【记录】一次pd.to_datetime处理异常
G-Research模拟环境下对于时间的处理异常,代码如下:
import datetime import os,sys import numpy as np import pandas as pd import gresearch_crypto #env = gresearch_crypto.make_env() class gresearch_guada(): """docstring for gresearch_guada""" def __init__(self): #super(gresearch_guada, self).__init__() ### 训练集 self.train = '/kaggle/input/g-research-crypto-forecasting/train.csv' ### 补充训练数据集——(验证集) self.supplemental_train = '/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv' ### 资产信息数据集,包含14个虚拟货币资产 self.asset_details = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv' ### 测试数据样例 self.example_test = '/kaggle/input/g-research-crypto-forecasting/example_test.csv' self.env = gresearch_crypto.make_env() self.iter_test = self.env.iter_test() def dataReader(self, datasetName): ### 数据集读取 if datasetName == 'train': ### 获取train数据集 df = pd.read_csv(self.train, usecols=['Target', 'Asset_ID', 'timestamp'], dtype={'Asset_ID': 'int8'}) elif datasetName == 'supplemental_train': ### 获取supplemental_train df = pd.read_csv(self.supplemental_train, usecols=['Target', 'Asset_ID', 'timestamp'], dtype={'Asset_ID': 'int8'}) else: print("ERROR [1018] - message: 数据集传入参数错误!") return df def datetimeProc(self, datasetName): ## 数据集时间处理 datasetName['datetime'] = pd.to_datetime(datasetName['timestamp'], unit='s') #print(datasetName['datetime']) datasetName = datasetName.set_index('datetime').drop('timestamp', axis=1) datasetName = datasetName[(datasetName.index.year == 2021) & (datasetName.index.month > 5)] #print(datasetName) dfs = {asset_id:datasetName[datasetName['Asset_ID'] == asset_id].resample('1min').interpolate().copy() for asset_id in datasetName['Asset_ID'].unique()} #print(dfs) ## delete $datasetName dataset del datasetName for datasetName_test, datasetName_pred in self.iter_test: print("---- 没有datetime的数据集:\n") print(datasetName_test['timestamp']) datasetName_test['datetime'] = pd.to_datetime(datasetName_test['timestamp'], unit='ms', errors='raise') print(datasetName_test['datetime']) #print("---- 更新datetime的数据集 ----") #print(datasetName_test) #print("---- 这是预测集 -----") #print(datasetName_pred) for _, row in datasetName_test.iterrows(): try: datasetName = dfs[row['Asset_ID']] closest_train_sample = datasetName.iloc[datasetName.index.get_loc(row['datasetName'], method='nearest')] datasetName_pred.loc[datasetName_pred['row_id'] == row['row_id'], 'Target'] = closest_train_sample['Target'] except: #raise e print("时间处理函数遭遇异常!") #datasetName_pred.loc[datasetName_pred['row_id'] == row['row_id'], 'Target'] = 0 #raise e #gresearch_guada. #print(datasetName_pred) datasetName_pred['Target'] = datasetName_pred['Target'].fillna(0) print(datasetName_pred) #datasetName_pred_filled = gresearch_guada.dataFillNan(datasetName_pred, 'Target', '3') return datasetName_pred
......
异常代码段:
for datasetName_test, datasetName_pred in self.iter_test: print("---- 没有datetime的数据集:\n") print(datasetName_test['timestamp']) datasetName_test['datetime'] = pd.to_datetime(datasetName_test['timestamp'], unit='ms', errors='raise') print(datasetName_test['datetime'])
异常Output:unit=ms时
This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set. ---- 没有datetime的数据集: 0 1623542400 1 1623542400 2 1623542400 3 1623542400 4 1623542400 5 1623542400 6 1623542400 7 1623542400 8 1623542400 9 1623542400 10 1623542400 11 1623542400 12 1623542400 13 1623542400 Name: timestamp, dtype: int64 0 1970-01-19 18:59:02.400 1 1970-01-19 18:59:02.400 2 1970-01-19 18:59:02.400 3 1970-01-19 18:59:02.400 4 1970-01-19 18:59:02.400 5 1970-01-19 18:59:02.400 6 1970-01-19 18:59:02.400 7 1970-01-19 18:59:02.400 8 1970-01-19 18:59:02.400 9 1970-01-19 18:59:02.400 10 1970-01-19 18:59:02.400 11 1970-01-19 18:59:02.400 12 1970-01-19 18:59:02.400 13 1970-01-19 18:59:02.400 Name: datetime, dtype: datetime64[ns]
异常Output:unit=s时
This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set. ---- 没有datetime的数据集: 0 1623542400 1 1623542400 2 1623542400 3 1623542400 4 1623542400 5 1623542400 6 1623542400 7 1623542400 8 1623542400 9 1623542400 10 1623542400 11 1623542400 12 1623542400 13 1623542400 Name: timestamp, dtype: int64 0 2021-06-13 1 2021-06-13 2 2021-06-13 3 2021-06-13 4 2021-06-13 5 2021-06-13 6 2021-06-13 7 2021-06-13 8 2021-06-13 9 2021-06-13 10 2021-06-13 11 2021-06-13 12 2021-06-13 13 2021-06-13 Name: datetime, dtype: datetime64[ns]