python 缺失值填充——固定值填充法
#!/bin/python2 ### Author: huangning ### ## Email: tonyandrewhn@126.com ## ################################# import warnings import time import sys import datetime import json import pandas as pd import numpy as np from collections import Counter #sys.path.append("D:\\huangning\\自用脚本\\bdci-dev\\sef_def_logger.py") from sel_def_logger import MyLog #import sef_def_logger # warnings.filterwarnings('ignore') class bdci(): """docstring for bdci""" #logging.basicConfig(filename="std.log", format='%(asctime)s %(message)s', filemode='w') #logger=logging.getLogger() #logger.setLevel(logging.DEBUG) def __init__(self): self.train_bank_path = 'D:\\huangning\\DataSet\\个贷违约预测\\train_dataset\\train_public.csv' self.train_internet_path = 'D:\\huangning\\DataSet\\个贷违约预测\\train_dataset\\train_internet.csv' self.testData_path = 'D:\\huangning\\DataSet\\个贷违约预测\\test_public.csv' self.work_year_map = {'10+ years': 10, '2 years': 2, '< 1 year': 0, '3 years': 3, '1 year': 1, '5 years': 5, '4 years': 4, '6 years': 6, '8 years': 8, '7 years': 7, '9 years': 9} self.logging = MyLog().logger def dataset_Fillnan(self, train_data, columns): ### 数据集缺失值填充 ### fillType : 填充方式,如fillnan, fillrandom, ...... ### columns : 需要填充的字段值 self.logging.info("bdci." + sys._getframe().f_code.co_name + ".service MSG: ------------------------- 开始对数据集缺失值填充...... ------------------------") fillType = input("请选择您需要缺失值填充方式(eg: ['固定值填充':'0','前置值填充':'1']:") #print("--- 固定值填充:请输入 0") #fillTypeDict = {0:"fixed_value","before_value":1,"after_value":2,"random_value":3,"predict_value":4} if fillType == "0": ### fixed value fillnan fixed_value = input("请输入" + columns + "列固定填充值:(eg: 推荐值:" + str(Counter(train_data[columns]).most_common(3)) + "):") self.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 已获取缺失值填充参数:{'缺失值填充方法':'固定值填充','固定值':" + fixed_value + "},准备开始缺失值填充......") train_data[columns].fillna(fixed_value, inplace=True) if train_data[columns].empty: #print("仍旧有空值") self.logging.error("bdci." + sys._getframe().f_code.co_name + "service MSG: 数据集列" + columns +"缺失值以固定值方式填充失败,请查看原因!") else: #print("该列无空值") self.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 数据集列" + columns +"缺失值以固定值方式填充成功!") return train_data elif fillType == 1: ### before value fillnan return "前值填充法 暂未开放......" else: ### 其他填充法 return "其他填充法 暂未开放......" def dataset_FillBatch(self, DataSetName): #pass #print(DataSetName.columns) self.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: ---------------- 开始批量化替换空缺值 ---------------") i = 1 for cols in DataSetName.columns: ### #print("第" + str(i) + "列:" + cols + "准备处理......STARTING......") if i <= len(DataSetName.columns): #print(train1_data[cols]) if DataSetName[cols].isnull().any(): ### 当前列为空 #bdci.logging.info("当前列" + cols + "为空") # -bug1: 数据f0开始 填充开始缓慢 train_data = self.dataset_Fillnan(DataSetName, cols) #break else: bdci.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 当前列" + cols + "没有空值,继续遍历......") #i = i + 1 else: bdci.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 数据集列遍历完成......") break i = i + 1 ## print("末尾第" + str(i) + "列已完成处理......ENDING......") return train_data } ### 实例化类 bdci = bdci() ### 获取训练数据集 train_bank = bdci.DataReader("train_bank") train_internet = bdci.DataReader("train_internet") test_public = bdci.DataReader("test_public") #print(train_bank) ### 字段一致性保持 train_bank.rename(columns={'isDefault':'is_default'},inplace=True) ### 获取训练数据集中公共样本——列名list common_cols = bdci.getCommon_cols(train_bank,train_internet) print(len(common_cols)) ### 获取非公共样本——列名list train_internet_left = bdci.getleft_cols("train_internet") train_bank_left = bdci.getleft_cols("train_bank") #print(train_internet_left) #print(train_bank_left) ### 获取公有字段数据集 train1_data = bdci.getCommon_colsdata("train_internet") train2_data = bdci.getCommon_colsdata("train_bank") test_data = bdci.getCommon_colsdata("test_public") ## 日期转化为pandas认可的格式 train1_data = bdci.dateTransformer(train1_data) train2_data = bdci.dateTransformer(train2_data) bdci.logging.info("开始处理数据填充---------- 数据集train1_data.csv -----------") train1_data_filled = bdci.dataset_FillBatch(train1_data) train1_data_mapped = bdci.propMapping(train1_data_filled) train1_data_mapped.to_csv('data\\train1_data.csv', sep=',', header=True, index=True) bdci.logging.info("开始处理数据填充---------- 数据集train1_data.csv -----------") train2_data_filled = bdci.dataset_FillBatch(train2_data) train2_data_mapped = bdci.propMapping(train2_data_filled) train2_data_mapped.to_csv('data\\train2_data.csv', sep=',', header=True, index=True)
主要是方法:
dataset_FillBatch() 和
dataset_Fillnan()