python 缺失值填充——固定值填充法


#!/bin/python2
###    Author: huangning      ###
## Email: tonyandrewhn@126.com ##
#################################
import warnings
import time
import sys
import datetime
import json
import pandas as pd
import numpy as np

from collections import Counter
#sys.path.append("D:\\huangning\\自用脚本\\bdci-dev\\sef_def_logger.py")
from sel_def_logger import MyLog
#import sef_def_logger
# warnings.filterwarnings('ignore')

class bdci():
    """docstring for bdci"""

    #logging.basicConfig(filename="std.log", format='%(asctime)s %(message)s', filemode='w') 
    #logger=logging.getLogger() 
    #logger.setLevel(logging.DEBUG) 

    def __init__(self):
        self.train_bank_path = 'D:\\huangning\\DataSet\\个贷违约预测\\train_dataset\\train_public.csv'
        self.train_internet_path = 'D:\\huangning\\DataSet\\个贷违约预测\\train_dataset\\train_internet.csv'
        self.testData_path = 'D:\\huangning\\DataSet\\个贷违约预测\\test_public.csv'
        self.work_year_map = {'10+ years': 10, '2 years': 2, '< 1 year': 0, '3 years': 3, '1 year': 1, '5 years': 5, '4 years': 4, '6 years': 6, '8 years': 8, '7 years': 7, '9 years': 9}
        self.logging = MyLog().logger

    def dataset_Fillnan(self, train_data, columns):
        ### 数据集缺失值填充
        ### fillType :  填充方式,如fillnan, fillrandom, ......
        ### columns : 需要填充的字段值

        self.logging.info("bdci." + sys._getframe().f_code.co_name + ".service MSG: ------------------------- 开始对数据集缺失值填充...... ------------------------")
        fillType = input("请选择您需要缺失值填充方式(eg: ['固定值填充':'0','前置值填充':'1']:")
        #print("--- 固定值填充:请输入 0")

        #fillTypeDict = {0:"fixed_value","before_value":1,"after_value":2,"random_value":3,"predict_value":4}
        if fillType == "0":
            ### fixed value fillnan
            fixed_value = input("请输入" + columns + "列固定填充值:(eg: 推荐值:" + str(Counter(train_data[columns]).most_common(3)) + "):")
            self.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 已获取缺失值填充参数:{'缺失值填充方法':'固定值填充','固定值':" + fixed_value + "},准备开始缺失值填充......")
            train_data[columns].fillna(fixed_value, inplace=True)
            if train_data[columns].empty:
                #print("仍旧有空值")
                self.logging.error("bdci." + sys._getframe().f_code.co_name + "service MSG: 数据集列" + columns +"缺失值以固定值方式填充失败,请查看原因!")
            else:
                #print("该列无空值")
                self.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 数据集列" + columns +"缺失值以固定值方式填充成功!")
            return train_data
        elif fillType == 1:
            ### before value fillnan
            return "前值填充法 暂未开放......"
        else:
            ### 其他填充法
            return "其他填充法 暂未开放......"

    def dataset_FillBatch(self, DataSetName):
        #pass
        #print(DataSetName.columns)
        self.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: ---------------- 开始批量化替换空缺值 ---------------")
        i = 1
        for cols in DataSetName.columns:
            ###
            #print("第" + str(i) + "列:" + cols + "准备处理......STARTING......")
            if i <= len(DataSetName.columns):
                #print(train1_data[cols])
                if DataSetName[cols].isnull().any():
                    ### 当前列为空
                    #bdci.logging.info("当前列" + cols + "为空")
                    # -bug1: 数据f0开始 填充开始缓慢
                    train_data = self.dataset_Fillnan(DataSetName, cols)
                    #break
                else:
                    bdci.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 当前列" + cols + "没有空值,继续遍历......")
                #i = i + 1
            else:
                bdci.logging.info("bdci." + sys._getframe().f_code.co_name + "service MSG: 数据集列遍历完成......")
                break
            i = i + 1
            ## print("末尾第" + str(i) + "列已完成处理......ENDING......")
        return train_data


}

### 实例化类
bdci = bdci()

### 获取训练数据集
train_bank = bdci.DataReader("train_bank")
train_internet = bdci.DataReader("train_internet")
test_public = bdci.DataReader("test_public")
#print(train_bank)

### 字段一致性保持
train_bank.rename(columns={'isDefault':'is_default'},inplace=True)


### 获取训练数据集中公共样本——列名list
common_cols = bdci.getCommon_cols(train_bank,train_internet)
print(len(common_cols))

### 获取非公共样本——列名list
train_internet_left = bdci.getleft_cols("train_internet")
train_bank_left = bdci.getleft_cols("train_bank")
#print(train_internet_left)
#print(train_bank_left)

### 获取公有字段数据集
train1_data = bdci.getCommon_colsdata("train_internet")
train2_data = bdci.getCommon_colsdata("train_bank")
test_data = bdci.getCommon_colsdata("test_public")



## 日期转化为pandas认可的格式
train1_data = bdci.dateTransformer(train1_data)
train2_data = bdci.dateTransformer(train2_data)


bdci.logging.info("开始处理数据填充---------- 数据集train1_data.csv -----------")
train1_data_filled = bdci.dataset_FillBatch(train1_data)

train1_data_mapped = bdci.propMapping(train1_data_filled)
train1_data_mapped.to_csv('data\\train1_data.csv', sep=',', header=True, index=True)


bdci.logging.info("开始处理数据填充---------- 数据集train1_data.csv -----------")
train2_data_filled = bdci.dataset_FillBatch(train2_data)
train2_data_mapped = bdci.propMapping(train2_data_filled)
train2_data_mapped.to_csv('data\\train2_data.csv', sep=',', header=True, index=True)

主要是方法:

dataset_FillBatch() 和
dataset_Fillnan()