数据清洗和准备
一、处理缺失数据
data1 = pd.Series(['a','b',np.nan,'d'])
data1
Out:
0 a
1 b
2 NaN
3 d
dtype: object
data1.isnull()
Out:
0 False
1 False
2 True
3 False
dtype: bool
data1[0] = None
data1.isnull()
Out:
0 True
1 False
2 True
3 False
dtype: bool
滤除缺失数据
data1.dropna()
Out:
1 b
3 d
dtype: object
data1[data1.notnull()]
Out:
1 b
3 d
dtype: object
data2 = pd.DataFrame([[1.,6.5,3.],[1.,np.nan,np.nan],[np.nan,np.nan,np.nan],[np.nan,6.7,7.]])
data2
Out:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.7 7.0
data2.dropna()
Out:
0 1 2
0 1.0 6.5 3.0
# 丢弃全为nan的行
data2.dropna(how='all')
Out:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.7 7.0
data2.dropna(axis=1,how='all')
Out:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.7 7.0
df = pd.DataFrame(np.random.randn(7,3))
df
Out:
0 1 2
0 -1.617948 0.104032 2.028219
1 0.936990 0.318075 -0.475493
2 -0.955172 0.903966 1.072972
3 1.443032 0.371353 -0.326741
4 0.388359 0.054044 0.969463
5 0.717524 1.966696 1.008964
6 -0.305763 0.800637 1.161548
df.iloc[:4,1] = np.nan
df
Out:
0 1 2
0 -1.617948 NaN 2.028219
1 0.936990 NaN -0.475493
2 -0.955172 NaN 1.072972
3 1.443032 NaN -0.326741
4 0.388359 0.054044 0.969463
5 0.717524 1.966696 1.008964
6 -0.305763 0.800637 1.161548
df.iloc[:2,2] = np.nan
df
Out:
0 1 2
0 -1.617948 NaN NaN
1 0.936990 NaN NaN
2 -0.955172 NaN 1.072972
3 1.443032 NaN -0.326741
4 0.388359 0.054044 0.969463
5 0.717524 1.966696 1.008964
6 -0.305763 0.800637 1.161548
df.dropna()
Out:
0 1 2
4 0.388359 0.054044 0.969463
5 0.717524 1.966696 1.008964
6 -0.305763 0.800637 1.161548
df.dropna(thresh=2) # 删除缺失值为2的数据
Out:
0 1 2
2 -0.955172 NaN 1.072972
3 1.443032 NaN -0.326741
4 0.388359 0.054044 0.969463
5 0.717524 1.966696 1.008964
6 -0.305763 0.800637 1.161548
填充数据
df.fillna(0) # 将NaN填充成零
Out:
0 1 2
0 -1.617948 0.000000 0.000000
1 0.936990 0.000000 0.000000
2 -0.955172 0.000000 1.072972
3 1.443032 0.000000 -0.326741
4 0.388359 0.054044 0.969463
5 0.717524 1.966696 1.008964
6 -0.305763 0.800637 1.161548
df.fillna({1:0.9,2:0},inplace=True) # 第一列出现的缺失值:0.9,第二列:0 #inplace参数 就地修改
Out:
0 1 2
0 -1.617948 0.900000 0.000000
1 0.936990 0.900000 0.000000
2 -0.955172 0.900000 1.072972
3 1.443032 0.900000 -0.326741
4 0.388359 0.054044 0.969463
5 0.717524 1.966696 1.008964
6 -0.305763 0.800637 1.161548
df
Out:
0 1 2
0 -1.617948 0.900000 0.000000
1 0.936990 0.900000 0.000000
2 -0.955172 0.900000 1.072972
3 1.443032 0.900000 -0.326741
4 0.388359 0.054044 0.969463
5 0.717524 1.966696 1.008964
6 -0.305763 0.800637 1.161548
df2 = pd.DataFrame(np.random.randn(6,3))
df2
Out:
0 1 2
0 1.242544 -0.764057 -0.182701
1 0.391988 0.826193 0.746514
2 0.063028 3.012261 0.371317
3 0.561835 1.380915 -1.038757
4 -0.317601 0.742460 2.828160
5 -0.623512 -0.351491 0.360894
df2.iloc[2:,1] = np.nan
df2.iloc[4:,2] = np.nan
df2
Out:
0 1 2
0 1.242544 -0.764057 -0.182701
1 0.391988 0.826193 0.746514
2 0.063028 NaN 0.371317
3 0.561835 NaN -1.038757
4 -0.317601 NaN NaN
5 -0.623512 NaN NaN
df2.fillna(method='ffill') # 让上一个数据替换下面的NaN
Out:
0 1 2
0 1.242544 -0.764057 -0.182701
1 0.391988 0.826193 0.746514
2 0.063028 0.826193 0.371317
3 0.561835 0.826193 -1.038757
4 -0.317601 0.826193 -1.038757
5 -0.623512 0.826193 -1.038757
df2.fillna(method='ffill',limit=2)
Out:
0 1 2
0 1.242544 -0.764057 -0.182701
1 0.391988 0.826193 0.746514
2 0.063028 0.826193 0.371317
3 0.561835 0.826193 -1.038757
4 -0.317601 NaN -1.038757
5 -0.623512 NaN -1.038757