import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
# 读取数据
data = pd.read_csv("./Taitanic data/data.csv")
# 注意:标签是Survived,没哟在最后一列
# 查看数据信息
# 可以看到数据类型 和 每个字段非空值数据量
# 可以看到Age、Cabin字段有数据缺失,需要专门处理
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
# 删除 Name,影响较小
# 删除 Cabin,缺失值较多
# Ticket在这里也没多大用处,也删掉
data.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)
# 因为 Embarked 比其他数据多两行空值,所以删掉其为空值的两行
data = data[data['Embarked'].notna()]
['S', 'C', 'Q']
# 将Sex、Embarked转换为数字类型
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
Int64Index: 889 entries, 0 to 890
Data columns (total 9 columns):
PassengerId 889 non-null int64
Survived 889 non-null int64
Pclass 889 non-null int64
Sex 889 non-null int64
Age 712 non-null float64
SibSp 889 non-null int64
Parch 889 non-null int64
Fare 889 non-null float64
Embarked 889 non-null int64
dtypes: float64(2), int64(7)
memory usage: 69.5 KB
# 对年龄进行填充:中值或均值,此处使用中值 填充
data.loc[data['Age'].isna(), 'Age'] = data['Age'].median()
889 rows × 9 columns
# 分离特征数据和 标签数据
X = data.drop('Survived', axis=1)
y = data['Survived']
889 rows × 8 columns
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# 因为数据是随机拆分的,所以为了后续选择数据方便,将索引重置一下
for i in [X_train, X_test, y_train, y_test]:
i.index = range(0, i.shape[0])
711 rows × 8 columns
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=2)
clf =, y_train)
score = clf.score(X_test, y_test)
from sklearn.model_selection import cross_val_score
model_scores = []
cross_vscors = []
for depth in range(1, 10):
clf = DecisionTreeClassifier(random_state=25
, max_depth=depth
# , criterion='entropy' # 通常认为entropy是当模型欠拟合时候使用
clf =, y_train)
score_tr = clf.score(X_train, y_train)
cross_tr = cross_val_score(clf, X, y, cv=10).mean()
plt.plot(range(1, 10), model_scores, color='red', label='train')
plt.plot(range(1, 10), cross_vscors, color='green', label='test')
plt.xticks(range(1, 11))
plt.legend(loc='upper left')
import numpy as np
# gini_thresholds = np.linspace(0, 0.5, 50) # 基尼系数常用取值范围
# entropy_threholds = np.linspace(0, 1, 50)
# 定义模型参数,用于传入GridSearchCV,且在实例化模型时候,不需要传入参数
parameters = {"splitter": ('best', 'random')
, "criterion": ("gini", "entropy")
, "min_samples_leaf": [*range(1, 50, 5)]
, "min_impurity_decrease": [*np.linspace(0, 0.5, 20)] # 不使用网格搜索,这个参数比较难使用
, "max_depth": [*range(1, 10)]
clf = DecisionTreeClassifier(random_state=25)
GS = GridSearchCV(clf, parameters, cv=10), y_train)
GridSearchCV(cv=10, error_score='raise-deprecating',
criterion='gini', max_depth=None,
presort=False, random_state=25,
0.47368421052631576, 0.5],
'min_samples_leaf': [1, 6, 11, 16, 21, 26, 31, 36, 41,
'splitter': ('best', 'random')},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)
# 最好的参数
{'criterion': 'gini',
'max_depth': 3,
'min_impurity_decrease': 0.0,
'min_samples_leaf': 1,
'splitter': 'best'}
# 最高的得分
# 网格搜索的缺点:输入进去的参数都会使用到,他不会自动舍弃某些参数,可能有时候舍弃某些参数的模型性能更好