机器学习笔记013 kaggle实战代码


  1 import hashlib
  2 import os
  3 import tarfile
  4 import zipfile
  5 import requests
  6 
  7 # @save
  8 DATA_HUB = dict()
  9 DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
 10 
 11 
 12 def download(name, cache_dir=os.path.join('..', 'data')):  #@save
 13     """下载一个DATA_HUB中的文件,返回本地文件名"""
 14     assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
 15     url, sha1_hash = DATA_HUB[name]
 16     os.makedirs(cache_dir, exist_ok=True)
 17     fname = os.path.join(cache_dir, url.split('/')[-1])
 18     if os.path.exists(fname):
 19         sha1 = hashlib.sha1()
 20         with open(fname, 'rb') as f:
 21             while True:
 22                 data = f.read(1048576)
 23                 if not data:
 24                     break
 25                 sha1.update(data)
 26         if sha1.hexdigest() == sha1_hash:
 27             return fname  # 命中缓存
 28     print(f'正在从{url}下载{fname}...')
 29     r = requests.get(url, stream=True, verify=True)
 30     with open(fname, 'wb') as f:
 31         f.write(r.content)
 32     return fname
 33 
 34 def download_extract(name, folder=None):  #@save
 35     """下载并解压zip/tar文件"""
 36     fname = download(name)
 37     base_dir = os.path.dirname(fname)
 38     data_dir, ext = os.path.splitext(fname)
 39     if ext == '.zip':
 40         fp = zipfile.ZipFile(fname, 'r')
 41     elif ext in ('.tar', '.gz'):
 42         fp = tarfile.open(fname, 'r')
 43     else:
 44         assert False, '只有zip/tar文件可以被解压缩'
 45     fp.extractall(base_dir)
 46     return os.path.join(base_dir, folder) if folder else data_dir
 47 
 48 def download_all():  #@save
 49     """下载DATA_HUB中的所有文件"""
 50     for name in DATA_HUB:
 51         download(name)
 52 
 53 
 54 
 55 import numpy as np
 56 import pandas as pd
 57 import torch
 58 from torch import nn
 59 from d2l import torch as d2l
 60 
 61 
 62 DATA_HUB['kaggle_house_train'] = (  #@save
 63     DATA_URL + 'kaggle_house_pred_train.csv',
 64     '585e9cc93e70b39160e7921475f9bcd7d31219ce')
 65 
 66 DATA_HUB['kaggle_house_test'] = (  #@save
 67     DATA_URL + 'kaggle_house_pred_test.csv',
 68     'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
 69 
 70 train_data = pd.read_csv(download('kaggle_house_train'))
 71 test_data = pd.read_csv(download('kaggle_house_test'))
 72 
 73 print(train_data.shape)
 74 print(test_data.shape)
 75 
 76 print(train_data.iloc[0:4,[0,1,2,3,-3,-2,-1]])
 77 
 78 #删掉第一行ID,另外train比test多一列,也就是label结果,在这里也删掉他
 79 all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
 80 
 81 print(all_features.iloc[0:4,[0,1,2,3,4,5,6,7,-3,-2,-1]])
 82 print("___________________________________________________________________________________________-")
 83 # 过滤出我们的数值特征
 84 numeric_features=all_features.dtypes[all_features.dtypes!='object'].index
 85 print(numeric_features.values)
 86 # 把数字特征中的每一个这一列,减去均值除以方差,用来标准化,将所有的特征放在一个尺度上,这个尺度的均值为0,方差为1
 87 all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
 88 # 在标准化数据之后,所有均值消失,因此我们可以将缺失值设置为0
 89 all_features[numeric_features] = all_features[numeric_features].fillna(0)
 90 
 91 all_features=pd.get_dummies(all_features,dummy_na=True) #dummy_na代表unKnown也算一个特征
 92 # oneHot编码之后的特征:
 93 print(all_features.shape)
 94 
 95 # 从pandas提取出NumPy格式,并将其转换为张量表示
 96 n_train=train_data.shape[0]
 97 train_features=torch.tensor(all_features[:n_train].values, dtype=torch.float32)
 98 test_features=torch.tensor(all_features[n_train:].values, dtype=torch.float32)
 99 train_labels=torch.tensor(train_data.SalePrice.values.reshape(-1,1),dtype=torch.float32)
100 # 开始训练!!!
101 loss=nn.MSELoss()
102 in_features=train_features.shape[1]
103 
104 def get_net():
105     net=nn.Sequential(nn.Linear(in_features,1))
106     return net
107 
108 # 相对误差,即(y-y_hat)/y
109 # 在这里取log
110 # 这个函数的意思是将所有的feature和label均做一个log,再做正常的线性回归
111 def log_rmse(net, features, labels):
112     # 为了在取对数时进一步稳定该值,将小于1的值设置为1
113     # clamp:torch.clamp(input, min, max, out=None) → Tensor
114     # sigmod是将所有数值映射到一段区间内,但是这个clamp是个框,只把出框的规范到区间内,框内的数值不改变
115     clipped_preds = torch.clamp(net(features), 1, float('inf'))
116     rmse = torch.sqrt(loss(torch.log(clipped_preds),
117                            torch.log(labels)))
118     return rmse.item()
119 
120 def train(net, train_features, train_labels, test_features, test_labels,
121           num_epochs, learning_rate, weight_decay, batch_size):
122     train_ls, test_ls = [], []
123     train_iter = d2l.load_array((train_features, train_labels), batch_size)
124     # 这里使用的是Adam优化算法,相对SGD来说比较平滑,对学习率没有那么敏感
125     optimizer = torch.optim.Adam(net.parameters(),
126                                  lr = learning_rate,
127                                  weight_decay = weight_decay)
128     for epoch in range(num_epochs):
129         for X, y in train_iter:
130             optimizer.zero_grad()
131             l = loss(net(X), y)
132             l.backward()
133             optimizer.step()
134         train_ls.append(log_rmse(net, train_features, train_labels))
135         if test_labels is not None:
136             test_ls.append(log_rmse(net, test_features, test_labels))
137     return train_ls, test_ls
138 
139 # 做一个K折交叉验证
140 def get_k_fold_data(k, i, X, y):
141     assert k > 1
142     fold_size = X.shape[0] // k # //是整除的意思
143     X_train, y_train = None, None
144     for j in range(k):
145         idx = slice(j * fold_size, (j + 1) * fold_size) # 切片函数,类似split,就相当于一段索引
146         X_part, y_part = X[idx, :], y[idx] # 这里的idx类似于(j * fold_size  :  (j + 1) * fold_size)
147         if j == i:  # 如果是第i折,则将这一折作为验证集
148             X_valid, y_valid = X_part, y_part
149         elif X_train is None:
150             X_train, y_train = X_part, y_part
151         else:
152             X_train = torch.cat([X_train, X_part], 0)
153             y_train = torch.cat([y_train, y_part], 0)
154     return X_train, y_train, X_valid, y_valid
155 
156 def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
157            batch_size):
158     train_l_sum, valid_l_sum = 0, 0
159     for i in range(k):
160         data = get_k_fold_data(k, i, X_train, y_train)
161         net = get_net()
162         train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
163                                    weight_decay, batch_size)
164         train_l_sum += train_ls[-1]
165         valid_l_sum += valid_ls[-1]
166         if i == 0:
167             d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
168                      xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
169                      legend=['train', 'valid'], yscale='log')
170             d2l.plt.show()
171         print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, '
172               f'验证log rmse{float(valid_ls[-1]):f}')
173     return train_l_sum / k, valid_l_sum / k
174 
175 k, num_epochs, lr, weight_decay, batch_size = 50, 100, 5, 0, 64
176 train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
177                           weight_decay, batch_size)
178 print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
179       f'平均验证log rmse: {float(valid_l):f}')