1 import hashlib
2 import os
3 import tarfile
4 import zipfile
5 import requests
6
7 # @save
8 DATA_HUB = dict()
9 DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
10
11
12 def download(name, cache_dir=os.path.join('..', 'data')): #@save
13 """下载一个DATA_HUB中的文件,返回本地文件名"""
14 assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
15 url, sha1_hash = DATA_HUB[name]
16 os.makedirs(cache_dir, exist_ok=True)
17 fname = os.path.join(cache_dir, url.split('/')[-1])
18 if os.path.exists(fname):
19 sha1 = hashlib.sha1()
20 with open(fname, 'rb') as f:
21 while True:
22 data = f.read(1048576)
23 if not data:
24 break
25 sha1.update(data)
26 if sha1.hexdigest() == sha1_hash:
27 return fname # 命中缓存
28 print(f'正在从{url}下载{fname}...')
29 r = requests.get(url, stream=True, verify=True)
30 with open(fname, 'wb') as f:
31 f.write(r.content)
32 return fname
33
34 def download_extract(name, folder=None): #@save
35 """下载并解压zip/tar文件"""
36 fname = download(name)
37 base_dir = os.path.dirname(fname)
38 data_dir, ext = os.path.splitext(fname)
39 if ext == '.zip':
40 fp = zipfile.ZipFile(fname, 'r')
41 elif ext in ('.tar', '.gz'):
42 fp = tarfile.open(fname, 'r')
43 else:
44 assert False, '只有zip/tar文件可以被解压缩'
45 fp.extractall(base_dir)
46 return os.path.join(base_dir, folder) if folder else data_dir
47
48 def download_all(): #@save
49 """下载DATA_HUB中的所有文件"""
50 for name in DATA_HUB:
51 download(name)
52
53
54
55 import numpy as np
56 import pandas as pd
57 import torch
58 from torch import nn
59 from d2l import torch as d2l
60
61
62 DATA_HUB['kaggle_house_train'] = ( #@save
63 DATA_URL + 'kaggle_house_pred_train.csv',
64 '585e9cc93e70b39160e7921475f9bcd7d31219ce')
65
66 DATA_HUB['kaggle_house_test'] = ( #@save
67 DATA_URL + 'kaggle_house_pred_test.csv',
68 'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
69
70 train_data = pd.read_csv(download('kaggle_house_train'))
71 test_data = pd.read_csv(download('kaggle_house_test'))
72
73 print(train_data.shape)
74 print(test_data.shape)
75
76 print(train_data.iloc[0:4,[0,1,2,3,-3,-2,-1]])
77
78 #删掉第一行ID,另外train比test多一列,也就是label结果,在这里也删掉他
79 all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
80
81 print(all_features.iloc[0:4,[0,1,2,3,4,5,6,7,-3,-2,-1]])
82 print("___________________________________________________________________________________________-")
83 # 过滤出我们的数值特征
84 numeric_features=all_features.dtypes[all_features.dtypes!='object'].index
85 print(numeric_features.values)
86 # 把数字特征中的每一个这一列,减去均值除以方差,用来标准化,将所有的特征放在一个尺度上,这个尺度的均值为0,方差为1
87 all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
88 # 在标准化数据之后,所有均值消失,因此我们可以将缺失值设置为0
89 all_features[numeric_features] = all_features[numeric_features].fillna(0)
90
91 all_features=pd.get_dummies(all_features,dummy_na=True) #dummy_na代表unKnown也算一个特征
92 # oneHot编码之后的特征:
93 print(all_features.shape)
94
95 # 从pandas提取出NumPy格式,并将其转换为张量表示
96 n_train=train_data.shape[0]
97 train_features=torch.tensor(all_features[:n_train].values, dtype=torch.float32)
98 test_features=torch.tensor(all_features[n_train:].values, dtype=torch.float32)
99 train_labels=torch.tensor(train_data.SalePrice.values.reshape(-1,1),dtype=torch.float32)
100 # 开始训练!!!
101 loss=nn.MSELoss()
102 in_features=train_features.shape[1]
103
104 def get_net():
105 net=nn.Sequential(nn.Linear(in_features,1))
106 return net
107
108 # 相对误差,即(y-y_hat)/y
109 # 在这里取log
110 # 这个函数的意思是将所有的feature和label均做一个log,再做正常的线性回归
111 def log_rmse(net, features, labels):
112 # 为了在取对数时进一步稳定该值,将小于1的值设置为1
113 # clamp:torch.clamp(input, min, max, out=None) → Tensor
114 # sigmod是将所有数值映射到一段区间内,但是这个clamp是个框,只把出框的规范到区间内,框内的数值不改变
115 clipped_preds = torch.clamp(net(features), 1, float('inf'))
116 rmse = torch.sqrt(loss(torch.log(clipped_preds),
117 torch.log(labels)))
118 return rmse.item()
119
120 def train(net, train_features, train_labels, test_features, test_labels,
121 num_epochs, learning_rate, weight_decay, batch_size):
122 train_ls, test_ls = [], []
123 train_iter = d2l.load_array((train_features, train_labels), batch_size)
124 # 这里使用的是Adam优化算法,相对SGD来说比较平滑,对学习率没有那么敏感
125 optimizer = torch.optim.Adam(net.parameters(),
126 lr = learning_rate,
127 weight_decay = weight_decay)
128 for epoch in range(num_epochs):
129 for X, y in train_iter:
130 optimizer.zero_grad()
131 l = loss(net(X), y)
132 l.backward()
133 optimizer.step()
134 train_ls.append(log_rmse(net, train_features, train_labels))
135 if test_labels is not None:
136 test_ls.append(log_rmse(net, test_features, test_labels))
137 return train_ls, test_ls
138
139 # 做一个K折交叉验证
140 def get_k_fold_data(k, i, X, y):
141 assert k > 1
142 fold_size = X.shape[0] // k # //是整除的意思
143 X_train, y_train = None, None
144 for j in range(k):
145 idx = slice(j * fold_size, (j + 1) * fold_size) # 切片函数,类似split,就相当于一段索引
146 X_part, y_part = X[idx, :], y[idx] # 这里的idx类似于(j * fold_size : (j + 1) * fold_size)
147 if j == i: # 如果是第i折,则将这一折作为验证集
148 X_valid, y_valid = X_part, y_part
149 elif X_train is None:
150 X_train, y_train = X_part, y_part
151 else:
152 X_train = torch.cat([X_train, X_part], 0)
153 y_train = torch.cat([y_train, y_part], 0)
154 return X_train, y_train, X_valid, y_valid
155
156 def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
157 batch_size):
158 train_l_sum, valid_l_sum = 0, 0
159 for i in range(k):
160 data = get_k_fold_data(k, i, X_train, y_train)
161 net = get_net()
162 train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
163 weight_decay, batch_size)
164 train_l_sum += train_ls[-1]
165 valid_l_sum += valid_ls[-1]
166 if i == 0:
167 d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
168 xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
169 legend=['train', 'valid'], yscale='log')
170 d2l.plt.show()
171 print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, '
172 f'验证log rmse{float(valid_ls[-1]):f}')
173 return train_l_sum / k, valid_l_sum / k
174
175 k, num_epochs, lr, weight_decay, batch_size = 50, 100, 5, 0, 64
176 train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
177 weight_decay, batch_size)
178 print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
179 f'平均验证log rmse: {float(valid_l):f}')