爬取国家统计局2020年行政区划分数据
参考:https://blog.csdn.net/qlx119/article/details/105289974
在MySQL中创建tab_citys数据表:
DROP TABLE IF EXISTS `tab_citys`; CREATE TABLE `tab_citys` ( `id` int(11) NOT NULL AUTO_INCREMENT, `parent_id` int(11) DEFAULT NULL, `city_name_zh` varchar(20) NOT NULL, `city_name_en` varchar(20) DEFAULT NULL, `city_level` int(11) NOT NULL, `city_code` char(12) NOT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=742037 DEFAULT CHARSET=utf8;
创建xzqh.py的pyton脚本:
1 #!/usr/bin/python 2 # -*- coding: UTF-8 -*- 3 # 功能: 获取省市县数据 4 # 版本:v1.1 5 import importlib 6 import sys 7 import pymysql 8 importlib.reload(sys) 9 import requests 10 import lxml.etree as etree 11 import os 12 class chinese_city(): 13 # 初始化函数 14 def __init__(self): 15 self.baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html' 16 self.base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' 17 self.conn = pymysql.connect(host="localhost", port=3306, user="root", passwd="root", db="xzqh", charset='utf8') 18 self.cur = self.conn.cursor() 19 self.trdic = { 20 1: '//tr[@class="provincetr"]', 21 2: '//tr[@class="citytr"]', 22 3: '//tr[@class="countytr"]', 23 4: '//tr[@class="towntr"]', 24 5: '//tr[@class="villagetr"]' 25 } 26 def __del__(self): 27 if self.cur: 28 self.cur.close() 29 if self.conn: 30 self.conn.close() 31 32 def crawl_page(self,url): 33 ''' 爬行政区划代码公布页 ''' 34 # print(f"crawling...{url}") 35 headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0', 36 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'} 37 i = 0 38 while i < 3: 39 try: 40 html = requests.get(url, headers=headers, timeout=20) 41 html.encoding = 'gbk' # 这里添加一行 42 # print(html.status_code) 43 text = html.text 44 return text 45 except requests.exceptions.RequestException: 46 i += 1 47 print('超时'+url) 48 49 #解析省页,返回list 50 def parseProvince(self): 51 html = self.crawl_page(self.baseUrl) 52 tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk')) 53 nodes = tree.xpath('//tr[@class="provincetr"]') 54 id = 1 55 values = [] 56 for node in nodes: 57 items = node.xpath('./td') 58 for item in items: 59 value = {} 60 nexturl = item.xpath('./a/@href') 61 province = item.xpath('./a/text()') 62 print(province) 63 value['url'] = self.base + "".join(nexturl) 64 value['name'] = "".join(province) 65 value['code'] = 0 66 value['pid'] = 0 67 value['id'] = id 68 value['level'] = 1 69 print(repr(value['name'])) 70 id = id + 1 71 last_id = self.insert_to_db(value) 72 value['id'] = last_id 73 values.append(value) 74 print(value) 75 return values 76 77 #根据trid 解析子页 78 def parse(self,trid, pid, url): 79 if url.strip() == '': 80 return None 81 # url_prefix+url 82 html = self.crawl_page(url) 83 tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk')) 84 85 if trid==3: 86 nodes = tree.xpath(self.trdic.get(trid)) 87 if len(nodes)==0: 88 nodes = tree.xpath(self.trdic.get(4)) 89 print('有镇的市:'+url) 90 else: 91 nodes = tree.xpath(self.trdic.get(trid)) 92 93 94 path = os.path.basename(url) 95 base_url = url.replace(path, '') 96 id = 1 97 values = [] 98 # 多个城市 99 for node in nodes: 100 value = {} 101 nexturl = node.xpath('./td[1]/a/@href') 102 if len(nexturl) == 0: 103 nexturl = '' 104 code = node.xpath('./td[1]/a/text()') 105 if len(code) == 0: 106 code = node.xpath('./td[1]/text()') 107 name = node.xpath('./td[2]/a/text()') 108 if len(name) == 0: 109 name = node.xpath('./td[2]/text()') 110 value['code'] = "".join(code) 111 urltemp = "".join(nexturl) 112 if len(urltemp) != 0: 113 value['url'] = base_url + "".join(nexturl) 114 else: 115 value['url'] = '' 116 value['name'] = "".join(name) 117 print(repr(value['name'])) 118 print(value['url']) 119 value['id'] = id 120 value['pid'] = pid 121 value['level'] = trid 122 id = id + 1 123 last_id = self.insert_to_db(value) 124 value['id'] = last_id 125 values.append(value) 126 print(value) 127 return values 128 129 #解析社区页 130 def parseVillager(self,trid, pid, url): 131 html = self.crawl_page(url) 132 tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk')) 133 nodes = tree.xpath(self.trdic.get(trid)) 134 id = 1 135 values = [] 136 # 多个城市 137 for node in nodes: 138 value = {} 139 nexturl = node.xpath('./td[1]/a/@href') 140 code = node.xpath('./td[1]/text()') 141 vcode = node.xpath('./td[2]/text()') 142 name = node.xpath('./td[3]/text()') 143 value['code'] = "".join(code) 144 value['url'] = "".join(nexturl) 145 value['name'] = "".join(name) 146 print(repr(value['name'])) 147 value['id'] = id 148 value['pid'] = pid 149 value['level'] = trid 150 values.append(value) 151 id = id + 1 152 last_id = self.insert_to_db(value) 153 value['id'] = last_id 154 values.append(value) 155 print(value) 156 157 return values 158 159 #插入数据库 160 def insert_to_db(self,taobao): 161 # return 0 162 param = [] 163 lastid = 0 164 try: 165 sql = 'INSERT INTO tab_citys values(%s,%s,%s,%s,%s, %s)' 166 param = (0, taobao.get("pid"), taobao.get("name"), '', taobao.get("level"), taobao.get("code")) 167 self.cur.execute(sql, param) 168 lastid = self.cur.lastrowid 169 self.conn.commit() 170 except Exception as e: 171 print(e) 172 self.conn.rollback() 173 return lastid 174 175 #从头执行解析 176 def parseChineseCity(self): 177 values = self.parseProvince() 178 for value in values: 179 citys = self.parse(2, value['id'], value['url']) 180 if not citys is None: 181 for city in citys: 182 countys = self.parse(3, city['id'], city['url']) 183 #这个下面是获取 乡镇和居委会数据 如果不需要删除就可以了 184 if not countys is None: 185 for county in countys: 186 towns = self.parse(4, county['id'], county['url']) 187 if towns is not None: 188 for town in towns: 189 villagers = self.parseVillager(5, town['id'], town['url']) 190 191 if __name__ == '__main__': 192 chinese_city = chinese_city() 193 chinese_city.parseChineseCity()
如果提示缺少相应的库,可以使用pip进行安装:
pip install pymysql
pip install lxml
运行脚本:
python ./xzqh.py
祝您成功!