python 爬虫
需要引入的包
import requests
import re
import time
import tldextract
主要方法
kv = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'} #消息头
url = "http://news.baidu.com" #地址
r = requests.get(url, headers=kv) #发送请求 获取页面
r.encoding = r.apparent_encoding #获取编码方式
html = r.text #获取页面内容
以百度新闻为例
links = re.findall(r'(?<=# print('find links:', len(links))
# print(r.cookies)
news_links = []
for link in links:
if not link.startswith('https'):
continue
tld = tldextract.extract(link)
if tld.domain == "baidu":
continue
news_links.append(link.replace("https", "http", 1)) #替换https 为http
for link in news_links:
# print(link)
html = requests.get(link, headers=kv).text
# print(html)
save_to_db(link, html)
# print('works done!')