爬虫系列之requests
爬取百度内容:
1 import requests 2 url = "https://www.baidu.com" 3 4 if __name__ == '__main__': 5 try: 6 kv = {'user-agent': 'Mozilla/5.0'} 7 r = requests.get(url, headers=kv) 8 r.raise_for_status() #返回状态值,如果不是200,则抛出异常 9 r.encoding = r.apparent_encoding 10 print(r.text) 11 #print(r.request.headers) 12 except: 13 print("爬虫失败")
在URL中填上http://www.baidu.com/s?wd=keyword,keyword就是我们要百度搜索的内容,在requests中有params参数,可以把参数追加到URL中。
1 import requests 2 url = "http://www.baidu.com/s" 3 keyword = "python" 4 5 if __name__ == '__main__': 6 try: 7 kv = {'user-agent': 'Mozilla/5.0'} 8 wd = {'wd': keyword} 9 r = requests.get(url, headers=kv, params=wd) 10 print(r.request.url) 11 r.raise_for_status() 12 r.encoding = r.apparent_encoding 13 print(len(r.text)) 14 except: 15 print("爬虫失败")
爬取图片
1 import requests 2 import os 3 url = "http://image.nationalgeographic.com.cn/2017/0211/20170211061910157.jpg" 4 5 kv = {'header': 'Mozilla/5.0'} 6 root = "D://pic_save//" 7 path = root + url.split('/')[-1] 8 9 if __name__ == '__main__': 10 try: 11 if not os.path.exists(root): 12 os.mkdir(root) 13 if not os.path.exists(path): 14 r = requests.get(url, headers=kv) 15 print(r.status_code) 16 with open (path, 'wb') as f: 17 f.write(r.content) 18 print("文件已保存成功") 19 else: 20 print("文件已存在") 21 except: 22 ("爬虫失败")