小说爬虫——以https://www.tadu.com/book/catalogue/891710/(小说目录)为例
-
目标网站:https://www.tadu.com/book/catalogue/891710/
-
目标内容:小说《没有爸爸也能活》第一章到第十三一章的正文内容。
-
任务要求:编写两个爬虫,爬虫1从https://www.tadu.com/book/catalogue/891710/
def get_source(self, url, headers):#获取url源代码
return requests.get(url, headers).content.decode()
def url2redis(self):#将url源代码中的章节url存储在redis数据库中
source = self.get_source(self.url,self.HEADERS)
selector = lxml.html.fromstring(source) # 创建树对象
url_lst = selector.xpath('//div[@class="chapter clearfix"]/a/@href')
for url in url_lst:
url = 'https://www.tadu.com' + url.strip()
self.client1.lpush('url_queue', url)
2.其次从url_queue中一个个弹出章节url,由于章节内容采取异步加载的方式,刚开始使用了selenium,通过webdriver来驱动Chrome浏览器,来解析JS接收数据,获取元素源代码。
def wait(url):#用于获取
driver = webdriver.Chrome(r'.\chromedriver.exe')#用chromedriver.exe驱动Chrome浏览器解析源代码中JS部分
driver.get(url) #连接网页
try:
WebDriverWait(driver,30).until(EC.presence_of_element_located((By.CLASS_NAME,"read_details"))) #等待页面加载
except Exception as _:
print('网页加载太慢。')
return driver.page_source #返回网页源代码
3.最后把这些章节内容源代码,通过Xpath来获取小说标题和内容,将其加入字典列表里,用于插入MongoDB数据库
def article2mongodb(self):#将各个url中文章内容传入MongoDB数据库中
while self.client1.llen('url_queue')>0:
url = self.client1.lpop('url_queue').decode()
html = wait(url)
selector = lxml.html.fromstring(html)
chapter_name = selector.xpath('//div[@class="clearfix"]/h4/text()')[0]
content = selector.xpath('//div[@id="partContent"]/p/text()')
self.content_lst.append({'title':chapter_name,'content':content})
self.handler.insert_many(self.content_lst)
5.源码:
# Author:CK
# -*- coding = utf-8 -*-
# @Time :2022/3/24 17:21
# @Author:ck
# @File :get_article.py
# @Software: PyCharm
from selenium import webdriver#浏览器驱动模块
from selenium.webdriver.support.ui import WebDriverWait#浏览器请求等待模块
from selenium.webdriver.common.by import By#锁定元素模块
from selenium.webdriver.support import expected_conditions as EC#期待模块
import requests#请求模块
import lxml.html#源代码解析模块
import redis #redis数据库
from pymongo import MongoClient #MongoDB数据库
def wait(url):#用于获取
driver = webdriver.Chrome(r'.\chromedriver.exe')#用chromedriver.exe驱动Chrome浏览器解析源代码中JS部分
driver.get(url) #连接网页
try:
WebDriverWait(driver,30).until(EC.presence_of_element_located((By.CLASS_NAME,"read_details"))) #等待页面加载
except Exception as _:
print('网页加载太慢。')
return driver.page_source #返回网页源代码
class get_article(object):#获取文章内容类
HEADERS = {
'User - Agent': 'Mozilla / 5.0(WindowsNT10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 99.0.4844.51Safari / 537.36'
}#头部
def __init__(self, url):
self.url = url
self.content_lst = []
self.client1 = redis.StrictRedis()#连接redis数据库
self.handler = db['article']#连接数据库集合
self.url2redis()
self.article2mongodb()
def get_source(self, url, headers):#获取url源代码
return requests.get(url, headers).content.decode()
def url2redis(self):#将url源代码中的章节url存储在redis数据库中
source = self.get_source(self.url,self.HEADERS)
selector = lxml.html.fromstring(source) # 创建树对象
url_lst = selector.xpath('//div[@class="chapter clearfix"]/a/@href')
for url in url_lst:
url = 'https://www.tadu.com' + url.strip()
self.client1.lpush('url_queue', url)
def article2mongodb(self):#将各个url中文章内容传入MongoDB数据库中
while self.client1.llen('url_queue')>0:
url = self.client1.lpop('url_queue').decode()
html = wait(url)
selector = lxml.html.fromstring(html)
chapter_name = selector.xpath('//div[@class="clearfix"]/h4/text()')[0]
content = selector.xpath('//div[@id="partContent"]/p/text()')
self.content_lst.append({'title':chapter_name,'content':content})
self.handler.insert_many(self.content_lst)
if __name__ =='__main__':
client0 =MongoClient()
db = client0['spicer']#创建数据库
article = get_article('https://www.tadu.com/book/catalogue/891710')
6.但实际运行,由于每次爬取一个章节内容都会运行一次Chrome解析器,所耗用内存过大,要想解决这个问题,得想个其它思路。由于异步加载,我们可以在浏览器开发者工具中找到每个章节的数据文件。通过对每个数据文件内容、url、请求方式的解析,从而只需要简单爬虫模式即可爬取。思路仅供参考,具体源码实现自考。