python爬虫-scrapy数据解析
一般的数据解析
首先创建工程
cd study_scrapy/ # 进入要创建工程的目录
scrapy startproject study_scrapy02 # 创建工程
cd study_scrapy02/ # 进入工程
scrapy genspider gushi www.xxx.com # 创建爬虫文件
修改爬虫文件和settings文件
import scrapy
class GushiSpider(scrapy.Spider):
name = 'gushi'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://so.gushiwen.cn/mingjus/']
def parse(self, response):
pass
# 只输出错误日志
LOG_LEVEL = 'ERROR'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
在parse中编写数据解析
我们这次的解析目标是,解析古诗文网中的名句及作者
def parse(self, response):
div_list = response.xpath('//*[@id="html"]/body/div[2]/div[1]/div[2]/div')
for div in div_list:
# extract可以将SSelector对象的存储的数据提取出来
content = div.xpath('./a[1]/text()')[0].extract()
author = div.xpath('./a[2]/text()')[0].extract()
print(content, author)
break
执行文件
scrapy crawl gushi
动态的数据解析
动态的数据解析需要运用到下载中间件
我们以 爬取boss直聘的岗位及详情介绍 为例
创建工程
cd study_scrapy # 进入项目目录
scrapy startproject bossPro # 创建一个名为bossPro的工程
cd bossPro # 进入工程
scrapy genspider boss www.zhipin.com # 创建爬虫文件
修改settings.py
# 仅显示错误日志
LOG_LEVEL = 'ERROR'
# UA伪装
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15'
# 君子协议
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# 管道队列
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'bossPro.pipelines.BossproPipeline': 300,
}
# 下载中间件
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'bossPro.middlewares.BossproDownloaderMiddleware': 543,
}
编写boss.py
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver import ChromeOptions
class BossSpider(scrapy.Spider):
name = 'boss'
# allowed_domains = ['www.zhipin.com']
start_urls = ['https://www.zhipin.com/job_detail/?query=python&city=101010100&industry=&position=']
def __init__(self):
service = Service('/Users/soutsukyou/PyCharm_Workspace/网络爬虫/study_selenium/chromedriver')
chrome_options = ChromeOptions()
# 规避检测
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 实例化浏览器
self.bro = webdriver.Chrome(service=service, options=chrome_options)
def parse_detail(self, response):
job_desc = response.xpath('//div[@class="job-sec"]/div//text()').extract()
job_desc = ''.join(job_desc)
print(job_desc)
def parse(self,response):
li_list = response.xpath('//*[@id="main"]/div/div[3]/ul/li')
for li in li_list:
title = response.xpath('.//span[@class="job-name"]/a/text()').extract()
print(title)
# area = response.xpath('.//span[@class="job-area-wrapper"]/span/text()').extract()
# money = response.xpath('.//div[@class="job-limit clearfix"]/span/text()').extract()
# limit = response.xpath('.//div[@class="job-limit clearfix"]/p//text()').extract()
# 这里我们只测试第一个标签
# 也不进行持久化存储
detail_url = response.xpath('.//span[@class="job-name"]/a/@href').extract_first()
# 对详情页发请求
yield scrapy.Request('https://www.zhipin.com' + detail_url, callback=self.parse_detail)
编写middlewares.py中的DownloaderMiddleware
def process_response(self, request, response, spider):
# 浏览器对象
bro = spider.bro
# 参数spider是爬虫对象
# 挑选出指定响应对象进行篡改url->request->response
bro.get(request.url)
page_text = bro.page_source # 包含了动态加载的数据
# 针对定位到的response篡改
# 实例化新的响应对象(包含动态加载的数据)
response = HtmlResponse(url=bro.current_url, body=page_text, encoding='utf-8', request=request)
return response
# 运行文件
scrapy crawl boss