python爬虫-scrapy数据解析


一般的数据解析

首先创建工程

cd study_scrapy/  # 进入要创建工程的目录

scrapy startproject study_scrapy02  # 创建工程

cd study_scrapy02/  # 进入工程

scrapy genspider gushi  www.xxx.com  # 创建爬虫文件

修改爬虫文件和settings文件

import scrapy

class GushiSpider(scrapy.Spider):
    name = 'gushi'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://so.gushiwen.cn/mingjus/']

    def parse(self, response):
        pass

# 只输出错误日志
LOG_LEVEL = 'ERROR'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

在parse中编写数据解析

我们这次的解析目标是,解析古诗文网中的名句及作者

    def parse(self, response):
        div_list = response.xpath('//*[@id="html"]/body/div[2]/div[1]/div[2]/div')
        for div in div_list:
            # extract可以将SSelector对象的存储的数据提取出来
            content = div.xpath('./a[1]/text()')[0].extract()
            author = div.xpath('./a[2]/text()')[0].extract()
            print(content, author)
            break

执行文件

scrapy crawl gushi

动态的数据解析

动态的数据解析需要运用到下载中间件

我们以 爬取boss直聘的岗位及详情介绍 为例

创建工程

cd study_scrapy  # 进入项目目录

scrapy startproject bossPro  # 创建一个名为bossPro的工程

cd bossPro  # 进入工程

scrapy genspider boss www.zhipin.com  # 创建爬虫文件

修改settings.py

# 仅显示错误日志
LOG_LEVEL = 'ERROR'

# UA伪装
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15'

# 君子协议
# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# 管道队列
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'bossPro.pipelines.BossproPipeline': 300,
}

# 下载中间件
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   'bossPro.middlewares.BossproDownloaderMiddleware': 543,
}

编写boss.py

import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver import ChromeOptions

class BossSpider(scrapy.Spider):
    name = 'boss'
    # allowed_domains = ['www.zhipin.com']
    start_urls = ['https://www.zhipin.com/job_detail/?query=python&city=101010100&industry=&position=']

    def __init__(self):
        service = Service('/Users/soutsukyou/PyCharm_Workspace/网络爬虫/study_selenium/chromedriver')
        chrome_options = ChromeOptions()
        # 规避检测
        chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
        # 实例化浏览器
        self.bro = webdriver.Chrome(service=service, options=chrome_options)

    def parse_detail(self, response):
        job_desc = response.xpath('//div[@class="job-sec"]/div//text()').extract()
        job_desc = ''.join(job_desc)
        print(job_desc)

    def parse(self,response):
        li_list = response.xpath('//*[@id="main"]/div/div[3]/ul/li')
        for li in li_list:
            title = response.xpath('.//span[@class="job-name"]/a/text()').extract()
            print(title)
            # area = response.xpath('.//span[@class="job-area-wrapper"]/span/text()').extract()
            # money = response.xpath('.//div[@class="job-limit clearfix"]/span/text()').extract()
            # limit = response.xpath('.//div[@class="job-limit clearfix"]/p//text()').extract()
            # 这里我们只测试第一个标签
            # 也不进行持久化存储
            detail_url = response.xpath('.//span[@class="job-name"]/a/@href').extract_first()
            # 对详情页发请求
            yield scrapy.Request('https://www.zhipin.com' + detail_url, callback=self.parse_detail)

编写middlewares.py中的DownloaderMiddleware

    def process_response(self, request, response, spider):
        # 浏览器对象
        bro = spider.bro
        # 参数spider是爬虫对象
        # 挑选出指定响应对象进行篡改url->request->response
        bro.get(request.url)
        page_text = bro.page_source  # 包含了动态加载的数据
        # 针对定位到的response篡改
        # 实例化新的响应对象(包含动态加载的数据)
        response = HtmlResponse(url=bro.current_url, body=page_text, encoding='utf-8', request=request)
        return response

# 运行文件
scrapy crawl boss