Scrapy 框架 中间件,信号,定制命令


中间件

下载器中间件

写中间件

from scrapy.http import HtmlResponse
from scrapy.http import Request

class Md1(object):
    @classmethod
    def from_crawler(cls, crawler):
        # 此方法用于拿到当前的爬虫
        s = cls()
        return s

    def process_request(self, request, spider):    
        print('md1.process_request',request)
        return None # 返回如果是 空就会继续往下执行下一个中间件的 process_request 方法,如果一旦有返回值就要考虑情况
        """
        # 1. 返回 Response
        # 返回 Response 之后会往下执行 最后一个中间件的 process_response 方法 
        # import requests
        # result = requests.get(request.url)
        # return HtmlResponse(url=request.url, status=200, headers=None, body=result.content)
        
        # 2. 返回 Request
        # 返回 Request 之后 相当于无视了这次的请求 重新回到 调制器 那边,相当于又产生了新的任务
        # return Request('https://dig.chouti.com/r/tec/hot/1')

        # 3. 抛出异常    
        # 抛出异常 必须要 有 process_exception 方法进行捕捉异常,不然会报错
        # process_exception 方法在进行一系列的操作 在捕捉到异常的时候 
        # from scrapy.exceptions import IgnoreRequest
        # raise IgnoreRequest
        
        # 4. 对请求进行加工(*) 
        # 通常我们都是用于对请求加工,然后再继续下面操作不返回东西 
        # request.headers['user-agent'] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
        # return None
        """

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object    # 返回一个 Response 来代替当前的 Response
        # - return a Request object        # 返回一个 Request 开启新任务 
        # - or raise IgnoreRequest        # 返回一个 IgnoreRequest 进行异常捕捉 
        print('m1.process_response',request,response)
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
            # 通常我们都是直接返回 None 就可以了
        # - return a Response object: stops process_exception() chain    # 只要返回了 Response 当前的 process_exception 就不做操作了 
            # 返回 Response 表示交给下一个 中间件的 process_exception 继续处理 
        # - return a Request object: stops process_exception() chain    # 只要返回了 Request 当前的 process_exception 就不做操作了 
            # 返回 Request 放弃本次任务,新建任务     
        pass

配置文件

DOWNLOADER_MIDDLEWARES = {    
    #'xdb.middlewares.XdbDownloaderMiddleware': 543,
    # 'xdb.proxy.XdbProxyMiddleware':751,
    'xdb.md.Md1':666,    # 依旧是 0-1000 越小越优先 
    'xdb.md.Md2':667,
}

执行顺序梳理

调度器 给 下载器的时候先走 process_request(从第一个中间件往最后一个走) 然后如果根据返回情况进行判断接下来的方向
  返回 None 继续下一个中间件的 process_request
  返回 Response 进入 最后一个下载中间件的 process_response 流程
  返回 Request 返回 调度器开启新任务 
  返回 异常  进入当前中间件的 process_exception 进行异常处理

下载器 还给 爬虫的时候要走 process_response(从最后一个中间件往第一个走)然后如果根据返回情况进行判断接下来的方向   返回 None 继续上一个中间件的 process_response   返回 Response 替换当前Response 进入上一个下载中间件的 process_response 流程   返回 Request 返回 调度器开启新任务 放弃当前的任务   返回 异常 进入当前中间件的 process_exception 进行异常处理

应用场景 - 随机 User-Agent 

开源的组件  导入

from fake_useragent import UserAgent

配置文件中设置选择方式

RANDOM_UA_TYPE = "random"

根据配置文件中的选择方式设置模式

class RandomUserAgentMiddlware(object):
    # 随机更换user-agent
    def __init__(self, crawler):
        super(RandomUserAgentMiddlware, self).__init__()
        self.ua = UserAgent()
        self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        def get_ua():
            return getattr(self.ua, self.ua_type)

        request.headers.setdefault('User-Agent', get_ua())

应用场景 - IP代理

写个脚本完成对 西刺代理IP的爬虫

并存入数据库

# -*- coding: utf-8 -*-
import requests
from scrapy.selector import Selector
import pymysql

conn = pymysql.connect(host="127.0.0.1", user="root", passwd="root", db="article_spider", charset="utf8")
cursor = conn.cursor()


def crawl_ips():
    # 爬取西刺的免费ip代理
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"}
    for i in range(1568):
        re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers)

        selector = Selector(text=re.text)
        all_trs = selector.css("#ip_list tr")

        ip_list = []
        for tr in all_trs[1:]:
            speed_str = tr.css(".bar::attr(title)").extract()[0]
            if speed_str:
                speed = float(speed_str.split("")[0])
            all_texts = tr.css("td::text").extract()

            ip = all_texts[0]
            port = all_texts[1]
            proxy_type = all_texts[5]

            ip_list.append((ip, port, proxy_type, speed))

        for ip_info in ip_list:
            cursor.execute(
                "insert proxy_ip(ip, port, speed, proxy_type) VALUES('{0}', '{1}', {2}, 'HTTP')".format(
                    ip_info[0], ip_info[1], ip_info[3]
                )
            )

            conn.commit()


class GetIP(object):
    def delete_ip(self, ip):
        # 从数据库中删除无效的ip
        delete_sql = """
            delete from proxy_ip where ip='{0}'
        """.format(ip)
        cursor.execute(delete_sql)
        conn.commit()
        return True

    def judge_ip(self, ip, port):
        # 判断ip是否可用
        http_url = "http://www.baidu.com"
        proxy_url = "http://{0}:{1}".format(ip, port)
        try:
            proxy_dict = {
                "http": proxy_url,
            }
            response = requests.get(http_url, proxies=proxy_dict)
        except Exception as e:
            print("invalid ip and port")
            self.delete_ip(ip)
            return False
        else:
            code = response.status_code
            if code >= 200 and code < 300:
                print("effective ip")
                return True
            else:
                print("invalid ip and port")
                self.delete_ip(ip)
                return False

    def get_random_ip(self):
        # 从数据库中随机获取一个可用的ip
        random_sql = """
              SELECT ip, port FROM proxy_ip
            ORDER BY RAND()
            LIMIT 1
            """
        result = cursor.execute(random_sql)
        for ip_info in cursor.fetchall():
            ip = ip_info[0]
            port = ip_info[1]

            judge_re = self.judge_ip(ip, port)
            if judge_re:
                return "http://{0}:{1}".format(ip, port)
            else:
                return self.get_random_ip()


# print (crawl_ips())
if __name__ == "__main__":
    get_ip = GetIP()
    get_ip.get_random_ip()

设置中间件来调用脚本设置代理 IP

class RandomProxyMiddleware(object):
    # 动态设置ip代理
    def process_request(self, request, spider):
        get_ip = GetIP()
        request.meta["proxy"] = get_ip.get_random_ip()

爬虫中间件

写中间件

class Sd1(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    # 只在爬虫启动时,执行一次。
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

配置文件

SPIDER_MIDDLEWARES = {
   # 'xdb.middlewares.XdbSpiderMiddleware': 543,
    'xdb.sd.Sd1': 666,    # 同爬虫中间件一样的判断机制 
    'xdb.sd.Sd2': 667,
}

执行流程

1. 第一次启动爬虫文件封装好 request 之后 走 process_start_requests 上传给引擎

2. 引擎将封装好的 request 给调度器

3. 调度器 继续执行 给下载器

4. 下载器 下载了内容之后给 引擎

5. 引擎再给 爬虫文件的时候要走 process_spider_input

6. 爬虫文件处理完之后如果有 yield 就要在走 process_spider_output 给引擎

应用

- 深度

- 优先级

信号

使用框架预留的位置,帮助你自定义一些功能

使用实例

from scrapy import signals

class MyExtend(object):
    def __init__(self):
        pass

    @classmethod
    def from_crawler(cls, crawler):
        self = cls()

        crawler.signals.connect(self.x1, signal=signals.spider_opened) # 绑定信号发生时允许的函数
        crawler.signals.connect(self.x2, signal=signals.spider_closed)

        return self

    def x1(self, spider):
        print('open')

    def x2(self, spider):
        print('close')
# 信号可选类型  from scrapy import signals 中可以看到 
engine_started = object()
engine_stopped = object()

spider_opened = object()
spider_idle = object()
spider_closed = object()
spider_error = object()

request_scheduled = object()
request_dropped = object()
response_received = object()
response_downloaded = object()

item_scraped = object()
item_dropped = object()
# settings.py 

EXTENSIONS = { 'xdb.ext.MyExtend':666, }

定制命令

单爬虫运行

import sys
from scrapy.cmdline import execute

if __name__ == '__main__':
    execute(["scrapy","crawl","chouti","--nolog"])

所有爬虫

- 在spiders同级创建任意目录,如:commands
- 在其中创建 crawlall.py 文件 (此处文件名就是自定义的命令)
- 在settings.py 中添加配置 COMMANDS_MODULE = '项目名称.目录名称'
- 在项目目录执行命令:scrapy crawlall 
# crawlall.py

from
scrapy.commands import ScrapyCommand from scrapy.utils.project import get_project_settings class Command(ScrapyCommand): requires_project = True def syntax(self): return '[options]' def short_desc(self): return 'Runs all of the spiders' def run(self, args, opts): spider_list = self.crawler_process.spiders.list() for name in spider_list: self.crawler_process.crawl(name, **opts.__dict__) self.crawler_process.start()
# settings.py

COMMANDS_MODULE = "xdb.commands"

爬虫暂停重启

原理

爬虫的暂停重启是需要文件支持

在启动的命令里选择一个路径

不同的爬虫不能共用,

相同的爬虫如果公用同一个就会给予这个文件的上一次状态继续爬取

该命令的中断命令是基于 windows  Ctrl+c / 杀进程 或者 Linux 里面的  kill -f -9 main.py 

因此在 pycharm 中的中断是做不到的, 只能在命令行中处理

scrapy crawl lagou -s JOBDIR=job_info/001

配置文件方式

指定 文件路径可以在 settings.py 中设置  

这样就是全局设置了

JOBDIR="job_info/001"

或者在单爬虫类中设置

cutom_settings = {
    "JOBDIR": "job_info/001"
}

总结

但是还是和上面的说法一样.....pycharm 里面没办法中断, 因此还是没有啥意义, 

还是只能使用命令行方式

相关