8.中间件以及crawlspider使用


同时采集多个字段

items.py

import scrapy

class Test1Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 在items定义数据类型
    title = scrapy.Field()
    pic = scrapy.Field()

spider1.py

datas_pic = response.xpath("//a[@class='u-card']/img")
for item in datas_pic:
#使用items来存储
pic = item.xpath("@data-src").extract()
title = item.xpath("@alt").extract()
topipeline1 = Test1Item(pic=pic,title=title)
yield topipeline1

pipelines.py

import json

class Test1Pipeline:
    # 打开爬虫的时候执行
    def open_spider(self,spider):
        # 打开并准备存储
        self.fp = open("spider1.json","w",encoding='utf-8')

    # 使用yield时候执行
    def process_item(self, item, spider):
        item_json = json.dumps(dict(item),ensure_ascii=False)
        # item_json = json.dumps(item,ensure_ascii=False)
        self.fp.write(item_json+'\n')
        return item

    # 爬虫结束时候执行
    def close_spider(self,spider):
        # 结束关闭文件
        self.fp.close()

下载图片

法一:

spider1.py

def parse(self, response):
        # 图片地址//a[@class='u-card']/img/@src
        # 文字地址//a[@class='u-card']/img/@alt
        datas_pic = response.xpath("//a[@class='u-card']/img")
        for item in datas_pic:
            pic = "http:"+str(item.xpath("@data-src")[0].extract())
            title = item.xpath("@alt").extract()
            topipeline1 = Test1Item(pic=pic,title=title)
            yield topipeline1
  • 由于我爬取的url没有http在前面,所以得自己添加
  • extract()函数表示返回一个list,xpath获得的数据是etree对象,是一个地址,需要自己转换

pipelines.py

import json
import os
from urllib import request

class Test1Pipeline:
    # 打开爬虫的时候执行
    def open_spider(self, spider):
        # 打开并准备存储
        self.fp = open("spider1.json", "w", encoding='utf-8')
        # 判断路径是否存在,当前目录下是否有文件夹,没有就创建一个
        self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')
        if not os.path.exists(self.path):
            os.mkdir(self.path)

    # 使用yield时候执行
    def process_item(self, item, spider):   
        item_json = json.dumps(dict(item), ensure_ascii=False)
        # item_json = json.dumps(item,ensure_ascii=False)
        self.fp.write(item_json + '\n')
        # 获得item的细节
        title = item['title'][0]
        pic = item['pic']
        path1 = os.path.join(self.path, 'pic1')
        if not os.path.exists(path1):
            os.mkdir(path1)
        # 下载图片并且加上jpg后缀
        request.urlretrieve(pic, os.path.join(path1, title + '.jpg'))
        return item

    # 爬虫结束时候执行
    def close_spider(self, spider):
        # 结束关闭文件
        self.fp.close()

法二:

在settings里新添加一个pipelines类和文件存储路径

import os

ITEM_PIPELINES = {
    'test1.pipelines.newImagePipeline':300
}

IMAGE_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')

重写pipelines中的图片下载类

from test1 import settings
from scrapy.pipelines.images import ImagesPipeline

# 重写图片下载类
class newImagePipeline(ImagesPipeline):
    # 请求之前调用
    def get_media_requests(self, item, info):
        for image_url in item['pic']:
            yield Request(image_url)

    # 请求之后调用
    def file_downloaded(self, response, request, info, *, item=None):
        # 重写path方法
        path = super(newImagePipeline,self).file_path(request,response,info)
        # 获得图片类型以新建文件夹
        category = request.item.get('category')
        # 获得settings里的图片的路径
        image_store = settings.IMAGE_STORE
        category_path = os.path.join(image_store,category)
        if not os.path.exists(category_path):
            os.mkdir(category_path)
        # 修改原有图片的名字
        image_name = path.replace("a","")
        # 真正图片的路径
        image_path = os.path.join(category_path,image_name)
        return image_path

中间件

middlewares,由于同一个浏览器下载图片过多会被认定为恶意爬虫,通过伪造ua来避免反爬

settings.py修改DOWNLOADER_MIDDLEWARES

DOWNLOADER_MIDDLEWARES = {
    'test1.middlewares.useragentMiddleware': 543,
}

middlewares.py添加类

class useragentMiddleware(object):
    # 定义一个请求头
    USER_AGENTS = {
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.33',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.94 Safari/537.34',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.95 Safari/537.35',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.96 Safari/537.36'
    }
    def process_requests(self,request,spider):
        # 随机选出一个ua
        user_agent = random.choice(self.USER_AGENTS)
        # 设置随机请求头
        request.header['User-Agent'] = user_agent

使用代理IP

settings.py

DOWNLOADER_MIDDLEWARES = {
    'test1.middlewares.proxyMiddleware': 543,
}

middlewares.py

import base64

class proxyMiddleware(object):
    # 普通代理
    PROXIES = {'1.2.3.4:8080','2.2.2.2:9090'}
    def process_request(self,request,spider):
        """
        proxy = random.choice(self.PROXIES)
        request.meta['proxy']=proxy
        """
        """
        # 有单独代理的私密代理
        proxy = '1.2.3.4:9090'
        user_pass = "admin:admin888"
        request.meta['proxy'] = proxy
        # 假如需要使用base64加密
        bs64_user_pass =  base64.b64encode(user_pass.encode('utf-8'))
        request.headers['Proxy-Authorization']="Basic"+bs64_user_pass.decode('utf-8')
        """
    def process_response(self,request,response,spider):
        if response.status!=200:
            print("请求出错")

crawlspider

进入spider目录创建spider2

scrapy genspider -t crawl spider2 目标网站

在test1项目下打开cmd,可以提前测试是否可以抓取到title内容

scrapy shell [目标网址]
title = response.xpath("//a[@class='u-card']/img/@alt").get
title

下载文件

settings.py

ITEM_PIPELINES = {
   'test1.pipelines.Test1Pipeline': 300
}

spider2.py

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from test1.items import Test1Item

class Spider2Spider(CrawlSpider):
    name = 'spider2'
    allowed_domains = ['4399dmw.com']
    start_urls = ['http://www.4399dmw.com/search/dh-1-0-0-0-0-0-0/']

    # 决定爬虫的走向
    rules = (
        Rule(LinkExtractor(allow=r'.+dh-1-0-0-0-0-\d-0\/'), callback='parse_item', follow=True)
    )

    def parse_item(self, response):
        datas_pic = response.xpath("//a[@class='u-card']/img")
        for item in datas_pic:
            pic = "http:" + str(item.xpath("@data-src")[0].extract())
            title = item.xpath("@alt").extract()
            topipeline1 = Test1Item(pic=pic, title=title)
            yield topipeline1

可以决定爬虫走向,通过定义不同的规则(Rule)进行爬取

添加简介

在items.py中添加一个jianjie类

jianjie = scrapy.Field()

spider2.py

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from test1.items import Test1Item

class Spider2Spider(CrawlSpider):
    name = 'spider2'
    allowed_domains = ['4399dmw.com']
    start_urls = ['http://www.4399dmw.com/search/dh-1-0-0-0-0-0-0/']

    # 决定爬虫的走向,爬到的页面是否需要跟进,到某个页面使用什么函数处理
    rules = (
        Rule(LinkExtractor(allow=r'.+dh-1-0-0-0-0-\d-0\/'),follow=True),
        Rule(LinkExtractor(allow=r'.+\/dh\/.+\/'),callback='parse_detail',follow=False)
    )

    def prase_detail(self,response):
        title = response.xpath("//div[@class='works__main']/h1/text()").extract()[0]
        jianjie = response.xpath("//div[@class='main']/div/p/text()").extract()[0]
        pic = "http:" + str(response.xpath("//div[@class='works__main']//img[@class='works__img']/@data-src")[0].extract())
        topipeline2 = Test1Item(jianjie=jianjie,pic=pic,title=title)
        yield topipeline2