8.中间件以及crawlspider使用
同时采集多个字段
items.py
import scrapy
class Test1Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 在items定义数据类型
title = scrapy.Field()
pic = scrapy.Field()
spider1.py
datas_pic = response.xpath("//a[@class='u-card']/img")
for item in datas_pic:
#使用items来存储
pic = item.xpath("@data-src").extract()
title = item.xpath("@alt").extract()
topipeline1 = Test1Item(pic=pic,title=title)
yield topipeline1
pipelines.py
import json
class Test1Pipeline:
# 打开爬虫的时候执行
def open_spider(self,spider):
# 打开并准备存储
self.fp = open("spider1.json","w",encoding='utf-8')
# 使用yield时候执行
def process_item(self, item, spider):
item_json = json.dumps(dict(item),ensure_ascii=False)
# item_json = json.dumps(item,ensure_ascii=False)
self.fp.write(item_json+'\n')
return item
# 爬虫结束时候执行
def close_spider(self,spider):
# 结束关闭文件
self.fp.close()
下载图片
法一:
spider1.py
def parse(self, response):
# 图片地址//a[@class='u-card']/img/@src
# 文字地址//a[@class='u-card']/img/@alt
datas_pic = response.xpath("//a[@class='u-card']/img")
for item in datas_pic:
pic = "http:"+str(item.xpath("@data-src")[0].extract())
title = item.xpath("@alt").extract()
topipeline1 = Test1Item(pic=pic,title=title)
yield topipeline1
- 由于我爬取的url没有http在前面,所以得自己添加
- extract()函数表示返回一个list,xpath获得的数据是etree对象,是一个地址,需要自己转换
pipelines.py
import json
import os
from urllib import request
class Test1Pipeline:
# 打开爬虫的时候执行
def open_spider(self, spider):
# 打开并准备存储
self.fp = open("spider1.json", "w", encoding='utf-8')
# 判断路径是否存在,当前目录下是否有文件夹,没有就创建一个
self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')
if not os.path.exists(self.path):
os.mkdir(self.path)
# 使用yield时候执行
def process_item(self, item, spider):
item_json = json.dumps(dict(item), ensure_ascii=False)
# item_json = json.dumps(item,ensure_ascii=False)
self.fp.write(item_json + '\n')
# 获得item的细节
title = item['title'][0]
pic = item['pic']
path1 = os.path.join(self.path, 'pic1')
if not os.path.exists(path1):
os.mkdir(path1)
# 下载图片并且加上jpg后缀
request.urlretrieve(pic, os.path.join(path1, title + '.jpg'))
return item
# 爬虫结束时候执行
def close_spider(self, spider):
# 结束关闭文件
self.fp.close()
法二:
在settings里新添加一个pipelines类和文件存储路径
import os
ITEM_PIPELINES = {
'test1.pipelines.newImagePipeline':300
}
IMAGE_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')
重写pipelines中的图片下载类
from test1 import settings
from scrapy.pipelines.images import ImagesPipeline
# 重写图片下载类
class newImagePipeline(ImagesPipeline):
# 请求之前调用
def get_media_requests(self, item, info):
for image_url in item['pic']:
yield Request(image_url)
# 请求之后调用
def file_downloaded(self, response, request, info, *, item=None):
# 重写path方法
path = super(newImagePipeline,self).file_path(request,response,info)
# 获得图片类型以新建文件夹
category = request.item.get('category')
# 获得settings里的图片的路径
image_store = settings.IMAGE_STORE
category_path = os.path.join(image_store,category)
if not os.path.exists(category_path):
os.mkdir(category_path)
# 修改原有图片的名字
image_name = path.replace("a","")
# 真正图片的路径
image_path = os.path.join(category_path,image_name)
return image_path
中间件
middlewares,由于同一个浏览器下载图片过多会被认定为恶意爬虫,通过伪造ua来避免反爬
settings.py修改DOWNLOADER_MIDDLEWARES
DOWNLOADER_MIDDLEWARES = {
'test1.middlewares.useragentMiddleware': 543,
}
middlewares.py添加类
class useragentMiddleware(object):
# 定义一个请求头
USER_AGENTS = {
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.33',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.94 Safari/537.34',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.95 Safari/537.35',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.96 Safari/537.36'
}
def process_requests(self,request,spider):
# 随机选出一个ua
user_agent = random.choice(self.USER_AGENTS)
# 设置随机请求头
request.header['User-Agent'] = user_agent
使用代理IP
settings.py
DOWNLOADER_MIDDLEWARES = {
'test1.middlewares.proxyMiddleware': 543,
}
middlewares.py
import base64
class proxyMiddleware(object):
# 普通代理
PROXIES = {'1.2.3.4:8080','2.2.2.2:9090'}
def process_request(self,request,spider):
"""
proxy = random.choice(self.PROXIES)
request.meta['proxy']=proxy
"""
"""
# 有单独代理的私密代理
proxy = '1.2.3.4:9090'
user_pass = "admin:admin888"
request.meta['proxy'] = proxy
# 假如需要使用base64加密
bs64_user_pass = base64.b64encode(user_pass.encode('utf-8'))
request.headers['Proxy-Authorization']="Basic"+bs64_user_pass.decode('utf-8')
"""
def process_response(self,request,response,spider):
if response.status!=200:
print("请求出错")
crawlspider
进入spider目录创建spider2
scrapy genspider -t crawl spider2 目标网站
在test1项目下打开cmd,可以提前测试是否可以抓取到title内容
scrapy shell [目标网址]
title = response.xpath("//a[@class='u-card']/img/@alt").get
title
下载文件
settings.py
ITEM_PIPELINES = {
'test1.pipelines.Test1Pipeline': 300
}
spider2.py
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from test1.items import Test1Item
class Spider2Spider(CrawlSpider):
name = 'spider2'
allowed_domains = ['4399dmw.com']
start_urls = ['http://www.4399dmw.com/search/dh-1-0-0-0-0-0-0/']
# 决定爬虫的走向
rules = (
Rule(LinkExtractor(allow=r'.+dh-1-0-0-0-0-\d-0\/'), callback='parse_item', follow=True)
)
def parse_item(self, response):
datas_pic = response.xpath("//a[@class='u-card']/img")
for item in datas_pic:
pic = "http:" + str(item.xpath("@data-src")[0].extract())
title = item.xpath("@alt").extract()
topipeline1 = Test1Item(pic=pic, title=title)
yield topipeline1
可以决定爬虫走向,通过定义不同的规则(Rule)进行爬取
添加简介
在items.py中添加一个jianjie类
jianjie = scrapy.Field()
spider2.py
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from test1.items import Test1Item
class Spider2Spider(CrawlSpider):
name = 'spider2'
allowed_domains = ['4399dmw.com']
start_urls = ['http://www.4399dmw.com/search/dh-1-0-0-0-0-0-0/']
# 决定爬虫的走向,爬到的页面是否需要跟进,到某个页面使用什么函数处理
rules = (
Rule(LinkExtractor(allow=r'.+dh-1-0-0-0-0-\d-0\/'),follow=True),
Rule(LinkExtractor(allow=r'.+\/dh\/.+\/'),callback='parse_detail',follow=False)
)
def prase_detail(self,response):
title = response.xpath("//div[@class='works__main']/h1/text()").extract()[0]
jianjie = response.xpath("//div[@class='main']/div/p/text()").extract()[0]
pic = "http:" + str(response.xpath("//div[@class='works__main']//img[@class='works__img']/@data-src")[0].extract())
topipeline2 = Test1Item(jianjie=jianjie,pic=pic,title=title)
yield topipeline2