scrapy爬取的数据异步存储至MySQL
以scrapy爬虫爬取简书中全部的页面详情数据为例:
1.cmd执行scrapy genspider -t crawl jbooks jianshu.com
创建完爬虫项目后最好为其创建一个脚本启动文件start.py 文件在项目根目录即可
from scrapy import cmdline
#启动爬虫命令
cmdline.execute('scrapy crawl js'.split())
去配置文件更改默认的配置信息:
1.robot协议必须改为false
ROBOTSTXT_OBEY = False
2.添加准备好的请求头信息,防止被网站识别:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv 11.0) like Gecko',
}
至此,新项目的配置完成,开始编写爬虫脚本。
先明确要获取的字段数据,在items完成:
import scrapy
class JbookItem(scrapy.Item):
title = scrapy.Field() #标题
content = scrapy.Field() #内容
article_id = scrapy.Field() #文章ID
source_url = scrapy.Field() #源地址
author =scrapy.Field() #作者
avatar = scrapy.Field() #作者头像
pub_time = scrapy.Field() #发布日期
爬虫脚本代码:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jbook.items import JbookItem
class JsSpider(CrawlSpider):
name = 'js'
allowed_domains = ['jianshu.com']
start_urls = ['https://www.jianshu.com/']
rules = (
#匹配当前页面中全部的url
Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
)
def parse_detail(self, response):
title = response.xpath('//div[@class="article"]/h1/text()').get().strip() #标题
# content = response.xpath('//div[@class="show-content-free"]//text()').getall()
# cont = ''.join(content).strip() #内容
content = response.xpath('//div[@class="show-content-free"]').get()
url = response.url
url1 = url.split("?")[0]
article_id = url1.split('/')[-1] #文章id
author = response.xpath('//div[@class="info"]/span[@class="name"]/a/text()').get().strip() #作者
avatar = response.xpath('//div[@class="author"]/a[@class="avatar"]/img/@src').get()
avat = "https:"+avatar.split('?')[0] #头像
pub_time = response.xpath('//span[@class="publish-time"]/text()').get().strip() #发布日期
item = JbookItem(title=title,
content=content,
article_id=article_id,
source_url=url,
author=author,
avatar=avat,
pub_time=pub_time
)
yield item
将item字典对象中的数据传递给管道文件去操作:piplines.py
import pymysql
class JbookPipeline(object):
def __init__(self):
dbparams ={
'host':'127.0.0.1',
'port':3306,
'user':'root',
'password':'root1234',
'database':'jbook',
'charset':'utf8'
}
self.conn = pymysql.connect(**dbparams) #创建数据库连接对象
self.cursor = self.conn.cursor() #创建一个游标对象
self._sql =None #初始化sql语句
def process_item(self, item, spider):
self.cursor.execute(self.sql,
(item['title'],
item['content'],
item['author'],
item['avatar'],
item['pub_time'],
item['article_id'],
item['source_url'],
))
self.conn.commit() #提交插入语句
return item
@property
def sql(self):
if not self._sql:
self._sql="""
insert into article (id,title,content,author,avatar,pub_time,article_id,source_url) value (null,%s,%s,%s,%s,%s,%s,%s)
"""
return self._sql
return self._sql
传统的同步数据存储
import pymysql
from twisted.enterprise import adbapi
from pymysql import cursors
class JbookTwistedPipeline(object):
def __init__(self):
dbparams = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': 'root1234',
'database': 'jbook',
'charset': 'utf8',
'cursorclass': cursors.DictCursor
}
self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams)
self._sql= None
@property
def sql(self):
if not self._sql:
self._sql = """
insert into article (id,title,content,author,avatar,pub_time,article_id,source_url) value (null,%s,%s,%s,%s,%s,%s,%s)
"""
return self._sql
return self._sql
def process_item(self,item,spider):
#对sql语句进行处理
defer = self.dbpool.runInteraction(self.insert_item,item) #执行函数insert_item 去插入数据
defer.addErrback(self.handle_error,item,spider) #遇到错误信息调用 handle_error方法
def insert_item(self, cursor, item):
cursor.execute(self.sql, (
item['title'],
item['content'],
item['author'],
item['avatar'],
item['pub_time'],
item['article_id'],
item['source_url'],
))
def handle_error(self,error,item,spider):
print('='*20+'error'+'='*20)
print(error)
print('='*20+'error'+'='*20)
通过数据库连接池实现数据的异步存储
在配置文件中开启piplines管道文件:
ITEM_PIPELINES = {
# 'jbook.pipelines.JbookPipeline': 300,
'jbook.pipelines.JbookTwistedPipeline': 300,
}