「数据采集」实验四
作业①
- 要求:熟练掌握
scrapy
中Item
、Pipeline
数据的序列化输出方法,使用Scrapy
+Xpath
+MySQL
数据库存储技术路线爬取当当网站图书数据 - 候选网站:http://www.dangdang.com/
- 关键词:学生自由选择
- 输出信息:
id | title | author | publisher | date | price | detail |
---|---|---|---|---|---|---|
1 | Python算法图解 | 何韬 | 清华大学出版社 | 2021-04-01 | ¥34.50 | 用到算法。数据结构是算法的基础,数组、字典、堆、栈、链表... |
.. | .. | .. | .. | .. | .. | .. |
Gitee链接 : 作业4-1
1.网页解析
- 找到信息所处位置
xpath
获取信息
lis=selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")
- 定位
- xpath获取信息
title=li.xpath("./a[position()=1]/@title").extract_first()
price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
date = li.xpath("./p[@class='search_book_author']/span[position()=last()-1]/text()").extract_first()
publisher = li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title").extract_first()
detail = li.xpath("./p[@class='detail']/text()").extract_first()
2.编写items.py
中的数据项目类
class BookItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
author=scrapy.Field()
date=scrapy.Field()
publisher=scrapy.Field()
detail=scrapy.Field()
price=scrapy.Field()
3.编写pipelines.py
中的数据处理类
class BookPipeline(object):
def open_spider(self,spider):
print("opend")
self.con = sqlite3.connect("books.db")
self.cursor = self.con.cursor()
try:
try:
self.cursor.execute("create table books(bTitle varchar(512),bAuthor varchar(256),bPublisher varchar(256),"
"bDate varchar(32),bPrice varchar(16),bDetail varchar(256),"
"constraint pk_books primary key (bTitle,bAuthor));")
except:
self.cursor.execute("delete from books")
self.opened = True
self.count = 1
except Exception as err:
print(err)
self.opened = False
def close_spider(self, spider):
if self.opened:
self.con.commit()
self.con.close()
self.opened=False
print("closed")
print("总共爬取",self.count,"本书籍")
def process_item(self, item, spider):
try:
print(item["title"])
print(item["author"])
print(item["publisher"])
print(item["date"])
print(item["price"])
print(item["detail"])
print()
if self.opened:
self.cursor.execute("insert into books (bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values (?,?,?,?,?,?)",
(item["title"],item["author"],item["publisher"],item["date"],item["price"],item["detail"]))
self.count+=1
except Exception as err:
print(err)
return item
4.编写setting.py
BOT_NAME = 'bank'
SPIDER_MODULES = ['bank.spiders']
NEWSPIDER_MODULE = 'bank.spiders'
ITEM_PIPELINES = {'bank.pipelines.BankPipeline': 300,}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'bank (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
5.编写MySpider.py
爬虫程序
class MySpider(scrapy.Spider):
name = "mySpider"
key = 'python'
source_url='http://search.dangdang.com/'
def start_requests(self):
url = MySpider.source_url+"?key="+MySpider.key
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
try:
dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
data = dammit.unicode_markup
selector=scrapy.Selector(text=data)
lis=selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")
for li in lis:
title=li.xpath("./a[position()=1]/@title").extract_first()
price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
date = li.xpath("./p[@class='search_book_author']/span[position()=last()-1]/text()").extract_first()
publisher = li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title").extract_first()
detail = li.xpath("./p[@class='detail']/text()").extract_first()
#detail有时没有,结果None
item=BookItem()
item["title"]=title.strip() if title else ""
item["author"]=author.strip() if author else ""
item["date"] = date.strip()[1:] if date else ""
item["publisher"] = publisher.strip() if publisher else ""
item["price"] = price.strip() if price else ""
item["detail"] = detail.strip() if detail else ""
yield item
#最后一页时link为None
link=selector.xpath("//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href").extract_first()
if link:
url=response.urljoin(link)
yield scrapy.Request(url=url, callback=self.parse)
except Exception as err:
print(err)
6.运行结果
- 控制台输出
- 数据库截图
7.心得体会
- 本题为复现课本中代码,未遇到困难。
作业②
- 要求:熟练掌握
scrapy
中Item
、Pipeline
数据的序列化输出方法;使用scrapy
框架+Xpath
+MySQL
数据库存储技术路线爬取外汇网站数据。 - 候选网站:招商银行网:http://fx.cmbchina.com/hq/
- 输出信息:
MYSQL
数据库存储和输出格式
id | currency | tsp | csp | tbp | cbp | time |
---|---|---|---|---|---|---|
1 | 港币 | 82.23 | 82.23 | 81.91 | 81.33 | 10:22:05 |
.. | .. | .. | .. | .. | .. | .. |
Gitee链接:作业4-2
1.网页解析
- xpath 获取信息
currency = selector.xpath('//div[@id="realRateInfo"]//td[1]/text()').extract()
tsp = selector.xpath('//div[@id="realRateInfo"]//td[4]/text()').extract()
csp = selector.xpath('//div[@id="realRateInfo"]//td[5]/text()').extract()
tbp = selector.xpath('//div[@id="realRateInfo"]//td[6]/text()').extract()
cbp = selector.xpath('//div[@id="realRateInfo"]//td[7]/text()').extract()
time = selector.xpath('//div[@id="realRateInfo"]//td[8]/text()').extract()
2.编写items.py
中的数据项目类
class BankItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
id = scrapy.Field()
currency = scrapy.Field()
tsp = scrapy.Field()
csp = scrapy.Field()
tbp = scrapy.Field()
cbp = scrapy.Field()
time = scrapy.Field()
3.编写pipelines.py
中的数据处理类
class BankPipeline:
def open_spider(self, spider):
print("opend")
self.con = sqlite3.connect("bank.db")
self.cursor = self.con.cursor()
try:
try:
self.cursor.execute("create table bank (bId varchar(4),bCurrency varchar(16),bTsp varchar(8),bCsp varchar(8),bTbp varchar(8),"
"bCbp varchar(8),bTime varchar(32),"constraint pk_bank primary key (bId,bCurrency));")
except:
self.cursor.execute("delete from bank")
self.opened = True
self.count = 1
except Exception as err:
print(err)
self.opened = False
def close_spider(self, spider):
try:
if self.opened:
self.con.commit()
self.con.close()
self.opened = False
except Exception as err:
print(err)
print("closed")
print("总共爬取", self.count - 1, "项信息")
def process_item(self, item, spider):
try:
print(self.count,item['currency'],item['tsp'],item['csp'],item['tbp'],item['cbp'], item['time'])
if self.opened:
self.cursor.execute("insert into bank (bId,bCurrency,bTsp,bCsp,bTbp,bCbp,bTime) values(?,?,?,?,?,?,?)",
(self.count,item['currency'],item['tsp'],item['csp'],item['tbp'],item['cbp'], item['time']))
self.count += 1
except Exception as err:
print(err)
return item
4.编写MySpider.py
爬虫程序
class MySpider(scrapy.Spider):
# 继承Scrapy.Spider类
name = "bankSpider"
source_url = "http://fx.cmbchina.com/hq/"
page = 0
count = 1
def start_requests(self):
url = MySpider.source_url
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
try:
try:
dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
data = dammit.unicode_markup
except Exception as err:
print(err)
selector = scrapy.Selector(text=data)
currency = selector.xpath('//div[@id="realRateInfo"]//td[1]/text()').extract()
tsp = selector.xpath('//div[@id="realRateInfo"]//td[4]/text('
')').extract()
csp = selector.xpath('//div[@id="realRateInfo"]//td[5]/text('
')').extract()
tbp = selector.xpath('//div[@id="realRateInfo"]//td[6]/text('
')').extract()
cbp = selector.xpath('//div[@id="realRateInfo"]//td[7]/text('
')').extract()
time = selector.xpath('//div[@id="realRateInfo"]//td[8]/text('
')').extract()
for i in range(1,len(currency)):
item = BankItem()
item["currency"] = currency[i].strip() if currency[i] else ""
item["tsp"] = tsp[i].strip() if tsp[i] else ""
item["csp"] = csp[i].strip() if csp[i] else ""
item["tbp"] = tbp[i].strip() if tbp[i] else ""
item["cbp"] = cbp[i].strip() if cbp[i] else ""
item["time"] = time[i].strip() if time[i] else ""
yield item
MySpider.page += 1
print("MySpider.page:", MySpider.page)
except Exception as err:
print(err)
5.运行结果
- 控制台输出
- 数据库截图
6.心得体会
- 对于Scrapy爬虫程序的框架使用逐渐熟练;
- 本题未进行翻页爬取,难度较低,未遇到困难。
作业③
- 要求:熟练掌握
Selenium
查找HTML元素
、爬取Ajax网页
数据、等待HTML
元素等内容;使用Selenium
框架+MySQL
数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。 - 候选网站:东方财富网
- 输出信息:MySQL数据库存储和输出格式如下
序号 | 股票代码 | 股票名称 | 最新报价 | 涨跌幅 | 涨跌额 | 成交量 | 成交额 | 振幅 | 最高 | 最低 | 今开 | 昨收 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 688093 | N世华 | 28.47 | 62.22% | 10.92 | 26.13万 | 7.6亿 | 22.3% | 32.0 | 28.08 | 30.2 | 17.55 |
2 | ... | ... | ... | ... |
Gitee链接:作业4-3
1.网页解析
- 板块切换
lis = driver.find_elements(By.XPATH, '//div[@id="tab"]/ul/li')
tab = lis[i].find_element(By.XPATH,'.//a')
driver.execute_script("arguments[0].click();", tab)
- 翻页
time.sleep(3)
input = driver.find_element(By.XPATH,//div[@class="dataTables_wrapper"]//input')
input.clear()
input.send_keys(页码)
go = driver.find_element(By.XPATH,'//div[@class="dataTables_wrapper"]//a[@class="paginte_go"]')
go.click()
- 信息定位
2.数据库操作
class StocksDB:
def openDB(self):
def closeDB(self):
def insert(self, tab,id, num, name, new, up, upprice, com, comprice, f, max,min,today,yes):
3.获取网页login()
def login():
# 请求网页
driver.get('http://quote.eastmoney.com/center/gridlist.html#hs_a_board')
4.拉动侧边滑动条
def drop_scroll():
for x in range(1, 11, 2):
time.sleep(0.5)
# 代表滑动条位置
j = x/10
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
# 运行上面的js代码
driver.execute_script(js)
5.获取信息get_info()
def get_info(tab):
# 分析页面后,获取tr里面的数据
trs = driver.find_elements(By.XPATH,'//tbody/tr')
# 遍历每个tr,获取详细信息
for tr in trs:
# 获取序号
id = tr.find_element(By.XPATH,'.//td[1]').text
# 获取股票代码
num = tr.find_element(By.XPATH,'.//td[2]/a').text
# 获取股票名称
name = tr.find_element(By.XPATH,'.//td[3]/a').text
# 获取最新价
new = tr.find_element(By.XPATH,'.//td[5]/span').text
# 获取涨跌幅
up = tr.find_element(By.XPATH,'.//td[6]/span').text
# 获取涨跌额
upprice = tr.find_element(By.XPATH,'.//td[7]/span').text
# 获取成交量
com = tr.find_element(By.XPATH,'.//td[8]').text
# 获取成交额
comprice = tr.find_element(By.XPATH,'.//td[9]').text
# 获取振幅
f = tr.find_element(By.XPATH,'.//td[10]').text
# 获取最高
max = tr.find_element(By.XPATH,'.//td[11]/span').text
# 获取最低
min = tr.find_element(By.XPATH,'.//td[12]/span').text
# 获取今开
today = tr.find_element(By.XPATH,'.//td[13]/span').text
# 获取昨收
yes = tr.find_element(By.XPATH,'.//td[14]').text
# 向数据库添加一条数据
print(tab,id,num, name, new, up, upprice, com, comprice, f,max,min,
today,yes)
db.insert(tab,id, num, name, new, up, upprice, com, comprice, f, max,
min,today,yes)
print(str(id) + ' has been inserted')
6.切换页面
def to_searchs():
lis = driver.find_elements(By.XPATH,'//div[@id="tab"]/ul/li')
# 爬取三个板块
for i in range(3):
time.sleep(3)
lis = driver.find_elements(By.XPATH, '//div[@id="tab"]/ul/li')
tab = lis[i].find_element(By.XPATH,'.//a')
print(tab.text)
text = tab.text
driver.execute_script("arguments[0].click();", tab)
# 每个版块爬取三页
for i in range(3):
# element is not attached to the page document
time.sleep(3)
#找到输入框
input = driver.find_element(By.XPATH,'//div[@class="dataTables_wrapper"]//input')
# 清空输入框
input.clear()
input.send_keys(i+1)
# 找到确定按钮
go = driver.find_element(By.XPATH,'//div[@class="dataTables_wrapper"]//a[@class="paginte_go"]')
go.click()
# 拉动侧边滑动条,使页面数据加载完全
drop_scroll()
# 获取信息
get_info(text)
7.运行结果
- 控制台输出
- 数据库截图
8.心得体会
- 通过本题熟悉了selenium框架的使用;
- 在切换页面时会出现
element is not attached to the page document
的错误,原因是在刚进入新的页面时页面数据未加载完全,time.sleep(3)
即可解决问题; - 在切换板块时若使用
tab.click()
会报错element click intercepted
,改为driver.execute_script("arguments[0].click();", tab)
。