爬取中华古诗词知识之数据获取
数据获取
来源于:寻古诗词网
对应的诗词板块,构建诗词知识图谱,爬取了唐宋元明清五大朝代的诗词
根据朝代来逐个爬取,分页爬取,可以发现分页的规律
唐代古诗的第一页url链接格式如下:https://www.xungushici.com/shicis/cd-tang-p-1
第二页:https://www.xungushici.com/shicis/cd-tang-p-2
可以看出其中的规律,就可以逐页爬取。
需要获得的数据包含:诗词名字,作诗朝代,作者,诗词内容,诗词分类,译文,注释,赏析以及创作背景
poem.py:获得了诗词名字,作诗朝代,作者,诗词内容,译文,赏析和鉴赏
import requests
from bs4 import BeautifulSoup
from lxml import etree
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息
pom_list=[]
k=1
for i in range(10001,10654):
url='https://www.xungushici.com/shicis/cd-qing-p-'+str(i)
r=requests.get(url,headers=headers)
content=r.content.decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
hed=soup.find('div',class_='col col-sm-12 col-lg-9')
list=hed.find_all('div',class_="card mt-3")
# print(len(list))
for it in list:
content = {}
#1.1获取单页所有诗集
href=it.find('h4',class_='card-title').a['href']
real_href='https://www.xungushici.com'+href
title=it.find('h4',class_='card-title').a.text
#2.1爬取诗词
get = requests.get(real_href).text
selector = etree.HTML(get)
#2.2获取标题
xtitle=selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/h3/text()')[0]
#2.3获取朝代
desty=selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/p/a/text()')[0]
#2.4获取作者
if len(selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/p/span/text()'))==0:
author=selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/p/a[2]/text()')[0]
else:
author =selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/p/span/text()')[0]
#2.5获取文章
ans=""
if len(selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/div[1]/p/text()'))==0:
artical=selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/div[1]/text()')
for it in artical:
ans=ans+it.replace("\r","").replace("\t","").replace("\n","")
else:
artical = selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/div[1]/p/text()')
for it in artical:
ans=ans+it.replace("\r","").replace("\t","").replace("\n","")
#2.6获取译文
trans=""
flag=0
for j in range(2,8):
path='/html/body/div[1]/div/div[1]/div[2]/div[2]/p[%d]'%j
if selector.xpath(path+'/text()')==[]:
break
else:
translist=selector.xpath(path+'/text()')
for it in translist:
trans = trans + it + "\n"
#2.7获取鉴赏
appear=""
for j in range(1,19):
path='/html/body/div[1]/div/div[1]/div[3]/div[2]/p[%d]'%j
if selector.xpath(path+'/text()')==[]:
break
else:
apperlist=selector.xpath(path+'/text()')
for it in apperlist:
appear = appear + it + "\n"
#2.8创作背景
background=selector.xpath('/html/body/div[1]/div/div[1]/div[4]/div[2]/p/text()')
text_back=""
if background!=[]:
for it in background:
text_back=text_back+it+"\n"
content['title']=xtitle
content['desty']=desty
content['author']=author
content['content']=ans
content['trans_content']=trans
content['appear']=appear
content['background']=text_back
pom_list.append(content)
print("第"+str(k)+"个")
k=k+1
import xlwt
xl = xlwt.Workbook()
# 调用对象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)
sheet1.write(0,0,"title")
sheet1.write(0,1,'desty')
sheet1.write(0,2,'author')
sheet1.write(0,3,'content')
sheet1.write(0,4,'trans_content')
sheet1.write(0,5,'appear')
sheet1.write(0,6,'background')
for i in range(0,len(pom_list)):
sheet1.write(i+1,0,pom_list[i]['title'])
sheet1.write(i+1, 1, pom_list[i]['desty'])
sheet1.write(i+1, 2, pom_list[i]['author'])
sheet1.write(i+1, 3, pom_list[i]['content'])
sheet1.write(i+1, 4, pom_list[i]['trans_content'])
sheet1.write(i+1, 5, pom_list[i]['appear'])
sheet1.write(i+1, 6, pom_list[i]['background'])
xl.save("qing3.xlsx")
# print(pom_list)
分类和注释,是后期需要重新爬取补上
import requests
from bs4 import BeautifulSoup
from lxml import etree
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息
pom_list=[]
k=1
for i in range(1,1000):
url='https://www.xungushici.com/shicis/cd-yuan-p-'+str(i)
r=requests.get(url,headers=headers)
content=r.content.decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
hed=soup.find('div',class_='col col-sm-12 col-lg-9')
list=hed.find_all('div',class_="card mt-3")
# print(len(list))
for it in list:
content = {}
#1.1获取单页所有诗集
href=it.find('h4',class_='card-title').a['href']
real_href='https://www.xungushici.com'+href
title=it.find('h4',class_='card-title').a.text
print(title)
#2.1爬取诗词
r2 = requests.get(real_href, headers=headers)
content2 = r2.content.decode('utf-8')
soup2 = BeautifulSoup(content2, 'html.parser')
zhu = ""
if soup2.find('div',class_='card mt-3')==[]:
zhu="无"
content['title'] = title
content['zhu'] = zhu
pom_list.append(content)
print("第" + str(k) + "个")
k = k + 1
continue
card_div=soup2.find('div',class_='card mt-3')
if card_div==None or card_div.find('div',class_='card-body')==[]:
zhu="无"
content['title'] = title
content['zhu'] = zhu
pom_list.append(content)
print("第" + str(k) + "个")
k = k + 1
continue
card_body=card_div.find('div',class_='card-body')
p_list=card_body.find_all('p')
flag=1
for it in p_list:
if str(it).find('strong')!=-1 and it.find('strong').text=='注释':
flag=0
continue
if flag==0:
zhu=zhu+str(it)
if len(zhu)==0:
zhu="无"
content['title']=title
content['zhu']=zhu
pom_list.append(content)
print("第"+str(k)+"个")
k=k+1
import xlwt
xl = xlwt.Workbook()
# 调用对象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)
sheet1.write(0,0,"title")
sheet1.write(0,12,'zhu')
for i in range(0,len(pom_list)):
sheet1.write(i+1,0,pom_list[i]['title'])
sheet1.write(i+1, 12, pom_list[i]['zhu'])
xl.save("yuan.xlsx")
# print(pom_list)
诗词形式,作诗时间,词牌名,曲牌名获取
poem_data.py获取作诗时间,采用crf分析作诗背景和鉴赏中的文字,提取出作诗时间词,并进行过滤。
#根据古诗的欣赏和背景来抽取:诗词的具体创作时间
import re
from pyhanlp import *
import pandas as pd
#人名“nr“
#地名“ns”
#机构名“nt”
def demo_CRF_lexical_analyzer(text):
CRFnewSegment = HanLP.newSegment("crf")
term_list = CRFnewSegment.seg(text)
ans=[]
for it in term_list:
if str(it.nature)=='t' or str(it.nature)=='m':
ans.append(str(it.word))
#print(ans)
return ans
from xlrd import open_workbook
from xlutils.copy import copy
#将分类结果重新写入原excel中
def write_to(data,file):
print(len(data))
xl =open_workbook(file)
excel = copy(xl)
sheet1 = excel.get_sheet(0)
sheet1.write(0, 9, "data")
for i in range(0, len(data)):
sheet1.write(i + 1, 9, data[i])
excel.save(file)
#获取指定文件夹下的excel
import os
def get_filename(path,filetype): # 输入路径、文件类型例如'.xlsx'
name = []
for root,dirs,files in os.walk(path):
for i in files:
if os.path.splitext(i)[1]==filetype:
name.append(i)
return name # 输出由有后缀的文件名组成的列表
if __name__ == '__main__':
file = 'data/'
list = get_filename(file, '.xlsx')
for it in list:
newfile = file + it
pome_time = []
print("开始"+str(newfile))
data=pd.read_excel(newfile).fillna("无")
appear=data.appear
back=data.background
if len(appear)>5000:
maxn=5000
else:
maxn=len(appear)
for i in range(maxn):
print("第"+str(i+1)+"个:")
app=appear[i]
bk=back[i]
if app=="无" and bk =="无":
pome_time.append("无")
print("无")
continue
#print("===============欣赏===================")
app_time=demo_CRF_lexical_analyzer(app)
#print("===============背景===================")
bk_time=demo_CRF_lexical_analyzer(bk)
f=False
for it in bk_time:
if bool(re.search(r'\d', it)) == True:
print(it)
pome_time.append(it)
f=True
break
if f==False:
for it in app_time:
if bool(re.search(r'\d', it)) == True:
print(it)
pome_time.append(it)
f=True
break
if f==False:
pome_time.append("无")
print("无")
write_to(pome_time,newfile)
诗词形式与词牌名和曲牌名分析
诗词形式:七言律诗,七言绝句,七言,五言律诗,五言绝句,五言
词牌名和曲牌名通过之后的网页爬取后,分析诗词名,进而得到对应的诗词牌名
最终结果:
总计:480122首
唐:48330首;宋:200000首;元:39240首;明:100000;清:92550首
诗人数据
从诗人的个人经历中,获取对应朝代的诗人名称
#根据诗人的个人简介抽取:生于,死于,字,号
import pandas as pd
import re
from xlrd import open_workbook
from xlutils.copy import copy
def read_author():
file= "data2/author.xlsx"
data=pd.read_excel(file).fillna("无")
produce=list(data.produce)
i=1
bg=[]
ed=[]
zi=[]
hao=[]
pome_self=[]
#获取诗人诗集数目
num=list(data.num)
for it in produce:
#获取诗人个人简介
pome_self.append(it)
print("第"+str(i)+"个诗人:")
# 获取诗人出生,去世的年份
datas=re.findall(r"\d+",it)
if len(datas)!=0 and len(datas)!=1:
bg.append(datas[0]+"年")
#print("生于"+datas[0])
flag=False
for j in range(1,len(datas)):
if len(datas[j])>=len(datas[0]) and int(datas[j])-int(datas[0])>15:
ed.append(datas[j]+"年")
#print("死于"+datas[j])
flag=True
break
if flag==False:
ed.append("无")
else:
bg.append("无")
ed.append("无")
# 获取诗人,字,号
ztext=re.findall(r".*字(.*?)[,|。]",it)
if len(ztext)!=0:
zi.append(ztext)
else:
zi.append("无")
#print(ztext)
htext = str(re.findall(r".*号(.*?)[,|。]", it)).replace('“','').replace('”','').replace('[','').replace(']','').replace('\'','')
if len(htext)!=0:
hao.append(htext)
else:
hao.append("无")
#print(htext)
i = i + 1
xl = open_workbook(file)
excel = copy(xl)
sheet1 = excel.get_sheet(0)
sheet1.write(0, 6, "begin_time")
sheet1.write(0,7,"end_time")
sheet1.write(0,8,"zi")
sheet1.write(0,9,"hao")
for i in range(0, len(bg)):
sheet1.write(i + 1, 6, bg[i])
sheet1.write(i + 1, 7, ed[i])
sheet1.write(i + 1, 8, zi[i])
sheet1.write(i + 1, 9, hao[i])
excel.save(file)
if __name__ == '__main__':
read_author()
诗人好友获取的结果是:
诗人信息
收集所有的合称
import requests
from bs4 import BeautifulSoup
from lxml import etree
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息
hc=[]
url='https://www.xungushici.com/authors'
r=requests.get(url,headers=headers)
content=r.content.decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
orign_href='https://www.xungushici.com'
hecheng=soup.find('div',id='divHeCheng')
list=hecheng.find_all('li',class_="m-1 badge badge-light")
dic={}
for i in range(1,len(list)):
href=orign_href+list[i].a['href']
hecehng=list[i].a.text
hc.append(hecehng)
r2 = requests.get(href, headers=headers)
content2 = r2.content.decode('utf-8')
soup2 = BeautifulSoup(content2, 'html.parser')
pomdiv=soup2.find('div',class_='col col-sm-12 col-lg-9')
card=pomdiv.find_all('div',class_='card mt-3')
author_list=[]
for it in card:
h4=it.find('h4',class_='card-title')
list_a=h4.find_all('a')
desty=list_a[0].text
author=list_a[1].text
author_list.append(author)
dic[hecehng]=",".join(author_list)
import xlwt
xl = xlwt.Workbook()
# 调用对象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)
sheet1.write(0,0,"hc")
sheet1.write(0,1,'author')
for i in range(0,len(hc)):
sheet1.write(i+1,0,hc[i])
sheet1.write(i+1,1,dic[hc[i]])
xl.save("common_name.xlsx")
for it in hc:
print(it+": "+dic[it])
之后将读取该表,对应到诗人表中添加一列合称属性:
诗人轨迹信息
获取来源
import requests
from bs4 import BeautifulSoup
from lxml import etree
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息
cipai=[]
for i in range(1,7):
url='https://www.xungushici.com/cipais/p'+str(i)
r=requests.get(url,headers=headers)
content=r.content.decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
hed=soup.find('ul',class_='list-unstyled d-flex flex-row flex-wrap align-items-center w-100')
list=hed.find_all('li',class_="m-1 badge badge-light")
for it in list:
if it.a!=None:
cipai.append(it.a.text)
import xlwt
xl = xlwt.Workbook()
# 调用对象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)
sheet1.write(0,0,"title")
for i in range(0,len(cipai)):
sheet1.write(i+1,0,cipai[i])
xl.save("cipai_name.xlsx")
展示效果
爬取收集
import pandas as pd
import xlwt
#读取yuan代的诗词
def read(file):
data=pd.read_excel(file)
title=data.title
# 存储一个曲排名列表
qu_list=[]
for it in title:
if it.find('·')!=-1:
# 根据诗词名获取对应的曲牌名
qu=it.split('·')
qu_list.append(qu[0])
new_qu=list(set(qu_list))
#将曲牌名进行保存
xl = xlwt.Workbook()
# 调用对象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)
sheet1.write(0, 0, "qu_name")
for i in range(0, len(new_qu)):
sheet1.write(i + 1, 0, new_qu[i])
xl.save("qupai_name.xlsx")
展示效果:
飞花令
通过遍历爬取的50万首古诗,分析每个句子是否有包含的飞花令中的关键字,如果有将其存储起来:诗句、作者、诗名、关键字
困难:如果用xlwt来存储,最多存储65536行数据,用openpyxl可以存储100万行数据。由于我们的诗句数据过大,因此需采用openpyxl来进行存储
import pandas as pd
import xlwt
import openpyxl
#读取飞花令
def read_word():
data=pd.read_excel('data2/word.xlsx')
words=data.word
return words
#遍历诗句
def read(file,words,write_file):
data=pd.read_excel(file)
title=data.title
content=data.content
author=data.author
#进行切分出单句
ans_sentens = []
ans_author = []
ans_title = []
ans_key = []
for i in range(len(title)):
print("第"+str(i)+"个")
cont=content[i]
aut=author[i]
tit=title[i]
sents=cont.replace('\n','').split('。')
for it in sents:
key_list = []
for k in words:
if it.find(k)!=-1:
key_list.append(k)
if len(key_list)!=0:
ans_sentens.append(it)
ans_author.append(aut)
ans_title.append(tit)
ans_key.append(",".join(key_list))
#存储对应的key,author,title,sentenous
xl = openpyxl.Workbook()
# 调用对象的add_sheet方法
sheet1 = xl.create_sheet(index=0)
sheet1.cell(1, 1, "sentens")
sheet1.cell(1, 2, "author")
sheet1.cell(1, 3, "title")
sheet1.cell(1, 4, "keys")
for i in range(0, len(ans_key)):
sheet1.cell(i + 2, 1, ans_sentens[i])
sheet1.cell(i + 2, 2, ans_author[i])
sheet1.cell(i + 2, 3, ans_title[i])
sheet1.cell(i + 2, 4, ans_key[i])
xl.save(write_file)
print("保存成功到-"+write_file)
#获取指定文件夹下的excel
import os
def get_filename(path,filetype): # 输入路径、文件类型例如'.xlsx'
name = []
for root,dirs,files in os.walk(path):
for i in files:
if os.path.splitext(i)[1]==filetype:
name.append(i)
return name # 输出由有后缀的文件名组成的列表
if __name__ == '__main__':
file='data/'
words=read_word()
list = get_filename(file, '.xlsx')
for i in range(len(list)):
new_file=file+list[i]
print(new_file)
sentences_file = "sentences/sentence" + str(i+1) + ".xlsx"
read(new_file,words,sentences_file)
展示效果
总结
以上的数据就是构建中华古诗词知识图谱的数据基础,通过这些数据,可以建立下一步的实体节点,进而创建实体之间的关系,从而可以存储至neo4j数据库中,当然也可以将其数据先导入至mysql数据库中。
方便之后的前台页面展示,neo4j数据库适合展示实体之间的关系,以及问答系统的建立。
出处:https://www.cnblogs.com/xiaofengzai/p/15753530.html