爬取中华古诗词知识之数据获取


数据获取

来源于:寻古诗词网

对应的诗词板块,构建诗词知识图谱,爬取了唐宋元明清五大朝代的诗词

根据朝代来逐个爬取,分页爬取,可以发现分页的规律

唐代古诗的第一页url链接格式如下:https://www.xungushici.com/shicis/cd-tang-p-1
第二页:https://www.xungushici.com/shicis/cd-tang-p-2

可以看出其中的规律,就可以逐页爬取。

需要获得的数据包含:诗词名字,作诗朝代,作者,诗词内容,诗词分类,译文,注释,赏析以及创作背景

poem.py:获得了诗词名字,作诗朝代,作者,诗词内容,译文,赏析和鉴赏

import requests
from bs4 import BeautifulSoup
from lxml import etree

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息
pom_list=[]
k=1
for i in range(10001,10654):
    url='https://www.xungushici.com/shicis/cd-qing-p-'+str(i)
    r=requests.get(url,headers=headers)
    content=r.content.decode('utf-8')
    soup = BeautifulSoup(content, 'html.parser')

    hed=soup.find('div',class_='col col-sm-12 col-lg-9')
    list=hed.find_all('div',class_="card mt-3")
    # print(len(list))

    for it in list:
        content = {}
        #1.1获取单页所有诗集
        href=it.find('h4',class_='card-title').a['href']
        real_href='https://www.xungushici.com'+href
        title=it.find('h4',class_='card-title').a.text
        #2.1爬取诗词
        get = requests.get(real_href).text
        selector = etree.HTML(get)
        #2.2获取标题
        xtitle=selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/h3/text()')[0]
        #2.3获取朝代
        desty=selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/p/a/text()')[0]
        #2.4获取作者
        if len(selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/p/span/text()'))==0:
            author=selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/p/a[2]/text()')[0]
        else:
            author =selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/p/span/text()')[0]
        #2.5获取文章
        ans=""
        if len(selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/div[1]/p/text()'))==0:
            artical=selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/div[1]/text()')
            for it in artical:
                ans=ans+it.replace("\r","").replace("\t","").replace("\n","")
        else:
            artical = selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/div[1]/p/text()')
            for it in artical:
                ans=ans+it.replace("\r","").replace("\t","").replace("\n","")
        #2.6获取译文
        trans=""
        flag=0
        for j in range(2,8):
            path='/html/body/div[1]/div/div[1]/div[2]/div[2]/p[%d]'%j
            if selector.xpath(path+'/text()')==[]:
                break
            else:
                translist=selector.xpath(path+'/text()')
                for it in translist:
                    trans = trans + it + "\n"
        #2.7获取鉴赏
        appear=""
        for j in range(1,19):
            path='/html/body/div[1]/div/div[1]/div[3]/div[2]/p[%d]'%j
            if selector.xpath(path+'/text()')==[]:
                break
            else:
                apperlist=selector.xpath(path+'/text()')
                for it in apperlist:
                    appear = appear + it + "\n"
        #2.8创作背景
        background=selector.xpath('/html/body/div[1]/div/div[1]/div[4]/div[2]/p/text()')
        text_back=""
        if background!=[]:
            for it in background:
                text_back=text_back+it+"\n"
        content['title']=xtitle
        content['desty']=desty
        content['author']=author
        content['content']=ans
        content['trans_content']=trans
        content['appear']=appear
        content['background']=text_back
        pom_list.append(content)
        print(""+str(k)+"")
        k=k+1

import xlwt

xl = xlwt.Workbook()
# 调用对象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"title")
sheet1.write(0,1,'desty')
sheet1.write(0,2,'author')
sheet1.write(0,3,'content')
sheet1.write(0,4,'trans_content')
sheet1.write(0,5,'appear')
sheet1.write(0,6,'background')

for i in range(0,len(pom_list)):
    sheet1.write(i+1,0,pom_list[i]['title'])
    sheet1.write(i+1, 1, pom_list[i]['desty'])
    sheet1.write(i+1, 2, pom_list[i]['author'])
    sheet1.write(i+1, 3, pom_list[i]['content'])
    sheet1.write(i+1, 4, pom_list[i]['trans_content'])
    sheet1.write(i+1, 5, pom_list[i]['appear'])
    sheet1.write(i+1, 6, pom_list[i]['background'])
xl.save("qing3.xlsx")
# print(pom_list)

分类和注释,是后期需要重新爬取补上

import requests
from bs4 import BeautifulSoup
from lxml import etree

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息
pom_list=[]
k=1
for i in range(1,1000):
    url='https://www.xungushici.com/shicis/cd-yuan-p-'+str(i)
    r=requests.get(url,headers=headers)
    content=r.content.decode('utf-8')
    soup = BeautifulSoup(content, 'html.parser')

    hed=soup.find('div',class_='col col-sm-12 col-lg-9')
    list=hed.find_all('div',class_="card mt-3")
    # print(len(list))

    for it in list:
        content = {}
        #1.1获取单页所有诗集
        href=it.find('h4',class_='card-title').a['href']
        real_href='https://www.xungushici.com'+href
        title=it.find('h4',class_='card-title').a.text
        print(title)
        #2.1爬取诗词
        r2 = requests.get(real_href, headers=headers)
        content2 = r2.content.decode('utf-8')
        soup2 = BeautifulSoup(content2, 'html.parser')
        zhu = ""
        if soup2.find('div',class_='card mt-3')==[]:
            zhu=""
            content['title'] = title
            content['zhu'] = zhu
            pom_list.append(content)
            print("" + str(k) + "")
            k = k + 1
            continue
        card_div=soup2.find('div',class_='card mt-3')

        if card_div==None or card_div.find('div',class_='card-body')==[]:
            zhu=""
            content['title'] = title
            content['zhu'] = zhu
            pom_list.append(content)
            print("" + str(k) + "")
            k = k + 1
            continue
        card_body=card_div.find('div',class_='card-body')
        p_list=card_body.find_all('p')
        flag=1
        for it in p_list:
            if str(it).find('strong')!=-1 and it.find('strong').text=='注释':
                flag=0
                continue
            if flag==0:
                zhu=zhu+str(it)
        if len(zhu)==0:
            zhu=""
        content['title']=title
        content['zhu']=zhu
        pom_list.append(content)
        print(""+str(k)+"")
        k=k+1

import xlwt

xl = xlwt.Workbook()
# 调用对象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"title")

sheet1.write(0,12,'zhu')

for i in range(0,len(pom_list)):
    sheet1.write(i+1,0,pom_list[i]['title'])
    sheet1.write(i+1, 12, pom_list[i]['zhu'])
xl.save("yuan.xlsx")
# print(pom_list)

诗词形式,作诗时间,词牌名,曲牌名获取

poem_data.py获取作诗时间,采用crf分析作诗背景和鉴赏中的文字,提取出作诗时间词,并进行过滤。

#根据古诗的欣赏和背景来抽取:诗词的具体创作时间

import re
from pyhanlp import *
import pandas as pd
#人名“nr“
#地名“ns”
#机构名“nt”

def demo_CRF_lexical_analyzer(text):
    CRFnewSegment = HanLP.newSegment("crf")
    term_list = CRFnewSegment.seg(text)
    ans=[]
    for it in term_list:
        if str(it.nature)=='t' or str(it.nature)=='m':
            ans.append(str(it.word))
    #print(ans)
    return ans

from xlrd import open_workbook
from xlutils.copy import copy

#将分类结果重新写入原excel中
def write_to(data,file):
    print(len(data))
    xl =open_workbook(file)
    excel = copy(xl)
    sheet1 = excel.get_sheet(0)

    sheet1.write(0, 9, "data")
    for i in range(0, len(data)):
        sheet1.write(i + 1, 9, data[i])

    excel.save(file)

#获取指定文件夹下的excel
import os
def get_filename(path,filetype):  # 输入路径、文件类型例如'.xlsx'
    name = []
    for root,dirs,files in os.walk(path):
        for i in files:
            if os.path.splitext(i)[1]==filetype:
                name.append(i)
    return name            # 输出由有后缀的文件名组成的列表

if __name__ == '__main__':
    file = 'data/'
    list = get_filename(file, '.xlsx')
    for it in list:
        newfile = file + it
        pome_time = []
        print("开始"+str(newfile))
        data=pd.read_excel(newfile).fillna("")
        appear=data.appear
        back=data.background
        if len(appear)>5000:
            maxn=5000
        else:
            maxn=len(appear)
        for i in range(maxn):
            print(""+str(i+1)+"个:")
            app=appear[i]
            bk=back[i]
            if app=="" and bk =="":
                pome_time.append("")
                print("")
                continue
            #print("===============欣赏===================")
            app_time=demo_CRF_lexical_analyzer(app)
            #print("===============背景===================")
            bk_time=demo_CRF_lexical_analyzer(bk)

            f=False
            for it in bk_time:
                if bool(re.search(r'\d', it))  == True:
                    print(it)
                    pome_time.append(it)
                    f=True
                    break
            if f==False:
                for it in app_time:
                    if bool(re.search(r'\d', it)) == True:
                        print(it)
                        pome_time.append(it)
                        f=True
                        break
            if f==False:
                pome_time.append("")
                print("")

        write_to(pome_time,newfile)

诗词形式与词牌名和曲牌名分析

诗词形式:七言律诗,七言绝句,七言,五言律诗,五言绝句,五言

词牌名和曲牌名通过之后的网页爬取后,分析诗词名,进而得到对应的诗词牌名

最终结果:

 

 总计:480122首

唐:48330首;宋:200000首;元:39240首;明:100000;清:92550首

诗人数据

从诗人的个人经历中,获取对应朝代的诗人名称

#根据诗人的个人简介抽取:生于,死于,字,号

import pandas as pd
import re
from xlrd import open_workbook
from xlutils.copy import copy

def read_author():
    file= "data2/author.xlsx"
    data=pd.read_excel(file).fillna("")
    produce=list(data.produce)
    i=1
    bg=[]
    ed=[]
    zi=[]
    hao=[]
    pome_self=[]
    #获取诗人诗集数目
    num=list(data.num)

    for it in produce:
        #获取诗人个人简介
        pome_self.append(it)

        print(""+str(i)+"个诗人:")
        # 获取诗人出生,去世的年份
        datas=re.findall(r"\d+",it)
        if len(datas)!=0 and len(datas)!=1:
            bg.append(datas[0]+"")
            #print("生于"+datas[0])
            flag=False
            for j in range(1,len(datas)):
                if len(datas[j])>=len(datas[0]) and int(datas[j])-int(datas[0])>15:
                    ed.append(datas[j]+"")
                    #print("死于"+datas[j])
                    flag=True
                    break
            if flag==False:
                ed.append("")
        else:
            bg.append("")
            ed.append("")

        # 获取诗人,字,号
        ztext=re.findall(r".*字(.*?)[,|。]",it)
        if len(ztext)!=0:
            zi.append(ztext)
        else:
            zi.append("")
        #print(ztext)
        htext = str(re.findall(r".*号(.*?)[,|。]", it)).replace('','').replace('','').replace('[','').replace(']','').replace('\'','')
        if len(htext)!=0:
            hao.append(htext)
        else:
            hao.append("")
        #print(htext)
        i = i + 1

    xl = open_workbook(file)
    excel = copy(xl)
    sheet1 = excel.get_sheet(0)

    sheet1.write(0, 6, "begin_time")
    sheet1.write(0,7,"end_time")
    sheet1.write(0,8,"zi")
    sheet1.write(0,9,"hao")
    for i in range(0, len(bg)):
        sheet1.write(i + 1, 6, bg[i])
        sheet1.write(i + 1, 7, ed[i])
        sheet1.write(i + 1, 8, zi[i])
        sheet1.write(i + 1, 9, hao[i])

    excel.save(file)




if __name__ == '__main__':
    read_author()

诗人好友获取的结果是:

 诗人信息

 收集所有的合称

import requests
from bs4 import BeautifulSoup
from lxml import etree

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息

hc=[]

url='https://www.xungushici.com/authors'
r=requests.get(url,headers=headers)
content=r.content.decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
orign_href='https://www.xungushici.com'


hecheng=soup.find('div',id='divHeCheng')
list=hecheng.find_all('li',class_="m-1 badge badge-light")
dic={}
for i in range(1,len(list)):
    href=orign_href+list[i].a['href']
    hecehng=list[i].a.text
    hc.append(hecehng)
    r2 = requests.get(href, headers=headers)
    content2 = r2.content.decode('utf-8')
    soup2 = BeautifulSoup(content2, 'html.parser')
    pomdiv=soup2.find('div',class_='col col-sm-12 col-lg-9')
    card=pomdiv.find_all('div',class_='card mt-3')
    author_list=[]
    for it in card:
        h4=it.find('h4',class_='card-title')
        list_a=h4.find_all('a')
        desty=list_a[0].text
        author=list_a[1].text
        author_list.append(author)
    dic[hecehng]=",".join(author_list)

import xlwt

xl = xlwt.Workbook()
# 调用对象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"hc")
sheet1.write(0,1,'author')
for i in range(0,len(hc)):
    sheet1.write(i+1,0,hc[i])
    sheet1.write(i+1,1,dic[hc[i]])

xl.save("common_name.xlsx")


for it in hc:
    print(it+": "+dic[it])

之后将读取该表,对应到诗人表中添加一列合称属性:

 诗人轨迹信息

获取来源

 

import requests
from bs4 import BeautifulSoup
from lxml import etree

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息
cipai=[]

for i in range(1,7):
    url='https://www.xungushici.com/cipais/p'+str(i)
    r=requests.get(url,headers=headers)
    content=r.content.decode('utf-8')
    soup = BeautifulSoup(content, 'html.parser')

    hed=soup.find('ul',class_='list-unstyled d-flex flex-row flex-wrap align-items-center w-100')
    list=hed.find_all('li',class_="m-1 badge badge-light")

    for it in list:
        if it.a!=None:
            cipai.append(it.a.text)

import xlwt

xl = xlwt.Workbook()
# 调用对象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"title")
for i in range(0,len(cipai)):
    sheet1.write(i+1,0,cipai[i])

xl.save("cipai_name.xlsx")

展示效果

爬取收集

import pandas as pd
import xlwt



#读取yuan代的诗词
def read(file):
    data=pd.read_excel(file)
    title=data.title
    # 存储一个曲排名列表
    qu_list=[]
    for it in title:
        if it.find('·')!=-1:
            # 根据诗词名获取对应的曲牌名
            qu=it.split('·')
            qu_list.append(qu[0])
    new_qu=list(set(qu_list))
    #将曲牌名进行保存
    xl = xlwt.Workbook()
    # 调用对象的add_sheet方法
    sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)
    sheet1.write(0, 0, "qu_name")

    for i in range(0, len(new_qu)):
        sheet1.write(i + 1, 0, new_qu[i])

    xl.save("qupai_name.xlsx")

展示效果:

 飞花令

通过遍历爬取的50万首古诗,分析每个句子是否有包含的飞花令中的关键字,如果有将其存储起来:诗句、作者、诗名、关键字

困难:如果用xlwt来存储,最多存储65536行数据,用openpyxl可以存储100万行数据。由于我们的诗句数据过大,因此需采用openpyxl来进行存储

import pandas as pd
import xlwt
import openpyxl

#读取飞花令
def read_word():
    data=pd.read_excel('data2/word.xlsx')
    words=data.word
    return words

#遍历诗句
def read(file,words,write_file):
    data=pd.read_excel(file)
    title=data.title
    content=data.content
    author=data.author
    #进行切分出单句
    ans_sentens = []
    ans_author = []
    ans_title = []
    ans_key = []
    for i in range(len(title)):
        print(""+str(i)+"")
        cont=content[i]
        aut=author[i]
        tit=title[i]
        sents=cont.replace('\n','').split('')
        for it in sents:
            key_list = []
            for k in words:
                if it.find(k)!=-1:
                    key_list.append(k)
            if len(key_list)!=0:
                ans_sentens.append(it)
                ans_author.append(aut)
                ans_title.append(tit)
                ans_key.append(",".join(key_list))

    #存储对应的key,author,title,sentenous
    xl = openpyxl.Workbook()
    # 调用对象的add_sheet方法
    sheet1 = xl.create_sheet(index=0)
    sheet1.cell(1, 1, "sentens")
    sheet1.cell(1, 2, "author")
    sheet1.cell(1, 3, "title")
    sheet1.cell(1, 4, "keys")

    for i in range(0, len(ans_key)):
        sheet1.cell(i + 2, 1, ans_sentens[i])
        sheet1.cell(i + 2, 2, ans_author[i])
        sheet1.cell(i + 2, 3, ans_title[i])
        sheet1.cell(i + 2, 4, ans_key[i])
    xl.save(write_file)
    print("保存成功到-"+write_file)

#获取指定文件夹下的excel
import os
def get_filename(path,filetype):  # 输入路径、文件类型例如'.xlsx'
    name = []
    for root,dirs,files in os.walk(path):
        for i in files:
            if os.path.splitext(i)[1]==filetype:
                name.append(i)
    return name            # 输出由有后缀的文件名组成的列表

if __name__ == '__main__':
    file='data/'
    words=read_word()
    list = get_filename(file, '.xlsx')
    for i in range(len(list)):
        new_file=file+list[i]
        print(new_file)
        sentences_file = "sentences/sentence" + str(i+1) + ".xlsx"
        read(new_file,words,sentences_file)

展示效果

 

 总结

以上的数据就是构建中华古诗词知识图谱的数据基础,通过这些数据,可以建立下一步的实体节点,进而创建实体之间的关系,从而可以存储至neo4j数据库中,当然也可以将其数据先导入至mysql数据库中。

方便之后的前台页面展示,neo4j数据库适合展示实体之间的关系,以及问答系统的建立。

出处:https://www.cnblogs.com/xiaofengzai/p/15753530.html