python xpath
语法:
article 选取所有article元素的所有子节点 /article 选取根元素article article/a 选取所有属于article的子元素的a元素 //div 选取所有div子元素(不论出现在文档任何地方) article//div 选取所有属于article元素的后代的div元素不管它出现在article之下的任何位置 //@class 选取所有名为class的属性 /article/div[1] 选取属于srticle子元素的第一个div所有子节点 /article/div[last()] 选取属于article子元素的最后一个div所有子节点 /article/div[last()-1] 选取属于article子元素的倒数第二个div所有子节点 //div[@lang] 选取所有拥有lang属性的div元素 //div[@lang='eng'] 选取所有lang属性为eng的div元素 /div/* 选取属于div元素的所有子节点 //* 选取所有元素 //div[@*] 选取所有带属性的div元素 //div/a | //div/p 选取所有div元素的a个p元素 //span | //ul 选取文档中的span和ul元素 article/div/p | //span 选取所有属于article元素的div元素和所有的span元素
实例:
记录实例1:
import requests #from lxml import etree from lxml import html etree = html.etree url = 'https://book.douban.com/top250' headers = { 'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } data = requests.get(url = url,headers = headers).text s = etree.HTML(data) #通过@title获取他的title标签里面的内容 film = s.xpath('//*[@id="content"]/div/div[1]/div/table[1]/tr/td[2]/div[1]/a/@title') print('电影名称:',film)
2:
import requests #from lxml import etree from lxml import html etree = html.etree url = 'https://book.douban.com/top250' headers = { 'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } data = requests.get(url = url,headers = headers).text s=etree.HTML(data) #file用来记录循环次数 file=s.xpath('//*[@id="content"]/div/div[1]/div/table') #通过@title获取他的title标签里面的内容 for info in file: title = info.xpath('./tr/td[2]/div[1]/a/@title')[0] href = info.xpath("./tr/td[2]/div[1]/a/@href")[0] score = info.xpath('./tr/td[2]/div[2]/span[2]/text()')[0] #只取评论的第一条 commitmentNum=info.xpath('./tr/td[2]/div[2]/span[3]/text()')[0].strip("(").strip().strip(")") scribe=info.xpath("./tr/td[2]/p[2]/span/text()") print("{} {} {} {}".format(title,href,score,commitmentNum,scribe[0]))
3;
import requests#from lxml import etree from lxml import html with open('./ttt1.csv','w',encoding='utf-8') as f: for a in range(10): url = 'https://book.douban.com/top250?start={}'.format(a*25) headers = { 'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } data = requests.get(url = url, headers = headers).text etree = html.etree s=etree.HTML(data) file=s.xpath('//*[@id="content"]/div/div[1]/div/table') for info in file: title = info.xpath('./tr/td[2]/div[1]/a/@title')[0] href = info.xpath("./tr/td[2]/div[1]/a/@href")[0] score = info.xpath('./tr/td[2]/div[2]/span[2]/text()')[0] commitmentNum=info.xpath('./tr/td[2]/div[2]/span[3]/text()')[0].strip("(").strip().strip(")") scribe=info.xpath("./tr/td[2]/p[2]/span/text()") if len(scribe) > 0: f.write("{},{},{},{},{}\n".format(title,href,score,commitmentNum,scribe[0])) else: f.write("{},{},{},{}\n".format(title, href, score, commitmentNum))
实例4:爬取我自己的博客首页内容。
import requests from lxml import html url = 'https://www.cnblogs.com/zjxcyr/' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } html_data = requests.get(url=url, headers=headers).text # with open('./data.html','w',encoding='utf-8') as f: # f.write(html_data) etree = html.etree s = etree.HTML(html_data) files = s.xpath('//*[@class="day"]') print(files) with open('./p2.txt', 'w', encoding='utf-8') as f: for file in files: print(file) date = file.xpath("./div/a/text()")[0] title = file.xpath("./div/a/span/text()")[0] comment = file.xpath("./div/div/text()")[0] print(date,title,comment) f.write("{}{}{}\n\n\n".format(date,title,comment))