python xpath


语法:

article                      选取所有article元素的所有子节点
  
/article                     选取根元素article
  
article/a                    选取所有属于article的子元素的a元素
  
//div                        选取所有div子元素(不论出现在文档任何地方)
  
article//div                 选取所有属于article元素的后代的div元素不管它出现在article之下的任何位置
  
//@class                     选取所有名为class的属性
  
/article/div[1]              选取属于srticle子元素的第一个div所有子节点
  
/article/div[last()]         选取属于article子元素的最后一个div所有子节点
  
/article/div[last()-1]       选取属于article子元素的倒数第二个div所有子节点
  
//div[@lang]                 选取所有拥有lang属性的div元素
  
//div[@lang='eng']           选取所有lang属性为eng的div元素
  
/div/*                       选取属于div元素的所有子节点
  
//*                          选取所有元素
  
//div[@*]                    选取所有带属性的div元素
  
//div/a | //div/p            选取所有div元素的a个p元素
  
//span | //ul                选取文档中的span和ul元素
  
article/div/p | //span       选取所有属于article元素的div元素和所有的span元素

实例:

记录实例1:

import requests
#from lxml import etree
from lxml import html
etree = html.etree
url = 'https://book.douban.com/top250'
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) '
                 'AppleWebKit/537.36 (KHTML, like Gecko) '
                 'Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
data = requests.get(url = url,headers = headers).text
s = etree.HTML(data)
#通过@title获取他的title标签里面的内容
film = s.xpath('//*[@id="content"]/div/div[1]/div/table[1]/tr/td[2]/div[1]/a/@title')
print('电影名称:',film)

2:

import requests
#from lxml import etree
from lxml import html
etree = html.etree
url = 'https://book.douban.com/top250'
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) '
                 'AppleWebKit/537.36 (KHTML, like Gecko) '
                 'Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
data = requests.get(url = url,headers = headers).text
s=etree.HTML(data)
#file用来记录循环次数
file=s.xpath('//*[@id="content"]/div/div[1]/div/table')
#通过@title获取他的title标签里面的内容

for info in file:
    title = info.xpath('./tr/td[2]/div[1]/a/@title')[0]
    href = info.xpath("./tr/td[2]/div[1]/a/@href")[0]
    score = info.xpath('./tr/td[2]/div[2]/span[2]/text()')[0]
    #只取评论的第一条
    commitmentNum=info.xpath('./tr/td[2]/div[2]/span[3]/text()')[0].strip("(").strip().strip(")")
    scribe=info.xpath("./tr/td[2]/p[2]/span/text()")
    print("{}    {}    {}   {}".format(title,href,score,commitmentNum,scribe[0]))

3;

import requests#from lxml import etree
from lxml import html
with open('./ttt1.csv','w',encoding='utf-8') as f:
    for a in range(10):
        url = 'https://book.douban.com/top250?start={}'.format(a*25)
        headers = {
            'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
        }
        data = requests.get(url = url, headers = headers).text
        etree = html.etree
        s=etree.HTML(data)
        file=s.xpath('//*[@id="content"]/div/div[1]/div/table')
        for info in file:
            title = info.xpath('./tr/td[2]/div[1]/a/@title')[0]
            href = info.xpath("./tr/td[2]/div[1]/a/@href")[0]
            score = info.xpath('./tr/td[2]/div[2]/span[2]/text()')[0]
            commitmentNum=info.xpath('./tr/td[2]/div[2]/span[3]/text()')[0].strip("(").strip().strip(")")
            scribe=info.xpath("./tr/td[2]/p[2]/span/text()")
            if len(scribe) > 0:
                f.write("{},{},{},{},{}\n".format(title,href,score,commitmentNum,scribe[0]))
            else:
                f.write("{},{},{},{}\n".format(title, href, score, commitmentNum))

 实例4:爬取我自己的博客首页内容。

import requests
from lxml import html

url = 'https://www.cnblogs.com/zjxcyr/'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) '
    'AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}


html_data = requests.get(url=url, headers=headers).text
# with open('./data.html','w',encoding='utf-8') as f:
#     f.write(html_data)

etree = html.etree
s = etree.HTML(html_data)
files = s.xpath('//*[@class="day"]')
print(files)
with open('./p2.txt', 'w', encoding='utf-8') as f:
    for file in files:
        print(file)
        date = file.xpath("./div/a/text()")[0]
        title = file.xpath("./div/a/span/text()")[0]
        comment = file.xpath("./div/div/text()")[0]
        print(date,title,comment)
        f.write("{}{}{}\n\n\n".format(date,title,comment))

doing~~~~~~~