【Python爬虫】PyQuery解析库


PyQuery解析库

阅读目录

  • 初始化
  • 基本CSS选择器
  • 查找元素
  • 遍历
  • 获取信息
  • DOM操作
  • 伪类选择器

PyQuery 是 Python 仿照 jQuery 的严格实现。语法与 jQuery 几乎完全相同。

官方文档:http://pyquery.readthedocs.io/

安装

pip install pyquery

初始化

字符串初始化

html = '''

'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li'))
  • class="item-0">first item
  • class="item-1">"link2.html">second item
  • class="item-0 active">"link3.html">class="bold">third item
  • class="item-1 active">"link4.html">fourth item
  • class="item-0">"link5.html">fifth item
  • 输出结果

    URL初始化

    from pyquery import PyQuery as pq
    doc = pq(url='http://www.baidu.com')
    print(doc('head'))
    "content-type" content="text/html;charset=utf-8"/>"X-UA-Compatible" content="IE=Edge"/>"always" name="referrer"/>"stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/>????o|??????????? ?°±??¥é?? 
    输出结果

    文件初始化

    from pyquery import PyQuery as pq
    doc = pq(filename='demo.html')
    print(doc('li'))
  • class="item-0">first item
  • class="item-1">"link2.html">second item
  • class="item-0 active">"link3.html">class="bold">third item
  • class="item-1 active">"link4.html">fourth item
  • class="item-0">"link5.html">fifth item
  • 输出结果

    基本CSS选择器

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    print(doc('#container .list li'))
  • class="item-0">first item
  • class="item-1">"link2.html">second item
  • class="item-0 active">"link3.html">class="bold">third item
  • class="item-1 active">"link4.html">fourth item
  • class="item-0">"link5.html">fifth item
  • 输出结果

    查找元素

    子元素

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    items = doc('.list')
    print(type(items))
    print(items)
    lis = items.find('li')
    print(type(lis))
    print(lis)
    <class 'pyquery.pyquery.PyQuery'>
    
     
    <class 'pyquery.pyquery.PyQuery'>
    
  • class="item-0">first item
  • class="item-1">"link2.html">second item
  • class="item-0 active">"link3.html">class="bold">third item
  • class="item-1 active">"link4.html">fourth item
  • class="item-0">"link5.html">fifth item
  • 输出结果
    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    items = doc('.list')
    lis = items.children()
    print(type(lis))
    print(lis)
    <class 'pyquery.pyquery.PyQuery'>
    
  • class="item-0">first item
  • class="item-1">"link2.html">second item
  • class="item-0 active">"link3.html">class="bold">third item
  • class="item-1 active">"link4.html">fourth item
  • class="item-0">"link5.html">fifth item
  • 输出结果
    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    items = doc('.list')
    lis = items.children('.active')
    print(lis)
  • class="item-0 active">"link3.html">class="bold">third item
  • class="item-1 active">"link4.html">fourth item
  • 输出结果

    父元素

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    items = doc('.list')
    container = items.parent()
    print(type(container))
    print(container)
    <class 'pyquery.pyquery.PyQuery'>
    
    "container">
    输出结果
    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    items = doc('.list')
    parents = items.parents()
    print(type(parents))
    print(parents)
    <class 'pyquery.pyquery.PyQuery'>
    
    class="wrap">
    "container">
    "container">
    输出结果
    parent = items.parents('.wrap')
    print(parent)
    class="wrap">
    "container">
    输出结果

    兄弟元素

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.list .item-0.active')
    print(li.siblings())
  • class="item-1">"link2.html">second item
  • class="item-0">first item
  • class="item-1 active">"link4.html">fourth item
  • class="item-0">"link5.html">fifth item
  • 输出结果
    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.list .item-0.active')
    print(li.siblings('.active'))
  • class="item-1 active">"link4.html">fourth item
  • 输出结果

    遍历

    单个元素

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.item-0.active')
    print(li)
  • class="item-0 active">"link3.html">class="bold">third item
  • html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    lis = doc('li').items()
    print(type(lis))
    for li in lis:
        print(li)
    <class 'generator'>
    
  • class="item-0">first item
  • class="item-1">"link2.html">second item
  • class="item-0 active">"link3.html">class="bold">third item
  • class="item-1 active">"link4.html">fourth item
  • class="item-0">"link5.html">fifth item
  • 输出结果

    获取信息

    获取属性

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    a = doc('.item-0.active a')
    print(a)
    print(a.attr('href'))
    print(a.attr.href)
    "link3.html">class="bold">third item
    link3.html
    link3.html
    输出结果

    获取文本

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    a = doc('.item-0.active a')
    print(a)
    print(a.text())
    "link3.html">class="bold">third item
    third item
    输出结果

    获取HTML

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.item-0.active')
    print(li)
    print(li.html())
  • class="item-0 active">"link3.html">class="bold">third item
  • "link3.html">class="bold">third item

    DOM操作

    addClass、removeClass

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.item-0.active')
    print(li)
    li.removeClass('active')
    print(li)
    li.addClass('active')
    print(li)
  • class="item-0 active">"link3.html">class="bold">third item
  • class="item-0">"link3.html">class="bold">third item
  • class="item-0 active">"link3.html">class="bold">third item
  • 输出结果

    attr、css

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.item-0.active')
    print(li)
    li.attr('name', 'link')
    print(li)
    li.css('font-size', '14px')
    print(li)
  • class="item-0 active">"link3.html">class="bold">third item
  • class="item-0 active" name="link">"link3.html">class="bold">third item
  • class="item-0 active" name="link" style="font-size: 14px">"link3.html">class="bold">third item
  • 输出结果

    remove

    html = '''
    
    Hello, World

    This is a paragraph.

    ''' from pyquery import PyQuery as pq doc = pq(html) wrap = doc('.wrap') print(wrap.text()) wrap.find('p').remove() print(wrap.text())
    Hello, World This is a paragraph.
    Hello, World
    输出结果

    其他DOM方法 http://pyquery.readthedocs.io/en/latest/api.html

    伪类选择器

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('li:first-child')
    print(li)
    li = doc('li:last-child')
    print(li)
    li = doc('li:nth-child(2)')
    print(li)
    li = doc('li:gt(2)')
    print(li)
    li = doc('li:nth-child(2n)')
    print(li)
    li = doc('li:contains(second)')
    print(li)
  • class="item-0">first item
  • class="item-0">"link5.html">fifth item
  • class="item-1">"link2.html">second item
  • class="item-1 active">"link4.html">fourth item
  • class="item-0">"link5.html">fifth item
  • class="item-1">"link2.html">second item
  • class="item-1 active">"link4.html">fourth item
  • class="item-1">"link2.html">second item
  • 输出结果

    更多CSS选择器可以查看 http://www.w3school.com.cn/css/index.asp