2.JSON数据处理以及BS使用


Json

在线数据生成器:https://www.onlinedatagenerator.com/

加载json数据

import requests
import json
# from pprint import pprint

def main():
    url = "http://192.168.223.143/test.json"
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
        "Referer":"https://www.baidu.com"
    }
    resp = requests.get(url=url,headers=headers)
    json_str = resp.content.decode("utf-8")
    # print(json_str)
    ret1 = json.loads(json_str)
    print(ret1['objects'][4]['EmailAddress'])
		# 美化打印
		# pprint(ret1)
    pass

if __name__ == '__main__':
    main()

保存文件与读取文件

with open("b.txt","w",encoding="utf-8") as file:
	  # ensure_ascii=False:可以显示中文
	  # indent=2:把子节点往后移两个空格,不移就会显示一行,影响美观
		file.write(json.dumps(ret1,ensure_ascii=False,indent=2))
with open("b.txt","r",encoding="utf-8") as file:
		ret2 = json.load(file)
		print(ret2)

BeautifulSoup代替正则表达式

作用:解析网页的源码

使用方法

pip install bs4
from bs4 import BeautifulSoup

# 保存页面源代码
html_doc = resp.content.decode("utf-8")

# 使用bs去处理网页源代码
soup = BeautifulSoup(html_doc)

# 找到class='m-hd'的div标签
print(soup.find('div', class_='m-hd'))

# 找到所有的a标签,并且class='u-card'的东西,放在list里,可以取出第2个
print(soup.find_all('a', class_='u-card')[2])

# 获取全部页面中的a标签,class为'u-card'的元素
number = len(soup.find_all('a', class_='u-card'))
for i in range(number):
    # 获得文字内容
    print(soup.find_all('a', class_='u-card')[i].get_text().strip())

# 查找所有div下id='j-anime-nav-collect'的标签内容,放在list里,取出第一个的文本
print(soup.select("div > #j-anime-nav-collect")[0].get_text())

# 查找所有ul下的class='item'的标签内容,放在list里,取出第一个的文本
print(soup.select("ul > .item")[1].get_text())

# 取出网页的标题
print(soup.title.string)
print(soup.title.get_text())

# 取出所有的img标签的内容,放在一个list里
print(soup.find_all('img'))

# 找到第一个class='u-tt'的,无视标签
print(soup.find(class_='u-tt').get_text())

list = soup.find('div', class_='lst-item').find_all('a', class_='u-card')
    for item in list:
        # 取出所有list中每一项的细节
        name = item.find('p', class_='u-tt').get_text()
        # print(name)
        # 取出图片地址,img标签里面的data-src属性,因为网站的src只是作为占位符,所有要具体情况具体分析
        pic_url = item.find('img').get('data-src')
        print(name + '-----' + pic_url)

BS的两个小例子

依次爬取1-14页的动漫名和图片保存位置

import requests
from bs4 import BeautifulSoup

def scrapy(page):
    url = "http://www.4399dmw.com/search/dh-1-0-0-0-0-{}-0/".format(page)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
        "Referer": "https://www.baidu.com"
    }
    resp = requests.get(url=url, headers=headers)
    html_doc = resp.content.decode("utf-8")
    soup = BeautifulSoup(html_doc)
    list = soup.find('div', class_='lst').find_all('a', class_='u-card')
    for item in list:
        name = item.find('p', class_='u-tt').get_text()
        pic_url = item.find('img').get('data-src')

        print(name + '—————' + pic_url)

def main():
    for i in range(14):
        print("爬到了第" + str(i) + "页")
        scrapy(i)

if __name__ == '__main__':
    main()

爬取其中火影忍者的猜你喜欢栏目

import requests
from bs4 import BeautifulSoup

def main():
    url = "http://www.4399dmw.com/huoying/donghua/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
        "Referer": "https://www.baidu.com"
    }
    resp = requests.get(url=url, headers=headers)
    html_doc = resp.content.decode('utf-8')
    soup = BeautifulSoup(html_doc)
    list = soup.find_all('div', class_='works__info')[3].find_all('a')
    for item in list:
        print(item.get_text())

if __name__ == '__main__':
    main()