cvpr顶会热词爬取


Py代码

"""
@Project :PycharmProjects 
@File    :test.py
@IDE     :PyCharm 
@Author  :Cra2iTeT
@Date    :2022/5/13 10:38 
"""

import urllib
from bs4 import BeautifulSoup
import requests


def get(url):
    headers = {
        'User-Agent': 'Mozilla/5.0(Macintosh;Inter Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gerko) Chrome/52.0.2743.116 Safari/537.36'
    }
    response = requests.get(url, headers=headers).text

    req = urllib.request.Request(url=url)
    content = urllib.request.urlopen(req).read()
    content = content.decode('utf-8')

    return content


# url = "https://openaccess.thecvf.com/content_ICCV_2019/html/Rossler_FaceForensics_Learning_to_Detect_Manipulated_Facial_Images_ICCV_2019_paper.html"
url = "https://openaccess.thecvf.com/ICCV2019?day=2019-10-29"
content = get(url)
soup = BeautifulSoup(content, "html.parser")
tag = soup.find_all('dt')

for tag1 in tag:
    if tag1.find('a'):
        taga = tag1.find('a')  # tag内获取a标签
        href = taga.get('href')  # 获取href内容
        article = taga.text

        tempContent = get("https://openaccess.thecvf.com/" + href)
        tempSoup = BeautifulSoup(tempContent, "html.parser")
        tempTagI = tempSoup.find_all('i')
        author = tempTagI[0].text
        tempTagAbstract = tempSoup.find(id='abstract')
        tempAbstract = tempTagAbstract.text
        tempTagAuthor = tempSoup.find(id='authors')
        text = tempTagAuthor.text
        year = text[text.index("CV), ") + 5:text.index(", pp. ")]

        with open('./newTestAuthor.json', 'a', encoding='utf8') as fp:
            fp.write(author + "\n")
        with open('./newTestAbstract.json', 'a', encoding='utf8') as fp:
            fp.write(tempAbstract)
        with open('./newTestHref.json', 'a', encoding='utf8') as fp:
            fp.write("https://openaccess.thecvf.com/" + href + "\n")
        with open('./newTestArticle.json', 'a', encoding='utf8') as fp:
            fp.write(article + "\n")
        with open('./newTestYear.json', 'a', encoding='utf8') as fp:
            fp.write(year + "\n")
    else:
        break

爬取结果

 导入数据库代码

     BufferedReader hrefBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestHref.json"));
        BufferedReader authorBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestAuthor.json"));
        BufferedReader articleBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestArticle.json"));
        BufferedReader abstractBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestAbstract.json"));
        BufferedReader yearBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestYear.json"));

        String lineHref = null;
        String lineAuthor = null;
        String lineArticle = null;
        String lineAbstract = abstractBufferedReader.readLine();
        String lineYear = null;

        while ((lineHref = hrefBufferedReader.readLine()) != null) {
            lineAbstract = abstractBufferedReader.readLine();
            lineAuthor = authorBufferedReader.readLine();
            lineArticle = articleBufferedReader.readLine();
            lineYear = yearBufferedReader.readLine();
            Paper paper = new Paper();
            paper.setHref(lineHref);
            paper.setAuthor(lineAuthor);
            paper.setArticle(lineArticle);
            paper.setPaperAbstract(lineAbstract);
            paper.setYear(lineYear);
            paperMapper.insert(paper);
        }

绝对地址跟换成自己的,java这块的代码需要引入mybatis-plus依赖