cvpr顶会热词爬取
Py代码
""" @Project :PycharmProjects @File :test.py @IDE :PyCharm @Author :Cra2iTeT @Date :2022/5/13 10:38 """ import urllib from bs4 import BeautifulSoup import requests def get(url): headers = { 'User-Agent': 'Mozilla/5.0(Macintosh;Inter Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gerko) Chrome/52.0.2743.116 Safari/537.36' } response = requests.get(url, headers=headers).text req = urllib.request.Request(url=url) content = urllib.request.urlopen(req).read() content = content.decode('utf-8') return content # url = "https://openaccess.thecvf.com/content_ICCV_2019/html/Rossler_FaceForensics_Learning_to_Detect_Manipulated_Facial_Images_ICCV_2019_paper.html" url = "https://openaccess.thecvf.com/ICCV2019?day=2019-10-29" content = get(url) soup = BeautifulSoup(content, "html.parser") tag = soup.find_all('dt') for tag1 in tag: if tag1.find('a'): taga = tag1.find('a') # tag内获取a标签 href = taga.get('href') # 获取href内容 article = taga.text tempContent = get("https://openaccess.thecvf.com/" + href) tempSoup = BeautifulSoup(tempContent, "html.parser") tempTagI = tempSoup.find_all('i') author = tempTagI[0].text tempTagAbstract = tempSoup.find(id='abstract') tempAbstract = tempTagAbstract.text tempTagAuthor = tempSoup.find(id='authors') text = tempTagAuthor.text year = text[text.index("CV), ") + 5:text.index(", pp. ")] with open('./newTestAuthor.json', 'a', encoding='utf8') as fp: fp.write(author + "\n") with open('./newTestAbstract.json', 'a', encoding='utf8') as fp: fp.write(tempAbstract) with open('./newTestHref.json', 'a', encoding='utf8') as fp: fp.write("https://openaccess.thecvf.com/" + href + "\n") with open('./newTestArticle.json', 'a', encoding='utf8') as fp: fp.write(article + "\n") with open('./newTestYear.json', 'a', encoding='utf8') as fp: fp.write(year + "\n") else: break
爬取结果
导入数据库代码
BufferedReader hrefBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestHref.json")); BufferedReader authorBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestAuthor.json")); BufferedReader articleBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestArticle.json")); BufferedReader abstractBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestAbstract.json")); BufferedReader yearBufferedReader = new BufferedReader(new FileReader("C:\\CS\\Python\\PycharmProjects\\test2\\newTestYear.json")); String lineHref = null; String lineAuthor = null; String lineArticle = null; String lineAbstract = abstractBufferedReader.readLine(); String lineYear = null; while ((lineHref = hrefBufferedReader.readLine()) != null) { lineAbstract = abstractBufferedReader.readLine(); lineAuthor = authorBufferedReader.readLine(); lineArticle = articleBufferedReader.readLine(); lineYear = yearBufferedReader.readLine(); Paper paper = new Paper(); paper.setHref(lineHref); paper.setAuthor(lineAuthor); paper.setArticle(lineArticle); paper.setPaperAbstract(lineAbstract); paper.setYear(lineYear); paperMapper.insert(paper); }
绝对地址跟换成自己的,java这块的代码需要引入mybatis-plus依赖