python 网页爬虫技能
爬虫一:爬取Tech Track Top 100 companies
参照:https://towardsdatascience.com/data-science-skills-web-scraping-using-python-d1a85ef607ed
首先导入要用到库:
- requests请求库
- bs4解析网页库
- csv保存提取内容的库
- 获取采用json格式保存
import requests from bs4 import BeautifulSoup import csv def get_html(bas_url,headers): r = requests.get(bas_url,headers = headers) print(r.status_code) r.encoding = r.apparent_encoding return r.text def parse_html(html): soup = BeautifulSoup(html,"html.parser") return soup def abstract(soup): table = soup.find('table',attrs = {'class':"tableSorter"}) results = table.find_all('tr') print("numbers of results:",len(results)) rows = [] #表头 rows.append(['rank','company','website',"location","yearend","2-years growth","international sales","total sales","staff","comment"]) print(rows) for result in results: data = result.find_all('td') if data == []: continue #print(data) rank = data[0].getText() company = data[1].find('span',attrs={"class":"company-name"}).getText() #获取company的公司官网 href = data[1].find("a").get("href") headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'} html_com = get_html(href,headers) soup2 = parse_html(html_com) try: tr = soup2.find_all("tr")[-1] website = tr.find('a').get('href') except: website = None # print(website) location = data[2].getText() yearend = data[3].getText() growth = data[4].getText() internationalSales = data[5].getText().strip("*").strip("?").strip("*").replace(",","") totalsales = data[6].getText().strip("*").strip("?").strip("*").replace(",","") staff = data[7].getText() comment = data[8].getText() rows.append([rank,company,website,location,yearend,growth,internationalSales,totalsales,staff,comment]) return rows def save_csv_file(rows): with open("fasttrack100.csv",'w',newline='',encoding="utf-8") as f_output: #不加newline=""会有空行 csv_output = csv.writer(f_output) csv_output.writerows(rows) def main(): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'} bas_url = "https://www.fasttrack.co.uk/league-tables/sme-export-track-100/league-table/" total_html = get_html(bas_url,headers) soup = parse_html(total_html) rows_info = abstract(soup) save_csv_file(rows_info) if __name__ == "__main__": main()