实例一>>爬取airbnb房屋信息
from selenium import webdriver # 下载的geckodriver的存储位置 driver = webdriver.Chrome(executable_path='D:\\chromedriver.exe') # 自动访问的网站 driver.get("https://www.airbnb.cn/s/Shenzhen--China/homes") #找到页面中所有的出租房 rent_list = driver.find_elements_by_css_selector('div._gig1e7') #每一个出租房的评论数量 for eachhouse in rent_list: #找出评论数量 try: comment = eachhouse.find_element_by_css_selector('span._1clmxfj') comment = comment.text except: comment = 0 #找到价格 price = eachhouse.find_element_by_css_selector('div._1ixtnfc') price = price.text #.replace("每晚","").replace("价格","").replace("\n","") #找到名称 name = eachhouse.find_element_by_css_selector('div._qrfr9x5') name = name.text #找到房屋种类 details = eachhouse.find_element_by_css_selector('div._1dir9an') details = details.text print(comment,price,name,details)
重点是通过【检查】来查看元素的定位名称,根据类型选择对应函数。
有时候会报错:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document (Session info: chrome=86.0.4240.183)
这个问题是有时会出现,有时不会出现。
异常: selenium.common.exceptions.StaleElementReferenceException(msg=None, screen=None, stacktrace=None)
依据: selenium.common.exceptions.WebDriverException
一个参考的元素现在是“过时”时抛出异常。
“过时”是指这个元素不再出现在页面的Dom中。
selenium的常见异常网站: https://blog.51cto.com/12965114/1931478
try: details = eachhouse.find_element_by_css_selector('div._1dir9an') details = details.text except ex.StaleElementReferenceException: print("error")
使用try.... expect..去抛出异常,程序可以正确运行,但还是有大量的数据没有爬到值
''' https://www.airbnb.cn/s/Shenzhen--China/homes?refinement_paths%5B%5D=%2Fhomes& current_tab_id=home_tab&selected_tab_id=home_tab&screen_size=large &hide_dates_and_guests_filters=false&place_id=ChIJkVLh0Aj0AzQRyYCStw1V7v0 &s_tag=22t2AOn4 &last_search_session_id=0cb302bf-293c-4c1c-a7e2-7df7dd6c9ad6 &items_offset=20/40 §ion_offset=6 https://www.airbnb.cn/s/Shenzhen--China/homes?items_offset=40 相应页数×20 ''' from selenium import webdriver from selenium.common import exceptions as ex # 下载的geckodriver的存储位置 driver = webdriver.Chrome(executable_path='D:\\chromedriver.exe') for i in range(0,3): link = 'https://www.airbnb.cn/s/Shenzhen--China/homes?items_offset='+str(i*20) link = driver.get(link) #print(link) rent_list = driver.find_elements_by_css_selector('div._gig1e7') for eachhouse in rent_list: # 找出评论数量 try: comment = eachhouse.find_element_by_css_selector('span._1clmxfj') comment = comment.text except: comment = 0 # 找到价格 # price2 = eachhouse.find_element_by_css_selector('div._1ixtnfc') # price = price2.text try: price2 = eachhouse.find_element_by_css_selector('div._1ixtnfc') price = price2.text except ex.StaleElementReferenceException: print("error") # 找到名称 # name = eachhouse.find_element_by_css_selector('div._qrfr9x5') # name = name.text try: name = eachhouse.find_element_by_css_selector('div._qrfr9x5') name = name.text except ex.StaleElementReferenceException: print("error") # 找到房屋种类 # details = eachhouse.find_element_by_css_selector('div._1dir9an') # details = details.text try: details = eachhouse.find_element_by_css_selector('div._1dir9an') details = details.text except ex.StaleElementReferenceException: print("error") print(comment, price, name, details)