实例一>>爬取airbnb房屋信息


from selenium import webdriver

# 下载的geckodriver的存储位置
driver = webdriver.Chrome(executable_path='D:\\chromedriver.exe')
# 自动访问的网站
driver.get("https://www.airbnb.cn/s/Shenzhen--China/homes")
#找到页面中所有的出租房
rent_list = driver.find_elements_by_css_selector('div._gig1e7')

#每一个出租房的评论数量
for eachhouse in rent_list:
    #找出评论数量
    try:
        comment = eachhouse.find_element_by_css_selector('span._1clmxfj')
        comment = comment.text
    except:
        comment = 0
    #找到价格
    price = eachhouse.find_element_by_css_selector('div._1ixtnfc')
    price = price.text          #.replace("每晚","").replace("价格","").replace("\n","")
    #找到名称
    name = eachhouse.find_element_by_css_selector('div._qrfr9x5')
    name = name.text
    #找到房屋种类
    details = eachhouse.find_element_by_css_selector('div._1dir9an')
    details = details.text

    print(comment,price,name,details)

重点是通过【检查】来查看元素的定位名称,根据类型选择对应函数。

有时候会报错:

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=86.0.4240.183)

这个问题是有时会出现,有时不会出现。

异常:   selenium.common.exceptions.StaleElementReferenceException(msg=None, screen=None, stacktrace=None)
依据:     selenium.common.exceptions.WebDriverException

                一个参考的元素现在是“过时”时抛出异常。 
                “过时”是指这个元素不再出现在页面的Dom中。

selenium的常见异常网站: https://blog.51cto.com/12965114/1931478

        try:
            details = eachhouse.find_element_by_css_selector('div._1dir9an')
            details = details.text
        except ex.StaleElementReferenceException:
            print("error")

使用try....   expect..去抛出异常,程序可以正确运行,但还是有大量的数据没有爬到值

'''
https://www.airbnb.cn/s/Shenzhen--China/homes?refinement_paths%5B%5D=%2Fhomes&
current_tab_id=home_tab&selected_tab_id=home_tab&screen_size=large
&hide_dates_and_guests_filters=false&place_id=ChIJkVLh0Aj0AzQRyYCStw1V7v0
&s_tag=22t2AOn4
&last_search_session_id=0cb302bf-293c-4c1c-a7e2-7df7dd6c9ad6
&items_offset=20/40
§ion_offset=6


https://www.airbnb.cn/s/Shenzhen--China/homes?items_offset=40
相应页数×20
'''
from selenium import webdriver
from selenium.common import exceptions as ex
# 下载的geckodriver的存储位置
driver = webdriver.Chrome(executable_path='D:\\chromedriver.exe')
for i in range(0,3):
    link = 'https://www.airbnb.cn/s/Shenzhen--China/homes?items_offset='+str(i*20)
    link = driver.get(link)
    #print(link)
    rent_list = driver.find_elements_by_css_selector('div._gig1e7')
    for eachhouse in rent_list:
        # 找出评论数量
        try:
            comment = eachhouse.find_element_by_css_selector('span._1clmxfj')
            comment = comment.text
        except:
            comment = 0
        # 找到价格
        # price2 = eachhouse.find_element_by_css_selector('div._1ixtnfc')
        # price = price2.text
        try:
            price2 = eachhouse.find_element_by_css_selector('div._1ixtnfc')
            price = price2.text
        except ex.StaleElementReferenceException:
            print("error")
        # 找到名称
        # name = eachhouse.find_element_by_css_selector('div._qrfr9x5')
        # name = name.text
        try:
            name = eachhouse.find_element_by_css_selector('div._qrfr9x5')
            name = name.text
        except ex.StaleElementReferenceException:
            print("error")
        # 找到房屋种类
        # details = eachhouse.find_element_by_css_selector('div._1dir9an')
        # details = details.text
        try:
            details = eachhouse.find_element_by_css_selector('div._1dir9an')
            details = details.text
        except ex.StaleElementReferenceException:
            print("error")

        print(comment, price, name, details)