第一步:爬取分类url
from requests_html import HTMLSession
session =HTMLSession()
#https://www.flipkart.com/lc/getData?dataSourceId=websiteNavigationMenuDS_1.0
#还有个t参数,目前可以不加
res =session.get('https://www.flipkart.com/lc/getData?dataSourceId=websiteNavigationMenuDS_1.0', verify=False)
appliances_columns_list=res.json().get('navData').get('appliances').get('tabs')[0].get('columns')
men_columns_list=res.json().get('navData').get('men').get('tabs')[0].get('columns')
women_columns_list=res.json().get('navData').get('women').get('tabs')[0].get('columns')
baby_kids_columns_list=res.json().get('navData').get('baby-kids').get('tabs')[0].get('columns')
home_kitchen_columns_list=res.json().get('navData').get('home-kitchen').get('tabs')[0].get('columns')
nav_columns_lists =appliances_columns_list+men_columns_list+women_columns_list+baby_kids_columns_list+home_kitchen_columns_list
for nav_columns_list in nav_columns_lists:
for title_url_type_dict in nav_columns_list:
print(title_url_type_dict.get('url'))
第二步:获取翻页
#page参数控制
如:https://www.flipkart.com/womens-footwear/pr?sid=osp,iko&page=3
第三步:获取详情页url
from requests_html import HTMLSession
session =HTMLSession()
res =session.get('https://www.flipkart.com/womens-footwear/pr?sid=osp,iko&page=3', verify=False)
print(res.html.xpath('//*[@id="container"]/div/div[3]/div[2]/div[1]/div[2]/div/div/div/div/div/a[1]/@href')[4:])
第四步:详情页访问