使用Beautifulsoup爬取药智网数据
使用Beautifulsoup模块爬取药智网数据
Tips:1.爬取多页时,先用一页的做测试,要不然ip容易被封
2.自己常用的处理数据的方法:
reg=re.compile('正则表达式') data=reg.sub('要替换的字符串',data)
代码(其实没多少)
1 # encoding=utf-8 2 from bs4 import BeautifulSoup 3 import urllib2 4 import time 5 class YBZC(): 6 def __init__(self): 7 self.user_agent='Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 8 self.headers={'User-Agent':self.user_agent} 9 def getHtml(self,pageIndex): 10 try: 11 url='http://db.yaozh.com/zhuce?p='+str(pageIndex) 12 request=urllib2.Request(url,headers=self.headers) 13 respone=urllib2.urlopen(request) 14 html=respone.read() 15 return html 16 except urllib2.URLError,e: 17 if hasattr(e,'reason'): 18 print u"连接失败",e.reason 19 return None 20 def getItems(self): 21 for i in range(1,13): 22 html=self.getHtml() 23 soup=BeautifulSoup(html,"html.parser") 24 tr_list=soup.find_all('tr') 25 # 表格标题 26 if i==1: 27 for item in tr_list[0]: 28 if item not in ['\n','\t',' ']: 29 with open('yaopinzhuce1030.txt','a') as f: 30 f.write(item.get_text(strip=True).encode('utf-8')+'|') 31 #=========================2015-10-30================================ 32 # 第一次的时候是现将数据全部都取下来,等存入文件的时候再筛选,现在直接筛选再 33 # 存入文件中,当时的时候并没有想到并且没有理解get_text()方法,这个也是 34 # 代码不精简的原因。。。。 35 #=================================================================== 36 # list_tit=[] 37 # for ths in tr_list[0]: 38 # if ths.find('a'): 39 # for item in ths: 40 # if type(item)!=unicode: 41 # list_tit.append(item.string) 42 # else: 43 # list_tit.append(ths.get_text(strip=True)) 44 # for item in list_tit: 45 # if item not in ['',' ','\n','\t']: 46 # with open('yaopinzhuce_new.txt','a') as f: 47 # f.write(item.encode('utf-8')+'|') 48 # 表格内容 49 f=open('yaopinzhuce1030.txt','a') 50 for tr in tr_list[1:]: 51 f.write('\n') 52 for item in tr: 53 if item not in ['',' ','\n']: 54 if item.string==None: 55 f.write('None'+'|') 56 else: 57 f.write(item.string.encode('utf-8')+'|') 58 59 f.close() 60 print 'sleeping... pageloading %d/12' %i 61 time.sleep(5) 62 63 64 spider=YBZC() 65 spider.getItems()