pyhton网页抓取
在wiktionary 这个网页上,将 Frequency lists as of 2006-04-16: 里几个页面中的4万个单词和对应词频都抓取下来, 生成一个文本文件,格式为两列,
第1列为词频,第2列为单词:
譬如这样
56271872 the
33950064 and
页面在 http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#Project_Gutenberg
我刚刚学习python,谢谢各位大牛!麻烦加一下注释,感激不尽
[解决办法]
#coding=utf8import urllib2,re,pprint#from BeautifulSoup import BeautifulSoup#从网站获取页面def gethtml(url): try: opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'MyWikipediaInfoScraper/0.1')] page = opener.open( url ).read() return page except Exception,ex: print ex return -1#测试时从本地文件读取def readlocalfile(): try: filename='./res/wikipage1.txt' fh=open(filename,'r') c=fh.read() fh.close() return c except Exception,ex: print ex return -1#提取数据def getdata(html): try: wordlists=[] r=re.compile(r'''(<a href="/wiki/[a-z]*"\s*title="[a-z]*">[a-z]*</a>\s*=\s*[0-9]*)''') datas=r.findall(html) for data in datas: #print data pos1=data.find('title="') if pos1==-1: continue pos2=data.find('"',pos1+7) if pos2==-1: continue word=data[pos1+7:pos2] rnumbers=re.compile(r'''(\s*[0-9]+\s*)''') number=rnumbers.findall(data)[0].strip() temp=[] temp.append(number) temp.append(word) wordlists.append(temp) datafile='./res/wikidata.txt' fh=open(datafile,'w') pprint.pprint(wordlists,fh) fh.close() except Exception,ex: print ex return -1def getwikidata(): try: url = "http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/PG/2006/04/1-10000" html=gethtml(url) #html=readlocalfile() if html==-1: return -2 datas=getdata(html) if datas==-1: return -2 except Exception,ex: print ex return -1 if __name__=='__main__': print 'begin...' ret=getwikidata() if ret==-1 or ret==-2: print 'end. but have some errors' else: print 'end. ok'