抽取豆瓣小组文章的程序
很悲催,python常用的正则表达式一点都不熟,写了个漏洞百出的只看楼主的代码,先贴上来,有空再完善。
# -*- coding: utf8 -*-import urllib2import re#from BeautifulSoup import BeautifulSoup##def (i,title)=ExtractTitle(rawdata):## buf=[]#### print rawdata[0]## i=0## while(rawdata[i]):## m = rawdata[i].find('<title>')## i=i+1## rawdata[i]## i++## while (m==-1)#### if m!=-1:## i=i+1## print rawdata[i]def Extract(rawdata): author=[] title=[] i=0 content=[] link=[] while i < len(rawdata): m = rawdata[i].find('<title>') if m!=-1: title=rawdata[i+1] print title###find the author tmpline = rawdata[i].find('topic-doc') if tmpline!=-1: print rawdata[i-2] posBeg=rawdata[i-2].find('alt=') posEnd=rawdata[i-2].find('/>') author = rawdata[i-2][posBeg+5:posEnd-1] print "author: "+author i=i+1 ## find the content written by the author while i < len(rawdata): contentLine = rawdata[i].find('alt="'+author+'"') if contentLine!=-1: while i< len(rawdata): pLine = rawdata[i].find('<p>') if pLine!=-1: while i< len(rawdata): pEndLine=rawdata[i].find('</p>') if pEndLine !=-1: print rawdata[i] content.append(rawdata[i]) break i+=1 break i+=1 i+=1 i=i+1 def ExtractLink(rawdata,start): i=start links=[] while i< len(rawdata): line = rawdata[i].find('paginator') if line!=-1: tmpRow = rawdata[i] posBeg=tmpRow.find("href=") if posBeg!=-1: tmpRow=tmpRow[posBeg+6:-1] posBeg=0 while posBeg!=-1: posEnd=tmpRow.find(">")## print posBeg## print posEnd## print tmpRow[posBeg:posEnd] links.append(tmpRow[posBeg:posEnd-1]) posBeg=tmpRow.find("href") if posBeg==-1: break tmpRow=tmpRow[posBeg+6:-1] posBeg=0 break i+=1 links=links[0:len(links)-1] return linksdef ExtractAuthorContent(rawdata): author=[] title=[] pos=0 pEnd=0 print len(rawdata) title,pos=findTitle(rawdata,pos) author,pos=findAuthor(rawdata,pos) ## find the content written by the author content,pEnd=extractAllSections(rawdata,author,pos) return author## print content## print pEnd##def ExtractContent(rawdata,author): pos=0 print len(rawdata) print author ## find the content written by the author content,pEnd=extractAllSections(rawdata,author,pos)def extractSection(rawdata,start): i=start content=[] done=0 while i< len(rawdata): pBeginLine = rawdata[i].find('<p>') if pBeginLine!=-1: break i+=1 while i< len(rawdata): pEndLine=rawdata[i].find('</p>') content.append(rawdata[i]) if pEndLine !=-1: print rawdata[i] break i+=1 return (content,i) def extractAllSections(rawdata,author,start): i=start content=[] pEnd=0 count=0 while i < len(rawdata): contentLine = rawdata[i].find('alt="'+author+'"') if contentLine!=-1: tmpContent,i=extractSection(rawdata,i) content.append(tmpContent) if i!=rawdata: count+=1 pEnd=i i+=1 i+=1 return (content,pEnd) def findTitle(rawdata,start): i=start while i < len(rawdata): m = rawdata[i].find('<title>') if m!=-1: title=rawdata[i+1] print title break i+=1 return(title,i) def findAuthor(rawdata,start): i=start author=0 while i < len(rawdata): tmpline = rawdata[i].find('topic-doc') if tmpline!=-1: posBeg=rawdata[i-2].find('alt=') posEnd=rawdata[i-2].find('/>') author = rawdata[i-2][posBeg+5:posEnd-1] print "author: "+author break i+=1 return (author,i) #data=file('E:/petrelli/play/crawl_douban/douban_2.htm','r').readlines()#soup = BeautifulSoup(data)#print soup.prettify()#Extract(data)data = urllib2.urlopen('http://www.douban.com/group/topic/9737262/').readlines()links=ExtractLink(data,0)author=ExtractAuthorContent(data)for link in links: print link## data = urllib2.urlopen(link).readlines()## ExtractContent(data,author) #ExtractAuthorContent(data)#for line in data:# print line?
?