线程池Queue的问题
我写了一个多线程的爬虫:URL抓取页面中其他URL,继续抓取其页面。里面用到了Queue但是在xp命令行下运行的时候经常光标不动。应该是信号同步的问题,调试很久不得其解,帖代码,往大家指正:
#coding=utf-8from __future__ import with_statementfrom BeautifulSoup import BeautifulSoupimport urllib2from threading import Threadfrom Queue import Queueimport timeimport socketsocket.setdefaulttimeout(5)class Fetcher:#把操作封到一个类里面,从网上搜得例子 def __init__(self,th_num): self.opener = urllib2.build_opener(urllib2.HTTPHandler) self.lock = Lock() #线程锁 self.q_req = Queue() #任务队列 self.q_ans = Queue() #完成队列 self.urls = []#返回抓取页面中的url self.th_num = th_num for i in range(th_num):#抓取线程 t = Thread(target=self.thread_get) t.setDaemon(True) t.start() for i in range(th_num):#处理线程 t = Thread(target=self.thread_put) t.setDaemon(True) t.start() def join(self): #解构时需等待两个队列完成 time.sleep(0.5) print '=====================im done' self.q_req.join() self.q_ans.join() def push(self,req): self.q_req.put(req) def thread_put(self): while True: try: if not self.q_ans.empty(): url = self.q_ans.get() self.urls.extend(url) self.q_ans.task_done() except Queue.empty,qe: print qe,'Queue==========' continue except Exception ,e: print e,'other,excp========' continue def thread_get(self): print 'i am starting------' while True: try: if self.q_req.empty(): continue req = self.q_req.get() except Queue.empty,qe: print 'enmpty-----------' continue urls = [] ans = '' try: ans = self.opener.open(req).read() soup = BeautifulSoup(ans) for a in soup.findAll('a'): try: if a['href'].startswith('http'): urls.append(a['href']) except KeyError, e: print e ,'=======================KeyError=in=soup=findAll' continue except Exception,ex: print ex,'========================Exception=in=soup=findAll' continue self.q_ans.put(urls) self.q_req.task_done() except UnicodeEncodeError, ue: print 'unicode----------------------wrong' print ue print req continue except urllib2.URLError, ue: print 'conn-----------rufuse' print ue print req continue except Exception, what: print 'other--exception----------in- threadget----' print what print req continue time.sleep(0.1) # don't spam print 'get==========' def run(links,th_num=10): f = Fetcher(th_num) for url in links: f.push(url) f.join() return f.urls if __name__ == "__main__": links = ['http://kingdowin.com/',] deep = 2#抓取页面的深度 while deep > 0: urls = run(links) deep -= 1 links = urls print links print "Exiting Main Thread"
i am starting------
i am starting------
=====================im done
[u'http://my.4399.com/userapp.php?id=100111', u'http://my.kingdowin.com', u'http://apps.renren.com/tdsheep/', u'http://www.pengyou.com/index.php?mod=appmanager&
act=openapp&type=qzone&appid=16488', u'http://www.renren.com', u'http://uchome.developer.manyou.com/', u'http://www.myspace.cn/', u'http://www.facebook.com', u'
http://www.pengyou.com', u'http://www.kaixin001.com']
i am starting------
i am starting------
i am starting------
i am starting------
i am starting------
i am starting------
i am starting------
i am starting------
i am starting------
i am starting------
=====================im done
conn-----------rufuse
<urlopen error timed out>
http://www.facebook.com
conn-----------rufuse
<urlopen error timed out>
http://my.kingdowin.com
运行的很好呀
这一句:self.lock = Lock() #线程锁
我这里报Lock没有定义
我加了from threading import Lock
或者注释掉
都能运行
[解决办法]
貌似Queue.get()成功后,不管进一步处理有啥异常task_done()总要调用吧,不然Queue.join()会
一直卡住