python多线程抓取网页的问题
刚接触python利用生产者消费者模型在写一个抓取网页分析的小东东,才写到生产者线程,就一堆问题,代码如下:
#!/usr/bin/python
from sgmllib import SGMLParser
from Queue import Queue
import urllib
import random
import threading
class URLLister(SGMLParser):
def __init__(self, fType, queue):
self.fType = fType
self.data = queue
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
if href:
tmpfileType = '.' + self.fType
tmp = href[0][-len(tmpfileType):].lower()
if tmp == tmpfileType:
self.data.put(href)
class Producer(threading.Thread):
def __init__(self, t_name ,searchURL1 , searchURL2 , pageNum , fType , queue):
threading.Thread.__init__(self, name=t_name)
self.searchURL1 = searchURL1
self.searchURL2 = searchURL2
self.pageNum = pageNum
self.fType = fType
self.data=queue
def run(self):
for i in range(0 , self.pageNum):
url = self.searchURL1 + self.fType + self.searchURL2 + str(i * 10)
urllink=urllib.urlopen(url)
urldata=urllink.read()
urllink.close()
lister=URLLister(self.fType , self.data)
lister.feed(urldata)
fileType='doc'
queue = Queue()
pageNum = 5
urlBaidu1 = 'http://www.baidu.com/s?wd=filetype%3A'
urlBaidu2 = '&pn='
producer = Producer('Pro', urlBaidu1 , urlBaidu2 , pageNum , fileType , queue)
producer.start()
producer.join()
之前将URLLister放在主线程中运行时正确的,但是放在生产者线程中时,报错
Exception in thread Pro.:
Traceback (most recent call last):
File "C:\Python27\lib\threading.py", line 552, in __bootstrap_inner
self.run()
File "E:\code\17python\1sys_path\test.py", line 43, in run
lister.feed(urldata)
File "C:\Python27\lib\sgmllib.py", line 103, in feed
self.rawdata = self.rawdata + data
AttributeError: URLLister instance has no attribute 'rawdata'
由于刚接触python,有点无从下手了。请各位指点下,谢谢!
[解决办法]