参考别人的代码写了个爬虫,出错的地方看不明白,请赐教
#-*- coding:gbk-*-
'''
Created on 2009-9-7
@author: Ken
'''
import urllib
import urllib2
import random
import os
import re
from sgmllib import SGMLParser
class URLLister(SGMLParser):
'''获取html中的图片地址\url地址,装入list中'''
def reset(self):
SGMLParser.reset(self)
self.img = []
self.urls = []
def start_img(self, attrs):
img = [v for k, v in attrs if k == 'src']
if img:
self.img.extend(img)
def start_a(self, attrs):
href = [v for k, v in attrs if k == 'href']
if href:
self.urls.extend(href)
def get_docum(url):
url = url + '//'
sock = urllib.urlopen(url)
file = sock.read()
sock.close()
return file
def is_img(url):
global imglenth
request = urllib2.Request(url)
opener = urllib2.build_opener()
try:
con = opener.open(request)
Type = con.headers.dict['content-type'][:5] #判断链接返回的 content-type是不是图片。
Length = int(con.headers.dict['content-length'])#判断图片大小
if Length > imglenth:
# print "Here is a type :" + Type
return bool(Type)
else:
print "图片 too small"
return 0
except:
print '该图片无法在服务器找到或者图片地址无法识别!'
print url
def get_file_name(ospath, imgname):
name = 'P' + str(random.randint(10000000, 99999999))
filepath = "%s%s.%s" % (ospath, name, (imgname.split('.'))[-1])
return filepath
def get_img(rq):
parser = URLLister(); doc = get_docum(rq); parser.feed(doc); img = parser.img
parser.close()
for i in range(0, len(img)):
if img[i][0:4] != 'http':#处理绝对路径
img[i] = rq + img[i]
return img
def get_url(rq):
parser = URLLister(); doc = get_docum(rq); parser.feed(doc); urls = parser.urls
parser.close()
for i in range(0, len(urls)):
if urls[i][0:4] != 'http': #处理绝对路径
urls[i] = rq + urls[i]
return urls
def compare_subs(str_a):
splitresult = str_a.split('/')
# print "after strip url " + str(splitresult)
str_x = splitresult[len(splitresult) - 1]
# print "get the last segment of url " + str_x
# print "Type of last segment of url " + str(type(str_x))
if (type(str_x) == str):
res = str_x.split('.')
# print "after strip last segment of url " + str(res)
str_x = res[len(res) - 1]
comp_jpg = re.search(str_x, 'jpg', re.IGNORECASE)
comp_gif = re.search(str_x, 'gif', re.IGNORECASE)
if bool(comp_jpg or comp_gif):
return is_img(str_a)
else:
return False
else:
return False
def depth(url, dep, ospath):
'''三个参数分别是
url : 需要下载的网站地址
dep :需要遍历的深度
ospath:图片下载的本地文件夹
'''
global num
if dep <= 0:
return 0
else:
img = get_img(url)
for j in range(0, len(img)):
print (img[j])
if (compare_subs(img[j])):
# if (is_img(img[j]) == 'image'):
isExist = True;
while(isExist): #判断文件是否已经存在
filepath = get_file_name(ospath, img[j]);
if (not os.path.exists(filepath)):
isExist = False;
try:
urllib.urlretrieve(img[j], filepath)
print '已经下载好第%d张图片' % (num + 1)
num += 1
except:
print '该图片无法下载或者图片地址无法识别!'
print img[j]
else:
pass
urls = get_url(url)
if len(urls) > 0:
for url in urls:
depth(url, (dep - 1), ospath)
else:
return 0
return 1
if __name__ == '__main__':
imglenth = 100000 #设置需要下载的图片大小。
num = 0
depth('http://www.yoka.com/club/one/pic/', 2, "E:\ppp\\")
print '********************************我爬完了!!******************************************'
这里是错误日志:
http://www.googleadservices.com/pagead/conversion/998693514/?value=0&label=6ZKeCLbmigMQirWb3AM&guid=ON&script=0
Traceback (most recent call last):
File "E:\web get c\example_001.py", line 142, in <module>
depth('http://www.yoka.com/club/one/pic/', 2, "E:\ppp\\")
File "E:\web get c\example_001.py", line 133, in depth
depth(url, (dep - 1), ospath)
File "E:\web get c\example_001.py", line 114, in depth
if (compare_subs(img[j])):
File "E:\web get c\example_001.py", line 89, in compare_subs
comp_jpg = re.search(str_x, 'jpg', re.IGNORECASE)
File "D:\Python27\lib\re.py", line 142, in search
return _compile(pattern, flags).search(string)
File "D:\Python27\lib\re.py", line 244, in _compile
raise error, v # invalid expression
sre_constants.error: nothing to repeat
[解决办法]
这里:comp_jpg = re.search(str_x, 'jpg', re.IGNORECASE)
这里的正则表达式出问题了
[解决办法]
re.search('jpg', str_x, re.IGNORECASE)