参考别人的代码写了个爬虫,出错的地方看不明白,请赐教解决方法

2013-01-25

参考别人的代码写了个爬虫,出错的地方看不明白,请赐教#-*- coding:gbk-*-Created on 2009-9-7 @author:

参考别人的代码写了个爬虫,出错的地方看不明白,请赐教
#-*- coding:gbk-*-
'''
Created on 2009-9-7

@author: Ken
'''
import  urllib
import urllib2
import random
import os
import re
from sgmllib import SGMLParser

class URLLister(SGMLParser):
    '''获取html中的图片地址\url地址，装入list中'''
    def reset(self):
        SGMLParser.reset(self)
        self.img = []
        self.urls = []
    def start_img(self, attrs):
        img = [v for k, v in attrs if k == 'src']
        if img:
            self.img.extend(img)
    def start_a(self, attrs):
        href = [v for k, v in attrs if k == 'href']
        if href:
            self.urls.extend(href)

def get_docum(url):
    url = url + '//'
    sock = urllib.urlopen(url)
    file = sock.read()
    sock.close()
    return file

def is_img(url):
    global imglenth
    request = urllib2.Request(url)
    opener = urllib2.build_opener()
    try:
        con = opener.open(request)
        Type = con.headers.dict['content-type'][:5] #判断链接返回的 content-type是不是图片。
        Length = int(con.headers.dict['content-length'])#判断图片大小
        if Length > imglenth:
#            print "Here is a type :" + Type
            return bool(Type)

        else:
            print "图片 too small"
            return 0
    except:
        print '该图片无法在服务器找到或者图片地址无法识别！'
        print url

def get_file_name(ospath, imgname):
    name = 'P' + str(random.randint(10000000, 99999999))
    filepath = "%s%s.%s" % (ospath, name, (imgname.split('.'))[-1])
    return filepath

def get_img(rq):
    parser = URLLister();    doc = get_docum(rq);    parser.feed(doc);    img = parser.img
    parser.close()
    for i in range(0, len(img)):

        if img[i][0:4] != 'http':#处理绝对路径
            img[i] = rq + img[i]
    return img

def get_url(rq):
    parser = URLLister();    doc = get_docum(rq);    parser.feed(doc);    urls = parser.urls
    parser.close()
    for i in range(0, len(urls)):
        if urls[i][0:4] != 'http': #处理绝对路径
            urls[i] = rq + urls[i]
    return urls

def compare_subs(str_a):
    splitresult = str_a.split('/')
#    print "after strip url " + str(splitresult)
    str_x = splitresult[len(splitresult) - 1]
#    print "get the last segment of url " + str_x
#    print "Type of last segment of url " + str(type(str_x))
    if (type(str_x) == str):
        res = str_x.split('.')
#        print "after strip last segment of url " + str(res)
        str_x = res[len(res) - 1]
        comp_jpg = re.search(str_x, 'jpg', re.IGNORECASE)
        comp_gif = re.search(str_x, 'gif', re.IGNORECASE)
        if bool(comp_jpg or comp_gif):
            return is_img(str_a)
        else:
            return False
    else:
        return False

def depth(url, dep, ospath):
    '''三个参数分别是
    url ：需要下载的网站地址
    dep ：需要遍历的深度
    ospath：图片下载的本地文件夹
    '''

    global num
    if dep <= 0:
        return 0
    else:
        img = get_img(url)
        for j in range(0, len(img)):
            print (img[j])

            if (compare_subs(img[j])):
#            if (is_img(img[j]) == 'image'):
                isExist = True;
                while(isExist): #判断文件是否已经存在
                    filepath = get_file_name(ospath, img[j]);

                    if (not os.path.exists(filepath)):
                        isExist = False;
                try:
                    urllib.urlretrieve(img[j], filepath)
                    print '已经下载好第%d张图片' % (num + 1)
                    num += 1
                except:
                    print '该图片无法下载或者图片地址无法识别！'
                    print img[j]
            else:
                pass
        urls = get_url(url)
        if len(urls) > 0:
            for url in urls:
                depth(url, (dep - 1), ospath)
        else:
            return 0
        return 1

if __name__ == '__main__':
    imglenth = 100000           #设置需要下载的图片大小。
    num = 0
    depth('http://www.yoka.com/club/one/pic/', 2, "E:\ppp\")
    print '********************************我爬完了！！******************************************'

这里是错误日志:
http://www.googleadservices.com/pagead/conversion/998693514/?value=0&label=6ZKeCLbmigMQirWb3AM&guid=ON&script=0
Traceback (most recent call last):
  File "E:\web get c\example_001.py", line 142, in <module>
    depth('http://www.yoka.com/club/one/pic/', 2, "E:\ppp\")
  File "E:\web get c\example_001.py", line 133, in depth
    depth(url, (dep - 1), ospath)
  File "E:\web get c\example_001.py", line 114, in depth
    if (compare_subs(img[j])):
  File "E:\web get c\example_001.py", line 89, in compare_subs
    comp_jpg = re.search(str_x, 'jpg', re.IGNORECASE)
  File "D:\Python27\lib\re.py", line 142, in search
    return _compile(pattern, flags).search(string)
  File "D:\Python27\lib\re.py", line 244, in _compile
    raise error, v # invalid expression
sre_constants.error: nothing to repeat

[解决办法]
这里：comp_jpg = re.search(str_x, 'jpg', re.IGNORECASE)
这里的正则表达式出问题了
[解决办法]
re.search('jpg', str_x, re.IGNORECASE)

热点排行

perl python

参考别人的代码写了个爬虫,出错的地方看不明白,请赐教解决方法