首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > 开发语言 > perl python >

参照别人的代码写了个爬虫,出错的地方看不明白,请赐教

2012-10-16 
参考别人的代码写了个爬虫,出错的地方看不明白,请赐教#-*- coding:gbk-*-Created on 2009-9-7@author:

参考别人的代码写了个爬虫,出错的地方看不明白,请赐教
#-*- coding:gbk-*-
'''
Created on 2009-9-7
 
@author: Ken
'''
import urllib
import urllib2
import random
import os
import re
from sgmllib import SGMLParser
 

 
class URLLister(SGMLParser):
  '''获取html中的图片地址\url地址,装入list中'''
  def reset(self):
  SGMLParser.reset(self)
  self.img = []
  self.urls = []
  def start_img(self, attrs):
  img = [v for k, v in attrs if k == 'src']
  if img:
  self.img.extend(img)
  def start_a(self, attrs):
  href = [v for k, v in attrs if k == 'href']
  if href:
  self.urls.extend(href)
 

def get_docum(url):
  url = url + '//'
  sock = urllib.urlopen(url)
  file = sock.read()
  sock.close()
  return file
 
def is_img(url):
  global imglenth
  request = urllib2.Request(url)
  opener = urllib2.build_opener()
  try:
  con = opener.open(request)
  Type = con.headers.dict['content-type'][:5] #判断链接返回的 content-type是不是图片。
  Length = int(con.headers.dict['content-length'])#判断图片大小
  if Length > imglenth:
# print "Here is a type :" + Type
  return bool(Type)

  else:
  print "图片 too small"
  return 0
  except:
  print '该图片无法在服务器找到或者图片地址无法识别!'
  print url
   
def get_file_name(ospath, imgname): 
  name = 'P' + str(random.randint(10000000, 99999999))
  filepath = "%s%s.%s" % (ospath, name, (imgname.split('.'))[-1])
  return filepath  

def get_img(rq):
  parser = URLLister(); doc = get_docum(rq); parser.feed(doc); img = parser.img
  parser.close()
  for i in range(0, len(img)):
  if img[i][0:4] != 'http':#处理绝对路径
  img[i] = rq + img[i]
  return img
 
def get_url(rq):
  parser = URLLister(); doc = get_docum(rq); parser.feed(doc); urls = parser.urls
  parser.close()
  for i in range(0, len(urls)):
  if urls[i][0:4] != 'http': #处理绝对路径
  urls[i] = rq + urls[i]
  return urls

def compare_subs(str_a):
  splitresult = str_a.split('/')
# print "after strip url " + str(splitresult)
  str_x = splitresult[len(splitresult) - 1]
# print "get the last segment of url " + str_x
# print "Type of last segment of url " + str(type(str_x))
  if (type(str_x) == str):
  res = str_x.split('.')
# print "after strip last segment of url " + str(res)
  str_x = res[len(res) - 1]
  comp_jpg = re.search(str_x, 'jpg', re.IGNORECASE)
  comp_gif = re.search(str_x, 'gif', re.IGNORECASE)
  if bool(comp_jpg or comp_gif):
  return is_img(str_a)
  else:
  return False
  else:
  return False

 
def depth(url, dep, ospath):
  '''三个参数分别是
  url : 需要下载的网站地址
  dep :需要遍历的深度 
  ospath:图片下载的本地文件夹  
  '''
   
  global num
  if dep <= 0:
  return 0
  else:
  img = get_img(url)
  for j in range(0, len(img)):
  print (img[j])


   
  if (compare_subs(img[j])):
# if (is_img(img[j]) == 'image'):
  isExist = True;
  while(isExist): #判断文件是否已经存在
  filepath = get_file_name(ospath, img[j]);
  if (not os.path.exists(filepath)):
  isExist = False;
  try:
  urllib.urlretrieve(img[j], filepath)
  print '已经下载好第%d张图片' % (num + 1)
  num += 1
  except:
  print '该图片无法下载或者图片地址无法识别!'
  print img[j]
  else:
  pass
  urls = get_url(url)
  if len(urls) > 0:
  for url in urls:
  depth(url, (dep - 1), ospath)
  else:
  return 0
  return 1
 

if __name__ == '__main__':
  imglenth = 100000 #设置需要下载的图片大小。 
  num = 0
  depth('http://www.yoka.com/club/one/pic/', 2, "E:\ppp\\")
  print '********************************我爬完了!!******************************************'

这里是错误日志:
http://www.googleadservices.com/pagead/conversion/998693514/?value=0&label=6ZKeCLbmigMQirWb3AM&guid=ON&script=0
Traceback (most recent call last):
  File "E:\web get c\example_001.py", line 142, in <module>
  depth('http://www.yoka.com/club/one/pic/', 2, "E:\ppp\\")
  File "E:\web get c\example_001.py", line 133, in depth
  depth(url, (dep - 1), ospath)
  File "E:\web get c\example_001.py", line 114, in depth
  if (compare_subs(img[j])):
  File "E:\web get c\example_001.py", line 89, in compare_subs
  comp_jpg = re.search(str_x, 'jpg', re.IGNORECASE)
  File "D:\Python27\lib\re.py", line 142, in search
  return _compile(pattern, flags).search(string)
  File "D:\Python27\lib\re.py", line 244, in _compile
  raise error, v # invalid expression
sre_constants.error: nothing to repeat



[解决办法]
这里:comp_jpg = re.search(str_x, 'jpg', re.IGNORECASE)
这里的正则表达式出问题了
[解决办法]
re.search('jpg', str_x, re.IGNORECASE)

热点排行