python + request + lxml的几个例证

2013-10-14

python + request + lxml的几个例子例子没有加入失败后重做的功能，这个也可以考虑增加。第三个例子加入了访

python + request + lxml的几个例子
例子没有加入失败后重做的功能，这个也可以考虑增加。
第三个例子加入了访问频率控制
遍历图片的例子加入多线程，明显爬得快很多

解析163新闻列表的例子：

#!/usr/bin/python# encoding=gbk# 我只是尝试遍历新闻而已，只有有很多链接博客，主题之类的没有操作# 如果要实现，就自己判断url来分析到底是什么，然后做相应的处理import sysimport requestsimport datetimeimport timeimport MySQLdbimport chardetimport lxml.html.soupparser as soupparserimport lxml.etree as etreestart_datetime = datetime.datetime.now()def parseFromWin1252(str):    # 因为新闻有一些是乱码，编码是windows-1252，需要转换成GBK    #print len(tt.decode("ISO-8859-1").encode("windows-1252").decode("GBK"))    #print len(tt)    try:        return str.encode("windows-1252").decode("GBK")    except UnicodeEncodeError:        #print "UnicodeEncodeError"        return str    except UnicodeDecodeError:        #print "UnicodeDecodeError"        return strdef resolveAndSaveNewContentFromLink(link, linkTitle, cursor):    # 打开一个链接，并得到里面的内容    # 有两种情况无法得到，1.没有标题的，可能是一个主题的页面；2.报异常的，还没处理，所以无法拿到内容    print u"处理:", link    request = requests.get(link)    try:        dom = soupparser.fromstring(request.content)        body = dom[0]        titles = body.xpath("//h1[@id='h1title']")        if len(titles) > 0:            #有标题            title = parseFromWin1252(titles[0].text)            print u"@TITLE:", request.encoding, title, link            newContents = body.xpath("//div[@id='endText']//p")            alist = []            for content in newContents:                if content.text != None:                    alist.append(content.text)            text = parseFromWin1252("<br><br>".join(alist))            values = [link, title, text, "Success"]            cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values)        else:            #无标题            title = parseFromWin1252(linkTitle)            print u"#NO_TITLE:", request.encoding, title, link            values = [link, title, "", "NO_TITLE"]            cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values)    except TypeError:        #报异常        title = parseFromWin1252(linkTitle)        print u"$TypeError:", request.encoding, title, link        values = [link, title, "", "TypeError"]        cursor.execute("insert into texts(url,title,text,statue) value(%s,%s,%s,%s)", values)#定义方法def resolveAndSaveLinks(body, cursor):    print u"解析html的Link"    links = body.xpath("//ul[@class='mod-list main-list']//a")    print u"处理数据"    count = 1;    for item in links:        # 有em标签的无法解析        if item.text != None:            values = [item.get("href"), item.text]            cursor.execute("insert into links(url,text) value(%s,%s)", values)            resolveAndSaveNewContentFromLink(item.get("href"), item.text, cursor)            #time.sleep(100) #是否需要暂停，免得被封掉?            print u"完成","<resolveAndSaveLinks>[%s:%s]" %(len(links), count)            count = count + 1            print "----------------------"    print u"保存数据完成,记录数[", len(links), "]"def resolveAndSaveEmInLinks(body, cursor):    print u"解析html的包含em元素的Link"    ems = body.xpath("//ul[@class='mod-list main-list']//em")    print u"处理数据"    count = 1;    for item in ems:        values = [item.getparent().get("href"), item.text]        cursor.execute("insert into links(url,text) value(%s,%s)", values)        resolveAndSaveNewContentFromLink(item.getparent().get("href"), item.text, cursor)        #time.sleep(100) #是否需要暂停，免得被封掉?        print u"完成","<resolveAndSaveEmInLinks>[%s:%s]" %(len(ems), count)        count = count + 1        print "----------------------"    print u"保存数据完成,记录数[", len(ems), "]"def resolve():    print u"打开链接"    req = requests.get("http://news.163.com/")    content = req.content    dom = soupparser.fromstring(content)    body = dom[1]    print u"链接数据库"    conn = MySQLdb.connect(host="192.168.0.196", user="root", passwd="", db="python", charset="utf8")    cursor = conn.cursor()    cursor.execute("delete from links")    cursor.execute("delete from texts")    #resolveAndSaveNewContentFromLink("http://auto.163.com/13/0929/02/99TGSGRJ00084TUR.html", u"测试", cursor)    #if True:    #    return    print u"解析并保存到数据库"    #遍历不包含em标签的link    resolveAndSaveLinks(body, cursor)    #遍历包含em标签的link    resolveAndSaveEmInLinks(body, cursor)    cursor.close()    conn.close()    print u"遍历完成"#开始调用resolve()end_datetime = datetime.datetime.now()print u"耗时", (end_datetime - start_datetime).seconds, u"秒"

遍历糗事百科的文章，只遍历导航上面的几个分类，热门，最新，等等

#!/usr/bin/ScanningQiuShiBaiKe.py# encoding=gbkimport sysimport osimport MySQLdbimport requestsimport datetimeimport timeimport lxml.html.soupparser as soupparserimport lxml.etree as etreecurrentPageId = "currentPageId"def getImageFile(imgUrl): #文件下载，并写入本地硬盘，返回文件名    local_filename = imgUrl.split('/')[-1]    local_filename=  "/home/pandy/tmp/"+local_filename    print u"下载文件成功: ", local_filename    r = requests.get(imgUrl, stream=True) # here we need to set stream = True parameter    with open(local_filename, 'wb') as f:        for chunk in r.iter_content(chunk_size=1024):            if chunk: # filter out keep-alive new chunks                f.write(chunk)                f.flush()        f.close()        return local_filename    return Nonedef scannintArticle(cursor, type, url, article):   #处理一个主题的信息    articleStr = etree.tostring(article)    articleBody = soupparser.fromstring(articleStr)    details = articleBody.xpath("//div[@class='detail']")    authors = articleBody.xpath("//div[@class='author']")    contents = articleBody.xpath("//div[@class='content']")    thumbs = articleBody.xpath("//div[@class='thumb']")    values = [type, url]    if len(details) > 0:        detailStr = etree.tostring(details[0])        detail = soupparser.fromstring(detailStr)        values.append(detail.xpath("//a")[0].text)        values.append(detail.xpath("//a")[0].get("href"))    else:        values.append("")        values.append("")    if len(authors) > 0:        authorStr = etree.tostring(authors[0])        author = soupparser.fromstring(authorStr)        values.append(author.xpath("//a")[0].text)        values.append(author.xpath("//a")[0].get("href"))    else:        values.append("")        values.append("")    if len(contents) > 0:        contentStr = etree.tostring(contents[0])        values.append(contents[0].text)    else:        values.append("")        values.append("")    if len(thumbs) > 0:        thumbStr = etree.tostring(thumbs[0])        thumb = soupparser.fromstring(thumbStr)        imgUrl = thumb.xpath("//img")[0].get("src")        values.append(imgUrl)        #下载图片，先临时存放，然后在读取出来保存到数据库，并删除        local_filename = getImageFile(imgUrl)        f = open( local_filename , "rb" )        b = f.read()        f.close()        os.remove(local_filename)        values.append(MySQLdb.Binary(b))    else:        values.append("")        values.append(None)    values.append("Success")    print values    cursor.execute(        "INSERT INTO qs_article ( type, url, detial_link, detail, user_link, user, content,img, img_content,status) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",        values)def scanning4typeArticle(cursor, type, url): #扫描一页    request = requests.get(url)    #print request.encoding    print url    #print len(request.content)    #print request.content    try:        dom = soupparser.fromstring(request.content)        body = dom[1]        #查找一页下面的主题        articleList = body.xpath("//div[@class='block untagged mb15 bs2']")        for article in articleList:            scannintArticle(cursor, type, url, article)    except:        print "Error"        values = [type, url, '', '', '', '', '', '',None, "Error"]        cursor.execute(            "INSERT INTO qs_article ( type, url, detial_link, detail, user_link, user, content,img, img_content, status) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",            values)def scanning4type(cursor, type, url, subfix):   #得到分页数，然后一页一页的打开    print u"开始扫描文章"    request = requests.get(url);    dom = soupparser.fromstring(request.content)    body = dom[0]    #得到底部分页的最大值    pagebars = body.xpath("//div[@class='pagebar']/a")    if len(pagebars) > 2:        maxPageSize = int(pagebars[len(pagebars) - 2].text) + 1        #一页一页的打开        for i in range(1, maxPageSize):            scanningUrl = "".join([url, subfix]).replace(currentPageId, str(i))            scanning4typeArticle(cursor, type, scanningUrl)    print u"扫描文章完成"def main(): # 主方法    #打开数据库    conn = MySQLdb.connect(host="192.168.0.196", user="root", passwd="", db="python", charset="utf8")    cursor = conn.cursor()    cursor.execute("delete from qs_article")    #扫描几个类型，就是导航的前几个分类    scanning4type(cursor, "8HR", "http://www.qiushibaike.com/8hr", "".join(["/page/", "currentPageId", "?s=4602020"]))    #scanning4type(cursor, "HOT", "http://www.qiushibaike.com/hot", "".join(["/page/", "currentPageId", "?s=4602057"]))    #scanning4type(cursor, "IMGRANK", "http://www.qiushibaike.com/imgrank", "".join(["/page/", "currentPageId", "?s=4602057"]))    #scanning4type(cursor, "LATE", "http://www.qiushibaike.com/late", "".join(["/page/", "currentPageId", "?s=4602057"]))    #scanning4typeArticle(cursor, type, "http://www.qiushibaike.com/late/page/346?s=4602057")    #关闭数据库    cursor.close()    conn.close()#开始运行主程序main()

遍历新浪一些博客的图片,加入了访问频率控制

#!/usr/bin/python# encoding=gbk#http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=3&type=2&blogid=67f899b332002zdw&ch=import sysimport osimport requestsimport MySQLdbimport lxml.html.soupparser as soupparserimport lxml.etree as etreeimport jsonimport timemaxPage = 100 # 定义被扫描的最大页数requests.adapters.DEFAULT_RETRIES = 5#加入控制打开频率DEFAULT_OPEN_PAGE_FREQUENCY = 1  #打开页面的间隔事件DEFAULT_OPEN_IMAGE_FREQUENCY = 3  #打开图片页面的间隔事件DEFAULT_IMAGE_COUNT = 0  #图片计数器DEFAULT_IMAGE_SIZE = 20  #打开size张图片后，要sleep DEFAULT_OPEN_IMAGE_FREQUENCY秒钟def saveImage(title, imageSrc): # 保存图片    if title == None:        title = u"无题"    print u"标题:%s     图片:%s" % (title, imageSrc)    dirStr = u"/mnt/E/新浪图集/" + title + "/"    if not os.path.exists(dirStr):        os.makedirs(dirStr)    fileName = imageSrc.split('/')[-1]    request = requests.get(imageSrc, stream=True)    with open(dirStr + fileName, "wb") as file:        for chunk in request.iter_content(chunk_size=1024):            if chunk: # filter out keep-alive new chunks                                        5                file.write(chunk)                file.flush()        file.close()def listPicPage(pageUrl): #从首页打开链接，然后进行图片的页面    global DEFAULT_IMAGE_COUNT    request = requests.get(pageUrl)    dom = soupparser.fromstring(request.content)    body = dom[1]    title = body.xpath("//h3[@class='title']")    titleStr = "";    if len(title) > 0:        titleStr = title[0].text    imageList = body.xpath("//div[@class='imgArea']/img[@class='qImg']")    print u"遍历图片页面，  标题:%s,   地址: %s " % (titleStr, pageUrl)    imageSrc = None    for image in imageList:        # 这里好像有两个地址，先用real_src，否在用src        if image.get("real_src") != None:            imageSrc = image.get("real_src")        else:            imageSrc = image.get("src")            #要存在图片地址，才需要继续解析        if imageSrc != None:            saveImage(titleStr, imageSrc)        #访问频率控制        DEFAULT_IMAGE_COUNT = DEFAULT_IMAGE_COUNT + 1        if DEFAULT_IMAGE_COUNT % DEFAULT_IMAGE_SIZE == 0:            print u"图片计数:%s, 休息 %s 秒钟后继续\n" % (DEFAULT_IMAGE_COUNT, DEFAULT_OPEN_IMAGE_FREQUENCY)            time.sleep(DEFAULT_OPEN_IMAGE_FREQUENCY)def listPicIndex(): #遍历首页    # 根据页数来打开url    for i in range(1, maxPage + 1):        url = "http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=" + str(            i) + "&type=2&blogid=67f899b332002zdw&ch="        request = requests.get(url)        json_obj = json.loads(request.content)        for item in json_obj["data"]["list"]:            #找到这一页的所有图片链接，然后进行打开这个链接，才是显示图片的页面            dom = soupparser.fromstring(item)            link = dom.xpath("//a[@class='pic']")            if len(link) > 0:                #遍历图片的页面                listPicPage(link[0].get("href"))            print u"---------------------------------------------完成一个图片链接, 页数:", i            #访问频率控制            # time.sleep(DEFAULT_OPEN_PAGE_FREQUENCY)    print u"---------------------------------------------完成页数", maxPage, ":", idef main():    listPicIndex()    #listPicPage("http://qing.blog.sina.com.cn/tj/a1509eee330044am.html")if __name__ == "__main__":    main()

上面的例子改成多线程

#!/usr/bin/python# encoding=gbk#http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=3&type=2&blogid=67f899b332002zdw&ch=import sysimport osimport requestsimport MySQLdbimport lxml.html.soupparser as soupparserimport lxml.etree as etreeimport jsonimport timeimport threadingMAX_PAGE = 100 # 定义被扫描的最大页数MAX_ERROR = 10 # 定义线程允许出现的最大错误数，当不超过这个数字的时候，会自动继续重试PAGE_SIZE = 5 #段数DEFAULT_OPEN_PAGE_FREQUENCY = 2 #完成一页休眠的时间DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY = 5 #出现异常之后等待重试的事件requests.adapters.DEFAULT_RETRIES = 5def saveImage(thName, title, imageSrc, currentPath): # 保存图片    if title == None:        title = u"无题"    print u"线程名称:%s,  页码:%s,   标题:%s     图片:%s" % (thName, currentPath, title, imageSrc)    dirStr = u"/mnt/E/新浪图集/" + title + "/"    if not os.path.exists(dirStr):        os.makedirs(dirStr)    fileName = imageSrc.split('/')[-1]    request = requests.get(imageSrc, stream=True)    with open(dirStr + fileName, "wb") as file:        for chunk in request.iter_content(chunk_size=1024):            if chunk: # filter out keep-alive new chunks                                        5                file.write(chunk)                file.flush()        file.close()def listPicPage(thName, pageUrl, currentPath): #从首页打开链接，然后进行图片的页面    global DEFAULT_IMAGE_COUNT    request = requests.get(pageUrl)    dom = soupparser.fromstring(request.content)    body = dom[1]    title = body.xpath("//h3[@class='title']")    titleStr = "";    if len(title) > 0:        titleStr = title[0].text    imageList = body.xpath("//div[@class='imgArea']/img[@class='qImg']")    #print u"\n\n页码:%s, 遍历图片页面，  标题:%s,   地址: %s " % (currentPath, titleStr, pageUrl)    imageSrc = None    for image in imageList:        # 这里好像有两个地址，先用real_src，否在用src        if image.get("real_src") != None:            imageSrc = image.get("real_src")        else:            imageSrc = image.get("src")            #要存在图片地址，才需要继续解析        if imageSrc != None:            saveImage(thName, titleStr, imageSrc, currentPath)def listPicIndex(thName, startPath, endPath): #遍历首页    # 根据页数来打开url    for i in range(startPath, endPath + 1):        url = "http://qing.blog.sina.com.cn/blog/api/tagresult.php?tag=%E7%BE%8E%E5%A5%B3&page=" + str(            i) + "&type=2&blogid=67f899b332002zdw&ch="        print url        request = requests.get(url)        json_obj = json.loads(request.content)        error_count = 0        for item in json_obj["data"]["list"]:            #找到这一页的所有图片链接，然后进行打开这个链接，才是显示图片的页面            dom = soupparser.fromstring(item)            link = dom.xpath("//a[@class='pic']")            if len(link) > 0:                #遍历图片的页面                try:                    listPicPage(thName, link[0].get("href"), i)                except:                    if error_count < MAX_ERROR:                        error_count = error_count + 1                        #错先错误的话，等待一会儿，再重试                        print u"---------------------------------------------休眠%s秒钟后重试, 页数:%s" % (                            DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY, i)                        time.sleep(DEFAULT_OPEN_PAGE_ERROR_WAIT_FREQUENCY)                        listPicPage(thName, link[0].get("href"), i)                    else:                        print u"出错超过预设次数，退出爬虫。"            #print u"---------------------------------------------完成一个图片链接, 页数:", i            #访问频率控制            time.sleep(DEFAULT_OPEN_PAGE_FREQUENCY)    print u"---------------------------------------------完成页数", MAX_PAGE, ":", i    return Trueclass MyThread(threading.Thread):    def __init__(self, name, startPath, endPage):        threading.Thread.__init__(self)        self.name = name        self.is_stop = False        self.startPage = startPath        self.endPage = endPage    def run(self):        while not self.is_stop:            #遍历完成后停止线程            self.is_stop = listPicIndex(self.name, self.startPage, self.endPage)    def stop(self):       #手动设置停止标记        self.is_stop = Trueif __name__ == "__main__":    #分段创建线程    count=1;    for i in range(1, MAX_PAGE, PAGE_SIZE):        startPath = i        endPath = i + PAGE_SIZE        if endPath > MAX_PAGE:            endPath = MAX_PAGE        print startPath, ",", endPath        t = MyThread("Thread " + str(count), startPath, endPath)        count=count+1        t.start()        pass

热点排行

XML SOAP

python + request + lxml的几个例证