用Heritrix 抓取网站的特定信息问题
我用Heritrix 1.14抓取网易笔记本电脑库中的笔记本电脑的信息,有每个产品的综述和参数以及一些图片,入口选的为笔记本电脑的产品库http://product.tech.163.com/digi/nb/
扩展Extractor代码如下:
package my.extractor;import java.io.BufferedReader;import java.io.IOException;import java.io.StringReader;import java.util.logging.Level;import java.util.logging.Logger;import org.apache.commons.httpclient.URIException;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.extractor.Extractor;import org.archive.crawler.extractor.Link;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.io.ReplayCharSequence;import org.archive.util.HttpRecorder;import org.archive.util.TextUtils;public class Notebook163Extractor extends Extractor{ /********************************************* * 扩展Extractor类 , 实现对网易163笔记本的产品列表链接地址进行分析, * 得到的结果就是各个品牌笔记本的入口地址。 * ***********************************************/ private static final long serialVerisonUID=1L; protected boolean ignoreUnexpectedHTML=true; //此为品牌集中地页面 private static final String urlNotebook163="http://product.tech.163.com/digi/nb"; private static final String url163="http://product.tech.163.com/nb"; //日志 private static final Logger logger=Logger.getLogger(Notebook163Extractor.class.getName()); public Notebook163Extractor(String name){ this(name,"Notebook163 extractor. Extracts links from HTML documents"); } public Notebook163Extractor(String name,String description){ super(name,description); } protected void extract(CrawlURI curi){ //获取URL的字符串 String url=curi.toString(); // 如果当前正在处理的URL是品牌汇集页面 if(url.equals(urlNotebook163)||url.equals(url163)){ ReplayCharSequence cs=null; try{ HttpRecorder hr=curi.getHttpRecorder(); if(hr==null){ throw new IOException("The recorder is null here."); } //获取该页面的内容 cs=hr.getReplayCharSequence(); }catch(IOException e){ curi.addLocalizedError(this.getName(), e, "Failed get of replay char sequence"+curi.toString()+" "+e.getMessage()); logger.log(Level.SEVERE,"Failed get of replay char sequence in "+Thread.currentThread().getName(),e); } //如果页面内容无法获取,则返回 if(cs==null){ return; } String content =cs.toString(); try{ BufferedReader reader=new BufferedReader(new StringReader(content)); //字符串的流 String line=reader.readLine();// 一行行的读 while(line!=null){ //当遇到包含特定标识的行时,也就是遇到了产品的品牌页面链接地址 if(line.contains("<li><a href=\"/digi/nb/brand/") ||line.contains("<li><a href=\"/nb/parameter/")) { String fullUrl=null;// 截取出URI,并在在前面加上域名构成URL fullUrl="http://product.tech.163.com"+ line.substring(line.indexOf("href=")+5, line.lastIndexOf(".html")+5); //将链接加入到待处理列表中 addLinkFromString(curi,fullUrl,"",Link.NAVLINK_HOP); // 在控制台打印URL System.out.println(fullUrl); }else if(line.contains("<div class=\"bigPic\">")) { String fullUrl=null; // 截取出URI,并在在前面加上域名构成URL fullUrl=line.substring(line.indexOf("href=")+5, line.lastIndexOf(".jpg")+4); //将链接加入到待处理列表中 addLinkFromString(curi,fullUrl,"",Link.NAVLINK_HOP); // 在控制台打印 System.out.println(fullUrl); } line=reader.readLine(); } }catch(Exception e){ e.printStackTrace(); } } } /*此方法将当前解析出来的URL加入到待处理列表中 * */ private void addLinkFromString(CrawlURI curi,String uri,CharSequence context,char hopType){ try{ curi.createAndAddLinkRelativeToBase(uri, context.toString(), hopType); }catch(URIException e){ if(getController()!=null){ getController().logUriError(e, curi.getUURI(), uri); } else{ logger.info("Failed createAndAddLinkRelativeToBase"+curi+", "+uri +uri+", "+context+", "+hopType+": "+e); } } }}
package my.postprocessor;import java.util.logging.Logger;import org.archive.crawler.datamodel.CandidateURI;import org.archive.crawler.postprocessor.FrontierScheduler;public class FrontierSchedulerFor163Notebook extends FrontierScheduler{ public static Logger LOGGER=Logger.getLogger(FrontierSchedulerFor163Notebook.class.getName()); public FrontierSchedulerFor163Notebook(String name){ super(name); } protected void schedule(CandidateURI caUri){ // 首先取得要加入等待队列的URL的字符串 String url=caUri.toString(); try{ //如果URL是产品页 if(url.indexOf("product.tech.163.com/nb/product/")!=-1 || url.indexOf("product.tech.163.com/digi/nb/brand/")!=1 || url.indexOf("product.tech.163.com/nb/parameter/")!=1 || url.indexOf("img2.cache.netease.com/photo")!=1 //只需一张图片 || url.endsWith(".jpg") || url.indexOf("robots.txt")!=-1 || url.indexOf("dns:")!=-1){ // 最后检索URL是否带有“#”号 //去除论坛链接 if(url.indexOf("#")==-1){ getController().getFrontier().schedule(caUri); } }else{ return; } }catch(Exception e){ e.printStackTrace(); }finally{ } }}