基于Spindle的加强HTTP Spider
基于Spindle的增强HTTP Spider?zz:http://www.iteye.com/news/1731构建于lucene之上的可用的Java开源Spide
基于Spindle的增强HTTP Spider
?
zz:http://www.iteye.com/news/1731
构建于lucene之上的可用的Java开源Spider少之又少,spindle长期没有更新且功能不够完善,故而自己参考其源
代码重新编写了一个可扩展的WebCrawler,本着开源共享,共同进步的想法发布于此,期冀得到大家的批评指正,
有任何意见及建议均可Email联系我 (kaninebruno@hotmail.com)
?? 以下代码基于lucene-2.3.1,htmlparser-1.6,je-analysis-1.5.3,以及自己修改过的cpdetector-1.0.5;
下载地址分别为
htmlparser:http://sourceforge.net/project/showfiles.php?group_id=24399
je-analysis:http://www.jesoft.cn/je-analysis-1.5.3.jar
lucene就不用说了,cpdetector-1.0.5见附件.
spindle的官方站点:http://www.bitmechanic.com/projects/spindle/
Java代码
package?com.huizhi.kanine.util;????import?java.io.BufferedReader;??import?java.io.File;??import?java.io.FileNotFoundException;??import?java.io.IOException;??import?java.io.InputStream;??import?java.io.InputStreamReader;??import?java.io.UnsupportedEncodingException;??import?java.net.HttpURLConnection;??import?java.net.MalformedURLException;??import?java.net.SocketException;??import?java.net.SocketTimeoutException;??import?java.net.URL;??import?java.net.UnknownHostException;??import?java.nio.charset.Charset;??import?java.util.ArrayList;??import?java.util.Date;??import?java.util.HashSet;????import?jeasy.analysis.MMAnalyzer;????import?org.apache.lucene.analysis.Analyzer;??import?org.apache.lucene.document.DateTools;??import?org.apache.lucene.document.Document;??import?org.apache.lucene.document.Field;??import?org.apache.lucene.index.CorruptIndexException;??import?org.apache.lucene.index.IndexReader;??import?org.apache.lucene.index.IndexWriter;??import?org.apache.lucene.index.Term;??import?org.apache.lucene.search.Hits;??import?org.apache.lucene.search.IndexSearcher;??import?org.apache.lucene.search.TermQuery;??import?org.apache.lucene.store.Directory;??import?org.apache.lucene.store.LockObtainFailedException;??import?org.apache.lucene.store.RAMDirectory;??import?org.htmlparser.Parser;??import?org.htmlparser.PrototypicalNodeFactory;??import?org.htmlparser.filters.AndFilter;??import?org.htmlparser.filters.HasAttributeFilter;??import?org.htmlparser.filters.NodeClassFilter;??import?org.htmlparser.tags.BaseHrefTag;??import?org.htmlparser.tags.FrameTag;??import?org.htmlparser.tags.LinkTag;??import?org.htmlparser.tags.MetaTag;??import?org.htmlparser.util.EncodingChangeException;??import?org.htmlparser.util.NodeIterator;??import?org.htmlparser.util.NodeList;??import?org.htmlparser.util.ParserException;??import?org.htmlparser.visitors.HtmlPage;????import?cpdetector.io.ASCIIDetector;??import?cpdetector.io.CodepageDetectorProxy;??import?cpdetector.io.JChardetFacade;??import?cpdetector.io.ParsingDetector;??import?cpdetector.io.UnicodeDetector;??????/**??*?@author?张波???*?E-mail:kaninebruno@hotmail.com???*?Created?On?:?2008-03-30??*/??public?class?SiteCapturer?implements?Runnable{????????????/*?基准(初始)URL?*/??????protected?URL?mSource;????????/*?索引文件的存放位置?*/??????protected?String?mTarget;????????/**??????*?待解析的URL地址集合,所有新检测到的链接均存放于此;??????*?解析时按照先入先出(First-In?First-Out)法则线性取出??????*/??????protected?ArrayList?mPages;????????/*?已解析的URL地址集合,避免链接的重复抓取?*/??????protected?HashSet?mFinished;????????protected?Parser?mParser;????????????/*?StringBuffer的缓冲区大小?*/??????protected??final?int?TRANSFER_SIZE?=?4096;????????????/*?当前平台的行分隔符?*/??????protected?static?String?lineSep?=?System.getProperty("line.separator");????????????/*?程序运行线程数,默认2个线程?*/??????protected?int?mthreads;????????????protected?ArrayList?threadList;????????????/*?存储于磁盘的IndexWriter?*/??????protected?IndexWriter?FSDWriter;????????????/*?存储于内存的IndexWriter?*/??????protected?IndexWriter?RAMWriter;????????protected?IndexSearcher?indexSearcher;????????protected?RAMDirectory?ramDirectory;????????????/*?筛选页面内容的分词器?*/??????protected?Analyzer?luceneAnalyzer;????????/*?解析页面时的字符编码?*/??????protected?String?charset;????????????/*?统计已抓取的页面数量?*/??????protected?int?count?=?0;????????????/*?基准端口?*/??????protected?int?mPort;????????????/*?基准主机?*/??????protected?String?mHost;????????????/*?检测索引中是否存在当前URL信息,避免重复抓取?*/??????protected?boolean?mCheck;????????/*?索引操作的写入线程锁?*/??????public?static?final?Object?indexLock?=?new?Object();????????????public?SiteCapturer()?{??????????mSource?=?null;??????????mTarget?=?null;??????????mthreads?=?2;??????????mCheck?=?false;??????????mPages?=?new?ArrayList();??????????mFinished?=?new?HashSet();??????????mParser?=?new?Parser();??????????PrototypicalNodeFactory?factory?=?new?PrototypicalNodeFactory();??????????factory.registerTag(new?LocalLinkTag());??????????factory.registerTag(new?LocalFrameTag());??????????factory.registerTag(new?LocalBaseHrefTag());??????????mParser.setNodeFactory(factory);??????}????????public?String?getSource()?{??????????return?mSource.toString();??????}????????public?void?setSource(String?source)?{??????????if?(source.endsWith("/"))??????????????source?=?source.substring(0,?source.length()?-?1);??????????try?{??????????????mSource?=?new?URL(source);??????????}?catch?(MalformedURLException?e)?{??????????????System.err.println("Invalid?URL?:?"?+?getSource());??????????}??????}????????public?String?getTarget()?{??????????return?(mTarget);??????}????????public?void?setTarget(String?target)?{??????????mTarget?=?target;??????}????????????public?int?getThreads()?{??????????return?(mthreads);??????}????????public?void?setThreads(int?threads)?{??????????mthreads?=?threads;??????}????????????public?boolean?isMCheck()?{??????????return?mCheck;??????}????????public?void?setMCheck(boolean?check)?{??????????mCheck?=?check;??????}????????/**??????*?程序入口,在此初始化mPages、IndexWriter??????*?通过协调各线程间的活动完成website的抓取工作??????*?任务完成后将所有的索引片段合并为一个以优化检索??????*/??????public?void?capture(){????????????mPages.clear();??????????mPages.add(getSource());????????????????????int?responseCode?=?0;??????????String?contentType?=?"";????????????????????try?{??????????????HttpURLConnection?uc?=?(HttpURLConnection)?mSource.openConnection();??????????????responseCode?=?uc.getResponseCode();??????????????contentType?=?uc.getContentType();??????????}?catch?(MalformedURLException?mue)?{??????????????System.err.println("Invalid?URL?:?"?+?getSource());??????????}?catch?(IOException?ie)?{??????????????if?(ie?instanceof?UnknownHostException)?{??????????????????System.err.println("UnknowHost?:?"?+?getSource());??????????????}?else?if?(ie?instanceof?SocketException)?{??????????????????System.err.println("Socket?Error?:?"?+?ie.getMessage()?+?"?"??????????????????????????+?getSource());??????????????}?else??????????????????ie.printStackTrace();??????????}????????????????????if?(responseCode?==?HttpURLConnection.HTTP_OK??????????????????&&?contentType.startsWith("text/html"))?{????????????????????????????mPort?=?mSource.getPort();??????????????mHost?=?mSource.getHost();??????????????charset?=?autoDetectCharset(mSource);????????????????/*?存放索引文件的位置?*/??????????????File?indexDir?=?new?File(mTarget);??????????????/*?标记是否重新建立索引,true为重新建立索引?*/??????????????boolean?flag?=?true;??????????????if?(!indexDir.exists())?{??????????????????/*?如果文件夹不存在则创建?*/??????????????????indexDir.mkdir();??????????????}?else?if?(IndexReader.indexExists(mTarget))?{??????????????????/*?如果已存在索引,则追加索引?*/??????????????????flag?=?false;??????????????????File?lockfile?=?new?File(mTarget?+?File.separator?+?"write.lock");??????????????????if?(lockfile.exists())??????????????????????lockfile.delete();??????????????}??????????????luceneAnalyzer?=?new?MMAnalyzer();??????????????ramDirectory?=?new?RAMDirectory();????????????????try?{??????????????????FSDWriter?=?new?IndexWriter(indexDir,?luceneAnalyzer,?flag);??????????????????RAMWriter?=?new?IndexWriter(ramDirectory,?luceneAnalyzer,?true);????????????????????????????????????while?(mCheck)?{??????????????????????IndexReader?indexReader?=?IndexReader.open(mTarget);??????????????????????indexSearcher?=?new?IndexSearcher(indexReader);??????????????????}????????????????????????????????????long?start?=?System.currentTimeMillis();??????????????????threadList?=?new?ArrayList();????????????????????for?(int?i?=?0;?i?<?mthreads;?i++)?{??????????????????????Thread?t?=?new?Thread(this,?"K-9?Spider?Thread?#"?+?(i?+?1));??????????????????????t.start();??????????????????????threadList.add(t);??????????????????}??????????????????while?(threadList.size()?>?0)?{??????????????????????Thread?child?=?(Thread)?threadList.remove(0);??????????????????????try?{??????????????????????????child.join();??????????????????????}?catch?(InterruptedException?e)?{??????????????????????????e.printStackTrace();??????????????????????}??????????????????}??????????????????long?elapsed?=?System.currentTimeMillis()?-?start;????????????????????RAMWriter.close();??????????????????FSDWriter.addIndexes(new?Directory[]?{?ramDirectory?});??????????????????FSDWriter.optimize();??????????????????FSDWriter.close();????????????????????System.out.println("Finished?in?"?+?(elapsed?/?1000)??????????????????????????+?"?seconds");??????????????????System.out.println("The?Count?of?the?Links?Captured?is?"??????????????????????????+?count);??????????????}?catch?(CorruptIndexException?cie)?{??????????????????cie.printStackTrace();??????????????}?catch?(LockObtainFailedException?lofe)?{??????????????????lofe.printStackTrace();??????????????}?catch?(IOException?ie)?{??????????????????ie.printStackTrace();??????????????}??????????}??????????}????????????public?void?run()?{??????????String?url;??????????while?((url?=?dequeueURL())?!=?null)?{??????????????if?(isToBeCaptured(url))??????????????????process(url);??????????}??????????mthreads--;??????}????????/**??????*?判断提取到的链接是否符合解析条件;标准为Port及Host与基准URL相同且类型为text/html或text/plain??????*/??????public?boolean?isToBeCaptured?(String?url){??????????boolean?flag?=?false;????????????????????HttpURLConnection?uc?=?null;??????????int?responseCode?=?0;??????????String?contentType?=?"";??????????String?host?=?"";??????????int?port?=?0;????????????????????try?{??????????????URL?source?=?new?URL(url);??????????????String?protocol?=?source.getProtocol();??????????????if?(protocol?!=?null?&&?protocol.equals("http"))?{??????????????????host?=?source.getHost();??????????????????port?=?source.getPort();??????????????????uc?=?(HttpURLConnection)?source.openConnection();??????????????????uc.setConnectTimeout(8000);??????????????????responseCode?=?uc.getResponseCode();??????????????????contentType?=?uc.getContentType();??????????????}??????????}?catch?(MalformedURLException?mue)?{??????????????System.err.println("Invalid?URL?:?"?+?url);??????????}?catch?(IOException?ie)?{??????????????if?(ie?instanceof?UnknownHostException)?{??????????????????System.err.println("UnknowHost?:?"?+?url);??????????????}?else?if?(ie?instanceof?SocketException)?{??????????????????System.err.println("Socket?Error?:?"?+?ie.getMessage()?+?"?"??????????????????????????+?url);??????????????}?else?if?(ie?instanceof?SocketTimeoutException)?{??????????????????System.err.println("Socket?Connection?Time?Out?:?"?+?url);??????????????}?else?if?(ie?instanceof?FileNotFoundException)?{??????????????????System.err.println("broken?link?"??????????????????????????+?((FileNotFoundException)?ie.getCause()).getMessage()??????????????????????????+?"?ignored");??????????????}?else??????????????????ie.printStackTrace();??????????}????????????????????if?(port?==?mPort??????????????????&&?responseCode?==?HttpURLConnection.HTTP_OK??????????????????&&?host.equals(mHost)??????????????????&&?(contentType.startsWith("text/html")?||?contentType??????????????????????????.startsWith("text/plain")))??????????????flag?=?true;??????????return?flag;??????}????????/*?从URL队列mPages里取出单个的URL?*/??????public?synchronized?String?dequeueURL()?{??????????while?(true)?{??????????????if?(mPages.size()?>?0)?{??????????????????String?url?=?(String)?mPages.remove(0);??????????????????mFinished.add(url);????????????????????????????????????if?(isToBeCaptured(url))?{??????????????????????int?bookmark;??????????????????????NodeList?list;??????????????????????NodeList?robots;??????????????????????MetaTag?robot;??????????????????????String?content;??????????????????????try?{??????????????????????????bookmark?=?mPages.size();??????????????????????????/*?获取页面所有节点?*/??????????????????????????mParser.setURL(url);??????????????????????????try?{??????????????????????????????list?=?new?NodeList();??????????????????????????????for?(NodeIterator?e?=?mParser.elements();?e??????????????????????????????????????.hasMoreNodes();)??????????????????????????????????list.add(e.nextNode());??????????????????????????}?catch?(EncodingChangeException?ece)?{??????????????????????????????/*?解码出错的异常处理?*/??????????????????????????????mParser.reset();??????????????????????????????list?=?new?NodeList();??????????????????????????????for?(NodeIterator?e?=?mParser.elements();?e??????????????????????????????????????.hasMoreNodes();)??????????????????????????????????list.add(e.nextNode());??????????????????????????}??????????????????????????/**??????????????????????????*?依据?http://www.robotstxt.org/wc/meta-user.html?处理??????????????????????????*?Robots??tag??????????????????????????*/??????????????????????????robots?=?list??????????????????????????????????.extractAllNodesThatMatch(??????????????????????????????????????????new?AndFilter(new?NodeClassFilter(??????????????????????????????????????????????????MetaTag.class),??????????????????????????????????????????????????new?HasAttributeFilter("name",??????????????????????????????????????????????????????????"robots")),?true);??????????????????????????if?(0?!=?robots.size())?{??????????????????????????????robot?=?(MetaTag)?robots.elementAt(0);??????????????????????????????content?=?robot.getAttribute("content")??????????????????????????????????????.toLowerCase();??????????????????????????????if?((-1?!=?content.indexOf("none"))??????????????????????????????????????||?(-1?!=?content.indexOf("nofollow")))??????????????????????????????????for?(int?i?=?bookmark;?i?<?mPages.size();?i++)??????????????????????????????????????mPages.remove(i);??????????????????????????}??????????????????????}?catch?(ParserException?pe)?{??????????????????????????pe.printStackTrace();??????????????????????}??????????????????}??????????????????return?url;??????????????}?else?{??????????????????mthreads--;??????????????????if?(mthreads?>?0)?{??????????????????????try?{??????????????????????????wait();??????????????????????????mthreads++;??????????????????????}?catch?(InterruptedException?ie)?{??????????????????????????ie.printStackTrace();??????????????????????}??????????????????}?else?{??????????????????????notifyAll();??????????????????????return?null;??????????????????}??????????????}??????????}??????}????????/**??????*?处理单独的URL地址,解析页面并加入到lucene索引中;通过自动探测页面编码保证抓取工作的顺利执行??????*/??????protected?void?process(String?url)?{????????????????????String?result[];??????????String?content?=?null;??????????String?title?=?null;????????????/*?此项操作较耗性能,故默认不予检测?*/??????????if?(mCheck)?{??????????????try?{??????????????????TermQuery?query?=?new?TermQuery(new?Term("url",?url));??????????????????Hits?hits?=?indexSearcher.search(query);??????????????????if?(hits.length()?>?0)?{??????????????????????System.out.println("The?URL?:?"?+?url??????????????????????????????+?"?has?already?been?captured");??????????????????}?else?{??????????????????????result?=?parseHtml(url,?charset);??????????????????????content?=?result[0];??????????????????????title?=?result[1];??????????????????}??????????????}?catch?(IOException?ie)?{??????????????????ie.printStackTrace();??????????????}??????????}?else?{??????????????result?=?parseHtml(url,?charset);??????????????content?=?result[0];??????????????title?=?result[1];??????????}????????????????????if?(content?!=?null?&&?content.trim().length()?>?0)?{????????????????Document?document?=?new?Document();??????????????document.add(new?Field("content",?content,?Field.Store.YES,??????????????????????Field.Index.TOKENIZED,??????????????????????Field.TermVector.WITH_POSITIONS_OFFSETS));??????????????document.add(new?Field("url",?url,?Field.Store.YES,??????????????????????Field.Index.UN_TOKENIZED));??????????????document.add(new?Field("title",?title,?Field.Store.YES,??????????????????????Field.Index.TOKENIZED,??????????????????????Field.TermVector.WITH_POSITIONS_OFFSETS));??????????????document.add(new?Field("date",?DateTools.timeToString(new?Date()??????????????????????.getTime(),?DateTools.Resolution.DAY),?Field.Store.YES,??????????????????????Field.Index.UN_TOKENIZED));????????????????????????????synchronized?(indexLock)?{??????????????????try?{??????????????????????RAMWriter.addDocument(document);??????????????????????/**??????????????????????*?当存放索引的内存使用大于指定值时将其写入硬盘;采用此方法的目的是??????????????????????*?通过内存缓冲避免频繁的IO操作,提高索引创建性能;??????????????????????*/??????????????????????if?(RAMWriter.ramSizeInBytes()?>?512?*?1024)?{??????????????????????????RAMWriter.close();??????????????????????????FSDWriter.addIndexes(new?Directory[]?{?ramDirectory?});??????????????????????????RAMWriter?=?new?IndexWriter(ramDirectory,??????????????????????????????????luceneAnalyzer,?true);??????????????????????}??????????????????????count++;??????????????????????System.out.println(Thread.currentThread().getName()??????????????????????????????+?":?Finished?Indexing?URL:?"?+?url);??????????????????}?catch?(CorruptIndexException?cie)?{??????????????????????cie.printStackTrace();??????????????????}?catch?(IOException?ie)?{??????????????????????ie.printStackTrace();??????????????????}??????????????}??????????}??????}????????/**??????*?Link?tag?that?rewrites?the?HREF.??????*?The?HREF?is?changed?to?a?local?target?if?it?matches?the?source.??????*/??????class?LocalLinkTag?extends?LinkTag?{??????????public?void?doSemanticAction()?{????????????????String?link?=?getLink();??????????????if?(link.endsWith("/"))??????????????????link?=?link.substring(0,?link.length()?-?1);??????????????int?pos?=?link.indexOf("#");??????????????if?(pos?!=?-1)??????????????????link?=?link.substring(0,?pos);????????????????/*?将链接加入到处理队列中?*/??????????????if?(!(mFinished.contains(link)?||?mPages.contains(link)))??????????????????mPages.add(link);????????????????setLink(link);??????????}??????}????????/**??????*?Frame?tag?that?rewrites?the?SRC?URLs.?The?SRC?URLs?are?mapped?to?local??????*?targets?if?they?match?the?source.??????*/??????class?LocalFrameTag?extends?FrameTag?{??????????public?void?doSemanticAction()?{????????????????String?link?=?getFrameLocation();??????????????if?(link.endsWith("/"))??????????????????link?=?link.substring(0,?link.length()?-?1);??????????????int?pos?=?link.indexOf("#");??????????????if?(pos?!=?-1)??????????????????link?=?link.substring(0,?pos);????????????????/*?将链接加入到处理队列中?*/??????????????if?(!(mFinished.contains(link)?||?mPages.contains(link)))??????????????????mPages.add(link);????????????????setFrameLocation(link);??????????}??????}????????/**??????*?Base?tag?that?doesn't?show.?The?toHtml()?method?is?overridden?to?return??????*?an?empty?string,?effectively?shutting?off?the?base?reference.??????*/??????class?LocalBaseHrefTag?extends?BaseHrefTag?{????????????????????public?String?toHtml()?{??????????????return?("");??????????}??????}????????????/*?自动探测页面编码,避免中文乱码的出现?*/??????protected?String?autoDetectCharset(URL?url)?{????????????????????CodepageDetectorProxy?detector?=?CodepageDetectorProxy.getInstance();??????????/**??????????*?ParsingDetector可用于检查HTML、XML等文件或字符流的编码??????????*?构造方法中的参数用于指示是否显示探测过程的详细信息??????????*?为false则不显示??????????*/???????????detector.add(new?ParsingDetector(false));??????????detector.add(JChardetFacade.getInstance());??????????detector.add(ASCIIDetector.getInstance());??????????detector.add(UnicodeDetector.getInstance());????????????????????Charset?charset?=?null;??????????try?{??????????????charset?=?detector.detectCodepage(url);??????????}?catch?(MalformedURLException?mue)?{??????????????mue.printStackTrace();??????????}?catch?(IOException?ie)?{??????????????ie.printStackTrace();??????????}??????????if?(charset?==?null)??????????????charset?=?Charset.defaultCharset();??????????return?charset.name();??????}????????/*?按照指定编码解析标准的html页面,为建立索引做准备*/??????protected?String[]?parseHtml(String?url,?String?charset)?{????????????String?result[]?=?null;??????????String?content?=?null;????????????????????tr