我在写页面extractor 解析器,有问题!!!
下面一段长长的代码,是从《开发自己的搜索引擎》上抄的,我还是新手,就一个问题(写在代码里了)?
package com.luceneheritrixbook.extractor.pconline.mobile;import java.io.BufferedWriter;import java.io.File;import java.io.FileWriter;import java.io.IOException;import java.util.Date;import org.archive.crawler.datamodel.CrawlURI;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.filters.AndFilter;import org.htmlparser.filters.HasAttributeFilter;import org.htmlparser.filters.OrFilter;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.tags.ImageTag;import org.htmlparser.tags.TableColumn;import org.htmlparser.util.NodeList;import com.luceneheritrixbook.extractor.Extractor;//我想问的是,这两个包是什么包,是htmlparser库里面的import com.luceneheritrixbook.util.StringUtils;//包吗?还是自己写的包?反正这两句显示有错的,//com.luceneheritrixbook.里面没有Utils子包;2:com.luceneheritrixbook.extractor.里面没有Extractot子包public class ExtractPconlineMobile extends Extractor{ public void extract() { BufferedWriter bw=null; NodeFilter attributes_filter=new AndFilter(new TagNameFilter("td"),new OrFilter(new HasAttributeFilter("class","tdl"),new HasAttributeFilter("class","td2"))); NodeFilter title_filter=new TagNameFilter("h1"); NodeFilter image_filter=new AndFilter(new TagNameFilter("img"),new HasAttributeFilter("class","bigimg")); try{ NodeList title_nodes=this.getParser().parse(title_filter); long num=title_nodes.size(); for(int i=0;i<title_nodes.size();i++){ Node node_title=title_nodes.elementAt(i); String[] names=node_title.toPlainTextString().split(""); StringBuffer title=new StringBuffer(); for(int k=0;k<names.length;k++){ title.append(names[k]).append("-"); } title.append((new Date()).getTime()); String path=this.getOutputParth(); bw=new BufferedWriter(new FileWriter(new File(path+title+".txt"))); int startPos=getInuputFilePath().indexOf("mirror")+6; String url_seg=getInuputFilePath().substring(startPos); url_seg=url_seg.replaceAll("\\\\","/"); String url="http:/"+url_seg; System.out.println(url); bw.write(url+NEWLINE); for(int k=0;k<names.length;k++){ bw.write(names[k]+NEWLINE); } } }catch(Exception e){ e.printStackTrace(); } this.getParser().reset(); try{ NodeList attributes_nodes=this.getParser().parse(attributes_filter); for(int i=0;i<attributes_nodes.size();i++){ TableColumn node=(TableColumn)attributes_nodes.elementAt(i); String name=node.getAttribute("class"); String result=node.getAttribute("class"); if(name.equals(new String("td1"))){ bw.write(StringUtils.trim(result)+":"); }else if(name.equals(new String("td2"))){ bw.write(StringUtils.trim(result)); bw.newLine(); } continue; } }catch(Exception e){ e.printStackTrace(); } this.getParser().reset(); try{ NodeList image_nodes=this.getParser().parse(image_filter); for(int i=0;i<image_nodes.size();i++){ ImageTag node=(ImageTag)image_nodes.elementAt(i); String image_url=node.getAttribute("src"); String fileType=image_url.substring(image_url.lastIndexOf(".")+1); String new_iamge_file=StringUtils.encodePassword( image_url,HASH_ALGORITHM) +"."+fileType; image_url=StringUtils.replace(image_url,"+"," "); copyImage(image_url,new_iamge_file); bw.write(image_url+NEWLINE); bw.write(SEPARATOR+NEWLINE); bw.write(new_iamge_file+NEWLINE); System.out.println(image_url); } }catch(Exception e){ e.printStackTrace(); } try{ if(bw!=null){ bw.close(); } }catch(IOException e){ e.printStackTrace(); } }}