htmlparser提取网页本文
htmlparser提取网页正文mport?org.htmlparser.Parser? ??import?org.htmlparser.beans.StringBean? ??im
htmlparser提取网页正文
mport?org.htmlparser.Parser;? ??import?org.htmlparser.beans.StringBean;? ??import?org.htmlparser.filters.NodeClassFilter;? ??import?org.htmlparser.parserapplications.StringExtractor;? ??import?org.htmlparser.tags.BodyTag;? ??import?org.htmlparser.util.NodeList;? ??import?org.htmlparser.util.ParserException;? ??? ??public?class?GetContent?{? ?????? ??????public?void?getContentUsingStringBean(String?url)?{? ??????????StringBean?sb?=?new?StringBean();? ??????????sb.setLinks(true);??????????? ??????????sb.setCollapse(true);?? ??????????sb.setReplaceNonBreakingSpaces(true);//?If?true?regular?space? ??????????sb.setURL("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html");? ??????????System.out.println("The?Content?is?:\n"?+?sb.getStrings());? ??? ??????}? ?????? ??????public?void?getContentUsingStringExtractor(String?url,?boolean?link)?{? ??????????StringExtractor?se?=?new?StringExtractor(url);? ??????????String?text?=?null;? ??????????try?{? ??????????????text?=?se.extractStrings(link);? ??????????????System.out.println("The?content?is?:\n"?+?text);? ??????????}?catch?(ParserException?e)?{? ??????????????e.printStackTrace();? ??????????}? ??????}? ??? ??????public?void?getContentUsingParser(String?url)?{? ??????????NodeList?nl;? ??????????try?{? ??????????????Parser?p?=?new?Parser(url);? ??????????????nl?=?p.parse(new?NodeClassFilter(BodyTag.class));? ??????????????BodyTag?bt?=?(BodyTag)?nl.elementAt(0);? ??????????????System.out.println(bt.toPlainTextString());? ??????????????}?catch?(ParserException?e)?{? ??????????????e.printStackTrace();? ??????????}? ??????}? ?????? ??????public?static?void?main(String[]?args)?{ ??????????GetContent?g?=?new?GetContent(); ??//??????g.getContentUsingStringBean(""); ??//??????g.getContentUsingParser("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html"); ??????????g.getContentUsingStringExtractor("http://www.sina.com.cn/",?false); ??????}??