Html parser 代码集锦 3
通过百度获取天气预报
通过百度获取天气预报
http://htmlparser.com.cn/post/20090917323.html
public class Getweather { /** * @param args * @throws ParserException */ public static void getWeather(String url) throws ParserException//通过百度获得天气预报, { Parser parser=new Parser("http://www.baidu.com/s?wd=%CC%EC%C6%F8");//URLDecoder码。代表天气自己转换就行 NodeFilter filter=new HasAttributeFilter("class","al_tr"); NodeList nodelist=parser.extractAllNodesThatMatch(filter); for(String a:nodelist.elementAt(0).toPlainTextString().trim().split(" ")) { if(!"".equals(a)) System.out.println(a); } } public static void getWeatherImage() throws ParserException//获得天气图片的链接URL { Parser parser=new Parser("http://www.baidu.com/s?wd=%CC%EC%C6%F8"); NodeFilter filter=new HasAttributeFilter("class","al_tr"); NodeList nodelist=parser.extractAllNodesThatMatch(filter); nodelist=nodelist.elementAt(0).getChildren(); NodeFilter filter1=new NodeClassFilter(ImageTag.class); nodelist=nodelist.extractAllNodesThatMatch(filter1,true); for(int i=0;i<nodelist.size();i++) { ImageTag image=(ImageTag) nodelist.elementAt(i); DownLoadImg(image.getImageURL(),String.valueOf(i)); } } public static void DownLoadImg(String url,String name)//下载对应的天气图片。 { HttpClient hc=new HttpClient(); GetMethod gm=new GetMethod(url); try { hc.executeMethod(gm); String path="/home/weather/"; File file=new File(path); if(!file.exists()) { file.mkdirs(); } String imagepath=path+name+".gif"; file=new File(imagepath); if(!file.exists()) { file.createNewFile(); } FileOutputStream out=new FileOutputStream(file); out.write(gm.getResponseBody()); out.close(); } catch (Exception e) { } } public static void main(String[] args) throws UnsupportedEncodingException, ParserException { getWeatherImage(); } }
//nekohtml结合xpath用法 DOMParser parser = new DOMParser(); try { //设置网页的默认编码 parser.setProperty("http://cyberneko.org/html/properties/default-encoding","gb2312"); /*The Xerces HTML DOM implementation does not support namespaces and cannot represent XHTML documents with namespace information. Therefore, in order to use the default HTML DOM implementation with NekoHTML's DOMParser to parse XHTML documents, you must turn off namespace processing.*/ parser.setFeature("http://xml.org/sax/features/namespaces", false); String strURL = "http://product.dangdang.com/product.aspx?product_id=9317290"; BufferedReader in = new BufferedReader( new InputStreamReader( new URL(strURL).openStream())); parser.parse(new InputSource(in)); in.close(); } catch (Exception e) { e.printStackTrace(); } Document doc = parser.getDocument(); // tags should be in upper case String productsXpath = "/HTML/BODY/DIV[2]/DIV[4]/DIV[2]/DIV/DIV[3]/UL[@class]/LI[9]"; NodeList products; try { products = XPathAPI.selectNodeList(doc, productsXpath); System.out.println("found: " + products.getLength()); Node node = null; for(int i=0; i< products.getLength();i++) { node = products.item(i); System.out.println( i + ":\n" + node.getTextContent()); } }catch (TransformerException e) { e.printStackTrace(); }
ConnectionManager manager = Page.getConnectionManager(); Parser parser = new Parser(manager .openConnection("http://www.verycd.com/topics/2760827/")); parser.setEncoding("GBK"); //提取a标签里的img图片链接 // NodeFilter filter = new AndFilter(new TagNameFilter("a"), // new HasChildFilter(new TagNameFilter("img"))); // NodeFilter filter = new TagNameFilter("title"); //提取input里面的东西 NodeFilter filter = new AndFilter(new TagNameFilter("input"), new HasAttributeFilter("class","forminput")); // NodeList nodeList = parser.parse(filter); NodeList nodeList = parser.extractAllNodesThatMatch(filter); NodeIterator it = nodeList.elements(); while (it.hasMoreNodes()) { Node node = it.nextNode(); System.out.println(node.toHtml()); } }
// 获取一个网页上所有的链接和图片链接 public static void extracLinks(String url) { try { Parser parser = new Parser(url); parser.setEncoding("gb2312"); //过滤 <frame> 标签的 filter,用来提取 frame 标签里的 src 属性所、表示的链接 NodeFilter frameFilter = new NodeFilter() { public boolean accept(Node node) { if (node.getText().startsWith("frame src=")) { return true; } else { return false; } } }; //OrFilter 来设置过滤 <a> 标签,<img> 标签和 <frame> 标签,三个标签是 or 的关系 OrFilte rorFilter = new OrFilter(new NodeClassFilter(LinkTag.class), new NodeClassFilter(ImageTag.class)); OrFilter linkFilter = new OrFilter(orFilter, frameFilter); //得到所有经过过滤的标签 NodeList list = parser.extractAllNodesThatMatch(linkFilter); for (int i = 0; i < list.size(); i++) { Node tag = list.elementAt(i); if (tag instanceof LinkTag)//<a> 标签 { LinkTag link = (LinkTag) tag; String linkUrl = link.getLink();//url String text = link.getLinkText();//链接文字 System.out.println(linkUrl + "**********" + text); } else if (tag instanceof ImageTag)//<img> 标签 { ImageTag image = (ImageTag) list.elementAt(i); System.out.print(image.getImageURL() + "********");//图片地址 System.out.println(image.getText());//图片文字 } else//<frame> 标签 { //提取 frame 里 src 属性的链接如 <frame src="test.html"/> String frame = tag.getText(); int start = frame.indexOf("src="); frame = frame.substring(start); int end = frame.indexOf(" "); if (end == -1) end = frame.indexOf(">"); frame = frame.substring(5, end - 1); System.out.println(frame); } } } catch (ParserException e) { e.printStackTrace(); } }
public void test2() throws ParserException{ ConnectionManager manager = Page.getConnectionManager(); Parser parser = new Parser(manager.openConnection("http://www.verycd.com/sto/datum/computer/page1")); //提取A标签,他有一个父标签为H3 NodeFilter filter = new AndFilter(new TagNameFilter("a"),new HasParentFilter(new TagNameFilter("h3"))); NodeList nodes = parser.parse(filter); NodeIterator it = nodes.elements(); while (it.hasMoreNodes()) { Node node = it.nextNode(); //可以在此用debug查看node节点是什么类型的,然后在转换成具体类型来提取想要的数据! if (node instanceof LinkTag) { //转换 LinkTag linkNode = (LinkTag) node; System.out.println("http://www.verycd.com"+linkNode.getAttribute("href")); } } }
ConnectionManager manager = Page.getConnectionManager(); Parser parser = new Parser(manager.openConnection("http://huodong.sodao.com/39/info#")); NodeFilter filter = new StringFilter("减价幅度"); //用了StringFilter,可以立即提取出所在的text节点 NodeList nodes = parser.parse(filter); NodeIterator it = nodes.elements(); while (it.hasMoreNodes()) { Node node = it.nextNode(); //可以在此用debug查看node节点是什么类型的,然后在转换成具体类型来提取想要的数据! System.out.println(node.getParent().toHtml()); //通过indexof方法提取想要的text
/** * 用htmlparser提取图片的几种方法 */ public class Test { static Parser parser = new Parser(); public static void test1(String url) throws ParserException{ Parser parser = new Parser(); parser.setURL(url); parser.setEncoding("GBK"); //第一种方法 NodeFilter imageFilter = new NodeFilter() { public boolean accept(Node node) { return (node instanceof ImageTag); } }; NodeList images = parser.extractAllNodesThatMatch(imageFilter); // 第二种方法 // NodeFilter imageFilter = new TagNameFilter("img"); //第三种方法 // NodeFilter imageFilter = new NodeClassFilter(ImageTag.class); // NodeList images = parser.parse(imageFilter); System.out.println("Size: " + images.size()); for(NodeIterator it = images.elements(); it.hasMoreNodes();){ ImageTag node = (ImageTag) it.nextNode(); System.out.println(node.getAttribute("src")); } } public static void test2(String url) throws ParserException { Parser parser = new Parser(); parser.setURL(url); parser.setEncoding("GBK"); NodeFilter divFilter = new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("id","text_content")); NodeList divs = parser.parse(divFilter); System.out.println(removeTag(divs.elementAt(0).toHtml())); } public static String removeTag(String content) { if (null != content && !"".equals(content.trim())) { while (content.indexOf("<") >=0 && content.indexOf(">") >= 0) { int i = content.indexOf("<"); int j = content.indexOf(">"); if (i < j) { String contetn1 = content.substring(0, i); String content2 = content.substring(j + 1, content.length() ); content = contetn1 + content2; } } } return content; } public static void main(String[] args) throws ParserException { String url = "http://news.dayoo.com/china/200908/11/53868_10386441.htm"; Test.test2(url); } } ===================================================================== /** *//** * 分别读纯文本和链接. * @param result 网页的内容 * @throws Exception */ public static void readTextAndLinkAndTitle(String result) throws Exception { Parser parser; NodeList nodelist; parser = Parser.createParser(result, "utf8"); NodeFilter textFilter = new NodeClassFilter(TextNode.class); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeFilter titleFilter = new NodeClassFilter(TitleTag.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter, titleFilter }); nodelist = parser.parse(lastFilter); Node[] nodes = nodelist.toNodeArray(); String line = ""; for (int i = 0; i < nodes.length; i++) { Node node = nodes[i]; if (node instanceof TextNode) { TextNode textnode = (TextNode) node; line = textnode.getText(); } else if (node instanceof LinkTag) { LinkTag link = (LinkTag) node; line = link.getLink(); } else if (node instanceof TitleTag) { TitleTag titlenode = (TitleTag) node; line = titlenode.getTitle(); } if (isTrimEmpty(line)) continue; System.out.println(line); } }
/** * @author rrong_m * @throws ParserException */ public static void getWords(String url) throws ParserException { Parser parser = new Parser(url); NodeFilter filter = new HasAttributeFilter("id", "word_more_con"); NodeList nodelist = parser.extractAllNodesThatMatch(filter); NodeFilter filter1 = new NodeClassFilter(LinkTag.class); nodelist = nodelist.extractAllNodesThatMatch(filter1, true); for (int i = 0; i < nodelist.size(); i++) { LinkTag link = (LinkTag) nodelist.elementAt(i); System.out.println(link.getLinkText() + ":" + link.getLink()); } }
public List getChangyu(String id) throws HttpException, IOException, ParserException { HttpClient hc=new HttpClient(); hc.getParams().setContentCharset("gb2312"); PostMethod pm=new PostMethod("http://61.145.121.47/custSearch.jsp"); pm.setParameter("bnos","111111111111"); hc.executeMethod(pm); String temp=pm.getResponseBodyAsString(50000);//设置获取html代码的数量,否则会报错,过大 //System.out.print(temp); Parser parser=new Parser(temp); NodeFilter filter=new HasAttributeFilter("cellpadding","-1"); NodeList nodelist=parser.extractAllNodesThatMatch(filter); if(nodelist.size()>0) { NodeFilter filter1=new NodeClassFilter(TableTag.class); nodelist=nodelist.extractAllNodesThatMatch(filter1,true); if(nodelist.size()>0) { TableTag table=(TableTag) nodelist.elementAt(0); TableRow [] rows=table.getRows(); if(rows.length>1) { for(int i=2;i<rows.length;i++) { TableRow row=rows[i]; TableColumn td=row.getColumns()[0]; TableColumn td1=row.getColumns()[1]; System.out.println(td.toPlainTextString()+" "+td1.toPlainTextString()); } } } } else { } return null; }
public static void getIp(String ip) throws ParserException { String url="http://www.ip138.com/ips.asp?ip="+ip; Parser parser=new Parser(url); parser.setEncoding("gb2312");//设置编码否则得到的是乱码51jsp.cn NodeList nodelist=null; NodeFilter filter=new HasAttributeFilter("class","ul1");//定义过滤51jsp.cn nodelist=parser.extractAllNodesThatMatch(filter); if(nodelist.size()>0) { System.out.println("您查询的IP为:"+ip); System.out.println(nodelist.elementAt(0).toPlainTextString()); } } public static void main(String[] args) throws ParserException { getIp("125.33.192.180"); }
public static ArrayList <String> getParagraphList( String content){ ArrayList <String> paraList=new ArrayList <String>(); Parser myParser=new Parser(); NodeList nodeList = null; NodeFilter paraFilter = new NodeClassFilter(ParagraphTag.class); try { myParser.setInputHTML(content); nodeList = myParser.parse(paraFilter); for (int i = 0; i <= nodeList.size(); i++) { ParagraphTag tag = (ParagraphTag) nodeList.elementAt(i); if(tag!=null){ // System.out.println(tag.getStringText()); // System.out.println("***********************************"); paraList.add(tag.getStringText()); } } } catch (ParserException e) { e.printStackTrace(); } return paraList; }
// 获取meta里的keywords和description List segments = source.findAllElements(Tag.META); getKeywordsDesc(segments); if (null != segments) { String keywordsStr = null; String descriptionStr = null; int sumSegments = segments.size(); if (sumSegments > 0) { for (int i = 0; i < sumSegments; i++) { String metaStr = segments.get(i).toString(); if (null != metaStr && !"".equals(metaStr.trim())) { // 获取description if (metaStr.indexOf("description") != -1 && metaStr.indexOf("content") != -1) { metaStr = metaStr.replaceAll(""", "").replaceAll( "/", ""); descriptionStr = metaStr.substring(metaStr .indexOf("content")); descriptionStr = descriptionStr.substring( descriptionStr.indexOf("=") + 1, descriptionStr.length() - 1); descriptionStr = TextHtml.html2text(descriptionStr); } parserBean.setDescription(removeTag(descriptionStr)); // 获取keywords if (metaStr.indexOf("keywords") != -1 && metaStr.indexOf("content") != -1) { metaStr = metaStr.replaceAll(""", "").replaceAll( "/", ""); keywordsStr = metaStr.substring(metaStr .indexOf("content")); keywordsStr = keywordsStr .substring(keywordsStr.indexOf("=") + 1, keywordsStr.length() - 1); keywordsStr = TextHtml.html2text(keywordsStr); parserBean.setKeywords(removeTag(keywordsStr)); } } }// for over } } // 获取meta里的keywords和descriptionList segments = source.findAllElements(Tag.META); getKeywordsDesc(segments); if (null != segments) {String keywordsStr = null;String descriptionStr = null;int sumSegments = segments.size();if (sumSegments > 0) {for (int i = 0; i < sumSegments; i++) {String metaStr = segments.get(i).toString();if (null != metaStr && !"".equals(metaStr.trim())) {// 获取descriptionif (metaStr.indexOf("description") != -1&& metaStr.indexOf("content") != -1) {metaStr = metaStr.replaceAll(""", "").replaceAll("/", "");descriptionStr = metaStr.substring(metaStr.indexOf("content"));descriptionStr = descriptionStr.substring(descriptionStr.indexOf("=") + 1,descriptionStr.length() - 1);descriptionStr = TextHtml.html2text(descriptionStr);}parserBean.setDescription(removeTag(descriptionStr));// 获取keywordsif (metaStr.indexOf("keywords") != -1&& metaStr.indexOf("content") != -1) {metaStr = metaStr.replaceAll(""", "").replaceAll("/", "");keywordsStr = metaStr.substring(metaStr.indexOf("content"));keywordsStr = keywordsStr.substring(keywordsStr.indexOf("=") + 1,keywordsStr.length() - 1);keywordsStr = TextHtml.html2text(keywordsStr);parserBean.setKeywords(removeTag(keywordsStr));}}}// for over}}
/** * 获取meta里的keywords和description */ private void getKeywordsDesc(List segments) { if (null != segments) { String keywords = null; String description = null; int sumSegments = segments.size(); for (int i = 0; i < sumSegments; i++) { String segment = segments.get(i).toString().toLowerCase() ; if (null != segment && !"".equals(segment.trim())) { // 获取meta里的keywords if (segment.indexOf("keywords") > 0 && segment.indexOf("content") > 0) { String patternStr = "< *meta *name *= *"? *keywords *"? *content *= *"?(.*) *"? */? *>"; keywords = Regex(patternStr, segment); if (null == keywords) { patternStr = "< *meta *content *= *"?(.*) *"? *name *= *"? *keywords *"? */? *>"; keywords = Regex(patternStr, segment); } if (null != keywords) { keywords = removeTag(keywords); } this.keyowrds = keywords.replace("/", "").replace(""", ""); } // 获取meta里的description if (segment.indexOf("description") > 0 && segment.indexOf("content") > 0) { String patternStr = "< *meta name *= *"? *description *"? *content *= *"?(.*) *"? */? *>"; description = Regex(patternStr, segment); if (null == description) { patternStr = "< *meta *content *= *"?(.*) *"? *name *= *"? *description *"? */? *>"; description = Regex(patternStr, segment); } if (null != description) { description = removeTag(description).replace("/", ""); } this.description = description.replace("/", "").replace(""", ""); } } } } }
private String Regex(String patternStr, String segment) { String str = null; Pattern p = Pattern.compile(patternStr,Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(segment); while (m.find()) { str = m.group(1); } return str; }
Parser parser=new Parser(url); NodeFilter filter=new NodeClassFilter(MetaTag.class); NodeList nodelist=parser.extractAllNodesThatMatch(filter); for(Node node:nodelist.toNodeArray()) { MetaTag meta=(MetaTag) node; System.out.println(meta.getAttribute("name")+":"+meta.getAttribute("content")); }