JAVA中使用Htmlparse解析HTML文档
package com.web.test;import java.io.BufferedReader;import java.io.InputStreamReader;import java.net.URL;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.tags.LinkTag;import org.htmlparser.util.NodeList;/** * JAVA中使用Htmlparse解析HTML文档,使用htmlparse遍历出HTML文档的所有超链接(<a>标记)。 * * @author YYmmiinngg */public class ReadHTML2{public static void main(String[] args){try{//1.网页HTMLString strUrl = "http://www.boc.cn/finadata/lilv/";URL url = new URL(strUrl);InputStreamReader isr = new InputStreamReader(url.openStream());BufferedReader br = new BufferedReader(isr);String htmlString = "";//2.本地HTML// File f=new File("fortest.htm");//输入流// InputStreamReader isr1=new InputStreamReader(new FileInputStream(f));// BufferedReader br=new BufferedReader(isr1);//获取html转换成StringString s;String allContent = "";while ((s = br.readLine()) != null){allContent = allContent + s;}//使用后HTML Parser 控件Parser myParser = Parser.createParser(allContent, "utf-8");try{// 通过过滤器过滤出<A>标签 NodeList nodeList = myParser.extractAllNodesThatMatch(new NodeFilter(){//实现该方法,用以过滤标签 public boolean accept(Node node){if (node instanceof LinkTag) //<A>标记 return true;return false;}});// 打印 for (int i = 0; i < nodeList.size(); i++){LinkTag n = (LinkTag) nodeList.elementAt(i);System.out.print(n.getStringText() + " ==>> ");System.out.println(n.extractLink());}}catch (Exception e){e.printStackTrace();}}catch (Exception e){e.printStackTrace();}}}