跪求解决！利用Htmlparser抓取网页本文时出错，求教大神解决

2012-12-23

跪求解决！！！利用Htmlparser抓取网页正文时出错，求教大神解决!跪求解决！！！利用Htmlparser抓取网页正文时

跪求解决！！！利用Htmlparser抓取网页正文时出错，求教大神解决!
跪求解决！！！
利用Htmlparser抓取网页正文时出错，求教大神解决!
/**包含正文的标签通常是TABLE、DIV或ParagraphTag里，因而找到包含文字最多的DIV或TABLE，通常就是正文**/
//table有效性的记录
public class TableValid {
    private int trnum;
    private int tdnum;
    private int linknum;
    private int textnum;
    private int scriptnum;

    public int getScriptnum() {
        return scriptnum;
    }
    public void setScriptnum(int scriptnum) {
        this.scriptnum = scriptnum;
    }
    public int getLinknum() {
        return linknum;
    }
    public void setLinknum(int linknum) {
        this.linknum = linknum;
    }
    public int getTdnum() {
        return tdnum;
    }
    public void setTdnum(int tdnum) {
        this.tdnum = tdnum;
    }
    public int getTextnum() {
        return textnum;
    }
    public void setTextnum(int textnum) {
        this.textnum = textnum;
    }
    public int getTrnum() {
        return trnum;
    }
    public void setTrnum(int trnum) {
        this.trnum = trnum;
    }
}
//table中的内容

import java.util.List;

public class TableContext {
private List<?> linkList;
private StringBuffer textBuffer;
private int tableRow;
private int totalRow;
private String sign;

public String getSign() {
return sign;
}

public void setSign(String sign) {
this.sign = sign;
}

public int getTotalRow() {
return totalRow;
}

public void setTotalRow(int totalRow) {
this.totalRow = totalRow;
}

public int getTableRow() {
return tableRow;
}

public void setTableRow(int tableRow) {
this.tableRow = tableRow;
}

public List<?> getLinkList() {
return linkList;
}

public void setLinkList(List<?> linkList) {
this.linkList = linkList;
}

public StringBuffer getTextBuffer() {
return textBuffer;
}

public void setTextBuffer(StringBuffer textBuffer) {
this.textBuffer = textBuffer;
}
}

//column有效性的记录
public class TableColumnValid {
    int tdNum;
    boolean valid;

public int getTdNum() {
return tdNum;
}
public void setTdNum(int tdNum) {
this.tdNum = tdNum;
}
public boolean isValid() {
return valid;
}
public void setValid(boolean valid) {
this.valid = valid;
}

}

//页面内容
import org.htmlparser.Node;

public class PageContext {
private StringBuffer textBuffer;
private int number;
private Node node;

public Node getNode() {
return node;
}

public void setNode(Node node) {
this.node = node;
}

public int getNumber() {
return number;
}

public void setNumber(int number) {
this.number = number;
}

public StringBuffer getTextBuffer() {
return textBuffer;
}

public void setTextBuffer(StringBuffer textBuffer) {
this.textBuffer = textBuffer;
}
}

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.Html;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.ParagraphTag;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.tags.SelectTag;
import org.htmlparser.tags.Span;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableHeader;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;

//正文抽取主程序
public class ExtractContext {
protected static final String lineSign = System
.getProperty("line.separator");
protected static final int lineSign_size = lineSign.length();

/** 定义系统上下文* */
public static final ApplicationContext context = new ClassPathXmlApplicationContext(
new String[] { "newwatch/persistence.xml", "newwatch/biz-util.xml",
"newwatch/biz-dao.xml" });

/**
* @param <ChannelLinkDO>
* @param args
*/
public static void main(String[] args) {
ExtractContext console = new ExtractContext();
ChannelLinkDO c = new ChannelLinkDO();
c.setEncode("gb2312");
c.setLink("http://www.qiche.com.cn/files/200712/12016.shtml");
c.setLinktext("test");
console.makeContext(c);
}

/**
* 收集HTML页面信息
*

* @param url
* @param urlEncode
*/
public void makeContext(ChannelLinkDO c) {
String metakeywords = "<META content={0} name=keywords>";
String metatitle = "<TITLE>{0}</TITLE>";
String metadesc = "<META content={0} name=description>";
String netshap = "<p> 正文快照: 时间{0}</p> ";

String tempLeate = "<LI class=active><A href="{0}" target=_blank>{1}</A></LI>";
String crop = "<p><A href="{0}" target=_blank>{1}</A></p> ";

try {
String siteUrl = getLinkUrl(c.getLink());
Parser parser = new Parser(c.getLink());
parser.setEncoding(c.getEncode());
for (NodeIterator e = parser.elements(); e.hasMoreNodes();) {
Node node = (Node) e.nextNode();
if (node instanceof Html) {
PageContext context = new PageContext();
context.setNumber(0);
context.setTextBuffer(new StringBuffer());
// 抓取出内容
extractHtml(node, context, siteUrl);
StringBuffer testContext = context.getTextBuffer();
String srcfilePath = "D:/kuaiso/site/templeate/context.vm";
String destfilePath = "D:/kuaiso/site/test/test.htm";
BufferedReader reader = new BufferedReader(
new InputStreamReader(new FileInputStream(
srcfilePath), "gbk"));
BufferedWriter writer = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(
destfilePath), "gbk"));
String lineContext = context.getTextBuffer().toString();
String line;
while ((line = reader.readLine()) != null) {
int start = line.indexOf("#context");
if (start >= 0) {
String tempCrop = StringUtils.replace(crop, "{0}", c
.getLink());
tempCrop = StringUtils.replace(tempCrop, "{1}",
" 原文链接： " + c.getLink());
writer.write(tempCrop + lineSign);
writer.write(netshap + lineSign);
writer.write(lineContext + lineSign);
continue;
}
int start1 = line.indexOf("#titledesc");
if (start1 >= 0) {
String tempLine = StringUtils.replace(tempLeate,
"{0}", "test.htm");
tempLine = StringUtils.replace(tempLine, "{1}",
"标题: " + c.getLinktext());

writer.write(tempLine + lineSign);
continue;
}
int start2 = line.indexOf("#metatitle");
if (start2 >= 0) {
metatitle = StringUtils.replace(metatitle, "{0}", c
.getLinktext());
writer.write(metatitle + lineSign);
continue;
}
int start3 = line.indexOf("#metadesc");
if (start3 >= 0) {
metadesc = StringUtils.replace(metadesc, "{0}", c
.getLinktext());
writer.write(metadesc + lineSign);
continue;
}
writer.write(line + lineSign);

}
writer.flush();
writer.close();
reader.close();
}
}
} catch (Exception e) {
System.out.println(e);
}
}

// 从一个字符串中提取出链接
private String getLinkUrl(String link) {
String urlDomaiPattern = "(http://[^/]*?" + "/)(.*?)";
Pattern pattern = Pattern.compile(urlDomaiPattern,
Pattern.CASE_INSENSITIVE + Pattern.DOTALL);
Matcher matcher = pattern.matcher(link);
String url = "";
while (matcher.find()) {
int start = matcher.start(1);
int end = matcher.end(1);
url = link.substring(start, end - 1).trim();
}
return url;
}

/**由于字符限制还有 extractHtml、setLinkImg、extractParagraph、getSpanWord、isValidTable、findTD、collapse方法在下个回复中**/

}//class ExtractContext

[解决办法]
我也有同样的问题，不知你解决没有？
[解决办法]
系统没找着commons-logging.jar
你要把它加入到classpath中去，如果是用Eclipse，在项目的“Build Path”的设置中把它添加进来

热点排行

Java相关

跪求解决 ！利用Htmlparser抓取网页本文时出错，求教大神解决

跪求解决！利用Htmlparser抓取网页本文时出错，求教大神解决