用java实现将html保存为txt文本时,怎样去掉body { font-family: SimSun; font-size:22px; .....}
编写了一个java类,将一个html网页保存为txt文本,保存后的txt文本内容都正确,但是总是带着
body {
font-family: SimSun;
font-size:22px;
font-style:italic;
font-weight:bold;
color:#00F;
}
不知道该怎样去掉,求大侠帮忙
java部分代码:
package format.conversion;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import javax.servlet.jsp.tagext.BodyTag;
import javax.swing.JFileChooser;
import javax.swing.filechooser.FileNameExtensionFilter;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
public class HtmlToTxt {
public static void main(String[] args) throws Exception {
HtmlToTxt test=new HtmlToTxt();
test.go();
}
public void go(){
try{
JFileChooser fileSave=new JFileChooser(".");
FileNameExtensionFilter extension=new FileNameExtensionFilter("txt Files(.txt)","txt");
fileSave.setFileFilter(extension);
fileSave.showSaveDialog(null);
File file=fileSave.getSelectedFile();
if(!file.getPath().endsWith(".txt")){
file=new File(file.getPath()+".txt");
}
String outputFile =file.toString();
FileWriter writer=new FileWriter(outputFile);
String content = readTextFile("WebRoot/Report.html","UTF-8");
String txtcontent=getText(content);
writer.write(txtcontent);
writer.close();
System.out.println("txt文件保存成功!");
System.out.println("文件保存路径为:"+new File(outputFile).toURI().toURL());
}catch(IOException ex){
System.out.println("txt文件保存失败!");
}catch(ParserException ex){
System.out.println("字符转换失败");
}
}
/*----------------获取文本内容和标题----------------------*/
public static String getText(String content) throws ParserException {
Parser myParser; //htmlParser对html页面解析
NodeList nodeList = null;
StringBuilder result = new StringBuilder();
myParser = Parser.createParser(content, "UTF-8");
NodeFilter textFilter = new NodeClassFilter(TextNode.class NodeFilter linkFilter = new NodeClassFilter(LinkTag.class
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter});
nodeList = myParser.parse(lastFilter);//获取节点列表
Node[] nodes = nodeList.toNodeArray(); //获取节点数组
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node anode = (Node) nodes[i];
if (anode instanceof TextNode) { TextNode textnode = (TextNode) anode;
line = textnode.getText();
} else if (anode instanceof LinkTag) {
LinkTag linknode = (LinkTag) anode;
line = linknode.getLink();
}
if (isTrimEmpty(line))
continue;
result.append(line);
}
return result.toString();
}
/*-------------------读取html文件-------------------*/
public static String readTextFile(String sFileName, String sEncode) {
StringBuffer sbStr = new StringBuffer(); //字符串变量
try {
File ff = new File(sFileName);
InputStreamReader read = new InputStreamReader(new FileInputStream(
ff), sEncode); BufferedReader ins = new BufferedReader(read);
String dataLine = "";
while (null != (dataLine = ins.readLine())) {
sbStr.append(dataLine);
sbStr.append("\r\n");
}
ins.close();
} catch (Exception e) {
e.printStackTrace();
}
return sbStr.toString();
}
public static boolean isTrimEmpty(String astr) {
if ((null == astr) || (astr.length() == 0)) {
return true;
}
if (isBlank(astr.trim())) {
return true;
}
return false;
}
public static boolean isBlank(String astr) {
if ((null == astr) || (astr.length() == 0)) {
return true;
} else {
return false;
}
}
}
java html
[解决办法]