lucene入门-使用pdfbox解析中文PDF
很多人使用 PDFBOX无法解析中文PDF,其实是在编程时没有指定字符集导致的,指定字符集后,pdfbox是完全可以解析中文PDF的
下载JAR文件
下载pdfbox
http://incubator.apache.org/pdfbox/
下载相关的jar
http://commons.apache.org/downloads/download_logging.cgi
引入external下的所有包
笔者BLOG地址:http://blog.163.com/sukerl@126/
以下是JAVA代码,注意红色部分指定了字符集:
package extract;import java.io.FileWriter;import java.io.IOException;import java.io.PrintWriter;import org.apache.pdfbox.pdmodel.PDDocument;import org.apache.pdfbox.util.*;public class ExtractorPDF {public static String getText(String file){String s="";String pdffile=file;PDDocument pdfdoc=null;try {pdfdoc=PDDocument.load(pdffile);PDFTextStripper stripper=new PDFTextStripper("GBK");s=stripper.getText(pdfdoc);} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}finally{try {if (pdfdoc!=null){pdfdoc.close();} }catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}return s; }public static void toTextFile(String doc,String filename) throws Exception{String pdffile=doc;PDDocument pdfdoc=null;try {pdfdoc=PDDocument.load(pdffile);PDFTextStripper stripper=new PDFTextStripper("GBK");PrintWriter pw=new PrintWriter(new FileWriter(filename));stripper.writeText(pdfdoc, pw);} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}finally{try {if (pdfdoc!=null){pdfdoc.close();} }catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}}/*** @param args*/public static void main(String[] args) {// TODO Auto-generated method stubtry {String sc=getText("D:/workspace/testsearch2/htmls/xxxx.pdf");System.out.print(sc);toTextFile("D:/workspace/testsearch2/htmls/xxxx.pdf","D:/workspace/testsearch2/htmls/xxxx.txt");} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}}?
?}