lucene根据文件类型自动解析的工厂类
阅读本章之前 请先参考其他几篇解析各类文档的章节
http://wuzhaohuixy-qq-com.iteye.com/blog/780437
http://wuzhaohuixy-qq-com.iteye.com/blog/780431
http://wuzhaohuixy-qq-com.iteye.com/blog/780426
http://wuzhaohuixy-qq-com.iteye.com/blog/780423
这里主要讲解根据文件类型自动解析文档(ppt , pdf , txt,doc,html,htm)
用java中的反射机制
先准备属性文件
parser.properties
txt=com.cs.TextParserdoc=com.cs.DocParserrtf=com.cs.DocParserppt=com.cs.PPTParserpdf=com.cs.PdfParserhtml=com.cs.EasyHtmlParserhtm=com.cs.EasyHtmlParser
package com.cs;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.lang.reflect.Constructor;import java.lang.reflect.InvocationTargetException;import java.util.Properties;public class ParserFactory {//该类一加载就把配置文件读到内存static Properties ps ;static{ps = new Properties() ;try {ps.load(new FileInputStream("E:\\EclipseStudyWorkspace\\LuceneParse\\src\\parser.properties")) ;} catch (FileNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}//该方法根据文件的后缀名确定该文件的类型 然后格局配置文件的类型分词建立索引public static Parsable getParser(File file){String ext = file.getAbsolutePath().substring(file.getAbsolutePath().lastIndexOf(".") + 1) ; String className = ps.getProperty(ext) ;Parsable parser = null ;if (className != null){try {//此处不能直接Class.forName().newInstance()//原因:要传参数//所以要先拿到构造器 然后根据构造器区newInstance() 此时就可以传入参数了Class clazz = Class.forName(className) ;//根据参数的不同拿到不同的构造器Constructor constructor = clazz.getConstructor(new Class[]{File.class}) ;//此处传入的参数是Class类型parser = (Parsable)constructor.newInstance(new Object[]{file}) ;} catch (ClassNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (SecurityException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (NoSuchMethodException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IllegalArgumentException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (InstantiationException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IllegalAccessException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (InvocationTargetException e) {// TODO Auto-generated catch blocke.printStackTrace();}}return parser ;}}
package com.cs;import java.io.File;public class Test {/** * @param args */public static void main(String[] args) {Parsable parser = null ;parser = ParserFactory.getParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\搜索引擎-基础.ppt"));//parser = ParserFactory.getParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\123.pdf"));//parser = ParserFactory.getParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\www.htm"));//parser = ParserFactory.getParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\XPDF使用文档.doc"));//parser = ParserFactory.getParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\文档.txt"));//parser = ParserFactory.getParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\BaseItem.html"));System.out.println(" content : "+parser.getContent()) ;}}