lucene如何解析PPT文档
加入jar包(poi-3.0.2-FINAL-20080204.jar poi-contrib-3.0.2-FINAL-20080204.jar poi-scratchpad-3.0.2-FINAL-20080204.jar )
package com.cs;public interface Parsable {public String getTitle() ;public String getContent() ;public String getSummary() ;}
package com.cs;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStream;import org.apache.poi.hslf.HSLFSlideShow;import org.apache.poi.hslf.model.Slide;import org.apache.poi.hslf.model.TextRun;import org.apache.poi.hslf.usermodel.SlideShow;public class PPTParser implements Parsable {private File file;private String content;public PPTParser(File file) {this.file = file;}public String getContent() {if (content != null) {return content;}// HSLFSlideShow contains the main functionality for the Powerpoint file// "reader". It is only a very basic class for now// SlideShow is a friendly wrapper on top of the more scary// HSLFSlideShowInputStream is;try {is = new FileInputStream(file);SlideShow ss = new SlideShow(new HSLFSlideShow(is));Slide[] slides = ss.getSlides();StringBuffer sb = new StringBuffer();for (int i = 0; i < slides.length; i++) {// This class represents a run of text in a powerpoint document.// That run could be text on a sheet, or text in a note.// It is only a very basic class for nowTextRun[] t = slides[i].getTextRuns();for (int j = 0; j < t.length; j++) {sb.append(t[j].getText());}sb.append(slides[i].getTitle());}content = sb.toString();return content;} catch (FileNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}return null;}public String getSummary() {String summary;if (content == null) {getContent();}if (content.length() > 200) {summary = content.substring(0, 200);} else {summary = content;}return summary;}public String getTitle() {// TODO Auto-generated method stubreturn file.getName();}public static void main(String[] args) {PPTParser pptParser = new PPTParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\搜索引擎-基础.ppt")) ;System.out.println("ppt content : "+pptParser.getContent()) ;}}