Lucene+Paoding完整实例
一个小小的搜索例子,实现对某个文件夹下的文件进行搜索
这里只有主要代码,整个project在附件中,导入到MyEclipse中时根据自己的情况修改配置文件中paoding-dic-home.properties的地址,当然,前提是你必须有庖丁解牛的字典,在页面搜索“项目”,会出现结果(基本每个文件中都有项目这个词)
附件中有项目T_Search,文件lucene\data,索引\lucene\index
MIndexer.java:创建索引(对文件进行创建,先把文件内容读取成String)
public class MIndexer {public void createIndex() { long start = System.currentTimeMillis(); try { // 获取Paoding中文分词器 Analyzer analyzer = new PaodingAnalyzer(); // indexWriter建立索引,E:\lucene\index建立索引的目录 IndexWriter writer = new IndexWriter("E:\\lucene\\index", analyzer, true,IndexWriter.MaxFieldLength.UNLIMITED); //E:\lucene\data建立索引的数据,主要是.txt、.pdf文件 indexDocs(writer, new File("E:\\lucene\\data")); writer.optimize(); writer.close(); System.out.println("用时:" + (System.currentTimeMillis() - start) + " 毫秒"); } catch (IOException e) { e.printStackTrace(); } } // 遍历文件夹文件,对需要的文件建立索引 static void indexDocs(IndexWriter writer, File file) throws IOException { if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { if (file.getName().endsWith(".htm") || file.getName().endsWith(".html") || file.getName().endsWith(".jsp") || file.getName().endsWith(".php") || file.getName().endsWith(".txt") || file.getName().endsWith(".pdf")) { try { // 针对参数文件建立索引文档 ,一个Document就相当于一跳记录 Document doc = new Document(); // Field.Index.ANALYZED 文件名称 建立索引,分词 doc.add(new Field("filename", file.getCanonicalPath(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); if(file.getName().endsWith(".pdf")){ doc.add(new Field("contents", pdf2txt(file), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); }else { doc.add(new Field("contents", ReadFile(file), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));} writer.addDocument(doc); } catch (FileNotFoundException fnfe) { ; } } } } } // 用字符串形式,读取一个File的内容 public static String ReadFile(File f) { String line = null; StringBuffer temp = new StringBuffer(); try { BufferedReader br = new BufferedReader(new InputStreamReader( new FileInputStream(f), "UTF-8")); while ((line = br.readLine()) != null) { temp.append(line); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return temp.toString(); } //若文件为pdf,就用这个读取 public static String pdf2txt(File pfile) {String _content = "";if (pfile.exists() && pfile.getName().lastIndexOf(".pdf") >= 1) {String textFile = String.format("%s%s%s%s%s.txt",pfile.getPath().substring(0,pfile.getPath().lastIndexOf(pfile.getName())),System.getProperty("file.separator"), "temp", System.getProperty("file.separator"), pfile.getName().substring(0, pfile.getName().lastIndexOf(".pdf")));if (!new File(textFile.substring(0, textFile.lastIndexOf(new File(textFile).getName()))).exists()) {new File(textFile.substring(0, textFile.lastIndexOf(new File(textFile).getName()))).mkdirs();}PDDocument pdDoc = null;COSDocument cosDoc = null;try {pdDoc = PDDocument.load(pfile);PDFParser parser = new PDFParser(new FileInputStream(pfile));parser.parse();cosDoc = parser.getDocument();PDFTextStripper stripper = new PDFTextStripper();_content = stripper.getText(new PDDocument(cosDoc));} catch (IOException e) {e.printStackTrace();} finally {try {cosDoc.close();pdDoc.close();if (new File(textFile).exists()) {new File(textFile).delete();}} catch (IOException e) {e.printStackTrace();}}}return _content;}}
?
?
MSearcher.java:搜索,返回符合条件的List
?
public class MSearcher {public List<MBean> searchIndex(String keyword, boolean highlight,int content_length, int start, int length) {String indexpath = "E:\\lucene\\index"; // 索引所在目录List<MBean> mList = new ArrayList<MBean>();if (indexpath != null && new File(indexpath).exists()&& keyword != null && !keyword.trim().equals("") && length > 0) {start = (start > 0) ? start : 1;String[] FIELD = { "filename", "contents" };// 获取Paoding中文分词器Analyzer analyzer = new PaodingAnalyzer();FSDirectory directory;IndexReader reader;Searcher searcher;try {directory = FSDirectory.getDirectory(indexpath);reader = IndexReader.open(directory);String queryString = keyword;/* * 下面这个表示要同时搜索这两个域,而且只要一个域里面有满足我们搜索的内容就行 SHOULD表示查询条件为or * MUST表示查询条件为and MUST_NOT表示查询条件为not */BooleanClause.Occur[] flags = new BooleanClause.Occur[] {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD };Query query = MultiFieldQueryParser.parse(queryString, FIELD,flags, analyzer);searcher = new IndexSearcher(directory);query = query.rewrite(reader);//分页,取出前start + length - 1条数据TopDocCollector collector = new TopDocCollector(start + length - 1);searcher.search(query, collector);ScoreDoc[] hits = collector.topDocs().scoreDocs;BoldFormatter formatter = new BoldFormatter();Highlighter highlighter = new Highlighter(formatter,new QueryScorer(query));highlighter.setTextFragmenter(new SimpleFragmenter(content_length));for (int i = start - 1; i < hits.length; i++) {MBean mBean = new MBean();Document doc = searcher.doc(hits[i].doc);String _filename = doc.get(FIELD[0]);String _contents = doc.get(FIELD[1]);int maxNumFragmentsRequired = 5;String fragmentSeparator = "...";TermPositionVector tpv_filename = (TermPositionVector) reader.getTermFreqVector(hits[i].doc, FIELD[0]);TermPositionVector tpv_contents = (TermPositionVector) reader.getTermFreqVector(hits[i].doc, FIELD[1]);String high_filename = "";String high_contents = "";if (tpv_filename != null) {TokenStream token_filename = TokenSources.getTokenStream(tpv_filename);high_filename = highlighter.getBestFragments(token_filename, _filename,maxNumFragmentsRequired, fragmentSeparator);}if (tpv_contents != null) {TokenStream token_contents = TokenSources.getTokenStream(tpv_contents);high_contents = highlighter.getBestFragments(token_contents, _contents,maxNumFragmentsRequired, fragmentSeparator);}mBean.setFilename((high_filename != null && !high_filename.equals("")) ? high_filename : _filename);mBean.setContents((high_contents != null && !high_contents.equals("")) ? high_contents: (_contents.length() > content_length ? _contents.substring(0, content_length) : _contents));mList.add(mBean);}searcher.close();reader.close();} catch (ParseException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}}return mList;}public Integer searchIndexLength(String keyword, boolean highlight,int content_length, int start, int length, int maxLength) {int _count = 0;String indexpath = "E:\\lucene\\index";if (indexpath != null && new File(indexpath).exists()&& keyword != null && !keyword.trim().equals("") && length > 0) {start = (start > 0) ? start : 1;String[] FIELD = { "filename", "contents" };Analyzer analyzer = new PaodingAnalyzer();FSDirectory directory;IndexReader reader;Searcher searcher;try {directory = FSDirectory.getDirectory(indexpath);reader = IndexReader.open(directory);String queryString = keyword;BooleanClause.Occur[] flags = new BooleanClause.Occur[] {BooleanClause.Occur.SHOULD,BooleanClause.Occur.SHOULD };Query query = MultiFieldQueryParser.parse(queryString, FIELD,flags, analyzer);searcher = new IndexSearcher(reader);query = query.rewrite(reader);TopDocCollector collector = new TopDocCollector(maxLength);searcher.search(query, collector);ScoreDoc[] hits = collector.topDocs().scoreDocs;_count = hits.length;searcher.close();reader.close();} catch (ParseException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();}}return _count;}}
?
Search.java:处理用户请求的Servlet
public class Search extends HttpServlet {private static final Integer NUMBER = 10;//每页显示10条private static final Integer CONTENT_LENGTH = 50;private static final Boolean HIGHLIGHT = true;private static final long serialVersionUID = 1L;private MSearcher mSearcher = new MSearcher();@Overridepublic void doPost(HttpServletRequest request, HttpServletResponse response)throws ServletException, IOException {request.setCharacterEncoding("UTF-8");String q = request.getParameter("q") != null ? request.getParameter("q").trim() : request.getParameter("q");System.out.println("----"+q);List<MBean> mList = new ArrayList<MBean>();List<PBean> pList = new ArrayList<PBean>();int start = request.getParameter("start")!= null ? Integer.valueOf(request.getParameter("start")): 0;int all_count = 0;all_count = mSearcher.searchIndexLength( q, HIGHLIGHT,CONTENT_LENGTH, start, NUMBER, NUMBER * 1000);mList = mSearcher.searchIndex( q, HIGHLIGHT,CONTENT_LENGTH, start, NUMBER);pList = getPageList(all_count, start);if (start > NUMBER) {request.setAttribute("previous", start - NUMBER);}if (start < all_count - NUMBER) {request.setAttribute("next", NUMBER + (start != 0 ? start : 1));}request.setAttribute("q", q);request.setAttribute("start", start);request.setAttribute("pList", pList);request.setAttribute("mList", mList.isEmpty() ? null : mList);request.getRequestDispatcher("/index.jsp").forward(request, response);}@Overridepublic void doGet(HttpServletRequest request, HttpServletResponse response)throws ServletException, IOException {doPost(request, response);}private static List<PBean> getPageList(int all_count, int start) {MIndexer mIndexer = new MIndexer();mIndexer.createIndex();List<PBean> pList = new ArrayList<PBean>();int all_page = (all_count <= 0) ? 1 : (all_count / NUMBER + (all_count% NUMBER > 0 ? 1 : 0));int now_page = (start <= 0) ? 1: (start / NUMBER + (start % NUMBER > 0 ? 1 : 0));for (int i = (now_page - 10 > 0 ? now_page - 10 : 1); i <= (((now_page + 9) <= all_page) ? (now_page + 9): all_page); i++) {PBean pBean = new PBean();pBean.setPage(i);pBean.setStart((pBean.getPage() - 1) * NUMBER + 1);pList.add(pBean);}return pList;}}
?
?