Lucene学习之构建简单的文档库
在构建一个简单的文档库时,需要两个条件,第一,必须能抓取文档的内容,第二,根据抓取的内容构建文档库的索引,然后根据进行搜索。前面两篇博客中介绍的tika和Lucene可以分别满足这两个条件,本篇博客,就结合这两个框架来创建一个简单的文档库,在介绍Lucene入门时,我们使用了FileReader来读入字符文件,在这里我们就要使用tike来读入并解析各种文档了。我们只需要在Lucene入门时所使用的代码中,修改一行代码即可,为了便于大家运行和调试,将全部代码贴出,修改的代码在140行:
?
package com.hsdl.lucene;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.StringField;import org.apache.lucene.document.TextField;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.IndexWriterConfig.OpenMode;import org.apache.lucene.queryparser.classic.ParseException;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.LockObtainFailedException;import org.apache.lucene.util.Version;import org.apache.tika.Tika;import org.wltea.analyzer.lucene.IKAnalyzer;public class LuceneDemo2 {private static String contentFieldName = "content";private static Tika tika = new Tika();public static void main(String[] args) {//Analyzer analyzer = new IKAnalyzer();Analyzer analyzer = new IKAnalyzer();try {String docPath = "D:/work/lucene/tika/doc";String indexPath = "D:/work/lucene/tika/index";//创建索引createIndex(analyzer, indexPath, docPath);//搜索search(analyzer, indexPath, "微信");} catch (CorruptIndexException e) {e.printStackTrace();} catch (LockObtainFailedException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();} catch (ParseException e) {// TODO Auto-generated catch blocke.printStackTrace();} }/** * 创建索引 * * @param analyzer * @param indexPath * @param docPath * @throws IOException * @throws CorruptIndexException * @throws LockObtainFailedException */private static void createIndex(Analyzer analyzer, String indexPath,String docPath) throws IOException, CorruptIndexException,LockObtainFailedException {IndexWriter iwriter;Directory directory = FSDirectory.open(new File(indexPath));// 配置IndexWriterConfigIndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_45,analyzer);iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);iwriter = new IndexWriter(directory, iwConfig);File file = new File(docPath);indexDocs(iwriter, file);iwriter.close();}/** * 搜索 * * @param analyzer * @param indexPath * @param queryStr * @throws CorruptIndexException * @throws IOException * @throws ParseException */private static void search(Analyzer analyzer, String indexPath,String queryStr) throws CorruptIndexException, IOException,ParseException {Directory directory = FSDirectory.open(new File(indexPath));// 搜索过程**********************************// 实例化搜索器IndexReader ireader = DirectoryReader.open(directory);IndexSearcher isearcher = new IndexSearcher(ireader);// 使用QueryParser查询分析器构造Query对象QueryParser qp = new QueryParser(Version.LUCENE_45, contentFieldName, analyzer);qp.setDefaultOperator(QueryParser.AND_OPERATOR);Query query = qp.parse(queryStr);// 搜索相似度最高的10条记录TopDocs topDocs = isearcher.search(query, 10);System.out.println("命中:" + topDocs.totalHits);// 输出结果ScoreDoc[] scoreDocs = topDocs.scoreDocs;System.out.println(scoreDocs.length);for (int i = 0; i < scoreDocs.length; i++) {Document targetDoc = isearcher.doc(scoreDocs[i].doc);System.out.println("内容:" + targetDoc.toString());System.out.println(targetDoc.get("fileName") + "["+ targetDoc.get("path") + "]");}}/** * 根据指定存放内容的文件或目录创建索引 * @param iwriter * @param file * @throws IOException */public static void indexDocs(IndexWriter iwriter, File file) throws IOException {if (file.canRead())if (file.isDirectory()) {String[] files = file.list();if (files != null)for (int i = 0; i < files.length; i++)indexDocs(iwriter, new File(file, files[i]));} else {Document doc = null;FileInputStream fis=null;try {doc = new Document();doc.add(new StringField("ID", "10000", Field.Store.YES));fis = new FileInputStream(file);//此处添加文件内容时,需要根据tika获取Reader对象doc.add(new TextField(contentFieldName, tika.parse(file)));doc.add(new StringField("fileName", file.getName(),Field.Store.YES));doc.add(new StringField("path", file.getAbsolutePath(),Field.Store.YES));iwriter.addDocument(doc);} finally {if(fis!=null){fis.close();}}}}}
?