lucene 查询示范

2012-07-23

lucene 查询示例排序 Lucene默认按照相关度(score)排序,为了能支持其他的排序方式,比如日期,我们在add Fie

lucene 查询示例

排序
Lucene默认按照相关度(score)排序,为了能支持其他的排序方式,比如日期,我们在add Field的时候,必须保证field被Index且不能被tokenized(分词),并且排序的只能是数字,日期,字符三种类型之一

实体类

public class Article {
? private String id;
? private String title;
? private String keyWords;
? private String content;
? private int order;
????????????? 省略set..get 方法
}

?组织数据

import java.util.ArrayList;import java.util.List;import com.company.project.entity.Article;public class DATAUTIls {public static List<Article> luceneDatas = new ArrayList<Article>();static {Article a1 = new Article();a1.setContent("我们都是中国人" );a1.setId("1");a1.setTitle("法眼看中国是怎么样的一个中国" ) ;//有两个中国a1.setKeyWords("中国，中国，中国") ;a1.setOrder(1);Article a2 = new Article();a2.setContent("我们是两个中国 中国" );a2.setId("2");a2.setTitle("法眼看中国是怎么样的一个中国 中国" ) ;//有两个中国a2.setKeyWords("中国，中国") ;a2.setOrder(2);Article a3 = new Article();a3.setContent("我们都是中国人" );a3.setId("3");a3.setTitle("法眼看怎么样的一个中国" ) ;//有两个中国a3.setKeyWords("中国 ") ;a3.setOrder(3);Article a4 = new Article();a4.setContent("我们都是国中人" );a4.setId("4");a4.setTitle("法眼看" ) ;//有两个中国a4.setKeyWords("无") ;a4.setOrder(4);luceneDatas.add(a1);luceneDatas.add(a3);luceneDatas.add(a2);luceneDatas.add(a4);}}

建造索引

import java.io.File;import java.io.IOException;import java.util.List;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.IndexWriterConfig.OpenMode;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.store.NRTCachingDirectory;import org.apache.lucene.util.Version;import com.company.project.entity.Article;public class IndexRunner { private String INDEX_STORe_PATH = "D:\\workplace\\company\\mylucene\\indexstore";      public IndexRunner(){};     public IndexRunner(String index_path)     {         this.INDEX_STORe_PATH = index_path;         File dir = new File(index_path);         if(dir.exists())         {         dir.mkdir();         }     }          //创建索引     public void createIndex(List<Article> datas,boolean isCreate) throws IOException     {                  //待创建得文档目录         Directory dir = FSDirectory.open(new File(INDEX_STORe_PATH));         //选择得分词工具         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);         //建立索引的配置类，包含了一个解析器         IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_34, analyzer);          //设置我们的解析器是新建还是追加更新         if(isCreate){             iwc.setOpenMode(OpenMode.CREATE);//每次建立都覆盖原来的索引         }         else{         iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);//每次都追加更新         }                               NRTCachingDirectory cachedFSDir = new NRTCachingDirectory(dir, 5.0, 60.0);         iwc.setMergeScheduler(cachedFSDir.getMergeScheduler());                           //索引的建立类 第一个参数索引的存放位置，第二个参数索引的配置对象         IndexWriter writer = new IndexWriter(dir, iwc);                                            for(int i=0;i<datas.size();i++)         {         Article article =   datas.get(i );         /*           * Field.Store.YES:存储字段值（未分词前的字段值） Field.Store.NO:不存储,存储与索引没有关系           * Field.Store.COMPRESS:压缩存储,用于长文本或二进制，但性能受损 Field.Index.ANALYZED:分词建索引           * Field.Index.ANALYZED_NO_NORMS:分词建索引，但是Field的值不像通常那样被保存，而是只取一个byte，这样节约存储空间           * Field.Index.NOT_ANALYZED:不分词且索引           * Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引，Field的值去一个byte保存           */          Field f0 = new Field("title", article.getTitle(), Field.Store.YES, Field.Index.ANALYZED);          Field f1 = new Field("content",article.getContent(),Field.Store.YES,Field.Index.ANALYZED);          Field f2 = new Field("order",String.valueOf(article.getOrder()),Field.Store.YES,Field.Index.NOT_ANALYZED);          Field f3 = new Field("id",String.valueOf(article.getId()),Field.Store.YES,Field.Index.NOT_ANALYZED);                    Document doc = new Document();          doc.add(f0) ;          doc.add(f1);          doc.add(f2);          doc.add(f3);           writer.addDocument(doc);           }                    //这个方法在新增索引的情况会很有用，就是讲原来散落的索引文件重新进行整理合并！         //          writer.forceMerge(1);                  writer.close();         System.out.println("索引创建成功");                        }     public static void main(String[] args) {                 IndexRunner indexRunner = new IndexRunner();         try {         indexRunner.createIndex(DATAUTIls.luceneDatas,true);         } catch (IOException e) {             // TODO Auto-generated catch block             e.printStackTrace();         }     }}

查询

此处有三种查询，一种是多字段查询一个关键字，一种是多字段组合查询，还有一种是分页查询

import java.io.File;import java.io.IOException;import java.io.StringReader;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.CorruptIndexException;import org.apache.lucene.index.IndexReader;import org.apache.lucene.queryParser.ParseException;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.Filter;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.search.highlight.Highlighter;import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;import org.apache.lucene.search.highlight.QueryScorer;import org.apache.lucene.search.highlight.SimpleHTMLFormatter;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import com.company.project.entity.Article;public class SearchRunner { private static String PATH = "D:\\workplace\\company\\mylucene\\indexstore"; public static void main(String [] arg) throws Exception{ String[] queryFileds = { "title", "content" };  String queryString = "中国"; //SearchRunner.searchList(queryFileds, queryString );   // SearchRunner.combinationSearch();    SearchRunner.pagingSearch("中", null); }  /**  * 在多个字段查找同一个值  */ public static void searchList(String[] queryFileds,String queryString) throws Exception {  // 查询的字符串:输入不存在的字符串是查询不到的,如：中国   IndexReader reader = IndexReader.open(FSDirectory.open(new File(PATH)));  IndexSearcher searcher = new IndexSearcher(reader);      Query query = LuceneUtils.createQuery(queryFileds, queryString);  // 在搜索器中进行查询  // 对查询内容进行过滤  Filter filter = null;  // 一次在索引器查询多少条数据  int queryCount = 100;  TopDocs results = searcher.search(query, filter, queryCount);  System.out.println("总符合: " + results.totalHits + "条数！");  // 显示记录  for (ScoreDoc sr : results.scoreDocs)  {   // 文档编号   int docID = sr.doc;   // 真正的内容   Document doc = searcher.doc(docID);   System.out.println("inof = " + doc.get("title"));   System.out.println("info2 = " + doc.get("content"));  } }  public static void   combinationSearch() throws CorruptIndexException, IOException, ParseException{ IndexReader reader = IndexReader.open(FSDirectory.open(new File(PATH)));  IndexSearcher searcher = new IndexSearcher(reader);//选择得分词工具        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);   QueryParser parser = new QueryParser(Version.LUCENE_34, "content",                  analyzer);     //注意此处AND一定要大写   Query query  = parser.parse("content:中      国  AND title:中      国");   // 一次在索引器查询多少条数据 int queryCount = 100;// Sort sort = new Sort(new SortField("order",SortField.DOUBLE,false)); //排序 false 升序 true降序   //TopDocs results = searcher.search(query, queryCount,sort); TopDocs results = searcher.search(query, queryCount);   System.out.println("总符合: " + results.totalHits + "条数！");           // 显示记录      for (ScoreDoc sr : results.scoreDocs)      {      //Sort(field,true)       // 文档编号       int docID = sr.doc;       // 真正的内容       Document doc = searcher.doc(docID);       System.out.println("id="+doc.get("id")+"\torder="+doc.get("order")+"\ttitle = " + doc.get("title")+"\tcontent = " + doc.get("content"));     }      }//分页查询 public static Map pagingSearch(String title,String content) throws CorruptIndexException, IOException, ParseException, InvalidTokenOffsetsException{ Map pager = new HashMap(); List<Article> blogList=new ArrayList<Article>() ;   TokenStream tokenStream=null;        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);  //获取IndexSearcher 对象  IndexReader reader = IndexReader.open(FSDirectory.open(new File(PATH)));  IndexSearcher indexSearch = new IndexSearcher(reader);    QueryParser queryParser =  new QueryParser(Version.LUCENE_34, "content",  analyzer);   //搜索条件的结合                 String str="";              if(title!=null &&title.length()>0){              str="title:"+title;              }                            if(content!=null &&content.length()>0){              if(str.trim().length()>0)                  {                  str +=" AND";                  }              str="content:"+content;              }             //设置搜索条件                 Query query=queryParser.parse(str);             //查询搜索引擎                TopDocs result = indexSearch.search(query, 10);            //上一页的最后一个document索引    第一页为0,其余也该页的起始记录条数             int index=2;             ScoreDoc scoreDoc=null;                 //如果当前页是第一页面scoreDoc=null。                 if(index>0){                    //因为索引是从0开始所以要index-1                   scoreDoc=result.scoreDocs[index-1];                }              //分页处理               int pageSize = 2;              TopDocs hits= indexSearch.searchAfter(scoreDoc, query, pageSize);              //设置分页的总记录数                                                       //循环hits.scoreDocs数据，并使用indexSearch.doc方法把Document还原，再拿出对应的字段的值                 for (int i = 0; i < hits.scoreDocs.length; i++) {               ScoreDoc sdoc = hits.scoreDocs[i];                 Document doc = indexSearch.doc(sdoc.doc);              Article article = new Article();              String stitle = doc.get("title");              String scontent = doc.get("content");              String id = doc.get("id");                          //加亮处理                SimpleHTMLFormatter simplehtml=new SimpleHTMLFormatter("<font color='red'>", "</font>");               Highlighter highlighter = new Highlighter(simplehtml,new QueryScorer(query));              if(title!=null){                tokenStream = analyzer.tokenStream("title",new StringReader(title));              String highLightText = highlighter.getBestFragment(tokenStream, title);               article.setTitle(highLightText==null?title:highLightText);              }else              {              article.setTitle(stitle);              }                                          if(content!=null){                tokenStream = analyzer.tokenStream("content",new StringReader(content));              String highLightText = highlighter.getBestFragment(tokenStream, content);               article.setContent(highLightText==null?title:highLightText);              }else              {              article.setContent(scontent);              }              article.setId(id);              System.out.println(article);              blogList.add(article);              }              pager.put("content",hits.totalHits);              pager.put("data",blogList); return pager; }}

热点排行

软件架构设计

lucene 查询示范