Lucene使用IKAnalyzer分词实例 及 IKAnalyzer扩展词库
IK分词器还支持通过配置IKAnalyzer.cfg.xml文件来扩充您的专有词典。谷歌拼音词库下载:?http://ishare.iask.sina.com.cn/f/14446921.html?from=like
在web项目的src目录下创建IKAnalyzer.cfg.xml文件,内容如下
<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd"> <properties> <comment>IK Analyzer 扩展配置</comment> <!-- 用户可以在这里配置自己的扩展字典 --> <entry key="ext_dict">/dicdata/use.dic.dic;/dicdata/googlepy.dic</entry> <!-- 用户可以在这里配置自己的扩展停止词字典 --> <entry key="ext_stopwords">/dicdata/ext_stopword.dic</entry> </properties>
org.wltea.analyzer.cfg下Configuration接口中的定义 getExtDictionarys() 获取扩展字典配置路径 getExtStopWordDictionarys() 获取扩展停止词典配置路径 getMainDictionary() 获取主词典路径 getQuantifierDicionary() 获取量词词典路径org.wltea.analyzer.cfg.DefualtConfig类是对Configuration接口的实现
org.wltea.analyzer.dic下的Directory类中相关的方法
package com.icrate.service.study.demo;/** * * * @version : 1.0 * * @author : 苏若年 <a href="mailto:DennisIT@163.com">发送邮件</a> * * @since : 1.0 创建时间: 2013-4-7 下午01:52:49 * * @function: TODO * */public class Medicine { private Integer id; private String name; private String function; public Medicine() { } public Medicine(Integer id, String name, String function) { super(); this.id = id; this.name = name; this.function = function; } //getter and setter() public String toString(){ return this.id + "," +this.name + "," + this.function; }}
package com.icrate.service.study.demo;import java.util.ArrayList;import java.util.List;/** * * * @version : 1.0 * * @author : 苏若年 <a href="mailto:DennisIT@163.com">发送邮件</a> * * @since : 1.0 创建时间: 2013-4-7 下午01:54:34 * * @function: TODO * */public class DataFactory { private static DataFactory dataFactory = new DataFactory(); private DataFactory(){ } public List<Medicine> getData(){ List<Medicine> list = new ArrayList<Medicine>(); list.add(new Medicine(1,"银花 感冒颗粒","功能主治:银花感冒颗粒 ,头痛,清热,解表,利咽。")); list.add(new Medicine(2,"感冒 止咳糖浆","功能主治:感冒止咳糖浆,解表清热,止咳化痰。")); list.add(new Medicine(3,"感冒灵颗粒","功能主治:解热镇痛。头痛 ,清热。")); list.add(new Medicine(4,"感冒灵胶囊","功能主治:银花感冒颗粒 ,头痛,清热,解表,利咽。")); list.add(new Medicine(5,"仁和 感冒颗粒","功能主治:疏风清热,宣肺止咳,解表清热,止咳化痰。")); return list; } public static DataFactory getInstance(){ return dataFactory; }}
package com.icrate.service.study.demo;import java.io.File;import java.io.IOException;import java.util.ArrayList;import java.util.List;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Term;import org.apache.lucene.queryParser.MultiFieldQueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.search.highlight.Formatter;import org.apache.lucene.search.highlight.Fragmenter;import org.apache.lucene.search.highlight.Highlighter;import org.apache.lucene.search.highlight.QueryScorer;import org.apache.lucene.search.highlight.Scorer;import org.apache.lucene.search.highlight.SimpleFragmenter;import org.apache.lucene.search.highlight.SimpleHTMLFormatter;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import org.wltea.analyzer.lucene.IKAnalyzer;/** * * LuenceProcess.java * * @version : 1.1 * * @author : 苏若年 <a href="mailto:DennisIT@163.com">发送邮件</a> * * @since : 1.0 创建时间: Apr 3, 2013 11:48:11 AM * * TODO : Luence中使用IK分词器 * */public class LuceneIKUtil { private Directory directory ; private Analyzer analyzer ; /** * 带参数构造,参数用来指定索引文件目录 * @param indexFilePath */ public LuceneIKUtil(String indexFilePath){ try { directory = FSDirectory.open(new File(indexFilePath)); analyzer = new IKAnalyzer(); } catch (IOException e) { e.printStackTrace(); } } /** * 默认构造,使用系统默认的路径作为索引 */ public LuceneIKUtil(){ this("/luence/index"); } /** * 创建索引 * Description: * @author dennisit@163.com Apr 3, 2013 * @throws Exception */ public void createIndex()throws Exception{ IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_35,analyzer); IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig); indexWriter.deleteAll(); List<Medicine> list = DataFactory.getInstance().getData(); for(int i=0; i<list.size(); i++){ Medicine medicine = list.get(i); Document document = addDocument(medicine.getId(), medicine.getName(), medicine.getFunction()); indexWriter.addDocument(document); } indexWriter.close(); } /** * * Description: * @author dennisit@163.com Apr 3, 2013 * @param id * @param title * @param content * @return */ public Document addDocument(Integer id, String name, String function){ Document doc = new Document(); //Field.Index.NO 表示不索引 //Field.Index.ANALYZED 表示分词且索引 //Field.Index.NOT_ANALYZED 表示不分词且索引 doc.add(new Field("id",String.valueOf(id),Field.Store.YES,Field.Index.NOT_ANALYZED)); doc.add(new Field("name",name,Field.Store.YES,Field.Index.ANALYZED)); doc.add(new Field("function",function,Field.Store.YES,Field.Index.ANALYZED)); return doc; } /** * * Description: 更新索引 * @author dennisit@163.com Apr 3, 2013 * @param id * @param title * @param content */ public void update(Integer id,String title, String content){ try { IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_35,analyzer); IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig); Document document = addDocument(id, title, content); Term term = new Term("id",String.valueOf(id)); indexWriter.updateDocument(term, document); indexWriter.close(); } catch (Exception e) { e.printStackTrace(); } } /** * * Description:按照ID进行索引 * @author dennisit@163.com Apr 3, 2013 * @param id */ public void delete(Integer id){ try { IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_35,analyzer); IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig); Term term = new Term("id",String.valueOf(id)); indexWriter.deleteDocuments(term); indexWriter.close(); } catch (Exception e) { e.printStackTrace(); } } /** * * Description:查询 * @author dennisit@163.com Apr 3, 2013 * @param where 查询条件 * @param scoreDoc 分页时用 */ public List<Medicine> search(String[] fields,String keyword){ IndexSearcher indexSearcher = null; List<Medicine> result = new ArrayList<Medicine>(); try { //创建索引搜索器,且只读 IndexReader indexReader = IndexReader.open(directory,true); indexSearcher = new IndexSearcher(indexReader); MultiFieldQueryParser queryParser =new MultiFieldQueryParser(Version.LUCENE_35, fields,analyzer); Query query = queryParser.parse(keyword); //返回前number条记录 TopDocs topDocs = indexSearcher.search(query, 10); //信息展示 int totalCount = topDocs.totalHits; System.out.println("共检索出 "+totalCount+" 条记录"); //高亮显示 /* 创建高亮器,使搜索的结果高亮显示 SimpleHTMLFormatter:用来控制你要加亮的关键字的高亮方式 此类有2个构造方法 1:SimpleHTMLFormatter()默认的构造方法.加亮方式:<B>关键字</B> 2:SimpleHTMLFormatter(String preTag, String postTag).加亮方式:preTag关键字postTag */ Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>"); /* QueryScorer QueryScorer 是内置的计分器。计分器的工作首先是将片段排序。QueryScorer使用的项是从用户输入的查询中得到的; 它会从原始输入的单词、词组和布尔查询中提取项,并且基于相应的加权因子(boost factor)给它们加权。 为了便于QueryScoere使用,还必须对查询的原始形式进行重写。 比如,带通配符查询、模糊查询、前缀查询以及范围查询 等,都被重写为BoolenaQuery中所使用的项。 在将Query实例传递到QueryScorer之前,可以调用Query.rewrite (IndexReader)方法来重写Query对象 */ Scorer fragmentScorer = new QueryScorer(query); Highlighter highlighter = new Highlighter(formatter,fragmentScorer); Fragmenter fragmenter = new SimpleFragmenter(100); /* Highlighter利用Fragmenter将原始文本分割成多个片段。 内置的SimpleFragmenter将原始文本分割成相同大小的片段,片段默认的大小为100个字符。这个大小是可控制的。 */ highlighter.setTextFragmenter(fragmenter); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for(ScoreDoc scDoc : scoreDocs){ Document document = indexSearcher.doc(scDoc.doc); Integer id = Integer.parseInt(document.get("id")); String name = document.get("name"); String function = document.get("function"); //float score = scDoc.score; //相似度 String lighterName = highlighter.getBestFragment(analyzer, "name", name); if(null==lighterName){ lighterName = name; } String lighterFunciton = highlighter.getBestFragment(analyzer, "function", function); if(null==lighterFunciton){ lighterFunciton = function; } Medicine medicine = new Medicine(); medicine.setId(id); medicine.setName(lighterName); medicine.setFunction(lighterFunciton); result.add(medicine); } } catch (Exception e) { e.printStackTrace(); }finally{ try { indexSearcher.close(); } catch (IOException e) { e.printStackTrace(); } } return result; } public static void main(String[] args) { LuceneIKUtil luceneProcess = new LuenceIKUtil("F:/index"); try { luceneProcess.createIndex(); } catch (Exception e) { e.printStackTrace(); } //修改测试 luceneProcess.update(2, "测试内容", "修改测试。。。"); //查询测试 String [] fields = {"name","function"}; List<Medicine> list = luenceProcess.search(fields,"感冒"); for(int i=0; i<list.size(); i++){ Medicine medicine = list.get(i); System.out.println("("+medicine.getId()+")"+medicine.getName() + "\t" + medicine.getFunction()); } //删除测试 //luenceProcess.delete(1); }}
加载扩展词典:/dicdata/use.dic.dic加载扩展词典:/dicdata/googlepy.dic加载扩展停止词典:/dicdata/ext_stopword.dic共检索出 4 条记录(1)银花 <font color='red'>感冒</font>颗粒 功能主治:银花<font color='red'>感冒</font>颗粒 ,头痛,清热,解表,利咽。(4)<font color='red'>感冒</font>灵胶囊 功能主治:银花<font color='red'>感冒</font>颗粒 ,头痛,清热,解表,利咽。(3)<font color='red'>感冒</font>灵颗粒 功能主治:解热镇痛。头痛 ,清热。(5)仁和 <font color='red'>感冒</font>颗粒 功能主治:疏风清热,宣肺止咳,解表清热,止咳化痰。
/** * 判断是否已经存在索引文件 * @param indexPath * @return */ private boolean isExistIndexFile(String indexPath) throws Exception{ File file = new File(indexPath); if (!file.exists()) { file.mkdirs(); } String indexSufix="/segments.gen"; //根据索引文件segments.gen是否存在判断是否是第一次创建索引 File indexFile=new File(indexPath+indexSufix); return indexFile.exists(); }
9.?IKTokenizer的incrementToken方法调用了IKSegmentation的next方法,next的作用是获得下一个分词结果。next在第一次被调用的时候,需要加载文本输入流,并将其读入buffer,此时便遍历子分词器,对buffer种的文本内容进行分词处理,然后把分词结果添加到context的lexemeSet中。
转转请注明出处:[http://www.cnblogs.com/dennisit/archive/2013/04/07/3005847.html]