重赏:关于lucene 搜索的问题 满意就结贴
这是创建索引:
public void creatIndex()throws Exception { String dataPath = CmsContext.projectPath+PropertiesUtil.getProperty("lucene_source_dir"); String indexPath = CmsContext.projectPath +PropertiesUtil.getProperty("lucene_index_dir"); int i = 0 ; File indexDir = new File(indexPath);//索引存储目录 File dir = new File(dataPath);// 待索引的数据文件目录 Analyzer analyzer = null;// 采用的分词器 FSDirectory directory = null; IndexWriter iwriter = null; FileInputStream fis = null; BufferedReader reader = null; try{ if(!indexDir.exists())//如果不存在索引目录,则建立此目录 indexDir.mkdirs(); analyzer = new IKAnalyzer(); directory = FSDirectory.open(indexDir);//打开目录 iwriter = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);//索引生成器 List<File> files = new ArrayList<File>(); FileUtil.getAllFiles(files, dir.getAbsolutePath());//获得目录下所有的文件,不包括文件夹 for (File file:files) { if(file.isFile()){ Document doc = new Document(); fis = new FileInputStream(file); String content = ""; reader = new BufferedReader(new InputStreamReader(fis,PropertiesUtil.getProperty("static_charset")));//需要加上编码方式,否则中文都为乱码 StringBuffer buffer = new StringBuffer(""); content = reader.readLine(); while (content != null) { buffer.append(content); content = reader.readLine(); } String html = buffer.toString(); String title = Util.fetchContentByTag(html, "title"); String filePath = file.getPath().replace(CmsContext.projectPath,CmsContext.webCtx).replace("\\", "/"); doc.add(new Field("title", (title==null?file.getName():title), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("path", filePath, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("content", html2Text(html), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("time", Util.date2str(new Date(file.lastModified()), "yyyy-MM-dd"), Field.Store.YES, Field.Index.ANALYZED)); iwriter.addDocument(doc); i++; } } }catch(Exception e){throw e;} finally { if(reader!=null)reader.close(); if(fis!=null)fis.close(); if(iwriter!=null){ iwriter.optimize(); iwriter.close(); } if(directory!=null)directory.close(); if(analyzer!=null)analyzer.close(); } }
public JSONObject searchIndex(String key, String luceneIndexDir) throws Exception { JSONObject obj = new JSONObject(); JSONArray aryJson = new JSONArray(); Directory dir = null; IndexSearcher search = null; Analyzer analyzer = null; try{ File indexDir = new File(luceneIndexDir); dir = FSDirectory.open(indexDir); search = new IndexSearcher(dir,true);//read-only Term term = new Term("content", key); Query query = new TermQuery(term); TopDocs topDocs = search.search(query, 10); ScoreDoc[] hits = topDocs.scoreDocs; // 正常产生的查询 // for (int i = 0; i < hits.length; i++) {// Document doc =// search.doc(hits[i].doc); // System.out.print(doc.get("title") + ":");// System.out.println(doc.get("content")); // } // 高亮设置 analyzer = new IKAnalyzer();// 设定分词器 SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<span>", "</span>"); // 设定高亮显示的格式,也就是对高亮显示的词组加上前缀后缀 Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(150)); /* * 设置每次返回的字符数.想必大家在使用搜索引擎的时候也没有一并把全部数据展示出来吧, 当然这里也是设定只展示部分数据 */ for (int i = 0; i < hits.length; i++) { Document doc = search.doc(hits[i].doc); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(doc.get("content"))); String str = highlighter.getBestFragment(tokenStream, doc.get("content")); if(str==null || str.length()==0) str= doc.get("title"); JSONObject element = null; element = new JSONObject(); element.put("title", doc.get("title")); element.put("path", doc.get("path")); element.put("time", doc.get("time")); element.put("content", str); //System.out.println(doc.get("title")); aryJson.add(element); } obj.put("luceneIndexForm", aryJson); }catch(Exception e){e.printStackTrace();} finally { if(analyzer!=null)analyzer.close(); if(search!=null)search.close(); if(dir!=null)dir.close(); } return obj; }
public static String html2Text(String inputString) { String htmlStr = inputString; //含html标签的字符串 String textStr =""; java.util.regex.Pattern p_script; java.util.regex.Matcher m_script; java.util.regex.Pattern p_style; java.util.regex.Matcher m_style; java.util.regex.Pattern p_html; java.util.regex.Matcher m_html; try { String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"; // 定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script> String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>"; // 定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style> String regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式 p_script = Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE); m_script = p_script.matcher(htmlStr); htmlStr = m_script.replaceAll(""); //过滤script标签 p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE); m_style = p_style.matcher(htmlStr); htmlStr = m_style.replaceAll(""); //过滤style标签 p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE); m_html = p_html.matcher(htmlStr); htmlStr = m_html.replaceAll(""); //过滤html标签 textStr = htmlStr; }catch(Exception e) { System.err.println("Html2Text: " + e.getMessage()); } return textStr;//返回文本字符串 }