首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > JAVA > Java Web开发 >

lucene对xml检索有关问题

2012-02-17 
lucene对xml检索问题我对文件夹里的xml文件建立了索引,但是为什么就检索不到呢,请各位大侠指点指点:建立索

lucene对xml检索问题
我对文件夹里的xml文件建立了索引,但是为什么就检索不到呢,请各位大侠指点指点:

建立索引的类:LuceneIndexLocalDisk

package Test;

import java.io.IOException;
import java.io.File;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.*;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


/*******************************************************************
 * 本代码完成本地指定目录的遍历和文件查找。对指定后缀的文件进行分析,利用Lucene建立
 * 索引,为后续检索使用做好准备。
 *******************************************************************/
public class LuceneIndexLocalDisk {

private static String Dest_Index_Path = "D:\\jy\\";
private static String Text_File_Path = "D:\\jy\\";


/*========================================================
* 主函数,指定索引目录和待分析的目录,生成Lucene索引
*========================================================*/
public static void main(String[] args) {

File indexpath = new File(Dest_Index_Path);
File localPath = new File(Text_File_Path);

try {
int nums = indexBuilder(indexpath,localPath);
System.out.println("Index Finished " + nums + " docs");
} catch (IOException e) {
e.printStackTrace();
}
}
/*========================================================
* 索引创建函数,生成IndexWriter创建索引,调用子目录索引函数,并优化
* 存储本地磁盘索引
*========================================================*/
public static int indexBuilder( File indexPath , File localPath ) 
throws IOException{
if(!localPath.exists() || !localPath.isDirectory() || !localPath.canRead()){
throw new IOException(localPath + "不存在或者不允许访问" );
}
System.out.println("目标路径完好");
IndexWriter FSWriter = new IndexWriter(indexPath,new StandardAnalyzer(),true);
FSWriter.setUseCompoundFile(true);

SubindexBuilder(FSWriter,localPath);
int num = FSWriter.docCount();
FSWriter.optimize();
FSWriter.close();
return num;
}

/*========================================================
* 判断当前文件名是否符合文件后缀要求
*========================================================*/
private static boolean IsValidType(String name){
if(name.endsWith(".xml"))
{
return true;
} else {
return false;
}
}
/*========================================================
* 处理各种不同类型文档,调用相应的参数,合并到本地磁盘索引当中
*========================================================*/
private static void fileindexBuilder(IndexWriter fswriter,File subfile)  
throws IOException{

if( subfile.isHidden() || !subfile.exists() || !subfile.canRead()){
return ;
}
  String strname = subfile.getName();
  int dotpos = strname.indexOf(".");
  HandleXml hnxml=new HandleXml();
  if( (dotpos >0) && (dotpos < strname.length()))
  {  
  hnxml.handle(fswriter ,subfile);
  }
   
}



/*========================================================
* 递归函数,递归分析目录,如果找到子目录,继续递归;如果找到文件分析索引
*========================================================*/
private static void SubindexBuilder(IndexWriter fswriter,File subPath)  


throws IOException{

File[] filelist = subPath.listFiles();
System.out.println(subPath.getAbsolutePath() + " :子目录个数 " + filelist.length);

for(int i = 0; i< filelist.length;i++){
File file = filelist[i];
if(file.isDirectory()){
SubindexBuilder(fswriter,file);
} else if(IsValidType(file.getName())){
fileindexBuilder(fswriter,file);
}
}
}
}


解析xml的类:HandleXml

package Test;

import java.io.File;
import java.io.IOException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

public class HandleXml {

public void handle(IndexWriter fswriter,File subPath)
{
try {// 处理分析XML文档,并索引文档内容
Directory ramdirectory = new RAMDirectory();
Analyzer TextAnalyzer = new StandardAnalyzer(); // 生成分析器
// 根据指定文件创建输入流

Document document = new Document() ; // 由Office文件生成文档对象
// 获取DOM对象树的生成器
DocumentBuilderFactory builderfactory = DocumentBuilderFactory.newInstance();

try {
// 获取 DocumentBuilder实例
DocumentBuilder builder = builderfactory.newDocumentBuilder();
////从 XML 文档获取 DOM 文档实例
org.w3c.dom.Document documentW3c = builder.parse(subPath);
//获取某节点的集合
NodeList nodelist = documentW3c.getElementsByTagName("item");
// 获取节点列表的总长度
int listnum = nodelist.getLength();
System.out.println("--------节点数量:"+listnum + "--------");
for (int i = 0; i < listnum; i++) {
// 获取节点
Element eltItem = (Element) nodelist.item(i);
// 获取节点的各项属性
Node eltTitle = eltItem.getElementsByTagName("title").item(0);
Node eltLink = eltItem.getElementsByTagName("addr").item(0);
Node eltDescription = eltItem.getElementsByTagName("content").item(0);
 
Stringtitle = eltTitle.getFirstChild().getNodeValue();  
Stringaddr = eltLink.getFirstChild().getNodeValue();  
Stringcontent = eltDescription.getFirstChild().getNodeValue();
 
Field field_title=new Field("title",title,Field.Store.YES,Field.Index.UN_TOKENIZED);
document.add(field_title);
 
Field field_addr=new Field("addr",addr,Field.Store.YES,Field.Index.UN_TOKENIZED);
document.add(field_addr);
 
Field field_content=new Field("content",content,Field.Store.YES,Field.Index.UN_TOKENIZED);
document.add(field_content);
// 输出结果
fswriter.addDocument(document);
System.out.print("标题:");
System.out.println(title);
System.out.print("链接:");
System.out.println(addr);
System.out.print("描述:");


System.out.println(content);
System.out.println("----------------------------\n");
}
} catch (ParserConfigurationException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("----------创建索引:XML 文件内容 ----------");
//System.out.println(document);
fswriter.addDocument(document); // 添加文档到索引

}catch (IOException e) {
e.printStackTrace();
}
System.out.println("----------创建索引:Office 文件成功. ----------");
}
}


检索代码:
public class SearchTool {
private static String Dest_Index_Path = "D:\\jy";

public static void main(String[] args) {

@SuppressWarnings("unused")
File indexpath = new File(Dest_Index_Path);
SearchTool tool = new SearchTool();

try {
tool.BasicSearch("故宫");
} catch (Exception e) {
e.printStackTrace();
}
}

  public void BasicSearch(String keyWord) throws ParseException {

Analyzer analyzer = new SimpleAnalyzer();
try {
IndexSearcher searcher = new IndexSearcher(Dest_Index_Path);
QueryParser parser = new QueryParser("content", analyzer);
Query query = parser.parse(keyWord);
System.out.println(query.toString());
Hits hits = null;
hits = searcher.search(query);
System.out.println(hits.length());
for (int i = 0; i < hits.length(); i++) {
System.out.println(hits.doc(i));
System.out.println(hits.doc(i).getField("title"));
}
hits = null;
System.gc();
} catch (CorruptIndexException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
}

}




D://JY中的 xml文件

<?xml version="1.0" encoding="GB2312"?>
<root>
<channel>
<title>wood</title>
<addr/>
<content>The channel:wood。The class of code:null</content>
</channel>
<item>
<title>植物大战僵尸</title>
<addr>http://www.node.com</addr>
<content>一款非常好玩的塔防游戏</content>
</item>
<item>
<title>故宫的传说</title>
<addr>http://www.chinanet.com</addr>
<content>想知道故 宫的由来吗?紫禁城里到底如何金碧辉煌</content>
</item>
<item>
<title>英语学习指导</title>
<addr>http://www.english.com</addr>
<content>指导性的文章</content>
</item>
</root>

[解决办法]
没法给你调,我用的lucnece3.我说的你都改了吗,有看了一下你的代码
new Field("content",content,Field.Store.YES,Field.Index.UN_TOKENIZED)被索引的Field怎么最后一个参数设置成不切词了,这么着只有完全content匹配才会有结果。
writer用完就关了吧。

热点排行