首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > 其他教程 > 互联网 >

扩张TokenFilter,实现二次分词

2012-09-17 
扩展TokenFilter,实现二次分词经过paoding分词后,再对每个token进行2次分词,此处是二元切分法?public clas

扩展TokenFilter,实现二次分词

经过paoding分词后,再对每个token进行2次分词,此处是二元切分法

?

public class MyCJKFilter extends TokenFilter {

??? private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);;
??? private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
??? private PositionIncrementAttribute posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);

??? protected MyCJKFilter(TokenStream input) {
??? ??? super(input);
??? }

??? private Vector<Token> bufferToken = new Vector<Token>();

??? private int count = 0;

??? private CJKAnalyzer analyzer = new CJKAnalyzer();

??? Map<String, Token> map = new HashMap<String, Token>();

??? @Override
??? public final boolean incrementToken() throws IOException {
??? ??? if (this.bufferToken.size() > 0) {
??? ??? ??? Token t = this.bufferToken.remove(0);
??? ??? ??? this.termAtt.setTermBuffer(t.term());
??? ??? ??? this.offsetAtt.setOffset(t.startOffset(), t.endOffset());
??? ??? ??? this.posAtt.setPositionIncrement(t.getPositionIncrement());
??? ??? ??? return true;
??? ??? }
??? ??? if (this.bufferToken.size() == 0 && this.count > 0) {
??? ??? ??? // System.out.println("count is > 0!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
??? ??? ??? count = 0;
??? ??? ??? return false;
??? ??? }

??? ??? map.clear();

??? ??? while (input.incrementToken()) {
??? ??? ??? this.termAtt = (TermAttribute) input.getAttribute(TermAttribute.class);
??? ??? ??? this.offsetAtt = (OffsetAttribute) input.getAttribute(OffsetAttribute.class);
??? ??? ??? this.posAtt = (PositionIncrementAttribute) input.getAttribute(PositionIncrementAttribute.class);
??? ??? ??? String term = this.termAtt.term();
??? ??? ??? Token tokenOri = new Token(term, this.offsetAtt.startOffset(), this.offsetAtt.endOffset());
??? ??? ??? this.bufferToken.add(tokenOri);
??? ??? ??? map.put(term, tokenOri);

??? ??? ??? // System.out.println(term + "-->" + this.offsetAtt.startOffset() + "," + this.offsetAtt.endOffset());

??? ??? ??? TokenStream ts = this.analyzer.tokenStream("", new StringReader(term));
??? ??? ??? while (ts.incrementToken()) {
??? ??? ??? ??? TermAttribute ta = (TermAttribute) ts.getAttribute(TermAttribute.class);
??? ??? ??? ??? if (map.containsKey(ta.term())) {
??? ??? ??? ??? ??? continue;
??? ??? ??? ??? }
??? ??? ??? ??? OffsetAttribute offa = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
??? ??? ??? ??? // System.out.println(ta.term() + "-->" + offa.startOffset() + "," + offa.endOffset());
??? ??? ??? ??? Token token = new Token(ta.term(), offa.startOffset(), offa.endOffset());
??? ??? ??? ??? if (token == null) {
??? ??? ??? ??? ??? // System.out.println("ts.next() is null");
??? ??? ??? ??? } else {
??? ??? ??? ??? ??? this.bufferToken.add(token);
??? ??? ??? ??? ??? // System.out.println("add to vector, term=" + token.term());
??? ??? ??? ??? }
??? ??? ??? }
??? ??? ??? count++;
??? ??? }

??? ??? if (bufferToken.size() > 0) {
??? ??? ??? return this.incrementToken();
??? ??? } else {
??? ??? ??? return false;
??? ??? }

??? }

}

热点排行