扩展TokenFilter,实现二次分词
经过paoding分词后,再对每个token进行2次分词,此处是二元切分法
?
public class MyCJKFilter extends TokenFilter {
??? private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);;
??? private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
??? private PositionIncrementAttribute posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
??? protected MyCJKFilter(TokenStream input) {
??? ??? super(input);
??? }
??? private Vector<Token> bufferToken = new Vector<Token>();
??? private int count = 0;
??? private CJKAnalyzer analyzer = new CJKAnalyzer();
??? Map<String, Token> map = new HashMap<String, Token>();
??? @Override
??? public final boolean incrementToken() throws IOException {
??? ??? if (this.bufferToken.size() > 0) {
??? ??? ??? Token t = this.bufferToken.remove(0);
??? ??? ??? this.termAtt.setTermBuffer(t.term());
??? ??? ??? this.offsetAtt.setOffset(t.startOffset(), t.endOffset());
??? ??? ??? this.posAtt.setPositionIncrement(t.getPositionIncrement());
??? ??? ??? return true;
??? ??? }
??? ??? if (this.bufferToken.size() == 0 && this.count > 0) {
??? ??? ??? // System.out.println("count is > 0!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
??? ??? ??? count = 0;
??? ??? ??? return false;
??? ??? }
??? ??? map.clear();
??? ??? while (input.incrementToken()) {
??? ??? ??? this.termAtt = (TermAttribute) input.getAttribute(TermAttribute.class);
??? ??? ??? this.offsetAtt = (OffsetAttribute) input.getAttribute(OffsetAttribute.class);
??? ??? ??? this.posAtt = (PositionIncrementAttribute) input.getAttribute(PositionIncrementAttribute.class);
??? ??? ??? String term = this.termAtt.term();
??? ??? ??? Token tokenOri = new Token(term, this.offsetAtt.startOffset(), this.offsetAtt.endOffset());
??? ??? ??? this.bufferToken.add(tokenOri);
??? ??? ??? map.put(term, tokenOri);
??? ??? ??? // System.out.println(term + "-->" + this.offsetAtt.startOffset() + "," + this.offsetAtt.endOffset());
??? ??? ??? TokenStream ts = this.analyzer.tokenStream("", new StringReader(term));
??? ??? ??? while (ts.incrementToken()) {
??? ??? ??? ??? TermAttribute ta = (TermAttribute) ts.getAttribute(TermAttribute.class);
??? ??? ??? ??? if (map.containsKey(ta.term())) {
??? ??? ??? ??? ??? continue;
??? ??? ??? ??? }
??? ??? ??? ??? OffsetAttribute offa = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
??? ??? ??? ??? // System.out.println(ta.term() + "-->" + offa.startOffset() + "," + offa.endOffset());
??? ??? ??? ??? Token token = new Token(ta.term(), offa.startOffset(), offa.endOffset());
??? ??? ??? ??? if (token == null) {
??? ??? ??? ??? ??? // System.out.println("ts.next() is null");
??? ??? ??? ??? } else {
??? ??? ??? ??? ??? this.bufferToken.add(token);
??? ??? ??? ??? ??? // System.out.println("add to vector, term=" + token.term());
??? ??? ??? ??? }
??? ??? ??? }
??? ??? ??? count++;
??? ??? }
??? ??? if (bufferToken.size() > 0) {
??? ??? ??? return this.incrementToken();
??? ??? } else {
??? ??? ??? return false;
??? ??? }
??? }
}