Solr: exact phrase query with a EdgeNGramFilterFactory

前端 未结 4 1079
谎友^
谎友^ 2021-02-04 17:06

In Solr (3.3), is it possible to make a field letter-by-letter searchable through a EdgeNGramFilterFactory and also sensitive to phrase queries?

By example,

4条回答
  •  臣服心动
    2021-02-04 17:59

    I've made a fix to EdgeNGramFilter so positions within a token are not incremented anymore:

        public class CustomEdgeNGramTokenFilterFactory extends TokenFilterFactory {
        private int maxGramSize = 0;
    
        private int minGramSize = 0;
    
        @Override
        public void init(Map args) {
            super.init(args);
            String maxArg = args.get("maxGramSize");
            maxGramSize = (maxArg != null ? Integer.parseInt(maxArg)
                    : EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
    
            String minArg = args.get("minGramSize");
            minGramSize = (minArg != null ? Integer.parseInt(minArg)
                    : EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
    
        }
    
        @Override
        public CustomEdgeNGramTokenFilter create(TokenStream input) {
            return new CustomEdgeNGramTokenFilter(input, minGramSize, maxGramSize);
        }
    }
    
    public class CustomEdgeNGramTokenFilter extends TokenFilter {
        private final int minGram;
        private final int maxGram;
        private char[] curTermBuffer;
        private int curTermLength;
        private int curGramSize;
    
        private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
        private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
        private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
    
        /**
         * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
         *
         * @param input   {@link org.apache.lucene.analysis.TokenStream} holding the input to be tokenized
         * @param minGram the smallest n-gram to generate
         * @param maxGram the largest n-gram to generate
         */
        public CustomEdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) {
            super(input);
    
            if (minGram < 1) {
                throw new IllegalArgumentException("minGram must be greater than zero");
            }
    
            if (minGram > maxGram) {
                throw new IllegalArgumentException("minGram must not be greater than maxGram");
            }
    
            this.minGram = minGram;
            this.maxGram = maxGram;
        }
    
    @Override
    public final boolean incrementToken() throws IOException {
        while (true) {
            int positionIncrement = 0;
            if (curTermBuffer == null) {
                if (!input.incrementToken()) {
                    return false;
                } else {
                    positionIncrement = positionIncrementAttribute.getPositionIncrement();
                    curTermBuffer = termAtt.buffer().clone();
                    curTermLength = termAtt.length();
                    curGramSize = minGram;
                }
            }
            if (curGramSize <= maxGram) {
                if (!(curGramSize > curTermLength         // if the remaining input is too short, we can't generate any n-grams
                        || curGramSize > maxGram)) {       // if we have hit the end of our n-gram size range, quit
                    // grab gramSize chars from front
                    int start = 0;
                    int end = start + curGramSize;
                    offsetAtt.setOffset(start, end);
                    positionIncrementAttribute.setPositionIncrement(positionIncrement);
                    termAtt.copyBuffer(curTermBuffer, start, curGramSize);
                    curGramSize++;
    
                    return true;
                }
            }
            curTermBuffer = null;
        }
    }
    
        @Override
        public void reset() throws IOException {
            super.reset();
            curTermBuffer = null;
        }
    }
    

提交回复
热议问题