Finding the position of search hits from Lucene

前端 未结 2 1836
感动是毒
感动是毒 2020-12-09 06:05

With Lucene, what would be the recommended approach for locating matches in search results?

More specifically, suppose index documents have a field \"fullText\" whic

相关标签:
2条回答
  • 2020-12-09 06:48

    TermFreqVector is what I used. Here is a working demo, that prints both the term positions, and the starting and ending term indexes:

    public class Search {
        public static void main(String[] args) throws IOException, ParseException {
            Search s = new Search();  
            s.doSearch(args[0], args[1]);  
        }  
    
        Search() {
        }  
    
        public void doSearch(String db, String querystr) throws IOException, ParseException {
            // 1. Specify the analyzer for tokenizing text.  
            //    The same analyzer should be used as was used for indexing  
            StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);  
    
            Directory index = FSDirectory.open(new File(db));  
    
            // 2. query  
            Query q = new QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(querystr);  
    
            // 3. search  
            int hitsPerPage = 10;  
            IndexSearcher searcher = new IndexSearcher(index, true);  
            IndexReader reader = IndexReader.open(index, true);  
            searcher.setDefaultFieldSortScoring(true, false);  
            TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);  
            searcher.search(q, collector);  
            ScoreDoc[] hits = collector.topDocs().scoreDocs;  
    
            // 4. display term positions, and term indexes   
            System.out.println("Found " + hits.length + " hits.");  
            for(int i=0;i<hits.length;++i) {  
    
                int docId = hits[i].doc;  
                TermFreqVector tfvector = reader.getTermFreqVector(docId, "contents");  
                TermPositionVector tpvector = (TermPositionVector)tfvector;  
                // this part works only if there is one term in the query string,  
                // otherwise you will have to iterate this section over the query terms.  
                int termidx = tfvector.indexOf(querystr);  
                int[] termposx = tpvector.getTermPositions(termidx);  
                TermVectorOffsetInfo[] tvoffsetinfo = tpvector.getOffsets(termidx);  
    
                for (int j=0;j<termposx.length;j++) {  
                    System.out.println("termpos : "+termposx[j]);  
                }  
                for (int j=0;j<tvoffsetinfo.length;j++) {  
                    int offsetStart = tvoffsetinfo[j].getStartOffset();  
                    int offsetEnd = tvoffsetinfo[j].getEndOffset();  
                    System.out.println("offsets : "+offsetStart+" "+offsetEnd);  
                }  
    
                // print some info about where the hit was found...  
                Document d = searcher.doc(docId);  
                System.out.println((i + 1) + ". " + d.get("path"));  
            }  
    
            // searcher can only be closed when there  
            // is no need to access the documents any more.   
            searcher.close();  
        }      
    }
    
    0 讨论(0)
  • 2020-12-09 06:56

    Here is a solution for lucene 5.2.1. It works only for single word querys, but should demonstrate the basic principles.

    The basic idea is:

    1. Get a TokenStream for each document, which matches your query.
    2. Create a QueryScorer and initialize it with the retrieved tokenStream.
    3. 'Loop' over each token of the stream (done by tokenStream.incrementToken()) and check if the token matches the search criteria (done by queryScorer.getTokenScore()).

    Here is the code:

    import java.io.IOException;
    import java.util.List;
    import java.util.Vector;
    
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.de.GermanAnalyzer;
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
    import org.apache.lucene.search.highlight.QueryScorer;
    import org.apache.lucene.search.highlight.TokenSources;
    
    public class OffsetSearcher {
    
        private IndexReader reader;
    
        public OffsetSearcher(IndexWriter indexWriter) throws IOException { 
            reader = DirectoryReader.open(indexWriter, true); 
        }
    
        public OffsetData[] getTermOffsets(Query query) throws IOException, InvalidTokenOffsetsException 
        {
            List<OffsetData> result = new Vector<>();
    
            IndexSearcher searcher = new IndexSearcher(reader);
            TopDocs topDocs = searcher.search(query, 1000);
    
            ScoreDoc[] scoreDocs = topDocs.scoreDocs;   
    
            Document doc;
            TokenStream tokenStream;
            CharTermAttribute termAtt;
            OffsetAttribute offsetAtt;
            QueryScorer queryScorer;
            OffsetData offsetData;
            String txt, tokenText;
            for (int i = 0; i < scoreDocs.length; i++) 
            {
                int docId = scoreDocs[i].doc;
                doc = reader.document(docId);
    
                txt = doc.get(RunSearch.CONTENT);
                tokenStream = TokenSources.getTokenStream(RunSearch.CONTENT, reader.getTermVectors(docId), txt, new GermanAnalyzer(), -1);
    
                termAtt = (CharTermAttribute)tokenStream.addAttribute(CharTermAttribute.class);
                offsetAtt = (OffsetAttribute)tokenStream.addAttribute(OffsetAttribute.class);
    
                queryScorer = new QueryScorer(query);
                queryScorer.setMaxDocCharsToAnalyze(RunSearch.MAX_DOC_CHARS);
                TokenStream newStream  = queryScorer.init(tokenStream);
                if (newStream != null) {
                    tokenStream = newStream;
                }
                queryScorer.startFragment(null);
    
                tokenStream.reset();
    
                int startOffset, endOffset;
                for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < RunSearch.MAX_DOC_CHARS); next = tokenStream.incrementToken())
                {
                    startOffset = offsetAtt.startOffset();
                    endOffset = offsetAtt.endOffset();
    
                    if ((endOffset > txt.length()) || (startOffset > txt.length()))
                    {
                        throw new InvalidTokenOffsetsException("Token " + termAtt.toString() + " exceeds length of provided text sized " + txt.length());
                    }
    
                    float res = queryScorer.getTokenScore();
                    if (res > 0.0F && startOffset <= endOffset) {
                        tokenText = txt.substring(startOffset, endOffset);
                        offsetData = new OffsetData(tokenText, startOffset, endOffset, docId);
                        result.add(offsetData);
                    }           
                }   
            }
    
            return result.toArray(new OffsetData[result.size()]);
        }
    
    
        public void close() throws IOException {
            reader.close();
        }
    
    
        public static class OffsetData {
    
            public String phrase;
            public int startOffset;
            public int endOffset;
            public int docId;
    
            public OffsetData(String phrase, int startOffset, int endOffset, int docId) {
                super();
                this.phrase = phrase;
                this.startOffset = startOffset;
                this.endOffset = endOffset;
                this.docId = docId;
            }
    
        }
    
    }
    
    0 讨论(0)
提交回复
热议问题