Term-document matrix in Lucene

懵懂的女人 提交于 2021-02-08 06:51:57


I am trying to get a term-document matrix from Lucene. It seems that most of the SO questions are for outdated APIs with different classes. I tried combining insight from these two questions to get a term vector from every document:

  • Term Vector Frequency in Lucene 4.0
  • Is it possible to iterate through documents stored in Lucene Index?

Relevant code, but DocEnum is not recognized in the current API. How can I get a term vector or count of all terms for every document?

IndexReader reader = DirectoryReader.open(index);

for (int i = 0;  i < reader.maxDoc(); i++) {
    Document doc = reader.document(i);
    Terms terms = reader.getTermVector(i, "country_text");

    if (terms != null && terms.size() > 0) {
        // access the terms for this field
        TermsEnum termsEnum = terms.iterator(); 
        BytesRef term = null;

        // explore the terms for this field
        while ((term = termsEnum.next()) != null) {
            // enumerate through documents, in this case only one
            DocsEnum docsEnum = termsEnum.docs(null, null); 
            int docIdEnum;
            while ((docIdEnum = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                // get the term frequency in the document 
                System.out.println(term.utf8ToString()+ " " + docIdEnum + " " + docsEnum.freq()); 

Full code:

import java.io.*;
import java.util.Iterator;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.JSONValue;
import org.json.simple.parser.JSONParser;

public class LuceneIndex {

    public static void main(String[] args) throws IOException, ParseException {

        String jsonFilePath = "wiki_data.json";
        JSONParser parser = new JSONParser();
        // Specify the analyzer for tokenizing text.
        StandardAnalyzer analyzer = new StandardAnalyzer();
        // create the index
        Directory index = new RAMDirectory();
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        IndexWriter w = new IndexWriter(index, config);

        try {     
            JSONArray a = (JSONArray) parser.parse(new FileReader(jsonFilePath));

            for (Object o : a) {
                JSONObject country = (JSONObject) o;
                String countryName = (String) country.get("country_name");
                String cityName = (String) country.get("city_name");
                String countryText = (String) country.get("country_text");
                String cityText = (String) country.get("city_text");
                addDoc(w, countryName, cityName, countryText, cityText);

            IndexReader reader = DirectoryReader.open(index);

            for (int i = 0;  i < reader.maxDoc(); i++) {
                Document doc = reader.document(i);
                Terms terms = reader.getTermVector(i, "country_text");

                if (terms != null && terms.size() > 0) {
                    // access the terms for this field
                    TermsEnum termsEnum = terms.iterator(); 
                    BytesRef term = null;

                    // explore the terms for this field
                    while ((term = termsEnum.next()) != null) {
                        // enumerate through documents, in this case only one
                        DocsEnum docsEnum = termsEnum.docs(null, null); 
                        int docIdEnum;
                        while ((docIdEnum = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                            // get the term frequency in the document 
                            System.out.println(term.utf8ToString()+ " " + docIdEnum + " " + docsEnum.freq()); 

            // reader can be closed when there
            // is no need to access the documents any more.

        } catch (FileNotFoundException e) {
        } catch (IOException e) {
        } catch (org.json.simple.parser.ParseException e) {

    private static void addDoc(IndexWriter w, String countryName, String cityName, 
            String countryText, String cityText) throws IOException {
        Document doc = new Document();
        doc.add(new StringField("country_name", countryName, Field.Store.YES));
        doc.add(new StringField("city_name", cityName, Field.Store.YES));
        doc.add(new TextField("country_text", countryText, Field.Store.YES));
        doc.add(new TextField("city_text", cityText, Field.Store.YES));




First thank for your code I had a little bug and your code helped me to complete it.

For me it works with this: (Lucene 7.2.1)

for(int i = 0; i < reader.maxDoc(); i++){
    Document doc = reader.document(i);
    Terms terms = reader.getTermVector(i, "text");

    if (terms != null && terms.size() > 0) {
        // access the terms for this field
        TermsEnum termsEnum = terms.iterator();
        BytesRef term = null;

        // explore the terms for this field
        while ((term = termsEnum.next()) != null) {
            // enumerate through documents, in this case only one
            PostingsEnum docsEnum = termsEnum.postings(null); 
            int docIdEnum;
            while ((docIdEnum = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                // get the term frequency in the document
                System.out.println(term.utf8ToString()+ " " + docIdEnum + " " + docsEnum.freq());

The Change here is I used PostingsEnum. DocsEnum is not available in Lucene 7.2.1 anymore.

But why it didn't work for you is how you add your document:

private void addDoc(IndexWriter w, String text, String name, String id) throws IOException {
    Document doc = new Document();
    // Create own FieldType to store Term Vectors
    FieldType ft = new FieldType();
    ft.setStoreTermVectors(true);  //Store Term Vectors
    StoredField t = new StoredField("text",text,ft);

    doc.add(new StringField("name", name, Field.Store.YES));
    doc.add(new StringField("id", id, Field.Store.YES));

You have to create your own FieldType. None of the standard ones will save the term vectors.


According to this question you should not use TextField for term frequency. Because it does not calculate it. Use "Field".


You can also do it by make your field as this:

 FieldType myFieldType = new FieldType(TextField.TYPE_STORED);

Then re-index your documents. finally can get term vector!

