Try to answer some boolean queries using Term-Document-Incidence-Matrix [closed]

旧城冷巷雨未停 提交于 2019-12-13 11:24:18

问题


I try answer some simple boolean query in these ways NOT x NOT y NOT z also x AND y AND z and also like this x OR y OR z x,y,z are some words and any of them belongs to a different file.txtor maybe all of them belongs to same file.txt, no matter .

I've written a class TermDocMatrix:

it must be able to answer a boolean query , I prepared some methods in class TermDocMatrix{ }for these purposes but it doesn't work. I even debug the code step by step, I realized loops have no turn. I don't know why, codes seems fine.

well you can see code in here :

class TermDocMatrix
{
    //stores distinct terms
    public HashSet<string> distinctTerm = new HashSet<string>();
    //stores document id and its contents without splitting
    public Dictionary<int, string> documentContentList = new Dictionary<int, string>();
    //stores document and its terms collection
    public Dictionary<string, List<string>> documentCollection = new Dictionary<string, List<string>>();
    public Dictionary<string, List<int>> termDocumentIncidenceMatrix = new Dictionary<string, List<int>>();

     //stop words collection
    public List<string> stopWords = new List<string> { "on", "of", "The", "an", "a", "in" };
    //boolean operators list
    public string[] booleanOperator = new string[] { "AND", "OR", "NOT" };

    private string _FileName = "words";
    public string _Path = "";
    int _lastDocNum = 0;

    public TermDocMatrix(string IndexPath,string FileName)
    {

        if (_Path.EndsWith("\\") == false) _Path += "\\";
        if (!Directory.Exists(IndexPath)) Directory.CreateDirectory(IndexPath);
        LogManager.Configure(_Path + _FileName + ".txt", false);
        // read all files
        LoadFiles();
    }

    private void LoadFiles()
    {
        int count = 0;

        if (File.Exists(_Path + _FileName + ".txt") == false)
            return;
        // load words
        string b = File.ReadAllText(_Path + _FileName + ".txt");
        String[] TermsCollection = RemoveStopsWords(b.ToUpper().Split(' '));
        foreach (string term in TermsCollection)
        {
            //prepeare distinct terms collection
            //remove stop words
            if (!stopWords.Contains(term))
            {
                distinctTerm.Add(term);
            }
        }
        //add document and their terms collection
        documentCollection.Add(_FileName, TermsCollection.ToList());
        //add document and its content for displaying the search result
        documentContentList.Add(count, b);
        count++;
    }
    public string ProcessFiles(string query)
    {
        termDocumentIncidenceMatrix = GetTermDocumentIncidenceMatrix(distinctTerm, documentCollection);
        do
        {
            List<int> lst = ProcessQuery(query);
            int count = 0;
            if (lst != null)
            {
                foreach (int a in lst)
                {
                    if (a == 1)
                    {
                         return documentContentList[count];
                    }
                    count++;
                }
            }
            else
            {
                return "No search result found";
            }

        } while (1 == 1);
    }
    public int WordCount()
    {
        return documentCollection.Count;
    }

    public int DocumentCount
    {
        get
        {
            return _lastDocNum;
        }
    }

    private void FilterQueryTerm(ref string[] str)
    {
        List<string> _queryTerm = new List<string>();


        foreach (string queryTerm in str)
        {
            if (queryTerm.ToUpper().Equals("BUT") || termDocumentIncidenceMatrix.ContainsKey(queryTerm.ToUpper()) || booleanOperator.Contains(queryTerm))
            {
                _queryTerm.Add(queryTerm);

            }
        }

        str = _queryTerm.ToArray();
    }

    //prepares Term Document Incidence Matrix
    public Dictionary<string, List<int>> GetTermDocumentIncidenceMatrix(HashSet<string> distinctTerms, Dictionary<string, List<string>> documentCollection)
    {
        Dictionary<string, List<int>> termDocumentIncidenceMatrix = new Dictionary<string, List<int>>();
        List<int> incidenceVector = new List<int>();
        foreach (string term in distinctTerms)
        {
            //incidence vector for each terms
            incidenceVector = new List<int>();
            foreach (KeyValuePair<string, List<string>> p in documentCollection)
            {

                if (p.Value.Contains(term))
                {
                    //document contains the term
                    incidenceVector.Add(1);

                }
                else
                {
                    //document do not contains the term
                    incidenceVector.Add(0);
                }
            }
            termDocumentIncidenceMatrix.Add(term, incidenceVector);

        }
        return termDocumentIncidenceMatrix;
    }
    //removes all stop words
    public string[] RemoveStopsWords(string[] str)
    {
        List<string> terms = new List<string>();
        foreach (string term in str)
        {
            if (!stopWords.Contains(term))
            {
                terms.Add(term);
            }
        }
        return terms.ToArray();
    }
    //process the boolean query
    public List<int> ProcessQuery(string query)
    {

        //query boolean operator
        string bitWiseOp = string.Empty;
        string[] queryTerm = RemoveStopsWords(query.ToUpper().Split(' '));

        //remove query term that doesnot appears on document collection
        FilterQueryTerm(ref queryTerm);
        List<int> previousTermIncidenceV = null;
        List<int> nextTermsIncidenceV = null;
        //holds the bitwise operation result
        List<int> resultSet = null;
        //suppose on query X AND Y, X is previousTerm term and Y is nextTerm
        Boolean hasPreviousTerm = false;
        Boolean hasNotOperation = false;
        foreach (string term in queryTerm)
        {
            //is a term
            if (!booleanOperator.Contains(term) && !term.Equals("BUT"))
            {
                //query case: structure AND NOT analysis
                if (hasNotOperation)
                {

                    if (hasPreviousTerm)
                    {
                        nextTermsIncidenceV = ProcessBooleanOperator("NOT", GetTermIncidenceVector(term), nextTermsIncidenceV);
                    }
                    //query case: eg.NOT analysis
                    else
                    {
                        previousTermIncidenceV = ProcessBooleanOperator("NOT", GetTermIncidenceVector(term), nextTermsIncidenceV);
                        resultSet = previousTermIncidenceV;
                    }
                    hasNotOperation = false;
                }
                else if (!hasPreviousTerm)
                {
                    previousTermIncidenceV = GetTermIncidenceVector(term);
                    resultSet = previousTermIncidenceV;
                    hasPreviousTerm = true;
                }
                else
                {

                    nextTermsIncidenceV = GetTermIncidenceVector(term);
                }
            }
            else if (term.Equals("NOT"))
            {
                //indicates that the  term in the next iteration should be complemented.
                hasNotOperation = true;
            }
            else
            {
                //'BUT' also should be evaluated as AND eg. structure BUT NOT semantic should be evaluated as structure AND NOT semantic
                if (term.Equals("BUT"))
                {
                    bitWiseOp = "AND";
                }
                else
                    bitWiseOp = term;
            }

            if (nextTermsIncidenceV != null && !hasNotOperation)
            {
                resultSet = ProcessBooleanOperator(bitWiseOp, previousTermIncidenceV, nextTermsIncidenceV);
                previousTermIncidenceV = resultSet;
                hasPreviousTerm = true;
                nextTermsIncidenceV = null;
            }
        }

        return resultSet;
    }

    //Process Boolean operators
    public List<int> ProcessBooleanOperator(string op, List<int> previousTermV, List<int> nextTermV)
    {
        List<int> resultSet = new List<int>();
        if (op.Equals("NOT"))
        {
            foreach (int a in previousTermV)
            {
                if (a == 1)
                {
                    resultSet.Add(0);
                }
                else
                {
                    resultSet.Add(1);
                }
            }
        }
        else if (op.ToUpper().Equals("AND")) //bitwise AND operation
        {
            for (int a = 0; a < previousTermV.Count; a++)
            {
                if (previousTermV[a] == 1 && nextTermV[a] == 1)
                {
                    resultSet.Add(1);
                }
                else
                {
                    resultSet.Add(0);
                }
            }
        }
        else if (op.ToUpper().Equals("OR")) //bitwise OR operation
        {
            for (int a = 0; a < previousTermV.Count; a++)
            {
                if (previousTermV[a] == 0 && nextTermV[a] == 0)
                {
                    resultSet.Add(0);
                }
                else
                {
                    resultSet.Add(1);
                }
            }
        }
        return resultSet;
    }

    //returns term incidence vector
    public List<int> GetTermIncidenceVector(string term)
    {
        return termDocumentIncidenceMatrix[term.ToUpper()];

    }
}

you need to now about another class named Class LogManager I used it on my Class TermDocMatrix. Here :

namespace WindowsFormsApplication1
 {
   internal class FileLogger
   {
    public static readonly FileLogger Instance = new FileLogger();

    private string _filename;
    private bool _showMethodName = false;
    private string _FilePath = "";

    public bool ShowMethodNames
    {
        get { return _showMethodName; }
    }

    public void Init(string filename, bool showmethodnames)
    {
        _showMethodName = showmethodnames;
        _filename = filename;
        // handle folder names as well -> create dir etc.
        _FilePath = Path.GetDirectoryName(filename);
        if (_FilePath != "")
        {
            _FilePath = Directory.CreateDirectory(_FilePath).FullName;
            if (_FilePath.EndsWith("\\") == false)
                _FilePath += "\\";
        }
    }
}
internal static class LogManager
{

    public static void Configure(string filename, bool showmethodnames)
    {
        FileLogger.Instance.Init(filename, showmethodnames);
    }
}

}

it must work but it don't. tell me why don't it work, please. when I ask for answer I just see this "No search result found", no matter I'd typed what kind of boolean query.


回答1:


Your problem is in this line: (ProcessFiles function)

String[] termsCollection = RemoveStopsWords(file.ToUpper().Split(' '));

you're splitting the name of the file and not its content That's why you have no search results

you should do something like this instead:

String[] termsCollection = RemoveStopsWords(File.ReadAllText(file).ToUpper().Split(' '));

Now change your TermDocMatrix constructor:

public TermDocMatrix(string IndexPath,string FileName)
{
    if (!Directory.Exists(IndexPath)) Directory.CreateDirectory(IndexPath);
    LogManager.Configure(System.IO.Path.Combine(_Path, _FileName + ".txt"), false);
    // read all files
    LoadFiles();
}

And your LoadFiles function:

private void LoadFiles()
{
    int count = 0;

    if (File.Exists(System.IO.Path.Combine(_Path, _FileName + ".txt")) == false)
        return;
    // load words
    string b = File.ReadAllText(System.IO.Path.Combine(_Path, _FileName + ".txt"));

    .....
}


来源:https://stackoverflow.com/questions/16468094/try-to-answer-some-boolean-queries-using-term-document-incidence-matrix

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!