I try answer some simple boolean query in these ways NOT x NOT y NOT z
also x AND y AND z
and also like this x OR y OR z
are some words and any of them belongs to a different file.txt
or maybe all of them belongs to same file.txt
, no matter .
I've written a class TermDocMatrix
it must be able to answer a boolean query , I prepared some methods in class TermDocMatrix{ }
for these purposes but it doesn't work. I even debug the code step by step, I realized loops have no turn. I don't know why, codes seems fine.
well you can see code in here :
class TermDocMatrix
//stores distinct terms
public HashSet<string> distinctTerm = new HashSet<string>();
//stores document id and its contents without splitting
public Dictionary<int, string> documentContentList = new Dictionary<int, string>();
//stores document and its terms collection
public Dictionary<string, List<string>> documentCollection = new Dictionary<string, List<string>>();
public Dictionary<string, List<int>> termDocumentIncidenceMatrix = new Dictionary<string, List<int>>();
//stop words collection
public List<string> stopWords = new List<string> { "on", "of", "The", "an", "a", "in" };
//boolean operators list
public string[] booleanOperator = new string[] { "AND", "OR", "NOT" };
private string _FileName = "words";
public string _Path = "";
int _lastDocNum = 0;
public TermDocMatrix(string IndexPath,string FileName)
if (_Path.EndsWith("\\") == false) _Path += "\\";
if (!Directory.Exists(IndexPath)) Directory.CreateDirectory(IndexPath);
LogManager.Configure(_Path + _FileName + ".txt", false);
// read all files
private void LoadFiles()
int count = 0;
if (File.Exists(_Path + _FileName + ".txt") == false)
// load words
string b = File.ReadAllText(_Path + _FileName + ".txt");
String[] TermsCollection = RemoveStopsWords(b.ToUpper().Split(' '));
foreach (string term in TermsCollection)
//prepeare distinct terms collection
//remove stop words
if (!stopWords.Contains(term))
//add document and their terms collection
documentCollection.Add(_FileName, TermsCollection.ToList());
//add document and its content for displaying the search result
documentContentList.Add(count, b);
public string ProcessFiles(string query)
termDocumentIncidenceMatrix = GetTermDocumentIncidenceMatrix(distinctTerm, documentCollection);
List<int> lst = ProcessQuery(query);
int count = 0;
if (lst != null)
foreach (int a in lst)
if (a == 1)
return documentContentList[count];
return "No search result found";
} while (1 == 1);
public int WordCount()
return documentCollection.Count;
public int DocumentCount
return _lastDocNum;
private void FilterQueryTerm(ref string[] str)
List<string> _queryTerm = new List<string>();
foreach (string queryTerm in str)
if (queryTerm.ToUpper().Equals("BUT") || termDocumentIncidenceMatrix.ContainsKey(queryTerm.ToUpper()) || booleanOperator.Contains(queryTerm))
str = _queryTerm.ToArray();
//prepares Term Document Incidence Matrix
public Dictionary<string, List<int>> GetTermDocumentIncidenceMatrix(HashSet<string> distinctTerms, Dictionary<string, List<string>> documentCollection)
Dictionary<string, List<int>> termDocumentIncidenceMatrix = new Dictionary<string, List<int>>();
List<int> incidenceVector = new List<int>();
foreach (string term in distinctTerms)
//incidence vector for each terms
incidenceVector = new List<int>();
foreach (KeyValuePair<string, List<string>> p in documentCollection)
if (p.Value.Contains(term))
//document contains the term
//document do not contains the term
termDocumentIncidenceMatrix.Add(term, incidenceVector);
return termDocumentIncidenceMatrix;
//removes all stop words
public string[] RemoveStopsWords(string[] str)
List<string> terms = new List<string>();
foreach (string term in str)
if (!stopWords.Contains(term))
return terms.ToArray();
//process the boolean query
public List<int> ProcessQuery(string query)
//query boolean operator
string bitWiseOp = string.Empty;
string[] queryTerm = RemoveStopsWords(query.ToUpper().Split(' '));
//remove query term that doesnot appears on document collection
FilterQueryTerm(ref queryTerm);
List<int> previousTermIncidenceV = null;
List<int> nextTermsIncidenceV = null;
//holds the bitwise operation result
List<int> resultSet = null;
//suppose on query X AND Y, X is previousTerm term and Y is nextTerm
Boolean hasPreviousTerm = false;
Boolean hasNotOperation = false;
foreach (string term in queryTerm)
//is a term
if (!booleanOperator.Contains(term) && !term.Equals("BUT"))
//query case: structure AND NOT analysis
if (hasNotOperation)
if (hasPreviousTerm)
nextTermsIncidenceV = ProcessBooleanOperator("NOT", GetTermIncidenceVector(term), nextTermsIncidenceV);
//query case: eg.NOT analysis
previousTermIncidenceV = ProcessBooleanOperator("NOT", GetTermIncidenceVector(term), nextTermsIncidenceV);
resultSet = previousTermIncidenceV;
hasNotOperation = false;
else if (!hasPreviousTerm)
previousTermIncidenceV = GetTermIncidenceVector(term);
resultSet = previousTermIncidenceV;
hasPreviousTerm = true;
nextTermsIncidenceV = GetTermIncidenceVector(term);
else if (term.Equals("NOT"))
//indicates that the term in the next iteration should be complemented.
hasNotOperation = true;
//'BUT' also should be evaluated as AND eg. structure BUT NOT semantic should be evaluated as structure AND NOT semantic
if (term.Equals("BUT"))
bitWiseOp = "AND";
bitWiseOp = term;
if (nextTermsIncidenceV != null && !hasNotOperation)
resultSet = ProcessBooleanOperator(bitWiseOp, previousTermIncidenceV, nextTermsIncidenceV);
previousTermIncidenceV = resultSet;
hasPreviousTerm = true;
nextTermsIncidenceV = null;
return resultSet;
//Process Boolean operators
public List<int> ProcessBooleanOperator(string op, List<int> previousTermV, List<int> nextTermV)
List<int> resultSet = new List<int>();
if (op.Equals("NOT"))
foreach (int a in previousTermV)
if (a == 1)
else if (op.ToUpper().Equals("AND")) //bitwise AND operation
for (int a = 0; a < previousTermV.Count; a++)
if (previousTermV[a] == 1 && nextTermV[a] == 1)
else if (op.ToUpper().Equals("OR")) //bitwise OR operation
for (int a = 0; a < previousTermV.Count; a++)
if (previousTermV[a] == 0 && nextTermV[a] == 0)
return resultSet;
//returns term incidence vector
public List<int> GetTermIncidenceVector(string term)
return termDocumentIncidenceMatrix[term.ToUpper()];
you need to now about another class named Class LogManager
I used it on my Class TermDocMatrix
. Here :
namespace WindowsFormsApplication1
internal class FileLogger
public static readonly FileLogger Instance = new FileLogger();
private string _filename;
private bool _showMethodName = false;
private string _FilePath = "";
public bool ShowMethodNames
get { return _showMethodName; }
public void Init(string filename, bool showmethodnames)
_showMethodName = showmethodnames;
_filename = filename;
// handle folder names as well -> create dir etc.
_FilePath = Path.GetDirectoryName(filename);
if (_FilePath != "")
_FilePath = Directory.CreateDirectory(_FilePath).FullName;
if (_FilePath.EndsWith("\\") == false)
_FilePath += "\\";
internal static class LogManager
public static void Configure(string filename, bool showmethodnames)
FileLogger.Instance.Init(filename, showmethodnames);
it must work but it don't. tell me why don't it work, please. when I ask for answer I just see this "No search result found", no matter I'd typed what kind of boolean query.
Your problem is in this line: (ProcessFiles
String[] termsCollection = RemoveStopsWords(file.ToUpper().Split(' '));
you're splitting the name of the file and not its content That's why you have no search results
you should do something like this instead:
String[] termsCollection = RemoveStopsWords(File.ReadAllText(file).ToUpper().Split(' '));
Now change your TermDocMatrix
public TermDocMatrix(string IndexPath,string FileName)
if (!Directory.Exists(IndexPath)) Directory.CreateDirectory(IndexPath);
LogManager.Configure(System.IO.Path.Combine(_Path, _FileName + ".txt"), false);
// read all files
And your LoadFiles
private void LoadFiles()
int count = 0;
if (File.Exists(System.IO.Path.Combine(_Path, _FileName + ".txt")) == false)
// load words
string b = File.ReadAllText(System.IO.Path.Combine(_Path, _FileName + ".txt"));