问题
I try answer some simple boolean query in these ways NOT x NOT y NOT z
also x AND y AND z
and also like this x OR y OR z
x,y,z
are some words and any of them belongs to a different file.txt
or maybe all of them belongs to same file.txt
, no matter .
I've written a class TermDocMatrix
:
it must be able to answer a boolean query , I prepared some methods in class TermDocMatrix{ }
for these purposes but it doesn't work. I even debug the code step by step, I realized loops have no turn. I don't know why, codes seems fine.
well you can see code in here :
class TermDocMatrix
{
//stores distinct terms
public HashSet<string> distinctTerm = new HashSet<string>();
//stores document id and its contents without splitting
public Dictionary<int, string> documentContentList = new Dictionary<int, string>();
//stores document and its terms collection
public Dictionary<string, List<string>> documentCollection = new Dictionary<string, List<string>>();
public Dictionary<string, List<int>> termDocumentIncidenceMatrix = new Dictionary<string, List<int>>();
//stop words collection
public List<string> stopWords = new List<string> { "on", "of", "The", "an", "a", "in" };
//boolean operators list
public string[] booleanOperator = new string[] { "AND", "OR", "NOT" };
private string _FileName = "words";
public string _Path = "";
int _lastDocNum = 0;
public TermDocMatrix(string IndexPath,string FileName)
{
if (_Path.EndsWith("\\") == false) _Path += "\\";
if (!Directory.Exists(IndexPath)) Directory.CreateDirectory(IndexPath);
LogManager.Configure(_Path + _FileName + ".txt", false);
// read all files
LoadFiles();
}
private void LoadFiles()
{
int count = 0;
if (File.Exists(_Path + _FileName + ".txt") == false)
return;
// load words
string b = File.ReadAllText(_Path + _FileName + ".txt");
String[] TermsCollection = RemoveStopsWords(b.ToUpper().Split(' '));
foreach (string term in TermsCollection)
{
//prepeare distinct terms collection
//remove stop words
if (!stopWords.Contains(term))
{
distinctTerm.Add(term);
}
}
//add document and their terms collection
documentCollection.Add(_FileName, TermsCollection.ToList());
//add document and its content for displaying the search result
documentContentList.Add(count, b);
count++;
}
public string ProcessFiles(string query)
{
termDocumentIncidenceMatrix = GetTermDocumentIncidenceMatrix(distinctTerm, documentCollection);
do
{
List<int> lst = ProcessQuery(query);
int count = 0;
if (lst != null)
{
foreach (int a in lst)
{
if (a == 1)
{
return documentContentList[count];
}
count++;
}
}
else
{
return "No search result found";
}
} while (1 == 1);
}
public int WordCount()
{
return documentCollection.Count;
}
public int DocumentCount
{
get
{
return _lastDocNum;
}
}
private void FilterQueryTerm(ref string[] str)
{
List<string> _queryTerm = new List<string>();
foreach (string queryTerm in str)
{
if (queryTerm.ToUpper().Equals("BUT") || termDocumentIncidenceMatrix.ContainsKey(queryTerm.ToUpper()) || booleanOperator.Contains(queryTerm))
{
_queryTerm.Add(queryTerm);
}
}
str = _queryTerm.ToArray();
}
//prepares Term Document Incidence Matrix
public Dictionary<string, List<int>> GetTermDocumentIncidenceMatrix(HashSet<string> distinctTerms, Dictionary<string, List<string>> documentCollection)
{
Dictionary<string, List<int>> termDocumentIncidenceMatrix = new Dictionary<string, List<int>>();
List<int> incidenceVector = new List<int>();
foreach (string term in distinctTerms)
{
//incidence vector for each terms
incidenceVector = new List<int>();
foreach (KeyValuePair<string, List<string>> p in documentCollection)
{
if (p.Value.Contains(term))
{
//document contains the term
incidenceVector.Add(1);
}
else
{
//document do not contains the term
incidenceVector.Add(0);
}
}
termDocumentIncidenceMatrix.Add(term, incidenceVector);
}
return termDocumentIncidenceMatrix;
}
//removes all stop words
public string[] RemoveStopsWords(string[] str)
{
List<string> terms = new List<string>();
foreach (string term in str)
{
if (!stopWords.Contains(term))
{
terms.Add(term);
}
}
return terms.ToArray();
}
//process the boolean query
public List<int> ProcessQuery(string query)
{
//query boolean operator
string bitWiseOp = string.Empty;
string[] queryTerm = RemoveStopsWords(query.ToUpper().Split(' '));
//remove query term that doesnot appears on document collection
FilterQueryTerm(ref queryTerm);
List<int> previousTermIncidenceV = null;
List<int> nextTermsIncidenceV = null;
//holds the bitwise operation result
List<int> resultSet = null;
//suppose on query X AND Y, X is previousTerm term and Y is nextTerm
Boolean hasPreviousTerm = false;
Boolean hasNotOperation = false;
foreach (string term in queryTerm)
{
//is a term
if (!booleanOperator.Contains(term) && !term.Equals("BUT"))
{
//query case: structure AND NOT analysis
if (hasNotOperation)
{
if (hasPreviousTerm)
{
nextTermsIncidenceV = ProcessBooleanOperator("NOT", GetTermIncidenceVector(term), nextTermsIncidenceV);
}
//query case: eg.NOT analysis
else
{
previousTermIncidenceV = ProcessBooleanOperator("NOT", GetTermIncidenceVector(term), nextTermsIncidenceV);
resultSet = previousTermIncidenceV;
}
hasNotOperation = false;
}
else if (!hasPreviousTerm)
{
previousTermIncidenceV = GetTermIncidenceVector(term);
resultSet = previousTermIncidenceV;
hasPreviousTerm = true;
}
else
{
nextTermsIncidenceV = GetTermIncidenceVector(term);
}
}
else if (term.Equals("NOT"))
{
//indicates that the term in the next iteration should be complemented.
hasNotOperation = true;
}
else
{
//'BUT' also should be evaluated as AND eg. structure BUT NOT semantic should be evaluated as structure AND NOT semantic
if (term.Equals("BUT"))
{
bitWiseOp = "AND";
}
else
bitWiseOp = term;
}
if (nextTermsIncidenceV != null && !hasNotOperation)
{
resultSet = ProcessBooleanOperator(bitWiseOp, previousTermIncidenceV, nextTermsIncidenceV);
previousTermIncidenceV = resultSet;
hasPreviousTerm = true;
nextTermsIncidenceV = null;
}
}
return resultSet;
}
//Process Boolean operators
public List<int> ProcessBooleanOperator(string op, List<int> previousTermV, List<int> nextTermV)
{
List<int> resultSet = new List<int>();
if (op.Equals("NOT"))
{
foreach (int a in previousTermV)
{
if (a == 1)
{
resultSet.Add(0);
}
else
{
resultSet.Add(1);
}
}
}
else if (op.ToUpper().Equals("AND")) //bitwise AND operation
{
for (int a = 0; a < previousTermV.Count; a++)
{
if (previousTermV[a] == 1 && nextTermV[a] == 1)
{
resultSet.Add(1);
}
else
{
resultSet.Add(0);
}
}
}
else if (op.ToUpper().Equals("OR")) //bitwise OR operation
{
for (int a = 0; a < previousTermV.Count; a++)
{
if (previousTermV[a] == 0 && nextTermV[a] == 0)
{
resultSet.Add(0);
}
else
{
resultSet.Add(1);
}
}
}
return resultSet;
}
//returns term incidence vector
public List<int> GetTermIncidenceVector(string term)
{
return termDocumentIncidenceMatrix[term.ToUpper()];
}
}
you need to now about another class named Class LogManager
I used it on my Class TermDocMatrix
. Here :
namespace WindowsFormsApplication1
{
internal class FileLogger
{
public static readonly FileLogger Instance = new FileLogger();
private string _filename;
private bool _showMethodName = false;
private string _FilePath = "";
public bool ShowMethodNames
{
get { return _showMethodName; }
}
public void Init(string filename, bool showmethodnames)
{
_showMethodName = showmethodnames;
_filename = filename;
// handle folder names as well -> create dir etc.
_FilePath = Path.GetDirectoryName(filename);
if (_FilePath != "")
{
_FilePath = Directory.CreateDirectory(_FilePath).FullName;
if (_FilePath.EndsWith("\\") == false)
_FilePath += "\\";
}
}
}
internal static class LogManager
{
public static void Configure(string filename, bool showmethodnames)
{
FileLogger.Instance.Init(filename, showmethodnames);
}
}
}
it must work but it don't. tell me why don't it work, please. when I ask for answer I just see this "No search result found", no matter I'd typed what kind of boolean query.
回答1:
Your problem is in this line: (ProcessFiles
function)
String[] termsCollection = RemoveStopsWords(file.ToUpper().Split(' '));
you're splitting the name of the file and not its content That's why you have no search results
you should do something like this instead:
String[] termsCollection = RemoveStopsWords(File.ReadAllText(file).ToUpper().Split(' '));
Now change your TermDocMatrix
constructor:
public TermDocMatrix(string IndexPath,string FileName)
{
if (!Directory.Exists(IndexPath)) Directory.CreateDirectory(IndexPath);
LogManager.Configure(System.IO.Path.Combine(_Path, _FileName + ".txt"), false);
// read all files
LoadFiles();
}
And your LoadFiles
function:
private void LoadFiles()
{
int count = 0;
if (File.Exists(System.IO.Path.Combine(_Path, _FileName + ".txt")) == false)
return;
// load words
string b = File.ReadAllText(System.IO.Path.Combine(_Path, _FileName + ".txt"));
.....
}
来源:https://stackoverflow.com/questions/16468094/try-to-answer-some-boolean-queries-using-term-document-incidence-matrix