问题
How would the Jaro–Winkler distance string comparison algorithm be implemented in C#?
回答1:
public static class JaroWinklerDistance
{
/* The Winkler modification will not be applied unless the
* percent match was at or above the mWeightThreshold percent
* without the modification.
* Winkler's paper used a default value of 0.7
*/
private static readonly double mWeightThreshold = 0.7;
/* Size of the prefix to be concidered by the Winkler modification.
* Winkler's paper used a default value of 4
*/
private static readonly int mNumChars = 4;
/// <summary>
/// Returns the Jaro-Winkler distance between the specified
/// strings. The distance is symmetric and will fall in the
/// range 0 (perfect match) to 1 (no match).
/// </summary>
/// <param name="aString1">First String</param>
/// <param name="aString2">Second String</param>
/// <returns></returns>
public static double distance(string aString1, string aString2) {
return 1.0 - proximity(aString1,aString2);
}
/// <summary>
/// Returns the Jaro-Winkler distance between the specified
/// strings. The distance is symmetric and will fall in the
/// range 0 (no match) to 1 (perfect match).
/// </summary>
/// <param name="aString1">First String</param>
/// <param name="aString2">Second String</param>
/// <returns></returns>
public static double proximity(string aString1, string aString2)
{
int lLen1 = aString1.Length;
int lLen2 = aString2.Length;
if (lLen1 == 0)
return lLen2 == 0 ? 1.0 : 0.0;
int lSearchRange = Math.Max(0,Math.Max(lLen1,lLen2)/2 - 1);
// default initialized to false
bool[] lMatched1 = new bool[lLen1];
bool[] lMatched2 = new bool[lLen2];
int lNumCommon = 0;
for (int i = 0; i < lLen1; ++i) {
int lStart = Math.Max(0,i-lSearchRange);
int lEnd = Math.Min(i+lSearchRange+1,lLen2);
for (int j = lStart; j < lEnd; ++j) {
if (lMatched2[j]) continue;
if (aString1[i] != aString2[j])
continue;
lMatched1[i] = true;
lMatched2[j] = true;
++lNumCommon;
break;
}
}
if (lNumCommon == 0) return 0.0;
int lNumHalfTransposed = 0;
int k = 0;
for (int i = 0; i < lLen1; ++i) {
if (!lMatched1[i]) continue;
while (!lMatched2[k]) ++k;
if (aString1[i] != aString2[k])
++lNumHalfTransposed;
++k;
}
// System.Diagnostics.Debug.WriteLine("numHalfTransposed=" + numHalfTransposed);
int lNumTransposed = lNumHalfTransposed/2;
// System.Diagnostics.Debug.WriteLine("numCommon=" + numCommon + " numTransposed=" + numTransposed);
double lNumCommonD = lNumCommon;
double lWeight = (lNumCommonD/lLen1
+ lNumCommonD/lLen2
+ (lNumCommon - lNumTransposed)/lNumCommonD)/3.0;
if (lWeight <= mWeightThreshold) return lWeight;
int lMax = Math.Min(mNumChars,Math.Min(aString1.Length,aString2.Length));
int lPos = 0;
while (lPos < lMax && aString1[lPos] == aString2[lPos])
++lPos;
if (lPos == 0) return lWeight;
return lWeight + 0.1 * lPos * (1.0 - lWeight);
}
}
回答2:
You can take a look on Lucene.Net ,
it implement Jaro–Winkler distance algorithm ,
and its score is different from which leebickmtu post ,
you can take it as reference
the url is below :
http://lucenenet.apache.org/docs/3.0.3/db/d12/_jaro_winkler_distance_8cs_source.html
回答3:
Use below class to use jaro winkler. i have customized both algorithm jaro and jaro-winkler.
Visit on Github for DLL.
using System;
using System.Linq;
namespace Search
{
public static class EditDistance
{
private struct JaroMetrics
{
public int Matches;
public int Transpositions;
}
private static EditDistance.JaroMetrics Matches(string s1, string s2)
{
string text;
string text2;
if (s1.Length > s2.Length)
{
text = s1;
text2 = s2;
}
else
{
text = s2;
text2 = s1;
}
int num = Math.Max(text.Length / 2 - 1, 0);
int[] array = new int[text2.Length];
int i;
for (i = 0; i < array.Length; i++)
{
array[i] = -1;
}
bool[] array2 = new bool[text.Length];
int num2 = 0;
for (int j = 0; j < text2.Length; j++)
{
char c = text2[j];
int k = Math.Max(j - num, 0);
int num3 = Math.Min(j + num + 1, text.Length);
while (k < num3)
{
if (!array2[k] && c == text[k])
{
array[j] = k;
array2[k] = true;
num2++;
break;
}
k++;
}
}
char[] array3 = new char[num2];
char[] ms2 = new char[num2];
i = 0;
int num4 = 0;
while (i < text2.Length)
{
if (array[i] != -1)
{
array3[num4] = text2[i];
num4++;
}
i++;
}
i = 0;
num4 = 0;
while (i < text.Length)
{
if (array2[i])
{
ms2[num4] = text[i];
num4++;
}
i++;
}
int num5 = array3.Where((char t, int mi) => t != ms2[mi]).Count<char>();
EditDistance.JaroMetrics result;
result.Matches = num2;
result.Transpositions = num5 / 2;
return result;
}
public static float JaroWinkler(this string s1, string s2, float prefixScale, float boostThreshold)
{
prefixScale = ((prefixScale > 0.25f) ? 0.25f : prefixScale);
prefixScale = ((prefixScale < 0f) ? 0f : prefixScale);
float num = s1.Jaro(s2);
int num2 = 0;
for (int i = 0; i < Math.Min(s1.Length, s2.Length); i++)
{
if (s1[i] != s2[i])
{
break;
}
num2++;
}
return (num < boostThreshold) ? num : (num + prefixScale * (float)num2 * (1f - num));
}
public static float JaroWinkler(this string s1, string s2, float prefixScale)
{
return s1.JaroWinkler(s2, prefixScale, 0.7f);
}
public static float JaroWinkler(this string s1, string s2)
{
return s1.JaroWinkler(s2, 0.1f, 0.7f);
}
public static float Jaro(this string s1, string s2)
{
EditDistance.JaroMetrics jaroMetrics = EditDistance.Matches(s1, s2);
float num = (float)jaroMetrics.Matches;
int transpositions = jaroMetrics.Transpositions;
float result;
if (num == 0f)
{
result = 0f;
}
else
{
float num2 = (num / (float)s1.Length + num / (float)s2.Length + (num - (float)transpositions) / num) / 3f;
result = num2;
}
return result;
}
public static int LevenshteinDistance(this string source, string target)
{
int result;
if (string.IsNullOrEmpty(source))
{
if (string.IsNullOrEmpty(target))
{
result = 0;
}
else
{
result = target.Length;
}
}
else if (string.IsNullOrEmpty(target))
{
result = source.Length;
}
else
{
if (source.Length > target.Length)
{
string text = target;
target = source;
source = text;
}
int length = target.Length;
int length2 = source.Length;
int[,] array = new int[2, length + 1];
for (int i = 1; i <= length; i++)
{
array[0, i] = i;
}
int num = 0;
for (int j = 1; j <= length2; j++)
{
num = (j & 1);
array[num, 0] = j;
int num2 = num ^ 1;
for (int i = 1; i <= length; i++)
{
int num3 = (target[i - 1] == source[j - 1]) ? 0 : 1;
array[num, i] = Math.Min(Math.Min(array[num2, i] + 1, array[num, i - 1] + 1), array[num2, i - 1] + num3);
}
}
result = array[num, length];
}
return result;
}
}
}
回答4:
You can use the below code which works very well for all the kind of strings.After getting the result you need to multiply with 100 to get the percentage of similarity. I hope it will solves your problem.
public class JaroWinkler
{
private const double defaultMismatchScore = 0.0;
private const double defaultMatchScore = 1.0;
/// <summary>
/// Gets the similarity between two strings by using the Jaro-Winkler algorithm.
/// A value of 1 means perfect match. A value of zero represents an absolute no match
/// </summary>
/// <param name="_firstWord"></param>
/// <param name="_secondWord"></param>
/// <returns>a value between 0-1 of the similarity</returns>
///
public static double RateSimilarity(string _firstWord, string _secondWord)
{
// Converting to lower case is not part of the original Jaro-Winkler implementation
// But we don't really care about case sensitivity in DIAMOND and wouldn't decrease security names similarity rate just because
// of Case sensitivity
_firstWord = _firstWord.ToLower();
_secondWord = _secondWord.ToLower();
if ((_firstWord != null) && (_secondWord != null))
{
if (_firstWord == _secondWord)
//return (SqlDouble)defaultMatchScore;
return defaultMatchScore;
else
{
// Get half the length of the string rounded up - (this is the distance used for acceptable transpositions)
int halfLength = Math.Min(_firstWord.Length, _secondWord.Length) / 2 + 1;
// Get common characters
StringBuilder common1 = GetCommonCharacters(_firstWord, _secondWord, halfLength);
int commonMatches = common1.Length;
// Check for zero in common
if (commonMatches == 0)
//return (SqlDouble)defaultMismatchScore;
return defaultMismatchScore;
StringBuilder common2 = GetCommonCharacters(_secondWord, _firstWord, halfLength);
// Check for same length common strings returning 0 if is not the same
if (commonMatches != common2.Length)
//return (SqlDouble)defaultMismatchScore;
return defaultMismatchScore;
// Get the number of transpositions
int transpositions = 0;
for (int i = 0; i < commonMatches; i++)
{
if (common1[i] != common2[i])
transpositions++;
}
int j = 0;
j += 1;
// Calculate Jaro metric
transpositions /= 2;
double jaroMetric = commonMatches / (3.0 * _firstWord.Length) + commonMatches / (3.0 * _secondWord.Length) + (commonMatches - transpositions) / (3.0 * commonMatches);
//return (SqlDouble)jaroMetric;
return jaroMetric;
}
}
//return (SqlDouble)defaultMismatchScore;
return defaultMismatchScore;
}
/// <summary>
/// Returns a string buffer of characters from string1 within string2 if they are of a given
/// distance seperation from the position in string1.
/// </summary>
/// <param name="firstWord">string one</param>
/// <param name="secondWord">string two</param>
/// <param name="separationDistance">separation distance</param>
/// <returns>A string buffer of characters from string1 within string2 if they are of a given
/// distance seperation from the position in string1</returns>
private static StringBuilder GetCommonCharacters(string firstWord, string secondWord, int separationDistance)
{
if ((firstWord != null) && (secondWord != null))
{
StringBuilder returnCommons = new StringBuilder(20);
StringBuilder copy = new StringBuilder(secondWord);
int firstWordLength = firstWord.Length;
int secondWordLength = secondWord.Length;
for (int i = 0; i < firstWordLength; i++)
{
char character = firstWord[i];
bool found = false;
for (int j = Math.Max(0, i - separationDistance); !found && j < Math.Min(i + separationDistance, secondWordLength); j++)
{
if (copy[j] == character)
{
found = true;
returnCommons.Append(character);
copy[j] = '#';
}
}
}
return returnCommons;
}
return null;
}
}
来源:https://stackoverflow.com/questions/19123506/jaro-winkler-distance-algorithm-in-c-sharp