Jaro–Winkler distance algorithm in C#

房东的猫 提交于 2019-11-26 19:38:10

问题


How would the Jaro–Winkler distance string comparison algorithm be implemented in C#?


回答1:


public static class JaroWinklerDistance
{
    /* The Winkler modification will not be applied unless the 
     * percent match was at or above the mWeightThreshold percent 
     * without the modification. 
     * Winkler's paper used a default value of 0.7
     */
    private static readonly double mWeightThreshold = 0.7;

    /* Size of the prefix to be concidered by the Winkler modification. 
     * Winkler's paper used a default value of 4
     */
    private static readonly int mNumChars = 4;


    /// <summary>
    /// Returns the Jaro-Winkler distance between the specified  
    /// strings. The distance is symmetric and will fall in the 
    /// range 0 (perfect match) to 1 (no match). 
    /// </summary>
    /// <param name="aString1">First String</param>
    /// <param name="aString2">Second String</param>
    /// <returns></returns>
    public static double distance(string aString1, string aString2) {
        return 1.0 - proximity(aString1,aString2);
    }


    /// <summary>
    /// Returns the Jaro-Winkler distance between the specified  
    /// strings. The distance is symmetric and will fall in the 
    /// range 0 (no match) to 1 (perfect match). 
    /// </summary>
    /// <param name="aString1">First String</param>
    /// <param name="aString2">Second String</param>
    /// <returns></returns>
    public static double proximity(string aString1, string aString2)
    {
        int lLen1 = aString1.Length;
        int lLen2 = aString2.Length;
        if (lLen1 == 0)
            return lLen2 == 0 ? 1.0 : 0.0;

        int  lSearchRange = Math.Max(0,Math.Max(lLen1,lLen2)/2 - 1);

        // default initialized to false
        bool[] lMatched1 = new bool[lLen1];
        bool[] lMatched2 = new bool[lLen2];

        int lNumCommon = 0;
        for (int i = 0; i < lLen1; ++i) {
            int lStart = Math.Max(0,i-lSearchRange);
            int lEnd = Math.Min(i+lSearchRange+1,lLen2);
            for (int j = lStart; j < lEnd; ++j) {
                if (lMatched2[j]) continue;
                if (aString1[i] != aString2[j])
                    continue;
                lMatched1[i] = true;
                lMatched2[j] = true;
                ++lNumCommon;
                break;
            }
        }
        if (lNumCommon == 0) return 0.0;

        int lNumHalfTransposed = 0;
        int k = 0;
        for (int i = 0; i < lLen1; ++i) {
            if (!lMatched1[i]) continue;
            while (!lMatched2[k]) ++k;
            if (aString1[i] != aString2[k])
                ++lNumHalfTransposed;
            ++k;
        }
        // System.Diagnostics.Debug.WriteLine("numHalfTransposed=" + numHalfTransposed);
        int lNumTransposed = lNumHalfTransposed/2;

        // System.Diagnostics.Debug.WriteLine("numCommon=" + numCommon + " numTransposed=" + numTransposed);
        double lNumCommonD = lNumCommon;
        double lWeight = (lNumCommonD/lLen1
                         + lNumCommonD/lLen2
                         + (lNumCommon - lNumTransposed)/lNumCommonD)/3.0;

        if (lWeight <= mWeightThreshold) return lWeight;
        int lMax = Math.Min(mNumChars,Math.Min(aString1.Length,aString2.Length));
        int lPos = 0;
        while (lPos < lMax && aString1[lPos] == aString2[lPos])
            ++lPos;
        if (lPos == 0) return lWeight;
        return lWeight + 0.1 * lPos * (1.0 - lWeight);

    }


}



回答2:


You can take a look on Lucene.Net ,

it implement Jaro–Winkler distance algorithm ,

and its score is different from which leebickmtu post ,

you can take it as reference

the url is below :

http://lucenenet.apache.org/docs/3.0.3/db/d12/_jaro_winkler_distance_8cs_source.html




回答3:


Use below class to use jaro winkler. i have customized both algorithm jaro and jaro-winkler.

Visit on Github for DLL.

using System;
using System.Linq;

namespace Search
{
    public static class EditDistance
    {
        private struct JaroMetrics
        {
            public int Matches;

            public int Transpositions;
        }

        private static EditDistance.JaroMetrics Matches(string s1, string s2)
        {
            string text;
            string text2;
            if (s1.Length > s2.Length)
            {
                text = s1;
                text2 = s2;
            }
            else
            {
                text = s2;
                text2 = s1;
            }
            int num = Math.Max(text.Length / 2 - 1, 0);
            int[] array = new int[text2.Length];
            int i;
            for (i = 0; i < array.Length; i++)
            {
                array[i] = -1;
            }
            bool[] array2 = new bool[text.Length];
            int num2 = 0;
            for (int j = 0; j < text2.Length; j++)
            {
                char c = text2[j];
                int k = Math.Max(j - num, 0);
                int num3 = Math.Min(j + num + 1, text.Length);
                while (k < num3)
                {
                    if (!array2[k] && c == text[k])
                    {
                        array[j] = k;
                        array2[k] = true;
                        num2++;
                        break;
                    }
                    k++;
                }
            }
            char[] array3 = new char[num2];
            char[] ms2 = new char[num2];
            i = 0;
            int num4 = 0;
            while (i < text2.Length)
            {
                if (array[i] != -1)
                {
                    array3[num4] = text2[i];
                    num4++;
                }
                i++;
            }
            i = 0;
            num4 = 0;
            while (i < text.Length)
            {
                if (array2[i])
                {
                    ms2[num4] = text[i];
                    num4++;
                }
                i++;
            }
            int num5 = array3.Where((char t, int mi) => t != ms2[mi]).Count<char>();
            EditDistance.JaroMetrics result;
            result.Matches = num2;
            result.Transpositions = num5 / 2;
            return result;
        }



        public static float JaroWinkler(this string s1, string s2, float prefixScale, float boostThreshold)
        {
            prefixScale = ((prefixScale > 0.25f) ? 0.25f : prefixScale);
            prefixScale = ((prefixScale < 0f) ? 0f : prefixScale);
            float num = s1.Jaro(s2);
            int num2 = 0;
            for (int i = 0; i < Math.Min(s1.Length, s2.Length); i++)
            {
                if (s1[i] != s2[i])
                {
                    break;
                }
                num2++;
            }
            return (num < boostThreshold) ? num : (num + prefixScale * (float)num2 * (1f - num));
        }

        public static float JaroWinkler(this string s1, string s2, float prefixScale)
        {
            return s1.JaroWinkler(s2, prefixScale, 0.7f);
        }

        public static float JaroWinkler(this string s1, string s2)
        {
            return s1.JaroWinkler(s2, 0.1f, 0.7f);
        }

        public static float Jaro(this string s1, string s2)
        {
            EditDistance.JaroMetrics jaroMetrics = EditDistance.Matches(s1, s2);
            float num = (float)jaroMetrics.Matches;
            int transpositions = jaroMetrics.Transpositions;
            float result;
            if (num == 0f)
            {
                result = 0f;
            }
            else
            {
                float num2 = (num / (float)s1.Length + num / (float)s2.Length + (num - (float)transpositions) / num) / 3f;
                result = num2;
            }
            return result;
        }

        public static int LevenshteinDistance(this string source, string target)
        {
            int result;
            if (string.IsNullOrEmpty(source))
            {
                if (string.IsNullOrEmpty(target))
                {
                    result = 0;
                }
                else
                {
                    result = target.Length;
                }
            }
            else if (string.IsNullOrEmpty(target))
            {
                result = source.Length;
            }
            else
            {
                if (source.Length > target.Length)
                {
                    string text = target;
                    target = source;
                    source = text;
                }
                int length = target.Length;
                int length2 = source.Length;
                int[,] array = new int[2, length + 1];
                for (int i = 1; i <= length; i++)
                {
                    array[0, i] = i;
                }
                int num = 0;
                for (int j = 1; j <= length2; j++)
                {
                    num = (j & 1);
                    array[num, 0] = j;
                    int num2 = num ^ 1;
                    for (int i = 1; i <= length; i++)
                    {
                        int num3 = (target[i - 1] == source[j - 1]) ? 0 : 1;
                        array[num, i] = Math.Min(Math.Min(array[num2, i] + 1, array[num, i - 1] + 1), array[num2, i - 1] + num3);
                    }
                }
                result = array[num, length];
            }
            return result;
        }
    }
}



回答4:


You can use the below code which works very well for all the kind of strings.After getting the result you need to multiply with 100 to get the percentage of similarity. I hope it will solves your problem.

    public class JaroWinkler
{
    private const double defaultMismatchScore = 0.0;
    private const double defaultMatchScore = 1.0;

    /// <summary>
    /// Gets the similarity between two strings by using the Jaro-Winkler algorithm.
    /// A value of 1 means perfect match. A value of zero represents an absolute no match
    /// </summary>
    /// <param name="_firstWord"></param>
    /// <param name="_secondWord"></param>
    /// <returns>a value between 0-1 of the similarity</returns>
    /// 
    public static double RateSimilarity(string _firstWord, string _secondWord)
    {
        // Converting to lower case is not part of the original Jaro-Winkler implementation
        // But we don't really care about case sensitivity in DIAMOND and wouldn't decrease security names similarity rate just because
        // of Case sensitivity
        _firstWord = _firstWord.ToLower();
        _secondWord = _secondWord.ToLower();

        if ((_firstWord != null) && (_secondWord != null))
        {
            if (_firstWord == _secondWord)
                //return (SqlDouble)defaultMatchScore;
                return defaultMatchScore;
            else
            {
                // Get half the length of the string rounded up - (this is the distance used for acceptable transpositions)
                int halfLength = Math.Min(_firstWord.Length, _secondWord.Length) / 2 + 1;

                // Get common characters
                StringBuilder common1 = GetCommonCharacters(_firstWord, _secondWord, halfLength);
                int commonMatches = common1.Length;

                // Check for zero in common
                if (commonMatches == 0)
                    //return (SqlDouble)defaultMismatchScore;
                    return defaultMismatchScore;

                StringBuilder common2 = GetCommonCharacters(_secondWord, _firstWord, halfLength);

                // Check for same length common strings returning 0 if is not the same
                if (commonMatches != common2.Length)
                    //return (SqlDouble)defaultMismatchScore;
                    return defaultMismatchScore;

                // Get the number of transpositions
                int transpositions = 0;
                for (int i = 0; i < commonMatches; i++)
                {
                    if (common1[i] != common2[i])
                        transpositions++;
                }

                int j = 0;
                j += 1;

                // Calculate Jaro metric
                transpositions /= 2;
                double jaroMetric = commonMatches / (3.0 * _firstWord.Length) + commonMatches / (3.0 * _secondWord.Length) + (commonMatches - transpositions) / (3.0 * commonMatches);
                //return (SqlDouble)jaroMetric;
                return jaroMetric;
            }
        }

        //return (SqlDouble)defaultMismatchScore;
        return defaultMismatchScore;
    }

    /// <summary>
    /// Returns a string buffer of characters from string1 within string2 if they are of a given
    /// distance seperation from the position in string1.
    /// </summary>
    /// <param name="firstWord">string one</param>
    /// <param name="secondWord">string two</param>
    /// <param name="separationDistance">separation distance</param>
    /// <returns>A string buffer of characters from string1 within string2 if they are of a given
    /// distance seperation from the position in string1</returns>
    private static StringBuilder GetCommonCharacters(string firstWord, string secondWord, int separationDistance)
    {
        if ((firstWord != null) && (secondWord != null))
        {
            StringBuilder returnCommons = new StringBuilder(20);
            StringBuilder copy = new StringBuilder(secondWord);
            int firstWordLength = firstWord.Length;
            int secondWordLength = secondWord.Length;

            for (int i = 0; i < firstWordLength; i++)
            {
                char character = firstWord[i];
                bool found = false;

                for (int j = Math.Max(0, i - separationDistance); !found && j < Math.Min(i + separationDistance, secondWordLength); j++)
                {
                    if (copy[j] == character)
                    {
                        found = true;
                        returnCommons.Append(character);
                        copy[j] = '#';
                    }
                }
            }
            return returnCommons;
        }
        return null;
    }
}


来源:https://stackoverflow.com/questions/19123506/jaro-winkler-distance-algorithm-in-c-sharp

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!