DotNet Soundex Function

删除回忆录丶 提交于 2019-11-30 16:03:10

I know this is late, but I also needed something similar (though no database involved), and the only answer isn't accurate (fails for 'Tymczak' and 'Pfister').

This is what I came up with:

class Program
{
    public static void Main(string[] args)
    {
                Assert.AreEqual(Soundex.Generate("H"), "H000");
                Assert.AreEqual(Soundex.Generate("Robert"), "R163");
                Assert.AreEqual(Soundex.Generate("Rupert"), "R163");
                Assert.AreEqual(Soundex.Generate("Rubin"), "R150");
                Assert.AreEqual(Soundex.Generate("Ashcraft"), "A261");
                Assert.AreEqual(Soundex.Generate("Ashcroft"), "A261");
                Assert.AreEqual(Soundex.Generate("Tymczak"), "T522");
                Assert.AreEqual(Soundex.Generate("Pfister"), "P236");
                Assert.AreEqual(Soundex.Generate("Gutierrez"), "G362");
                Assert.AreEqual(Soundex.Generate("Jackson"), "J250");
                Assert.AreEqual(Soundex.Generate("VanDeusen"), "V532");
                Assert.AreEqual(Soundex.Generate("Deusen"), "D250");
                Assert.AreEqual(Soundex.Generate("Sword"), "S630");
                Assert.AreEqual(Soundex.Generate("Sord"), "S630");
                Assert.AreEqual(Soundex.Generate("Log-out"), "L230");
                Assert.AreEqual(Soundex.Generate("Logout"), "L230");
                Assert.AreEqual(Soundex.Generate("123"), Soundex.Empty);
                Assert.AreEqual(Soundex.Generate(""), Soundex.Empty);
                Assert.AreEqual(Soundex.Generate(null), Soundex.Empty);
    }
}

public static class Soundex
{
    public const string Empty = "0000";

    private static readonly Regex Sanitiser = new Regex(@"[^A-Z]", RegexOptions.Compiled);
    private static readonly Regex CollapseRepeatedNumbers = new Regex(@"(\d)?\1*[WH]*\1*", RegexOptions.Compiled);
    private static readonly Regex RemoveVowelSounds = new Regex(@"[AEIOUY]", RegexOptions.Compiled);

    public static string Generate(string Phrase)
    {
        // Remove non-alphas
        Phrase = Sanitiser.Replace((Phrase ?? string.Empty).ToUpper(), string.Empty);

        // Nothing to soundex, return empty
        if (string.IsNullOrEmpty(Phrase))
            return Empty;

        // Convert consonants to numerical representation
        var Numified = Numify(Phrase);

        // Remove repeated numberics (characters of the same sound class), even if separated by H or W
        Numified = CollapseRepeatedNumbers.Replace(Numified, @"$1");

        if (Numified.Length > 0 && Numified[0] == Numify(Phrase[0]))
        {
            // Remove first numeric as first letter in same class as subsequent letters
            Numified = Numified.Substring(1);
        }

        // Remove vowels
        Numified = RemoveVowelSounds.Replace(Numified, string.Empty);

        // Concatenate, pad and trim to ensure X### format.
        return string.Format("{0}{1}", Phrase[0], Numified).PadRight(4, '0').Substring(0, 4);
    }

    private static string Numify(string Phrase)
    {
        return new string(Phrase.ToCharArray().Select(Numify).ToArray());
    }

    private static char Numify(char Character)
    {
        switch (Character)
        {
            case 'B': case 'F': case 'P': case 'V':
                return '1';
            case 'C': case 'G': case 'J': case 'K': case 'Q': case 'S': case 'X': case 'Z':
                return '2';
            case 'D': case 'T':
                return '3';
            case 'L':
                return '4';
            case 'M': case 'N':
                return '5';
            case 'R':
                return '6';
            default:
                return Character;
        }
    }
}

Based on the answer of Dotnet Services and tigrou, I have corrected the algorithm in order to reflect the function described in Wikipedia.

Test cases such as Ashcraft = A226, Tymczak = T522, Pfister = P236 and Honeyman = H555 are now working correctly.

public static string Soundex(string data)
{
    StringBuilder result = new StringBuilder();

    if (data != null && data.Length > 0)
    {
        string previousCode = "", currentCode = "", currentLetter = "";
        result.Append(data[0]); // keep initial char

        for (int i = 0; i < data.Length; i++) //start at 0 in order to correctly encode "Pf..."
        {
            currentLetter = data[i].ToString().ToLower();
            currentCode = "";

            if ("bfpv".Contains(currentLetter)) 
                currentCode = "1";
            else if ("cgjkqsxz".Contains(currentLetter))
                currentCode = "2";
            else if ("dt".Contains(currentLetter))
                currentCode = "3";
            else if (currentLetter == "l")
                currentCode = "4";
            else if ("mn".Contains(currentLetter))
                currentCode = "5";
            else if (currentLetter == "r")
                currentCode = "6";

            if (currentCode != previousCode && i > 0) // do not add first code to result string
                result.Append(currentCode);

            if (result.Length == 4) break;

            previousCode = currentCode; // always retain previous code, even empty
        }
    }
    if (result.Length < 4)
        result.Append(new String('0', 4 - result.Length));

    return result.ToString().ToUpper();
}

According to the algorithm described in wikipedia

    private string Soundex(string word)
    {
        word = word.ToUpper();
        word = word[0] + 
            Regex.Replace(
                Regex.Replace(
                Regex.Replace(
                Regex.Replace(
                Regex.Replace(
                Regex.Replace(
                Regex.Replace(word.Substring(1), "[AEIOUYHW]",""),
                "[BFPV]+", "1"),
                "[CGJKQSXZ]+", "2"),
                "[DT]+","3"),
                "[L]+","4"),
                "[MN]+","5"),
                "[R]+","6")
            ;
        return word.PadRight(4,'0').Substring(0,4);
    }

You could use something like this in c# per SQL

public static string Soundex(string data)
    {
        StringBuilder result = new StringBuilder();

        if (data != null && data.Length > 0)
        {
            string previousCode = "", currentCode = "", currentLetter = "";

            result.Append(data.Substring(0, 1));

            for (int i = 1; i < data.Length; i++) 
            {
                currentLetter = data.Substring(i, 1).ToLower();
                currentCode = "";

                if ("bfpv".IndexOf(currentLetter) > -1)
                    currentCode = "1";

                else if ("cgjkqsxz".IndexOf(currentLetter) > -1)
                    currentCode = "2";

                else if ("dt".IndexOf(currentLetter) > -1)
                    currentCode = "3";


                else if (currentLetter == "l")
                    currentCode = "4";

                else if ("mn".IndexOf(currentLetter) > -1)
                    currentCode = "5";

                else if (currentLetter == "r")
                    currentCode = "6";

                if (currentCode != previousCode)
                    result.Append(currentCode);

                if (result.Length == 4) break;

                if (currentCode != "")
                    previousCode = currentCode;

            }
        }
        if (result.Length < 4)
            result.Append(new String('0', 4 - result.Length));

        return result.ToString().ToUpper();
    }
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!