I have a database table that has a column of SQLServer Soundex encoded last name + first name. In my C# program I would like to convert a string using soundex for use in my query.
Is there either a standard string function for soundex in the dotnet library or is the an open source library that implements it (perhaps as an extension method on string)?
I know this is late, but I also needed something similar (though no database involved), and the only answer isn't accurate (fails for 'Tymczak' and 'Pfister').
This is what I came up with:
class Program
{
public static void Main(string[] args)
{
Assert.AreEqual(Soundex.Generate("H"), "H000");
Assert.AreEqual(Soundex.Generate("Robert"), "R163");
Assert.AreEqual(Soundex.Generate("Rupert"), "R163");
Assert.AreEqual(Soundex.Generate("Rubin"), "R150");
Assert.AreEqual(Soundex.Generate("Ashcraft"), "A261");
Assert.AreEqual(Soundex.Generate("Ashcroft"), "A261");
Assert.AreEqual(Soundex.Generate("Tymczak"), "T522");
Assert.AreEqual(Soundex.Generate("Pfister"), "P236");
Assert.AreEqual(Soundex.Generate("Gutierrez"), "G362");
Assert.AreEqual(Soundex.Generate("Jackson"), "J250");
Assert.AreEqual(Soundex.Generate("VanDeusen"), "V532");
Assert.AreEqual(Soundex.Generate("Deusen"), "D250");
Assert.AreEqual(Soundex.Generate("Sword"), "S630");
Assert.AreEqual(Soundex.Generate("Sord"), "S630");
Assert.AreEqual(Soundex.Generate("Log-out"), "L230");
Assert.AreEqual(Soundex.Generate("Logout"), "L230");
Assert.AreEqual(Soundex.Generate("123"), Soundex.Empty);
Assert.AreEqual(Soundex.Generate(""), Soundex.Empty);
Assert.AreEqual(Soundex.Generate(null), Soundex.Empty);
}
}
public static class Soundex
{
public const string Empty = "0000";
private static readonly Regex Sanitiser = new Regex(@"[^A-Z]", RegexOptions.Compiled);
private static readonly Regex CollapseRepeatedNumbers = new Regex(@"(\d)?\1*[WH]*\1*", RegexOptions.Compiled);
private static readonly Regex RemoveVowelSounds = new Regex(@"[AEIOUY]", RegexOptions.Compiled);
public static string Generate(string Phrase)
{
// Remove non-alphas
Phrase = Sanitiser.Replace((Phrase ?? string.Empty).ToUpper(), string.Empty);
// Nothing to soundex, return empty
if (string.IsNullOrEmpty(Phrase))
return Empty;
// Convert consonants to numerical representation
var Numified = Numify(Phrase);
// Remove repeated numberics (characters of the same sound class), even if separated by H or W
Numified = CollapseRepeatedNumbers.Replace(Numified, @"$1");
if (Numified.Length > 0 && Numified[0] == Numify(Phrase[0]))
{
// Remove first numeric as first letter in same class as subsequent letters
Numified = Numified.Substring(1);
}
// Remove vowels
Numified = RemoveVowelSounds.Replace(Numified, string.Empty);
// Concatenate, pad and trim to ensure X### format.
return string.Format("{0}{1}", Phrase[0], Numified).PadRight(4, '0').Substring(0, 4);
}
private static string Numify(string Phrase)
{
return new string(Phrase.ToCharArray().Select(Numify).ToArray());
}
private static char Numify(char Character)
{
switch (Character)
{
case 'B': case 'F': case 'P': case 'V':
return '1';
case 'C': case 'G': case 'J': case 'K': case 'Q': case 'S': case 'X': case 'Z':
return '2';
case 'D': case 'T':
return '3';
case 'L':
return '4';
case 'M': case 'N':
return '5';
case 'R':
return '6';
default:
return Character;
}
}
}
Based on the answer of Dotnet Services and tigrou, I have corrected the algorithm in order to reflect the function described in Wikipedia.
Test cases such as Ashcraft = A226, Tymczak = T522, Pfister = P236 and Honeyman = H555 are now working correctly.
public static string Soundex(string data)
{
StringBuilder result = new StringBuilder();
if (data != null && data.Length > 0)
{
string previousCode = "", currentCode = "", currentLetter = "";
result.Append(data[0]); // keep initial char
for (int i = 0; i < data.Length; i++) //start at 0 in order to correctly encode "Pf..."
{
currentLetter = data[i].ToString().ToLower();
currentCode = "";
if ("bfpv".Contains(currentLetter))
currentCode = "1";
else if ("cgjkqsxz".Contains(currentLetter))
currentCode = "2";
else if ("dt".Contains(currentLetter))
currentCode = "3";
else if (currentLetter == "l")
currentCode = "4";
else if ("mn".Contains(currentLetter))
currentCode = "5";
else if (currentLetter == "r")
currentCode = "6";
if (currentCode != previousCode && i > 0) // do not add first code to result string
result.Append(currentCode);
if (result.Length == 4) break;
previousCode = currentCode; // always retain previous code, even empty
}
}
if (result.Length < 4)
result.Append(new String('0', 4 - result.Length));
return result.ToString().ToUpper();
}
According to the algorithm described in wikipedia
private string Soundex(string word)
{
word = word.ToUpper();
word = word[0] +
Regex.Replace(
Regex.Replace(
Regex.Replace(
Regex.Replace(
Regex.Replace(
Regex.Replace(
Regex.Replace(word.Substring(1), "[AEIOUYHW]",""),
"[BFPV]+", "1"),
"[CGJKQSXZ]+", "2"),
"[DT]+","3"),
"[L]+","4"),
"[MN]+","5"),
"[R]+","6")
;
return word.PadRight(4,'0').Substring(0,4);
}
You could use something like this in c# per SQL
public static string Soundex(string data)
{
StringBuilder result = new StringBuilder();
if (data != null && data.Length > 0)
{
string previousCode = "", currentCode = "", currentLetter = "";
result.Append(data.Substring(0, 1));
for (int i = 1; i < data.Length; i++)
{
currentLetter = data.Substring(i, 1).ToLower();
currentCode = "";
if ("bfpv".IndexOf(currentLetter) > -1)
currentCode = "1";
else if ("cgjkqsxz".IndexOf(currentLetter) > -1)
currentCode = "2";
else if ("dt".IndexOf(currentLetter) > -1)
currentCode = "3";
else if (currentLetter == "l")
currentCode = "4";
else if ("mn".IndexOf(currentLetter) > -1)
currentCode = "5";
else if (currentLetter == "r")
currentCode = "6";
if (currentCode != previousCode)
result.Append(currentCode);
if (result.Length == 4) break;
if (currentCode != "")
previousCode = currentCode;
}
}
if (result.Length < 4)
result.Append(new String('0', 4 - result.Length));
return result.ToString().ToUpper();
}
来源:https://stackoverflow.com/questions/11121936/dotnet-soundex-function