How do I remove diacritics (accents) from a string in .NET?

前端 未结 20 2860
南方客
南方客 2020-11-21 05:44

I\'m trying to convert some strings that are in French Canadian and basically, I\'d like to be able to take out the French accent marks in the letters while keeping the lett

相关标签:
20条回答
  • 2020-11-21 06:08

    I needed something that converts all major unicode characters and the voted answer leaved a few out so I've created a version of CodeIgniter's convert_accented_characters($str) into C# that is easily customisable:

    using System;
    using System.Text;
    using System.Collections.Generic;
    
    public static class Strings
    {
        static Dictionary<string, string> foreign_characters = new Dictionary<string, string>
        {
            { "äæǽ", "ae" },
            { "öœ", "oe" },
            { "ü", "ue" },
            { "Ä", "Ae" },
            { "Ü", "Ue" },
            { "Ö", "Oe" },
            { "ÀÁÂÃÄÅǺĀĂĄǍΑΆẢẠẦẪẨẬẰẮẴẲẶА", "A" },
            { "àáâãåǻāăąǎªαάảạầấẫẩậằắẵẳặа", "a" },
            { "Б", "B" },
            { "б", "b" },
            { "ÇĆĈĊČ", "C" },
            { "çćĉċč", "c" },
            { "Д", "D" },
            { "д", "d" },
            { "ÐĎĐΔ", "Dj" },
            { "ðďđδ", "dj" },
            { "ÈÉÊËĒĔĖĘĚΕΈẼẺẸỀẾỄỂỆЕЭ", "E" },
            { "èéêëēĕėęěέεẽẻẹềếễểệеэ", "e" },
            { "Ф", "F" },
            { "ф", "f" },
            { "ĜĞĠĢΓГҐ", "G" },
            { "ĝğġģγгґ", "g" },
            { "ĤĦ", "H" },
            { "ĥħ", "h" },
            { "ÌÍÎÏĨĪĬǏĮİΗΉΊΙΪỈỊИЫ", "I" },
            { "ìíîïĩīĭǐįıηήίιϊỉịиыї", "i" },
            { "Ĵ", "J" },
            { "ĵ", "j" },
            { "ĶΚК", "K" },
            { "ķκк", "k" },
            { "ĹĻĽĿŁΛЛ", "L" },
            { "ĺļľŀłλл", "l" },
            { "М", "M" },
            { "м", "m" },
            { "ÑŃŅŇΝН", "N" },
            { "ñńņňʼnνн", "n" },
            { "ÒÓÔÕŌŎǑŐƠØǾΟΌΩΏỎỌỒỐỖỔỘỜỚỠỞỢО", "O" },
            { "òóôõōŏǒőơøǿºοόωώỏọồốỗổộờớỡởợо", "o" },
            { "П", "P" },
            { "п", "p" },
            { "ŔŖŘΡР", "R" },
            { "ŕŗřρр", "r" },
            { "ŚŜŞȘŠΣС", "S" },
            { "śŝşșšſσςс", "s" },
            { "ȚŢŤŦτТ", "T" },
            { "țţťŧт", "t" },
            { "ÙÚÛŨŪŬŮŰŲƯǓǕǗǙǛŨỦỤỪỨỮỬỰУ", "U" },
            { "ùúûũūŭůűųưǔǖǘǚǜυύϋủụừứữửựу", "u" },
            { "ÝŸŶΥΎΫỲỸỶỴЙ", "Y" },
            { "ýÿŷỳỹỷỵй", "y" },
            { "В", "V" },
            { "в", "v" },
            { "Ŵ", "W" },
            { "ŵ", "w" },
            { "ŹŻŽΖЗ", "Z" },
            { "źżžζз", "z" },
            { "ÆǼ", "AE" },
            { "ß", "ss" },
            { "IJ", "IJ" },
            { "ij", "ij" },
            { "Œ", "OE" },
            { "ƒ", "f" },
            { "ξ", "ks" },
            { "π", "p" },
            { "β", "v" },
            { "μ", "m" },
            { "ψ", "ps" },
            { "Ё", "Yo" },
            { "ё", "yo" },
            { "Є", "Ye" },
            { "є", "ye" },
            { "Ї", "Yi" },
            { "Ж", "Zh" },
            { "ж", "zh" },
            { "Х", "Kh" },
            { "х", "kh" },
            { "Ц", "Ts" },
            { "ц", "ts" },
            { "Ч", "Ch" },
            { "ч", "ch" },
            { "Ш", "Sh" },
            { "ш", "sh" },
            { "Щ", "Shch" },
            { "щ", "shch" },
            { "ЪъЬь", "" },
            { "Ю", "Yu" },
            { "ю", "yu" },
            { "Я", "Ya" },
            { "я", "ya" },
        };
    
        public static char RemoveDiacritics(this char c){
            foreach(KeyValuePair<string, string> entry in foreign_characters)
            {
                if(entry.Key.IndexOf (c) != -1)
                {
                    return entry.Value[0];
                }
            }
            return c;
        }
    
        public static string RemoveDiacritics(this string s) 
        {
            //StringBuilder sb = new StringBuilder ();
            string text = "";
    
    
            foreach (char c in s)
            {
                int len = text.Length;
    
                foreach(KeyValuePair<string, string> entry in foreign_characters)
                {
                    if(entry.Key.IndexOf (c) != -1)
                    {
                        text += entry.Value;
                        break;
                    }
                }
    
                if (len == text.Length) {
                    text += c;  
                }
            }
            return text;
        }
    }
    

    Usage

    // for strings
    "crème brûlée".RemoveDiacritics (); // creme brulee
    
    // for chars
    "Ã"[0].RemoveDiacritics (); // A
    
    0 讨论(0)
  • 2020-11-21 06:08

    I really like the concise and functional code provided by azrafe7. So, I have changed it a little bit to convert it to an extension method:

    public static class StringExtensions
    {
        public static string RemoveDiacritics(this string text)
        {
            const string SINGLEBYTE_LATIN_ASCII_ENCODING = "ISO-8859-8";
    
            if (string.IsNullOrEmpty(text))
            {
                return string.Empty;
            }
    
            return Encoding.ASCII.GetString(
                Encoding.GetEncoding(SINGLEBYTE_LATIN_ASCII_ENCODING).GetBytes(text));
        }
    }
    
    0 讨论(0)
  • 2020-11-21 06:10

    this did the trick for me...

    string accentedStr;
    byte[] tempBytes;
    tempBytes = System.Text.Encoding.GetEncoding("ISO-8859-8").GetBytes(accentedStr);
    string asciiStr = System.Text.Encoding.UTF8.GetString(tempBytes);
    

    quick&short!

    0 讨论(0)
  • 2020-11-21 06:10

    Try HelperSharp package.

    There is a method RemoveAccents:

     public static string RemoveAccents(this string source)
     {
         //8 bit characters 
         byte[] b = Encoding.GetEncoding(1251).GetBytes(source);
    
         // 7 bit characters
         string t = Encoding.ASCII.GetString(b);
         Regex re = new Regex("[^a-zA-Z0-9]=-_/");
         string c = re.Replace(t, " ");
         return c;
     }
    
    0 讨论(0)
  • 2020-11-21 06:11

    Popping this Library here if you haven't already considered it. Looks like there are a full range of unit tests with it.

    https://github.com/thomasgalliker/Diacritics.NET

    0 讨论(0)
  • 2020-11-21 06:14

    you can use string extension from MMLib.Extensions nuget package:

    using MMLib.RapidPrototyping.Generators;
    public void ExtensionsExample()
    {
      string target = "aácčeéií";
      Assert.AreEqual("aacceeii", target.RemoveDiacritics());
    } 
    

    Nuget page: https://www.nuget.org/packages/MMLib.Extensions/ Codeplex project site https://mmlib.codeplex.com/

    0 讨论(0)
提交回复
热议问题