How can I convert this string:
This string contains the Unicode character Pi(π)
into an escaped A
To store actual Unicode codepoints, you have to first decode the String's UTF-16 codeunits to UTF-32 codeunits (which are currently the same as the Unicode codepoints). Use System.Text.Encoding.UTF32.GetBytes()
for that, and then write the resulting bytes to the StringBuilder
as needed,i.e.
static void Main(string[] args)
{
String originalString = "This string contains the unicode character Pi(π)";
Byte[] bytes = Encoding.UTF32.GetBytes(originalString);
StringBuilder asAscii = new StringBuilder();
for (int idx = 0; idx < bytes.Length; idx += 4)
{
uint codepoint = BitConverter.ToUInt32(bytes, idx);
if (codepoint <= 127)
asAscii.Append(Convert.ToChar(codepoint));
else
asAscii.AppendFormat("\\u{0:x4}", codepoint);
}
Console.WriteLine("Final string: {0}", asAscii);
Console.ReadKey();
}
A small patch to @Adam Sills's answer which solves FormatException
on cases where the input string like "c:\u00ab\otherdirectory\" plus RegexOptions.Compiled
makes the Regex
compilation much faster:
private static Regex DECODING_REGEX = new Regex(@"\\u(?<Value>[a-fA-F0-9]{4})", RegexOptions.Compiled);
private const string PLACEHOLDER = @"#!#";
public static string DecodeEncodedNonAsciiCharacters(this string value)
{
return DECODING_REGEX.Replace(
value.Replace(@"\\", PLACEHOLDER),
m => {
return ((char)int.Parse(m.Groups["Value"].Value, NumberStyles.HexNumber)).ToString(); })
.Replace(PLACEHOLDER, @"\\");
}
Here is my current implementation:
public static class UnicodeStringExtensions
{
public static string EncodeNonAsciiCharacters(this string value) {
var bytes = Encoding.Unicode.GetBytes(value);
var sb = StringBuilderCache.Acquire(value.Length);
bool encodedsomething = false;
for (int i = 0; i < bytes.Length; i += 2) {
var c = BitConverter.ToUInt16(bytes, i);
if ((c >= 0x20 && c <= 0x7f) || c == 0x0A || c == 0x0D) {
sb.Append((char) c);
} else {
sb.Append($"\\u{c:x4}");
encodedsomething = true;
}
}
if (!encodedsomething) {
StringBuilderCache.Release(sb);
return value;
}
return StringBuilderCache.GetStringAndRelease(sb);
}
public static string DecodeEncodedNonAsciiCharacters(this string value)
=> Regex.Replace(value,/*language=regexp*/@"(?:\\u[a-fA-F0-9]{4})+", Decode);
static readonly string[] Splitsequence = new [] { "\\u" };
private static string Decode(Match m) {
var bytes = m.Value.Split(Splitsequence, StringSplitOptions.RemoveEmptyEntries)
.Select(s => ushort.Parse(s, NumberStyles.HexNumber)).SelectMany(BitConverter.GetBytes).ToArray();
return Encoding.Unicode.GetString(bytes);
}
}
This passes a test:
public void TestBigUnicode() {
var s = "\U00020000";
var encoded = s.EncodeNonAsciiCharacters();
var decoded = encoded.DecodeEncodedNonAsciiCharacters();
Assert.Equals(s, decoded);
}
with the encoded value: "\ud840\udc00"
This implementation makes use of a StringBuilderCache (reference source link)