问题
An device encodes a string "🤛🏽" as "\uD83E\uDD1B\uD83C\uDFFD"
. The hexadecimal numbers represented in this string are from the UTF-16 hex encoding of the character. The Unicode code point U+1F91B, U+1F3FD
gets its numbers from the UTF-32 hex encoding.
Taking this later one, in Swift we can do a literal like this "\u{1F91B}\u{1F3FD}" and we will get the character "🤛🏽" as expected.
How can I convert from the UTF-16 hex string "\uD83E\uDD1B\uD83C\uDFFD"
to get the "🤛🏽"?
I've tried taking the string and converting it to [UInt32]
array of 32 bit integers and then using that to create unicode scalars, but this only works for Unicode characters that can be expressed int a single UTF-32 code point.
Here is the source code I'm using.
extension String {
func decodeBlock() -> String {
let strings = self.components(separatedBy: "\\u")
var scalars : [UInt32] = []
var value: UInt32 = 0
for string in strings {
print(string)
let scanner = Scanner(string: string)
if scanner.scanHexInt32(&value) {
scalars.append(value)
}
}
let utf32chars = scalars
var str = ""
var generator = utf32chars.makeIterator()
var utf32 : UTF32 = UTF32()
var done = false
while !done {
let r = utf32.decode(&generator)
switch (r) {
case . emptyInput:
done = true
case .scalarValue(let val):
str.append(Character(val))
case .error:
return "$"
}
}
return str
return self
}
}
It is adapted from the code in an answer to a similar question. https://stackoverflow.com/a/41412056/731773
The source of the encoded string is the org.apache.commons.lang.StringEscapeUtils escapeJava
function which can be found here.
回答1:
This is a little bit of a cheat but UTF-16 happens to be the encoding used by NSString
so you can borrow the methods of NSString
to achieve it:
extension String {
func decodeBlock() -> String? {
var chars = [unichar]()
for substr in self.components(separatedBy: "\\u") where !substr.isEmpty {
if let value = UInt16(substr, radix: 16) {
chars.append(value)
} else {
return nil
}
}
return NSString(characters: chars, length: chars.count) as String
}
}
if let decoded = "\\uD83E\\uDD1B\\uD83C\\uDFFD".decodeBlock() {
print(decoded)
} else {
print("Cannot decode")
}
回答2:
This is definitely cheating since it just uses the built in method in JavaScript, but it works.
func decode() -> String{
// getting a JSContext
let context = JSContext()
let encodedString = self
// defining a JavaScript function
let jsFunctionText = "var decode = function(encodedString) {\n" +
"var r = /\\\\u([\\d\\w]{4})/gi;\n" +
" x = encodedString\n" +
"x = x.replace(r, function (match, grp) {\n" +
" return String.fromCharCode(parseInt(grp, 16)); } );\n" +
" x = unescape(x);\n" +
" return x\n" +
"}"
// print(jsFunctionText)
context!.evaluateScript(jsFunctionText)!
// calling a JavaScript function
let jsFunction = context?.objectForKeyedSubscript("decode")
let decodedValue = jsFunction?.call(withArguments: [encodedString]);
if let decodedString = decodedValue?.toString() {
return decodedString
} else {
return self
}
}
来源:https://stackoverflow.com/questions/44664541/how-to-decode-a-utf16-string-into-a-unicode-character