I am pulling a JSON file from a site and one of the strings received is:
The Weeknd ‘King Of The Fall&
Swift 3 version of @akashivskyy's extension,
extension String {
init(htmlEncodedString: String) {
self.init()
guard let encodedData = htmlEncodedString.data(using: .utf8) else {
self = htmlEncodedString
return
}
let attributedOptions: [String : Any] = [
NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType,
NSCharacterEncodingDocumentAttribute: String.Encoding.utf8.rawValue
]
do {
let attributedString = try NSAttributedString(data: encodedData, options: attributedOptions, documentAttributes: nil)
self = attributedString.string
} catch {
print("Error: \(error)")
self = htmlEncodedString
}
}
}
This would be my approach. You could add the entities dictionary from https://gist.github.com/mwaterfall/25b4a6a06dc3309d9555 Michael Waterfall mentions.
extension String {
func htmlDecoded()->String {
guard (self != "") else { return self }
var newStr = self
let entities = [
""" : "\"",
"&" : "&",
"'" : "'",
"<" : "<",
">" : ">",
]
for (name,value) in entities {
newStr = newStr.stringByReplacingOccurrencesOfString(name, withString: value)
}
return newStr
}
}
Examples used:
let encoded = "this is so "good""
let decoded = encoded.htmlDecoded() // "this is so "good""
OR
let encoded = "this is so "good"".htmlDecoded() // "this is so "good""
I was looking for a pure Swift 3.0 utility to escape to/unescape from HTML character references (i.e. for server-side Swift apps on both macOS and Linux) but didn't find any comprehensive solutions, so I wrote my own implementation: https://github.com/IBM-Swift/swift-html-entities
The package, HTMLEntities
, works with HTML4 named character references as well as hex/dec numeric character references, and it will recognize special numeric character references per the W3 HTML5 spec (i.e. €
should be unescaped as the Euro sign (unicode U+20AC
) and NOT as the unicode character for U+0080
, and certain ranges of numeric character references should be replaced with the replacement character U+FFFD
when unescaping).
Usage example:
import HTMLEntities
// encode example
let html = "<script>alert(\"abc\")</script>"
print(html.htmlEscape())
// Prints ”<script>alert("abc")</script>"
// decode example
let htmlencoded = "<script>alert("abc")</script>"
print(htmlencoded.htmlUnescape())
// Prints ”<script>alert(\"abc\")</script>"
And for OP's example:
print("The Weeknd ‘King Of The Fall’ [Video Premiere] | @TheWeeknd | #SoPhi ".htmlUnescape())
// prints "The Weeknd ‘King Of The Fall’ [Video Premiere] | @TheWeeknd | #SoPhi "
Edit: HTMLEntities
now supports HTML5 named character references as of version 2.0.0. Spec-compliant parsing is also implemented.
Swift 4:
The total solution that finally worked for me with HTML code and newline characters and single quotes
extension String {
var htmlDecoded: String {
let decoded = try? NSAttributedString(data: Data(utf8), options: [
.documentType: NSAttributedString.DocumentType.html,
.characterEncoding: String.Encoding.utf8.rawValue
], documentAttributes: nil).string
return decoded ?? self
}
}
Usage:
let yourStringEncoded = yourStringWithHtmlcode.htmlDecoded
I then had to apply some more filters to get rid of single quotes (for example, don't, hasn't, It's, etc.), and new line characters like \n
:
var yourNewString = String(yourStringEncoded.filter { !"\n\t\r".contains($0) })
yourNewString = yourNewString.replacingOccurrences(of: "\'", with: "", options: NSString.CompareOptions.literal, range: nil)
Updated answer working on Swift 3
extension String {
init?(htmlEncodedString: String) {
let encodedData = htmlEncodedString.data(using: String.Encoding.utf8)!
let attributedOptions = [ NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType]
guard let attributedString = try? NSAttributedString(data: encodedData, options: attributedOptions, documentAttributes: nil) else {
return nil
}
self.init(attributedString.string)
}