How do I decode HTML entities in Swift?

后端 未结 23 1894
一生所求
一生所求 2020-11-22 01:47

I am pulling a JSON file from a site and one of the strings received is:

The Weeknd ‘King Of The Fall&         


        
23条回答
  •  你的背包
    2020-11-22 02:46

    @akashivskyy's answer is great and demonstrates how to utilize NSAttributedString to decode HTML entities. One possible disadvantage (as he stated) is that all HTML markup is removed as well, so

     4 < 5 & 3 > 2
    

    becomes

    4 < 5 & 3 > 2
    

    On OS X there is CFXMLCreateStringByUnescapingEntities() which does the job:

    let encoded = " 4 < 5 & 3 > 2 . Price: 12 €.  @ "
    let decoded = CFXMLCreateStringByUnescapingEntities(nil, encoded, nil) as String
    println(decoded)
    //  4 < 5 & 3 > 2 . Price: 12 €.  @ 
    

    but this is not available on iOS.

    Here is a pure Swift implementation. It decodes character entities references like < using a dictionary, and all numeric character entities like @ or . (Note that I did not list all 252 HTML entities explicitly.)

    Swift 4:

    // Mapping from XML/HTML character entity reference to character
    // From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
    private let characterEntities : [ Substring : Character ] = [
        // XML predefined entities:
        """    : "\"",
        "&"     : "&",
        "'"    : "'",
        "<"      : "<",
        ">"      : ">",
    
        // HTML character entity references:
        " "    : "\u{00a0}",
        // ...
        "♦"   : "♦",
    ]
    
    extension String {
    
        /// Returns a new string made by replacing in the `String`
        /// all HTML character entity references with the corresponding
        /// character.
        var stringByDecodingHTMLEntities : String {
    
            // ===== Utility functions =====
    
            // Convert the number in the string to the corresponding
            // Unicode character, e.g.
            //    decodeNumeric("64", 10)   --> "@"
            //    decodeNumeric("20ac", 16) --> "€"
            func decodeNumeric(_ string : Substring, base : Int) -> Character? {
                guard let code = UInt32(string, radix: base),
                    let uniScalar = UnicodeScalar(code) else { return nil }
                return Character(uniScalar)
            }
    
            // Decode the HTML character entity to the corresponding
            // Unicode character, return `nil` for invalid input.
            //     decode("@")    --> "@"
            //     decode("€") --> "€"
            //     decode("<")     --> "<"
            //     decode("&foo;")    --> nil
            func decode(_ entity : Substring) -> Character? {
    
                if entity.hasPrefix("&#x") || entity.hasPrefix("&#X") {
                    return decodeNumeric(entity.dropFirst(3).dropLast(), base: 16)
                } else if entity.hasPrefix("&#") {
                    return decodeNumeric(entity.dropFirst(2).dropLast(), base: 10)
                } else {
                    return characterEntities[entity]
                }
            }
    
            // ===== Method starts here =====
    
            var result = ""
            var position = startIndex
    
            // Find the next '&' and copy the characters preceding it to `result`:
            while let ampRange = self[position...].range(of: "&") {
                result.append(contentsOf: self[position ..< ampRange.lowerBound])
                position = ampRange.lowerBound
    
                // Find the next ';' and copy everything from '&' to ';' into `entity`
                guard let semiRange = self[position...].range(of: ";") else {
                    // No matching ';'.
                    break
                }
                let entity = self[position ..< semiRange.upperBound]
                position = semiRange.upperBound
    
                if let decoded = decode(entity) {
                    // Replace by decoded character:
                    result.append(decoded)
                } else {
                    // Invalid entity, copy verbatim:
                    result.append(contentsOf: entity)
                }
            }
            // Copy remaining characters to `result`:
            result.append(contentsOf: self[position...])
            return result
        }
    }
    

    Example:

    let encoded = " 4 < 5 & 3 > 2 . Price: 12 €.  @ "
    let decoded = encoded.stringByDecodingHTMLEntities
    print(decoded)
    //  4 < 5 & 3 > 2 . Price: 12 €.  @
    

    Swift 3:

    // Mapping from XML/HTML character entity reference to character
    // From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
    private let characterEntities : [ String : Character ] = [
        // XML predefined entities:
        """    : "\"",
        "&"     : "&",
        "'"    : "'",
        "<"      : "<",
        ">"      : ">",
    
        // HTML character entity references:
        " "    : "\u{00a0}",
        // ...
        "♦"   : "♦",
    ]
    
    extension String {
    
        /// Returns a new string made by replacing in the `String`
        /// all HTML character entity references with the corresponding
        /// character.
        var stringByDecodingHTMLEntities : String {
    
            // ===== Utility functions =====
    
            // Convert the number in the string to the corresponding
            // Unicode character, e.g.
            //    decodeNumeric("64", 10)   --> "@"
            //    decodeNumeric("20ac", 16) --> "€"
            func decodeNumeric(_ string : String, base : Int) -> Character? {
                guard let code = UInt32(string, radix: base),
                    let uniScalar = UnicodeScalar(code) else { return nil }
                return Character(uniScalar)
            }
    
            // Decode the HTML character entity to the corresponding
            // Unicode character, return `nil` for invalid input.
            //     decode("@")    --> "@"
            //     decode("€") --> "€"
            //     decode("<")     --> "<"
            //     decode("&foo;")    --> nil
            func decode(_ entity : String) -> Character? {
    
                if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){
                    return decodeNumeric(entity.substring(with: entity.index(entity.startIndex, offsetBy: 3) ..< entity.index(entity.endIndex, offsetBy: -1)), base: 16)
                } else if entity.hasPrefix("&#") {
                    return decodeNumeric(entity.substring(with: entity.index(entity.startIndex, offsetBy: 2) ..< entity.index(entity.endIndex, offsetBy: -1)), base: 10)
                } else {
                    return characterEntities[entity]
                }
            }
    
            // ===== Method starts here =====
    
            var result = ""
            var position = startIndex
    
            // Find the next '&' and copy the characters preceding it to `result`:
            while let ampRange = self.range(of: "&", range: position ..< endIndex) {
                result.append(self[position ..< ampRange.lowerBound])
                position = ampRange.lowerBound
    
                // Find the next ';' and copy everything from '&' to ';' into `entity`
                if let semiRange = self.range(of: ";", range: position ..< endIndex) {
                    let entity = self[position ..< semiRange.upperBound]
                    position = semiRange.upperBound
    
                    if let decoded = decode(entity) {
                        // Replace by decoded character:
                        result.append(decoded)
                    } else {
                        // Invalid entity, copy verbatim:
                        result.append(entity)
                    }
                } else {
                    // No matching ';'.
                    break
                }
            }
            // Copy remaining characters to `result`:
            result.append(self[position ..< endIndex])
            return result
        }
    }
    

    Swift 2:

    // Mapping from XML/HTML character entity reference to character
    // From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
    private let characterEntities : [ String : Character ] = [
        // XML predefined entities:
        """    : "\"",
        "&"     : "&",
        "'"    : "'",
        "<"      : "<",
        ">"      : ">",
    
        // HTML character entity references:
        " "    : "\u{00a0}",
        // ...
        "♦"   : "♦",
    ]
    
    extension String {
    
        /// Returns a new string made by replacing in the `String`
        /// all HTML character entity references with the corresponding
        /// character.
        var stringByDecodingHTMLEntities : String {
    
            // ===== Utility functions =====
    
            // Convert the number in the string to the corresponding
            // Unicode character, e.g.
            //    decodeNumeric("64", 10)   --> "@"
            //    decodeNumeric("20ac", 16) --> "€"
            func decodeNumeric(string : String, base : Int32) -> Character? {
                let code = UInt32(strtoul(string, nil, base))
                return Character(UnicodeScalar(code))
            }
    
            // Decode the HTML character entity to the corresponding
            // Unicode character, return `nil` for invalid input.
            //     decode("@")    --> "@"
            //     decode("€") --> "€"
            //     decode("<")     --> "<"
            //     decode("&foo;")    --> nil
            func decode(entity : String) -> Character? {
    
                if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){
                    return decodeNumeric(entity.substringFromIndex(entity.startIndex.advancedBy(3)), base: 16)
                } else if entity.hasPrefix("&#") {
                    return decodeNumeric(entity.substringFromIndex(entity.startIndex.advancedBy(2)), base: 10)
                } else {
                    return characterEntities[entity]
                }
            }
    
            // ===== Method starts here =====
    
            var result = ""
            var position = startIndex
    
            // Find the next '&' and copy the characters preceding it to `result`:
            while let ampRange = self.rangeOfString("&", range: position ..< endIndex) {
                result.appendContentsOf(self[position ..< ampRange.startIndex])
                position = ampRange.startIndex
    
                // Find the next ';' and copy everything from '&' to ';' into `entity`
                if let semiRange = self.rangeOfString(";", range: position ..< endIndex) {
                    let entity = self[position ..< semiRange.endIndex]
                    position = semiRange.endIndex
    
                    if let decoded = decode(entity) {
                        // Replace by decoded character:
                        result.append(decoded)
                    } else {
                        // Invalid entity, copy verbatim:
                        result.appendContentsOf(entity)
                    }
                } else {
                    // No matching ';'.
                    break
                }
            }
            // Copy remaining characters to `result`:
            result.appendContentsOf(self[position ..< endIndex])
            return result
        }
    }
    

提交回复
热议问题