HTML character decoding in Objective-C / Cocoa Touch

后端 未结 13 1983
我寻月下人不归
我寻月下人不归 2020-11-22 10:24

First of all, I found this: Objective C HTML escape/unescape, but it doesn\'t work for me.

My encoded characters (come from a RSS feed, btw) look like this: &a

相关标签:
13条回答
  • 2020-11-22 10:55

    Check out my NSString category for HTML. Here are the methods available:

    - (NSString *)stringByConvertingHTMLToPlainText;
    - (NSString *)stringByDecodingHTMLEntities;
    - (NSString *)stringByEncodingHTMLEntities;
    - (NSString *)stringWithNewLinesAsBRs;
    - (NSString *)stringByRemovingNewLinesAndWhitespace;
    
    0 讨论(0)
  • 2020-11-22 10:55

    The one by Daniel is basically very nice, and I fixed a few issues there:

    1. removed the skipping character for NSSCanner (otherwise spaces between two continuous entities would be ignored

      [scanner setCharactersToBeSkipped:nil];

    2. fixed the parsing when there are isolated '&' symbols (I am not sure what is the 'correct' output for this, I just compared it against firefox):

    e.g.

        &#ABC DF & B'  & C' Items (288)
    

    here is the modified code:

    - (NSString *)stringByDecodingXMLEntities {
        NSUInteger myLength = [self length];
        NSUInteger ampIndex = [self rangeOfString:@"&" options:NSLiteralSearch].location;
    
        // Short-circuit if there are no ampersands.
        if (ampIndex == NSNotFound) {
            return self;
        }
        // Make result string with some extra capacity.
        NSMutableString *result = [NSMutableString stringWithCapacity:(myLength * 1.25)];
    
        // First iteration doesn't need to scan to & since we did that already, but for code simplicity's sake we'll do it again with the scanner.
        NSScanner *scanner = [NSScanner scannerWithString:self];
    
        [scanner setCharactersToBeSkipped:nil];
    
        NSCharacterSet *boundaryCharacterSet = [NSCharacterSet characterSetWithCharactersInString:@" \t\n\r;"];
    
        do {
            // Scan up to the next entity or the end of the string.
            NSString *nonEntityString;
            if ([scanner scanUpToString:@"&" intoString:&nonEntityString]) {
                [result appendString:nonEntityString];
            }
            if ([scanner isAtEnd]) {
                goto finish;
            }
            // Scan either a HTML or numeric character entity reference.
            if ([scanner scanString:@"&" intoString:NULL])
                [result appendString:@"&"];
            else if ([scanner scanString:@"'" intoString:NULL])
                [result appendString:@"'"];
            else if ([scanner scanString:@""" intoString:NULL])
                [result appendString:@"\""];
            else if ([scanner scanString:@"<" intoString:NULL])
                [result appendString:@"<"];
            else if ([scanner scanString:@"&gt;" intoString:NULL])
                [result appendString:@">"];
            else if ([scanner scanString:@"&#" intoString:NULL]) {
                BOOL gotNumber;
                unsigned charCode;
                NSString *xForHex = @"";
    
                // Is it hex or decimal?
                if ([scanner scanString:@"x" intoString:&xForHex]) {
                    gotNumber = [scanner scanHexInt:&charCode];
                }
                else {
                    gotNumber = [scanner scanInt:(int*)&charCode];
                }
    
                if (gotNumber) {
                    [result appendFormat:@"%C", (unichar)charCode];
    
                    [scanner scanString:@";" intoString:NULL];
                }
                else {
                    NSString *unknownEntity = @"";
    
                    [scanner scanUpToCharactersFromSet:boundaryCharacterSet intoString:&unknownEntity];
    
    
                    [result appendFormat:@"&#%@%@", xForHex, unknownEntity];
    
                    //[scanner scanUpToString:@";" intoString:&unknownEntity];
                    //[result appendFormat:@"&#%@%@;", xForHex, unknownEntity];
                    NSLog(@"Expected numeric character entity but got &#%@%@;", xForHex, unknownEntity);
    
                }
    
            }
            else {
                NSString *amp;
    
                [scanner scanString:@"&" intoString:&amp];  //an isolated & symbol
                [result appendString:amp];
    
                /*
                NSString *unknownEntity = @"";
                [scanner scanUpToString:@";" intoString:&unknownEntity];
                NSString *semicolon = @"";
                [scanner scanString:@";" intoString:&semicolon];
                [result appendFormat:@"%@%@", unknownEntity, semicolon];
                NSLog(@"Unsupported XML character entity %@%@", unknownEntity, semicolon);
                 */
            }
    
        }
        while (![scanner isAtEnd]);
    
    finish:
        return result;
    }
    
    0 讨论(0)
  • 2020-11-22 10:57

    As if you need another solution! This one is pretty simple and quite effective:

    @interface NSString (NSStringCategory)
    - (NSString *) stringByReplacingISO8859Codes;
    @end
    
    
    @implementation NSString (NSStringCategory)
    - (NSString *) stringByReplacingISO8859Codes
    {
        NSString *dataString = self;
        do {
            //*** See if string contains &# prefix
            NSRange range = [dataString rangeOfString: @"&#" options: NSRegularExpressionSearch];
            if (range.location == NSNotFound) {
                break;
            }
            //*** Get the next three charaters after the prefix
            NSString *isoHex = [dataString substringWithRange: NSMakeRange(range.location + 2, 3)];
            //*** Create the full code for replacement
            NSString *isoString = [NSString stringWithFormat: @"&#%@;", isoHex];
            //*** Convert to decimal integer
            unsigned decimal = 0;
            NSScanner *scanner = [NSScanner scannerWithString: [NSString stringWithFormat: @"0%@", isoHex]];
            [scanner scanHexInt: &decimal];
            //*** Use decimal code to get unicode character
            NSString *unicode = [NSString stringWithFormat:@"%C", decimal];
            //*** Replace all occurences of this code in the string
            dataString = [dataString stringByReplacingOccurrencesOfString: isoString withString: unicode];
        } while (TRUE); //*** Loop until we hit the NSNotFound
    
        return dataString;
    }
    @end
    
    0 讨论(0)
  • 2020-11-22 10:59

    If you have the Character Entity Reference as a string, e.g. @"2318", you can extract a recoded NSString with the correct unicode character using strtoul;

    NSString *unicodePoint = @"2318"
    unichar iconChar = (unichar) strtoul(unicodePoint.UTF8String, NULL, 16);
    NSString *recoded = [NSString stringWithFormat:@"%C", iconChar];
    NSLog(@"recoded: %@", recoded");
    // prints out "recoded: ⌘"
    
    0 讨论(0)
  • 2020-11-22 11:00

    Swift 3 version of Jugale's answer

    extension String {
        static private let mappings = ["&quot;" : "\"","&amp;" : "&", "&lt;" : "<", "&gt;" : ">","&nbsp;" : " ","&iexcl;" : "¡","&cent;" : "¢","&pound;" : " £","&curren;" : "¤","&yen;" : "¥","&brvbar;" : "¦","&sect;" : "§","&uml;" : "¨","&copy;" : "©","&ordf;" : " ª","&laquo" : "«","&not" : "¬","&reg" : "®","&macr" : "¯","&deg" : "°","&plusmn" : "±","&sup2; " : "²","&sup3" : "³","&acute" : "´","&micro" : "µ","&para" : "¶","&middot" : "·","&cedil" : "¸","&sup1" : "¹","&ordm" : "º","&raquo" : "»&","frac14" : "¼","&frac12" : "½","&frac34" : "¾","&iquest" : "¿","&times" : "×","&divide" : "÷","&ETH" : "Ð","&eth" : "ð","&THORN" : "Þ","&thorn" : "þ","&AElig" : "Æ","&aelig" : "æ","&OElig" : "Œ","&oelig" : "œ","&Aring" : "Å","&Oslash" : "Ø","&Ccedil" : "Ç","&ccedil" : "ç","&szlig" : "ß","&Ntilde;" : "Ñ","&ntilde;":"ñ",]
    
        func stringByDecodingXMLEntities() -> String {
    
            guard let _ = self.range(of: "&", options: [.literal]) else {
                return self
            }
    
            var result = ""
    
            let scanner = Scanner(string: self)
            scanner.charactersToBeSkipped = nil
    
            let boundaryCharacterSet = CharacterSet(charactersIn: " \t\n\r;")
    
            repeat {
                var nonEntityString: NSString? = nil
    
                if scanner.scanUpTo("&", into: &nonEntityString) {
                    if let s = nonEntityString as? String {
                        result.append(s)
                    }
                }
    
                if scanner.isAtEnd {
                    break
                }
    
                var didBreak = false
                for (k,v) in String.mappings {
                    if scanner.scanString(k, into: nil) {
                        result.append(v)
                        didBreak = true
                        break
                    }
                }
    
                if !didBreak {
    
                    if scanner.scanString("&#", into: nil) {
    
                        var gotNumber = false
                        var charCodeUInt: UInt32 = 0
                        var charCodeInt: Int32 = -1
                        var xForHex: NSString? = nil
    
                        if scanner.scanString("x", into: &xForHex) {
                            gotNumber = scanner.scanHexInt32(&charCodeUInt)
                        }
                        else {
                            gotNumber = scanner.scanInt32(&charCodeInt)
                        }
    
                        if gotNumber {
                            let newChar = String(format: "%C", (charCodeInt > -1) ? charCodeInt : charCodeUInt)
                            result.append(newChar)
                            scanner.scanString(";", into: nil)
                        }
                        else {
                            var unknownEntity: NSString? = nil
                            scanner.scanUpToCharacters(from: boundaryCharacterSet, into: &unknownEntity)
                            let h = xForHex ?? ""
                            let u = unknownEntity ?? ""
                            result.append("&#\(h)\(u)")
                        }
                    }
                    else {
                        scanner.scanString("&", into: nil)
                        result.append("&")
                    }
                }
    
            } while (!scanner.isAtEnd)
    
            return result
        }
    }
    
    0 讨论(0)
  • 2020-11-22 11:02

    Here's a Swift version of Walty Yeung's answer:

    extension String {
        static private let mappings = ["&quot;" : "\"","&amp;" : "&", "&lt;" : "<", "&gt;" : ">","&nbsp;" : " ","&iexcl;" : "¡","&cent;" : "¢","&pound;" : " £","&curren;" : "¤","&yen;" : "¥","&brvbar;" : "¦","&sect;" : "§","&uml;" : "¨","&copy;" : "©","&ordf;" : " ª","&laquo" : "«","&not" : "¬","&reg" : "®","&macr" : "¯","&deg" : "°","&plusmn" : "±","&sup2; " : "²","&sup3" : "³","&acute" : "´","&micro" : "µ","&para" : "¶","&middot" : "·","&cedil" : "¸","&sup1" : "¹","&ordm" : "º","&raquo" : "»&","frac14" : "¼","&frac12" : "½","&frac34" : "¾","&iquest" : "¿","&times" : "×","&divide" : "÷","&ETH" : "Ð","&eth" : "ð","&THORN" : "Þ","&thorn" : "þ","&AElig" : "Æ","&aelig" : "æ","&OElig" : "Œ","&oelig" : "œ","&Aring" : "Å","&Oslash" : "Ø","&Ccedil" : "Ç","&ccedil" : "ç","&szlig" : "ß","&Ntilde;" : "Ñ","&ntilde;":"ñ",]
    
        func stringByDecodingXMLEntities() -> String {
    
            guard let _ = self.rangeOfString("&", options: [.LiteralSearch]) else {
                return self
            }
    
            var result = ""
    
            let scanner = NSScanner(string: self)
            scanner.charactersToBeSkipped = nil
    
            let boundaryCharacterSet = NSCharacterSet(charactersInString: " \t\n\r;")
    
            repeat {
                var nonEntityString: NSString? = nil
    
                if scanner.scanUpToString("&", intoString: &nonEntityString) {
                    if let s = nonEntityString as? String {
                        result.appendContentsOf(s)
                    }
                }
    
                if scanner.atEnd {
                    break
                }
    
                var didBreak = false
                for (k,v) in String.mappings {
                    if scanner.scanString(k, intoString: nil) {
                        result.appendContentsOf(v)
                        didBreak = true
                        break
                    }
                }
    
                if !didBreak {
    
                    if scanner.scanString("&#", intoString: nil) {
    
                        var gotNumber = false
                        var charCodeUInt: UInt32 = 0
                        var charCodeInt: Int32 = -1
                        var xForHex: NSString? = nil
    
                        if scanner.scanString("x", intoString: &xForHex) {
                            gotNumber = scanner.scanHexInt(&charCodeUInt)
                        }
                        else {
                            gotNumber = scanner.scanInt(&charCodeInt)
                        }
    
                        if gotNumber {
                            let newChar = String(format: "%C", (charCodeInt > -1) ? charCodeInt : charCodeUInt)
                            result.appendContentsOf(newChar)
                            scanner.scanString(";", intoString: nil)
                        }
                        else {
                            var unknownEntity: NSString? = nil
                            scanner.scanUpToCharactersFromSet(boundaryCharacterSet, intoString: &unknownEntity)
                            let h = xForHex ?? ""
                            let u = unknownEntity ?? ""
                            result.appendContentsOf("&#\(h)\(u)")
                        }
                    }
                    else {
                        scanner.scanString("&", intoString: nil)
                        result.appendContentsOf("&")
                    }
                }
    
            } while (!scanner.atEnd)
    
            return result
        }
    }
    
    0 讨论(0)
提交回复
热议问题