boost::property_tree::json_parser and two-byte wide characters

前端 未结 2 640
囚心锁ツ
囚心锁ツ 2021-02-09 01:27

Introduction

std::string text = \"á\";

\"á\" is two-byte character (assuming a UTF-8 encoding).
So following line prints 2.



        
相关标签:
2条回答
  • 2021-02-09 02:04

    Support above Basic Multilingual Plane:

        template<class Ch>
    std::basic_string<Ch> create_escapes(const std::basic_string<Ch> &s)
    {
        std::basic_string<Ch> result;
        typename std::basic_string<Ch>::const_iterator b = s.begin();
        typename std::basic_string<Ch>::const_iterator e = s.end();
        while (b != e)
        {
            if (*b == 0x20 || *b == 0x21 || (*b >= 0x23 && *b <= 0x2E) ||
                (*b >= 0x30 && *b <= 0x5B) || (*b >= 0x5D && *b <= 0x80))
                result += *b;
            else if (*b == Ch('\b')) result += Ch('\\'), result += Ch('b');
            else if (*b == Ch('\f')) result += Ch('\\'), result += Ch('f');
            else if (*b == Ch('\n')) result += Ch('\\'), result += Ch('n');
            else if (*b == Ch('\r')) result += Ch('\\'), result += Ch('r');
            else if (*b == Ch('/')) result += Ch('\\'), result += Ch('/');
            else if (*b == Ch('"'))  result += Ch('\\'), result += Ch('"');
            else if (*b == Ch('\\')) result += Ch('\\'), result += Ch('\\');
            else
            {
                const char * hexdigits = "0123456789ABCDEF";
    
                typedef typename make_unsigned<Ch>::type UCh;
                unsigned long u = static_cast<unsigned long>(static_cast<UCh>(*b));
    
                if (u <= 0xFFFF)
                {            
                    int d1 = u / 4096; u -= d1 * 4096;
                    int d2 = u / 256; u -= d2 * 256;
                    int d3 = u / 16; u -= d3 * 16;
                    int d4 = u;
    
                    result += Ch('\\'); result += Ch('u');
                    result += Ch(hexdigits[d1]); result += Ch(hexdigits[d2]);
                    result += Ch(hexdigits[d3]); result += Ch(hexdigits[d4]);
                }
                else
                {
                    u = (((static_cast<unsigned long>(static_cast<UCh>(*b)) - 0x10000) >> 10) & 0x3ff) + 0xd800;
    
                    int d1 = u / 4096; u -= d1 * 4096;
                    int d2 = u / 256; u -= d2 * 256;
                    int d3 = u / 16; u -= d3 * 16;
                    int d4 = u;
    
                    result += Ch('\\'); result += Ch('u');
                    result += Ch(hexdigits[d1]); result += Ch(hexdigits[d2]);
                    result += Ch(hexdigits[d3]); result += Ch(hexdigits[d4]);
    
                    u = ((static_cast<unsigned long>(static_cast<UCh>(*b)) - 0x10000) & 0x3ff) + 0xdc00;
    
                    d1 = u / 4096; u -= d1 * 4096;
                    d2 = u / 256; u -= d2 * 256;
                    d3 = u / 16; u -= d3 * 16;
                    d4 = u;
    
                    result += Ch('\\'); result += Ch('u');
                    result += Ch(hexdigits[d1]); result += Ch(hexdigits[d2]);
                    result += Ch(hexdigits[d3]); result += Ch(hexdigits[d4]);
                }
            }
            ++b;
        }
        return result;
    }
    
    0 讨论(0)
  • 2021-02-09 02:25

    I found some solutions. In general you needs to specify boost::property_tree::json_parser::create_escapes template for [Ch=Char], to provide your "special occasion bug free escaping".

    JSON standard assume that all string are UTF-16 encoded with "\uXXXX" escaping, but some library support UTF-8 encoding with "\xXX" escaping. If JSON file can be encoded in UTF-8, you may pass all character higher than 0x7F, witch was intended for original function.

    I put this code before using boost::property_tree::json_parser::write_json. It comes from boost_1_49_0/boost/property_tree/detail/json_parser_write.hpp:

    namespace boost { namespace property_tree { namespace json_parser
    {
        // Create necessary escape sequences from illegal characters
        template<>
        std::basic_string<char> create_escapes(const std::basic_string<char> &s)
        {
            std::basic_string<char> result;
            std::basic_string<char>::const_iterator b = s.begin();
            std::basic_string<char>::const_iterator e = s.end();
            while (b != e)
            {
                // This assumes an ASCII superset. But so does everything in PTree.
                // We escape everything outside ASCII, because this code can't
                // handle high unicode characters.
                if (*b == 0x20 || *b == 0x21 || (*b >= 0x23 && *b <= 0x2E) ||
                    (*b >= 0x30 && *b <= 0x5B) || (*b >= 0x5D && *b <= 0xFF)  //it fails here because char are signed
                    || (*b >= -0x80 && *b < 0 ) ) // this will pass UTF-8 signed chars
                    result += *b;
                else if (*b == char('\b')) result += char('\\'), result += char('b');
                else if (*b == char('\f')) result += char('\\'), result += char('f');
                else if (*b == char('\n')) result += char('\\'), result += char('n');
                else if (*b == char('\r')) result += char('\\'), result += char('r');
                else if (*b == char('/')) result += char('\\'), result += char('/');
                else if (*b == char('"'))  result += char('\\'), result += char('"');
                else if (*b == char('\\')) result += char('\\'), result += char('\\');
                else
                {
                    const char *hexdigits = "0123456789ABCDEF";
                    typedef make_unsigned<char>::type UCh;
                    unsigned long u = (std::min)(static_cast<unsigned long>(
                                                     static_cast<UCh>(*b)),
                                                 0xFFFFul);
                    int d1 = u / 4096; u -= d1 * 4096;
                    int d2 = u / 256; u -= d2 * 256;
                    int d3 = u / 16; u -= d3 * 16;
                    int d4 = u;
                    result += char('\\'); result += char('u');
                    result += char(hexdigits[d1]); result += char(hexdigits[d2]);
                    result += char(hexdigits[d3]); result += char(hexdigits[d4]);
                }
                ++b;
            }
            return result;
        }
    } } }
    

    And the output I get:

    {
        "text": "aáb"
    }
    

    Also the function boost::property_tree::json_parser::a_unicode have similar problems with reading escaped unicode characters to signed chars.

    0 讨论(0)
提交回复
热议问题