Convert from UTF-8 to unicode c++

后端 未结 2 1500
旧巷少年郎
旧巷少年郎 2020-12-06 03:40

How do I convert ú within a c++ application where the application receives the character as UTF-8 encoding %C3%BA and store it as the unicode equivalent %FA. I just want to

相关标签:
2条回答
  • 2020-12-06 04:08

    I just wrote some code to do this yesterday...

    I'm not saying this is the "perfect" way to do this, but it appears to work for all testcases I've run through it (I wrote both directions for that purpose).

    I'll leave it to you to translate "%NN" to an integer value.

    #include <iostream>
    #include <deque>
    
    std::deque<int> unicode_to_utf8(int charcode)
    {
        std::deque<int> d;
        if (charcode < 128)
        {
            d.push_back(charcode);
        }
        else
        {
            int first_bits = 6; 
            const int other_bits = 6;
            int first_val = 0xC0;
            int t = 0;
            while (charcode >= (1 << first_bits))
            {
                {
                    t = 128 | (charcode & ((1 << other_bits)-1));
                    charcode >>= other_bits;
                    first_val |= 1 << (first_bits);
                    first_bits--;
                }
                d.push_front(t);
            }
            t = first_val | charcode;
            d.push_front(t);
        }
        return d;
    }
    
    
    int utf8_to_unicode(std::deque<int> &coded)
    {
        int charcode = 0;
        int t = coded.front();
        coded.pop_front();
        if (t < 128)
        {
            return t;
        }
        int high_bit_mask = (1 << 6) -1;
        int high_bit_shift = 0;
        int total_bits = 0;
        const int other_bits = 6;
        while((t & 0xC0) == 0xC0)
        {
            t <<= 1;
            t &= 0xff;
            total_bits += 6;
            high_bit_mask >>= 1; 
            high_bit_shift++;
            charcode <<= other_bits;
            charcode |= coded.front() & ((1 << other_bits)-1);
            coded.pop_front();
        } 
        charcode |= ((t >> high_bit_shift) & high_bit_mask) << total_bits;
        return charcode;
    }
    
    int main()
    {
        int charcode; 
    
        for(;;)
        {
            std::cout << "Enter unicode value:" << std::endl;
            std::cin >> charcode; 
            auto x = unicode_to_utf8(charcode);
            for(auto c : x)
            {
                std::cout << "\\x" << std::hex << c << " ";
            }
            std::cout << std::endl;
            int c = utf8_to_unicode(x);
            std::cout << "reversed:" << std::dec << c << std::hex << " in hex:" << c << std::endl;
        }
    }
    
    0 讨论(0)
  • 2020-12-06 04:20

    This is actually in the standard libray:

    #include <string>
    #include <codecvt> // for std::codecvt_utf8
    #include <locale>  // for std::wstring_convert
    
    
    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv_utf8_utf32;
    
    
    int main() {
    
        std::string utf8_bytes = "ú";
        std::u32string unicode_codepoints = conv_utf8_utf32.from_bytes(utf8_bytes);
    
        return 0;
    }
    

    The other way around is done with conv_utf8_utf32.to_bytes.

    Example with printing in your %hex format using printf:

    #include <string>
    #include <codecvt> // for std::codecvt_utf8
    #include <locale>  // for std::wstring_convert
    #include <cstdio>
    
    
    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv_utf8_utf32;
    
    
    int main() {
    
        std::string utf8_bytes = "ú";
        // print the bytes in %hex format
        for (char byte: utf8_bytes) {
            printf("%%%2X", reinterpret_cast<unsigned char&>(byte));
        }   
        printf("\n");
    
    
        std::u32string unicode_codepoints = conv_utf8_utf32.from_bytes(utf8_bytes);
    
        // print the code points in %hex format
        for (char32_t chr: unicode_codepoints) {
            printf("%%%2X", chr);
        }   
        printf("\n");
    
    
        return 0;
    }
    
    0 讨论(0)
提交回复
热议问题