How do I convert ú within a c++ application where the application receives the character as UTF-8 encoding %C3%BA and store it as the unicode equivalent %FA. I just want to
I just wrote some code to do this yesterday...
I'm not saying this is the "perfect" way to do this, but it appears to work for all testcases I've run through it (I wrote both directions for that purpose).
I'll leave it to you to translate "%NN" to an integer value.
#include <iostream>
#include <deque>
std::deque<int> unicode_to_utf8(int charcode)
{
std::deque<int> d;
if (charcode < 128)
{
d.push_back(charcode);
}
else
{
int first_bits = 6;
const int other_bits = 6;
int first_val = 0xC0;
int t = 0;
while (charcode >= (1 << first_bits))
{
{
t = 128 | (charcode & ((1 << other_bits)-1));
charcode >>= other_bits;
first_val |= 1 << (first_bits);
first_bits--;
}
d.push_front(t);
}
t = first_val | charcode;
d.push_front(t);
}
return d;
}
int utf8_to_unicode(std::deque<int> &coded)
{
int charcode = 0;
int t = coded.front();
coded.pop_front();
if (t < 128)
{
return t;
}
int high_bit_mask = (1 << 6) -1;
int high_bit_shift = 0;
int total_bits = 0;
const int other_bits = 6;
while((t & 0xC0) == 0xC0)
{
t <<= 1;
t &= 0xff;
total_bits += 6;
high_bit_mask >>= 1;
high_bit_shift++;
charcode <<= other_bits;
charcode |= coded.front() & ((1 << other_bits)-1);
coded.pop_front();
}
charcode |= ((t >> high_bit_shift) & high_bit_mask) << total_bits;
return charcode;
}
int main()
{
int charcode;
for(;;)
{
std::cout << "Enter unicode value:" << std::endl;
std::cin >> charcode;
auto x = unicode_to_utf8(charcode);
for(auto c : x)
{
std::cout << "\\x" << std::hex << c << " ";
}
std::cout << std::endl;
int c = utf8_to_unicode(x);
std::cout << "reversed:" << std::dec << c << std::hex << " in hex:" << c << std::endl;
}
}
This is actually in the standard libray:
#include <string>
#include <codecvt> // for std::codecvt_utf8
#include <locale> // for std::wstring_convert
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv_utf8_utf32;
int main() {
std::string utf8_bytes = "ú";
std::u32string unicode_codepoints = conv_utf8_utf32.from_bytes(utf8_bytes);
return 0;
}
The other way around is done with conv_utf8_utf32.to_bytes
.
Example with printing in your %hex
format using printf:
#include <string>
#include <codecvt> // for std::codecvt_utf8
#include <locale> // for std::wstring_convert
#include <cstdio>
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv_utf8_utf32;
int main() {
std::string utf8_bytes = "ú";
// print the bytes in %hex format
for (char byte: utf8_bytes) {
printf("%%%2X", reinterpret_cast<unsigned char&>(byte));
}
printf("\n");
std::u32string unicode_codepoints = conv_utf8_utf32.from_bytes(utf8_bytes);
// print the code points in %hex format
for (char32_t chr: unicode_codepoints) {
printf("%%%2X", chr);
}
printf("\n");
return 0;
}