Let\'s imagine I have a UTF-8 encoded std::string
containing the following:
óó
and I\'d like to convert it to the following:
<
There are some examples on StackOverflow but they use wide character strings, and other answers say you shouldn't be using wide character strings for UTF-8.
The article within (utf8everywhere) and answers apply to Windows. The C++ standard requires that wchar_t
be wide enough to accomodate all supported code units (32-bits wide) but works perfectly fine with UTF-8. On Windows, wchar_t
is UTF-16 but if you're on Windows you have more problems than just that if we're going to be honest (namely their horrifying API).
It also appears that this problem can be very "tricky" in that the output might be dependent upon the user's locale.
Not really. Set the locale inside the code. Some programs like sort
don't work properly if you don't set the locale inside the shell for example, so the onus on the user.
I was expecting to just use something like std::toupper(), but the usage is really unclear to me because it seems like I'm not just converting one character at a time but an entire string.
The code example uses iterators. If you don't want to convert every character, don't.
Also, this Ideone example I put together seems to show that toupper() of 0xc3b3 is just 0xc3b3, which is an unexpected result. Calling setlocale to either UTF-8 or ISO8859-1 doesn't appear to change the outcome.
You have undefined behavior. The range of unsigned char
is 255. 0xc3b3
way surpasses that.
I'd love some guidance if you could shed some light on either what I'm doing wrong or why my question/premise is faulty!
This example works perfectly fine:
#include <iostream>
#include <string>
#include <locale>
int main()
{
std::setlocale(LC_CTYPE, "en_US.UTF-8"); // the locale will be the UTF-8 enabled English
std::wstring str = L"óó";
std::wcout << str << std::endl;
for (std::wstring::iterator it = str.begin(); it != str.end(); ++it)
*it = towupper(*it);
std::wcout << str << std::endl;
}
Outputs: ÓÓ
This function covering the case sensitive character sets in the UTF8 i found (might be some bugs in the matching, but I think I got it right).
Well, this is shorter than a table solution, and cover it all?
Could likely be written more elegant, but show the picture. The cmp function must copy the strings for the operation, and that might slow huge strings and eat memory. But we are not 5 persons sharing a PDP11 with 64KB anymore, like in the past?
This is something I would like to find in a place like this. (The upper case version is not needed for case insensitive cmp-cases, and might be added later.)
unsigned char *StrToLwrExt(unsigned char *pString)
{
if (pString && *pString) {
unsigned char *p = pString;
unsigned char *pExtChar = 0;
while (*p) {
if ((*p >= 0x41) && (*p <= 0x5a)) // US ASCII
(*p) += 0x20;
else if (*p > 0xc0) {
pExtChar = p;
p++;
switch (*pExtChar) {
case 0xc3: // Latin 1
if ((*p >= 0x80)
&& (*p <= 0x9e)
&& (*p != 0x97))
(*p) += 0x20; // US ASCII shift
break;
case 0xc4: // Latin Exteneded
if ((*p >= 0x80)
&& (*p <= 0xb7)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
else if ((*p >= 0xb9)
&& (*p <= 0xbe)
&& (*p % 2)) // Odd
(*p)++; // Next char is lwr
else if (*p == 0xbf) {
*pExtChar = 0xc5;
(*p) = 0x80;
}
break;
case 0xc5: // Latin Exteneded
if ((*p >= 0x80)
&& (*p <= 0x88)
&& (*p % 2)) // Odd
(*p)++; // Next char is lwr
else if ((*p >= 0x8a)
&& (*p <= 0xb7)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
else if ((*p >= 0xb9)
&& (*p <= 0xbe)
&& (*p % 2)) // Odd
(*p)++; // Next char is lwr
break;
case 0xc6: // Latin Exteneded
switch (*p) {
case 0x82:
case 0x84:
case 0x87:
case 0x8b:
case 0x91:
case 0x98:
case 0xa0:
case 0xa2:
case 0xa4:
case 0xa7:
case 0xac:
case 0xaf:
case 0xb3:
case 0xb5:
case 0xb8:
case 0xbc:
(*p)++; // Next char is lwr
break;
default:
break;
}
break;
case 0xc7: // Latin Exteneded
if (*p == 0x84)
(*p) = 0x86;
else if (*p == 0x85)
(*p)++; // Next char is lwr
else if (*p == 0x87)
(*p) = 0x89;
else if (*p == 0x88)
(*p)++; // Next char is lwr
else if (*p == 0x8a)
(*p) = 0x8c;
else if (*p == 0x8b)
(*p)++; // Next char is lwr
else if ((*p >= 0x8d)
&& (*p <= 0x9c)
&& (*p % 2)) // Odd
(*p)++; // Next char is lwr
else if ((*p >= 0x9e)
&& (*p <= 0xaf)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
else if (*p == 0xb1)
(*p) = 0xb3;
else if (*p == 0xb2)
(*p)++; // Next char is lwr
else if (*p == 0xb4)
(*p)++; // Next char is lwr
else if (*p == 0xb8)
(*p)++; // Next char is lwr
else if (*p == 0xba)
(*p)++; // Next char is lwr
else if (*p == 0xbc)
(*p)++; // Next char is lwr
else if (*p == 0xbe)
(*p)++; // Next char is lwr
break;
case 0xc8: // Latin Exteneded
if ((*p >= 0x80)
&& (*p <= 0x9f)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
else if ((*p >= 0xa2)
&& (*p <= 0xb3)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
else if (*p == 0xbb)
(*p)++; // Next char is lwr
break;
case 0xcd: // Greek & Coptic
switch (*p) {
case 0xb0:
case 0xb2:
case 0xb6:
(*p)++; // Next char is lwr
break;
default:
if (*p == 0xbf) {
*pExtChar = 0xcf;
(*p) = 0xb3;
}
break;
}
break;
case 0xce: // Greek & Coptic
if (*p == 0x86)
(*p) = 0xac;
else if (*p == 0x88)
(*p) = 0xad;
else if (*p == 0x89)
(*p) = 0xae;
else if (*p == 0x8a)
(*p) = 0xaf;
else if (*p == 0x8c) {
*pExtChar = 0xcf;
(*p) = 0x8c;
}
else if (*p == 0x8e) {
*pExtChar = 0xcf;
(*p) = 0x8d;
}
else if (*p == 0x8f) {
*pExtChar = 0xcf;
(*p) = 0x8e;
}
else if ((*p >= 0x91)
&& (*p <= 0x9f))
(*p) += 0x20; // US ASCII shift
else if ((*p >= 0xa0)
&& (*p <= 0xab)
&& (*p != 0xa2)) {
*pExtChar = 0xcf;
(*p) -= 0x20;
}
break;
case 0xcf: // Greek & Coptic
if (*p == 0x8f)
(*p) = 0xb4;
else if (*p == 0x91)
(*p)++; // Next char is lwr
else if ((*p >= 0x98)
&& (*p <= 0xaf)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
else if (*p == 0xb4)
(*p) = 0x91;
else if (*p == 0xb7)
(*p)++; // Next char is lwr
else if (*p == 0xb9)
(*p) = 0xb2;
else if (*p == 0xbb)
(*p)++; // Next char is lwr
else if (*p == 0xbd) {
*pExtChar = 0xcd;
(*p) = 0xbb;
}
else if (*p == 0xbe) {
*pExtChar = 0xcd;
(*p) = 0xbc;
}
else if (*p == 0xbf) {
*pExtChar = 0xcd;
(*p) = 0xbd;
}
break;
case 0xd0: // Cyrillic
if ((*p >= 0x80)
&& (*p <= 0x8f)) {
*pExtChar = 0xd1;
(*p) += 0x10;
}
else if ((*p >= 0x90)
&& (*p <= 0x9f))
(*p) += 0x20; // US ASCII shift
else if ((*p >= 0xa0)
&& (*p <= 0xaf)) {
*pExtChar = 0xd1;
(*p) -= 0x20;
}
break;
case 0xd1: // Cyrillic supplement
if ((*p >= 0xa0)
&& (*p <= 0xbf)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
break;
case 0xd2: // Cyrillic supplement
if (*p == 0x80)
(*p)++; // Next char is lwr
else if ((*p >= 0x8a)
&& (*p <= 0xbf)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
break;
case 0xd3: // Cyrillic supplement
if ((*p >= 0x81)
&& (*p <= 0x8e)
&& (*p % 2)) // Odd
(*p)++; // Next char is lwr
else if ((*p >= 0x90)
&& (*p <= 0xbf)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
break;
case 0xd4: // Cyrillic supplement & Armenian
if ((*p >= 0x80)
&& (*p <= 0xaf)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
else if ((*p >= 0xb1)
&& (*p <= 0xbf)) {
*pExtChar = 0xd5;
(*p) -= 0x10;
}
break;
case 0xd5: // Armenian
if ((*p >= 0x80)
&& (*p <= 0x96)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
break;
case 0xe1: // Three byte code
pExtChar = p;
p++;
switch (*pExtChar) {
case 0x82: // Georgian
if ((*p >= 0xa0)
&& (*p <= 0xbf)) {
*pExtChar = 0x83;
(*p) -= 0x10;
}
break;
case 0x83: // Georgian
if ((*p >= 0x80)
&& ((*p <= 0x85)
|| (*p == 0x87))
|| (*p == 0x8d))
(*p) += 0x30;
break;
case 0xb8: // Latin extened
if ((*p >= 0x80)
&& (*p <= 0xbf)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
break;
case 0xb9: // Latin extened
if ((*p >= 0x80)
&& (*p <= 0xbf)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
break;
case 0xba: // Latin extened
if ((*p >= 0x80)
&& (*p <= 0x94)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
else if ((*p >= 0x9e)
&& (*p <= 0xbf)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
break;
case 0xbb: // Latin extened
if ((*p >= 0x80)
&& (*p <= 0xbf)
&& (!(*p % 2))) // Even
(*p)++; // Next char is lwr
break;
case 0xbc: // Greek extened
if ((*p >= 0x88)
&& (*p <= 0x8f))
(*p) -= 0x08;
else if ((*p >= 0x98)
&& (*p <= 0x9f))
(*p) -= 0x08;
else if ((*p >= 0xa8)
&& (*p <= 0xaf))
(*p) -= 0x08;
else if ((*p >= 0xb8)
&& (*p <= 0x8f))
(*p) -= 0x08;
break;
case 0xbd: // Greek extened
if ((*p >= 0x88)
&& (*p <= 0x8d))
(*p) -= 0x08;
else if ((*p >= 0x98)
&& (*p <= 0x9f))
(*p) -= 0x08;
else if ((*p >= 0xa8)
&& (*p <= 0xaf))
(*p) -= 0x08;
else if ((*p >= 0xb8)
&& (*p <= 0x8f))
(*p) -= 0x08;
break;
case 0xbe: // Greek extened
if ((*p >= 0x88)
&& (*p <= 0x8f))
(*p) -= 0x08;
else if ((*p >= 0x98)
&& (*p <= 0x9f))
(*p) -= 0x08;
else if ((*p >= 0xa8)
&& (*p <= 0xaf))
(*p) -= 0x08;
else if ((*p >= 0xb8)
&& (*p <= 0xb9))
(*p) -= 0x08;
break;
case 0xbf: // Greek extened
if ((*p >= 0x88)
&& (*p <= 0x8c))
(*p) -= 0x08;
else if ((*p >= 0x98)
&& (*p <= 0x9b))
(*p) -= 0x08;
else if ((*p >= 0xa8)
&& (*p <= 0xac))
(*p) -= 0x08;
break;
default:
break;
}
break;
case 0xf0: // Four byte code
pExtChar = p;
p++;
switch (*pExtChar) {
case 0x90:
pExtChar = p;
p++;
switch (*pExtChar) {
case 0x92: // Osage
if ((*p >= 0xb0)
&& (*p <= 0xbf)) {
*pExtChar = 0x93;
(*p) -= 0x18;
}
break;
case 0x93: // Osage
if ((*p >= 0x80)
&& (*p <= 0x93))
(*p) += 0x18;
break;
default:
break;
}
break;
default:
break;
}
break;
default:
break;
}
pExtChar = 0;
}
p++;
}
}
return pString;
}
int StrnCiCmp(const char *s1, const char *s2, size_t ztCount)
{
if (s1 && *s1 && s2 && *s2) {
char cExtChar = 0;
unsigned char *pStr1Low = calloc(strlen(s1) + 1, sizeof(char));
unsigned char *pStr2Low = calloc(strlen(s2) + 1, sizeof(char));
if (pStr1Low && pStr2Low) {
unsigned char *p1 = pStr1Low;
unsigned char *p2 = pStr2Low;
strcpy(pStr1Low, s1);
strcpy(pStr2Low, s2);
StrToLwrExt(pStr1Low);
StrToLwrExt(pStr2Low);
for (; ztCount--; p1++, p2++) {
int iDiff = *p1 - *p2;
if (iDiff != 0 || !*p1 || !*p2) {
free(pStr1Low);
free(pStr2Low);
return iDiff;
}
}
free(pStr1Low);
free(pStr2Low);
return 0;
}
return (-1);
}
return (-1);
}
int StrCiCmp(const char *s1, const char *s2)
{
return StrnCiCmp(s1, s2, (size_t)(-1));
}
char *StrCiStr(const char *s1, const char *s2)
{
if (s1 && *s1 && s2 && *s2) {
char *p = (char *)s1;
size_t len = strlen(s2);
while (*p) {
if (StrnCiCmp(p, s2, len) == 0)
return (char *)p;
p++;
}
}
return (0);
}
These case insensitive features are definitely needed in search facilities.
Well, I have the same need as described above and UTF8 is pretty smooth in most ways, but the upper and lower case situations is not that great. Looks like it fall off the todo list when done? Because it has been in the past one of the major topics on the todo list in such cases. I have been patching IBM keyboard driver 1984 before IBM shipped, but copies were available. Also patched Displaywrite 1 and 3 (PC-DOS wordprocessor) before IBM wanted to ship in Europe. Done an awful lot of PC-DOS (CP850) and CP1252 (Windows) to and from national EBCDIC Code pages in IBM 3270 mainframe terminal systems. Them all had this case sensitivity topic on the todo list. In all national ASCII versions and the CP1252 Windows tables had a shift between the 0x40-0x5F and 0x60-0x7F to flip between lower and higher cases (but not PCDOS CP850), by 0x20.
What to do about it?
The tolower() and toupper() will not work in UTF8 multi character strings, outside US-ASCII. They are only working with one byte. But a string solution would work, and there are solutions for about everything else.
Western Europeans are lucky
Well the UTF8 put the CP1252 (Windows 8bit/Latin1) as the first additional table, Latin-1 Supplement (Unicode block), as is. This means that it is possible to shift the Letters (C3XX) like regular US ASCII. Code sample below.
Greeks, Russians, Icelanders and Eastern Europeans are not that lucky
For the Icelanders the Đ/đ - D with stroke (same as the th sound of the word the) is just punched out from CP1252.
The Greeks, Russians and Eastern Europeans ISO8-charsets (CP1253, CP1251 and CP1257) could have been used (as the latin CP1252 was directly used). Then just shifting would also have worked. But instead someone just filled the table pretty randomly (like in the PC-DOC 8-bit ASCII).
There is only one working solution, the same as for PC_DOS ASCII, make translation-tables. I will do it for next X-mas (when I need it bad) but I hint how to do it if someone else is in a hurry.
How to do solutions for the Greeks, Russians, Icelanders and Eastern Europeans
Make different tables relating to the different first byte of the UTF8-table for Eastern Europe, Greek and Cyrillic in the programming code. Fill the tables with the second byte of the letters in its UTF8 second byte positions and exchange the uppercase letters with the matching second byte of the lower cases, and make another one doing the other way around.
Then identify what first byte that relates to each table. That way the programming code can select the right table and just read the right position and get the upper or lower case characters needed. Then modify the letter case functions below (those I have made for Latin1), to use tables instaed of shifting 0x20, for some first UTF8-characters, where tables must be used. It will work smooth and new computers have no problem with memory and power.
UTF8 letter case related functions Latin1 samples
This is working I believe, tried it yet shortly. It only works in Latin-1, and USACII parts of the UTF8.
unsigned char *StrToLwrUft8Latin1(unsigned char *pString)
{
char cExtChar = 0;
if (pString && *pString) {
unsigned char *p = pString;
while (*p) {
if (((cExtChar && ((*p >= 0x80) && (*p <= 0xbf)))
|| ((!cExtChar) && (*p <= 0x7f)))
&& ((((*p & 0x7f) + cExtChar) >= 0x40)
&& (((*p & 0x7f) + cExtChar) <= 0x5f)))
*p += 0x20;
if (cExtChar)
cExtChar = 0;
else if (*p == 0xc3)
cExtChar = 0x40;
p++;
}
}
return pString;
}
unsigned char *StrToUprUft8Latin1(unsigned char *pString)
{
char cExtChar = 0;
if (pString && *pString) {
unsigned char *p = pString;
while (*p) {
if (((cExtChar && ((*p >= 0x80) && (*p <= 0xbf)))
|| ((!cExtChar) && (*p <= 0x7f)))
&& ((((*p & 0x7f) + cExtChar) >= 0x60)
&& (((*p & 0x7f) + cExtChar) <= 0x7e)))
*p -= 0x20;
if (cExtChar)
cExtChar = 0;
else if (*p == 0xc3)
cExtChar = 0x40;
p++;
}
}
return pString;
}
int StrnCiCmpLatin1(const char *s1, const char *s2, size_t ztCount)
{
unsigned char cExtChar = 0;
if (s1 && *s1 && s2 && *s2) {
for (; ztCount--; s1++, s2++) {
int iDiff = tolower((unsigned char)(*s1 & 0x7f)
+ cExtChar) - tolower((unsigned char)(*s2 & 0x7f) + cExtChar);
if (iDiff != 0 || !*s1 || !*s2)
return iDiff;
if (cExtChar)
cExtChar = 0;
else if (((unsigned char )*s2) == ((unsigned char)0xc3))
cExtChar = 0x40;
}
}
return 0;
}
int StrCiCmpLatin1(const char *s1, const char *s2)
{
return StrnCiCmpLatin1(s1, s2, (size_t)(-1));
}
char *StrCiStrLatin1(const char *s1, const char *s2)
{
if (s1 && *s1 && s2 && *s2) {
char *p = (char *)s1;
size_t len = strlen(s2);
while (*p) {
if (StrnCiCmpLatin1(p, s2, len) == 0)
return p;
p++;
}
}
return (0);
}
There is no standard way to do Unicode case conversion in C++. There are ways that work on some C++ implementations, but the standard doesn't require them to.
If you want guaranteed Unicode case conversion, you will need to use a library like ICU or Boost.Locale (aka: ICU with a more C++-like interface).