How to uppercase/lowercase UTF-8 characters in C++?

后端 未结 4 2107
野性不改
野性不改 2021-02-19 09:59

Let\'s imagine I have a UTF-8 encoded std::string containing the following:

óó

and I\'d like to convert it to the following:

<

4条回答
  •  难免孤独
    2021-02-19 10:19

    This function covering the case sensitive character sets in the UTF8 i found (might be some bugs in the matching, but I think I got it right).

    Well, this is shorter than a table solution, and cover it all?

    Could likely be written more elegant, but show the picture. The cmp function must copy the strings for the operation, and that might slow huge strings and eat memory. But we are not 5 persons sharing a PDP11 with 64KB anymore, like in the past?

    This is something I would like to find in a place like this. (The upper case version is not needed for case insensitive cmp-cases, and might be added later.)

    unsigned char *StrToLwrExt(unsigned char *pString)
    {
        if (pString && *pString) {
            unsigned char *p = pString;
            unsigned char *pExtChar = 0;
            while (*p) {
                if ((*p >= 0x41) && (*p <= 0x5a)) // US ASCII
                    (*p) += 0x20;
                else if (*p > 0xc0) {
                    pExtChar = p;
                    p++;
                    switch (*pExtChar) {
                    case 0xc3: // Latin 1
                        if ((*p >= 0x80)
                            && (*p <= 0x9e)
                            && (*p != 0x97))
                            (*p) += 0x20; // US ASCII shift
                        break;
                    case 0xc4: // Latin Exteneded
                        if ((*p >= 0x80)
                            && (*p <= 0xb7)
                            && (!(*p % 2))) // Even
                            (*p)++; // Next char is lwr
                        else if ((*p >= 0xb9)
                            && (*p <= 0xbe)
                            && (*p % 2)) // Odd
                            (*p)++; // Next char is lwr
                        else if (*p == 0xbf) {
                            *pExtChar = 0xc5;
                            (*p) = 0x80;
                        }
                        break;
                    case 0xc5: // Latin Exteneded
                        if ((*p >= 0x80)
                            && (*p <= 0x88)
                            && (*p % 2)) // Odd
                            (*p)++; // Next char is lwr
                        else if ((*p >= 0x8a)
                            && (*p <= 0xb7)
                            && (!(*p % 2))) // Even
                            (*p)++; // Next char is lwr
                        else if ((*p >= 0xb9)
                            && (*p <= 0xbe)
                            && (*p % 2)) // Odd
                            (*p)++; // Next char is lwr
                        break;
                    case 0xc6: // Latin Exteneded
                        switch (*p) {
                        case 0x82:
                        case 0x84:
                        case 0x87:
                        case 0x8b:
                        case 0x91:
                        case 0x98:
                        case 0xa0:
                        case 0xa2:
                        case 0xa4:
                        case 0xa7:
                        case 0xac:
                        case 0xaf:
                        case 0xb3:
                        case 0xb5:
                        case 0xb8:
                        case 0xbc:
                            (*p)++; // Next char is lwr
                            break;
                        default:
                            break;
                        }
                        break;
                    case 0xc7: // Latin Exteneded
                        if (*p == 0x84)
                            (*p) = 0x86;
                        else if (*p == 0x85)
                            (*p)++; // Next char is lwr
                        else if (*p == 0x87)
                            (*p) = 0x89;
                        else if (*p == 0x88)
                            (*p)++; // Next char is lwr
                        else if (*p == 0x8a)
                            (*p) = 0x8c;
                        else if (*p == 0x8b)
                            (*p)++; // Next char is lwr
                        else if ((*p >= 0x8d)
                            && (*p <= 0x9c)
                            && (*p % 2)) // Odd
                            (*p)++; // Next char is lwr
                        else if ((*p >= 0x9e)
                            && (*p <= 0xaf)
                            && (!(*p % 2))) // Even
                            (*p)++; // Next char is lwr
                        else if (*p == 0xb1)
                            (*p) = 0xb3;
                        else if (*p == 0xb2)
                            (*p)++; // Next char is lwr
                        else if (*p == 0xb4)
                            (*p)++; // Next char is lwr
                        else if (*p == 0xb8)
                            (*p)++; // Next char is lwr
                        else if (*p == 0xba)
                            (*p)++; // Next char is lwr
                        else if (*p == 0xbc)
                            (*p)++; // Next char is lwr
                        else if (*p == 0xbe)
                            (*p)++; // Next char is lwr
                        break;
                    case 0xc8: // Latin Exteneded
                        if ((*p >= 0x80)
                            && (*p <= 0x9f)
                            && (!(*p % 2))) // Even
                            (*p)++; // Next char is lwr
                        else if ((*p >= 0xa2)
                            && (*p <= 0xb3)
                            && (!(*p % 2))) // Even
                            (*p)++; // Next char is lwr
                        else if (*p == 0xbb)
                            (*p)++; // Next char is lwr
                        break;
                    case 0xcd: // Greek & Coptic
                        switch (*p) {
                        case 0xb0:
                        case 0xb2:
                        case 0xb6:
                            (*p)++; // Next char is lwr
                            break;
                        default:
                            if (*p == 0xbf) {
                                *pExtChar = 0xcf;
                                (*p) = 0xb3;
                            }
                            break;
                        }
                        break;
                    case 0xce: // Greek & Coptic
                        if (*p == 0x86)
                            (*p) = 0xac;
                        else if (*p == 0x88)
                            (*p) = 0xad;
                        else if (*p == 0x89)
                            (*p) = 0xae;
                        else if (*p == 0x8a)
                            (*p) = 0xaf;
                        else if (*p == 0x8c) {
                            *pExtChar = 0xcf;
                            (*p) = 0x8c;
                        }
                        else if (*p == 0x8e) {
                            *pExtChar = 0xcf;
                            (*p) = 0x8d;
                        }
                        else if (*p == 0x8f) {
                            *pExtChar = 0xcf;
                            (*p) = 0x8e;
                        }
                        else if ((*p >= 0x91)
                            && (*p <= 0x9f))
                            (*p) += 0x20; // US ASCII shift
                        else if ((*p >= 0xa0)
                            && (*p <= 0xab)
                            && (*p != 0xa2)) {
                            *pExtChar = 0xcf;
                            (*p) -= 0x20;
                        }
                        break;
                    case 0xcf: // Greek & Coptic
                        if (*p == 0x8f)
                            (*p) = 0xb4;
                        else if (*p == 0x91)
                            (*p)++; // Next char is lwr
                        else if ((*p >= 0x98)
                            && (*p <= 0xaf)
                            && (!(*p % 2))) // Even
                            (*p)++; // Next char is lwr
                        else if (*p == 0xb4)
                            (*p) = 0x91;
                        else if (*p == 0xb7)
                            (*p)++; // Next char is lwr
                        else if (*p == 0xb9)
                            (*p) = 0xb2;
                        else if (*p == 0xbb)
                            (*p)++; // Next char is lwr
                        else if (*p == 0xbd) {
                            *pExtChar = 0xcd;
                            (*p) = 0xbb;
                        }
                        else if (*p == 0xbe) {
                            *pExtChar = 0xcd;
                            (*p) = 0xbc;
                        }
                        else if (*p == 0xbf) {
                            *pExtChar = 0xcd;
                            (*p) = 0xbd;
                        }
    
                        break;
                    case 0xd0: // Cyrillic
                        if ((*p >= 0x80)
                            && (*p <= 0x8f)) {
                            *pExtChar = 0xd1;
                            (*p) += 0x10;
                        }
                        else if ((*p >= 0x90)
                            && (*p <= 0x9f))
                            (*p) += 0x20; // US ASCII shift
                        else if ((*p >= 0xa0)
                            && (*p <= 0xaf)) {
                            *pExtChar = 0xd1;
                            (*p) -= 0x20;
                        }
                        break;
                    case 0xd1: // Cyrillic supplement
                        if ((*p >= 0xa0)
                            && (*p <= 0xbf)
                            && (!(*p % 2))) // Even
                            (*p)++; // Next char is lwr
                        break;
                    case 0xd2: // Cyrillic supplement
                        if (*p == 0x80)
                            (*p)++; // Next char is lwr
                        else if ((*p >= 0x8a)
                            && (*p <= 0xbf)
                            && (!(*p % 2))) // Even
                            (*p)++; // Next char is lwr
                        break;
                    case 0xd3: // Cyrillic supplement
                        if ((*p >= 0x81)
                            && (*p <= 0x8e)
                            && (*p % 2)) // Odd
                            (*p)++; // Next char is lwr
                        else if ((*p >= 0x90)
                            && (*p <= 0xbf)
                            && (!(*p % 2))) // Even
                            (*p)++; // Next char is lwr
                        break;
                    case 0xd4: // Cyrillic supplement & Armenian
                        if ((*p >= 0x80)
                            && (*p <= 0xaf)
                            && (!(*p % 2))) // Even
                            (*p)++; // Next char is lwr
                        else if ((*p >= 0xb1)
                            && (*p <= 0xbf)) {
                            *pExtChar = 0xd5;
                            (*p) -= 0x10;
                        }
                        break;
                    case 0xd5: // Armenian
                        if ((*p >= 0x80)
                            && (*p <= 0x96)
                            && (!(*p % 2))) // Even
                            (*p)++; // Next char is lwr
                        break;
                    case 0xe1: // Three byte code
                        pExtChar = p;
                        p++;
                        switch (*pExtChar) {
                        case 0x82: // Georgian
                            if ((*p >= 0xa0)
                                && (*p <= 0xbf)) {
                                *pExtChar = 0x83;
                                (*p) -= 0x10;
                            }
                            break;
                        case 0x83: // Georgian
                            if ((*p >= 0x80)
                                && ((*p <= 0x85)
                                    || (*p == 0x87))
                                || (*p == 0x8d))
                                (*p) += 0x30;
                            break;
                        case 0xb8: // Latin extened
                            if ((*p >= 0x80)
                                && (*p <= 0xbf)
                                && (!(*p % 2))) // Even
                                (*p)++; // Next char is lwr
                            break;
                        case 0xb9: // Latin extened
                            if ((*p >= 0x80)
                                && (*p <= 0xbf)
                                && (!(*p % 2))) // Even
                                (*p)++; // Next char is lwr
                            break;
                        case 0xba: // Latin extened
                            if ((*p >= 0x80)
                                && (*p <= 0x94)
                                && (!(*p % 2))) // Even
                                (*p)++; // Next char is lwr
                            else if ((*p >= 0x9e)
                                && (*p <= 0xbf)
                                && (!(*p % 2))) // Even
                                (*p)++; // Next char is lwr
                            break;
                        case 0xbb: // Latin extened
                            if ((*p >= 0x80)
                                && (*p <= 0xbf)
                                && (!(*p % 2))) // Even
                                (*p)++; // Next char is lwr
                            break;
                        case 0xbc: // Greek extened
                            if ((*p >= 0x88)
                                && (*p <= 0x8f))
                                (*p) -= 0x08;
                            else if ((*p >= 0x98)
                                && (*p <= 0x9f))
                                (*p) -= 0x08;
                            else if ((*p >= 0xa8)
                                && (*p <= 0xaf))
                                (*p) -= 0x08;
                            else if ((*p >= 0xb8)
                                && (*p <= 0x8f))
                                (*p) -= 0x08;
                            break;
                        case 0xbd: // Greek extened
                            if ((*p >= 0x88)
                                && (*p <= 0x8d))
                                (*p) -= 0x08;
                            else if ((*p >= 0x98)
                                && (*p <= 0x9f))
                                (*p) -= 0x08;
                            else if ((*p >= 0xa8)
                                && (*p <= 0xaf))
                                (*p) -= 0x08;
                            else if ((*p >= 0xb8)
                                && (*p <= 0x8f))
                                (*p) -= 0x08;
                            break;
                        case 0xbe: // Greek extened
                            if ((*p >= 0x88)
                                && (*p <= 0x8f))
                                (*p) -= 0x08;
                            else if ((*p >= 0x98)
                                && (*p <= 0x9f))
                                (*p) -= 0x08;
                            else if ((*p >= 0xa8)
                                && (*p <= 0xaf))
                                (*p) -= 0x08;
                            else if ((*p >= 0xb8)
                                && (*p <= 0xb9))
                                (*p) -= 0x08;
                            break;
                        case 0xbf: // Greek extened
                            if ((*p >= 0x88)
                                && (*p <= 0x8c))
                                (*p) -= 0x08;
                            else if ((*p >= 0x98)
                                && (*p <= 0x9b))
                                (*p) -= 0x08;
                            else if ((*p >= 0xa8)
                                && (*p <= 0xac))
                                (*p) -= 0x08;
                            break;
                        default:
                            break;
                        }
                        break;
                    case 0xf0: // Four byte code
                        pExtChar = p;
                        p++;
                        switch (*pExtChar) {
                        case 0x90:
                            pExtChar = p;
                            p++;
                            switch (*pExtChar) {
                            case 0x92: // Osage 
                                if ((*p >= 0xb0)
                                    && (*p <= 0xbf)) {
                                    *pExtChar = 0x93;
                                    (*p) -= 0x18;
                                }
                                break;
                            case 0x93: // Osage 
                                if ((*p >= 0x80)
                                    && (*p <= 0x93))
                                    (*p) += 0x18;
                                break;
                            default:
                                break;
                            }
                            break;
                        default:
                            break;
                        }
                        break;
                    default:
                        break;
                    }
                    pExtChar = 0;
                }
                p++;
            }
        }
        return pString;
    }
    
    int StrnCiCmp(const char *s1, const char *s2, size_t ztCount)
    {
        if (s1 && *s1 && s2 && *s2) {
            char cExtChar = 0;
            unsigned char *pStr1Low = calloc(strlen(s1) + 1, sizeof(char));
            unsigned char *pStr2Low = calloc(strlen(s2) + 1, sizeof(char));
            if (pStr1Low && pStr2Low) {
                unsigned char *p1 = pStr1Low;
                unsigned char *p2 = pStr2Low;
                strcpy(pStr1Low, s1);
                strcpy(pStr2Low, s2);
                StrToLwrExt(pStr1Low);
                StrToLwrExt(pStr2Low);
                for (; ztCount--; p1++, p2++) {
                    int iDiff = *p1 - *p2;
                    if (iDiff != 0 || !*p1 || !*p2) {
                        free(pStr1Low);
                        free(pStr2Low);
                        return iDiff;
                    }
                }
                free(pStr1Low);
                free(pStr2Low);
                return 0;
            }
            return (-1);
        }
        return (-1);
    }
    int StrCiCmp(const char *s1, const char *s2)
    {
        return StrnCiCmp(s1, s2, (size_t)(-1));
    }
    char *StrCiStr(const char *s1, const char *s2)
    {
        if (s1 && *s1 && s2 && *s2) {
            char *p = (char *)s1;
            size_t len = strlen(s2);
            while (*p) {
                if (StrnCiCmp(p, s2, len) == 0)
                    return (char *)p;
                p++;
            }
        }
        return (0);
    }
    

提交回复
热议问题