How to compare multibyte characters in C

前端 未结 4 1212
感动是毒
感动是毒 2021-01-17 19:11

I try to parse text and find some characters in it. I use the code below. It works with normal characters like abcdef but it does not work with öçşğüı

4条回答
  •  走了就别回头了
    2021-01-17 20:03

    See wiki here: https://en.wikipedia.org/wiki/UTF-8 In particular, there is a table with the bit patterns.

    Here's another way to scan/convert a utf-8 string into a codepoint [not exact, just an example--refer to wiki]:

    // utf8scan -- convert utf8 to codepoints (example)
    
    char inpbuf[1000];
    char uni[8];
    
    typedef union {
        char utf8[4];
        unsigned int code;
    } codepoint_t;
    
    codepoint_t outbuf[1000];
    
    // unidecode -- decode utf8 char into codepoint
    // RETURNS: updated rhs pointer
    char *
    unidecode(codepoint_t *lhs,char *rhs)
    {
        int idx;
        int chr;
    
        idx = 0;
        lhs->utf8[idx++] = *rhs++;
    
        for (;  ;  ++rhs, ++idx) {
            chr = *rhs;
    
            // end of string
            if (chr == 0)
                break;
    
            // start of new ascii char
            if ((chr & 0x80) == 0)
                break;
    
            // start of new unicode char
            if (chr & 0x40)
                break;
    
            lhs->utf8[idx] = chr;
        }
    
        return rhs;
    }
    
    // main -- main program
    int
    main(void)
    {
        char *rhs;
        codepoint_t *lhs;
    
        rhs = inpbuf;
        lhs = outbuf;
    
        for (;  *rhs != 0;  ++lhs) {
            lhs->code = 0;
    
            // ascii char
            if ((*rhs & 0x80) == 0)
                lhs->utf8[0] = *rhs++;
    
            // get/skip unicode char
            else
                rhs = unidecode(lhs,rhs);
        }
    
        // add EOS
        lhs->code = 0;
    
        return 0;
    }
    

提交回复
热议问题