I try to parse text and find some characters in it. I use the code below. It works with normal characters like abcdef
but it does not work with öçşğüı
See wiki here: https://en.wikipedia.org/wiki/UTF-8 In particular, there is a table with the bit patterns.
Here's another way to scan/convert a utf-8 string into a codepoint
[not exact, just an example--refer to wiki]:
// utf8scan -- convert utf8 to codepoints (example)
char inpbuf[1000];
char uni[8];
typedef union {
char utf8[4];
unsigned int code;
} codepoint_t;
codepoint_t outbuf[1000];
// unidecode -- decode utf8 char into codepoint
// RETURNS: updated rhs pointer
char *
unidecode(codepoint_t *lhs,char *rhs)
{
int idx;
int chr;
idx = 0;
lhs->utf8[idx++] = *rhs++;
for (; ; ++rhs, ++idx) {
chr = *rhs;
// end of string
if (chr == 0)
break;
// start of new ascii char
if ((chr & 0x80) == 0)
break;
// start of new unicode char
if (chr & 0x40)
break;
lhs->utf8[idx] = chr;
}
return rhs;
}
// main -- main program
int
main(void)
{
char *rhs;
codepoint_t *lhs;
rhs = inpbuf;
lhs = outbuf;
for (; *rhs != 0; ++lhs) {
lhs->code = 0;
// ascii char
if ((*rhs & 0x80) == 0)
lhs->utf8[0] = *rhs++;
// get/skip unicode char
else
rhs = unidecode(lhs,rhs);
}
// add EOS
lhs->code = 0;
return 0;
}