I\'m programming something that counts the number of UTF-8 characters in a file. I\'ve already written the base code but now, I\'m stuck in the part where the characters are su
There are multiple options you may take:
main1
below)main2
below)main3
below that uses libunistring
)utf8_strlen
-ish solution that will work on specific UTF-8 string property and check the bytes yourself, as showed in other answers.Here is an example program that has to be compiled with -lunistring
under linux with rudimentary error checking with assert
:
#include
#include
#include
#include
#include
void main1()
{
// read the file as wide characters
const char *l = setlocale(LC_ALL, "en_US.UTF-8");
assert(l);
FILE *file = fopen("file.txt", "r");
assert(file);
int count = 0;
while(fgetwc(file) != WEOF) {
count++;
}
fclose(file);
printf("Number of characters: %i\n", count);
}
// just a helper function cause i'm lazy
char *file_to_buf(const char *filename, size_t *strlen) {
FILE *file = fopen(filename, "r");
assert(file);
size_t n = 0;
char *ret = malloc(1);
assert(ret);
for (int c; (c = fgetc(file)) != EOF;) {
ret = realloc(ret, n + 2);
assert(ret);
ret[n++] = c;
}
ret[n] = '\0';
*strlen = n;
fclose(file);
return ret;
}
void main2() {
const char *l = setlocale(LC_ALL, "en_US.UTF-8");
assert(l);
size_t strlen = 0;
char *str = file_to_buf("file.txt", &strlen);
assert(str);
// convert multibye string to wide string
// assuming multibytes are in UTF-8
// this may also be done in a streaming fashion when reading byte by byte from a file
// and calling with `mbtowc` and checking errno for EILSEQ and managing some buffer
mbstate_t ps = {0};
const char *tmp = str;
size_t count = mbsrtowcs(NULL, &tmp, 0, &ps);
assert(count != (size_t)-1);
printf("Number of characters: %zu\n", count);
free(str);
}
#include // u8_mbsnlen from libunistring
void main3() {
size_t strlen = 0;
char *str = file_to_buf("file.txt", &strlen);
assert(str);
// for simplicity I am assuming uint8_t is equal to unisgned char
size_t count = u8_mbsnlen((const uint8_t *)str, strlen);
printf("Number of characters: %zu\n", count);
free(str);
}
int main() {
main1();
main2();
main3();
}