How to iterate through unicode characters and print them on the screen with printf in C?

后端 未结 4 1209
不知归路
不知归路 2021-01-05 21:00

I want to iterate through all (at least the 16 bit) unicode characters and print them on the screen with C.

I know there are related questions on SO but they don\'t

4条回答
  •  一整个雨季
    2021-01-05 21:21

    In C99, you can use wide character to multibyte character conversion functions wctomb() or wcrtomb() to convert each code point to a local representation, using the current character set. (The code points are in the current character set, not Unicode.) Remember to use setlocale() to ensure conversion functions are aware of the user locale (most importantly, the current character set used). The conversion functions use the LC_CTYPE category, but you should still use setlocale(LC_ALL, ""); as for any other locale-aware program.

    (Not all systems have the C.UTF-8 locale installed, so I do not recommend trying to override the locale to the standard C with UTF-8 using setlocale(LC_ALL, "C.UTF-8");. It works on some systems, but not all. AFAIK it does not work in Fedora-based Linux distributions, for example.)

    Because you want to output all Unicode code points, I suggest a different approach: Use one of the Universal Character Set Transformation Formats, i.e. UTF-8, UTF-16 (UCS-2 was superseded by UTF-16 in 1996), or UTF-32 (also known as UCS-4). UTF-8 is the one most often used on the Web -- in particular, on this very web page you're looking at right now -- and is very easy to use.

    For further reading on why you should prefer UTF-8 over "native wide strings", see utf8everywhere.org.

    If you want truly portable code, you can use this header file, utf8.h, to convert UTF-8 to unicode code points (utf8_to_code()) and Unicode code points to UTF-8 (code_to_utf8()):

    #ifndef   UTF8_H
    #define   UTF8_H
    #include 
    #include 
    
    #define   UTF8_MAXLEN 6
    
    static size_t utf8_to_code(const unsigned char *const buffer, unsigned int *const codeptr)
    {
        if (!buffer) {
            errno = EINVAL;
            return 0;
        }
    
        if (*buffer == 0U) {
            errno = 0;
            return 0;
        }
    
        if (*buffer < 128U) {
            if (codeptr)
                *codeptr = buffer[0];
            return 1;
        }
    
        if (*buffer < 192U) {
            errno = EILSEQ;
            return 0;
        }
    
        if (*buffer < 224U) {
            if (buffer[1] >= 128U && buffer[1] < 192U)
                return ((buffer[0] - 192U) << 6U)
                     |  (buffer[1] - 128U);
            errno = EILSEQ;
            return 0;
        }
    
        if (*buffer < 240U) {
            if (buffer[1] >= 128U && buffer[1] < 192U &&
                buffer[2] >= 128U && buffer[2] < 192U)
                return ((buffer[0] - 224U) << 12U)
                     | ((buffer[1] - 128U) << 6U)
                     |  (buffer[2] - 128U);
            errno = EILSEQ;
            return 0;
        }
    
        if (*buffer < 248U) {
            if (buffer[1] >= 128U && buffer[1] < 192U &&
                buffer[2] >= 128U && buffer[2] < 192U &&
                buffer[3] >= 128U && buffer[3] < 192U)
                return ((buffer[0] - 240U) << 18U)
                     | ((buffer[1] - 128U) << 12U)
                     | ((buffer[2] - 128U) << 6U)
                     |  (buffer[3] - 128U);
            errno = EILSEQ;
            return 0;
        }
    
        if (*buffer < 252U) {
            if (buffer[1] >= 128U && buffer[1] < 192U &&
                buffer[2] >= 128U && buffer[2] < 192U &&
                buffer[3] >= 128U && buffer[3] < 192U &&
                buffer[4] >= 128U && buffer[4] < 192U)
                return ((buffer[0] - 248U) << 24U)
                     | ((buffer[1] - 128U) << 18U)
                     | ((buffer[2] - 128U) << 12U)
                     | ((buffer[3] - 128U) << 6U)
                     |  (buffer[4] - 128U);
            errno = EILSEQ;
            return 0;
        }
    
        if (*buffer < 254U) {
            if (buffer[1] >= 128U && buffer[1] < 192U &&
                buffer[2] >= 128U && buffer[2] < 192U &&
                buffer[3] >= 128U && buffer[3] < 192U &&
                buffer[4] >= 128U && buffer[4] < 192U &&
                buffer[5] >= 128U && buffer[5] < 192U)
                return ((buffer[0] - 252U) << 30U)
                     | ((buffer[1] - 128U) << 24U)
                     | ((buffer[2] - 128U) << 18U)
                     | ((buffer[3] - 128U) << 12U)
                     | ((buffer[4] - 128U) << 6U)
                     |  (buffer[5] - 128U);
            errno = EILSEQ;
            return 0;
        }
    
        errno = EILSEQ;
        return 0;
    }
    
    static size_t code_to_utf8(unsigned char *const buffer, const unsigned int code)
    {
        if (code < 128U) {
            buffer[0] = code;
            return 1;
        }
        if (code < 2048U) {
            buffer[0] = 0xC0U | (code >> 6U);
            buffer[1] = 0x80U | (code & 0x3FU);
            return 2;
        }
        if (code < 65536) {
            buffer[0] = 0xE0U | (code >> 12U);
            buffer[1] = 0x80U | ((code >> 6U) & 0x3FU);
            buffer[2] = 0x80U | (code & 0x3FU);
            return 3;
        }
        if (code < 2097152U) {
            buffer[0] = 0xF0U | (code >> 18U);
            buffer[1] = 0x80U | ((code >> 12U) & 0x3FU);
            buffer[2] = 0x80U | ((code >> 6U) & 0x3FU);
            buffer[3] = 0x80U | (code & 0x3FU);
            return 4;
        }
        if (code < 67108864U) {
            buffer[0] = 0xF8U | (code >> 24U);
            buffer[1] = 0x80U | ((code >> 18U) & 0x3FU);
            buffer[2] = 0x80U | ((code >> 12U) & 0x3FU);
            buffer[3] = 0x80U | ((code >> 6U) & 0x3FU);
            buffer[4] = 0x80U | (code & 0x3FU);
            return 5;
        }
        if (code <= 2147483647U) {
            buffer[0] = 0xFCU | (code >> 30U);
            buffer[1] = 0x80U | ((code >> 24U) & 0x3FU);
            buffer[2] = 0x80U | ((code >> 18U) & 0x3FU);
            buffer[3] = 0x80U | ((code >> 12U) & 0x3FU);
            buffer[4] = 0x80U | ((code >> 6U) & 0x3FU);
            buffer[5] = 0x80U | (code & 0x3FU);
            return 6;
        }
        errno = EINVAL;
        return 0;
    }
    
    #endif /* UTF8_H */
    

    It is not fast, but it should be easy to understand, and supports all possible Unicode code points (U+0000 to U+10FFFF, inclusive), on all systems with at least 32-bit unsigned ints. On systems with 16-bit unsigned ints, your compiler may warn about unreachable code, and it'll only support the first 65536 code points (U+0000 to U+FFFF).

    Using above utf8.h, you can easily write a C program that outputs a HTML page containing the Unicode characters you want (excluding control characters U+0000-U+001F and U+007F-U+00BF, inclusive, and invalid code points U+D800-U+DFFF, inclusive). For example, page.c:

    #include 
    #include 
    #include 
    #include "utf8.h"
    
    int main(void)
    {
        unsigned char  ch[UTF8_MAXLEN + 1];
        unsigned int   i;
        const char    *str;
        size_t         n, len;
    
        /* HTML5 DOCTYPE */
        printf("\n");
        printf("\n");
    
        /* Header part. */
        printf(" \n");
        printf("   Unicode character list \n");
        printf("  \n");
        printf("  \n");
        printf(" \n");
    
        /* Body part. */
        printf(" \n");
    
        n = 0;
        for (i = 0U; i <= 0xFFFFU; i++) {
    
            /* Skip Unicode control characters. */
            if ((i >= 0U && i <= 31U) ||
                (i >= 127U && i <= 159U))
                continue;
    
            /* Skip invalid Unicode code points. */
            if (i >= 0xD800U && i <= 0xDFFFU)
                continue;
    
            len = code_to_utf8(ch, i);
            if (len > 0) {
                ch[len] = '\0';
    
                /* HTML does not like " & < > */
                if (i == 32U)
                    str = " ";
                else
                if (i == 34U)
                    str = """;
                else
                if (i == 38U)
                    str = "&";
                else
                if (i == 60U)
                    str = "<";
                else
                if (i == 62U)
                    str = ">";
                else
                    str = (const char *)ch;
    
                if (n & 1) {
                printf("  

    ", i, i, str); printf("U+%04X", i); printf(" %s", str); printf("

    \n"); } else { printf("

    ", i, i, str); printf("U+%04X", i); printf(" %s", str); printf("

    \n"); } n++; } } printf(" \n"); printf("\n"); return EXIT_SUCCESS; }

    Redirect the output to a file, and you can open the file in whatever browser you prefer. If your browser is sane, and does not treat local files any different to those it obtains from a web server, then you should see the correct output.

    (If you see multiple characters per code point after U+00A0, your browser has decided that because the file is local, it is using a different character set that it explicitly states it uses. Switch to a sane browser if that happens, or override the character set selection.)

    If you want, you can just print the codes out as UTF-8 text, say using text.c:

    #include 
    #include 
    #include 
    #include "utf8.h"
    
    int main(void)
    {
        unsigned char  ch[UTF8_MAXLEN + 1];
        unsigned int   i;
        size_t         len;
    
        for (i = 0U; i <= 0xFFFFU; i++) {
    
            /* Skip Unicode control characters. */
            if ((i >= 0U && i <= 31U) ||
                (i >= 127U && i <= 159U))
                continue;
    
            /* Skip invalid Unicode code points. */
            if (i >= 0xD800U && i <= 0xDFFFU)
                continue;
    
            len = code_to_utf8(ch, i);
            if (len > 0) {
                ch[len] = '\0';
                printf("U+%04X %s \n", i, ch);
            }
        }
    
        return EXIT_SUCCESS;
    }
    

    but then you must either be sure your terminal or terminal emulator supports UTF-8 and uses an UTF-8 locale, or you redirect the output to a text file and open that file in an editor which either assumes the file uses UTF-8 or lets you explicitly select the UTF-8 character set.

    Note that there is a space before and after each character. Because some of the code points are combining characters, they may not show up at all unless they can be combined with another character, and most (all?) combine with space just fine.

    If you use Windows, then you must conform to Microsoft stupidity, and add a special "byte order mark" -- printf("\xEF\xBB\xBF"); -- to the beginning of the output, so that its utilities like Notepad recognizes the file as UTF-8. It's a Windows-only wart, and treat it as such.

    Questions?

提交回复
热议问题