How to get the UNICODE code from each character of a UTF-8 string?

前端 未结 2 886
夕颜
夕颜 2021-01-27 10:58

With C++11, how can I, from an UTF-8 encoded std::string, get the Unicode value of each character of the text into an uint32_t?

Something like:

相关标签:
2条回答
  • 2021-01-27 11:34

    Using <utf8.h> from http://utfcpp.sourceforge.net/ you could code:

     static inline void fix_utf8_string(std::string& str)
     {
       std::string temp;
       utf8::replace_invalid(str.begin(), str.end(), back_inserter(temp));
       str = temp;
     }
    
     static inline bool valid_utf8_cstr(const char*s)
     {
       if (!s) return false;
       const char* e = s+strlen(s);
       return utf8::is_valid(s,e);
     }
    
     static inline size_t
     utf8_length(const char*s)
     {
       if (!s) return 0;
       const char* e = s+strlen(s);
       return utf8::distance(s,e);
     }
    
    
     // apply a function to every code point, exiting if that function
     // gives true and return the number of visited code points
     static inline size_t
     utf8_foreach_if(const char*s, 
                     std::function<bool(uint32_t,size_t)>f)
     {
       if (!s) return 0;
       size_t ix=0;
       const char*pc = s;
       while(*pc)
         {
           const char*epc
             = (pc[1]==0)?(pc+1):(pc[2]==0)
                  ?(pc+2):(pc[3]==0)?(pc+3):(pc+4);
           uint32_t c = utf8::next(pc,epc);
           if (f(c,ix)) break;
           ix++;
         };
       return ix;
     }
    
     static inline size_t
     utf8_foreach_if(const std::string& s, 
                     std::function<bool(uint32_t,size_t)>f)
     {
       if (s.empty()) return 0;
       size_t ix=0;
       const char*pc = s.c_str();
       const char*epc = pc + s.size();
       while(*pc)
         {
           uint32_t c = utf8::next(pc,epc);
           if (f(c,ix)) break;
           ix++;
         };
       return ix;
     }
    

    This is extracted from some code licensed under GPLv3 that I will release in a few weeks or months.

    0 讨论(0)
  • 2021-01-27 11:49

    You can simply convert the string into a UTF-32 encoded one, using the provided conversion facet and std::wstring_convert from <locale>:

    #include <codecvt>
    #include <locale>
    #include <string>
    
    void foo(std::string const & utf8str)
    {
         std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
         std::u32string utf32str = conv.from_bytes(utf8str);
    
         for (char32_t u : utf32str)  { /* ... */ }
    }
    
    0 讨论(0)
提交回复
热议问题