how can I convert wstring to u16string?

后端 未结 2 512
时光取名叫无心
时光取名叫无心 2021-01-01 01:41

I want to convert wstring to u16string in C++.

I can convert wstring to string, or reverse. But I don\'t know how convert to <

2条回答
  •  借酒劲吻你
    2021-01-01 02:25

    Update

    I had thought the standard version did not work, but in fact this was simply due to bugs in the Visual C++ and libstdc++ 3.4.21 runtime libraries. It does work with clang++ -std=c++14 -stdlib=libc++. Here is a version that tests whether the standard method works on your compiler:

    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    
    using std::cout;
    using std::endl;
    using std::exit;
    using std::memcmp;
    using std::size_t;
    
    using std::wcout;
    
    #if _WIN32 || _WIN64
    // Windows needs a little non-standard magic for this to work.
    #include 
    #include 
    #include 
    #endif
    
    using std::size_t;
    
    void init_locale(void)
    // Does magic so that wcout can work.
    {
    #if _WIN32 || _WIN64
      // Windows needs a little non-standard magic.
      constexpr char cp_utf16le[] = ".1200";
      setlocale( LC_ALL, cp_utf16le );
      _setmode( _fileno(stdout), _O_U16TEXT );
    #else
      // The correct locale name may vary by OS, e.g., "en_US.utf8".
      constexpr char locale_name[] = "";
      std::locale::global(std::locale(locale_name));
      std::wcout.imbue(std::locale());
    #endif
    }
    
    int main(void)
    {
      constexpr char16_t msg_utf16[] = u"¡Hola, mundo! \U0001F600"; // Shouldn't assume endianness.
      constexpr wchar_t msg_w[] = L"¡Hola, mundo! \U0001F600";
      constexpr char32_t msg_utf32[] = U"¡Hola, mundo! \U0001F600";
      constexpr char msg_utf8[] = u8"¡Hola, mundo! \U0001F600";
    
      init_locale();
    
      const std::codecvt_utf16 converter_w;
      const size_t max_len = sizeof(msg_utf16);
      std::vector out(max_len);
      std::mbstate_t state;
      const wchar_t* from_w = nullptr;
      char* to_next = nullptr;
    
      converter_w.out( state, msg_w, msg_w+sizeof(msg_w)/sizeof(wchar_t), from_w, out.data(), out.data() + out.size(), to_next );
    
    
      if (memcmp( msg_utf8, out.data(), sizeof(msg_utf8) ) == 0 ) {
        wcout << L"std::codecvt_utf16 converts to UTF-8, not UTF-16!" << endl;
      } else if ( memcmp( msg_utf16, out.data(), max_len ) != 0 ) {
        wcout << L"std::codecvt_utf16 conversion not equal!" << endl;
      } else {
        wcout << L"std::codecvt_utf16 conversion is correct." << endl;
      }
      out.clear();
      out.resize(max_len);
    
      const std::codecvt_utf16 converter_u32;
      const char32_t* from_u32 = nullptr;
      converter_u32.out( state, msg_utf32, msg_utf32+sizeof(msg_utf32)/sizeof(char32_t), from_u32, out.data(), out.data() + out.size(), to_next );
    
      if ( memcmp( msg_utf16, out.data(), max_len ) != 0 ) {
        wcout << L"std::codecvt_utf16 conversion not equal!" << endl;
      } else {
        wcout << L"std::codecvt_utf16 conversion is correct." << endl;
      }
    
      wcout << msg_w << endl;
      return EXIT_SUCCESS;
    }
    

    Previous

    A bit late to the game, but here’s a version that additionally checks whether wchar_t is 32-bits (as it is on Linux), and if so, performs surrogate-pair conversion. I recommend saving this source as UTF-8 with a BOM. Here is a link to it on ideone.

    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    
    #if _WIN32 || _WIN64
    // Windows needs a little non-standard magic for this to work.
    #include 
    #include 
    #include 
    #endif
    
    using std::size_t;
    
    void init_locale(void)
    // Does magic so that wcout can work.
    {
    #if _WIN32 || _WIN64
      // Windows needs a little non-standard magic.
      constexpr char cp_utf16le[] = ".1200";
      setlocale( LC_ALL, cp_utf16le );
      _setmode( _fileno(stdout), _O_U16TEXT );
    #else
      // The correct locale name may vary by OS, e.g., "en_US.utf8".
      constexpr char locale_name[] = "";
      std::locale::global(std::locale(locale_name));
      std::wcout.imbue(std::locale());
    #endif
    }
    
    std::u16string make_u16string( const std::wstring& ws )
    /* Creates a UTF-16 string from a wide-character string.  Any wide characters
     * outside the allowed range of UTF-16 are mapped to the sentinel value U+FFFD,
     * per the Unicode documentation. (http://www.unicode.org/faq/private_use.html
     * retrieved 12 March 2017.) Unpaired surrogates in ws are also converted to
     * sentinel values.  Noncharacters, however, are left intact.  As a fallback,
     * if wide characters are the same size as char16_t, this does a more trivial
     * construction using that implicit conversion.
     */
    {
      /* We assume that, if this test passes, a wide-character string is already
       * UTF-16, or at least converts to it implicitly without needing surrogate
       * pairs.
       */
      if ( sizeof(wchar_t) == sizeof(char16_t) ) {
        return std::u16string( ws.begin(), ws.end() );
      } else {
        /* The conversion from UTF-32 to UTF-16 might possibly require surrogates.
         * A surrogate pair suffices to represent all wide characters, because all
         * characters outside the range will be mapped to the sentinel value
         * U+FFFD.  Add one character for the terminating NUL.
         */
        const size_t max_len = 2 * ws.length() + 1;
        // Our temporary UTF-16 string.
        std::u16string result;
    
        result.reserve(max_len);
    
        for ( const wchar_t& wc : ws ) {
          const std::wint_t chr = wc;
    
          if ( chr < 0 || chr > 0x10FFFF || (chr >= 0xD800 && chr <= 0xDFFF) ) {
            // Invalid code point.  Replace with sentinel, per Unicode standard:
            constexpr char16_t sentinel = u'\uFFFD';
            result.push_back(sentinel);
          } else if ( chr < 0x10000UL ) { // In the BMP.
            result.push_back(static_cast(wc));
          } else {
            const char16_t leading = static_cast( 
              ((chr-0x10000UL) / 0x400U) + 0xD800U );
            const char16_t trailing = static_cast( 
              ((chr-0x10000UL) % 0x400U) + 0xDC00U );
    
            result.append({leading, trailing});
          } // end if
        } // end for
    
       /* The returned string is shrunken to fit, which might not be the Right
        * Thing if there is more to be added to the string.
        */
        result.shrink_to_fit();
    
        // We depend here on the compiler to optimize the move constructor.
        return result;
      } // end if
      // Not reached.
    }
    
    int main(void)
    {
      static const std::wstring wtest(L"☪☮∈✡℩☯✝ \U0001F644");
      static const std::u16string u16test(u"☪☮∈✡℩☯✝ \U0001F644");
      const std::u16string converted = make_u16string(wtest);
    
      init_locale();
    
      std::wcout << L"sizeof(wchar_t) == " << sizeof(wchar_t) << L".\n";
    
      for( size_t i = 0; i <= u16test.length(); ++i ) {
        if ( u16test[i] != converted[i] ) {
          std::wcout << std::hex << std::showbase
                     << std::right << std::setfill(L'0')
                     << std::setw(4) << (unsigned)converted[i] << L" ≠ "
                     << std::setw(4) << (unsigned)u16test[i] << L" at "
                     << i << L'.' << std::endl;
          return EXIT_FAILURE;
        } // end if
      } // end for
    
      std::wcout << wtest << std::endl;
    
      return EXIT_SUCCESS;
    }
    

    Footnote

    Since someone asked: The reason I suggest UTF-8 with BOM is that some compilers, including MSVC 2015, will assume a source file is encoded according to the current code page unless there is a BOM or you specify an encoding on the command line. No encoding works on all toolchains, unfortunately, but every tool I’ve used that’s modern enough to support C++14 also understands the BOM.

提交回复
热议问题