How do I HTML-/ URL-Encode a std::wstring containing Unicode characters?

后端 未结 4 1715
猫巷女王i
猫巷女王i 2021-01-06 02:36

I have another question yet. If I had a std::wstring looking like this:

ドイツ語で検索していてこちらのサイトにたどり着きました。

How could I possibly get it

相关标签:
4条回答
  • 2021-01-06 02:50

    Here's a version that converts from UTF-16 (wchar) to hex-encoded UTF-8 using the Win32-specific WideCharToMultiByte() function.

    #include <string>
    #include <iostream>
    #include <stdio.h>
    #include <windows.h>
    
    
    std::string wstring_to_utf8_hex(const std::wstring &input)
    {
      std::string output;
      int cbNeeded = WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1, NULL, 0, NULL, NULL);
      if (cbNeeded > 0) {
        char *utf8 = new char[cbNeeded];
        if (WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1, utf8, cbNeeded, NULL, NULL) != 0) {
          for (char *p = utf8; *p; *p++) {
            char onehex[5];
            _snprintf(onehex, sizeof(onehex), "%%%02.2X", (unsigned char)*p);
            output.append(onehex);
          }
        }
        delete[] utf8;
      }
      return output;
    }
    
    
    int main(int, char*[])
    {
      std::wstring ja = L"ドイツ語で検索していてこちらのサイトにたどり着きました。";
      std::cout << "result=" << wstring_to_utf8_hex(ja) << std::endl;
      return 0;
    }
    

    To go the other way, you'll need to use some parsing to decode the hex values into a UTF-8 buffer, and then call the complimentary MultiByteToWideChar() to get it back into a wchar array.

    #include <string>
    #include <iostream>
    #include <ctype.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <windows.h>
    
    std::string unhexlify(const std::string &input)
    {
      std::string output;
      for (const char *p = input.c_str(); *p; ) {
        if (p[0] == '%' && isxdigit(p[1]) && isxdigit(p[2])) {
          int ch = (isdigit(p[1]) ? p[1] - '0' : toupper(p[1]) - 'A' + 10) * 16 + 
                   (isdigit(p[2]) ? p[2] - '0' : toupper(p[2]) - 'A' + 10);
          output.push_back((char)ch);
          p += 3;
        } else if (p[0] == '%' && p[1] == '#' && isdigit(p[2])) {
          int ch = atoi(p + 2);
          output.push_back((char)ch);
          p += 2;
          while (*p && isdigit(*p)) p++;
          if (*p == ';') p++;
        } else {
          output.push_back(*p++);
        }
      }
      return output;
    }
    
    
    std::wstring utf8_hex_to_wstring(const std::string &input)
    {
      std::wstring output;
      std::string utf8 = unhexlify(input);
      int cchNeeded = MultiByteToWideChar(CP_UTF8, 0, utf8.c_str(), -1, NULL, 0);
      if (cchNeeded > 0) {
        wchar_t *widebuf = new wchar_t[cchNeeded];
        if (MultiByteToWideChar(CP_UTF8, 0, utf8.c_str(), -1, widebuf, cchNeeded) != 0) {
          output = widebuf;
        }
        delete[] widebuf;
      }
      return output;
    }
    
    int main(int, char*[])
    {
      std::wstring ja = L"ドイツ語で検索していてこちらのサイトにたどり着きました。";
      std::string hex = "%E3%83%89%E3%82%A4%E3%83%84%E8%AA%9E%E3%81%A7%E6%A4%9C%E7%B4%A2%E3%81%97%E3%81%A6%E3%81%84%E3%81%A6%E3%81%93%E3%81%A1%E3%82%89%E3%81%AE%E3%82%B5%E3%82%A4%E3%83%88%E3%81%AB%E3%81%9F%E3%81%A9%E3%82%8A%E7%9D%80%E3%81%8D%E3%81%BE%E3%81%97%E3%81%9F%E3%80%82";
      std::wstring newja = utf8_hex_to_wstring(hex);
      std::cout << "match?=" << (newja == ja ? "yes" : "no") << std::endl;
      return 0;
    }
    
    0 讨论(0)
  • 2021-01-06 02:59

    You see, before you can convert a char to a URL escape sequence, you have to convert your wstring* into ISO-Latin charset which is what is used for URLs. ICU could be a good place to start, where you can pass your wstring to it and get a ISO-Lantin sequence. Then, simply iterate through the resulting chars and convert them to the escape senquence:

    std::stringstream URL;
    URL << std::hex;
    for(auto it = myWString.begin(); it != myWString.end(); ++it)
       URL << '%' << std::setfill('0') << std::setw(2) << (int)*it;
    

    Take a look here for more info in how to format the string.

    * I'm assuming that your wstring is a UTF-16, which usually is the case, although you didn't specify

    This might help also.

    0 讨论(0)
  • 2021-01-06 03:01

    Here is an example which shows two methods, one based on the Qt library and one based on the ICU library. Both should be fairly platform-independent:

    #include <iostream>
    #include <sstream>
    #include <iomanip>
    #include <stdexcept>
    
    #include <boost/scoped_array.hpp>
    
    #include <QtCore/QString>
    #include <QtCore/QUrl>
    #include <QtCore/QVector>
    
    #include <unicode/utypes.h>
    #include <unicode/ustring.h>
    #include <unicode/unistr.h>
    #include <unicode/schriter.h>
    
    void encodeQt() {
      const QString str = QString::fromWCharArray(L"ドイツ語で検索していてこちらのサイトにたどり着きました。");
      const QUrl url = str;
      std::cout << "URL encoded: " << url.toEncoded().constData() << std::endl;
      typedef QVector<uint> CodePointVector;
      const CodePointVector codePoints = str.toUcs4();
      std::stringstream htmlEncoded;
      for (CodePointVector::const_iterator it = codePoints.constBegin(); it != codePoints.constEnd(); ++it) {
        htmlEncoded << "&#" << *it << ';';
      }
      std::cout << "HTML encoded: " << htmlEncoded.str() << std::endl;
    }
    
    void encodeICU() {
      const std::wstring cppString = L"ドイツ語で検索していてこちらのサイトにたどり着きました。";
      int bufSize = cppString.length() * 2;
      boost::scoped_array<UChar> strBuffer(new UChar[bufSize]);
      int size = 0;
      UErrorCode error = U_ZERO_ERROR;
      u_strFromWCS(strBuffer.get(), bufSize, &size, cppString.data(), cppString.length(), &error);
      if (error) return;
      const UnicodeString str(strBuffer.get(), size);
      bufSize = str.length() * 4;
      boost::scoped_array<char> buffer(new char[bufSize]);
      u_strToUTF8(buffer.get(), bufSize, &size, str.getBuffer(), str.length(), &error);
      if (error) return;
      const std::string urlUtf8(buffer.get(), size);
      std::stringstream urlEncoded;
      urlEncoded << std::hex << std::setfill('0');
      for (std::string::const_iterator it = urlUtf8.begin(); it != urlUtf8.end(); ++it) {
        urlEncoded << '%' << std::setw(2) << static_cast<unsigned int>(static_cast<unsigned char>(*it));
      }
      std::cout << "URL encoded: " << urlEncoded.str() << std::endl;
      std::stringstream htmlEncoded;
      StringCharacterIterator it = str;
      while (it.hasNext()) {
        const UChar32 pt = it.next32PostInc();
        htmlEncoded << "&#" << pt << ';';
      }
      std::cout << "HTML encoded: " << htmlEncoded.str() << std::endl;
    }
    
    
    int main() {
      encodeQt();
      encodeICU();
    }
    
    0 讨论(0)
  • 2021-01-06 03:03

    First, convert to UTF-8. Then, normal URL/HTML encode would do the right thing.

    0 讨论(0)
提交回复
热议问题