I have another question yet. If I had a std::wstring looking like this:
ドイツ語で検索していてこちらのサイトにたどり着きました。
How could I possibly get it
Here's a version that converts from UTF-16 (wchar) to hex-encoded UTF-8 using the Win32-specific WideCharToMultiByte() function.
#include <string>
#include <iostream>
#include <stdio.h>
#include <windows.h>
std::string wstring_to_utf8_hex(const std::wstring &input)
{
std::string output;
int cbNeeded = WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1, NULL, 0, NULL, NULL);
if (cbNeeded > 0) {
char *utf8 = new char[cbNeeded];
if (WideCharToMultiByte(CP_UTF8, 0, input.c_str(), -1, utf8, cbNeeded, NULL, NULL) != 0) {
for (char *p = utf8; *p; *p++) {
char onehex[5];
_snprintf(onehex, sizeof(onehex), "%%%02.2X", (unsigned char)*p);
output.append(onehex);
}
}
delete[] utf8;
}
return output;
}
int main(int, char*[])
{
std::wstring ja = L"ドイツ語で検索していてこちらのサイトにたどり着きました。";
std::cout << "result=" << wstring_to_utf8_hex(ja) << std::endl;
return 0;
}
To go the other way, you'll need to use some parsing to decode the hex values into a UTF-8 buffer, and then call the complimentary MultiByteToWideChar() to get it back into a wchar array.
#include <string>
#include <iostream>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>
std::string unhexlify(const std::string &input)
{
std::string output;
for (const char *p = input.c_str(); *p; ) {
if (p[0] == '%' && isxdigit(p[1]) && isxdigit(p[2])) {
int ch = (isdigit(p[1]) ? p[1] - '0' : toupper(p[1]) - 'A' + 10) * 16 +
(isdigit(p[2]) ? p[2] - '0' : toupper(p[2]) - 'A' + 10);
output.push_back((char)ch);
p += 3;
} else if (p[0] == '%' && p[1] == '#' && isdigit(p[2])) {
int ch = atoi(p + 2);
output.push_back((char)ch);
p += 2;
while (*p && isdigit(*p)) p++;
if (*p == ';') p++;
} else {
output.push_back(*p++);
}
}
return output;
}
std::wstring utf8_hex_to_wstring(const std::string &input)
{
std::wstring output;
std::string utf8 = unhexlify(input);
int cchNeeded = MultiByteToWideChar(CP_UTF8, 0, utf8.c_str(), -1, NULL, 0);
if (cchNeeded > 0) {
wchar_t *widebuf = new wchar_t[cchNeeded];
if (MultiByteToWideChar(CP_UTF8, 0, utf8.c_str(), -1, widebuf, cchNeeded) != 0) {
output = widebuf;
}
delete[] widebuf;
}
return output;
}
int main(int, char*[])
{
std::wstring ja = L"ドイツ語で検索していてこちらのサイトにたどり着きました。";
std::string hex = "%E3%83%89%E3%82%A4%E3%83%84%E8%AA%9E%E3%81%A7%E6%A4%9C%E7%B4%A2%E3%81%97%E3%81%A6%E3%81%84%E3%81%A6%E3%81%93%E3%81%A1%E3%82%89%E3%81%AE%E3%82%B5%E3%82%A4%E3%83%88%E3%81%AB%E3%81%9F%E3%81%A9%E3%82%8A%E7%9D%80%E3%81%8D%E3%81%BE%E3%81%97%E3%81%9F%E3%80%82";
std::wstring newja = utf8_hex_to_wstring(hex);
std::cout << "match?=" << (newja == ja ? "yes" : "no") << std::endl;
return 0;
}
You see, before you can convert a char to a URL escape sequence, you have to convert your wstring* into ISO-Latin charset which is what is used for URLs. ICU could be a good place to start, where you can pass your wstring to it and get a ISO-Lantin sequence. Then, simply iterate through the resulting chars and convert them to the escape senquence:
std::stringstream URL;
URL << std::hex;
for(auto it = myWString.begin(); it != myWString.end(); ++it)
URL << '%' << std::setfill('0') << std::setw(2) << (int)*it;
Take a look here for more info in how to format the string.
* I'm assuming that your wstring is a UTF-16, which usually is the case, although you didn't specify
This might help also.
Here is an example which shows two methods, one based on the Qt library and one based on the ICU library. Both should be fairly platform-independent:
#include <iostream>
#include <sstream>
#include <iomanip>
#include <stdexcept>
#include <boost/scoped_array.hpp>
#include <QtCore/QString>
#include <QtCore/QUrl>
#include <QtCore/QVector>
#include <unicode/utypes.h>
#include <unicode/ustring.h>
#include <unicode/unistr.h>
#include <unicode/schriter.h>
void encodeQt() {
const QString str = QString::fromWCharArray(L"ドイツ語で検索していてこちらのサイトにたどり着きました。");
const QUrl url = str;
std::cout << "URL encoded: " << url.toEncoded().constData() << std::endl;
typedef QVector<uint> CodePointVector;
const CodePointVector codePoints = str.toUcs4();
std::stringstream htmlEncoded;
for (CodePointVector::const_iterator it = codePoints.constBegin(); it != codePoints.constEnd(); ++it) {
htmlEncoded << "&#" << *it << ';';
}
std::cout << "HTML encoded: " << htmlEncoded.str() << std::endl;
}
void encodeICU() {
const std::wstring cppString = L"ドイツ語で検索していてこちらのサイトにたどり着きました。";
int bufSize = cppString.length() * 2;
boost::scoped_array<UChar> strBuffer(new UChar[bufSize]);
int size = 0;
UErrorCode error = U_ZERO_ERROR;
u_strFromWCS(strBuffer.get(), bufSize, &size, cppString.data(), cppString.length(), &error);
if (error) return;
const UnicodeString str(strBuffer.get(), size);
bufSize = str.length() * 4;
boost::scoped_array<char> buffer(new char[bufSize]);
u_strToUTF8(buffer.get(), bufSize, &size, str.getBuffer(), str.length(), &error);
if (error) return;
const std::string urlUtf8(buffer.get(), size);
std::stringstream urlEncoded;
urlEncoded << std::hex << std::setfill('0');
for (std::string::const_iterator it = urlUtf8.begin(); it != urlUtf8.end(); ++it) {
urlEncoded << '%' << std::setw(2) << static_cast<unsigned int>(static_cast<unsigned char>(*it));
}
std::cout << "URL encoded: " << urlEncoded.str() << std::endl;
std::stringstream htmlEncoded;
StringCharacterIterator it = str;
while (it.hasNext()) {
const UChar32 pt = it.next32PostInc();
htmlEncoded << "&#" << pt << ';';
}
std::cout << "HTML encoded: " << htmlEncoded.str() << std::endl;
}
int main() {
encodeQt();
encodeICU();
}
First, convert to UTF-8. Then, normal URL/HTML encode would do the right thing.