I have Javascript in an XHTML web page that is passing UTF-8 encoded strings. It needs to continue to pass the UTF-8 version, as well as decode it. How is it possible to decode a UTF-8 string for display?
<script type="text/javascript">
// <![CDATA[
function updateUser(usernameSent){
var usernameReceived = usernameSent; // Current value: Größe
var usernameDecoded = usernameReceived; // Decode to: Größe
var html2id = '';
html2id += 'Encoded: ' + usernameReceived + '<br />Decoded: ' + usernameDecoded;
document.getElementById('userId').innerHTML = html2id;
}
// ]]>
</script>
To answer the original question: here is how you decode utf-8 in javascript:
http://ecmanaut.blogspot.ca/2006/07/encoding-decoding-utf8-in-javascript.html
Specifically,
function encode_utf8(s) {
return unescape(encodeURIComponent(s));
}
function decode_utf8(s) {
return decodeURIComponent(escape(s));
}
I just used this in my code, and it works perfectly.
This should work:
// http://www.onicos.com/staff/iz/amuse/javascript/expert/utf.txt
/* utf.js - UTF-8 <=> UTF-16 convertion
*
* Copyright (C) 1999 Masanao Izumo <iz@onicos.co.jp>
* Version: 1.0
* LastModified: Dec 25 1999
* This library is free. You can redistribute it and/or modify it.
*/
function Utf8ArrayToStr(array) {
var out, i, len, c;
var char2, char3;
out = "";
len = array.length;
i = 0;
while(i < len) {
c = array[i++];
switch(c >> 4)
{
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
// 0xxxxxxx
out += String.fromCharCode(c);
break;
case 12: case 13:
// 110x xxxx 10xx xxxx
char2 = array[i++];
out += String.fromCharCode(((c & 0x1F) << 6) | (char2 & 0x3F));
break;
case 14:
// 1110 xxxx 10xx xxxx 10xx xxxx
char2 = array[i++];
char3 = array[i++];
out += String.fromCharCode(((c & 0x0F) << 12) |
((char2 & 0x3F) << 6) |
((char3 & 0x3F) << 0));
break;
}
}
return out;
}
Check out the JSFiddle demo.
@albert's solution was the closest I think but it can only parse up to 3 byte utf-8 characters
function utf8ArrayToStr(array) {
var out, i, len, c;
var char2, char3;
out = "";
len = array.length;
i = 0;
// XXX: Invalid bytes are ignored
while(i < len) {
c = array[i++];
if (c >> 7 == 0) {
// 0xxx xxxx
out += String.fromCharCode(c);
continue;
}
// Invalid starting byte
if (c >> 6 == 0x02) {
continue;
}
// #### MULTIBYTE ####
// How many bytes left for thus character?
var extraLength = null;
if (c >> 5 == 0x06) {
extraLength = 1;
} else if (c >> 4 == 0x0e) {
extraLength = 2;
} else if (c >> 3 == 0x1e) {
extraLength = 3;
} else if (c >> 2 == 0x3e) {
extraLength = 4;
} else if (c >> 1 == 0x7e) {
extraLength = 5;
} else {
continue;
}
// Do we have enough bytes in our data?
if (i+extraLength > len) {
var leftovers = array.slice(i-1);
// If there is an invalid byte in the leftovers we might want to
// continue from there.
for (; i < len; i++) if (array[i] >> 6 != 0x02) break;
if (i != len) continue;
// All leftover bytes are valid.
return {result: out, leftovers: leftovers};
}
// Remove the UTF-8 prefix from the char (res)
var mask = (1 << (8 - extraLength - 1)) - 1,
res = c & mask, nextChar, count;
for (count = 0; count < extraLength; count++) {
nextChar = array[i++];
// Is the char valid multibyte part?
if (nextChar >> 6 != 0x02) {break;};
res = (res << 6) | (nextChar & 0x3f);
}
if (count != extraLength) {
i--;
continue;
}
if (res <= 0xffff) {
out += String.fromCharCode(res);
continue;
}
res -= 0x10000;
var high = ((res >> 10) & 0x3ff) + 0xd800,
low = (res & 0x3ff) + 0xdc00;
out += String.fromCharCode(high, low);
}
return {result: out, leftovers: []};
}
This returns {result: "parsed string", leftovers: [list of invalid bytes at the end]}
in case you are parsing the string in chunks.
EDIT: fixed the issue that @unhammer found.
Here is a solution handling all Unicode code points include upper (4 byte) values and supported by all modern browsers (IE and others > 5.5). It uses decodeURIComponent(), but NOT the deprecated escape/unescape functions:
function utf8_to_str(a) {
for(var i=0, s=''; i<a.length; i++) {
var h = a[i].toString(16)
if(h.length < 2) h = '0' + h
s += '%' + h
}
return decodeURIComponent(s)
}
Tested and available on GitHub
To create UTF-8 from a string:
function utf8_from_str(s) {
for(var i=0, enc = encodeURIComponent(s), a = []; i < enc.length;) {
if(enc[i] === '%') {
a.push(parseInt(enc.substr(i+1, 2), 16))
i += 3
} else {
a.push(enc.charCodeAt(i++))
}
}
return a
}
Tested and available on GitHub
Update @Albert's answer adding condition for emoji.
function Utf8ArrayToStr(array) {
var out, i, len, c;
var char2, char3, char4;
out = "";
len = array.length;
i = 0;
while(i < len) {
c = array[i++];
switch(c >> 4)
{
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
// 0xxxxxxx
out += String.fromCharCode(c);
break;
case 12: case 13:
// 110x xxxx 10xx xxxx
char2 = array[i++];
out += String.fromCharCode(((c & 0x1F) << 6) | (char2 & 0x3F));
break;
case 14:
// 1110 xxxx 10xx xxxx 10xx xxxx
char2 = array[i++];
char3 = array[i++];
out += String.fromCharCode(((c & 0x0F) << 12) |
((char2 & 0x3F) << 6) |
((char3 & 0x3F) << 0));
break;
case 15:
// 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
char2 = array[i++];
char3 = array[i++];
char4 = array[i++];
out += String.fromCodePoint(((c & 0x07) << 18) | ((char2 & 0x3F) << 12) | ((char3 & 0x3F) << 6) | (char4 & 0x3F));
break;
}
return out;
}
// String to Utf8 ByteBuffer
function strToUTF8(str){
return Uint8Array.from(encodeURIComponent(str).replace(/%(..)/g,(m,v)=>{return String.fromCodePoint(parseInt(v,16))}), c=>c.codePointAt(0))
}
// Utf8 ByteArray to string
function UTF8toStr(ba){
return decodeURIComponent(ba.reduce((p,c)=>{return p+'%'+c.toString(16),''}))
}
Perhaps using the textDecoder will be sufficient.
Not supported in all browsers though. But it might be sufficient if you use crosswalk or any other use case where you know what browser is used.
var decoder = new TextDecoder('utf-8'),
decodedMessage;
decodedMessage = decoder.decode(message.data);
This is what I found after a more specific Google search than just UTF-8 encode/decode. so for those who are looking for a converting library to convert between encodings, here you go.
https://github.com/inexorabletash/text-encoding
var uint8array = new TextEncoder().encode(str);
var str = new TextDecoder(encoding).decode(uint8array);
Paste from repo readme
All encodings from the Encoding specification are supported:
utf-8 ibm866 iso-8859-2 iso-8859-3 iso-8859-4 iso-8859-5 iso-8859-6 iso-8859-7 iso-8859-8 iso-8859-8-i iso-8859-10 iso-8859-13 iso-8859-14 iso-8859-15 iso-8859-16 koi8-r koi8-u macintosh windows-874 windows-1250 windows-1251 windows-1252 windows-1253 windows-1254 windows-1255 windows-1256 windows-1257 windows-1258 x-mac-cyrillic gb18030 hz-gb-2312 big5 euc-jp iso-2022-jp shift_jis euc-kr replacement utf-16be utf-16le x-user-defined
(Some encodings may be supported under other names, e.g. ascii, iso-8859-1, etc. See Encoding for additional labels for each encoding.)
I reckon the easiest way would be to use a built-in js functions decodeURI() / encodeURI().
function (usernameSent) {
var usernameEncoded = usernameSent; // Current value: utf8
var usernameDecoded = decodeURI(usernameReceived); // Decoded
// do stuff
}
Using my 1.6KB library, you can do
ToString(FromUTF8(Array.from(usernameReceived)))
I searched for a simple solution and this works well for me:
//input data
view = new Uint8Array(data);
//output string
serialString = ua2text(view);
//convert UTF8 to string
function ua2text(ua) {
s = "";
for (var i = 0; i < ua.length; i++) {
s += String.fromCharCode(ua[i]);
}
return s;
}
Only issue I have is sometimes I get one character at a time. This might be by design with my source of the arraybuffer. I'm using https://github.com/xseignard/cordovarduino to read serial data on an android device.
来源:https://stackoverflow.com/questions/13356493/decode-utf-8-with-javascript