Using Javascript's atob to decode base64 doesn't properly decode utf-8 strings

后端 未结 10 1986
野趣味
野趣味 2020-11-22 16:24

I\'m using the Javascript window.atob() function to decode a base64-encoded string (specifically the base64-encoded content from the GitHub API). Problem is I\'

相关标签:
10条回答
  • 2020-11-22 16:50

    Things change. The escape/unescape methods have been deprecated.

    You can URI encode the string before you Base64-encode it. Note that this does't produce Base64-encoded UTF8, but rather Base64-encoded URL-encoded data. Both sides must agree on the same encoding.

    See working example here: http://codepen.io/anon/pen/PZgbPW

    // encode string
    var base64 = window.btoa(encodeURIComponent('€ 你好 æøåÆØÅ'));
    // decode string
    var str = decodeURIComponent(window.atob(tmp));
    // str is now === '€ 你好 æøåÆØÅ'
    

    For OP's problem a third party library such as js-base64 should solve the problem.

    0 讨论(0)
  • 2020-11-22 16:52

    I would assume that one might want a solution that produces a widely useable base64 URI. Please visit data:text/plain;charset=utf-8;base64,4pi44pi54pi64pi74pi84pi+4pi/ to see a demonstration (copy the data uri, open a new tab, paste the data URI into the address bar, then press enter to go to the page). Despite the fact that this URI is base64-encoded, the browser is still able to recognize the high code points and decode them properly. The minified encoder+decoder is 1058 bytes (+Gzip→589 bytes)

    !function(e){"use strict";function h(b){var a=b.charCodeAt(0);if(55296<=a&&56319>=a)if(b=b.charCodeAt(1),b===b&&56320<=b&&57343>=b){if(a=1024*(a-55296)+b-56320+65536,65535<a)return d(240|a>>>18,128|a>>>12&63,128|a>>>6&63,128|a&63)}else return d(239,191,189);return 127>=a?inputString:2047>=a?d(192|a>>>6,128|a&63):d(224|a>>>12,128|a>>>6&63,128|a&63)}function k(b){var a=b.charCodeAt(0)<<24,f=l(~a),c=0,e=b.length,g="";if(5>f&&e>=f){a=a<<f>>>24+f;for(c=1;c<f;++c)a=a<<6|b.charCodeAt(c)&63;65535>=a?g+=d(a):1114111>=a?(a-=65536,g+=d((a>>10)+55296,(a&1023)+56320)):c=0}for(;c<e;++c)g+="\ufffd";return g}var m=Math.log,n=Math.LN2,l=Math.clz32||function(b){return 31-m(b>>>0)/n|0},d=String.fromCharCode,p=atob,q=btoa;e.btoaUTF8=function(b,a){return q((a?"\u00ef\u00bb\u00bf":"")+b.replace(/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g,h))};e.atobUTF8=function(b,a){a||"\u00ef\u00bb\u00bf"!==b.substring(0,3)||(b=b.substring(3));return p(b).replace(/[\xc0-\xff][\x80-\xbf]*/g,k)}}(""+void 0==typeof global?""+void 0==typeof self?this:self:global)
    

    Below is the source code used to generate it.

    var fromCharCode = String.fromCharCode;
    var btoaUTF8 = (function(btoa, replacer){"use strict";
        return function(inputString, BOMit){
            return btoa((BOMit ? "\xEF\xBB\xBF" : "") + inputString.replace(
                /[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g, replacer
            ));
        }
    })(btoa, function(nonAsciiChars){"use strict";
        // make the UTF string into a binary UTF-8 encoded string
        var point = nonAsciiChars.charCodeAt(0);
        if (point >= 0xD800 && point <= 0xDBFF) {
            var nextcode = nonAsciiChars.charCodeAt(1);
            if (nextcode !== nextcode) // NaN because string is 1 code point long
                return fromCharCode(0xef/*11101111*/, 0xbf/*10111111*/, 0xbd/*10111101*/);
            // https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
            if (nextcode >= 0xDC00 && nextcode <= 0xDFFF) {
                point = (point - 0xD800) * 0x400 + nextcode - 0xDC00 + 0x10000;
                if (point > 0xffff)
                    return fromCharCode(
                        (0x1e/*0b11110*/<<3) | (point>>>18),
                        (0x2/*0b10*/<<6) | ((point>>>12)&0x3f/*0b00111111*/),
                        (0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
                        (0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
                    );
            } else return fromCharCode(0xef, 0xbf, 0xbd);
        }
        if (point <= 0x007f) return nonAsciiChars;
        else if (point <= 0x07ff) {
            return fromCharCode((0x6<<5)|(point>>>6), (0x2<<6)|(point&0x3f));
        } else return fromCharCode(
            (0xe/*0b1110*/<<4) | (point>>>12),
            (0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
            (0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
        );
    });
    

    Then, to decode the base64 data, either HTTP get the data as a data URI or use the function below.

    var clz32 = Math.clz32 || (function(log, LN2){"use strict";
        return function(x) {return 31 - log(x >>> 0) / LN2 | 0};
    })(Math.log, Math.LN2);
    var fromCharCode = String.fromCharCode;
    var atobUTF8 = (function(atob, replacer){"use strict";
        return function(inputString, keepBOM){
            inputString = atob(inputString);
            if (!keepBOM && inputString.substring(0,3) === "\xEF\xBB\xBF")
                inputString = inputString.substring(3); // eradicate UTF-8 BOM
            // 0xc0 => 0b11000000; 0xff => 0b11111111; 0xc0-0xff => 0b11xxxxxx
            // 0x80 => 0b10000000; 0xbf => 0b10111111; 0x80-0xbf => 0b10xxxxxx
            return inputString.replace(/[\xc0-\xff][\x80-\xbf]*/g, replacer);
        }
    })(atob, function(encoded){"use strict";
        var codePoint = encoded.charCodeAt(0) << 24;
        var leadingOnes = clz32(~codePoint);
        var endPos = 0, stringLen = encoded.length;
        var result = "";
        if (leadingOnes < 5 && stringLen >= leadingOnes) {
            codePoint = (codePoint<<leadingOnes)>>>(24+leadingOnes);
            for (endPos = 1; endPos < leadingOnes; ++endPos)
                codePoint = (codePoint<<6) | (encoded.charCodeAt(endPos)&0x3f/*0b00111111*/);
            if (codePoint <= 0xFFFF) { // BMP code point
              result += fromCharCode(codePoint);
            } else if (codePoint <= 0x10FFFF) {
              // https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
              codePoint -= 0x10000;
              result += fromCharCode(
                (codePoint >> 10) + 0xD800,  // highSurrogate
                (codePoint & 0x3ff) + 0xDC00 // lowSurrogate
              );
            } else endPos = 0; // to fill it in with INVALIDs
        }
        for (; endPos < stringLen; ++endPos) result += "\ufffd"; // replacement character
        return result;
    });
    

    The advantage of being more standard is that this encoder and this decoder are more widely applicable because they can be used as a valid URL that displays correctly. Observe.

    (function(window){
        "use strict";
        var sourceEle = document.getElementById("source");
        var urlBarEle = document.getElementById("urlBar");
        var mainFrameEle = document.getElementById("mainframe");
        var gotoButton = document.getElementById("gotoButton");
        var parseInt = window.parseInt;
        var fromCodePoint = String.fromCodePoint;
        var parse = JSON.parse;
        
        function unescape(str){
            return str.replace(/\\u[\da-f]{0,4}|\\x[\da-f]{0,2}|\\u{[^}]*}|\\[bfnrtv"'\\]|\\0[0-7]{1,3}|\\\d{1,3}/g, function(match){
              try{
                if (match.startsWith("\\u{"))
                  return fromCodePoint(parseInt(match.slice(2,-1),16));
                if (match.startsWith("\\u") || match.startsWith("\\x"))
                  return fromCodePoint(parseInt(match.substring(2),16));
                if (match.startsWith("\\0") && match.length > 2)
                  return fromCodePoint(parseInt(match.substring(2),8));
                if (/^\\\d/.test(match)) return fromCodePoint(+match.slice(1));
              }catch(e){return "\ufffd".repeat(match.length)}
              return parse('"' + match + '"');
            });
        }
        
        function whenChange(){
          try{ urlBarEle.value = "data:text/plain;charset=UTF-8;base64," + btoaUTF8(unescape(sourceEle.value), true);
          } finally{ gotoURL(); }
        }
        sourceEle.addEventListener("change",whenChange,{passive:1});
        sourceEle.addEventListener("input",whenChange,{passive:1});
        
        // IFrame Setup:
        function gotoURL(){mainFrameEle.src = urlBarEle.value}
        gotoButton.addEventListener("click", gotoURL, {passive: 1});
        function urlChanged(){urlBarEle.value = mainFrameEle.src}
        mainFrameEle.addEventListener("load", urlChanged, {passive: 1});
        urlBarEle.addEventListener("keypress", function(evt){
          if (evt.key === "enter") evt.preventDefault(), urlChanged();
        }, {passive: 1});
        
            
        var fromCharCode = String.fromCharCode;
        var btoaUTF8 = (function(btoa, replacer){
    		    "use strict";
            return function(inputString, BOMit){
            	return btoa((BOMit?"\xEF\xBB\xBF":"") + inputString.replace(
            		/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g, replacer
        		));
        	}
        })(btoa, function(nonAsciiChars){
    		"use strict";
        	// make the UTF string into a binary UTF-8 encoded string
        	var point = nonAsciiChars.charCodeAt(0);
        	if (point >= 0xD800 && point <= 0xDBFF) {
        		var nextcode = nonAsciiChars.charCodeAt(1);
        		if (nextcode !== nextcode) { // NaN because string is 1code point long
        			return fromCharCode(0xef/*11101111*/, 0xbf/*10111111*/, 0xbd/*10111101*/);
        		}
        		// https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
        		if (nextcode >= 0xDC00 && nextcode <= 0xDFFF) {
        			point = (point - 0xD800) * 0x400 + nextcode - 0xDC00 + 0x10000;
        			if (point > 0xffff) {
        				return fromCharCode(
        					(0x1e/*0b11110*/<<3) | (point>>>18),
        					(0x2/*0b10*/<<6) | ((point>>>12)&0x3f/*0b00111111*/),
        					(0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
        					(0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
        				);
        			}
        		} else {
        			return fromCharCode(0xef, 0xbf, 0xbd);
        		}
        	}
        	if (point <= 0x007f) { return inputString; }
        	else if (point <= 0x07ff) {
        		return fromCharCode((0x6<<5)|(point>>>6), (0x2<<6)|(point&0x3f/*00111111*/));
        	} else {
        		return fromCharCode(
        			(0xe/*0b1110*/<<4) | (point>>>12),
        			(0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
        			(0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
        		);
        	}
        });
        setTimeout(whenChange, 0);
    })(window);
    img:active{opacity:0.8}
    <center>
    <textarea id="source" style="width:66.7vw">Hello \u1234 W\186\0256ld!
    Enter text into the top box. Then the URL will update automatically.
    </textarea><br />
    <div style="width:66.7vw;display:inline-block;height:calc(25vw + 1em + 6px);border:2px solid;text-align:left;line-height:1em">
    <input id="urlBar" style="width:calc(100% - 1em - 13px)" /><img id="gotoButton" src="" style="width:calc(1em + 4px);line-height:1em;vertical-align:-40%;cursor:pointer" />
    <iframe id="mainframe" style="width:66.7vw;height:25vw" frameBorder="0"></iframe>
    </div>
    </center>

    In addition to being very standardized, the above code snippets are also very fast. Instead of an indirect chain of succession where the data has to be converted several times between various forms (such as in Riccardo Galli's response), the above code snippet is as direct as performantly possible. It uses only one simple fast String.prototype.replace call to process the data when encoding, and only one to decode the data when decoding. Another plus is that (especially for big strings), String.prototype.replace allows the browser to automatically handle the underlying memory management of resizing the string, leading a significant performance boost especially in evergreen browsers like Chrome and Firefox that heavily optimize String.prototype.replace. Finally, the icing on the cake is that for you latin script exclūsīvō users, strings which don't contain any code points above 0x7f are extra fast to process because the string remains unmodified by the replacement algorithm.

    I have created a github repository for this solution at https://github.com/anonyco/BestBase64EncoderDecoder/

    0 讨论(0)
  • 2020-11-22 16:53

    If treating strings as bytes is more your thing, you can use the following functions

    function u_atob(ascii) {
        return Uint8Array.from(atob(ascii), c => c.charCodeAt(0));
    }
    
    function u_btoa(buffer) {
        var binary = [];
        var bytes = new Uint8Array(buffer);
        for (var i = 0, il = bytes.byteLength; i < il; i++) {
            binary.push(String.fromCharCode(bytes[i]));
        }
        return btoa(binary.join(''));
    }
    
    
    // example, it works also with astral plane characters such as '                                                                    
    0 讨论(0)
  • 2020-11-22 16:55

    Here is 2018 updated solution as described in the Mozilla Development Resources

    TO ENCODE FROM UNICODE TO B64

    function b64EncodeUnicode(str) {
        // first we use encodeURIComponent to get percent-encoded UTF-8,
        // then we convert the percent encodings into raw bytes which
        // can be fed into btoa.
        return btoa(encodeURIComponent(str).replace(/%([0-9A-F]{2})/g,
            function toSolidBytes(match, p1) {
                return String.fromCharCode('0x' + p1);
        }));
    }
    
    b64EncodeUnicode('✓ à la mode'); // "4pyTIMOgIGxhIG1vZGU="
    b64EncodeUnicode('\n'); // "Cg=="
    

    TO DECODE FROM B64 TO UNICODE

    function b64DecodeUnicode(str) {
        // Going backwards: from bytestream, to percent-encoding, to original string.
        return decodeURIComponent(atob(str).split('').map(function(c) {
            return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
        }).join(''));
    }
    
    b64DecodeUnicode('4pyTIMOgIGxhIG1vZGU='); // "✓ à la mode"
    b64DecodeUnicode('Cg=='); // "\n"
    
    0 讨论(0)
提交回复
热议问题