getting a string length that contains unicode character exceeding 0xffff

前端 未结 4 1596
没有蜡笔的小新
没有蜡笔的小新 2021-01-14 06:23

I’m using this character, double sharp \'

相关标签:
4条回答
  • 2021-01-14 07:00
    String.prototype.codes = function() { return [...this].length };
    String.prototype.chars = function() {
        let GraphemeSplitter = require('grapheme-splitter');
        return (new GraphemeSplitter()).countGraphemes(this);
    }
    
    console.log("F                                                                    
    0 讨论(0)
  • 2021-01-14 07:03

    Javascript (and Java) strings use UTF-16 encoding.

    Unicode codepoint U+0046 (F) is encoded in UTF-16 using 1 codeunit: 0x0046

    Unicode codepoint U+1D12A (

    0 讨论(0)
  • 2021-01-14 07:13

    That's the function I wrote to get string length in codepoint length

    function nbUnicodeLength(string){
        var stringIndex = 0;
        var unicodeIndex = 0;
        var length = string.length;
        var second;
        var first;
        while (stringIndex < length) {
    
            first = string.charCodeAt(stringIndex);  // returns an integer between 0 and 65535 representing the UTF-16 code unit at the given index.
            if (first >= 0xD800 && first <= 0xDBFF && string.length > stringIndex + 1) {
                second = string.charCodeAt(stringIndex + 1);
                if (second >= 0xDC00 && second <= 0xDFFF) {
                    stringIndex += 2;
                } else {
                    stringIndex += 1;
                }
            } else {
                stringIndex += 1;
            }
    
            unicodeIndex += 1;
        }
        return unicodeIndex;
    }
    
    0 讨论(0)
  • 2021-01-14 07:14

    To sumarize my comments:

    That's just the lenght of that string.

    Some chars involve other chars as well, even if it looks like a single character. "̉mủt̉ả̉̉̉t̉ẻd̉W̉ỏ̉r̉̉d̉̉".length == 24

    From this (great) blog post, they have a function that will return correct length:

    function fancyCount(str){
      const joiner = "\u{200D}";
      const split = str.split(joiner);
      let count = 0;
        
      for(const s of split){
        //removing the variation selectors
        const num = Array.from(s.split(/[\ufe00-\ufe0f]/).join("")).length;
        count += num;
      }
        
      //assuming the joiners are used appropriately
      return count / split.length;
    }
    
    console.log(fancyCount("F                                                                    
    0 讨论(0)
提交回复
热议问题