getting a string length that contains unicode character exceeding 0xffff

前端未结

关注

 4  1599

I’m using this character, double sharp \'


                      
              相关标签:


      
      
        
          4条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  梦谈多话        
                
              
                            
                2021-01-14 07:00
              
            
            
                                                                       
String.prototype.codes = function() { return [...this].length };
String.prototype.chars = function() {
    let GraphemeSplitter = require('grapheme-splitter');
    return (new GraphemeSplitter()).countGraphemes(this);
}

console.log("F                                                                    
                                                        
            

            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          

          	          
            
           
            
                              
                
              
              
                
                  小蘑菇        
                
              
                            
                2021-01-14 07:03
              
            
            
                                                                       
Javascript (and Java) strings use UTF-16 encoding.

Unicode codepoint U+0046 (F) is encoded in UTF-16 using 1 codeunit: 0x0046

Unicode codepoint U+1D12A (                                                                    

                                                        

            

            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          

          	          
            
           
            
                              
                
              
              
                
                  庸人自扰        
                
              
                            
                2021-01-14 07:13
              
            
            
                                                                       
That's the function I wrote to get string length in codepoint length
function nbUnicodeLength(string){
    var stringIndex = 0;
    var unicodeIndex = 0;
    var length = string.length;
    var second;
    var first;
    while (stringIndex < length) {

        first = string.charCodeAt(stringIndex);  // returns an integer between 0 and 65535 representing the UTF-16 code unit at the given index.
        if (first >= 0xD800 && first <= 0xDBFF && string.length > stringIndex + 1) {
            second = string.charCodeAt(stringIndex + 1);
            if (second >= 0xDC00 && second <= 0xDFFF) {
                stringIndex += 2;
            } else {
                stringIndex += 1;
            }
        } else {
            stringIndex += 1;
        }

        unicodeIndex += 1;
    }
    return unicodeIndex;
}

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  北海茫月        
                
              
                            
                2021-01-14 07:14
              
            
            
                                                                       
To sumarize my comments:

That's just the lenght of that string.

Some chars involve other chars as well, even if it looks like a single character. "̉mủt̉ả̉̉̉t̉ẻd̉W̉ỏ̉r̉̉d̉̉".length == 24

From this (great) blog post, they have a function that will return correct length:



function fancyCount(str){
  const joiner = "\u{200D}";
  const split = str.split(joiner);
  let count = 0;
    
  for(const s of split){
    //removing the variation selectors
    const num = Array.from(s.split(/[\ufe00-\ufe0f]/).join("")).length;
    count += num;
  }
    
  //assuming the joiners are used appropriately
  return count / split.length;
}

console.log(fancyCount("F                                                                    
                                                        
            

            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          

          	          
                             

        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复