Count number of characters present in foreign language

后端 未结 2 1580
孤独总比滥情好
孤独总比滥情好 2021-01-05 07:41

Is there any optimal way to implement character count for non English letters? For example, if we take the word \"Mother\" in English, it is a 6 letter wor

相关标签:
2条回答
  • 2021-01-05 07:51

    You can ignore combining marks in the count calculation with this function:

    function charCount( str ) {
        var re = /[\u0300-\u036f\u1dc0-\u1dff\u20d0-\u20ff\ufe20-\ufe2f\u0b82\u0b83\u0bbe\u0bbf\u0bc0-\u0bc2\u0bc6-\u0bc8\u0bca-\u0bcd\u0bd7]/g
        return str.replace( re, "").length;
    }
    
    console.log(charCount('மதர்'))// 3
    
    //More tests on random Tamil text:
    //Paint the text character by character to verify, for instance 'யெ' is a single character, not 2
    
    console.log(charCount("மெய்யெழுத்துக்கள்")); //9
    console.log(charCount("ஒவ்வொன்றுடனும்")); //8
    console.log(charCount("தமிழ்")); //3
    console.log(charCount("வருகின்றனர்.")); //8
    console.log(charCount("எழுதப்படும்")); //7
    

    The Tamil signs and marks are not composed into single characters with their target character in unicode, so normalization wouldn't help. I have added all the Tamil combining marks or signs manually to the regex, but it also includes the ranges for normal combining marks, so charCount("ä") is 1 regardless of normalization form.

    0 讨论(0)
  • 2021-01-05 07:58

    Update

    Back from lunch =) I'm afraid that the previous won't work this well with any foreign language So i added another fiddle with a possible way

    var UnicodeNsm = [Array 1280] //It holds all escaped Unicode Non Space Marks
    function countNSMString(str) {
        var chars = str.split("");
        var count = 0;
        for (var i = 0,ilen = chars.length;i<ilen;i++) {
          if(UnicodeNsm.indexOf(escape(chars[i])) == -1) {
            count++;
           }
        }
        return count;
    }
    
    var English = "Mother";  
    var Tamil = "மதர்";
    var Vietnamese = "mẹ"
    var Hindi = "मां"
    
    function logL (str) {    
          console.log(str + " has " + countNSMString(str) + " visible Characters and " + str.length + " normal Characters" ); //"மதர் has 3 visible Characters"
    }
    
    logL(English) //"Mother has 6 visible Characters and 6 normal Characters"
    logL(Tamil) //"மதர் has 3 visible Characters and 4 normal Characters"
    logL(Vietnamese) //"mẹ has 2 visible Characters and 3 normal Characters"
    logL(Hindi) //"मां has 1 visible Characters and 3 normal Characters"
    

    So this just checks if theres any Character in the String which is a Unicode NSM character and ignores the count for this, this should work for the Most languages, not Tamil only, And an array with 1280 Elements shouldn't be that big of a performance issue

    Here is a list with the Unicode NSM's http://www.fileformat.info/info/unicode/category/Mn/list.htm

    Here is the according JSBin


    After experimenting a bit with string operations, it turns out String.indexOf returns the same for

    "ர்" and for "ர" meaning
    "ர்ரர".indexOf("ர்") == "ர்ரர".indexOf("ர" + "்") //true but
    "ர்ரர".indexOf("ர") == "ர்ரர".indexOf("ர" + "ர") //false

    I took this opportunity and tried something like this

    //ர்
    
    var char = "ரர்ர்ரர்்";
    var char2 = "ரரர்ர்ரர்்";    
    var char3 = "ர்ரர்ர்ரர்்";
    
    function countStr(str) {
             var  chars = str.split("");
             var count = 0;
              for(var i = 0, ilen = chars.length;i<ilen;i++) {
                     var chars2 = chars[i] + chars[i+1];   
                     if (str.indexOf(chars[i]) == str.indexOf(chars2))
                       i += 1;
                   count++;
                }
             return count;
     }
    
    
    console.log("--");
    
    console.log(countStr(char)); //6
    
    console.log(countStr(char2)); //7
    
    console.log(countStr(char3)); //7
    

    Which seems to work for the String above, it may take some adjustments, as i don't know a thing about Encoding and stuff, but maybe its a point you can begin with

    Heres the JSBin

    0 讨论(0)
提交回复
热议问题