Is there any optimal way to implement character count for non English letters? For example, if we take the word \"Mother\" in English, it is a 6 letter wor
You can ignore combining marks in the count calculation with this function:
function charCount( str ) {
var re = /[\u0300-\u036f\u1dc0-\u1dff\u20d0-\u20ff\ufe20-\ufe2f\u0b82\u0b83\u0bbe\u0bbf\u0bc0-\u0bc2\u0bc6-\u0bc8\u0bca-\u0bcd\u0bd7]/g
return str.replace( re, "").length;
}
console.log(charCount('மதர்'))// 3
//More tests on random Tamil text:
//Paint the text character by character to verify, for instance 'யெ' is a single character, not 2
console.log(charCount("மெய்யெழுத்துக்கள்")); //9
console.log(charCount("ஒவ்வொன்றுடனும்")); //8
console.log(charCount("தமிழ்")); //3
console.log(charCount("வருகின்றனர்.")); //8
console.log(charCount("எழுதப்படும்")); //7
The Tamil signs and marks are not composed into single characters with their target character in unicode, so normalization wouldn't help. I have added all the Tamil combining marks or signs manually
to the regex, but it also includes the ranges for normal combining marks, so charCount("ä")
is 1
regardless of normalization form.
Back from lunch =) I'm afraid that the previous won't work this well with any foreign language So i added another fiddle with a possible way
var UnicodeNsm = [Array 1280] //It holds all escaped Unicode Non Space Marks
function countNSMString(str) {
var chars = str.split("");
var count = 0;
for (var i = 0,ilen = chars.length;i<ilen;i++) {
if(UnicodeNsm.indexOf(escape(chars[i])) == -1) {
count++;
}
}
return count;
}
var English = "Mother";
var Tamil = "மதர்";
var Vietnamese = "mẹ"
var Hindi = "मां"
function logL (str) {
console.log(str + " has " + countNSMString(str) + " visible Characters and " + str.length + " normal Characters" ); //"மதர் has 3 visible Characters"
}
logL(English) //"Mother has 6 visible Characters and 6 normal Characters"
logL(Tamil) //"மதர் has 3 visible Characters and 4 normal Characters"
logL(Vietnamese) //"mẹ has 2 visible Characters and 3 normal Characters"
logL(Hindi) //"मां has 1 visible Characters and 3 normal Characters"
So this just checks if theres any Character in the String which is a Unicode NSM character and ignores the count for this, this should work for the Most languages, not Tamil only, And an array with 1280 Elements shouldn't be that big of a performance issue
Here is a list with the Unicode NSM's http://www.fileformat.info/info/unicode/category/Mn/list.htm
Here is the according JSBin
After experimenting a bit with string operations, it turns out
String.indexOf
returns the same for
"ர்"
and for "ர"
meaning
"ர்ரர".indexOf("ர்") == "ர்ரர".indexOf("ர" + "்") //true
but
"ர்ரர".indexOf("ர") == "ர்ரர".indexOf("ர" + "ர")
//false
I took this opportunity and tried something like this
//ர்
var char = "ரர்ர்ரர்்";
var char2 = "ரரர்ர்ரர்்";
var char3 = "ர்ரர்ர்ரர்்";
function countStr(str) {
var chars = str.split("");
var count = 0;
for(var i = 0, ilen = chars.length;i<ilen;i++) {
var chars2 = chars[i] + chars[i+1];
if (str.indexOf(chars[i]) == str.indexOf(chars2))
i += 1;
count++;
}
return count;
}
console.log("--");
console.log(countStr(char)); //6
console.log(countStr(char2)); //7
console.log(countStr(char3)); //7
Which seems to work for the String above, it may take some adjustments, as i don't know a thing about Encoding and stuff, but maybe its a point you can begin with
Heres the JSBin