For a poor man\'s implementation of near-collation-correct sorting on the client side I need a JavaScript function that does efficient single character rep
https://stackoverflow.com/a/37511463
With ES2015/ES6 String.Prototype.Normalize(),
const str = "Crème Brulée" str.normalize('NFD').replace(/[\u0300-\u036f]/g, "") > 'Creme Brulee'
Two things are happening here:
normalize()
ing toNFD
Unicode normal form decomposes combined graphemes into the combination of simple ones. Theè
ofCrème
ends up expressed ase
+̀
.- Using a regex character class to match the U+0300 → U+036F range, it is now trivial to
g
lobally get rid of the diacritics, which the Unicode standard conveniently groups as the Combining Diacritical Marks Unicode block.See comment for performance testing.
Alternatively, if you just want sorting
Intl.Collator has sufficient support ~85% right now, a polyfill is also available here but I haven't tested it.
const c = new Intl.Collator(); ['creme brulee', 'crème brulée', 'crame brulai', 'crome brouillé', 'creme brulay', 'creme brulfé', 'creme bruléa'].sort(c.compare) [ 'crame brulai','creme brulay','creme bruléa','creme brulee', 'crème brulée','creme brulfé','crome brouillé' ] ['creme brulee', 'crème brulée', 'crame brulai', 'crome brouillé'].sort((a,b) => a>b) ["crame brulai", "creme brulee", "crome brouillé", "crème brulée"]
Answer os Crisalin is almost perfect. Just improved the performance to avoid create new RegExp on each run.
var normalizeConversions = [
{ regex: new RegExp('ä|æ|ǽ', 'g'), clean: 'ae' },
{ regex: new RegExp('ö|œ', 'g'), clean: 'oe' },
{ regex: new RegExp('ü', 'g'), clean: 'ue' },
{ regex: new RegExp('Ä', 'g'), clean: 'Ae' },
{ regex: new RegExp('Ü', 'g'), clean: 'Ue' },
{ regex: new RegExp('Ö', 'g'), clean: 'Oe' },
{ regex: new RegExp('À|Á|Â|Ã|Ä|Å|Ǻ|Ā|Ă|Ą|Ǎ', 'g'), clean: 'A' },
{ regex: new RegExp('à|á|â|ã|å|ǻ|ā|ă|ą|ǎ|ª', 'g'), clean: 'a' },
{ regex: new RegExp('Ç|Ć|Ĉ|Ċ|Č', 'g'), clean: 'C' },
{ regex: new RegExp('ç|ć|ĉ|ċ|č', 'g'), clean: 'c' },
{ regex: new RegExp('Ð|Ď|Đ', 'g'), clean: 'D' },
{ regex: new RegExp('ð|ď|đ', 'g'), clean: 'd' },
{ regex: new RegExp('È|É|Ê|Ë|Ē|Ĕ|Ė|Ę|Ě', 'g'), clean: 'E' },
{ regex: new RegExp('è|é|ê|ë|ē|ĕ|ė|ę|ě', 'g'), clean: 'e' },
{ regex: new RegExp('Ĝ|Ğ|Ġ|Ģ', 'g'), clean: 'G' },
{ regex: new RegExp('ĝ|ğ|ġ|ģ', 'g'), clean: 'g' },
{ regex: new RegExp('Ĥ|Ħ', 'g'), clean: 'H' },
{ regex: new RegExp('ĥ|ħ', 'g'), clean: 'h' },
{ regex: new RegExp('Ì|Í|Î|Ï|Ĩ|Ī|Ĭ|Ǐ|Į|İ', 'g'), clean: 'I' },
{ regex: new RegExp('ì|í|î|ï|ĩ|ī|ĭ|ǐ|į|ı', 'g'), clean: 'i' },
{ regex: new RegExp('Ĵ', 'g'), clean: 'J' },
{ regex: new RegExp('ĵ', 'g'), clean: 'j' },
{ regex: new RegExp('Ķ', 'g'), clean: 'K' },
{ regex: new RegExp('ķ', 'g'), clean: 'k' },
{ regex: new RegExp('Ĺ|Ļ|Ľ|Ŀ|Ł', 'g'), clean: 'L' },
{ regex: new RegExp('ĺ|ļ|ľ|ŀ|ł', 'g'), clean: 'l' },
{ regex: new RegExp('Ñ|Ń|Ņ|Ň', 'g'), clean: 'N' },
{ regex: new RegExp('ñ|ń|ņ|ň|ʼn', 'g'), clean: 'n' },
{ regex: new RegExp('Ò|Ó|Ô|Õ|Ō|Ŏ|Ǒ|Ő|Ơ|Ø|Ǿ', 'g'), clean: 'O' },
{ regex: new RegExp('ò|ó|ô|õ|ō|ŏ|ǒ|ő|ơ|ø|ǿ|º', 'g'), clean: 'o' },
{ regex: new RegExp('Ŕ|Ŗ|Ř', 'g'), clean: 'R' },
{ regex: new RegExp('ŕ|ŗ|ř', 'g'), clean: 'r' },
{ regex: new RegExp('Ś|Ŝ|Ş|Š', 'g'), clean: 'S' },
{ regex: new RegExp('ś|ŝ|ş|š|ſ', 'g'), clean: 's' },
{ regex: new RegExp('Ţ|Ť|Ŧ', 'g'), clean: 'T' },
{ regex: new RegExp('ţ|ť|ŧ', 'g'), clean: 't' },
{ regex: new RegExp('Ù|Ú|Û|Ũ|Ū|Ŭ|Ů|Ű|Ų|Ư|Ǔ|Ǖ|Ǘ|Ǚ|Ǜ', 'g'), clean: 'U' },
{ regex: new RegExp('ù|ú|û|ũ|ū|ŭ|ů|ű|ų|ư|ǔ|ǖ|ǘ|ǚ|ǜ', 'g'), clean: 'u' },
{ regex: new RegExp('Ý|Ÿ|Ŷ', 'g'), clean: 'Y' },
{ regex: new RegExp('ý|ÿ|ŷ', 'g'), clean: 'y' },
{ regex: new RegExp('Ŵ', 'g'), clean: 'W' },
{ regex: new RegExp('ŵ', 'g'), clean: 'w' },
{ regex: new RegExp('Ź|Ż|Ž', 'g'), clean: 'Z' },
{ regex: new RegExp('ź|ż|ž', 'g'), clean: 'z' },
{ regex: new RegExp('Æ|Ǽ', 'g'), clean: 'AE' },
{ regex: new RegExp('ß', 'g'), clean: 'ss' },
{ regex: new RegExp('IJ', 'g'), clean: 'IJ' },
{ regex: new RegExp('ij', 'g'), clean: 'ij' },
{ regex: new RegExp('Œ', 'g'), clean: 'OE' },
{ regex: new RegExp('ƒ', 'g'), clean: 'f' }
];
Usage:
function(str){
normalizeConversions.forEach(function(normalizeEntry){
str = str.replace(normalizeEntry.regex, normalizeEntry.clean);
});
return str;
};
I just wanted to post my solution using String#localeCompare
const base_chars = [
'1', '2', '3', '4', '5', '6', '7', '8', '9',
'0', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'-', '_', ' '
];
const fix = str => str.normalize('NFKD').split('')
.map(c => base_chars.find(bc => bc.localeCompare(c, 'en', { sensitivity: 'base' })==0))
.join('');
const str = 'OÒ óëå-123';
console.log(`fix(${str}) = ${fix(str)}`);
I can't speak to what you are trying to do specifically with the function itself, but if you don't like the regex being built every time, here are two solutions and some caveats about each.
Here is one way to do this:
function makeSortString(s) {
if(!makeSortString.translate_re) makeSortString.translate_re = /[öäüÖÄÜ]/g;
var translate = {
"ä": "a", "ö": "o", "ü": "u",
"Ä": "A", "Ö": "O", "Ü": "U" // probably more to come
};
return ( s.replace(makeSortString.translate_re, function(match) {
return translate[match];
}) );
}
This will obviously make the regex a property of the function itself. The only thing you may not like about this (or you may, I guess it depends) is that the regex can now be modified outside of the function's body. So, someone could do this to modify the interally-used regex:
makeSortString.translate_re = /[a-z]/g;
So, there is that option.
One way to get a closure, and thus prevent someone from modifying the regex, would be to define this as an anonymous function assignment like this:
var makeSortString = (function() {
var translate_re = /[öäüÖÄÜ]/g;
return function(s) {
var translate = {
"ä": "a", "ö": "o", "ü": "u",
"Ä": "A", "Ö": "O", "Ü": "U" // probably more to come
};
return ( s.replace(translate_re, function(match) {
return translate[match];
}) );
}
})();
Hopefully this is useful to you.
UPDATE: It's early and I don't know why I didn't see the obvious before, but it might also be useful to put you translate
object in a closure as well:
var makeSortString = (function() {
var translate_re = /[öäüÖÄÜ]/g;
var translate = {
"ä": "a", "ö": "o", "ü": "u",
"Ä": "A", "Ö": "O", "Ü": "U" // probably more to come
};
return function(s) {
return ( s.replace(translate_re, function(match) {
return translate[match];
}) );
}
})();
If you want to achieve sorting where "ä" comes after "a" and is not treated as the same, then you can use a function like mine.
You can always change the alphabet to get different or even weird sortings. However, if you want some letters to be equivalent, then you have to manipulate the strings like a = a.replace(/ä/, 'a')
or similar, as many have already replied above. I've included the uppercase letters if someone wants to have all uppercase words before all lowercase words (then you have to ommit .toLowerCase()
).
function sortbyalphabet(a,b) {
alphabet = "0123456789AaÀàÁáÂâÃãÄäBbCcÇçDdÈèÉéÊêËëFfGgHhÌìÍíÎîÏïJjKkLlMmNnÑñOoÒòÓóÔôÕõÖöPpQqRrSsTtÙùÚúÛûÜüVvWwXxÝýŸÿZz";
a = a.toLowerCase();
b = b.toLowerCase();
shorterone = (a.length > b.length ? a : b);
for (i=0; i<shorterone.length; i++){
diff = alphabet.indexOf(a.charAt(i)) - alphabet.indexOf(b.charAt(i));
if (diff!=0){
return diff;
}
}
// sort the shorter first
return a.length - b.length;
}
var n = ["ast", "Äste", "apfel", "äpfel", "à"];
console.log(n.sort(sortbyalphabet));
// should return ["apfel", "ast", "à", "äpfel", "äste"]
Not a single answer mentions String.localeCompare, which happens to do exactly what you originally wanted, but not what you're asking for.
var list = ['a', 'b', 'c', 'o', 'u', 'z', 'ä', 'ö', 'ü'];
list.sort((a, b) => a.localeCompare(b));
console.log(list);
//Outputs ['a', 'ä', 'b', 'c', 'o', 'ö', 'u', 'ü', 'z']
The second and third parameter are not supported by older browsers though. It's an option worth considering nonetheless.