I\'m writing a Chrome extension that involves doing a lot of the following job: sanitizing strings that might contain HTML tags, by converting
All-in-one script:
// HTML entities Encode/Decode
function htmlspecialchars(str) {
var map = {
"&": "&",
"<": "<",
">": ">",
"\"": """,
"'": "'" // ' -> ' for XML only
};
return str.replace(/[&<>"']/g, function(m) { return map[m]; });
}
function htmlspecialchars_decode(str) {
var map = {
"&": "&",
"<": "<",
">": ">",
""": "\"",
"'": "'"
};
return str.replace(/(&|<|>|"|')/g, function(m) { return map[m]; });
}
function htmlentities(str) {
var textarea = document.createElement("textarea");
textarea.innerHTML = str;
return textarea.innerHTML;
}
function htmlentities_decode(str) {
var textarea = document.createElement("textarea");
textarea.innerHTML = str;
return textarea.value;
}
http://pastebin.com/JGCVs0Ts
You could try passing a callback function to perform the replacement:
var tagsToReplace = {
'&': '&',
'<': '<',
'>': '>'
};
function replaceTag(tag) {
return tagsToReplace[tag] || tag;
}
function safe_tags_replace(str) {
return str.replace(/[&<>]/g, replaceTag);
}
Here is a performance test: http://jsperf.com/encode-html-entities to compare with calling the replace
function repeatedly, and using the DOM method proposed by Dmitrij.
Your way seems to be faster...
Why do you need it, though?
I'm not entirely sure about speed, but if you are looking for simplicity I would suggest using the lodash/underscore escape function.
The AngularJS source code also has a version inside of angular-sanitize.js.
var SURROGATE_PAIR_REGEXP = /[\uD800-\uDBFF][\uDC00-\uDFFF]/g,
// Match everything outside of normal chars and " (quote character)
NON_ALPHANUMERIC_REGEXP = /([^\#-~| |!])/g;
/**
* Escapes all potentially dangerous characters, so that the
* resulting string can be safely inserted into attribute or
* element text.
* @param value
* @returns {string} escaped text
*/
function encodeEntities(value) {
return value.
replace(/&/g, '&').
replace(SURROGATE_PAIR_REGEXP, function(value) {
var hi = value.charCodeAt(0);
var low = value.charCodeAt(1);
return '&#' + (((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000) + ';';
}).
replace(NON_ALPHANUMERIC_REGEXP, function(value) {
return '&#' + value.charCodeAt(0) + ';';
}).
replace(/</g, '<').
replace(/>/g, '>');
}
Martijn's method as single function with handling " mark (using in javascript) :
function escapeHTML(html) {
var fn=function(tag) {
var charsToReplace = {
'&': '&',
'<': '<',
'>': '>',
'"': '"'
};
return charsToReplace[tag] || tag;
}
return html.replace(/[&<>"]/g, fn);
}
The fastest method is:
function escapeHTML(html) {
return document.createElement('div').appendChild(document.createTextNode(html)).parentNode.innerHTML;
}
This method is about twice faster than the methods based on 'replace', see http://jsperf.com/htmlencoderegex/35 .
Source: https://stackoverflow.com/a/17546215/698168