I\'m currently creating a Node.js webscraper/proxy, but I\'m having trouble parsing relative Urls found in the scripting part of the source, I figured REGEX would do the tri
A reliable way to convert urls from relative to absolute is to use the built-in url module.
Example:
var url = require('url');
url.resolve("http://www.example.org/foo/bar/", "../baz/qux.html");
>> gives 'http://www.example.org/foo/baz/qux.html'
If you use a regex to find all non-absolute URLs, you can then just prefix them with the current URL and that should be it.
The URLs you need to fix would be ones which don't start either with a /
or http(s)://
(or other protocol markers, if you care about them)
As an example, let's say you're scraping http://www.example.com/
. If you encounter a relative URL, let's say foo/bar
, you would simply prefix the URL being scraped to it like so: http://www.example.com/foo/bar
For a regex to scrape the URLs from the page, there are probably plenty of good ones available if you google a bit so I'm not going to start inventing a poor one here :)
This is Rob W answer "Advanced HTML string replacement functions" in current thread plus some code re-factoring from me to make JSLint happy.
I should post it as answer's comment but I don't have enough reputation points.
/*jslint browser: true */
/*jslint regexp: true */
/*jslint unparam: true*/
/*jshint strict: false */
/**
* convertRelToAbsUrl
*
* https://stackoverflow.com/a/7544757/1983903
*
* @param {String} url
* @return {String} updated url
*/
function convertRelToAbsUrl(url) {
var baseUrl = null;
if (/^(https?|file|ftps?|mailto|javascript|data:image\/[^;]{2,9};):/i.test(url)) {
return url; // url is already absolute
}
baseUrl = location.href.match(/^(.+)\/?(?:#.+)?$/)[0] + '/';
if (url.substring(0, 2) === '//') {
return location.protocol + url;
}
if (url.charAt(0) === '/') {
return location.protocol + '//' + location.host + url;
}
if (url.substring(0, 2) === './') {
url = '.' + url;
} else if (/^\s*$/.test(url)) {
return ''; // empty = return nothing
}
url = baseUrl + '../' + url;
while (/\/\.\.\//.test(url)) {
url = url.replace(/[^\/]+\/+\.\.\//g, '');
}
url = url.replace(/\.$/, '').replace(/\/\./g, '').replace(/"/g, '%22')
.replace(/'/g, '%27').replace(/</g, '%3C').replace(/>/g, '%3E');
return url;
}
/**
* convertAllRelativeToAbsoluteUrls
*
* https://stackoverflow.com/a/7544757/1983903
*
* @param {String} html
* @return {String} updated html
*/
function convertAllRelativeToAbsoluteUrls(html) {
var me = this,
att = '[^-a-z0-9:._]',
entityEnd = '(?:;|(?!\\d))',
ents = {
' ' : '(?:\\s| ?|�*32' + entityEnd + '|�*20' + entityEnd + ')',
'(' : '(?:\\(|�*40' + entityEnd + '|�*28' + entityEnd + ')',
')' : '(?:\\)|�*41' + entityEnd + '|�*29' + entityEnd + ')',
'.' : '(?:\\.|�*46' + entityEnd + '|�*2e' + entityEnd + ')'
},
charMap = {},
s = ents[' '] + '*', // short-hand for common use
any = '(?:[^>\"\']*(?:\"[^\"]*\"|\'[^\']*\'))*?[^>]*',
slashRE = null,
dotRE = null;
function ae(string) {
var allCharsLowerCase = string.toLowerCase(),
allCharsUpperCase = string.toUpperCase(),
reRes = '',
charLowerCase = null,
charUpperCase = null,
reSub = null,
i = null;
if (ents[string]) {
return ents[string];
}
for (i = 0; i < string.length; i++) {
charLowerCase = allCharsLowerCase.charAt(i);
if (charMap[charLowerCase]) {
reRes += charMap[charLowerCase];
continue;
}
charUpperCase = allCharsUpperCase.charAt(i);
reSub = [charLowerCase];
reSub.push('�*' + charLowerCase.charCodeAt(0) + entityEnd);
reSub.push('�*' + charLowerCase.charCodeAt(0).toString(16) + entityEnd);
if (charLowerCase !== charUpperCase) {
reSub.push('�*' + charUpperCase.charCodeAt(0) + entityEnd);
reSub.push('�*' + charUpperCase.charCodeAt(0).toString(16) + entityEnd);
}
reSub = '(?:' + reSub.join('|') + ')';
reRes += (charMap[charLowerCase] = reSub);
}
return (ents[string] = reRes);
}
function by(match, group1, group2, group3) {
return group1 + me.convertRelToAbsUrl(group2) + group3;
}
slashRE = new RegExp(ae('/'), 'g');
dotRE = new RegExp(ae('.'), 'g');
function by2(match, group1, group2, group3) {
group2 = group2.replace(slashRE, '/').replace(dotRE, '.');
return group1 + me.convertRelToAbsUrl(group2) + group3;
}
function cr(selector, attribute, marker, delimiter, end) {
var re1 = null,
re2 = null,
re3 = null;
if (typeof selector === 'string') {
selector = new RegExp(selector, 'gi');
}
attribute = att + attribute;
marker = typeof marker === 'string' ? marker : '\\s*=\\s*';
delimiter = typeof delimiter === 'string' ? delimiter : '';
end = typeof end === 'string' ? '?)(' + end : ')(';
re1 = new RegExp('(' + attribute + marker + '")([^"' + delimiter + ']+' + end + ')', 'gi');
re2 = new RegExp('(' + attribute + marker + '\')([^\'' + delimiter + ']+' + end + ')', 'gi');
re3 = new RegExp('(' + attribute + marker + ')([^"\'][^\\s>' + delimiter + ']*' + end + ')', 'gi');
html = html.replace(selector, function (match) {
return match.replace(re1, by).replace(re2, by).replace(re3, by);
});
}
function cri(selector, attribute, front, flags, delimiter, end) {
var re1 = null,
re2 = null,
at1 = null,
at2 = null,
at3 = null,
handleAttr = null;
if (typeof selector === 'string') {
selector = new RegExp(selector, 'gi');
}
attribute = att + attribute;
flags = typeof flags === 'string' ? flags : 'gi';
re1 = new RegExp('(' + attribute + '\\s*=\\s*")([^"]*)', 'gi');
re2 = new RegExp("(" + attribute + "\\s*=\\s*')([^']+)", 'gi');
at1 = new RegExp('(' + front + ')([^"]+)(")', flags);
at2 = new RegExp("(" + front + ")([^']+)(')", flags);
if (typeof delimiter === 'string') {
end = typeof end === 'string' ? end : '';
at3 = new RegExp('(' + front + ')([^\"\'][^' + delimiter + ']*' + (end ? '?)(' + end + ')' : ')()'), flags);
handleAttr = function (match, g1, g2) {
return g1 + g2.replace(at1, by2).replace(at2, by2).replace(at3, by2);
};
} else {
handleAttr = function (match, g1, g2) {
return g1 + g2.replace(at1, by2).replace(at2, by2);
};
}
html = html.replace(selector, function (match) {
return match.replace(re1, handleAttr).replace(re2, handleAttr);
});
}
cri('<meta' + any + att + 'http-equiv\\s*=\\s*(?:\"' + ae('refresh')
+ '\"' + any + '>|\'' + ae('refresh') + '\'' + any + '>|' + ae('refresh')
+ '(?:' + ae(' ') + any + '>|>))', 'content', ae('url') + s + ae('=') + s, 'i');
cr('<' + any + att + 'href\\s*=' + any + '>', 'href'); /* Linked elements */
cr('<' + any + att + 'src\\s*=' + any + '>', 'src'); /* Embedded elements */
cr('<object' + any + att + 'data\\s*=' + any + '>', 'data'); /* <object data= > */
cr('<applet' + any + att + 'codebase\\s*=' + any + '>', 'codebase'); /* <applet codebase= > */
/* <param name=movie value= >*/
cr('<param' + any + att + 'name\\s*=\\s*(?:\"' + ae('movie') + '\"' + any + '>|\''
+ ae('movie') + '\'' + any + '>|' + ae('movie') + '(?:' + ae(' ') + any + '>|>))', 'value');
cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi,
'url', '\\s*\\(\\s*', '', '\\s*\\)'); /* <style> */
cri('<' + any + att + 'style\\s*=' + any + '>', 'style',
ae('url') + s + ae('(') + s, 0, s + ae(')'), ae(')')); /*< style=" url(...) " > */
return html;
}
Note for OP, because he requested such a function: Change base_url
to your proxy's basE URL in order to achieve the desired results.
Two functions will be shown below (the usage guide is contained within the code). Make sure that you don't skip any part of the explanation of this answer to fully understand the function's behaviour.
rel_to_abs(urL)
- This function returns absolute URLs. When an absolute URL with a commonly trusted protocol is passed, it will immediately return this URL. Otherwise, an absolute URL is generated from the base_url
and the function argument. Relative URLs are correctly parsed (../
; ./
; .
; //
).replace_all_rel_by_abs
- This function will parse all occurences of URLs which have a significant meaning in HTML, such as CSS url()
, links and external resources. See the code for a full list of parsed instances. See this answer for an adjusted implementation to sanitise HTML strings from an external source (to embed in the document).rel_to_abs
- Parsing relative URLs
function rel_to_abs(url){
/* Only accept commonly trusted protocols:
* Only data-image URLs are accepted, Exotic flavours (escaped slash,
* html-entitied characters) are not supported to keep the function fast */
if(/^(https?|file|ftps?|mailto|javascript|data:image\/[^;]{2,9};):/i.test(url))
return url; //Url is already absolute
var base_url = location.href.match(/^(.+)\/?(?:#.+)?$/)[0]+"/";
if(url.substring(0,2) == "//")
return location.protocol + url;
else if(url.charAt(0) == "/")
return location.protocol + "//" + location.host + url;
else if(url.substring(0,2) == "./")
url = "." + url;
else if(/^\s*$/.test(url))
return ""; //Empty = Return nothing
else url = "../" + url;
url = base_url + url;
var i=0
while(/\/\.\.\//.test(url = url.replace(/[^\/]+\/+\.\.\//g,"")));
/* Escape certain characters to prevent XSS */
url = url.replace(/\.$/,"").replace(/\/\./g,"").replace(/"/g,"%22")
.replace(/'/g,"%27").replace(/</g,"%3C").replace(/>/g,"%3E");
return url;
}
Cases / examples:
http://foo.bar
. Already an absolute URL, thus returned immediately./doo
Relative to the root: Returns the current root + provided relative URL../meh
Relative to the current directory.../booh
Relative to the parent directory.The function converts relative paths to ../
, and performs a search-and-replace (http://domain/sub/anything-but-a-slash/../me
to http://domain/sub/me
).
replace_all_rel_by_abs
- Convert all relevant occurences of URLs<script>
, event handlers are not replaced, because it's near-impossible to create a fast-and-secure filter to parse JavaScript.
This script is served with some comments inside. Regular Expressions are dynamically created, because an individual RE can have a size of 3000 characters. <meta http-equiv=refresh content=.. >
can be obfuscated in various ways, hence the size of the RE.
function replace_all_rel_by_abs(html){
/*HTML/XML Attribute may not be prefixed by these characters (common
attribute chars. This list is not complete, but will be sufficient
for this function (see http://www.w3.org/TR/REC-xml/#NT-NameChar). */
var att = "[^-a-z0-9:._]";
var entityEnd = "(?:;|(?!\\d))";
var ents = {" ":"(?:\\s| ?|�*32"+entityEnd+"|�*20"+entityEnd+")",
"(":"(?:\\(|�*40"+entityEnd+"|�*28"+entityEnd+")",
")":"(?:\\)|�*41"+entityEnd+"|�*29"+entityEnd+")",
".":"(?:\\.|�*46"+entityEnd+"|�*2e"+entityEnd+")"};
/* Placeholders to filter obfuscations */
var charMap = {};
var s = ents[" "]+"*"; //Short-hand for common use
var any = "(?:[^>\"']*(?:\"[^\"]*\"|'[^']*'))*?[^>]*";
/* ^ Important: Must be pre- and postfixed by < and >.
* This RE should match anything within a tag! */
/*
@name ae
@description Converts a given string in a sequence of the original
input and the HTML entity
@param String string String to convert
*/
function ae(string){
var all_chars_lowercase = string.toLowerCase();
if(ents[string]) return ents[string];
var all_chars_uppercase = string.toUpperCase();
var RE_res = "";
for(var i=0; i<string.length; i++){
var char_lowercase = all_chars_lowercase.charAt(i);
if(charMap[char_lowercase]){
RE_res += charMap[char_lowercase];
continue;
}
var char_uppercase = all_chars_uppercase.charAt(i);
var RE_sub = [char_lowercase];
RE_sub.push("�*" + char_lowercase.charCodeAt(0) + entityEnd);
RE_sub.push("�*" + char_lowercase.charCodeAt(0).toString(16) + entityEnd);
if(char_lowercase != char_uppercase){
/* Note: RE ignorecase flag has already been activated */
RE_sub.push("�*" + char_uppercase.charCodeAt(0) + entityEnd);
RE_sub.push("�*" + char_uppercase.charCodeAt(0).toString(16) + entityEnd);
}
RE_sub = "(?:" + RE_sub.join("|") + ")";
RE_res += (charMap[char_lowercase] = RE_sub);
}
return(ents[string] = RE_res);
}
/*
@name by
@description 2nd argument for replace().
*/
function by(match, group1, group2, group3){
/* Note that this function can also be used to remove links:
* return group1 + "javascript://" + group3; */
return group1 + rel_to_abs(group2) + group3;
}
/*
@name by2
@description 2nd argument for replace(). Parses relevant HTML entities
*/
var slashRE = new RegExp(ae("/"), 'g');
var dotRE = new RegExp(ae("."), 'g');
function by2(match, group1, group2, group3){
/*Note that this function can also be used to remove links:
* return group1 + "javascript://" + group3; */
group2 = group2.replace(slashRE, "/").replace(dotRE, ".");
return group1 + rel_to_abs(group2) + group3;
}
/*
@name cr
@description Selects a HTML element and performs a
search-and-replace on attributes
@param String selector HTML substring to match
@param String attribute RegExp-escaped; HTML element attribute to match
@param String marker Optional RegExp-escaped; marks the prefix
@param String delimiter Optional RegExp escaped; non-quote delimiters
@param String end Optional RegExp-escaped; forces the match to end
before an occurence of <end>
*/
function cr(selector, attribute, marker, delimiter, end){
if(typeof selector == "string") selector = new RegExp(selector, "gi");
attribute = att + attribute;
marker = typeof marker == "string" ? marker : "\\s*=\\s*";
delimiter = typeof delimiter == "string" ? delimiter : "";
end = typeof end == "string" ? "?)("+end : ")(";
var re1 = new RegExp('('+attribute+marker+'")([^"'+delimiter+']+'+end+')', 'gi');
var re2 = new RegExp("("+attribute+marker+"')([^'"+delimiter+"]+"+end+")", 'gi');
var re3 = new RegExp('('+attribute+marker+')([^"\'][^\\s>'+delimiter+']*'+end+')', 'gi');
html = html.replace(selector, function(match){
return match.replace(re1, by).replace(re2, by).replace(re3, by);
});
}
/*
@name cri
@description Selects an attribute of a HTML element, and
performs a search-and-replace on certain values
@param String selector HTML element to match
@param String attribute RegExp-escaped; HTML element attribute to match
@param String front RegExp-escaped; attribute value, prefix to match
@param String flags Optional RegExp flags, default "gi"
@param String delimiter Optional RegExp-escaped; non-quote delimiters
@param String end Optional RegExp-escaped; forces the match to end
before an occurence of <end>
*/
function cri(selector, attribute, front, flags, delimiter, end){
if(typeof selector == "string") selector = new RegExp(selector, "gi");
attribute = att + attribute;
flags = typeof flags == "string" ? flags : "gi";
var re1 = new RegExp('('+attribute+'\\s*=\\s*")([^"]*)', 'gi');
var re2 = new RegExp("("+attribute+"\\s*=\\s*')([^']+)", 'gi');
var at1 = new RegExp('('+front+')([^"]+)(")', flags);
var at2 = new RegExp("("+front+")([^']+)(')", flags);
if(typeof delimiter == "string"){
end = typeof end == "string" ? end : "";
var at3 = new RegExp("("+front+")([^\"'][^"+delimiter+"]*" + (end?"?)("+end+")":")()"), flags);
var handleAttr = function(match, g1, g2){return g1+g2.replace(at1, by2).replace(at2, by2).replace(at3, by2)};
} else {
var handleAttr = function(match, g1, g2){return g1+g2.replace(at1, by2).replace(at2, by2)};
}
html = html.replace(selector, function(match){
return match.replace(re1, handleAttr).replace(re2, handleAttr);
});
}
/* <meta http-equiv=refresh content=" ; url= " > */
cri("<meta"+any+att+"http-equiv\\s*=\\s*(?:\""+ae("refresh")+"\""+any+">|'"+ae("refresh")+"'"+any+">|"+ae("refresh")+"(?:"+ae(" ")+any+">|>))", "content", ae("url")+s+ae("=")+s, "i");
cr("<"+any+att+"href\\s*="+any+">", "href"); /* Linked elements */
cr("<"+any+att+"src\\s*="+any+">", "src"); /* Embedded elements */
cr("<object"+any+att+"data\\s*="+any+">", "data"); /* <object data= > */
cr("<applet"+any+att+"codebase\\s*="+any+">", "codebase"); /* <applet codebase= > */
/* <param name=movie value= >*/
cr("<param"+any+att+"name\\s*=\\s*(?:\""+ae("movie")+"\""+any+">|'"+ae("movie")+"'"+any+">|"+ae("movie")+"(?:"+ae(" ")+any+">|>))", "value");
cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, "url", "\\s*\\(\\s*", "", "\\s*\\)"); /* <style> */
cri("<"+any+att+"style\\s*="+any+">", "style", ae("url")+s+ae("(")+s, 0, s+ae(")"), ae(")")); /*< style=" url(...) " > */
return html;
}
A short summary of the private functions:
rel_to_abs(url)
- Converts relative / unknown URLs to absolute URLsreplace_all_rel_by_abs(html)
- Replaces all relevant occurences of URLs within a string of HTML by absolute URLs.
ae
- Any Entity - Returns a RE-pattern to deal with HTML entities.by
- replace by - This short function request the actual url replace (rel_to_abs
). This function may be called hundreds, if not thousand times. Be careful to not add a slow algorithm to this function (customisation).cr
- Create Replace - Creates and executes a search-and-replace.href="..."
(within any HTML tag).cri
- Create Replace Inline - Creates and executes a search-and-replace.url(..)
within the all style
attribute within HTML tags.Open any page, and paste the following bookmarklet in the location bar:
javascript:void(function(){var s=document.createElement("script");s.src="http://rob.lekensteyn.nl/rel_to_abs.js";document.body.appendChild(s)})();
The injected code contains the two functions, as defined above, plus the test case, shown below. Note: The test case does not modify the HTML of the page, but shows the parsed results in a textarea (optionally).
var t=(new Date).getTime();
var result = replace_all_rel_by_abs(document.documentElement.innerHTML);
if(confirm((new Date).getTime()-t+" milliseconds to execute\n\nPut results in new textarea?")){
var txt = document.createElement("textarea");
txt.style.cssText = "position:fixed;top:0;left:0;width:100%;height:99%"
txt.ondblclick = function(){this.parentNode.removeChild(this)}
txt.value = result;
document.body.appendChild(txt);
}
See also:
From a comment by Rob W above about the base tag I wrote an injection function:
function injectBase(html, base) {
// Remove any <base> elements inside <head>
html = html.replace(/(<[^>/]*head[^>]*>)[\s\S]*?(<[^>/]*base[^>]*>)[\s\S]*?(<[^>]*head[^>]*>)/img, "$1 $3");
// Add <base> just before </head>
html = html.replace(/(<[^>/]*head[^>]*>[\s\S]*?)(<[^>]*head[^>]*>)/img, "$1 " + base + " $2");
return(html);
}