I need to get the HTML with current styles (maybe inlined) of a page that finished rendering and finished running scripts, using a server side application which will be given ju
PhantomJS is a headless (GUI-less) WebKit with JavaScript API. It runs on all major platforms, as I requested in my question.
It can run Javascript scripts to control the GUI-less web browser. It has a powerful API, and lots and lots of examples.
In my spare time over the last 2-3 days I wrote the solution to my question, and it covers all requirements beautifully. I haven't found a webpage for which it wouldn't work.
.
Usage, command line:
phantomjs save_as_html.js http://stackoverflow.com/q/12215844/584490 saved.html
.
Javascript is allowed to run for n
seconds after everything else loads, it should work even for web pages generated entirely by javascript.
.
Notes:
Where possible, XHR loading of resources is prefered over HTML5's canvas rendering because of reduced file size and preventing quality loss (reusing original files is better than anything).
<link>
and <img>
tags are kept in place, and data:
URIs are used inside the href and src attributes respectively, instead of URLs. The same is true for background-image
, which is read using getComputedStyle() on all DOM nodes.
<script>
tags and event handler attributes are removed.
<link>
tags with rel="alternative"
are removed also (maybe they shouldn't be, and instead be fixed into an absolute URL, if relative).
<iframe>
is currently not handled, and its src attribute is beeing set to about:blank
.
.
Beware all cross site scripting security restrictions are lifted, so that all resources can be loaded. Make sure you don't try to save malicious webpages while using some secret credentials of your Facebook account :).
.
save_as_html.js
contents:
//http://stackoverflow.com/a/12256190/584490
var page = require('webpage').create();
page.onConsoleMessage = function (msg) { console.log(msg); };
var system = require('system');
var address, output, size;
if (system.args.length!=3)
{
console.log('Usage: save_as_html.js URL filename');
phantom.exit(1);
}
else
{
address = system.args[1];
output = system.args[2];
page.viewportSize = {
width: 1680,
height: 1050,
};
//SECURITY_ERR: DOM Exception 18: An attempt was made to break through the security policy of the user agent.
//Enable cross site scripting:
page.settings.XSSAuditingEnabled=false;
page.settings.localToRemoteUrlAccessEnabled=true;
page.settings.webSecurityEnabled=false;
page.settings.userAgent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1";
page.settings.ignoreSslErrors=true;
page.open(address, function (status){
if (status!=='success')
{
console.log("Unable to load URL, returned status: "+status);
phantom.exit(1);
}
else
{
window.setTimeout(function (){
page.evaluate(function(){
var nodeList=document.getElementsByTagName("*");
var arrEventHandlerAttributes=[
"onblur", "onchange", "onclick", "ondblclick", "onfocus", "onkeydown", "onkeyup", "onkeypress", "onkeyup","onload",
"onmousedown", "onmousemove", "onmouseout", "onmouseover", "onmouseup", "onreset", "onselect", "onsubmit", "onunload"
];
//http://stackoverflow.com/a/7372816/584490
var base64Encode=function(str)
{
var CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
var out = "", i = 0, len = str.length, c1, c2, c3;
while (i < len) {
c1 = str.charCodeAt(i++) & 0xff;
if (i == len) {
out += CHARS.charAt(c1 >> 2);
out += CHARS.charAt((c1 & 0x3) << 4);
out += "==";
break;
}
c2 = str.charCodeAt(i++);
if (i == len) {
out += CHARS.charAt(c1 >> 2);
out += CHARS.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4));
out += CHARS.charAt((c2 & 0xF) << 2);
out += "=";
break;
}
c3 = str.charCodeAt(i++);
out += CHARS.charAt(c1 >> 2);
out += CHARS.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4));
out += CHARS.charAt(((c2 & 0xF) << 2) | ((c3 & 0xC0) >> 6));
out += CHARS.charAt(c3 & 0x3F);
}
return out;
};
for(var n=nodeList.length-1; n>0; n--)
{
try
{
var el=nodeList[n];
if(el.nodeName=="IMG" && el.src.substr(0, 5)!="data:")
{
/*var canvas=document.createElement("canvas");
canvas.width=parseInt(el.width);
canvas.height=parseInt(el.height);
var ctx=canvas.getContext("2d");
ctx.drawImage(el, 0, 0);
el.src=canvas.toDataURL();*/
var xhr=new XMLHttpRequest();
xhr.open(
"get",
el.src,
/*Asynchronous*/ false
);
xhr.overrideMimeType("text/plain; charset=x-user-defined");
xhr.send(null);
var strResponseContentType=xhr.getResponseHeader("Content-type").split(";")[0].replace(/[^a-z0-9\/-]/gi, "");
el.src="data:"+strResponseContentType+";base64,"+base64Encode(xhr.responseText);
}
else if(el.nodeName=="LINK")
{
if(el.rel=="alternate")
{
el.parentNode.removeChild(el);
}
else if(el.href.substr(0, 5)!="data:")
{
var xhr=new XMLHttpRequest();
xhr.open(
"get",
el.href,
/*Asynchronous*/ false
);
xhr.overrideMimeType("text/plain; charset=x-user-defined");
xhr.send(null);
//var strResponseContentType=xhr.getResponseHeader("Content-type").split(";")[0].replace(/[^a-z0-9\/-]/gi, "");
//el.href="data:"+strResponseContentType+";base64,"+base64Encode(xhr.responseText);
el.href="data:"+el.type+";base64,"+base64Encode(xhr.responseText);
}
continue;
}
else if(el.nodeName=="SCRIPT")
{
el.parentNode.removeChild(el);
continue;
}
else if(el.nodeName=="IFRAME")
{
el.src="about:blank";
continue;
}
for(var z=arrEventHandlerAttributes.length-1; z>=0; z--)
el.removeAttribute(arrEventHandlerAttributes[z]);
var strBackgroundImageURL=window.getComputedStyle(el).getPropertyValue("background-image").replace("/[\s]/g", "");
if(strBackgroundImageURL.substr(0, 4)=="url(" && strBackgroundImageURL.substr(4, 5)!="data:")
{
strBackgroundImageURL=strBackgroundImageURL.substr(4, strBackgroundImageURL.length-5);
/*var imageTemp=document.createElement("img");
imageTemp.src=strBackgroundImageURL;
imageTemp.onload=function(e){
var canvas=document.createElement("canvas");
canvas.width=parseInt(imageTemp.width);
canvas.height=parseInt(imageTemp.height);
var ctx=canvas.getContext("2d");
ctx.drawImage(imageTemp, 0, 0);
el.style.backgroundImage="url("+canvas.toDataURL()+")";
};
if (imageTemp.complete)
imageTemp.onload();
*/
var xhr=new XMLHttpRequest();
xhr.open(
"get",
strBackgroundImageURL,
/*Asynchronous*/ false
);
xhr.overrideMimeType("text/plain; charset=x-user-defined");
xhr.send(null);
var strResponseContentType=xhr.getResponseHeader("Content-type").split(";")[0].replace(/[^a-z0-9\/-]/gi, "");
el.style.backgroundImage="url("+"data:"+strResponseContentType+";base64,"+base64Encode(xhr.responseText)+")";
}
if(el.nodeName=="A")
{
el.href="#";//TODO convert relative paths to absolute ones (keep URLs);
el.setAttribute("onclick", "return false;");//TODO: remove this when the above is fixed.
}
else if(el.nodeName=="FORM")
{
el.setAttribute("action", "");
el.setAttribute("onsubmit", "return false;");
}
}
catch(error)
{
//what can be done about it?
}
}
});
require("fs").write(output, page.content, "w");
phantom.exit();
}, 1000);
}
});
}