Is there a way in JS to get the entire HTML within the html tags, as a string?
document.documentElement.??
document.documentElement.outerHTML
I am using outerHTML
for elements (the main <html>
container), and XMLSerializer
for anything else including <!DOCTYPE>
, random comments outside the <html>
container, or whatever else might be there. It seems that whitespace isn't preserved outside the <html>
element, so I'm adding newlines by default with sep="\n"
.
function get_document_html(sep="\n") {
let html = "";
let xml = new XMLSerializer();
for (let n of document.childNodes) {
if (n.nodeType == Node.ELEMENT_NODE)
html += n.outerHTML + sep;
else
html += xml.serializeToString(n) + sep;
}
return html;
}
console.log(get_document_html().slice(0, 200));
I just need doctype html and should work fine in IE11, Edge and Chrome. I used below code it works fine.
function downloadPage(element, event) {
var isChrome = /Chrome/.test(navigator.userAgent) && /Google Inc/.test(navigator.vendor);
if ((navigator.userAgent.indexOf("MSIE") != -1) || (!!document.documentMode == true)) {
document.execCommand('SaveAs', '1', 'page.html');
event.preventDefault();
} else {
if(isChrome) {
element.setAttribute('href','data:text/html;charset=UTF-8,'+encodeURIComponent('<!doctype html>' + document.documentElement.outerHTML));
}
element.setAttribute('download', 'page.html');
}
}
and in your anchor tag use like this.
<a href="#" onclick="downloadPage(this,event);" download>Download entire page.</a>
Example
function downloadPage(element, event) {
var isChrome = /Chrome/.test(navigator.userAgent) && /Google Inc/.test(navigator.vendor);
if ((navigator.userAgent.indexOf("MSIE") != -1) || (!!document.documentMode == true)) {
document.execCommand('SaveAs', '1', 'page.html');
event.preventDefault();
} else {
if(isChrome) {
element.setAttribute('href','data:text/html;charset=UTF-8,'+encodeURIComponent('<!doctype html>' + document.documentElement.outerHTML));
}
element.setAttribute('download', 'page.html');
}
}
I just need doctype html and should work fine in IE11, Edge and Chrome.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
<p>
<a href="#" onclick="downloadPage(this,event);" download><h2>Download entire page.</h2></a></p>
<p>Some image here</p>
<p><img src="https://placeimg.com/250/150/animals"/></p>
To also get things outside the <html>...</html>
, most importantly the <!DOCTYPE ...>
declaration, you could walk through document.childNodes, turning each into a string:
const html = [...document.childNodes]
.map(node => nodeToString(node))
.join('\n') // could use '' instead, but whitespace should not matter.
function nodeToString(node) {
switch (node.nodeType) {
case node.ELEMENT_NODE:
return node.outerHTML
case node.TEXT_NODE:
// Text nodes should probably never be encountered, but handling them anyway.
return node.textContent
case node.COMMENT_NODE:
return `<!--${node.textContent}-->`
case node.DOCUMENT_TYPE_NODE:
return doctypeToString(node)
default:
throw new TypeError(`Unexpected node type: ${node.nodeType}`)
}
}
I published this code as document-outerhtml on npm.
edit Note the code above depends on a function doctypeToString
; its implementation could be as follows (code below is published on npm as doctype-to-string):
function doctypeToString(doctype) {
if (doctype === null) {
return ''
}
// Checking with instanceof DocumentType might be neater, but how to get a
// reference to DocumentType without assuming it to be available globally?
// To play nice with custom DOM implementations, we resort to duck-typing.
if (!doctype
|| doctype.nodeType !== doctype.DOCUMENT_TYPE_NODE
|| typeof doctype.name !== 'string'
|| typeof doctype.publicId !== 'string'
|| typeof doctype.systemId !== 'string'
) {
throw new TypeError('Expected a DocumentType')
}
const doctypeString = `<!DOCTYPE ${doctype.name}`
+ (doctype.publicId ? ` PUBLIC "${doctype.publicId}"` : '')
+ (doctype.systemId
? (doctype.publicId ? `` : ` SYSTEM`) + ` "${doctype.systemId}"`
: ``)
+ `>`
return doctypeString
}
I believe document.documentElement.outerHTML
should return that for you.
According to MDN, outerHTML
is supported in Firefox 11, Chrome 0.2, Internet Explorer 4.0, Opera 7, Safari 1.3, Android, Firefox Mobile 11, IE Mobile, Opera Mobile, and Safari Mobile. outerHTML
is in the DOM Parsing and Serialization specification.
The MSDN page on the outerHTML property notes that it is supported in IE 5+. Colin's answer links to the W3C quirksmode page, which offers a good comparison of cross-browser compatibility (for other DOM features too).
I tried the various answers to see what is returned. I'm using the latest version of Chrome.
The suggestion document.documentElement.innerHTML;
returned <head> ... </body>
Gaby's suggestion document.getElementsByTagName('html')[0].innerHTML;
returned the same.
The suggestion document.documentElement.outerHTML;
returned <html><head> ... </body></html>
which is everything apart from the 'doctype'.
You can retrieve the doctype object with document.doctype;
This returns an object, not a string, so if you need to extract the details as strings for all doctypes up to and including HTML5 it is described here: Get DocType of an HTML as string with Javascript
I only wanted HTML5, so the following was enough for me to create the whole document:
alert('<!DOCTYPE HTML>' + '\n' + document.documentElement.outerHTML);