phantomjs not waiting for “full” page load

前端 未结 14 996
南旧
南旧 2020-11-22 13:48

I\'m using PhantomJS v1.4.1 to load some web pages. I don\'t have access to their server-side, I just getting links pointing to them. I\'m using obsolete version of Phantom

相关标签:
14条回答
  • 2020-11-22 14:52

    I would rather periodically check for document.readyState status (https://developer.mozilla.org/en-US/docs/Web/API/document.readyState). Although this approach is a bit clunky, you can be sure that inside onPageReady function you are using fully loaded document.

    var page = require("webpage").create(),
        url = "http://example.com/index.html";
    
    function onPageReady() {
        var htmlContent = page.evaluate(function () {
            return document.documentElement.outerHTML;
        });
    
        console.log(htmlContent);
    
        phantom.exit();
    }
    
    page.open(url, function (status) {
        function checkReadyState() {
            setTimeout(function () {
                var readyState = page.evaluate(function () {
                    return document.readyState;
                });
    
                if ("complete" === readyState) {
                    onPageReady();
                } else {
                    checkReadyState();
                }
            });
        }
    
        checkReadyState();
    });
    

    Additional explanation:

    Using nested setTimeout instead of setInterval prevents checkReadyState from "overlapping" and race conditions when its execution is prolonged for some random reasons. setTimeout has a default delay of 4ms (https://stackoverflow.com/a/3580085/1011156) so active polling will not drastically affect program performance.

    document.readyState === "complete" means that document is completely loaded with all resources (https://html.spec.whatwg.org/multipage/dom.html#current-document-readiness).

    0 讨论(0)
  • 2020-11-22 14:52

    I found this solution useful in a NodeJS app. I use it just in desperate cases because it launches a timeout in order to wait for the full page load.

    The second argument is the callback function which is going to be called once the response is ready.

    phantom = require('phantom');
    
    var fullLoad = function(anUrl, callbackDone) {
        phantom.create(function (ph) {
            ph.createPage(function (page) {
                page.open(anUrl, function (status) {
                    if (status !== 'success') {
                        console.error("pahtom: error opening " + anUrl, status);
                        ph.exit();
                    } else {
                        // timeOut
                        global.setTimeout(function () {
                            page.evaluate(function () {
                                return document.documentElement.innerHTML;
                            }, function (result) {
                                ph.exit(); // EXTREMLY IMPORTANT
                                callbackDone(result); // callback
                            });
                        }, 5000);
                    }
                });
            });
        });
    }
    
    var callback = function(htmlBody) {
        // do smth with the htmlBody
    }
    
    fullLoad('your/url/', callback);
    
    0 讨论(0)
提交回复
热议问题