How to get all html data after all scripts and page loading is done? (puppeteer)

前端 未结 3 377
太阳男子
太阳男子 2021-01-04 23:23

Finally I figured how to use Node.js. Installed all libraries/extensions. So puppeteer is working, but as it was previous with Xmlhttp... it gets only template/body of the p

相关标签:
3条回答
  • 2021-01-04 23:54

    If you want full html same as inspect? Here it is:

        const puppeteer = require('puppeteer');
    
        (async function main() {
          try {
            const browser = await puppeteer.launch();
            const [page] = await browser.pages();
    
            await page.goto('https://example.org/', { waitUntil: 'networkidle0' });
            const data = await page.evaluate(() => document.querySelector('*').outerHTML);
    
            console.log(data);
    
            await browser.close();
          } catch (err) {
            console.error(err);
          }
        })();
    
    0 讨论(0)
  • 2021-01-05 00:00

    let bodyHTML = await page.evaluate(() => document.documentElement.outerHTML);

    This

    0 讨论(0)
  • 2021-01-05 00:06

    Some notes:

    1. You need not cheerio with puppeteer and you need not reparse page.content(): you already have the full DOM with all scripts run and you can evaluate any code in window context like in a browser using page.evaluate() and transferring serializable data between web API context and Node.js API context.

    2. Try to use async/await only, this will simplify your code and flow.

    3. If you need to wait till all the scripts and other dependencies are loaded, use waitUntil: 'networkidle0' in page.goto().

    4. If you suspect that document scripts need some time till the needed state, use various test functions like page.waitForSelector() or fall back to page.waitFor(milliseconds).

    Here is a simple script that outputs all tag names in a page.

    'use strict';
    
    const puppeteer = require('puppeteer');
    
    (async function main() {
      try {
        const browser = await puppeteer.launch();
        const [page] = await browser.pages();
    
        await page.goto('https://example.org/', { waitUntil: 'networkidle0' });
    
        const data = await page.evaluate(
          () =>  Array.from(document.querySelectorAll('*'))
                      .map(elem => elem.tagName)
        );
    
        console.log(data);
    
        await browser.close();
      } catch (err) {
        console.error(err);
      }
    })();
    

    You can specify your task in more details and we can try to write something more appropriate.


    Script for www.bezrealitky.cz (task from a comment below):

    'use strict';
    
    const fs = require('fs');
    const puppeteer = require('puppeteer');
    
    (async function main() {
      try {
        const browser = await puppeteer.launch();
        const [page] = await browser.pages();
        page.setDefaultTimeout(0);
    
        await page.goto('https://www.bezrealitky.cz/vyhledat?offerType=pronajem&estateType=byt&disposition=&ownership=&construction=&equipped=&balcony=&order=timeOrder_desc&boundary=%5B%5B%7B%22lat%22%3A50.171436864513%2C%22lng%22%3A14.506905276796942%7D%2C%7B%22lat%22%3A50.154133576294%2C%22lng%22%3A14.599004629591036%7D%2C%7B%22lat%22%3A50.14524430128%2C%22lng%22%3A14.58773054712799%7D%2C%7B%22lat%22%3A50.129307131988%2C%22lng%22%3A14.60087568578706%7D%2C%7B%22lat%22%3A50.122604734575%2C%22lng%22%3A14.659116306376973%7D%2C%7B%22lat%22%3A50.106512499343%2C%22lng%22%3A14.657434650206028%7D%2C%7B%22lat%22%3A50.090685542974%2C%22lng%22%3A14.705099547441932%7D%2C%7B%22lat%22%3A50.072175921973%2C%22lng%22%3A14.700004206235008%7D%2C%7B%22lat%22%3A50.056898491904%2C%22lng%22%3A14.640206899053055%7D%2C%7B%22lat%22%3A50.038528576841%2C%22lng%22%3A14.666852728301023%7D%2C%7B%22lat%22%3A50.030955909657%2C%22lng%22%3A14.656128752460972%7D%2C%7B%22lat%22%3A50.013435368522%2C%22lng%22%3A14.66854956530301%7D%2C%7B%22lat%22%3A49.99444182116%2C%22lng%22%3A14.640153080292066%7D%2C%7B%22lat%22%3A50.010839032542%2C%22lng%22%3A14.527474219359988%7D%2C%7B%22lat%22%3A49.970771602447%2C%22lng%22%3A14.46224174052395%7D%2C%7B%22lat%22%3A49.970669964027%2C%22lng%22%3A14.400648545303966%7D%2C%7B%22lat%22%3A49.941901176098%2C%22lng%22%3A14.395563234671044%7D%2C%7B%22lat%22%3A49.948384148423%2C%22lng%22%3A14.337635637038034%7D%2C%7B%22lat%22%3A49.958376114735%2C%22lng%22%3A14.324977842107955%7D%2C%7B%22lat%22%3A49.9676286223%2C%22lng%22%3A14.34491711110104%7D%2C%7B%22lat%22%3A49.971859099005%2C%22lng%22%3A14.326815050839059%7D%2C%7B%22lat%22%3A49.990608728081%2C%22lng%22%3A14.342731259186962%7D%2C%7B%22lat%22%3A50.002211140429%2C%22lng%22%3A14.29483886971002%7D%2C%7B%22lat%22%3A50.023596577558%2C%22lng%22%3A14.315872285282012%7D%2C%7B%22lat%22%3A50.058309376419%2C%22lng%22%3A14.248086830069042%7D%2C%7B%22lat%22%3A50.073179111%2C%22lng%22%3A14.290193274400963%7D%2C%7B%22lat%22%3A50.102973823639%2C%22lng%22%3A14.224439442359994%7D%2C%7B%22lat%22%3A50.130060800171%2C%22lng%22%3A14.302396419107936%7D%2C%7B%22lat%22%3A50.116019827009%2C%22lng%22%3A14.360785349547996%7D%2C%7B%22lat%22%3A50.148005694843%2C%22lng%22%3A14.365662825877052%7D%2C%7B%22lat%22%3A50.14142969454%2C%22lng%22%3A14.394903042943952%7D%2C%7B%22lat%22%3A50.171436864513%2C%22lng%22%3A14.506905276796942%7D%2C%7B%22lat%22%3A50.171436864513%2C%22lng%22%3A14.506905276796942%7D%5D%5D&hasDrawnBoundary=1&mapBounds=%5B%5B%7B%22lat%22%3A50.289447077141126%2C%22lng%22%3A14.68724263943227%7D%2C%7B%22lat%22%3A50.289447077141126%2C%22lng%22%3A14.087801111111958%7D%2C%7B%22lat%22%3A50.039169221047985%2C%22lng%22%3A14.087801111111958%7D%2C%7B%22lat%22%3A50.039169221047985%2C%22lng%22%3A14.68724263943227%7D%2C%7B%22lat%22%3A50.289447077141126%2C%22lng%22%3A14.68724263943227%7D%5D%5D&center=%7B%22lat%22%3A50.16447196305031%2C%22lng%22%3A14.387521875272125%7D&zoom=11&locationInput=praha&limit=15');
    
        await page.waitForSelector('#search-content button.btn-icon');
    
        while (await page.$('#search-content button.btn-icon') !== null) {
          const articlesForNow = (await page.$$('#search-content article')).length;
          console.log(`Articles for now: ${articlesForNow}. Getting more...`);
    
          await Promise.all([
            page.evaluate(
              () => { document.querySelector('#search-content button.btn-icon').click(); }
            ),
            page.waitForFunction(
              old => document.querySelectorAll('#search-content article').length > old,
              {},
              articlesForNow
            ),
          ]);
        }
    
        const articlesAll = (await page.$$('#search-content article')).length;
        console.log(`All articles: ${articlesAll}.`);
    
        fs.writeFileSync('full.html', await page.content());
        fs.writeFileSync('articles.html', await page.evaluate(
          () => document.querySelector('#search-content div.b-filter__inner').outerHTML
        ));
        fs.writeFileSync('articles.txt', await page.evaluate(
          () => [...document.querySelectorAll('#search-content article')]
                  .map(({ innerText }) => innerText)
                  .join(`\n${'-'.repeat(50)}\n`)
        ));
        console.log('Saved.');
    
        await browser.close();
      } catch (err) {
        console.error(err);
      }
    })();
    
    0 讨论(0)
提交回复
热议问题