Trouble modifying an object within page.evaluate in Puppeteer

爱⌒轻易说出口 提交于 2021-01-28 20:00:24

问题


I am creating a Twitter scraper as a project. Tweets are rendered in the DOM as you scroll down so I want to use Puppeteer to scroll, extract data and save it into a predefined object, then continue scrolling. The problem is that the script is not actually modifying the object provided and I am left with an empty object.

The for loop to extract data works when called outside the scrolling function (i.e. I can extract the first tweets rendered in the page). The scrolling function itself works, I got it from Puppeteer - scroll down until you can't anymore .

For testing purposes I set the scrolling function to only scroll 20 times (it is otherwise designed to scroll until it can't scroll anymore). Here is my code:

app.get('/scrape', async (req, res) => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.setJavaScriptEnabled(true)
    await page.goto(`https://twitter.com/${req.query.url}`);
    await page.setJavaScriptEnabled(true)
    let obj = {}
    await autoScroll(page, obj)
    async function autoScroll(page, obj) {
        await page.evaluate(async (obj) => {
            await new Promise((resolve, reject) => {
                var totalHeight = 0;
                var distance = 400;
                var count = 0
                var timer = setInterval(() => {
                    var scrollHeight = document.body.scrollHeight;
                    window.scrollBy(0, distance);
                    totalHeight += distance;
                    for (let i = 0; i < 100; i++) {
                        let id, date, text
                        try {
                            id = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].getAttribute('data-tweet-id')
                            date = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[1].getAttribute('title')
                            text = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].childNodes[3].childNodes[3].childNodes[1].innerHTML
                            obj[id] = { date: date, text: text }
                            console.log(i)
                        } catch (err) { continue }
                    }
                    count++
                    //if(totalHeight >= scrollHeight){
                    if (count === 20) {
                        clearInterval(timer);
                        resolve();
                    }
                }, 400);
            });
        }, obj);
    }
    res.send(obj)
    await browser.close();
})

The request sends an empty object every time. I don't receive any error messages or console logs; if they are there, I can't see them because they are executed in the context of the headless Chrome browser than Puppeteer generates.

Any help would be appreciated!


回答1:


The arguments you pass to page.evaluate will be JSON-serialized and transferred to the page context.

The properties you assign to obj in your page.evaluate() function will only be present in the page context, not in the script where you called page.evaluate.

You can work around this by returning the obj object from the function instead of passing it as parameter:

let obj = await page.evaluate(async() => {
  return new Promise(resolve => {
      let obj = {};
      // ...
      // set something on obj
      obj['foo'] = 'bar';

      // resolve with the obj
      resolve(obj);
      // ...
  });
});

Integrated in your code snippet:

app.get('/scrape', async (req, res) => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.setJavaScriptEnabled(true)
    await page.goto(`https://twitter.com/${req.query.url}`);
    await page.setJavaScriptEnabled(true)
    let obj = await autoScroll(page);
    async function autoScroll(page) {
        return page.evaluate(async () => {
            let obj = {};
            return new Promise((resolve, reject) => {
                var totalHeight = 0;
                var distance = 400;
                var count = 0
                var timer = setInterval(() => {
                    var scrollHeight = document.body.scrollHeight;
                    window.scrollBy(0, distance);
                    totalHeight += distance;
                    for (let i = 0; i < 100; i++) {
                        let id, date, text
                        try {
                            id = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].getAttribute('data-tweet-id')
                            date = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[1].getAttribute('title')
                            text = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].childNodes[3].childNodes[3].childNodes[1].innerHTML
                            obj[id] = { date: date, text: text }
                            console.log(i)
                        } catch (err) { continue }
                    }
                    count++
                    //if(totalHeight >= scrollHeight){
                    if (count === 20) {
                        clearInterval(timer);
                        resolve(obj);
                    }
                }, 400);
            });
        });
    }
    res.send(obj)
    await browser.close();
})

If you're using a transpiler like babel you might need to pass the function as a string to page.evaluate, e.g.:

await page.evaluate(`async() => {
  return Promise.resolve(42);
}`);

(puppeteer will call .toString() on your function to get the source, which might contain references to helpers used by babel, which aren't present in the page context)

Edit:
To debug your selectors you can try to launch puppeteer in non-headless mode. That way you get a real browser window where you can access the dev console. e.g.:

const browser = await puppeteer.launch({headless: false});


来源:https://stackoverflow.com/questions/57525681/trouble-modifying-an-object-within-page-evaluate-in-puppeteer

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!