问题
I am creating a Twitter scraper as a project. Tweets are rendered in the DOM as you scroll down so I want to use Puppeteer to scroll, extract data and save it into a predefined object, then continue scrolling. The problem is that the script is not actually modifying the object provided and I am left with an empty object.
The for loop to extract data works when called outside the scrolling function (i.e. I can extract the first tweets rendered in the page). The scrolling function itself works, I got it from Puppeteer - scroll down until you can't anymore .
For testing purposes I set the scrolling function to only scroll 20 times (it is otherwise designed to scroll until it can't scroll anymore). Here is my code:
app.get('/scrape', async (req, res) => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setJavaScriptEnabled(true)
await page.goto(`https://twitter.com/${req.query.url}`);
await page.setJavaScriptEnabled(true)
let obj = {}
await autoScroll(page, obj)
async function autoScroll(page, obj) {
await page.evaluate(async (obj) => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 400;
var count = 0
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
for (let i = 0; i < 100; i++) {
let id, date, text
try {
id = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].getAttribute('data-tweet-id')
date = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[1].getAttribute('title')
text = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].childNodes[3].childNodes[3].childNodes[1].innerHTML
obj[id] = { date: date, text: text }
console.log(i)
} catch (err) { continue }
}
count++
//if(totalHeight >= scrollHeight){
if (count === 20) {
clearInterval(timer);
resolve();
}
}, 400);
});
}, obj);
}
res.send(obj)
await browser.close();
})
The request sends an empty object every time. I don't receive any error messages or console logs; if they are there, I can't see them because they are executed in the context of the headless Chrome browser than Puppeteer generates.
Any help would be appreciated!
回答1:
The arguments you pass to page.evaluate
will be JSON-serialized and transferred to the page context.
The properties you assign to obj
in your page.evaluate()
function will only be present in the page context, not in the script where you called page.evaluate
.
You can work around this by returning the obj
object from the function instead of passing it as parameter:
let obj = await page.evaluate(async() => {
return new Promise(resolve => {
let obj = {};
// ...
// set something on obj
obj['foo'] = 'bar';
// resolve with the obj
resolve(obj);
// ...
});
});
Integrated in your code snippet:
app.get('/scrape', async (req, res) => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setJavaScriptEnabled(true)
await page.goto(`https://twitter.com/${req.query.url}`);
await page.setJavaScriptEnabled(true)
let obj = await autoScroll(page);
async function autoScroll(page) {
return page.evaluate(async () => {
let obj = {};
return new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 400;
var count = 0
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
for (let i = 0; i < 100; i++) {
let id, date, text
try {
id = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].getAttribute('data-tweet-id')
date = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[1].getAttribute('title')
text = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].childNodes[3].childNodes[3].childNodes[1].innerHTML
obj[id] = { date: date, text: text }
console.log(i)
} catch (err) { continue }
}
count++
//if(totalHeight >= scrollHeight){
if (count === 20) {
clearInterval(timer);
resolve(obj);
}
}, 400);
});
});
}
res.send(obj)
await browser.close();
})
If you're using a transpiler like babel you might need to pass the function as a string to page.evaluate
, e.g.:
await page.evaluate(`async() => {
return Promise.resolve(42);
}`);
(puppeteer will call .toString()
on your function to get the source, which might contain references to helpers used by babel, which aren't present in the page context)
Edit:
To debug your selectors you can try to launch puppeteer in non-headless mode.
That way you get a real browser window where you can access the dev console.
e.g.:
const browser = await puppeteer.launch({headless: false});
来源:https://stackoverflow.com/questions/57525681/trouble-modifying-an-object-within-page-evaluate-in-puppeteer