爬虫文件
baidu.js
const puppeteer = require("puppeteer"); const path = require('path'); const pathToExtension = path.join(__dirname, './chrome-mac/Chromium.app/Contents/MacOS/Chromium'); var exec = require('child_process').execSync; const conf = { headless: false, executablePath: pathToExtension, defaultViewport: { width: 1300, height: 900 }, }; const run = async (browserEndpoint) => { //var count = exec('ps -ef |grep Chromium |grep -v "grep" |awk \'{print $8}\'|wc -l'); if (browserEndpoint == "") { var browser = await puppeteer.launch(conf) const _browserEndpoint = await await browser.wsEndpoint(); console.log("_browserEndpoint",_browserEndpoint) browserEndpoint=_browserEndpoint } var browser = await puppeteer.connect({"browserWSEndpoint":browserEndpoint}) const page = await browser.newPage() await page.goto('https://www.baidu.com/', {waitUntil: 'networkidle2'}); //addScriptTag需要加在goto的后面,然后就可以在evaluate里使用jQuery的语法了。 await page.addScriptTag({ url: 'https://code.jquery.com/jquery-3.2.1.min.js', }); await page.waitFor('#u1') // 可以接收evaluate内部打印的console内容 page.on('console', msg => { for (let i = 0; i < msg.args().length; i++) { console.log(`${i}: ${msg.args([i])}`) } }) const result = await page.evaluate(() => { let data = []; // 初始化空数组来存储数据 let elements = $("#u1"); // 获取所有元素 for (let element of elements) { let title = element.innerText; // 获取标题 let url = element.href;//获取网址 data.push({title, url}); // 存入数组 } return data; }); console.log(result); const dic = { "result": result, "browserEndpoint": browserEndpoint } await page.close() return dic }; module.exports = { run }
服务文件
server.js
var http = require('http'); var run_spider = require("./baidu.js"); var browserEndpoint = "" http.createServer(function (req, res) { console.log("in", browserEndpoint) res.writeHead(200, {'Content-Type': 'text/plain;charset=UTF-8', 'Access-Control-Allow-origin': '*'}); if (req.method.toUpperCase() == 'POST') { var postData = ''; req.on('data', function (data) { postData += data; //接受的数据 }); req.on('end', function () { if (browserEndpoint == "") { console.log("if", browserEndpoint) run_spider.run(browserEndpoint).then(function (result_dict) { browserEndpoint = result_dict.browserEndpoint console.log("browserEndpoint", browserEndpoint) }) } else { console.log("else", browserEndpoint) run_spider.run(browserEndpoint).then(function (result_dict) { Promise.resolve(result_dict.browserEndpoint) }) } } ); } }).listen('9001', function () { console.log('开启服务端口9001'); });