await

puppeteer获取课程并写入json

懵懂的女人 提交于 2019-11-27 06:00:36
const puppeteer = require ( 'puppeteer' ) ; const fs = require ( 'fs' ) ; const imoocUrl = 'https://www.imooc.com/course/list?c=fe' ; ; ( async ( ) => { const browser = await puppeteer . launch ( { headless : false , defaultViewport : { width : 1920 , height : 1080 } } ) ; const page = await browser . newPage ( ) ; console . log ( 'start open url:' , imoocUrl ) ; await page . goto ( imoocUrl ) ; //操作数据 console . log ( 'operate dom by console' ) ; const result = await page . evaluate ( ( ) => { let $ = window . $ ; let data = [ ] ; let courseList = $ ( '.moco-course-list' ) . find ( '.course

async-await用法记录

不打扰是莪最后的温柔 提交于 2019-11-27 05:31:19
一、概览: async:函数 await:操作符 await必须在async函数中才能使用 await后面可以是任意值,但是一般跟Promise对象 1、Promise的resolve方法的值就是await值 2、Promise的reject不会作为值返回,需要使用try-catch捕获 来源: https://blog.51cto.com/9161018/2429215

puppeteer

岁酱吖の 提交于 2019-11-27 05:21:44
centos7 简单些 npm install -g cnpm --registry=https://registry.npm.taobao.org cnpm i puppeteer 再配置信息 #依赖库 yum install pango.x86_64 libXcomposite.x86_64 libXcursor.x86_64 libXdamage.x86_64 libXext.x86_64 libXi.x86_64 libXtst.x86_64 cups-libs.x86_64 libXScrnSaver.x86_64 libXrandr.x86_64 GConf2.x86_64 alsa-lib.x86_64 atk.x86_64 gtk3.x86_64 -y #字体 yum install ipa-gothic-fonts xorg-x11-fonts-100dpi xorg-x11-fonts-75dpi xorg-x11-utils xorg-x11-fonts-cyrillic xorg-x11-fonts-Type1 xorg-x11-fonts-misc -y 普通用户 const puppeteer = require('puppeteer'); (async () => { const browser = await puppeteer.launch(

puppeteer爬虫服务

此生再无相见时 提交于 2019-11-26 16:42:44
爬虫文件 baidu.js const puppeteer = require("puppeteer"); const path = require('path'); const pathToExtension = path.join(__dirname, './chrome-mac/Chromium.app/Contents/MacOS/Chromium'); var exec = require('child_process').execSync; const conf = { headless: false, executablePath: pathToExtension, defaultViewport: { width: 1300, height: 900 }, }; const run = async (browserEndpoint) => { //var count = exec('ps -ef |grep Chromium |grep -v "grep" |awk \'{print $8}\'|wc -l'); if (browserEndpoint == "") { var browser = await puppeteer.launch(conf) const _browserEndpoint = await await browser.wsEndpoint(

tornado实现高并发爬虫

无人久伴 提交于 2019-11-26 16:39:17
from pyquery import PyQuery as pq from tornado import ioloop, gen, httpclient, queues from urllib.parse import urljoin base_url = "http://www.baidu.com" concurrency = 8 async def get_url_links(url): response = await httpclient.AsyncHTTPClient().fetch(url) html = response.body.decode("utf-8") p = pq(html) links = [] for tag_a in p("a").items(): links.append(urljoin(base_url, tag_a.attr("href"))) return links async def main(): seen_set = set() q = queues.Queue() async def fetch_url(current_url): if current_url in seen_set: return print(f"获取:{current_url}") seen_set.add(current_url) next_urls =