yandex 图片自动下载命令行程序
一个在 yandex 上搜索图片并下载到本地的 node cli 程序.
使用帮助: $0 <搜索关键词> [-t=超时(默认 1000)] [-r 是否替换下载(默认 false)] 示例: $0 shop # 搜索 shop 并下载 $0 shop -t=0 # 不使用超时 $0 shop -t=0 -r # 覆盖重名文件
一个群友让做的程序, ¥100. 需求就是使用 yandex 搜索图片, 并自动下载到本地.
分析
- 打开 https://yandex.com/images/search?text=shop
- 查看 images/search? 请求, p={当前页数}
- 返回结果 blocks[0].params.lastPage 是最大页数
- 返回结果中所有 img_url={原图地址}
win 下简单的命令行交互
这里用批处理实现, 其实 node 中也可以做交互的.
C:. │ yandex 图片自动下载.bat │ └─lib node_modules index.js node.exe
yandex 图片自动下载.bat 文件内容
color f0 && chcp 936 && cls && @echo off title yandex 图片自动下载 mode con cols=120 lines=30 :down set /p keyword=输入关键词开始下载: .\lib\node.exe .\lib\index.js "%keyword%" echo ========================== echo 全部下载完成 goto :down
源码
index.js
const fs = require('fs') const url = require('url') const path = require('path') const fly = require('flyio') const text = process.argv[2] if(text === undefined) { console.log(` 使用帮助: $0 <搜索关键词> [-t=超时(默认 10000)] [-r 是否替换下载(默认 false)] 示例: $0 shop # 搜索 shop 并下载 $0 shop -t=0 # 不使用超时 $0 shop -t=0 -r # 覆盖重名文件 `) return } const argv = parseArgv() const userConfig = { text, // 用户搜索的关键词 timeout: argv['-t'] === undefined ? 10000 : Number(argv['-t']), // 设置超时时间, 默认 10 秒 replace: argv['-r'] || false, // 是否替换下载, 默认否 } const config = userConfig fly.config.timeout = config.timeout const downHistoryFile = `${process.cwd()}/download.txt` fs.writeFileSync(downHistoryFile, '') const downDir = `${process.cwd()}/download` new Promise(async () => { let text = userConfig.text let {urls: lastUrls = [], lastPage = 0} = (await getImgUrls({text}).catch(err => console.log('err', err))) || {} if(lastPage === 0) { console.log('没有搜索结果, 你可以更换关键词重试') return } for (let p = 0; p < lastPage; p++) { let curPage = p + 1 let urls = [] if(curPage === 1) { urls = lastUrls } else { let getRes = await getImgUrls({text, p: curPage}).catch(err => console.log('err', err)) urls = (getRes || {}).urls || [] // 获取页数 } urls = [...new Set(urls)] // 去除重复的 url // urls = urls.slice(0, 3) const paseUrls = [] urls.map(item => { const {pathname, href} = url.parse(item) let {base, ext, name} = path.parse(pathname); ext = ext || '.jpg' // 如果没有扩展名时, 默认给定为 jpg // let fullName = paseUrls.find(item => item.fullName === base) // 如果保存文件名重复, 则添加时间戳处理 // ? `${name}_${uuid()}_${ext}` // : base // let fullName = `${name}_${uuid()}_${ext}` let fullName = `${name || uuid()}${ext}` paseUrls.push({ name, fullName, href, }) }) console.log(`正在下载第 ${curPage}/${lastPage} 页`) await Promise.all(paseUrls.map(item => download(item.href, item.fullName))).finally(async () => { console.log(`第 ${curPage}/${lastPage} 页下载完成`) }) } }) function hasFile(filePath) { return fs.existsSync(filePath) } function parseArgv() { return process.argv.slice(2).reduce((acc, arg) => { let [k, v = true] = arg.split('=') acc[k] = v return acc }, {}) } async function getImgUrls(arg = {}) { arg = { text: 'shop', p: 1, ...arg, } return new Promise((resolve, reject) => { console.log(`正在查找图片, 关键词: ${arg.text}, 页: ${arg.p}`) let api = `https://yandex.com/images/search?format=json&request=%7B%22blocks%22%3A%5B%7B%22block%22%3A%22serp-controller%22%2C%22params%22%3A%7B%7D%2C%22version%22%3A2%7D%2C%7B%22block%22%3A%22serp-list_infinite_yes%22%2C%22params%22%3A%7B%22initialPageNum%22%3A0%7D%2C%22version%22%3A2%7D%2C%7B%22block%22%3A%22more_direction_next%22%2C%22params%22%3A%7B%7D%2C%22version%22%3A2%7D%2C%7B%22block%22%3A%22gallery__items%3Aajax%22%2C%22params%22%3A%7B%7D%2C%22version%22%3A2%7D%5D%2C%22bmt%22%3A%7B%22lb%22%3A%22%22%7D%2C%22amt%22%3A%7B%22las%22%3A%22justifier-height%3D1%3Bthumb-underlay%3D1%3Bjustifier-setheight%3D1%3Bfitimages-height%3D1%3Bjustifier-fitincuts%3D1%3Bchunk.219.0%3D1%3Bchunk.169.0%3D1%22%7D%7D&yu=2650822041574437239&p=${arg.p}&text=${encodeURIComponent(arg.text)}&rpt=image&uinfo=sw-1920-sh-1200-ww-905-wh-1045-pd-2-wp-16x10_2560x1600` // console.log('api', api) fly.get(api) .then(res => { const data = res.data handleData(data, r => resolve(r)) }).catch(err => { reject(err) }) }) } function execSync(cmd, out = false) { const child_process = require('child_process') let str = child_process.execSync(cmd).toString().trim() out && console.log(str) return str } async function handleData(data, cb) { try { const lastPage = data.blocks[0].params.lastPage const urls = [] data.blocks.forEach(item => { (item.html.match(/img_url=.*?&/g) || []).forEach(item => { urls.push(decodeURIComponent((item.match(/img_url=(.*?)&/) || [])[1] || "")) }) }) cb({ lastPage, urls, }) } catch (error) { console.log('error', error) cb({ msg: '搜索结果出现错误', error, lastPage: 0, urls: [], }) } } async function download(u, p) { fs.writeFileSync(downHistoryFile, fs.readFileSync(downHistoryFile) + u + '\r\n') const savePath = `${downDir}/${p}` if(!config.replace && hasFile(savePath)) { console.log(`跳过已下载 ${p}`) return true } else { if(!hasFile(downDir)) { fs.mkdirSync(downDir) } return fly.download(u, savePath) .then(d => { console.log(`下载成功: ${p}`) }) .catch(err => console.log('err', err.message)) } } function uuid(sep = '') { let increment = process.increment === undefined ? (process.increment = 1) : (process.increment = (process.increment + 1)) return `${Date.now()}_${increment}`.replace(/_/g, sep) }
package.json
{ "name": "getImg", "version": "1.0.0", "description": "", "main": "index.js", "keywords": [], "author": "xw", "dependencies": { "flyio": "^0.6.14", "request": "^2.88.0" }, "license": "ISC" }
注:
本程序有个小问题, 懒得解决, 看大家能不能发现.