目的: 获得目标背单词网站中的单词, 写了一个简单的小爬虫, 使用jdk11
到此, 思路明确!
第一步, 把冰箱门...., 串词了,Sorry!!
第一步, 调用登陆接口, 拿到sessionid!
第二步, 带着sessionid到单词列表页, 拿到body, 转成Document, 开始"借鉴"单词!
是不是So easy!
package com.***;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.util.HashMap;
/**
* @author jqw1122@foxmail.com
* @description 爬啊爬
* @date 2/23/2019 17:14
*/
public class Crawler {
@Test
public void crawler() {
String loginUrl = "http://www.cikuang.me/login";
String fromBody = "username=jqw1122@foxamil.com&password=qweqwe123";
String wordSetUrl= "http://www.cikuang.me/member/learningset?id=4573";
HttpClient httpClient = HttpClient.newBuilder().build();
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(loginUrl))
.header("Content-Type","application/x-www-form-urlencoded")
.POST(HttpRequest.BodyPublishers.ofString(fromBody))
.build();
httpClient.sendAsync(request, HttpResponse.BodyHandlers.ofString())
.thenApply(HttpResponse::headers)
.thenAccept(headers -> {
//cookie有好多 我只要sid啊魂淡!
var cookieMap = new HashMap<String, String>();
headers.map().get("set-cookie").forEach(c -> {
String[] split = c.split(";");
for (String s : split) {
String[] split1 = s.split("=");
if (split1.length == 2)
cookieMap.put(split1[0], split1[1]);
}
});
//拿着sid去单词页面
String cookie_sid = cookieMap.get("sid");
HttpRequest request2 = HttpRequest.newBuilder()
.uri(URI.create(wordSetUrl))
.header("Content-Type","application/x-www-form-urlencoded")
.header("Cookie", "sid=" + cookie_sid)
.GET()
.build();
httpClient.sendAsync(request2, HttpResponse.BodyHandlers.ofString())
.thenApply(HttpResponse::body)
.thenAccept(htmlString ->{
//获取到body转成Document, 方便借鉴...
Document htmlDocument = Jsoup.parse(htmlString);
//获取单词table id
Element wordListTable = htmlDocument.getElementById("wordListTable");
Elements trs = wordListTable.getElementsByTag("tr");
trs.forEach(t -> {
Elements tds = t.children();
String en = tds.get(0).child(0).text();
String cn = tds.get(1).text();
System.out.println("单词---->>> " + en + ":" + cn);
});
}).join();
} ).join();
}
}
爬虫2: 目的:获取KMF中托福-听力-所有练习题的题目的音频
/**
* @author jqw1122@foxmail.com
* @description
* @date 2/23/2019 17:14
*/
public class Crawler {
@Test
public void crawlerKMF() {
String mainUrl= "https://toefl.kmf.com";
String mainUrl1= "https://toefl.kmf.com/listen/ets/order/";
String localFilePath = "C:\\kmf_audio\\";
HttpClient httpClient = HttpClient.newBuilder().build();
List<String> detailUrlList = new ArrayList<>();
e:for (int i = 0; i <= 5; i++) {
for (int j = 1; j <= 4; j++) {
String url = mainUrl1 + i + "/0/" + j;
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(url))
.header("Content-Type","application/x-www-form-urlencoded")
.GET()
.build();
httpClient.sendAsync(request, HttpResponse.BodyHandlers.ofString()).thenApply(HttpResponse::body).thenAccept(bodyString -> {
Document htmlDocument = Jsoup.parse(bodyString);
Elements elements = htmlDocument.getElementsByAttributeValue("class", "check-links js-check-link");
elements.forEach(tagA -> {
String href = tagA.attr("href");
detailUrlList.add(href);
});
System.out.println("page detail number:" + elements.size());
}).join();
//test
// if (1==1) break e;
}
}
System.out.println("page/file number: "+detailUrlList.size());
var fileList = new ArrayList<Map<String, String>>();
System.out.println(LocalTime.now().toString() + " start get audio file url in detail page");
detailUrlList.parallelStream().forEach(href -> {
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(mainUrl + href))
.header("Content-Type","application/x-www-form-urlencoded")
.GET()
.build();
httpClient.sendAsync(request, HttpResponse.BodyHandlers.ofString()).thenApply(HttpResponse::body).thenAccept(bodyString -> {
Document htmlDocument = Jsoup.parse(bodyString);
Elements bts = htmlDocument.getElementsByAttributeValue("class", "i-title js-top-title");
String fileName = bts.get(0).text();
Elements audios = htmlDocument.getElementsByAttributeValue("class", "question-audio-cont js-question-audio g-player-control video-left-content js-player-record");
String fileUrl = audios.get(0).attr("data-url");
fileList.add(Map.of("fileName", fileName.toLowerCase().replace(" ", "_") + ".mp3", "fileUrl", fileUrl));
// System.out.println(fileName+ "--"+fileUrl);
}).join();
});
System.out.println(LocalTime.now().toString() + " finish get audio file url in detail page! start downloading files to local!");
fileList.parallelStream().forEach(t -> {
try (InputStream ins = new URL(t.get("fileUrl")).openStream()) {
Path target = Paths.get(localFilePath, t.get("fileName"));
// Files.createDirectories(target.getParent());
Files.copy(ins, target, StandardCopyOption.REPLACE_EXISTING);
} catch (IOException e) {
System.out.println("download failed! fileName:" + t.get("fileName") + " fileUrl:" + t.get("fileUrl"));
e.printStackTrace();
}
});
System.out.println(LocalTime.now().toString() + " download completed");
}
}
下载成功了....
来源:oschina
链接:https://my.oschina.net/u/2757155/blog/3013965