package com.oa.test;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
public class Demo02 {
public static void main(String[] args) throws IOException {
System.out.println("开始");
Demo02 d = new Demo02();
String str = d.getHtml();
System.out.println(str);
d.readHtml(str);
System.out.println("结束");
}
public String getHtml() throws IOException {
StringBuffer buffer = new StringBuffer();
String path = "http://www.dyhjw.com/dyhjw/etf.html";
URL url = new URL(path);
URLConnection conn = url.openConnection();
//获取输入流
InputStream in = conn.getInputStream();
//字节流-》字符流 InputStreamReader
InputStreamReader reader = new InputStreamReader(in, "utf-8");
//包装流,可以按行读取
BufferedReader breader = new BufferedReader(reader);
//读
String line = "";
while ((line = breader.readLine()) != null) {
buffer.append(line);
}
return buffer + "";
}
public List<Object[]> readHtml(String html) {
//1. 使用Jsoup解析html -> Document对象
Document document = Jsoup.parse(html);
//2. 从Document中找到id=newlist_list_content_table的element
// Element div = document.getElementById("newlist_list_content_table");
//3. 在id=newlist_list_content_table下找到所有class=newslist的elements
// Elements tables =document.getElementsByClass("sx_table");
// 直接获取整个table的内容
// System.out.println("tables="+tables);
Elements trs = document.select("table").select("tr");
System.out.println("===" + trs.size());
List<Object[]> list = new ArrayList<Object[]>();
File file = new File("d://黄金.txt");
FileWriter fWriter = null;
try {
fWriter = new FileWriter(file);
fWriter.append("日期(北京)\t净持仓量(盎司)\t净持仓量(吨)\t总价值(美元)\t总价值(美元)\t影响(金银)\t\n");
for (int i = 1; i < trs.size(); i++) {
Elements tds = trs.get(i).select("td");
Object[] obj = {
tds.get(0).text(),
Double.parseDouble(tds.get(1).text()),
Double.parseDouble(tds.get(2).text()),
Double.parseDouble(tds.get(3).text()),
tds.get(4).text()
};
list.add(obj);
for (int j = 0; j < tds.size(); j++) {
String txt = tds.get(j).text();
fWriter.append(txt + "\t\n");
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
fWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return list;
}
}
另一个版本
package com.oa.test;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
public class Demo03 {
public static void main(String[] args) throws IOException {
System.out.println("开始");
Demo03 d = new Demo03();
String str = d.getHtml();
d.readHtml(str);
System.out.println("结束");
}
public String getHtml() throws IOException {
StringBuffer buffer = new StringBuffer();
String path = "http://www.dyhjw.com/dyhjw/etf.html";
URL url = new URL(path);
URLConnection conn = url.openConnection();
//获取输入流
InputStream in = conn.getInputStream();
//字节流-》字符流 InputStreamReader
InputStreamReader reader = new InputStreamReader(in, "utf-8");
//包装流,可以按行读取
BufferedReader breader = new BufferedReader(reader);
//读
String line;
while ((line = breader.readLine()) != null) {
buffer.append(line);
}
return buffer.toString();
}
public void readHtml(String html) throws IOException {
//1. 使用Jsoup解析html -> Document对象
Document document = Jsoup.parse(html);
//2. 从Document中找到整个table的内容
Elements trs = document.select("table").select("tr");
//3. 创建文件存储爬取的数据
File file = new File("d://黄金.txt");
FileWriter fWriter = new FileWriter(file);
fWriter.append("日期(美国)\t净持仓量(吨)\t总价值(美元)\t增减(吨)\t影响(金银)\r\n");
//循环所有的表格行
for (int i = 1; i < trs.size(); i++) {
//获取当前行的所有单元格
Elements tds = trs.get(i).select("td");
for (int j = 0; j < tds.size(); j++) {
String txt = tds.get(j).text();
fWriter.append(txt + "\t");//\t分隔符
}
fWriter.append("\r\n");//添加换行符
}
fWriter.close();
}
}
下载图片的爬虫
package com.oa.test;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.UUID;
public class Demo04 {
public static void main(String[] args) throws Exception {
System.out.println("开始");
Demo04 d = new Demo04();
String str = d.getHtml();
d.readHtml(str);
System.out.println("结束");
}
public String getHtml() throws Exception {
StringBuffer buffer = new StringBuffer();
//https://huaban.com/boards/favorite/beauty/
String path = "https://huaban.com/favorite/beauty/";
URL url = new URL(path);
URLConnection conn = url.openConnection();
//获取输入流
InputStream in = conn.getInputStream();
//字节流-》字符流 InputStreamReader
InputStreamReader reader = new InputStreamReader(in, "utf-8");
//包装流,可以按行读取
BufferedReader breader = new BufferedReader(reader);
//读
String line;
while ((line = breader.readLine()) != null) {
buffer.append(line);
}
return buffer.toString();
}
public void readHtml(String html) throws Exception {
//1. 使用Jsoup解析html -> Document对象
Document document = Jsoup.parse(html);
//2. 从Document中找到整个table的内容
/*
Elements images = document.select("div[id=waterfall]").select("img[class=large]");
for (int i=0; i<images.size(); i++){
String src = images.get(i).attr("src");
System.out.println(src);
downloadImages("http:" + src);
}
*/
Elements elements = document.select("div[class=pin wfc]");
for (int i=0; i<elements.size(); i++){
String src=elements.get(i).select("a[class=img x layer-view]").select("img").attr("src");
System.out.println(src);
if (src.length()>0) {
downloadImages("http:" + src);
}
}
}
public void downloadImages(String src) throws Exception {
System.out.println("=="+src);
URL url = new URL(src);
InputStream is = url.openStream();
FileOutputStream fos = new FileOutputStream("D:\\images\\"+ UUID.randomUUID()+".png");
byte buf[] = new byte[1024];
int length = 0;
while ((length=is.read(buf))!=-1){
fos.write(buf,0,length);
}
fos.close();
is.close();
}
}
来源:https://blog.csdn.net/kongfanyu/article/details/102778837