背景
现如今,数据成为了越来越重要的网络资源,越来越有价值。无论是数据的分析还是前后端页面的数据交互,都离不开真实有效的数据。项目开发中数据甲方不可能实时提供,我们只能找到目标网站的数据进行抓取入库。
数据作用
决策支持
提升效益
数据的直接变现方式
数据资源交易
行业报告
广告平台
数据抓取的难点
1、目标网站有反爬取策略
2、目标网站模板会进行定时或实时变动
3、目标网站URL抓取失败
4、IP被封禁
解决办法:
购买代理IP库,随机获取IP进行数据抓取
部署多个应用分别进行抓取,降低单位节点访问的频率
设置每个页面抓取的时间间隔
5、用户登录限制
数据抓取的原理
实质上就是java程序模拟浏览器进行目标网站的访问,无论是请求目标服务器的接口还是请求目标网页内容,都是要在java程序中对数据进行解析。最简单的抓取方式有httpclient请求目标服务器接口,jsoup请求目标页面内容,把请求的数据进行解析然后入库。另外要做好爬取的实时监控,如果URL请求失败3次,就放弃该URL的抓取。
总体架构的设计
1、数据流向
1、确定数据爬取目标
2、数据采集
1、下载数据
2、解析数据
3、存取接入库(database,HDFS)
3、分析查询服务
2、模块划分
1、数据采集模块
2、数据分析模块
3、报表管理模块
4、系统管理与监控模块
3、模块解读
技术选型
数据采集层
JSoup、HttpClient、HTMLCleaner、XPath+正则表达式
数据存取层
HBase+Redis
数据处理层
solr+eslasticsearch
数据展示层
springboot+thymeleaf+jquery+echarts/highcharts
部署方案
爬虫项目:多台服务器
爬虫分类URL定时:一台服务器
HBase:集群
solr:服务器集群
Redis:服务器集群
爬虫监控:一台服务器
web项目:多台服务器
zookeeper:服务器集群
简单demo-windows环境开发--没有跳过登录的功能
package com.pshdhx.task;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.imegaware.crawler.util.HttpSender;
import com.imegaware.crawler.util.JdbcUtil;
import com.imegaware.crawler.util.LoginUtil;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.util.EntityUtils;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
public class Zhenbankongqi {
/**
* @Title 某区县镇办空气
* @Description TODO(这里用一句话描述这个方法的作用)
* @author pshdhx
* @return
* @return
*/
public void zichuan() {
// 登录获取cookie
String cookie = "";
try {
cookie = LoginUtil.getLoginCookie();
} catch (Exception e1) {
// TODO 自动生成的 catch 块
e1.printStackTrace();
}
// 1、准备请求参数
StringBuffer params = new StringBuffer();
params.append("Method=QueryRealTimeData");
params.append("&city=370302");
params.append("&levels=1%2C2%2C64");
params.append("&codes=107%2C132%2C101%2C141%2C140%2C102%2C106%2C108%2C144%2C127%2C130%2C129%2C126%2C350");
params.append("&subname=");
// 2、创建Post请求
HttpPost httpPost = new HttpPost(
"http://60.210.111.130:8002/ReDevelop/ajax/ZiBo/CityAirTown/RealTime/RealTimeDataQUIDYN_ZB/RealTimeData.ashx"
+ "?" + params);
// 3、设置请求头(注:如果只是传普通参数的话,ContentType不一定非要用application/json)
httpPost.setHeader("Content-Type", "application/json;charset=utf8");
httpPost.setHeader("Host", "60.210.111.130:8002");
httpPost.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0");
httpPost.setHeader("Accept", "application/json, text/javascript, */*; q=0.01");
httpPost.setHeader("Accept-Language", "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2");
httpPost.setHeader("Accept-Encoding", "gzip, deflate");
httpPost.setHeader("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
httpPost.setHeader("X-Requested-With", "XMLHttpRequest");
httpPost.setHeader("Origin", "http://60.210.111.130:8002");
httpPost.setHeader("Connection", "keep-alive");
httpPost.setHeader("Referer",
"http://60.210.111.130:8002/ReDevelop/Page/ZiBo/CityAirTown/RealTime/RealTimeDataQUIDYN_ZB/RealTimeData.aspx");
httpPost.setHeader("Cookie", "ASP.NET_SessionId=" + cookie + "; autoLogin=null; user=null; pwd=null");
// 4、发送请求并获取返回内容
try {
HttpEntity responseEntity = HttpSender.sendHttpPost(httpPost);
if (responseEntity != null) {
// 处理返回数据
JSONObject jsonObject = JSON.parseObject(EntityUtils.toString(responseEntity));
System.out.println("响应内容为:" + jsonObject);
// 5、入库
//saveData(jsonObject);
}
} catch (Exception e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
}
}
/**
* @Title saveData
* @Description 数据入库
* @author pshdhx
* @param param
* @return
* @return int
* @throws SQLException
*/
private static void saveData(JSONObject param) throws RuntimeException {
int rows = 0;
PreparedStatement pstmt;
try {
Connection conn = JdbcUtil.getConnection();
String sql = "INSERT INTO town_air(subid,stcode,subname,datetime,val_107,val_132,val_101,val_141,val_140,val_102,val_106,val_108,aqi,chief,iaqitype,levelst,val_144,val_127,val_130,val_129,val_126,val_350,task_time) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,now());";
pstmt = conn.prepareStatement(sql);
JSONArray row = param.getJSONArray("rows");
for (int i = 0; i < row.size(); i++) {
JSONObject data = row.getJSONObject(i);
pstmt.setString(1, data.getString("SubID"));
pstmt.setString(2, data.getString("Stcode"));
pstmt.setString(3, data.getString("SubName"));
pstmt.setString(4, data.getString("DateTime"));
pstmt.setString(5, data.getString("val_107"));
......
pstmt.addBatch();
}
int[] x = pstmt.executeBatch();
JdbcUtil.close(conn, pstmt, null);
for (int i : x) {
rows += i;
}
System.out.println("入库完成,共插入" + rows + "行数据");
} catch (Exception e) {
throw new RuntimeException("数据入库失败!", e);
}
}
public static void main(String[] args) {
new Zhenbankongqi().zichuan();
}
简单demo-deepin(Linux)环境开发(用到了一个比较好的包,hutool.cn)---没有跳过登录的功能
package com.pshdhx.psd;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.HashMap;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.util.EntityUtils;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.imegaware.crawler.util.HttpSender;
import com.imegaware.crawler.util.JdbcUtil;
import com.imegaware.crawler.util.LoginUtil;
import cn.hutool.http.Header;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpUtil;
public class Test {
public static void main(String[] args) {
String cookie = "";
try {
cookie = LoginUtil.getLoginCookie();
} catch (Exception e1) {
// TODO 自动生成的 catch 块
e1.printStackTrace();
}
String url = "http://60.210.111.130:8002/ReDevelop/ajax/ZiBo/CityAirTown/RealTime/RealTimeDataQUIDYN_ZB/RealTimeData.ashx";
HashMap<String, Object> paramMap = new HashMap<>();
paramMap.put("city", "0");
paramMap.put("Method", "QueryRealTimeData");
paramMap.put("levels", "1,2,64");
paramMap.put("codes", "107,132,101,141,140,102,106,108,144,127,130,129,126,350");
paramMap.put("subname", "");
//链式构建请求
String result2 = HttpRequest.post(url)
.header(Header.USER_AGENT, "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36")//头信息,多个头信息多次调用此方法即可
.header(Header.CONTENT_TYPE, "application/x-www-form-urlencoded; charset=UTF-8")
.header(Header.ACCEPT,"application/json, text/javascript, */*; q=0.01")
.header(Header.ACCEPT_LANGUAGE,"zh-CN,zh;q=0.9")
.header(Header.ACCEPT_ENCODING,"gzip, deflate")
.header(Header.HOST,"60.210.111.130:8002")
.header(Header.ORIGIN,"http://60.210.111.130:8002")
.header(Header.CONNECTION,"keep-alive")
.header(Header.REFERER,"http://60.210.111.130:8002/SewagePlant/RealTime/RealTimeDataQUIDYN/RealTimeData.aspx")
.header(Header.COOKIE,"ASP.NET_SessionId=" + cookie + "; autoLogin=null; user=null; pwd=null")
.form(paramMap)//表单内容
.timeout(20000)//超时,毫秒
.execute().body();
System.out.println(result2);
}
}
希望此篇文章对您有帮助!
来源:oschina
链接:https://my.oschina.net/u/4301811/blog/4713569