【推荐】2019 Java 开发者跳槽指南.pdf(吐血整理) >>>
一、列表页
var configs = {
domains: ["fang.com"],
scanUrls:
[
"http://office.fang.com/shou/house/",
"http://sh.office.fang.com/shou/house/",
"http://tj.office.fang.com/shou/house/",
"http://cq.office.fang.com/shou/house/",
"http://hf.office.fang.com/shou/house/",
"http://nb.office.fang.com/shou/house/",
"http://hz.office.fang.com/shou/house/",
"http://gz.office.fang.com/shou/house/",
"http://sz.office.fang.com/shou/house/",
"http://dg.office.fang.com/shou/house/",
"http://nn.office.fang.com/shou/house/",
"http://hn.office.fang.com/shou/house/",
"http://zz.office.fang.com/shou/house/",
"http://cd.office.fang.com/shou/house/",
"http://wuhan.office.fang.com/shou/house/",
"http://cs.office.fang.com/shou/house/",
"http://sjz.office.fang.com/shou/house/",
"http://xian.office.fang.com/shou/house/",
"http://nanjing.office.fang.com/shou/house/",
"http://suzhou.office.fang.com/shou/house/",
"http://wuxi.office.fang.com/shou/house/",
"http://cz.office.fang.com/shou/house/",
"http://jn.office.fang.com/shou/house/",
"http://qd.office.fang.com/shou/house/",
"http://nc.office.fang.com/shou/house/",
"http://changchun.office.fang.com/shou/house/",
"http://dl.office.fang.com/shou/house/"
],
// scanUrls: ["https://sh.office.fang.com/shou/house/"],
// contentUrlRegexes: [/https:\/\/.*/], //内容页url正则
// helperUrlRegexes: [/https:\/\/.*/], //列表页url正则 可留空
autoFindUrls: true,
fields: [
{
// 抽取项
name: "m_url",
repeated: true,
selector: "//div/dl/dt/a/@href" //默认使用XPath
}
]
};
configs.onProcessHelperPage = function(page, content, site) {
//接收参数
var param_split = page.url
//下一页链接
//var nextPageUrl = extract(content, "//a[text()='下一页']/@href");
var nowPage = extract(content, "//a[@id='PageControl1_hlk_next']/@href");
if (!nowPage) {
return false; // 如果没有下一页就不添加新的列表页到待爬队列
}
// nextPageUrl = nextPageUrl+"?province="+province_split+"&city="+city_split+"&area="+area_split+"&plate="+plate_split;
site.addUrl(nowPage);
return false; // 需要自动发现内容页,所以返回true
};
var crawler = new Crawler(configs);
crawler.start();
二、详情页的链接
var configs = {
domains: ["fang.com"],
scanUrls: [],
// scanUrls: ["https://office.fang.com/shou/3_441836901.html"],
// contentUrlRegexes: [/http:\/\/.*/], //内容页url正则
// helperUrlRegexes: [/http:\/\/.*/], //列表页url正则 可留空
autoFindUrls: false,
fields: [
{
// 抽取项
name: "d_url",
selector: "//div[@class='inforTxt']/dl[2]/dt/a[1]/@href"
}
]
};
configs.initCrawl = function(site) {
var sourceId = 11223662; //此ID需要修改为您自己的数据源ID
var query = 'source{}';
var src = shenjian.readSource(sourceId, query);
site.async(function(src) {
var infos = src.nextBatch(100)
while (infos) {
for (var i = 0; i < infos.length; i++) {
urls = infos[i].m_url;
for (var j=0;j<urls.length;j++){
site.addScanUrl(urls[j])
}
}
infos = src.nextBatch(100)
}
}, src);
};
var crawler = new Crawler(configs);
crawler.start();
三、详情页
var configs = {
domains: ["fang.com"],
// scanUrls: ["https://xinshikongguojigongyu.fang.com/office/xiangqing/"],
scanUrls: [],
// contentUrlRegexes: [/https:\/\/.*/], //内容页url正则
// helperUrlRegexes: [/https:\/\/.*/], //列表页url正则 可留空
autoFindUrls: false,
enableJS: true,
fields: [
{
// 楼盘名字
name: "name",
selector: "//span[@class='biaoti']" //默认使用XPath
},
{
// 所属区域
name: "area",
selector: "//dl[@class='xiangqing']/dd[1]" //默认使用XPath
},
{
// 楼盘地址
name: "address",
selector: "//dl[@class='xiangqing']/dd[2]/span" //默认使用XPath
},
{
// 物业类型
name: "property_type",
selector: "//dl[@class='xiangqing']/dd[4]" //默认使用XPath
},
{
// 写字楼的等级
name: "level",
selector: "//dl[@class='xiangqing']/dd[5]" //默认使用XPath
},
{
// 竣工时间
name: "mtime",
selector: "//dl[@class='xiangqing']/dd[9]" //默认使用XPath
},
{
//占地面积
name: "floor_area",
selector: "//dl[@class='xiangqing']/dd[13]" //默认使用XPath
},
{
// 建筑面积
name: "covered_area",
selector: "//dl[@class='xiangqing']/dd[14]" //默认使用XPath
},
{
// 百度经度
name: "longitude"
},
{
// 百度纬度
name: "latitude"
},
{
// 高德经度
name: "gaode_lon"
},
{
// 高德纬度
name: "gaode_lat"
}
]
};
configs.initCrawl = function(site) {
var sourceId = 11225420; //此ID需要修改为您自己的数据源ID
var query = 'source{}';
var src = shenjian.readSource(sourceId, query);
site.async(function(src) {
var infos = src.nextBatch(100)
while (infos) {
for (var i = 0; i < infos.length; i++) {
urls = infos[i].d_url;
if (urls.indexOf('esf')===-1){
site.addScanUrl(urls+'xiangqing/')
}else {
site.addScanUrl(urls.replace('esf','xiangqing/'))
}
}
infos = src.nextBatch(100)
}
}, src);
};
configs.afterDownloadPage = function(page, site) {
var Turl = extract(page.raw, "//div[@class='blmapbox']/iframe/@src")
pageData = site.requestUrl("https:" + Turl,
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3941.4 Safari/537.36"
}
)
// pageData=JSON.parse(pageData)
// var pos = /px:.*/.exec(page.raw)
var pos = extract(pageData, "/html/body/script[1]")
page.contextData = JSON.stringify(pos)
return page;
};
/*
回调函数afterExtractField:对抽取出来的数据进行处理
*/
configs.afterExtractField = function(fieldName, data, page, site) {
if (fieldName == "area") {
return data.replace("所属区域:", "")
}
if (fieldName == "property_type") {
return data.replace("物业类别:", "")
}
if (fieldName == "level") {
return data.replace("写字楼等级:", "")
}
if (fieldName == "mtime") {
return data.replace("竣工时间:", "")
}
if (fieldName == "floor_area") {
return data.replace("占地面积:", "")
}
if (fieldName == "covered_area") {
return data.replace("建筑面积:", "")
}
return data;
};
configs.afterExtractPage = function(page, data, site) {
function isEmpty(obj) {
if (typeof obj != "undefined" || obj != null || obj != "") {
return true;
}
else {
return false;
}
}
var jw = JSON.parse(page.contextData)
ss_jw = JSON.stringify(jw.match(/[px:]\d(.)+/g))
s = ss_jw.replace('[', '').split('\\\"')
data.longitude = s[1]
data.latitude = s[3]
var baidu_lon = s[1]
var baidu_lat = s[3]
if (isEmpty(baidu_lon) && isEmpty(baidu_lat)) {
if (s[1] === "" || s[3] === "") {
data.gaode_lon = 0
data.gaode_lat = 0
}
else {
var lon = baidu_lon;
var lat = baidu_lat;
var x_pi = 3.14159265358979324 * 3000.0 / 180.0
var x = lon - 0.0065
var y = lat - 0.006
var z = Math.sqrt(x * x + y * y) - 0.00002 * Math.sin(y * x_pi)
var theta = Math.atan2(y, x) - 0.000003 * Math.cos(x * x_pi)
data.gaode_lon = z * Math.cos(theta)
data.gaode_lat = z * Math.sin(theta)
}
}
return data;
};
var crawler = new Crawler(configs);
crawler.start();
爬取结果:
来源:oschina
链接:https://my.oschina.net/u/3892643/blog/3144036