【推荐】2019 Java 开发者跳槽指南.pdf(吐血整理) >>>
/*
神箭手云_爬虫开发
支持原生JavaScript
开发教程:http://docs.shenjian.io/develop/crawler/doc/concept/crawler.html
*/
var configs = {
domains: ["fang.com"],
scanUrls: [],
contentUrlRegexes: [/https:\/\/.*/], //内容页url正则
helperUrlRegexes: [/https:\/\/.*/], //列表页url正则 可留空
autoFindUrls: false,
enableJS: true,
fields: [
{
// 楼盘名字
name: "name",
selector: "//span[@class='biaoti']" //默认使用XPath
},
{
// 所属区域
name: "area",
selector: "//dl[@class='xiangqing']/dd[1]" //默认使用XPath
},
{
// 楼盘地址
name: "address",
selector: "//dl[@class='xiangqing']/dd[2]/span" //默认使用XPath
},
{
// 物业类型
name: "property_type",
selector: "//dl[@class='xiangqing']/dd[4]" //默认使用XPath
},
{
// 写字楼的等级
name: "level",
selector: "//dl[@class='xiangqing']/dd[5]" //默认使用XPath
},
{
// 竣工时间
name: "mtime",
selector: "//dl[@class='xiangqing']/dd[9]" //默认使用XPath
},
{
//占地面积
name: "floor_area",
selector: "//dl[@class='xiangqing']/dd[13]" //默认使用XPath
},
{
// 建筑面积
name: "covered_area",
selector: "//dl[@class='xiangqing']/dd[14]" //默认使用XPath
},
{
// 建筑面积
name: "longitude",
selector: "" //默认使用XPath
},
{
// 建筑面积
name: "latitude",
selector: "" //默认使用XPath
}
]
};
configs.initCrawl = function(site) {
var sourceId = 11164939; //此ID需要修改为您自己的数据源ID
print(sourceId)
var query = 'source{}';
var src = shenjian.readSource(sourceId, query);
site.async(function(src) {
var infos = src.nextBatch(100)
while (infos) {
for (var i = 0; i < infos.length; i++) {
urls = infos[i].d_url + "xiangqing/";
print(urls)
site.addScanUrl(urls)
}
}
}, src);
};
configs.afterDownloadPage = function(page, site) {
var Turl = extract(page.raw, "//div[@class='blmapbox']/iframe/@src")
pageData = site.requestUrl("https:" + Turl,
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3941.4 Safari/537.36"
}
)
// pageData=JSON.parse(pageData)
// var pos = /px:.*/.exec(page.raw)
var pos = extract(pageData, "/html/body/script[1]")
page.contextData = JSON.stringify(pos)
return page;
};
/*
回调函数afterExtractField:对抽取出来的数据进行处理
*/
configs.afterExtractField = function(fieldName, data, page, site) {
if (fieldName == "area") {
return data.replace("所属区域:", "")
}
if (fieldName == "property_type") {
return data.replace("物业类别:", "")
}
if (fieldName == "level") {
return data.replace("写字楼等级:", "")
}
if (fieldName == "mtime") {
return data.replace("竣工时间:", "")
}
if (fieldName == "floor_area") {
return data.replace("占地面积:", "")
}
if (fieldName == "covered_area") {
return data.replace("建筑面积:", "")
}
return data;
};
configs.afterExtractPage = function(page, data, site) {
var jw = JSON.parse(page.contextData)
ss_jw = JSON.stringify(jw.match(/[px:]\d(.)+/g))
s = ss_jw.replace('[', '').split('\\\"')
data.longitude = s[1]
data.latitude = s[3]
return data;
};
var crawler = new Crawler(configs);
crawler.start();
来源:oschina
链接:https://my.oschina.net/u/3892643/blog/3143514