数据爬取一例 | 易学教程

最近赋闲在家, 就想找点事情做,就想着爬爬小视频什么的,于是就有了下面这个程序, 没什么难度, 就是给大家分享下思路

下面就用 https：// www。avtb6677 。 com来举例:

1, 找一个工具来下载相关页面代码, 我用的是teleport来只下载页面, 不下载其他的, 这大概等了1个小时就好了, 这工具挺好的,比写代码来爬全站来得快.

2, 这一个小时期间, 我写完了以下代码, 大概只用了20分钟吧.

package aa;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.alibaba.fastjson.JSON;

public class AvTaoBao6677 {

    public 	static String list = "c:/aaa.list";
	//下载页面的目录
	static String html_dir = "C:/Users/Administrator/workspace/avtb6677/html";
	//
	static String host = "https://www.avtb6677.com";

	public static void main(String [ ] args) {
		for ( File ff : new File ( html_dir ).listFiles ( ) ) {
			try {
				Document doc = Jsoup.parse ( ff , "UTF-8" ); ///注意编码
				Element video = get_video ( doc );
				if ( null == video ) throw new RuntimeException ( "" );
				Element source = get_source ( video );
				String poster = video.attr ( "poster" );
				poster = poster.startsWith ( "http" ) ? poster : host + poster;

				HashMap < String , String > attrs = new HashMap < String , String > ( );
				attrs.put ( "name" , ff.getName ( ) );
				attrs.put ( "title" , get_title ( doc.select ( "title" ).text ( ) ) );
				attrs.put ( "url " , doc.select ( "meta[property=\"og:url\"]" ).attr ( "content" ) );
				attrs.put ( "host" , host );
				attrs.put ( "poster" , poster );
				attrs.put ( "src" , source.attr ( "src" ) );
				attrs.put ( "tppabs" , source.attr ( "tppabs" ) );
				attrs.put ( "type" , source.attr ( "type" ) );
				attrs.put ( "label" , source.attr ( "label" ) );
				attrs.put ( "res" , source.attr ( "res" ) );
				System.out.println ( JSON.toJSONString ( attrs ) );
                FileUtils.write ( new File ( list ) ,JSON.toJSONString ( attrs ) + "\n" , true );
			} catch ( Exception e ) {
				e.printStackTrace ( );
			}
		}
	}

	private static String get_title(String text) {
		return text.indexOf ( "-" ) > 0 ? text.split ( "-" ) [ 0 ] : text.trim ( );
	}

	private static Element get_source(Element video) {
		Elements aaa = video.select ( "source" );
		List < Integer > reslist = new ArrayList < Integer > ( );
		for ( int i = 0 ; i < aaa.size ( ) ; i++ ) {
			Element aaaaa = aaa.get ( i );
			try {
				reslist.add ( Integer.valueOf ( aaaaa.attr ( "res" ) ) );
			} catch ( Exception e ) {
				e.printStackTrace ( );
			}
		}
		int res = get_max_res ( reslist );
		return video.select ( "source[res=\"" + res + "\"]" ).get ( 0 );
	}

	private static int get_max_res(List < Integer > reslist) {
		int q = 0;
		for ( int qq : reslist ) {
			q = qq > q ? qq : q;
		}
		return q;
	}

	private static Element get_video(Document doc2) throws IOException {
		Element video = doc2.select ( "video#player" ).get ( 0 );
		return video;
	}

}

3.依赖了3个jar : commons-io-2.4.jar ,fastjson-1.2.31.jar , jsoup-1.8.1.jar

4, 跑下这个程序就出来你需要的数据了, 注意tppabs字段就是你需要的数据了.

{
    "res": "360",
    "title": "最新网红美少女押尾貓VIP",
    "host": "https://www.avtb6677.com",
    "url ": "https://www.avtb6677.com/141225/最新网红美少女押尾貓vip",
    "name": "index-2378.htm",
    "poster": "https://www.avtb6677.com/1media/videos/tmb/000/041/225/player.jpg",
    "label": "360p",
    "tppabs": "https://gigi.kobeblackmanba.com/1media/videos/mp4/41225.mp4?st=XO86Hl9Ly8mP7ebEyAoS6w&e=1580634483",
    "src": "141225.mp4-st=XO86Hl9Ly8mP7ebEyAoS6w&e=1580634483",
    "type": "video/mp4"
}

5,也可以把json串保存到文件里面, 然后再解析下载.

package aa;

import httpclient.HttpUtil;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.apache.commons.io.FileUtils;
import org.apache.http.client.ClientProtocolException;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.alibaba.fastjson.JSON;

public class Down {

	static String list = AvTaoBao6677.list;
	static final int THREAD_POOL_SIZE = 2 ;  //多线程下载
	static String header_txt = "C:/Users/Administrator/workspace/avtb6677/header.txt";
	static HttpUtil HttpUtil = new HttpUtil ( );
	static HashMap < String , String > headers = new HashMap < String , String > ( );
	private static ExecutorService downloadExcutorService = Executors.newFixedThreadPool ( THREAD_POOL_SIZE );

	public static void main(String [ ] args) throws IOException {
		headers.putAll ( getHeader ( FileUtils.readLines ( new File ( header_txt ) ) ) );
		headers.put ( "progress" , "true" );
		int q = 0;
		for ( String line : FileUtils.readLines ( new File ( list ) ) ) {
			try {
				if ( q++ > 5 ) break; //调试用
				System.out.println ( line );
				String url = JSON.parseObject ( line ).getString ( "url" );
				String title = JSON.parseObject ( line ).getString ( "title" );
				File ff = new File ( "Z:/avtb6677/mp4" , title.trim ( ) + ".mp4" );
				if ( ff.exists ( ) ) continue;
				String source = get_Source ( url );
				System.out.println ( "source:" + source );
				if ( !ff.exists ( ) ) download ( source , ff );
			} catch ( RuntimeException e ) {
				System.err.println ( line );
				e.printStackTrace ( );
			}
		}
		//		HttpUtil.close ( ); //多线程不可以断开连接.

	}

	private static HashMap < String , String > getHeader(List < String > readLines) {
		HashMap < String , String > headers = new HashMap < String , String > ( );
		for ( String line : readLines ) {
			headers.put ( line.split ( "=" ) [ 0 ].trim ( ) , line.split ( "=" ) [ 1 ].trim ( ) );
		}
		return headers;
	}

	private static void download(final String source, final File file) {
		downloadExcutorService.execute ( new Runnable ( ) {
			public void run() {
				HttpUtil.DOWNLOAD ( source , file , headers );
			}
		} );
	}

	private static String get_Source(String url) throws ClientProtocolException, IOException {
		headers.put ( ":path" , url.substring ( url.indexOf ( "com/" ) + 3 ) );
		String html = HttpUtil.GET ( url , headers ).getHtml ( );
		Document doc = Jsoup.parse ( html );
		Element video = get_video ( doc );
		Element source = get_source ( video );
		String src = source.attr ( "src" );
		String tppabs = source.attr ( "tppabs" );
		return StringUtil.isBlank ( src ) ? tppabs : src;
	}

	private static Element get_source(Element video) {
		Elements aaa = video.select ( "source" );
		List < Integer > reslist = new ArrayList < Integer > ( );
		for ( int i = 0 ; i < aaa.size ( ) ; i++ ) {
			Element aaaaa = aaa.get ( i );
			try {
				reslist.add ( Integer.valueOf ( aaaaa.attr ( "res" ) ) );
			} catch ( Exception e ) {
				e.printStackTrace ( );
			}
		}
		int res = get_max_res ( reslist );
		return video.select ( "source[res=\"" + res + "\"]" ).get ( 0 );
	}

	private static int get_max_res(List < Integer > reslist) {
		int q = 0;
		for ( int qq : reslist ) {
			q = qq > q ? qq : q;
		}
		return q;
	}

	private static Element get_video(Document doc2) throws IOException {
		Element player = doc2.select ( "div#player-container" ).get ( 0 );
		Element video = player.select ( "video#player" ).get ( 0 );
		return video;
	}

}

OK, 都是你喜欢的模样.

来源：oschina

链接：https://my.oschina.net/wmhx/blog/3162743

标签

fastjson

Teleport

Commons-IO

jsoup