最近赋闲在家, 就想找点事情做,就想着爬爬小视频什么的,于是就有了下面这个程序, 没什么难度, 就是给大家分享下思路
下面就用 https:// www。avtb6677 。 com来举例:
1, 找一个工具来下载相关页面代码, 我用的是teleport来只下载页面, 不下载其他的, 这大概等了1个小时就好了, 这工具挺好的,比写代码来爬全站来得快.
2, 这一个小时期间, 我写完了以下代码, 大概只用了20分钟吧.
package aa; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.alibaba.fastjson.JSON; public class AvTaoBao6677 { public static String list = "c:/aaa.list"; //下载页面的目录 static String html_dir = "C:/Users/Administrator/workspace/avtb6677/html"; // static String host = "https://www.avtb6677.com"; public static void main(String [ ] args) { for ( File ff : new File ( html_dir ).listFiles ( ) ) { try { Document doc = Jsoup.parse ( ff , "UTF-8" ); ///注意编码 Element video = get_video ( doc ); if ( null == video ) throw new RuntimeException ( "" ); Element source = get_source ( video ); String poster = video.attr ( "poster" ); poster = poster.startsWith ( "http" ) ? poster : host + poster; HashMap < String , String > attrs = new HashMap < String , String > ( ); attrs.put ( "name" , ff.getName ( ) ); attrs.put ( "title" , get_title ( doc.select ( "title" ).text ( ) ) ); attrs.put ( "url " , doc.select ( "meta[property=\"og:url\"]" ).attr ( "content" ) ); attrs.put ( "host" , host ); attrs.put ( "poster" , poster ); attrs.put ( "src" , source.attr ( "src" ) ); attrs.put ( "tppabs" , source.attr ( "tppabs" ) ); attrs.put ( "type" , source.attr ( "type" ) ); attrs.put ( "label" , source.attr ( "label" ) ); attrs.put ( "res" , source.attr ( "res" ) ); System.out.println ( JSON.toJSONString ( attrs ) ); FileUtils.write ( new File ( list ) ,JSON.toJSONString ( attrs ) + "\n" , true ); } catch ( Exception e ) { e.printStackTrace ( ); } } } private static String get_title(String text) { return text.indexOf ( "-" ) > 0 ? text.split ( "-" ) [ 0 ] : text.trim ( ); } private static Element get_source(Element video) { Elements aaa = video.select ( "source" ); List < Integer > reslist = new ArrayList < Integer > ( ); for ( int i = 0 ; i < aaa.size ( ) ; i++ ) { Element aaaaa = aaa.get ( i ); try { reslist.add ( Integer.valueOf ( aaaaa.attr ( "res" ) ) ); } catch ( Exception e ) { e.printStackTrace ( ); } } int res = get_max_res ( reslist ); return video.select ( "source[res=\"" + res + "\"]" ).get ( 0 ); } private static int get_max_res(List < Integer > reslist) { int q = 0; for ( int qq : reslist ) { q = qq > q ? qq : q; } return q; } private static Element get_video(Document doc2) throws IOException { Element video = doc2.select ( "video#player" ).get ( 0 ); return video; } }
3.依赖了3个jar : commons-io-2.4.jar ,fastjson-1.2.31.jar , jsoup-1.8.1.jar
4, 跑下这个程序就出来你需要的数据了, 注意tppabs字段就是你需要的数据了.
{ "res": "360", "title": "最新网红美少女押尾貓VIP", "host": "https://www.avtb6677.com", "url ": "https://www.avtb6677.com/141225/最新网红美少女押尾貓vip", "name": "index-2378.htm", "poster": "https://www.avtb6677.com/1media/videos/tmb/000/041/225/player.jpg", "label": "360p", "tppabs": "https://gigi.kobeblackmanba.com/1media/videos/mp4/41225.mp4?st=XO86Hl9Ly8mP7ebEyAoS6w&e=1580634483", "src": "141225.mp4-st=XO86Hl9Ly8mP7ebEyAoS6w&e=1580634483", "type": "video/mp4" }
5,也可以把json串保存到文件里面, 然后再解析下载.
package aa; import httpclient.HttpUtil; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import org.apache.commons.io.FileUtils; import org.apache.http.client.ClientProtocolException; import org.jsoup.Jsoup; import org.jsoup.helper.StringUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.alibaba.fastjson.JSON; public class Down { static String list = AvTaoBao6677.list; static final int THREAD_POOL_SIZE = 2 ; //多线程下载 static String header_txt = "C:/Users/Administrator/workspace/avtb6677/header.txt"; static HttpUtil HttpUtil = new HttpUtil ( ); static HashMap < String , String > headers = new HashMap < String , String > ( ); private static ExecutorService downloadExcutorService = Executors.newFixedThreadPool ( THREAD_POOL_SIZE ); public static void main(String [ ] args) throws IOException { headers.putAll ( getHeader ( FileUtils.readLines ( new File ( header_txt ) ) ) ); headers.put ( "progress" , "true" ); int q = 0; for ( String line : FileUtils.readLines ( new File ( list ) ) ) { try { if ( q++ > 5 ) break; //调试用 System.out.println ( line ); String url = JSON.parseObject ( line ).getString ( "url" ); String title = JSON.parseObject ( line ).getString ( "title" ); File ff = new File ( "Z:/avtb6677/mp4" , title.trim ( ) + ".mp4" ); if ( ff.exists ( ) ) continue; String source = get_Source ( url ); System.out.println ( "source:" + source ); if ( !ff.exists ( ) ) download ( source , ff ); } catch ( RuntimeException e ) { System.err.println ( line ); e.printStackTrace ( ); } } // HttpUtil.close ( ); //多线程不可以断开连接. } private static HashMap < String , String > getHeader(List < String > readLines) { HashMap < String , String > headers = new HashMap < String , String > ( ); for ( String line : readLines ) { headers.put ( line.split ( "=" ) [ 0 ].trim ( ) , line.split ( "=" ) [ 1 ].trim ( ) ); } return headers; } private static void download(final String source, final File file) { downloadExcutorService.execute ( new Runnable ( ) { public void run() { HttpUtil.DOWNLOAD ( source , file , headers ); } } ); } private static String get_Source(String url) throws ClientProtocolException, IOException { headers.put ( ":path" , url.substring ( url.indexOf ( "com/" ) + 3 ) ); String html = HttpUtil.GET ( url , headers ).getHtml ( ); Document doc = Jsoup.parse ( html ); Element video = get_video ( doc ); Element source = get_source ( video ); String src = source.attr ( "src" ); String tppabs = source.attr ( "tppabs" ); return StringUtil.isBlank ( src ) ? tppabs : src; } private static Element get_source(Element video) { Elements aaa = video.select ( "source" ); List < Integer > reslist = new ArrayList < Integer > ( ); for ( int i = 0 ; i < aaa.size ( ) ; i++ ) { Element aaaaa = aaa.get ( i ); try { reslist.add ( Integer.valueOf ( aaaaa.attr ( "res" ) ) ); } catch ( Exception e ) { e.printStackTrace ( ); } } int res = get_max_res ( reslist ); return video.select ( "source[res=\"" + res + "\"]" ).get ( 0 ); } private static int get_max_res(List < Integer > reslist) { int q = 0; for ( int qq : reslist ) { q = qq > q ? qq : q; } return q; } private static Element get_video(Document doc2) throws IOException { Element player = doc2.select ( "div#player-container" ).get ( 0 ); Element video = player.select ( "video#player" ).get ( 0 ); return video; } }
OK, 都是你喜欢的模样.
来源:oschina
链接:https://my.oschina.net/wmhx/blog/3162743