How to return html of a page using robobrowser

后端 未结 1 933
北海茫月
北海茫月 2021-01-18 13:45

I\'m experimenting with http://robobrowser.readthedocs.org/en/latest/readme.html, a new python library based on the beautiful soup library. I\'m trying to test it out by ope

相关标签:
1条回答
  • 2021-01-18 14:09

    You can try using the parsed property.

    Code:

    from robobrowser import RoboBrowser
    
    url = "http://www.google.com"
    br = RoboBrowser(history=True)
    br.open(url)
    print br.parsed
    

    Result:

    <!DOCTYPE html>
    <html itemscope="" itemtype="http://schema.org/WebPage" lang="en-PH"><head><meta content="/images/google_favicon_128.png" itemprop="image"/><title>Google</title><script>(function(){
    window.google={kEI:"-RFgU9LgJsq6uATKqYGoDg",getEI:function(a){for(var b;a&&(!a.getAttribute||!(b=a.getAttribute("eid")));)a=a.parentNode;return b||google.kEI},https:function(){return"https:"==window.location.protocol},kEXPI:"17259,4000116,4007661,4007830,4008142,4009033,4009641,4010806,4010858,4010899,4011228,4011258,4011679,4012373,4012504,4012860,4013374,4013414,4013591,4013723,4013758,4013787,4013823,4013941,4013967,4013979,4014016,4014033,4014093,4014431,4014515,4014636,4014671,4014810,4014813,4014828,4014991,4015119,4015155,4015234,4015260,4015519,4015550,4015587,4015635,4015638,4015639,4015772,4015901,4016005,4016042,4016127,4016309,4016363,4016367,4016371,4016391,4016425,4016452,4016466,4016479,4016487,4016638,4016703,4016730,4016786,4016800,4016824,4016855,4016933,4016969,4016978,4017042,4017079,4017177,4017202,4017206,4017280,4017285,4017294,4017301,4017336,4017556,8300015,8300017,8500165,8500223,8500240,8500252,8500255,8500283,8500306,8500313,8500325,8500332,8500349,10200002,10200012,10200029,10200038,10200040,10200048,10200053,10200055,10200066,10200083,10200120,10200134,10200136,10200155,10200157,10200159,10200164,10200195,10200211,10200215,10200221,10200224,10200231,10200236,10200242,10200246,10200252",kCSI:{e:"17259,4000116,4007661,4007830,4008142,4009033,4009641,4010806,4010858,4010899,4011228,4011258,4011679,4012373,4012504,4012860,4013374,4013414,4013591,4013723,4013758,4013787,4013823,4013941,4013967,4013979,4014016,4014033,4014093,4014431,4014515,4014636,4014671,4014810,4014813,4014828,4014991,4015119,4015155,4015234,4015260,4015519,4015550,4015587,4015635,4015638,4015639,4015772,4015901,4016005,4016042,4016127,4016309,4016363,4016367,4016371,4016391,4016425,4016452,4016466,4016479,4016487,4016638,4016703,4016730,4016786,4016800,4016824,4016855,4016933,4016969,4016978,4017042,4017079,4017177,4017202,4017206,4017280,4017285,4017294,4017301,4017336,4017556,8300015,8300017,8500165,8500223,8500240,8500252,8500255,8500283,8500306,8500313,8500325,8500332,8500349,10200002,10200012,10200029,10200038,10200040,10200048,10200053,10200055,10200066,10200083,10200120,10200134,10200136,10200155,10200157,10200159,10200164,10200195,10200211,10200215,10200221,10200224,10200231,10200236,10200242,10200246,10200252",ei:"-RFgU9LgJsq6uATKqYGoDg"},authuser:0,ml:function(){},kHL:"en",time:function(){return(new Date).getTime()},log:function(a,b,c,h,k){var d=
    new Image,f=google.lc,e=google.li,g="";d.onerror=d.onload=d.onabort=function(){delete f[e]};f[e]=d;c||-1!=b.search("&ei=")||(g="&ei="+google.getEI(h));c=c||"/"+(k||"gen_204")+"?atyp=i&ct="+a+"&cad="+b+g+"&zx="+google.time();a=/^http:/i;a.test(c)&&google.https()?(google.ml(Error("GLMM"),!1,{src:c}),delete f[e]):(d.src=c,google.li=e+1)},lc:[],li:0,y:{},x:function(a,b){google.y[a.id]=[a,b];return!1},load:function(a,b,c){google.x({id:a+l++},function(){google.load(a,b,c)})}};var l=0;})();
    (function(){google.sn="webhp";google.timers={};google.startTick=function(a,b){var f=google.time();google.timers[a]={t:{start:f},bfr:!!b};};google.tick=function(a,b,f){google.timers[a]||google.startTick(a);google.timers[a].t[b]=f||google.time()};google.startTick("load",!0);
    try{}catch(d){}})();
    var _gjwl=location;function _gjuc(){var a=_gjwl.href.indexOf("#");if(0<=a&&(a=_gjwl.href.substring(a),0<a.indexOf("&q=")||0<=a.indexOf("#q="))&&(a=a.substring(1),-1==a.indexOf("#"))){for(var d=0;d<a.length;){var b=d;"&"==a.charAt(b)&&++b;var c=a.indexOf("&",b);-1==c&&(c=a.length);b=a.substring(b,c);if(0==b.indexOf("fp="))a=a.substring(0,d)+a.substring(c,a.length),c=d;else if("cad=h"==b)return 0;d=c}_gjwl.href="/search?"+a+"&cad=h";return 1}return 0}
    function _gjh(){!_gjuc()&&window.google&&google.x&&google.x({id:"GJH"},function(){google.nav&&google.nav.gjh&&google.nav.gjh()})};
    window._gjh&&_gjh();</script><style>#gbar,#guser{font-size:13px;padding-top:1px !important;}#gbar{height:22px}#guser{padding-bottom:7px !important;text-align:right}.gbh,.gbd{border-top:1px solid #c9d7f1;font-size:1px}.gbh{height:0;position:absolute;top:24px;width:100%}@media all{.gb1{height:22px;margin-right:.5em;vertical-align:top}#gbar{float:left}}a.gb1,a.gb4{text-decoration:underline !important}a.gb1,a.gb4{color:#00c !important}.gbi .gb4{color:#dd8e27 !important}.gbf .gb4{color:#900 !important}</style><style>body,td,a,p,.h{font-family:arial,sans-serif}body{margin:0;overflow-y:scroll}#gog{padding:3px 8px 0}td{line-height:.8em}.gac_m td{line-height:17px}form{margin-bottom:20px}.h{color:#36c}.q{color:#00c}.ts td{padding:0}.ts{border-collapse:collapse}em{font-weight:bold;font-style:normal}.lst{height:25px;width:496px}.gsfi,.lst{font:18px arial,sans-serif}.gsfs{font:17px arial,sans-serif}.ds{display:inline-box;display:inline-block;margin:3px 0 4px;margin-left:4px}input{font-family:inherit}a.gb1,a.gb2,a.gb3,a.gb4{color:#11c !important}body{background:#fff;color:black}a{color:#11c;text-decoration:none}a:hover,a:active{text-decoration:underline}.fl a{color:#36c}a:visited{color:#551a8b}a.gb1,a.gb4{text-decoration:underline}a.gb3:hover{text-decoration:none}#ghead a.gb2:hover{color:#fff !important}.sblc{padding-top:5px}.sblc a{display:block;margin:2px 0;margin-left:13px;font-size:11px}.lsbb{background:#eee;border:solid 1px;border-color:#ccc #999 #999 #ccc;height:30px}.lsbb{display:block}.ftl,#fll a{display:inline-block;margin:0 12px}.lsb{background:url(/images/srpr/nav_logo80.png) 0 -258px repeat-x;border:none;color:#000;cursor:pointer;height:30px;margin:0;outline:0;font:15px arial,sans-serif;vertical-align:top}.lsb:active{background:#ccc}.lst:focus{outline:none}#addlang a{padding:0 3px}</style><script></script></head><body bgcolor="#fff"><script>(function(){var src='/images/nav_logo176.png';var iesg=false;document.body.onload = function(){window.n && window.n();if (document.images){new Image().src=src;}
    if (!iesg){document.f&&document.f.q.focus();document.gbqf&&document.gbqf.q.focus();}
    }
    })();</script><textarea id="csi" style="display:none"></textarea><div id="mngb"> <div id="gbar"><nobr><b class="gb1">Search</b> <a class="gb1" href="http://www.google.com.ph/imghp?hl=en&amp;tab=wi">Images</a> <a class="gb1" href="http://maps.google.com.ph/maps?hl=en&amp;tab=wl">Maps</a> <a class="gb1" href="https://play.google.com/?hl=en&amp;tab=w8">Play</a> <a class="gb1" href="http://www.youtube.com/?gl=PH&amp;tab=w1">YouTube</a> <a class="gb1" href="http://news.google.com.ph/nwshp?hl=en&amp;tab=wn">News</a> <a class="gb1" href="https://mail.google.com/mail/?tab=wm">Gmail</a> <a class="gb1" href="https://drive.google.com/?tab=wo">Drive</a> <a class="gb1" href="http://www.google.com.ph/intl/en/options/" style="text-decoration:none"><u>More</u> »</a></nobr></div><div id="guser" width="100%"><nobr><span class="gbi" id="gbn"></span><span class="gbf" id="gbf"></span><span id="gbe"></span><a class="gb4" href="http://www.google.com.ph/history/optout?hl=en">Web History</a> | <a class="gb4" href="/preferences?hl=en">Settings</a> | <a class="gb4" href="https://accounts.google.com/ServiceLogin?hl=en&amp;continue=http://www.google.com.ph/%3Fgfe_rd%3Dcr%26ei%3D-BFgU62INOmNiAeYnYDoAg" id="gb_70" target="_top">Sign in</a></nobr></div><div class="gbh" style="left:0"></div><div class="gbh" style="right:0"></div> </div><center><br clear="all" id="lgpd"/><div id="lga"><div style="padding:28px 0 3px"><div align="left" id="hplogo" onload="window.lol&amp;&amp;lol()" style="height:110px;width:276px;background:url(/images/srpr/logo9w.png) no-repeat" title="Google"><div nowrap="" style="color:#777;font-size:16px;font-weight:bold;position:relative;top:70px;left:218px">Philippines</div></div></div><br/></div><form action="/search" name="f"><table cellpadding="0" cellspacing="0"><tr valign="top"><td width="25%"> </td><td align="center" nowrap=""><input name="ie" type="hidden" value="ISO-8859-1"/><input name="hl" type="hidden" value="en-PH"/><input name="source" type="hidden" value="hp"/><div class="ds" style="height:32px;margin:4px 0"><input autocomplete="off" class="lst" maxlength="2048" name="q" size="57" style="color:#000;margin:0;padding:5px 8px 0 6px;vertical-align:top" title="Google Search" value=""/></div><br style="line-height:0"/><span class="ds"><span class="lsbb"><input class="lsb" name="btnG" type="submit" value="Google Search"/></span></span><span class="ds"><span class="lsbb"><input class="lsb" name="btnI" onclick="if(this.form.q.value)this.checked=1; else top.location='/doodles/'" type="submit" value="I'm Feeling Lucky"/></span></span></td><td align="left" class="fl sblc" nowrap="" width="25%"><a href="/advanced_search?hl=en-PH&amp;authuser=0">Advanced search</a><a href="/language_tools?hl=en-PH&amp;authuser=0">Language tools</a></td></tr></table><input id="gbv" name="gbv" type="hidden" value="1"/></form><div id="gac_scont"></div><div style="font-size:83%;min-height:3.5em"><br/><div id="als"><font id="addlang" size="-1">Google.com.ph offered in: <a href="http://www.google.com.ph/setprefs?sig=0_SbhOaVIheKTw2jFHRcEg8o-Evng%3D&amp;hl=tl&amp;source=homepage">Filipino</a> <a href="http://www.google.com.ph/setprefs?sig=0_SbhOaVIheKTw2jFHRcEg8o-Evng%3D&amp;hl=ceb&amp;source=homepage">Cebuano</a></font><br/><br/></div></div><span id="footer"><div style="font-size:10pt"><div id="fll" style="margin:19px auto;text-align:center"><a href="/intl/en/ads/">Advertising Programs</a><a href="http://www.google.com.ph/intl/en/services/">Business Solutions</a><a href="/intl/en/about.html">About Google</a><a href="http://www.google.com.ph/setprefdomain?prefdom=US&amp;sig=0_dQ2pwXFotFQfDlj9qmDCkzdxCdA%3D" id="fehl">Google.com</a></div></div><p style="color:#767676;font-size:8pt">© 2013 - <a href="/intl/en/policies/">Privacy &amp; Terms</a></p></span></center><div id="xjsd"></div><div data-jiis="bp" id="xjsi"><script>if(google.y)google.y.first=[];(function(){function b(a){window.setTimeout(function(){var c=document.createElement("script");c.src=a;document.getElementById("xjsd").appendChild(c)},0)}google.dljp=function(a){google.xjsu=a;b(a)};google.dlj=b;})();
    if(!google.xjs){window._=window._||{};window._._DumpException=function(e){throw e};if(google.timers&&google.timers.load.t){google.timers.load.t.xjsls=new Date().getTime();}google.dljp('/xjs/_/js/k\x3dxjs.hp.en_US.RLLpSOAzMFM.O/m\x3dsb_he,pcc/rt\x3dj/d\x3d1/sv\x3d1/rs\x3dAItRSTOBXfxSyWrXjOGBi9e9cIs5cEBO6A');google.xjs=1;}google.pmc={"sb_he":{"agen":true,"cgen":true,"client":"heirloom-hp","dh":true,"ds":"","eqch":true,"fl":true,"host":"google.com.ph","jam":0,"jsonp":true,"msgs":{"cibl":"Clear Search","dym":"Did you mean:","lcky":"I\u0026#39;m Feeling Lucky","lml":"Learn more","oskt":"Input tools","psrc":"This search was removed from your \u003Ca href=\"/history\"\u003EWeb History\u003C/a\u003E","psrl":"Remove","sbit":"Search by image","srch":"Google Search"},"ovr":{},"pq":"","qcpw":false,"scd":10,"sce":5,"stok":"QGTPqfOgiEZ_AI3e5vphR6-NOmw"},"pcc":{}};google.y.first.push(function(){if(google.med){google.med('init');google.initHistory();google.med('history');}});if(google.j&&google.j.en&&google.j.xi){window.setTimeout(google.j.xi,0);}</script></div><script>(function(){if(google.timers&&google.timers.load.t){var b,c,d,e,g=function(a,f){a.removeEventListener?(a.removeEventListener("load",f,!1),a.removeEventListener("error",f,!1)):(a.detachEvent("onload",f),a.detachEvent("onerror",f))},h=function(a){e=(new Date).getTime();++c;a=a||window.event;a=a.target||a.srcElement;g(a,h)},k=document.getElementsByTagName("img");b=k.length;for(var l=c=0,m;l<b;++l)m=k[l],m.complete||"string"!=typeof m.src||!m.src?++c:m.addEventListener?(m.addEventListener("load",h,!1),m.addEventListener("error",
    h,!1)):(m.attachEvent("onload",h),m.attachEvent("onerror",h));d=b-c;var n=function(){if(google.timers.load.t){google.timers.load.t.ol=(new Date).getTime();google.timers.load.t.iml=e;google.kCSI.imc=c;google.kCSI.imn=b;google.kCSI.imp=d;void 0!==google.stt&&(google.kCSI.stt=google.stt);google.csiReport&&google.csiReport()}};window.addEventListener?window.addEventListener("load",n,!1):window.attachEvent&&
    window.attachEvent("onload",n);google.timers.load.t.prt=e=(new Date).getTime()};})();
    </script></body></html>
    [Finished in 4.3s]
    

    How it scrapes is up to you, though. In any case, if you're having difficulties getting to know new libraries such as this, always exhaust the documentation.

    From the site itself:

    parsed
        Lazily parse response content, using HTML parser specified by the browser.
    

    Source

    Or you can use dir on br and get the following:

    ['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_build_url', '_cursor', '_maxlen', '_states', '_traverse', '_update_state', 'back', 'find', 'find_all', 'follow_link', 'forward', 'get_form', 'get_forms', 'get_link', 'get_links', 'history', 'open', 'parsed', 'parser', 'response', 'select', 'session', 'state', 'submit_form', 'timeout', 'url']
    [Finished in 5.3s]
    

    As you can see, towards the end, parsed is shown.

    Hope this helps.

    0 讨论(0)
提交回复
热议问题