How to return html of a page using robobrowser

陌路散爱 提交于 2019-12-30 18:50:39

问题


I'm experimenting with http://robobrowser.readthedocs.org/en/latest/readme.html, a new python library based on the beautiful soup library. I'm trying to test it out by opening an html page and returning it within a django app, but I can't figure out to do this most simple task. My django app contains :

def index(request):    

    p=str(request.POST.get('p', False)) # p='https://www.yahoo.com/'
    browser = RoboBrowser(history=True)
    postedmessage = browser.open(p)
    return HttpResponse(postedmessage)

How can I return all the page's HTML?


回答1:


You can try using the parsed property.

Code:

from robobrowser import RoboBrowser

url = "http://www.google.com"
br = RoboBrowser(history=True)
br.open(url)
print br.parsed

Result:

<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/WebPage" lang="en-PH"><head><meta content="/images/google_favicon_128.png" itemprop="image"/><title>Google</title><script>(function(){
window.google={kEI:"-RFgU9LgJsq6uATKqYGoDg",getEI:function(a){for(var b;a&&(!a.getAttribute||!(b=a.getAttribute("eid")));)a=a.parentNode;return b||google.kEI},https:function(){return"https:"==window.location.protocol},kEXPI:"17259,4000116,4007661,4007830,4008142,4009033,4009641,4010806,4010858,4010899,4011228,4011258,4011679,4012373,4012504,4012860,4013374,4013414,4013591,4013723,4013758,4013787,4013823,4013941,4013967,4013979,4014016,4014033,4014093,4014431,4014515,4014636,4014671,4014810,4014813,4014828,4014991,4015119,4015155,4015234,4015260,4015519,4015550,4015587,4015635,4015638,4015639,4015772,4015901,4016005,4016042,4016127,4016309,4016363,4016367,4016371,4016391,4016425,4016452,4016466,4016479,4016487,4016638,4016703,4016730,4016786,4016800,4016824,4016855,4016933,4016969,4016978,4017042,4017079,4017177,4017202,4017206,4017280,4017285,4017294,4017301,4017336,4017556,8300015,8300017,8500165,8500223,8500240,8500252,8500255,8500283,8500306,8500313,8500325,8500332,8500349,10200002,10200012,10200029,10200038,10200040,10200048,10200053,10200055,10200066,10200083,10200120,10200134,10200136,10200155,10200157,10200159,10200164,10200195,10200211,10200215,10200221,10200224,10200231,10200236,10200242,10200246,10200252",kCSI:{e:"17259,4000116,4007661,4007830,4008142,4009033,4009641,4010806,4010858,4010899,4011228,4011258,4011679,4012373,4012504,4012860,4013374,4013414,4013591,4013723,4013758,4013787,4013823,4013941,4013967,4013979,4014016,4014033,4014093,4014431,4014515,4014636,4014671,4014810,4014813,4014828,4014991,4015119,4015155,4015234,4015260,4015519,4015550,4015587,4015635,4015638,4015639,4015772,4015901,4016005,4016042,4016127,4016309,4016363,4016367,4016371,4016391,4016425,4016452,4016466,4016479,4016487,4016638,4016703,4016730,4016786,4016800,4016824,4016855,4016933,4016969,4016978,4017042,4017079,4017177,4017202,4017206,4017280,4017285,4017294,4017301,4017336,4017556,8300015,8300017,8500165,8500223,8500240,8500252,8500255,8500283,8500306,8500313,8500325,8500332,8500349,10200002,10200012,10200029,10200038,10200040,10200048,10200053,10200055,10200066,10200083,10200120,10200134,10200136,10200155,10200157,10200159,10200164,10200195,10200211,10200215,10200221,10200224,10200231,10200236,10200242,10200246,10200252",ei:"-RFgU9LgJsq6uATKqYGoDg"},authuser:0,ml:function(){},kHL:"en",time:function(){return(new Date).getTime()},log:function(a,b,c,h,k){var d=
new Image,f=google.lc,e=google.li,g="";d.onerror=d.onload=d.onabort=function(){delete f[e]};f[e]=d;c||-1!=b.search("&ei=")||(g="&ei="+google.getEI(h));c=c||"/"+(k||"gen_204")+"?atyp=i&ct="+a+"&cad="+b+g+"&zx="+google.time();a=/^http:/i;a.test(c)&&google.https()?(google.ml(Error("GLMM"),!1,{src:c}),delete f[e]):(d.src=c,google.li=e+1)},lc:[],li:0,y:{},x:function(a,b){google.y[a.id]=[a,b];return!1},load:function(a,b,c){google.x({id:a+l++},function(){google.load(a,b,c)})}};var l=0;})();
(function(){google.sn="webhp";google.timers={};google.startTick=function(a,b){var f=google.time();google.timers[a]={t:{start:f},bfr:!!b};};google.tick=function(a,b,f){google.timers[a]||google.startTick(a);google.timers[a].t[b]=f||google.time()};google.startTick("load",!0);
try{}catch(d){}})();
var _gjwl=location;function _gjuc(){var a=_gjwl.href.indexOf("#");if(0<=a&&(a=_gjwl.href.substring(a),0<a.indexOf("&q=")||0<=a.indexOf("#q="))&&(a=a.substring(1),-1==a.indexOf("#"))){for(var d=0;d<a.length;){var b=d;"&"==a.charAt(b)&&++b;var c=a.indexOf("&",b);-1==c&&(c=a.length);b=a.substring(b,c);if(0==b.indexOf("fp="))a=a.substring(0,d)+a.substring(c,a.length),c=d;else if("cad=h"==b)return 0;d=c}_gjwl.href="/search?"+a+"&cad=h";return 1}return 0}
function _gjh(){!_gjuc()&&window.google&&google.x&&google.x({id:"GJH"},function(){google.nav&&google.nav.gjh&&google.nav.gjh()})};
window._gjh&&_gjh();</script><style>#gbar,#guser{font-size:13px;padding-top:1px !important;}#gbar{height:22px}#guser{padding-bottom:7px !important;text-align:right}.gbh,.gbd{border-top:1px solid #c9d7f1;font-size:1px}.gbh{height:0;position:absolute;top:24px;width:100%}@media all{.gb1{height:22px;margin-right:.5em;vertical-align:top}#gbar{float:left}}a.gb1,a.gb4{text-decoration:underline !important}a.gb1,a.gb4{color:#00c !important}.gbi .gb4{color:#dd8e27 !important}.gbf .gb4{color:#900 !important}</style><style>body,td,a,p,.h{font-family:arial,sans-serif}body{margin:0;overflow-y:scroll}#gog{padding:3px 8px 0}td{line-height:.8em}.gac_m td{line-height:17px}form{margin-bottom:20px}.h{color:#36c}.q{color:#00c}.ts td{padding:0}.ts{border-collapse:collapse}em{font-weight:bold;font-style:normal}.lst{height:25px;width:496px}.gsfi,.lst{font:18px arial,sans-serif}.gsfs{font:17px arial,sans-serif}.ds{display:inline-box;display:inline-block;margin:3px 0 4px;margin-left:4px}input{font-family:inherit}a.gb1,a.gb2,a.gb3,a.gb4{color:#11c !important}body{background:#fff;color:black}a{color:#11c;text-decoration:none}a:hover,a:active{text-decoration:underline}.fl a{color:#36c}a:visited{color:#551a8b}a.gb1,a.gb4{text-decoration:underline}a.gb3:hover{text-decoration:none}#ghead a.gb2:hover{color:#fff !important}.sblc{padding-top:5px}.sblc a{display:block;margin:2px 0;margin-left:13px;font-size:11px}.lsbb{background:#eee;border:solid 1px;border-color:#ccc #999 #999 #ccc;height:30px}.lsbb{display:block}.ftl,#fll a{display:inline-block;margin:0 12px}.lsb{background:url(/images/srpr/nav_logo80.png) 0 -258px repeat-x;border:none;color:#000;cursor:pointer;height:30px;margin:0;outline:0;font:15px arial,sans-serif;vertical-align:top}.lsb:active{background:#ccc}.lst:focus{outline:none}#addlang a{padding:0 3px}</style><script></script></head><body bgcolor="#fff"><script>(function(){var src='/images/nav_logo176.png';var iesg=false;document.body.onload = function(){window.n && window.n();if (document.images){new Image().src=src;}
if (!iesg){document.f&&document.f.q.focus();document.gbqf&&document.gbqf.q.focus();}
}
})();</script><textarea id="csi" style="display:none"></textarea><div id="mngb"> <div id="gbar"><nobr><b class="gb1">Search</b> <a class="gb1" href="http://www.google.com.ph/imghp?hl=en&amp;tab=wi">Images</a> <a class="gb1" href="http://maps.google.com.ph/maps?hl=en&amp;tab=wl">Maps</a> <a class="gb1" href="https://play.google.com/?hl=en&amp;tab=w8">Play</a> <a class="gb1" href="http://www.youtube.com/?gl=PH&amp;tab=w1">YouTube</a> <a class="gb1" href="http://news.google.com.ph/nwshp?hl=en&amp;tab=wn">News</a> <a class="gb1" href="https://mail.google.com/mail/?tab=wm">Gmail</a> <a class="gb1" href="https://drive.google.com/?tab=wo">Drive</a> <a class="gb1" href="http://www.google.com.ph/intl/en/options/" style="text-decoration:none"><u>More</u> »</a></nobr></div><div id="guser" width="100%"><nobr><span class="gbi" id="gbn"></span><span class="gbf" id="gbf"></span><span id="gbe"></span><a class="gb4" href="http://www.google.com.ph/history/optout?hl=en">Web History</a> | <a class="gb4" href="/preferences?hl=en">Settings</a> | <a class="gb4" href="https://accounts.google.com/ServiceLogin?hl=en&amp;continue=http://www.google.com.ph/%3Fgfe_rd%3Dcr%26ei%3D-BFgU62INOmNiAeYnYDoAg" id="gb_70" target="_top">Sign in</a></nobr></div><div class="gbh" style="left:0"></div><div class="gbh" style="right:0"></div> </div><center><br clear="all" id="lgpd"/><div id="lga"><div style="padding:28px 0 3px"><div align="left" id="hplogo" onload="window.lol&amp;&amp;lol()" style="height:110px;width:276px;background:url(/images/srpr/logo9w.png) no-repeat" title="Google"><div nowrap="" style="color:#777;font-size:16px;font-weight:bold;position:relative;top:70px;left:218px">Philippines</div></div></div><br/></div><form action="/search" name="f"><table cellpadding="0" cellspacing="0"><tr valign="top"><td width="25%"> </td><td align="center" nowrap=""><input name="ie" type="hidden" value="ISO-8859-1"/><input name="hl" type="hidden" value="en-PH"/><input name="source" type="hidden" value="hp"/><div class="ds" style="height:32px;margin:4px 0"><input autocomplete="off" class="lst" maxlength="2048" name="q" size="57" style="color:#000;margin:0;padding:5px 8px 0 6px;vertical-align:top" title="Google Search" value=""/></div><br style="line-height:0"/><span class="ds"><span class="lsbb"><input class="lsb" name="btnG" type="submit" value="Google Search"/></span></span><span class="ds"><span class="lsbb"><input class="lsb" name="btnI" onclick="if(this.form.q.value)this.checked=1; else top.location='/doodles/'" type="submit" value="I'm Feeling Lucky"/></span></span></td><td align="left" class="fl sblc" nowrap="" width="25%"><a href="/advanced_search?hl=en-PH&amp;authuser=0">Advanced search</a><a href="/language_tools?hl=en-PH&amp;authuser=0">Language tools</a></td></tr></table><input id="gbv" name="gbv" type="hidden" value="1"/></form><div id="gac_scont"></div><div style="font-size:83%;min-height:3.5em"><br/><div id="als"><font id="addlang" size="-1">Google.com.ph offered in: <a href="http://www.google.com.ph/setprefs?sig=0_SbhOaVIheKTw2jFHRcEg8o-Evng%3D&amp;hl=tl&amp;source=homepage">Filipino</a> <a href="http://www.google.com.ph/setprefs?sig=0_SbhOaVIheKTw2jFHRcEg8o-Evng%3D&amp;hl=ceb&amp;source=homepage">Cebuano</a></font><br/><br/></div></div><span id="footer"><div style="font-size:10pt"><div id="fll" style="margin:19px auto;text-align:center"><a href="/intl/en/ads/">Advertising Programs</a><a href="http://www.google.com.ph/intl/en/services/">Business Solutions</a><a href="/intl/en/about.html">About Google</a><a href="http://www.google.com.ph/setprefdomain?prefdom=US&amp;sig=0_dQ2pwXFotFQfDlj9qmDCkzdxCdA%3D" id="fehl">Google.com</a></div></div><p style="color:#767676;font-size:8pt">© 2013 - <a href="/intl/en/policies/">Privacy &amp; Terms</a></p></span></center><div id="xjsd"></div><div data-jiis="bp" id="xjsi"><script>if(google.y)google.y.first=[];(function(){function b(a){window.setTimeout(function(){var c=document.createElement("script");c.src=a;document.getElementById("xjsd").appendChild(c)},0)}google.dljp=function(a){google.xjsu=a;b(a)};google.dlj=b;})();
if(!google.xjs){window._=window._||{};window._._DumpException=function(e){throw e};if(google.timers&&google.timers.load.t){google.timers.load.t.xjsls=new Date().getTime();}google.dljp('/xjs/_/js/k\x3dxjs.hp.en_US.RLLpSOAzMFM.O/m\x3dsb_he,pcc/rt\x3dj/d\x3d1/sv\x3d1/rs\x3dAItRSTOBXfxSyWrXjOGBi9e9cIs5cEBO6A');google.xjs=1;}google.pmc={"sb_he":{"agen":true,"cgen":true,"client":"heirloom-hp","dh":true,"ds":"","eqch":true,"fl":true,"host":"google.com.ph","jam":0,"jsonp":true,"msgs":{"cibl":"Clear Search","dym":"Did you mean:","lcky":"I\u0026#39;m Feeling Lucky","lml":"Learn more","oskt":"Input tools","psrc":"This search was removed from your \u003Ca href=\"/history\"\u003EWeb History\u003C/a\u003E","psrl":"Remove","sbit":"Search by image","srch":"Google Search"},"ovr":{},"pq":"","qcpw":false,"scd":10,"sce":5,"stok":"QGTPqfOgiEZ_AI3e5vphR6-NOmw"},"pcc":{}};google.y.first.push(function(){if(google.med){google.med('init');google.initHistory();google.med('history');}});if(google.j&&google.j.en&&google.j.xi){window.setTimeout(google.j.xi,0);}</script></div><script>(function(){if(google.timers&&google.timers.load.t){var b,c,d,e,g=function(a,f){a.removeEventListener?(a.removeEventListener("load",f,!1),a.removeEventListener("error",f,!1)):(a.detachEvent("onload",f),a.detachEvent("onerror",f))},h=function(a){e=(new Date).getTime();++c;a=a||window.event;a=a.target||a.srcElement;g(a,h)},k=document.getElementsByTagName("img");b=k.length;for(var l=c=0,m;l<b;++l)m=k[l],m.complete||"string"!=typeof m.src||!m.src?++c:m.addEventListener?(m.addEventListener("load",h,!1),m.addEventListener("error",
h,!1)):(m.attachEvent("onload",h),m.attachEvent("onerror",h));d=b-c;var n=function(){if(google.timers.load.t){google.timers.load.t.ol=(new Date).getTime();google.timers.load.t.iml=e;google.kCSI.imc=c;google.kCSI.imn=b;google.kCSI.imp=d;void 0!==google.stt&&(google.kCSI.stt=google.stt);google.csiReport&&google.csiReport()}};window.addEventListener?window.addEventListener("load",n,!1):window.attachEvent&&
window.attachEvent("onload",n);google.timers.load.t.prt=e=(new Date).getTime()};})();
</script></body></html>
[Finished in 4.3s]

How it scrapes is up to you, though. In any case, if you're having difficulties getting to know new libraries such as this, always exhaust the documentation.

From the site itself:

parsed
    Lazily parse response content, using HTML parser specified by the browser.

Source

Or you can use dir on br and get the following:

['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_build_url', '_cursor', '_maxlen', '_states', '_traverse', '_update_state', 'back', 'find', 'find_all', 'follow_link', 'forward', 'get_form', 'get_forms', 'get_link', 'get_links', 'history', 'open', 'parsed', 'parser', 'response', 'select', 'session', 'state', 'submit_form', 'timeout', 'url']
[Finished in 5.3s]

As you can see, towards the end, parsed is shown.

Hope this helps.



来源:https://stackoverflow.com/questions/23372311/how-to-return-html-of-a-page-using-robobrowser

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!