Where can I find some “hello world”-simple Beautiful Soup examples?

后端 未结 2 1220
半阙折子戏
半阙折子戏 2021-02-04 16:55

I\'d like to do a very simple replacement using Beautiful Soup. Let\'s say I want to visit all A tags in a page and append \"?foo\" to their href. Can someone post or link to an

2条回答
  •  野性不改
    2021-02-04 17:43

    my example:

    HEADERS = {"User-Agent" : "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5",
           "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
           "Accept-Language" : "ru,en-us;q=0.7,en;q=0.3",
           "Accept-Charset" : "windows-1251,utf-8;q=0.7,*;q=0.7",
           "Accept-Encoding" : "identity, *;q=0",
           "Connection" : "Keep-Alive"}
    PROXY=None
    timeout=60
    
    
    def parse_manuf_page_about(page_str_about):
    slovar={}
    global timeout
    socket.setdefaulttimeout(timeout)
    if PROXY is not None:
            proxy_handler = urllib2.ProxyHandler( { "http": "http://"+PROXY+"/" } )
            opener = urllib2.build_opener(proxy_handler)
            urllib2.install_opener(opener)
    page_request = urllib2.Request(url=page_str_about, headers=HEADERS)
    try:
        #print "Page reading ... %s" %page_str
        page_zapr = urllib2.urlopen(url=page_request)
        page=page_zapr.read()
    except Exception ,error:
        print str(error)
        res=False
        return res,slovar
    soup = BeautifulSoup(page)
    select_pod=soup.findAll('div', {"class":"win aboutUs"})
    
    promeg= select_pod[0].findAll("p")[0]
    zerro_br= promeg.findAll(text=True)
    Company_Info=" ".join(zerro_br).strip(" \t\n")
    select =soup.findAll('div', {"class":"win"})
    cells_tabl= select[0].findAll("tr")
    
    for yach in cells_tabl:
        text_zag=yach.findAll("th")
        for zn_yach in text_zag:
            if len(zn_yach)>0:
                txt_zn_yach="".join(zn_yach.findAll(text=True)).strip(" \t\n")
            else:
                txt_zn_yach= zn_yach.contents[0].strip(" \t\n")
                #print txt_zn_yach
        text_znach_td=yach.findAll("td")
        for zn_yach_td in text_znach_td:
            if len(zn_yach_td)>0:
                txt_zn_yach_td="".join(zn_yach_td.findAll(text=True)).strip(" \t\n")
            else:
                txt_zn_yach_td= zn_yach.contents[0].strip(" \t\n")
                #print txt_zn_yach_td
        # Делаем замены неугодных символов / Replase browsers char
        if " " in txt_zn_yach_td:
            while txt_zn_yach_td.find("nbsp;")>0:
                pos_gavna=txt_zn_yach_td.find(" ")
                txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+txt_zn_yach_td[pos_gavna+6:]
        if """ in txt_zn_yach_td:
            while txt_zn_yach_td.find("quot;")>0:
                pos_gavna=txt_zn_yach_td.find(""")
                txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+'"'+txt_zn_yach_td[pos_gavna+6:]
        if "&" in txt_zn_yach_td:
            while txt_zn_yach_td.find("&")>0:
                pos_gavna=txt_zn_yach_td.find("&")
                txt_zn_yach_td=txt_zn_yach_td[:pos_gavna]+'&'+txt_zn_yach_td[pos_gavna+6:]
        slovar[str(txt_zn_yach)]=txt_zn_yach_td
        slovar["Company_Info"]=Company_Info
    # разбираем нижнюю таблицу с контактом и вытаскиваем оттуда имя контакта | get name contacts
    select_contact=soup.findAll('a', {"class":"member-name"})
    for contact_person in select_contact:
        slovar["Contact_Person"]= contact_person.contents[0]
    # получаем статус голд партнера по наличию таблички в левом верхнем углу | get Gold status
    select_gold_part=soup.findAll('a', {"class":"memberLogo"})
    if len(select_gold_part)==0:
        slovar["Gold member"]="N"
    else:
        slovar["Gold member"]="Y"
    res=True
    return res,slovar
    

    This code parsing one page of manufactury on Alibaba.com. You can see it page - http://xmxinhuafeng.en.alibaba.com/aboutus.html

提交回复
热议问题