Python：bs4的使用 | 易学教程

from bs4 import BeautifulSoup  soup = BeautifulSoup("<html>A Html Text</html>", "html.parser")



lxml HTML	BeautifulSoup(html, "lxml")
lxml XML	BeautifulSoup(html, ["lxml", "xml"]) BeautifulSoup(html, "xml")
html5lib	BeautifulSoup(html, "html5lib")

soup.prettify()  # prettify 有括号和没括号都可以

soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')  tag = soup.b  type(tag)  # <class 'bs4.element.Tag'>

Name

tag.name # 'b'

Attributes

tag['class'] # 'boldest'  tag.attrs # {'class': 'boldest'}  type(tag.attrs) # <class 'dict'>

soup = BeautifulSoup('<p class="body strikeout"></p>')  print(soup.p['class'])  # ['body', 'strikeout']  print(soup.p.attrs)     # {'class': ['body', 'strikeout']}

soup = BeautifulSoup('<p id="my id"></p>', 'html.parser') print(soup.p['id'])    # 'my id'

soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')  s = soup.b.string  print(s)        # Extremely bold  print(type(s))  # <class 'bs4.element.NavigableString'>

soup = BeautifulSoup("<b><!--This is a comment--></b>")  comment = soup.b.string  print(comment)          # This is a comment  print(type(comment))    # <class 'bs4.element.Comment'>

soup = BeautifulSoup("""<div> <span>test</span> </div> """)  element = soup.div.contents  print(element)          # ['\n', <span>test</span>, '\n']

soup = BeautifulSoup("""<div>     <p><span><b>test</b></span></p> </div> """)  element = soup.p.string  print(element)          # test  print(type(element))    # <class 'bs4.element.NavigableString'>

soup = BeautifulSoup("""<div>     <p>      </p>     <p>test 1</p>     <p>test 2</p> </div> """, 'html.parser')  element = soup.div.stripped_strings  print(list(element))          # ['test 1', 'test 2']

soup = BeautifulSoup("""<div>     <p>test 1</p><b>test 2</b><h>test 3</h></div> """, 'html.parser')  print(soup.b.next_sibling)      # <h>test 3</h>  print(soup.b.previous_sibling)  # <p>test 1</p>  print(soup.h.next_sibling)      # None

html = """ <div>     <p class="title"><b>The Dormouse's story</b></p>     <p class="story">Once upon a time there were three little sisters; and their names were     <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,     <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and     <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a></p> </div> """  soup = BeautifulSoup(html, 'html.parser')

soup.find_all('b')  # [<b>The Dormouse's story</b>]

soup.find_all(re.compile("^b"))  # [<b>The Dormouse's story</b>]

soup.find_all(["a", "b"])

True

soup.find_all(True)

def has_class_but_no_id(tag):     return tag.has_attr('class') and not tag.has_attr('id')   print(soup.find_all(has_class_but_no_id))

[<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were     <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a></p>]

data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')  print(data_soup.find_all(data-foo="value"))  # SyntaxError: keyword can't be an expression

data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')  print(data_soup.find_all(attrs={"data-foo": "value"}))  # [<div data-foo="value">foo!</div>]

css_soup = BeautifulSoup('<p class="body bold strikeout"></p>')  print(css_soup.find_all("p", class_="strikeout"))  print(css_soup.find_all("p", class_="body"))  print(css_soup.find_all("p", class_="body bold strikeout"))  # [<p class="body strikeout"></p>]  print(css_soup.find_all("p", class_="body strikeout"))  print(css_soup.find_all("p", class_="strikeout body"))  # []

soup.find_all('b')  soup('b')

from bs4 import BeautifulSoup    html = """ <html> <head><title>标题</title></head> <body>  <p class="title" name="dromouse"><b>标题</b></p>  <div name="divlink">   <p>    <a href="http://example.com/1" class="sister" id="link1">链接1</a>    <a href="http://example.com/2" class="sister" id="link2">链接2</a>    <a href="http://example.com/3" class="sister" id="link3">链接3</a>   </p>  </div>  <p></p>  <div name='dv2'></div> </body> </html> """  soup = BeautifulSoup(html, 'lxml')  # 通过tag查找 print(soup.select('title'))             # [<title>标题</title>]  # 通过tag逐层查找 print(soup.select("html head title"))   # [<title>标题</title>]  # 通过class查找 print(soup.select('.sister')) # [<a class="sister" href="http://example.com/1" id="link1">链接1</a>, # <a class="sister" href="http://example.com/2" id="link2">链接2</a>, # <a class="sister" href="http://example.com/3" id="link3">链接3</a>]   # 通过id查找 print(soup.select('#link1, #link2')) # [<a class="sister" href="http://example.com/1" id="link1">链接1</a>, # <a class="sister" href="http://example.com/2" id="link2">链接2</a>]   # 组合查找 print(soup.select('p #link1'))　　　　# [<a class="sister" href="http://example.com/1" id="link1">链接1</a>]    # 查找直接子标签 print(soup.select("head > title"))　 # [<title>标题</title>] print(soup.select("p > #link1"))　　 # [<a class="sister" href="http://example.com/1" id="link1">链接1</a>]  print(soup.select("p > a:nth-of-type(2)"))　　# [<a class="sister" href="http://example.com/2" id="link2">链接2</a>] # nth-of-type 是CSS选择器     # 查找兄弟节点（向后查找） print(soup.select("#link1 ~ .sister")) # [<a class="sister" href="http://example.com/2" id="link2">链接2</a>, # <a class="sister" href="http://example.com/3" id="link3">链接3</a>]  print(soup.select("#link1 + .sister")) # [<a class="sister" href="http://example.com/2" id="link2">链接2</a>]     # 通过属性查找 print(soup.select('a[href="http://example.com/1"]'))  # ^ 以XX开头 print(soup.select('a[href^="http://example.com/"]'))  # * 包含 print(soup.select('a[href*=".com/"]'))  # 查找包含指定属性的标签 print(soup.select('[name]'))     # 查找第一个元素 print(soup.select_one(".sister"))

文章来源: Python：bs4的使用

标签

sister

test

python

soup

element

class