from bs4 import BeautifulSoup soup = BeautifulSoup("<html>A Html Text</html>", "html.parser")
| ||
lxml HTML | BeautifulSoup(html, "lxml") | |
lxml XML | BeautifulSoup(html, ["lxml", "xml"]) BeautifulSoup(html, "xml") | |
html5lib | BeautifulSoup(html, "html5lib") |
soup.prettify() # prettify 有括号和没括号都可以
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>') tag = soup.b type(tag) # <class 'bs4.element.Tag'>
Name
tag.name # 'b'
Attributes
tag['class'] # 'boldest' tag.attrs # {'class': 'boldest'} type(tag.attrs) # <class 'dict'>
soup = BeautifulSoup('<p class="body strikeout"></p>') print(soup.p['class']) # ['body', 'strikeout'] print(soup.p.attrs) # {'class': ['body', 'strikeout']}
soup = BeautifulSoup('<p id="my id"></p>', 'html.parser') print(soup.p['id']) # 'my id'
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>') s = soup.b.string print(s) # Extremely bold print(type(s)) # <class 'bs4.element.NavigableString'>
soup = BeautifulSoup("<b><!--This is a comment--></b>") comment = soup.b.string print(comment) # This is a comment print(type(comment)) # <class 'bs4.element.Comment'>
soup = BeautifulSoup("""<div> <span>test</span> </div> """) element = soup.div.contents print(element) # ['\n', <span>test</span>, '\n']
soup = BeautifulSoup("""<div> <p><span><b>test</b></span></p> </div> """) element = soup.p.string print(element) # test print(type(element)) # <class 'bs4.element.NavigableString'>
soup = BeautifulSoup("""<div> <p> </p> <p>test 1</p> <p>test 2</p> </div> """, 'html.parser') element = soup.div.stripped_strings print(list(element)) # ['test 1', 'test 2']
soup = BeautifulSoup("""<div> <p>test 1</p><b>test 2</b><h>test 3</h></div> """, 'html.parser') print(soup.b.next_sibling) # <h>test 3</h> print(soup.b.previous_sibling) # <p>test 1</p> print(soup.h.next_sibling) # None
html = """ <div> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a></p> </div> """ soup = BeautifulSoup(html, 'html.parser')
soup.find_all('b') # [<b>The Dormouse's story</b>]
soup.find_all(re.compile("^b")) # [<b>The Dormouse's story</b>]
soup.find_all(["a", "b"])
True
soup.find_all(True)
def has_class_but_no_id(tag): return tag.has_attr('class') and not tag.has_attr('id') print(soup.find_all(has_class_but_no_id))
[<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a></p>]
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>') print(data_soup.find_all(data-foo="value")) # SyntaxError: keyword can't be an expression
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>') print(data_soup.find_all(attrs={"data-foo": "value"})) # [<div data-foo="value">foo!</div>]
css_soup = BeautifulSoup('<p class="body bold strikeout"></p>') print(css_soup.find_all("p", class_="strikeout")) print(css_soup.find_all("p", class_="body")) print(css_soup.find_all("p", class_="body bold strikeout")) # [<p class="body strikeout"></p>] print(css_soup.find_all("p", class_="body strikeout")) print(css_soup.find_all("p", class_="strikeout body")) # []
soup.find_all('b') soup('b')
from bs4 import BeautifulSoup html = """ <html> <head><title>标题</title></head> <body> <p class="title" name="dromouse"><b>标题</b></p> <div name="divlink"> <p> <a href="http://example.com/1" class="sister" id="link1">链接1</a> <a href="http://example.com/2" class="sister" id="link2">链接2</a> <a href="http://example.com/3" class="sister" id="link3">链接3</a> </p> </div> <p></p> <div name='dv2'></div> </body> </html> """ soup = BeautifulSoup(html, 'lxml') # 通过tag查找 print(soup.select('title')) # [<title>标题</title>] # 通过tag逐层查找 print(soup.select("html head title")) # [<title>标题</title>] # 通过class查找 print(soup.select('.sister')) # [<a class="sister" href="http://example.com/1" id="link1">链接1</a>, # <a class="sister" href="http://example.com/2" id="link2">链接2</a>, # <a class="sister" href="http://example.com/3" id="link3">链接3</a>] # 通过id查找 print(soup.select('#link1, #link2')) # [<a class="sister" href="http://example.com/1" id="link1">链接1</a>, # <a class="sister" href="http://example.com/2" id="link2">链接2</a>] # 组合查找 print(soup.select('p #link1')) # [<a class="sister" href="http://example.com/1" id="link1">链接1</a>] # 查找直接子标签 print(soup.select("head > title")) # [<title>标题</title>] print(soup.select("p > #link1")) # [<a class="sister" href="http://example.com/1" id="link1">链接1</a>] print(soup.select("p > a:nth-of-type(2)")) # [<a class="sister" href="http://example.com/2" id="link2">链接2</a>] # nth-of-type 是CSS选择器 # 查找兄弟节点(向后查找) print(soup.select("#link1 ~ .sister")) # [<a class="sister" href="http://example.com/2" id="link2">链接2</a>, # <a class="sister" href="http://example.com/3" id="link3">链接3</a>] print(soup.select("#link1 + .sister")) # [<a class="sister" href="http://example.com/2" id="link2">链接2</a>] # 通过属性查找 print(soup.select('a[href="http://example.com/1"]')) # ^ 以XX开头 print(soup.select('a[href^="http://example.com/"]')) # * 包含 print(soup.select('a[href*=".com/"]')) # 查找包含指定属性的标签 print(soup.select('[name]')) # 查找第一个元素 print(soup.select_one(".sister"))