from bs4 import BeautifulSoup
from lxml import html,etree
file ='hm.html'
htmlfile = open(file, 'r', encoding='utf-8')
htmlhandle = htmlfile.read()
soup = BeautifulSoup(htmlhandle, features='lxml')
#a=soup.text
a = soup.find_all(name='div',attrs={"class":"p"})[0].text
#a = soup.select('')
#print(a)#以上为内容爬取
#网页的url进行爬取
from bs4 import BeautifulSoup
from lxml import html,etree
file ='hm.html'
htmlfile = open(file, 'r', encoding='utf-8')
htmlhandle = htmlfile.read()
soup = BeautifulSoup(htmlhandle, features='lxml')
#a = soup.find_all(name='li',attrs={"class":"last"})
#a = soup.a.attrs['href']
print(soup.select('a')[32]['href'])
#获取标题
from bs4 import BeautifulSoup
from lxml import html,etree
file ='hm.html'
htmlfile = open(file, 'r', encoding='utf-8')
htmlhandle = htmlfile.read()
soup = BeautifulSoup(htmlhandle, features='lxml')
a = soup.find_all(name='div',attrs={"class":"chapter_update_time"})[0].text
print(a)
#全局变量的使用
def ja (a,b):
global c
c = a+b
return c
def main ():
a=1
b=2
n=ja(a,b)
print(c)
if __name__ == '__main__':
main()'''
#bs4清洗数据的小练习
from bs4 import BeautifulSoup
from lxml import html,etree
file ='hm.html'
htmlfile = open(file, 'r', encoding='utf-8')
htmlhandle = htmlfile.read()
soup = BeautifulSoup(htmlhandle, features='lxml')
a = soup.find_all(name='div',attrs={"class":"chapter_update_time"})[0].text
print(a)
from bs4 import BeautifulSoup
file = 'hm.html'
htmlfile = open(file,'r',encoding='utf-8').read()
soup = BeautifulSoup(htmlfile,'lxml')
a = soup.find_all(name='div',attrs={"class":"chapter_update_time"})[0].text
print(a)
来源:CSDN
作者:IT小黑猪
链接:https://blog.csdn.net/weixin_46244909/article/details/104214512