首先逐步分析每行代码的意思:
这是要引入的东西:
from os import path import requests from bs4 import BeautifulSoup import json import pymysql import numpy as np import time
输入请求地址:
#请求地址 url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia?from=timeline&isappinstalled=0'
为了防止被反爬虫(伪装成浏览器):
#为了避免反爬,伪装成浏览器: #创建头部信息 headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} response = requests.get(url,headers = headers) #发送网络请求
页面输出显示信息:
#print(response.content.decode('utf-8'))#以字节流形式打印网页源码 content = response.content.decode('utf-8') #print(content)
进行解析:
soup = BeautifulSoup(content, 'html.parser')#指定Beautiful的解析器为“html.parser”
之后就是对于数组的处理:
'''*find()返回的是第一个匹配的标签结果*find_all()返回的是所有匹配结果的列表'''listA = soup.find_all(name='script',attrs={"id":"getAreaStat"})#世界确诊listB = soup.find_all(name='script',attrs={"id":"getListByCountryTypeService2"})account = str(listA)#转化成字符串messages = account[52:-21]#截取从52到后边倒数21个
转换类型:
messages_json = json.loads(messages)#json.loads 用于解码 JSON 数据。该函数返回 Python 字段的数据类型。
之后就是线管的数据传入list然后对数据库进行操作了。
具体的代码如下:
from os import path import requests from bs4 import BeautifulSoup import json import pymysql import numpy as np import time #请求地址 url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia?from=timeline&isappinstalled=0' #为了避免反爬,伪装成浏览器: #创建头部信息 headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} response = requests.get(url,headers = headers) #发送网络请求 #print(response.content.decode('utf-8'))#以字节流形式打印网页源码 content = response.content.decode('utf-8') #print(content) soup = BeautifulSoup(content, 'html.parser')#指定Beautiful的解析器为“html.parser” listA = soup.find_all(name='script',attrs={"id":"getAreaStat"}) #世界确诊 listB = soup.find_all(name='script',attrs={"id":"getListByCountryTypeService2"}) #listA = soup.find_all(name='div',attrs={"class":"c-touchable-feedback c-touchable-feedback-no-default"}) account = str(listA) #world_messages = str(listB)[87:-21] messages = account[52:-21] messages_json = json.loads(messages) #world_messages_json = json.loads(world_messages) valuesList = [] cityList = [] ''' worldList = [] for k in range(len(world_messages_json)): worldvalue = (world_messages_json[k].get('id'),world_messages_json[k].get('createTime'),world_messages_json[k].get('modifyTime'),world_messages_json[k].get('tags'), world_messages_json[k].get('countryType'),world_messages_json[k].get('continents'),world_messages_json[k].get('provinceId'),world_messages_json[k].get('provinceName'), world_messages_json[k].get('provinceShortName'),world_messages_json[k].get('cityName'),world_messages_json[k].get('currentConfirmedCount'),world_messages_json[k].get('confirmedCount'), world_messages_json[k].get('suspectedCount'),world_messages_json[k].get('curedCount'),world_messages_json[k].get('deadCount'),world_messages_json[k].get('locationId'), world_messages_json[k].get('countryShortCode'),) worldList.append(worldvalue) ''' con=len(messages_json) k=0 for i in range(len(messages_json)): #value = messages_json[i] k=k+1 value = (k,time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),messages_json[i].get('provinceShortName'),None,messages_json[i].get('confirmedCount'),messages_json[i].get('suspectedCount'),messages_json[i].get('curedCount'),messages_json[i].get('deadCount'),messages_json[i].get('locationId')) valuesList.append(value) cityValue = messages_json[i].get('cities') #print(cityValue) for j in range(len(cityValue)): con=con+1 cityValueList = (con,time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),messages_json[i].get('provinceShortName'),cityValue[j].get('cityName'),cityValue[j].get('confirmedCount'),cityValue[j].get('suspectedCount'),cityValue[j].get('curedCount'),cityValue[j].get('deadCount'),cityValue[j].get('locationId')) #print(cityValueList) cityList.append(cityValueList) #cityList.append(cityValue) db = pymysql.connect("localhost", "root", "511924", "ceshi1", charset='utf8') cursor = db.cursor() array = np.asarray(valuesList[0]) #sql_clean_world = "TRUNCATE TABLE world_map" #sql_clean_city = "TRUNCATE TABLE city_map" #sql_clean_json = "TRUNCATE TABLE province_data_from_json" sql_clean_province = "TRUNCATE TABLE info3" #sql1 = "INSERT INTO city_map values (%s,%s,%s,%s,%s,%s,%s,%s)" #sql_world = "INSERT INTO world_map values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" #sql = "INSERT INTO province_map values (0,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') " sql = "INSERT INTO info3 values (%s,%s,%s,%s,%s,%s,%s,%s,%s) " #sql = "INSERT INTO province_map (provinceName,provinceShortName,correntConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,comment,locationId,statisticsData) values (0,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') " #sql = """INSERT INTO province_map (provinceName,provinceShortName,correntConfirmedCount,confirmedCount,suspectedCount,curedCount,deadCount,comment,locationId,statisticsData) values ('湖北省', '湖北', 43334, 64786, 0, 18889, 2563, '', 420000, 'https://file1.dxycdn.com/2020/0223/618/3398299751673487511-135.json')""" value_tuple = tuple(valuesList) cityTuple = tuple(cityList) #worldTuple = tuple(worldList) #print(cityTuple) #print(tuple(value_tuple)) try: #cursor.execute(sql_clean_city) cursor.execute(sql_clean_province) #cursor.executemany(sql, value_tuple) #cursor.executemany(sql1,cityTuple) db.commit() except: print('执行失败,进入回调1') db.rollback() try: #cursor.execute(sql_clean_city) #cursor.execute(sql_clean_province) cursor.executemany(sql, value_tuple) #cursor.executemany(sql1,cityTuple) db.commit() except: print('执行失败,进入回调3') db.rollback() try: #cursor.execute(sql_clean_city) #cursor.execute(sql_clean_province) #cursor.executemany(sql, value_tuple) cursor.executemany(sql,cityTuple) db.commit() except: print('执行失败,进入回调4') db.rollback() #print(messages_json) #print(account[52:-21]) # soupDiv = BeautifulSoup(listA,'html.parser') # listB = soupDiv.find_all(name='div',attrs={"class":"c-gap-bottom-zero c-line-clamp2"}) #for i in listA: #print(i) #listA[12] #print(listA) db.close()
来源:https://www.cnblogs.com/dazhi151/p/12461830.html