问题
I am a novice at Python and helping out on a school project. Any help is much appreciated. THANKS. I get an error when it gets to the year 2004 and 2003. And it is caused by the result_list list. The error is "ValueError: arrays must all be same length". How can I introduce code that fixes this. The scores are important....
import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup
#from openpyxl.writer.excel import ExcelWriter
import openpyxl
#from openpyxl import load_workbook
import csv
year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']
i=0
while i <= len(year_id)-1:
url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + str(year_id[i])
lehigh = requests.get(url).text
soup = BeautifulSoup(lehigh,'lxml')
date_list = []
for date in soup.find_all('div',class_="sidearm-schedule-game-opponent-date"):
date_list.append(date.get_text(strip=True, separator=' '))
name_list = []
for name in soup.find_all('div',class_="sidearm-schedule-game-opponent-name"):
name_list.append(name.get_text(strip=True, separator=' '))
result_list = []
for result in soup.find_all('div',class_="sidearm-schedule-game-result"):
result_list.append(result.get_text(strip=True, separator=' '))
opp_list = []
for opp in soup.find_all('div',class_="sidearm-schedule-game-opponent-text"):
opp_list.append(opp.get_text(strip=True, separator=' '))
conf_list = []
for conf in soup.find_all('div',class_="sidearm-schedule-game-conference-conference"):
conf_list.append(conf.get_text(strip=True))
dict = {'date':date_list,'opponent':name_list,'result':result_list,'list':opp_list,'conference':conf_list}
df = pd.DataFrame(dict)
workbook1 = openpyxl.load_workbook('lehigh.xlsx')
writer = pd.ExcelWriter('lehigh.xlsx', engine='openpyxl')
writer.book = workbook1
df.to_excel(writer, sheet_name=str(year_id[i]),index=False,startrow=0,startcol=0)
writer.save()
writer.close()
i = i+1
回答1:
Code is updated:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from itertools import zip_longest
d = []
n = []
res = []
op = []
yr = []
with requests.Session() as req:
for year in range(2003, 2020):
print(f"Extracting Year# {year}")
r = req.get(
f"https://lehighsports.com/sports/mens-soccer/schedule/{year}")
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
for date in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-date flex-item-1'}):
d.append(date.get_text(strip=True, separator=" "))
for name in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-name'}):
n.append(name.get_text(strip=True))
for result in soup.findAll("div", {'class': 'sidearm-schedule-game-result'}):
result = result.get_text(strip=True)
res.append(result)
if len(d) != len(res):
res.append("None")
for opp in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-text'}):
op.append(opp.get_text(strip=True, separator=' '))
yr.append(year)
data = []
for items in zip_longest(yr, d, n, res, op):
data.append(items)
df = pd.DataFrame(data, columns=['Year', 'Date', 'Name', 'Result', 'Opponent']).to_excel(
'lehigh.xlsx', index=False)
Output: check-online
回答2:
A few things:
- You don't need to iterate through index. Just simply iterate through the list
- The reason you get error is the result list is a length of 23, while your other lists are length of 24. So you'll need to figure out how to deal with nulls, and deal with were they fall (they may not always be the last entry)
How I would do it, is I would grab each row, and then pull the data for that, as opposed to pulling each entity into a list. I then take all those rows on the site and create a table, and make a list of tables (1 table for each year). The way you can deal with the missing data is use try/except. I also added a little function (found here) that will take that list of tables and write them into separate excel sheets.
import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup
#from openpyxl.writer.excel import ExcelWriter
import openpyxl
#from openpyxl import load_workbook
import csv
year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']
results = []
for year in year_id:
url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + year
print (url)
lehigh = requests.get(url).text
soup = BeautifulSoup(lehigh,'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
try:
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
except:
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
results.append(sheet)
def save_xls(list_dfs, xls_path):
with ExcelWriter(xls_path) as writer:
for n, df in enumerate(list_dfs):
df.to_excel(writer,'%s' %year_id[n],index=False,)
writer.save()
save_xls(results,'lehigh.xlsx')
来源:https://stackoverflow.com/questions/59606758/beautifulsoup-requests-dataframe-saving-to-excel-arrays-error