I\'ve written a script in python in combination with selenium to parse some dates available within a table in a webpage. The table is located under the header NPL Vict
You the issue is that the items is either an idate or an itime. so you are overwriting one of them every time.
I commented out your excepts
, and it prints fine for me:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
link = "http://www.oddsportal.com/soccer/australia/npl-victoria/"
def get_content(driver,url):
driver.get(url)
idate = ''
itime = ''
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"#tournamentTable tr"))):
try:
idate = items.find_element_by_css_selector("th span[class^='datet']").text
#except Exception: idate = ""
except:
pass
try:
itime = items.find_element_by_css_selector("td.table-time").text
# print('itime: ',itime)
# except Exception: itime = ""
except:
pass
if idate !='' and itime !='':
print(f'{idate}--{itime}')
if __name__ == '__main__':
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
try:
get_content(driver,link)
finally:
driver.quit()
Try to use below code:
def get_content(driver,url):
driver.get(url)
dates = len(wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"#tournamentTable tr.center.nob-border"))))
for d in range(dates):
item = driver.find_elements_by_css_selector("#tournamentTable tr.center.nob-border")[d]
try:
idate = item.find_element_by_css_selector("th span[class^='datet']").text
except Exception: idate = ""
for time_td in item.find_elements_by_xpath(".//following::td[contains(@class, 'table-time') and not((preceding::tr[@class='center nob-border'])[%d])]" % (d + 2)):
try:
itime = time_td.text
except Exception: itime = ""
print(f'{idate}--{itime}')
I'm not using Selenium, but selected dates can be extracted with just BeautifulSoup. The timedates are coded as Unix timestamp inside tag classes:
from bs4 import BeautifulSoup
import requests
import re
import datetime
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
r = requests.get('http://www.oddsportal.com/soccer/australia/npl-victoria/', headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
for td in soup.select('table#tournamentTable td.datet'):
for c in td['class']:
if re.match(r't\d+', c):
unix_timestamp = int(re.match(r't(\d+)', c)[1])
d = datetime.datetime.utcfromtimestamp(unix_timestamp).strftime('%d %b %Y--%H:%M')
print(d)
Prints:
10 Aug 2018--09:30
10 Aug 2018--10:15
11 Aug 2018--05:00
11 Aug 2018--05:00
11 Aug 2018--09:00
12 Aug 2018--06:00
12 Aug 2018--06:00
If you want also the matches printed:
for td in soup.select('table#tournamentTable td.datet'):
for c in td['class']:
if re.match(r't\d+', c):
unix_timestamp = int(re.match(r't(\d+)', c)[1])
d = datetime.datetime.utcfromtimestamp(unix_timestamp).strftime('%d %b %Y--%H:%M')
print(d, end=' ')
print(td.find_next('td').text)
Prints:
10 Aug 2018--09:30 Melbourne Knights - Port Melbourne Sharks
10 Aug 2018--10:15 Pascoe Vale - Dandenong Thunder
11 Aug 2018--05:00 Avondale FC - Bentleigh Greens
11 Aug 2018--05:00 Northcote City - Bulleen
11 Aug 2018--09:00 Hume City - Oakleigh Cannons
12 Aug 2018--06:00 Heidelberg Utd - Green Gully
12 Aug 2018--06:00 South Melbourne - Kingston City