Failed to grab dates in a cutomized manner out of a tabular content

前端未结

关注

 3  1269

I\'ve written a script in python in combination with selenium to parse some dates available within a table in a webpage. The table is located under the header NPL Vict


                      
              相关标签:


      
      
        
          3条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  滥情空心        
                
              
                            
                2021-01-07 14:14
              
            
            
                                                                       
You the issue is that the items is either an idate or an itime. so you are overwriting one of them every time.

I commented out your excepts, and it prints fine for me:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

link = "http://www.oddsportal.com/soccer/australia/npl-victoria/"

def get_content(driver,url):
    driver.get(url)
    idate = ''
    itime = ''
    for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"#tournamentTable tr"))):
        try:
            idate = items.find_element_by_css_selector("th span[class^='datet']").text
        #except Exception: idate = ""
        except:
            pass
        try:
            itime = items.find_element_by_css_selector("td.table-time").text
           # print('itime: ',itime)
       # except Exception: itime = ""
        except:
            pass
        if idate !='' and itime !='':
            print(f'{idate}--{itime}')

if __name__ == '__main__':
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver,10)
    try:
        get_content(driver,link)
    finally:
        driver.quit()

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  北海茫月        
                
              
                            
                2021-01-07 14:16
              
            
            
                                                                       
Try to use below code:

def get_content(driver,url):
    driver.get(url)
    dates = len(wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"#tournamentTable tr.center.nob-border"))))
    for d in range(dates):
        item = driver.find_elements_by_css_selector("#tournamentTable tr.center.nob-border")[d]
        try:
            idate = item.find_element_by_css_selector("th span[class^='datet']").text
        except Exception: idate = ""
        for time_td in item.find_elements_by_xpath(".//following::td[contains(@class, 'table-time') and not((preceding::tr[@class='center nob-border'])[%d])]" % (d + 2)):
            try:
                itime = time_td.text
            except Exception: itime = ""
            print(f'{idate}--{itime}')

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  小鲜肉        
                
              
                            
                2021-01-07 14:26
              
            
            
                                                                       
I'm not using Selenium, but selected dates can be extracted with just BeautifulSoup. The timedates are coded as Unix timestamp inside tag classes:

from bs4 import BeautifulSoup
import requests
import re
import datetime

headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
r = requests.get('http://www.oddsportal.com/soccer/australia/npl-victoria/', headers=headers)
soup = BeautifulSoup(r.text, 'lxml')

for td in soup.select('table#tournamentTable td.datet'):
    for c in td['class']:
        if re.match(r't\d+', c):
            unix_timestamp = int(re.match(r't(\d+)', c)[1])
            d = datetime.datetime.utcfromtimestamp(unix_timestamp).strftime('%d %b %Y--%H:%M')
            print(d)


Prints:

10 Aug 2018--09:30
10 Aug 2018--10:15
11 Aug 2018--05:00
11 Aug 2018--05:00
11 Aug 2018--09:00
12 Aug 2018--06:00
12 Aug 2018--06:00


If you want also the matches printed:

for td in soup.select('table#tournamentTable td.datet'):
    for c in td['class']:
        if re.match(r't\d+', c):
            unix_timestamp = int(re.match(r't(\d+)', c)[1])
            d = datetime.datetime.utcfromtimestamp(unix_timestamp).strftime('%d %b %Y--%H:%M')
            print(d, end=' ')
            print(td.find_next('td').text)


Prints:

10 Aug 2018--09:30 Melbourne Knights - Port Melbourne Sharks
10 Aug 2018--10:15 Pascoe Vale - Dandenong Thunder
11 Aug 2018--05:00 Avondale FC - Bentleigh Greens
11 Aug 2018--05:00 Northcote City - Bulleen
11 Aug 2018--09:00 Hume City - Oakleigh Cannons
12 Aug 2018--06:00 Heidelberg Utd - Green Gully
12 Aug 2018--06:00 South Melbourne - Kingston City

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复