Using rvest package when HTML table has two headers

后端 未结 2 1320
礼貌的吻别
礼貌的吻别 2021-01-07 06:17

I am using the following code to scrape an HTML table on AFL player data:

library(rvest)

website <-read_html(\"https://afltables.com/afl/stats/teams/adel         


        
相关标签:
2条回答
  • 2021-01-07 07:02


    library(rvest)
    #> Le chargement a nécessité le package : xml2
    
    website <-read_html("https://afltables.com/afl/stats/teams/adelaide/2017_gbg.html")
    

    On this website, you have several tables, one per link displayed above the printed table on the main page. Using html_tables on the result of html_nodes("tables") allows you to get all the tables in a list at once.

    all_tables <- website %>%
      html_nodes("table") %>%
      html_table()
    
    str(all_tables, 1)
    #> List of 23
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    #>  $ :'data.frame':    34 obs. of  27 variables:
    

    You can then select the table you want but the header are still not right

    head(all_tables[[1]])
    #>          Disposals Disposals Disposals Disposals Disposals Disposals
    #> 1           Player        R1        R2        R3        R4        R5
    #> 2     Atkins, Rory        19        19        19        23        29
    #> 3  Beech, Jonathon                                                  
    #> 4     Betts, Eddie        18        13        16        22        12
    #> 5      Brown, Luke        18        12        13         9        15
    #> 6 Cameron, Charlie        23        17        16        16        13
    #>   Disposals Disposals Disposals Disposals Disposals Disposals Disposals
    #> 1        R6        R7        R8        R9       R10       R11       R12
    #> 2        23        20        21        28        37        14        25
    #> 3                                                                    15
    #> 4        16        13         9        16        14        12        11
    #> 5        17        13        20        25        16        12          
    #> 6        13        14        10        18        13         8        13
    #>   Disposals Disposals Disposals Disposals Disposals Disposals Disposals
    #> 1       R14       R15       R16       R17       R18       R19       R20
    #> 2        28        15        23        18        19        16        16
    #> 3        12        11                                                  
    #> 4        14        11        13        16         8                  16
    #> 5        10        15        14        17        11        10        20
    #> 6        15                  10        20         6         9        17
    #>   Disposals Disposals Disposals Disposals Disposals Disposals Disposals
    #> 1       R21       R22       R23        QF        PF        GF       Tot
    #> 2        27        21        21        16        22        17       536
    #> 3                                                                    38
    #> 4         7        16        12        13        13         7       318
    #> 5        17        17         9        20        10        13       353
    #> 6        13        10        10        15        19        16       334
    

    Using some manipulation on the list and tables with purrr and dplyr, you can format your table which has 2 headers:

    all_tables   <- website %>%
      html_nodes("table") %>%
      # do not let httr handles header automatically. 
      html_table(header = FALSE)
    
    library(purrr)
    #> 
    #> Attachement du package : 'purrr'
    #> The following object is masked from 'package:rvest':
    #> 
    #>     pluck
    all_tables <- all_tables %>%
      # get the first column, first row to set the name for the list elements
      # pluck is a purrr function acting like x[[1]][1, 1] here
      lmap( ~ set_names(.x, nm = pluck(.x, 1, 1, 1))) %>%
      # For each table, set second line as header 
      # and delete first and second line
      map(~ set_names(.x, nm = .x[2, ]) %>% slice(-c(1, 2)))
    str(all_tables_res, 1)
    #> List of 23
    #>  $ Disposals              :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Kicks                  :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Marks                  :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Handballs              :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Goals                  :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Behinds                :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Hit Outs               :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Tackles                :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Rebounds               :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Inside 50s             :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Clearances             :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Clangers               :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Frees                  :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Frees Against          :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Brownlow Votes         :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Contested Possessions  :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Uncontested Possessions:Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Contested Marks        :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Marks Inside 50        :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ One Percenters         :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Bounces                :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ Goal Assists           :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    #>  $ % Played               :Classes 'tbl_df', 'tbl' and 'data.frame': 33 obs. of  27 variables:
    
    You can now called any table of the website.
    
    head(all_tables_res$Goals)
    #> # A tibble: 6 x 27
    #>             Player    R1    R2    R3    R4    R5    R6    R7    R8    R9
    #>              <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
    #> 1     Atkins, Rory     3     1     -     2     1     -     1     1     -
    #> 2  Beech, Jonathon                                                      
    #> 3     Betts, Eddie     4     3     3     6     3     1     3     2     3
    #> 4      Brown, Luke     -     1     -     -     1     -     -     -     -
    #> 5 Cameron, Charlie     2     1     -     1     2     2     2     -     4
    #> 6     Crouch, Brad                             -     -     -     -     1
    #> # ... with 17 more variables: R10 <chr>, R11 <chr>, R12 <chr>, R14 <chr>,
    #> #   R15 <chr>, R16 <chr>, R17 <chr>, R18 <chr>, R19 <chr>, R20 <chr>,
    #> #   R21 <chr>, R22 <chr>, R23 <chr>, QF <chr>, PF <chr>, GF <chr>,
    #> #   Tot <chr>
    
    0 讨论(0)
  • 2021-01-07 07:06

    Firstly, and unrelated to your question: Don't use table as a name for your objects, because this name is already reserved for other functionalities in R. It is considered bad practice and I've been told that it will come back and nip you in the butt somewhere down the line.

    Moving on to the question: You are struggling with the type of data that html_table() gives you. You are returned a list, which contains a regular data.frame. The list you outputted, has NULL for the number of columns and rows, because that list only has one element: the data.frame. By selecting that first (and only) element of your list, you will get to the dataframe you're actually interesting in. This dataframe has 27 columns and 34 rows

    website <-read_html("https://afltables.com/afl/stats/teams/adelaide/2017_gbg.html")
    scraped <- website %>%
                    html_nodes("table") %>%
                    .[(1)] %>%
                    html_table() %>%
                    `[[`(1)   # Select the first element of the list, like scraped[[1]]
    ncol(scraped) 
    # 27
    nrow(scraped)
    # 34
    
    0 讨论(0)
提交回复
热议问题