Extracting html table with rowspan values

问题

The data frame I create with the following code (using the RCurl and XML packages) puts the three letter team abbreviation into only the first row that it spans. Is there another package or additional code I can add to keep the data in the proper column?

library(XML)
library(RCurl)
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
url_source <- readLines(url, encoding = "UTF-8")
playoffs <- data.frame(readHTMLTable(url_source, stringsAsFactors = F, header = T) [2])

回答1:

You are actually pretty close. The only thing you need to do is to get the data in the proper columns and rows as some of the rows have shifted to the left. You can achieve this as follows (with the help of the data.table and zoo packages):

# your original code
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
url_source <- readLines(url, encoding = "UTF-8")
playoffs <- data.frame(readHTMLTable(url_source, stringsAsFactors = F, header = T)[2])

# assigning proper names to the columns
names(playoffs) <- c("shortcode","franchise","years","appearances")

# 1. shift the dat columnwise for the rows in which there is no shortcode
# 2. fill the resulting NA's with the last observation
# 3. only keep the last shortcode when the previous ones are the same
#    because only there the shortcode matches the franchise name
library(data.table)
library(zoo)
setDT(playoffs)[nchar(shortcode) > 3, `:=` (shortcode = NA,
                                            franchise = shortcode,
                                            years = franchise,
                                            appearances = years)
                ][, shortcode := na.locf(shortcode)
                  ][shortcode == shift(shortcode, 1L, type="lead"), shortcode := NA]

回答2:

Here's an answer. I took the liberty up the data

library(dplyr)
library(XML)
library(RCurl)
library(stringi)
library(zoo)
library(tidyr)

initial_data =
  "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams" %>%
  readLines(encoding = "UTF-8") %>%
  readHTMLTable(stringsAsFactors = F) %>%
  `[[`(2) %>%
  mutate(ID = 1:n(),
         test =
           V1 %>%
           stri_detect_regex("^[A-Z]{2,3}$"))

variable_names = c("franchise",
                   "years",
                   "initial_postseason_appearances")

shifts = 
  initial_data %>%
  filter(test %>% `!`) %>%
  setNames(c(variable_names,
             "trash",
             "ID",
             "test"))

team_initial =
  initial_data %>%
  filter(test) %>%
  setNames(c("initial_abbreviation",
             variable_names,
             "ID",
             "test")) %>%
  bind_rows(shifts) %>%
  arrange(ID) %>%
  separate(years, c("start", "end")) %>%
  mutate(abbreviation = initial_abbreviation %>% na.locf,
         split_postseason_appearances =
           initial_postseason_appearances %>%
           plyr::mapvalues("–", NA) %>%
           stri_split_fixed(", ") )

appearance = 
  team_initial %>%
  select(franchise,
         split_postseason_appearances) %>%
  unnest(split_postseason_appearances) %>%
  mutate(postseason_appearance =
           split_postseason_appearances %>%
           extract_numeric) %>%
  select(-split_postseason_appearances)

team = 
  team_initial %>%
  select(abbreviation,
         franchise,
         start,
         end)

回答3:

Consider an XML package solution, requiring various XPath expressions using xpathSApply() with for loop and if/then logic. To capture the row-spanned table records, various XPath string functions are used: string-length(), concat(), and substring():

library(XML)

# PARSE FROM URL
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
webpage <- readLines(url)
html = htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)

# INITIALIZE LISTS
code <- c()
team <- c()
year <- c()
postseason <- c()

# APPEND TO LISTS LOOPING ACROSS ALL TEAMS
numberofteams <- length(xpathSApply(html, "//table[2]//tr/td[1]"))

for (i in (1:numberofteams+1)) {
  # TR NODES WITH LETTER TEAM ABBREVIATION (STRING LENGTH=2 or 3)
  if (as.character(xpathSApply(html, sprintf("string-length(//table[2]/tr[%s]/td[1])", i), xmlValue)) %in% c("2","3")) {

    code <- c(code, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
    team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
    year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))
    postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[4]", i), xmlValue))
  } else {
    # TR NODES W/O LETTER TEAM ABBREVIATION       
    code <- c(code, xpathSApply(html, sprintf("substring(concat(//table[2]/tr[position()=%s-1]/td[position()=1 and string-length(.)=3],
                                               //table[2]/tr[position()=%s-2]/td[position()=1 and string-length(.)=3]), 1, 3)", i, i), xmlValue))
    team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
    year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
    postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))        

  }
}

# COMBINE LISTS INTO DATA FRAME
playoffs <- data.frame(code = unlist(code), 
                       team = unlist(team), 
                       year = unlist(year), 
                       postseason = unlist(postseason))

回答4:

Give htmltab a try:

install.packages("htmltab")
library(htmltab)

purl <- htmlParse(url_source)
htmltab(purl, which = 2)

来源：https://stackoverflow.com/questions/34121911/extracting-html-table-with-rowspan-values

标签

xml

html-table

rcurl