Why do I get an error message pointing to Inf values when trying to plot counts over time in R?

后端 未结 2 600
青春惊慌失措
青春惊慌失措 2021-01-25 23:54

I am using the code given in this answer to generate this plot

library(rvest)

cachedir <- \"cache\"
if (!dir.exists(cachedir)) dir.create(cachedir)

URL <         


        
2条回答
  •  滥情空心
    2021-01-25 23:58

    # Install pacakges if they are not already installed:
    necessary_packages <- c("rvest", "tidyverse")
    
    # Create a vector containing the names of any packages needing installation:
    new_packages <- necessary_packages[!(necessary_packages %in%
                                           installed.packages()[, "Package"])]
    
    # If the vector has more than 0 values, install the new pacakges
    # (and their) associated dependencies:
    if (length(new_packages) > 0) {
      install.packages(new_packages, dependencies = TRUE)
    }
    
    # Initialise the packages in the session:
    lapply(necessary_packages, require, character.only = TRUE)
    
    # Store a scalar that's values is the github url: URL => vector:
    URL <-
      "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports"
    
    # Store a scalar of the directory path where files are to be stored:
    # Enter your desired path here !
    covid_19_csv_dir_path <- "C:/Users/.../Documents/covid_19_csvs"
    
    # If the directory doesn't exist create it:
    if (!(dir.exists(covid_19_csv_dir_path))) {
      dir.create(covid_19_csv_dir_path)
    }
    
    # Store a vector of URLs: csvlinks => character vector:
    csvlinks <-
      read_html(URL) %>%
      html_nodes("a") %>%
      html_attr("href") %>%
      grep("csv$", ., value = TRUE) %>%
      paste0("https://raw.githubusercontent.com", .) %>%
      gsub("/blob", "", .)
    
    # Store a vector of csv names: csv_names => vector
    csv_names <- sub(".*\\/", "", csvlinks)
    
    # Check if the file already exists in directory: csvs_stored_locally => vector:
    csvs_stored_locally <- list.files(covid_19_csv_dir_path)
    
    # Subset the csvlinks vector to contain those csvs that
    # require downloading: csvs_to_be_stored => vector
    csvs_to_be_stored <- csvlinks[!(csv_names %in% csvs_stored_locally)]
    
    # Conditionally execute the next segment if there are csvs to store:
    if (length(csvs_to_be_stored) > 0) {
      # Create a vector of the date of each version: version_dates => vector
      version_dates <- as.Date(gsub("\\.csv", "",
                                    csv_names[!(csv_names %in% csvs_stored_locally)]),
                               "%m-%d-%Y")
    
      # Create a vector of names for each dataframe in the list: df_names => vector
      df_names <- paste0("x_", gsub("[[:punct:]]", "_", version_dates))
    
      # Create an empty list to store the dataframes: df_list => list
      df_list <- vector("list", length(csvs_to_be_stored))
    
      # Store the dataframes in the named list: df_list => list
      df_list <- lapply(seq_along(csvs_to_be_stored),
                        function(i) {
                          read.csv(csvs_to_be_stored[i], sep = ",")
                        })
    
      # Clean the vector names in each dataframe: cleaned_df_names_list => df_list
      df_list <- setNames(lapply(df_list,
                                 function(x) {
                                   names(x) <- gsub("[[:punct:]]|\\s+|.*\\.", "",
                                                    trimws(names(x), "both"))
                                   return(x)
                                 }),
                          df_names)
    
      # Store the version date as a variable: df_list => list
      df_list <-
        mapply(cbind,
               "version_date" = version_dates,
               df_list,
               SIMPLIFY = FALSE)
    
      # Store the csv files in the directory: stdout
      lapply(seq_along(df_list),
             function(i) {
               write.csv(as.data.frame(df_list[i]),
                         paste0(covid_19_csv_dir_path, "/",
                                sub("\\/", "", csv_names[i])),
                         row.names = FALSE)
             })
    }
    
    # If there are files stored in the directory read them in:
    if (length(csvs_stored_locally) > 0) {
      # Allocate some memory for a list of dataframes: ls_csvs => list
      ls_csvs <- vector("list", length(csvs_stored_locally))
    
      # Read the csvs as dataframes into a list and name them appropriately: ls_csvs => list
      ls_csvs <-
        setNames(lapply(seq_along(csvs_stored_locally), function(i) {
          read.csv(paste0(covid_19_csv_dir_path, "/", csvs_stored_locally[i]))
        }),
        paste0("x_", gsub(
          "[[:punct:]]", "_",
          gsub("\\.csv", "", csvs_stored_locally)
        )))
    }
    
    # If csvs have been downloaded from the github in this execution, combine
    # the list of stored dataframes with the those downloaded: combined_df_list => list
    if (exists("df_list") & exists("ls_csvs")) {
    
      # Combine the two lists: combined_df_list => list
      combined_df_list <- c(ls_csvs, df_list)
    
      # Remove df_list, ls_csvs variables from the global environment:
      rm(df_list, ls_csvs)
    
      # If ls_csvs but not df_list exists in the global environment:
    } else if (exists("ls_csvs") & !(exists("df_list"))) {
    
      # Rename the ls_csvs list: combined_df_list => list
      combined_df_list <- ls_csvs
    
      # Remove ls_csv variables from the global environment:
      rm(ls_csvs)
    
    
    # Otherwise:
    } else{
    
      # Rename the df_list list: combined_df_list => list
      combined_df_list <- df_list
    
    }
    
    # Re-allocate some memory:
    gc()
    
    # Store a function to row-bind all dataframes in the list: rbind_all_columns => function
    rbind_all_columns <- function(x, y) {
      x[, c(as.character(setdiff(colnames(y), colnames(x))))] <- NA
      y[, c(as.character(setdiff(colnames(x), colnames(y))))] <- NA
      return(rbind(x, y))
    }
    
    # Apply the function and store the result as a dataframe: df => data.frame
    df <-
      Reduce(function(x, y) {
        rbind_all_columns(x, y)
      }, combined_df_list)
    
    # Subset out New York and Washington: washington_vs_ny => data.frame 
    washington_vs_ny <- df[which(df$State == "Washington" | df$State == "New York"),]
    
    # Clean the data.frame: washington_vs_nyordered => data.frame
    washington_vs_nyordered <- within(washington_vs_ny[order(washington_vs_ny$version_date),],
                                 {
                                   Confirmed <- ifelse(is.na(Confirmed), 0, Confirmed)
                                   Deaths <- ifelse(is.na(Deaths), 0, Deaths)
                                   Recovered <- ifelse(is.na(Recovered), 0, Recovered)
                                 }
    )[,c("version_date", "State", "Confirmed", "Deaths", "Recovered")]
    
    # Reshape the data for charting: chart_df => data.frame: 
    chart_data <- within(reshape(washington_vs_nyordered,
            direction = "long", 
            varying = c("Confirmed", "Deaths", "Recovered"),
            v.names=c("vars"),
            idvar = c("version_date", "State"),
            timevar = "vals", 
            times = c("Confirmed", "Deaths", "Recovered"),
            new.row.names = 1:(length(c("Confirmed", "Deaths", "Recovered")) * 
                                 nrow(washington_vs_nyordered))
          ), {version_date <- as.Date(as.character(version_date), "%Y-%m-%d")})
    
    # Chart the data: 
    ggplot(chart_data, aes(x = version_date, y = vars, colour = vals)) +
      geom_line() + 
      facet_wrap(.~State)
    

提交回复
热议问题