In R, use nonstandard evaluation to select specific variables from data.frames

非 Y 不嫁゛ 提交于 2021-01-28 06:21:51

问题


I've got several large-ish data.frames set up like a relational database, and I'd like to make a single function to look for whatever variable I need and grab it from that particular data.frame and add it to the data.frame I'm currently working on. I've got a way to do this that works, but it requires temporarily making a list of all the data.frames, which seems inefficient. I suspect that nonstandard evaluation would solve this problem for me, but I'm not sure how to do it.

Here's what works but seems inefficient:

Table1 <- data.frame(ID = LETTERS[1:10], ColA = rnorm(10), ColB = rnorm(10),
                     ColC = rnorm(10))

Table2 <- data.frame(ID = LETTERS[1:10], ColD = rnorm(10), ColE = rnorm(10),
                     ColF = rnorm(10))

Table3 <- data.frame(ID = LETTERS[1:10], ColG = rnorm(10), ColH = rnorm(10),
                     ColI = rnorm(10))

Key <- data.frame(Table = rep(c("Table1", "Table2", "Table3"), each = 4),
                  ColumnName = c("ID", paste0("Col", LETTERS[1:3]),
                                 "ID", paste0("Col", LETTERS[4:6]),
                                 "ID", paste0("Col", LETTERS[7:9])))

# function for grabbing info from other tables
grab <- function(StartDF, ColNames){

      AllDFs <- list(Table1, Table2, Table3)
      names(AllDFs) <- c("Table1", "Table2", "Table3")

      # Determine which data.frames have that column
      WhichDF <- Key %>% filter(ColumnName %in% ColNames) %>% 
            select(Table)

      TempDF <- StartDF

      for(i in 1:length(ColNames)){
            ToAdd <- AllDFs[WhichDF[i, 1]]
            ToAdd <- ToAdd[[1]] %>% 
                  select(c(ColNames[i], ID))

            TempDF <- TempDF %>% left_join(ToAdd)
            rm(ToAdd)
      }

      return(TempDF)


}

grab(Table1, c("ColE", "ColH"))

What would be great instead would be something like this:

grab <- function(StartDF, ColNames){

      # Some function that returns the column names of all the data.frames
      # without me creating a new object that is a list of them

      # Some function that left_joins the correct data.frame plus the column
      # "ID" to my starting data.frame, again without needing to create that list 
      # of all the data.frames

}

回答1:


Instead of creating the list manually, we can directly get the values of the objects returned from the 'Table' column of 'Key' dataset with mget

library(dplyr)
library(purrr)
grab <- function(StartDF, ColNames){



     # filter the rows of Key based on the ColNames input
     # pull the Table column as a vector
     # column was factor, so convert to character class
     # return the value of the objects with mget in a list
     Tables <- Key %>% 
               filter(ColumnName %in% ColNames) %>% 
               pull(Table) %>%
               as.character %>%
               mget(envir = .GlobalEnv) 


      TempDF <- StartDF

      # use the same left_joins in a loop after selecting only the
      # ID and corresponding columns from 'ColNames'
      for(i in seq_along(ColNames)){
            ToAdd  <- Tables[[i]] %>%
                         select(ColNames[i], ID)          

            TempDF <- TempDF %>% 
                  left_join(ToAdd)
            rm(ToAdd)
      }

      TempDF


}

grab(Table1, c("ColE", "ColH"))

Or another option is reduce

grab <- function(StartDF, ColNames) {
     #only change is that instead of a for loop
     # use reduce with left_join after selecting the corresponding columns
     # with map
     Key %>%
       filter(ColumnName %in% ColNames) %>% 
       pull(Table) %>%
       as.character %>%
       mget(envir = .GlobalEnv)  %>%
       map2(ColNames, ~ .x %>%
                     select(ID, .y)) %>%
       append(list(Table1), .)  %>%
       reduce(left_join)

   }

grab(Table1, c("ColE", "ColH"))
#   ID       ColA       ColB        ColC        ColE        ColH
#1   A -0.9490093  0.5177143 -1.91015491  0.07777086  1.86277670
#2   B -0.7182786 -1.1019146 -0.70802738 -0.73965230  0.18375660
#3   C  0.5064516 -1.6904354  1.11106206  2.04315508 -0.65365228
#4   D  0.9362477  0.5260682 -0.03419651 -0.51628310 -1.17104181
#5   E  0.5636047 -0.9470895  0.43303304 -2.95928629  1.86425049
#6   F  1.0598531  0.4144901  0.10239896  1.57681703 -0.05382603
#7   G  1.1335047 -0.8282173 -0.28327898  2.02917831  0.50768462
#8   H  0.2941341  0.3261185 -0.15528127 -0.46470035 -0.86561320
#9   I -2.1434905  0.6567689  0.02298549  0.90822132  0.64360337
#10  J  0.4291258  1.3410147  0.67544567  0.12466251  0.75989623



回答2:


There is a serious bug in the accepted solution. If you're not careful with the ordering in the ColNames argument, then the function won't work. Also, I redefined your data to use tibbles instead. They're basically the same as data frames, but their default settings are nicer (e.g. you don't need StringsAsFactors = FALSE)

library(tidyverse)

Table1 <- tibble(
  ID = LETTERS[1:10], ColA = rnorm(10), ColB = rnorm(10), ColC = rnorm(10)
)
Table2 <- tibble(
  ID = LETTERS[1:10], ColD = rnorm(10), ColE = rnorm(10), ColF = rnorm(10)
)
Table3 <- tibble(
  ID = LETTERS[1:10], ColG = rnorm(10), ColH = rnorm(10), ColI = rnorm(10)
)

Key <- tibble(
  Table = rep(c("Table1", "Table2", "Table3"), each = 4),
  ColumnName = c("ID", paste0("Col", LETTERS[1:3]),
                 "ID", paste0("Col", LETTERS[4:6]),
                 "ID", paste0("Col", LETTERS[7:9]))
)

grab_akrun <- function(StartDF, ColNames) {
  #only change is that instead of a for loop
  # use reduce with left_join after selecting the corresponding columns
  # with map
  Key %>%
    filter(ColumnName %in% ColNames) %>% 
    pull(Table) %>%
    as.character %>%
    mget(envir = .GlobalEnv)  %>%
    map2(ColNames, ~ .x %>%
           select(ID, .y)) %>%
    append(list(Table1), .)  %>%
    reduce(left_join)

}

grab_akrun(Table1, c("ColE", "ColH"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#>    ID      ColA   ColB   ColC   ColE   ColH
#>    <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#>  1 A     -0.658 -0.613  0.689 -0.850 -0.795
#>  2 B      0.143  0.732 -0.212 -1.74   1.99 
#>  3 C     -0.966 -0.570 -0.354  0.559 -1.11 
#>  4 D     -1.05   0.269 -0.856 -0.370 -1.35 
#>  5 E      0.255 -0.349  0.329  1.39   0.421
#>  6 F      1.51   1.38   0.707 -0.639  0.289
#>  7 G     -1.28   1.44  -1.35   1.94  -1.04 
#>  8 H     -1.56  -0.434  0.231  0.467  0.656
#>  9 I     -0.553 -1.64  -0.761  0.133  0.249
#> 10 J     -0.950  0.418 -0.843  0.593  0.343

This works, but if you change the order:

grab_akrun(Table1, c("ColH", "ColE"))
#> Error: Unknown column `ColH`

Instead, you should approach it like this:

grab_new <- function(StartDF, ColNames) {
  Key %>% 
    filter(ColumnName %in% ColNames) %>% 
    pluck("Table") %>%
    mget(inherits = TRUE) %>% 
    map(~select(.x, ID, intersect(colnames(.x), ColNames))) %>% 
    reduce(left_join, .init = StartDF)
}

grab_new(Table1, c("ColE", "ColH"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#>    ID      ColA   ColB   ColC   ColE   ColH
#>    <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#>  1 A     -0.658 -0.613  0.689 -0.850 -0.795
#>  2 B      0.143  0.732 -0.212 -1.74   1.99 
#>  3 C     -0.966 -0.570 -0.354  0.559 -1.11 
#>  4 D     -1.05   0.269 -0.856 -0.370 -1.35 
#>  5 E      0.255 -0.349  0.329  1.39   0.421
#>  6 F      1.51   1.38   0.707 -0.639  0.289
#>  7 G     -1.28   1.44  -1.35   1.94  -1.04 
#>  8 H     -1.56  -0.434  0.231  0.467  0.656
#>  9 I     -0.553 -1.64  -0.761  0.133  0.249
#> 10 J     -0.950  0.418 -0.843  0.593  0.343
grab_new(Table1, c("ColH", "ColE"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#>    ID      ColA   ColB   ColC   ColE   ColH
#>    <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#>  1 A     -0.658 -0.613  0.689 -0.850 -0.795
#>  2 B      0.143  0.732 -0.212 -1.74   1.99 
#>  3 C     -0.966 -0.570 -0.354  0.559 -1.11 
#>  4 D     -1.05   0.269 -0.856 -0.370 -1.35 
#>  5 E      0.255 -0.349  0.329  1.39   0.421
#>  6 F      1.51   1.38   0.707 -0.639  0.289
#>  7 G     -1.28   1.44  -1.35   1.94  -1.04 
#>  8 H     -1.56  -0.434  0.231  0.467  0.656
#>  9 I     -0.553 -1.64  -0.761  0.133  0.249
#> 10 J     -0.950  0.418 -0.843  0.593  0.343

Which works as expected.

Created on 2020-01-21 by the reprex package (v0.3.0)



来源:https://stackoverflow.com/questions/59828866/in-r-use-nonstandard-evaluation-to-select-specific-variables-from-data-frames

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!