问题
I've got several large-ish data.frames set up like a relational database, and I'd like to make a single function to look for whatever variable I need and grab it from that particular data.frame and add it to the data.frame I'm currently working on. I've got a way to do this that works, but it requires temporarily making a list of all the data.frames, which seems inefficient. I suspect that nonstandard evaluation would solve this problem for me, but I'm not sure how to do it.
Here's what works but seems inefficient:
Table1 <- data.frame(ID = LETTERS[1:10], ColA = rnorm(10), ColB = rnorm(10),
ColC = rnorm(10))
Table2 <- data.frame(ID = LETTERS[1:10], ColD = rnorm(10), ColE = rnorm(10),
ColF = rnorm(10))
Table3 <- data.frame(ID = LETTERS[1:10], ColG = rnorm(10), ColH = rnorm(10),
ColI = rnorm(10))
Key <- data.frame(Table = rep(c("Table1", "Table2", "Table3"), each = 4),
ColumnName = c("ID", paste0("Col", LETTERS[1:3]),
"ID", paste0("Col", LETTERS[4:6]),
"ID", paste0("Col", LETTERS[7:9])))
# function for grabbing info from other tables
grab <- function(StartDF, ColNames){
AllDFs <- list(Table1, Table2, Table3)
names(AllDFs) <- c("Table1", "Table2", "Table3")
# Determine which data.frames have that column
WhichDF <- Key %>% filter(ColumnName %in% ColNames) %>%
select(Table)
TempDF <- StartDF
for(i in 1:length(ColNames)){
ToAdd <- AllDFs[WhichDF[i, 1]]
ToAdd <- ToAdd[[1]] %>%
select(c(ColNames[i], ID))
TempDF <- TempDF %>% left_join(ToAdd)
rm(ToAdd)
}
return(TempDF)
}
grab(Table1, c("ColE", "ColH"))
What would be great instead would be something like this:
grab <- function(StartDF, ColNames){
# Some function that returns the column names of all the data.frames
# without me creating a new object that is a list of them
# Some function that left_joins the correct data.frame plus the column
# "ID" to my starting data.frame, again without needing to create that list
# of all the data.frames
}
回答1:
Instead of creating the list
manually, we can directly get the values of the objects returned from the 'Table' column of 'Key' dataset with mget
library(dplyr)
library(purrr)
grab <- function(StartDF, ColNames){
# filter the rows of Key based on the ColNames input
# pull the Table column as a vector
# column was factor, so convert to character class
# return the value of the objects with mget in a list
Tables <- Key %>%
filter(ColumnName %in% ColNames) %>%
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv)
TempDF <- StartDF
# use the same left_joins in a loop after selecting only the
# ID and corresponding columns from 'ColNames'
for(i in seq_along(ColNames)){
ToAdd <- Tables[[i]] %>%
select(ColNames[i], ID)
TempDF <- TempDF %>%
left_join(ToAdd)
rm(ToAdd)
}
TempDF
}
grab(Table1, c("ColE", "ColH"))
Or another option is reduce
grab <- function(StartDF, ColNames) {
#only change is that instead of a for loop
# use reduce with left_join after selecting the corresponding columns
# with map
Key %>%
filter(ColumnName %in% ColNames) %>%
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv) %>%
map2(ColNames, ~ .x %>%
select(ID, .y)) %>%
append(list(Table1), .) %>%
reduce(left_join)
}
grab(Table1, c("ColE", "ColH"))
# ID ColA ColB ColC ColE ColH
#1 A -0.9490093 0.5177143 -1.91015491 0.07777086 1.86277670
#2 B -0.7182786 -1.1019146 -0.70802738 -0.73965230 0.18375660
#3 C 0.5064516 -1.6904354 1.11106206 2.04315508 -0.65365228
#4 D 0.9362477 0.5260682 -0.03419651 -0.51628310 -1.17104181
#5 E 0.5636047 -0.9470895 0.43303304 -2.95928629 1.86425049
#6 F 1.0598531 0.4144901 0.10239896 1.57681703 -0.05382603
#7 G 1.1335047 -0.8282173 -0.28327898 2.02917831 0.50768462
#8 H 0.2941341 0.3261185 -0.15528127 -0.46470035 -0.86561320
#9 I -2.1434905 0.6567689 0.02298549 0.90822132 0.64360337
#10 J 0.4291258 1.3410147 0.67544567 0.12466251 0.75989623
回答2:
There is a serious bug in the accepted solution. If you're not careful with the ordering in the ColNames
argument, then the function won't work. Also, I redefined your data to use tibbles instead. They're basically the same as data frames, but their default settings are nicer (e.g. you don't need StringsAsFactors = FALSE)
library(tidyverse)
Table1 <- tibble(
ID = LETTERS[1:10], ColA = rnorm(10), ColB = rnorm(10), ColC = rnorm(10)
)
Table2 <- tibble(
ID = LETTERS[1:10], ColD = rnorm(10), ColE = rnorm(10), ColF = rnorm(10)
)
Table3 <- tibble(
ID = LETTERS[1:10], ColG = rnorm(10), ColH = rnorm(10), ColI = rnorm(10)
)
Key <- tibble(
Table = rep(c("Table1", "Table2", "Table3"), each = 4),
ColumnName = c("ID", paste0("Col", LETTERS[1:3]),
"ID", paste0("Col", LETTERS[4:6]),
"ID", paste0("Col", LETTERS[7:9]))
)
grab_akrun <- function(StartDF, ColNames) {
#only change is that instead of a for loop
# use reduce with left_join after selecting the corresponding columns
# with map
Key %>%
filter(ColumnName %in% ColNames) %>%
pull(Table) %>%
as.character %>%
mget(envir = .GlobalEnv) %>%
map2(ColNames, ~ .x %>%
select(ID, .y)) %>%
append(list(Table1), .) %>%
reduce(left_join)
}
grab_akrun(Table1, c("ColE", "ColH"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#> ID ColA ColB ColC ColE ColH
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A -0.658 -0.613 0.689 -0.850 -0.795
#> 2 B 0.143 0.732 -0.212 -1.74 1.99
#> 3 C -0.966 -0.570 -0.354 0.559 -1.11
#> 4 D -1.05 0.269 -0.856 -0.370 -1.35
#> 5 E 0.255 -0.349 0.329 1.39 0.421
#> 6 F 1.51 1.38 0.707 -0.639 0.289
#> 7 G -1.28 1.44 -1.35 1.94 -1.04
#> 8 H -1.56 -0.434 0.231 0.467 0.656
#> 9 I -0.553 -1.64 -0.761 0.133 0.249
#> 10 J -0.950 0.418 -0.843 0.593 0.343
This works, but if you change the order:
grab_akrun(Table1, c("ColH", "ColE"))
#> Error: Unknown column `ColH`
Instead, you should approach it like this:
grab_new <- function(StartDF, ColNames) {
Key %>%
filter(ColumnName %in% ColNames) %>%
pluck("Table") %>%
mget(inherits = TRUE) %>%
map(~select(.x, ID, intersect(colnames(.x), ColNames))) %>%
reduce(left_join, .init = StartDF)
}
grab_new(Table1, c("ColE", "ColH"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#> ID ColA ColB ColC ColE ColH
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A -0.658 -0.613 0.689 -0.850 -0.795
#> 2 B 0.143 0.732 -0.212 -1.74 1.99
#> 3 C -0.966 -0.570 -0.354 0.559 -1.11
#> 4 D -1.05 0.269 -0.856 -0.370 -1.35
#> 5 E 0.255 -0.349 0.329 1.39 0.421
#> 6 F 1.51 1.38 0.707 -0.639 0.289
#> 7 G -1.28 1.44 -1.35 1.94 -1.04
#> 8 H -1.56 -0.434 0.231 0.467 0.656
#> 9 I -0.553 -1.64 -0.761 0.133 0.249
#> 10 J -0.950 0.418 -0.843 0.593 0.343
grab_new(Table1, c("ColH", "ColE"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#> ID ColA ColB ColC ColE ColH
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A -0.658 -0.613 0.689 -0.850 -0.795
#> 2 B 0.143 0.732 -0.212 -1.74 1.99
#> 3 C -0.966 -0.570 -0.354 0.559 -1.11
#> 4 D -1.05 0.269 -0.856 -0.370 -1.35
#> 5 E 0.255 -0.349 0.329 1.39 0.421
#> 6 F 1.51 1.38 0.707 -0.639 0.289
#> 7 G -1.28 1.44 -1.35 1.94 -1.04
#> 8 H -1.56 -0.434 0.231 0.467 0.656
#> 9 I -0.553 -1.64 -0.761 0.133 0.249
#> 10 J -0.950 0.418 -0.843 0.593 0.343
Which works as expected.
Created on 2020-01-21 by the reprex package (v0.3.0)
来源:https://stackoverflow.com/questions/59828866/in-r-use-nonstandard-evaluation-to-select-specific-variables-from-data-frames