问题
I am trying to join data.table x to z using a non-equi join. Table x contains two columns X1 and X2 that are used as the range to use for joining with column Z1 in z. The current code successfully does the non-equi join however certain columns are removed or renamed. I would like to return the 'ideal' data.table supplied, instead of the one I currently have which I would have to rename columns or join data further to get the 'ideal' data supplied.
> library(data.table)
>
> x <- data.table(Id = c("A", "B", "C", "C"),
+ X1 = c(1L, 3L, 5L, 7L),
+ X2 = c(8L,12L,9L,18L),
+ XY = c("x2", "x4", "x6", "x8"))
>
> z <- data.table(ID = "C", Z1 = 5:9, Z2 = paste0("z", 5:9))
>
> x
Id X1 X2 XY
1: A 1 8 x2
2: B 3 12 x4
3: C 5 9 x6
4: C 7 18 x8
> z
ID Z1 Z2
1: C 5 z5
2: C 6 z6
3: C 7 z7
4: C 8 z8
5: C 9 z9
>
> # suboptimal data return data format
> x[z, on = .(Id == ID, X1 <= Z1, X2 >= Z1)]
Id X1 X2 XY Z2
1: C 5 5 x6 z5
2: C 6 6 x6 z6
3: C 7 7 x6 z7
4: C 7 7 x8 z7
5: C 8 8 x6 z8
6: C 8 8 x8 z8
7: C 9 9 x6 z9
8: C 9 9 x8 z9
>
> # column names are Id, X1 and X2 from x which replaces ID and Z1. The contents of X1 and X2 are also changed to the original values of Z1.
> # XY and Z2 remain unchanged.
>
> # I want to create the following table where the original column names and values are retained, while still joining the table in a non-equi way.
>
> ideal <- data.table(ID = c("C", "C", "C", "C", "C", "C", "C", "C"),
+ Z1 = c(5, 6, 7, 7, 8, 8, 9, 9),
+ Z2 = c("Z5", "z6", "z7", "z7", "z8", "z8", "z9", "z9"),
+ X1 = c(5, 5, 5, 7, 5, 7, 5, 7),
+ X2 = c(9, 9, 9, 18, 9, 18, 9, 18),
+ XY = c("x6", "x6", "x6", "x8", "x6", "x8", "x6", "x8"))
>
> print(ideal)
ID Z1 Z2 X1 X2 XY
1: C 5 Z5 5 9 x6
2: C 6 z6 5 9 x6
3: C 7 z7 5 9 x6
4: C 7 z7 7 18 x8
5: C 8 z8 5 9 x6
6: C 8 z8 7 18 x8
7: C 9 z9 5 9 x6
8: C 9 z9 7 18 x8
回答1:
As commented by @Humpelstielzchen, it is possible to do by selecting the required columns manually. But one has to use the prefix x.
(x
in x.
refers to the parameter x
of [.data.table
rather than the name of the data.table) to recover columns in the the original data.table x
. Otherwise, incorrect output will be produced.
# desired
x[z, .(ID, Z1, Z2, X1 = x.X1, X2 = x.X2, XY), on = .(Id == ID, X1 <= Z1, X2 >= Z1)]
# ID Z1 Z2 X1 X2 XY
# 1: C 5 z5 5 9 x6
# 2: C 6 z6 5 9 x6
# 3: C 7 z7 5 9 x6
# 4: C 7 z7 7 18 x8
# 5: C 8 z8 5 9 x6
# 6: C 8 z8 7 18 x8
# 7: C 9 z9 5 9 x6
# 8: C 9 z9 7 18 x8
# undesired
x[z, on = .(Id == ID, X1 <= Z1, X2 >= Z1), .(ID, Z1, Z2, X1, X2, XY)]
# ID Z1 Z2 X1 X2 XY
# 1: C 5 z5 5 5 x6
# 2: C 6 z6 6 6 x6
# 3: C 7 z7 7 7 x6
# 4: C 7 z7 7 7 x8
# 5: C 8 z8 8 8 x6
# 6: C 8 z8 8 8 x8
# 7: C 9 z9 9 9 x6
# 8: C 9 z9 9 9 x8
packageVersion('data.table')
# '1.13.2'
In case that there are many other columns, which makes manual selection unfeasible, here is a workaround by repalcing the position of x
and z
during joining:
DT <- z[x, on = .(ID=Id, Z1 >= X1, Z1 <= X2), nomatch = NULL]
#' since for non-equi conditions, the values are from RHS while
#' the column names were from LHS, we known that `Z1` and `Z1.1`
#' correspond to `X1` and `X2`.
setnames(DT, c('Z1', 'Z1.1'), c('X1', 'X2'))
DT[z, Z1 := i.Z1, on = .(ID, Z2)]
# > DT
# ID X1 Z2 X2 XY Z1
# 1: C 5 z5 9 x6 5
# 2: C 5 z6 9 x6 6
# 3: C 5 z7 9 x6 7
# 4: C 5 z8 9 x6 8
# 5: C 5 z9 9 x6 9
# 6: C 7 z7 18 x8 7
# 7: C 7 z8 18 x8 8
# 8: C 7 z9 18 x8 9
回答2:
Make a copy?
copy_cols <- function(dt, nms) {
dt[, paste0(".", nms) := lapply(.SD, copy), .SDcols = nms]
}
copy_cols(x, c("X1", "X2"))
copy_cols(z, "Z1")
x[z, on = .(Id == ID, .X1 <= .Z1, .X2 >= .Z1)][, c(".X1", ".X2") := NULL][]
Output
Id X1 X2 XY Z1 Z2
1: C 5 9 x6 5 z5
2: C 5 9 x6 6 z6
3: C 5 9 x6 7 z7
4: C 7 18 x8 7 z7
5: C 5 9 x6 8 z8
6: C 7 18 x8 8 z8
7: C 5 9 x6 9 z9
8: C 7 18 x8 9 z9
回答3:
I ended up answering my own question.
data_table_tidy_join <- function(x,y, join_by){
x <- data.table(x)
y <- data.table(y)
# Determine single join names
single_join_names <- purrr::keep((stringr::str_split(join_by, "==|>=|<=")), ~length(.) == 1) %>% unlist()
# cols from x that won't require as matching in i
remove_from_x_names <- c(trimws(na.omit(stringr::str_extract(join_by, ".*(?=[=]{2})"))), single_join_names)
# names need to keep
x_names_keep_raw <- names(x)[!names(x) %in% remove_from_x_names]
y_names_keep_raw <- names(y)
# cols that exist in both x and y, but not being equi joined on
cols_rename_index <- x_names_keep_raw[x_names_keep_raw %in% y_names_keep_raw]
#rename so indexing works
x_names_keep <- x_names_keep_raw
y_names_keep <- y_names_keep_raw
# give prefix to necessary vars
x_names_keep[x_names_keep %in% cols_rename_index] <- paste("x.",cols_rename_index, sep ="")
y_names_keep[y_names_keep %in% cols_rename_index] <- paste("i.",cols_rename_index, sep ="")
# implement data.table call, keeping required cols
joined_data <-
x[y, on = join_by,
mget(c(paste0("i.", y_names_keep_raw),paste0("x.", x_names_keep_raw))) %>% set_names(c(y_names_keep,x_names_keep)),
mult = "all", allow.cartesian = TRUE, nomatch = NA] %>%
as_tibble()
return(joined_data)
}
> x <- data.table(Id = c("A", "B", "C", "C"),
+ X1 = c(1L, 3L, 5L, 7L),
+ X2 = c(8L,12L,9L,18L),
+ XY = c("x2", "x4", "x6", "x8"))
>
> z <- data.table(ID = "C", Z1 = 5:9, Z2 = paste0("z", 5:9))
>
> data_table_tidy_join(x, z, join_by = c("Id == ID","X1 <= Z1", "X2 >= Z1"))
# A tibble: 8 x 6
ID Z1 Z2 X1 X2 XY
<chr> <int> <chr> <int> <int> <chr>
1 C 5 z5 5 9 x6
2 C 6 z6 5 9 x6
3 C 7 z7 5 9 x6
4 C 7 z7 7 18 x8
5 C 8 z8 5 9 x6
6 C 8 z8 7 18 x8
7 C 9 z9 5 9 x6
8 C 9 z9 7 18 x8
来源:https://stackoverflow.com/questions/65000550/column-name-labelling-in-data-table-joins