问题
This follows from Extract rows with duplicate values in two or more fields but different values in another field
As suggested, I'm posting additional request separately. First code then question.
library(data.table)
# load the data
customers <- structure(list(
NAME = c("B V RAMANA ", "K KRISHNA", "B SUDARSHAN", "B ANNAPURNA ",
"BIKASH BAHADUR CHITRE", "KOTLA CHENNAMMA ", "K KRISHNA",
" B V RAMANA", "B ANNAPURNA", "ZAITOON BEE", "BIMAN BALAIAH",
" KOTLA CHENNAMMA ", "B V RAMANA"),
DOB = c("15-01-1960", "01-05-1964", "12-03-1975", "12-12-1962",
"14-05-1983", "15-07-1958", "01-05-1964", "15-01-1960",
"12-12-1962", "20-02-1960", "10-03-1964", "15-07-1958",
"15-01-1960"),
ID = c(" 502910", "502737", "502995", " 502878", "502984",
"502466", "502737", "502902 ", "502877 ", "503000",
"502979", "502467", "502902 "),
PIN = c(500033, 500050, 500032, 500084, 500032, 500032, 500084, 500035,
500084, 500084, 500032, 500032, 500032)),
.Names = c("NAME", "DOB", "ID", "PIN"),
class = c("data.table", "data.frame"), row.names = c(NA, -13L))
# function for Duplicate Key Exclusion
dupKeyEx <- function(DT, dup_cols, unique_cols) {
cols <- c(dup_cols, unique_cols)
mDT <- DT[!duplicated(DT, by=cols), .N, by=dup_cols][N > 1L]
ans <- unique(DT[mDT[, !"N"], on=dup_cols], by=cols)
setorderv(ans, c(dup_cols, unique_cols))
return(ans)
}
Running the function gets zero results because of whitespaces in the beginning or end of NAME
and ID
columns in customers
table:
dup_cols <- c("NAME", "DOB")
unique_cols <- "ID"
dupKeyEx(customers, dup_cols, unique_cols)
Empty data.table (0 rows) of 4 cols: NAME,DOB,ID,PIN
So we trim, ie, remove whitespaces from both ends of the relevant columns:
library(stringr)
customers[, `:=`(NAME = str_trim(NAME),
ID = str_trim(ID))]
Now we get the expected results:
dupKeyEx(customers, dup_cols, unique_cols)
NAME DOB ID PIN
1: B ANNAPURNA 12-12-1962 502877 500084
2: B ANNAPURNA 12-12-1962 502878 500084
3: B V RAMANA 15-01-1960 502902 500035
4: B V RAMANA 15-01-1960 502910 500033
5: KOTLA CHENNAMMA 15-07-1958 502466 500032
6: KOTLA CHENNAMMA 15-07-1958 502467 500032
I was wondering if the columns in dup_cols
and unique_cols
(together assigned in cols
variable inside the dupKeyEx
function) could be trimmed inside the function itself. That way, I won't need to remember to trim the relevant columns before using the dupKeyEx
function.
I searched but was unable to find ways to refer to columns inside cols
variable and apply stringr::str_trim()
on them inside the dupKeyEx
function. Any help would be appreciated.
回答1:
you can do this:
dupKeyEx <- function(DT, dup_cols, unique_cols) {
sapply(dup_cols,function(x) DT[[x]] <<- str_trim(DT[[x]]))
cols <- c(dup_cols, unique_cols)
mDT <- DT[!duplicated(DT, by=cols), .N, by=dup_cols][N > 1L]
ans <- unique(DT[mDT[, !"N"], on=dup_cols], by=cols)
setorderv(ans, c(dup_cols, unique_cols))
return(ans)
}
or follow @Frank's advice, using data.table
syntax. We make a copy of the table first not to modify the input:
dupKeyEx <- function(DT, dup_cols, unique_cols) {
DT<-copy(DT) # comment if you want to keep changes by reference
DT[, (dup_cols) := lapply(.SD, str_trim), .SDcols=dup_cols]
cols <- c(dup_cols, unique_cols)
mDT <- DT[!duplicated(DT, by=cols), .N, by=dup_cols][N > 1L]
ans <- unique(DT[mDT[, !"N"], on=dup_cols], by=cols)
setorderv(ans, c(dup_cols, unique_cols))
return(ans)
}
Edit by San: The following serves my purpose nicely:
dupKeyEx <- function(DT, dup_cols, unique_cols) {
cols <- c(dup_cols, unique_cols)
chr_cols <- cols[sapply(DT[, ..cols], is.character)]
DT[, (chr_cols) := lapply(.SD, stringr::str_trim), .SDcols=chr_cols]
mDT <- DT[!duplicated(DT, by=cols), .N, by=dup_cols][N > 1L]
ans <- unique(DT[mDT[, !"N"], on=dup_cols], by=cols)
setorderv(ans, c(dup_cols, unique_cols))
return(ans)
}
Thanks to both contributors. I've modified code to apply str_trim
only on character columns so as to avoid changing other column types to character. This change is made by reference because making copies of large tables costs resources including time. Also, trimming has no bad side effects in my analysis job - it is usually necessary.
Generally speaking, trimming all character columns should be done immediately after loading any large table by something like:
trimCharCols <- function(DT) {
colsDT <- names(DT)
chr_cols <- colsDT[sapply(DT, is.character)]
DT[, (chr_cols) := lapply(.SD, stringr::str_trim), .SDcols=chr_cols]
}
in which case two lines of code from dupKeyEx
function may be avoided. But I'm going to leave them there for the time being because I need this function to be "independent".
来源:https://stackoverflow.com/questions/45716873/how-to-refer-to-multiple-column-names-held-in-a-variable-inside-a-function