Conditionally replace values of multiple columns, from values of other multiple columns

问题

Suppose I have this dataset:

set.seed (1234); 
data.frame(cbind(a=rep(c("si","no"),30),b=rnorm(60)),
           c=rep(c("d","e","f"),20))  %>% head()

Then I want to add many columns (in this example I only added two), to identify distinct cases between each group (in this case, column "a").

set.seed(1234); 
data.frame(cbind(a=rep(c("si","no"),30),b=rnorm(60)),c=rep(c("d","e","f"),20)) %>% 
group_by(a) %>% dplyr::mutate_at(vars(c(b,c)), .funs= list(dups_hash_ing= ~n_distinct(.)))

This code leaves the following dataset:

If I set the dataset with dput, the outcome is

structure(list(a = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L), .Label = c("no", "si"), class = "factor"), b = structure(c(22L, 
1L, 51L, 34L, 50L, 57L, 53L, 10L, 47L, 3L, 11L, 23L, 15L, 38L, 
58L, 39L, 41L, 17L, 28L, 21L, 37L, 45L, 29L, 46L, 32L, 48L, 56L, 
52L, 26L, 19L, 35L, 8L, 55L, 20L, 9L, 36L, 2L, 12L, 6L, 42L, 
49L, 43L, 59L, 54L, 31L, 13L, 60L, 44L, 14L, 30L, 7L, 5L, 16L, 
27L, 33L, 18L, 24L, 4L, 25L, 40L), .Label = c("-0.0997905884418961", 
"-0.151736536534977", "-0.198416273822079", "-0.254874652654534", 
"-0.274704218225806", "-0.304721068966714", "-0.324393300483657", 
"-0.400235237343163", "-0.415751788401515", "-0.50873701541522", 
"-0.538070788884863", "-0.60615111526422", "-0.659770093821306", 
"-0.684320344136007", "-0.789646852263761", "-0.933503340589868", 
"-0.965903210133575", "-1.07754212275943", "-1.11444896479736", 
"-1.60708093984972", "-2.07823754188738", "-2.7322195229558", 
"-2.85575865501923", "-3.23315213292314", "0.0295178303214797", 
"0.0326639575014441", "0.116845344986082", "0.162654708118265", 
"0.185513915583057", "0.186492083080971", "0.287709728313787", 
"0.311681028661359", "0.319160238648117", "0.413868915451097", 
"0.418057822385083", "0.42200837321742", "0.485226820569252", 
"0.487814635163685", "0.500694614280786", "0.594273774110513", 
"0.62021020366732", "0.629536099884472", "0.660212631820405", 
"0.677415500438328", "0.696768778564913", "0.700733515544461", 
"0.704180178465512", "0.760462361967838", "0.895171980275539", 
"0.912322161610113", "0.976031734922396", "1.1123628412626", 
"1.16910851401363", "1.17349757263239", "1.49349310261748", "1.84246362620766", 
"1.98373220068438", "2.16803253951933", "2.27348352044748", "2.91914013071762"
), class = "factor"), c = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 
1L, 2L, 3L, 1L, 2L, 3L), .Label = c("d", "e", "f"), class = "factor"), 
    a_dups_hash_ing = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), b_dups_hash_ing = c(30L, 30L, 30L, 30L, 
    30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 
    30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 
    30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 
    30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L, 
    30L, 30L, 30L, 30L, 30L, 30L, 30L, 30L), c_dups_hash_ing = c(3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L)), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -60L), groups = structure(list(
    a = structure(1:2, .Label = c("no", "si"), class = "factor"), 
    .rows = list(c(2L, 4L, 6L, 8L, 10L, 12L, 14L, 16L, 18L, 20L, 
    22L, 24L, 26L, 28L, 30L, 32L, 34L, 36L, 38L, 40L, 42L, 44L, 
    46L, 48L, 50L, 52L, 54L, 56L, 58L, 60L), c(1L, 3L, 5L, 7L, 
    9L, 11L, 13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L, 29L, 31L, 
    33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L, 49L, 51L, 53L, 55L, 
    57L, 59L))), row.names = c(NA, -2L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE))

What I need to do, is replace, column by column, if the number of distinct cases is more than one per group, with the value of the original column. I have to do this for more than 50 columns. An example of this will be provided for only one column with mutate:

  dplyr::mutate(b_dups_hash_ing= ifelse(>1,b,0))

I need to repeat the code provided above for many variables. This is very similar to a mutate_at (words in brackets is what I would do). The following example does not work, but is something I would do in an ideal world, just for your better understanding of my problem.

dplyr::mutate_at(vars(contains('_dups_hash_ing')), .funs = list(~ifelse(.>1,vars([original]),0)))

回答1:

Is this what you're looking for?

df %>% dplyr::mutate_at(vars(contains('_dups_hash_ing')), ~ ifelse(. > 1, ., 0)) %>% head
#> # A tibble: 6 x 6
#> # Groups:   a [2]
#>   a     b                  c     a_dups_hash_ing b_dups_hash_ing c_dups_hash_ing
#>   <fct> <fct>              <fct>           <dbl>           <int>           <int>
#> 1 si    -2.7322195229558   d                   0              30               3
#> 2 no    -0.09979058844189… e                   0              30               3
#> 3 si    0.976031734922396  f                   0              30               3
#> 4 no    0.413868915451097  d                   0              30               3
#> 5 si    0.912322161610113  e                   0              30               3
#> 6 no    1.98373220068438   f                   0              30               3

来源：https://stackoverflow.com/questions/60328028/conditionally-replace-values-of-multiple-columns-from-values-of-other-multiple

标签

replace

dplyr

at-command

mutate