dplyr::mutate (assign na.rm =TRUE)

前端 未结 3 1204
野性不改
野性不改 2021-01-13 07:23

I have a data.frame that has 100 variables. I want to get the sum of three variables only using mutate (not summarise).

If there is NA in a

相关标签:
3条回答
  • 2021-01-13 07:33

    Where better = tidyr:

    df %>%
        mutate(rn = row_number()) %>%
        gather(var, varNum, var1:var3) %>%
        group_by(rn) %>%
        mutate(sum = sum(varNum, na.rm = TRUE)) %>% 
        spread(var, varNum)
    

    In case your dataset is poised to grow...

    0 讨论(0)
  • 2021-01-13 07:44

    rowwise() is my go-to function. It's like group_by() but it treats each row as an individual group.

    df %>% rowwise() %>% mutate(Sum = sum(c(var1, var2, var3), na.rm = TRUE))
    
    0 讨论(0)
  • 2021-01-13 07:52

    We can use Reduce with +

    df %>% 
         mutate_each(funs(replace(., is.na(.), 0)), var1:var3) %>% 
         mutate(Sum = Reduce(`+`, .))      
    #   var1 var2 var3 Sum
    #1    4    5    0   9
    #2    2    0    3   5
    #3    1    2    4   7
    #4    0    3    5   8
    #5    3    0    2   5
    #6    1    1    5   7
    

    Or with rowSums

    df %>% 
       mutate(Sum = rowSums(.[names(.)[1:3]], na.rm = TRUE))
    #   var1 var2 var3 Sum
    #1    4    5   NA   9
    #2    2   NA    3   5
    #3    1    2    4   7
    #4   NA    3    5   8
    #5    3   NA    2   5
    #6    1    1    5   7
    

    Benchmarks

    set.seed(24)
    df1 <- as.data.frame(matrix(sample(c(NA, 1:5), 1e6 *3, replace=TRUE),
                    dimnames = list(NULL, paste0("var", 1:3)), ncol=3))
    system.time({
    df1 %>% rowwise() %>% mutate(Sum = sum(c(var1, var2, var3), na.rm = TRUE))
    })
    # user  system elapsed 
    #  21.50    0.03   21.66 
    
    system.time({
    df1 %>%
        mutate(rn = row_number()) %>%
        gather(var, varNum, var1:var3) %>%
        group_by(rn) %>%
        mutate(sum = sum(varNum, na.rm = TRUE)) %>% 
        spread(var, varNum)})
     # user  system elapsed 
     #  5.96    0.39    6.37 
    
    
    system.time({
    replace(df1, is.na(df1), 0) %>% mutate(sum = var1 + var2 + var3)
    })
    
    # user  system elapsed 
    #   0.17    0.01    0.19 
    
    system.time({
    df1 %>% 
         mutate_each(funs(replace(., is.na(.), 0)), var1:var3) %>% 
         mutate(Sum = Reduce(`+`, .))      
    })
    # user  system elapsed 
    #   0.10    0.02    0.11 
    
    system.time({
    df1 %>% 
       mutate(Sum = rowSums(.[names(.)[1:3]], na.rm = TRUE))
       })
    # user  system elapsed 
    #   0.04    0.00    0.03 
    
    0 讨论(0)
提交回复
热议问题