Separate string after last underscore

荒凉一梦 提交于 2019-12-06 01:27:39

df <- data.frame(Name = c("A","B","C"),
                 Var_1_EVU = c(2,NA,NA),
                 Var_1_BdS = c(NA,3,4),
                 Var_2_BdS = c(NA,3,4))

df %>% 
  gather("type", "value", -Name) %>% 
  separate(type, into = c("type", "type_num", "var")) %>% 
  unite(type, type, type_num, sep = "") %>% 
  spread(type, value)

#   Name var Var1 Var2
# 1    A BdS   NA   NA
# 2    A EVU    2   NA
# 3    B BdS    3    3
# 4    B EVU   NA   NA
# 5    C BdS    4    4
# 6    C EVU   NA   NA

example using tidyr::extract to deal with varnames that have an arbitrary number of underscores...


df <- data.frame(Name = c("A","B","C"),
                 Var_x_1_EVU = c(2,NA,NA),
                 Var_x_1_BdS = c(NA,3,4),
                 Var_x_y_2_BdS = c(NA,3,4))

df %>% 
  gather("col_name", "value", -Name) %>% 
  extract(col_name, c("var", "type"), "(.*)_(.*)") %>% 
  spread(var, value)

#   Name type Var_x_1 Var_x_y_2
# 1    A  BdS      NA        NA
# 2    A  EVU       2        NA
# 3    B  BdS       3         3
# 4    B  EVU      NA        NA
# 5    C  BdS       4         4
# 6    C  EVU      NA        NA

You can avoid a potential problem with duplicate observations by adding a row number column/variable first with mutate(n = row_number()) to make each observation unique, and you can avoid tidyr::extract being masked by magrittr by calling it explictly with tidyr::extract...


dt <- data.table(Name = c("A", "A", "B", "C"),
                 Var_1_EVU = c(1, 2, NA, NA),
                 Var_1_BdS = c(1, NA, 3, 4),
                 Var_x_2_BdS = c(1, NA, 3, 4))

dt %>% 
  mutate(n = row_number()) %>% 
  gather("col_name", "value", -n, -Name) %>% 
  tidyr::extract(col_name, c("var", "type"), "(.*)_(.*)") %>% 
  spread(var, value)

#   Name n type Var_1 Var_x_2
# 1    A 1  BdS     1       1
# 2    A 1  EVU     1      NA
# 3    A 2  BdS    NA      NA
# 4    A 2  EVU     2      NA
# 5    B 3  BdS     3       3
# 6    B 3  EVU    NA      NA
# 7    C 4  BdS     4       4
# 8    C 4  EVU    NA      NA

Here's an alternative data.table solution using tstrsplit/melt/dcast I would personally stick with data.table in this case because spread doesn't have a fun argument, hence, if you have dupes when spreading again, you will get an error.

library(magrittr) # people like pipes these days
dt %>%
  # convert ot long format like you did
  melt(., id = "Name") %>% 
  # split by the last underscore
  .[, c("variable", "grp") := tstrsplit(variable, "_(?!.*_)", perl = TRUE)] %>% 
  # convert back to wide format
  dcast(., Name + grp ~ variable) 

#    Name grp Var_1 Var_2
# 1:    A BdS    NA    NA
# 2:    A EVU     2    NA
# 3:    B BdS     3     3
# 4:    B EVU    NA    NA
# 5:    C BdS     4     4
# 6:    C EVU    NA    NA