问题
I have a 67MM row data.table with people names and surname separated by spaces. I just need to create a new column for each word.
Here is an small subset of the data:
n <- structure(list(Subscription_Id = c("13.855.231.846.091.000",
"11.156.048.529.090.800", "24.940.584.090.830", "242.753.039.111.124",
"27.843.782.090.830", "13.773.513.145.090.800", "25.691.374.090.830",
"12.236.174.155.090.900", "252.027.904.121.210", "11.136.991.054.110.100"
), Account_Desc = c("AGUAYO CARLA", "LEIVA LILIANA", "FULLANA MARIA LAURA",
"PETREL SERGIO", "IPTICKET SRL", "LEDESMA ORLANDO", "CATTANEO LUIS RAUL",
"CABRAL CARMEN ESTELA", "ITURGOYEN HECTOR", "CASA CASILDO"),
V1 = c("AGUAYO", "LEIVA", "FULLANA", "PETREL", "IPTICKET",
"LEDESMA", "CATTANEO", "CABRAL", "ITURGOYEN", "CASA"), V2 = c("CARLA",
"LILIANA", "MARIA", "SERGIO", "SRL", "ORLANDO", "LUIS", "CARMEN",
"HECTOR", "CASILDO"), V3 = c(NA, NA, "LAURA", NA, NA, NA,
"RAUL", "ESTELA", NA, NA), `NA` = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
)), .Names = c("Subscription_Id", "Account_Desc", "V1", "V2",
"V3", NA), class = c("data.table", "data.frame"), row.names = c(NA,
-10L), .internal.selfref = <pointer: 0x0000000000200788>)
require("data.table")
n <- data.table(n)
Expected Output
# Subscription_Id Account_Desc V1 V2 V3 NA
# 1: 13.855.231.846.091.000 AGUAYO CARLA AGUAYO CARLA NA NA
# 2: 11.156.048.529.090.800 LEIVA LILIANA LEIVA LILIANA NA NA
# 3: 24.940.584.090.830 FULLANA MARIA LAURA FULLANA MARIA LAURA NA
1st Attempt
How to make this work would be the first question
library(stringr)
# This separates the strings, but i loose the Subscription_Id variable.
n[, str_split_fixed(Account_Desc, "[ +]", 4)]
# This doesn't work.
n[, paste0("V",1:4) := str_split_fixed(Account_Desc, "[ +]", 4)]
2nd Attempt
This works, but i seem to be doing the calculation 3 times. Not sure if its the most effiecient way
cols = paste0("V",1:3)
for(j in 1:3){
set(n,i=NULL,j=cols[j],value = sapply(strsplit(as.character(n$Account_Desc),"[ +]"), "[", j))
}
Let's use big_n to benchmarck
big_n <- data.table(Subscription_Id = rep(n[,Subscription_Id],1e7),
Account_Desc = rep(n[,Account_Desc],1e7)
)
回答1:
I don't work with datasets anywhere near this scale, so I have no idea if this is going to be of use or not. One thing that comes to mind is to use a matrix
and matrix indexing.
Since I'm impatient, I've only tried it on 1e5 rows on my slow system :-)
Create your sample data
big_n <- data.table(Subscription_Id = rep(n[,Subscription_Id],1e5),
Account_Desc = rep(n[,Account_Desc],1e5))
Write a function to create your matrix
StringMat <- function(input) {
Temp <- strsplit(input, " ", fixed = TRUE)
Lens <- vapply(Temp, length, 1L)
A <- unlist(Temp, use.names = FALSE)
Rows <- rep(sequence(length(Temp)), Lens)
Cols <- sequence(Lens)
m <- matrix(NA, nrow = length(Temp), ncol = max(Lens),
dimnames = list(NULL, paste0("V", sequence(max(Lens)))))
m[cbind(Rows, Cols)] <- A
m
}
Time it and view the output
system.time(outB1 <- cbind(big_n, StringMat(big_n$Account_Desc)))
# user system elapsed
# 4.524 0.000 4.533
outB1
# Subscription_Id Account_Desc V1 V2 V3
# 1: 13.855.231.846.091.000 AGUAYO CARLA AGUAYO CARLA NA
# 2: 11.156.048.529.090.800 LEIVA LILIANA LEIVA LILIANA NA
# 3: 24.940.584.090.830 FULLANA MARIA LAURA FULLANA MARIA LAURA
# 4: 242.753.039.111.124 PETREL SERGIO PETREL SERGIO NA
# 5: 27.843.782.090.830 IPTICKET SRL IPTICKET SRL NA
# ---
# 999996: 13.773.513.145.090.800 LEDESMA ORLANDO LEDESMA ORLANDO NA
# 999997: 25.691.374.090.830 CATTANEO LUIS RAUL CATTANEO LUIS RAUL
# 999998: 12.236.174.155.090.900 CABRAL CARMEN ESTELA CABRAL CARMEN ESTELA
# 999999: 252.027.904.121.210 ITURGOYEN HECTOR ITURGOYEN HECTOR NA
# 1000000: 11.136.991.054.110.100 CASA CASILDO CASA CASILDO NA
Correct the set_method
function and compare timings
set_method <- function(DT){
cols = paste0("V",1:3)
for(j in 1:3){
set(DT,i=NULL,j=cols[j],
value = sapply(strsplit(as.character(DT[, Account_Desc, with = TRUE]),
"[ +]"), "[", j))
}
}
system.time(set_method(big_n))
# user system elapsed
# 25.319 0.022 25.586
Reset the "big_n" dataset and try out str_split_fixed
(ouch!)
big_n[, c("V1", "V2", "V3") := NULL]
library(stringr)
system.time(outBrodie <- cbind(big_n, as.data.table(str_split_fixed(
big_n$Account_Desc, "[ +]", 4))))
# user system elapsed
# 204.966 0.514 206.910
回答2:
EDIT 3: Stealing Arun's blood and sweat:
cbind(n, as.data.table(str_split_fixed(n$Account_Desc, "[ +]", 4)))
This avoids the potentially costly by
and produces the same result (plus the original name column).
EDIT2: as per Arun's comment, maybe:
n.2[, c(paste0("V", 1:4)):=as.list(str_split_fixed(Account_Desc, "[ +]", 4)), by=Subscription_Id]
But you still have the by
. Old way:
n[, as.list(str_split_fixed(Account_Desc, "[ +]", 4)), by=Subscription_Id]
produces:
# Subscription_Id V1 V2 V3 V4
# 1: 13.855.231.846.091.000 AGUAYO CARLA
# 2: 11.156.048.529.090.800 LEIVA LILIANA
# 3: 24.940.584.090.830 FULLANA MARIA LAURA
# 4: 242.753.039.111.124 PETREL SERGIO
# 5: 27.843.782.090.830 IPTICKET SRL
# 6: 13.773.513.145.090.800 LEDESMA ORLANDO
# 7: 25.691.374.090.830 CATTANEO LUIS RAUL
# 8: 12.236.174.155.090.900 CABRAL CARMEN ESTELA
# 9: 252.027.904.121.210 ITURGOYEN HECTOR
# 10: 11.136.991.054.110.100 CASA CASILDO
EDIT: word of warning, some stringr
functions can be slow (not sure if this one is). If this is still slow for your process, you may want to write your own function using strsplit
and something to pad it to the appropriate length.
来源:https://stackoverflow.com/questions/21733345/best-way-to-manipulate-strings-in-big-data-table