In python, scikit has a great function called LabelEncoder that maps categorical levels (strings) to integer representation.
Is there anything in R to do this?
I wrote the following which I think works, the efficiency of which and/or how it will scale is not yet tested
str2Int.fit_transform<-function(df, plug_missing=TRUE){
list_of_levels=list() #empty list
#loop through the columns
for (i in 1: ncol(df))
{
#only
if (is.character(df[,i]) || is.factor(df[,i]) ){
#deal with missing
if(plug_missing){
#if factor
if (is.factor(df[,i])){
df[,i] = factor(df[,i], levels=c(levels(df[,i]), 'MISSING'))
df[,i][is.na(df[,i])] = 'MISSING'
}else{ #if character
df[,i][is.na(df[,i])] = 'MISSING'
}
}#end missing IF
levels<-unique(df[,i]) #distinct levels
list_of_levels[[colnames(df)[i]]] <- levels #set list with name of the columns to the levels
df[,i] <- as.numeric(factor(df[,i], levels = levels))
}#end if character/factor IF
}#end loop
return (list(list_of_levels,df)) #return the list of levels and the new DF
}#end of function
str2Int.transform<-function(df,list_of_levels,plug_missing=TRUE)
{
#loop through the columns
for (i in 1: ncol(df))
{
#only
if (is.character(df[,i]) || is.factor(df[,i]) ){
#deal with missing
if(plug_missing){
#if factor
if (is.factor(df[,i])){
df[,i] = factor(df[,i], levels=c(levels(df[,i]), 'MISSING'))
df[,i][is.na(df[,i])] = 'MISSING'
}else{ #if character
df[,i][is.na(df[,i])] = 'MISSING'
}
}#end missing IF
levels=list_of_levels[[colnames(df)[i]]]
if (! is.null(levels)){
df[,i] <- as.numeric(factor(df[,i], levels = levels))
}
}# character or factor
}#end of loop
return(df)
}#end of function
######################################################
# Test the functions
######################################################
###Test fit transform
# as strings
sample_dat <- data.frame(a_fact=c('Red','Blue','Blue',NA,'Green'), a_int=c(1,2,3,4,5), a_str=c('a','b','c','a','v'),stringsAsFactors=FALSE)
result<-str2Int.fit_transform(sample_dat)
result[[1]] #list of levels
result[[2]] #transformed df
#as factors
sample_dat <- data.frame(a_fact=c('Red','Blue','Blue',NA,'Green'), a_int=c(1,2,3,4,5), a_str=c('a','b','c','a','v'),stringsAsFactors=TRUE)
result<-str2Int.fit_transform(sample_dat)
result[[1]] #list of levels
result[[2]] #transformed df
###Test transform
str2Int.transform(sample_dat,result[[1]])