Label Encoder functionality in R?

前端未结

关注

 9  954

In python, scikit has a great function called LabelEncoder that maps categorical levels (strings) to integer representation.

Is there anything in R to do this?

相关标签:

9条回答

渐次进展

2021-02-06 09:05

# input P to the function below is a dataframe containing only categorical variables
numlevel <- function(P) { 

n <- dim(P)[2]

for(i in 1: n) {

  m <- length(unique(P[[i]]))

levels(P[[i]]) <- c(1:m)

}

return(P)

}

Q <- numlevel(P)

0 讨论(0)

谎友^

2021-02-06 09:10

Try CatEncoders package. It replicates the Python sklearn.preprocessing functionality.

# variable to encode values
colors = c("red", "red", "blue", "green")
lab_enc = LabelEncoder.fit(colors)

# new values are transformed to NA
values = transform(lab_enc, c('red', 'red', 'yellow'))
values

# [1]  3  3 NA


# doing the inverse: given the encoded numbers return the labels
inverse.transform(lab_enc, values)
# [1] "red" "red" NA

I would add the functionality of reporting the non-matching labels with a warning.

PS: It also has the OneHotEncoder function.

0 讨论(0)

伪装坚强ぢ

2021-02-06 09:14

df<- mtcars

head(df)

df$cyl  <- factor(df$cyl)

df$carb <- factor(df$carb)
vec <- sapply(df, is.factor)

catlevels <- sapply(df[vec], levels)

#store the levels for each category
#level appearing first is coded as 1, second as 2 so on

df <- sapply(df, as.numeric)

class(df) #matrix

df <- data.frame(df) 

#converting back to dataframe

head(df)

0 讨论(0)

借酒劲吻你

2021-02-06 09:14

# Data
Country <- c("France", "Spain", "Germany", "Spain", "Germany", "France")
Age <- c(34, 27, 30, 32, 42, 30)
Purchased <- c("No", "Yes", "No", "No", "Yes", "Yes")
df <- data.frame(Country, Age, Purchased)
df

# Output
  Country Age Purchased
1  France  34        No
2   Spain  27       Yes
3 Germany  30        No
4   Spain  32        No
5 Germany  42       Yes
6  France  30       Yes

Using CatEncoders package : Encoders for Categorical Variables

library(CatEncoders)

# Saving names of categorical variables
factors <- names(which(sapply(df, is.factor)))

# Label Encoder
for (i in factors){
  encode <- LabelEncoder.fit(df[, i])
  df[, i] <- transform(encode, df[, i])
}
df

# Output
  Country Age Purchased
1       1  34         1
2       3  27         2
3       2  30         1
4       3  32         1
5       2  42         2
6       1  30         2

Using R base : factor function

# Label Encoder
levels <- c("France", "Spain", "Germany", "No", "Yes")
labels <- c(1, 2, 3, 1, 2)
for (i in factors){
  df[, i] <- factor(df[, i], levels = levels, labels = labels, ordered = TRUE)
}
df

# Output
  Country Age Purchased
1       1  34         1
2       2  27         2
3       3  30         1
4       2  32         1
5       3  42         2
6       1  30         2

0 讨论(0)

眼角桃花

2021-02-06 09:23

If I correctly understand what do you want:

# function which returns function which will encode vectors with values  of 'vec' 
label_encoder = function(vec){
    levels = sort(unique(vec))
    function(x){
        match(x, levels)
    }
}

colors = c("red", "red", "blue", "green")

color_encoder = label_encoder(colors) # create encoder

encoded_colors = color_encoder(colors) # encode colors
encoded_colors

new_colors = c("blue", "green", "green")  # new vector
encoded_new_colors = color_encoder(new_colors)
encoded_new_colors

other_colors = c("blue", "green", "green", "yellow") 
color_encoder(other_colors) # NA's are introduced

# save and restore to disk
saveRDS(color_encoder, "color_encoder.RDS")
c_encoder = readRDS("color_encoder.RDS")
c_encoder(colors) # same result

# dealing with multiple columns

# create data.frame
set.seed(123) # make result reproducible
color_dataframe = as.data.frame(
    matrix(
        sample(c("red", "blue", "green",  "yellow"), 12, replace = TRUE),
        ncol = 3)
)
color_dataframe

# encode each column
for (column in colnames(color_dataframe)){
    color_dataframe[[column]] = color_encoder(color_dataframe[[column]])
}
color_dataframe

0 讨论(0)

遇见更好的自我

2021-02-06 09:24

I wrote the following which I think works, the efficiency of which and/or how it will scale is not yet tested

str2Int.fit_transform<-function(df, plug_missing=TRUE){

  list_of_levels=list()  #empty list   

  #loop through the columns
  for (i in 1: ncol(df))
  {

    #only   
    if (is.character(df[,i]) ||  is.factor(df[,i]) ){

      #deal with missing
      if(plug_missing){

        #if factor
        if (is.factor(df[,i])){
          df[,i] = factor(df[,i], levels=c(levels(df[,i]), 'MISSING'))
          df[,i][is.na(df[,i])] = 'MISSING' 


        }else{   #if character

          df[,i][is.na(df[,i])] = 'MISSING' 

        }
      }#end missing IF

      levels<-unique(df[,i]) #distinct levels
      list_of_levels[[colnames(df)[i]]] <- levels #set list with name of the columns to the levels
      df[,i] <- as.numeric(factor(df[,i], levels = levels))

    }#end if character/factor IF


  }#end loop

  return (list(list_of_levels,df)) #return the list of levels and the new DF

}#end of function



str2Int.transform<-function(df,list_of_levels,plug_missing=TRUE)
{
  #loop through the columns
  for (i in 1: ncol(df))
  {

    #only   
    if (is.character(df[,i]) ||  is.factor(df[,i]) ){


      #deal with missing
      if(plug_missing){

        #if factor
        if (is.factor(df[,i])){
          df[,i] = factor(df[,i], levels=c(levels(df[,i]), 'MISSING'))
          df[,i][is.na(df[,i])] = 'MISSING' 


        }else{   #if character

          df[,i][is.na(df[,i])] = 'MISSING' 

        }
      }#end missing IF

      levels=list_of_levels[[colnames(df)[i]]]

      if (! is.null(levels)){
        df[,i] <- as.numeric(factor(df[,i], levels = levels))
      }

    }# character or factor

  }#end of loop

  return(df)

}#end of function




######################################################
# Test the functions
######################################################



###Test fit transform

# as strings
sample_dat <- data.frame(a_fact=c('Red','Blue','Blue',NA,'Green'), a_int=c(1,2,3,4,5), a_str=c('a','b','c','a','v'),stringsAsFactors=FALSE)

result<-str2Int.fit_transform(sample_dat)
result[[1]] #list of levels
result[[2]] #transformed df

#as factors
sample_dat <- data.frame(a_fact=c('Red','Blue','Blue',NA,'Green'), a_int=c(1,2,3,4,5), a_str=c('a','b','c','a','v'),stringsAsFactors=TRUE)

result<-str2Int.fit_transform(sample_dat)
result[[1]] #list of levels
result[[2]] #transformed df



###Test transform
str2Int.transform(sample_dat,result[[1]])

0 讨论(0)

1 2 下一页