Label Encoder functionality in R?

前端 未结 9 916
你的背包
你的背包 2021-02-06 08:56

In python, scikit has a great function called LabelEncoder that maps categorical levels (strings) to integer representation.

Is there anything in R to do this?

相关标签:
9条回答
  • 2021-02-06 09:05
    # input P to the function below is a dataframe containing only categorical variables
    numlevel <- function(P) { 
    
    n <- dim(P)[2]
    
    for(i in 1: n) {
    
      m <- length(unique(P[[i]]))
    
    levels(P[[i]]) <- c(1:m)
    
    }
    
    return(P)
    
    }
    
    Q <- numlevel(P) 
    
    0 讨论(0)
  • 2021-02-06 09:10

    Try CatEncoders package. It replicates the Python sklearn.preprocessing functionality.

    # variable to encode values
    colors = c("red", "red", "blue", "green")
    lab_enc = LabelEncoder.fit(colors)
    
    # new values are transformed to NA
    values = transform(lab_enc, c('red', 'red', 'yellow'))
    values
    
    # [1]  3  3 NA
    
    
    # doing the inverse: given the encoded numbers return the labels
    inverse.transform(lab_enc, values)
    # [1] "red" "red" NA   
    

    I would add the functionality of reporting the non-matching labels with a warning.

    PS: It also has the OneHotEncoder function.

    0 讨论(0)
  • 2021-02-06 09:14
    df<- mtcars
    
    head(df)
    
    df$cyl  <- factor(df$cyl)
    
    df$carb <- factor(df$carb)
    vec <- sapply(df, is.factor)
    
    catlevels <- sapply(df[vec], levels)
    
    #store the levels for each category
    #level appearing first is coded as 1, second as 2 so on
    
    df <- sapply(df, as.numeric)
    
    class(df) #matrix
    
    df <- data.frame(df) 
    
    #converting back to dataframe
    
    head(df)
    
    0 讨论(0)
  • 2021-02-06 09:14
    # Data
    Country <- c("France", "Spain", "Germany", "Spain", "Germany", "France")
    Age <- c(34, 27, 30, 32, 42, 30)
    Purchased <- c("No", "Yes", "No", "No", "Yes", "Yes")
    df <- data.frame(Country, Age, Purchased)
    df
    
    # Output
      Country Age Purchased
    1  France  34        No
    2   Spain  27       Yes
    3 Germany  30        No
    4   Spain  32        No
    5 Germany  42       Yes
    6  France  30       Yes
    

    Using CatEncoders package : Encoders for Categorical Variables

    library(CatEncoders)
    
    # Saving names of categorical variables
    factors <- names(which(sapply(df, is.factor)))
    
    # Label Encoder
    for (i in factors){
      encode <- LabelEncoder.fit(df[, i])
      df[, i] <- transform(encode, df[, i])
    }
    df
    
    # Output
      Country Age Purchased
    1       1  34         1
    2       3  27         2
    3       2  30         1
    4       3  32         1
    5       2  42         2
    6       1  30         2
    

    Using R base : factor function

    # Label Encoder
    levels <- c("France", "Spain", "Germany", "No", "Yes")
    labels <- c(1, 2, 3, 1, 2)
    for (i in factors){
      df[, i] <- factor(df[, i], levels = levels, labels = labels, ordered = TRUE)
    }
    df
    
    # Output
      Country Age Purchased
    1       1  34         1
    2       2  27         2
    3       3  30         1
    4       2  32         1
    5       3  42         2
    6       1  30         2
    
    0 讨论(0)
  • 2021-02-06 09:23

    If I correctly understand what do you want:

    # function which returns function which will encode vectors with values  of 'vec' 
    label_encoder = function(vec){
        levels = sort(unique(vec))
        function(x){
            match(x, levels)
        }
    }
    
    colors = c("red", "red", "blue", "green")
    
    color_encoder = label_encoder(colors) # create encoder
    
    encoded_colors = color_encoder(colors) # encode colors
    encoded_colors
    
    new_colors = c("blue", "green", "green")  # new vector
    encoded_new_colors = color_encoder(new_colors)
    encoded_new_colors
    
    other_colors = c("blue", "green", "green", "yellow") 
    color_encoder(other_colors) # NA's are introduced
    
    # save and restore to disk
    saveRDS(color_encoder, "color_encoder.RDS")
    c_encoder = readRDS("color_encoder.RDS")
    c_encoder(colors) # same result
    
    # dealing with multiple columns
    
    # create data.frame
    set.seed(123) # make result reproducible
    color_dataframe = as.data.frame(
        matrix(
            sample(c("red", "blue", "green",  "yellow"), 12, replace = TRUE),
            ncol = 3)
    )
    color_dataframe
    
    # encode each column
    for (column in colnames(color_dataframe)){
        color_dataframe[[column]] = color_encoder(color_dataframe[[column]])
    }
    color_dataframe
    
    0 讨论(0)
  • 2021-02-06 09:24

    I wrote the following which I think works, the efficiency of which and/or how it will scale is not yet tested

    str2Int.fit_transform<-function(df, plug_missing=TRUE){
    
      list_of_levels=list()  #empty list   
    
      #loop through the columns
      for (i in 1: ncol(df))
      {
    
        #only   
        if (is.character(df[,i]) ||  is.factor(df[,i]) ){
    
          #deal with missing
          if(plug_missing){
    
            #if factor
            if (is.factor(df[,i])){
              df[,i] = factor(df[,i], levels=c(levels(df[,i]), 'MISSING'))
              df[,i][is.na(df[,i])] = 'MISSING' 
    
    
            }else{   #if character
    
              df[,i][is.na(df[,i])] = 'MISSING' 
    
            }
          }#end missing IF
    
          levels<-unique(df[,i]) #distinct levels
          list_of_levels[[colnames(df)[i]]] <- levels #set list with name of the columns to the levels
          df[,i] <- as.numeric(factor(df[,i], levels = levels))
    
        }#end if character/factor IF
    
    
      }#end loop
    
      return (list(list_of_levels,df)) #return the list of levels and the new DF
    
    }#end of function
    
    
    
    str2Int.transform<-function(df,list_of_levels,plug_missing=TRUE)
    {
      #loop through the columns
      for (i in 1: ncol(df))
      {
    
        #only   
        if (is.character(df[,i]) ||  is.factor(df[,i]) ){
    
    
          #deal with missing
          if(plug_missing){
    
            #if factor
            if (is.factor(df[,i])){
              df[,i] = factor(df[,i], levels=c(levels(df[,i]), 'MISSING'))
              df[,i][is.na(df[,i])] = 'MISSING' 
    
    
            }else{   #if character
    
              df[,i][is.na(df[,i])] = 'MISSING' 
    
            }
          }#end missing IF
    
          levels=list_of_levels[[colnames(df)[i]]]
    
          if (! is.null(levels)){
            df[,i] <- as.numeric(factor(df[,i], levels = levels))
          }
    
        }# character or factor
    
      }#end of loop
    
      return(df)
    
    }#end of function
    
    
    
    
    ######################################################
    # Test the functions
    ######################################################
    
    
    
    ###Test fit transform
    
    # as strings
    sample_dat <- data.frame(a_fact=c('Red','Blue','Blue',NA,'Green'), a_int=c(1,2,3,4,5), a_str=c('a','b','c','a','v'),stringsAsFactors=FALSE)
    
    result<-str2Int.fit_transform(sample_dat)
    result[[1]] #list of levels
    result[[2]] #transformed df
    
    #as factors
    sample_dat <- data.frame(a_fact=c('Red','Blue','Blue',NA,'Green'), a_int=c(1,2,3,4,5), a_str=c('a','b','c','a','v'),stringsAsFactors=TRUE)
    
    result<-str2Int.fit_transform(sample_dat)
    result[[1]] #list of levels
    result[[2]] #transformed df
    
    
    
    ###Test transform
    str2Int.transform(sample_dat,result[[1]])
    
    0 讨论(0)
提交回复
热议问题