Label Encoder functionality in R?

前端 未结 9 930
你的背包
你的背包 2021-02-06 08:56

In python, scikit has a great function called LabelEncoder that maps categorical levels (strings) to integer representation.

Is there anything in R to do this?

9条回答
  •  遇见更好的自我
    2021-02-06 09:24

    I wrote the following which I think works, the efficiency of which and/or how it will scale is not yet tested

    str2Int.fit_transform<-function(df, plug_missing=TRUE){
    
      list_of_levels=list()  #empty list   
    
      #loop through the columns
      for (i in 1: ncol(df))
      {
    
        #only   
        if (is.character(df[,i]) ||  is.factor(df[,i]) ){
    
          #deal with missing
          if(plug_missing){
    
            #if factor
            if (is.factor(df[,i])){
              df[,i] = factor(df[,i], levels=c(levels(df[,i]), 'MISSING'))
              df[,i][is.na(df[,i])] = 'MISSING' 
    
    
            }else{   #if character
    
              df[,i][is.na(df[,i])] = 'MISSING' 
    
            }
          }#end missing IF
    
          levels<-unique(df[,i]) #distinct levels
          list_of_levels[[colnames(df)[i]]] <- levels #set list with name of the columns to the levels
          df[,i] <- as.numeric(factor(df[,i], levels = levels))
    
        }#end if character/factor IF
    
    
      }#end loop
    
      return (list(list_of_levels,df)) #return the list of levels and the new DF
    
    }#end of function
    
    
    
    str2Int.transform<-function(df,list_of_levels,plug_missing=TRUE)
    {
      #loop through the columns
      for (i in 1: ncol(df))
      {
    
        #only   
        if (is.character(df[,i]) ||  is.factor(df[,i]) ){
    
    
          #deal with missing
          if(plug_missing){
    
            #if factor
            if (is.factor(df[,i])){
              df[,i] = factor(df[,i], levels=c(levels(df[,i]), 'MISSING'))
              df[,i][is.na(df[,i])] = 'MISSING' 
    
    
            }else{   #if character
    
              df[,i][is.na(df[,i])] = 'MISSING' 
    
            }
          }#end missing IF
    
          levels=list_of_levels[[colnames(df)[i]]]
    
          if (! is.null(levels)){
            df[,i] <- as.numeric(factor(df[,i], levels = levels))
          }
    
        }# character or factor
    
      }#end of loop
    
      return(df)
    
    }#end of function
    
    
    
    
    ######################################################
    # Test the functions
    ######################################################
    
    
    
    ###Test fit transform
    
    # as strings
    sample_dat <- data.frame(a_fact=c('Red','Blue','Blue',NA,'Green'), a_int=c(1,2,3,4,5), a_str=c('a','b','c','a','v'),stringsAsFactors=FALSE)
    
    result<-str2Int.fit_transform(sample_dat)
    result[[1]] #list of levels
    result[[2]] #transformed df
    
    #as factors
    sample_dat <- data.frame(a_fact=c('Red','Blue','Blue',NA,'Green'), a_int=c(1,2,3,4,5), a_str=c('a','b','c','a','v'),stringsAsFactors=TRUE)
    
    result<-str2Int.fit_transform(sample_dat)
    result[[1]] #list of levels
    result[[2]] #transformed df
    
    
    
    ###Test transform
    str2Int.transform(sample_dat,result[[1]])
    

提交回复
热议问题