Smote for oversampling multiclass dataset

后端 未结 1 1656
傲寒
傲寒 2021-01-26 00:18

I am trying to adapt Smote technique of chawla et al 2002 describing in the following link https://www.cs.cmu.edu/afs/cs/project/jair/pub/volume16/chawla02a-html/node6.html to

1条回答
  •  不知归路
    2021-01-26 00:38

    i am trying to adapt Smote technique of chawla et al 2002 describing in the following link :https://www.cs.cmu.edu/afs/cs/project/jair/pub/volume16/chawla02a-html/node6.html to a multiclass dataset like iris,my present solution decribed in the code below is adapted from the code of the function SMOTE existing in the package DMwR and it works well.

    #NewSMOTE algorithm to balance a multiclass dataset without undersampling majority class(iris as example)
    #get an unbalanced iris dataset 
    set.seed(9850)
    gp <-runif(nrow(iris))
    iris<-iris[order(gp),]
    s<-sample(150,71)
    data<-iris[s,]
    #sort classes in decreasing order
    tab<-sort(table(data$Species),decreasing=TRUE) 
    #majoritycases extrction
    selMaj <- which(data$Species==names(tab[1]))
    #takes data with minority classes in order to oversample it
    data1<-data[data$Species!=names(tab[1]),]
    #versicolor and viriginica are minority classes
    Species <-factor(data1$Species)
    dataminExs<-data.frame()
    newExs1 <-data.frame()
    class <- unique(Species)
    for(i in 1:length(class)){
      # the column where the target variable is
      tgt <- which(names(data1) == "Species")
      #getting the minority class[i]
      minCl <- as.character(class[i])
       # get the cases of the minority class
      minExs <- which(data1[,tgt] == minCl)
      #concatenate all minority classes instances
       dataminExs<- rbind(dataminExs, data1[minExs,])
      print(nrow(dataminExs))
       #calculate percentage of oversampling
     perc.over= as.integer(length(selMaj)/length(minExs))*100
      # generate synthetic cases from these minExs
      newExs <- smote.exs(data1[minExs,],ncol(data1),perc.over,5)
        #all synthetic minority examples created
      print(nrow(newExs))
        newExs1<-rbind(newExs1, newExs)
      print(newExs1 )
    #alldatamin <-rbind(newExs1, dataminExs)
         }
    # the final data set (the undersample+the rare cases+the smoted exs)
    newdataset <- rbind(data[selMaj,],newExs1)
    
    #function to oversample minority class
    smote.exs <- function(data,tgt,N,k)
      # INPUTS:
      # data are the rare cases (the minority "class" cases)
      # tgt is the name of the target variable
      # N is the percentage of over-sampling to carry out;
      # and k is the number of nearest neighours to use for the generation
      # OUTPUTS:
      # The result of the function is a (N/100)*T set of generated
      # examples with rare values on the target
    {
      nomatr <- c()
      T <- matrix(nrow=dim(data)[1],ncol=dim(data)[2]-1)
      for(col in seq.int(dim(T)[2]))
        if (class(data[,col]) %in% c('factor','character')) {
          T[,col] <- as.integer(data[,col])
          nomatr <- c(nomatr,col)
        } else T[,col] <- data[,col]
    
      if (N < 100) { # only a percentage of the T cases will be SMOTEd
        nT <- NROW(T)
        idx <- sample(1:nT,as.integer((N/100)*nT))
        T <- T[idx,]
        N <- 100
      }
    
      p <- dim(T)[2]
      nT <- dim(T)[1]
    
      ranges <- apply(T,2,max)-apply(T,2,min)
    
      nexs <-  as.integer(N/100) # this is the number of artificial exs generated
      # for each member of T
      new <- matrix(nrow=nexs*nT,ncol=p)    # the new cases
    
      for(i in 1:nT) {
    
        # the k NNs of case T[i,]
        xd <- scale(T,T[i,],ranges)
        for(a in nomatr) xd[,a] <- xd[,a]==0
        dd <- drop(xd^2 %*% rep(1, ncol(xd)))
        kNNs <- order(dd)[2:(k+1)]
    
        for(n in 1:nexs) {
          # select randomly one of the k NNs
          neig <- sample(1:k,1)
    
          ex <- vector(length=ncol(T))
    
          # the attribute values of the generated case
          difs <- T[kNNs[neig],]-T[i,]
          new[(i-1)*nexs+n,] <- T[i,]+runif(1)*difs
          for(a in nomatr)
            new[(i-1)*nexs+n,a] <- c(T[kNNs[neig],a],T[i,a])[1+round(runif(1),0)]
    
        }
      }
      newCases <- data.frame(new)
      for(a in nomatr)
        newCases[,a] <- factor(newCases[,a],levels=1:nlevels(data[,a]),labels=levels(data[,a]))
    
      newCases[,tgt] <- factor(rep(data[1,tgt],nrow(newCases)),levels=levels(data[,tgt]))
      colnames(newCases) <- colnames(data)
      newCases
    }
    

    0 讨论(0)
提交回复
热议问题