I am trying to adapt Smote technique of chawla et al 2002 describing in the following link https://www.cs.cmu.edu/afs/cs/project/jair/pub/volume16/chawla02a-html/node6.html to
i am trying to adapt Smote technique of chawla et al 2002 describing in the following link :https://www.cs.cmu.edu/afs/cs/project/jair/pub/volume16/chawla02a-html/node6.html to a multiclass dataset like iris,my present solution decribed in the code below is adapted from the code of the function SMOTE existing in the package DMwR and it works well.
#NewSMOTE algorithm to balance a multiclass dataset without undersampling majority class(iris as example)
#get an unbalanced iris dataset
set.seed(9850)
gp <-runif(nrow(iris))
iris<-iris[order(gp),]
s<-sample(150,71)
data<-iris[s,]
#sort classes in decreasing order
tab<-sort(table(data$Species),decreasing=TRUE)
#majoritycases extrction
selMaj <- which(data$Species==names(tab[1]))
#takes data with minority classes in order to oversample it
data1<-data[data$Species!=names(tab[1]),]
#versicolor and viriginica are minority classes
Species <-factor(data1$Species)
dataminExs<-data.frame()
newExs1 <-data.frame()
class <- unique(Species)
for(i in 1:length(class)){
# the column where the target variable is
tgt <- which(names(data1) == "Species")
#getting the minority class[i]
minCl <- as.character(class[i])
# get the cases of the minority class
minExs <- which(data1[,tgt] == minCl)
#concatenate all minority classes instances
dataminExs<- rbind(dataminExs, data1[minExs,])
print(nrow(dataminExs))
#calculate percentage of oversampling
perc.over= as.integer(length(selMaj)/length(minExs))*100
# generate synthetic cases from these minExs
newExs <- smote.exs(data1[minExs,],ncol(data1),perc.over,5)
#all synthetic minority examples created
print(nrow(newExs))
newExs1<-rbind(newExs1, newExs)
print(newExs1 )
#alldatamin <-rbind(newExs1, dataminExs)
}
# the final data set (the undersample+the rare cases+the smoted exs)
newdataset <- rbind(data[selMaj,],newExs1)
#function to oversample minority class
smote.exs <- function(data,tgt,N,k)
# INPUTS:
# data are the rare cases (the minority "class" cases)
# tgt is the name of the target variable
# N is the percentage of over-sampling to carry out;
# and k is the number of nearest neighours to use for the generation
# OUTPUTS:
# The result of the function is a (N/100)*T set of generated
# examples with rare values on the target
{
nomatr <- c()
T <- matrix(nrow=dim(data)[1],ncol=dim(data)[2]-1)
for(col in seq.int(dim(T)[2]))
if (class(data[,col]) %in% c('factor','character')) {
T[,col] <- as.integer(data[,col])
nomatr <- c(nomatr,col)
} else T[,col] <- data[,col]
if (N < 100) { # only a percentage of the T cases will be SMOTEd
nT <- NROW(T)
idx <- sample(1:nT,as.integer((N/100)*nT))
T <- T[idx,]
N <- 100
}
p <- dim(T)[2]
nT <- dim(T)[1]
ranges <- apply(T,2,max)-apply(T,2,min)
nexs <- as.integer(N/100) # this is the number of artificial exs generated
# for each member of T
new <- matrix(nrow=nexs*nT,ncol=p) # the new cases
for(i in 1:nT) {
# the k NNs of case T[i,]
xd <- scale(T,T[i,],ranges)
for(a in nomatr) xd[,a] <- xd[,a]==0
dd <- drop(xd^2 %*% rep(1, ncol(xd)))
kNNs <- order(dd)[2:(k+1)]
for(n in 1:nexs) {
# select randomly one of the k NNs
neig <- sample(1:k,1)
ex <- vector(length=ncol(T))
# the attribute values of the generated case
difs <- T[kNNs[neig],]-T[i,]
new[(i-1)*nexs+n,] <- T[i,]+runif(1)*difs
for(a in nomatr)
new[(i-1)*nexs+n,a] <- c(T[kNNs[neig],a],T[i,a])[1+round(runif(1),0)]
}
}
newCases <- data.frame(new)
for(a in nomatr)
newCases[,a] <- factor(newCases[,a],levels=1:nlevels(data[,a]),labels=levels(data[,a]))
newCases[,tgt] <- factor(rep(data[1,tgt],nrow(newCases)),levels=levels(data[,tgt]))
colnames(newCases) <- colnames(data)
newCases
}