I have a set of documents that I am trying to cluster based on their vocabulary (that is, first making a corpus and then a sparse matrix with the DocumentTermMatrix
I had a similar problem some time ago..
Here is what I did:
require("tm")
require("skmeans")
require("slam")
# clus: a skmeans object
# dtm: a Document Term Matrix
# first: eg. 10 most frequent words per cluster
# unique: if FALSE all words of the DTM will be used
# if TRUE only cluster specific words will be used
# result: List with words and frequency of words
# If unique = TRUE, only cluster specific words will be considered.
# Words which occur in more than one cluster will be ignored.
mfrq_words_per_cluster <- function(clus, dtm, first = 10, unique = TRUE){
if(!any(class(clus) == "skmeans")) return("clus must be an skmeans object")
dtm <- as.simple_triplet_matrix(dtm)
indM <- table(names(clus$cluster), clus$cluster) == 1 # generate bool matrix
hfun <- function(ind, dtm){ # help function, summing up words
if(is.null(dtm[ind, ])) dtm[ind, ] else col_sums(dtm[ind, ])
}
frqM <- apply(indM, 2, hfun, dtm = dtm)
if(unique){
# eliminate word which occur in several clusters
frqM <- frqM[rowSums(frqM > 0) == 1, ]
}
# export to list, order and take first x elements
res <- lapply(1:ncol(frqM), function(i, mat, first)
head(sort(mat[, i], decreasing = TRUE), first),
mat = frqM, first = first)
names(res) <- paste0("CLUSTER_", 1:ncol(frqM))
return(res)
}
A small example:
data("crude")
dtm <- DocumentTermMatrix(crude, control =
list(removePunctuation = TRUE,
removeNumbers = TRUE,
stopwords = TRUE))
rownames(dtm) <- paste0("Doc_", 1:20)
clus <- skmeans(dtm, 3)
mfrq_words_per_cluster(clus, dtm)
mfrq_words_per_cluster(clus, dtm, unique = FALSE)
HTH