问题
I am following the tutorial over here : https://www.rpubs.com/loveb/som . This tutorial shows how to use the Kohonen Network (also called SOM, a type of machine learning algorithm) on the iris data.
I ran this code from the tutorial:
library(kohonen) #fitting SOMs
library(ggplot2) #plots
library(GGally) #plots
library(RColorBrewer) #colors, using predefined palettes
iris_complete <-iris[complete.cases(iris),]
iris_unique <- unique(iris_complete) # Remove duplicates
#scale data
iris.sc = scale(iris_unique[, 1:4]) #Levels/Factors cannot be scaled... But used in predictive SOM:s using xyf. Later.
#build grid
iris.grid = somgrid(xdim = 10, ydim=10, topo="hexagonal", toroidal = TRUE)
set.seed(33) #for reproducability
iris.som <- som(iris.sc, grid=iris.grid, rlen=700, alpha=c(0.05,0.01), keep.data = TRUE)
#plot 1
plot(iris.som, type="count")
#plot2
var <- 1 #define the variable to plot
plot(iris.som, type = "property", property = getCodes(iris.som)[,var], main=colnames(getCodes(iris.som))[var], palette.name=terrain.colors)
The above code fits a Kohonen Network on the iris data. Each observation from the data set is assigned to each one of the "colorful circles" (also called "neurons") in the below pictures.
My question: In these plots, how would you identify which observations were assigned to which circles? Suppose I wanted to know which observations belong in the circles outlined in with the black triangles below:
Is it possible to do this? Right now, I am trying to use iris.som$classif
to somehow trace which points are in which circle. Is there a better way to do this?
UPDATE: @Jonny Phelps showed me how to identify observations within a triangular form (see answer below). But i am still not sure if it possible to identify irregular shaped forms. E.g.
In a previous post (Labelling Points on a Plot (R Language)), a user showed me how to assign arbitrary numbers to each circle on the grid:
Based on the above plot, how could you use the "som$classif" statement to find out which observations were in circles 92,91,82,81,72 and 71?
Thanks
回答1:
EDIT: Now with Shiny App!
A plotly
solution is also possible, where you can mouse over individual neurons to display the associated iris rownames (called id here). Based on your iris.som
data and Jonny Phelps' grid approach, you can just assign the row numbers as concatenated strings to the individual neurons and have these shown upon mouseover:
library(ggplot2)
library(plotly)
ga <- data.frame(g=iris.som$unit.classif,
sample=seq_len(dim(iris.som$data[[1]])[1]))
grid_pts <- as.data.frame(iris.som$grid$pts)
grid_pts$column <- rep(1:iris.som$grid$xdim, by=iris.som$grid$ydim)
grid_pts$row <- rep(1:iris.som$grid$ydim, each=iris.som$grid$xdim)
grid_pts$classif <- 1:nrow(grid_pts)
grid_pts$id <- sapply(seq_along(grid_pts$classif),
function(x) paste(ga$sample[ga$g==x], collapse=", "))
grid_pts$count <- sapply(seq_along(grid_pts$classif),
function(x) length(ga$sample[ga$g==x]))
grid_pts$count <- factor(grid_pts$count, levels=0:max(grid_pts$count))
p1 <- ggplot(grid_pts, aes(x=x, y=y, colour=count, row=row, column=column, id=id)) +
geom_point(size=8) +
scale_colour_manual(values=c("grey50", heat.colors(length(unique(grid_pts$count))))) +
theme_void() +
theme(plot.margin=unit(c(1,rep(.3, 3)),"cm"))
ggplotly(p1)
Here is a full Shiny app that allows lasso selection and shows a table with the data:
invisible(suppressPackageStartupMessages(
lapply(c("shiny","dplyr","ggplot2", "plotly", "kohonen", "GGally", "DT"),
require, character.only=TRUE)))
iris_complete <- iris[complete.cases(iris),]
iris_unique <- unique(iris_complete) # Remove duplicates
#scale data
iris.sc = scale(iris_unique[, 1:4]) #Levels/Factors cannot be scaled... But used in predictive SOM:s using xyf. Later.
#build grid
iris.grid = somgrid(xdim = 10, ydim=10, topo="hexagonal", toroidal = TRUE)
set.seed(33) #for reproducability
iris.som <- som(iris.sc, grid=iris.grid, rlen=700, alpha=c(0.05,0.01), keep.data = TRUE)
ga <- data.frame(g=iris.som$unit.classif,
sample=seq_len(dim(iris.som$data[[1]])[1]))
grid_pts <- as.data.frame(iris.som$grid$pts)
grid_pts$column <- rep(1:iris.som$grid$xdim, by=iris.som$grid$ydim)
grid_pts$row <- rep(1:iris.som$grid$ydim, each=iris.som$grid$xdim)
grid_pts$classif <- 1:nrow(grid_pts)
grid_pts$id <- sapply(seq_along(grid_pts$classif),
function(x) paste(ga$sample[ga$g==x], collapse=", "))
grid_pts$count <- sapply(seq_along(grid_pts$classif),
function(x) length(ga$sample[ga$g==x]))
grid_pts$count <- factor(grid_pts$count, levels=0:max(grid_pts$count))
# Shiny app, adapted from https://gist.github.com/dgrapov/128e3be71965bf00495768e47f0428b9
ui <- fluidPage(
fluidRow(
column(12, plotlyOutput("plot", height = "600px")),
column(12, DT::dataTableOutput('data_table'))
)
)
server <- function(input, output){
output$plot <- renderPlotly({
req(data())
p <- ggplot(data = data()$data,
aes(x=x, y=y, classif=classif, colour=count, row=row, column=column, id=id)) +
geom_point(size=8) +
scale_colour_manual(
values=c("grey50", heat.colors(length(unique(grid_pts$count))))
) +
theme_void() +
theme(plot.margin=unit(c(1, rep(.3, 3)), "cm"))
obj <- data()$sel
if(nrow(obj) != 0) {
p <- p + geom_point(data=obj, mapping=aes(x=x, y=y, classif=classif,
count=count, row=row, column=column, id=id), color="blue",
size=5, inherit.aes=FALSE)
}
ggplotly(p, source="p1") %>% layout(dragmode = "lasso")
})
selected <- reactive({
event_data("plotly_selected", source = "p1")
})
output$data_table <- DT::renderDataTable(
data()$sel, filter='top', options=list(
pageLength=5, autoWidth=TRUE
)
)
data <- reactive({
tmp <- grid_pts
sel <- tryCatch(filter(grid_pts, paste(x, y, sep="_") %in%
paste(selected()$x, selected()$y, sep="_")),
error=function(e){NULL})
list(data=tmp, sel=sel)
})
}
shinyApp(ui,server)
回答2:
From what I can see, using iris.som$unit.classif
& iris.som$grid
is the way to go in isolating circles within the plotting grid. I have made an assumption that the classifier value matches the row index of iris.som$grid
so this will need some more validation. Let me know if this helps your problem :)
findTriangle <- function(top_row, top_column, side_length, iris.som,
reverse=FALSE){
# top_row: row index of the top most triangle value
# top_column: column index...
# side_length: how many rows does the triangle occupy?
# iris.som: the som object
# reverse: set to TRUE to flip the triangle
# make the grid
grid_pts <- as.data.frame(iris.som$grid$pts)
grid_pts$column <- rep(1:iris.som$grid$xdim, by=iris.som$grid$ydim)
grid_pts$row <- rep(1:iris.som$grid$ydim, each=iris.som$grid$xdim)
grid_pts$classif <- 1:nrow(grid_pts)
# starting point - top most point of the triangle
# use reverse for triangles the other way around
grid_pts$triangle <- FALSE
grid_pts[grid_pts$column == top_column & grid_pts$row == top_row, ][["triangle"]] <- TRUE
# loop through the remaining rows and fill out the triangle
value_row <- top_row
value_start_column <- grid_pts[grid_pts$triangle == TRUE,]$x
value_end_column <- grid_pts[grid_pts$triangle == TRUE,]$x
if(reverse){
row_move <- -1
}else{
row_move <- 1
}
# update triangle
for(row in 1:(side_length-1)){
value_row <- value_row + row_move
value_start_column <- value_start_column - 0.5
value_end_column <- value_end_column + 0.5
grid_pts[grid_pts$row == value_row &
grid_pts$x >= value_start_column &
grid_pts$x <= value_end_column, ]$triangle <- TRUE
}
# visualise
pl <- ggplot(grid_pts, aes(x=x, y=rev(row), col=as.factor(triangle))) +
geom_point(size=7) +
scale_color_manual(values=c("grey", "indianred")) +
theme_void()
print(pl)
return(grid_pts)
}
# take the grid and pick out the triangle
top_row <- 2
top_column <- 6
side_length <- 4
reverse <- FALSE # set to TRUE to flip the triangle ie go from the bottom
grid_pts <- findTriangle(top_row, top_column, side_length, iris.som, reverse)
# now add the classifier and merge to get the co-ordinates
iris.sc2 <- as.data.frame(iris.sc)
iris.sc2$classif <- iris.som$unit.classif
iris.sc2 <- merge(iris.sc2, grid_pts, by=c("classif"), all.x=TRUE)
# filter to the points in the triangle
iris.sc2[iris.sc2$triangle==TRUE,]
Output data:
classif Sepal.Length Sepal.Width Petal.Length Petal.Width x y column row triangle
21 16 -1.01537328 0.5506423 -1.3287735 -1.3042249 6.0 1.732051 6 2 TRUE
22 16 -1.01537328 0.3214643 -1.4419091 -1.3042249 6.0 1.732051 6 2 TRUE
39 25 -0.89501479 1.0089981 -1.3287735 -1.3042249 5.5 2.598076 5 3 TRUE
40 25 -0.77465630 1.0089981 -1.2722057 -1.3042249 5.5 2.598076 5 3 TRUE
41 25 -0.77465630 0.7798202 -1.3287735 -1.3042249 5.5 2.598076 5 3 TRUE
42 25 -1.01537328 0.7798202 -1.2722057 -1.3042249 5.5 2.598076 5 3 TRUE
43 25 -0.89501479 0.7798202 -1.2722057 -1.3042249 5.5 2.598076 5 3 TRUE
44 26 -0.89501479 0.5506423 -1.1590702 -0.9108454 6.5 2.598076 6 3 TRUE
45 26 -1.01537328 0.7798202 -1.2156380 -1.0419719 6.5 2.598076 6 3 TRUE
58 36 -0.53393933 0.7798202 -1.2722057 -1.0419719 6.0 3.464102 6 4 TRUE
59 36 -0.41358084 1.0089981 -1.3853413 -1.3042249 6.0 3.464102 6 4 TRUE
60 36 -0.53393933 0.7798202 -1.1590702 -1.3042249 6.0 3.464102 6 4 TRUE
61 37 -1.01537328 1.0089981 -1.2156380 -0.7797188 7.0 3.464102 7 4 TRUE
62 37 -1.01537328 1.0089981 -1.3853413 -1.1730984 7.0 3.464102 7 4 TRUE
63 37 -0.89501479 1.0089981 -1.3287735 -1.1730984 7.0 3.464102 7 4 TRUE
74 44 0.06785311 0.3214643 0.5945312 0.7937995 4.5 4.330127 4 5 TRUE
75 46 -0.65429782 1.4673539 -1.2722057 -1.3042249 6.5 4.330127 6 5 TRUE
76 46 -0.53393933 1.4673539 -1.2722057 -1.3042249 6.5 4.330127 6 5 TRUE
77 47 -0.89501479 1.6965319 -1.0459346 -1.0419719 7.5 4.330127 7 5 TRUE
78 47 -0.89501479 1.6965319 -1.2156380 -1.3042249 7.5 4.330127 7 5 TRUE
79 47 -0.89501479 1.4673539 -1.2722057 -1.0419719 7.5 4.330127 7 5 TRUE
80 47 -0.89501479 1.6965319 -1.2722057 -1.1730984 7.5 4.330127 7 5 TRUE
Validation plotting on the grid:
回答3:
I elaborated the example in my post, however, not on the iris data set but I suppose it is no problem: R, SOM, Kohonen Package, Outlier Detection and also added code snippets you might need. They show
- How to generate data, add outliers and depict them on plots
- How to train the SOM
- How to do the clustering
- How to use hierarchic clustering to add the cluster boundaries to the SOM plots
- Finally, I added the clusters predicted by SOM to compare them with the real clusters in which I generated the data
I think this answers your questions. It would also be nice to compare the performance of SOM with t-SNE. I have only used SOM as an experiment on the data I generated and on the real wine data set. It would also be nice to prepare heat maps if you have more than 2 variables. All the best to you analysis!
来源:https://stackoverflow.com/questions/65864333/identifying-points-by-color