I have a sample of 1m records obtained from my original data. (For your reference, you may use this dummy data that may generate approximately similar distribution
Look at the structure of the returned object (this should be documented in the help):
> # simple mixture of normals:
> x=c(rnorm(10000,8,2),rnorm(10000,17,4))
> xMix = normalmixEM(x, lambda=NULL, mu=NULL, sigma=NULL)
Now what:
> str(xMix)
List of 9
$ x : num [1:20000] 6.18 9.92 9.07 8.84 9.93 ...
$ lambda : num [1:2] 0.502 0.498
$ mu : num [1:2] 7.99 17.05
$ sigma : num [1:2] 2.03 4.02
$ loglik : num -59877
The lambda, mu, and sigma components define the returned normal densities. You can plot these in ggplot using qplot
and stat_function
. But first make a function that returns scaled normal densities:
sdnorm =
function(x, mean=0, sd=1, lambda=1){lambda*dnorm(x, mean=mean, sd=sd)}
Then:
qplot(x,geom="density") + stat_function(fun=sdnorm,args=list(mean=xMix$mu[1],sd=xMix$sigma[1], lambda=xMix$lambda[1]),fill="blue",geom="polygon") + stat_function(fun=sdnorm,args=list(mean=xMix$mu[2],sd=xMix$sigma[2], lambda=xMix$lambda[2]),fill="#FF0000",geom="polygon")
Or whatever ggplot
skills you have. Transparent colours on the densities might be nice.
ggplot(data.frame(x=x)) +
geom_histogram(aes(x=x,y=..density..),fill="white",color="black") +
stat_function(fun=sdnorm,
args=list(mean=xMix$mu[2],
sd=xMix$sigma[2],
lambda=xMix$lambda[2]),
fill="#FF000080",geom="polygon") +
stat_function(fun=sdnorm,
args=list(mean=xMix$mu[1],
sd=xMix$sigma[1],
lambda=xMix$lambda[1]),
fill="#00FF0080",geom="polygon")
producing:
Here's a slightly different approach which uses geom_ploygon(...)
instead of multiple calls to stat_function(...)
. One problem with stat_function(...)
is that the secondary arguments (mu, sigma, and lambda in this example), which are passed using the args=list(...)
parameter, cannot be included in an aesthetic mapping, so you have to have multiple calls to stat_function(...)
as is @Spacedman`s solution.
This approach builds the PDFs outside of ggplot and uses a single call to geom_polygon(...)
. As a result, it works without modification for an arbitrary number of distributions in the mixture.
# ggplot mixture plot
gg.mixEM <- function(EM) {
require(ggplot2)
x <- with(EM,seq(min(x),max(x),len=1000))
pars <- with(EM,data.frame(comp=colnames(posterior), mu, sigma,lambda))
em.df <- data.frame(x=rep(x,each=nrow(pars)),pars)
em.df$y <- with(em.df,lambda*dnorm(x,mean=mu,sd=sigma))
ggplot(data.frame(x=EM$x),aes(x,y=..density..)) +
geom_histogram(fill=NA,color="black")+
geom_polygon(data=em.df,aes(x,y,fill=comp),color="grey50", alpha=0.5)+
scale_fill_discrete("Component\nMeans",labels=format(em.df$mu,digits=3))+
theme_bw()
}
library(mixtools)
# two components
set.seed(1) # for reproducible example
b <- rnorm(2000000, mean=c(8,17), sd=2)
c <- b[sample(length(b), 1000000) ]
c2 <- normalmixEM(c, lambda=NULL, mu=NULL, sigma=NULL)
gg.mixEM(c2)
# three components
set.seed(1)
b <- rnorm(2000000, mean=c(8,17,30), sd=c(2,3,5))
c <- b[sample(length(b), 1000000) ]
library(mixtools)
c3 <- normalmixEM(c, k=3, lambda=NULL, mu=NULL, sigma=NULL)
gg.mixEM(c3)