Using geom_smooth for fitting a glm to fractions

大憨熊 提交于 2020-08-05 08:00:07

问题


This post is somewhat related to this post.

Here I have xy grouped data where y are fractions:

library(dplyr)
library(ggplot2)
library(ggpmisc)

set.seed(1)
df1 <- data.frame(value = c(0.8,0.5,0.4,0.2,0.5,0.6,0.5,0.48,0.52),
                 age = rep(c("d2","d4","d45"),3),
                 group = c("A","A","A","B","B","B","C","C","C")) %>%
  dplyr::mutate(time = as.integer(age)) %>%
  dplyr::arrange(group,time) %>%
  dplyr::mutate(group_age=paste0(group,"_",age))
df1$group_age <- factor(df1$group_age,levels=unique(df1$group_age))

What I'm trying to achieve is to plot df1 as a bar plot, like this:

ggplot(df1,aes(x=group_age,y=value,fill=age)) +
  geom_bar(stat='identity')

But I want to fit to each group a binomial glm with a logit link function, which estimates how these fractions are affected by time.

Let's say I have 100 observations per each age (time) in each group:

df2 <- do.call(rbind,lapply(1:nrow(df1),function(i){
  data.frame(age=df1$age[i],group=df1$group[i],time=df1$time[i],group_age=df1$group_age[i],value=c(rep(T,100*df1$value[i]),rep(F,100*(1-df1$value[i]))))
}))

Then the glm for each group (e.g., group A) is:

glm(value ~ time, dplyr::filter(df2, group == "A"), family = binomial(link='logit'))

So I would like to add to the plot above the estimated regression slopes for each group along with their corresponding p-values (similar to what I'm doing for the continuous df$value in this post).

I thought that using:

ggplot(df1,aes(x=group_age,y=value,fill=age)) +
  geom_bar(stat='identity') +
  geom_smooth(data=df2,mapping=aes(x=group_age,y=value,group=group),color="black",method='glm',method.args=list(family=binomial(link='logit')),size=1,se=T) +
  stat_poly_eq(aes(label=stat(p.value.label)),formula=my_formula,parse=T,npcx="center",npcy="bottom") +
  scale_x_log10(name="Age",labels=levels(df$age),breaks=1:length(levels(df$age))) +
  facet_wrap(~group) + theme_minimal()

Would work but I get the error:

Error in Math.factor(x, base) : ‘log’ not meaningful for factors

Any idea how to get it right?


回答1:


I believe this could help:

library(tidyverse)
library(broom)

df2$value <- as.numeric(df2$value)

#Estimate coefs
dfmodel <- df2 %>% group_by(group) %>%
  do(fitmodel = glm(value ~ time, data = .,family = binomial(link='logit')))
#Extract coeffs
dfCoef = tidy(dfmodel, fitmodel)
#Create labels
dfCoef %>% filter(term=='(Intercept)') %>% mutate(Label=paste0(round(estimate,3),'(p=',round(p.value,3),')'),
                                                  group_age=paste0(group,'_','d4')) %>%
  select(c(group,Label,group_age)) -> Labels
#Values
df2 %>% group_by(group,group_age) %>% summarise(value=sum(value)) %>% ungroup() %>%
  group_by(group) %>% filter(value==max(value)) %>% select(-group_age) -> values
#Combine
Labels %>% left_join(values) -> Labels
Labels %>% mutate(age=NA) -> Labels
#Plot
ggplot(df2,aes(x=group_age,y=value,fill=age)) +
  geom_text(data=Labels,aes(x=group_age,y=value,label=Label),fontface='bold')+
  geom_bar(stat='identity')+
  facet_wrap(.~group,scales='free')




回答2:


Thanks to Pedro Aphalo this is nearly a complete solution:

Generate the data.frame with the fractions (here use time as an integer by deleting "d" in age rather than using time as the levels of age):

library(dplyr)
library(ggplot2)
library(ggpmisc)

set.seed(1)
df1 <- data.frame(value = c(0.8,0.5,0.4,0.2,0.5,0.6,0.5,0.48,0.52),
                 age = rep(c("d2","d4","d45"),3),
                 group = c("A","A","A","B","B","B","C","C","C")) %>%
  dplyr::mutate(time = as.integer(gsub("d","",age))) %>%
  dplyr::arrange(group,time) %>%
  dplyr::mutate(group_age=paste0(group,"_",age))
df1$group_age <- factor(df1$group_age,levels=unique(df1$group_age))

Inflate df1 to 100 observations per each age in each group but specify value as an integer rather than a binary:

df2 <- do.call(rbind,lapply(1:nrow(df1),function(i){
  data.frame(age=df1$age[i],group=df1$group[i],time=df1$time[i],group_age=df1$group_age[i],value=c(rep(1,100*df1$value[i]),rep(0,100*(1-df1$value[i]))))
}))

And now plot it using geom_smooth and stat_fit_tidy:

ggplot(df1,aes(x=time,y=value,group=group,fill=age)) +
  geom_bar(stat='identity') +
  geom_smooth(data=df2,mapping=aes(x=time,y=value,group=group),color="black",method='glm',method.args=list(family=binomial(link='logit'))) +
  stat_fit_tidy(data=df2,mapping=aes(x=time,y=value,group=group,label=sprintf("P = %.3g",stat(x_p.value))),method='glm',method.args=list(formula=y~x,family=binomial(link='logit')),parse=T,label.x="center",label.y="top") +
  scale_x_log10(name="Age",labels=levels(df2$age),breaks=unique(df2$time)) +
  facet_wrap(~group) + theme_minimal()

Which gives (note that the scale_x_log10 is mainly a cosmetic approach to presenting the x-axis as time rather than levels of age):

The only imperfection is that the p-values seem to appear messed up.



来源:https://stackoverflow.com/questions/62986739/using-geom-smooth-for-fitting-a-glm-to-fractions

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!