问题
I have a set of spatial coordinate (x,y) data that has a response variable for each coordinate over the course of several years. The following code generates a similar data frame:
df <- data.frame(
id = rep(1:2, 2),
x = rep(c(25, 30),10),
y = rep(c(100, 200), 10),
year = rep(1980:1989, 2),
response = rnorm(20)
)
The resulting data frame:
head(df)
id x y year response
1 1 25 100 1980 0.1707431
2 2 30 200 1981 1.3562263
3 1 25 100 1982 -0.4590506
4 2 30 200 1983 1.3238410
5 1 25 100 1984 1.7765772
6 2 30 200 1985 -0.6258069
I want to run a linear regression on each cell through time to get the change in response variable. To do this, I use plyr and the ddply command:
require(plyr)
lm.df <- ddply(df, .(id), function(z)coef(lm(response ~ year, data = z)))
Then I recombine w/ the spatial data (the example here of the recombination is a bit simple, but works for the example.):
points <- data.frame(
id = c(1,2),
x = c(25,30),
y = c(100,200)
)
lm.stack <- merge(points, lm.df, by="id")
colnames(lm.stack) <- c("ID", "x", "y", "intercept", "slope")
print(lm.stack)
ID x y intercept slope
1 1 25 100 257.7291 -0.12985632
2 2 30 200 173.3676 -0.08708068
This works great. But what I want, is to be able to extract the adj r squared value of each lm model to be able to denote significant trends by coordinate cell w/ the ultimate goal of projecting a map of cells colored by significance value. This is where I need help and am greatly appreciative. Thanks!
回答1:
I would suggest a combination of packages dplyr
and broom
.
This approach will return everything you need to know about the model (info about coefficients and info about model itself) as a dataframe:
set.seed(25)
df <- data.frame(
id = rep(1:2, 2),
x = rep(c(25, 30),10),
y = rep(c(100, 200), 10),
year = rep(1980:1989, 2),
response = rnorm(20)
)
df
# id x y year response
# 1 1 25 100 1980 -0.21183360
# 2 2 30 200 1981 -1.04159113
# 3 1 25 100 1982 -1.15330756
# 4 2 30 200 1983 0.32153150
# 5 1 25 100 1984 -1.50012988
# 6 2 30 200 1985 -0.44553326
# 7 1 25 100 1986 1.73404543
# 8 2 30 200 1987 0.51129562
# 9 1 25 100 1988 0.09964504
# 10 2 30 200 1989 -0.05789111
# 11 1 25 100 1980 -1.74278763
# 12 2 30 200 1981 -1.32495298
# 13 1 25 100 1982 -0.54793388
# 14 2 30 200 1983 -1.45638428
# 15 1 25 100 1984 0.08268682
# 16 2 30 200 1985 0.92757895
# 17 1 25 100 1986 -0.71676933
# 18 2 30 200 1987 0.96239968
# 19 1 25 100 1988 1.54588458
# 20 2 30 200 1989 -1.00976361
library(dplyr)
library(broom)
df %>%
group_by(id) %>%
do({model = lm(response~year, data=.) # create your model
data.frame(tidy(model), # get coefficient info
glance(model))}) # get model info
# id term estimate std.error statistic p.value r.squared adj.r.squared sigma statistic.1 p.value.1 df logLik AIC BIC deviance df.residual
# 1 1 (Intercept) -492.2144842 213.19252113 -2.308779 0.04978386 0.3996362 0.32459069 0.9611139 5.325253 0.04987162 2 -12.67704 31.35409 32.26184 7.389919 8
# 2 1 year 0.2479705 0.10745580 2.307651 0.04987162 0.3996362 0.32459069 0.9611139 5.325253 0.04987162 2 -12.67704 31.35409 32.26184 7.389919 8
# 3 2 (Intercept) -258.6253012 196.88284243 -1.313600 0.22539989 0.1771294 0.07427055 0.8871395 1.722063 0.22582607 2 -11.87614 29.75227 30.66003 6.296132 8
# 4 2 year 0.1301582 0.09918521 1.312274 0.22582607 0.1771294 0.07427055 0.8871395 1.722063 0.22582607 2 -11.87614 29.75227 30.66003 6.296132 8
You can use group_by(id,x,y)
if you really want your x and y columns in your final dataframe. For example if you want a similar output to the one you provided you can do :
library(dplyr)
library(broom)
library(tidyr)
dd =
df %>%
group_by(id, x, y) %>%
do({model = lm(response~year, data=.) # create your model
data.frame(tidy(model), # get coefficient info
glance(model))}) # get model info
Then select the info you need:
dd %>%
select(id, x, y, term, estimate, adj.r.squared)
# id x y term estimate adj.r.squared
# 1 1 25 100 (Intercept) -492.2144842 0.32459069
# 2 1 25 100 year 0.2479705 0.32459069
# 3 2 30 200 (Intercept) -258.6253012 0.07427055
# 4 2 30 200 year 0.1301582 0.07427055
where you get one row for the intercept and one row for the slope.
Or, even reshape that data frame to:
dd %>%
select(id, x, y, term, estimate, adj.r.squared) %>%
spread(term, estimate)
# id x y adj.r.squared (Intercept) year
# 1 1 25 100 0.32459069 -492.2145 0.2479705
# 2 2 30 200 0.07427055 -258.6253 0.1301582
column year
is the slope
(coefficient of variable year
)
In a similar way you can also use package purrr
to create a new data frame that has list columns with the corresponding info:
set.seed(25)
df <- data.frame(
id = rep(1:2, 2),
x = rep(c(25, 30),10),
y = rep(c(100, 200), 10),
year = rep(1980:1989, 2),
response = rnorm(20)
)
library(dplyr)
library(broom)
library(tidyr)
library(purrr)
dd = df %>%
group_by(id, x, y) %>% # for each combination of those variables
nest() %>% # nest data (rest of columns)
mutate(Model = map(data, ~lm(response~year, data=.)), # use nested data to build model of interest
Coeff_Info = map(Model, tidy), # get coefficient info
Model_Info = map(Model, glance)) %>% # get model info
ungroup() # forget the grouping
# check how new dataset looks like
dd
# # A tibble: 2 x 7
# id x y data Model Coeff_Info Model_Info
# <int> <dbl> <dbl> <list> <list> <list> <list>
# 1 1 25 100 <tibble [10 x 2]> <S3: lm> <data.frame [2 x 5]> <data.frame [1 x 11]>
# 2 2 30 200 <tibble [10 x 2]> <S3: lm> <data.frame [2 x 5]> <data.frame [1 x 11]>
You can still access any element you want, but remember that some of your columns now have list elements:
# get coefficient info for both models
dd$Coeff_Info
# [[1]]
# term estimate std.error statistic p.value
# 1 (Intercept) -492.2144842 213.1925211 -2.308779 0.04978386
# 2 year 0.2479705 0.1074558 2.307651 0.04987162
#
# [[2]]
# term estimate std.error statistic p.value
# 1 (Intercept) -258.6253012 196.88284243 -1.313600 0.2253999
# 2 year 0.1301582 0.09918521 1.312274 0.2258261
# get the r squared value for 1st model
dd %>% # from new dataset
filter(id == 1) %>% # keep all info / rows of 1st model
pull(Model_Info) %>% # get model info
map_dbl("r.squared") # show r squared
# [1] 0.3996362
Or, alternatively
dd %>% # from new dataset
unnest(id, Model_Info) %>% # umnest id and model info
filter(id == 1) %>% # keep row of 1st model
pull(r.squared) # show the r squared
# [1] 0.3996362
回答2:
If you keep the models in a list, then you can easily extract whatever you want from them using lapply
or any of the other list traversal functions.
lms <- dlply(df, .(id), function(x) lm(response ~ year, data=x))
rsq <- lapply(lms, function(x) summary(x)$adj.r.squared) # get the adjusted r2
An alternative to making the list yourself, is lmList
from nlme
library(nlme)
lms2 <- lmList(response ~ year | id, data=df)
summary(lms2)$adj.r.squared
来源:https://stackoverflow.com/questions/32274779/extracting-p-values-from-multiple-linear-regression-lm-inside-of-a-ddply-funct