问题
Given the following data frame:
structure(list(`-5` = c(0, 1, 0, 0, 9, 22), `-4` = c(1, 3, 0,
0, 1, 17), `-3` = c(1, 3, 0, 0, 0, 12), `-2` = c(1, 3, 0, 0,
2, 10), `-1` = c(0, 0, 0, 4, 3, 9), `0` = c(0, 1, 0, 2, 2, 21
), `1` = c(0, 1, 1, 7, 1, 21), `2` = c(1, 0, 1, 2, 1, 10), `3` = c(0,
9, 0, 6, 1, 12), `4` = c(0, 2, 0, 5, 0, 18), `5` = c(0, 0, 0,
3, 0, 23)), .Names = c("-5", "-4", "-3", "-2", "-1", "0", "1",
"2", "3", "4", "5"), row.names = c(NA, 6L), class = "data.frame")
# -5 -4 -3 -2 -1 0 1 2 3 4 5
#1 0 1 1 1 0 0 0 1 0 0 0
#2 1 3 3 3 0 1 1 0 9 2 0
#3 0 0 0 0 0 0 1 1 0 0 0
#4 0 0 0 0 4 2 7 2 6 5 3
#5 9 1 0 2 3 2 1 1 1 0 0
#6 22 17 12 10 9 21 21 10 12 18 23
I would like R to give me the slope for all the data points in each row for columns -5:-1. Basically the slope for a linear regression trendline based on those 5 data points. Then a second slope for all the data points for the columns 1:5. The year 0 is ignored.
Basically this is what it would look like (the two last columns computed using Excel):
structure(list(`-5` = c(0, 1, 0, 0, 9, 22), `-4` = c(1, 3, 0,
0, 1, 17), `-3` = c(1, 3, 0, 0, 0, 12), `-2` = c(1, 3, 0, 0,
2, 10), `-1` = c(0, 0, 0, 4, 3, 9), `0` = c(0, 1, 0, 2, 2, 21
), `1` = c(0, 1, 1, 7, 1, 21), `2` = c(1, 0, 1, 2, 1, 10), `3` = c(0,
9, 0, 6, 1, 12), `4` = c(0, 2, 0, 5, 0, 18), `5` = c(0, 0, 0,
3, 0, 23), `Negative Years` = c(0, -2, 0, 0.8, -1.1, -3.3), `Positive Years` = c(-0.1,
0, -0.3, -0.5, -0.3, 1.2)), .Names = c("-5", "-4", "-3", "-2",
"-1", "0", "1", "2", "3", "4", "5", "Negative Years", "Positive Years"
), row.names = c(NA, 6L), class = "data.frame")
# -5 -4 -3 -2 -1 0 1 2 3 4 5 Negative Years Positive Years
#1 0 1 1 1 0 0 0 1 0 0 0 0.0 -0.1
#2 1 3 3 3 0 1 1 0 9 2 0 -2.0 0.0
#3 0 0 0 0 0 0 1 1 0 0 0 0.0 -0.3
#4 0 0 0 0 4 2 7 2 6 5 3 0.8 -0.5
#5 9 1 0 2 3 2 1 1 1 0 0 -1.1 -0.3
#6 22 17 12 10 9 21 21 10 12 18 23 -3.3 1.2
回答1:
This is what a statistician (not a data scientist) would do.
Let your data frame be dat
.
Y <- t(dat) ## response matrix
t <- -5:5 ## time stamps
id <- c(rep("-", 5), NA, rep("+", 5)) ## group index (factor)
fit <- lm(Y ~ t * id) ## mlm
m <- coef(fit)[c(2, 4), ] ## coefficient matrix
m[2, ] <- m[2, ] + m[1, ] ## reverse contrast
round(t(m), 2)
# t t:id+
#1 0.0 -0.1
#2 -0.2 0.0
#3 0.0 -0.3
#4 0.8 -0.5
#5 -1.1 -0.3
#6 -3.3 1.2
Change column names to what you desire.
回答2:
a=by(data.frame(t(dat)),sign(as.numeric(names(dat))),function(x)
round(unname(sapply(x,function(y)coef(lm(data.frame(y,as.numeric(rownames(x)))))[2])),2))
cbind(dat,do.call(cbind,setNames(a[-2],c("Negative Years","Positive Years"))))
-5 -4 -3 -2 -1 0 1 2 3 4 5 Negative Years Positive Years
1 0 1 1 1 0 0 0 1 0 0 0 0.0 -0.1
2 1 3 3 3 0 1 1 0 9 2 0 -0.2 0.0
3 0 0 0 0 0 0 1 1 0 0 0 0.0 -0.3
4 0 0 0 0 4 2 7 2 6 5 3 0.8 -0.5
5 9 1 0 2 3 2 1 1 1 0 0 -1.1 -0.3
6 22 17 12 10 9 21 21 10 12 18 23 -3.3 1.2
using tidyverse:
library(tidyverse)
data.frame(t(dat))%>%
rownames_to_column("x")%>%
mutate(x=as.numeric(x))%>%
gather(col,val,-x)%>%
filter(x!=0)%>%
group_by(col,s=sign(x))%>%
summarise(u=round(coef(lm(val~x))[2],2))%>%
spread(col,u)%>%{data.frame(t(.[-1]))}%>%
setNames(c("Negative Years","Positive Years"))%>%
cbind(dat,.)
-5 -4 -3 -2 -1 0 1 2 3 4 5 Negative Years Positive Years
1 0 1 1 1 0 0 0 1 0 0 0 0.0 -0.1
2 1 3 3 3 0 1 1 0 9 2 0 -0.2 0.0
3 0 0 0 0 0 0 1 1 0 0 0 0.0 -0.3
4 0 0 0 0 4 2 7 2 6 5 3 0.8 -0.5
5 9 1 0 2 3 2 1 1 1 0 0 -1.1 -0.3
6 22 17 12 10 9 21 21 10 12 18 23 -3.3 1.2
回答3:
A solution using the tidyverse
package. Assuming that dat
is your original data frame and dat2
is the final output.
library(tidyverse)
dat2 <- dat %>%
rowid_to_column() %>% # Get the rowid to a column
gather(Column, Value, -rowid, convert = TRUE) %>% # Convert to long format
filter(Column != 0) %>% # Remove Column == 0
mutate(Sign = ifelse(Column > 0, "Positive", "Negative")) %>% # Create a column show Positive and Negative
group_by(rowid, Sign) %>% # Create nested column
nest() %>% # Each element in nested column is a dtaa frame
mutate(LM = map(data, ~lm(Value ~ Column, data = .x))) %>% # Apply lm to each element in nested data frame
mutate(Slope =
map_dbl(LM, ~round(.x[["coefficients"]][[2]],
digits = 1))) %>% # Get the rounded slope
select(rowid, Sign, Slope) %>% # Select relevant column
spread(Sign, Slope) %>% # Convert to wide format
left_join(dat %>% rowid_to_column(), ., by = "rowid") %>% # Merge to the original data frame
select(-rowid) # Remove the rowid column
dat2
# -5 -4 -3 -2 -1 0 1 2 3 4 5 Negative Positive
# 1 0 1 1 1 0 0 0 1 0 0 0 0.0 -0.1
# 2 1 3 3 3 0 1 1 0 9 2 0 -0.2 0.0
# 3 0 0 0 0 0 0 1 1 0 0 0 0.0 -0.3
# 4 0 0 0 0 4 2 7 2 6 5 3 0.8 -0.5
# 5 9 1 0 2 3 2 1 1 1 0 0 -1.1 -0.3
# 6 22 17 12 10 9 21 21 10 12 18 23 -3.3 1.2
来源:https://stackoverflow.com/questions/51180159/finding-the-slope-for-multiple-points-in-selected-columns