问题
I have the following DF:
DF = structure(list(ID = c(21785L, 21785L, 21785L), V1 = c(0.828273303,
6.404590021, 0.775568448), V2 = c(2L, 3L, 2L), V3 = c(NA, 1.122899914,
0.850113234), V4 = c(NA, 4L, 3L), V5 = c(NA, 0.866757168, 0.868943246
), V6 = c(NA, 5L, 4L), V7 = c(NA, 0.563804788, 0.728656328),
V8 = c(NA, 6L, 5L), V9 = c(NA, 0.888109208, 0.823803733),
V10 = c(NA, 7L, 6L), V11 = c(NA, 0.578834113, 0.863467391
), V12 = c(NA, 1L, 7L), V13 = c(NA, NA, 0.939920869)), class = "data.frame", row.names = c(5L,
163L, 167L))
Output:
Row ID V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
5 21785 0.8282733 2 NA NA NA NA NA NA NA NA NA NA NA
163 21785 6.4045900 3 1.1228999 4 0.8667572 5 0.5638048 6 0.8881092 7 0.5788341 1 NA
167 21785 0.7755684 2 0.8501132 3 0.8689432 4 0.7286563 5 0.8238037 6 0.8634674 7 0.9399209
The data can be broken down into 3 parts:
- ID per participant
- Odd Columns representing standardized heart rate
- Even columns representing day number of week (1 = sunday)
I have 100 plus unique participants and 3000 rows of data with unequal data per day, hence the NAs.
I would like to pivot the data into one column per part
- so that: col1 = ID, col2 = HR, col3 = Weekday
I have tried several methods based on similar questions such as:
# melt the data frame to put all the metrics in a single column
DF2 = reshape2::melt(DF, id.vars = c("ID"))
# split the data by ID
DF3 = split(DF2, DF2$ID)
# allocate empty DF with 3 columns for future appending
DF_Organized = data.frame()[1,3]
# make the data into 3 new columns, 1 for ID, HR, weekday
for (m in 1:length(DF3)){
DF_tmp = DF3[m] %>%
data.frame %>% na.omit() # convert to DF, remove NAs
setNames(., c("ID","colx","Value")) %>% # set names for clarity
mutate(ind = rep(c(1, 2),length.out = n())) %>% # assign 1 to amplitude and 2 to day values in each row
group_by(ind) %>% # group by value type
mutate(id = row_number()) %>% # make new column that determines location of data by previous assignment
spread(ind, Value) %>% # organize data by new ID
select(-id) #clean
# reorganize the NAs to the bottom
DF_tmp2 = setNames(do.call(function(...) rowr::cbind.fill(..., fill = NA),
lapply(DF_tmp, na.omit)),colnames(DF_tmp)) %>%
na.omit() %>%
select(-colx) %>%
setNames(., c("ID","HR","Weekday")) # set names for clarity
I get close but not accurate:
Actual Output:
> DF_tmp2
ID HR Weekday
1 21785 0.8282733 6.4045900
2 21785 0.7755684 2.0000000
3 21785 3.0000000 2.0000000
4 21785 1.1228999 0.8501132
. . . There's misalignment and inaccurate combinations. Any help is appreciated.
Expected Output:
> DF_tmp2
ID HR Weekday
1 21785 0.8282733 2.0000000
2 21785 6.4045900 3.0000000
3 21785 1.1228999 4.0000000
4 21785 0.8667572 5.0000000
5 21875 0.5638048 6.0000000
.
.
.
回答1:
1) pivot_longer Define the v.names
column names and the number of pairs k
. Then add V14
since V13
seems unmatched and then change the names to something which identifies the columns, i.e. ID, HR 1, Weekday 1, HR 2, Weekday 2, etc. With these names we can employ pivot_longer
.
library(dplyr)
library(purrr)
library(tidyr)
v.names <- c("HR", "Weekday")
k <- ncol(DF) %/% 2L # 7L = no. of (HR, Weekday) pairs
DF %>%
mutate(V14 = V12 %% 7L + 1L, n = 1:n()) %>%
set_names("ID", cross2(v.names, 1:k) %>% map(lift(paste)), "n") %>%
pivot_longer(-c(ID, n), names_to = c(".value", "Num"), names_sep = " ") %>%
drop_na %>%
arrange(n, Num) %>%
select(-n, -Num)
giving:
# A tibble: 14 x 3
ID HR Weekday
<int> <dbl> <dbl>
1 21785 0.828 2
2 21785 6.40 3
3 21785 1.12 4
4 21785 0.867 5
5 21785 0.564 6
6 21785 0.888 7
7 21785 0.579 1
8 21785 0.776 2
9 21785 0.850 3
10 21785 0.869 4
11 21785 0.729 5
12 21785 0.824 6
13 21785 0.863 7
14 21785 0.940 1
2) Base R We can alternately use reshape
in base R in much the same way. v.names
and k
are from above. Note that reshape
automatically adds an id
column giving the row number in the original data frame so we do not have to add it ourself as we did in (1).
DF2 <- transform(DF, V14 = V12 %% 7L + 1L)
names(DF2)[-1] <- outer(v.names, 1:k, paste)
long <- na.omit(reshape(DF2, dir = "long",
varying = lapply(v.names, grep, names(DF2)), v.names = v.names))
long[order(long$id, long$time), c("ID", "HR", "Weekday")]
3) data.table
Using DF2 from (2)
library(data.table)
DT2 <- data.table(DF2)[, row := .I]
DT2 <- na.omit(melt(DT2, idvars = c("ID", "row"),
measure.vars = sapply(v.names, grep, names(DT2), simplify = FALSE)))
setkey(DT2, row, Weekday)
DT2[, c("ID", "HR", "Weekday")]
回答2:
DF %>%
gather(col, val, -ID) %>%
mutate(col = if_else(str_ends(col, "0|2|4|6|8"), "Weekday", "HR")) %>%
group_by(col) %>%
mutate(instance = row_number()) %>%
spread(col, val) %>%
filter(!is.na(HR))
## A tibble: 14 x 4
# ID instance HR Weekday
# <int> <int> <dbl> <dbl>
# 1 21785 1 0.828 2
# 2 21785 2 6.40 3
# 3 21785 3 0.776 2
# 4 21785 5 1.12 4
# 5 21785 6 0.850 3
# 6 21785 8 0.867 5
# 7 21785 9 0.869 4
# 8 21785 11 0.564 6
# 9 21785 12 0.729 5
#10 21785 14 0.888 7
#11 21785 15 0.824 6
#12 21785 17 0.579 1
#13 21785 18 0.863 7
#14 21785 21 0.940 NA
回答3:
Hope this helps
for (m in 1:length(DF3)){
DF_tmp = DF3[m] %>%
data.frame %>%
na.omit() %>%
setNames(., c("ID","colx","Value")) %>% # set names for clarity
mutate(ind = substring(colx, 2) %>% as.numeric() %% 2) %>% # assign 1 to amplitude and 2 to day values in each row
group_by(ind) %>% # group by value type
mutate(id = row_number()) %>% # make new column that determines location of data by previous assignment
ungroup() %>%
select(-colx) %>%
spread(ind, Value)
# reorganize the NAs to the bottom
DF_tmp2 = setNames(do.call(function(...) rowr::cbind.fill(..., fill = NA),
lapply(DF_tmp, na.omit)),colnames(DF_tmp)) %>%
na.omit() %>%
select(-id) %>%
setNames(., c("ID","Weekday","HR")) %>% # set names for clarity
arrange(Weekday)
}
来源:https://stackoverflow.com/questions/58566740/pivot-by-group-for-unequal-data-size