I have three date columns as shown below
Id Date1 Date2 Date3
12 2005-12-22 NA NA
11 2009-10-11 NA NA
Using base R
, we could get the column index of the non-NA values for the 'Date' columns in each row by matrix multiplication
indx <- (!is.na(df1[-1])) %*% seq_len(ncol(df1[-1]))
Or using max.col
on the logical matrix (!is.na(df1[-1])
)
indx <- max.col(!is.na(df1[-1]))
Then create the new data.frame with 'Id' column from the 'df1', 'Date' from 'row/column' index and 'Index' from above.
data.frame(Id=df1[1], Date=df1[-1][cbind(1:nrow(df1[-1]), indx)], Index=indx)
# Id Date Index
#1 12 2005-12-22 1
#2 11 2009-10-11 1
#3 29 2005-04-11 2
#4 45 2008-11-06 3
#5 39 2006-01-02 3
#6 44 2005-04-16 2
Or using dplyr/tidyr
library(dplyr)
library(tidyr)
gather(df1, Index, Date, -Id) %>%
filter(!is.na(Date)) %>%
extract(Index, 'Index', '[^0-9]+([0-9]+)', convert=TRUE)
# Id Index Date
#1 12 1 2005-12-22
#2 11 1 2009-10-11
#3 29 2 2005-04-11
#4 44 2 2005-04-16
#5 45 3 2008-11-06
#6 39 3 2006-01-02
df1 <- structure(list(Id = c(12L, 11L, 29L, 45L, 39L, 44L),
Date1 = c("2005-12-22",
"2009-10-11", NA, NA, NA, NA), Date2 = c(NA, NA, "2005-04-11",
NA, NA, "2005-04-16"), Date3 = c(NA, NA, NA, "2008-11-06",
"2006-01-02", NA)), .Names = c("Id", "Date1", "Date2", "Date3"),
class = "data.frame", row.names = c(NA, -6L))
This is a classic use of reshape
to go from "wide" to "long" format. If d
is your data.frame:
d2 <- reshape(d, idvar = "Id", v.names = "Date", timevar = "Index",
varying = c("Date1", "Date2", "Date3"), direction = "long")
Result:
> d2
Id Index Date
12.1 12 1 2005-12-22
11.1 11 1 2009-10-11
29.1 29 1 <NA>
45.1 45 1 <NA>
39.1 39 1 <NA>
44.1 44 1 <NA>
12.2 12 2 <NA>
11.2 11 2 <NA>
29.2 29 2 2005-04-11
45.2 45 2 <NA>
39.2 39 2 <NA>
44.2 44 2 2005-04-16
12.3 12 3 <NA>
11.3 11 3 <NA>
29.3 29 3 <NA>
45.3 45 3 2008-11-06
39.3 39 3 2006-01-02
44.3 44 3 <NA>
If you don't want all the NA
values (above) you can subset:
> d2[!is.na(d2$Date),]
Id Index Date
12.1 12 1 2005-12-22
11.1 11 1 2009-10-11
29.2 29 2 2005-04-11
44.2 44 2 2005-04-16
45.3 45 3 2008-11-06
39.3 39 3 2006-01-02
You can consider melt
ing your data.
Here's an example:
library(data.table)
library(reshape2)
melt(as.data.table(mydf), id.vars = "Id", na.rm = TRUE)
# Id variable value
# 1: 12 Date1 2005-12-22
# 2: 11 Date1 2009-10-11
# 3: 29 Date2 2005-04-11
# 4: 44 Date2 2005-04-16
# 5: 45 Date3 2008-11-06
# 6: 39 Date3 2006-01-02
## More specific to what you want:
melt(as.data.table(mydf), id.vars = "Id", na.rm = TRUE)[,
variable := sub("Date", "", variable)][]
# Id variable value
# 1: 12 1 2005-12-22
# 2: 11 1 2009-10-11
# 3: 29 2 2005-04-11
# 4: 44 2 2005-04-16
# 5: 45 3 2008-11-06
# 6: 39 3 2006-01-02
You can also use tidyr
with a small hack for the id
:
library(tidyr)
df[is.na(df)]=''
transform(unite(df, 'Date', Date1:Date3, sep=''),
id=ceiling(which(df[-1]!='')/nrow(df)))
# Id Date id
#1 12 2005-12-22 1
#2 11 2009-10-11 1
#3 29 2005-04-11 2
#4 45 2008-11-06 2
#5 39 2006-01-02 3
#6 44 2005-04-16 3