I have a below-mentioned dataframe:
structure(
list(ID = c(\"P-1\", \" P-1\", \"P-1\", \"P-2\", \"P-3\", \"P-4\", \"P-5\"
Hopefully that'll be enough to get you started, to go further, I'll need an expected output that looks like it comes from R, and further explanations as to how variables are computed.
library(tidyverse)
df <- structure(
list(ID = c("P-1", " P-1", "P-1", "P-2", "P-3", "P-4", "P-5", "P-6", "P-7",
"P-8"),
Date = c("2020-03-16 12:11:33", "2020-03-16 13:16:04",
"2020-03-16 06:13:55", "2020-03-16 10:03:43",
"2020-03-16 12:37:09", "2020-03-16 06:40:24",
"2020-03-16 09:46:45", "2020-03-16 12:07:44",
"2020-03-16 14:09:51", "2020-03-16 09:19:23"),
Status = c("SA", "SA", "SA", "RE", "RE", "RE", "RE", "XA", "XA", "XA"),
Flag = c("L", "L", "L", NA, "K", "J", NA, NA, "H", "G"),
Value = c(5929.81, 5929.81, 5929.81, NA, 6969.33, 740.08, NA, NA, 1524.8,
NA),
Flag2 = c("CL", "CL", "CL", NA, "RY", "", NA, NA, "", NA),
Flag3 = c(NA, NA, NA, NA, "RI", "PO", NA, "SS", "DDP", NA)),
.Names=c("ID", "Date", "Status", "Flag", "Value", "Flag2", "Flag3"),
row.names=c(NA, 10L), class="data.frame")
df2 <- df %>%
mutate(
# add variables
Value = ifelse(0 <= Value & Value <= 15000, "0-15000", "15000-50000"),
substatus = case_when(
!is.na(Flag2) & is.na(Flag3) ~ "a",
!is.na(Flag3) & is.na(Flag2) ~ "b",
!is.na(Flag3) & !is.na(Flag2) ~ "c",
TRUE ~ "d"),
# make Date an actual date rather than a timestamp
Date = as.Date(Date),
# remove obsolete columns
Flag2 = NULL,
Flag3 = NULL,
ID = NULL,
# renames NAs into the name of the desired column
Flag = ifelse(is.na(Flag), "[Null]", Flag),
# create column of 1 for pivot
temp = 1,
# and row id
id = row_number()
) %>%
# create new columns L K etc, this also drops the Flag col
pivot_wider(names_from = "Flag", values_from = "temp", values_fill = list(temp=0)) %>%
# move `[Null]` column to the end
select(everything(), -`[Null]`, `[Null]`) %>%
mutate(
id = NULL,
count = 1,
Total = rowSums(select(., L:`[Null]`)))
df2
#> # A tibble: 10 x 12
#> Date Status Value substatus L K J H G `[Null]`
#>
#> 1 2020-03-16 SA 0-15~ a 1 0 0 0 0 0
#> 2 2020-03-16 SA 0-15~ a 1 0 0 0 0 0
#> 3 2020-03-16 SA 0-15~ a 1 0 0 0 0 0
#> 4 2020-03-16 RE d 0 0 0 0 0 1
#> 5 2020-03-16 RE 0-15~ c 0 1 0 0 0 0
#> 6 2020-03-16 RE 0-15~ c 0 0 1 0 0 0
#> 7 2020-03-16 RE d 0 0 0 0 0 1
#> 8 2020-03-16 XA b 0 0 0 0 0 1
#> 9 2020-03-16 XA 0-15~ c 0 0 0 1 0 0
#> 10 2020-03-16 XA d 0 0 0 0 1 0
#> # ... with 2 more variables: count , Total
# As you didn't tell what to do with NA values so I left them as NA
bind_rows(
df2 %>%
# add missing combinations of abcd
complete(nesting(Date, Status, Value), substatus) %>%
group_by(Date, Value, Status, substatus) %>%
summarize_all(~sum(., na.rm=TRUE)) %>%
group_by(Status, Value) %>%
mutate(percent = paste(round(100 * Total / sum(Total), 2), "%")) %>%
ungroup(),
df2 %>%
mutate(substatus = Status, Status = paste0(Status, "_")) %>%
group_by(Date, Value, Status, substatus) %>%
mutate(count = n()) %>%
group_by(count, add = TRUE) %>%
summarize_all(~sum(., na.rm=TRUE)) %>%
group_by(Value) %>%
mutate(percent = paste(round(100 * Total / sum(Total), 2), "%"))
) %>%
arrange(Date, Value, desc(Status)) %>%
mutate(Status = NULL) %>%
rename(Status = substatus) %>%
print(n=Inf)
#> # A tibble: 25 x 12
#> Date Value Status L K J H G `[Null]` count Total
#>
#> 1 2020-03-16 0-15~ XA 0 0 0 1 0 0 1 1
#> 2 2020-03-16 0-15~ a 0 0 0 0 0 0 0 0
#> 3 2020-03-16 0-15~ b 0 0 0 0 0 0 0 0
#> 4 2020-03-16 0-15~ c 0 0 0 1 0 0 1 1
#> 5 2020-03-16 0-15~ d 0 0 0 0 0 0 0 0
#> 6 2020-03-16 0-15~ SA 3 0 0 0 0 0 3 3
#> 7 2020-03-16 0-15~ a 3 0 0 0 0 0 3 3
#> 8 2020-03-16 0-15~ b 0 0 0 0 0 0 0 0
#> 9 2020-03-16 0-15~ c 0 0 0 0 0 0 0 0
#> 10 2020-03-16 0-15~ d 0 0 0 0 0 0 0 0
#> 11 2020-03-16 0-15~ RE 0 1 1 0 0 0 2 2
#> 12 2020-03-16 0-15~ a 0 0 0 0 0 0 0 0
#> 13 2020-03-16 0-15~ b 0 0 0 0 0 0 0 0
#> 14 2020-03-16 0-15~ c 0 1 1 0 0 0 2 2
#> 15 2020-03-16 0-15~ d 0 0 0 0 0 0 0 0
#> 16 2020-03-16 XA 0 0 0 0 1 1 2 2
#> 17 2020-03-16 a 0 0 0 0 0 0 0 0
#> 18 2020-03-16 b 0 0 0 0 0 1 1 1
#> 19 2020-03-16 c 0 0 0 0 0 0 0 0
#> 20 2020-03-16 d 0 0 0 0 1 0 1 1
#> 21 2020-03-16 RE 0 0 0 0 0 2 2 2
#> 22 2020-03-16 a 0 0 0 0 0 0 0 0
#> 23 2020-03-16 b 0 0 0 0 0 0 0 0
#> 24 2020-03-16 c 0 0 0 0 0 0 0 0
#> 25 2020-03-16 d 0 0 0 0 0 2 2 2
#> # ... with 1 more variable: percent