How to pivoting dataframe consisting column with section and sub section In R

前端 未结 1 1583
时光取名叫无心
时光取名叫无心 2021-02-07 20:50

I have a below-mentioned dataframe:

structure(
  list(ID = c(\"P-1\", \" P-1\", \"P-1\", \"P-2\", \"P-3\", \"P-4\", \"P-5\"         


        
1条回答
  •  傲寒
    傲寒 (楼主)
    2021-02-07 21:40

    Hopefully that'll be enough to get you started, to go further, I'll need an expected output that looks like it comes from R, and further explanations as to how variables are computed.

    library(tidyverse)
    df <- structure(
      list(ID = c("P-1", " P-1", "P-1", "P-2", "P-3", "P-4", "P-5", "P-6", "P-7",
                  "P-8"),
           Date = c("2020-03-16 12:11:33", "2020-03-16 13:16:04",
                    "2020-03-16 06:13:55", "2020-03-16 10:03:43",
                    "2020-03-16 12:37:09", "2020-03-16 06:40:24",
                    "2020-03-16 09:46:45", "2020-03-16 12:07:44",
                    "2020-03-16 14:09:51", "2020-03-16 09:19:23"),
           Status = c("SA", "SA", "SA", "RE", "RE", "RE", "RE", "XA", "XA", "XA"),
           Flag = c("L", "L", "L", NA, "K", "J", NA, NA, "H", "G"),
           Value = c(5929.81, 5929.81, 5929.81, NA, 6969.33, 740.08, NA, NA, 1524.8,
                     NA),
           Flag2 = c("CL", "CL", "CL", NA, "RY", "", NA, NA, "", NA),
           Flag3 = c(NA, NA, NA, NA, "RI", "PO", NA, "SS", "DDP", NA)),
      .Names=c("ID", "Date", "Status", "Flag", "Value", "Flag2", "Flag3"),
      row.names=c(NA, 10L), class="data.frame")
    
    df2 <- df %>%
      mutate(
        # add variables
        Value = ifelse(0 <= Value & Value <= 15000, "0-15000", "15000-50000"),
        substatus = case_when(
          !is.na(Flag2) & is.na(Flag3) ~ "a",
          !is.na(Flag3) & is.na(Flag2) ~ "b",
          !is.na(Flag3) & !is.na(Flag2) ~ "c",
          TRUE ~ "d"),
        # make Date an actual date rather than a timestamp
        Date = as.Date(Date),
        # remove obsolete columns
        Flag2 = NULL,
        Flag3 = NULL,
        ID = NULL,
        # renames NAs into the name of the desired column
        Flag = ifelse(is.na(Flag), "[Null]", Flag),
        # create column of 1 for pivot
        temp = 1,
        # and row id
        id = row_number()
        ) %>%
      # create new columns L K etc, this also drops the Flag col
      pivot_wider(names_from = "Flag", values_from = "temp", values_fill = list(temp=0)) %>%
      # move `[Null]` column to the end
      select(everything(), -`[Null]`, `[Null]`) %>%
      mutate(
        id = NULL,
        count = 1,
        Total = rowSums(select(., L:`[Null]`))) 
    df2
    #> # A tibble: 10 x 12
    #>    Date       Status Value substatus     L     K     J     H     G `[Null]`
    #>                         
    #>  1 2020-03-16 SA     0-15~ a             1     0     0     0     0        0
    #>  2 2020-03-16 SA     0-15~ a             1     0     0     0     0        0
    #>  3 2020-03-16 SA     0-15~ a             1     0     0     0     0        0
    #>  4 2020-03-16 RE       d             0     0     0     0     0        1
    #>  5 2020-03-16 RE     0-15~ c             0     1     0     0     0        0
    #>  6 2020-03-16 RE     0-15~ c             0     0     1     0     0        0
    #>  7 2020-03-16 RE       d             0     0     0     0     0        1
    #>  8 2020-03-16 XA       b             0     0     0     0     0        1
    #>  9 2020-03-16 XA     0-15~ c             0     0     0     1     0        0
    #> 10 2020-03-16 XA       d             0     0     0     0     1        0
    #> # ... with 2 more variables: count , Total 
    
    # As you didn't tell what to do with NA values so I left them as NA 
    
    bind_rows(
      df2 %>%
        # add missing combinations of abcd
        complete(nesting(Date, Status, Value), substatus) %>%
        group_by(Date, Value, Status, substatus) %>% 
        summarize_all(~sum(., na.rm=TRUE)) %>%
        group_by(Status, Value) %>%
        mutate(percent = paste(round(100 * Total / sum(Total), 2), "%")) %>%
        ungroup(),
      df2 %>% 
        mutate(substatus = Status, Status = paste0(Status, "_")) %>%
        group_by(Date, Value, Status, substatus) %>% 
        mutate(count = n()) %>%
        group_by(count, add = TRUE) %>%
        summarize_all(~sum(., na.rm=TRUE)) %>%
        group_by(Value) %>%
        mutate(percent = paste(round(100 * Total / sum(Total), 2), "%"))
    ) %>%
      arrange(Date, Value, desc(Status)) %>%
      mutate(Status = NULL) %>%
      rename(Status = substatus) %>%
      print(n=Inf)
    #> # A tibble: 25 x 12
    #>    Date       Value Status     L     K     J     H     G `[Null]` count Total
    #>                      
    #>  1 2020-03-16 0-15~ XA         0     0     0     1     0        0     1     1
    #>  2 2020-03-16 0-15~ a          0     0     0     0     0        0     0     0
    #>  3 2020-03-16 0-15~ b          0     0     0     0     0        0     0     0
    #>  4 2020-03-16 0-15~ c          0     0     0     1     0        0     1     1
    #>  5 2020-03-16 0-15~ d          0     0     0     0     0        0     0     0
    #>  6 2020-03-16 0-15~ SA         3     0     0     0     0        0     3     3
    #>  7 2020-03-16 0-15~ a          3     0     0     0     0        0     3     3
    #>  8 2020-03-16 0-15~ b          0     0     0     0     0        0     0     0
    #>  9 2020-03-16 0-15~ c          0     0     0     0     0        0     0     0
    #> 10 2020-03-16 0-15~ d          0     0     0     0     0        0     0     0
    #> 11 2020-03-16 0-15~ RE         0     1     1     0     0        0     2     2
    #> 12 2020-03-16 0-15~ a          0     0     0     0     0        0     0     0
    #> 13 2020-03-16 0-15~ b          0     0     0     0     0        0     0     0
    #> 14 2020-03-16 0-15~ c          0     1     1     0     0        0     2     2
    #> 15 2020-03-16 0-15~ d          0     0     0     0     0        0     0     0
    #> 16 2020-03-16   XA         0     0     0     0     1        1     2     2
    #> 17 2020-03-16   a          0     0     0     0     0        0     0     0
    #> 18 2020-03-16   b          0     0     0     0     0        1     1     1
    #> 19 2020-03-16   c          0     0     0     0     0        0     0     0
    #> 20 2020-03-16   d          0     0     0     0     1        0     1     1
    #> 21 2020-03-16   RE         0     0     0     0     0        2     2     2
    #> 22 2020-03-16   a          0     0     0     0     0        0     0     0
    #> 23 2020-03-16   b          0     0     0     0     0        0     0     0
    #> 24 2020-03-16   c          0     0     0     0     0        0     0     0
    #> 25 2020-03-16   d          0     0     0     0     0        2     2     2
    #> # ... with 1 more variable: percent 
    

    0 讨论(0)
提交回复
热议问题