row not consolidating duplicates in R when using multiple months in Date Filter

后端 未结 1 1435
梦毁少年i
梦毁少年i 2020-12-07 03:25

I am using the following code to summarize my data by a column

library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)

######         


        
相关标签:
1条回答
  • 2020-12-07 03:36

    The OP is wondering why the result is not consolidated for CUST_ID if more than one month of data is processed.

    The reason is that the monthly files are read in and aggregated one by one but a final aggregation step is needed to consolidate over all months.

    The code below is a simplified replacement of the double for loops. I have left out the code for testing for "fast filtering".

    The first part creates a list of files to be processed. The second part does the processing.

    # create vector of filenames to be processed
    in_filenames <- list.files(
      file.path(in_directory, in_subfolders), 
      pattern = "\\.txt$", 
      full.names = TRUE, 
      recursive = TRUE)
    
    # read and aggregate each file separately
    mastertable <- rbindlist(
      lapply(in_filenames, function(fn) {
        # code for "fast filter" test goes here
        message("Reading in ", fn)
        temptable <- fread(fn,
                           colClasses = c(CUSTOMER_TIER = "character"),
                           na.strings = "")
        # aggregate 
        temptable[, lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
      })
    )[
      # THIS IS THE MISSING STEP:
      # second aggregation for overall totals
      , lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
    
    Processing file: Raw Data/AA-CA/AA-CA 2017-01.txt
    Processing file: Raw Data/AA-CA/AA-CA 2017-02.txt
    Processing file: Raw Data/CB-HZ/CB-HZ 2017-01.txt
    Processing file: Raw Data/CB-HZ/CB-HZ 2017-02.txt
    
    mastertable
    
         CUST_ID Ext Sale
    1: AK0010001  427.803
    2: CO0020001 1540.300
    3: CO0010001 -179.765
    

    Note that chaining of data.table expressions is used here.


    Edit 1:

    By request of the OP, here is the complete code (except for the "fast filtering" stuff). There are some additional lines which where modified. They are marked with ### MODIFIED.

    library(data.table, warn.conflicts = FALSE)
    library(lubridate, warn.conflicts = FALSE)
    
    ################
    ## PARAMETERS ##
    ################
    
    # Set path of major source folder for raw transaction data
    in_directory <- "Raw Data"   ### MODIFIED
    
    # List names of sub-folders (currently grouped by first two characters of CUST_ID)
    in_subfolders <- list("AA-CA", "CB-HZ")
    
    # Set location for output
    out_directory <- "YTD Master"   ### MODIFIED
    out_filename <- "OUTPUT.csv"
    
    # Set beginning and end of date range to be collected - year-month-day format
    date_range <- interval(as.Date("2017-01-01"), as.Date("2017-02-28"))   ### MODIFIED
    
    # Enable or disable filtering of raw files to only grab items bought within certain months to save space.
    # If false, all files will be scanned for unique items, which will take longer and be a larger file.
    date_filter <- TRUE
    
    
    ##########
    ## CODE ##
    ##########
    
    starttime <- Sys.time()
    
    # create vector of filenames to be processed
    in_filenames <- list.files(
      file.path(in_directory, in_subfolders), 
      pattern = "\\.txt$", 
      full.names = TRUE, 
      recursive = TRUE)
    
    # read and aggregate each file separetely
    mastertable <- rbindlist(
      lapply(in_filenames, function(fn) {
        # code for fast filter test goes here
        message("Processing file: ", fn)
        temptable <- fread(fn,
                           colClasses = c(CUSTOMER_TIER = "character"),
                           na.strings = "")
        # aggregate by month
        temptable[, lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
      })
    )[
      # second aggregation overall
      , lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
    
    # Save Final table
    print("Saving master table")
    fwrite(mastertable, paste0(out_directory, out_filename))
    # rm(mastertable)   ### MODIFIED
    
    print(Sys.time()-starttime)
    

    Edit 2

    The OP has asked to include the "fast filter" code which I had omitted for brevity.

    However, I have a different approach. Instead of reading the first line of each file to check if INVOICE_DT is within the given date_range my approach filters the file names. The file names contain the year-month in ISO 8601 format.

    So, a vector of allowed year-month strings is constructed from the given date_range. Only those file names which contain one of the allowed year-month strings are selected for further processing.

    However, selecting the proper files is only the first step. As the date-range may start or end right in the middel of a month, we need also to filter the rows of each processed file. This step is missing from OP's code.

    library(data.table, warn.conflicts = FALSE)
    library(magrittr)   ### MODIFIED
    # library(lubridate, warn.conflicts = FALSE)   ### MODIFIED
    
    ################
    ## PARAMETERS ##
    ################
    
    # Set path of major source folder for raw transaction data
    in_directory <- "Raw Data"   ### MODIFIED
    
    # List names of sub-folders (currently grouped by first two characters of CUST_ID)
    in_subfolders <- list("AA-CA", "CB-HZ")
    
    # Set location for output
    out_directory <- "YTD Master"   ### MODIFIED
    out_filename <- "OUTPUT.csv"
    
    # Set beginning and end of date range to be collected - year-month-day format
    date_range <- c("2017-01-01", "2017-02-14")   ### MODIFIED
    
    # Enable or disable filtering of raw files to only grab items bought within certain months to save space.
    # If false, all files will be scanned for unique items, which will take longer and be a larger file.
    # date_filter <- TRUE   ### MODIFIED
    
    
    ##########
    ## CODE ##
    ##########
    
    starttime <- Sys.time()
    
    # create vector of filenames to be processed
    in_filenames <- list.files(
      file.path(in_directory, in_subfolders), 
      pattern = "\\.txt$", 
      full.names = TRUE, 
      recursive = TRUE)
    
    # filter filenames, only 
    selected_in_filenames <- 
      seq(as.Date(date_range[1]), 
          as.Date(date_range[2]), by = "1 month") %>% 
      format("%Y-%m") %>% 
      lapply(function(x) stringr::str_subset(in_filenames, x)) %>% 
      unlist()
    
    # read and aggregate each file separetely
    mastertable <- rbindlist(
      lapply(selected_in_filenames, function(fn) {
        message("Processing file: ", fn)
        temptable <- fread(fn,
                           colClasses = c(CUSTOMER_TIER = "character"),
                           na.strings = "")
        # aggregate file but filtered for date_range
        temptable[INVOICE_DT %between% date_range, 
                  lapply(.SD, sum), by = .(CUST_ID, QTR = quarter(INVOICE_DT)), 
                  .SDcols = c("Ext Sale")]
      })
    )[
      # second aggregation overall
      , lapply(.SD, sum), by = .(CUST_ID, QTR), .SDcols = c("Ext Sale")]
    
    # Save Final table
    print("Saving master table")
    fwrite(mastertable, file.path(out_directory, out_filename))
    # rm(mastertable)   ### MODIFIED
    
    print(Sys.time()-starttime)
    
    mastertable
    
         CUST_ID QTR Ext Sale
    1: AK0010001   1  209.970
    2: CO0020001   1 1540.300
    3: CO0010001   1   -1.565
    

    Note that date_range <- c("2017-01-01", "2017-02-14") now ends mid of February.

    0 讨论(0)
提交回复
热议问题