Convert unstructured csv file to a data frame

前端 未结 3 976
再見小時候
再見小時候 2021-01-13 16:07

I am learning R for text mining. I have a TV program schedule in form of CSV. The programs usually start at 06:00 AM and goes on until 05:00 AM the next day which is called

3条回答
  •  被撕碎了的回忆
    2021-01-13 16:14

    An alternative solution with data.table:

    library(data.table)
    library(zoo)
    library(splitstackshape)
    
    txt <- textConnection("Sunday|\n 01-Nov-15|\n 6|Tom\n some information about the program|\n 23.3|Jerry\n some information about the program|\n 5|Avatar\n some information about the program|\nMonday|\n 02-Nov-15|\n 6|Tom\n some information about the program|\n 23.3|Jerry\n some information about the program|\n 5|Avatar\n some information about the program|")
    tv <- readLines(txt)
    DT <- data.table(tv)[, tv := gsub('[|]$', '', tv)]
    
    wd <- levels(weekdays(1:7, abbreviate = FALSE))
    
    DT <- DT[, temp := tv %chin% wd
             ][, day := tv[temp], by = 1:nrow(tvDT)
               ][, day := na.locf(day)
                 ][, temp := NULL
                   ][, idx := rleid(day)
                     ][, date := tv[2], by = idx
                       ][, .SD[-c(1,2)], by = idx]
    
    DT <- cSplit(DT, sep="|", "tv", "long")[, lbl := rep(c("Time","Program","Info")), by = idx]
    DT <- dcast(DT, idx + day + date + rowid(lbl) ~ lbl, value.var = "tv")[, lbl := NULL]
    
    DT <- DT[, datetime := as.POSIXct(paste(as.character(date), sprintf("%01.2f",as.numeric(as.character(Time)))), format = "%d-%b-%y %H.%M")
       ][, datetime := datetime + (+(datetime < shift(datetime, fill=datetime[1]) & datetime < 6) * 24 * 60 * 60)
         ][, .(datetime, Program, Info)]
    

    The result:

    > DT
                  datetime Program                               Info
    1: 2015-11-01 06:00:00     Tom some information about the program
    2: 2015-11-01 23:30:00   Jerry some information about the program
    3: 2015-11-02 05:00:00  Avatar some information about the program
    4: 2015-11-02 06:00:00     Tom some information about the program
    5: 2015-11-02 23:30:00   Jerry some information about the program
    6: 2015-11-03 05:00:00  Avatar some information about the program
    

    Explanation:

    1: read data, convert to a data.table & remove trailing |:

    txt <- textConnection("Sunday|\n 01-Nov-15|\n 6|Tom\n some information about the program|\n 23.3|Jerry\n some information about the program|\n 5|Avatar\n some information about the program|\nMonday|\n 02-Nov-15|\n 6|Tom\n some information about the program|\n 23.3|Jerry\n some information about the program|\n 5|Avatar\n some information about the program|")
    tv <- readLines(txt)
    DT <- data.table(tv)[, tv := gsub('[|]$', '', tv)]
    

    2: extract the weekdays into a new column

    wd <- levels(weekdays(1:7, abbreviate = FALSE)) # a vector with the full weekdays
    DT[, temp := tv %chin% wd
       ][, day := tv[temp], by = 1:nrow(tvDT)
         ][, day := na.locf(day)
           ][, temp := NULL]
    

    3: create an index per day & create a column with the dates

    DT[, idx := rleid(day)][, date := tv[2], by = idx]
    

    4: remove unnecessary lines

    DT <- DT[, .SD[-c(1,2)], by = idx]
    

    5: split the time and the program-name into separate rows & create a label column

    DT <- cSplit(DT, sep="|", "tv", "long")[, lbl := rep(c("Time","Program","Info")), by = idx]
    

    6: reshape into wide format using the 'rowid' function from the development version of data.table

    DT <- dcast(DT, idx + day + date + rowid(idx2) ~ idx2, value.var = "tv")[, idx2 := NULL]
    

    7: create a dattime column & set the late night time to the next day

    DT[, datetime := as.POSIXct(paste(as.character(date), sprintf("%01.2f",as.numeric(as.character(Time)))), format = "%d-%b-%y %H.%M")
       ][, datetime := datetime + (+(datetime < shift(datetime, fill=datetime[1]) & datetime < 6) * 24 * 60 * 60)]
    

    8: keep the needed columns

    DT <- DT[, .(datetime, Program, Info)]
    

提交回复
热议问题