问题
I am learning R for text mining. I have a TV program schedule in form of CSV. The programs usually start at 06:00 AM and goes on until 05:00 AM the next day which is called a broadcast day. For example: the programs for 15/11/2015 start at 06:00 AM and ends at 05:00 AM the next day.
Here is a sample code showing how the schedule looks like:
read.table(textConnection("Sunday|\n 01-Nov-15|\n 6|Tom\n some information about the program|\n 23.3|Jerry\n some information about the program|\n 5|Avatar\n some information about the program|\nMonday|\n 02-Nov-15|\n 6|Tom\n some information about the program|\n 23.3|Jerry\n some information about the program|\n 5|Avatar\n some information about the program|"), header = F, sep = "|", stringsAsFactors = F)
whose output is as follows:
V1|V2
Sunday |
01-Nov-15 |
6 | Tom
some information about the program |
23.3 | Jerry
some information about the program |
5 | Avatar
some information about the program |
5.3 | Panda
some information about the program |
Monday |
02-Nov-15|
6 Jerry
some information about the program |
6.25 | Panda
some information about the program |
23.3 | Avatar
some information about the program |
7.25 | Tom
some information about the program |
I want to convert the above data into a form of data.frame
Date |Program|Synopsis
2015-11-1 06:00 |Tom | some information about the program
2015-11-1 23:30 |Jerry | some information about the program
2015-11-2 05:00 |Avatar | some information about the program
2015-11-2 05:30 |Panda | some information about the program
2015-11-2 06:00 |Jerry | some information about the program
2015-11-2 06:25 |Panda | some information about the program
2015-11-2 23:30 |Avatar | some information about the program
2015-11-3 07:25 |Tom | some information about the program
I am thankful for any suggestions/tips regarding functions or packages I should have a look at.
回答1:
It's a bit of a mess, but it seems to work:
df <- read.table(textConnection(txt <- "Sunday|\n 01-Nov-15|\n 6|Tom\n some information about the program|\n 23.3|Jerry\n some information about the program|\n 5|Avatar\n some information about the program|\nMonday|\n 02-Nov-15|\n 6|Tom\n some information about the program|\n 23.3|Jerry\n some information about the program|\n 5|Avatar\n some information about the program|"), header = F, sep = "|", stringsAsFactors = F)
cat(txt)
Sys.setlocale("LC_TIME", "English") # if needed
weekdays <- format(seq.Date(Sys.Date(), Sys.Date()+6, 1), "%A")
days <- split(df, cumsum(df$V1 %in% weekdays))
lapply(days, function(dayDF) {
tmp <- cbind.data.frame(V1=dayDF[2, 1], do.call(rbind, split(unlist(dayDF[-c(1:2), ]), cumsum(!dayDF[-(1:2), 2]==""))), stringsAsFactors = F)
tmp[, 1] <- as.Date(tmp[, 1], "%d-%B-%y")
tmp[, 2] <- as.numeric(tmp[, 2])
tmp[, 5] <- NULL
idx <- c(FALSE, diff(tmp[, 2])<0)
tmp[idx, 1] <- tmp[idx, 1] + 1
return(tmp)
}) -> days
days <- transform(do.call(rbind.data.frame, days), V1=as.POSIXct(paste(V1, sprintf("%.2f", V11)), format="%Y-%m-%d %H.%M"), V11=NULL)
names(days) <- c("Date", "Synopsis", "Program")
rownames(days) <- NULL
days[, c(1, 3, 2)]
# Date Program Synopsis
# 1 2015-11-01 06:00:00 Tom some information about the program
# 2 2015-11-01 23:30:00 Jerry some information about the program
# 3 2015-11-02 05:00:00 Avatar some information about the program
# 4 2015-11-02 06:00:00 Tom some information about the program
# 5 2015-11-02 23:30:00 Jerry some information about the program
# 6 2015-11-03 05:00:00 Avatar some information about the program
回答2:
An alternative solution with data.table:
library(data.table)
library(zoo)
library(splitstackshape)
txt <- textConnection("Sunday|\n 01-Nov-15|\n 6|Tom\n some information about the program|\n 23.3|Jerry\n some information about the program|\n 5|Avatar\n some information about the program|\nMonday|\n 02-Nov-15|\n 6|Tom\n some information about the program|\n 23.3|Jerry\n some information about the program|\n 5|Avatar\n some information about the program|")
tv <- readLines(txt)
DT <- data.table(tv)[, tv := gsub('[|]$', '', tv)]
wd <- levels(weekdays(1:7, abbreviate = FALSE))
DT <- DT[, temp := tv %chin% wd
][, day := tv[temp], by = 1:nrow(tvDT)
][, day := na.locf(day)
][, temp := NULL
][, idx := rleid(day)
][, date := tv[2], by = idx
][, .SD[-c(1,2)], by = idx]
DT <- cSplit(DT, sep="|", "tv", "long")[, lbl := rep(c("Time","Program","Info")), by = idx]
DT <- dcast(DT, idx + day + date + rowid(lbl) ~ lbl, value.var = "tv")[, lbl := NULL]
DT <- DT[, datetime := as.POSIXct(paste(as.character(date), sprintf("%01.2f",as.numeric(as.character(Time)))), format = "%d-%b-%y %H.%M")
][, datetime := datetime + (+(datetime < shift(datetime, fill=datetime[1]) & datetime < 6) * 24 * 60 * 60)
][, .(datetime, Program, Info)]
The result:
> DT
datetime Program Info
1: 2015-11-01 06:00:00 Tom some information about the program
2: 2015-11-01 23:30:00 Jerry some information about the program
3: 2015-11-02 05:00:00 Avatar some information about the program
4: 2015-11-02 06:00:00 Tom some information about the program
5: 2015-11-02 23:30:00 Jerry some information about the program
6: 2015-11-03 05:00:00 Avatar some information about the program
Explanation:
1: read data, convert to a data.table & remove trailing |
:
txt <- textConnection("Sunday|\n 01-Nov-15|\n 6|Tom\n some information about the program|\n 23.3|Jerry\n some information about the program|\n 5|Avatar\n some information about the program|\nMonday|\n 02-Nov-15|\n 6|Tom\n some information about the program|\n 23.3|Jerry\n some information about the program|\n 5|Avatar\n some information about the program|")
tv <- readLines(txt)
DT <- data.table(tv)[, tv := gsub('[|]$', '', tv)]
2: extract the weekdays into a new column
wd <- levels(weekdays(1:7, abbreviate = FALSE)) # a vector with the full weekdays
DT[, temp := tv %chin% wd
][, day := tv[temp], by = 1:nrow(tvDT)
][, day := na.locf(day)
][, temp := NULL]
3: create an index per day & create a column with the dates
DT[, idx := rleid(day)][, date := tv[2], by = idx]
4: remove unnecessary lines
DT <- DT[, .SD[-c(1,2)], by = idx]
5: split the time and the program-name into separate rows & create a label column
DT <- cSplit(DT, sep="|", "tv", "long")[, lbl := rep(c("Time","Program","Info")), by = idx]
6: reshape into wide format using the 'rowid' function from the development version of data.table
DT <- dcast(DT, idx + day + date + rowid(idx2) ~ idx2, value.var = "tv")[, idx2 := NULL]
7: create a dattime column & set the late night time to the next day
DT[, datetime := as.POSIXct(paste(as.character(date), sprintf("%01.2f",as.numeric(as.character(Time)))), format = "%d-%b-%y %H.%M")
][, datetime := datetime + (+(datetime < shift(datetime, fill=datetime[1]) & datetime < 6) * 24 * 60 * 60)]
8: keep the needed columns
DT <- DT[, .(datetime, Program, Info)]
回答3:
1) This sets up some functions and then consists of four transform(...) %>% subset(...)
code fragments linked together using a magrittr pipeline. We assume DF
is the output of the read.table
in the question.
First, load the zoo package so get access to na.locf
. Define a Lead
function which shifts each element by 1 position. Also define a datetime
function which converts a date plus a h.m number to a datetime.
Now convert the dates to "Date"
class. The rows that are not dates will become NA. Use Lead
to shift that vector by 1 position and then extract the NA positions effectively removing the weekday rows. Now use na.locf
to fill in the dates and keep only rows with duplicated dates effectively removing the rows containing only a date. Next set Program
as V1
and Synopsis
as V2
except we must shift V2
using Lead
since the Synopsis
is on the second row of each pair. Keep only the odd positioned rows. Produce datetime
and pick out desired columns.
library(magrittr)
library(zoo) # needed for na.locf
Lead <- function(x, fill = NA) c(x[-1], fill) # shift down and fill
datetime <- function(date, time) {
time <- as.numeric(time)
as.POSIXct(sprintf("%s %.0f:%02f", date, time, 100 * (time %% 1))) +
24 * 60 * 60 * (time < 6) # add day if time < 6
}
DF %>%
transform(date = as.Date(V1, "%d-%b-%y")) %>%
subset(Lead(is.na(date), TRUE)) %>% # rm weekday rows
transform(date = na.locf(date)) %>% # fill in dates
subset(duplicated(date)) %>% # rm date rows
transform(Program = V2, Synopsis = Lead(V1)) %>%
subset(c(TRUE, FALSE)) %>% # keep odd positioned rows only
transform(Date = datetime(date, V1)) %>%
subset(select = c("Date", "Program", "Synopsis"))
giving:
Date Program Synopsis
1 2015-11-01 06:00:00 Tom some information about the program
2 2015-11-01 23:30:00 Jerry some information about the program
3 2015-11-02 05:00:00 Avatar some information about the program
4 2015-11-02 06:00:00 Tom some information about the program
5 2015-11-02 23:30:00 Jerry some information about the program
6 2015-11-03 05:00:00 Avatar some information about the program
2) dplyr and here it is using dplyr and the datetime
function above. We could have replaced the transform
and subset
functions in (1) with dplyr mutate
and filter
and Lead
with lead
but for variety we do it another way:
library(dplyr)
library(zoo) # na.locf
DF %>%
mutate(date = as.Date(V1, "%d-%b-%t")) %>%
filter(lead(is.na(date), default = TRUE)) %>% # rm weekday rows
mutate(date = na.locf(date)) %>% # fill in dates
group_by(date) %>%
mutate(Program = V2, Synopsis = lead(V1)) %>%
slice(seq(2, n(), by = 2)) %>%
ungroup() %>%
mutate(Date = datetime(date, V1)) %>%
select(Date, Program, Synopsis)
giving:
Source: local data frame [6 x 3]
Date Program Synopsis
(time) (chr) (chr)
1 2015-11-01 06:00:00 Tom some information about the program
2 2015-11-01 23:30:00 Jerry some information about the program
3 2015-11-02 05:00:00 Avatar some information about the program
4 2015-11-02 06:00:00 Tom some information about the program
5 2015-11-02 23:30:00 Jerry some information about the program
6 2015-11-03 05:00:00 Avatar some information about the program
3) data.table This also uses na.locf
from zoo and datetime
defined in (1):
library(data.table)
library(zoo)
dt <- data.table(DF)
dt <- dt[, date := as.Date(V1, "%d-%b-%y")][
shift(is.na(date), type = "lead", fill = TRUE)][, # rm weekday rows
date := na.locf(date)][duplicated(date)][, # fill in dates & rm date rows
Synopsis := shift(V1, type = "lead")][seq(1, .N, 2)][, # align Synopsis
c("Date", "Program") := list(datetime(date, V1), V2)][,
list(Date, Program, Synopsis)]
giving:
> dt
Date Program Synopsis
1: 2015-11-01 06:00:00 Tom some information about the program
2: 2015-11-01 23:30:00 Jerry some information about the program
3: 2015-11-02 05:00:00 Avatar some information about the program
4: 2015-11-02 06:00:00 Tom some information about the program
5: 2015-11-02 23:30:00 Jerry some information about the program
6: 2015-11-03 05:00:00 Avatar some information about the program
UPDATE: Simplified (1) and added (2) and (3).
来源:https://stackoverflow.com/questions/33719058/convert-unstructured-csv-file-to-a-data-frame