问题
When reading a file, the read.table
function uses type.convert
to distinguish between logical, integer, numeric, complex, or factor columns and store them accordingly.
I'd like to add dates to the mix, so that columns containing dates can automatically be recognized and parsed into Date
objects. Only a few date formats should be recognized, e.g.
date.formats <- c("%m/%d/%Y", "%Y/%m/%d")
Here is an example:
fh <- textConnection(
"num char date-format1 date-format2 not-all-dates not-same-formats
10 a 1/1/2013 2013/01/01 2013/01/01 1/1/2013
20 b 2/1/2013 2013/02/01 a 2013/02/01
30 c 3/1/2013 NA b 3/1/2013"
)
And the output of
dat <- my.read.table(fh, header = TRUE, stringsAsFactors = FALSE,
date.formats = date.formats)
sapply(dat, class)
would give:
num => numeric
char => character
date-format1 => Date
date-format2 => Date
not-all-dates => character
not-same-formats => character # not a typo: date format must be consistent
Before I go and implement it from scratch, is something like this already available in a package? Or maybe someone already gave it a crack (or will) and is willing to share his code here? Thank you.
回答1:
You could use lubridate::parse_date_time
, which is a bit stricter (and creates POSIXlt
) data.
I've also added a bit more checking for existing NA values (may not be necessary).
eg
library(lubridate)
my.read.table <- function(..., date.formats = c("%m/%d/%Y", "%Y/%m/%d")) {
dat <- read.table(...)
for (col.idx in seq_len(ncol(dat))) {
x <- dat[, col.idx]
if(!is.character(x) | is.factor(x)) next
if (all(is.na(x))) next
for (format in date.formats) {
complete.x <- !(is.na(x))
d <- as.Date(parse_date_time(as.character(x), format, quiet = TRUE))
d.na <- d[complete.x]
if (any(is.na(d.na))) next
dat[, col.idx] <- d
}
}
dat
}
dat <- my.read.table(fh, stringsAsFactors = FALSE,header=TRUE)
str(dat)
'data.frame': 3 obs. of 6 variables:
$ num : int 10 20 30
$ char : chr "a" "b" "c"
$ date.format1 : Date, format: "2013-01-01" "2013-02-01" "2013-03-01"
$ date.format2 : Date, format: "2013-01-01" "2013-02-01" NA
$ not.all.dates : chr "2013/01/01" "a" "b"
$ not.same.formats: chr "1/1/2013" "2013/02/01" "3/1/2013"
An alternative would be to use options(warn = 2)
within the function and wrap the parse_date_time(...)
in a try statement
my.read.table <- function(..., date.formats = c("%m/%d/%Y", "%Y/%m/%d")) {
dat <- read.table(...)
owarn <-getOption('warn')
on.exit(options(warn = owarn))
options(warn = 2)
for (col.idx in seq_len(ncol(dat))) {
x <- dat[, col.idx]
if(!is.character(x) | is.factor(x)) next
if (all(is.na(x))) next
for (format in date.formats) {
d <- try(as.Date(parse_date_time(as.character(x), format)), silent= TRUE)
if (inherits(d, 'try-error')) next
dat[, col.idx] <- d
}
}
dat
}
回答2:
You can try with regular expressions.
my.read.table <- function(..., date.formats = c("%m/%d/%Y", "%Y/%m/%d")) {
require(stringr)
formats <- c(
"%m" = "[0-9]{1,2}",
"%d" = "[0-9]{1,2}",
"%Y" = "[0-9]{4}"
)
dat <- read.table(...)
for (col.idx in seq_len(ncol(dat))) {
for (format in date.formats) {
x <- dat[, col.idx]
if(!is.character(x) | is.factor(x)) break
if (all(is.na(x))) break
x <- as.character(x)
# Convert the format into a regular expression
for( k in names(formats) ) {
format <- str_replace_all( format, k, formats[k] )
}
# Check if it matches on the non-NA elements
if( all( str_detect( x, format ) | is.na(x) ) ) {
dat[, col.idx] <- as.Date(x, format)
break
}
}
}
dat
}
dat <- my.read.table(fh, header = TRUE, stringsAsFactors = FALSE)
as.data.frame(sapply(dat, class))
# sapply(dat, class)
# num integer
# char character
# date.format1 Date
# date.format2 Date
# not.all.dates character
# not.same.formats character
回答3:
Here I threw one together quickly. It is not handling the last column properly because the as.Date
function is not strict enough (see that as.Date("1/1/2013", "%Y/%m/%d")
parses ok for example...)
my.read.table <- function(..., date.formats = c("%m/%d/%Y", "%Y/%m/%d")) {
dat <- read.table(...)
for (col.idx in seq_len(ncol(dat))) {
x <- dat[, col.idx]
if(!is.character(x) | is.factor(x)) next
if (all(is.na(x))) next
for (f in date.formats) {
d <- as.Date(as.character(x), f)
if (any(is.na(d[!is.na(x)]))) next
dat[, col.idx] <- d
}
}
dat
}
dat <- my.read.table(fh, header = TRUE, stringsAsFactors = FALSE)
as.data.frame(sapply(dat, class))
# sapply(dat, class)
# num integer
# char character
# date.format1 Date
# date.format2 Date
# not.all.dates character
# not.same.formats Date
If you know a way to parse dates that is more strict around formats than as.Date
(see the example above), please let me know.
Edit: To make the date parsing super strict, I can add
if (!identical(x, format(d, f))) next
For it to work, I will need all my input dates to have leading zeroes where needed, i.e. 01/01/2013
and not 1/1/2013
. I can live with that if that's the standard way.
来源:https://stackoverflow.com/questions/18390674/automatically-detect-date-columns-when-reading-a-file-into-a-data-frame