问题
I am trying to read in and work with a horribly formatted debug log. There are no consistent delimeters and it does not appear line breaks are encoded either.
What I'd like to do is read in and parse the data to have a new line for each date (YYYY-MM-DD format).
I am trying to work within the tidyverse
but cannot seem to get something that will parse the file correctly.
Is there a way to force lines to be delimited by a date pattern?
None of these work:
library(tidyverse)
Log_File <- read.table("Example.txt", header = F, fill = T, skip = 1, allowEscapes = TRUE)
Log_File <- read_delim("Example.txt", col_names = F, delim = " ", n_max = 2)
Log_File <- read_lines("Example.txt", skip = 1, n_max = -1L, na = character(),
locale = default_locale(), progress = interactive())
> Log_File
V1 V2 V3 V4 V5 V6 V7
1 2019-09-20 14:06:18.952 [Error] [main] > CloudStorageExtension.swift[line:38]-downloadData(node:storageObj:value:): Error
2 2019-09-20 14:06:18.953 [Error] [main] > AlertService.swift[line:310]-retrieveProfileName(): Unable
3 error : {
4 code : 404,
5 message : Not Found. Could not get object ,
6 status : GET_OBJECT
7 }
8 }, bucket=integration-c5068.appspot.com, data=<7b0a2020 22657272 6f72223a 207b0a20 20202022
9 74206765 74206f62 6a656374 222c0a20 20202022 73746174 7573223a
10 ResponseErrorDomain=com.google.HTTPStatus, ResponseErrorCode=404}
11 2019-09-20 14:06:18.953 [Error] [main] > AlertService.swift[line:314]-retrieveProfileName(): AlertSettings
12 error : {
13 code : 404,
14 message : Not Found. Could not get object ,
15 status : GET_OBJECT
16 }
17 }, bucket=integration-c5068.appspot.com, data=<7b0a2020 22657272 6f72223a 207b0a20 20202022
18 74206765 74206f62 6a656374 222c0a20 20202022 73746174 7573223a
19 ResponseErrorDomain=com.google.HTTPStatus, ResponseErrorCode=404}
20 2019-09-20 14:06:18.957 [Error] [main] > CloudStorageExtension.swift[line:38]-downloadData(node:storageObj:value:): Error
I know linking to a text file is frowned upon, so here is some raw text, hopefully this works:
2019-09-20 14:06:18.952 [Error] [main] > CloudStorageExtension.swift[line:38]-downloadData(node:storageObj:value:): Error occurs when download filestorage data with description: Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist.
2019-09-20 14:06:18.953 [Error] [main] > AlertService.swift[line:310]-retrieveProfileName(): Unable to get AlertSettings Name: Error Domain=FIRStorageErrorDomain Code=-13010 "Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist." UserInfo={object=App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data, ResponseBody={
"error": {
"code": 404,
"message": "Not Found. Could not get object",
"status": "GET_OBJECT"
}
}, bucket=integration-c5068.appspot.com, data=<7b0a2020 22657272 6f72223a 207b0a20 20202022 636f6465 223a2034 30342c0a 20202020 226d6573 73616765 223a2022 4e6f7420 466f756e 642e2020 436f756c 64206e6f 74206765 74206f62 6a656374 222c0a20 20202022 73746174 7573223a 20224745 545f4f42 4a454354 220a2020 7d0a7d>, data_content_type=application/json; charset=UTF-8, NSLocalizedDescription=Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist., ResponseErrorDomain=com.google.HTTPStatus, ResponseErrorCode=404}
2019-09-20 14:06:18.953 [Error] [main] > AlertService.swift[line:314]-retrieveProfileName(): AlertSettings Name object missing: Error Domain=FIRStorageErrorDomain Code=-13010 "Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist." UserInfo={object=App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data, ResponseBody={
"error": {
"code": 404,
"message": "Not Found. Could not get object",
"status": "GET_OBJECT"
}
}, bucket=integration-c5068.appspot.com, data=<7b0a2020 22657272 6f72223a 207b0a20 20202022 636f6465 223a2034 30342c0a 20202020 226d6573 73616765 223a2022 4e6f7420 466f756e 642e2020 436f756c 64206e6f 74206765 74206f62 6a656374 222c0a20 20202022 73746174 7573223a 20224745 545f4f42 4a454354 220a2020 7d0a7d>, data_content_type=application/json; charset=UTF-8, NSLocalizedDescription=Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist., ResponseErrorDomain=com.google.HTTPStatus, ResponseErrorCode=404}
2019-09-20 14:06:18.957 [Error] [main] > CloudStorageExtension.swift[line:38]-downloadData(node:storageObj:value:): Error occurs when download filestorage data with description: Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist.
Here is a dput as read in with:
Log_File <- read_delim("Example.txt", col_names = F, delim = " ")
Data <- structure(list(X1 = c("2019-09-20", "2019-09-20", "error\": {\n \"code\": 404,\n \"message\": \"Not Found. Could not get object\",\n \"status\": \"GET_OBJECT",
" }", "},", "2019-09-20", "error\": {\n \"code\": 404,\n \"message\": \"Not Found. Could not get object\",\n \"status\": \"GET_OBJECT",
" }", "},", "2019-09-20"), X2 = c("14:06:18.952", "14:06:18.953",
NA, NA, "bucket=integration-c5068.appspot.com,", "14:06:18.953",
NA, NA, "bucket=integration-c5068.appspot.com,", "14:06:18.957"
), X3 = c("[Error]", "[Error]", NA, NA, "data=<7b0a2020", "[Error]",
NA, NA, "data=<7b0a2020", "[Error]"), X4 = c("[main]", "[main]",
NA, NA, "22657272", "[main]", NA, NA, "22657272", "[main]"),
X5 = c(">", ">", NA, NA, "6f72223a", ">", NA, NA, "6f72223a",
">"), X6 = c("CloudStorageExtension.swift[line:38]-downloadData(node:storageObj:value:):",
"AlertService.swift[line:310]-retrieveProfileName():", NA,
NA, "207b0a20", "AlertService.swift[line:314]-retrieveProfileName():",
NA, NA, "207b0a20", "CloudStorageExtension.swift[line:38]-downloadData(node:storageObj:value:):"
), X7 = c("Error", "Unable", NA, NA, "20202022", "AlertSettings",
NA, NA, "20202022", "Error"), X8 = c("occurs", "to", NA,
NA, "636f6465", "Name", NA, NA, "636f6465", "occurs"), X9 = c("when",
"get", NA, NA, "223a2034", "object", NA, NA, "223a2034",
"when"), X10 = c("download", "AlertSettings", NA, NA, "30342c0a",
"missing:", NA, NA, "30342c0a", "download"), X11 = c("filestorage",
"Name:", NA, NA, "20202020", "Error", NA, NA, "20202020",
"filestorage"), X12 = c("data", "Error", NA, NA, "226d6573",
"Domain=FIRStorageErrorDomain", NA, NA, "226d6573", "data"
), X13 = c("with", "Domain=FIRStorageErrorDomain", NA, NA,
"73616765", "Code=-13010", NA, NA, "73616765", "with"), X14 = c("description:",
"Code=-13010", NA, NA, "223a2022", "Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist.",
NA, NA, "223a2022", "description:"), X15 = c("Object", "Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist.",
NA, NA, "4e6f7420", "UserInfo={object=App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data,",
NA, NA, "4e6f7420", "Object"), X16 = c("App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data",
"UserInfo={object=App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data,",
NA, NA, "466f756e", "ResponseBody={", NA, NA, "466f756e",
"App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data"
), X17 = c("does", "ResponseBody={", NA, NA, "642e2020",
NA, NA, NA, "642e2020", "does"), X18 = c("not", NA, NA, NA,
"436f756c", NA, NA, NA, "436f756c", "not"), X19 = c("exist.",
NA, NA, NA, "64206e6f", NA, NA, NA, "64206e6f", "exist.")), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -10L), problems = structure(list(
row = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 4L,
5L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 6L, 7L, 8L, 9L
), col = c("X1", "X1", "X1", "X1", "X1", "X1", "X1", "X1",
"X1", "X1", NA, NA, NA, NA, "X1", "X1", "X1", "X1", "X1",
"X1", "X1", "X1", "X1", "X1", NA, NA, NA, NA), expected = c("delimiter or quote",
"delimiter or quote", "delimiter or quote", "delimiter or quote",
"delimiter or quote", "delimiter or quote", "delimiter or quote",
"delimiter or quote", "delimiter or quote", "delimiter or quote",
"19 columns", "19 columns", "19 columns", "19 columns", "delimiter or quote",
"delimiter or quote", "delimiter or quote", "delimiter or quote",
"delimiter or quote", "delimiter or quote", "delimiter or quote",
"delimiter or quote", "delimiter or quote", "delimiter or quote",
"19 columns", "19 columns", "19 columns", "19 columns"),
actual = c(":", "c", ":", "m", ":", "N", ",", "s", ":", "G",
"17 columns", "1 columns", "1 columns", "40 columns", ":",
"c", ":", "m", ":", "N", ",", "s", ":", "G", "16 columns",
"1 columns", "1 columns", "40 columns"), file = c("'Example.txt'",
"'Example.txt'", "'Example.txt'", "'Example.txt'", "'Example.txt'",
"'Example.txt'", "'Example.txt'", "'Example.txt'", "'Example.txt'",
"'Example.txt'", "'Example.txt'", "'Example.txt'", "'Example.txt'",
"'Example.txt'", "'Example.txt'", "'Example.txt'", "'Example.txt'",
"'Example.txt'", "'Example.txt'", "'Example.txt'", "'Example.txt'",
"'Example.txt'", "'Example.txt'", "'Example.txt'", "'Example.txt'",
"'Example.txt'", "'Example.txt'", "'Example.txt'")), row.names = c(NA,
-28L), class = c("tbl_df", "tbl", "data.frame")), spec = structure(list(
cols = list(X1 = structure(list(), class = c("collector_character",
"collector")), X2 = structure(list(), class = c("collector_character",
"collector")), X3 = structure(list(), class = c("collector_character",
"collector")), X4 = structure(list(), class = c("collector_character",
"collector")), X5 = structure(list(), class = c("collector_character",
"collector")), X6 = structure(list(), class = c("collector_character",
"collector")), X7 = structure(list(), class = c("collector_character",
"collector")), X8 = structure(list(), class = c("collector_character",
"collector")), X9 = structure(list(), class = c("collector_character",
"collector")), X10 = structure(list(), class = c("collector_character",
"collector")), X11 = structure(list(), class = c("collector_character",
"collector")), X12 = structure(list(), class = c("collector_character",
"collector")), X13 = structure(list(), class = c("collector_character",
"collector")), X14 = structure(list(), class = c("collector_character",
"collector")), X15 = structure(list(), class = c("collector_character",
"collector")), X16 = structure(list(), class = c("collector_character",
"collector")), X17 = structure(list(), class = c("collector_character",
"collector")), X18 = structure(list(), class = c("collector_character",
"collector")), X19 = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 0), class = "col_spec"))
Any suggestions for appending rows without dates to the previous row/line?
回答1:
I don't think you can use a delimiter to do that, but a simple pattern match (line starts with 20
) should suffice:
Sample data, where you should use the readLines
command to read all text in, I'm faking it here:
# loglines <- readLines(filename)
loglines <- strsplit('2019-09-20 14:06:18.952 [Error] [main] > CloudStorageExtension.swift[line:38]-downloadData(node:storageObj:value:): Error occurs when download filestorage data with description: Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist.
2019-09-20 14:06:18.953 [Error] [main] > AlertService.swift[line:310]-retrieveProfileName(): Unable to get AlertSettings Name: Error Domain=FIRStorageErrorDomain Code=-13010 "Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist." UserInfo={object=App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data, ResponseBody={
"error": {
"code": 404,
"message": "Not Found. Could not get object",
"status": "GET_OBJECT"
}
}, bucket=integration-c5068.appspot.com, data=<7b0a2020 22657272 6f72223a 207b0a20 20202022 636f6465 223a2034 30342c0a 20202020 226d6573 73616765 223a2022 4e6f7420 466f756e 642e2020 436f756c 64206e6f 74206765 74206f62 6a656374 222c0a20 20202022 73746174 7573223a 20224745 545f4f42 4a454354 220a2020 7d0a7d>, data_content_type=application/json; charset=UTF-8, NSLocalizedDescription=Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist., ResponseErrorDomain=com.google.HTTPStatus, ResponseErrorCode=404}
2019-09-20 14:06:18.953 [Error] [main] > AlertService.swift[line:314]-retrieveProfileName(): AlertSettings Name object missing: Error Domain=FIRStorageErrorDomain Code=-13010 "Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist." UserInfo={object=App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data, ResponseBody={
"error": {
"code": 404,
"message": "Not Found. Could not get object",
"status": "GET_OBJECT"
}
}, bucket=integration-c5068.appspot.com, data=<7b0a2020 22657272 6f72223a 207b0a20 20202022 636f6465 223a2034 30342c0a 20202020 226d6573 73616765 223a2022 4e6f7420 466f756e 642e2020 436f756c 64206e6f 74206765 74206f62 6a656374 222c0a20 20202022 73746174 7573223a 20224745 545f4f42 4a454354 220a2020 7d0a7d>, data_content_type=application/json; charset=UTF-8, NSLocalizedDescription=Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist., ResponseErrorDomain=com.google.HTTPStatus, ResponseErrorCode=404}
2019-09-20 14:06:18.957 [Error] [main] > CloudStorageExtension.swift[line:38]-downloadData(node:storageObj:value:): Error occurs when download filestorage data with description: Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist.', "\n")[[1]]
Using this example, we group the lines together with grepl
(returns a vector of logical
s), and cumsum
on that:
grepl("^20", loglines)
# [1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
# [13] FALSE FALSE FALSE TRUE
cumsum(grepl("^20", loglines))
# [1] 1 2 2 2 2 2 2 2 3 3 3 3 3 3 3 4
So the first line is by itself, the next 7 are together, etc.
combined <- as.list(by(loglines, cumsum(grepl("^20", loglines)), paste, collapse = "\n"))
str(combined)
# List of 4
# $ 1: chr "2019-09-20 14:06:18.952 [Error] [main] > CloudStorageExtension.swift[line:38]-downloadData(node:storageObj:valu"| __truncated__
# $ 2: chr "2019-09-20 14:06:18.953 [Error] [main] > AlertService.swift[line:310]-retrieveProfileName(): Unable to get Aler"| __truncated__
# $ 3: chr "2019-09-20 14:06:18.953 [Error] [main] > AlertService.swift[line:314]-retrieveProfileName(): AlertSettings Name"| __truncated__
# $ 4: chr "2019-09-20 14:06:18.957 [Error] [main] > CloudStorageExtension.swift[line:38]-downloadData(node:storageObj:valu"| __truncated__
# perhaps for convenience:
combined <- unlist(as.list(combined), use.names = FALSE)
# one element:
combined[[2]]
# [1] "2019-09-20 14:06:18.953 [Error] [main] > AlertService.swift[line:310]-retrieveProfileName(): Unable to get AlertSettings Name: Error Domain=FIRStorageErrorDomain Code=-13010 \"Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist.\" UserInfo={object=App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data, ResponseBody={\n \"error\": {\n \"code\": 404,\n \"message\": \"Not Found. Could not get object\",\n \"status\": \"GET_OBJECT\"\n }\n}, bucket=integration-c5068.appspot.com, data=<7b0a2020 22657272 6f72223a 207b0a20 20202022 636f6465 223a2034 30342c0a 20202020 226d6573 73616765 223a2022 4e6f7420 466f756e 642e2020 436f756c 64206e6f 74206765 74206f62 6a656374 222c0a20 20202022 73746174 7573223a 20224745 545f4f42 4a454354 220a2020 7d0a7d>, data_content_type=application/json; charset=UTF-8, NSLocalizedDescription=Object App/Data/Users/U0bGtkevMkc8Z94KFIoYSKy87sS2/Modes/RealMode/Alert/Data does not exist., ResponseErrorDomain=com.google.HTTPStatus, ResponseErrorCode=404}"
(Note the embedded newlines within each string. This can easily be changed by changing collapse=
.)
This example might be parsed with read.fwf
, such as
out <- read.fwf(textConnection(combined), widths=c(24, 8, 7, 999), stringsAsFactors=FALSE)
str(out)
# 'data.frame': 16 obs. of 4 variables:
# $ V1: chr "2019-09-20 14:06:18.952 " "2019-09-20 14:06:18.953 " " \"error\": {" " \"code\": 404," ...
# $ V2: chr "[Error] " "[Error] " NA NA ...
# $ V3: chr "[main] " "[main] " NA NA ...
# $ V4: chr "> CloudStorageExtension.swift[line:38]-downloadData(node:storageObj:value:): Error occurs when download filesto"| __truncated__ "> AlertService.swift[line:310]-retrieveProfileName(): Unable to get AlertSettings Name: Error Domain=FIRStorage"| __truncated__ NA NA ...
This might benefit from removing surrounding whitespace, such as with
out <- lapply(out, trimws)
来源:https://stackoverflow.com/questions/58208207/force-date-as-new-line-on-reading-non-delimited-text-file