问题
I have a vector of JSONs (of the same structure) and transform it to a data.frame. Following example does exactly what I want.
require(jsonlite) # fromJSON()
require(magrittr) # for the pipeline only
require(data.table) # rbindlist()
jsons <- c('{"num":1,"char":"a","list":{"x":1,"y":2}}',
'{"num":2,"char":"b","list":{"x":1,"y":2}}',
'{"num":3,"char":"c","list":{"x":1,"y":2}}')
df <- jsons %>%
lapply(fromJSON) %>%
lapply(as.data.frame.list, stringsAsFactors = F) %>%
rbindlist(fill = T)
Some elements of the JSON are objects, i.e. if I transform it fromJSON()
some elements of the list will be lists as well. I cannot use unlist()
to each list because I have different variable types so I am using as.data.frame.list()
function. This is however too slow to do for each JSON individually. Is there a way how can I do it more effectively?
json <- '{"$schema":"http://json-schema.org/draft-04/schema#","title":"Product set","type":"array","items":{"title":"Product","type":"object","properties":{"id":{"description":"The unique identifier for a product","type":"number"},"name":{"type":"string"},"price":{"type":"number","minimum":0,"exclusiveMinimum":true},"tags":{"type":"array","items":{"type":"string"},"minItems":1,"uniqueItems":true},"dimensions":{"type":"object","properties":{"length":{"type":"number"},"width":{"type":"number"},"height":{"type":"number"}},"required":["length","width","height"]},"warehouseLocation":{"description":"Coordinates of the warehouse with the product","$ref":"http://json-schema.org/geo"}},"required":["id","name","price"]}}'
system.time(
df <- json %>% rep(1000) %>%
lapply(fromJSON) %>%
lapply(as.data.frame.list, stringsAsFactors = F) %>%
rbindlist(fill = T)
) # 2.72
I know that there are plenty of similar questions but most of the answers I saw was about using as.data.frame()
or data.frame()
. Nobody mentioned the speed. Maybe there is no better solution to this.
回答1:
I finally found the answer. It will be on CRAN soon.
devtools::install_github("jeremystan/tidyjson")
tidyjson::spread_all()
This function is about 10-times faster than my example above.
回答2:
Try to collapse all JSONs in the one string. Let's show example of the solution:
require(jsonlite)
require(data.table)
json <- '{"$schema":"http://json-schema.org/draft-04/schema#","title":"Product set","type":"array","items":{"title":"Product","type":"object","properties":{"id":{"description":"The unique identifier for a product","type":"number"},"name":{"type":"string"},"price":{"type":"number","minimum":0,"exclusiveMinimum":true},"tags":{"type":"array","items":{"type":"string"},"minItems":1,"uniqueItems":true},"dimensions":{"type":"object","properties":{"length":{"type":"number"},"width":{"type":"number"},"height":{"type":"number"}},"required":["length","width","height"]},"warehouseLocation":{"description":"Coordinates of the warehouse with the product","$ref":"http://json-schema.org/geo"}},"required":["id","name","price"]}}'
n <- 1000
ex <- rep(json, 1000)
f1 <- function(x) {
res <- lapply(x, fromJSON)
res <- lapply(res, as.data.frame.list, stringsAsFactors = FALSE)
res <- rbindlist(res, fill = TRUE)
return(res)
}
f2 <- function(x) {
res <- fromJSON(paste0("[", paste(x, collapse = ","), "]"), flatten = TRUE)
lst <- sapply(res, is.list)
res[lst] <- lapply(res[lst], function(x) as.data.table(transpose(x)))
res <- flatten(res)
return(res)
}
bench::mark(
f1(ex), f2(ex), min_iterations = 100, check = FALSE
)
#> # A tibble: 2 x 14
#> expression min mean median max `itr/sec` mem_alloc n_gc n_itr #> total_time result memory time
#> <chr> <bch:t> <bch:t> <bch:t> <bch:tm> <dbl> <bch:byt> <dbl> <int> #> <bch:tm> <list> <list> <lis>
#> 1 f1(ex) 2.27s 2.35s 2.32s 2.49s 0.425 0B 5397 100 #> 3.92m <data… <Rpro… <bch…
#> 2 f2(ex) 48.85ms 63.78ms 57.88ms 116.19ms 15.7 0B 143 100 #> 6.38s <data… <Rpro… <bch…
#> # … with 1 more variable: gc <list>
来源:https://stackoverflow.com/questions/39343993/r-vector-of-jsons-to-data-frame