问题
I have to deal with JSON documents that contain nested documents and at some level have an array which in turn contains individual documents that conceptionally would map back to "data frame rows" when reading/parsing the JSON in R.
First order problem/question
I'm looking for a way to ensure that
either all
data frames
are always turned intotibbles
or that at least the "leaf data frames" become
tibbles
while the the "parent data frames" are allowed to becomelists
for arbitrary nested structures, either directly upon parsing via {jsonlite}
or afterwards via {purrr}
.
Second order problem/question
How do I traverse lists and apply map
recursively with {purrr}
"the right
way"?
Related
- https://hendrikvanb.gitlab.io/2018/07/nested_data-json_to_tibble/
- Ensure that data frames become tibbles when reading MongoDB data with {mongolite}
Example
Example data
json <- '[
{
"labels": ["label-a", "label-b"],
"levelOne": {
"levelTwo": {
"levelThree": [
{
"x": "A",
"y": 1,
"z": true
},
{
"x": "B",
"y": 2,
"z": false
}
]
}
},
"schema": "0.0.1"
},
{
"labels": ["label-a", "label-b"],
"levelOne": {
"levelTwo": {
"levelThree": [
{
"x": "A",
"y": 10,
"z": false
},
{
"x": "B",
"y": 20,
"z": true
}
]
}
},
"schema": "0.0.1"
}
]'
Result after parsing and turning into tibble
x <- json %>% jsonlite::fromJSON()
x %>% str()
# 'data.frame': 2 obs. of 3 variables:
# $ labels :List of 2
# ..$ : chr "label-a" "label-b"
# ..$ : chr "label-a" "label-b"
# $ levelOne:'data.frame': 2 obs. of 1 variable:
# ..$ levelTwo:'data.frame': 2 obs. of 1 variable:
# .. ..$ levelThree:List of 2
# .. .. ..$ :'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 1 2
# .. .. .. ..$ z: logi TRUE FALSE
# .. .. ..$ :'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 10 20
# .. .. .. ..$ z: logi FALSE TRUE
# $ schema : chr "0.0.1" "0.0.1"
x_tbl <- x %>% tibble::as_tibble()
x_tbl %>% str()
# Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# $ labels :List of 2
# ..$ : chr "label-a" "label-b"
# ..$ : chr "label-a" "label-b"
# $ levelOne:'data.frame': 2 obs. of 1 variable:
# ..$ levelTwo:'data.frame': 2 obs. of 1 variable:
# .. ..$ levelThree:List of 2
# .. .. ..$ :'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 1 2
# .. .. .. ..$ z: logi TRUE FALSE
# .. .. ..$ :'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 10 20
# .. .. .. ..$ z: logi FALSE TRUE
# $ schema : chr "0.0.1" "0.0.1"
Desired result
x_tbl$levelOne <- x_tbl$levelOne %>% tibble::as_tibble()
x_tbl$levelOne$levelTwo <- x_tbl$levelOne$levelTwo %>%
tibble::as_tibble()
x_tbl$levelOne$levelTwo$levelThree <- x_tbl$levelOne$levelTwo$levelThree %>%
purrr::map(tibble::as_tibble)
x_tbl %>% str()
# Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# $ labels :List of 2
# ..$ : chr "label-a" "label-b"
# ..$ : chr "label-a" "label-b"
# $ levelOne:Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 1 variable:
# ..$ levelTwo:Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 1 variable:
# .. ..$ levelThree:List of 2
# .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 1 2
# .. .. .. ..$ z: logi TRUE FALSE
# .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 10 20
# .. .. .. ..$ z: logi FALSE TRUE
# $ schema : chr "0.0.1" "0.0.1"
If I try to do that via dplyr::mutate()
or purrr::map*_df()
, I get the Error: Column
is of unsupported class data.frame
error
Current implementation
I have something which looks good at first sight, but duplicates the nested
structure as soon as you cast the list
to tibble
. Even if it did work as
desired, it seems to complicated and brittle as it was designed with one
specific use case/JSON structure in mind:
tidy_nested_data_frames <- function(
x
) {
is_data_frame_that_should_be_list <- function(x) {
is.data.frame(x) && purrr::map_lgl(x, is.data.frame)
}
y <- x %>%
purrr::map_if(is_data_frame_that_should_be_list, as.list)
# Check for next data frame columns to handle:
false <- function(.x) FALSE
class_info <- y %>%
purrr::map_if(is.list, ~.x %>% purrr::map(is.data.frame), .else = false)
trans_to_tibble <- function(x) {
x %>% purrr::map(tibble::as_tibble)
}
purrr::map2(class_info, y, function(.x, .y) {
go_deeper <- .x %>% as.logical() %>% all()
if (go_deeper) {
# Continue if data frame columns have been detected:
tidy_nested_data_frames(.y)
} else {
# Handle data frames that have list columns that themselves carry the data
# frames we want to turn into tibbles:
# NOTE:
# This probably does not generalize well yet as the logic seems to much
# tied to my current use case!
if (.y %>% is.data.frame()) {
.y %>%
purrr::map_if(is.list, trans_to_tibble)
} else {
.y
}
}
})
}
Testing
x <- json %>%
jsonlite::fromJSON() %>%
tidy_nested_data_frames()
x %>% str()
# List of 3
# $ labels :List of 2
# ..$ : chr [1:2] "label-a" "label-b"
# ..$ : chr [1:2] "label-a" "label-b"
# $ levelOne:List of 1
# ..$ levelTwo:List of 1
# .. ..$ levelThree:List of 2
# .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr [1:2] "A" "B"
# .. .. .. ..$ y: int [1:2] 1 2
# .. .. .. ..$ z: logi [1:2] TRUE FALSE
# .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr [1:2] "A" "B"
# .. .. .. ..$ y: int [1:2] 10 20
# .. .. .. ..$ z: logi [1:2] FALSE TRUE
# $ schema : chr [1:2] "0.0.1" "0.0.1"
x_tbl <- x %>% tibble::as_tibble()
x_tbl %>% str()
# Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# $ labels :List of 2
# ..$ : chr "label-a" "label-b"
# ..$ : chr "label-a" "label-b"
# $ levelOne:List of 2
# ..$ levelTwo:List of 1
# .. ..$ levelThree:List of 2
# .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 1 2
# .. .. .. ..$ z: logi TRUE FALSE
# .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 10 20
# .. .. .. ..$ z: logi FALSE TRUE
# ..$ levelTwo:List of 1
# .. ..$ levelThree:List of 2
# .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 1 2
# .. .. .. ..$ z: logi TRUE FALSE
# .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
# .. .. .. ..$ x: chr "A" "B"
# .. .. .. ..$ y: int 10 20
# .. .. .. ..$ z: logi FALSE TRUE
# $ schema : chr "0.0.1" "0.0.1"
EDIT 2020-01-14
Trying out the approach of Alland Cameron "as is" I get:
library(tibble)
x %>%
recursive_tibble() %>%
str()
# List of 3
# $ labels :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 2 variables:
# ..$ V1: chr [1:2] "label-a" "label-b"
# ..$ V2: chr [1:2] "label-a" "label-b"
# $ levelOne:List of 1
# ..$ levelTwo:List of 1
# .. ..$ levelThree:Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 6 variables:
# .. .. ..$ x1: chr [1:2] "A" "A"
# .. .. ..$ x2: chr [1:2] "B" "B"
# .. .. ..$ y1: chr [1:2] "1" "10"
# .. .. ..$ y2: chr [1:2] "2" "20"
# .. .. ..$ z1: chr [1:2] "TRUE" "FALSE"
# .. .. ..$ z2: chr [1:2] "FALSE" "TRUE"
# $ schema : chr [1:2] "0.0.1" "0.0.1"
Session info
sessioninfo::session_info()
# ─ Session info ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
# setting value
# version R version 3.6.1 (2019-07-05)
# os Pop!_OS 19.10
# system x86_64, linux-gnu
# ui RStudio
# language en_US:en
# collate en_US.UTF-8
# ctype en_US.UTF-8
# tz UTC
# date 2020-01-14
#
# ─ Packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
# ! package * version date lib source
# askpass 1.1 2019-01-13 [1] CRAN (R 3.6.1)
# assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.6.1)
# backports 1.1.5 2019-10-02 [1] CRAN (R 3.6.1)
# bmp 0.3 2017-09-11 [1] CRAN (R 3.6.1)
# callr 3.4.0 2019-12-09 [1] CRAN (R 3.6.1)
# cli 2.0.1 2020-01-08 [1] CRAN (R 3.6.1)
# colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.6.1)
# config 0.3 2018-03-27 [1] CRAN (R 3.6.1)
# confx 0.0.0.9012 2020-01-05 [1] github (rappster/confx@9695409)
# crayon 1.3.4 2017-09-16 [1] CRAN (R 3.6.1)
# curl 4.3 2019-12-02 [1] CRAN (R 3.6.1)
# R depot.dts.dce * 0.1.1.9003 <NA> [?] <NA>
# desc 1.2.0 2018-05-01 [1] CRAN (R 3.6.1)
# devtools 2.2.1 2019-09-24 [1] CRAN (R 3.6.1)
# digest 0.6.23 2019-11-23 [1] CRAN (R 3.6.1)
# dplyr 0.8.3 2019-07-04 [1] CRAN (R 3.6.1)
# ellipsis 0.3.0 2019-09-20 [1] CRAN (R 3.6.1)
# fansi 0.4.1 2020-01-08 [1] CRAN (R 3.6.1)
# fs 1.3.1 2019-05-06 [1] CRAN (R 3.6.1)
# glue 1.3.1 2019-03-12 [1] CRAN (R 3.6.1)
# here 0.1 2017-05-28 [1] CRAN (R 3.6.1)
# igraph 1.2.4.2 2019-11-27 [1] CRAN (R 3.6.1)
# imager 0.41.2 2019-01-23 [1] CRAN (R 3.6.1)
# jpeg 0.1-8.1 2019-10-24 [1] CRAN (R 3.6.1)
# jsonlite 1.6 2018-12-07 [1] CRAN (R 3.6.1)
# knitr 1.26 2019-11-12 [1] CRAN (R 3.6.1)
# later 1.0.0 2019-10-04 [1] CRAN (R 3.6.1)
# lifecycle 0.1.0 2019-08-01 [1] CRAN (R 3.6.1)
# lubridate 1.7.4 2018-04-11 [1] CRAN (R 3.6.1)
# magick 2.2 2019-08-26 [1] CRAN (R 3.6.1)
# magrittr 1.5 2014-11-22 [1] CRAN (R 3.6.1)
# memoise 1.1.0 2017-04-21 [1] CRAN (R 3.6.1)
# mongolite 2.1.0 2019-05-09 [1] CRAN (R 3.6.1)
# munsell 0.5.0 2018-06-12 [1] CRAN (R 3.6.1)
# openssl 1.4.1 2019-07-18 [1] CRAN (R 3.6.1)
# pillar 1.4.3 2019-12-20 [1] CRAN (R 3.6.1)
# pkgbuild 1.0.6 2019-10-09 [1] CRAN (R 3.6.1)
# pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 3.6.1)
# pkgload 1.0.2 2018-10-29 [1] CRAN (R 3.6.1)
# plyr 1.8.5 2019-12-10 [1] CRAN (R 3.6.1)
# png 0.1-7 2013-12-03 [1] CRAN (R 3.6.1)
# prettyunits 1.0.2 2015-07-13 [1] CRAN (R 3.6.1)
# processx 3.4.1 2019-07-18 [1] CRAN (R 3.6.1)
# promises * 1.1.0 2019-10-04 [1] CRAN (R 3.6.1)
# ps 1.3.0 2018-12-21 [1] CRAN (R 3.6.1)
# purrr 0.3.3 2019-10-18 [1] CRAN (R 3.6.1)
# R6 2.4.1 2019-11-12 [1] CRAN (R 3.6.1)
# Rcpp 1.0.3 2019-11-08 [1] CRAN (R 3.6.1)
# readbitmap 0.1.5 2018-06-27 [1] CRAN (R 3.6.1)
# remotes 2.1.0 2019-06-24 [1] CRAN (R 3.6.1)
# renv 0.9.2 2019-12-09 [1] CRAN (R 3.6.1)
# rlang 0.4.2 2019-11-23 [1] CRAN (R 3.6.1)
# rprojroot 1.3-2 2018-01-03 [1] CRAN (R 3.6.1)
# rstudioapi 0.10 2019-03-19 [1] CRAN (R 3.6.1)
# scales 1.1.0 2019-11-18 [1] CRAN (R 3.6.1)
# sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.6.1)
# stringi 1.4.3 2019-03-12 [1] CRAN (R 3.6.1)
# stringr 1.4.0 2019-02-10 [1] CRAN (R 3.6.1)
# testthat * 2.3.1 2019-12-01 [1] CRAN (R 3.6.1)
# tibble * 2.1.3 2019-06-06 [1] CRAN (R 3.6.1)
# tidyr 1.0.0 2019-09-11 [1] CRAN (R 3.6.1)
# tidyselect 0.2.5 2018-10-11 [1] CRAN (R 3.6.1)
# tiff 0.1-5 2013-09-04 [1] CRAN (R 3.6.1)
# usethis 1.5.1 2019-07-04 [1] CRAN (R 3.6.1)
# utf8 1.1.4 2018-05-24 [1] CRAN (R 3.6.1)
# vctrs 0.2.99.9001 2020-01-08 [1] github (r-lib/vctrs@ab84679)
# withr 2.1.2 2018-03-15 [1] CRAN (R 3.6.1)
# xfun 0.11 2019-11-12 [1] CRAN (R 3.6.1)
# yaml 2.2.0 2018-07-25 [1] CRAN (R 3.6.1)
#
# [1] /home/janko/R/x86_64-pc-linux-gnu-library/3.6
# [2] /usr/local/lib/R/site-library
# [3] /usr/lib/R/site-library
# [4] /usr/lib/R/library
回答1:
I guess you're going to have to use recursion to go through the list. Here's an idea I had, but I could only get it to work with fromJSON
from the rjson package rather than the jsonlite package.
The first step is to define a recursive function to check the depth of a list element:
depth <- function(list_entry)
{
if (is.list(list_entry) & !is.tibble(list_entry))
return(max(sapply(list_entry, depth)) + 1)
else
return(0)
}
The next function recursively tries to make a tibble out of depth-1 elements (if they are vectors) or out of depth-2 elements (if the tibble values are listed individually). If it finds a depth-0 element it will return it unchanged, and if the element is > 2 deep or not suitable to turn into a tibble, it will pass the children nodes recursively for the same treatment.
recursive_tibble <- function(json_list)
{
lapply(json_list, function(y)
{
if(depth(y) == 0)
return(unlist(y))
if(depth(y) == 1)
{
if (length(y) < 2)
return(unlist(y))
if (length(unique(names(y))) == 1)
return(as_tibble(do.call("rbind", lapply(y, unlist))))
if (length(unique(unlist(lapply(y, length)))) == 1)
return(as_tibble(do.call("cbind", lapply(y, unlist))))
else return(unlist(y))
}
if (depth(y) == 2)
{
if (length(y) < 2)
return(recursive_tibble(y))
if (all(do.call(`==`, lapply(y, names))))
return(as_tibble(do.call("rbind", lapply(y, unlist))))
}
else return(recursive_tibble(y))
})
}
So now you can do:
recursive_tibble(x)
#> List of 2
#> $ :List of 5
#> ..$ _id : chr "1234"
#> ..$ createdAt: chr "2020-01-13 09:00:00"
#> ..$ labels : chr [1:2] "label-a" "label-b"
#> ..$ levelOne :List of 1
#> .. ..$ levelTwo:List of 1
#> .. .. ..$ levelThree:Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
#> .. .. .. ..$ x: chr [1:2] "A" "B"
#> .. .. .. ..$ y: chr [1:2] "1" "2"
#> .. .. .. ..$ z: chr [1:2] "TRUE" "FALSE"
#> ..$ schema : chr "0.0.1"
#> $ :List of 5
#> ..$ _id : chr "5678"
#> ..$ createdAt: chr "2020-01-13 09:01:00"
#> ..$ labels : chr [1:2] "label-a" "label-b"
#> ..$ levelOne :List of 1
#> .. ..$ levelTwo:List of 1
#> .. .. ..$ levelThree:Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of 3 variables:
#> .. .. .. ..$ x: chr [1:2] "A" "B"
#> .. .. .. ..$ y: chr [1:2] "1" "2"
#> .. .. .. ..$ z: chr [1:2] "TRUE" "FALSE"
#> ..$ schema : chr "0.0.1"
来源:https://stackoverflow.com/questions/59717301/recursively-ensuring-tibbles-instead-of-data-frames-when-parsing-manipulating-ne