Converting nested list to dataframe

后端 未结 3 2181
南旧
南旧 2020-11-27 03:14

The goal is to convert a nested list which sometimes contain missing records into a data frame. An example of the structure when there are missing records is:



        
相关标签:
3条回答
  • 2020-11-27 03:50

    You can also use (at least v1.9.3) of rbindlist in the data.table package:

    library(data.table)
    
    rbindlist(mylist, fill=TRUE)
    
    ##      Hit Project Year Rating      Launch  ID    Dept            Error
    ## 1:  True    Blue 2011      4 26 Jan 2012  19 1, 2, 4               NA
    ## 2: False      NA   NA     NA          NA  NA      NA Record not found
    ## 3:  True   Green 2004      8 29 Feb 2004 183    6, 8               NA
    
    0 讨论(0)
  • 2020-11-27 04:08

    I just developed a solution for this question that is applicable here, so I'll provide it here as well:

    tl <- function(e) { if (is.null(e)) return(NULL); ret <- typeof(e); if (ret == 'list' && !is.null(names(e))) ret <- list(type='namedlist') else ret <- list(type=ret,len=length(e)); ret; };
    mkcsv <- function(v) paste0(collapse=',',v);
    keyListToStr <- function(keyList) paste0(collapse='','/',sapply(keyList,function(key) if (is.null(key)) '*' else paste0(collapse=',',key)));
    
    extractLevelColumns <- function(
        nodes, ## current level node selection
        ..., ## additional arguments to data.frame()
        keyList=list(), ## current key path under main list
        sep=NULL, ## optional string separator on which to join multi-element vectors; if NULL, will leave as separate columns
        mkname=function(keyList,maxLen) paste0(collapse='.',if (is.null(sep) && maxLen == 1L) keyList[-length(keyList)] else keyList) ## name builder from current keyList and character vector max length across node level; default to dot-separated keys, and remove last index component for scalars
    ) {
        cat(sprintf('extractLevelColumns(): %s\n',keyListToStr(keyList)));
        if (length(nodes) == 0L) return(list()); ## handle corner case of empty main list
        tlList <- lapply(nodes,tl);
        typeList <- do.call(c,lapply(tlList,`[[`,'type'));
        if (length(unique(typeList)) != 1L) stop(sprintf('error: inconsistent types (%s) at %s.',mkcsv(typeList),keyListToStr(keyList)));
        type <- typeList[1L];
        if (type == 'namedlist') { ## hash; recurse
            allKeys <- unique(do.call(c,lapply(nodes,names)));
            ret <- do.call(c,lapply(allKeys,function(key) extractLevelColumns(lapply(nodes,`[[`,key),...,keyList=c(keyList,key),sep=sep,mkname=mkname)));
        } else if (type == 'list') { ## array; recurse
            lenList <- do.call(c,lapply(tlList,`[[`,'len'));
            maxLen <- max(lenList,na.rm=T);
            allIndexes <- seq_len(maxLen);
            ret <- do.call(c,lapply(allIndexes,function(index) extractLevelColumns(lapply(nodes,function(node) if (length(node) < index) NULL else node[[index]]),...,keyList=c(keyList,index),sep=sep,mkname=mkname))); ## must be careful to translate out-of-bounds to NULL; happens automatically with string keys, but not with integer indexes
        } else if (type%in%c('raw','logical','integer','double','complex','character')) { ## atomic leaf node; build column
            lenList <- do.call(c,lapply(tlList,`[[`,'len'));
            maxLen <- max(lenList,na.rm=T);
            if (is.null(sep)) {
                ret <- lapply(seq_len(maxLen),function(i) setNames(data.frame(sapply(nodes,function(node) if (length(node) < i) NA else node[[i]]),...),mkname(c(keyList,i),maxLen)));
            } else {
                ## keep original type if maxLen is 1, IOW don't stringify
                ret <- list(setNames(data.frame(sapply(nodes,function(node) if (length(node) == 0L) NA else if (maxLen == 1L) node else paste(collapse=sep,node)),...),mkname(keyList,maxLen)));
            }; ## end if
        } else stop(sprintf('error: unsupported type %s at %s.',type,keyListToStr(keyList)));
        if (is.null(ret)) ret <- list(); ## handle corner case of exclusively empty sublists
        ret;
    }; ## end extractLevelColumns()
    ## simple interface function
    flattenList <- function(mainList,...) do.call(cbind,extractLevelColumns(mainList,...));
    

    Execution:

    ## define data
    mylist <- list(structure(list(Hit='True',Project='Blue',Year='2011',Rating='4',Launch='26 Jan 2012',ID='19',Dept='1, 2, 4'),.Names=c('Hit','Project','Year','Rating','Launch','ID','Dept')),structure(list(Hit='False',Error='Record not found'),.Names=c('Hit','Error')),structure(list(Hit='True',Project='Green',Year='2004',Rating='8',Launch='29 Feb 2004',ID='183',Dept='6, 8'),.Names=c('Hit','Project','Year','Rating','Launch','ID','Dept')));
    
    ## run it
    df <- flattenList(mylist);
    ## extractLevelColumns():
    ## extractLevelColumns(): Hit
    ## extractLevelColumns(): Project
    ## extractLevelColumns(): Year
    ## extractLevelColumns(): Rating
    ## extractLevelColumns(): Launch
    ## extractLevelColumns(): ID
    ## extractLevelColumns(): Dept
    ## extractLevelColumns(): Error
    
    df;
    ##     Hit Project Year Rating      Launch   ID    Dept            Error
    ## 1  True    Blue 2011      4 26 Jan 2012   19 1, 2, 4             <NA>
    ## 2 False    <NA> <NA>   <NA>        <NA> <NA>    <NA> Record not found
    ## 3  True   Green 2004      8 29 Feb 2004  183    6, 8             <NA>
    

    My function is more powerful than data.table::rbindlist() as of 1.9.6, in that it can handle any number of nesting levels and different vector lengths across branches. In the linked question, my function correctly flattens the OP's list to a data.frame, but data.table::rbindlist() fails with "Error in rbindlist(jsonRList, fill = T) : Column 4 of item 16 is length 2, inconsistent with first column of that item which is length 1. rbind/rbindlist doesn't recycle as it already expects each item to be a uniform list, data.frame or data.table".

    0 讨论(0)
  • 2020-11-27 04:12

    You could create a list of data.frames:

    dfs <- lapply(mylist, data.frame, stringsAsFactors = FALSE)
    

    Then use one of these:

    library(plyr)
    rbind.fill(dfs)
    

    or the faster

    library(dplyr)
    rbind_all(dfs)
    

    In the case of dplyr::rbind_all, I am surprised that it chooses to use "" instead of NA for missing data. If you remove stringsAsFactors = FALSE, you will get NA but at the cost of a warning... So suppressWarnings(rbind_all(lapply(mylist, data.frame))) would be an ugly but fast solution.

    0 讨论(0)
提交回复
热议问题