R XML - combining parent and child nodes into data frame

前端未结

关注

 3  1001

I have xml like this:


                      
              相关标签:


      
      
        
          3条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  感动是毒        
                
              
                            
                2020-12-06 16:21
              
            
            
                                                                       
Consider parsing meeting data by node index and expand it to the number of its child race elements, then column bind with race data:

doc <- xmlParse("/path/to/Source.xml")

# NUMBER OF MEETING NODES
mtg_num <- length(xpathSApply(doc, "//meeting"))

# DATAFRAME LIST OF EXPANDED MEETING ATTRS
meeting_list <- lapply(seq(mtg_num), function(i) {
  races_num <- length(xpathSApply(doc, sprintf("//meeting[%s]/race", i)))

  data.frame(
    meeting_id = rep(xpathSApply(doc, sprintf("//meeting[%s]/@id", i)), races_num),
    meeting_name = rep(xpathSApply(doc, sprintf("//meeting[%s]/@name", i)), races_num)
  )
})

# COLUMN BIND MEETING NODES, RACE NODES, AND RACE ATTRS
final_df <- cbind(do.call(rbind, meeting_list),
                  xmlToDataFrame(nodes = getNodeSet(doc, "//meeting/race")),
                  XML:::xmlAttrsToDataFrame(getNodeSet(doc, "//meeting/race")))


Output

head(final_df)

#   meeting_id      meeting_name  time       date ampm                                                       title type distance    group tipsAllowed predictorAllowed
# 1        195 Punchestown (IRE) 12:25 2018-01-13   pm                      Adare Manor Opportunity Handicap Chase    C     2m4f Handicap           1                1
# 2        195 Punchestown (IRE)  1:00 2018-01-13   pm         Total Event Rental (Kildare) Novice Chase (Grade 3)    C     2m4f  Grade 3           1                1
# 3        195 Punchestown (IRE)  1:35 2018-01-13   pm Connolly's RED MILLS Amateur National (Q.R.) Handicap Chase    C     3m1f Handicap           1                1
# 4        195 Punchestown (IRE)  2:10 2018-01-13   pm                Sky Bet Moscow Flyer Novice Hurdle (Grade 2)    H       2m  Grade 2           1                1
# 5        195 Punchestown (IRE)  2:45 2018-01-13   pm                              Sportinglife.com Maiden Hurdle    H       2m                    1                1
# 6        195 Punchestown (IRE)  3:20 2018-01-13   pm                       Leinster Leader Mares Handicap Hurdle    H  2m4f40y Handicap           1                1

#  bettingLink declaredRunners liveCommentary liveTab       raceDescription tvText betOffers     id perform_race_id perform_race_id_atr details_available race_status_code
# 1           1              10              1       1        Handicap Chase   ATR       <NA> 692415                                                     1                R
# 2           1               7              1       1  Novice Chase Grade 3   ATR       <NA> 692416                                                     1                R
# 3           1              12              1       1        Handicap Chase   ATR       <NA> 692417                                                     1                R
# 4           1               7              1       1 Novice Hurdle Grade 2   ATR       <NA> 692418                                                     1                R
# 5           1              17              1       1         Maiden Hurdle   ATR       <NA> 692419                                                     1                R
# 6           1               8              1       1       Handicap Hurdle   ATR       <NA> 692420                                                     1                R

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  甜味超标        
                
              
                            
                2020-12-06 16:24
              
            
            
                                                                       
Here's options with xml2 for XML handling and the tidyverse for munging. The attributes (xml_attrs returns a named character vector), node names, and node values can be read into a three-element list that can be coerced to a data frame:

library(tidyverse)
library(xml2)

x <- read_xml('races.xml')

races <- x %>% 
    xml_find_all('//race') %>% 
    map_dfr(~list(attrs = list(xml_attrs(.x)), 
                  variable = list(map(xml_children(.x), xml_name)), 
                  value = list(map(xml_children(.x), xml_text))))

races
#> # A tibble: 29 x 3
#>    attrs     variable    value      
#>    <list>    <list>      <list>     
#>  1 <chr [5]> <list [15]> <list [15]>
#>  2 <chr [5]> <list [15]> <list [15]>
#>  3 <chr [5]> <list [15]> <list [15]>
#>  4 <chr [5]> <list [15]> <list [15]>
#>  5 <chr [5]> <list [15]> <list [15]>
#>  6 <chr [5]> <list [15]> <list [15]>
#>  7 <chr [5]> <list [15]> <list [15]>
#>  8 <chr [5]> <list [16]> <list [16]>
#>  9 <chr [5]> <list [16]> <list [16]>
#> 10 <chr [5]> <list [16]> <list [16]>
#> # ... with 19 more rows

which can in turn be cleaned up with a lot of tidyr:
races_tidy <- races %>% 
    mutate(attr_names = map(attrs, names)) %>% 
    unnest(attr_names, attrs, .drop = FALSE) %>% 
    spread(attr_names, attrs) %>% 
    unnest(variable, value) %>% 
    unnest(variable, value) %>% 
    spread(variable, value) %>% 
    type_convert()    # fix variable types

This works, but the unnesting and spreading is fragile. Writing a more robust method is actually not too much more work, though, as you can just arrange the list columns before unnesting:

races_tidy2 <- races %>% 
    mutate(attrs = map(attrs, ~as_tibble(as.list(.x))), 
           data = map2(variable, value, ~as_tibble(set_names(.y, .x)))) %>% 
    unnest(attrs, data, .drop = TRUE) %>% 
    type_convert()

The most direct approach is to do the rearranging right while iterating over nodes. This is most concise and likely most efficient approach, but writing it correctly relies on careful manipulation of the data structures, so writing viable code may take longer.

races_tidy3 <- x %>% 
    xml_find_all('//race') %>% 
    map_dfr(~flatten(c(xml_attrs(.x), 
                       map(xml_children(.x), 
                           ~set_names(as.list(xml_text(.x)), xml_name(.x)))))) %>%
    type_convert()

races_tidy3
#> # A tibble: 29 x 21
#>        id perf… perf… deta… race… time  date       ampm  title type  dist…
#>     <int> <chr> <chr> <int> <chr> <tim> <date>     <chr> <chr> <chr> <chr>
#>  1 692415 <NA>  <NA>      1 R     12:25 2018-01-13 pm    Adar… C     2m4f 
#>  2 692416 <NA>  <NA>      1 R     01:00 2018-01-13 pm    Tota… C     2m4f 
#>  3 692417 <NA>  <NA>      1 R     01:35 2018-01-13 pm    Conn… C     3m1f 
#>  4 692418 <NA>  <NA>      1 R     02:10 2018-01-13 pm    Sky … H     2m   
#>  5 692419 <NA>  <NA>      1 R     02:45 2018-01-13 pm    Spor… H     2m   
#>  6 692420 <NA>  <NA>      1 R     03:20 2018-01-13 pm    Lein… H     2m4f…
#>  7 692421 <NA>  <NA>      1 R     03:50 2018-01-13 pm    Davi… B     2m   
#>  8 691061 <NA>  <NA>      1 R     12:40 2018-01-13 pm    Betf… H     2m   
#>  9 691060 <NA>  <NA>      1 R     01:15 2018-01-13 pm    Betf… C     2m54y
#> 10 691058 <NA>  <NA>      1 R     01:50 2018-01-13 pm    Betf… C     3m   
#> # ... with 19 more rows, and 10 more variables: group <chr>, tipsAllowed
#> #   <int>, predictorAllowed <int>, bettingLink <int>, declaredRunners
#> #   <int>, liveCommentary <int>, liveTab <int>, raceDescription <chr>,
#> #   tvText <chr>, betOffers <chr>

All return the same data, though column order is different for races_tidy.
all_equal(races_tidy, races_tidy2)
#> [1] TRUE

identical(races_tidy2, races_tidy3)
#> [1] TRUE

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  花落未央        
                
              
                            
                2020-12-06 16:28
              
            
            
                                                                       
Alternatively, consider XSLT, the special-purpose langauge designed specifically to transform XML files such as flatter, simpler ones for your R needs. R can run XSLT 1.0 scripts with the xslt third-party package (extension of xml2). 

But also, XSLT is portable and can be run even outside R with Java, Python, PHP, or dedicated executables such as Saxon and Xalan. Below shows a system call to Unix's xsltproc. There is a similar batch call available for Windows. Once simplified, pass the new XML using XML's xmlToDataframe. 

Specifically, XSLT below parses down to race level and pulls meeting data from parent node.

XSLT (save as .xsl, a well-formed .xml file)

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" omit-xml-declaration="yes" indent="yes"/>
    <xsl:strip-space elements="*"/>

    <xsl:template match="/root/cards">
     <xsl:copy>
       <xsl:apply-templates select="meeting"/>
     </xsl:copy>
    </xsl:template>

    <xsl:template match="meeting">
       <xsl:apply-templates select="race"/>
    </xsl:template>

    <xsl:template match="race">
     <xsl:copy>
       <meeting_id><xsl:value-of select="ancestor::meeting/@id"/></meeting_id>
       <meeting_name><xsl:value-of select="ancestor::meeting/@name"/></meeting_name>
       <xsl:apply-templates select="@*"/>
       <xsl:copy-of select="*"/>
     </xsl:copy>
    </xsl:template>

    <xsl:template match="race/@*">
       <xsl:element name="{name(.)}"><xsl:value-of select="."/></xsl:element>
    </xsl:template>

</xsl:stylesheet>


R

library(XML)
library(xslt)

# LOAD XML AND XSL
input <- read_xml("/path/to/input.xml", package = "xslt")
style <- read_xml("/path/to/xslt_script.xsl", package = "xslt")

# TRANSFORM INPUT INTO OUTPUT
new_xml <- xml_xslt(input, style)
output <- as.character(new_xml)

# PARSE OUTPUT FROM STRING
doc <- xmlParse(output, asText=TRUE)

# COMMAND LINE CALL TO UNIX'S XSLTPROC (ALTERNATIVE TO xslt PACKAGE)
system("xsltproc -o /path/to/input.xml /path/to/xslt_script.xsl /path/to/output.xml")
doc <- xmlParse("/path/to/output.xml")

# BUILD DATAFRAME
df <- xmlToDataFrame(doc, nodes=getNodeSet(doc, '//race'))

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复