问题
I am really new to Path and R in general and am trying to convert an XML file using XPath into a data frame in R. With some help I managed to transform most of the information in the XML already . However, now I am trying to take two consecutive elements and merge them into one data frame. Somehow I can't seem to get it right.
This is an excerpt of the xml data:
</customer-bootstrap-data>
<customer-bootstrap-data id="970911" customerName="HighIncome-1_4" powerType="ELECTRIC_VEHICLE">
<netUsage>0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.1124173640233721,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.37606842556525066,-0.0,-0.0,-0.038684343289247636,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.8490012729862713,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0</netUsage>
</customer-bootstrap-data>
<customer-bootstrap-data id="970912" customerName="HighIncome-2_17" powerType="ELECTRIC_VEHICLE">
<netUsage>0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.21395291779884928,-0.0,-0.0,-1.3581716633726693,-0.0,-0.0,-2.8140822306420716,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.0221045637055397,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-3.3,-3.223543705462774,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.153329511039221,-0.0,-0.0,-0.0,-0.0,-0.0,-0.820425411761537,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.7054631085029754,-0.0,-0.7130641168720118,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-2.5003661751788435,-0.0,-0.0,-3.3,-3.3,-0.6606989045692728,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.20818145620010853,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0493154269844851,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-0.0,-1.041919182358086,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-0.5334016276259916,-0.0,-0.0,-0.0</netUsage>
</customer-bootstrap-data>
I am able to generate a table with the first element using the following code:
customerBoot <- xpathSApply(doc=xml, path=("//customer-bootstrap-data"), xmlAttrs)
customerBoot <- data.frame(t(customerBoot))
containing id, customerName and powerType. But I want the net usage to be also included for each customer id.
The following code selects all the information I want it just doesn't allow me to transform it into a data frame.
customerBoot <- getNodeSet(xml,"//customer-bootstrap-data")
Any ideas going either way? I am looking for a fast solution.
Thank you!
回答1:
Let me know if this has speed issues (iterating over a huge XML doc in this manner can sometimes be slow):
library(XML)
library(purrr)
fil <- '<dat><customer-bootstrap-data id="970911" customerName="HighIncome-1_4" powerType="ELECTRIC_VEHICLE">
<netUsage>0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.1124173640233721,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.37606842556525066,-0.0,-0.0,-0.038684343289247636,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.8490012729862713,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0</netUsage>
</customer-bootstrap-data>
<customer-bootstrap-data id="970912" customerName="HighIncome-2_17" powerType="ELECTRIC_VEHICLE">
<netUsage>0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.21395291779884928,-0.0,-0.0,-1.3581716633726693,-0.0,-0.0,-2.8140822306420716,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.0221045637055397,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-3.3,-3.223543705462774,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.153329511039221,-0.0,-0.0,-0.0,-0.0,-0.0,-0.820425411761537,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.7054631085029754,-0.0,-0.7130641168720118,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-2.5003661751788435,-0.0,-0.0,-3.3,-3.3,-0.6606989045692728,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.20818145620010853,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0493154269844851,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-0.0,-1.041919182358086,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-3.3,-0.5334016276259916,-0.0,-0.0,-0.0</netUsage>
</customer-bootstrap-data></dat>'
doc <- xmlParse(fil)
customerBoot <- xpathSApply(doc=doc, path="//customer-bootstrap-data", xmlAttrs)
customerBoot <- data.frame(t(customerBoot), stringsAsFactors=FALSE)
# go by row using the id, grab the desired data node, convert it to a wide data.frame
customerBoot <- purrr::by_row(customerBoot, function(x) {
path <- sprintf("//customer-bootstrap-data[@id='%s']/netUsage", x$id)
vals <- strsplit(xpathSApply(doc=doc, path=path, xmlValue), ",")[[1]]
as.numeric(vals)
}, .to="X", .collate="cols")
# limiting the "str()" equivalent "glimpse()" to 15 columns since there are >300 of them:
dplyr::glimpse(customerBoot[, 1:15])
## Observations: 2
## Variables: 15
## $ id (chr) "970911", "970912"
## $ customerName (chr) "HighIncome-1_4", "HighIncome-2_17"
## $ powerType (chr) "ELECTRIC_VEHICLE", "ELECTRIC_VEHICLE"
## $ X1 (dbl) 0, 0
## $ X2 (dbl) 0, 0
## $ X3 (dbl) 0, 0
## $ X4 (dbl) 0, 0
## $ X5 (dbl) 0, 0
## $ X6 (dbl) 0, 0
## $ X7 (dbl) 0, 0
## $ X8 (dbl) 0.0000000, -0.2139529
## $ X9 (dbl) 0, 0
## $ X10 (dbl) 0, 0
## $ X11 (dbl) 0.000000, -1.358172
## $ X12 (dbl) 0, 0
Alternate way with xml2
:
library(purrr)
library(xml2)
doc <- read_xml(fil)
xml_find_all(doc, "//customer-bootstrap-data") %>%
xml_attrs() %>%
map_df(function(x) {
path <- sprintf("//customer-bootstrap-data[@id='%s']/netUsage", x["id"])
vals <- strsplit(xml_text(xml_find_one(doc, path)), ",")[[1]]
vals <- setNames(as.numeric(vals), sprintf("X%d", 1:length(vals)))
rbind.data.frame((c(as.list(x), as.list(vals))), stringsAsFactors=FALSE)
})
UPDATE:
Here's one way to do it when netUsage
is not of uniform length:
data.table::rbindlist(apply(customerBoot, 1, function(x) {
path <- sprintf("//customer-bootstrap-data[@id='%s']/netUsage", x["id"])
vals <- strsplit(xpathSApply(doc=doc, path=path, xmlValue), ",")[[1]]
c(as.list(x), as.list(setNames(as.numeric(vals), sprintf("X%d", 1:length(vals)))))
}), fill=TRUE)
来源:https://stackoverflow.com/questions/37565910/xpath-select-two-consecutive-elements-and-transform-them-into-one-data-frame