I have written a small python script to parse XML data based on Liza Daly's blog in Python. However, my code does not parse all the nodes. So for example when a person has had multiple addresses then it takes only the first available address. The XML tree would look like this:
- lgs
- entities
- entity
- id
- name
- addressess
- address
- address1
- address
- address1
- entity
- id
(...)
and this would be the python script:
import os
import time
from datetime import datetime
import lxml.etree as ET
import pandas as pd
xml_file = '.\\FILE.XML'
file_name, file_extension = os.path.splitext(os.path.basename(xml_file))
def fast_iter(context, *args, **kwargs):
# iterate through the XML file and create a dictionary
xml_dict = { "Id":[]
, "name":[]
, "address":[]
, "type":[] }
def try_to_append(xml_column, node):
# find and append XML nodes to the empty dictionary
try:
xml_dict[xml_column].append(elem.find(node).text)
except:
xml_dict[xml_column].append('')
for event, elem in context:
try_to_append('Id', 'id')
try_to_append('name', 'name')
try_to_append('address', 'addresses/address/address1'
elem.clear()
for ancestor in elem.xpath("ancestor-or-self::*"):
while ancestor.getprevious() is not None:
del ancestor.getparent()[0]
del context
return xml_dict
context = ET.iterparse(xml_file, tag='entity')
xml_dict = fast_iter(context)
df = pd.DataFrame(xml_dict)
So what I get is:
ID | name | address | Type
1 | John Doe | Pythonstreet 4 | A
But John Doe has 3 addresses on his name where he has lived. So I would expect:
ID | name | address | Type
1 | John Doe | Pythonstreet 4 | A
1 | John Doe | SQL street 33 | A
1 | John Doe | C++ street 99 | A
And as you can see, because the only extra node is address, the rest does not change and only the address is added extra. SQL Server would do this automatically when you use OPENROWSET() with BULK. However, this file is greater than 3GB so that wouldn't work.
UPDATE: XML Example (fake addresses)
<?xml version='1.0' encoding='UTF-8'?>
<ffl>
<version>12345</version>
<entities>
<entity id="1124353" version="12345">
<name>DAVID, Beckham</name>
<footId>1021</footId>
<footCode>FIF</footCode>
<createdDate>09/02/1991</createdDate>
<source>FIF</source>
<OriginalSource>FIFA</OriginalSource>
<pobs>
<pob>Leytonstone, London, United Kingdom</pob>
</pobs>
<dobs>
<dob>May 02, 1975</dob>
</dobs>
<titles>
<title>Football player</title>
</titles>
<addresses>
<address>
<address></address>
<city>London</city>
<country>UK</country>
<countryName>UNITED KINGDOM</countryName>
<postalCode>SE10 0JF</postalCode>
</address>
<address>
<address1>35-37 Parkgate Road</address1>
<city>London</city>
<country>UK</country>
<countryName>UNITED KINGDOM</countryName>
<postalCode>SW11 4NP</postalCode>
</address>
</addresses>
</entity>
<entity id="1184359" version="12345">
<name>CRISTIANO, Ronaldo</name>
<footId>1022</footId>
<footCode>FIF</footCode>
<createdDate>20/03/2003</createdDate>
<source>FIF</source>
<OriginalSource>FIFA</OriginalSource>
<pobs>
<pob>Funchal, Madeira, Portugal</pob>
</pobs>
<dobs>
<dob>February 05, 1985</dob>
</dobs>
<titles>
<title>Football player</title>
</titles>
<addresses>
<address>
<address>Avenida da Boavista 1837</address>
<city>Porto</city>
<country>PT</country>
<countryName>PORTUGAL</countryName>
<postalCode>4100-133</postalCode>
</address>
<address>
<address1>Extramuros 74</address1>
<city>Madrid</city>
<country>ES</country>
<countryName>Spain</countryName>
<postalCode>28400</postalCode>
</address>
<address>
<address1>Viale Certosa 29</address1>
<city>Turin</city>
<country>IT</country>
<countryName>Italy</countryName>
<postalCode>10093</postalCode>
</address>
</addresses>
</entity>
<entity id="1984359" version="12345">
<name>LIONEL, Messi</name>
<footId>1023</footId>
<footCode>FIF</footCode>
<createdDate>09/02/2008</createdDate>
<source>FIF</source>
<OriginalSource>FIFA</OriginalSource>
<pobs>
<pob>Rosario, Argentina</pob>
</pobs>
<dobs>
<dob>June 24, 1987</dob>
</dobs>
<titles>
<title>Football player</title>
</titles>
<addresses>
<address>
<address>Almeyra 2588</address>
<city>San Martin</city>
<state>Buenos Aires</state>
<country>AR</country>
<countryName>ARGENTINA</countryName>
<postalCode>N/A</postalCode>
</address>
<address>
<address1>Comandante Izarduy 67</address1>
<city>Barcelona</city>
<country>ES</country>
<countryName>SPAIN</countryName>
<postalCode>08940</postalCode>
</address>
<address>
<address1>Humahuaca 4425</address1>
<city>Buenos Aires</city>
<country>AR</country>
<countryName>ARGENTINA</countryName>
<postalCode>N/A</postalCode>
</address>
</addresses>
</entity>
</entities>
</ffl>
Comment: As it now only outputs results
Outputing results are only for demonstration, tracing and debuging.
To write a record
and addresses
into a SQL
database, for example using sqlite3
, do:
c.execute("INSERT INTO entity(id, name) VALUES(:id, :name)", record)
addresses = []
for addr in record['addresses']:
addr[1].update({'id': record['id']})
addresses.append(addr[1])
c.executemany("INSERT INTO adresses(id, address, city) VALUES(:id, :address, :city)", addresses)
To flatten for pandas
Preconditon outside the loop: df = pd.DataFrame()
from copy import copy
addresses = copy(record['addresses'])
del record['addresses']
df_records = []
for addr in addresses:
record.update(addr[1])
df_records.append(record)
df = df.append(df_records, ignore_index=True)
Question: Use
etree.iterparse
to include all nodes in XML file
The following class Entity
do:
- Parse the
XML
File usinglxml.etree.iterparse
. - There is no File size limit, as the
<entity>...</entity>
Element Tree are deleted after processing. - Builds from every
<entity>...</entity>
Tree adict {tag, value, ...}
. - Using of
generator objects
toyield
thedict
. - Sequence Elements, e.g.
<addresses>/<address>
are List of Tuple[(address, {tag, text})...
.
ToDo:
- To flatten into many Records, loop
record['addresses']
- To equal different tag names:
address
andaddress1
- To flatten, Sequence tags, e.g.
<titels>
,<probs>
and<dobs>
from lxml import etree
class Entity:
def __init__(self, fh):
"""
Initialize 'iterparse' to only generate 'end' events on tag '<entity>'
:param fh: File Handle from the XML File to parse
"""
self.context = etree.iterparse(fh, events=("end",), tag=['entity'])
def _parse(self):
"""
Parse the XML File for all '<entity>...</entity>' Elements
Clear/Delete the Element Tree after processing
:return: Yield the current '<entity>...</entity>' Element Tree
"""
for event, elem in self.context:
yield elem
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
def sequence(self, elements):
"""
Expand a Sequence Element, e.g. <titels> to a Tuple ('titel', text).
If found a nested Sequence Element, e.g. <address>,
to a Tuple ('address', {tag, text})
:param elements: The Sequence Element
:return: List of Tuple [(tag1, value), (tag2, value), ... ,(tagn, value))
"""
_elements = []
for elem in elements:
if len(elem):
_elements.append((elem.tag, dict(self.sequence(elem))))
else:
_elements.append((elem.tag, elem.text))
return _elements
def __iter__(self):
"""
Iterate all '<entity>...</entity>' Element Trees yielded from self._parse()
:return: Dict var 'entity' {tag1, value, tag2, value, ... ,tagn, value}}
"""
for xml_entity in self._parse():
entity = {'id': xml_entity.attrib['id']}
for elem in xml_entity:
# if elem is Sequence
if len(elem):
# Append tuple(tag, value)
entity[elem.tag] = self.sequence(elem)
else:
entity[elem.tag] = elem.text
yield entity
if __name__ == "__main__":
with open('.\\FILE.XML', 'rb') as in_xml_
for record in Entity(in_xml):
print("record:{}".format(record))
for key, value in record.items():
if isinstance(value, (list)):
#print_list(key, value)
print("{}:{}".format(key, value))
else:
print("{}:{}".format(key, value))
Output: Shows only the first Record and only 4 fields.
Note: There is a pitfall with unique tag names:address
andaddress1
record:{'id': '1124353', 'titles': {'title': 'Foot... (omitted for brevity) id:1124353 name:DAVID, Beckham titles:[('title', 'Football player')] addresses: address:{'city': 'London', 'address': None, 'post... (omitted for brevity) address:{'city': 'London', 'address1': '35-37 Par... (omitted for brevity)
Tested with Python: 3.5 - lxml.etree: 3.7.1
来源:https://stackoverflow.com/questions/53775497/iterparse-big-xml-with-low-memory-footprint-and-get-all-even-nested-sequence