问题
I'm trying to remove all the project1
nodes (along with their child elements) from the below sample xml document (original document is about 30 GB) using SAX parser.It would be fine to have a separate modified file or ok with the in-line edit.
sample.xml
<ROOT>
<test src="http://dfs.com">Hi</test>
<project1>This is old data<foo></foo></project1>
<bar>
<project1>ty</project1>
<foo></foo>
</bar>
</ROOT>
Here is my attempt..
parser.py
from xml.sax.handler import ContentHandler
import xml.sax
class MyHandler(xml.sax.handler.ContentHandler):
def __init__(self, out_file):
self._charBuffer = []
self._result = []
self._out = open(out_file, 'w')
def _createElement(self, name, attrs):
attributes = attrs.items()
if attributes:
out = ''
for key, value in attributes:
out += ' {}={}'.format(key, value)
return '<{}{}>'.format(name, out)
return '<{}>'.format(name)
def _getCharacterData(self):
data = ''.join(self._charBuffer).strip()
self._charBuffer = []
self._out.write(data.strip()) #remove strip() if whitespace is important
def parse(self, f):
xml.sax.parse(f, self)
def characters(self, data):
self._charBuffer.append(data)
def startElement(self, name, attrs):
if not name == 'project1':
self._result.append({})
self._out.write(self._createElement(name, attrs))
def endElement(self, name):
if not name == 'project1': self._result[-1][name] = self._getCharacterData()
MyHandler('out.xml').parse("sample.xml")
I can't make it to work.
回答1:
You could use a xml.sax.saxutils.XMLFilterBase implementation to filter out your project1 nodes.
Instead of assembling the xml strings yourself you could use xml.sax.saxutils.XMLGenerator.
The following is Python3 code, adjust super
if you require Python2.
from xml.sax import make_parser
from xml.sax.saxutils import XMLFilterBase, XMLGenerator
class Project1Filter(XMLFilterBase):
"""This decides which SAX events to forward to the ContentHandler
We will not forward events when we are inside any elements with a
name specified in the 'tags_names_to_exclude' parameter
"""
def __init__(self, tag_names_to_exclude, parent=None):
super().__init__(parent)
# set of tag names to exclude
self._tag_names_to_exclude = tag_names_to_exclude
# _project_1_count keeps track of opened project1 elements
self._project_1_count = 0
def _forward_events(self):
# will return True when we are not inside a project1 element
return self._project_1_count == 0
def startElement(self, name, attrs):
if name in self._tag_names_to_exclude:
self._project_1_count += 1
if self._forward_events():
super().startElement(name, attrs)
def endElement(self, name):
if self._forward_events():
super().endElement(name)
if name in self._tag_names_to_exclude:
self._project_1_count -= 1
def characters(self, content):
if self._forward_events():
super().characters(content)
# override other content handler methods on XMLFilterBase as neccessary
def main():
tag_names_to_exclude = {'project1', 'project2', 'project3'}
reader = Project1Filter(tag_names_to_exclude, make_parser())
with open('out-small.xml', 'w') as f:
handler = XMLGenerator(f)
reader.setContentHandler(handler)
reader.parse('input.xml')
if __name__ == "__main__":
main()
来源:https://stackoverflow.com/questions/42325244/how-to-use-xml-sax-parser-to-read-and-write-a-large-xml