How to use xml sax parser to read and write a large xml?

前提是你 提交于 2019-11-29 06:45:32

You could use a xml.sax.saxutils.XMLFilterBase implementation to filter out your project1 nodes.

Instead of assembling the xml strings yourself you could use xml.sax.saxutils.XMLGenerator.

The following is Python3 code, adjust super if you require Python2.

from xml.sax import make_parser
from xml.sax.saxutils import XMLFilterBase, XMLGenerator


class Project1Filter(XMLFilterBase):
    """This decides which SAX events to forward to the ContentHandler

    We will not forward events when we are inside any elements with a
    name specified in the 'tags_names_to_exclude' parameter
    """

    def __init__(self, tag_names_to_exclude, parent=None):
        super().__init__(parent)

        # set of tag names to exclude
        self._tag_names_to_exclude = tag_names_to_exclude

        # _project_1_count keeps track of opened project1 elements
        self._project_1_count = 0

    def _forward_events(self):
        # will return True when we are not inside a project1 element
        return self._project_1_count == 0

    def startElement(self, name, attrs):
        if name in self._tag_names_to_exclude:
            self._project_1_count += 1

        if self._forward_events():
            super().startElement(name, attrs)

    def endElement(self, name):
        if self._forward_events():
            super().endElement(name)

        if name in self._tag_names_to_exclude:
            self._project_1_count -= 1

    def characters(self, content):
        if self._forward_events():
            super().characters(content)

    # override other content handler methods on XMLFilterBase as neccessary


def main():
    tag_names_to_exclude = {'project1', 'project2', 'project3'}
    reader = Project1Filter(tag_names_to_exclude, make_parser())

    with open('out-small.xml', 'w') as f:
        handler = XMLGenerator(f)
        reader.setContentHandler(handler)
        reader.parse('input.xml')


if __name__ == "__main__":
    main()
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!