Read XML file while it is being written (in Python)

一曲冷凌霜 提交于 2019-12-02 08:19:05

Since yesterday I found the Peter Gibson's answer about the undocumented xml.etree.ElementTree.XMLTreeBuilder._parser.EndElementHandler.

This example is similar to the other one but uses xml.etree.ElementTree (and watchdog).

It does not work when ElementTree is replaced by cElementTree :-/

import time
import watchdog.events
import watchdog.observers
import xml.etree.ElementTree

class XmlFileEventHandler(watchdog.events.PatternMatchingEventHandler):
    def __init__(self):
        watchdog.events.PatternMatchingEventHandler.__init__(self, patterns=['*.xml'])
        self.xml_file = None
        self.parser = xml.etree.ElementTree.XMLTreeBuilder()
        def end_tag_event(tag):
            node = self.parser._end(tag)
            print 'tag=', tag, 'node=', node
        self.parser._parser.EndElementHandler = end_tag_event

    def on_modified(self, event):
        if not self.xml_file:
            self.xml_file = open(event.src_path)
        buffer = self.xml_file.read()
        if buffer:
            self.parser.feed(buffer)

if __name__ == '__main__':
    observer = watchdog.observers.Observer()
    event_handler = XmlFileEventHandler()
    observer.schedule(event_handler, path='.')
    try:
        observer.start()
        while True:
            time.sleep(10)
    finally:
        observer.stop()
        observer.join()

While the script is running, do not forget to touch one XML file, or simulate the on-the-fly writing using this one line script:

while read line; do echo $line; sleep 1; done <in.xml >out.xml &

For information, the xml.etree.ElementTree.iterparse does not seem to support a file being written. My test code:

from __future__ import print_function, division
import xml.etree.ElementTree

if __name__ == '__main__':
    context = xml.etree.ElementTree.iterparse('f.xml', events=('end',))
    for action, elem in context:
        print(action, elem.tag)

My output:

end program
end version
end creator
end filename
end filesize
end tcpflow
end fileobject
end filename
end filesize
end tcpflow
end fileobject
end filename
end filesize
Traceback (most recent call last):
  File "./iter.py", line 9, in <module>
    for action, elem in context:
  File "/usr/lib64/python2.7/xml/etree/ElementTree.py", line 1281, in next
    self._root = self._parser.close()
  File "/usr/lib64/python2.7/xml/etree/ElementTree.py", line 1654, in close
    self._raiseerror(v)
  File "/usr/lib64/python2.7/xml/etree/ElementTree.py", line 1506, in _raiseerror
    raise err
xml.etree.ElementTree.ParseError: no element found: line 20, column 0

Three hours after posting my question, no answer received. But I have finally implemented the simple example I was looking for.

My inspiration is from saaj's answer and is based on xml.sax and watchdog.

from __future__ import print_function, division
import time
import watchdog.events
import watchdog.observers
import xml.sax

class XmlStreamHandler(xml.sax.handler.ContentHandler):
  def startElement(self, tag, attributes):
    print(tag, 'attributes=', attributes.items())
    self.tag = tag
  def characters(self, content):
    print(self.tag, 'content=', content)

class XmlFileEventHandler(watchdog.events.PatternMatchingEventHandler):
  def __init__(self):
    watchdog.events.PatternMatchingEventHandler.__init__(self, patterns=['*.xml'])
    self.file = None
    self.parser = xml.sax.make_parser()
    self.parser.setContentHandler(XmlStreamHandler())
  def on_modified(self, event):
    if not self.file:
      self.file = open(event.src_path)
    self.parser.feed(self.file.read())

if __name__ == '__main__':
  observer = watchdog.observers.Observer()
  event_handler = XmlFileEventHandler()
  observer.schedule(event_handler, path='.')
  try:
    observer.start()
    while True:
      time.sleep(10)
  finally:
    observer.stop()
    observer.join()

While the script is running, do not forget to touch one XML file, or simulate the on-the-fly writing using the following command:

while read line; do echo $line; sleep 1; done <in.xml >out.xml &
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!