I have to monitor an XML file being written by a tool running all the day. But the XML file is properly completed and closed only at the end of the day.
Same constraint
Since yesterday I found the Peter Gibson's answer about the undocumented xml.etree.ElementTree.XMLTreeBuilder._parser.EndElementHandler
.
This example is similar to the other one but uses xml.etree.ElementTree (and watchdog).
It does not work when ElementTree
is replaced by cElementTree
:-/
import time
import watchdog.events
import watchdog.observers
import xml.etree.ElementTree
class XmlFileEventHandler(watchdog.events.PatternMatchingEventHandler):
def __init__(self):
watchdog.events.PatternMatchingEventHandler.__init__(self, patterns=['*.xml'])
self.xml_file = None
self.parser = xml.etree.ElementTree.XMLTreeBuilder()
def end_tag_event(tag):
node = self.parser._end(tag)
print 'tag=', tag, 'node=', node
self.parser._parser.EndElementHandler = end_tag_event
def on_modified(self, event):
if not self.xml_file:
self.xml_file = open(event.src_path)
buffer = self.xml_file.read()
if buffer:
self.parser.feed(buffer)
if __name__ == '__main__':
observer = watchdog.observers.Observer()
event_handler = XmlFileEventHandler()
observer.schedule(event_handler, path='.')
try:
observer.start()
while True:
time.sleep(10)
finally:
observer.stop()
observer.join()
While the script is running, do not forget to touch
one XML file, or simulate the on-the-fly writing using this one line script:
while read line; do echo $line; sleep 1; done <in.xml >out.xml &
For information, the xml.etree.ElementTree.iterparse does not seem to support a file being written. My test code:
from __future__ import print_function, division
import xml.etree.ElementTree
if __name__ == '__main__':
context = xml.etree.ElementTree.iterparse('f.xml', events=('end',))
for action, elem in context:
print(action, elem.tag)
My output:
end program
end version
end creator
end filename
end filesize
end tcpflow
end fileobject
end filename
end filesize
end tcpflow
end fileobject
end filename
end filesize
Traceback (most recent call last):
File "./iter.py", line 9, in <module>
for action, elem in context:
File "/usr/lib64/python2.7/xml/etree/ElementTree.py", line 1281, in next
self._root = self._parser.close()
File "/usr/lib64/python2.7/xml/etree/ElementTree.py", line 1654, in close
self._raiseerror(v)
File "/usr/lib64/python2.7/xml/etree/ElementTree.py", line 1506, in _raiseerror
raise err
xml.etree.ElementTree.ParseError: no element found: line 20, column 0
Three hours after posting my question, no answer received. But I have finally implemented the simple example I was looking for.
My inspiration is from saaj's answer and is based on xml.sax and watchdog.
from __future__ import print_function, division
import time
import watchdog.events
import watchdog.observers
import xml.sax
class XmlStreamHandler(xml.sax.handler.ContentHandler):
def startElement(self, tag, attributes):
print(tag, 'attributes=', attributes.items())
self.tag = tag
def characters(self, content):
print(self.tag, 'content=', content)
class XmlFileEventHandler(watchdog.events.PatternMatchingEventHandler):
def __init__(self):
watchdog.events.PatternMatchingEventHandler.__init__(self, patterns=['*.xml'])
self.file = None
self.parser = xml.sax.make_parser()
self.parser.setContentHandler(XmlStreamHandler())
def on_modified(self, event):
if not self.file:
self.file = open(event.src_path)
self.parser.feed(self.file.read())
if __name__ == '__main__':
observer = watchdog.observers.Observer()
event_handler = XmlFileEventHandler()
observer.schedule(event_handler, path='.')
try:
observer.start()
while True:
time.sleep(10)
finally:
observer.stop()
observer.join()
While the script is running, do not forget to touch
one XML file, or simulate the on-the-fly writing using the following command:
while read line; do echo $line; sleep 1; done <in.xml >out.xml &