I am new to programming in python,´and i have some troubles understanding the concept. I wish to compare two xml files. These xml files are quite large. I will give an examp
This is actually a reasonably challenging problem (due to what "difference" means often being in the eye of the beholder here, as there will be semantically "equivalent" information that you probably don't want marked as differences).
You could try using xmldiff, which is based on work in the paper Change Detection in Hierarchically Structured Information.
Another script using xml.etree. Its awful but it works :)
#!/usr/bin/env python
import sys
import xml.etree.ElementTree as ET
from termcolor import colored
tree1 = ET.parse(sys.argv[1])
root1 = tree1.getroot()
tree2 = ET.parse(sys.argv[2])
root2 = tree2.getroot()
class Element:
def __init__(self,e):
self.name = e.tag
self.subs = {}
self.atts = {}
for child in e:
self.subs[child.tag] = Element(child)
for att in e.attrib.keys():
self.atts[att] = e.attrib[att]
print "name: %s, len(subs) = %d, len(atts) = %d" % ( self.name, len(self.subs), len(self.atts) )
def compare(self,el):
if self.name!=el.name:
raise RuntimeError("Two names are not the same")
print "----------------------------------------------------------------"
print self.name
print "----------------------------------------------------------------"
for att in self.atts.keys():
v1 = self.atts[att]
if att not in el.atts.keys():
v2 = '[NA]'
color = 'yellow'
else:
v2 = el.atts[att]
if v2==v1:
color = 'green'
else:
color = 'red'
print colored("first:\t%s = %s" % ( att, v1 ), color)
print colored("second:\t%s = %s" % ( att, v2 ), color)
for subName in self.subs.keys():
if subName not in el.subs.keys():
print colored("first:\thas got %s" % ( subName), 'purple')
print colored("second:\thasn't got %s" % ( subName), 'purple')
else:
self.subs[subName].compare( el.subs[subName] )
e1 = Element(root1)
e2 = Element(root2)
e1.compare(e2)
My approach to the problem was transforming each XML into a xml.etree.ElementTree and iterating through each of the layers. I also included the functionality to ignore a list of attributes while doing the comparison.
The first block of code holds the class used:
import xml.etree.ElementTree as ET
import logging
class XmlTree():
def __init__(self):
self.hdlr = logging.FileHandler('xml-comparison.log')
self.formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
@staticmethod
def convert_string_to_tree( xmlString):
return ET.fromstring(xmlString)
def xml_compare(self, x1, x2, excludes=[]):
"""
Compares two xml etrees
:param x1: the first tree
:param x2: the second tree
:param excludes: list of string of attributes to exclude from comparison
:return:
True if both files match
"""
if x1.tag != x2.tag:
self.logger.debug('Tags do not match: %s and %s' % (x1.tag, x2.tag))
return False
for name, value in x1.attrib.items():
if not name in excludes:
if x2.attrib.get(name) != value:
self.logger.debug('Attributes do not match: %s=%r, %s=%r'
% (name, value, name, x2.attrib.get(name)))
return False
for name in x2.attrib.keys():
if not name in excludes:
if name not in x1.attrib:
self.logger.debug('x2 has an attribute x1 is missing: %s'
% name)
return False
if not self.text_compare(x1.text, x2.text):
self.logger.debug('text: %r != %r' % (x1.text, x2.text))
return False
if not self.text_compare(x1.tail, x2.tail):
self.logger.debug('tail: %r != %r' % (x1.tail, x2.tail))
return False
cl1 = x1.getchildren()
cl2 = x2.getchildren()
if len(cl1) != len(cl2):
self.logger.debug('children length differs, %i != %i'
% (len(cl1), len(cl2)))
return False
i = 0
for c1, c2 in zip(cl1, cl2):
i += 1
if not c1.tag in excludes:
if not self.xml_compare(c1, c2, excludes):
self.logger.debug('children %i do not match: %s'
% (i, c1.tag))
return False
return True
def text_compare(self, t1, t2):
"""
Compare two text strings
:param t1: text one
:param t2: text two
:return:
True if a match
"""
if not t1 and not t2:
return True
if t1 == '*' or t2 == '*':
return True
return (t1 or '').strip() == (t2 or '').strip()
The second block of code holds a couple of XML examples and their comparison:
xml1 = "<note><to>Tove</to><from>Jani</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
xml2 = "<note><to>Tove</to><from>Daniel</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
tree1 = XmlTree.convert_string_to_tree(xml1)
tree2 = XmlTree.convert_string_to_tree(xml2)
comparator = XmlTree()
if comparator.xml_compare(tree1, tree2, ["from"]):
print "XMLs match"
else:
print "XMLs don't match"
Most of the credit for this code must be given to syawar