Building on another SO question, how can one check whether two well-formed XML snippets are semantically equal. All I need is \"equal\" or not, since I\'m using this for un
I had the same problem: two documents I wanted to compare that had the same attributes but in different orders.
It seems that XML Canonicalization (C14N) in lxml works well for this, but I'm definitely not an XML expert. I'm curious to know if somebody else can point out drawbacks to this approach.
parser = etree.XMLParser(remove_blank_text=True)
xml1 = etree.fromstring(xml_string1, parser)
xml2 = etree.fromstring(xml_string2, parser)
print "xml1 == xml2: " + str(xml1 == xml2)
ppxml1 = etree.tostring(xml1, pretty_print=True)
ppxml2 = etree.tostring(xml2, pretty_print=True)
print "pretty(xml1) == pretty(xml2): " + str(ppxml1 == ppxml2)
xml_string_io1 = StringIO()
xml1.getroottree().write_c14n(xml_string_io1)
cxml1 = xml_string_io1.getvalue()
xml_string_io2 = StringIO()
xml2.getroottree().write_c14n(xml_string_io2)
cxml2 = xml_string_io2.getvalue()
print "canonicalize(xml1) == canonicalize(xml2): " + str(cxml1 == cxml2)
Running this gives me:
$ python test.py
xml1 == xml2: false
pretty(xml1) == pretty(xml2): false
canonicalize(xml1) == canonicalize(xml2): true
Adapting Anentropic's great answer to Python 3 (basically, change iteritems()
to items()
, and basestring
to string
):
from lxml import etree
import xmltodict # pip install xmltodict
def normalise_dict(d):
"""
Recursively convert dict-like object (eg OrderedDict) into plain dict.
Sorts list values.
"""
out = {}
for k, v in dict(d).items():
if hasattr(v, 'iteritems'):
out[k] = normalise_dict(v)
elif isinstance(v, list):
out[k] = []
for item in sorted(v):
if hasattr(item, 'iteritems'):
out[k].append(normalise_dict(item))
else:
out[k].append(item)
else:
out[k] = v
return out
def xml_compare(a, b):
"""
Compares two XML documents (as string or etree)
Does not care about element order
"""
if not isinstance(a, str):
a = etree.tostring(a)
if not isinstance(b, str):
b = etree.tostring(b)
a = normalise_dict(xmltodict.parse(a))
b = normalise_dict(xmltodict.parse(b))
return a == b
Here a simple solution, convert XML into dictionaries (with xmltodict) and compare dictionaries together
import json
import xmltodict
class XmlDiff(object):
def __init__(self, xml1, xml2):
self.dict1 = json.loads(json.dumps((xmltodict.parse(xml1))))
self.dict2 = json.loads(json.dumps((xmltodict.parse(xml2))))
def equal(self):
return self.dict1 == self.dict2
unit test
import unittest
class XMLDiffTestCase(unittest.TestCase):
def test_xml_equal(self):
xml1 = """<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200" end="1276041599">
</Stats>"""
xml2 = """<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats end="1276041599" start="1275955200" >
</Stats>"""
self.assertTrue(XmlDiff(xml1, xml2).equal())
def test_xml_not_equal(self):
xml1 = """<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats start="1275955200">
</Stats>"""
xml2 = """<?xml version='1.0' encoding='utf-8' standalone='yes'?>
<Stats end="1276041599" start="1275955200" >
</Stats>"""
self.assertFalse(XmlDiff(xml1, xml2).equal())
or in simple python method :
import json
import xmltodict
def xml_equal(a, b):
"""
Compares two XML documents (as string or etree)
Does not care about element order
"""
return json.loads(json.dumps((xmltodict.parse(a)))) == json.loads(json.dumps((xmltodict.parse(b))))
Thinking about this problem, I came up with the following solution that renders XML elements comparable and sortable:
import xml.etree.ElementTree as ET
def cmpElement(x, y):
# compare type
r = cmp(type(x), type(y))
if r: return r
# compare tag
r = cmp(x.tag, y.tag)
if r: return r
# compare tag attributes
r = cmp(x.attrib, y.attrib)
if r: return r
# compare stripped text content
xtext = (x.text and x.text.strip()) or None
ytext = (y.text and y.text.strip()) or None
r = cmp(xtext, ytext)
if r: return r
# compare sorted children
if len(x) or len(y):
return cmp(sorted(x.getchildren()), sorted(y.getchildren()))
return 0
ET._ElementInterface.__lt__ = lambda self, other: cmpElement(self, other) == -1
ET._ElementInterface.__gt__ = lambda self, other: cmpElement(self, other) == 1
ET._ElementInterface.__le__ = lambda self, other: cmpElement(self, other) <= 0
ET._ElementInterface.__ge__ = lambda self, other: cmpElement(self, other) >= 0
ET._ElementInterface.__eq__ = lambda self, other: cmpElement(self, other) == 0
ET._ElementInterface.__ne__ = lambda self, other: cmpElement(self, other) != 0