Comparing two xml files in python

后端 未结 3 1397
别那么骄傲
别那么骄傲 2020-12-03 12:28

I am new to programming in python,´and i have some troubles understanding the concept. I wish to compare two xml files. These xml files are quite large. I will give an examp

相关标签:
3条回答
  • 2020-12-03 12:48

    This is actually a reasonably challenging problem (due to what "difference" means often being in the eye of the beholder here, as there will be semantically "equivalent" information that you probably don't want marked as differences).

    You could try using xmldiff, which is based on work in the paper Change Detection in Hierarchically Structured Information.

    0 讨论(0)
  • 2020-12-03 12:50

    Another script using xml.etree. Its awful but it works :)

    #!/usr/bin/env python
    
    import sys
    import xml.etree.ElementTree as ET
    
    from termcolor import colored
    
    tree1 = ET.parse(sys.argv[1])
    root1 = tree1.getroot()
    
    tree2 = ET.parse(sys.argv[2])
    root2 = tree2.getroot()
    
    class Element:
        def __init__(self,e):
            self.name = e.tag
            self.subs = {}
            self.atts = {}
            for child in e:
                self.subs[child.tag] = Element(child)
    
            for att in e.attrib.keys():
                self.atts[att] = e.attrib[att]
    
            print "name: %s, len(subs) = %d, len(atts) = %d" % ( self.name, len(self.subs), len(self.atts) )
    
        def compare(self,el):
            if self.name!=el.name:
                raise RuntimeError("Two names are not the same")
            print "----------------------------------------------------------------"
            print self.name
            print "----------------------------------------------------------------"
            for att in self.atts.keys():
                v1 = self.atts[att]
                if att not in el.atts.keys():
                    v2 = '[NA]'
                    color = 'yellow'
                else:
                    v2 = el.atts[att]
                    if v2==v1:
                        color = 'green'
                    else:
                        color = 'red'
                print colored("first:\t%s = %s" % ( att, v1 ), color)
                print colored("second:\t%s = %s" % ( att, v2 ), color)
    
            for subName in self.subs.keys():
                if subName not in el.subs.keys():
                    print colored("first:\thas got %s" % ( subName), 'purple')
                    print colored("second:\thasn't got %s" % ( subName), 'purple')
                else:
                    self.subs[subName].compare( el.subs[subName] )
    
    
    
    e1 = Element(root1)
    e2 = Element(root2)
    
    e1.compare(e2)
    
    0 讨论(0)
  • 2020-12-03 12:51

    My approach to the problem was transforming each XML into a xml.etree.ElementTree and iterating through each of the layers. I also included the functionality to ignore a list of attributes while doing the comparison.

    The first block of code holds the class used:

    import xml.etree.ElementTree as ET
    import logging
    
    class XmlTree():
    
        def __init__(self):
            self.hdlr = logging.FileHandler('xml-comparison.log')
            self.formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    
        @staticmethod
        def convert_string_to_tree( xmlString):
    
            return ET.fromstring(xmlString)
    
        def xml_compare(self, x1, x2, excludes=[]):
            """
            Compares two xml etrees
            :param x1: the first tree
            :param x2: the second tree
            :param excludes: list of string of attributes to exclude from comparison
            :return:
                True if both files match
            """
    
            if x1.tag != x2.tag:
                self.logger.debug('Tags do not match: %s and %s' % (x1.tag, x2.tag))
                return False
            for name, value in x1.attrib.items():
                if not name in excludes:
                    if x2.attrib.get(name) != value:
                        self.logger.debug('Attributes do not match: %s=%r, %s=%r'
                                     % (name, value, name, x2.attrib.get(name)))
                        return False
            for name in x2.attrib.keys():
                if not name in excludes:
                    if name not in x1.attrib:
                        self.logger.debug('x2 has an attribute x1 is missing: %s'
                                     % name)
                        return False
            if not self.text_compare(x1.text, x2.text):
                self.logger.debug('text: %r != %r' % (x1.text, x2.text))
                return False
            if not self.text_compare(x1.tail, x2.tail):
                self.logger.debug('tail: %r != %r' % (x1.tail, x2.tail))
                return False
            cl1 = x1.getchildren()
            cl2 = x2.getchildren()
            if len(cl1) != len(cl2):
                self.logger.debug('children length differs, %i != %i'
                             % (len(cl1), len(cl2)))
                return False
            i = 0
            for c1, c2 in zip(cl1, cl2):
                i += 1
                if not c1.tag in excludes:
                    if not self.xml_compare(c1, c2, excludes):
                        self.logger.debug('children %i do not match: %s'
                                     % (i, c1.tag))
                        return False
            return True
    
        def text_compare(self, t1, t2):
            """
            Compare two text strings
            :param t1: text one
            :param t2: text two
            :return:
                True if a match
            """
            if not t1 and not t2:
                return True
            if t1 == '*' or t2 == '*':
                return True
            return (t1 or '').strip() == (t2 or '').strip()
    

    The second block of code holds a couple of XML examples and their comparison:

    xml1 = "<note><to>Tove</to><from>Jani</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
    
    xml2 = "<note><to>Tove</to><from>Daniel</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>"
    
    tree1 = XmlTree.convert_string_to_tree(xml1)
    tree2 = XmlTree.convert_string_to_tree(xml2)
    
    comparator = XmlTree()
    
    if comparator.xml_compare(tree1, tree2, ["from"]):
        print "XMLs match"
    else:
        print "XMLs don't match"
    

    Most of the credit for this code must be given to syawar

    0 讨论(0)
提交回复
热议问题