I\'m learning Python and I\'m trying to extract lists of all tags and corresponding values from any XML file. This is my code so far.
def ParseXml(XmlFile):
I believe this is what you need:
from bs4 import BeautifulSoup
from urllib.request import urlopen
soup = BeautifulSoup(yourlinkhere, 'lxml')
lst = []
for tag in soup.findChildren():
if tag.child:
lst.append(str(tag.name) + '.' + str(tag.child))
else:
lst.append(tag.name)
Here's a recursive approach that only uses [Python]: xml.etree.ElementTree — The ElementTree XML API:
import xml.etree.ElementTree as ET
def parse_node(node, ancestor_string=""):
#print(type(node), dir(node))
if ancestor_string:
node_string = ".".join([ancestor_string, node.tag])
else:
node_string = node.tag
tag_list = [node_string]
text = node.text
if text:
text_list = [text.strip()]
else:
text_list = [""]
attr_list = [node.attrib]
for child_node in list(node):
child_tag_list, child_text_list, child_attr_list = parse_node(child_node, ancestor_string=node_string)
tag_list.extend(child_tag_list)
text_list.extend(child_text_list)
attr_list.extend(child_attr_list)
return tag_list, text_list, attr_list
def parse_xml(file_name):
tree = ET.parse("test.xml")
root_node = tree.getroot()
tags, texts, attrs = parse_node(root_node)
print(tags)
print(texts)
print(attrs)
def main():
parse_xml("a.xml")
if __name__ == "__main__":
main()
Notes:
parse_node
's ancestor_string
argument, which is computed for each node in the tree and passed to its (direct) childrenmain
and parse_xml
) where one just calls the other, only adds an useless level of nesting, but it's a good practice that I got used toOutput (I've run the script with Python 2.7 and Python 3.5):
['Application', 'Application.UserAuthRequest', 'Application.UserAuthRequest.VendorApp', 'Application.UserAuthRequest.VendorApp.AppName', 'Application.ApplicationRequest', 'Application.ApplicationRequest.GUID', 'Application.ApplicationRequest.Type', 'Application.ApplicationRequest.File', 'Application.ApplicationRequest.FileExtension', 'Application.ApplicationRequest.FileExtension.Result', 'Application.ApplicationRequest.FileExtension.Result.ResultCode'] ['', '', '', 'SING', '', 'ABD45129-PD1212-121DFL', 'Streaming', '', '', '', 'Success'] [{'Version': '2.01'}, {}, {}, {}, {'ID': '12-123-AH'}, {}, {'tc': '200'}, {}, {'VendorCode': '200'}, {}, {'tc': '1'}]