Better way to parse xml

核能气质少年 提交于 2020-01-12 03:36:12

问题


I've been parsing XML like this for years, and I have to admit when the number of different element becomes larger I find it a bit boring and exhausting to do, here is what I mean, sample dummy XML:

<?xml version="1.0"?>
<Order>
    <Date>2003/07/04</Date>
    <CustomerId>123</CustomerId>
    <CustomerName>Acme Alpha</CustomerName>
    <Item>
        <ItemId> 987</ItemId>
        <ItemName>Coupler</ItemName>
        <Quantity>5</Quantity>
    </Item>
    <Item>
        <ItemId>654</ItemId>
        <ItemName>Connector</ItemName>
        <Quantity unit="12">3</Quantity>
    </Item>
    <Item>
        <ItemId>579</ItemId>
        <ItemName>Clasp</ItemName>
        <Quantity>1</Quantity>
    </Item>
</Order>

This is relevant part (using sax) :

public class SaxParser extends DefaultHandler {

    boolean isItem = false;
    boolean isOrder = false;
    boolean isDate = false;
    boolean isCustomerId = false;
    private Order order;
    private Item item;

        @Override
    public void startElement(String namespaceURI, String localName, String qName, Attributes atts) {
        if (localName.equalsIgnoreCase("ORDER")) {
            order = new Order();
        }

        if (localName.equalsIgnoreCase("DATE")) {
            isDate = true;
        }

        if (localName.equalsIgnoreCase("CUSTOMERID")) {
            isCustomerId = true;
        }

        if (localName.equalsIgnoreCase("ITEM")) {
            isItem = true;
        }
    }

    public void characters(char ch[], int start, int length) throws SAXException {

        if (isDate){
            SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
            String value = new String(ch, start, length);
            try {
                order.setDate(formatter.parse(value));
            } catch (ParseException e) {
                e.printStackTrace();
            }
        }

        if(isCustomerId){
            order.setCustomerId(Integer.valueOf(new String(ch, start, length)));
        }

        if (isItem) {
            item = new Item();
            isItem = false;
        }



    }

}

I'm wondering is there a way to get rid of these hideous booleans which keep growing with number of elements. There must be a better way to parse this relatively simple xml. Just by looking the lines of code necessary to do this task looks ugly.

Currently I'm using SAX parser, but I'm open to any other suggestions (other than DOM, I can't afford in memory parsers I have huge XML files).


回答1:


Here's an example of using JAXB with StAX.

Input document:

<?xml version="1.0" encoding="UTF-8"?>
<Personlist xmlns="http://example.org">
    <Person>
        <Name>Name 1</Name>
        <Address>
            <StreetAddress>Somestreet</StreetAddress>
            <PostalCode>00001</PostalCode>
            <CountryName>Finland</CountryName>
        </Address>
    </Person>
    <Person>
        <Name>Name 2</Name>
        <Address>
            <StreetAddress>Someotherstreet</StreetAddress>
            <PostalCode>43400</PostalCode>
            <CountryName>Sweden</CountryName>
        </Address>
    </Person>
</Personlist>

Person.java:

@XmlRootElement(name = "Person", namespace = "http://example.org")
public class Person {
    @XmlElement(name = "Name", namespace = "http://example.org")
    private String name;
    @XmlElement(name = "Address", namespace = "http://example.org")
    private Address address;

    public String getName() {
        return name;
    }

    public Address getAddress() {
        return address;
    }
}

Address.java:

public class Address {
    @XmlElement(name = "StreetAddress", namespace = "http://example.org")
    private String streetAddress;
    @XmlElement(name = "PostalCode", namespace = "http://example.org")
    private String postalCode;
    @XmlElement(name = "CountryName", namespace = "http://example.org")
    private String countryName;

    public String getStreetAddress() {
        return streetAddress;
    }

    public String getPostalCode() {
        return postalCode;
    }

    public String getCountryName() {
        return countryName;
    }
}

PersonlistProcessor.java:

public class PersonlistProcessor {
    public static void main(String[] args) throws Exception {
        new PersonlistProcessor().processPersonlist(PersonlistProcessor.class
                .getResourceAsStream("personlist.xml"));
    }

    // TODO: Instead of throws Exception, all exceptions should be wrapped
    // inside runtime exception
    public void processPersonlist(InputStream inputStream) throws Exception {
        JAXBContext jaxbContext = JAXBContext.newInstance(Person.class);
        XMLStreamReader xss = XMLInputFactory.newFactory().createXMLStreamReader(inputStream);
        // Create unmarshaller
        Unmarshaller unmarshaller = jaxbContext.createUnmarshaller();
        // Go to next tag
        xss.nextTag();
        // Require Personlist
        xss.require(XMLStreamReader.START_ELEMENT, "http://example.org", "Personlist");
        // Go to next tag
        while (xss.nextTag() == XMLStreamReader.START_ELEMENT) {
            // Require Person
            xss.require(XMLStreamReader.START_ELEMENT, "http://example.org", "Person");
            // Unmarshall person
            Person person = (Person)unmarshaller.unmarshal(xss);
            // Process person
            processPerson(person);
        }
        // Require Personlist
        xss.require(XMLStreamReader.END_ELEMENT, "http://example.org", "Personlist");
    }

    private void processPerson(Person person) {
        System.out.println(person.getName());
        System.out.println(person.getAddress().getCountryName());
    }
}



回答2:


If you control the definition of the XML, you could use an XML binding tool, for example JAXB (Java Architecture for XML Binding.) In JAXB you can define a schema for the XML structure (XSD and others are supported) or annotate your Java classes in order to define the serialization rules. Once you have a clear declarative mapping between XML and Java, marshalling and unmarshalling to/from XML becomes trivial.

Using JAXB does require more memory than SAX handlers, but there exist methods to process the XML documents by parts: Dealing with large documents.

JAXB page from Oracle




回答3:


I've been using xsteam to serialize my own objects to xml and then load them back as Java objects. If you can represent everythign as POJOs and you properly annotate the POJOs to match the types in your xml file you might find it much easier to use.

When a String represents an object in XML, you can just write:

Order theOrder = (Order)xstream.fromXML(xmlString);

I have always used it to load an object into memory in a single line, but if you need to stream it and process as you go you should be able to use a HierarchicalStreamReader to iterate through the document. This might be very similar to Simple, suggested by @Dave.




回答4:


In SAX the parser "pushes" events at your handler, so you have to do all the housekeeping as you are used to here. An alternative would be StAX (the javax.xml.stream package), which is still streaming but your code is responsible for "pulling" events from the parser. This way the logic of what elements are expected in what order is encoded in the control flow of your program rather than having to be explicitly represented in booleans.

Depending on the precise structure of the XML there may be a "middle way" using a toolkit like XOM, which has a mode of operation where you parse a subtree of the document into a DOM-like object model, process that twig, then throw it away and parse the next one. This is good for repetitive documents with many similar elements that can each be processed in isolation - you get the ease of programming to a tree-based API within each twig but still have the streaming behaviour that lets you parse huge documents efficiently.

public class ItemProcessor extends NodeFactory {
  private Nodes emptyNodes = new Nodes();

  public Nodes finishMakingElement(Element elt) {
    if("Item".equals(elt.getLocalName())) {
      // process the Item element here
      System.out.println(elt.getFirstChildElement("ItemId").getValue()
         + ": " + elt.getFirstChildElement("ItemName").getValue());

      // then throw it away
      return emptyNodes;
    } else {
      return super.finishMakingElement(elt);
    }
  }
}

You can achieve a similar thing with a combination of StAX and JAXB - define JAXB annotated classes that represent your repeating element (Item in this example) and then create a StAX parser, navigate to the first Item start tag, and then you can unmarshal one complete Item at a time from the XMLStreamReader.




回答5:


As others suggested, a Stax model would be a better approach to minimize the memory foot print since it is a push based model. I have personally used Axio (Which is used in Apache Axis) and parse elements using XPath expressions which is less verbose than going through node elements as you have done in the code snippet provided.




回答6:


I've been using this library. It sits on top of the standard Java library and makes things easier for me. In particular, you can ask for a specific element or attribute by name, rather than using the big "if" statement you've described.

http://marketmovers.blogspot.com/2014/02/the-easy-way-to-read-xml-in-java.html




回答7:


There is another library which supports more compact XML parsing, RTXML. The library and its documentation is on rasmustorkel.com. I implemented the parsing of the file in the original question and I am including the complete program here:

package for_so;

import java.io.File;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import rasmus_torkel.xml_basic.read.TagNode;
import rasmus_torkel.xml_basic.read.XmlReadOptions;
import rasmus_torkel.xml_basic.read.impl.XmlReader;

public class Q15626686_ReadOrder
{
    public static class Order
    {
        public final Date            _date;
        public final int             _customerId;
        public final String          _customerName;
        public final ArrayList<Item> _itemAl;

        public
        Order(TagNode node)
        {
            _date = (Date)node.nextStringMappedFieldE("Date", Date.class);
            _customerId = (int)node.nextIntFieldE("CustomerId");
            _customerName = node.nextTextFieldE("CustomerName");
            _itemAl = new ArrayList<Item>();
            boolean finished = false;
            while (!finished)
            {
                TagNode itemNode = node.nextChildN("Item");
                if (itemNode != null)
                {
                    Item item = new Item(itemNode);
                    _itemAl.add(item);
                }
                else
                {
                    finished = true;
                }
            }
            node.verifyNoMoreChildren();
        }
    }

    public static final Pattern DATE_PATTERN = Pattern.compile("^(\\d\\d\\d\\d)\\/(\\d\\d)\\/(\\d\\d)$");

    public static class Date
    {
        public final String _dateString;
        public final int    _year;
        public final int    _month;
        public final int    _day;

        public
        Date(String dateString)
        {
            _dateString = dateString;
            Matcher matcher = DATE_PATTERN.matcher(dateString);
            if (!matcher.matches())
            {
                throw new RuntimeException(dateString + " does not match pattern " + DATE_PATTERN.pattern());
            }
            _year = Integer.parseInt(matcher.group(1));
            _month = Integer.parseInt(matcher.group(2));
            _day = Integer.parseInt(matcher.group(3));
        }
    }

    public static class Item
    {
        public final int      _itemId;
        public final String   _itemName;
        public final Quantity _quantity;

        public
        Item(TagNode node)
        {
            _itemId = node.nextIntFieldE("ItemId");
            _itemName = node.nextTextFieldE("ItemName");
            _quantity = new Quantity(node.nextChildE("Quantity"));
            node.verifyNoMoreChildren();
        }
    }

    public static class Quantity
    {
        public final int _unitSize;
        public final int _unitQuantity;

        public
        Quantity(TagNode node)
        {
            _unitSize = node.attributeIntD("unit", 1);
            _unitQuantity = node.onlyInt();
        }
    }

    public static void
    main(String[] args)
    {
        File xmlFile = new File(args[0]);
        TagNode orderNode = XmlReader.xmlFileToRoot(xmlFile, "Order", XmlReadOptions.DEFAULT);
        Order order = new Order(orderNode);
        System.out.println("Read order for " + order._customerName + " which has " + order._itemAl.size() + " items");
    }
}

You will notice that the retrieval functions end in N, E or D. They refer to what to do when the desired data item is not there. N stands for return Null, E stands for throw Exception and D stands for use Default.




回答8:


Solution without using outside package, or even XPath: use an enum "PARSE_MODE", probably in combination with a Stack<PARSE_MODE>:

1) The basic solution:

a) fields

private PARSE_MODE parseMode = PARSE_MODE.__UNDEFINED__;
// NB: essential that all these enum values are upper case, but this is the convention anyway
private enum PARSE_MODE {
    __UNDEFINED__, ORDER, DATE, CUSTOMERID, ITEM };
private List<String> parseModeStrings = new ArrayList<String>();
private Stack<PARSE_MODE> modeBreadcrumbs = new Stack<PARSE_MODE>();

b) make your List<String>, maybe in the constructor:

    for( PARSE_MODE pm : PARSE_MODE.values() ){
        // might want to check here that these are indeed upper case
        parseModeStrings.add( pm.name() );
    }

c) startElement and endElement:

@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) {
    String localNameUC = localName.toUpperCase();
    // pushing "__UNDEFINED__" would mess things up! But unlikely name for an XML element
    assert ! localNameUC.equals( "__UNDEFINED__" );

    if( parseModeStrings.contains( localNameUC )){
        parseMode = PARSE_MODE.valueOf( localNameUC );
        // any "policing" to do with which modes are allowed to switch into 
        // other modes could be put here... 
        // in your case, go `new Order()` here when parseMode == ORDER
        modeBreadcrumbs.push( parseMode );
    } 
    else {
       // typically ignore the start of this element...
    }
}   

@Override
private void endElement(String uri, String localName, String qName) throws Exception {
    String localNameUC = localName.toUpperCase();
    if( parseModeStrings.contains( localNameUC )){
        // will not fail unless XML structure which is malformed in some way
        // or coding error in use of the Stack, etc.:
        assert modeBreadcrumbs.pop() == parseMode;
        if( modeBreadcrumbs.empty() ){
            parseMode = PARSE_MODE.__UNDEFINED__;
        }
        else {
            parseMode = modeBreadcrumbs.peek();
        }
    } 
    else {
       // typically ignore the end of this element...
    }

}

... so what does this all mean? At any one time you have knowledge of the "parse mode" you're in ... and you can also look at the Stack<PARSE_MODE> modeBreadcrumbs if you need to find out what other parse modes you passed through to get here...

Your characters method then becomes substantially cleaner:

public void characters(char[] ch, int start, int length) throws SAXException {
    switch( parseMode ){
    case DATE:
        // PS - this SimpleDateFormat object can be a field: it doesn't need to be created hundreds of times
        SimpleDateFormat formatter. ...
        String value = ...
        ...
        break;

    case CUSTOMERID:
        order.setCustomerId( ...
        break;

    case ITEM:
        item = new Item();
        // this next line probably won't be needed: when you get to endElement, if 
        // parseMode is ITEM, the previous mode will be restored automatically
        // isItem = false ;
    }

}

2) The more "professional" solution:
abstract class which concrete classes have to extend and which then have no ability to modify the Stack, etc. NB this examines qName rather than localName. Thus:

public abstract class AbstractSAXHandler extends DefaultHandler {
    protected enum PARSE_MODE implements SAXHandlerParseMode {
        __UNDEFINED__
    };
    // abstract: the concrete subclasses must populate...
    abstract protected Collection<Enum<?>> getPossibleModes();
    // 
    private Stack<SAXHandlerParseMode> modeBreadcrumbs = new Stack<SAXHandlerParseMode>();
    private Collection<Enum<?>> possibleModes;
    private Map<String, Enum<?>> nameToEnumMap;
    private Map<String, Enum<?>> getNameToEnumMap(){
        // lazy creation and population of map
        if( nameToEnumMap == null ){
            if( possibleModes == null ){
                possibleModes = getPossibleModes();
            }
            nameToEnumMap = new HashMap<String, Enum<?>>();
            for( Enum<?> possibleMode : possibleModes ){
                nameToEnumMap.put( possibleMode.name(), possibleMode ); 
            }
        }
        return nameToEnumMap;
    }

    protected boolean isLegitimateModeName( String name ){
        return getNameToEnumMap().containsKey( name );
    }

    protected SAXHandlerParseMode getParseMode() {
        return modeBreadcrumbs.isEmpty()? PARSE_MODE.__UNDEFINED__ : modeBreadcrumbs.peek();
    }

    @Override
    public void startElement(String uri, String localName, String qName, Attributes attributes)
            throws SAXException {
        try {
            _startElement(uri, localName, qName, attributes);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    // override in subclasses (NB I think caught Exceptions are not a brilliant design choice in Java)
    protected void _startElement(String uri, String localName, String qName, Attributes attributes)
            throws Exception {
        String qNameUC = qName.toUpperCase();
        // very undesirable ever to push "UNDEFINED"! But unlikely name for an XML element
        assert !qNameUC.equals("__UNDEFINED__") : "Encountered XML element with qName \"__UNDEFINED__\"!";
        if( getNameToEnumMap().containsKey( qNameUC )){
            Enum<?> newMode = getNameToEnumMap().get( qNameUC );
            modeBreadcrumbs.push( (SAXHandlerParseMode)newMode );
        }
    }

    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {
        try {
            _endElement(uri, localName, qName);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    // override in subclasses
    protected void _endElement(String uri, String localName, String qName) throws Exception {
        String qNameUC = qName.toUpperCase();
        if( getNameToEnumMap().containsKey( qNameUC )){
            modeBreadcrumbs.pop(); 
        }
    }

    public List<?> showModeBreadcrumbs(){
        return org.apache.commons.collections4.ListUtils.unmodifiableList( modeBreadcrumbs );
    }

}

interface SAXHandlerParseMode {

}

Then, salient part of concrete subclass:

private enum PARSE_MODE implements SAXHandlerParseMode {
    ORDER, DATE, CUSTOMERID, ITEM
};

private Collection<Enum<?>> possibleModes;

@Override
protected Collection<Enum<?>> getPossibleModes() {
    // lazy initiation
    if (possibleModes == null) {
        List<SAXHandlerParseMode> parseModes = new ArrayList<SAXHandlerParseMode>( Arrays.asList(PARSE_MODE.values()) );
        possibleModes = new ArrayList<Enum<?>>();
        for( SAXHandlerParseMode parseMode : parseModes ){
            possibleModes.add( PARSE_MODE.valueOf( parseMode.toString() ));
        }
        // __UNDEFINED__ mode (from abstract superclass) must be added afterwards
        possibleModes.add( AbstractSAXHandler.PARSE_MODE.__UNDEFINED__ );
    }
    return possibleModes;
}

PS this is a starting point for more sophisticated stuff: for example, you might set up a List<Object> which is kept synchronised with the Stack<PARSE_MODE>: the Objects could then be anything you want, enabling you to "reach back" into the ascendant "XML nodes" of the one you're dealing with. Don't use a Map, though: the Stack can potentially contain the same PARSE_MODE object more than once. This in fact illustrates a fundamental characteristic of all tree-like structures: no individual node (here: parse mode) exists in isolation: its identity is always defined by the entire path leading to it.




回答9:


    import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;

public class JXML {
private DocumentBuilder builder;
private Document doc = null;
private DocumentBuilderFactory factory ;
private XPathExpression expr = null;
private XPathFactory xFactory;
private XPath xpath;
private String xmlFile;
public static ArrayList<String> XMLVALUE ;  


public JXML(String xmlFile){
    this.xmlFile = xmlFile;
}


private void xmlFileSettings(){     
    try {
        factory = DocumentBuilderFactory.newInstance();
        factory.setNamespaceAware(true);
        xFactory = XPathFactory.newInstance();
        xpath = xFactory.newXPath();
        builder = factory.newDocumentBuilder();
        doc = builder.parse(xmlFile);
    }
    catch (Exception e){
        System.out.println(e);
    }       
}



public String[] selectQuery(String query){
    xmlFileSettings();
    ArrayList<String> records = new ArrayList<String>();
    try {
        expr = xpath.compile(query);
        Object result = expr.evaluate(doc, XPathConstants.NODESET);
        NodeList nodes = (NodeList) result;
        for (int i=0; i<nodes.getLength();i++){             
            records.add(nodes.item(i).getNodeValue());
        }
        return records.toArray(new String[records.size()]);
    } 
    catch (Exception e) {
        System.out.println("There is error in query string");
        return records.toArray(new String[records.size()]);
    }       
}

public boolean updateQuery(String query,String value){
    xmlFileSettings();
    try{
        NodeList nodes = (NodeList) xpath.evaluate(query, doc, XPathConstants.NODESET);
        for (int idx = 0; idx < nodes.getLength(); idx++) {
          nodes.item(idx).setTextContent(value);
        }
        Transformer xformer = TransformerFactory.newInstance().newTransformer();
        xformer.transform(new DOMSource(doc), new StreamResult(new File(this.xmlFile)));
        return true;
    }catch(Exception e){
        System.out.println(e);
        return false;
    }
}




public static void main(String args[]){
    JXML jxml = new JXML("c://user.xml");
    jxml.updateQuery("//Order/CustomerId/text()","222");
    String result[]=jxml.selectQuery("//Order/Item/*/text()");
    for(int i=0;i<result.length;i++){
        System.out.println(result[i]);
    }
}

}



来源:https://stackoverflow.com/questions/15626686/better-way-to-parse-xml

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!