Remove specific XML nodes using Clojure

后端 未结 3 611
悲&欢浪女
悲&欢浪女 2021-01-24 07:45

I have the following XML structure:

(def xmlstr
\"
  
    AAA         


        
相关标签:
3条回答
  • 2021-01-24 08:06

    Examples below use full namespaces, not aliases. One way of solving this was using zippers:

    (defn remove-types-loc [types loc]
      (loop [loc loc]
        (if (clojure.zip/end? loc)
          (clojure.zip/root loc)
          (if (and (clojure.zip/branch? loc)
                   (some #(and (= (:tag %) :Type)
                               (contains? types (first (:content %)))) (clojure.zip/children loc)))
            (recur (clojure.zip/remove loc))
            (recur (clojure.zip/next loc))))))
    
    (clojure.data.xml/emit-str (remove-types-loc #{"B" "C"} zipxml))
    ;; => emits the expected result, with the two Type A Items
    

    The following gives the same result using core functions, but has quite a new nested levels and 'needs' two functions:

    (defn remove-types-in* [remove-types content]
      (update-in content [:content]
                 (fn [items]
                   (remove (fn [item]
                             (some #(and
                                     (= (:tag %) :Type)
                                     (contains? remove-types (first (:content %)))) (:content item)))
                    items))))
    
    (defn remove-types-in [remove-types xmldoc]
      (update-in xmldoc [:content] #(map (partial remove-types-in* remove-types) %)))
    
    (clojure.data.xml/emit-str (remove-types-in #{"B" "C"} xmldoc))
    ;; => same result as above
    

    Finally, when the structure is fixed and as simple as this one, it is easy to just construct the result manually. But this would break if the source gets more elements or attributes.

    (clojure.data.xml/emit-str
     (clojure.data.xml/sexp-as-element
      [:ROOT
       [:Items
        (for [i (clojure.data.zip.xml/xml-> zipxml :Items :Item)
              :let [t (clojure.data.zip.xml/xml1-> i :Type clojure.data.zip.xml/text)
                    n (clojure.data.zip.xml/xml1-> i :Note clojure.data.zip.xml/text)]
              :when (not (contains? #{"B" "C"} t))]
          [:Item
           [:Type t]
           [:Note n]])]]))
    ;; same as above
    

    Possibly a better version of the above, which will work even if Item structure changes:

    (clojure.data.xml/emit-str
     (clojure.data.xml/element
      :ROOT {}
      (clojure.data.xml/element
       :Items {}
       (for [n (xml-seq xmldoc)
             :when (and
                    (= :Item (:tag n))
                    (not (some #(and (= (:tag %) :Type)
                                     (contains? #{"B" "C"} (first (:content %))))
                               (:content n))))] n))))
    

    Didn't find any oneliners for doing it. Not sure if there are better / more readable ways of doing this using org.clojure or other libraries.

    For more complex XML editing, XSLT or XQuery Update is arguably a more 'native' solution. Here's a quick and dirty XSLT 2.0 solution using the open-source Saxon-HE S9API:

    ;; lein try net.sf.saxon/Saxon-HE "9.7.0-18"
    (defn remove-types-xslt [remove-types xmlstr]
      (let [processor (net.sf.saxon.s9api.Processor. false)
             compiler (.newXsltCompiler processor)
             exp (.compile compiler (javax.xml.transform.stream.StreamSource. (java.io.StringReader. "<xsl:transform version='2.0' xmlns:xsl='http://www.w3.org/1999/XSL/Transform'><xsl:param name='remove-types'/><xsl:template match='@*|node()'><xsl:copy><xsl:apply-templates select='@*|node()'/></xsl:copy></xsl:template><xsl:template match='Item[Type[. = $remove-types]]'/></xsl:transform>")))
             src (.build (.newDocumentBuilder processor) (javax.xml.transform.stream.StreamSource. (java.io.StringReader. xmlstr)))
             sw (java.io.StringWriter.)
             out (doto (net.sf.saxon.s9api.Serializer.) (.setOutputWriter sw))
             t (doto (.load exp) (.setInitialContextNode src) (.setDestination out) (.setParameter (net.sf.saxon.s9api.QName. "remove-types") (net.sf.saxon.s9api.XdmValue. (for [remove-type remove-types] (net.sf.saxon.s9api.XdmAtomicValue. remove-type)))) (.transform))]
        sw))
    (str (remove-types-xslt #{"B" "C"} xmlstr))
    

    And for completeness, here's an even dirtier version using XQuery Update Facility. Note that this particular example uses Saxon-EE, and therefore requires a paid EE license.

    (defn remove-types-xq [remove-types xmlstr]
      (let [processor (net.sf.saxon.s9api.Processor. true)
            compiler (doto (.newXQueryCompiler processor) (.setUpdatingEnabled true))
            exp (.compile compiler "declare variable $remove-types as xs:string+ external;delete nodes //Items/Item[Type[. = $remove-types]]")
            src (.build (doto (.newDocumentBuilder processor) (.setTreeModel net.sf.saxon.om.TreeModel/LINKED_TREE)) (javax.xml.transform.stream.StreamSource. (java.io.StringReader. xmlstr)))
            e (doto (.load exp) (.setContextItem src) (.setExternalVariable (net.sf.saxon.s9api.QName. "remove-types") (net.sf.saxon.s9api.XdmValue. (for [remove-type remove-types] (net.sf.saxon.s9api.XdmAtomicValue. remove-type)))) (.run))]
        (when-let [res (first (iterator-seq (.getUpdatedDocuments e)))]
          (let [sw (java.io.StringWriter.)
                out (doto (net.sf.saxon.s9api.Serializer.) (.setOutputWriter sw))]
            (.writeXdmValue processor res out)
            sw))))
    (str (remove-types-xq #{"B" "C"} xmlstr))
    

    Except for all the stuff, delete nodes //Items/Item[Type[. = $remove-types]] is pretty succinct.

    0 讨论(0)
  • 2021-01-24 08:22

    The Clojure standard APIs provide convenient functions for manipulating XML and other tree structures. Removing (leaf) nodes can be done on depth-first traversal using clojure.walk:

    (require '[clojure.xml :as xml]
             '[clojure.walk :as walk])
    
    (def xmlstr
    "<ROOT>
      <Items>
        <Item><Type>A</Type><Note>AA</Note></Item>
        <Item><Type>B</Type><Note>BB</Note></Item>
        <Item><Type>C</Type><Note>CC</Note></Item>
        <Item><Type>A</Type><Note>AA</Note></Item>
      </Items>
    </ROOT>")
    
    (def xmldoc (xml/parse (java.io.ByteArrayInputStream. (.getBytes xmlstr))))
    
    (defn tag-matches [item tag]
      (= (:tag item) tag))
    
    (defn content-matches [item to-match]
      ((into #{} to-match)
       (apply str (:content item))))
    
    (defn match-criteria [item to-match]
      (some #(and (tag-matches % :Type)
                  (content-matches % to-match))
            (:content item)))
    
    (defn mk-xml-walker [& to-remove]
      (fn [form]
        (if (and (vector? form)
                 (some #(tag-matches % :Item) form))
          (filter (complement #(match-criteria % to-remove)) form)
          form)))
    
    (xml/emit (walk/postwalk (mk-xml-walker "B" "C") xmldoc))
    

    For magical one-liners, you may also want to check out Specter which provides a very concise syntax for manipulating nested data structures, like XML.

    0 讨论(0)
  • 2021-01-24 08:23

    The Tupelo library can easily solve this problem using tupelo.forest. You can find the API docs on GitHub Pages. Below is a test case using your example.

    Here we load your xml data and convert it first into enlive and then the native tree structure used by tupelo.forest:

    (ns tst.tupelo.forest-examples
      (:use tupelo.forest tupelo.test )
      (:require
        [clojure.data.xml :as dx]
        [clojure.java.io :as io]
        [clojure.set :as cs]
        [net.cgrand.enlive-html :as en-html]
        [schema.core :as s]
        [tupelo.core :as t]
        [tupelo.string :as ts]))
    (t/refer-tupelo)
    
    ; Discard any xml nodes of Type="A" or Type="B" (plus blank string nodes)
    (dotest
      (with-forest (new-forest)
        (let [xml-str         "<ROOT>
                                <Items>
                                  <Item><Type>A</Type><Note>AA1</Note></Item>
                                  <Item><Type>B</Type><Note>BB1</Note></Item>
                                  <Item><Type>C</Type><Note>CC1</Note></Item>
                                  <Item><Type>A</Type><Note>AA2</Note></Item>
                                </Items>
                              </ROOT>"
              enlive-tree     (->> xml-str
                                java.io.StringReader.
                                en-html/html-resource
                                first)
              root-hid        (add-tree-enlive enlive-tree)
              tree-1          (hid->tree root-hid)
    

    The hid suffix stands for "Hex ID" which is unique hex value that acts like a pointer to a node/leaf in the tree. At this stage we have just loaded the data in the forest data structure, creating tree-1 which looks like:

     (is= tree-1
       {:attrs {:tag :ROOT},
        :kids  [{:attrs {:tag :tupelo.forest/raw},
                 :value "\n                            "}
                {:attrs {:tag :Items},
                 :kids  [{:attrs {:tag :tupelo.forest/raw},
                          :value "\n                              "}
                         {:attrs {:tag :Item},
                          :kids  [{:attrs {:tag :Type}, :value "A"}
                                  {:attrs {:tag :Note}, :value "AA1"}]}
                         {:attrs {:tag :tupelo.forest/raw},
                          :value "\n                              "}
                         {:attrs {:tag :Item},
                          :kids  [{:attrs {:tag :Type}, :value "B"}
                                  {:attrs {:tag :Note}, :value "BB1"}]}
                         {:attrs {:tag :tupelo.forest/raw},
                          :value "\n                              "}
                         {:attrs {:tag :Item},
                          :kids  [{:attrs {:tag :Type}, :value "C"}
                                  {:attrs {:tag :Note}, :value "CC1"}]}
                         {:attrs {:tag :tupelo.forest/raw},
                          :value "\n                              "}
                         {:attrs {:tag :Item},
                          :kids  [{:attrs {:tag :Type}, :value "A"}
                                  {:attrs {:tag :Note}, :value "AA2"}]}
                         {:attrs {:tag :tupelo.forest/raw},
                          :value "\n                            "}]}
                {:attrs {:tag :tupelo.forest/raw},
                 :value "\n                          "}]})
    

    We next remove any blank strings with this code:

    blank-leaf-hid? (fn [hid] (and (leaf-hid? hid) ; ensure it is a leaf node
                                (let [value (hid->value hid)]
                                  (and (string? value)
                                    (or (zero? (count value)) ; empty string
                                      (ts/whitespace? value)))))) ; all whitespace string
    
    blank-leaf-hids (keep-if blank-leaf-hid? (all-hids))
    >>              (apply remove-hid blank-leaf-hids)
    tree-2          (hid->tree root-hid)
    

    yielding tree-2 which looks much neater:

    (is= tree-2
      {:attrs {:tag :ROOT},
       :kids  [{:attrs {:tag :Items},
                :kids  [{:attrs {:tag :Item},
                         :kids  [{:attrs {:tag :Type}, :value "A"}
                                 {:attrs {:tag :Note}, :value "AA1"}]}
                        {:attrs {:tag :Item},
                         :kids  [{:attrs {:tag :Type}, :value "B"}
                                 {:attrs {:tag :Note}, :value "BB1"}]}
                        {:attrs {:tag :Item},
                         :kids  [{:attrs {:tag :Type}, :value "C"}
                                 {:attrs {:tag :Note}, :value "CC1"}]}
                        {:attrs {:tag :Item},
                         :kids  [{:attrs {:tag :Type}, :value "A"}
                                 {:attrs {:tag :Note}, :value "AA2"}]}]}]})
    

    The final code fragment removes Type="B" or Type="C" nodes:

    type-bc-hid?    (fn [hid] (pos? (count (glue
                                (find-leaf-hids hid [:** :Type] "B")
                                (find-leaf-hids hid [:** :Type] "C")))))
    
    type-bc-hids    (find-hids-with root-hid [:** :Item] type-bc-hid?)
    >>              (apply remove-hid type-bc-hids)
    tree-3          (hid->tree root-hid)
    tree-3-hiccup   (hid->hiccup root-hid) ]
    

    yielding the final result tree shown in both tree format and hiccup format:

    (is= tree-3
      {:attrs {:tag :ROOT},
       :kids
              [{:attrs {:tag :Items},
                :kids  [{:attrs {:tag :Item},
                         :kids  [{:attrs {:tag :Type}, :value "A"}
                                 {:attrs {:tag :Note}, :value "AA1"}]}
                        {:attrs {:tag :Item},
                         :kids  [{:attrs {:tag :Type}, :value "A"}
                                 {:attrs {:tag :Note}, :value "AA2"}]}]}]})
    (is= tree-3-hiccup
      [:ROOT
       [:Items
        [:Item [:Type "A"] [:Note "AA1"]]
        [:Item [:Type "A"] [:Note "AA2"]]]]))))
    

    The full example can be found in the forest-examples unit test.

    Update

    Here is the most compact version with extra features removed:

    (dotest
      (with-forest (new-forest)
        (let [xml-str         "<ROOT>
                                <Items>
                                  <Item><Type>A</Type><Note>AA1</Note></Item>
                                  <Item><Type>B</Type><Note>BB1</Note></Item>
                                  <Item><Type>C</Type><Note>CC1</Note></Item>
                                  <Item><Type>A</Type><Note>AA2</Note></Item>
                                </Items>
                              </ROOT>"
              enlive-tree     (->> xml-str
                                java.io.StringReader.
                                en-html/xml-resource
                                first)
              root-hid        (add-tree-enlive enlive-tree)
              blank-leaf-hid? (fn [hid] (ts/whitespace? (hid->value hid)))
              has-bc-leaf?    (fn [hid] (or (has-child-leaf? hid [:** :Type] "B")
                                            (has-child-leaf? hid [:** :Type] "C")))
              blank-leaf-hids (keep-if blank-leaf-hid? (all-leaf-hids))
              >>              (apply remove-hid blank-leaf-hids)
              bc-item-hids    (find-hids-with root-hid [:** :Item] has-bc-leaf?)]
          (apply remove-hid bc-item-hids)
          (is= (hid->hiccup root-hid)
            [:ROOT
             [:Items
              [:Item [:Type "A"] [:Note "AA1"]]
              [:Item [:Type "A"] [:Note "AA2"]]]]))))
    
    0 讨论(0)
提交回复
热议问题