Expanded tree cache full error need to tune the query

问题

Description:

$enumValues will have sequence of strings that I have to look into
$assetSubGroup will have a element value from XML (for loop) i.e string that I have to match in above maintained sequence
If match is not, I have to hold few element values and return.

All three of my attempts below are giving me expanded tree cache full errors. There are ~470000 assets i.e XML I'm querying.

How can I tune these queries to avoid expanded tree cache errors?

approach 1:

let $query-name := "get-asset-sub-group-values"
let $output-dir := "D:\output\"

let $report-uri := concat($output-dir, $query-name, "_report0.txt")

let $enumValues := (:all sequence of strings goes here :)

let $map1 := map:new($enumValues ! map:entry(fn:string(.), fn:true()))

let $result1 := concat('asset-id' , "|",  'upi',  "|", 'assetSubGroup',  "|",  'asset-type',  "|", 'asset-sub-type', "|", 'originator', "|", 'originator-identifier',  "|",  'mm-project-id',  "|" , 'sap-project-id' , "&#10;")
let $result2 :=
  for $each-search-copy in cts:search(collection("metadata-search"), cts:element-value-query(xs:QName("AssetID"), "*"))/metadata
    let $asset-id := $each-search-copy/assetIdentifiers/assetIdentifier/AssetID[1]/text()
    let $upi := $each-search-copy/assetIdentifiers/assetIdentifier/SAPID[1]/text()
    let $asset-type := $each-search-copy/biblioCore/assetType[1]/text()
    let $asset-sub-type := $each-search-copy/biblioCore/assetSubType[1]/text()
    let $originator := $each-search-copy/biblioCore/originator[1]/text()
    let $originator-identifier := $each-search-copy/assetIdentifiers/assetIdentifier/OriginatorIdentifier[1]/text()
    let $mm-project-id := $each-search-copy/biblioCore/MMProjectID[1]/text()
    let $sap-project-id := $each-search-copy/biblioCore/SAPProjectID[1]/text()
    let $assetSubGroup := $each-search-copy/biblioCore/assetSubGroup[1]/text()
    let $map2 := map:new($assetSubGroup ! map:entry(fn:string(.), fn:true()))
    let $flag := map:keys($map2 - $map1)
    return
        if ($flag)
            then(

                     concat($asset-id , "|",  $upi,  "|", $assetSubGroup,  "|",  $asset-type,  "|", $asset-sub-type, "|", $originator, "|", $originator-identifier,  "|",  $mm-project-id,  "|" , $sap-project-id , "&#10;")
                 )
            else (
                  if($assetSubGroup) then() 
                  else (     

                        concat($asset-id , "|",  $upi,  "|", $assetSubGroup,  "|",  $asset-type,  "|", $asset-sub-type, "|", $originator, "|", $originator-identifier,  "|",  $mm-project-id,  "|" , $sap-project-id , "&#10;")

            ))
let $result3 := ($result1, $result2)    
return xdmp:save($report-uri, text{$result3}), xdmp:elapsed-time()

approach 2:

let $query-name := "get-asset-sub-group-values"
let $output-dir := "D:\output\"
let $report-uri := concat($output-dir, $query-name, "_report1.txt")

let $enumValues := (:all string value sequence goes here that has to match:)

let $map1 := map:new($enumValues ! map:entry(fn:string(.), fn:true()))

let $result1 :=( concat('asset-id' , "|",  'upi',  "|", 'assetSubGroup',  "|",  'asset-type',  "|", 'asset-sub-type', "|", 'originator', "|", 'originator-identifier',  "|",  'mm-project-id',  "|" , 'sap-project-id' , "&#10;"),

  for $each-search-copy in cts:search(collection("metadata-search"), cts:element-value-query(xs:QName("AssetID"), "*"))/metadata
    let $asset-id := $each-search-copy/assetIdentifiers/assetIdentifier/AssetID[1]/text()
    let $upi := $each-search-copy/assetIdentifiers/assetIdentifier/SAPID[1]/text()
    let $asset-type := $each-search-copy/biblioCore/assetType[1]/text()
    let $asset-sub-type := $each-search-copy/biblioCore/assetSubType[1]/text()
    let $originator := $each-search-copy/biblioCore/originator[1]/text()
    let $originator-identifier := $each-search-copy/assetIdentifiers/assetIdentifier/OriginatorIdentifier[1]/text()
    let $mm-project-id := $each-search-copy/biblioCore/MMProjectID[1]/text()
    let $sap-project-id := $each-search-copy/biblioCore/SAPProjectID[1]/text()
    let $assetSubGroup := $each-search-copy/biblioCore/assetSubGroup[1]/text()
    let $map2 := map:new($assetSubGroup ! map:entry(fn:string(.), fn:true()))
    let $flag := map:keys($map2 - $map1)
    return
        if ($flag)
            then(

                     concat($asset-id , "|",  $upi,  "|", $assetSubGroup,  "|",  $asset-type,  "|", $asset-sub-type, "|", $originator, "|", $originator-identifier,  "|",  $mm-project-id,  "|" , $sap-project-id , "&#10;")
                 )
            else (
                  if($assetSubGroup) then() 
                  else (     

                        concat($asset-id , "|",  $upi,  "|", $assetSubGroup,  "|",  $asset-type,  "|", $asset-sub-type, "|", $originator, "|", $originator-identifier,  "|",  $mm-project-id,  "|" , $sap-project-id , "&#10;")

            ))
            )

return xdmp:save($report-uri, text{$result1}), xdmp:elapsed-time()

approach 3:

let $query-name := "get-asset-sub-group-values"
let $output-dir :=  "D:\output\"
let $report-uri := concat($output-dir, $query-name, "_report2.txt")


let $enumValues := (:sequence of strings are passed:)

let $result1 :=( concat('asset-id' , "|",  'upi',  "|", 'assetSubGroup',  "|",  'asset-type',  "|", 'asset-sub-type', "|", 'originator', "|", 'originator-identifier',  "|",  'mm-project-id',  "|" , 'sap-project-id' , "&#10;"),

  for $each-search-copy in cts:search(collection("metadata-search"), cts:element-value-query(xs:QName("AssetID"), "*"))/metadata
    let $asset-id := $each-search-copy/assetIdentifiers/assetIdentifier/AssetID[1]/text()
    let $upi := $each-search-copy/assetIdentifiers/assetIdentifier/SAPID[1]/text()
    let $asset-type := $each-search-copy/biblioCore/assetType[1]/text()
    let $asset-sub-type := $each-search-copy/biblioCore/assetSubType[1]/text()
    let $originator := $each-search-copy/biblioCore/originator[1]/text()
    let $originator-identifier := $each-search-copy/assetIdentifiers/assetIdentifier/OriginatorIdentifier[1]/text()
    let $mm-project-id := $each-search-copy/biblioCore/MMProjectID[1]/text()
    let $sap-project-id := $each-search-copy/biblioCore/SAPProjectID[1]/text()
    let $assetSubGroup := $each-search-copy/biblioCore/assetSubGroup[1]/text()
    let $flag := ($assetSubGroup eq $enumValues)
    return
        if ($flag)
            then()
            else (
                  concat($asset-id , "|",  $upi,  "|", $assetSubGroup,  "|",  $asset-type,  "|", $asset-sub-type, "|", $originator, "|", $originator-identifier,  "|",  $mm-project-id,  "|" , $sap-project-id , "&#10;")
                 ))

return xdmp:save($report-uri, text{$result1}), xdmp:elapsed-time()

回答1:

If you find that you are having trouble writing a "streamable" query, another alternative would be to use a batch tool, such as CORB2 that uses multiple threads to execute a module for each document to produce it's output in a separate transaction and collects the results into a final output file. By breaking out the work into separate transactions, you don't have to worry about expanded tree cache errors or timeouts, and you can adjust the thread count to perform more work concurrently and get it done faster than a single query execution.

An example CORB2 options file to produce your | delimited text file would look something like this (you would need to adjust the XCC-CONNECTION-URI value for your environment):

XCC-CONNECTION-URI=xcc://user:password@host:port
THREAD-COUNT=8
URIS-MODULE=selector.xqy|ADHOC
PROCESS-MODULE=process.xqy|ADHOC
PROCESS-TASK=com.marklogic.developer.corb.ExportBatchToFileTask
EXPORT-FILE-DIR=D:\output\
EXPORT-FILE-NAME=get-asset-sub-group-values_report.txt
PRE-BATCH-TASK=com.marklogic.developer.corb.PreBatchUpdateFileTask
EXPORT-FILE-TOP-CONTENT=asset-id|upi|assetSubGroup|asset-type|asset-sub-type|originator|originator-identifier|mm-project-id|sap-project-id
BATCH-URI-DELIM=|

Create the URIs selector module selector.xqy that will find all of the URIs to process:

xquery version "1.0-ml";
let $uris := cts:uris("", (), cts:and-query((
  cts:collection-query("metadata-search"), 
  cts:element-value-query(xs:QName("AssetID"), "*")
)) )
return (fn:count($uris), $uris)

Create the process module process.xqy that will be called for each URI:

xquery version "1.0-ml";

declare variable $URI external;

let $each-search-copy := fn:doc($URI)/metadata

let $asset-id := $each-search-copy/assetIdentifiers/assetIdentifier/AssetID[1]/text()
let $upi := $each-search-copy/assetIdentifiers/assetIdentifier/SAPID[1]/text()
let $asset-type := $each-search-copy/biblioCore/assetType[1]/text()
let $asset-sub-type := $each-search-copy/biblioCore/assetSubType[1]/text()
let $originator := $each-search-copy/biblioCore/originator[1]/text()
let $originator-identifier := $each-search-copy/assetIdentifiers/assetIdentifier/OriginatorIdentifier[1]/text()
let $mm-project-id := $each-search-copy/biblioCore/MMProjectID[1]/text()
let $sap-project-id := $each-search-copy/biblioCore/SAPProjectID[1]/text()
let $assetSubGroup := $each-search-copy/biblioCore/assetSubGroup[1]/text()
let $flag := ($assetSubGroup eq $enumValues)
return
    if ($flag)
        then()
        else (
              (: NOTE you could use string-join() instead of concat()

                 string-join(($asset-id, $upi, $assetSubGroup, $asset-type, $asset-sub-type, $originator, $originator-identifier, $mm-project-id, $sap-project-id), "|")  

              :)
              concat($asset-id , "|",  $upi,  "|", $assetSubGroup,  "|",  $asset-type,  "|", $asset-sub-type, "|", $originator, "|", $originator-identifier,  "|",  $mm-project-id,  "|" , $sap-project-id)
             ))

Invoking the CORB job like this (adjust paths and filenames for the XCC and CORB jars and your properties file):

java -server -cp .:marklogic-xcc-8.0.8.jar:marklogic-corb-2.4.1.jar
    -DOPTIONS-FILE=myjob.properties com.marklogic.developer.corb.Manager

Or, if you are using ml-gradle, use the corb task

回答2:

The biggest culprit in your approach is the fact that you try to concat the entire result into a single string. You also capture the result in a variable before inserting it into the text node.

Below approach will work better, as it allows MarkLogic to stream data while writing to disk:

xquery version "1.0-ml";

xdmp:save("/tmp/test.csv", document {
  (: insert csv header :)
  text { "uri&#10;" },

(: append csv records line by line :)
  for $doc in cts:search(collection(), cts:true-query())
  return text{ xdmp:node-uri($doc) || "&#10;" }
})

HTH!

来源：https://stackoverflow.com/questions/52002691/expanded-tree-cache-full-error-need-to-tune-the-query

标签

marklogic

marklogic-8

query-tuning