Suppose we have a collection of raw data:
{ \"person\": \"David, age 102\"}
{ \"person\": \"Max, age 8\" }
and we\'d like to transform that
This version of mongod
provides the $split operator which, of course split the string as shown here.
We then assign the the newly computed value to a variable using the $let variable operator. The new value can then be use in the in expression to return the "name" and the "age" values using the $arrayElemAt operator to return the element at a specified index; 0
for the first element and -1
for the last element.
Note that in the in expression we need to split the last element in order to return the string of integer.
Finally we need to iterate the Cursor
object and cast the convert the string of integer to numeric using Number or parseInt and use bulk operation and the bulkWrite() method to $set the value for those field for maximum efficiency.
let requests = [];
db.coll.aggregate(
[
{ "$project": {
"person": {
"$let": {
"vars": {
"infos": { "$split": [ "$person", "," ] }
},
"in": {
"name": { "$arrayElemAt": [ "$$infos", 0 ] },
"age": {
"$arrayElemAt": [
{ "$split": [
{ "$arrayElemAt": [ "$$infos", -1 ] },
" "
]},
-1
]
}
}
}
}
}}
]
).forEach(document => {
requests.push({
"updateOne": {
"filter": { "_id": document._id },
"update": {
"$set": {
"name": document.person.name,
"age": Number(document.person.age)
},
"$unset": { "person": " " }
}
}
});
if ( requests.length === 500 ) {
// Execute per 500 ops and re-init
db.coll.bulkWrite(requests);
requests = [];
}}
);
// Clean up queues
if(requests.length > 0) {
db.coll.bulkWrite(requests);
}
MongoDB 3.2 deprecates the old Bulk() API and its associated methods and provides the bulkWrite()
method but it doesn't provide the $split
operator so the only option we have here is to use the mapReduce() method to transform our data then update the collection using bulk operation.
var mapFunction = function() {
var person = {},
infos = this.person.split(/[,\s]+/);
person["name"] = infos[0];
person["age"] = infos[2];
emit(this._id, person);
};
var results = db.coll.mapReduce(
mapFunction,
function(key, val) {},
{ "out": { "inline": 1 } }
)["results"];
results.forEach(document => {
requests.push({
"updateOne": {
"filter": { "_id": document._id },
"update": {
"$set": {
"name": document.value.name,
"age": Number(document.value.age)
},
"$unset": { "person": " " }
}
}
});
if ( requests.length === 500 ) {
// Execute per 500 operations and re-init
db.coll.bulkWrite(requests);
requests = [];
}}
);
// Clean up queues
if(requests.length > 0) {
db.coll.bulkWrite(requests);
}
We need to use the now deprecated Bulk API.
var bulkOp = db.coll.initializeUnorderedBulkOp();
var count = 0;
results.forEach(function(document) {
bulkOp.find({ "_id": document._id}).updateOne(
{
"$set": {
"name": document.value.name,
"age": Number(document.value.age)
},
"$unset": { "person": " " }
}
);
count++;
if (count === 500 ) {
// Execute per 500 operations and re-init
bulkOp.execute();
bulkOp = db.coll.initializeUnorderedBulkOp();
}
});
// clean up queues
if (count > 0 ) {
bulkOp.execute();
}