I have gone through several articles and examples, and have yet to find an efficient way to do this SQL query in MongoDB (where there are millions of rows documen
1) The easiest way to do this is via the aggregation framework. This takes two "$group" commands: the first one groups by distinct values, the second one counts all of the distinct values
pipeline = [
{ $group: { _id: "$myIndexedNonUniqueField"} },
{ $group: { _id: 1, count: { $sum: 1 } } }
];
//
// Run the aggregation command
//
R = db.runCommand(
{
"aggregate": "myCollection" ,
"pipeline": pipeline
}
);
printjson(R);
2) If you want to do this with Map/Reduce you can. This is also a two-phase process: in the first phase we build a new collection with a list of every distinct value for the key. In the second we do a count() on the new collection.
var SOURCE = db.myCollection;
var DEST = db.distinct
DEST.drop();
map = function() {
emit( this.myIndexedNonUniqueField , {count: 1});
}
reduce = function(key, values) {
var count = 0;
values.forEach(function(v) {
count += v['count']; // count each distinct value for lagniappe
});
return {count: count};
};
//
// run map/reduce
//
res = SOURCE.mapReduce( map, reduce,
{ out: 'distinct',
verbose: true
}
);
print( "distinct count= " + res.counts.output );
print( "distinct count=", DEST.count() );
Note that you cannot return the result of the map/reduce inline, because that will potentially overrun the 16MB document size limit. You can save the calculation in a collection and then count() the size of the collection, or you can get the number of results from the return value of mapReduce().
Following solution worked for me
db.test.distinct('user'); [ "alex", "England", "France", "Australia" ]
db.countries.distinct('country').length 4
db.myCollection.aggregate(
{$group : {_id : "$myIndexedNonUniqueField"} },
{$group: {_id:1, count: {$sum : 1 }}});
straight to result:
db.myCollection.aggregate(
{$group : {_id : "$myIndexedNonUniqueField"} },
{$group: {_id:1, count: {$sum : 1 }}})
.result[0].count;