问题
Benchmark with python
import json
import os
from datetime import datetime
import pymongo
from bson import ObjectId
jsons = []
DB_URL = os.environ.get('DB_URL')
FILE_PATH = os.environ.get('FILE_PATH')
client = pymongo.MongoClient(DB_URL)
database = client[client.get_database().name]
collection_name = 'test_collection'
database[collection_name].drop()
database[collection_name].create_index([('p.k', pymongo.ASCENDING),
('p.v', pymongo.ASCENDING)],
background=True)
def writeJsons():
global jsons
if not jsons:
return
print('Start {}'.format(len(jsons)))
ts = datetime.utcnow()
database[collection_name].insert_many(jsons)
print('{} {} seconds'.format(len(jsons), datetime.utcnow() - ts))
jsons = []
with open(FILE_PATH) as fl:
for line in fl.readlines():
j = json.loads(line)
j['_id'] = j['_id']['$oid']
for o in j['p']:
if (isinstance(o['v'], dict)):
o['v'] = o['v']['$numberLong']
jsons.append(j)
if len(jsons) == 100000:
writeJsons()
writeJsons()
Benchmark with node
import mongoose from 'mongoose'
import process from 'process'
import events from 'events'
import fs from 'fs'
import readline from 'readline'
const DB_URL = process.env.DB_URL
const FILE_PATH = process.env.FILE_PATH
mongoose.connect(DB_URL, {
useFindAndModify: false,
useUnifiedTopology: true,
useNewUrlParser: true
})
let db = mongoose.connection
db.on('error', err => console.log('Failed to connect at %s: %s', DB_URL, err))
const DataSchema = new mongoose.Schema(
{},
{ validateBeforeSave: false, versionKey: false, strict: false }
)
DataSchema.index({ 'p.k': 1, 'p.v': 1 }, { background: true })
const collectionName = 'test_collection'
const DataModel = mongoose.model(
collectionName,
DataSchema,
collectionName
)
const fileStream = fs.createReadStream(FILE_PATH)
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity
})
let jsons = []
let promise = Promise.resolve()
async function writeJsons() {
if (!jsons.length) {
return
}
console.log(`Start ${jsons.length}`)
const startTime = Date.now()
const startTimeArray = process.hrtime()
await DataModel.insertMany(jsons.splice(0, 100000), {
bypassDocumentValidation: true
})
const diff = Date.now() - startTime
console.log(
`${jsons.length} ${diff / 1000} seconds ${process.hrtime(startTimeArray)}`
)
jsons = []
}
rl.on('line', line => {
try {
const json = JSON.parse(line.toString())
json._id = mongoose.Types.ObjectId(json._id.$oid)
for (const o of json.p) {
if (typeof o.v === 'object') {
o.v = o.v.$numberLong
}
}
jsons.push(json)
if (jsons.length === 100000) {
rl.pause()
}
} catch (err) {
console.log(line.toString(), err)
}
})
rl.on('pause', async () => {
promise = writeJsons().then(() => rl.resume())
})
rl.on('close', async () => {
await promise
await writeJsons()
})
db.once('open', async () => {
console.log(`Successfully connected at ${DB_URL}`)
try {
await mongoose.connection.dropCollection(collectionName)
} catch (err) {}
})
The input file is exported collection and is 613 MB big, > 1.7M records.
Each document looks like
{_id:...,
p: [
{k:'somekey', v: 'somevalue'},
...
]
}
mongodb version 4.0 is used.
The results are quite solid and python2 with pymongo 3.8 is k 2-3 times faster than nodejs 10 with mongoose 5.7. Any similar experience, opinion? Actually expected nodejs to be a little faster than python.
来源:https://stackoverflow.com/questions/58226391/mongoose-vs-pymongo-drivers-write-insertmany-test