mongoose vs pymongo drivers - write (insertMany) test

若如初见. 提交于 2019-12-25 02:34:41

问题


Benchmark with python

import json
import os
from datetime import datetime

import pymongo
from bson import ObjectId

jsons = []

DB_URL = os.environ.get('DB_URL')
FILE_PATH = os.environ.get('FILE_PATH')

client = pymongo.MongoClient(DB_URL)
database = client[client.get_database().name]


collection_name = 'test_collection'

database[collection_name].drop()

database[collection_name].create_index([('p.k', pymongo.ASCENDING),
                                        ('p.v', pymongo.ASCENDING)],
                                        background=True)

def writeJsons():
    global jsons
    if not jsons:
        return
    print('Start {}'.format(len(jsons)))
    ts = datetime.utcnow()
    database[collection_name].insert_many(jsons)
    print('{} {} seconds'.format(len(jsons), datetime.utcnow() - ts))
    jsons = []

with open(FILE_PATH) as fl:
   for line in fl.readlines():
       j = json.loads(line)
       j['_id'] = j['_id']['$oid']
       for o in j['p']:
           if (isinstance(o['v'], dict)):
               o['v'] = o['v']['$numberLong']
       jsons.append(j)
       if len(jsons) == 100000:
          writeJsons()

writeJsons()

Benchmark with node

import mongoose from 'mongoose'

import process from 'process'
import events from 'events'
import fs from 'fs'
import readline from 'readline'

const DB_URL = process.env.DB_URL
const FILE_PATH = process.env.FILE_PATH

mongoose.connect(DB_URL, {
  useFindAndModify: false,
  useUnifiedTopology: true,
  useNewUrlParser: true
})

let db = mongoose.connection
db.on('error', err => console.log('Failed to connect at %s: %s', DB_URL, err))

const DataSchema = new mongoose.Schema(
  {},
  { validateBeforeSave: false, versionKey: false, strict: false }
)
DataSchema.index({ 'p.k': 1, 'p.v': 1 }, { background: true })
const collectionName = 'test_collection'
const DataModel = mongoose.model(
  collectionName,
  DataSchema,
  collectionName
)

const fileStream = fs.createReadStream(FILE_PATH)

const rl = readline.createInterface({
  input: fileStream,
  crlfDelay: Infinity
})

let jsons = []

let promise = Promise.resolve()

async function writeJsons() {
  if (!jsons.length) {
    return
  }
  console.log(`Start ${jsons.length}`)
  const startTime = Date.now()
  const startTimeArray = process.hrtime()
  await DataModel.insertMany(jsons.splice(0, 100000), {
    bypassDocumentValidation: true
  })
  const diff = Date.now() - startTime
  console.log(
    `${jsons.length} ${diff / 1000} seconds ${process.hrtime(startTimeArray)}`
  )
  jsons = []
}

rl.on('line', line => {
  try {
    const json = JSON.parse(line.toString())
    json._id = mongoose.Types.ObjectId(json._id.$oid)
    for (const o of json.p) {
      if (typeof o.v === 'object') {
        o.v = o.v.$numberLong
      }
    }
    jsons.push(json)
    if (jsons.length === 100000) {
      rl.pause()
    }
  } catch (err) {
    console.log(line.toString(), err)
  }
})

rl.on('pause', async () => {
  promise = writeJsons().then(() => rl.resume())
})

rl.on('close', async () => {
  await promise
  await writeJsons()
})

db.once('open', async () => {
  console.log(`Successfully connected at ${DB_URL}`)
  try {
    await mongoose.connection.dropCollection(collectionName)
  } catch (err) {}
})

The input file is exported collection and is 613 MB big, > 1.7M records.

Each document looks like

{_id:...,
 p: [
     {k:'somekey', v: 'somevalue'},
     ...
     ]
}

mongodb version 4.0 is used.

The results are quite solid and python2 with pymongo 3.8 is k 2-3 times faster than nodejs 10 with mongoose 5.7. Any similar experience, opinion? Actually expected nodejs to be a little faster than python.

来源:https://stackoverflow.com/questions/58226391/mongoose-vs-pymongo-drivers-write-insertmany-test

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!