How can I import bulk data from a CSV file into DynamoDB?

后端 未结 14 1853
我在风中等你
我在风中等你 2021-01-31 15:08

I am trying to import a CSV file data into AWS DynamoDB.

Here\'s what my CSV file looks like:

first_name  last_name
sri ram
Rahul   Dravid
JetPay  Underw         


        
14条回答
  •  独厮守ぢ
    2021-01-31 15:59

    Before getting to my code, some notes on testing this locally

    I recommend using a local version of DynamoDB, in case you want to sanity check this before you start incurring charges and what not. I made some small modifications before posting this, so be sure to test with whatever means make sense to you. There is a fake batch upload job I commented out, which you could use in lieu of any DynamoDB service, remote or local, to verify in stdout that this is working to your needs.

    dynamodb-local

    See dynamodb-local on npmjs or manual install

    If you went the manual install route, you can start dynamodb-local with something like this:

    java -Djava.library.path=/DynamoDBLocal_lib/\
         -jar /DynamoDBLocal.jar\
         -inMemory\
         -sharedDb
    

    The npm route may be simpler.

    dynamodb-admin

    Along with that, see dynamodb-admin.

    I installed dynamodb-admin with npm i -g dynamodb-admin. It can then be run with:

    dynamodb-admin
    

    Using them:

    dynamodb-local defaults to localhost:8000.

    dynamodb-admin is a web page that defaults to localhost:8001. Once you launch these two services, open localhost:8001 in your browser to view and manipulate the database.

    The script below doesn't create the database. Use dynamodb-admin for this.

    Credit goes to...

    • Ben Nadel.

    The code

    • I'm not as experienced with JS & Node.js as I am with other languages, so please forgive any JS faux pas.
    • You'll notice each group of concurrent batches is purposely slowed down by 900ms. This was a hacky solution, and I'm leaving it here to serve as an example (and because of laziness, and because you're not paying me).
    • If you increase MAX_CONCURRENT_BATCHES, you will want to calculate the appropriate delay amount based on your WCU, item size, batch size, and the new concurrency level.
    • Another approach would be to turn on Auto Scaling and implement exponential backoff for each failed batch. Like I mention below in one of the comments, this really shouldn't be necessary with some back-of-the-envelope calculations to figure out how many writes you can actually do, given your WCU limit and data size, and just let your code run at a predictable rate the entire time.
    • You might wonder why I didn't just let AWS SDK handle concurrency. Good question. Probably would have made this slightly simpler. You could experiment by applying the MAX_CONCURRENT_BATCHES to the maxSockets config option, and modifying the code that creates arrays of batches so that it only passes individual batches forward.
    /**
     * Uploads CSV data to DynamoDB.
     *
     * 1. Streams a CSV file line-by-line.
     * 2. Parses each line to a JSON object.
     * 3. Collects batches of JSON objects.
     * 4. Converts batches into the PutRequest format needed by AWS.DynamoDB.batchWriteItem
     *    and runs 1 or more batches at a time.
     */
    
    const AWS = require("aws-sdk")
    const chalk = require('chalk')
    const fs = require('fs')
    const split = require('split2')
    const uuid = require('uuid')
    const through2 = require('through2')
    const { Writable } = require('stream');
    const { Transform } = require('stream');
    
    const CSV_FILE_PATH = __dirname + "/../assets/whatever.csv"
    
    // A whitelist of the CSV columns to ingest.
    const CSV_KEYS = [
        "id",
        "name", 
        "city"
    ]
    
    // Inadequate WCU will cause "insufficient throughput" exceptions, which in this script are not currently  
    // handled with retry attempts. Retries are not necessary as long as you consistently
    // stay under the WCU, which isn't that hard to predict.
    
    // The number of records to pass to AWS.DynamoDB.DocumentClient.batchWrite
    // See https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_BatchWriteItem.html
    const MAX_RECORDS_PER_BATCH = 25
    
    // The number of batches to upload concurrently.  
    // https://docs.aws.amazon.com/sdk-for-javascript/v2/developer-guide/node-configuring-maxsockets.html
    const MAX_CONCURRENT_BATCHES = 1
    
    // MAKE SURE TO LAUNCH `dynamodb-local` EXTERNALLY FIRST IF USING LOCALHOST!
    AWS.config.update({
        region: "us-west-1"
        ,endpoint: "http://localhost:8000"     // Comment out to hit live DynamoDB service.
    });
    const db = new AWS.DynamoDB()
    
    // Create a file line reader.
    var fileReaderStream = fs.createReadStream(CSV_FILE_PATH)
    var lineReaderStream = fileReaderStream.pipe(split())
    
    var linesRead = 0
    
    // Attach a stream that transforms text lines into JSON objects.
    var skipHeader = true
    var csvParserStream = lineReaderStream.pipe(
        through2(
            {
                objectMode: true,
                highWaterMark: 1
            },
            function handleWrite(chunk, encoding, callback) {
    
                // ignore CSV header
                if (skipHeader) {
                    skipHeader = false
                    callback()
                    return
                }
    
                linesRead++
    
                // transform line into stringified JSON
                const values = chunk.toString().split(',')
                const ret = {}
                CSV_KEYS.forEach((keyName, index) => {
                    ret[keyName] = values[index]
                })
                ret.line = linesRead
    
                console.log(chalk.cyan.bold("csvParserStream:", 
                    "line:", linesRead + ".", 
                    chunk.length, "bytes.", 
                    ret.id
                ))
    
                callback(null, ret)
            }
        )
    )
    
    // Attach a stream that collects incoming json lines to create batches. 
    // Outputs an array (<= MAX_CONCURRENT_BATCHES) of arrays (<= MAX_RECORDS_PER_BATCH).
    var batchingStream = (function batchObjectsIntoGroups(source) {
        var batchBuffer = []
        var idx = 0
    
        var batchingStream = source.pipe(
            through2.obj(
                {
                    objectMode: true,
                    writableObjectMode: true,
                    highWaterMark: 1
                },
                function handleWrite(item, encoding, callback) {
                    var batchIdx = Math.floor(idx / MAX_RECORDS_PER_BATCH)
    
                    if (idx % MAX_RECORDS_PER_BATCH == 0 && batchIdx < MAX_CONCURRENT_BATCHES) {
                        batchBuffer.push([])
                    }
    
                    batchBuffer[batchIdx].push(item)
    
                    if (MAX_CONCURRENT_BATCHES == batchBuffer.length &&
                        MAX_RECORDS_PER_BATCH == batchBuffer[MAX_CONCURRENT_BATCHES-1].length) 
                    {
                        this.push(batchBuffer)
                        batchBuffer = []
                        idx = 0
                    } else {
                        idx++
                    }
    
                    callback()
                },
                function handleFlush(callback) {
                    if (batchBuffer.length) {
                        this.push(batchBuffer)
                    }
    
                    callback()
                }
            )
        )
    
        return (batchingStream);
    })(csvParserStream)
    
    // Attach a stream that transforms batch buffers to collections of DynamoDB batchWrite jobs.
    var databaseStream = new Writable({
    
        objectMode: true,
        highWaterMark: 1,
    
        write(batchBuffer, encoding, callback) {
            console.log(chalk.yellow(`Batch being processed.`))
    
            // Create `batchBuffer.length` batchWrite jobs.
            var jobs = batchBuffer.map(batch => 
                buildBatchWriteJob(batch)
            )
    
            // Run multiple batch-write jobs concurrently.
            Promise
                .all(jobs)
                .then(results => {
                    console.log(chalk.bold.red(`${batchBuffer.length} batches completed.`))
                })
                .catch(error => {
                    console.log( chalk.red( "ERROR" ), error )
                    callback(error)
                })
                .then( () => {
                    console.log( chalk.bold.red("Resuming file input.") )
    
                    setTimeout(callback, 900) // slow down the uploads. calculate this based on WCU, item size, batch size, and concurrency level.
                })
    
            // return false
        }
    })
    batchingStream.pipe(databaseStream)
    
    // Builds a batch-write job that runs as an async promise.
    function buildBatchWriteJob(batch) {
        let params = buildRequestParams(batch)
    
        // This was being used temporarily prior to hooking up the script to any dynamo service.
    
        // let fakeJob = new Promise( (resolve, reject) => {
    
        //     console.log(chalk.green.bold( "Would upload batch:", 
        //         pluckValues(batch, "line")
        //     ))
    
        //     let t0 = new Date().getTime()
    
        //     // fake timing
        //     setTimeout(function() {
        //         console.log(chalk.dim.yellow.italic(`Batch upload time: ${new Date().getTime() - t0}ms`))
        //         resolve()
        //     }, 300)
        // })
        // return fakeJob
    
        let promise = new Promise(
            function(resolve, reject) {
                let t0 = new Date().getTime()
    
                let printItems = function(msg, items) {
                    console.log(chalk.green.bold(msg, pluckValues(batch, "id")))
                }
    
                let processItemsCallback = function (err, data) {
                  if (err) { 
                     console.error(`Failed at batch: ${pluckValues(batch, "line")}, ${pluckValues(batch, "id")}`)
                     console.error("Error:", err)
                     reject()
                  } else {
                    var params = {}
                    params.RequestItems = data.UnprocessedItems
    
                    var numUnprocessed = Object.keys(params.RequestItems).length
                    if (numUnprocessed != 0) {
                        console.log(`Encountered ${numUnprocessed}`)
                        printItems("Retrying unprocessed items:", params)
                        db.batchWriteItem(params, processItemsCallback)
                    } else {
                        console.log(chalk.dim.yellow.italic(`Batch upload time: ${new Date().getTime() - t0}ms`))
    
                        resolve()
                    }
                  }
                }
                db.batchWriteItem(params, processItemsCallback)
            }
        )
        return (promise)
    }
    
    // Build request payload for the batchWrite
    function buildRequestParams(batch) {
    
        var params = {
            RequestItems: {}
        }
        params.RequestItems.Provider = batch.map(obj => {
    
            let item = {}
    
            CSV_KEYS.forEach((keyName, index) => {
                if (obj[keyName] && obj[keyName].length > 0) {
                    item[keyName] = { "S": obj[keyName] }
                }
            })
    
            return {
                PutRequest: {
                    Item: item
                }
            }
        })
        return params
    }
    
    function pluckValues(batch, fieldName) {
        var values = batch.map(item => {
            return (item[fieldName])
        })
        return (values)
    }
    

提交回复
热议问题