How can I import bulk data from a CSV file into DynamoDB?

后端未结

关注

 14  1848

I am trying to import a CSV file data into AWS DynamoDB.

Here\'s what my CSV file looks like:

first_name  last_name
sri ram
Rahul   Dravid
JetPay  Underw


                      
              相关标签:


      
      
        
          14条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  悲&欢浪女        
                
              
                            
                2021-01-31 15:46
              
            
            
                                                                       
As a lowly dev without perms to create a Data Pipeline, I had to use this javascript.  Hassan Sidique's code was slightly out of date, but this worked for me:



var fs = require('fs');
var parse = require('csv-parse');
var async = require('async');
const AWS = require('aws-sdk');
const dynamodbDocClient = new AWS.DynamoDB({ region: "eu-west-1" });

var csv_filename = "./CSV.csv";

rs = fs.createReadStream(csv_filename);
parser = parse({
    columns : true,
    delimiter : ','
}, function(err, data) {
    var split_arrays = [], size = 25;

    while (data.length > 0) {

        //split_arrays.push(data.splice(0, size));
        let cur25 = data.splice(0, size)
        let item_data = []

        for (var i = cur25.length - 1; i >= 0; i--) {
          const this_item = {
            "PutRequest" : {
              "Item": {
                // your column names here will vary, but you'll need do define the type
                "Title": {
                  "S": cur25[i].Title
                },
                "Col2": {
                  "N": cur25[i].Col2
                },
                "Col3": {
                  "N": cur25[i].Col3
                }
              }
            }
          };
          item_data.push(this_item)
        }
        split_arrays.push(item_data);
    }
    data_imported = false;
    chunk_no = 1;
    async.each(split_arrays, (item_data, callback) => {
        const params = {
            RequestItems: {
                "tagPerformance" : item_data
            }
        }
        dynamodbDocClient.batchWriteItem(params, function(err, res, cap) {
            if (err === null) {
                console.log('Success chunk #' + chunk_no);
                data_imported = true;
            } else {
                console.log(err);
                console.log('Fail chunk #' + chunk_no);
                data_imported = false;
            }
            chunk_no++;
            callback();
        });

    }, () => {
        // run after loops
        console.log('all data imported....');

    });

});
rs.pipe(parser);



                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  青春惊慌失措        
                
              
                            
                2021-01-31 15:49
              
            
            
                                                                       
I wrote a tool to do this using parallel execution that requires no dependencies or developer tooling installed on the machine (it's written in Go).

It can handle:


Comma separated (CSV) files
Tab separated (TSV) files
Large files
Local files
Files on S3
Parallel imports within AWS using AWS Step Functions to import > 4M rows per minute
No dependencies (no need for .NET, Python, Node.js, Docker, AWS CLI etc.)


It's available for MacOS, Linux, Windows and Docker: https://github.com/a-h/ddbimport



Here's the results of my tests showing that it can import a lot faster in parallel using AWS Step Functions.



I'm describing the tool in more detail at AWS Community Summit on the 15th May 2020 at 1155 BST - https://www.twitch.tv/awscomsum
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  不知归路        
                
              
                            
                2021-01-31 15:50
              
            
            
                                                                       
Follow the instruction in the following link to import data to existing tables in DynamoDB:
https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/SampleData.LoadData.html
Please note, the name of the tables is what you must find here:
https://console.aws.amazon.com/dynamodbv2/home
And the name of the table is used inside the json file, the name of the json file itself is not important. For example I have a table as Country-kdezpod7qrap7nhpjghjj-staging, then for importing data to that table I must make a json file like this:
{
    "Country-kdezpod7qrap7nhpjghjj-staging": [
        {
            "PutRequest": {
                "Item": {
                      "id": {
                        "S": "ir"
                      },
                      "__typename": {
                        "S": "Country"
                      },
                      "createdAt": {
                        "S": "2021-01-04T12:32:09.012Z"
                      },
                      "name": {
                        "S": "Iran"
                      },
                      "self": {
                        "N": "1"
                      },
                      "updatedAt": {
                        "S": "2021-01-04T12:32:09.012Z"
                      }
                }
            }
        }        
    ]
}

If you don't know how to create the items for each PutRequest then you can create an item in your DB with mutation and then try to duplicate it, then it will show the structure of one item for you:

If you have a huge list of items in your CSV file, you can use the following npm tool to generate the json file:
https://www.npmjs.com/package/json-dynamo-putrequest
Then we can use the following command to import the data:
aws dynamodb batch-write-item --request-items file://Country.json

If it import the data successfully, you must see the following output:
{
    "UnprocessedItems": {}
}

Also please note that with this method you can only have 25 PutRequest items in your array. So if you want to push 100 items you need to create 4 files.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  自闭症患者        
                
              
                            
                2021-01-31 15:51
              
            
            
                                                                       
The simplest solution is probably to use a template / solution made by AWS:
Implementing bulk CSV ingestion to Amazon DynamoDB
https://aws.amazon.com/blogs/database/implementing-bulk-csv-ingestion-to-amazon-dynamodb/
With this approach, you use the template provided to create a CloudFormation stack including an S3 bucket, a Lambda function, and a new DynamoDB table. The lambda is triggered to run on upload to the S3 bucket and inserts into the table in batches.
In my case, I wanted to insert into an existing table, so I just changed the Lambda function's environment variable once the stack was created.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  悲哀的现实        
                
              
                            
                2021-01-31 15:53
              
            
            
                                                                       
You can try using batch writes and multiprocessing to speed up your bulk import.

import csv
import time
import boto3
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(4)

current_milli_time = lambda: int(round(time.time() * 1000))
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('table_name')

def add_users_in_batch(data):
    with table.batch_writer() as batch:
        for item in data:
            batch.put_item(Item = item)


def run_batch_migration():
    start = current_milli_time()
    row_count = 0
    batch = []
    batches = []
    with open(CSV_PATH, newline = '') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\t', quotechar = '|')
        for row in reader:
            row_count += 1
            item = {
                'email': row[0],
                'country': row[1]
            }
            batch.append(item)
            if row_count % 25 == 0:
                batches.append(batch)
                batch = []
        batches.append(batch)
        pool.map(add_users_in_batch, batches)

    print('Number of rows processed - ', str(row_count))
    end = current_milli_time()
    print('Total time taken for migration : ', str((end - start) / 1000), ' secs')


if __name__ == "__main__":
    run_batch_migration()

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  暖寄归人        
                
              
                            
                2021-01-31 15:55
              
            
            
                                                                       
Another quick workaround is to load your CSV to RDS or any other mysql instance first, which is quite easy to do (https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Introduction.html) and then use DMS (AWS Database Migration Service) to load the entire data to dynamodb. You'll have to create a role for DMS before you can load the data. But this works wonderfully without having to run any scripts.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
   
          
     1
2
3
下一页
           
           
        
                                  
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复