Node.js & Amazon S3: How to iterate through all files in a bucket?

前端 未结 15 923
情书的邮戳
情书的邮戳 2020-12-02 14:17

Is there any Amazon S3 client library for Node.js that allows listing of all files in S3 bucket?

The most known aws2js and knox don\'t seem to have this functionalit

相关标签:
15条回答
  • 2020-12-02 14:46

    In fact aws2js supports listing of objects in a bucket on a low level via s3.get() method call. To do it one has to pass prefix parameter which is documented on Amazon S3 REST API page:

    var s3 = require('aws2js').load('s3', awsAccessKeyId, awsSecretAccessKey);    
    s3.setBucket(bucketName);
    
    var folder = encodeURI('some/path/to/S3/folder');
    var url = '?prefix=' + folder;
    
    s3.get(url, 'xml', function (error, data) {
        console.log(error);
        console.log(data);
    });
    

    The data variable in the above snippet contains a list of all objects in the bucketName bucket.

    0 讨论(0)
  • 2020-12-02 14:47

    Using the official aws-sdk:

    var allKeys = [];
    function listAllKeys(marker, cb)
    {
      s3.listObjects({Bucket: s3bucket, Marker: marker}, function(err, data){
        allKeys.push(data.Contents);
    
        if(data.IsTruncated)
          listAllKeys(data.NextMarker, cb);
        else
          cb();
      });
    }
    

    see s3.listObjects

    Edit 2017: Same basic idea, but listObjectsV2( ... ) is now recommended and uses a ContinuationToken (see s3.listObjectsV2):

    var allKeys = [];
    function listAllKeys(token, cb)
    {
      var opts = { Bucket: s3bucket };
      if(token) opts.ContinuationToken = token;
    
      s3.listObjectsV2(opts, function(err, data){
        allKeys = allKeys.concat(data.Contents);
    
        if(data.IsTruncated)
          listAllKeys(data.NextContinuationToken, cb);
        else
          cb();
      });
    }
    
    0 讨论(0)
  • 2020-12-02 14:48

    Using Async Generator

    Import S3

    const { S3 } = require("aws-sdk");
    const s3 = new S3();
    

    create a generator function to retrieve all the files list

    async function* listAllKeys(opts) {
      opts = { ...opts };
      do {
        const data = await s3.listObjectsV2(opts).promise();
        opts.ContinuationToken = data.NextContinuationToken;
        yield data;
      } while (opts.ContinuationToken);
    }
    

    Prepare aws parameter, based on api docs

    const opts = {
      Bucket: "bucket-xyz" /* required */,
      // ContinuationToken: 'STRING_VALUE',
      // Delimiter: 'STRING_VALUE',
      // EncodingType: url,
      // FetchOwner: true || false,
      // MaxKeys: 'NUMBER_VALUE',
      // Prefix: 'STRING_VALUE',
      // RequestPayer: requester,
      // StartAfter: 'STRING_VALUE'
    };
    

    Use generator

    async function main() {
      // using for of await loop
      for await (const data of listAllKeys(opts)) {
        console.log(data.Contents);
      }
    }
    main();
    

    thats it

    Or Lazy Load

    async function main() {
      const keys = listAllKeys(opts);
      console.log(await keys.next());
      // {value: {…}, done: false}
      console.log(await keys.next());
      // {value: {…}, done: false}
      console.log(await keys.next());
      // {value: undefined, done: true}
    }
    main();
    

    Or Use generator to make Observable function

    const lister = (opts) => (o) => {
      let needMore = true;
      (async () => {
        const keys = listAllKeys(opts);
        for await (const data of keys) {
          o.next(data);
          if (!needMore) break;
        }
        o.complete();
      })();
      return () => (needMore = false);
    };
    

    use this observable function with RXJS

    // Using Rxjs
    
    const { Observable } = require("rxjs");
    const { flatMap } = require("rxjs/operators");
    
    function listAll() {
      return Observable.create(lister(opts))
        .pipe(flatMap((v) => v.Contents))
        .subscribe(console.log);
    }
    
    listAll();
    

    or use this observable function with Nodejs EventEmitter

    const EventEmitter = require("events");
    
    const _eve = new EventEmitter();
    
    async function onData(data) {
      // will be called for each set of data
      console.log(data);
    }
    async function onError(error) {
      // will be called if any error
      console.log(error);
    }
    async function onComplete() {
      // will be called when data completely received
    }
    _eve.on("next", onData);
    _eve.on("error", onError);
    _eve.on("complete", onComplete);
    
    const stop = lister(opts)({
      next: (v) => _eve.emit("next", v),
      error: (e) => _eve.emit("error", e),
      complete: (v) => _eve.emit("complete", v),
    });
    
    0 讨论(0)
  • 2020-12-02 14:53

    Although @Meekohi's answer does technically work, I've had enough heartache with the S3 portion of the AWS SDK for NodeJS. After all the previous struggling with modules such as aws-sdk, s3, knox, I decided to install s3cmd via the OS package manager and shell-out to it using child_process

    Something like:

        var s3cmd = new cmd_exec('s3cmd', ['ls', filepath, 's3://'+inputBucket],
                function (me, data) {me.stdout += data.toString();},
                function (me) {me.exit = 1;}
        );
        response.send(s3cmd.stdout);
    

    (Using the cmd_exec implementation from this question)

    This approach just works really well - including for other problematic things like file upload.

    0 讨论(0)
  • 2020-12-02 14:54

    This is an old question and I guess the AWS JS SDK has changed a lot since it was asked. Here's yet another way to do it these days:

    s3.listObjects({Bucket:'mybucket', Prefix:'some-pfx'}).
    on('success', function handlePage(r) {
        //... handle page of contents r.data.Contents
    
        if(r.hasNextPage()) {
            // There's another page; handle it
            r.nextPage().on('success', handlePage).send();
        } else {
            // Finished!
        }
    }).
    on('error', function(r) {
        // Error!
    }).
    send();
    
    0 讨论(0)
  • 2020-12-02 14:58

    Meekohi provided a very good answer, but the (new) documentation states that NextMarker can be undefined. When this is the case, you should use the last key as the marker.

    So his codesample can be changed into:

    var allKeys = [];
    function listAllKeys(marker, cb) {
      s3.listObjects({Bucket: s3bucket, Marker: marker}, function(err, data){
        allKeys.push(data.Contents);
        if(data.IsTruncated)
          listAllKeys(data.NextMarker || data.Contents[data.Contents.length-1].Key, cb);
        else
          cb();
      });
    }
    

    Couldn't comment on the original answer since I don't have the required reputation. Apologies for the bad mark-up btw.

    0 讨论(0)
提交回复
热议问题