Save Dataframe to csv directly to s3 Python

前端 未结 10 883
独厮守ぢ
独厮守ぢ 2020-11-28 02:02

I have a pandas DataFrame that I want to upload to a new CSV file. The problem is that I don\'t want to save the file locally before transferring it to s3. Is there any meth

相关标签:
10条回答
  • 2020-11-28 02:43

    I read a csv with two columns from bucket s3, and the content of the file csv i put in pandas dataframe.

    Example:

    config.json

    {
      "credential": {
        "access_key":"xxxxxx",
        "secret_key":"xxxxxx"
    }
    ,
    "s3":{
           "bucket":"mybucket",
           "key":"csv/user.csv"
       }
    }
    

    cls_config.json

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    import os
    import json
    
    class cls_config(object):
    
        def __init__(self,filename):
    
            self.filename = filename
    
    
        def getConfig(self):
    
            fileName = os.path.join(os.path.dirname(__file__), self.filename)
            with open(fileName) as f:
            config = json.load(f)
            return config
    

    cls_pandas.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    import pandas as pd
    import io
    
    class cls_pandas(object):
    
        def __init__(self):
            pass
    
        def read(self,stream):
    
            df = pd.read_csv(io.StringIO(stream), sep = ",")
            return df
    

    cls_s3.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    import boto3
    import json
    
    class cls_s3(object):
    
        def  __init__(self,access_key,secret_key):
    
            self.s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
    
        def getObject(self,bucket,key):
    
            read_file = self.s3.get_object(Bucket=bucket, Key=key)
            body = read_file['Body'].read().decode('utf-8')
            return body
    

    test.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    from cls_config import *
    from cls_s3 import *
    from cls_pandas import *
    
    class test(object):
    
        def __init__(self):
            self.conf = cls_config('config.json')
    
        def process(self):
    
            conf = self.conf.getConfig()
    
            bucket = conf['s3']['bucket']
            key = conf['s3']['key']
    
            access_key = conf['credential']['access_key']
            secret_key = conf['credential']['secret_key']
    
            s3 = cls_s3(access_key,secret_key)
            ob = s3.getObject(bucket,key)
    
            pa = cls_pandas()
            df = pa.read(ob)
    
            print df
    
    if __name__ == '__main__':
        test = test()
        test.process()
    
    0 讨论(0)
  • 2020-11-28 02:44

    You can use:

    from io import StringIO # python3; python2: BytesIO 
    import boto3
    
    bucket = 'my_bucket_name' # already created on S3
    csv_buffer = StringIO()
    df.to_csv(csv_buffer)
    s3_resource = boto3.resource('s3')
    s3_resource.Object(bucket, 'df.csv').put(Body=csv_buffer.getvalue())
    
    0 讨论(0)
  • 2020-11-28 02:44

    since you are using boto3.client(), try:

    import boto3
    from io import StringIO #python3 
    s3 = boto3.client('s3', aws_access_key_id='key', aws_secret_access_key='secret_key')
    def copy_to_s3(client, df, bucket, filepath):
        csv_buf = StringIO()
        df.to_csv(csv_buf, header=True, index=False)
        csv_buf.seek(0)
        client.put_object(Bucket=bucket, Body=csv_buf.getvalue(), Key=filepath)
        print(f'Copy {df.shape[0]} rows to S3 Bucket {bucket} at {filepath}, Done!')
    
    copy_to_s3(client=s3, df=df_to_upload, bucket='abc', filepath='def/test.csv')
    
    0 讨论(0)
  • 2020-11-28 02:48

    I found this can be done using client also and not just resource.

    from io import StringIO
    import boto3
    s3 = boto3.client("s3",\
                      region_name=region_name,\
                      aws_access_key_id=aws_access_key_id,\
                      aws_secret_access_key=aws_secret_access_key)
    csv_buf = StringIO()
    df.to_csv(csv_buf, header=True, index=False)
    csv_buf.seek(0)
    s3.put_object(Bucket=bucket, Body=csv_buf.getvalue(), Key='path/test.csv')
    
    0 讨论(0)
提交回复
热议问题