How to Create Dataframe from AWS Athena using Boto3 get_query_results method

前端 未结 5 2196
广开言路
广开言路 2021-02-13 05:37

I\'m using AWS Athena to query raw data from S3. Since Athena writes the query output into S3 output bucket I used to do:

df = pd.read_csv(OutputLocation)
         


        
5条回答
  •  忘掉有多难
    2021-02-13 06:41

    get_query_results only returns 1000 rows. How can I use it to get two million rows into a Pandas dataframe?

    If you try to add:

    client.get_query_results(QueryExecutionId=res['QueryExecutionId'], MaxResults=2000)
    

    You will obtain the next error:

    An error occurred (InvalidRequestException) when calling the GetQueryResults operation: MaxResults is more than maximum allowed length 1000.

    You can obtain millions of rows if you obtain the file directly from your bucket s3 (in the next example into a Pandas Dataframe):

    def obtain_data_from_s3(self):
        self.resource = boto3.resource('s3', 
                              region_name = self.region_name, 
                              aws_access_key_id = self.aws_access_key_id,
                              aws_secret_access_key= self.aws_secret_access_key)
    
        response = self.resource \
        .Bucket(self.bucket) \
        .Object(key= self.folder + self.filename + '.csv') \
        .get()
    
        return pd.read_csv(io.BytesIO(response['Body'].read()), encoding='utf8')   
    

    The self.filename can be:

    self.filename = response['QueryExecutionId'] + ".csv"
    

    Because Athena names the files as the QueryExecutionId. I will write you all my code that takes a query and return a dataframe with all the rows and columns.

    import time
    import boto3
    import pandas as pd
    import io
    
    class QueryAthena:
    
        def __init__(self, query, database):
            self.database = database
            self.folder = 'my_folder/'
            self.bucket = 'my_bucket'
            self.s3_input = 's3://' + self.bucket + '/my_folder_input'
            self.s3_output =  's3://' + self.bucket + '/' + self.folder
            self.region_name = 'us-east-1'
            self.aws_access_key_id = "my_aws_access_key_id"
            self.aws_secret_access_key = "my_aws_secret_access_key"
            self.query = query
    
        def load_conf(self, q):
            try:
                self.client = boto3.client('athena', 
                                  region_name = self.region_name, 
                                  aws_access_key_id = self.aws_access_key_id,
                                  aws_secret_access_key= self.aws_secret_access_key)
                response = self.client.start_query_execution(
                    QueryString = q,
                        QueryExecutionContext={
                        'Database': self.database
                        },
                        ResultConfiguration={
                        'OutputLocation': self.s3_output,
                        }
                )
                self.filename = response['QueryExecutionId']
                print('Execution ID: ' + response['QueryExecutionId'])
    
            except Exception as e:
                print(e)
            return response                
    
        def run_query(self):
            queries = [self.query]
            for q in queries:
                res = self.load_conf(q)
            try:              
                query_status = None
                while query_status == 'QUEUED' or query_status == 'RUNNING' or query_status is None:
                    query_status = self.client.get_query_execution(QueryExecutionId=res["QueryExecutionId"])['QueryExecution']['Status']['State']
                    print(query_status)
                    if query_status == 'FAILED' or query_status == 'CANCELLED':
                        raise Exception('Athena query with the string "{}" failed or was cancelled'.format(self.query))
                    time.sleep(10)
                print('Query "{}" finished.'.format(self.query))
    
                df = self.obtain_data()
                return df
    
            except Exception as e:
                print(e)      
    
        def obtain_data(self):
            try:
                self.resource = boto3.resource('s3', 
                                      region_name = self.region_name, 
                                      aws_access_key_id = self.aws_access_key_id,
                                      aws_secret_access_key= self.aws_secret_access_key)
    
                response = self.resource \
                .Bucket(self.bucket) \
                .Object(key= self.folder + self.filename + '.csv') \
                .get()
    
                return pd.read_csv(io.BytesIO(response['Body'].read()), encoding='utf8')   
            except Exception as e:
                print(e)  
    
    
    if __name__ == "__main__":       
        query = "SELECT * FROM bucket.folder"
        qa = QueryAthena(query=query, database='myAthenaDb')
        dataframe = qa.run_query()
    

提交回复
热议问题