Upload data to the Azure ADLS Gen2 from on-premise using Python or Java

后端 未结 1 1767
误落风尘
误落风尘 2021-01-07 14:02

I have an Azure Storage account with Data Lake Gen2. I would like to upload data from on-premise to the Lake Gen2 file systems using Python (or Java).

I have found e

相关标签:
1条回答
  • 2021-01-07 14:35

    According to the offical tutorial Quickstart: Upload, download, and list blobs with Python, as below, you can not directly use Azure Storage SDK for Python to do any operations in Azure Data Lake Store Gen 2 if you have not enrolled in the public preview of multi-protocol access on Data Lake Storage.

    Note

    The features described in this article are available to accounts that have a hierarchical namespace only if you enroll in the public preview of multi-protocol access on Data Lake Storage. To review limitations, see the known issues article.

    So the only solution to upload data to ADLS Gen2 is to use the REST APIs of ADLS Gen2, please refer to its reference Azure Data Lake Store REST API.

    Here is my sample code to upload data to ADLS Gen2 in Python, and it works fine.

    import requests
    import json
    
    def auth(tenant_id, client_id, client_secret):
        print('auth')
        auth_headers = {
            "Content-Type": "application/x-www-form-urlencoded"
        }
        auth_body = {
            "client_id": client_id,
            "client_secret": client_secret,
            "scope" : "https://storage.azure.com/.default",
            "grant_type" : "client_credentials"
        }
        resp = requests.post(f"https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token", headers=auth_headers, data=auth_body)
        return (resp.status_code, json.loads(resp.text))
    
    def mkfs(account_name, fs_name, access_token):
        print('mkfs')
        fs_headers = {
            "Authorization": f"Bearer {access_token}"
        }
        resp = requests.put(f"https://{account_name}.dfs.core.windows.net/{fs_name}?resource=filesystem", headers=fs_headers)
        return (resp.status_code, resp.text)
    
    def mkdir(account_name, fs_name, dir_name, access_token):
        print('mkdir')
        dir_headers = {
            "Authorization": f"Bearer {access_token}"
        }
        resp = requests.put(f"https://{account_name}.dfs.core.windows.net/{fs_name}/{dir_name}?resource=directory", headers=dir_headers)
        return (resp.status_code, resp.text)
        
    def touch_file(account_name, fs_name, dir_name, file_name, access_token):
        print('touch_file')
        touch_file_headers = {
            "Authorization": f"Bearer {access_token}"
        }
        resp = requests.put(f"https://{account_name}.dfs.core.windows.net/{fs_name}/{dir_name}/{file_name}?resource=file", headers=touch_file_headers)
        return (resp.status_code, resp.text)
    
    def append_file(account_name, fs_name, path, content, position, access_token):
        print('append_file')
        append_file_headers = {
            "Authorization": f"Bearer {access_token}",
            "Content-Type": "text/plain",
            "Content-Length": f"{len(content)}"
        }
        resp = requests.patch(f"https://{account_name}.dfs.core.windows.net/{fs_name}/{path}?action=append&position={position}", headers=append_file_headers, data=content)
        return (resp.status_code, resp.text)
        
    def flush_file(account_name, fs_name, path, position, access_token):
        print('flush_file')
        flush_file_headers = {
            "Authorization": f"Bearer {access_token}"
        }
        resp = requests.patch(f"https://{account_name}.dfs.core.windows.net/{fs_name}/{path}?action=flush&position={position}", headers=flush_file_headers)
        return (resp.status_code, resp.text)
    
    def mkfile(account_name, fs_name, dir_name, file_name, local_file_name, access_token):
        print('mkfile')
        status_code, result = touch_file(account_name, fs_name, dir_name, file_name, access_token)
        if status_code == 201:
            with open(local_file_name, 'rb') as local_file:
                path = f"{dir_name}/{file_name}"
                content = local_file.read()
                position = 0
                append_file(account_name, fs_name, path, content, position, access_token)
                position = len(content)
                flush_file(account_name, fs_name, path, position, access_token)
        else:
            print(result)
            
        
    if __name__ == '__main__':
        tenant_id = '<your tenant id>'
        client_id = '<your client id>'
        client_secret = '<your client secret>'
        
        account_name = '<your adls account name>'
        fs_name = '<your filesystem name>'
        dir_name = '<your directory name>'
        file_name = '<your file name>'
        local_file_name = '<your local file name>'
        
        # Acquire an Access token
        auth_status_code, auth_result = auth(tenant_id, client_id, client_secret)
        access_token = auth_status_code == 200 and auth_result['access_token'] or ''
        print(access_token)
        
        # Create a filesystem
        mkfs_status_code, mkfs_result = mkfs(account_name, fs_name, access_token)
        print(mkfs_status_code, mkfs_result)
        
        # Create a directory
        mkdir_status_code, mkdir_result = mkdir(account_name, fs_name, dir_name, access_token)
        print(mkdir_status_code, mkdir_result)
        
        # Create a file from local file
        mkfile(account_name, fs_name, dir_name, file_name, local_file_name, access_token)
    

    Hope it helps.

    0 讨论(0)
提交回复
热议问题