Paramiko Fails to download large files >1GB

前端 未结 7 1402
一向
一向 2020-12-13 10:44
def download():
if os.path.exists( dst_dir_path ) == False:
    logger.error( \"Cannot access destination folder %s. Please check path and permissions. \" % ( dst_di         


        
相关标签:
7条回答
  • 2020-12-13 11:25

    I use this type of script with paramiko for larger files, you can play around with the window_size / packet size to see what works best for you, if you want it to be more performant you could run parallel processes to read different chunks of files in parallel using the second method (see http://docs.paramiko.org/en/latest/api/sftp.html#paramiko.sftp_file.SFTPFile.readv)

    import time, paramiko
    
    MAX_RETRIES = 10
    
    ftp_server = "ftp.someserver.com"
    port = 22
    sftp_file = "/somefolder/somefile.txt"
    local_file = "/somefolder/somewhere/here.txt"
    ssh_conn = sftp_client = None
    username = "username"
    password = "password"
    
    start_time = time.time()
    
    for retry in range(MAX_RETRIES):
        try:
            ssh_conn = paramiko.Transport((ftp_server, port))
            ssh_conn.connect(username=username, password=password)
            # method 1 using sftpfile.get and settings window_size, max_packet_size
            window_size = pow(4, 12)#about ~16MB chunks
            max_packet_size = pow(4, 12)
            sftp_client = paramiko.SFTPClient.from_transport(ssh_conn, window_size=window_size, max_packet_size=max_packet_size)
            sftp_client.get(sftp_file, local_file)
            # method 2 breaking up file into chunks to read in parallel
            sftp_client = paramiko.SFTPClient.from_transport(ssh_conn)
            filesize = sftp_client.stat(sftp_file).st_size
            chunksize = pow(4, 12)#<-- adjust this and benchmark speed
            chunks = [(offset, chunksize) for offset in range(0, filesize, chunksize)]
            with sftp_client.open(sftp_file, "rb") as infile:
                with open(local_file, "wb") as outfile:
                    for chunk in infile.readv(chunks):
                        outfile.write(chunk)
            break
        except (EOFError, paramiko.ssh_exception.SSHException, OSError) as x:
            retry += 1
            print("%s %s - > retrying %s..." % (type(x), x, retry))
            time.sleep(abs(retry) * 10)
            # back off in steps of 10, 20.. seconds 
        finally:
            if hasattr(sftp_client, "close") and callable(sftp_client.close):
                sftp_client.close()
            if hasattr(ssh_conn, "close") and callable(ssh_conn.close):
                ssh_conn.close()
    
    
    print("Loading File %s Took %d seconds " % (sftp_file, time.time() - start_time))
    

    If you are really concerned about performance you could run the second method and break the file into multiple processes / threads, here's a code sample using multi-threading that writes multiple file parts then joins them into the one file

    import threading, os, time, paramiko
    
    #you could make the number of threads relative to file size
    NUM_THREADS = 4
    MAX_RETRIES = 10
    
    def make_filepart_path(file_path, part_number):
        """creates filepart path from filepath"""
        return "%s.filepart.%s" % (file_path, part_number+1)
    
    def write_chunks(chunks, tnum, local_file_part, username, password, ftp_server, max_retries):
        ssh_conn = sftp_client = None
        for retry in range(max_retries):
            try:
                ssh_conn = paramiko.Transport((ftp_server, port))
                ssh_conn.connect(username=username, password=password)
                sftp_client = paramiko.SFTPClient.from_transport(ssh_conn)
                with sftp_client.open(sftp_file, "rb") as infile:
                    with open(local_file_part, "wb") as outfile:
                        for chunk in infile.readv(chunks):
                            outfile.write(chunk)
                break
            except (EOFError, paramiko.ssh_exception.SSHException, OSError) as x:
                retry += 1
                print("%s %s Thread %s - > retrying %s..." % (type(x), x, tnum, retry))
                time.sleep(abs(retry) * 10)
            finally:
                if hasattr(sftp_client, "close") and callable(sftp_client.close):
                    sftp_client.close()
                if hasattr(ssh_conn, "close") and callable(ssh_conn.close):
                    ssh_conn.close()
    
    
    
    start_time = time.time()
    
    for retry in range(MAX_RETRIES):
        try:
            ssh_conn = paramiko.Transport((ftp_server, port))
            ssh_conn.connect(username=username, password=password)
            sftp_client = paramiko.SFTPClient.from_transport(ssh_conn)
            # connect to get the file's size in order to calculate chunks
            filesize = sftp_client.stat(sftp_file).st_size
            sftp_client.close()
            ssh_conn.close()
            chunksize = pow(4, 12)
            chunks = [(offset, chunksize) for offset in range(0, filesize, chunksize)]
            thread_chunk_size = (len(chunks) // NUM_THREADS) + 1
            # break the chunks into sub lists to hand off to threads
            thread_chunks = [chunks[i:i+thread_chunk_size] for i in range(0, len(chunks) - 1, thread_chunk_size)]
            threads = []
            fileparts = []
            for thread_num in range(len(thread_chunks)):
                local_file_part = make_filepart_path(local_file, thread_num) 
                args = (thread_chunks[thread_num], thread_num, local_file_part, username, password, ftp_server, MAX_RETRIES)
                threads.append(threading.Thread(target=write_chunks, args=args))
                fileparts.append(local_file_part)
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join()
            # join file parts into one file, remove fileparts
            with open(local_file, "wb") as outfile:
                for filepart in fileparts:
                    with open(filepart, "rb") as infile:
                        outfile.write(infile.read())
                    os.remove(filepart)
            break
        except (EOFError, paramiko.ssh_exception.SSHException, OSError) as x:
            retry += 1
            print("%s %s - > retrying %s..." % (type(x), x, retry))
            time.sleep(abs(retry) * 10)
        finally:
           if hasattr(sftp_client, "close") and callable(sftp_client.close):
               sftp_client.close()
           if hasattr(ssh_conn, "close") and callable(ssh_conn.close):
               ssh_conn.close()
    
    
    print("Loading File %s Took %d seconds " % (sftp_file, time.time() - start_time))
    
    0 讨论(0)
提交回复
热议问题