I am looking for a way to unzip nested zip files in python. For example, consider the following structure (hypothetical names for ease):
Unfortunately decompressing zip files requires random access to the archive, and the ZipFile
methods (not to mention the DEFLATE algorithm itself) only provide streams. It is therefore impossible to decompress nested zip files without extracting them.
This works for me. Just place this script with the nested zip under the same directory. It will also count the total number of files within the nested zip as well
import os
from zipfile import ZipFile
def unzip (path, total_count):
for root, dirs, files in os.walk(path):
for file in files:
file_name = os.path.join(root, file)
if (not file_name.endswith('.zip')):
total_count += 1
else:
currentdir = file_name[:-4]
if not os.path.exists(currentdir):
os.makedirs(currentdir)
with ZipFile(file_name) as zipObj:
zipObj.extractall(currentdir)
os.remove(file_name)
total_count = unzip(currentdir, total_count)
return total_count
total_count = unzip ('.', 0)
print(total_count)
ZipFile needs a file-like object, so you can use StringIO to turn the data you read from the nested zip into such an object. The caveat is that you'll be loading the full (still compressed) inner zip into memory.
with zipfile.ZipFile('foo.zip') as z:
with z.open('nested.zip') as z2:
z2_filedata = cStringIO.StringIO(z2.read())
with zipfile.ZipFile(z2_filedata) as nested_zip:
print nested_zip.open('data.txt').read()
I use python 3.7.3
import zipfile
import io
with zipfile.ZipFile('all.zip') as z:
with z.open('nested.zip') as z2:
z2_filedata = io.BytesIO(z2.read())
with zipfile.ZipFile(z2_filedata) as nested_zip:
print( nested_zip.open('readme.md').read())
For those looking for a function that extracts a nested zip file (any level of nesting) and cleans up the original zip files:
import zipfile, re, os
def extract_nested_zip(zippedFile, toFolder):
""" Unzip a zip file and its contents, including nested zip files
Delete the zip file(s) after extraction
"""
with zipfile.ZipFile(zippedFile, 'r') as zfile:
zfile.extractall(path=toFolder)
os.remove(zippedFile)
for root, dirs, files in os.walk(toFolder):
for filename in files:
if re.search(r'\.zip$', filename):
fileSpec = os.path.join(root, filename)
extract_nested_zip(fileSpec, root)
Here's a function I came up with.
def extract_nested_zipfile(path, parent_zip=None):
"""Returns a ZipFile specified by path, even if the path contains
intermediary ZipFiles. For example, /root/gparent.zip/parent.zip/child.zip
will return a ZipFile that represents child.zip
"""
def extract_inner_zipfile(parent_zip, child_zip_path):
"""Returns a ZipFile specified by child_zip_path that exists inside
parent_zip.
"""
memory_zip = StringIO()
memory_zip.write(parent_zip.open(child_zip_path).read())
return zipfile.ZipFile(memory_zip)
if ('.zip' + os.sep) in path:
(parent_zip_path, child_zip_path) = os.path.relpath(path).split(
'.zip' + os.sep, 1)
parent_zip_path += '.zip'
if not parent_zip:
# This is the top-level, so read from disk
parent_zip = zipfile.ZipFile(parent_zip_path)
else:
# We're already in a zip, so pull it out and recurse
parent_zip = extract_inner_zipfile(parent_zip, parent_zip_path)
return extract_nested_zipfile(child_zip_path, parent_zip)
else:
if parent_zip:
return extract_inner_zipfile(parent_zip, path)
else:
# If there is no nesting, it's easy!
return zipfile.ZipFile(path)
Here's how I tested it:
echo hello world > hi.txt
zip wrap1.zip hi.txt
zip wrap2.zip wrap1.zip
zip wrap3.zip wrap2.zip
print extract_nested_zipfile('/Users/mattfaus/dev/dev-git/wrap1.zip').open('hi.txt').read()
print extract_nested_zipfile('/Users/mattfaus/dev/dev-git/wrap2.zip/wrap1.zip').open('hi.txt').read()
print extract_nested_zipfile('/Users/mattfaus/dev/dev-git/wrap3.zip/wrap2.zip/wrap1.zip').open('hi.txt').read()