I was searching for a way to strip out pictures from these file types and this is the solution I came up with. It iterates through a given directory structure, copies any files
Excel files are in the form of zip file.It is easy to extract images from excel or docx file:
import zipfile
from PIL import Image, ImageFilter
import io
blur = ImageFilter.GaussianBlur(40)
def redact_images(filename,FilePath):
outfile = filename.replace(".xlsx", "_redacted.xlsx")
with zipfile.ZipFile(filename) as inzip:
with zipfile.ZipFile(outfile, "w") as outzip:
i = 0
for info in inzip.infolist():
name = info.filename
content = inzip.read(info)
if name.endswith((".png", ".jpeg", ".gif")):
fmt = name.split(".")[-1]
Name = name.split("/")[-1]
img = Image.open(io.BytesIO(content))
img.save(FilePath + str(Name))
outb = io.BytesIO()
img.save(outb, fmt)
content = outb.getvalue()
info.file_size = len(content)
info.CRC = zipfile.crc32(content)
i += 1
outzip.writestr(info, content)
filename : Location of input excel file
FilePath : Location to save extracted images