I am processing many text files which (some of them) contain uuencoding which can be .jpg or .pdf or .zip of .xlsx etc. I don\'t care about the embedded UUencoded data, so I wou
I coded up what was supposed to be a rather simple generator. Because the spec is slightly tedious (why two separate end markers on different lines?) it is rather bulky, but here goes. It should work as a validator for uuencode at the same time, but I have only tested it in very limited settings.
import re
def unuuencode (iterator, collector=None, ignore_length_errors=False):
"""
Yield lines from iterator except when they are in an uuencode blob.
If collector is not None, append to it the uuencoded blobs as a list
of a list of lines, one for each uuencoded blob.
"""
state = None # one of { None, 'in_blob', 'closing', 'closed' }
collectitem = None
regex = re.compile(r'^begin\s+[0-7]{3,6}\s+.*?(?:\r?\n)?$')
for line in iterator:
if state == None:
if regex.match(line):
if collector != None:
collectitem = [line]
state = 'in_blob'
continue
else:
yield line
else:
stripped = line.rstrip('\r\n')
if state == 'in_blob' and line.startswith('`'):
state = 'closing'
if state == 'closing':
if stripped != '`':
raise ValueError('Expected "`" but got "%s"' % line)
state = 'closed'
elif state == 'closed':
if stripped != 'end':
raise ValueError('Expected "end" but got "%s"' % line)
state = None
else:
expect = ord(line[0:1])-32
actual = len(stripped)
seen = (len(stripped)-1)*6/8
if seen != expect:
if not ignore_length_errors:
raise ValueError('Wrong prefix on line: %s '
'(indicated %i, 6/8 %i, actual length %i)' % (
line, expect, seen, actual))
if line[0:1] != 'M':
state = 'closing'
if collectitem:
collectitem.append(line)
if state is None:
if collectitem:
collector.append(collectitem)
collectitem = None
continue
Use it like this:
with open(file, 'r') as f:
lines = [x for x in unuuencode(f)]
or like this:
with open(file, 'r') as f:
blobs = []
lines = [x for x in unuuencode(f, collector=blobs)]
or like this:
with open(file, 'r') as f:
lines = f.read().split('\n')
# ... or whichever way you obtained your content as an array of lines
lines = [x for x in unuuencode(lines)]
or in the case of the code you seem to be using:
for fi in sys.argv[1:]:
with open(fi) as markup:
soup = BeautifulSoup(''.join(unuuencode(markup, ignore_length_errors=True)))
with open("strip_" + fi, "w") as f:
f.write(soup.get_text().encode('utf-8'))
The sample you linked to had an invalid length indicator in the second uuencoded blob, so I added an option to ignore that.