I have a string that kind of looks like this:
\"stuff . // : /// more-stuff .. .. ...$%$% stuff -> DD\"
and I want to strip off all p
works in python3 this will retain the same whitespace character you collapsed. So if you have a tab and a space next to each other they wont collapse into a single character.
def collapse_whitespace_characters(raw_text):
ret = ''
if len(raw_text) > 1:
prev_char = raw_text[0]
ret += prev_char
for cur_char in raw_text[1:]:
if not cur_char.isspace() or cur_char != prev_char:
ret += cur_char
prev_char = cur_char
else:
ret = raw_text
return ret
this one will collapse whitespace sets into the first whitespace character it sees
def collapse_whitespace(raw_text):
ret = ''
if len(raw_text) > 1:
prev_char = raw_text[0]
ret += prev_char
for cur_char in raw_text[1:]:
if not cur_char.isspace() or \
(cur_char.isspace() and not prev_char.isspace()):
ret += cur_char
prev_char = cur_char
else:
ret = raw_text
return ret
>>> collapse_whitespace_characters('we like spaces and\t\t TABS AND WHATEVER\xa0\xa0IS')
'we like spaces and\t TABS\tAND WHATEVER\xa0IS'
>>> collapse_whitespace('we like spaces and\t\t TABS AND WHATEVER\xa0\xa0IS')
'we like spaces and\tTABS\tAND WHATEVER\xa0IS'
for punctuation
def collapse_punctuation(raw_text):
ret = ''
if len(raw_text) > 1:
prev_char = raw_text[0]
ret += prev_char
for cur_char in raw_text[1:]:
if cur_char.isalnum() or cur_char != prev_char:
ret += cur_char
prev_char = cur_char
else:
ret = raw_text
return ret
to actually answer the question
orig = 'stuff . // : /// more-stuff .. .. ...$%$% stuff -> DD'
collapse_whitespace(''.join([(c.upper() if c.isalnum() else ' ') for c in orig]))
as said, the regexp would be something like
re.sub('\W+', ' ', orig).upper()