I can do basic regex alright, but this is slightly different, namely I don\'t know what the pattern is going to be.
For example, I have a list of similar strings:
<
This solution finds the two longest common substrings and uses them to delimit the input strings:
def an_answer_to_stackoverflow_question_1914394(lst):
"""
>>> lst = ['asometxt0moretxt', 'bsometxt1moretxt', 'aasometxt10moretxt', 'zzsometxt999moretxt']
>>> an_answer_to_stackoverflow_question_1914394(lst)
(['sometxt', 'moretxt'], [('a', '0'), ('b', '1'), ('aa', '10'), ('zz', '999')])
"""
delimiters = find_delimiters(lst)
return delimiters, list(split_strings(lst, delimiters))
find_delimiters
and friends finds the delimiters:
import itertools
def find_delimiters(lst):
"""
>>> lst = ['asometxt0moretxt', 'bsometxt1moretxt', 'aasometxt10moretxt', 'zzsometxt999moretxt']
>>> find_delimiters(lst)
['sometxt', 'moretxt']
"""
candidates = list(itertools.islice(find_longest_common_substrings(lst), 3))
if len(candidates) == 3 and len(candidates[1]) == len(candidates[2]):
raise ValueError("Unable to find useful delimiters")
if candidates[1] in candidates[0]:
raise ValueError("Unable to find useful delimiters")
return candidates[0:2]
def find_longest_common_substrings(lst):
"""
>>> lst = ['asometxt0moretxt', 'bsometxt1moretxt', 'aasometxt10moretxt', 'zzsometxt999moretxt']
>>> list(itertools.islice(find_longest_common_substrings(lst), 3))
['sometxt', 'moretxt', 'sometx']
"""
for i in xrange(min_length(lst), 0, -1):
for substring in common_substrings(lst, i):
yield substring
def min_length(lst):
return min(len(item) for item in lst)
def common_substrings(lst, length):
"""
>>> list(common_substrings(["hello", "world"], 2))
[]
>>> list(common_substrings(["aabbcc", "dbbrra"], 2))
['bb']
"""
assert length <= min_length(lst)
returned = set()
for i, item in enumerate(lst):
for substring in all_substrings(item, length):
in_all_others = True
for j, other_item in enumerate(lst):
if j == i:
continue
if substring not in other_item:
in_all_others = False
if in_all_others:
if substring not in returned:
returned.add(substring)
yield substring
def all_substrings(item, length):
"""
>>> list(all_substrings("hello", 2))
['he', 'el', 'll', 'lo']
"""
for i in range(len(item) - length + 1):
yield item[i:i+length]
split_strings
splits the strings using the delimiters:
import re
def split_strings(lst, delimiters):
"""
>>> lst = ['asometxt0moretxt', 'bsometxt1moretxt', 'aasometxt10moretxt', 'zzsometxt999moretxt']
>>> list(split_strings(lst, find_delimiters(lst)))
[('a', '0'), ('b', '1'), ('aa', '10'), ('zz', '999')]
"""
for item in lst:
parts = re.split("|".join(delimiters), item)
yield tuple(part for part in parts if part != '')