I just started learning python and here I have a sorted list of protein sequences (total 59,000 sequences) and some of them overlap. I have made a toy list here for example:
# assuming list is sorted:
pattern = ["ABCDE",
"ABCDEFG",
"ABCDEFGH",
"ABCDEFGHIJKLMNO",
"CEST",
"DBTSFDE",
"DBTSFDEO",
"EOEUDNBNUW",
"EAEUDNBNUW",
"FG",
"FGH"]
pattern = list(reversed(pattern))
def iterate_patterns():
while pattern:
i = pattern.pop()
throw_it_away = False
for p in pattern:
if p.startswith(i):
throw_it_away = True
break
if throw_it_away == False:
yield i
print(list(iterate_patterns()))
Output:
['ABCDEFGHIJKLMNO', 'CEST', 'DBTSFDEO', 'EOEUDNBNUW', 'EAEUDNBNUW', 'FGH']