I just started learning python and here I have a sorted list of protein sequences (total 59,000 sequences) and some of them overlap. I have made a toy list here for example:
You can use a binary tree whose insertion process attempts to find nodes that precede the value:
class Tree:
def __init__(self, val=None):
self.left, self.value, self.right = None, val, None
def insert_val(self, _val):
if self.value is None or _val.startswith(self.value):
self.value = _val
else:
if _val < self.value:
getattr(self.left, 'insert_val', lambda x:setattr(self, 'left', Tree(x)))(_val)
else:
getattr(self.right, 'insert_val', lambda x:setattr(self, 'right', Tree(x)))(_val)
def flatten(self):
return [*getattr(self.left, 'flatten', lambda :[])(), self.value, *getattr(self.right, 'flatten', lambda :[])()]
t = Tree()
for i in open('filename.txt'):
t.insert_val(i.strip('\n'))
print(t.flatten())
Output:
['ABCDEFGHIJKLMNO', 'CEST', 'DBTSFDEO', 'EAEUDNBNUW', 'EOEUDNBNUW', 'FGH']