Python string pattern recognition/compression

前端 未结 6 906
隐瞒了意图╮
隐瞒了意图╮ 2021-02-15 14:45

I can do basic regex alright, but this is slightly different, namely I don\'t know what the pattern is going to be.

For example, I have a list of similar strings:

<
6条回答
  •  夕颜
    夕颜 (楼主)
    2021-02-15 15:42

    This solution finds the two longest common substrings and uses them to delimit the input strings:

    def an_answer_to_stackoverflow_question_1914394(lst):
        """
        >>> lst = ['asometxt0moretxt', 'bsometxt1moretxt', 'aasometxt10moretxt', 'zzsometxt999moretxt']
        >>> an_answer_to_stackoverflow_question_1914394(lst)
        (['sometxt', 'moretxt'], [('a', '0'), ('b', '1'), ('aa', '10'), ('zz', '999')])
        """
        delimiters = find_delimiters(lst)
        return delimiters, list(split_strings(lst, delimiters))
    

    find_delimiters and friends finds the delimiters:

    import itertools
    
    def find_delimiters(lst):
        """
        >>> lst = ['asometxt0moretxt', 'bsometxt1moretxt', 'aasometxt10moretxt', 'zzsometxt999moretxt']
        >>> find_delimiters(lst)
        ['sometxt', 'moretxt']
        """
        candidates = list(itertools.islice(find_longest_common_substrings(lst), 3))
        if len(candidates) == 3 and len(candidates[1]) == len(candidates[2]):
            raise ValueError("Unable to find useful delimiters")
        if candidates[1] in candidates[0]:
            raise ValueError("Unable to find useful delimiters")
        return candidates[0:2]
    
    def find_longest_common_substrings(lst):
        """
        >>> lst = ['asometxt0moretxt', 'bsometxt1moretxt', 'aasometxt10moretxt', 'zzsometxt999moretxt']
        >>> list(itertools.islice(find_longest_common_substrings(lst), 3))
        ['sometxt', 'moretxt', 'sometx']
        """
        for i in xrange(min_length(lst), 0, -1):
            for substring in common_substrings(lst, i):
                yield substring
    
    
    def min_length(lst):
        return min(len(item) for item in lst)
    
    def common_substrings(lst, length):
        """
        >>> list(common_substrings(["hello", "world"], 2))
        []
        >>> list(common_substrings(["aabbcc", "dbbrra"], 2))
        ['bb']
        """
        assert length <= min_length(lst)
        returned = set()
        for i, item in enumerate(lst):
            for substring in all_substrings(item, length):
                in_all_others = True
                for j, other_item in enumerate(lst):
                    if j == i:
                        continue
                    if substring not in other_item:
                        in_all_others = False
                if in_all_others:
                    if substring not in returned:
                        returned.add(substring)
                        yield substring
    
    def all_substrings(item, length):
        """
        >>> list(all_substrings("hello", 2))
        ['he', 'el', 'll', 'lo']
        """
        for i in range(len(item) - length + 1):
            yield item[i:i+length]
    

    split_strings splits the strings using the delimiters:

    import re
    
    def split_strings(lst, delimiters):
        """
        >>> lst = ['asometxt0moretxt', 'bsometxt1moretxt', 'aasometxt10moretxt', 'zzsometxt999moretxt']
        >>> list(split_strings(lst, find_delimiters(lst)))
        [('a', '0'), ('b', '1'), ('aa', '10'), ('zz', '999')]
        """
        for item in lst:
            parts = re.split("|".join(delimiters), item)
            yield tuple(part for part in parts if part != '')
    

提交回复
热议问题