List of strings, get common substring of n elements, Python

旧街凉风 提交于 2021-02-07 08:38:07

问题


My problem is maybe similar to this, but another situation. Consider this list in input :

['ACCCACCCGTGG','AATCCC','CCCTGAGG']

And the other input is n,n is a number, the dimension of the substring in common in every element of the list. So the output has to be the maximum occorence substring with the number of occorences, similar to this:

{'CCC' : 4}

4 becouse in the first element of list are twice, and one time in the other two strings.CCC becouse is the longhest substring with 3 elements,that repeats at least 1 time per string I started in that way :

def get_n_repeats_list(n,seq_list):
max_substring={}
list_seq=list(seq_list)
for i in range(0,len(list_seq)):
    if i+1<len(list_seq):
        #Idea : to get elements in common,comparing two strings at time
        #in_common=set(list_seq[i])-set(list_seq[i+1])
        #max_substring...       
return max_substring

Maybe here a solution


回答1:


So this is my take on it. It is definitely not the prettiest thing on the planet but it should work just fine.

a = ['ACCCWCCCGTGG', 'AATCCC', 'CCCTGAGG']

def occur(the_list, a_substr):
    i_found = 0
    for a_string in the_list:
        for i_str in range(len(a_string) - len(a_substr) + 1):
            #print('Comparing {:s} to {:s}'.format(substr, a_string[i_str:i_str + len(substr)]))
            if a_substr == a_string[i_str:i_str + len(a_substr)]:
                i_found += 1
    return i_found

def found_str(original_List, n):
    result_dict = {}
    if n > min(map(len, original_List)):
        print("The substring has to be shorter than the shortest string!")
        exit()
    specialChar = '|'
    b = specialChar.join(item for item in original_List)
    str_list = []
    for i in range(len(b) - n):
        currStr = b[i:i+n]
        if specialChar not in currStr:
            str_list.append(currStr)
        else:
            continue
    str_list = set(str_list)

    for sub_strs in str_list:
        i_found = 0
        for strs in original_List:
            if sub_strs in strs:
                i_found += 1

        if i_found == len(original_List):
            #print("entered with sub = {:s}".format(sub_strs))
            #print(occur(original_List, sub_strs))
            result_dict[sub_strs] = occur(original_List, sub_strs)

    if result_dict == {}:
        print("No common substings of length {:} were found".format(n))

    return result_dict

end = found_str(a, 3)
print(end)

returns: {'CCC': 4}




回答2:


import operator
LL = ['ACCCACCCGTGG','AATCCC','CCCTGAGG']

def createLenList(n,LL):
    stubs = {}
    for l in LL: 
      for i,e in enumerate(l): 
          stub = l[i:i+n]          
          if len(stub) == n:
             if stub not in stubs: stubs[stub]  = 1
             else:                 stubs[stub] += 1

    maxKey =   max(stubs.iteritems(), key=operator.itemgetter(1))[0]
    return [maxKey,stubs[maxKey]]

maxStub =  createLenList(3,LL)
print maxStub



回答3:


def long_substr(data):
    substr = ''
    if len(data) > 1 and len(data[0]) > 0:
        for i in range(len(data[0])):
            for j in range(len(data[0])-i+1):
                if j > len(substr) and is_substr(data[0][i:i+j], data):
                    substr = data[0][i:i+j]
    return substr       

def is_substr(find, data):
    if len(data) < 1 and len(find) < 1:
        return False
    for i in range(len(data)):
        if find not in data[i]:
            return False
    return True 

input_list = ['A', 'ACCCACCCGTGG','AATCCC','CCCTGAGG']
longest_common_str = long_substr(input_list)

if longest_common_str:
    frequency = 0
    for common in input_list:
        frequency += common.count(longest_common_str)

    print (longest_common_str, frequency)      
else: 
    print ("nothing common")        

Output

A 6



来源:https://stackoverflow.com/questions/37527585/list-of-strings-get-common-substring-of-n-elements-python

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!