python3, difflib SequenceMatcher

房东的猫 提交于 2019-12-11 16:05:24

问题


the following takes in two strings, compares differences and return them both as identicals as well as their differences, separated by spaces (maintaining the length of the longest sting.

The commented area in the code, are the 4 strings that should be returned.

from difflib import SequenceMatcher




t1 = 'betty:  backstreetvboysareback"give.jpg"LAlarrygarryhannyhref="ang"_self'

t2 = 'bettyv:  backstreetvboysareback"lifeislike"LAlarrygarryhannyhref="in.php"_self'


#t1 = 'betty :  backstreetvboysareback" i e      "LAlarrygarryhannyhref=" n    "_self'
#t2 = 'betty :  backstreetvboysareback" i e      "LAlarrygarryhannyhref=" n    "_self'

#o1 = '                                g v .jpg                          g           '
#o2 = '     v                          l f islike                        i .php      '



matcher = SequenceMatcher(None, t1, t2)
blocks = matcher.get_matching_blocks()

bla1 = []
bla2 = []

for i in range(len(blocks)):
    if i != len(blocks)-1:
        bla1.append([t1[blocks[i].a + blocks[i].size:blocks[i+1].a], blocks[i].a + blocks[i].size, blocks[i+1].a])
        bla2.append([t2[blocks[i].b + blocks[i].size:blocks[i+1].b], blocks[i].b + blocks[i].size, blocks[i+1].b])



cnt = 0
for i in range(len(bla1)):


    if bla1[i][1] < bla2[i][1]:
        num = bla2[i][1] - bla1[i][1]
        t2 = t2[0:bla2[i][1]] + ' '*num + t2[bla2[i][1]:len(t2)]
        bla2[i][0] = ' '*num + bla2[i][0]
        bla2[i][1] = bla1[i][1]

    if bla2[i][1] < bla1[i][1]:
        num = bla1[i][1] - bla2[i][1]
        t1 = t1[0:bla1[i][1]] + ' '*num + t1[bla1[i][1]:len(t1)]
        bla1[i][0] = ' '*num + bla1[i][0]
        bla1[i][1] = bla2[i][1]

    if bla1[i][2] > bla2[i][2]:
        num = bla1[i][2] - bla2[i][2]
        t2 = t2[0:bla2[i][2]] + ' '*num + t2[bla2[i][2]:len(t2)]
        bla2[i][0] = bla2[i][0] + ' '*num
        bla2[i][2] = bla1[i][2]

    if bla2[i][2] > bla1[i][2]:
        num = bla2[i][2] - bla1[i][2]
        t1 = t1[0:bla1[i][2]] + ' '*num + t1[bla1[i][2]:len(t1)]
        bla1[i][0] = bla1[i][0] + ' '*num
        bla1[i][2] = bla2[i][2]




t11 = []
t11 = t1[0:bla1[0][1]]
t11 += t1[bla1[0][2]:bla1[1][1]]
t11 += t1[bla1[1][2]:bla1[2][1]]
t11 += t1[bla1[2][2]:bla1[3][1]]
t11 += t1[bla1[3][2]:bla1[4][1]]
t11 += t1[bla1[5][2]:bla1[6][1]]
t11 += t1[bla1[6][2]:len(t1)]

t12 = []
t12 = t2[0:bla1[0][1]]
t12 += t2[bla1[0][2]:bla1[1][1]]
t12 += t2[bla1[1][2]:bla1[2][1]]
t12 += t2[bla1[2][2]:bla1[3][1]]
t12 += t2[bla1[3][2]:bla1[4][1]]
t12 += t2[bla1[5][2]:bla1[6][1]]
t12 += t2[bla1[6][2]:len(t2)]

After ranging the blocks into an organised format bla1, bla2 where each difference is stored as a string with its start and end position eg ['v', 33, 34] for each separate string. After this, I attempt to insert spaces to match the length and separation factors necessary and this is where the code starts to break.

Please if someone could take a look!


回答1:


I have worked through resolving this, and since no one has posted a response I will post the progress and solution. The following code is progress ... it worked well when dealing with variations that had less offset but began to break when getting into larger differences, specifically in maintaining spacing (offset) in matching up the two.

from difflib import SequenceMatcher
import pdb


t1 = 'betty:  backstreetvboysareback"give.jpg"LAlarrygarryhannyhref="ang"_self'

t2 = 'betty:  backstreetvboysareback"lol.jpg"LAlarrygarryhannyhref="ang"_self'

#t2 = 'bettyv:  backstreetvboysareback"lifeislike"LAlarrygarryhannyhref="in.php"_selff'

#t2 = 'LA'
#t2 = 'c give.'
#t2 = 'give.'




#t1 = 'betty :  backstreetvboysareback" i e      "LAlarrygarryhannyhref=" n    "_self'
#t2 = 'betty :  backstreetvboysareback" i e      "LAlarrygarryhannyhref=" n    "_self'

#o1 = '                                g v .jpg                          g           '
#o2 = '     v                          l f islike                        i .php      '



matcher = SequenceMatcher(None, t1, t2)
blocks = matcher.get_matching_blocks()

#print(len(blocks))

bla1 = []
bla2 = []

#bla = (string), (first pos), (second pos), (pos1 + pos2), (pos + pos2 total positions added togeather)
dnt = False
for i in range(len(blocks)):

    if i == 0:
      if blocks[i].a != 0 and dnt == False:
        bla1.append([t1[blocks[i].a:blocks[i].b], 0, blocks[i].a, 0, 0])
        bla2.append([t2[blocks[i].a:blocks[i].b], 0, blocks[i].b, 0, 0])
        dnt = True

      if blocks[i].b != 0 and dnt == False:
        bla2.append([t2[blocks[i].a:blocks[i].b], 0, blocks[i].b, 0, 0])
        bla1.append([t1[blocks[i].a:blocks[i].b], 0, blocks[i].a, 0, 0])
        dnt = True

    if i != len(blocks)-1:
        print(blocks[i])

        bla1.append([t1[blocks[i].a + blocks[i].size:blocks[i+1].a], blocks[i].a + blocks[i].size, blocks[i+1].a, 0, 0])
        bla2.append([t2[blocks[i].b + blocks[i].size:blocks[i+1].b], blocks[i].b + blocks[i].size, blocks[i+1].b, 0, 0])

#pdb.set_trace()

ttl = 0
for i in range(len(bla1)):
  cnt = bla1[i][2] - bla1[i][1]
  if cnt != 0:
    bla1[i][3] = cnt
  ttl = ttl + cnt
  bla1[i][4] = ttl

ttl = 0
for i in range(len(bla2)):
  cnt = bla2[i][2] - bla2[i][1]
  if cnt != 0:
    bla2[i][3] = cnt
  ttl = ttl + cnt
  bla2[i][4] = ttl

print(bla1)
print(bla2)

tt1 = ''
dif = 0
i = 0
while True:

  if i == 0:
    if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
    if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]  
    tt1 += t1[:bla1[i][1]] + '_'*dif

  if i <= len(bla1) -1:

    if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
    if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]

    if len(bla1) != 1:
      if i == 0: tt1 += t1[bla1[i][1] + bla1[i][3]:bla1[i+1][1]]
      if i != 0 and i != len(bla1)-1: tt1 += '_'*dif + t1[bla1[i][1] + bla1[i][3]:bla1[i+1][1]]
      if i == len(bla1)-1: tt1 += '_'*dif + t1[bla1[i][1] + bla1[i][3]:len(t1)]

    i = i+1
    print('t1 = ' + tt1)

  else:
    break

tt2 = ''
i = 0
dif = 0
while True:

  if i == 0:

    if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
    if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]   
    tt2 += t2[:bla2[i][1]] + '_'*dif

  if i <= len(bla2) -1:

    if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
    if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]    

    if len(bla2) != 1:
      if i == 0: tt2 += t2[bla2[i][1] + bla2[i][3]:bla2[i+1][1]]
      if i != 0 and i != len(bla1)-1: tt2 += '_'*dif + t2[bla2[i][1] + bla2[i][3]:bla2[i+1][1]]
      if i == len(bla2)-1: tt2 += '_'*dif + t2[bla2[i][1] + bla2[i][3]:len(t2)]

    i = i+1
    print('t2 = ' + tt2)

  else:
    break

  print()

Solution:

Unfortunately I have been too busy to continue coding this and have resorted to sub-processing diffutils ... this is a wonderful alternative to a lot of painstaking coding!



来源:https://stackoverflow.com/questions/48859026/python3-difflib-sequencematcher

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!