Damerau–Levenshtein distance (Edit Distance with Transposition) c implementation

后端 未结 2 1426
攒了一身酷
攒了一身酷 2021-02-13 18:29

I implemented the Damerau–Levenshtein distance in c++ but it does not give correct o/p for the input (pantera,aorta) the correct o/p is 4 but my code gives 5.....



        
相关标签:
2条回答
  • 2021-02-13 19:00

    The algorithm in the post does not compute Damerau-Levenshtein distance. In a wikipedia article this algorithm is defined as the Optimal String Alignment Distance.

    A java implementation of DL distance algorithm can be found in another SO post.

    To get the correct values of OSA distance please change the lines marked with - below with the lines marked with +

     int  editdist(string s,string t,int n,int m) 
     {
         int d1,d2,d3,cost;
         int i,j;
         for(i=0;i<=n;i++) 
         {
             for(j=0;j<=m;j++)
             {
    -          if(s[i+1]==t[j+1]) 
    +          if(s[i+1]==t[j+1]) 
                  cost=0;
               else
                  cost=1;
               d1=d[i][j+1]+1;
               d2=d[i+1][j]+1;
               d3=d[i][j]+cost;
               d[i+1][j+1]=minimum(d1,d2,d3);
    -          if(i>0 && j>0 && s[i+1]==t[j] && s[i]==t[j+1] )   //transposition
    +          if(i>0 && j>0 && s[i]==t[j-1] && s[i-1]==t[j] )   //transposition
               {
                   d[i+1][j+1]=min(d[i+1][j+1],d[i-1][j-1]+cost);
               }
             }
         }
         return d[n+1][m+1]; 
     }
    

    It looks as if the code was copied from a program written in a programming language where array indices start at 1 by default. Therefore all references to the elements of the distance array d were incremented. However the references to the characters within the strings are references to 0-based arrays, therefore they should not be updated.

    To compute the distance the distance array has to be properly initialized:

    for( i = 0; i < n + 1; i++)
          d[i][0] = i;
    for( j = 1; j < m + 1; j++)
          d[0][j] = j;
    

    Since you have got the answer 5, you probably have your distance array already initialized correctly.

    Since the above algorithm does not compute the DL distance, here is a sketch of a C implementation of the DL algorithm (derived from the SO post with a java impl. derived from an ActionScript impl. in the Wikipedia article).

    #define d(i,j) dd[(i) * (m+2) + (j) ]
    #define min(x,y) ((x) < (y) ? (x) : (y))
    #define min3(a,b,c) ((a)< (b) ? min((a),(c)) : min((b),(c)))
    #define min4(a,b,c,d) ((a)< (b) ? min3((a),(c),(d)) : min3((b),(c),(d)))
    
    int dprint(int* dd, int n,int m){
     int i,j;
     for (i=0; i < n+2;i++){
        for (j=0;j < m+2; j++){
            printf("%02d ",d(i,j));
        }
        printf("\n");
     }
     printf("\n");
     return 0;
    }
    
    int dldist2(char *s, char* t, int n, int m) {
        int *dd;
        int i, j, cost, i1,j1,DB;
        int INFINITY = n + m;
        int DA[256 * sizeof(int)];
    
        memset(DA, 0, sizeof(DA));
    
        if (!(dd = (int*) malloc((n+2)*(m+2)*sizeof(int)))) {
          return -1;
        }
    
        d(0,0) = INFINITY;
        for(i = 0; i < n+1; i++) {
          d(i+1,1) = i ;
          d(i+1,0) = INFINITY;
        }
        for(j = 0; j<m+1; j++) {
          d(1,j+1) = j ;
          d(0,j+1) = INFINITY;
        }      
        dprint(dd,n,m);
    
        for(i = 1; i< n+1; i++) {
          DB = 0;
          for(j = 1; j< m+1; j++) {
            i1 = DA[t[j-1]];
            j1 = DB;
            cost = ((s[i-1]==t[j-1])?0:1);
            if(cost==0) DB = j;
            d(i+1,j+1) =
              min4(d(i,j)+cost,
                  d(i+1,j) + 1,
                  d(i,j+1)+1, 
                  d(i1,j1) + (i-i1-1) + 1 + (j-j1-1));
          }
          DA[s[i-1]] = i;
          dprint(dd,n,m);
        }
        cost = d(n+1,m+1);
        free(dd);
        return cost;
    }
    
    0 讨论(0)
  • 2021-02-13 19:13

    Here is my C++ version of this algorithm:

    int damerau_levenshtein_distance(std::string p_string1, std::string p_string2)
    {
        int l_string_length1 = p_string1.length();
        int l_string_length2 = p_string2.length();
        int d[l_string_length1+1][l_string_length2+1];
    
        int i;
        int j;
        int l_cost;
    
        for (i = 0;i <= l_string_length1;i++)
        {
            d[i][0] = i;
        }
        for(j = 0; j<= l_string_length2; j++)
        {
            d[0][j] = j;
        }
        for (i = 1;i <= l_string_length1;i++)
        {
            for(j = 1; j<= l_string_length2; j++)
            {
                if( p_string1[i-1] == p_string2[j-1] )
                {
                    l_cost = 0;
                }
                else
                {
                    l_cost = 1;
                }
                d[i][j] = std::min(
                d[i-1][j] + 1,                  // delete
                std::min(d[i][j-1] + 1,         // insert
                d[i-1][j-1] + l_cost)           // substitution
                );
                if( (i > 1) && 
                (j > 1) && 
                (p_string1[i-1] == p_string2[j-2]) && 
                (p_string1[i-2] == p_string2[j-1])
                ) 
                {
                d[i][j] = std::min(
                d[i][j],
                 d[i-2][j-2] + l_cost   // transposition
                );
                }
            }
        }
        return d[l_string_length1][l_string_length2];
    }
    
    0 讨论(0)
提交回复
热议问题