All Common Substrings Between Two Strings

前端 未结 3 1884
星月不相逢
星月不相逢 2021-01-02 09:42

I am working on C# to find all the common substrings between two strings. For instance, if the input is: S1= \"need asssitance with email\" S2= \"email assistance needed\"<

相关标签:
3条回答
  • 2021-01-02 09:56
        public static string [] CommonString(string left, string right)
        {
            List<string> result = new List<string>();
            string [] rightArray = right.Split();
            string [] leftArray = left.Split();
    
            result.AddRange(rightArray.Where(r => leftArray.Any(l => l.StartsWith(r))));
    
            // must check other way in case left array contains smaller words than right array
            result.AddRange(leftArray.Where(l => rightArray.Any(r => r.StartsWith(l))));
    
            return result.Distinct().ToArray();
        }
    
    0 讨论(0)
  • 2021-01-02 10:00

    Use Set-Intersections

    Start with a routine to find all possible substrings of a string. Here it is in Python, it's an 'exercise for the reader' to translate it to C#':

    def allSubstr(instring):
      retset = set()
      retset.add(instring)
      totlen = len(instring)
      for thislen in range(0, totlen):
        for startpos in range(0, totlen):
          # print "startpos: %s, thislen: %s" % (startpos, thislen)
          addStr = instring[startpos:startpos+thislen]
          # print "addstr: %s" % (addStr)
          retset.add(addStr)
      print "retset total: %s" % (retset)
      return retset
    
    set1 = allSubstr('abcdefg')
    set2 = allSubstr('cdef')
    print set1.intersection(set2)
    

    Here's the output:

    >>> set1 = allSubstr('abcdefg')
    retset total: set(['', 'cde', 'ab', 'ef', 'cd', 'abcdef', 'abc', 'efg', 'bcde', 'cdefg', 'bc', 'de',   'bcdef', 'abcd', 'defg', 'fg', 'cdef', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'bcd', 'abcde', 'abcdefg', 'bcdefg', 'def'])
    >>> set2 = allSubstr('cdef')
    retset total: set(['', 'cde', 'c', 'ef', 'e', 'd', 'f', 'de', 'cd', 'cdef', 'def'])
    >>> 
    >>> set1.intersection(set2)
    set(['', 'cde', 'c', 'de', 'e', 'd', 'f', 'ef', 'cd', 'cdef', 'def'])
    

    No, you're not interested in subsets of length 1. But, you can always add a limit to length before you do the set.add() call.

    0 讨论(0)
  • 2021-01-02 10:14

    A different approach: you could use levenshtein distance to find the similarity of two words. If the distance is less than a specified value you could consider two strings as equal. Then you can use the levenshtein comparer for Enumerable.Intersect.

    Then it's simple as:

    string S1= "need asssitance with email" ;
    string S2 = "email assistance needed";
    string[] words1 = S1.Split();
    string[] words2 = S2.Split();
    var wordsIntersecting = words1.Intersect(words2, new LevenshteinComparer());
    string output = string.Join(" ", wordsIntersecting);
    

    output: need asssitance email

    Here is the custom comparer:

    class LevenshteinComparer : IEqualityComparer<string>
    {
        public int MaxDistance { get; set; }
        private Levenshtein _Levenshtein = new Levenshtein();
    
        public LevenshteinComparer() : this(50) { }
    
        public LevenshteinComparer(int maxDistance)
        {
            this.MaxDistance = maxDistance;
        }
    
        public bool Equals(string x, string y)
        {
            int distance = _Levenshtein.iLD(x, y);
            return distance <= MaxDistance;
        }
    
        public int GetHashCode(string obj)
        {
            return 0; 
        }
    }
    

    and here is an implementation of the levenshtein algoritm:

    public class Levenshtein
    {
        ///*****************************
        /// Compute Levenshtein distance 
        /// Memory efficient version
        ///*****************************
        public int iLD(String sRow, String sCol)
        {
            int RowLen = sRow.Length;  // length of sRow
            int ColLen = sCol.Length;  // length of sCol
            int RowIdx;                // iterates through sRow
            int ColIdx;                // iterates through sCol
            char Row_i;                // ith character of sRow
            char Col_j;                // jth character of sCol
            int cost;                   // cost
    
            /// Test string length
            if (Math.Max(sRow.Length, sCol.Length) > Math.Pow(2, 31))
                throw (new Exception("\nMaximum string length in Levenshtein.iLD is " + Math.Pow(2, 31) + ".\nYours is " + Math.Max(sRow.Length, sCol.Length) + "."));
    
            // Step 1
    
            if (RowLen == 0)
            {
                return ColLen;
            }
    
            if (ColLen == 0)
            {
                return RowLen;
            }
    
            /// Create the two vectors
            int[] v0 = new int[RowLen + 1];
            int[] v1 = new int[RowLen + 1];
            int[] vTmp;
    
    
    
            /// Step 2
            /// Initialize the first vector
            for (RowIdx = 1; RowIdx <= RowLen; RowIdx++)
            {
                v0[RowIdx] = RowIdx;
            }
    
            // Step 3
    
            /// Fore each column
            for (ColIdx = 1; ColIdx <= ColLen; ColIdx++)
            {
                /// Set the 0'th element to the column number
                v1[0] = ColIdx;
    
                Col_j = sCol[ColIdx - 1];
    
    
                // Step 4
    
                /// Fore each row
                for (RowIdx = 1; RowIdx <= RowLen; RowIdx++)
                {
                    Row_i = sRow[RowIdx - 1];
    
    
                    // Step 5
    
                    if (Row_i == Col_j)
                    {
                        cost = 0;
                    }
                    else
                    {
                        cost = 1;
                    }
    
                    // Step 6
    
                    /// Find minimum
                    int m_min = v0[RowIdx] + 1;
                    int b = v1[RowIdx - 1] + 1;
                    int c = v0[RowIdx - 1] + cost;
    
                    if (b < m_min)
                    {
                        m_min = b;
                    }
                    if (c < m_min)
                    {
                        m_min = c;
                    }
    
                    v1[RowIdx] = m_min;
                }
    
                /// Swap the vectors
                vTmp = v0;
                v0 = v1;
                v1 = vTmp;
    
            }
    
    
            // Step 7
    
            /// Value between 0 - 100
            /// 0==perfect match 100==totaly different
            /// 
            /// The vectors where swaped one last time at the end of the last loop,
            /// that is why the result is now in v0 rather than in v1
            //System.Console.WriteLine("iDist=" + v0[RowLen]);
            int max = System.Math.Max(RowLen, ColLen);
            return ((100 * v0[RowLen]) / max);
        }
    
    
    
    
    
        ///*****************************
        /// Compute the min
        ///*****************************
    
        private int Minimum(int a, int b, int c)
        {
            int mi = a;
    
            if (b < mi)
            {
                mi = b;
            }
            if (c < mi)
            {
                mi = c;
            }
    
            return mi;
        }
    
        ///*****************************
        /// Compute Levenshtein distance         
        ///*****************************
    
        public int LD(String sNew, String sOld)
        {
            int[,] matrix;              // matrix
            int sNewLen = sNew.Length;  // length of sNew
            int sOldLen = sOld.Length;  // length of sOld
            int sNewIdx; // iterates through sNew
            int sOldIdx; // iterates through sOld
            char sNew_i; // ith character of sNew
            char sOld_j; // jth character of sOld
            int cost; // cost
    
            /// Test string length
            if (Math.Max(sNew.Length, sOld.Length) > Math.Pow(2, 31))
                throw (new Exception("\nMaximum string length in Levenshtein.LD is " + Math.Pow(2, 31) + ".\nYours is " + Math.Max(sNew.Length, sOld.Length) + "."));
    
            // Step 1
    
            if (sNewLen == 0)
            {
                return sOldLen;
            }
    
            if (sOldLen == 0)
            {
                return sNewLen;
            }
    
            matrix = new int[sNewLen + 1, sOldLen + 1];
    
            // Step 2
    
            for (sNewIdx = 0; sNewIdx <= sNewLen; sNewIdx++)
            {
                matrix[sNewIdx, 0] = sNewIdx;
            }
    
            for (sOldIdx = 0; sOldIdx <= sOldLen; sOldIdx++)
            {
                matrix[0, sOldIdx] = sOldIdx;
            }
    
            // Step 3
    
            for (sNewIdx = 1; sNewIdx <= sNewLen; sNewIdx++)
            {
                sNew_i = sNew[sNewIdx - 1];
    
                // Step 4
    
                for (sOldIdx = 1; sOldIdx <= sOldLen; sOldIdx++)
                {
                    sOld_j = sOld[sOldIdx - 1];
    
                    // Step 5
    
                    if (sNew_i == sOld_j)
                    {
                        cost = 0;
                    }
                    else
                    {
                        cost = 1;
                    }
    
                    // Step 6
    
                    matrix[sNewIdx, sOldIdx] = Minimum(matrix[sNewIdx - 1, sOldIdx] + 1, matrix[sNewIdx, sOldIdx - 1] + 1, matrix[sNewIdx - 1, sOldIdx - 1] + cost);
    
                }
            }
    
            // Step 7
    
            /// Value between 0 - 100
            /// 0==perfect match 100==totaly different
            System.Console.WriteLine("Dist=" + matrix[sNewLen, sOldLen]);
            int max = System.Math.Max(sNewLen, sOldLen);
            return (100 * matrix[sNewLen, sOldLen]) / max;
        }
    }
    

    Credits to the Levenshtein class to: http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm

    0 讨论(0)
提交回复
热议问题