Is it possible to compare rows for similar data in SQL server

后端 未结 2 1476
失恋的感觉
失恋的感觉 2021-01-16 19:50

Is it possible to compare rows for similar data in SQL Server? I have a company name column in a table where company names could be somewhat similar. Here is an example of t

相关标签:
2条回答
  • 2021-01-16 20:11

    I have a couple UDF's I converted from some VB code some time ago that takes in 2 varchar() and returns an int between 0 and 100 (0= not similar, 100= same) if your interested.

    -- Description: Removes any special characters from a string
    CREATE FUNCTION [dbo].[SimReplaceSpecial] 
    (
        -- Add the parameters for the function here
        @String varchar(max)
    )
    RETURNS varchar(max)
    AS
    BEGIN
        -- Declare the return variable here
        DECLARE @Result varchar(max) = ''
    
        -- Add the T-SQL statements to compute the return value here
        DECLARE @Pos int = 1
        DECLARE @Asc int
        DECLARE @WorkingString varchar(max)
    
        SET @WorkingString = upper(@String)
    
        WHILE @Pos <= LEN(@WorkingString)
        BEGIN
            SET @Asc = ascii(substring(@WorkingString,@Pos,1))
            If (@Asc >= 48 And @Asc <= 57) Or (@Asc >= 65 And @Asc <= 90)
                    SET @Result = @Result + Char(@Asc)
    
            SET @Pos = @Pos + 1
            --IF @Pos + 1 > len(@String)
            --  BREAK
            --ELSE
            --  CONTINUE
        END
    
        -- Return the result of the function
        RETURN @Result
    END
    
    -- Description: DO NOT CALL DIRECTLY - Used by the Similar function
    --   Finds longest common substring (other than single
    --   characters) in String1 and String2, then recursively
    --   finds longest common substring in left-hand
    --   portion and right-hand portion. Updates the
    --   cumulative score.
    CREATE FUNCTION [dbo].[SimFindCommon] 
    (
        -- Add the parameters for the function here
        @String1 varchar(max),
        @String2 varchar(max),
        @Score int
    )
    RETURNS int
    AS
    BEGIN
        -- Declare the return variable here
        --DECLARE @Result int
    
        DECLARE @Longest Int = 0
        DECLARE @StartPos1 Int = 0
        DECLARE @StartPos2 Int = 0
        DECLARE @J Int = 0
    
        DECLARE @HoldStr varchar(max)
        DECLARE @TestStr varchar(max)
        DECLARE @LeftStr1 varchar(max) = ''
        DECLARE @LeftStr2 varchar(max) = ''
        DECLARE @RightStr1 varchar(max) = ''
        DECLARE @RightStr2 varchar(max) = ''
    
        -- Add the T-SQL statements to compute the return value here
        SET @HoldStr = @String2
        WHILE LEN(@HoldStr) > @Longest 
        BEGIN
            SET @TestStr = @HoldStr
            WHILE LEN(@TestStr) > 1
            BEGIN
                    SET @J = CHARINDEX(@TestStr,@String1)
                If @J > 0
                BEGIN
                    --Test string is sub-set of the other string
                    If Len(@TestStr) > @Longest 
                    BEGIN
                        --Test string is longer than previous
                        --longest. Store its length and position.
                        SET @Longest = Len(@TestStr)
                        SET @StartPos1 = @J
                        SET @StartPos2 = CHARINDEX(@TestStr,@String2)
                    END
                    --No point in going further with this string
                    BREAK
                END
                ELSE
                    --Test string is not a sub-set of the other
                    --string. Discard final character of test
                    --string and try again.
                    SET @TestStr = Left(@TestStr, LEN(@TestStr) - 1)
            END 
            --Now discard first char of test string and
            --repeat the process.
            SET @HoldStr = Right(@HoldStr, LEN(@HoldStr) - 1)
        END 
    
        --Update the cumulative score with the length of
        --the common sub-string.
        SET @Score = @Score + @Longest
    
        --We now have the longest common sub-string, so we
        --can isolate the sub-strings to the left and right
        --of it.
    
        If @StartPos1 > 3 And @StartPos2 > 3 
        BEGIN
            SET @LeftStr1 = Left(@String1, @StartPos1 - 1)
            SET @LeftStr2 = Left(@String2, @StartPos2 - 1)
            If RTRIM(LTRIM(@LeftStr1)) <> '' And RTRIM(LTRIM(@LeftStr2)) <> ''
            BEGIN
                --Get longest common substring from left strings
                SET @Score = dbo.SimFindCommon(@LeftStr1, @LeftStr2,@Score)
            END
        END
        ELSE
        BEGIN
            SET @LeftStr1 = ''
            SET @LeftStr2 = ''
        END
        If @Longest > 0
        BEGIN
            SET @RightStr1 = substring(@String1, @StartPos1 + @Longest, LEN(@String1))
            SET @RightStr2 = substring(@String2, @StartPos2 + @Longest, LEN(@String2))
    
            If RTRIM(LTRIM(@RightStr1)) <> '' And RTRIM(LTRIM(@RightStr2)) <> ''
            BEGIN
                --Get longest common substring from right strings
                SET @Score = dbo.SimFindCommon(@RightStr1, @RightStr2,@Score)
            END
        END
        ELSE
        BEGIN
            SET @RightStr1 = ''
            SET @RightStr2 = ''
        END
    
        -- Return the result of the function
        RETURN @Score
    END
    
    -- Description: Compares two not-empty strings regardless of case.
    --  Returns a numeric indication of their similarity 
    --  (0 = not at all similar, 100 = identical)
    CREATE FUNCTION [dbo].[Similar] 
    (
        -- Add the parameters for the function here
        @String1 varchar(max),
        @String2 varchar(max)
    )
    RETURNS int
    AS
    BEGIN
        -- Declare the return variable here
        DECLARE @Result int
        DECLARE @WorkingString1 varchar(max)
        DECLARE @WorkingString2 varchar(max)
    
        -- Add the T-SQL statements to compute the return value here
        if isnull(@String1,'') = '' or isnull(@String2,'') = ''
            SET @Result = 0
        ELSE 
        BEGIN
            --Convert each string to simplest form (letters
            --and digits only, all upper case)
            SET @WorkingString1 = dbo.SimReplaceSpecial(@String1)
            SET @WorkingString2 = dbo.SimReplaceSpecial(@String2)
    
            If RTRIM(LTRIM(@WorkingString1)) = '' Or RTRIM(LTRIM(@WorkingString2)) = ''
            BEGIN
                --One or both of the strings is now empty
                SET @Result =  0
            END
            ELSE
            BEGIN
                If @WorkingString1 = @WorkingString2
                BEGIN
                    --Strings are identical
                    SET @Result =  100
                END
                ELSE
                BEGIN
                    --Find all common sub-strings
                    SET @Result = dbo.SimFindCommon(@WorkingString1, @WorkingString2,0)
    
                    --We now have the cumulative score. Return this
                    --as a percent of the maximum score. The maximum
                    --score is the average length of the two strings.
                    SET @Result = @Result * 200 / (Len(@WorkingString1) + Len(@WorkingString2))
                END
            END
        END
    
        -- Return the result of the function
        RETURN @Result
    
    END
    
    
    
    --Usage--------------------------------------------------------------------
    --Call the "Similar" Function only
    SELECT dbo.Similar('ANDORRA WOODS','ANDORRA WOODS HEALTHCARE CENTER')
    --Result = 60
    SELECT dbo.Similar('ABC HEALTHCARE, JOB #31181','ABC HEALTHCARE, JOB #31251')
    --Result = 85
    SELECT dbo.Similar('ACTION SERVICE  SALES, A SUBSIDIARY OF SINGER EQUIPMENT','ACTION SERVICE  SALES, A SUBSIDIARY OF SINGER EQUIPMENT COMPANY')
    --Result = 92
    SELECT dbo.Similar('APEX SYSTEMS','APEX SYSTEMS, INC')
    --Result = 88
    
    0 讨论(0)
  • 2021-01-16 20:21

    SSIS/Data Tools has a Fuzzy Grouping transformation that is very helpful in situations like this. It doesn't actually group your data, rather it gives you similarity scores that you can use to determine when items should be grouped together.

    Plenty of tutorials out there, here's one: The Fuzzy Grouping Transformation

    0 讨论(0)
提交回复
热议问题