Is it possible to compare rows for similar data in SQL server

后端 未结 2 1478
失恋的感觉
失恋的感觉 2021-01-16 19:50

Is it possible to compare rows for similar data in SQL Server? I have a company name column in a table where company names could be somewhat similar. Here is an example of t

2条回答
  •  臣服心动
    2021-01-16 20:11

    I have a couple UDF's I converted from some VB code some time ago that takes in 2 varchar() and returns an int between 0 and 100 (0= not similar, 100= same) if your interested.

    -- Description: Removes any special characters from a string
    CREATE FUNCTION [dbo].[SimReplaceSpecial] 
    (
        -- Add the parameters for the function here
        @String varchar(max)
    )
    RETURNS varchar(max)
    AS
    BEGIN
        -- Declare the return variable here
        DECLARE @Result varchar(max) = ''
    
        -- Add the T-SQL statements to compute the return value here
        DECLARE @Pos int = 1
        DECLARE @Asc int
        DECLARE @WorkingString varchar(max)
    
        SET @WorkingString = upper(@String)
    
        WHILE @Pos <= LEN(@WorkingString)
        BEGIN
            SET @Asc = ascii(substring(@WorkingString,@Pos,1))
            If (@Asc >= 48 And @Asc <= 57) Or (@Asc >= 65 And @Asc <= 90)
                    SET @Result = @Result + Char(@Asc)
    
            SET @Pos = @Pos + 1
            --IF @Pos + 1 > len(@String)
            --  BREAK
            --ELSE
            --  CONTINUE
        END
    
        -- Return the result of the function
        RETURN @Result
    END
    
    -- Description: DO NOT CALL DIRECTLY - Used by the Similar function
    --   Finds longest common substring (other than single
    --   characters) in String1 and String2, then recursively
    --   finds longest common substring in left-hand
    --   portion and right-hand portion. Updates the
    --   cumulative score.
    CREATE FUNCTION [dbo].[SimFindCommon] 
    (
        -- Add the parameters for the function here
        @String1 varchar(max),
        @String2 varchar(max),
        @Score int
    )
    RETURNS int
    AS
    BEGIN
        -- Declare the return variable here
        --DECLARE @Result int
    
        DECLARE @Longest Int = 0
        DECLARE @StartPos1 Int = 0
        DECLARE @StartPos2 Int = 0
        DECLARE @J Int = 0
    
        DECLARE @HoldStr varchar(max)
        DECLARE @TestStr varchar(max)
        DECLARE @LeftStr1 varchar(max) = ''
        DECLARE @LeftStr2 varchar(max) = ''
        DECLARE @RightStr1 varchar(max) = ''
        DECLARE @RightStr2 varchar(max) = ''
    
        -- Add the T-SQL statements to compute the return value here
        SET @HoldStr = @String2
        WHILE LEN(@HoldStr) > @Longest 
        BEGIN
            SET @TestStr = @HoldStr
            WHILE LEN(@TestStr) > 1
            BEGIN
                    SET @J = CHARINDEX(@TestStr,@String1)
                If @J > 0
                BEGIN
                    --Test string is sub-set of the other string
                    If Len(@TestStr) > @Longest 
                    BEGIN
                        --Test string is longer than previous
                        --longest. Store its length and position.
                        SET @Longest = Len(@TestStr)
                        SET @StartPos1 = @J
                        SET @StartPos2 = CHARINDEX(@TestStr,@String2)
                    END
                    --No point in going further with this string
                    BREAK
                END
                ELSE
                    --Test string is not a sub-set of the other
                    --string. Discard final character of test
                    --string and try again.
                    SET @TestStr = Left(@TestStr, LEN(@TestStr) - 1)
            END 
            --Now discard first char of test string and
            --repeat the process.
            SET @HoldStr = Right(@HoldStr, LEN(@HoldStr) - 1)
        END 
    
        --Update the cumulative score with the length of
        --the common sub-string.
        SET @Score = @Score + @Longest
    
        --We now have the longest common sub-string, so we
        --can isolate the sub-strings to the left and right
        --of it.
    
        If @StartPos1 > 3 And @StartPos2 > 3 
        BEGIN
            SET @LeftStr1 = Left(@String1, @StartPos1 - 1)
            SET @LeftStr2 = Left(@String2, @StartPos2 - 1)
            If RTRIM(LTRIM(@LeftStr1)) <> '' And RTRIM(LTRIM(@LeftStr2)) <> ''
            BEGIN
                --Get longest common substring from left strings
                SET @Score = dbo.SimFindCommon(@LeftStr1, @LeftStr2,@Score)
            END
        END
        ELSE
        BEGIN
            SET @LeftStr1 = ''
            SET @LeftStr2 = ''
        END
        If @Longest > 0
        BEGIN
            SET @RightStr1 = substring(@String1, @StartPos1 + @Longest, LEN(@String1))
            SET @RightStr2 = substring(@String2, @StartPos2 + @Longest, LEN(@String2))
    
            If RTRIM(LTRIM(@RightStr1)) <> '' And RTRIM(LTRIM(@RightStr2)) <> ''
            BEGIN
                --Get longest common substring from right strings
                SET @Score = dbo.SimFindCommon(@RightStr1, @RightStr2,@Score)
            END
        END
        ELSE
        BEGIN
            SET @RightStr1 = ''
            SET @RightStr2 = ''
        END
    
        -- Return the result of the function
        RETURN @Score
    END
    
    -- Description: Compares two not-empty strings regardless of case.
    --  Returns a numeric indication of their similarity 
    --  (0 = not at all similar, 100 = identical)
    CREATE FUNCTION [dbo].[Similar] 
    (
        -- Add the parameters for the function here
        @String1 varchar(max),
        @String2 varchar(max)
    )
    RETURNS int
    AS
    BEGIN
        -- Declare the return variable here
        DECLARE @Result int
        DECLARE @WorkingString1 varchar(max)
        DECLARE @WorkingString2 varchar(max)
    
        -- Add the T-SQL statements to compute the return value here
        if isnull(@String1,'') = '' or isnull(@String2,'') = ''
            SET @Result = 0
        ELSE 
        BEGIN
            --Convert each string to simplest form (letters
            --and digits only, all upper case)
            SET @WorkingString1 = dbo.SimReplaceSpecial(@String1)
            SET @WorkingString2 = dbo.SimReplaceSpecial(@String2)
    
            If RTRIM(LTRIM(@WorkingString1)) = '' Or RTRIM(LTRIM(@WorkingString2)) = ''
            BEGIN
                --One or both of the strings is now empty
                SET @Result =  0
            END
            ELSE
            BEGIN
                If @WorkingString1 = @WorkingString2
                BEGIN
                    --Strings are identical
                    SET @Result =  100
                END
                ELSE
                BEGIN
                    --Find all common sub-strings
                    SET @Result = dbo.SimFindCommon(@WorkingString1, @WorkingString2,0)
    
                    --We now have the cumulative score. Return this
                    --as a percent of the maximum score. The maximum
                    --score is the average length of the two strings.
                    SET @Result = @Result * 200 / (Len(@WorkingString1) + Len(@WorkingString2))
                END
            END
        END
    
        -- Return the result of the function
        RETURN @Result
    
    END
    
    
    
    --Usage--------------------------------------------------------------------
    --Call the "Similar" Function only
    SELECT dbo.Similar('ANDORRA WOODS','ANDORRA WOODS HEALTHCARE CENTER')
    --Result = 60
    SELECT dbo.Similar('ABC HEALTHCARE, JOB #31181','ABC HEALTHCARE, JOB #31251')
    --Result = 85
    SELECT dbo.Similar('ACTION SERVICE  SALES, A SUBSIDIARY OF SINGER EQUIPMENT','ACTION SERVICE  SALES, A SUBSIDIARY OF SINGER EQUIPMENT COMPANY')
    --Result = 92
    SELECT dbo.Similar('APEX SYSTEMS','APEX SYSTEMS, INC')
    --Result = 88
    

提交回复
热议问题