Is it possible to compare rows for similar data in SQL Server? I have a company name column in a table where company names could be somewhat similar. Here is an example of t
I have a couple UDF's I converted from some VB code some time ago that takes in 2 varchar() and returns an int between 0 and 100 (0= not similar, 100= same) if your interested.
-- Description: Removes any special characters from a string
CREATE FUNCTION [dbo].[SimReplaceSpecial]
(
-- Add the parameters for the function here
@String varchar(max)
)
RETURNS varchar(max)
AS
BEGIN
-- Declare the return variable here
DECLARE @Result varchar(max) = ''
-- Add the T-SQL statements to compute the return value here
DECLARE @Pos int = 1
DECLARE @Asc int
DECLARE @WorkingString varchar(max)
SET @WorkingString = upper(@String)
WHILE @Pos <= LEN(@WorkingString)
BEGIN
SET @Asc = ascii(substring(@WorkingString,@Pos,1))
If (@Asc >= 48 And @Asc <= 57) Or (@Asc >= 65 And @Asc <= 90)
SET @Result = @Result + Char(@Asc)
SET @Pos = @Pos + 1
--IF @Pos + 1 > len(@String)
-- BREAK
--ELSE
-- CONTINUE
END
-- Return the result of the function
RETURN @Result
END
-- Description: DO NOT CALL DIRECTLY - Used by the Similar function
-- Finds longest common substring (other than single
-- characters) in String1 and String2, then recursively
-- finds longest common substring in left-hand
-- portion and right-hand portion. Updates the
-- cumulative score.
CREATE FUNCTION [dbo].[SimFindCommon]
(
-- Add the parameters for the function here
@String1 varchar(max),
@String2 varchar(max),
@Score int
)
RETURNS int
AS
BEGIN
-- Declare the return variable here
--DECLARE @Result int
DECLARE @Longest Int = 0
DECLARE @StartPos1 Int = 0
DECLARE @StartPos2 Int = 0
DECLARE @J Int = 0
DECLARE @HoldStr varchar(max)
DECLARE @TestStr varchar(max)
DECLARE @LeftStr1 varchar(max) = ''
DECLARE @LeftStr2 varchar(max) = ''
DECLARE @RightStr1 varchar(max) = ''
DECLARE @RightStr2 varchar(max) = ''
-- Add the T-SQL statements to compute the return value here
SET @HoldStr = @String2
WHILE LEN(@HoldStr) > @Longest
BEGIN
SET @TestStr = @HoldStr
WHILE LEN(@TestStr) > 1
BEGIN
SET @J = CHARINDEX(@TestStr,@String1)
If @J > 0
BEGIN
--Test string is sub-set of the other string
If Len(@TestStr) > @Longest
BEGIN
--Test string is longer than previous
--longest. Store its length and position.
SET @Longest = Len(@TestStr)
SET @StartPos1 = @J
SET @StartPos2 = CHARINDEX(@TestStr,@String2)
END
--No point in going further with this string
BREAK
END
ELSE
--Test string is not a sub-set of the other
--string. Discard final character of test
--string and try again.
SET @TestStr = Left(@TestStr, LEN(@TestStr) - 1)
END
--Now discard first char of test string and
--repeat the process.
SET @HoldStr = Right(@HoldStr, LEN(@HoldStr) - 1)
END
--Update the cumulative score with the length of
--the common sub-string.
SET @Score = @Score + @Longest
--We now have the longest common sub-string, so we
--can isolate the sub-strings to the left and right
--of it.
If @StartPos1 > 3 And @StartPos2 > 3
BEGIN
SET @LeftStr1 = Left(@String1, @StartPos1 - 1)
SET @LeftStr2 = Left(@String2, @StartPos2 - 1)
If RTRIM(LTRIM(@LeftStr1)) <> '' And RTRIM(LTRIM(@LeftStr2)) <> ''
BEGIN
--Get longest common substring from left strings
SET @Score = dbo.SimFindCommon(@LeftStr1, @LeftStr2,@Score)
END
END
ELSE
BEGIN
SET @LeftStr1 = ''
SET @LeftStr2 = ''
END
If @Longest > 0
BEGIN
SET @RightStr1 = substring(@String1, @StartPos1 + @Longest, LEN(@String1))
SET @RightStr2 = substring(@String2, @StartPos2 + @Longest, LEN(@String2))
If RTRIM(LTRIM(@RightStr1)) <> '' And RTRIM(LTRIM(@RightStr2)) <> ''
BEGIN
--Get longest common substring from right strings
SET @Score = dbo.SimFindCommon(@RightStr1, @RightStr2,@Score)
END
END
ELSE
BEGIN
SET @RightStr1 = ''
SET @RightStr2 = ''
END
-- Return the result of the function
RETURN @Score
END
-- Description: Compares two not-empty strings regardless of case.
-- Returns a numeric indication of their similarity
-- (0 = not at all similar, 100 = identical)
CREATE FUNCTION [dbo].[Similar]
(
-- Add the parameters for the function here
@String1 varchar(max),
@String2 varchar(max)
)
RETURNS int
AS
BEGIN
-- Declare the return variable here
DECLARE @Result int
DECLARE @WorkingString1 varchar(max)
DECLARE @WorkingString2 varchar(max)
-- Add the T-SQL statements to compute the return value here
if isnull(@String1,'') = '' or isnull(@String2,'') = ''
SET @Result = 0
ELSE
BEGIN
--Convert each string to simplest form (letters
--and digits only, all upper case)
SET @WorkingString1 = dbo.SimReplaceSpecial(@String1)
SET @WorkingString2 = dbo.SimReplaceSpecial(@String2)
If RTRIM(LTRIM(@WorkingString1)) = '' Or RTRIM(LTRIM(@WorkingString2)) = ''
BEGIN
--One or both of the strings is now empty
SET @Result = 0
END
ELSE
BEGIN
If @WorkingString1 = @WorkingString2
BEGIN
--Strings are identical
SET @Result = 100
END
ELSE
BEGIN
--Find all common sub-strings
SET @Result = dbo.SimFindCommon(@WorkingString1, @WorkingString2,0)
--We now have the cumulative score. Return this
--as a percent of the maximum score. The maximum
--score is the average length of the two strings.
SET @Result = @Result * 200 / (Len(@WorkingString1) + Len(@WorkingString2))
END
END
END
-- Return the result of the function
RETURN @Result
END
--Usage--------------------------------------------------------------------
--Call the "Similar" Function only
SELECT dbo.Similar('ANDORRA WOODS','ANDORRA WOODS HEALTHCARE CENTER')
--Result = 60
SELECT dbo.Similar('ABC HEALTHCARE, JOB #31181','ABC HEALTHCARE, JOB #31251')
--Result = 85
SELECT dbo.Similar('ACTION SERVICE SALES, A SUBSIDIARY OF SINGER EQUIPMENT','ACTION SERVICE SALES, A SUBSIDIARY OF SINGER EQUIPMENT COMPANY')
--Result = 92
SELECT dbo.Similar('APEX SYSTEMS','APEX SYSTEMS, INC')
--Result = 88
SSIS/Data Tools has a Fuzzy Grouping transformation that is very helpful in situations like this. It doesn't actually group your data, rather it gives you similarity scores that you can use to determine when items should be grouped together.
Plenty of tutorials out there, here's one: The Fuzzy Grouping Transformation