Word matching in SQL Server

后端 未结 1 1602
予麋鹿
予麋鹿 2021-01-06 01:47

I have a requirement to provide a suggested match between data in two database tables. The basic requirement is; - A \"match\" should be suggested for the highest number of

相关标签:
1条回答
  • 2021-01-06 02:13

    I've used sys.dm_fts_parser below to split the sentences into words. There are plenty of TSQL split functions around if you are not on SQL Server 2008 or find this isn't suitable for some reason.

    The requirement that each A.id can only be paired with a B.id that hadn't been used previously and vice-versa isn't one I could think of an efficient set based solution for.

    ;WITH A(Id, sentence) As
    (
      SELECT 1,'What other text in here'    UNION ALL
      SELECT 2,'What am I doing here'       UNION ALL 
      SELECT 3,'I need to find another job' UNION ALL 
      SELECT 4,'Other text in here'         
    ),
    B(Id, sentence) As
    (
     SELECT  5,'Other text in here'          UNION ALL
     SELECT  6,'I am doing what here'        UNION ALL
     SELECT  7,'Purple unicorns'             UNION ALL
     SELECT  8,'What are you doing in here'
    ),  A_Split
         AS (SELECT Id AS A_Id,
                    display_term,
                    COUNT(*) OVER (PARTITION BY Id) AS A_Cnt
             FROM   A
                    CROSS APPLY 
                       sys.dm_fts_parser('"' + REPLACE(sentence, '"', '""')+'"',1033, 0,0)),
    
         B_Split
         AS (SELECT Id AS B_Id,
                    display_term,
                    COUNT(*) OVER (PARTITION BY Id) AS B_Cnt
             FROM   B
                    CROSS APPLY 
                       sys.dm_fts_parser('"' + REPLACE(sentence, '"', '""')+'"',1033, 0,0)),
         Joined
         As (SELECT A_Id,
                    B_Id,
                    B_Cnt,
                    Cnt = COUNT(*),
                    CAST(COUNT(*) as FLOAT)/B_Cnt AS PctMatchBToA,
                    CAST(COUNT(*) as FLOAT)/A_Cnt AS PctMatchAToB
             from   A_Split A
                    JOIN B_Split B
                      ON A.display_term = B.display_term
             GROUP  BY A_Id,
                       B_Id,
                       B_Cnt,
                       A_Cnt)
    SELECT IDENTITY(int, 1, 1) as id, *
    INTO   #IntermediateResults
    FROM   Joined
    ORDER  BY PctMatchBToA DESC,
              PctMatchAToB DESC
    
    DECLARE @A_Id INT,
            @B_Id INT,
            @Cnt  INT
    
    DECLARE @Results TABLE (
      A_Id INT,
      B_Id INT,
      Cnt  INT)
    
    SELECT TOP(1) @A_Id = A_Id,
                  @B_Id = B_Id,
                  @Cnt = Cnt
    FROM   #IntermediateResults
    ORDER  BY id
    
    WHILE ( @@ROWCOUNT > 0 )
      BEGIN
    
          INSERT INTO @Results
          SELECT @A_Id,
                 @B_Id,
                 @Cnt
    
          DELETE FROM #IntermediateResults
          WHERE  A_Id = @A_Id
                  OR B_Id = @B_Id
    
          SELECT TOP(1) @A_Id = A_Id,
                        @B_Id = B_Id,
                        @Cnt = Cnt
          FROM   #IntermediateResults
          ORDER  BY id
      END
    
    DROP TABLE #IntermediateResults
    
    SELECT *
    FROM   @Results
    ORDER  BY A_Id  
    

    Returns

    A_Id        B_Id        Cnt
    ----------- ----------- -----------
    1           8           3
    2           6           5
    4           5           4
    
    0 讨论(0)
提交回复
热议问题