What is the best way to create and populate a numbers table?

后端 未结 11 2534
悲&欢浪女
悲&欢浪女 2020-11-21 13:28

I\'ve seen many different ways to create and populate a numbers table. However, what is the best way to create and populate one? With \"best\" being defined from most to l

11条回答
  •  暗喜
    暗喜 (楼主)
    2020-11-21 13:39

    This is a repackaging of the accepted answer - but in a way that lets you compare them all to each other for yourself - the top 3 algorithms are compared (and comments explain why other methods are excluded) and you can run against your own setup to see how they each perform with the size of sequence that you desire.

    SET NOCOUNT ON;
    
    --
    -- Set the count of numbers that you want in your sequence ...
    --
    DECLARE @NumberOfNumbers int = 10000000;
    --
    --  Some notes on choosing a useful length for your sequence ...
    --      For a sequence of  100 numbers -- winner depends on preference of min/max/avg runtime ... (I prefer PhilKelley algo here - edit the algo so RowSet2 is max RowSet CTE)
    --      For a sequence of   1k numbers -- winner depends on preference of min/max/avg runtime ... (Sadly PhilKelley algo is generally lowest ranked in this bucket, but could be tweaked to perform better)
    --      For a sequence of  10k numbers -- a clear winner emerges for this bucket
    --      For a sequence of 100k numbers -- do not test any looping methods at this size or above ...
    --                                        the previous winner fails, a different method is need to guarantee the full sequence desired
    --      For a sequence of  1MM numbers -- the statistics aren't changing much between the algorithms - choose one based on your own goals or tweaks
    --      For a sequence of 10MM numbers -- only one of the methods yields the desired sequence, and the numbers are much closer than for smaller sequences
    
    DECLARE @TestIteration int = 0;
    DECLARE @MaxIterations int = 10;
    DECLARE @MethodName varchar(128);
    
    -- SQL SERVER 2017 Syntax/Support needed
    DROP TABLE IF EXISTS #TimingTest
    CREATE TABLE #TimingTest (MethodName varchar(128), TestIteration int, StartDate DateTime2, EndDate DateTime2, ElapsedTime decimal(38,0), ItemCount decimal(38,0), MaxNumber decimal(38,0), MinNumber decimal(38,0))
    
    --
    --  Conduct the test ...
    --
    WHILE @TestIteration < @MaxIterations
    BEGIN
        -- Be sure that the test moves forward
        SET @TestIteration += 1;
    
    /*  -- This method has been removed, as it is BY FAR, the slowest method
        -- This test shows that, looping should be avoided, likely at all costs, if one places a value / premium on speed of execution ...
    
        --
        -- METHOD - Fast looping
        --
    
        -- Prep for the test
        DROP TABLE IF EXISTS [Numbers].[Test];
        CREATE TABLE [Numbers].[Test] (Number INT NOT NULL);
    
        -- Method information
        SET @MethodName = 'FastLoop';
    
        -- Record the start of the test
        INSERT INTO #TimingTest(MethodName, TestIteration, StartDate)
        SELECT @MethodName, @TestIteration, GETDATE()
    
        -- Run the algorithm
        DECLARE @i INT = 1;
        WHILE @i <= @NumberOfNumbers
        BEGIN
            INSERT INTO [Numbers].[Test](Number) VALUES (@i);
            SELECT @i = @i + 1;
        END;
    
        ALTER TABLE [Numbers].[Test] ADD CONSTRAINT PK_Numbers_Test_Number PRIMARY KEY CLUSTERED (Number)
    
        -- Record the end of the test
        UPDATE tt
            SET 
                EndDate = GETDATE()
        FROM #TimingTest tt
        WHERE tt.MethodName = @MethodName
        and tt.TestIteration = @TestIteration
    
        -- And the stats about the numbers in the sequence
        UPDATE tt
            SET 
                ItemCount = results.ItemCount,
                MaxNumber = results.MaxNumber,
                MinNumber = results.MinNumber
        FROM #TimingTest tt
        CROSS JOIN (
            SELECT COUNT(Number) as ItemCount, MAX(Number) as MaxNumber, MIN(Number) as MinNumber FROM [Numbers].[Test]
        ) results
        WHERE tt.MethodName = @MethodName
        and tt.TestIteration = @TestIteration
    */
    
    /*  -- This method requires GO statements, which would break the script, also - this answer does not appear to be the fastest *AND* seems to perform "magic"
        --
        -- METHOD - "Semi-Looping"
        --
    
        -- Prep for the test
        DROP TABLE IF EXISTS [Numbers].[Test];
        CREATE TABLE [Numbers].[Test] (Number INT NOT NULL);
    
        -- Method information
        SET @MethodName = 'SemiLoop';
    
        -- Record the start of the test
        INSERT INTO #TimingTest(MethodName, TestIteration, StartDate)
        SELECT @MethodName, @TestIteration, GETDATE()
    
        -- Run the algorithm 
        INSERT [Numbers].[Test] values (1);
    --    GO --required
    
        INSERT [Numbers].[Test] SELECT Number + (SELECT COUNT(*) FROM [Numbers].[Test]) FROM [Numbers].[Test]
    --    GO 14 --will create 16384 total rows
    
        ALTER TABLE [Numbers].[Test] ADD CONSTRAINT PK_Numbers_Test_Number PRIMARY KEY CLUSTERED (Number)
    
        -- Record the end of the test
        UPDATE tt
            SET 
                EndDate = GETDATE()
        FROM #TimingTest tt
        WHERE tt.MethodName = @MethodName
        and tt.TestIteration = @TestIteration
    
        -- And the stats about the numbers in the sequence
        UPDATE tt
            SET 
                ItemCount = results.ItemCount,
                MaxNumber = results.MaxNumber,
                MinNumber = results.MinNumber
        FROM #TimingTest tt
        CROSS JOIN (
            SELECT COUNT(Number) as ItemCount, MAX(Number) as MaxNumber, MIN(Number) as MinNumber FROM [Numbers].[Test]
        ) results
        WHERE tt.MethodName = @MethodName
        and tt.TestIteration = @TestIteration
    */
        --
        -- METHOD - Philip Kelley's algo 
        --          (needs tweaking to match the desired length of sequence in order to optimize its performance, relies more on the coder to properly tweak the algorithm)
        --
    
        -- Prep for the test
        DROP TABLE IF EXISTS [Numbers].[Test];
        CREATE TABLE [Numbers].[Test] (Number INT NOT NULL);
    
        -- Method information
        SET @MethodName = 'PhilKelley';
    
        -- Record the start of the test
        INSERT INTO #TimingTest(MethodName, TestIteration, StartDate)
        SELECT @MethodName, @TestIteration, GETDATE()
    
        -- Run the algorithm
        ; WITH
        RowSet0 as (select 1 as Item union all select 1),              --          2 rows   -- We only have to name the column in the first select, the second/union select inherits the column name
        RowSet1 as (select 1 as Item from RowSet0 as A, RowSet0 as B), --          4 rows
        RowSet2 as (select 1 as Item from RowSet1 as A, RowSet1 as B), --         16 rows
        RowSet3 as (select 1 as Item from RowSet2 as A, RowSet2 as B), --        256 rows
        RowSet4 as (select 1 as Item from RowSet3 as A, RowSet3 as B), --      65536 rows (65k)
        RowSet5 as (select 1 as Item from RowSet4 as A, RowSet4 as B), -- 4294967296 rows (4BB)
        -- Add more RowSetX to get higher and higher numbers of rows    
        -- Each successive RowSetX results in squaring the previously available number of rows
        Tally   as (select row_number() over (order by Item) as Number from RowSet5) -- This is what gives us the sequence of integers, always select from the terminal CTE expression
        -- Note: testing of this specific use case has shown that making Tally as a sub-query instead of a terminal CTE expression is slower (always) - be sure to follow this pattern closely for max performance
        INSERT INTO [Numbers].[Test] (Number)
        SELECT o.Number
        FROM Tally o
        WHERE o.Number <= @NumberOfNumbers
    
        ALTER TABLE [Numbers].[Test] ADD CONSTRAINT PK_Numbers_Test_Number PRIMARY KEY CLUSTERED (Number)
    
        -- Record the end of the test
        UPDATE tt
            SET 
                EndDate = GETDATE()
        FROM #TimingTest tt
        WHERE tt.MethodName = @MethodName
        and tt.TestIteration = @TestIteration
    
        -- And the stats about the numbers in the sequence
        UPDATE tt
            SET 
                ItemCount = results.ItemCount,
                MaxNumber = results.MaxNumber,
                MinNumber = results.MinNumber
        FROM #TimingTest tt
        CROSS JOIN (
            SELECT COUNT(Number) as ItemCount, MAX(Number) as MaxNumber, MIN(Number) as MinNumber FROM [Numbers].[Test]
        ) results
        WHERE tt.MethodName = @MethodName
        and tt.TestIteration = @TestIteration
    
        --
        -- METHOD - Mladen Prajdic answer
        --
    
        -- Prep for the test
        DROP TABLE IF EXISTS [Numbers].[Test];
        CREATE TABLE [Numbers].[Test] (Number INT NOT NULL);
    
        -- Method information
        SET @MethodName = 'MladenPrajdic';
    
        -- Record the start of the test
        INSERT INTO #TimingTest(MethodName, TestIteration, StartDate)
        SELECT @MethodName, @TestIteration, GETDATE()
    
        -- Run the algorithm
        INSERT INTO [Numbers].[Test](Number)
        SELECT TOP (@NumberOfNumbers) row_number() over(order by t1.number) as N
        FROM master..spt_values t1 
        CROSS JOIN master..spt_values t2
    
        ALTER TABLE [Numbers].[Test] ADD CONSTRAINT PK_Numbers_Test_Number PRIMARY KEY CLUSTERED (Number)
    
        -- Record the end of the test
        UPDATE tt
            SET 
                EndDate = GETDATE()
        FROM #TimingTest tt
        WHERE tt.MethodName = @MethodName
        and tt.TestIteration = @TestIteration
    
        -- And the stats about the numbers in the sequence
        UPDATE tt
            SET 
                ItemCount = results.ItemCount,
                MaxNumber = results.MaxNumber,
                MinNumber = results.MinNumber
        FROM #TimingTest tt
        CROSS JOIN (
            SELECT COUNT(Number) as ItemCount, MAX(Number) as MaxNumber, MIN(Number) as MinNumber FROM [Numbers].[Test]
        ) results
        WHERE tt.MethodName = @MethodName
        and tt.TestIteration = @TestIteration
    
        --
        -- METHOD - Single INSERT
        -- 
    
        -- Prep for the test
        DROP TABLE IF EXISTS [Numbers].[Test];
        -- The Table creation is part of this algorithm ...
    
        -- Method information
        SET @MethodName = 'SingleInsert';
    
        -- Record the start of the test
        INSERT INTO #TimingTest(MethodName, TestIteration, StartDate)
        SELECT @MethodName, @TestIteration, GETDATE()
    
        -- Run the algorithm
        SELECT TOP (@NumberOfNumbers) IDENTITY(int,1,1) AS Number
        INTO [Numbers].[Test]
        FROM sys.objects s1       -- use sys.columns if you don't get enough rows returned to generate all the numbers you need
        CROSS JOIN sys.objects s2 -- use sys.columns if you don't get enough rows returned to generate all the numbers you need
    
        ALTER TABLE [Numbers].[Test] ADD CONSTRAINT PK_Numbers_Test_Number PRIMARY KEY CLUSTERED (Number)
    
        -- Record the end of the test
        UPDATE tt
            SET 
                EndDate = GETDATE()
        FROM #TimingTest tt
        WHERE tt.MethodName = @MethodName
        and tt.TestIteration = @TestIteration
    
        -- And the stats about the numbers in the sequence
        UPDATE tt
            SET 
                ItemCount = results.ItemCount,
                MaxNumber = results.MaxNumber,
                MinNumber = results.MinNumber
        FROM #TimingTest tt
        CROSS JOIN (
            SELECT COUNT(Number) as ItemCount, MAX(Number) as MaxNumber, MIN(Number) as MinNumber FROM [Numbers].[Test]
        ) results
        WHERE tt.MethodName = @MethodName
        and tt.TestIteration = @TestIteration
    END
    
    -- Calculate the timespan for each of the runs
    UPDATE tt
        SET
            ElapsedTime = DATEDIFF(MICROSECOND, StartDate, EndDate)
    FROM #TimingTest tt
    
    --
    -- Report the results ...
    --
    SELECT 
        MethodName, AVG(ElapsedTime) / AVG(ItemCount) as TimePerRecord, CAST(AVG(ItemCount) as bigint) as SequenceLength,
        MAX(ElapsedTime) as MaxTime, MIN(ElapsedTime) as MinTime,
        MAX(MaxNumber) as MaxNumber, MIN(MinNumber) as MinNumber
    FROM #TimingTest tt
    GROUP by tt.MethodName
    ORDER BY TimePerRecord ASC, MaxTime ASC, MinTime ASC
    

提交回复
热议问题