Recursive SQL statement (PostgreSQL 9.1.4)

前端 未结 2 2229
余生分开走
余生分开走 2021-02-20 13:31

PostgreSQL 9.1

Business situation

Every month, there is a new batch of accounts given to a specific process. Every batch can be described by mon

相关标签:
2条回答
  • 2021-02-20 14:18
            --
            -- rank the dates.
            -- , also fetch the the fields that seem to depend on them.
            -- (this should have been done in the data model)
            --
    CREATE VIEW date_rank AS (
            SELECT uniq.granularity,uniq.entry_accounts,uniq.entry_amount
            , row_number() OVER(ORDER BY 0) AS zrank
            FROM ( SELECT DISTINCT granularity, entry_accounts, entry_amount FROM vintage_data)
                 AS uniq
            );
    
    -- SELECT * FROM date_rank ORDER BY granularity;
            --
            -- transform to an x*y matrix, avoiding the date key and the slack columns
            --
    CREATE VIEW matrix_data AS (
            SELECT vd.distance_in_months AS xxx
            , dr.zrank AS yyy
            , vd.recovery_amount AS val
            FROM vintage_data vd
            JOIN date_rank dr ON dr.granularity = vd.granularity
            );
    -- SELECT * FROM matrix_data;
    
            --
            -- In order to perform the reversed transformation:
            -- make the view insertable.
            -- INSERTS to matrix_data will percolate back into the vintage_data table
            -- (don't try this at home ;-)
            --
    CREATE RULE magic_by_the_plasser AS
            ON INSERT TO matrix_data
            DO INSTEAD (
            INSERT INTO vintage_data (granularity,distance_in_months,entry_accounts,entry_amount,recovery_amount)
            SELECT dr.granularity, new.xxx, dr.entry_accounts, dr.entry_amount, new.val
            FROM date_rank dr
            WHERE dr.zrank = new.yyy
                    ;
            );
    
            --
            -- This CTE creates the weights for a Pascal-triangle
            --
    -- EXPLAIN -- ANALYZE
    WITH RECURSIVE pascal AS (
            WITH empty AS (
                    --
                    -- "cart" is a cathesian product of X*Y
                    -- its function is similar to a "calendar table":
                    -- filling in the missing X,Y pairs, making the matrix "square".
                    -- (well: rectangular, but in the given case nX==nY)
                    --
                    WITH cart AS (
                            WITH mmx AS (
                                    WITH xx AS ( SELECT MIN(xxx) AS x0 , MAX(xxx) AS x1 FROM matrix_data)
                                    SELECT generate_series(xx.x0,xx.x1) AS xxx
                                    FROM xx
                                    )
                            , mmy AS (
                                    WITH yy AS ( SELECT MIN(yyy) AS y0 , MAX(yyy) AS y1 FROM matrix_data)
                                    SELECT generate_series(yy.y0,yy.y1) AS yyy
                                    FROM yy
                                    )
                            SELECT * FROM mmx
                            JOIN mmy ON (1=1) -- Carthesian product here!
                            )
                    --
                    -- The (x,y) pairs that are not present in the current matrix
                    --
                    SELECT * FROM cart ca
                    WHERE NOT EXISTS (
                            SELECT *
                            FROM matrix_data nx
                            WHERE nx.xxx = ca.xxx
                            AND nx.yyy = ca.yyy
                            )
                    )
            SELECT md.yyy AS src_y
                    , md.xxx AS src_x
                    , md.yyy AS dst_y
                    , md.xxx AS dst_x
                    -- The filled-in matrix cells have weight 1
                    , 1::numeric AS weight
            FROM matrix_data md
            UNION ALL
            SELECT pa.src_y AS src_y
                    , pa.src_x AS src_x
                    , em.yyy AS dst_y
                    , em.xxx AS dst_x
                    -- the derived matrix cells inherit weight/2 from both their parents
                    , (pa.weight/2) AS weight
            FROM pascal pa
            JOIN empty em
                    ON ( em.yyy = pa.dst_y+1 AND em.xxx = pa.dst_x)
                    OR ( em.yyy = pa.dst_y AND em.xxx = pa.dst_x+1 )
            )
    INSERT INTO matrix_data(yyy,xxx,val)
    SELECT pa.dst_y,pa.dst_x
            ,SUM(ma.val*pa.weight)
    FROM pascal pa
    JOIN matrix_data ma ON pa.src_y = ma.yyy AND pa.src_x = ma.xxx
            -- avoid the filled-in matrix cells (which map to themselves)
    WHERE NOT (pa.src_y = pa.dst_y AND pa.src_x = pa.dst_x)
    GROUP BY pa.dst_y,pa.dst_x
            ;
    
            --
            -- This will also get rid of the matrix_data view and the rule.
            --
    DROP VIEW date_rank CASCADE;
    -- SELECT * FROM matrix_data ;
    
    SELECT * FROM vintage_data ORDER BY granularity, distance_in_months;
    

    RESULT:

    NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index "vintage_data_pkey" for table "vintage_data"
    CREATE TABLE
    NOTICE:  ALTER TABLE / ADD UNIQUE will create implicit index "mx_xy" for table "vintage_data"
    ALTER TABLE
    INSERT 0 21
    VACUUM
    CREATE VIEW
    CREATE VIEW
    CREATE RULE
    INSERT 0 15
    NOTICE:  drop cascades to view matrix_data
    DROP VIEW
     granularity | distance_in_months | entry_accounts | entry_amount |      recovery_amount      
    -------------+--------------------+----------------+--------------+---------------------------
     2012-01-31  |                  1 |            200 |       100000 |                      1000
     2012-01-31  |                  2 |            200 |       100000 |                      2000
     2012-01-31  |                  3 |            200 |       100000 |                      3000
     2012-01-31  |                  4 |            200 |       100000 |                      3500
     2012-01-31  |                  5 |            200 |       100000 |                      3400
     2012-01-31  |                  6 |            200 |       100000 |                      3300
     2012-02-28  |                  1 |            250 |       150000 |                      1200
     2012-02-28  |                  2 |            250 |       150000 |                      1600
     2012-02-28  |                  3 |            250 |       150000 |                      1800
     2012-02-28  |                  4 |            250 |       150000 |                      1200
     2012-02-28  |                  5 |            250 |       150000 |                      1600
     2012-02-28  |                  6 |            250 |       150000 | 2381.25000000000000000000
     2012-03-31  |                  1 |            200 |        90000 |                      1300
     2012-03-31  |                  2 |            200 |        90000 |                      1200
     2012-03-31  |                  3 |            200 |        90000 |                      1400
     2012-03-31  |                  4 |            200 |        90000 |                      1000
     2012-03-31  |                  5 |            200 |        90000 | 2200.00000000000000000000
     2012-03-31  |                  6 |            200 |        90000 | 2750.00000000000000000000
     2012-04-30  |                  1 |            300 |       180000 |                      1600
     2012-04-30  |                  2 |            300 |       180000 |                      1500
     2012-04-30  |                  3 |            300 |       180000 |                      4000
     2012-04-30  |                  4 |            300 |       180000 | 2500.00000000000000000000
     2012-04-30  |                  5 |            300 |       180000 | 2350.00000000000000000000
     2012-04-30  |                  6 |            300 |       180000 | 2550.00000000000000000000
     2012-05-31  |                  1 |            400 |       225000 |                      2200
     2012-05-31  |                  2 |            400 |       225000 |                      6000
     2012-05-31  |                  3 |            400 |       225000 | 5000.00000000000000000000
     2012-05-31  |                  4 |            400 |       225000 | 3750.00000000000000000000
     2012-05-31  |                  5 |            400 |       225000 | 3050.00000000000000000000
     2012-05-31  |                  6 |            400 |       225000 | 2800.00000000000000000000
     2012-06-30  |                  1 |            100 |        60000 |                      1000
     2012-06-30  |                  2 |            100 |        60000 | 3500.00000000000000000000
     2012-06-30  |                  3 |            100 |        60000 | 4250.00000000000000000000
     2012-06-30  |                  4 |            100 |        60000 | 4000.00000000000000000000
     2012-06-30  |                  5 |            100 |        60000 | 3525.00000000000000000000
     2012-06-30  |                  6 |            100 |        60000 | 3162.50000000000000000000
    (36 rows)
    
    0 讨论(0)
  • 2021-02-20 14:22

    It's a big task, split it up to make it more manageable. I would put that in a plpgsql function with RETURN TABLE:

    1. Create a temporary table for your "Calculation Process" matrix using a crosstab query You need the tablefunc module installed for that. Run (once per database):

      CREATE EXTENSION tablefunc;
      
    2. Update the temp table field by field.

    3. Return table.

    The following demo is fully functional and tested with PostgreSQL 9.1.4. Building on the table definition provided in the question:

    -- DROP FUNCTION f_forcast();
    
    CREATE OR REPLACE FUNCTION f_forcast()
      RETURNS TABLE (
      granularity date
     ,entry_accounts numeric
     ,entry_amount numeric
     ,d1 numeric
     ,d2 numeric
     ,d3 numeric
     ,d4 numeric
     ,d5 numeric
     ,d6 numeric) AS
    $BODY$
    BEGIN
    
    --== Create temp table with result of crosstab() ==--
    
    CREATE TEMP TABLE matrix ON COMMIT DROP AS
    SELECT *
    FROM   crosstab (
            'SELECT granularity, entry_accounts, entry_amount
                   ,distance_in_months, recovery_amount
             FROM   vintage_data
             ORDER  BY 1, 2',
    
            'SELECT DISTINCT distance_in_months
             FROM   vintage_data
             ORDER  BY 1')
    AS tbl (
      granularity date
     ,entry_accounts numeric
     ,entry_amount numeric
     ,d1 numeric
     ,d2 numeric
     ,d3 numeric
     ,d4 numeric
     ,d5 numeric
     ,d6 numeric
     );
    
    ANALYZE matrix; -- update statistics to help calculations
    
    
    --== Calculations ==--
    
    -- I implemented the first calculation for X1 and leave the rest to you.
    -- Can probably be generalized in a loop or even a single statement.
    
    UPDATE matrix m
    SET    d4 = (
        SELECT (sum(x.d1) + sum(x.d2) + sum(x.d3) + sum(x.d4))
                /(sum(x.d1) + sum(x.d2) + sum(x.d3)) - 1
                -- removed redundant sum(entry_amount) from equation
        FROM  (
            SELECT *
            FROM   matrix a
            WHERE  a.granularity < m.granularity
            ORDER  BY a.granularity DESC
            LIMIT  3
            ) x
        ) * (m.d1 + m.d2 + m.d3)
    WHERE m.granularity = '2012-04-30';
    
    --- Next update X2 ..
    
    
    --== Return results ==--
    
    RETURN QUERY
    TABLE  matrix
    ORDER  BY 1;
    
    END;
    $BODY$ LANGUAGE plpgsql;
    

    Call:

    SELECT * FROM f_forcast();
    

    I have simplified quite a bit, removing some redundant steps in the calculation.
    The solution employs a variety of advanced techniques. You need to know your way around PostgreSQL to work with this.

    0 讨论(0)
提交回复
热议问题