SQL query to answer: If occurs in timepoint A, does occur in time period B-C?

前端 未结 2 1364
感动是毒
感动是毒 2021-01-25 04:05

I\'m querying a large data set to figure out if a bunch of campaign events (i.e. event 1,2,..) during different timepoints gives a result in user activity (active, inactive) dur

相关标签:
2条回答
  • 2021-01-25 04:31

    You seem to be quite there. A range partition is the right way to go. BigQuery only supports integers in such frame, so we need to convert the date to a number; since you have dates with no time component, UNIX_DATE() comes to mind:

    WINDOW 3d_later AS (
        PARTITION BY user 
        ORDER BY UNIX_DATE(cm.date) 
        RANGE BETWEEN 1 FOLLOWING AND 3 FOLLOWING
    )
    
    0 讨论(0)
  • 2021-01-25 04:35

    Below example is for BigQuery Standard SQL

    #standardSQL
    SELECT *, IF(events IS NULL, 0, COUNTIF(day_activity = 'active') OVER(three_day_activity_window)) AS three_day_activity
    FROM `project.dataset.table`
    WINDOW three_day_activity_window AS (
      PARTITION BY user 
      ORDER BY UNIX_DATE(date) 
      RANGE BETWEEN 1 FOLLOWING AND 3 FOLLOWING
    )
    

    You can test, play with above using sample data from your question as in below example

    #standardSQL
    WITH `project.dataset.table` AS (
      SELECT DATE '2020-01-01' date , 1 user, 'event1' events, 'active' day_activity UNION ALL
      SELECT '2020-01-01', 2, 'event1', 'inactive' UNION ALL
      SELECT '2020-01-02', 1, NULL, 'inactive' UNION ALL
      SELECT '2020-01-02', 2, NULL, 'active' UNION ALL
      SELECT '2020-01-03', 1, NULL, 'inactive' UNION ALL
      SELECT '2020-01-03', 2, NULL, 'active' UNION ALL
      SELECT '2020-01-04', 1, NULL, 'active' UNION ALL
      SELECT '2020-01-04', 2, NULL, 'active' 
    )
    SELECT *, IF(events IS NULL, 0, COUNTIF(day_activity = 'active') OVER(three_day_activity_window)) AS three_day_activity
    FROM `project.dataset.table`
    WINDOW three_day_activity_window AS (
      PARTITION BY user 
      ORDER BY UNIX_DATE(date) 
      RANGE BETWEEN 1 FOLLOWING AND 3 FOLLOWING
    )
    ORDER BY date, user   
    

    with output

    Row date        user    events  day_activity    three_day_activity   
    1   2020-01-01  1       event1  active          1    
    2   2020-01-01  2       event1  inactive        3    
    3   2020-01-02  1       null    inactive        0    
    4   2020-01-02  2       null    active          0    
    5   2020-01-03  1       null    inactive        0    
    6   2020-01-03  2       null    active          0    
    7   2020-01-04  1       null    active          0    
    8   2020-01-04  2       null    active          0       
    

    Update for - to avoid registering the same user as active multiple times in one day (and tallying those up to a huge sum)?

    If you want to avoid counting all activity for user on same day - use below adjusted version (note extra entry in sample data to introduce user's multiple activity on same day)

    #standardSQL
    WITH `project.dataset.table` AS (
      SELECT DATE '2020-01-01' DATE , 1 user, 'event1' events, 'active' day_activity UNION ALL
      SELECT '2020-01-01', 2, 'event1', 'inactive' UNION ALL
      SELECT '2020-01-02', 1, NULL, 'inactive' UNION ALL
      SELECT '2020-01-02', 2, NULL, 'active' UNION ALL
      SELECT '2020-01-03', 1, NULL, 'inactive' UNION ALL
      SELECT '2020-01-03', 2, NULL, 'active' UNION ALL
      SELECT '2020-01-04', 1, NULL, 'active' UNION ALL
      SELECT '2020-01-04', 1, NULL, 'active' UNION ALL
      SELECT '2020-01-04', 2, NULL, 'active' 
    )
    SELECT *, 
      IF(events IS NULL, 0, COUNTIF(day_activity = 'active') OVER(three_day_activity_window)) AS three_day_activity
    FROM (
      SELECT date, user, MAX(events) events, MIN(day_activity) day_activity
      FROM `project.dataset.table` 
      GROUP BY date, user
    )
    WINDOW three_day_activity_window AS (
      PARTITION BY user 
      ORDER BY UNIX_DATE(date) 
      RANGE BETWEEN 1 FOLLOWING AND 3 FOLLOWING
    )
    ORDER BY date, user
    
    0 讨论(0)
提交回复
热议问题