I\'m querying a large data set to figure out if a bunch of campaign events (i.e. event 1,2,..) during different timepoints gives a result in user activity (active, inactive) dur
You seem to be quite there. A range partition is the right way to go. BigQuery only supports integers in such frame, so we need to convert the date to a number; since you have dates with no time component, UNIX_DATE()
comes to mind:
WINDOW 3d_later AS (
PARTITION BY user
ORDER BY UNIX_DATE(cm.date)
RANGE BETWEEN 1 FOLLOWING AND 3 FOLLOWING
)
Below example is for BigQuery Standard SQL
#standardSQL
SELECT *, IF(events IS NULL, 0, COUNTIF(day_activity = 'active') OVER(three_day_activity_window)) AS three_day_activity
FROM `project.dataset.table`
WINDOW three_day_activity_window AS (
PARTITION BY user
ORDER BY UNIX_DATE(date)
RANGE BETWEEN 1 FOLLOWING AND 3 FOLLOWING
)
You can test, play with above using sample data from your question as in below example
#standardSQL
WITH `project.dataset.table` AS (
SELECT DATE '2020-01-01' date , 1 user, 'event1' events, 'active' day_activity UNION ALL
SELECT '2020-01-01', 2, 'event1', 'inactive' UNION ALL
SELECT '2020-01-02', 1, NULL, 'inactive' UNION ALL
SELECT '2020-01-02', 2, NULL, 'active' UNION ALL
SELECT '2020-01-03', 1, NULL, 'inactive' UNION ALL
SELECT '2020-01-03', 2, NULL, 'active' UNION ALL
SELECT '2020-01-04', 1, NULL, 'active' UNION ALL
SELECT '2020-01-04', 2, NULL, 'active'
)
SELECT *, IF(events IS NULL, 0, COUNTIF(day_activity = 'active') OVER(three_day_activity_window)) AS three_day_activity
FROM `project.dataset.table`
WINDOW three_day_activity_window AS (
PARTITION BY user
ORDER BY UNIX_DATE(date)
RANGE BETWEEN 1 FOLLOWING AND 3 FOLLOWING
)
ORDER BY date, user
with output
Row date user events day_activity three_day_activity
1 2020-01-01 1 event1 active 1
2 2020-01-01 2 event1 inactive 3
3 2020-01-02 1 null inactive 0
4 2020-01-02 2 null active 0
5 2020-01-03 1 null inactive 0
6 2020-01-03 2 null active 0
7 2020-01-04 1 null active 0
8 2020-01-04 2 null active 0
Update for - to avoid registering the same user as active multiple times in one day (and tallying those up to a huge sum)?
If you want to avoid counting all activity for user on same day - use below adjusted version (note extra entry in sample data to introduce user's multiple activity on same day)
#standardSQL
WITH `project.dataset.table` AS (
SELECT DATE '2020-01-01' DATE , 1 user, 'event1' events, 'active' day_activity UNION ALL
SELECT '2020-01-01', 2, 'event1', 'inactive' UNION ALL
SELECT '2020-01-02', 1, NULL, 'inactive' UNION ALL
SELECT '2020-01-02', 2, NULL, 'active' UNION ALL
SELECT '2020-01-03', 1, NULL, 'inactive' UNION ALL
SELECT '2020-01-03', 2, NULL, 'active' UNION ALL
SELECT '2020-01-04', 1, NULL, 'active' UNION ALL
SELECT '2020-01-04', 1, NULL, 'active' UNION ALL
SELECT '2020-01-04', 2, NULL, 'active'
)
SELECT *,
IF(events IS NULL, 0, COUNTIF(day_activity = 'active') OVER(three_day_activity_window)) AS three_day_activity
FROM (
SELECT date, user, MAX(events) events, MIN(day_activity) day_activity
FROM `project.dataset.table`
GROUP BY date, user
)
WINDOW three_day_activity_window AS (
PARTITION BY user
ORDER BY UNIX_DATE(date)
RANGE BETWEEN 1 FOLLOWING AND 3 FOLLOWING
)
ORDER BY date, user