From ebda48ad1de0ac97a6d713fed726e7839e2c65c6 Mon Sep 17 00:00:00 2001 From: desmond-hui <97470747+desmond-hui@users.noreply.github.com> Date: Wed, 12 Feb 2025 07:02:24 -0800 Subject: [PATCH] use row_number to dedupe (#798) --- ..._transactions_and_votes_missing_7_days.sql | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/tests/test_silver__transactions_and_votes_missing_7_days.sql b/tests/test_silver__transactions_and_votes_missing_7_days.sql index 5d49c1f5..5ce0f116 100644 --- a/tests/test_silver__transactions_and_votes_missing_7_days.sql +++ b/tests/test_silver__transactions_and_votes_missing_7_days.sql @@ -21,21 +21,28 @@ silver_counts AS ( count(tx_id) AS transaction_count FROM ( - SELECT - block_id, - tx_id - FROM - {{ ref('silver__transactions') }} t - WHERE - block_timestamp BETWEEN current_date - 8 AND current_timestamp - INTERVAL '12 HOUR' - UNION - SELECT - block_id, - tx_id - FROM - solana.silver.votes t - WHERE - block_timestamp BETWEEN current_date - 8 AND current_timestamp - INTERVAL '12 HOUR' + -- Deduplicate using row_number() instead of `UNION` or `count(DISTINCT)` since it is faster for large datasets + SELECT + * + FROM ( + SELECT + block_id, + tx_id + FROM + {{ ref('silver__transactions') }} t + WHERE + block_timestamp BETWEEN current_date - 8 AND current_timestamp - INTERVAL '12 HOUR' + UNION ALL + SELECT + block_id, + tx_id + FROM + solana.silver.votes t + WHERE + block_timestamp BETWEEN current_date - 8 AND current_timestamp - INTERVAL '12 HOUR' + ) + QUALIFY + row_number() OVER (PARTITION BY tx_id ORDER BY block_id) = 1 ) GROUP BY 1