Skip to content

Commit

Permalink
feat: metrics by collection (#929)
Browse files Browse the repository at this point in the history
* feat: onchain metrics by collection

* feat: github metrics by collection
  • Loading branch information
ccerv1 authored Feb 24, 2024
1 parent 29373f0 commit 842e6e6
Show file tree
Hide file tree
Showing 4 changed files with 244 additions and 5 deletions.
101 changes: 101 additions & 0 deletions dbt/models/marts/github_metrics/github_metrics_by_collection.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
{#
Summary GitHub metrics for a collection:
- first_commit_date: The date of the first commit to the collection
- last_commit_date: The date of the last commit to the collection
- repos: The number of repositories in the collection
- stars: The number of stars the collection has
- forks: The number of forks the collection has
- contributors: The number of contributors to the collection
- contributors_6_months: The number of contributors to the collection in the last 6 months
- new_contributors_6_months: The number of new contributors to the collection in the last 6 months
- avg_active_devs_6_months: The average number of active developers in the last 6 months
- commits_6_months: The number of commits to the collection in the last 6 months
- issues_opened_6_months: The number of issues opened in the collection in the last 6 months
- issues_closed_6_months: The number of issues closed in the collection in the last 6 months
- pull_requests_opened_6_months: The number of pull requests opened in the collection in the last 6 months
- pull_requests_merged_6_months: The number of pull requests merged in the collection in the last 6 months
#}

-- CTE for calculating the first and last commit date for each collection, ignoring forked repos
WITH collection_commit_dates AS (
SELECT
pbc.collection_id,
MIN(e.time) AS first_commit_date,
MAX(e.time) AS last_commit_date
FROM {{ ref('int_events_to_project') }} AS e
JOIN {{ ref('stg_ossd__repositories_by_project') }} AS r ON e.project_id = r.project_id
JOIN {{ ref('stg_ossd__projects_by_collection') }} AS pbc ON r.project_id = pbc.project_id
WHERE e.event_type = 'COMMIT_CODE'
AND r.is_fork = false
GROUP BY pbc.collection_id
),
-- CTE for aggregating stars, forks, and repository counts by collection
collection_stars_forks_repos AS (
SELECT
pbc.collection_id,
COUNT(DISTINCT r.id) AS repos,
SUM(r.star_count) AS stars,
SUM(r.fork_count) AS forks
FROM {{ ref('stg_ossd__repositories_by_project') }} AS r
JOIN {{ ref('stg_ossd__projects_by_collection') }} AS pbc ON r.project_id = pbc.project_id
WHERE r.is_fork = false
GROUP BY pbc.collection_id
),
-- CTE for calculating contributor counts and new contributors in the last 6 months at collection level
collection_contributors AS (
SELECT
pbc.collection_id,
COUNT(DISTINCT d.from_id) AS contributors,
COUNT(DISTINCT CASE WHEN d.bucket_month >= DATE_ADD(CURRENT_DATE(), INTERVAL -6 MONTH) THEN d.from_id END) AS contributors_6_months,
COUNT(DISTINCT CASE WHEN d.bucket_month >= DATE_ADD(CURRENT_DATE(), INTERVAL -6 MONTH) AND d.user_segment_type IN ('FULL_TIME_DEV', 'PART_TIME_DEV') THEN CONCAT(d.from_id, '_', d.bucket_month) END) / 6 AS avg_active_devs_6_months,
COUNT(DISTINCT CASE WHEN d.first_contribution_date >= DATE_ADD(CURRENT_DATE(), INTERVAL -6 MONTH) THEN d.from_id END) AS new_contributors_6_months
FROM (
SELECT
d.from_id,
pbc.collection_id,
d.bucket_month,
d.user_segment_type,
MIN(d.bucket_month) OVER (PARTITION BY d.from_id, pbc.collection_id) AS first_contribution_date
FROM {{ ref('int_devs') }} AS d
JOIN {{ ref('stg_ossd__projects_by_collection') }} AS pbc ON d.project_id = pbc.project_id
) AS d
GROUP BY pbc.collection_id
),
-- CTE for summarizing collection activity metrics over the past 6 months
collection_activity AS (
SELECT
pbc.collection_id,
SUM(CASE WHEN e.event_type = 'COMMIT_CODE' THEN e.amount END) AS commits_6_months,
SUM(CASE WHEN e.event_type = 'ISSUE_OPENED' THEN e.amount END) AS issues_opened_6_months,
SUM(CASE WHEN e.event_type = 'ISSUE_CLOSED' THEN e.amount END) AS issues_closed_6_months,
SUM(CASE WHEN e.event_type = 'PULL_REQUEST_OPENED' THEN e.amount END) AS pull_requests_opened_6_months,
SUM(CASE WHEN e.event_type = 'PULL_REQUEST_MERGED' THEN e.amount END) AS pull_requests_merged_6_months
FROM {{ ref('int_events_to_project') }} AS e
JOIN {{ ref('stg_ossd__projects_by_collection') }} AS pbc ON e.project_id = pbc.project_id
WHERE e.time >= DATE_ADD(CURRENT_DATE(), INTERVAL -6 MONTH)
GROUP BY pbc.collection_id
)

-- Final query to join all the metrics together for collections
SELECT
c.collection_id,
c.collection_name,
ccd.first_commit_date,
ccd.last_commit_date,
csfr.repos,
csfr.stars,
csfr.forks,
cc.contributors,
cc.contributors_6_months,
cc.new_contributors_6_months,
cc.avg_active_devs_6_months,
ca.commits_6_months,
ca.issues_opened_6_months,
ca.issues_closed_6_months,
ca.pull_requests_opened_6_months,
ca.pull_requests_merged_6_months
FROM {{ ref('collections') }} AS c
INNER JOIN collection_commit_dates AS ccd ON c.collection_id = ccd.collection_id
INNER JOIN collection_stars_forks_repos AS csfr ON c.collection_id = csfr.collection_id
INNER JOIN collection_contributors AS cc ON c.collection_id = cc.collection_id
INNER JOIN collection_activity AS ca ON c.collection_id = ca.collection_id
8 changes: 4 additions & 4 deletions dbt/models/marts/github_metrics/github_metrics_by_project.sql
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ SELECT
act.pull_requests_opened_6_months,
act.pull_requests_merged_6_months
FROM {{ ref('projects') }} AS p
JOIN project_commit_dates AS pcd ON p.project_id = pcd.project_id
JOIN stars_forks_repos AS sfr ON p.project_id = sfr.project_id
JOIN contributors_cte AS c ON p.project_id = c.project_id
JOIN activity_cte AS act ON p.project_id = act.project_id
INNER JOIN project_commit_dates AS pcd ON p.project_id = pcd.project_id
INNER JOIN stars_forks_repos AS sfr ON p.project_id = sfr.project_id
INNER JOIN contributors_cte AS c ON p.project_id = c.project_id
INNER JOIN activity_cte AS act ON p.project_id = act.project_id
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
{#
Arbitrum Onchain Metrics
Summary onchain metrics for a collection:
- num_projects: The number of projects in the collection
- num_contracts: The number of contracts in the collection
- first_txn_date: The date of the first transaction to the collection
- total_txns: The total number of transactions to the collection
- total_l2_gas: The total L2 gas used by the collection
- total_users: The number of unique users interacting with the collection
- txns_6_months: The total number of transactions to the collection in the last 6 months
- l2_gas_6_months: The total L2 gas used by the collection in the last 6 months
- users_6_months: The number of unique users interacting with the collection in the last 6 months
- new_users: The number of users interacting with the collection for the first time in the last 3 months
- active_users: The number of active users interacting with the collection in the last 3 months
- high_frequency_users: The number of users who have made 1000+ transactions with the collection in the last 3 months
- more_active_users: The number of users who have made 10-999 transactions with the collection in the last 3 months
- less_active_users: The number of users who have made 1-9 transactions with the collection in the last 3 months
#}


-- CTE for grabbing the onchain transaction data we care about, including project-collection mapping
WITH txns AS (
SELECT
pbc.collection_id,
a.project_id,
c.from_source_id AS from_id,
DATE(TIMESTAMP_TRUNC(c.time, MONTH)) AS bucket_month,
l2_gas,
tx_count
FROM {{ ref('stg_dune__arbitrum_contract_invocation') }} AS c
JOIN {{ ref('stg_ossd__artifacts_by_project') }} AS a ON c.to_source_id = a.artifact_source_id
JOIN {{ ref('stg_ossd__projects_by_collection') }} AS pbc ON a.project_id = pbc.project_id
),

-- CTEs for calculating all-time and 6-month collection metrics across all contracts
metrics_all_time AS (
SELECT
collection_id,
COUNT(DISTINCT project_id) AS total_projects,
MIN(bucket_month) AS first_txn_date,
COUNT(DISTINCT from_id) AS total_users,
SUM(l2_gas) AS total_l2_gas,
SUM(tx_count) AS total_txns
FROM txns
GROUP BY collection_id
),
metrics_6_months AS (
SELECT
collection_id,
COUNT(DISTINCT from_id) AS users_6_months,
SUM(l2_gas) AS l2_gas_6_months,
SUM(tx_count) AS txns_6_months
FROM txns
WHERE bucket_month >= DATE_ADD(CURRENT_DATE(), INTERVAL -6 MONTH)
GROUP BY collection_id
),

-- CTE for identifying new users to the collection in the last 3 months
new_users AS (
SELECT
collection_id,
SUM(is_new_user) AS new_user_count
FROM (
SELECT
collection_id,
from_id,
CASE WHEN MIN(bucket_month) >= DATE_ADD(CURRENT_DATE(), INTERVAL -3 MONTH) THEN 1 ELSE 0 END AS is_new_user
FROM txns
GROUP BY collection_id, from_id
)
GROUP BY collection_id
),

-- CTEs for segmenting different types of active users based on txn volume at collection level
user_txns_aggregated AS (
SELECT
collection_id,
from_id,
SUM(tx_count) AS total_tx_count
FROM txns
WHERE bucket_month >= DATE_ADD(CURRENT_DATE(), INTERVAL -3 MONTH)
GROUP BY collection_id, from_id
),
user_segments AS (
SELECT
collection_id,
COUNT(DISTINCT CASE WHEN user_segment = 'HIGH_FREQUENCY_USER' THEN from_id END) AS high_frequency_users,
COUNT(DISTINCT CASE WHEN user_segment = 'MORE_ACTIVE_USER' THEN from_id END) AS more_active_users,
COUNT(DISTINCT CASE WHEN user_segment = 'LESS_ACTIVE_USER' THEN from_id END) AS less_active_users
FROM (
SELECT
collection_id,
from_id,
CASE
WHEN total_tx_count >= 1000 THEN 'HIGH_FREQUENCY_USER'
WHEN total_tx_count >= 10 THEN 'MORE_ACTIVE_USER'
ELSE 'LESS_ACTIVE_USER'
END AS user_segment
FROM user_txns_aggregated
)
GROUP BY collection_id
),

-- CTE to count the number of contracts deployed by projects in a collection
contracts AS (
SELECT
pbc.collection_id,
COUNT(DISTINCT a.artifact_source_id) AS num_contracts
FROM {{ ref('stg_ossd__artifacts_by_project') }} AS a
JOIN {{ ref('stg_ossd__projects_by_collection') }} AS pbc ON a.project_id = pbc.project_id
GROUP BY pbc.collection_id
)

-- Final query to join all the metrics together for collections
SELECT
c.collection_id,
c.collection_name,
co.num_contracts,
ma.total_projects,
ma.first_txn_date,
ma.total_txns,
ma.total_l2_gas,
ma.total_users,
m6.txns_6_months,
m6.l2_gas_6_months,
m6.users_6_months,
nu.new_user_count AS new_users,
(us.high_frequency_users + us.more_active_users + us.less_active_users) AS active_users,
us.high_frequency_users,
us.more_active_users,
us.less_active_users

FROM {{ ref('collections') }} AS c
INNER JOIN metrics_all_time AS ma ON c.collection_id = ma.collection_id
INNER JOIN metrics_6_months AS m6 on c.collection_id = m6.collection_id
INNER JOIN new_users AS nu on c.collection_id = nu.collection_id
INNER JOIN user_segments AS us on c.collection_id = us.collection_id
INNER JOIN contracts AS co on c.collection_id = co.collection_id
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ SELECT
us.less_active_users

FROM {{ ref('projects') }} AS p
JOIN metrics_all_time AS ma ON p.project_id = ma.project_id
INNER JOIN metrics_all_time AS ma ON p.project_id = ma.project_id
INNER JOIN metrics_6_months AS m6 on p.project_id = m6.project_id
INNER JOIN new_users AS nu on p.project_id = nu.project_id
INNER JOIN user_segments AS us on p.project_id = us.project_id
Expand Down

0 comments on commit 842e6e6

Please sign in to comment.