From f8adb52ef932b16f92e3cabb8a042d876124d1a1 Mon Sep 17 00:00:00 2001 From: Fabian Beuke Date: Sat, 30 Mar 2024 09:57:52 +0100 Subject: [PATCH] replace outdated BQ lang table with GH Archive PR lang extract The table [bigquery-public-data:github_repos.languages] was last updated in Nov 2022. This is a significant issue since, without any further updates, we can only count events that are happening for these outdated lists of repositories. Hence, we need a new method to obtain a large enough sample of repository primary language metadata. Fortunately, we can directly extract the language from PullRequest events, because they provide such a language field. So, whenever there is a PullRequest for any of the repos we want to include in our ranking, we are able to determine the language. These amount to many millions. The drawback is that we cannot include repositories that did not have any pull request for the current quarter. I think this is a fair trade-off for now until maybe there is some better solution. --- scripts/query.js | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/query.js b/scripts/query.js index 8e3de88..52fccfc 100755 --- a/scripts/query.js +++ b/scripts/query.js @@ -62,9 +62,13 @@ const queryBuilder = (tables) => { FROM ${tables} WHERE NOT LOWER(actor.login) LIKE "%bot%") a JOIN ( SELECT repo_name as name, lang FROM ( SELECT * FROM ( SELECT *, ROW_NUMBER() OVER (PARTITION BY repo_name ORDER BY lang) as num FROM ( - SELECT repo_name, FIRST_VALUE(language.name) OVER ( - partition by repo_name order by language.bytes DESC) AS lang - FROM [bigquery-public-data:github_repos.languages])) + SELECT + JSON_EXTRACT_SCALAR(payload, "$.pull_request.base.repo.language") as lang, + repo.name as repo_name + FROM ${tables} + WHERE + JSON_EXTRACT_SCALAR(payload, "$.pull_request.base.repo.language") IS NOT NULL + )) WHERE num = 1 order by repo_name) WHERE lang != 'null') b ON a.name = b.name) GROUP by type, language, year, quarter, actor.login