From 33ad00fc6263aaadc0361fbd71c6e5ddc33fc15c Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Mon, 25 Oct 2021 17:30:53 +0100 Subject: [PATCH] Third party 2021 query amends (#2404) * Add limit to 50 * Linting fixes * More ranking queries * Fix blocking query and linting * Update sql/2021/third-parties/third_parties_blocking_main_thread.sql * Update sql/2021/third-parties/third_parties_blocking_main_thread.sql Co-authored-by: Rick Viscomi --- ...distribution_of_3XX_response_body_size.sql | 15 +++- ...tion_of_size_and_time_by_third_parties.sql | 14 +++- ...of_third_parties_by_number_of_websites.sql | 15 +++- ...f_websites_by_number_of_third_parties.sql} | 14 +++- .../number_of_third_parties_by_rank.sql | 77 +++++++++++++++++ ..._of_third_parties_by_rank_and_category.sql | 83 +++++++++++++++++++ ...rcent_of_third_parties_by_content_type.sql | 15 +++- .../percent_of_third_party_cache.sql | 17 +++- ...d_party_loaded_before_DOMContentLoaded.sql | 13 ++- ...and_bytes_by_category_and_content_type.sql | 23 +++-- ...t_of_third_party_with_security_headers.sql | 15 +++- .../percent_of_websites_with_third_party.sql | 14 +++- ...f_websites_with_third_party_by_ranking.sql | 26 ++++-- sql/2021/third-parties/tao_by_third_party.sql | 21 +++-- .../third_parties_blocking_main_thread.sql | 8 +- ...rties_blocking_main_thread_percentiles.sql | 11 ++- .../third_parties_blocking_rendering.sql | 10 ++- ...parties_blocking_rendering_percentiles.sql | 4 +- ...d_parties_by_median_body_size_and_time.sql | 15 +++- ...00_third_parties_by_number_of_websites.sql | 15 +++- 20 files changed, 371 insertions(+), 54 deletions(-) rename sql/2021/third-parties/{distribution_of_websites_by number_of_third_parties.sql => distribution_of_websites_by_number_of_third_parties.sql} (76%) create mode 100644 sql/2021/third-parties/number_of_third_parties_by_rank.sql create mode 100644 sql/2021/third-parties/number_of_third_parties_by_rank_and_category.sql diff --git a/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql b/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql index c8d715bf9ec..89572407bca 100644 --- a/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql +++ b/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql @@ -5,6 +5,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, + pageid AS page, url, status, respBodySize AS body_size @@ -14,12 +15,22 @@ WITH requests AS ( third_party AS ( SELECT - domain + domain, + category, + COUNT(DISTINCT page) AS page_usage FROM - `httparchive.almanac.third_parties` + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) WHERE date = '2021-07-01' AND category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 ), base AS ( diff --git a/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql b/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql index ae6d4379908..79ff1e40110 100644 --- a/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql +++ b/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql @@ -4,6 +4,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, + pageid AS page, url, respBodySize AS body_size, time @@ -13,13 +14,22 @@ WITH requests AS ( third_party AS ( SELECT + domain, category, - domain + COUNT(DISTINCT page) AS page_usage FROM - `httparchive.almanac.third_parties` + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) WHERE date = '2021-07-01' AND category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 ), base AS ( diff --git a/sql/2021/third-parties/distribution_of_third_parties_by_number_of_websites.sql b/sql/2021/third-parties/distribution_of_third_parties_by_number_of_websites.sql index f5c1171e2d0..c71de4abdf1 100644 --- a/sql/2021/third-parties/distribution_of_third_parties_by_number_of_websites.sql +++ b/sql/2021/third-parties/distribution_of_third_parties_by_number_of_websites.sql @@ -13,12 +13,23 @@ WITH requests AS ( third_party AS ( SELECT domain, - canonicalDomain + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage FROM - `httparchive.almanac.third_parties` + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) WHERE date = '2021-07-01' AND category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 ), base AS ( diff --git a/sql/2021/third-parties/distribution_of_websites_by number_of_third_parties.sql b/sql/2021/third-parties/distribution_of_websites_by_number_of_third_parties.sql similarity index 76% rename from sql/2021/third-parties/distribution_of_websites_by number_of_third_parties.sql rename to sql/2021/third-parties/distribution_of_websites_by_number_of_third_parties.sql index a96662e38b7..ecd7db3a8ee 100644 --- a/sql/2021/third-parties/distribution_of_websites_by number_of_third_parties.sql +++ b/sql/2021/third-parties/distribution_of_websites_by_number_of_third_parties.sql @@ -12,12 +12,22 @@ WITH requests AS ( third_party AS ( SELECT - domain + domain, + category, + COUNT(DISTINCT page) AS page_usage FROM - `httparchive.almanac.third_parties` + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) WHERE date = '2021-07-01' AND category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 ), base AS ( diff --git a/sql/2021/third-parties/number_of_third_parties_by_rank.sql b/sql/2021/third-parties/number_of_third_parties_by_rank.sql new file mode 100644 index 00000000000..51e3dc734c0 --- /dev/null +++ b/sql/2021/third-parties/number_of_third_parties_by_rank.sql @@ -0,0 +1,77 @@ +#standardSQL +# Number of third-parties per websites by rank +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2021_07_01_*` +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + rank + FROM + `httparchive.summary_pages.2021_07_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2021-07-01' AND + category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + page, + rank, + COUNT(domain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + INNER JOIN + pages + USING + (client, page) + GROUP BY + client, + page, + rank +) + +SELECT + client, + rank_grouping, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page +FROM + base, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping diff --git a/sql/2021/third-parties/number_of_third_parties_by_rank_and_category.sql b/sql/2021/third-parties/number_of_third_parties_by_rank_and_category.sql new file mode 100644 index 00000000000..ec485767567 --- /dev/null +++ b/sql/2021/third-parties/number_of_third_parties_by_rank_and_category.sql @@ -0,0 +1,83 @@ +#standardSQL +# Number of third-parties per websites by rank and category + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + url + FROM + `httparchive.summary_requests.2021_07_01_*` +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + rank + FROM + `httparchive.summary_pages.2021_07_01_*` +), + +third_party AS ( + SELECT + domain, + category, + COUNT(DISTINCT page) AS page_usage + FROM + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) + WHERE + date = '2021-07-01' AND + category NOT IN ('hosting') + GROUP BY + domain, + category + HAVING + page_usage >= 50 +), + +base AS ( + SELECT + client, + category, + page, + rank, + COUNT(domain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.url) = NET.HOST(third_party.domain) + INNER JOIN + pages + USING + (client, page) + GROUP BY + client, + category, + page, + rank +) + +SELECT + client, + category, + rank_grouping, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page +FROM + base, + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + category, + rank_grouping +ORDER BY + client, + category, + rank_grouping diff --git a/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql b/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql index 46835de28fa..f4bcbc32b15 100644 --- a/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql +++ b/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql @@ -4,6 +4,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, + pageid AS page, url, type AS contentType FROM @@ -12,12 +13,22 @@ WITH requests AS ( third_party AS ( SELECT - domain + domain, + category, + COUNT(DISTINCT page) AS page_usage FROM - `httparchive.almanac.third_parties` + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) WHERE date = '2021-07-01' AND category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 ) SELECT diff --git a/sql/2021/third-parties/percent_of_third_party_cache.sql b/sql/2021/third-parties/percent_of_third_party_cache.sql index 7294c55342c..14235555ddf 100644 --- a/sql/2021/third-parties/percent_of_third_party_cache.sql +++ b/sql/2021/third-parties/percent_of_third_party_cache.sql @@ -10,19 +10,30 @@ WITH requests AS ( respOtherHeaders, reqOtherHeaders, type, - url + url, + pageid AS page FROM `httparchive.summary_requests.2021_07_01_*` ), third_party AS ( SELECT - domain + domain, + category, + COUNT(DISTINCT page) AS page_usage FROM - `httparchive.almanac.third_parties` + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) WHERE date = '2021-07-01' AND category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 ), base AS ( diff --git a/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql b/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql index 638dcb6924b..b1a4d170295 100644 --- a/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql +++ b/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql @@ -22,13 +22,22 @@ pages AS ( third_party AS ( SELECT + domain, category, - domain + COUNT(DISTINCT page) AS page_usage FROM - `httparchive.almanac.third_parties` + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) WHERE date = '2021-07-01' AND category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 ), base AS ( diff --git a/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql b/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql index 365092b19e9..1532663655a 100644 --- a/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql +++ b/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql @@ -14,13 +14,22 @@ WITH requests AS ( third_party AS ( SELECT + domain, category, - domain + COUNT(DISTINCT page) AS page_usage FROM - `httparchive.almanac.third_parties` + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) WHERE date = '2021-07-01' AND category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 ), base AS ( @@ -62,11 +71,11 @@ SELECT category, contentType, SUM(requests) AS requests, - AVG(requests) AS avg_requests_per_page, - SAFE_DIVIDE(SUM(requests), SUM(total_page_requests)) AS avg_pct_requests_per_page, - AVG(body_size) AS avg_body_size_per_page, - SAFE_DIVIDE(SUM(body_size), SUM(total_page_size)) AS avg_pct_body_size_per_page -FROM requests_per_page_and_category + SAFE_DIVIDE(SUM(requests), SUM(SUM(requests)) OVER (PARTITION BY client, category)) AS pct_requests, + SUM(body_size) AS body_size, + SAFE_DIVIDE(SUM(body_size), SUM(SUM(body_size)) OVER (PARTITION BY client, category)) AS pct_body_size +FROM + requests_per_page_and_category GROUP BY client, category, diff --git a/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql b/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql index de683c34b2d..c48431cd54a 100644 --- a/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql +++ b/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql @@ -4,6 +4,8 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, + pageid AS page, + url, RTRIM(urlShort, '/') AS origin, respOtherHeaders FROM @@ -12,13 +14,22 @@ WITH requests AS ( third_party AS ( SELECT + domain, category, - domain + COUNT(DISTINCT page) AS page_usage FROM - `httparchive.almanac.third_parties` + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) WHERE date = '2021-07-01' AND category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 ), headers AS ( diff --git a/sql/2021/third-parties/percent_of_websites_with_third_party.sql b/sql/2021/third-parties/percent_of_websites_with_third_party.sql index d8826ef6403..52cf0dacfa0 100644 --- a/sql/2021/third-parties/percent_of_websites_with_third_party.sql +++ b/sql/2021/third-parties/percent_of_websites_with_third_party.sql @@ -12,12 +12,22 @@ WITH requests AS ( third_party AS ( SELECT - domain + domain, + category, + COUNT(DISTINCT page) AS page_usage FROM - `httparchive.almanac.third_parties` + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) WHERE date = '2021-07-01' AND category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage > 50 ) SELECT diff --git a/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql b/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql index 6210a3ad4a7..2f133119869 100644 --- a/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql +++ b/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql @@ -4,7 +4,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, - pageid, + pageid AS page, url FROM `httparchive.summary_requests.2021_07_01_*` @@ -12,18 +12,28 @@ WITH requests AS ( third_party AS ( SELECT - domain + domain, + category, + COUNT(DISTINCT page) AS page_usage FROM - `httparchive.almanac.third_parties` + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) WHERE date = '2021-07-01' AND category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 ), pages AS ( SELECT _TABLE_SUFFIX AS client, - pageid, + pageid AS page, rank FROM `httparchive.summary_pages.2021_07_01_*` @@ -32,14 +42,14 @@ pages AS ( SELECT client, rank_grouping, - COUNT(DISTINCT IF(domain IS NOT NULL, pageid, NULL)) AS pages_with_third_party, - COUNT(DISTINCT pageid) AS total_pages, - COUNT(DISTINCT IF(domain IS NOT NULL, pageid, NULL)) / COUNT(DISTINCT pageid) AS pct_pages_with_third_party + COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) AS pages_with_third_party, + COUNT(DISTINCT page) AS total_pages, + COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) / COUNT(DISTINCT page) AS pct_pages_with_third_party FROM pages JOIN requests -USING (client, pageid) +USING (client, page) LEFT JOIN third_party ON NET.HOST(requests.url) = NET.HOST(third_party.domain), diff --git a/sql/2021/third-parties/tao_by_third_party.sql b/sql/2021/third-parties/tao_by_third_party.sql index 3106ac25fde..281091443ab 100644 --- a/sql/2021/third-parties/tao_by_third_party.sql +++ b/sql/2021/third-parties/tao_by_third_party.sql @@ -17,7 +17,8 @@ RETURNS STRING LANGUAGE js AS ''' WITH requests AS ( SELECT _TABLE_SUFFIX AS client, - pageid, + pageid AS page, + url, RTRIM(urlShort, '/') AS origin, respOtherHeaders FROM @@ -27,7 +28,8 @@ WITH requests AS ( pages AS ( SELECT _TABLE_SUFFIX AS client, - pageid, + url, + pageid AS page, RTRIM(urlShort, '/') AS origin FROM `httparchive.summary_pages.2021_07_01_*` @@ -35,13 +37,22 @@ pages AS ( third_party AS ( SELECT + domain, category, - domain + COUNT(DISTINCT page) AS page_usage FROM - `httparchive.almanac.third_parties` + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) WHERE date = '2021-07-01' AND category != 'hosting' + GROUP BY + domain, + category + HAVING + page_usage >= 50 ), headers AS ( @@ -53,7 +64,7 @@ headers AS ( third_party.category AS req_category FROM requests LEFT JOIN pages - USING (client, pageid) + USING (client, page) INNER JOIN third_party ON NET.HOST(requests.origin) = NET.HOST(third_party.domain) ), diff --git a/sql/2021/third-parties/third_parties_blocking_main_thread.sql b/sql/2021/third-parties/third_parties_blocking_main_thread.sql index b57feb4b1d1..b8cde89bf43 100644 --- a/sql/2021/third-parties/third_parties_blocking_main_thread.sql +++ b/sql/2021/third-parties/third_parties_blocking_main_thread.sql @@ -10,9 +10,11 @@ SELECT domain, category, - COUNT(0) AS total_pages, + COUNT(DISTINCT page) AS total_pages, COUNTIF(blocking > 0) AS blocking_pages, + COUNT(DISTINCT page) - COUNTIF(blocking > 0) AS non_blocking_pages, COUNTIF(blocking > 0) / COUNT(0) AS blocking_pages_pct, + (COUNT(DISTINCT page) - COUNTIF(blocking > 0)) / COUNT(0) AS non_blocking_pages_pct, APPROX_QUANTILES(transfer_size_kib, 1000)[OFFSET(500)] AS p50_transfer_size_kib, APPROX_QUANTILES(blocking_time, 1000)[OFFSET(500)] AS p50_blocking_time FROM ( @@ -20,7 +22,7 @@ FROM ( JSON_VALUE(third_party_items, "$.entity.url") AS domain, page, JSON_VALUE(third_party_items, "$.entity.text") AS category, - COUNTIF(JSON_VALUE(third_party_items, "$.blockingTime") != "0") AS blocking, + COUNTIF(SAFE_CAST(JSON_VALUE(third_party_items, "$.blockingTime") AS FLOAT64) > 250) AS blocking, SUM(SAFE_CAST(JSON_VALUE(third_party_items, "$.blockingTime") AS FLOAT64)) AS blocking_time, SUM(SAFE_CAST(JSON_VALUE(third_party_items, "$.transferSize") AS FLOAT64) / 1024) AS transfer_size_kib FROM @@ -40,6 +42,8 @@ FROM ( GROUP BY domain, category +HAVING + total_pages >= 50 ORDER BY total_pages DESC LIMIT 200 diff --git a/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql b/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql index e7d7117fc84..1de228cf958 100644 --- a/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql +++ b/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql @@ -10,16 +10,17 @@ SELECT domain, category, - COUNT(0) AS total_pages, + COUNT(DISTINCT page) AS total_pages, + COUNTIF(blocking > 0) AS blocking_pages, percentile, - APPROX_QUANTILES(transfer_size_kib, 1000)[OFFSET(percentile * 10)] AS transfer_size_kib, - APPROX_QUANTILES(blocking_time, 1000)[OFFSET(percentile * 10)] AS blocking_time + APPROX_QUANTILES(transfer_size_kib, 1000)[OFFSET(percentile * 10)] AS p50_transfer_size_kib, + APPROX_QUANTILES(blocking_time, 1000)[OFFSET(percentile * 10)] AS p50_blocking_time FROM ( SELECT JSON_VALUE(third_party_items, "$.entity.url") AS domain, page, JSON_VALUE(third_party_items, "$.entity.text") AS category, - COUNTIF(JSON_VALUE(third_party_items, "$.blockingTime") != "0") AS blocking, + COUNTIF(SAFE_CAST(JSON_VALUE(third_party_items, "$.blockingTime") AS FLOAT64) > 250) AS blocking, SUM(SAFE_CAST(JSON_VALUE(third_party_items, "$.blockingTime") AS FLOAT64)) AS blocking_time, SUM(SAFE_CAST(JSON_VALUE(third_party_items, "$.transferSize") AS FLOAT64) / 1024) AS transfer_size_kib FROM @@ -41,6 +42,8 @@ GROUP BY domain, category, percentile +HAVING + total_pages >= 50 ORDER BY total_pages DESC, category, diff --git a/sql/2021/third-parties/third_parties_blocking_rendering.sql b/sql/2021/third-parties/third_parties_blocking_rendering.sql index c9f5f832595..babb1725e9a 100644 --- a/sql/2021/third-parties/third_parties_blocking_rendering.sql +++ b/sql/2021/third-parties/third_parties_blocking_rendering.sql @@ -29,16 +29,18 @@ total_third_party_usage AS ( GROUP BY canonicalDomain, category + HAVING + total_pages >= 50 ) SELECT canonicalDomain, category, total_pages, - COUNT(0) AS blocking_pages, - total_pages - COUNT(0) AS non_blocking_pages, - COUNT(0) / total_pages AS blocking_pages_pct, - 1 - (COUNT(0) / total_pages) AS non_blocking_pages_pct, + COUNT(DISTINCT page) AS blocking_pages, + total_pages - COUNT(DISTINCT page) AS non_blocking_pages, + COUNT(DISTINCT page) / total_pages AS blocking_pages_pct, + (total_pages - COUNT(DISTINCT page)) / total_pages AS non_blocking_pages_pct, APPROX_QUANTILES(wasted_ms, 1000)[OFFSET(500)] AS p50_wastedMs, APPROX_QUANTILES(total_bytes_kib, 1000)[OFFSET(500)] AS p50_total_bytes_kib FROM ( diff --git a/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql b/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql index a103488b368..27dead60974 100644 --- a/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql +++ b/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql @@ -29,13 +29,15 @@ total_third_party_usage AS ( GROUP BY canonicalDomain, category + HAVING + total_pages >= 50 ) SELECT canonicalDomain, category, total_pages, - COUNT(0) AS blocking_pages, + COUNT(DISTINCT page) AS blocking_pages, percentile, APPROX_QUANTILES(wasted_ms, 1000)[OFFSET(percentile * 10)] AS wasted_ms, APPROX_QUANTILES(total_bytes_kib, 1000)[OFFSET(percentile * 10)] AS total_bytes_kib diff --git a/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql b/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql index 809a6ac33d4..04eab0a90d2 100644 --- a/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql +++ b/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql @@ -5,6 +5,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, url, + pageid AS page, respBodySize AS body_size, time FROM @@ -14,13 +15,23 @@ WITH requests AS ( third_party AS ( SELECT domain, + category, canonicalDomain, - category + COUNT(DISTINCT page) AS page_usage FROM - `httparchive.almanac.third_parties` + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) WHERE date = '2021-07-01' AND category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 ), base AS ( diff --git a/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql b/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql index 7d4a1e3db5d..e1ad033079a 100644 --- a/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql +++ b/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql @@ -22,12 +22,23 @@ totals AS ( third_party AS ( SELECT domain, - canonicalDomain + canonicalDomain, + category, + COUNT(DISTINCT page) AS page_usage FROM - `httparchive.almanac.third_parties` + `httparchive.almanac.third_parties` tp + JOIN + requests r + ON NET.HOST(r.url) = NET.HOST(tp.domain) WHERE date = '2021-07-01' AND category != 'hosting' + GROUP BY + domain, + canonicalDomain, + category + HAVING + page_usage >= 50 ) SELECT