From 9263fb3957128c0693d20e0acded527fc8725d7d Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 30 Sep 2021 11:36:03 +0100 Subject: [PATCH 01/21] Conversion of 2021 queries --- sql/2021/third-parties/README.md | 0 ...distribution_of_3XX_response_body_size.sql | 52 ++++++++++++ ...tion_of_size_and_time_by_third_parties.sql | 51 ++++++++++++ ...of_third_parties_by_number_of_websites.sql | 53 ++++++++++++ ...of_websites_by number_of_third_parties.sql | 50 +++++++++++ ...rcent_of_third_parties_by_content_type.sql | 51 ++++++++++++ .../percent_of_third_party_cache.sql | 61 ++++++++++++++ ...d_party_loaded_before_DOMContentLoaded.sql | 55 +++++++++++++ ...and_bytes_by_category_and_content_type.sql | 69 ++++++++++++++++ ...t_of_third_party_with_security_headers.sql | 53 ++++++++++++ .../percent_of_websites_with_third_party.sql | 31 +++++++ sql/2021/third-parties/tao_by_third_party.sql | 82 +++++++++++++++++++ ...d_parties_by_median_body_size_and_time.sql | 76 +++++++++++++++++ ...00_third_parties_by_number_of_websites.sql | 60 ++++++++++++++ .../zero_third_party_websites.sql | 67 +++++++++++++++ 15 files changed, 811 insertions(+) create mode 100644 sql/2021/third-parties/README.md create mode 100644 sql/2021/third-parties/distribution_of_3XX_response_body_size.sql create mode 100644 sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql create mode 100644 sql/2021/third-parties/distribution_of_third_parties_by_number_of_websites.sql create mode 100644 sql/2021/third-parties/distribution_of_websites_by number_of_third_parties.sql create mode 100644 sql/2021/third-parties/percent_of_third_parties_by_content_type.sql create mode 100644 sql/2021/third-parties/percent_of_third_party_cache.sql create mode 100644 sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql create mode 100644 sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql create mode 100644 sql/2021/third-parties/percent_of_third_party_with_security_headers.sql create mode 100644 sql/2021/third-parties/percent_of_websites_with_third_party.sql create mode 100644 sql/2021/third-parties/tao_by_third_party.sql create mode 100644 sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql create mode 100644 sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql create mode 100644 sql/2021/third-parties/zero_third_party_websites.sql diff --git a/sql/2021/third-parties/README.md b/sql/2021/third-parties/README.md new file mode 100644 index 00000000000..e69de29bb2d diff --git a/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql b/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql new file mode 100644 index 00000000000..67cd1a665e7 --- /dev/null +++ b/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql @@ -0,0 +1,52 @@ +#standardSQL +# Distribution of response body size by redirected third parties +# HTTP status codes documentation: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + req_host AS host, + status, + respBodySize AS body_size + FROM + `httparchive.summary_requests.2021_07_01_*` +), + +third_party AS ( + SELECT + domain + FROM + `httparchive.almanac.third_parties` + WHERE + date = '2021-07-01' +), + +base AS ( + SELECT + client, + domain, + IF(status BETWEEN 300 AND 399, 1, 0) AS redirected, + body_size + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.host) = NET.HOST(third_party.domain) +) + +SELECT + client, + percentile, + APPROX_QUANTILES(body_size, 1000)[OFFSET(percentile * 10)] AS approx_redirect_body_size +FROM + base, + UNNEST([10, 25, 50, 75, 90]) AS percentile +WHERE + redirected = 1 +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql b/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql new file mode 100644 index 00000000000..e17c8be4667 --- /dev/null +++ b/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql @@ -0,0 +1,51 @@ +#standardSQL +# Distribution of third party requests size and time by category + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + req_host AS host, + respBodySize AS body_size, + time + FROM + `httparchive.summary_requests.2021_07_01_*` +), + +third_party AS ( + SELECT + category, + domain + FROM + `httparchive.almanac.third_parties` + WHERE + date = '2021-07-01' +), + +base AS ( + SELECT + client, + category, + body_size, + time + FROM + requests + INNER JOIN + third_party + ON + NET.HOST(requests.host) = NET.HOST(third_party.domain) +) + +SELECT + category, + percentile, + APPROX_QUANTILES(body_size, 1000)[OFFSET(percentile * 10)] AS body_size, + APPROX_QUANTILES(time, 1000)[OFFSET(percentile * 10)] AS time +FROM + base, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + category, + percentile +ORDER BY + category, + percentile diff --git a/sql/2021/third-parties/distribution_of_third_parties_by_number_of_websites.sql b/sql/2021/third-parties/distribution_of_third_parties_by_number_of_websites.sql new file mode 100644 index 00000000000..34788925ccf --- /dev/null +++ b/sql/2021/third-parties/distribution_of_third_parties_by_number_of_websites.sql @@ -0,0 +1,53 @@ +#standardSQL +# Distribution of third parties by number of websites + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + req_host AS host + FROM + `httparchive.summary_requests.2021_07_01_*` +), + +third_party AS ( + SELECT + domain, + canonicalDomain + FROM + `httparchive.almanac.third_parties` + WHERE + date = '2021-07-01' +), + +base AS ( + SELECT + client, + canonicalDomain, + COUNT(DISTINCT page) AS pages_per_third_party + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.host) = NET.HOST(third_party.domain) + WHERE + canonicalDomain IS NOT NULL + GROUP BY + client, + canonicalDomain +) + +SELECT + client, + percentile, + APPROX_QUANTILES(pages_per_third_party, 1000)[OFFSET(percentile * 10)] AS approx_pages_per_third_party +FROM + base, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2021/third-parties/distribution_of_websites_by number_of_third_parties.sql b/sql/2021/third-parties/distribution_of_websites_by number_of_third_parties.sql new file mode 100644 index 00000000000..944d823df66 --- /dev/null +++ b/sql/2021/third-parties/distribution_of_websites_by number_of_third_parties.sql @@ -0,0 +1,50 @@ +#standardSQL +# Distribution of websites by number of third party + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + req_host AS host + FROM + `httparchive.summary_requests.2021_07_01_*` +), + +third_party AS ( + SELECT + domain + FROM + `httparchive.almanac.third_parties` + WHERE + date = '2021-07-01' +), + +base AS ( + SELECT + client, + page, + COUNT(domain) AS third_parties_per_page + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.host) = NET.HOST(third_party.domain) + GROUP BY + client, + page +) + +SELECT + client, + percentile, + APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(percentile * 10)] AS approx_third_parties_per_page +FROM + base, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + client, + percentile +ORDER BY + client, + percentile diff --git a/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql b/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql new file mode 100644 index 00000000000..213437d5edd --- /dev/null +++ b/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql @@ -0,0 +1,51 @@ +#standardSQL +# Percent of third party requests by content type. + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + req_host AS host, + type AS contentType + FROM + `httparchive.summary_requests.2021_07_01_*` +), + +third_party AS ( + SELECT + domain + FROM + `httparchive.almanac.third_parties` + WHERE + date = '2021-07-01' +), + +base AS ( + SELECT + client, + contentType, + COUNT(0) OVER (PARTITION BY client) AS total_requests + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.host) = NET.HOST(third_party.domain) + WHERE + domain IS NOT NULL +) + +SELECT + client, + contentType, + total_requests, + COUNT(0) AS requests, + COUNT(0) / total_requests AS pct_requests +FROM + base +GROUP BY + client, + contentType, + total_requests +ORDER BY + client, + contentType diff --git a/sql/2021/third-parties/percent_of_third_party_cache.sql b/sql/2021/third-parties/percent_of_third_party_cache.sql new file mode 100644 index 00000000000..69ebdcffcb9 --- /dev/null +++ b/sql/2021/third-parties/percent_of_third_party_cache.sql @@ -0,0 +1,61 @@ +#standardSQL +# Percent of third party requests cached +# Cache-Control documentation: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Cache-Control#Directives + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + resp_cache_control, + status, + respOtherHeaders, + reqOtherHeaders, + type, + req_host AS host + FROM + `httparchive.summary_requests.2021_07_01_*` +), + +third_party AS ( + SELECT + domain + FROM + `httparchive.almanac.third_parties` + WHERE + date = '2021-07-01' +), + +base AS ( + SELECT + client, + type, + IF( + ( + status IN (301, 302, 307, 308, 410) AND + NOT REGEXP_CONTAINS(resp_cache_control, r'(?i)private|no-store') AND + NOT REGEXP_CONTAINS(reqOtherHeaders, r'Authorization') + ) OR + ( + status IN (301, 302, 307, 308, 410) OR + REGEXP_CONTAINS(resp_cache_control, r'public|max-age|s-maxage') OR + REGEXP_CONTAINS(respOtherHeaders, r'Expires') + ), 1, 0) AS cached + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.host) = NET.HOST(third_party.domain) + WHERE + domain IS NOT NULL +) + +SELECT + client, + type, + COUNT(0) AS total_requests, + SUM(cached) / COUNT(0) AS pct_cached_requests +FROM + base +GROUP BY + client, + type diff --git a/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql b/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql new file mode 100644 index 00000000000..951d8ef65ed --- /dev/null +++ b/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql @@ -0,0 +1,55 @@ +#standardSQL +# Percent of third-party requests loaded before DOM Content Loaded event + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + page, + url, + SAFE_CAST(JSON_EXTRACT_SCALAR(payload, '$._load_end') AS INT64) AS load_end + FROM + `httparchive.requests.2021_07_01_*` +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + url, + onContentLoaded + FROM + `httparchive.summary_pages.2021_07_01_*` +), + +third_party AS ( + SELECT + category, + domain + FROM + `httparchive.almanac.third_parties` + WHERE + date = '2021-07-01' +), + +base AS ( + SELECT + requests.client AS client, + third_party.domain AS request_domain, + IF(requests.load_end < pages.onContentLoaded, 1, 0) AS early_request, + third_party.category AS request_category + FROM requests + INNER JOIN third_party + ON NET.HOST(requests.url) = NET.HOST(third_party.domain) + LEFT JOIN pages + ON requests.page = pages.url AND requests.client = pages.client +) + +SELECT + client, + request_category, + COUNT(0) AS total_requests, + SUM(early_request) / COUNT(0) AS pct_early_requests +FROM + base +GROUP BY + client, + request_category diff --git a/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql b/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql new file mode 100644 index 00000000000..271b0e27c4a --- /dev/null +++ b/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql @@ -0,0 +1,69 @@ +#standardSQL +# Percent of third party requests and bytes by category and content type. + +WITH requests AS ( + SELECT + pageid AS page, + req_host AS host, + type AS contentType, + respBodySize AS body_size + FROM + `httparchive.summary_requests.2021_07_01_mobile` +), + +third_party AS ( + SELECT + category, + domain + FROM + `httparchive.almanac.third_parties` + WHERE + date = '2021-07-01' +), + +base AS ( + SELECT + page, + category, + contentType, + body_size + FROM + requests + INNER JOIN + third_party + ON + NET.HOST(requests.host) = NET.HOST(third_party.domain) +), + +requests_per_page_and_category AS ( + SELECT + page, + category, + contentType, + SUM(SUM(body_size)) OVER (PARTITION BY page) AS total_page_size, + SUM(body_size) AS body_size, + SUM(COUNT(0)) OVER (PARTITION BY page) AS total_page_requests, + COUNT(0) AS requests + FROM + base + GROUP BY + page, + category, + contentType +) + +SELECT + category, + contentType, + SUM(requests) AS requests, + AVG(requests) AS avg_requests_per_page, + SAFE_DIVIDE(SUM(requests), SUM(total_page_requests)) AS avg_pct_requests_per_page, + AVG(body_size) AS avg_body_size_per_page, + SAFE_DIVIDE(SUM(body_size), SUM(total_page_size)) AS avg_pct_body_size_per_page +FROM requests_per_page_and_category +GROUP BY + category, + contentType +ORDER BY + category, + contentType diff --git a/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql b/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql new file mode 100644 index 00000000000..4685a2d27df --- /dev/null +++ b/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql @@ -0,0 +1,53 @@ +#standardSQL +# Percent of third-party requests with security headers + +WITH requests AS ( + SELECT + RTRIM(urlShort, '/') AS origin, + respOtherHeaders + FROM + `httparchive.summary_requests.2021_07_01_mobile` +), + +third_party AS ( + SELECT + category, + domain + FROM + `httparchive.almanac.third_parties` + WHERE + date = '2021-07-01' +), + +headers AS ( + SELECT + requests.origin AS req_origin, + LOWER(respOtherHeaders) AS respOtherHeaders, + third_party.category AS req_category + FROM requests + INNER JOIN third_party + ON NET.HOST(requests.origin) = NET.HOST(third_party.domain) +), + +base AS ( + SELECT + req_origin, + req_category, + IF(STRPOS(respOtherHeaders, "strict-transport-security") > 0, 1, 0) AS hsts_header, + IF(STRPOS(respOtherHeaders, "x-content-type-options") > 0, 1, 0) AS x_content_type_options_header, + IF(STRPOS(respOtherHeaders, "x-frame-options") > 0, 1, 0) AS x_frame_options_header, + IF(STRPOS(respOtherHeaders, "x-xss-protection") > 0, 1, 0) AS x_xss_protection_header + FROM headers +) + +SELECT + req_category, + COUNT(0) AS total_requests, + SUM(hsts_header) / COUNT(0) AS pct_hsts_header_requests, + SUM(x_content_type_options_header) / COUNT(0) AS pct_x_content_type_options_header_requests, + SUM(x_frame_options_header) / COUNT(0) AS pct_x_frame_options_header_requests, + SUM(x_xss_protection_header) / COUNT(0) AS pct_x_xss_protection_header_requests +FROM + base +GROUP BY + req_category diff --git a/sql/2021/third-parties/percent_of_websites_with_third_party.sql b/sql/2021/third-parties/percent_of_websites_with_third_party.sql new file mode 100644 index 00000000000..7ede2f491ab --- /dev/null +++ b/sql/2021/third-parties/percent_of_websites_with_third_party.sql @@ -0,0 +1,31 @@ +#standardSQL +# Percent of websites with third parties + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + req_host AS host + FROM + `httparchive.summary_requests.2021_07_01_*` +), + +third_party AS ( + SELECT + domain + FROM + `httparchive.almanac.third_parties` + WHERE + date = '2021-07-01' +) + +SELECT + client, + COUNT(DISTINCT page) AS total_pages, + COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) / COUNT(DISTINCT page) AS pct_pages_with_third_party +FROM + requests +LEFT JOIN third_party +ON NET.HOST(requests.host) = NET.HOST(third_party.domain) +GROUP BY + client diff --git a/sql/2021/third-parties/tao_by_third_party.sql b/sql/2021/third-parties/tao_by_third_party.sql new file mode 100644 index 00000000000..552eb804b48 --- /dev/null +++ b/sql/2021/third-parties/tao_by_third_party.sql @@ -0,0 +1,82 @@ +#standardSQL +# Percent of third-party requests with "Timing-Allow-Origin" headers +# Header reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Timing-Allow-Origin + +CREATE TEMP FUNCTION get_tao(headers STRING) +RETURNS STRING LANGUAGE js AS ''' + try { + const regex = /timing-allow-origin = (\\*|(http.*?,? )+)/gm; + output = regex.exec(headers)[1]+", "; + output = output.replace(/, , $/, ", "); + return output; + } catch (e) { + return false; + } +'''; + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid, + RTRIM(urlShort, '/') AS origin, + respOtherHeaders + FROM + `httparchive.summary_requests.2021_07_01_*` +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid, + RTRIM(urlShort, '/') AS origin + FROM + `httparchive.summary_pages.2021_07_01_*` +), + +third_party AS ( + SELECT + category, + domain + FROM + `httparchive.almanac.third_parties` + WHERE + date = '2021-07-01' +), + +headers AS ( + SELECT + requests.client AS client, + requests.origin AS req_origin, + pages.origin AS page_origin, + get_tao(LOWER(respOtherHeaders)) AS timing_allow_origin, + third_party.category AS req_category + FROM requests + LEFT JOIN pages + USING (client, pageid) + INNER JOIN third_party + ON NET.HOST(requests.origin) = NET.HOST(third_party.domain) +), + +base AS ( + SELECT + client, + req_origin, + page_origin, + timing_allow_origin, + req_category, + IF( + page_origin = req_origin OR + timing_allow_origin = "*, " OR + STRPOS(timing_allow_origin, CONCAT(page_origin, ", ")) > 0, + 1, 0) AS timing_allowed + FROM headers +) + +SELECT + client, + COUNT(0) AS total_requests, + SUM(timing_allowed) / COUNT(0) AS pct_timing_allowed_requests +FROM + base +GROUP BY + client diff --git a/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql b/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql new file mode 100644 index 00000000000..e837776768c --- /dev/null +++ b/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql @@ -0,0 +1,76 @@ +#standardSQL +# Top 100 third parties by median response body size, time + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + req_host AS host, + respBodySize AS body_size, + time + FROM + `httparchive.summary_requests.2021_07_01_*` +), + +third_party AS ( + SELECT + domain, + canonicalDomain, + category + FROM + `httparchive.almanac.third_parties` + WHERE + date = '2021-07-01' +), + +base AS ( + SELECT + client, + category, + canonicalDomain, + APPROX_QUANTILES(body_size, 1000)[OFFSET(500)] / 1024 AS median_body_size_kb, + APPROX_QUANTILES(time, 1000)[OFFSET(500)] / 1000 AS median_time_s + FROM + requests + INNER JOIN + third_party + ON + NET.HOST(requests.host) = NET.HOST(third_party.domain) + GROUP BY + client, + category, + canonicalDomain +) + +SELECT + ranking, + client, + category, + canonicalDomain, + metric, + rank +FROM ( + SELECT + 'median_body_size_kb' AS ranking, + client, + category, + canonicalDomain, + median_body_size_kb AS metric, + DENSE_RANK() OVER (PARTITION BY client ORDER BY median_body_size_kb DESC) AS rank + FROM base + UNION ALL ( + SELECT + 'median_time_s' AS ranking, + client, + category, + canonicalDomain, + median_time_s AS metric, + DENSE_RANK() OVER (PARTITION BY client ORDER BY median_time_s DESC) AS rank + FROM base + ) +) +WHERE + rank <= 100 +ORDER BY + ranking, + client, + metric DESC diff --git a/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql b/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql new file mode 100644 index 00000000000..fca1fe9333e --- /dev/null +++ b/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql @@ -0,0 +1,60 @@ +#standardSQL +# Top 100 third parties by number of websites + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + req_host AS host + FROM + `httparchive.summary_requests.2021_07_01_*` +), + +third_party AS ( + SELECT + domain, + canonicalDomain + FROM + `httparchive.almanac.third_parties` + WHERE + date = '2021-07-01' +), + +base AS ( + SELECT + client, + canonicalDomain, + COUNT(DISTINCT page) AS total_pages, + COUNT(DISTINCT page) / COUNT(0) OVER () AS pct_pages + FROM + requests + LEFT JOIN + third_party + ON + NET.HOST(requests.host) = NET.HOST(third_party.domain) + WHERE + canonicalDomain IS NOT NULL + GROUP BY + client, + canonicalDomain +) + + +SELECT + canonicalDomain, + total_pages, + pct_pages +FROM ( + SELECT + client, + canonicalDomain, + total_pages, + pct_pages, + DENSE_RANK() OVER (PARTITION BY client ORDER BY total_pages DESC) AS rank + FROM + base +) +WHERE + rank <= 100 +ORDER BY + total_pages DESC diff --git a/sql/2021/third-parties/zero_third_party_websites.sql b/sql/2021/third-parties/zero_third_party_websites.sql new file mode 100644 index 00000000000..f4d061d1dbc --- /dev/null +++ b/sql/2021/third-parties/zero_third_party_websites.sql @@ -0,0 +1,67 @@ +#standardSQL +# Websites with no third party requests + +# Provides incorrect information in some cases, e.g. pageid = 140607555 + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid AS page, + crawlid, + req_host AS host + FROM + `httparchive.summary_requests.2021_07_01_*` +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid, + crawlid, + wptid, + reqTotal, + url + FROM + `httparchive.summary_pages.2021_07_01_*` +), + +base AS ( + SELECT + requests.client, + LOGICAL_AND(NET.HOST(host) = NET.HOST(url)) AS zero_third_party, + url, + requests.crawlid AS requests_crawl, + pages.crawlid AS pages_crawl, + wptid, + reqTotal + FROM + requests + JOIN + pages + ON + requests.page = pages.pageid AND + requests.client = pages.client + GROUP BY + client, + url, + requests_crawl, + pages_crawl, + wptid, + reqTotal + HAVING + zero_third_party = TRUE + +) + +SELECT + client, + url, + requests_crawl, + pages_crawl, + wptid, + reqTotal +FROM + base +ORDER BY + reqTotal DESC +LIMIT 1000 From 300703ae7cada5f2fd8882d3977c2e58da8f7f28 Mon Sep 17 00:00:00 2001 From: Barry Date: Sun, 3 Oct 2021 23:59:20 +0100 Subject: [PATCH 02/21] Add Markdown file back --- src/content/en/2021/third-parties.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 src/content/en/2021/third-parties.md diff --git a/src/content/en/2021/third-parties.md b/src/content/en/2021/third-parties.md new file mode 100644 index 00000000000..4eb1dff3c5e --- /dev/null +++ b/src/content/en/2021/third-parties.md @@ -0,0 +1,20 @@ +--- +#See https://github.com/HTTPArchive/almanac.httparchive.org/wiki/Authors'-Guide#metadata-to-add-at-the-top-of-your-chapters +title: Third Parties +description: TODO +authors: [] +reviewers: [] +analysts: [] +editors: [] +translators: [] +results: https://docs.google.com/spreadsheets/d/1tf4RMF8SYr6he9tbqt61yuFJ_QK-F-i7XPxaPkpKSDI/ +featured_quote: TODO +featured_stat_1: TODO +featured_stat_label_1: TODO +featured_stat_2: TODO +featured_stat_label_2: TODO +featured_stat_3: TODO +featured_stat_label_3: TODO +--- + +## TODO From ce31c2410de0d83e3eb8a3045f3b45e124302734 Mon Sep 17 00:00:00 2001 From: Barry Date: Mon, 11 Oct 2021 09:48:54 +0100 Subject: [PATCH 03/21] Add client --- .../distribution_of_size_and_time_by_third_parties.sql | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql b/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql index e17c8be4667..624832893de 100644 --- a/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql +++ b/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql @@ -36,6 +36,7 @@ base AS ( ) SELECT + client, category, percentile, APPROX_QUANTILES(body_size, 1000)[OFFSET(percentile * 10)] AS body_size, @@ -44,8 +45,10 @@ FROM base, UNNEST([10, 25, 50, 75, 90]) AS percentile GROUP BY + client, category, percentile ORDER BY + client, category, percentile From db1fd1ae66ff6a88823af81705643bd3316b2aa1 Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 13 Oct 2021 00:28:52 +0100 Subject: [PATCH 04/21] Fix queries --- ...rcent_of_third_parties_by_content_type.sql | 30 +++------ ...d_parties_by_median_body_size_and_time.sql | 8 +-- ...00_third_parties_by_number_of_websites.sql | 65 +++++++++---------- 3 files changed, 46 insertions(+), 57 deletions(-) diff --git a/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql b/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql index 213437d5edd..c4afe3cee91 100644 --- a/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql +++ b/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql @@ -17,35 +17,25 @@ third_party AS ( `httparchive.almanac.third_parties` WHERE date = '2021-07-01' -), - -base AS ( - SELECT - client, - contentType, - COUNT(0) OVER (PARTITION BY client) AS total_requests - FROM - requests - LEFT JOIN - third_party - ON - NET.HOST(requests.host) = NET.HOST(third_party.domain) - WHERE - domain IS NOT NULL ) SELECT client, contentType, - total_requests, COUNT(0) AS requests, - COUNT(0) / total_requests AS pct_requests + SUM(COUNT(0)) OVER (PARTITION BY client) AS total_requests, + COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY client) AS pct_requests FROM - base + requests +LEFT JOIN + third_party +ON + NET.HOST(requests.host) = NET.HOST(third_party.domain) +WHERE + domain IS NOT NULL GROUP BY client, - contentType, - total_requests + contentType ORDER BY client, contentType diff --git a/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql b/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql index e837776768c..76d36029c71 100644 --- a/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql +++ b/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql @@ -47,7 +47,7 @@ SELECT category, canonicalDomain, metric, - rank + sorted_order FROM ( SELECT 'median_body_size_kb' AS ranking, @@ -55,7 +55,7 @@ FROM ( category, canonicalDomain, median_body_size_kb AS metric, - DENSE_RANK() OVER (PARTITION BY client ORDER BY median_body_size_kb DESC) AS rank + DENSE_RANK() OVER (PARTITION BY client ORDER BY median_body_size_kb DESC) AS sorted_order FROM base UNION ALL ( SELECT @@ -64,12 +64,12 @@ FROM ( category, canonicalDomain, median_time_s AS metric, - DENSE_RANK() OVER (PARTITION BY client ORDER BY median_time_s DESC) AS rank + DENSE_RANK() OVER (PARTITION BY client ORDER BY median_time_s DESC) AS sorted_order FROM base ) ) WHERE - rank <= 100 + sorted_order <= 100 ORDER BY ranking, client, diff --git a/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql b/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql index fca1fe9333e..3c48cbb3187 100644 --- a/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql +++ b/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql @@ -10,6 +10,15 @@ WITH requests AS ( `httparchive.summary_requests.2021_07_01_*` ), +totals AS ( + SELECT + _TABLE_SUFFIX AS client, + COUNT(DISTINCT pageid) as total_pages + FROM + `httparchive.summary_requests.2021_07_01_*` + GROUP BY _TABLE_SUFFIX +), + third_party AS ( SELECT domain, @@ -18,43 +27,33 @@ third_party AS ( `httparchive.almanac.third_parties` WHERE date = '2021-07-01' -), - -base AS ( - SELECT - client, - canonicalDomain, - COUNT(DISTINCT page) AS total_pages, - COUNT(DISTINCT page) / COUNT(0) OVER () AS pct_pages - FROM - requests - LEFT JOIN - third_party - ON - NET.HOST(requests.host) = NET.HOST(third_party.domain) - WHERE - canonicalDomain IS NOT NULL - GROUP BY - client, - canonicalDomain ) - SELECT + client, canonicalDomain, + COUNT(DISTINCT page) AS pages, total_pages, - pct_pages -FROM ( - SELECT - client, - canonicalDomain, - total_pages, - pct_pages, - DENSE_RANK() OVER (PARTITION BY client ORDER BY total_pages DESC) AS rank - FROM - base -) + COUNT(DISTINCT page) / total_pages AS pct_pages, + DENSE_RANK() OVER (PARTITION BY client ORDER BY COUNT(DISTINCT page) DESC) AS sorted_order +FROM + requests +LEFT JOIN + third_party +ON + NET.HOST(requests.host) = NET.HOST(third_party.domain) +JOIN + totals +USING (client) WHERE - rank <= 100 + canonicalDomain IS NOT NULL +GROUP BY + client, + total_pages, + canonicalDomain +QUALIFY + sorted_order <= 100 ORDER BY - total_pages DESC + pct_pages DESC, + client + From 148b26d3b320535a1b8e618bc05ddd05b43acb6b Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 13 Oct 2021 00:36:20 +0100 Subject: [PATCH 05/21] Linting fixes --- .../top100_third_parties_by_number_of_websites.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql b/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql index 3c48cbb3187..2fd5928b13d 100644 --- a/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql +++ b/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql @@ -13,7 +13,7 @@ WITH requests AS ( totals AS ( SELECT _TABLE_SUFFIX AS client, - COUNT(DISTINCT pageid) as total_pages + COUNT(DISTINCT pageid) AS total_pages FROM `httparchive.summary_requests.2021_07_01_*` GROUP BY _TABLE_SUFFIX From 8b64d954577728d1e853e00b36b68ba0435fd2ac Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 13 Oct 2021 01:10:31 +0100 Subject: [PATCH 06/21] More query fixes --- ...distribution_of_3XX_response_body_size.sql | 2 +- ...tion_of_size_and_time_by_third_parties.sql | 2 +- .../zero_third_party_websites.sql | 67 ------------------- 3 files changed, 2 insertions(+), 69 deletions(-) delete mode 100644 sql/2021/third-parties/zero_third_party_websites.sql diff --git a/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql b/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql index 67cd1a665e7..5f8ecbf4c47 100644 --- a/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql +++ b/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql @@ -41,7 +41,7 @@ SELECT APPROX_QUANTILES(body_size, 1000)[OFFSET(percentile * 10)] AS approx_redirect_body_size FROM base, - UNNEST([10, 25, 50, 75, 90]) AS percentile + UNNEST(GENERATE_ARRAY(1, 100)) AS percentile WHERE redirected = 1 GROUP BY diff --git a/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql b/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql index 624832893de..869f8bfab09 100644 --- a/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql +++ b/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql @@ -43,7 +43,7 @@ SELECT APPROX_QUANTILES(time, 1000)[OFFSET(percentile * 10)] AS time FROM base, - UNNEST([10, 25, 50, 75, 90]) AS percentile + UNNEST(GENERATE_ARRAY(1, 100)) AS percentile GROUP BY client, category, diff --git a/sql/2021/third-parties/zero_third_party_websites.sql b/sql/2021/third-parties/zero_third_party_websites.sql deleted file mode 100644 index f4d061d1dbc..00000000000 --- a/sql/2021/third-parties/zero_third_party_websites.sql +++ /dev/null @@ -1,67 +0,0 @@ -#standardSQL -# Websites with no third party requests - -# Provides incorrect information in some cases, e.g. pageid = 140607555 - -WITH requests AS ( - SELECT - _TABLE_SUFFIX AS client, - pageid AS page, - crawlid, - req_host AS host - FROM - `httparchive.summary_requests.2021_07_01_*` -), - -pages AS ( - SELECT - _TABLE_SUFFIX AS client, - pageid, - crawlid, - wptid, - reqTotal, - url - FROM - `httparchive.summary_pages.2021_07_01_*` -), - -base AS ( - SELECT - requests.client, - LOGICAL_AND(NET.HOST(host) = NET.HOST(url)) AS zero_third_party, - url, - requests.crawlid AS requests_crawl, - pages.crawlid AS pages_crawl, - wptid, - reqTotal - FROM - requests - JOIN - pages - ON - requests.page = pages.pageid AND - requests.client = pages.client - GROUP BY - client, - url, - requests_crawl, - pages_crawl, - wptid, - reqTotal - HAVING - zero_third_party = TRUE - -) - -SELECT - client, - url, - requests_crawl, - pages_crawl, - wptid, - reqTotal -FROM - base -ORDER BY - reqTotal DESC -LIMIT 1000 From 41bc062bf7b9621e4f0dd64a6106a881ea8fb20d Mon Sep 17 00:00:00 2001 From: Barry Date: Wed, 13 Oct 2021 01:21:48 +0100 Subject: [PATCH 07/21] Add ranking query --- ...f_websites_with_third_party_by_ranking.sql | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql diff --git a/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql b/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql new file mode 100644 index 00000000000..8b609c66ec6 --- /dev/null +++ b/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql @@ -0,0 +1,52 @@ +#standardSQL +# Percent of websites with third parties by ranking + +WITH requests AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid, + req_host AS host + FROM + `httparchive.summary_requests.2021_07_01_*` +), + +third_party AS ( + SELECT + domain + FROM + `httparchive.almanac.third_parties` + WHERE + date = '2021-07-01' +), + +pages AS ( + SELECT + _TABLE_SUFFIX AS client, + pageid, + rank + FROM + `httparchive.summary_pages.2021_07_01_*` +) + +SELECT + client, + rank_grouping, + COUNT(DISTINCT pageid) AS total_pages, + COUNT(DISTINCT IF(domain IS NOT NULL, pageid, NULL)) / COUNT(DISTINCT pageid) AS pct_pages_with_third_party +FROM + pages +JOIN + requests +USING (client, pageid) +LEFT JOIN + third_party +ON NET.HOST(requests.host) = NET.HOST(third_party.domain), + UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping +WHERE + rank <= rank_grouping +GROUP BY + client, + rank_grouping +ORDER BY + client, + rank_grouping From e074b8da36c4c65ac581fa51a031a7985ab4f347 Mon Sep 17 00:00:00 2001 From: Barry Date: Sun, 17 Oct 2021 19:56:56 +0100 Subject: [PATCH 08/21] Add final queries --- .../third_parties_blocking_main_thread.sql | 42 +++++++++++ ...rties_blocking_main_thread_percentiles.sql | 45 +++++++++++ .../third_parties_blocking_rendering.sql | 75 +++++++++++++++++++ ...parties_blocking_rendering_percentiles.sql | 75 +++++++++++++++++++ 4 files changed, 237 insertions(+) create mode 100644 sql/2021/third-parties/third_parties_blocking_main_thread.sql create mode 100644 sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql create mode 100644 sql/2021/third-parties/third_parties_blocking_rendering.sql create mode 100644 sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql diff --git a/sql/2021/third-parties/third_parties_blocking_main_thread.sql b/sql/2021/third-parties/third_parties_blocking_main_thread.sql new file mode 100644 index 00000000000..3c964cad9b8 --- /dev/null +++ b/sql/2021/third-parties/third_parties_blocking_main_thread.sql @@ -0,0 +1,42 @@ +#standardSQL +# Third-Party domains which block the main thread +# Based heavily on research by Houssein Djirdeh: +# https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 + +SELECT + domain, + category, + COUNT(0) AS total_pages, + COUNTIF(blocking > 0) AS blocking_pages, + COUNTIF(blocking > 0) / COUNT(0) AS blocking_pages_pct, + APPROX_QUANTILES(transfer_size_kib, 1000)[OFFSET(500)] AS p50_transfer_size_kib, + APPROX_QUANTILES(blocking_time, 1000)[OFFSET(500)] AS p50_blocking_time +FROM ( + SELECT + JSON_VALUE(third_party_items, "$.entity.url") AS domain, + page, + JSON_VALUE(third_party_items, "$.entity.text") AS category, + COUNTIF(JSON_VALUE(third_party_items, "$.blockingTime") != "0") AS blocking, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, "$.blockingTime") AS FLOAT64)) AS blocking_time, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, "$.transferSize") AS FLOAT64) / 1024) AS transfer_size_kib + FROM + ( + SELECT + url AS page, + report + FROM + `httparchive.lighthouse.2021_07_01_mobile` + ), + UNNEST(JSON_QUERY_ARRAY(report, '$.audits.third-party-summary.details.items')) AS third_party_items + GROUP BY + domain, + page, + category + ) +GROUP BY + domain, + category +ORDER BY + total_pages DESC +LIMIT + 200 diff --git a/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql b/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql new file mode 100644 index 00000000000..855aae953bd --- /dev/null +++ b/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql @@ -0,0 +1,45 @@ +#standardSQL +# Third-Party domains which block the main thread by percentile +# Based heavily on research by Houssein Djirdeh: +# https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 + +SELECT + domain, + category, + COUNT(0) AS total_pages, + percentile, + APPROX_QUANTILES(transfer_size_kib, 1000)[OFFSET(percentile * 10)] AS transfer_size_kib, + APPROX_QUANTILES(blocking_time, 1000)[OFFSET(percentile * 10)] AS blocking_time +FROM ( + SELECT + JSON_VALUE(third_party_items, "$.entity.url") AS domain, + page, + JSON_VALUE(third_party_items, "$.entity.text") AS category, + COUNTIF(JSON_VALUE(third_party_items, "$.blockingTime") != "0") AS blocking, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, "$.blockingTime") AS FLOAT64)) AS blocking_time, + SUM(SAFE_CAST(JSON_VALUE(third_party_items, "$.transferSize") AS FLOAT64) / 1024) AS transfer_size_kib + FROM + ( + SELECT + url AS page, + report + FROM + `httparchive.lighthouse.2021_07_01_mobile` + ), + UNNEST(JSON_QUERY_ARRAY(report, '$.audits.third-party-summary.details.items')) AS third_party_items + GROUP BY + domain, + page, + category + ), + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + domain, + category, + percentile +ORDER BY + total_pages DESC, + category, + percentile +LIMIT + 200 diff --git a/sql/2021/third-parties/third_parties_blocking_rendering.sql b/sql/2021/third-parties/third_parties_blocking_rendering.sql new file mode 100644 index 00000000000..dc70d5297e1 --- /dev/null +++ b/sql/2021/third-parties/third_parties_blocking_rendering.sql @@ -0,0 +1,75 @@ +#standardSQL +# Third-Party domains which render block paint +# Based heavily on research by Houssein Djirdeh: +# https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 + +WITH +total_third_party_usage AS ( + SELECT + canonicalDomain, + category, + COUNT(DISTINCT sp.url) AS total_pages + FROM + `httparchive.summary_pages.2021_07_01_mobile` sp + INNER JOIN + `httparchive.summary_requests.2021_07_01_mobile` sr + USING (pageid) + INNER JOIN + `httparchive.almanac.third_parties` + ON + NET.HOST(sr.url) = NET.HOST(domain) AND + date = '2021-07-01' + GROUP BY + canonicalDomain, + category +) + +SELECT + canonicalDomain, + category, + total_pages, + COUNT(0) AS blocking_pages, + total_pages - COUNT(0) AS non_blocking_pages, + COUNT(0) / total_pages AS blocking_pages_pct, + 1 - (COUNT(0) / total_pages) AS non_blocking_pages_pct, + APPROX_QUANTILES(wasted_ms, 1000)[OFFSET(500)] AS p50_wastedMs, + APPROX_QUANTILES(total_bytes_kib, 1000)[OFFSET(500)] AS p50_total_bytes_kib +FROM ( + SELECT + canonicalDomain, + domain, + page, + category, + SUM(SAFE_CAST(JSON_VALUE(renderBlockingItems, "$.wastedMs") AS FLOAT64)) AS wasted_ms, + SUM(SAFE_CAST(JSON_VALUE(renderBlockingItems, "$.totalBytes") AS FLOAT64) / 1024) AS total_bytes_kib + FROM + ( + SELECT + url AS page, + report + FROM + `httparchive.lighthouse.2021_07_01_mobile` + ), + UNNEST(JSON_QUERY_ARRAY(report, '$.audits.render-blocking-resources.details.items')) AS renderBlockingItems + INNER JOIN + `httparchive.almanac.third_parties` + ON + NET.HOST(JSON_VALUE(renderBlockingItems, "$.url")) = domain + GROUP BY + canonicalDomain, + domain, + page, + category + ) +INNER JOIN + total_third_party_usage +USING (canonicalDomain, category) +GROUP BY + canonicalDomain, + category, + total_pages +ORDER BY + total_pages DESC, + category +LIMIT + 200 diff --git a/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql b/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql new file mode 100644 index 00000000000..6503f72efba --- /dev/null +++ b/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql @@ -0,0 +1,75 @@ +#standardSQL +# Third-Party domains which render block paint by percentile +# Based heavily on research by Houssein Djirdeh: +# https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 + +WITH +total_third_party_usage AS ( + SELECT + canonicalDomain, + category, + COUNT(DISTINCT sp.url) AS total_pages + FROM + `httparchive.summary_pages.2021_07_01_mobile` sp + INNER JOIN + `httparchive.summary_requests.2021_07_01_mobile` sr + USING (pageid) + INNER JOIN + `httparchive.almanac.third_parties` + ON + NET.HOST(sr.url) = NET.HOST(domain) AND + date = '2021-07-01' + GROUP BY + canonicalDomain, + category +) + +SELECT + canonicalDomain, + category, + total_pages, + COUNT(0) AS blocking_pages, + percentile, + APPROX_QUANTILES(wasted_ms, 1000)[OFFSET(percentile * 10)] AS wasted_ms, + APPROX_QUANTILES(total_bytes_kib, 1000)[OFFSET(percentile * 10)] AS total_bytes_kib +FROM ( + SELECT + canonicalDomain, + page, + category, + SUM(SAFE_CAST(JSON_VALUE(render_blocking_items, "$.wastedMs") AS FLOAT64)) AS wasted_ms, + SUM(SAFE_CAST(JSON_VALUE(render_blocking_items, "$.totalBytes") AS FLOAT64) / 1024) AS total_bytes_kib + FROM + ( + SELECT + url AS page, + report + FROM + `httparchive.lighthouse.2021_07_01_mobile` + ), + UNNEST(JSON_QUERY_ARRAY(report, '$.audits.render-blocking-resources.details.items')) AS render_blocking_items + INNER JOIN + `httparchive.almanac.third_parties` + ON + NET.HOST(JSON_VALUE(render_blocking_items, "$.url")) = domain AND + date = '2021-07-01' + GROUP BY + canonicalDomain, + page, + category + ) +INNER JOIN + total_third_party_usage +USING (canonicalDomain, category), + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile +GROUP BY + canonicalDomain, + category, + total_pages, + percentile +ORDER BY + total_pages DESC, + category, + percentile +LIMIT + 200 From f82c8e79e9b484c7705359cc01b127e3f8adde49 Mon Sep 17 00:00:00 2001 From: Barry Date: Sun, 17 Oct 2021 20:02:29 +0100 Subject: [PATCH 09/21] Add documentation --- sql/2021/third-parties/README.md | 18 ++++++++++++++++++ .../third_parties_blocking_main_thread.sql | 4 ++++ ...arties_blocking_main_thread_percentiles.sql | 4 ++++ .../third_parties_blocking_rendering.sql | 6 ++++++ ..._parties_blocking_rendering_percentiles.sql | 6 ++++++ 5 files changed, 38 insertions(+) diff --git a/sql/2021/third-parties/README.md b/sql/2021/third-parties/README.md index e69de29bb2d..148a8f53629 100644 --- a/sql/2021/third-parties/README.md +++ b/sql/2021/third-parties/README.md @@ -0,0 +1,18 @@ +# 2021 Third-Party queries + + + +Resources: + +- [Chapter issue](https://github.com/HTTPArchive/almanac.httparchive.org/issues/2145) +- [Planning doc](https://docs.google.com/document/d/164HhV76iVT2qVfFY2kzyr44eIcGAw3fWqNTbuoRfRvE/edit?usp=sharing) +- [Results sheet](https://docs.google.com/spreadsheets/d/1tf4RMF8SYr6he9tbqt61yuFJ_QK-F-i7XPxaPkpKSDI/edit?usp=sharing/) +- [2019 chapter](https://almanac.httparchive.org/en/2019/third-parties) +- [2020 chapter](https://almanac.httparchive.org/en/2020/third-parties) diff --git a/sql/2021/third-parties/third_parties_blocking_main_thread.sql b/sql/2021/third-parties/third_parties_blocking_main_thread.sql index 3c964cad9b8..d8b52c3de09 100644 --- a/sql/2021/third-parties/third_parties_blocking_main_thread.sql +++ b/sql/2021/third-parties/third_parties_blocking_main_thread.sql @@ -1,5 +1,9 @@ #standardSQL # Third-Party domains which block the main thread +# +# As Lighthouse measures all impact there is no need to do a separate total +# Lighthouse also gives a useable category. So no need to use almanac.third-parties table +# # Based heavily on research by Houssein Djirdeh: # https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 diff --git a/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql b/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql index 855aae953bd..be70995bb47 100644 --- a/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql +++ b/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql @@ -1,5 +1,9 @@ #standardSQL # Third-Party domains which block the main thread by percentile +# +# As Lighthouse measures all impact there is no need to do a separate total +# Lighthouse also gives a useable category. So no need to use almanac.third-parties table +# # Based heavily on research by Houssein Djirdeh: # https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 diff --git a/sql/2021/third-parties/third_parties_blocking_rendering.sql b/sql/2021/third-parties/third_parties_blocking_rendering.sql index dc70d5297e1..bda8560f8d4 100644 --- a/sql/2021/third-parties/third_parties_blocking_rendering.sql +++ b/sql/2021/third-parties/third_parties_blocking_rendering.sql @@ -1,5 +1,11 @@ #standardSQL # Third-Party domains which render block paint +# +# Unlike the blocking main thread queries, light nhouse only contains details if the +# third-party is render blocking (i.e. wastedMs/total_bytes are never 0) +# And also there are no categories given to each third-party +# So we join to the usual almanac.third_parties table to get those totals and categories +# # Based heavily on research by Houssein Djirdeh: # https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 diff --git a/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql b/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql index 6503f72efba..45df8a20500 100644 --- a/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql +++ b/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql @@ -1,5 +1,11 @@ #standardSQL # Third-Party domains which render block paint by percentile +# +# Unlike the blocking main thread queries, light nhouse only contains details if the +# third-party is render blocking (i.e. wastedMs/total_bytes are never 0) +# And also there are no categories given to each third-party +# So we join to the usual almanac.third_parties table to get those totals and categories +# # Based heavily on research by Houssein Djirdeh: # https://docs.google.com/spreadsheets/d/1Td-4qFjuBzxp8af_if5iBC0Lkqm_OROb7_2OcbxrU_g/edit?resourcekey=0-ZCfve5cngWxF0-sv5pLRzg#gid=1628564987 From 3bd6c5eae73d3723178cf656bbd4ccc3f6eb2f03 Mon Sep 17 00:00:00 2001 From: Barry Date: Sun, 17 Oct 2021 20:12:14 +0100 Subject: [PATCH 10/21] Linting --- sql/2021/third-parties/third_parties_blocking_main_thread.sql | 3 +-- .../third_parties_blocking_main_thread_percentiles.sql | 3 +-- sql/2021/third-parties/third_parties_blocking_rendering.sql | 3 +-- .../third_parties_blocking_rendering_percentiles.sql | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/sql/2021/third-parties/third_parties_blocking_main_thread.sql b/sql/2021/third-parties/third_parties_blocking_main_thread.sql index d8b52c3de09..b57feb4b1d1 100644 --- a/sql/2021/third-parties/third_parties_blocking_main_thread.sql +++ b/sql/2021/third-parties/third_parties_blocking_main_thread.sql @@ -42,5 +42,4 @@ GROUP BY category ORDER BY total_pages DESC -LIMIT - 200 +LIMIT 200 diff --git a/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql b/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql index be70995bb47..e7d7117fc84 100644 --- a/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql +++ b/sql/2021/third-parties/third_parties_blocking_main_thread_percentiles.sql @@ -45,5 +45,4 @@ ORDER BY total_pages DESC, category, percentile -LIMIT - 200 +LIMIT 200 diff --git a/sql/2021/third-parties/third_parties_blocking_rendering.sql b/sql/2021/third-parties/third_parties_blocking_rendering.sql index bda8560f8d4..2f120954b69 100644 --- a/sql/2021/third-parties/third_parties_blocking_rendering.sql +++ b/sql/2021/third-parties/third_parties_blocking_rendering.sql @@ -77,5 +77,4 @@ GROUP BY ORDER BY total_pages DESC, category -LIMIT - 200 +LIMIT 200 diff --git a/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql b/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql index 45df8a20500..7861dc3722d 100644 --- a/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql +++ b/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql @@ -77,5 +77,4 @@ ORDER BY total_pages DESC, category, percentile -LIMIT - 200 +LIMIT 200 From 09dfff4a29bf1123ee0a02ec6f5e51c551250ef2 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Thu, 21 Oct 2021 00:22:37 +0100 Subject: [PATCH 11/21] Update sql/2021/third-parties/percent_of_third_party_cache.sql Co-authored-by: Rick Viscomi --- sql/2021/third-parties/percent_of_third_party_cache.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/2021/third-parties/percent_of_third_party_cache.sql b/sql/2021/third-parties/percent_of_third_party_cache.sql index 69ebdcffcb9..cfc6c6c9ef6 100644 --- a/sql/2021/third-parties/percent_of_third_party_cache.sql +++ b/sql/2021/third-parties/percent_of_third_party_cache.sql @@ -52,6 +52,7 @@ base AS ( SELECT client, type, + SUM(cached) AS cached_requests, COUNT(0) AS total_requests, SUM(cached) / COUNT(0) AS pct_cached_requests FROM From acb9b3d69616a998bc137c642dd17ccc5f37715b Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Thu, 21 Oct 2021 00:24:57 +0100 Subject: [PATCH 12/21] Update sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql Co-authored-by: Rick Viscomi --- .../percent_of_third_party_loaded_before_DOMContentLoaded.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql b/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql index 951d8ef65ed..a99f97a4076 100644 --- a/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql +++ b/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql @@ -46,6 +46,7 @@ base AS ( SELECT client, request_category, + SUM(early_request) AS early_requests, COUNT(0) AS total_requests, SUM(early_request) / COUNT(0) AS pct_early_requests FROM From 7d6e2990f6560e51b71820f1d19d3d2b8b091214 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 21 Oct 2021 00:31:43 +0100 Subject: [PATCH 13/21] Add client --- ...requests_and_bytes_by_category_and_content_type.sql | 9 ++++++++- .../percent_of_third_party_with_security_headers.sql | 10 +++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql b/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql index 271b0e27c4a..53437bdbaee 100644 --- a/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql +++ b/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql @@ -3,12 +3,13 @@ WITH requests AS ( SELECT + _TABLE_SUFFIX as client, pageid AS page, req_host AS host, type AS contentType, respBodySize AS body_size FROM - `httparchive.summary_requests.2021_07_01_mobile` + `httparchive.summary_requests.2021_07_01_*` ), third_party AS ( @@ -23,6 +24,7 @@ third_party AS ( base AS ( SELECT + client, page, category, contentType, @@ -37,6 +39,7 @@ base AS ( requests_per_page_and_category AS ( SELECT + client, page, category, contentType, @@ -47,12 +50,14 @@ requests_per_page_and_category AS ( FROM base GROUP BY + client, page, category, contentType ) SELECT + client, category, contentType, SUM(requests) AS requests, @@ -62,8 +67,10 @@ SELECT SAFE_DIVIDE(SUM(body_size), SUM(total_page_size)) AS avg_pct_body_size_per_page FROM requests_per_page_and_category GROUP BY + client, category, contentType ORDER BY + client, category, contentType diff --git a/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql b/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql index 4685a2d27df..004127ba779 100644 --- a/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql +++ b/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql @@ -3,10 +3,11 @@ WITH requests AS ( SELECT + _TABLE_SUFFIX as client, RTRIM(urlShort, '/') AS origin, respOtherHeaders FROM - `httparchive.summary_requests.2021_07_01_mobile` + `httparchive.summary_requests.2021_07_01_*` ), third_party AS ( @@ -21,6 +22,7 @@ third_party AS ( headers AS ( SELECT + client, requests.origin AS req_origin, LOWER(respOtherHeaders) AS respOtherHeaders, third_party.category AS req_category @@ -31,6 +33,7 @@ headers AS ( base AS ( SELECT + client, req_origin, req_category, IF(STRPOS(respOtherHeaders, "strict-transport-security") > 0, 1, 0) AS hsts_header, @@ -41,6 +44,7 @@ base AS ( ) SELECT + client, req_category, COUNT(0) AS total_requests, SUM(hsts_header) / COUNT(0) AS pct_hsts_header_requests, @@ -50,4 +54,8 @@ SELECT FROM base GROUP BY + client, + req_category +ORDER BY + client, req_category From bc1e7d203003d6fe5adb239ad84ad67875da9451 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Thu, 21 Oct 2021 00:32:23 +0100 Subject: [PATCH 14/21] Update sql/2021/third-parties/percent_of_websites_with_third_party.sql Co-authored-by: Rick Viscomi --- sql/2021/third-parties/percent_of_websites_with_third_party.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/2021/third-parties/percent_of_websites_with_third_party.sql b/sql/2021/third-parties/percent_of_websites_with_third_party.sql index 7ede2f491ab..fd4915e285b 100644 --- a/sql/2021/third-parties/percent_of_websites_with_third_party.sql +++ b/sql/2021/third-parties/percent_of_websites_with_third_party.sql @@ -21,6 +21,7 @@ third_party AS ( SELECT client, + COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) AS pages_with_third_party, COUNT(DISTINCT page) AS total_pages, COUNT(DISTINCT IF(domain IS NOT NULL, page, NULL)) / COUNT(DISTINCT page) AS pct_pages_with_third_party FROM From c34685adba5b95983172878cc7aac7512c933fbe Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Thu, 21 Oct 2021 00:32:37 +0100 Subject: [PATCH 15/21] Update sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql Co-authored-by: Rick Viscomi --- .../percent_of_websites_with_third_party_by_ranking.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql b/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql index 8b609c66ec6..49d807f1577 100644 --- a/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql +++ b/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql @@ -31,6 +31,7 @@ pages AS ( SELECT client, rank_grouping, + COUNT(DISTINCT IF(domain IS NOT NULL, pageid, NULL)) AS pages_with_third_party, COUNT(DISTINCT pageid) AS total_pages, COUNT(DISTINCT IF(domain IS NOT NULL, pageid, NULL)) / COUNT(DISTINCT pageid) AS pct_pages_with_third_party FROM From 3849c51cd62a2c3c3a221a75c5a002aee534ecaa Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Thu, 21 Oct 2021 00:32:59 +0100 Subject: [PATCH 16/21] Update sql/2021/third-parties/tao_by_third_party.sql Co-authored-by: Rick Viscomi --- sql/2021/third-parties/tao_by_third_party.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/2021/third-parties/tao_by_third_party.sql b/sql/2021/third-parties/tao_by_third_party.sql index 552eb804b48..24b5e4e4bf5 100644 --- a/sql/2021/third-parties/tao_by_third_party.sql +++ b/sql/2021/third-parties/tao_by_third_party.sql @@ -74,6 +74,7 @@ base AS ( SELECT client, + SUM(timing_allowed) AS timing_allowed_requests, COUNT(0) AS total_requests, SUM(timing_allowed) / COUNT(0) AS pct_timing_allowed_requests FROM From f5dfb63ab44513a3661be5863a2e746c01865d42 Mon Sep 17 00:00:00 2001 From: Barry Pollard Date: Thu, 21 Oct 2021 00:33:37 +0100 Subject: [PATCH 17/21] Update sql/2021/third-parties/tao_by_third_party.sql Co-authored-by: Rick Viscomi --- sql/2021/third-parties/tao_by_third_party.sql | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sql/2021/third-parties/tao_by_third_party.sql b/sql/2021/third-parties/tao_by_third_party.sql index 24b5e4e4bf5..f5da6e7e29a 100644 --- a/sql/2021/third-parties/tao_by_third_party.sql +++ b/sql/2021/third-parties/tao_by_third_party.sql @@ -60,10 +60,6 @@ headers AS ( base AS ( SELECT client, - req_origin, - page_origin, - timing_allow_origin, - req_category, IF( page_origin = req_origin OR timing_allow_origin = "*, " OR From dc932117c7d4934b73e881c76b0b22309510316a Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 21 Oct 2021 00:38:14 +0100 Subject: [PATCH 18/21] Linting fixes --- ...rd_party_requests_and_bytes_by_category_and_content_type.sql | 2 +- .../percent_of_third_party_with_security_headers.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql b/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql index 53437bdbaee..bd48b01047c 100644 --- a/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql +++ b/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql @@ -3,7 +3,7 @@ WITH requests AS ( SELECT - _TABLE_SUFFIX as client, + _TABLE_SUFFIX AS client, pageid AS page, req_host AS host, type AS contentType, diff --git a/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql b/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql index 004127ba779..284f501f2b3 100644 --- a/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql +++ b/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql @@ -3,7 +3,7 @@ WITH requests AS ( SELECT - _TABLE_SUFFIX as client, + _TABLE_SUFFIX AS client, RTRIM(urlShort, '/') AS origin, respOtherHeaders FROM From fc1f9bf1cb4c21b242b286e5f40bad433c722fd7 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 21 Oct 2021 01:42:50 +0100 Subject: [PATCH 19/21] Change NET.HOST and removing hosting category --- .../distribution_of_3XX_response_body_size.sql | 7 ++++--- ...istribution_of_size_and_time_by_third_parties.sql | 7 ++++--- ...bution_of_third_parties_by_number_of_websites.sql | 7 ++++--- ...bution_of_websites_by number_of_third_parties.sql | 7 ++++--- .../percent_of_third_parties_by_content_type.sql | 7 ++++--- .../third-parties/percent_of_third_party_cache.sql | 7 ++++--- ...of_third_party_loaded_before_DOMContentLoaded.sql | 3 ++- ...quests_and_bytes_by_category_and_content_type.sql | 7 ++++--- .../percent_of_third_party_with_security_headers.sql | 3 ++- .../percent_of_websites_with_third_party.sql | 7 ++++--- ...rcent_of_websites_with_third_party_by_ranking.sql | 7 ++++--- sql/2021/third-parties/tao_by_third_party.sql | 3 ++- .../third_parties_blocking_rendering.sql | 3 ++- .../third_parties_blocking_rendering_percentiles.sql | 3 ++- ...00_third_parties_by_median_body_size_and_time.sql | 7 ++++--- .../top100_third_parties_by_number_of_websites.sql | 12 ++++++------ 16 files changed, 56 insertions(+), 41 deletions(-) diff --git a/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql b/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql index 5f8ecbf4c47..c8d715bf9ec 100644 --- a/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql +++ b/sql/2021/third-parties/distribution_of_3XX_response_body_size.sql @@ -5,7 +5,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, - req_host AS host, + url, status, respBodySize AS body_size FROM @@ -18,7 +18,8 @@ third_party AS ( FROM `httparchive.almanac.third_parties` WHERE - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' ), base AS ( @@ -32,7 +33,7 @@ base AS ( LEFT JOIN third_party ON - NET.HOST(requests.host) = NET.HOST(third_party.domain) + NET.HOST(requests.url) = NET.HOST(third_party.domain) ) SELECT diff --git a/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql b/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql index 869f8bfab09..ae6d4379908 100644 --- a/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql +++ b/sql/2021/third-parties/distribution_of_size_and_time_by_third_parties.sql @@ -4,7 +4,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, - req_host AS host, + url, respBodySize AS body_size, time FROM @@ -18,7 +18,8 @@ third_party AS ( FROM `httparchive.almanac.third_parties` WHERE - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' ), base AS ( @@ -32,7 +33,7 @@ base AS ( INNER JOIN third_party ON - NET.HOST(requests.host) = NET.HOST(third_party.domain) + NET.HOST(requests.url) = NET.HOST(third_party.domain) ) SELECT diff --git a/sql/2021/third-parties/distribution_of_third_parties_by_number_of_websites.sql b/sql/2021/third-parties/distribution_of_third_parties_by_number_of_websites.sql index 34788925ccf..f5c1171e2d0 100644 --- a/sql/2021/third-parties/distribution_of_third_parties_by_number_of_websites.sql +++ b/sql/2021/third-parties/distribution_of_third_parties_by_number_of_websites.sql @@ -5,7 +5,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, pageid AS page, - req_host AS host + url FROM `httparchive.summary_requests.2021_07_01_*` ), @@ -17,7 +17,8 @@ third_party AS ( FROM `httparchive.almanac.third_parties` WHERE - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' ), base AS ( @@ -30,7 +31,7 @@ base AS ( LEFT JOIN third_party ON - NET.HOST(requests.host) = NET.HOST(third_party.domain) + NET.HOST(requests.url) = NET.HOST(third_party.domain) WHERE canonicalDomain IS NOT NULL GROUP BY diff --git a/sql/2021/third-parties/distribution_of_websites_by number_of_third_parties.sql b/sql/2021/third-parties/distribution_of_websites_by number_of_third_parties.sql index 944d823df66..a96662e38b7 100644 --- a/sql/2021/third-parties/distribution_of_websites_by number_of_third_parties.sql +++ b/sql/2021/third-parties/distribution_of_websites_by number_of_third_parties.sql @@ -5,7 +5,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, pageid AS page, - req_host AS host + url FROM `httparchive.summary_requests.2021_07_01_*` ), @@ -16,7 +16,8 @@ third_party AS ( FROM `httparchive.almanac.third_parties` WHERE - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' ), base AS ( @@ -29,7 +30,7 @@ base AS ( LEFT JOIN third_party ON - NET.HOST(requests.host) = NET.HOST(third_party.domain) + NET.HOST(requests.url) = NET.HOST(third_party.domain) GROUP BY client, page diff --git a/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql b/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql index c4afe3cee91..46835de28fa 100644 --- a/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql +++ b/sql/2021/third-parties/percent_of_third_parties_by_content_type.sql @@ -4,7 +4,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, - req_host AS host, + url, type AS contentType FROM `httparchive.summary_requests.2021_07_01_*` @@ -16,7 +16,8 @@ third_party AS ( FROM `httparchive.almanac.third_parties` WHERE - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' ) SELECT @@ -30,7 +31,7 @@ FROM LEFT JOIN third_party ON - NET.HOST(requests.host) = NET.HOST(third_party.domain) + NET.HOST(requests.url) = NET.HOST(third_party.domain) WHERE domain IS NOT NULL GROUP BY diff --git a/sql/2021/third-parties/percent_of_third_party_cache.sql b/sql/2021/third-parties/percent_of_third_party_cache.sql index cfc6c6c9ef6..7294c55342c 100644 --- a/sql/2021/third-parties/percent_of_third_party_cache.sql +++ b/sql/2021/third-parties/percent_of_third_party_cache.sql @@ -10,7 +10,7 @@ WITH requests AS ( respOtherHeaders, reqOtherHeaders, type, - req_host AS host + url FROM `httparchive.summary_requests.2021_07_01_*` ), @@ -21,7 +21,8 @@ third_party AS ( FROM `httparchive.almanac.third_parties` WHERE - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' ), base AS ( @@ -44,7 +45,7 @@ base AS ( LEFT JOIN third_party ON - NET.HOST(requests.host) = NET.HOST(third_party.domain) + NET.HOST(requests.url) = NET.HOST(third_party.domain) WHERE domain IS NOT NULL ) diff --git a/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql b/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql index a99f97a4076..638dcb6924b 100644 --- a/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql +++ b/sql/2021/third-parties/percent_of_third_party_loaded_before_DOMContentLoaded.sql @@ -27,7 +27,8 @@ third_party AS ( FROM `httparchive.almanac.third_parties` WHERE - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' ), base AS ( diff --git a/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql b/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql index bd48b01047c..365092b19e9 100644 --- a/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql +++ b/sql/2021/third-parties/percent_of_third_party_requests_and_bytes_by_category_and_content_type.sql @@ -5,7 +5,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, pageid AS page, - req_host AS host, + url, type AS contentType, respBodySize AS body_size FROM @@ -19,7 +19,8 @@ third_party AS ( FROM `httparchive.almanac.third_parties` WHERE - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' ), base AS ( @@ -34,7 +35,7 @@ base AS ( INNER JOIN third_party ON - NET.HOST(requests.host) = NET.HOST(third_party.domain) + NET.HOST(requests.url) = NET.HOST(third_party.domain) ), requests_per_page_and_category AS ( diff --git a/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql b/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql index 284f501f2b3..de683c34b2d 100644 --- a/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql +++ b/sql/2021/third-parties/percent_of_third_party_with_security_headers.sql @@ -17,7 +17,8 @@ third_party AS ( FROM `httparchive.almanac.third_parties` WHERE - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' ), headers AS ( diff --git a/sql/2021/third-parties/percent_of_websites_with_third_party.sql b/sql/2021/third-parties/percent_of_websites_with_third_party.sql index fd4915e285b..d8826ef6403 100644 --- a/sql/2021/third-parties/percent_of_websites_with_third_party.sql +++ b/sql/2021/third-parties/percent_of_websites_with_third_party.sql @@ -5,7 +5,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, pageid AS page, - req_host AS host + url FROM `httparchive.summary_requests.2021_07_01_*` ), @@ -16,7 +16,8 @@ third_party AS ( FROM `httparchive.almanac.third_parties` WHERE - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' ) SELECT @@ -27,6 +28,6 @@ SELECT FROM requests LEFT JOIN third_party -ON NET.HOST(requests.host) = NET.HOST(third_party.domain) +ON NET.HOST(requests.url) = NET.HOST(third_party.domain) GROUP BY client diff --git a/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql b/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql index 49d807f1577..6210a3ad4a7 100644 --- a/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql +++ b/sql/2021/third-parties/percent_of_websites_with_third_party_by_ranking.sql @@ -5,7 +5,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, pageid, - req_host AS host + url FROM `httparchive.summary_requests.2021_07_01_*` ), @@ -16,7 +16,8 @@ third_party AS ( FROM `httparchive.almanac.third_parties` WHERE - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' ), pages AS ( @@ -41,7 +42,7 @@ JOIN USING (client, pageid) LEFT JOIN third_party -ON NET.HOST(requests.host) = NET.HOST(third_party.domain), +ON NET.HOST(requests.url) = NET.HOST(third_party.domain), UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping WHERE rank <= rank_grouping diff --git a/sql/2021/third-parties/tao_by_third_party.sql b/sql/2021/third-parties/tao_by_third_party.sql index f5da6e7e29a..3106ac25fde 100644 --- a/sql/2021/third-parties/tao_by_third_party.sql +++ b/sql/2021/third-parties/tao_by_third_party.sql @@ -40,7 +40,8 @@ third_party AS ( FROM `httparchive.almanac.third_parties` WHERE - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' ), headers AS ( diff --git a/sql/2021/third-parties/third_parties_blocking_rendering.sql b/sql/2021/third-parties/third_parties_blocking_rendering.sql index 2f120954b69..c9f5f832595 100644 --- a/sql/2021/third-parties/third_parties_blocking_rendering.sql +++ b/sql/2021/third-parties/third_parties_blocking_rendering.sql @@ -24,7 +24,8 @@ total_third_party_usage AS ( `httparchive.almanac.third_parties` ON NET.HOST(sr.url) = NET.HOST(domain) AND - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' GROUP BY canonicalDomain, category diff --git a/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql b/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql index 7861dc3722d..a103488b368 100644 --- a/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql +++ b/sql/2021/third-parties/third_parties_blocking_rendering_percentiles.sql @@ -24,7 +24,8 @@ total_third_party_usage AS ( `httparchive.almanac.third_parties` ON NET.HOST(sr.url) = NET.HOST(domain) AND - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' GROUP BY canonicalDomain, category diff --git a/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql b/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql index 76d36029c71..809a6ac33d4 100644 --- a/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql +++ b/sql/2021/third-parties/top100_third_parties_by_median_body_size_and_time.sql @@ -4,7 +4,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, - req_host AS host, + url, respBodySize AS body_size, time FROM @@ -19,7 +19,8 @@ third_party AS ( FROM `httparchive.almanac.third_parties` WHERE - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' ), base AS ( @@ -34,7 +35,7 @@ base AS ( INNER JOIN third_party ON - NET.HOST(requests.host) = NET.HOST(third_party.domain) + NET.HOST(requests.url) = NET.HOST(third_party.domain) GROUP BY client, category, diff --git a/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql b/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql index 2fd5928b13d..d9fd6fa5951 100644 --- a/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql +++ b/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql @@ -5,7 +5,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, pageid AS page, - req_host AS host + NET.HOST(url) AS host FROM `httparchive.summary_requests.2021_07_01_*` ), @@ -13,9 +13,9 @@ WITH requests AS ( totals AS ( SELECT _TABLE_SUFFIX AS client, - COUNT(DISTINCT pageid) AS total_pages + COUNT(0) AS total_pages FROM - `httparchive.summary_requests.2021_07_01_*` + `httparchive.summary_pages.2021_07_01_*` GROUP BY _TABLE_SUFFIX ), @@ -26,7 +26,8 @@ third_party AS ( FROM `httparchive.almanac.third_parties` WHERE - date = '2021-07-01' + date = '2021-07-01' AND + category != 'hosting' ) SELECT @@ -41,7 +42,7 @@ FROM LEFT JOIN third_party ON - NET.HOST(requests.host) = NET.HOST(third_party.domain) + NET.HOST(requests.url) = NET.HOST(third_party.domain) JOIN totals USING (client) @@ -56,4 +57,3 @@ QUALIFY ORDER BY pct_pages DESC, client - From 1eac213303e1274b083ce84f8c2e5a68d6451495 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 21 Oct 2021 01:45:15 +0100 Subject: [PATCH 20/21] Linting fixes --- .../top100_third_parties_by_number_of_websites.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql b/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql index d9fd6fa5951..cded78bb877 100644 --- a/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql +++ b/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql @@ -5,7 +5,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, pageid AS page, - NET.HOST(url) AS host + NET.HOST(url) AS host FROM `httparchive.summary_requests.2021_07_01_*` ), From 8d58136d713b89170fbc645b2f17264217667c39 Mon Sep 17 00:00:00 2001 From: Barry Date: Thu, 21 Oct 2021 01:48:02 +0100 Subject: [PATCH 21/21] Forgot to hit save --- .../top100_third_parties_by_number_of_websites.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql b/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql index cded78bb877..7d4a1e3db5d 100644 --- a/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql +++ b/sql/2021/third-parties/top100_third_parties_by_number_of_websites.sql @@ -5,7 +5,7 @@ WITH requests AS ( SELECT _TABLE_SUFFIX AS client, pageid AS page, - NET.HOST(url) AS host + url FROM `httparchive.summary_requests.2021_07_01_*` ),