Skip to content

Commit

Permalink
Third party 2021 query amends (#2404)
Browse files Browse the repository at this point in the history
* Add limit to 50

* Linting fixes

* More ranking queries

* Fix blocking query and linting

* Update sql/2021/third-parties/third_parties_blocking_main_thread.sql

* Update sql/2021/third-parties/third_parties_blocking_main_thread.sql

Co-authored-by: Rick Viscomi <[email protected]>
  • Loading branch information
tunetheweb and rviscomi authored Oct 25, 2021
1 parent e80bb31 commit 33ad00f
Show file tree
Hide file tree
Showing 20 changed files with 371 additions and 54 deletions.
15 changes: 13 additions & 2 deletions sql/2021/third-parties/distribution_of_3XX_response_body_size.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
WITH requests AS (
SELECT
_TABLE_SUFFIX AS client,
pageid AS page,
url,
status,
respBodySize AS body_size
Expand All @@ -14,12 +15,22 @@ WITH requests AS (

third_party AS (
SELECT
domain
domain,
category,
COUNT(DISTINCT page) AS page_usage
FROM
`httparchive.almanac.third_parties`
`httparchive.almanac.third_parties` tp
JOIN
requests r
ON NET.HOST(r.url) = NET.HOST(tp.domain)
WHERE
date = '2021-07-01' AND
category != 'hosting'
GROUP BY
domain,
category
HAVING
page_usage >= 50
),

base AS (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
WITH requests AS (
SELECT
_TABLE_SUFFIX AS client,
pageid AS page,
url,
respBodySize AS body_size,
time
Expand All @@ -13,13 +14,22 @@ WITH requests AS (

third_party AS (
SELECT
domain,
category,
domain
COUNT(DISTINCT page) AS page_usage
FROM
`httparchive.almanac.third_parties`
`httparchive.almanac.third_parties` tp
JOIN
requests r
ON NET.HOST(r.url) = NET.HOST(tp.domain)
WHERE
date = '2021-07-01' AND
category != 'hosting'
GROUP BY
domain,
category
HAVING
page_usage >= 50
),

base AS (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,23 @@ WITH requests AS (
third_party AS (
SELECT
domain,
canonicalDomain
canonicalDomain,
category,
COUNT(DISTINCT page) AS page_usage
FROM
`httparchive.almanac.third_parties`
`httparchive.almanac.third_parties` tp
JOIN
requests r
ON NET.HOST(r.url) = NET.HOST(tp.domain)
WHERE
date = '2021-07-01' AND
category != 'hosting'
GROUP BY
domain,
canonicalDomain,
category
HAVING
page_usage >= 50
),

base AS (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,22 @@ WITH requests AS (

third_party AS (
SELECT
domain
domain,
category,
COUNT(DISTINCT page) AS page_usage
FROM
`httparchive.almanac.third_parties`
`httparchive.almanac.third_parties` tp
JOIN
requests r
ON NET.HOST(r.url) = NET.HOST(tp.domain)
WHERE
date = '2021-07-01' AND
category != 'hosting'
GROUP BY
domain,
category
HAVING
page_usage >= 50
),

base AS (
Expand Down
77 changes: 77 additions & 0 deletions sql/2021/third-parties/number_of_third_parties_by_rank.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#standardSQL
# Number of third-parties per websites by rank
WITH requests AS (
SELECT
_TABLE_SUFFIX AS client,
pageid AS page,
url
FROM
`httparchive.summary_requests.2021_07_01_*`
),

pages AS (
SELECT
_TABLE_SUFFIX AS client,
pageid AS page,
rank
FROM
`httparchive.summary_pages.2021_07_01_*`
),

third_party AS (
SELECT
domain,
category,
COUNT(DISTINCT page) AS page_usage
FROM
`httparchive.almanac.third_parties` tp
JOIN
requests r
ON NET.HOST(r.url) = NET.HOST(tp.domain)
WHERE
date = '2021-07-01' AND
category != 'hosting'
GROUP BY
domain,
category
HAVING
page_usage >= 50
),

base AS (
SELECT
client,
page,
rank,
COUNT(domain) AS third_parties_per_page
FROM
requests
LEFT JOIN
third_party
ON
NET.HOST(requests.url) = NET.HOST(third_party.domain)
INNER JOIN
pages
USING
(client, page)
GROUP BY
client,
page,
rank
)

SELECT
client,
rank_grouping,
APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page
FROM
base,
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
WHERE
rank <= rank_grouping
GROUP BY
client,
rank_grouping
ORDER BY
client,
rank_grouping
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#standardSQL
# Number of third-parties per websites by rank and category

WITH requests AS (
SELECT
_TABLE_SUFFIX AS client,
pageid AS page,
url
FROM
`httparchive.summary_requests.2021_07_01_*`
),

pages AS (
SELECT
_TABLE_SUFFIX AS client,
pageid AS page,
rank
FROM
`httparchive.summary_pages.2021_07_01_*`
),

third_party AS (
SELECT
domain,
category,
COUNT(DISTINCT page) AS page_usage
FROM
`httparchive.almanac.third_parties` tp
JOIN
requests r
ON NET.HOST(r.url) = NET.HOST(tp.domain)
WHERE
date = '2021-07-01' AND
category NOT IN ('hosting')
GROUP BY
domain,
category
HAVING
page_usage >= 50
),

base AS (
SELECT
client,
category,
page,
rank,
COUNT(domain) AS third_parties_per_page
FROM
requests
LEFT JOIN
third_party
ON
NET.HOST(requests.url) = NET.HOST(third_party.domain)
INNER JOIN
pages
USING
(client, page)
GROUP BY
client,
category,
page,
rank
)

SELECT
client,
category,
rank_grouping,
APPROX_QUANTILES(third_parties_per_page, 1000)[OFFSET(500)] AS p50_third_parties_per_page
FROM
base,
UNNEST([1000, 10000, 100000, 1000000, 10000000]) AS rank_grouping
WHERE
rank <= rank_grouping
GROUP BY
client,
category,
rank_grouping
ORDER BY
client,
category,
rank_grouping
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
WITH requests AS (
SELECT
_TABLE_SUFFIX AS client,
pageid AS page,
url,
type AS contentType
FROM
Expand All @@ -12,12 +13,22 @@ WITH requests AS (

third_party AS (
SELECT
domain
domain,
category,
COUNT(DISTINCT page) AS page_usage
FROM
`httparchive.almanac.third_parties`
`httparchive.almanac.third_parties` tp
JOIN
requests r
ON NET.HOST(r.url) = NET.HOST(tp.domain)
WHERE
date = '2021-07-01' AND
category != 'hosting'
GROUP BY
domain,
category
HAVING
page_usage >= 50
)

SELECT
Expand Down
17 changes: 14 additions & 3 deletions sql/2021/third-parties/percent_of_third_party_cache.sql
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,30 @@ WITH requests AS (
respOtherHeaders,
reqOtherHeaders,
type,
url
url,
pageid AS page
FROM
`httparchive.summary_requests.2021_07_01_*`
),

third_party AS (
SELECT
domain
domain,
category,
COUNT(DISTINCT page) AS page_usage
FROM
`httparchive.almanac.third_parties`
`httparchive.almanac.third_parties` tp
JOIN
requests r
ON NET.HOST(r.url) = NET.HOST(tp.domain)
WHERE
date = '2021-07-01' AND
category != 'hosting'
GROUP BY
domain,
category
HAVING
page_usage >= 50
),

base AS (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,22 @@ pages AS (

third_party AS (
SELECT
domain,
category,
domain
COUNT(DISTINCT page) AS page_usage
FROM
`httparchive.almanac.third_parties`
`httparchive.almanac.third_parties` tp
JOIN
requests r
ON NET.HOST(r.url) = NET.HOST(tp.domain)
WHERE
date = '2021-07-01' AND
category != 'hosting'
GROUP BY
domain,
category
HAVING
page_usage >= 50
),

base AS (
Expand Down
Loading

0 comments on commit 33ad00f

Please sign in to comment.