Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CSS 2020 queries #1281

Merged
merged 17 commits into from
Sep 30, 2020
21 changes: 21 additions & 0 deletions sql/2020/01_CSS/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# CSS Queries

## Query size warning

The 2020 data in the [`parsed_css`](https://console.cloud.google.com/bigquery?p=httparchive&d=almanac&t=parsed_css&page=table) table is 9.7 TB, which is approximately $50 per query.

When prototyping queries, it's advisable to use the [`parsed_css_1k`](https://console.cloud.google.com/bigquery?p=httparchive&d=almanac&t=parsed_css_1k&page=table) table instead, which only contains 1000 rows for easier testing. Make sure to switch this back to the full table when saving the results for analysis.

## [CSS utils](../../lib/css-utils.js)

- **Source**: https://github.com/LeaVerou/rework-utils/tree/master/src
- **Playground**: https://projects.verou.me/rework-utils/
- **Docs**: https://projects.verou.me/rework-utils/docs/

This file provides JS utility functions to be used by the queries that depend on the `parsed_css` table.

## Related resources

- [Tracking issue](https://github.com/HTTPArchive/almanac.httparchive.org/issues/898)
- [Draft doc](https://docs.google.com/document/d/1Cy9acip1ZQScoQEeds5-6l1FFFBJTJr4SheZiQxbj-Q/edit?usp=sharing)
- [Results sheet](https://docs.google.com/spreadsheets/d/1sMWXWjMujqfAREYxNbG_t1fOJKYCA6ASLwtz4pBQVTw/edit?usp=sharing)
36 changes: 36 additions & 0 deletions sql/2020/01_CSS/box_sizing.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#standardSQL
# 1. Distribution of the number of occurrences of box-sizing:border-box per page.
# 2. Percent of pages with that style.
CREATE TEMPORARY FUNCTION countBorderBoxDeclarations(css STRING) RETURNS NUMERIC LANGUAGE js AS '''
try {
const ast = JSON.parse(css);
return countDeclarations(ast.stylesheet.rules, {properties: /^(-(o|moz|webkit|ms)-)?box-sizing$/, values: 'border-box'});
} catch (e) {
return null;
}
'''
OPTIONS (library="gs://httparchive/lib/rework-utils.js");

SELECT
percentile,
client,
COUNT(DISTINCT IF(declarations > 0, page, NULL)) AS pages,
COUNT(DISTINCT page) AS total,
COUNT(DISTINCT IF(declarations > 0, page, NULL)) / COUNT(DISTINCT page) AS pct_pages,
APPROX_QUANTILES(declarations, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS declarations_per_page
FROM (
SELECT
client,
page,
countBorderBoxDeclarations(css) AS declarations
FROM
`httparchive.almanac.parsed_css`
WHERE
date = '2020-08-01'),
UNNEST([10, 25, 50, 75, 90]) AS percentile
GROUP BY
percentile,
client
ORDER BY
percentile,
client
19 changes: 19 additions & 0 deletions sql/2020/01_CSS/css_in_js.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#standardSQL
# CSS in JS. WIP
CREATE TEMPORARY FUNCTION getCssInJS(payload STRING)
RETURNS ARRAY<STRING> LANGUAGE js AS ''''''
try {
var $ = JSON.parse(payload);
var css = JSON.parse($._css);

return Array.isArray(css.css_in_js) && css.css_in_js.length > 0 ? css.css_in_js : [''NONE''];
} catch (e) {
return [e.message];
}
'''''';

SELECT
url,
cssInJs
FROM `httparchive.sample_data.pages_mobile_10k`
CROSS JOIN UNNEST(getCssInJS(payload)) AS cssInJs
44 changes: 44 additions & 0 deletions sql/2020/01_CSS/custom_property_names.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#standardSQL
# Most popular custom property names as a percent of pages.
CREATE TEMPORARY FUNCTION getCustomPropertyNames(payload STRING) RETURNS ARRAY<STRING> LANGUAGE js AS '''
try {
var $ = JSON.parse(payload);
var vars = JSON.parse($['_css-variables']);
return Object.keys(vars.summary);
} catch (e) {
return [];
}
''';

SELECT
client,
name,
COUNT(DISTINCT url) AS freq,
total,
COUNT(DISTINCT url) / total AS pct
FROM (
SELECT
_TABLE_SUFFIX AS client,
url,
getCustomPropertyNames(payload) AS names,
total
FROM
`httparchive.pages.2020_08_01_*`
JOIN (
SELECT
_TABLE_SUFFIX,
COUNT(DISTINCT url) AS total
FROM
`httparchive.pages.2020_08_01_*`
GROUP BY
_TABLE_SUFFIX)
USING (_TABLE_SUFFIX)),
UNNEST(names) AS name
GROUP BY
client,
name,
total
ORDER BY
pct DESC
LIMIT
1000
44 changes: 44 additions & 0 deletions sql/2020/01_CSS/custom_property_values.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#standardSQL
# Most popular custom property values as a percent of pages.
CREATE TEMPORARY FUNCTION getCustomPropertyValues(payload STRING) RETURNS ARRAY<STRING> LANGUAGE js AS '''
try {
var $ = JSON.parse(payload);
var vars = JSON.parse($['_css-variables']);
return Object.values(vars.summary);
} catch (e) {
return [];
}
''';

SELECT
client,
value,
COUNT(DISTINCT url) AS freq,
total,
COUNT(DISTINCT url) / total AS pct
FROM (
SELECT
_TABLE_SUFFIX AS client,
url,
getCustomPropertyValues(payload) AS values,
total
FROM
`httparchive.pages.2020_08_01_*`
JOIN (
SELECT
_TABLE_SUFFIX,
COUNT(DISTINCT url) AS total
FROM
`httparchive.pages.2020_08_01_*`
GROUP BY
_TABLE_SUFFIX)
USING (_TABLE_SUFFIX)),
UNNEST(values) AS value
GROUP BY
client,
value,
total
ORDER BY
pct DESC
LIMIT
1000
141 changes: 141 additions & 0 deletions sql/2020/01_CSS/selector_parts_freq_per_page.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#standardSQL
CREATE TEMPORARY FUNCTION getSelectorParts(css STRING)
RETURNS STRUCT<
class ARRAY<STRUCT<name STRING, value INT64>>,
id ARRAY<STRUCT<name STRING, value INT64>>,
attribute ARRAY<STRUCT<name STRING, value INT64>>,
pseudo_class ARRAY<STRUCT<name STRING, value INT64>>,
pseudo_element ARRAY<STRUCT<name STRING, value INT64>>
> LANGUAGE js AS '''
try {
function compute(ast) {
let ret = {
class: {},
id: {},
attribute: {},
"pseudo-class": {},
"pseudo-element": {}
};

walkSelectors(ast, selector => {
let sast = parsel.parse(selector, {list: false});

parsel.walk(sast, node => {
if (node.type in ret) {
incrementByKey(ret[node.type], node.name);
}
}, {subtree: true});
});

for (let type in ret) {
ret[type] = sortObject(ret[type]);
}

return ret;
}

function unzip(obj) {
return Object.entries(obj).filter(([name, value]) => {
return !isNaN(value);
}).map(([name, value]) => ({name, value}));
}

const ast = JSON.parse(css);
let parts = compute(ast);
return {
class: unzip(parts.class),
id: unzip(parts.id),
attribute: unzip(parts.attribute),
pseudo_class: unzip(parts['pseudo-class']),
pseudo_element: unzip(parts['pseudo-element'])
}
} catch (e) {
return {class: [{name: e, value: 0}]};
}
'''
OPTIONS (library="gs://httparchive/lib/css-utils.js");

# https://www.stevenmoseley.com/blog/tech/high-performance-sql-correlated-scalar-aggregate-reduction-queries
CREATE TEMPORARY FUNCTION encode(comparator STRING, data STRING) RETURNS STRING AS (
CONCAT(LPAD(comparator, 11, '0'), data)
);
CREATE TEMPORARY FUNCTION decode(value STRING) RETURNS STRING AS (
SUBSTR(value, 12)
);

WITH selector_parts AS (
SELECT
client,
page,
url,
getSelectorParts(css) AS parts
FROM
`httparchive.almanac.parsed_css`
WHERE
date = '2020-08-01' AND
# Limit the size of the CSS to avoid OOM crashes.
LENGTH(css) < 0.1 * 1024 * 1024
)

SELECT
client,
decode(MAX(encode(CAST(class_freq AS STRING), class_name))) AS class_name,
MAX(class_freq) AS class_freq,
decode(MAX(encode(CAST(id_freq AS STRING), id_name))) AS id_name,
MAX(id_freq) AS id_freq,
decode(MAX(encode(CAST(attribute_freq AS STRING), attribute_name))) AS attribute_name,
MAX(attribute_freq) AS attribute_freq,
decode(MAX(encode(CAST(pseudo_class_freq AS STRING), pseudo_class_name))) AS pseudo_class_name,
MAX(pseudo_class_freq) AS pseudo_class_freq,
decode(MAX(encode(CAST(pseudo_element_freq AS STRING), pseudo_element_name))) AS pseudo_element_name,
MAX(pseudo_element_freq) AS pseudo_element_freq
FROM (
SELECT
client,
class.name AS class_name,
SUM(class.value) OVER (PARTITION BY client, class.name) AS class_freq
FROM
selector_parts,
UNNEST(parts.class) AS class)
JOIN (
SELECT
client,
id.name AS id_name,
SUM(id.value) OVER (PARTITION BY client, id.name) AS id_freq
FROM
selector_parts,
UNNEST(parts.id) AS id)
USING
(client)
JOIN (
SELECT
client,
attribute.name AS attribute_name,
SUM(attribute.value) OVER (PARTITION BY client, attribute.name) AS attribute_freq
FROM
selector_parts,
UNNEST(parts.attribute) AS attribute)
USING
(client)
JOIN (
SELECT
client,
pseudo_class.name AS pseudo_class_name,
SUM(pseudo_class.value) OVER (PARTITION BY client, pseudo_class.name) AS pseudo_class_freq
FROM
selector_parts,
UNNEST(parts.pseudo_class) AS pseudo_class)
USING
(client)
JOIN (
SELECT
client,
pseudo_element.name AS pseudo_element_name,
SUM(pseudo_element.value) OVER (PARTITION BY client, pseudo_element.name) AS pseudo_element_freq
FROM
selector_parts,
UNNEST(parts.pseudo_element) AS pseudo_element)
USING
(client)
GROUP BY
client
Loading