From 1f7ae058ba2fdc4ad1c368f33870c9da387f0359 Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Mon, 7 Sep 2020 21:29:01 -0400 Subject: [PATCH 01/17] touch --- sql/2020/01_CSS/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/2020/01_CSS/README.md b/sql/2020/01_CSS/README.md index e69de29bb2d..ae11e9bd61e 100644 --- a/sql/2020/01_CSS/README.md +++ b/sql/2020/01_CSS/README.md @@ -0,0 +1 @@ +Stylesheets parsed by Rework CSS \ No newline at end of file From eab63dcfe7ab39aad2d46d3d6d85124cd76c9e5c Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Fri, 11 Sep 2020 09:32:14 -0400 Subject: [PATCH 02/17] first CSS query and lib --- sql/2020/01_CSS/box_sizing.js | 36 ++++++++ sql/lib/rework-utils.js | 157 ++++++++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 sql/2020/01_CSS/box_sizing.js create mode 100644 sql/lib/rework-utils.js diff --git a/sql/2020/01_CSS/box_sizing.js b/sql/2020/01_CSS/box_sizing.js new file mode 100644 index 00000000000..9c918384591 --- /dev/null +++ b/sql/2020/01_CSS/box_sizing.js @@ -0,0 +1,36 @@ +#standardSQL +# - Distribution of the number of occurrences of box-sizing:border-box per page. +# - Percent of pages with that style. +CREATE TEMPORARY FUNCTION countBorderBoxDeclarations(css STRING) RETURNS NUMERIC LANGUAGE js AS ''' +try { + const ast = JSON.parse(css); + return countDeclarations(ast.stylesheet.rules, {properties: /^(-(o|moz|webkit|ms)-)?box-sizing$/, values: 'border-box'}); +} catch (e) { + return null; +} +''' +OPTIONS (library="gs://httparchive/lib/rework-utils.js"); + +SELECT + percentile, + client, + COUNT(DISTINCT IF(declarations > 0, page, NULL)) AS pages, + COUNT(DISTINCT page) AS total, + COUNT(DISTINCT IF(declarations > 0, page, NULL)) / COUNT(DISTINCT page) AS pct_pages, + APPROX_QUANTILES(declarations, 1000)[OFFSET(percentile * 10)] AS median_declarations +FROM ( + SELECT + client, + page, + countBorderBoxDeclarations(css) AS declarations + FROM + `httparchive.almanac.parsed_css` + WHERE + date = '2020-08-01'), + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client \ No newline at end of file diff --git a/sql/lib/rework-utils.js b/sql/lib/rework-utils.js new file mode 100644 index 00000000000..4522ec81024 --- /dev/null +++ b/sql/lib/rework-utils.js @@ -0,0 +1,157 @@ +/** + * Test whether a value passes a given test. + * The test could be a string, regexp, function, or array of any of these. + * This is at the core of most walkers. + * @param value + * @param {string|RegExp|Function|Array} [test] + * @return {Boolean} true if no test is provided, or test passes, false otherwise. + */ +function matches(value, test) { + if (!test) { + return true; + } + + if (Array.isArray(test)) { + return test.some(t => matches(value, t)); + } + + let type = typeof test; + + if (type === "string") { + return value === test; + } + else if (type === "function") { + return test(value); + } + else if (test instanceof RegExp) { + return test.test(value); + } + + return false; +} + +/** + * Recursively walk all declarations + * @param {Object|Array} rules - AST, array of rules, or single rule + * @param {Function} callback - Callback to be executed on each declaration. Arguments: (declaration, rule) + * @param {Object} [test] - Conditions that need to be satisfied for a declaration to be visited, all optional + * @param {string|RegExp|Function|Array} test.properties - Test for property names + * @param {string|RegExp|Function|Array} test.values - Test for values + * @param {string|RegExp|Function|Array} test.rules - Test for rules + */ +function walkDeclarations(rules, callback, test) { + if (!rules) { + return; + } + + if (rules.stylesheet) { + // AST passed + rules = rules.stylesheet.rules; + } + else if (!Array.isArray(rules)) { + // Single rule + rules = [rules]; + } + + for (let rule of rules) { + if (!matches(rule, test && test.rules)) { + continue; + } + + // Walk declarations directly in rule + if (rule.declarations) { + for (let declaration of rule.declarations) { + if (matches(declaration.property, test && test.properties) && matches(declaration.value, test && test.values)) { + callback(declaration, rule); + } + } + } + + // Walk declarations of nested rules (e.g. @media, @supports have nested rules) + if (rule.rules) { + walkDeclarations(rule.rules, callback, test); + } + } +} + +/** + * Recursively walk all "normal" rules, i.e. rules with selectors + * @param rules {Object|Array} AST or array of CSS rules + * @param callback {Function} Function to be executed for each matching rule. Rule passed as the only argument. + * @param [test] {Object} + * @param test.rules {string|RegExp|Function|Array} Which rules the callback runs on + * @param test.ancestors {string|RegExp|Function|Array} Which rules the walker descends on + * @return The return value of the callback (which also breaks the loop) or undefined. + */ +function walkRules(rules, callback, test) { + if (!rules) { + return; + } + + if (!Array.isArray(rules)) { + // AST passed + rules = rules.stylesheet.rules; + } + + for (let rule of rules) { + if (matches(rule, test && test.rules)) { + let ret = callback(rule); + + if (ret !== undefined) { + // Break loop and return immediately + return ret; + } + } + + if (matches(rule, test && test.ancestors)) { + if (rule.rules) { + walkRules(rule.rules, callback, test); + } + } + } +} + +/** + * Sort an object literal and return the result as a new object literal + * @param {Object} obj + * @param {Function} [f=x=>x] Optional function to pass arguments through, useful if e.g. we are sorting by a property of an object. + */ +function sortObject(obj, f = x => x) { + return Object.fromEntries(Object.entries(obj).sort((a, b) => f(b[1]) - f(a[1]))); +} + +/** + * Sum all values of an object and return the result + * @param {Object} obj + */ +function sumObject(obj) { + Object.values(obj).reduce((a, c) => a + c, 0); +} + +/** + * Count total declarations that pass a given test. + * @see {@link module:walkDeclarations} for arguments + * @returns {number} Declaration count that pass the provided conditions. + */ +function countDeclarations(rules, test) { + let ret = 0; + + walkDeclarations(rules, declaration => ret++, test); + + return ret; +} + +/** + * Count properties that pass a given test, in rules that pass a given test + * @see {@link module:walkDeclarations} for arguments + * @return {Object} Property names and declaration counts. Use `sumObject(ret)` to get total count. + */ +function countDeclarationsByProperty(rules, test) { + let ret = {}; + + walkDeclarations(rules, declaration => { + ret[declaration.property] = (ret[declaration.property] || 0) + 1; + }, test); + + return sortObject(ret); +} From 00d000fafef972b71be0100067549b0229d007d5 Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Fri, 11 Sep 2020 09:40:33 -0400 Subject: [PATCH 03/17] rename, ignore nulls --- sql/2020/01_CSS/{box_sizing.js => box_sizing.sql} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename sql/2020/01_CSS/{box_sizing.js => box_sizing.sql} (90%) diff --git a/sql/2020/01_CSS/box_sizing.js b/sql/2020/01_CSS/box_sizing.sql similarity index 90% rename from sql/2020/01_CSS/box_sizing.js rename to sql/2020/01_CSS/box_sizing.sql index 9c918384591..6f7dc4b95c9 100644 --- a/sql/2020/01_CSS/box_sizing.js +++ b/sql/2020/01_CSS/box_sizing.sql @@ -17,7 +17,7 @@ SELECT COUNT(DISTINCT IF(declarations > 0, page, NULL)) AS pages, COUNT(DISTINCT page) AS total, COUNT(DISTINCT IF(declarations > 0, page, NULL)) / COUNT(DISTINCT page) AS pct_pages, - APPROX_QUANTILES(declarations, 1000)[OFFSET(percentile * 10)] AS median_declarations + APPROX_QUANTILES(declarations, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS declarations_per_page FROM ( SELECT client, From 8097904dcfeb9e9219debd060336de4ffc4ad54d Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Fri, 11 Sep 2020 10:03:30 -0400 Subject: [PATCH 04/17] gcs_copy --- sql/lib/gcs_copy.sh | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100755 sql/lib/gcs_copy.sh diff --git a/sql/lib/gcs_copy.sh b/sql/lib/gcs_copy.sh new file mode 100755 index 00000000000..14e91195eda --- /dev/null +++ b/sql/lib/gcs_copy.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# Copy each lib script to Google Cloud Storage (GCS). +# Files are available at gs://httparchive/lib/*.js +# And via HTTPS at https://cdn.httparchive.org/lib/*.js (with caching) + +gsutil cp sql/lib/*.js gs://httparchive/lib \ No newline at end of file From 1cd6abfc3bddadeb6d12c45e826c79971efe82ff Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Fri, 11 Sep 2020 10:13:42 -0400 Subject: [PATCH 05/17] css notices --- sql/2020/01_CSS/README.md | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/sql/2020/01_CSS/README.md b/sql/2020/01_CSS/README.md index ae11e9bd61e..ff39746defc 100644 --- a/sql/2020/01_CSS/README.md +++ b/sql/2020/01_CSS/README.md @@ -1 +1,21 @@ -Stylesheets parsed by Rework CSS \ No newline at end of file +# CSS Queries + +## Query size warning + +The 2020 data in the [`parsed_css`](https://console.cloud.google.com/bigquery?p=httparchive&d=almanac&t=parsed_css&page=table) table is 9.7 TB, which is approximately $50 per query. + +When prototyping queries, it's advisable to use the [`parsed_css_1k`](https://console.cloud.google.com/bigquery?p=httparchive&d=almanac&t=parsed_css_1k&page=table) table instead, which only contains 1000 rows for easier testing. Make sure to switch this back to the full table when saving the results for analysis. + +## [Rework utils](../../lib/rework-utils.js) + +Source: https://github.com/LeaVerou/rework-utils/tree/master/src +Playground: https://projects.verou.me/rework-utils/ +Docs: https://projects.verou.me/rework-utils/docs/ + +This file provides JS utility functions to be used by the queries that depend on the `parsed_css` table. + +## Related resources + +- [Tracking issue](https://github.com/HTTPArchive/almanac.httparchive.org/issues/898) +- [Draft doc](https://docs.google.com/document/d/1Cy9acip1ZQScoQEeds5-6l1FFFBJTJr4SheZiQxbj-Q/edit?usp=sharing) +- [Results sheet](https://docs.google.com/spreadsheets/d/1sMWXWjMujqfAREYxNbG_t1fOJKYCA6ASLwtz4pBQVTw/edit?usp=sharing) \ No newline at end of file From ca773fbe05d37c8fdfcacf98c364cdcc5840c962 Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Fri, 11 Sep 2020 14:24:21 -0400 Subject: [PATCH 06/17] formatting --- sql/2020/01_CSS/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/2020/01_CSS/README.md b/sql/2020/01_CSS/README.md index ff39746defc..5dc8b34dc55 100644 --- a/sql/2020/01_CSS/README.md +++ b/sql/2020/01_CSS/README.md @@ -8,9 +8,9 @@ When prototyping queries, it's advisable to use the [`parsed_css_1k`](https://co ## [Rework utils](../../lib/rework-utils.js) -Source: https://github.com/LeaVerou/rework-utils/tree/master/src -Playground: https://projects.verou.me/rework-utils/ -Docs: https://projects.verou.me/rework-utils/docs/ +- **Source**: https://github.com/LeaVerou/rework-utils/tree/master/src +- **Playground**: https://projects.verou.me/rework-utils/ +- **Docs**: https://projects.verou.me/rework-utils/docs/ This file provides JS utility functions to be used by the queries that depend on the `parsed_css` table. From a9ca51be631e455cc4146fb917f4c2ac6b27ba7e Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Mon, 14 Sep 2020 12:29:23 -0400 Subject: [PATCH 07/17] sourcemap preprocessors --- sql/2020/01_CSS/box_sizing.sql | 4 ++-- sql/2020/01_CSS/sourcemap_preprocessors.sql | 23 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 sql/2020/01_CSS/sourcemap_preprocessors.sql diff --git a/sql/2020/01_CSS/box_sizing.sql b/sql/2020/01_CSS/box_sizing.sql index 6f7dc4b95c9..efa66ded62a 100644 --- a/sql/2020/01_CSS/box_sizing.sql +++ b/sql/2020/01_CSS/box_sizing.sql @@ -1,6 +1,6 @@ #standardSQL -# - Distribution of the number of occurrences of box-sizing:border-box per page. -# - Percent of pages with that style. +# 1. Distribution of the number of occurrences of box-sizing:border-box per page. +# 2. Percent of pages with that style. CREATE TEMPORARY FUNCTION countBorderBoxDeclarations(css STRING) RETURNS NUMERIC LANGUAGE js AS ''' try { const ast = JSON.parse(css); diff --git a/sql/2020/01_CSS/sourcemap_preprocessors.sql b/sql/2020/01_CSS/sourcemap_preprocessors.sql new file mode 100644 index 00000000000..69bf9cb7e60 --- /dev/null +++ b/sql/2020/01_CSS/sourcemap_preprocessors.sql @@ -0,0 +1,23 @@ +#standardSQL +# Adoption of preprocessors as a percent of pages that use sourcemaps. +CREATE TEMPORARY FUNCTION getSourcemappedExts(payload STRING) RETURNS ARRAY LANGUAGE js AS ''' +try { + var $ = JSON.parse(payload); + var sass = JSON.parse($._sass); + return Object.keys(sass.sourcemaps.ext); +} catch (e) { + return []; +} +'''; + +SELECT + DISTINCT _TABLE_SUFFIX AS client, + ext, + COUNT(DISTINCT url) OVER (PARTITION BY _TABLE_SUFFIX, ext) AS freq, + COUNT(DISTINCT url) OVER (PARTITION BY _TABLE_SUFFIX) AS total, + COUNT(DISTINCT url) OVER (PARTITION BY _TABLE_SUFFIX, ext) / COUNT(DISTINCT url) OVER (PARTITION BY _TABLE_SUFFIX) AS pct +FROM + `httparchive.pages.2020_08_01_*`, + UNNEST(getSourcemappedExts(payload)) AS ext +ORDER BY + pct DESC \ No newline at end of file From 5d55af949067b08faeace1037838ec4d50cbc075 Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Mon, 14 Sep 2020 12:57:13 -0400 Subject: [PATCH 08/17] sourcemap adoption --- sql/2020/01_CSS/sourcemap_adoption.sql | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 sql/2020/01_CSS/sourcemap_adoption.sql diff --git a/sql/2020/01_CSS/sourcemap_adoption.sql b/sql/2020/01_CSS/sourcemap_adoption.sql new file mode 100644 index 00000000000..bf4456a68da --- /dev/null +++ b/sql/2020/01_CSS/sourcemap_adoption.sql @@ -0,0 +1,25 @@ +#standardSQL +# Percent of pages with CSS sourcemaps. +CREATE TEMPORARY FUNCTION countSourcemaps(payload STRING) RETURNS INT64 LANGUAGE js AS ''' +try { + var $ = JSON.parse(payload); + var sass = JSON.parse($._sass); + return sass.sourcemaps.count; +} catch (e) { + return 0; +} +'''; + +SELECT + client, + COUNTIF(has_sourcemap) AS freq, + COUNT(0) AS total, + COUNTIF(has_sourcemap) / COUNT(0) AS pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + countSourcemaps(payload) > 0 AS has_sourcemap + FROM + `httparchive.pages.2020_08_01_*`) +GROUP BY + client \ No newline at end of file From cefded1be137cac0c8efdb80d1fdb427949da515 Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Tue, 15 Sep 2020 15:18:28 -0400 Subject: [PATCH 09/17] custom props --- sql/2020/01_CSS/custom_property_names.sql | 44 ++++++++++++++++++++++ sql/2020/01_CSS/custom_property_values.sql | 44 ++++++++++++++++++++++ sql/2020/01_CSS/stylesheet_kbytes.sql | 15 ++++++++ 3 files changed, 103 insertions(+) create mode 100644 sql/2020/01_CSS/custom_property_names.sql create mode 100644 sql/2020/01_CSS/custom_property_values.sql create mode 100644 sql/2020/01_CSS/stylesheet_kbytes.sql diff --git a/sql/2020/01_CSS/custom_property_names.sql b/sql/2020/01_CSS/custom_property_names.sql new file mode 100644 index 00000000000..9887f5e1800 --- /dev/null +++ b/sql/2020/01_CSS/custom_property_names.sql @@ -0,0 +1,44 @@ +#standardSQL +# Most popular custom property names as a percent of pages. +CREATE TEMPORARY FUNCTION getCustomPropertyNames(payload STRING) RETURNS ARRAY LANGUAGE js AS ''' +try { + var $ = JSON.parse(payload); + var vars = JSON.parse($['_css-variables']); + return Object.keys(vars.summary); +} catch (e) { + return []; +} +'''; + +SELECT + client, + name, + COUNT(DISTINCT url) AS freq, + total, + COUNT(DISTINCT url) / total AS pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + url, + getCustomPropertyNames(payload) AS names, + total + FROM + `httparchive.pages.2020_08_01_*` + JOIN ( + SELECT + _TABLE_SUFFIX, + COUNT(DISTINCT url) AS total + FROM + `httparchive.pages.2020_08_01_*` + GROUP BY + _TABLE_SUFFIX) + USING (_TABLE_SUFFIX)), + UNNEST(names) AS name +GROUP BY + client, + name, + total +ORDER BY + pct DESC +LIMIT + 1000 \ No newline at end of file diff --git a/sql/2020/01_CSS/custom_property_values.sql b/sql/2020/01_CSS/custom_property_values.sql new file mode 100644 index 00000000000..9e0d5baad2b --- /dev/null +++ b/sql/2020/01_CSS/custom_property_values.sql @@ -0,0 +1,44 @@ +#standardSQL +# Most popular custom property values as a percent of pages. +CREATE TEMPORARY FUNCTION getCustomPropertyValues(payload STRING) RETURNS ARRAY LANGUAGE js AS ''' +try { + var $ = JSON.parse(payload); + var vars = JSON.parse($['_css-variables']); + return Object.values(vars.summary); +} catch (e) { + return []; +} +'''; + +SELECT + client, + value, + COUNT(DISTINCT url) AS freq, + total, + COUNT(DISTINCT url) / total AS pct +FROM ( + SELECT + _TABLE_SUFFIX AS client, + url, + getCustomPropertyValues(payload) AS values, + total + FROM + `httparchive.pages.2020_08_01_*` + JOIN ( + SELECT + _TABLE_SUFFIX, + COUNT(DISTINCT url) AS total + FROM + `httparchive.pages.2020_08_01_*` + GROUP BY + _TABLE_SUFFIX) + USING (_TABLE_SUFFIX)), + UNNEST(values) AS value +GROUP BY + client, + value, + total +ORDER BY + pct DESC +LIMIT + 1000 \ No newline at end of file diff --git a/sql/2020/01_CSS/stylesheet_kbytes.sql b/sql/2020/01_CSS/stylesheet_kbytes.sql new file mode 100644 index 00000000000..e08d264775e --- /dev/null +++ b/sql/2020/01_CSS/stylesheet_kbytes.sql @@ -0,0 +1,15 @@ +#standardSQL +# Distribution of external stylesheet transfer size (compressed). +SELECT + percentile, + _TABLE_SUFFIX AS client, + APPROX_QUANTILES(bytesCSS / 1024, 1000)[OFFSET(percentile * 10)] AS stylesheet_kbytes +FROM + `httparchive.summary_pages.2020_08_01_*`, + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client \ No newline at end of file From bbc4db1de8fbf875b98bcadb2d950c977058adbe Mon Sep 17 00:00:00 2001 From: Dmitry Pokidov Date: Mon, 21 Sep 2020 21:34:05 +1000 Subject: [PATCH 10/17] Initial commit for CSS in JS --- sql/2020/01_CSS/css_in_js.sql | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 sql/2020/01_CSS/css_in_js.sql diff --git a/sql/2020/01_CSS/css_in_js.sql b/sql/2020/01_CSS/css_in_js.sql new file mode 100644 index 00000000000..671116b55d6 --- /dev/null +++ b/sql/2020/01_CSS/css_in_js.sql @@ -0,0 +1,19 @@ +#standardSQL +# CSS in JS. WIP +CREATE TEMPORARY FUNCTION getCssInJS(payload STRING) +RETURNS ARRAY LANGUAGE js AS '''''' + try { + var $ = JSON.parse(payload); + var css = JSON.parse($._css); + + return Array.isArray(css.css_in_js) && css.css_in_js.length > 0 ? css.css_in_js : [''NONE'']; + } catch (e) { + return [e.message]; + } +''''''; + +SELECT + url, + cssInJs +FROM `httparchive.sample_data.pages_mobile_10k` +CROSS JOIN UNNEST(getCssInJS(payload)) AS cssInJs \ No newline at end of file From 1f765dc2f72bfd2aeb7972e1c13b0cc95b6beb41 Mon Sep 17 00:00:00 2001 From: Lea Verou Date: Fri, 25 Sep 2020 20:35:11 +0300 Subject: [PATCH 11/17] Node script to download & combine CSS utils in one fell swoop --- sql/lib/css-utils.js | 639 +++++++++++++++++++++++++++++++++++++++ sql/lib/get-css-utils.js | 58 ++++ sql/lib/rework-utils.js | 157 ---------- 3 files changed, 697 insertions(+), 157 deletions(-) create mode 100644 sql/lib/css-utils.js create mode 100644 sql/lib/get-css-utils.js delete mode 100644 sql/lib/rework-utils.js diff --git a/sql/lib/css-utils.js b/sql/lib/css-utils.js new file mode 100644 index 00000000000..7399d37f5f8 --- /dev/null +++ b/sql/lib/css-utils.js @@ -0,0 +1,639 @@ +(() => { +const TOKENS = { + attribute: /\[\s*(?:(?\*|[-\w]*)\|)?(?[-\w\u{0080}-\u{FFFF}]+)\s*(?:(?\W?=)\s*(?.+?)\s*(?i)?\s*)?\]/gu, + id: /#(?(?:[-\w\u{0080}-\u{FFFF}]|\\.)+)/gu, + class: /\.(?(?:[-\w\u{0080}-\u{FFFF}]|\\.)+)/gu, + comma: /\s*,\s*/g, // must be before combinator + combinator: /\s*[\s>+~]\s*/g, // this must be after attribute + "pseudo-element": /::(?[-\w\u{0080}-\u{FFFF}]+)(?:\((?¶+)\))?/gu, // this must be before pseudo-class + "pseudo-class": /:(?[-\w\u{0080}-\u{FFFF}]+)(?:\((?¶+)\))?/gu, + type: /(?:(?\*|[-\w]*)\|)?(?[-\w\u{0080}-\u{FFFF}]+)|\*/gu // this must be last +}; + +const TOKENS_WITH_PARENS = new Set(["pseudo-class", "pseudo-element"]); +const TOKENS_WITH_STRINGS = new Set([...TOKENS_WITH_PARENS, "attribute"]); +const TRIM_TOKENS = new Set(["combinator", "comma"]); +const RECURSIVE_PSEUDO_CLASSES = new Set(["not", "is", "where", "has", "matches", "-moz-any", "-webkit-any"]); + +const TOKENS_FOR_RESTORE = Object.assign({}, TOKENS); +TOKENS_FOR_RESTORE["pseudo-element"] = RegExp(TOKENS["pseudo-element"].source.replace("(?¶+)", "(?.+?)"), "gu") +TOKENS_FOR_RESTORE["pseudo-class"] = RegExp(TOKENS["pseudo-class"].source.replace("(?¶+)", "(?.+)"), "gu") + +function gobbleParens(text, i) { + let str = "", stack = []; + + for (; i < text.length; i++) { + let char = text[i]; + + if (char === "(") { + stack.push(char); + } + else if (char === ")") { + if (stack.length > 0) { + stack.pop(); + } + else { + throw new Error("Closing paren without opening paren at " + i); + } + } + + str += char; + + if (stack.length === 0) { + return str; + } + } + + throw new Error("Opening paren without closing paren"); +} + +function tokenizeBy (text, grammar) { + if (!text) { + return []; + } + + var strarr = [text]; + + tokenloop: for (var token in grammar) { + let pattern = grammar[token]; + + for (var i=0; i < strarr.length; i++) { // Don’t cache length as it changes during the loop + var str = strarr[i]; + + if (typeof str === "string") { + pattern.lastIndex = 0; + + var match = pattern.exec(str); + + if (match) { + let from = match.index - 1; + let args = []; + let content = match[0]; + + let before = str.slice(0, from + 1); + if (before) { + args.push(before); + } + + args.push({ + type: token, + content, + ...match.groups + }); + + let after = str.slice(from + content.length + 1); + if (after) { + args.push(after); + } + + strarr.splice(i, 1, ...args); + } + + } + } + } + + let offset = 0; + for (let i=0; i { + strings.push({str, start}); + return quote + "§".repeat(content.length) + quote; + }); + + // Now that strings are out of the way, extract parens and replace them with parens with whitespace (to preserve offsets) + let parens = [], offset = 0, start; + while ((start = selector.indexOf("(", offset)) > -1) { + let str = gobbleParens(selector, start); + parens.push({str, start}); + selector = selector.substring(0, start) + "(" + "¶".repeat(str.length - 2) + ")" + selector.substring(start + str.length); + offset = start + str.length; + } + + // Now we have no nested structures and we can parse with regexes + let tokens = tokenizeBy(selector, TOKENS); + + // Now restore parens and strings in reverse order + function restoreNested(strings, regex, types) { + for (let str of strings) { + for (let token of tokens) { + if (types.has(token.type) && token.pos[0] < str.start && str.start < token.pos[1]) { + let content = token.content; + token.content = token.content.replace(regex, str.str); + + if (token.content !== content) { // actually changed? + // Re-evaluate groups + TOKENS_FOR_RESTORE[token.type].lastIndex = 0; + let match = TOKENS_FOR_RESTORE[token.type].exec(token.content); + let groups = match.groups; + Object.assign(token, groups); + } + } + } + } + } + + restoreNested(parens, /\(¶+\)/, TOKENS_WITH_PARENS); + restoreNested(strings, /(['"])§+?\1/, TOKENS_WITH_STRINGS); + + return tokens; +} + +// Convert a flat list of tokens into a tree of complex & compound selectors +function nestTokens(tokens, {list = true} = {}) { + if (list && tokens.find(t => t.type === "comma")) { + let selectors = [], temp = []; + + for (let i=0; i=0; i--) { + let token = tokens[i]; + + if (token.type === "combinator") { + let left = tokens.slice(0, i); + let right = tokens.slice(i + 1); + + if (left.length === 0 || right.length === 0) { + throw new Error(`Combinator ${token.content} used in selector ${left.length === 0? "start" : "end"}`); + } + + return { + type: "complex", + combinator: token.content, + left: nestTokens(left), + right: nestTokens(right) + }; + } + } + + // If we're here, there are no combinators, so it's just a list + return tokens.length === 1? tokens[0] : { + type: "compound", + list: [...tokens] // clone to avoid pointers messing up the AST + }; +} + +// Traverse an AST (or part thereof), in depth-first order +function walk(node, callback, o, parent) { + if (node.type === "complex") { + walk(node.left, callback, o, node); + walk(node.right, callback, o, node); + } + else if (node.type === "compound") { + for (let n of node.list) { + walk(n, callback, o, node); + } + } + else if (node.subtree && o && o.subtree) { + walk(node.subtree, callback, o, node); + } + + callback(node, parent); +} + +/** + * Parse a CSS selector + * @param selector {String} The selector to parse + * @param options.recursive {Boolean} Whether to parse the arguments of pseudo-classes like :is(), :has() etc. Defaults to true. + * @param options.list {Boolean} Whether this can be a selector list (A, B, C etc). Defaults to true. + */ +function parse(selector, {recursive = true, list = true} = {}) { + let tokens = tokenize(selector); + + if (!tokens) { + return null; + } + + let ast = nestTokens(tokens, {list}); + + if (recursive) { + walk(ast, node => { + if (node.type === "pseudo-class" && node.argument && RECURSIVE_PSEUDO_CLASSES.has(node.name)) { + node.subtree = parse(node.argument, {recursive: true, list: true}); + } + }); + } + + return ast; +} + +function specificityToNumber(specificity, base) { + base = base || Math.max(...specificity) + 1; + + return specificity[0] * base ** 2 + specificity[1] * base + specificity[0]; +} + +function maxIndexOf(arr) { + let max = arr[0], ret = 0; + + for (let i=0; i max) { + ret = i; + max = arr[i]; + } + } + + return arr.length === 0? -1 : ret; +} + +/** + * Calculate specificity of a selector. + * If the selector is a list, the max specificity is returned. + */ +function specificity(selector, {format = "array"} = {}) { + let ast = typeof selector === "object"? selector : parse(selector, {recursive: true}); + + if (!ast) { + return null; + } + + if (ast.type === "list") { + // Return max specificity + let base = 10; + let specificities = ast.list.map(s => { + let sp = specificity(s); + base = Math.max(base, ...sp); + return sp; + }); + let numbers = specificities.map(s => specificityToNumber(s, base)); + let i = maxIndexOf(numbers); + return specificities[i]; + } + + let ret = [0, 0, 0]; + + walk(ast, node => { + if (node.type === "id") { + ret[0]++; + } + else if (node.type === "class" || node.type === "attribute") { + ret[1]++; + } + else if ((node.type === "type" && node.content !== "*") || node.type === "pseudo-element") { + ret[2]++; + } + else if (node.type === "pseudo-class" && node.name !== "where") { + if (RECURSIVE_PSEUDO_CLASSES.has(node.name) && node.subtree) { + // Max of argument list + let sub = specificity(node.subtree); + sub.forEach((s, i) => ret[i] += s); + } + else { + ret[1]++; + } + } + }); + + return ret; +} + + +self.parsel = {gobbleParens, tokenizeBy, tokenize, nestTokens, walk, parse, specificityToNumber, specificity}; +})(); + +/* countDeclarations.js */ + +/** + * Count total declarations that pass a given test. + * @see {@link module:walkDeclarations} for arguments + * @returns {number} Declaration count that pass the provided conditions. + */ +function countDeclarations(rules, test) { + let ret = 0; + + walkDeclarations(rules, declaration => ret++, test); + + return ret; +} + + +/* countDeclarationsByProperty.js */ + +/** + * Count properties that pass a given test, in rules that pass a given test + * @see {@link module:walkDeclarations} for arguments + * @return {Object} Property names and declaration counts. Use `sumObject(ret)` to get total count. + */ +function countDeclarationsByProperty(rules, test) { + let ret = {}; + + walkDeclarations(rules, declaration => { + ret[declaration.property] = (ret[declaration.property] || 0) + 1; + }, test); + + return sortObject(ret); +} + + +/* extractFunctionCalls.js */ +/** + * Extract all or some function calls from a string + * @param {string} value - The value to extract function calls from. + * Note that this will also extract nested function calls, you can use `pos` to discard those if they are not of interest. + * @param {Object} [test] + * @param {string|RegExp|Function|Array} test.names + * @param {string|RegExp|Function|Array} test.args + * @return {Array} Array of objects, one for each function call with `{name, args, pos}` keys + */ +function extractFunctionCalls(value, test) { + // First, extract all function calls + let ret = []; + + for (let match of value.matchAll(/\b(?[\w-]+)\(/gi)) { + let index = match.index; + let openParen = index + match[0].length; + let rawArgs = parsel.gobbleParens(value, openParen - 1); + let args = rawArgs.slice(1, -1).trim(); + let name = match.groups.name; + + ret.push({name, pos: [index, index + match[0].length + rawArgs.length - 1], args}) + } + + if (test) { + ret = ret.filter(f => { + return matches(f.name, test && test.names) && matches(f.args, test && test.args); + }); + } + + return ret; +} + + +// Get distinct values of properties that pass a given test, in rules that pass a given test +// Returns object of properties with arrays of values. +function getPropertyValues(rules, test) { + let ret = {}; + + walkDeclarations(rules, declaration => { + if (matches(declaration.property, test && test.properties) && matches(declaration.value, test && test.values)) { + ret[declaration.property] = (ret[declaration.property] || new Set()); + ret[declaration.property].add(declaration.value); + } + }, {rules: test && test.rules}); + + return sortObject(ret); +} + + +/* incrementByKey.js */ + +/** + * Increment a value in an object, whether the key exists or not + * @param {Object} obj - The object + * @param {string} key - The object property + * @return {number} The new value + */ +function incrementByKey(obj, key) { + return obj[key] = (obj[key] || 0) + 1; +} + + +/* matches.js */ +/** + * Test whether a value passes a given test. + * The test could be a string, regexp, function, or array of any of these. + * This is at the core of most walkers. + * @param value + * @param {string|RegExp|Function|Array} [test] + * @return {Boolean} true if no test is provided, or test passes, false otherwise. + */ +function matches(value, test, not) { + if (!test) { + return !not; + } + + if (Array.isArray(test)) { + return test.some(t => matches(value, t)); + } + + let type = typeof test; + + if (type === "string") { + return value === test; + } + else if (type === "function") { + return test(value); + } + else if (test instanceof RegExp) { + return test.test(value); + } + + return false; +} + + +/* sortObject.js */ +/** + * Sort an object literal and return the result as a new object literal + * @param {Object} obj + * @param {Function} [f=x=>x] Optional function to pass arguments through, useful if e.g. we are sorting by a property of an object. + */ +function sortObject(obj, f = x => x) { + if (!obj) { + return obj; + } + + return Object.fromEntries(Object.entries(obj).sort((a, b) => f(b[1]) - f(a[1]))); +} + + +/* sumObject.js */ +/** + * Sum all values of an object and return the result + * @param {Object} obj + */ +function sumObject(obj) { + return Object.values(obj).reduce((a, c) => a + c, 0); +} + + +/* walkDeclarations.js */ + +/** + * Recursively walk all declarations + * @param {Object|Array} rules - AST, array of rules, or single rule + * @param {Function} callback - Callback to be executed on each declaration. Arguments: (declaration, rule) + * @param {Object} [test] - Conditions that need to be satisfied for a declaration to be visited, all optional + * @param {string|RegExp|Function|Array} test.properties - Test for property names + * @param {string|RegExp|Function|Array} test.values - Test for values + * @param {string|RegExp|Function|Array} test.rules - Test for rules + */ +function walkDeclarations(rules, callback, test) { + if (!rules) { + return; + } + + if (rules.stylesheet) { + // AST passed + rules = rules.stylesheet.rules; + } + else if (!Array.isArray(rules)) { + // Single rule + rules = [rules]; + } + + for (let rule of rules) { + if (!matches(rule, test && test.rules) || matches(rule, test && test.not && test.not.rules, true)) { + continue; + } + + // Walk declarations directly in rule + if (rule.declarations) { + for (let declaration of rule.declarations) { + if (declaration.type !== "declaration") { + continue; + } + + let {property, value} = declaration; + let important = false; + + value = value.replace(/\s*!important\s*$/, $0 => { + important = true; + return ""; + }); + + if (!test || + matches(property, test.properties) + && matches(value, test.values) + && !matches(property, test.not && test.not.properties, true) + && !matches(value, test.not && test.not.values, true) + ) { + callback({property, value, important}, rule); + } + } + } + + // Walk declarations of nested rules (e.g. @media, @supports have nested rules) + if (rule.rules) { + walkDeclarations(rule.rules, callback, test); + } + } +} + + +/* walkRules.js */ +/** + * Recursively walk all "normal" rules, i.e. rules with selectors + * @param rules {Object|Array} AST or array of CSS rules + * @param callback {Function} Function to be executed for each matching rule. Rule passed as the only argument. + * @param [test] {Object} + * @param test.rules {string|RegExp|Function|Array} Which rules the callback runs on + * @param test.type {string|RegExp|Function|Array} Which rule types the walker runs on + * @param test.ancestors {string|RegExp|Function|Array} Which rules the walker descends on + * @return The return value of the callback (which also breaks the loop) or undefined. + */ +function walkRules(rules, callback, test) { + if (!rules) { + return; + } + + if (!Array.isArray(rules)) { + // AST passed + rules = rules.stylesheet.rules; + } + + for (let rule of rules) { + if (!test || + matches(rule, test && test.rules) + && matches(rule.type, test && test.type) + && !matches(rule, test.not && test.not.rules, true) + && !matches(rule.type, test.not && test.not.type, true) + ) { + let ret = callback(rule); + + if (ret !== undefined) { + // Break loop and return immediately + return ret; + } + } + + if ( + matches(rule, test && test.ancestors) + && !matches(rule, test && test.not && test.not.ancestors, true) + ) { + if (rule.rules) { + walkRules(rule.rules, callback, test); + } + } + } +} + + +/* walkSelectors.js */ + +/** + * Walk all selectors in rules that have selectors + * @param {Object|Array} rules - AST, array of rules, or single rule + * @param {Function} callback - Function to be executed for each matching rule. Rule passed as the only argument. + * @param {Object} [test] + * @param {string|RegExp|Function|Array} test.selectors - Which selectors the callback runs on + * @see {@link module:walkRules} for test properties available that filter the rules inspected + */ +function walkSelectors(rules, callback, test) { + if (rules.stylesheet) { + // AST passed + rules = rules.stylesheet.rules; + } + else if (!Array.isArray(rules)) { + // Single rule + rules = [rules]; + } + + walkRules(rules, rule => { + if (rule.selectors) { + for (let selector of rule.selectors) { + if (matches(selector, test && test.selectors)) { + callback(selector, rule.selectors); + } + } + } + }, test); +} diff --git a/sql/lib/get-css-utils.js b/sql/lib/get-css-utils.js new file mode 100644 index 00000000000..4d9ee71caeb --- /dev/null +++ b/sql/lib/get-css-utils.js @@ -0,0 +1,58 @@ +const fs = require("fs"); +const https = require("https"); + +function readFile(file, enc = "utf8") { + return new Promise((resolve, reject) => { + fs.readFile(file, enc, (err,data) => { + if (err) { + reject(err); + } + + resolve(data); + }); + }); +} + +function writeFile(file, contents, enc) { + return new Promise((resolve, reject) => { + fs.writeFile(file, contents, enc, (err) => { + if (err) { + reject(err); + } + + resolve(); + }); + }); +} + +function downloadFile(url) { + return new Promise((resolve, reject) => { + let data = ""; + let request = https.get(url, response => { + response.on("data", chunk => data += chunk); + response.on("end", () => resolve(data)); + }); + + request.on("error", err => { + reject(err); + }); + }) + +} + +const urls = [ + "https://projects.verou.me/parsel/parsel_nomodule.js", + "https://projects.verou.me/rework-utils/rework-utils.js" +]; + +(async ()=>{ + +let contents = urls.map(async url => await downloadFile(url)); + +contents = await Promise.all(contents); + +contents = contents.join("\n\n"); + +writeFile("./css-utils.js", contents); + +})(); diff --git a/sql/lib/rework-utils.js b/sql/lib/rework-utils.js deleted file mode 100644 index 4522ec81024..00000000000 --- a/sql/lib/rework-utils.js +++ /dev/null @@ -1,157 +0,0 @@ -/** - * Test whether a value passes a given test. - * The test could be a string, regexp, function, or array of any of these. - * This is at the core of most walkers. - * @param value - * @param {string|RegExp|Function|Array} [test] - * @return {Boolean} true if no test is provided, or test passes, false otherwise. - */ -function matches(value, test) { - if (!test) { - return true; - } - - if (Array.isArray(test)) { - return test.some(t => matches(value, t)); - } - - let type = typeof test; - - if (type === "string") { - return value === test; - } - else if (type === "function") { - return test(value); - } - else if (test instanceof RegExp) { - return test.test(value); - } - - return false; -} - -/** - * Recursively walk all declarations - * @param {Object|Array} rules - AST, array of rules, or single rule - * @param {Function} callback - Callback to be executed on each declaration. Arguments: (declaration, rule) - * @param {Object} [test] - Conditions that need to be satisfied for a declaration to be visited, all optional - * @param {string|RegExp|Function|Array} test.properties - Test for property names - * @param {string|RegExp|Function|Array} test.values - Test for values - * @param {string|RegExp|Function|Array} test.rules - Test for rules - */ -function walkDeclarations(rules, callback, test) { - if (!rules) { - return; - } - - if (rules.stylesheet) { - // AST passed - rules = rules.stylesheet.rules; - } - else if (!Array.isArray(rules)) { - // Single rule - rules = [rules]; - } - - for (let rule of rules) { - if (!matches(rule, test && test.rules)) { - continue; - } - - // Walk declarations directly in rule - if (rule.declarations) { - for (let declaration of rule.declarations) { - if (matches(declaration.property, test && test.properties) && matches(declaration.value, test && test.values)) { - callback(declaration, rule); - } - } - } - - // Walk declarations of nested rules (e.g. @media, @supports have nested rules) - if (rule.rules) { - walkDeclarations(rule.rules, callback, test); - } - } -} - -/** - * Recursively walk all "normal" rules, i.e. rules with selectors - * @param rules {Object|Array} AST or array of CSS rules - * @param callback {Function} Function to be executed for each matching rule. Rule passed as the only argument. - * @param [test] {Object} - * @param test.rules {string|RegExp|Function|Array} Which rules the callback runs on - * @param test.ancestors {string|RegExp|Function|Array} Which rules the walker descends on - * @return The return value of the callback (which also breaks the loop) or undefined. - */ -function walkRules(rules, callback, test) { - if (!rules) { - return; - } - - if (!Array.isArray(rules)) { - // AST passed - rules = rules.stylesheet.rules; - } - - for (let rule of rules) { - if (matches(rule, test && test.rules)) { - let ret = callback(rule); - - if (ret !== undefined) { - // Break loop and return immediately - return ret; - } - } - - if (matches(rule, test && test.ancestors)) { - if (rule.rules) { - walkRules(rule.rules, callback, test); - } - } - } -} - -/** - * Sort an object literal and return the result as a new object literal - * @param {Object} obj - * @param {Function} [f=x=>x] Optional function to pass arguments through, useful if e.g. we are sorting by a property of an object. - */ -function sortObject(obj, f = x => x) { - return Object.fromEntries(Object.entries(obj).sort((a, b) => f(b[1]) - f(a[1]))); -} - -/** - * Sum all values of an object and return the result - * @param {Object} obj - */ -function sumObject(obj) { - Object.values(obj).reduce((a, c) => a + c, 0); -} - -/** - * Count total declarations that pass a given test. - * @see {@link module:walkDeclarations} for arguments - * @returns {number} Declaration count that pass the provided conditions. - */ -function countDeclarations(rules, test) { - let ret = 0; - - walkDeclarations(rules, declaration => ret++, test); - - return ret; -} - -/** - * Count properties that pass a given test, in rules that pass a given test - * @see {@link module:walkDeclarations} for arguments - * @return {Object} Property names and declaration counts. Use `sumObject(ret)` to get total count. - */ -function countDeclarationsByProperty(rules, test) { - let ret = {}; - - walkDeclarations(rules, declaration => { - ret[declaration.property] = (ret[declaration.property] || 0) + 1; - }, test); - - return sortObject(ret); -} From 7809af78c06992916d108b5338f16c253f52e440 Mon Sep 17 00:00:00 2001 From: Lea Verou Date: Fri, 25 Sep 2020 21:15:24 +0300 Subject: [PATCH 12/17] Update css-utils.js --- sql/lib/css-utils.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/lib/css-utils.js b/sql/lib/css-utils.js index 7399d37f5f8..4d8171c6373 100644 --- a/sql/lib/css-utils.js +++ b/sql/lib/css-utils.js @@ -1,4 +1,4 @@ -(() => { +var parsel = (() => { const TOKENS = { attribute: /\[\s*(?:(?\*|[-\w]*)\|)?(?[-\w\u{0080}-\u{FFFF}]+)\s*(?:(?\W?=)\s*(?.+?)\s*(?i)?\s*)?\]/gu, id: /#(?(?:[-\w\u{0080}-\u{FFFF}]|\\.)+)/gu, @@ -268,7 +268,7 @@ function parse(selector, {recursive = true, list = true} = {}) { function specificityToNumber(specificity, base) { base = base || Math.max(...specificity) + 1; - return specificity[0] * base ** 2 + specificity[1] * base + specificity[0]; + return specificity[0] * base ** 2 + specificity[1] * base + specificity[2]; } function maxIndexOf(arr) { @@ -336,7 +336,7 @@ function specificity(selector, {format = "array"} = {}) { } -self.parsel = {gobbleParens, tokenizeBy, tokenize, nestTokens, walk, parse, specificityToNumber, specificity}; +return {gobbleParens, tokenizeBy, tokenize, nestTokens, walk, parse, specificityToNumber, specificity}; })(); /* countDeclarations.js */ From fda741096e230934a7e2c1d7ca5ef8190d3e10a4 Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Fri, 25 Sep 2020 15:21:28 -0400 Subject: [PATCH 13/17] css queries --- sql/2020/01_CSS/specificity.sql | 97 ++++++++++++++++++++ sql/2020/01_CSS/specificity_hacks.sql | 124 ++++++++++++++++++++++++++ sql/2020/01_CSS/stylesheet_count.sql | 31 +++++++ 3 files changed, 252 insertions(+) create mode 100644 sql/2020/01_CSS/specificity.sql create mode 100644 sql/2020/01_CSS/specificity_hacks.sql create mode 100644 sql/2020/01_CSS/stylesheet_count.sql diff --git a/sql/2020/01_CSS/specificity.sql b/sql/2020/01_CSS/specificity.sql new file mode 100644 index 00000000000..632232863ea --- /dev/null +++ b/sql/2020/01_CSS/specificity.sql @@ -0,0 +1,97 @@ +#standardSQL +CREATE TEMPORARY FUNCTION getSpecificityInfo(css STRING) +RETURNS STRUCT< + ruleCount NUMERIC, + selectorCount NUMERIC, + selectorsPerRule NUMERIC, + avgSpecificity STRING, + maxSpecificity STRING, + medianSpecificity STRING +> LANGUAGE js AS ''' +try { + function extractSpecificity(ast) { + let ret = { + selectorCount: 0, + ruleCount: 0, + specificityCount: {}, + maxSpecifity: [0, 0, 0] + }; + + let ss = [0, 0, 0]; + + walkRules(ast, rule => { + ret.ruleCount++; + + for (let selector of rule.selectors) { + ret.selectorCount++; + let s = parsel.specificity(selector); + ss = ss.map((a, i) => a + s[i]); + let max = Math.max(...s); + + incrementByKey(ret.specificityCount, max <= 5? s + "" : "higher"); + + let base = Math.max(...ret.maxSpecifity, ...s); + if (parsel.specificityToNumber(s, base) > parsel.specificityToNumber(ret.maxSpecifity, base)) { + ret.maxSpecifity = s; + ret.maxSpecifitySelector = selector; + } + } + }, {type: "rule"}); + + ret.selectorsPerRule = ret.selectorCount / ret.ruleCount; + ret.avgSpecificity = ss.map(s => s / ret.selectorCount); + + return ret; + } + + function getMedian(specificities) { + const total = Object.values(specificities).reduce((sum, value) => sum + value, 0); + if (total == 0) { + return null; + } + let cdf = 0; + const cdfEntries = Object.entries(specificities).map(([specificity, count]) => { + cdf += count; + return [specificity, cdf / total]; + }); + + for ([specificity, cdf] in cdfEntries) { + if (cdf >= 0.5) { + return specificity; + } + } + + return null; + } + + function specificityToString(specificity) { + return specificity && specificity.join(','); + } + + function sanitize(value) { + return isNaN(value) ? null : value; + } + + const ast = JSON.parse(css); + const specificity = extractSpecificity(ast); + + return { + ruleCount: specificity.ruleCount, + selectorCount: specificity.selectorCount, + selectorsPerRule: sanitize(specificity.selectorsPerRule), + avgSpecificity: specificityToString(specificity.avgSpecificity), + maxSpecificity: specificityToString(specificity.maxSpecificity), + medianSpecificity: getMedian(specificity.specificityCount) + }; +} catch (e) { + return {avgSpecificity: e}; +} +''' +OPTIONS (library="gs://httparchive/lib/css-utils.js"); + + + SELECT + client, + getSpecificityInfo(css) AS info + FROM + `httparchive.almanac.parsed_css_1k` \ No newline at end of file diff --git a/sql/2020/01_CSS/specificity_hacks.sql b/sql/2020/01_CSS/specificity_hacks.sql new file mode 100644 index 00000000000..114710dfc83 --- /dev/null +++ b/sql/2020/01_CSS/specificity_hacks.sql @@ -0,0 +1,124 @@ +#standardSQL +CREATE TEMPORARY FUNCTION getSpecificityHacks(css STRING) +RETURNS STRUCT< + bem NUMERIC, + attribute_id NUMERIC, + duplicate_classes NUMERIC, + root_descendant NUMERIC, + html_descendant NUMERIC, + not_id_descendant NUMERIC +> LANGUAGE js AS ''' +try { + +function compute() { + +let ret = { + bem: 0, + attribute_id: 0, + duplicate_classes: 0, + root_descendant: 0, + html_descendant: 0, + not_id_descendant: 0, +}; + +const bem = /^(?=.+--|.+__)[a-z0-9-]+(__[\\w-]+)?(--[\\w-]+)?$/i; + +walkSelectors(ast, selector => { + let sast = parsel.parse(selector, {list: false, recursive: false}); + + parsel.walk(sast, (node, parent) => { + if (node.type === "attribute" && node.name === "id" && node.operator === "=") { + ret.attribute_id++; + } + else if (node.type === "compound") { + // Look for duplicate classes + let classes = new Set(); + + for (let s of node.list) { + if (s.type === "class") { + if (classes.has(s.name)) { + // Found a duplicate class + ret.duplicate_classes++; + break; + } + + classes.add(s.name); + } + } + } + else if (!parent && node.type === "complex") { + let first = node; + // Find the firstmost compound + while ((first = first.left) && first.type === "complex"); + + if (first.combinator === " ") { + first = first.left; + } + + if (first.type === "pseudo-class" && first.name === "root") { + ret.root_descendant++; + } + else if (first.type === "type" && first.name === "html") { + ret.html_descendant++; + } + else if (first.type === "pseudo-class" && first.name === "not" && first.argument.startsWith("#")) { + ret.not_id_descendant++; + } + } + else if (node.type === "class" && (!parent || parent.type === "complex" && parent.combinator === " ")) { + if (bem.test(node.name)) { + ret.bem++; + } + } + }, {subtree: true}); +}); + +return ret; + +} + + const ast = JSON.parse(css); + return compute(ast); +} catch (e) { + return null; +} +''' +OPTIONS (library="gs://httparchive/lib/css-utils.js"); + +SELECT + percentile, + client, + COUNT(DISTINCT page) AS total, + COUNT(DISTINCT IF(hack.bem > 0, page, NULL)) AS bem_pages, + COUNT(DISTINCT IF(hack.bem > 0, page, NULL)) / COUNT(DISTINCT page) AS bem_pages_pct, + APPROX_QUANTILES(hack.bem, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS bem_per_page, + COUNT(DISTINCT IF(hack.attribute_id > 0, page, NULL)) AS attribute_id_pages, + COUNT(DISTINCT IF(hack.attribute_id > 0, page, NULL)) / COUNT(DISTINCT page) AS attribute_id_pages_pct, + APPROX_QUANTILES(hack.attribute_id, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS attribute_id_per_page, + COUNT(DISTINCT IF(hack.duplicate_classes > 0, page, NULL)) AS duplicate_classes_pages, + COUNT(DISTINCT IF(hack.duplicate_classes > 0, page, NULL)) / COUNT(DISTINCT page) AS duplicate_classes_pages_pct, + APPROX_QUANTILES(hack.duplicate_classes, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS duplicate_classes_per_page, + COUNT(DISTINCT IF(hack.root_descendant > 0, page, NULL)) AS root_descendant_pages, + COUNT(DISTINCT IF(hack.root_descendant > 0, page, NULL)) / COUNT(DISTINCT page) AS root_descendant_pages_pct, + APPROX_QUANTILES(hack.root_descendant, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS root_descendantem_per_page, + COUNT(DISTINCT IF(hack.html_descendant > 0, page, NULL)) AS html_descendant_pages, + COUNT(DISTINCT IF(hack.html_descendant > 0, page, NULL)) / COUNT(DISTINCT page) AS html_descendant_pages_pct, + APPROX_QUANTILES(hack.html_descendant, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS html_descendant_per_page, + COUNT(DISTINCT IF(hack.not_id_descendant > 0, page, NULL)) AS not_id_descendant_pages, + COUNT(DISTINCT IF(hack.not_id_descendant > 0, page, NULL)) / COUNT(DISTINCT page) AS not_id_descendant_pages_pct, + APPROX_QUANTILES(hack.not_id_descendant, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS not_id_descendant_per_page, + COUNTIF(hack IS NULL) AS parse_errors +FROM ( + SELECT + client, + page, + getSpecificityHacks(css) AS hack + FROM + `httparchive.almanac.parsed_css`), + UNNEST([10, 25, 50, 75, 90, 95, 99, 100]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client \ No newline at end of file diff --git a/sql/2020/01_CSS/stylesheet_count.sql b/sql/2020/01_CSS/stylesheet_count.sql new file mode 100644 index 00000000000..cdfd7ccbc82 --- /dev/null +++ b/sql/2020/01_CSS/stylesheet_count.sql @@ -0,0 +1,31 @@ +#standardSQL +CREATE TEMPORARY FUNCTION getStylesheets(payload STRING) +RETURNS STRUCT LANGUAGE js AS ''' +try { + var $ = JSON.parse(payload) + var sass = JSON.parse($._sass); + return sass.stylesheets; +} catch (e) { + return null; +} +'''; + +SELECT + percentile, + _TABLE_SUFFIX AS client, + APPROX_QUANTILES(stylesheets.inline, 1000)[OFFSET(percentile * 10)] AS num_inline_stylesheets, + APPROX_QUANTILES(stylesheets.remote, 1000)[OFFSET(percentile * 10)] AS num_remote_stylesheets, +FROM ( + SELECT + _TABLE_SUFFIX, + url, + getStylesheets(payload) AS stylesheets + FROM + `httparchive.pages.2020_08_01_*`), + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client \ No newline at end of file From 86e06119e0a143e68183f16285a85c2593d0a7cb Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Fri, 25 Sep 2020 16:00:45 -0400 Subject: [PATCH 14/17] use partition --- sql/2020/01_CSS/specificity_hacks.sql | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sql/2020/01_CSS/specificity_hacks.sql b/sql/2020/01_CSS/specificity_hacks.sql index 114710dfc83..0e2bcf71d71 100644 --- a/sql/2020/01_CSS/specificity_hacks.sql +++ b/sql/2020/01_CSS/specificity_hacks.sql @@ -114,7 +114,9 @@ FROM ( page, getSpecificityHacks(css) AS hack FROM - `httparchive.almanac.parsed_css`), + `httparchive.almanac.parsed_css` + WHERE + date = '2020-08-01'), UNNEST([10, 25, 50, 75, 90, 95, 99, 100]) AS percentile GROUP BY percentile, From 83a185c85ec5f3611b5a65ba2bbf2b9b24581bb6 Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Sat, 26 Sep 2020 02:27:34 -0400 Subject: [PATCH 15/17] fix hacks grouping --- sql/2020/01_CSS/specificity.sql | 2 +- sql/2020/01_CSS/specificity_hacks.sql | 67 ++++++++++++++++----------- 2 files changed, 41 insertions(+), 28 deletions(-) diff --git a/sql/2020/01_CSS/specificity.sql b/sql/2020/01_CSS/specificity.sql index 632232863ea..06ed8d77dd8 100644 --- a/sql/2020/01_CSS/specificity.sql +++ b/sql/2020/01_CSS/specificity.sql @@ -73,7 +73,7 @@ try { } const ast = JSON.parse(css); - const specificity = extractSpecificity(ast); + let specificity = extractSpecificity(ast); return { ruleCount: specificity.ruleCount, diff --git a/sql/2020/01_CSS/specificity_hacks.sql b/sql/2020/01_CSS/specificity_hacks.sql index 0e2bcf71d71..22db00ee0a4 100644 --- a/sql/2020/01_CSS/specificity_hacks.sql +++ b/sql/2020/01_CSS/specificity_hacks.sql @@ -88,36 +88,49 @@ OPTIONS (library="gs://httparchive/lib/css-utils.js"); SELECT percentile, client, - COUNT(DISTINCT page) AS total, - COUNT(DISTINCT IF(hack.bem > 0, page, NULL)) AS bem_pages, - COUNT(DISTINCT IF(hack.bem > 0, page, NULL)) / COUNT(DISTINCT page) AS bem_pages_pct, - APPROX_QUANTILES(hack.bem, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS bem_per_page, - COUNT(DISTINCT IF(hack.attribute_id > 0, page, NULL)) AS attribute_id_pages, - COUNT(DISTINCT IF(hack.attribute_id > 0, page, NULL)) / COUNT(DISTINCT page) AS attribute_id_pages_pct, - APPROX_QUANTILES(hack.attribute_id, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS attribute_id_per_page, - COUNT(DISTINCT IF(hack.duplicate_classes > 0, page, NULL)) AS duplicate_classes_pages, - COUNT(DISTINCT IF(hack.duplicate_classes > 0, page, NULL)) / COUNT(DISTINCT page) AS duplicate_classes_pages_pct, - APPROX_QUANTILES(hack.duplicate_classes, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS duplicate_classes_per_page, - COUNT(DISTINCT IF(hack.root_descendant > 0, page, NULL)) AS root_descendant_pages, - COUNT(DISTINCT IF(hack.root_descendant > 0, page, NULL)) / COUNT(DISTINCT page) AS root_descendant_pages_pct, - APPROX_QUANTILES(hack.root_descendant, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS root_descendantem_per_page, - COUNT(DISTINCT IF(hack.html_descendant > 0, page, NULL)) AS html_descendant_pages, - COUNT(DISTINCT IF(hack.html_descendant > 0, page, NULL)) / COUNT(DISTINCT page) AS html_descendant_pages_pct, - APPROX_QUANTILES(hack.html_descendant, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS html_descendant_per_page, - COUNT(DISTINCT IF(hack.not_id_descendant > 0, page, NULL)) AS not_id_descendant_pages, - COUNT(DISTINCT IF(hack.not_id_descendant > 0, page, NULL)) / COUNT(DISTINCT page) AS not_id_descendant_pages_pct, - APPROX_QUANTILES(hack.not_id_descendant, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS not_id_descendant_per_page, - COUNTIF(hack IS NULL) AS parse_errors + COUNT(0) AS total, + COUNTIF(bem > 0) AS bem_pages, + COUNTIF(bem > 0) / COUNT(0) AS bem_pages_pct, + APPROX_QUANTILES(bem, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS bem_per_page, + COUNTIF(attribute_id > 0) AS attribute_id_pages, + COUNTIF(attribute_id > 0) / COUNT(0) AS attribute_id_pages_pct, + APPROX_QUANTILES(attribute_id, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS attribute_id_per_page, + COUNTIF(duplicate_classes > 0) AS duplicate_classes_pages, + COUNTIF(duplicate_classes > 0) / COUNT(0) AS duplicate_classes_pages_pct, + APPROX_QUANTILES(duplicate_classes, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS duplicate_classes_per_page, + COUNTIF(root_descendant > 0) AS root_descendant_pages, + COUNTIF(root_descendant > 0) / COUNT(0) AS root_descendant_pages_pct, + APPROX_QUANTILES(root_descendant, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS root_descendantem_per_page, + COUNTIF(html_descendant > 0) AS html_descendant_pages, + COUNTIF(html_descendant > 0) / COUNT(0) AS html_descendant_pages_pct, + APPROX_QUANTILES(html_descendant, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS html_descendant_per_page, + COUNTIF(not_id_descendant > 0) AS not_id_descendant_pages, + COUNTIF(not_id_descendant > 0) / COUNT(0) AS not_id_descendant_pages_pct, + APPROX_QUANTILES(not_id_descendant, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS not_id_descendant_per_page FROM ( SELECT client, - page, - getSpecificityHacks(css) AS hack - FROM - `httparchive.almanac.parsed_css` - WHERE - date = '2020-08-01'), - UNNEST([10, 25, 50, 75, 90, 95, 99, 100]) AS percentile + SUM(hack.bem) AS bem, + SUM(hack.attribute_id) AS attribute_id, + SUM(hack.duplicate_classes) AS duplicate_classes, + SUM(hack.root_descendant) AS root_descendant, + SUM(hack.html_descendant) AS html_descendant, + SUM(hack.not_id_descendant) AS not_id_descendant + FROM ( + SELECT + client, + page, + getSpecificityHacks(css) AS hack + FROM + `httparchive.almanac.parsed_css` + WHERE + date = '2020-08-01' AND + # Limit the size of the CSS to avoid OOM crashes. + LENGTH(css) < 0.1 * 1024 * 1024) + GROUP BY + client, + page), + UNNEST([10, 25, 50, 75, 90, 100]) AS percentile GROUP BY percentile, client From 5231ab3f3ae1ebda0712cb24aedc2469f7cabc8b Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Sun, 27 Sep 2020 01:50:49 -0400 Subject: [PATCH 16/17] selectors and specificity --- .../01_CSS/selector_parts_freq_per_page.sql | 141 ++++++++++++++++++ sql/2020/01_CSS/selectors.sql | 111 ++++++++++++++ sql/2020/01_CSS/specificity.sql | 100 ++++++++----- sql/2020/01_CSS/top_selector_attributes.sql | 88 +++++++++++ sql/2020/01_CSS/top_selector_classes.sql | 88 +++++++++++ sql/2020/01_CSS/top_selector_ids.sql | 88 +++++++++++ .../01_CSS/top_selector_pseudo_classes.sql | 88 +++++++++++ .../01_CSS/top_selector_pseudo_elements.sql | 88 +++++++++++ 8 files changed, 755 insertions(+), 37 deletions(-) create mode 100644 sql/2020/01_CSS/selector_parts_freq_per_page.sql create mode 100644 sql/2020/01_CSS/selectors.sql create mode 100644 sql/2020/01_CSS/top_selector_attributes.sql create mode 100644 sql/2020/01_CSS/top_selector_classes.sql create mode 100644 sql/2020/01_CSS/top_selector_ids.sql create mode 100644 sql/2020/01_CSS/top_selector_pseudo_classes.sql create mode 100644 sql/2020/01_CSS/top_selector_pseudo_elements.sql diff --git a/sql/2020/01_CSS/selector_parts_freq_per_page.sql b/sql/2020/01_CSS/selector_parts_freq_per_page.sql new file mode 100644 index 00000000000..b1a3c972930 --- /dev/null +++ b/sql/2020/01_CSS/selector_parts_freq_per_page.sql @@ -0,0 +1,141 @@ +#standardSQL +CREATE TEMPORARY FUNCTION getSelectorParts(css STRING) +RETURNS STRUCT< + class ARRAY>, + id ARRAY>, + attribute ARRAY>, + pseudo_class ARRAY>, + pseudo_element ARRAY> +> LANGUAGE js AS ''' +try { + function compute(ast) { + let ret = { + class: {}, + id: {}, + attribute: {}, + "pseudo-class": {}, + "pseudo-element": {} + }; + + walkSelectors(ast, selector => { + let sast = parsel.parse(selector, {list: false}); + + parsel.walk(sast, node => { + if (node.type in ret) { + incrementByKey(ret[node.type], node.name); + } + }, {subtree: true}); + }); + + for (let type in ret) { + ret[type] = sortObject(ret[type]); + } + + return ret; + } + + function unzip(obj) { + return Object.entries(obj).filter(([name, value]) => { + return !isNaN(value); + }).map(([name, value]) => ({name, value})); + } + + const ast = JSON.parse(css); + let parts = compute(ast); + return { + class: unzip(parts.class), + id: unzip(parts.id), + attribute: unzip(parts.attribute), + pseudo_class: unzip(parts['pseudo-class']), + pseudo_element: unzip(parts['pseudo-element']) + } +} catch (e) { + return {class: [{name: e, value: 0}]}; +} +''' +OPTIONS (library="gs://httparchive/lib/css-utils.js"); + +# https://www.stevenmoseley.com/blog/tech/high-performance-sql-correlated-scalar-aggregate-reduction-queries +CREATE TEMPORARY FUNCTION encode(comparator STRING, data STRING) RETURNS STRING AS ( + CONCAT(LPAD(comparator, 11, '0'), data) +); +CREATE TEMPORARY FUNCTION decode(value STRING) RETURNS STRING AS ( + SUBSTR(value, 12) +); + +WITH selector_parts AS ( + SELECT + client, + page, + url, + getSelectorParts(css) AS parts + FROM + `httparchive.almanac.parsed_css` + WHERE + date = '2020-08-01' AND + # Limit the size of the CSS to avoid OOM crashes. + LENGTH(css) < 0.1 * 1024 * 1024 +) + +SELECT + client, + decode(MAX(encode(CAST(class_freq AS STRING), class_name))) AS class_name, + MAX(class_freq) AS class_freq, + decode(MAX(encode(CAST(id_freq AS STRING), id_name))) AS id_name, + MAX(id_freq) AS id_freq, + decode(MAX(encode(CAST(attribute_freq AS STRING), attribute_name))) AS attribute_name, + MAX(attribute_freq) AS attribute_freq, + decode(MAX(encode(CAST(pseudo_class_freq AS STRING), pseudo_class_name))) AS pseudo_class_name, + MAX(pseudo_class_freq) AS pseudo_class_freq, + decode(MAX(encode(CAST(pseudo_element_freq AS STRING), pseudo_element_name))) AS pseudo_element_name, + MAX(pseudo_element_freq) AS pseudo_element_freq +FROM ( + SELECT + client, + class.name AS class_name, + SUM(class.value) OVER (PARTITION BY client, class.name) AS class_freq + FROM + selector_parts, + UNNEST(parts.class) AS class) +JOIN ( + SELECT + client, + id.name AS id_name, + SUM(id.value) OVER (PARTITION BY client, id.name) AS id_freq + FROM + selector_parts, + UNNEST(parts.id) AS id) +USING + (client) +JOIN ( + SELECT + client, + attribute.name AS attribute_name, + SUM(attribute.value) OVER (PARTITION BY client, attribute.name) AS attribute_freq + FROM + selector_parts, + UNNEST(parts.attribute) AS attribute) +USING + (client) +JOIN ( + SELECT + client, + pseudo_class.name AS pseudo_class_name, + SUM(pseudo_class.value) OVER (PARTITION BY client, pseudo_class.name) AS pseudo_class_freq + FROM + selector_parts, + UNNEST(parts.pseudo_class) AS pseudo_class) +USING + (client) +JOIN ( + SELECT + client, + pseudo_element.name AS pseudo_element_name, + SUM(pseudo_element.value) OVER (PARTITION BY client, pseudo_element.name) AS pseudo_element_freq + FROM + selector_parts, + UNNEST(parts.pseudo_element) AS pseudo_element) +USING + (client) +GROUP BY + client \ No newline at end of file diff --git a/sql/2020/01_CSS/selectors.sql b/sql/2020/01_CSS/selectors.sql new file mode 100644 index 00000000000..3c2c4261807 --- /dev/null +++ b/sql/2020/01_CSS/selectors.sql @@ -0,0 +1,111 @@ +#standardSQL +CREATE TEMPORARY FUNCTION getSpecificityInfo(css STRING) +RETURNS STRUCT< + ruleCount NUMERIC, + selectorCount NUMERIC, + distribution ARRAY> +> LANGUAGE js AS ''' +try { + function extractSpecificity(ast) { + let ret = { + selectorCount: 0, + ruleCount: 0, + specificityCount: {}, + maxSpecifity: [0, 0, 0] + }; + + let ss = [0, 0, 0]; + + walkRules(ast, rule => { + ret.ruleCount++; + + for (let selector of rule.selectors) { + ret.selectorCount++; + let s = parsel.specificity(selector); + ss = ss.map((a, i) => a + s[i]); + let max = Math.max(...s); + + incrementByKey(ret.specificityCount, max <= 5? s + "" : "higher"); + + let base = Math.max(...ret.maxSpecifity, ...s); + if (parsel.specificityToNumber(s, base) > parsel.specificityToNumber(ret.maxSpecifity, base)) { + ret.maxSpecifity = s; + ret.maxSpecifitySelector = selector; + } + } + }, {type: "rule"}); + + ret.selectorsPerRule = ret.selectorCount / ret.ruleCount; + ret.avgSpecificity = ss.map(s => s / ret.selectorCount); + + return ret; + } + + function toComparableString(specificity) { + if (!specificity) { + return null; + } + if (specificity.split(',').length !== 3) { + return null; + } + + // The highest unit of specificity is 9398, so we need 5 digits of padding. + // Fun fact: the most specific selector in the dataset is 1065,9398,7851! + return specificity.split(',').map(i => i.padStart(5, '0')).join(''); + } + + const ast = JSON.parse(css); + let specificity = extractSpecificity(ast); + let ruleCount = specificity.ruleCount; + let selectorCount = specificity.selectorCount; + let distribution = Object.entries(specificity.specificityCount).map(([specificity, freq]) => { + return { + specificity, + freq, + specificity_cmp: toComparableString(specificity) + } + }); + + return { + ruleCount, + selectorCount, + distribution + }; +} catch (e) { + return null; +} +''' +OPTIONS (library="gs://httparchive/lib/css-utils.js"); + +SELECT + percentile, + client, + APPROX_QUANTILES(rule_count, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS rule_count, + APPROX_QUANTILES(selector_count, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS selector_count, + APPROX_QUANTILES(SAFE_DIVIDE(selector_count, rule_count), 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS selectors_per_rule +FROM ( + SELECT + client, + SUM(info.ruleCount) AS rule_count, + SUM(info.selectorCount) AS selector_count + FROM ( + SELECT + client, + page, + getSpecificityInfo(css) AS info + FROM + `httparchive.almanac.parsed_css` + WHERE + date = '2020-08-01' AND + # Limit the size of the CSS to avoid OOM crashes. This loses ~20% of stylesheets. + LENGTH(css) < 0.1 * 1024 * 1024) + GROUP BY + client, + page), + UNNEST([10, 25, 50, 75, 90]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client \ No newline at end of file diff --git a/sql/2020/01_CSS/specificity.sql b/sql/2020/01_CSS/specificity.sql index 06ed8d77dd8..9eeb411e0ba 100644 --- a/sql/2020/01_CSS/specificity.sql +++ b/sql/2020/01_CSS/specificity.sql @@ -3,10 +3,7 @@ CREATE TEMPORARY FUNCTION getSpecificityInfo(css STRING) RETURNS STRUCT< ruleCount NUMERIC, selectorCount NUMERIC, - selectorsPerRule NUMERIC, - avgSpecificity STRING, - maxSpecificity STRING, - medianSpecificity STRING + distribution ARRAY> > LANGUAGE js AS ''' try { function extractSpecificity(ast) { @@ -44,54 +41,83 @@ try { return ret; } - function getMedian(specificities) { - const total = Object.values(specificities).reduce((sum, value) => sum + value, 0); - if (total == 0) { + function toComparableString(specificity) { + if (!specificity) { return null; } - let cdf = 0; - const cdfEntries = Object.entries(specificities).map(([specificity, count]) => { - cdf += count; - return [specificity, cdf / total]; - }); - - for ([specificity, cdf] in cdfEntries) { - if (cdf >= 0.5) { - return specificity; - } + if (specificity.split(',').length !== 3) { + return null; } - return null; - } - - function specificityToString(specificity) { - return specificity && specificity.join(','); - } - - function sanitize(value) { - return isNaN(value) ? null : value; + // The highest unit of specificity is 9398, so we need 5 digits of padding. + return specificity.split(',').map(i => i.padStart(5, '0')).join('') + specificity; } const ast = JSON.parse(css); let specificity = extractSpecificity(ast); + let ruleCount = specificity.ruleCount; + let selectorCount = specificity.selectorCount; + let distribution = Object.entries(specificity.specificityCount).map(([specificity, freq]) => { + return { + specificity, + freq, + specificity_cmp: toComparableString(specificity) + } + }); return { - ruleCount: specificity.ruleCount, - selectorCount: specificity.selectorCount, - selectorsPerRule: sanitize(specificity.selectorsPerRule), - avgSpecificity: specificityToString(specificity.avgSpecificity), - maxSpecificity: specificityToString(specificity.maxSpecificity), - medianSpecificity: getMedian(specificity.specificityCount) + ruleCount, + selectorCount, + distribution }; } catch (e) { - return {avgSpecificity: e}; + return null; } ''' OPTIONS (library="gs://httparchive/lib/css-utils.js"); - +# https://www.stevenmoseley.com/blog/tech/high-performance-sql-correlated-scalar-aggregate-reduction-queries +CREATE TEMPORARY FUNCTION extractSpecificity(specificity_cmp STRING) RETURNS STRING AS ( + SUBSTR(specificity_cmp, 16) +); + +SELECT + percentile, + client, + extractSpecificity(APPROX_QUANTILES(max_specificity_cmp, 1000)[OFFSET(percentile * 10)]) AS max_specificity, + extractSpecificity(APPROX_QUANTILES(median_specificity_cmp, 1000)[OFFSET(percentile * 10)]) AS median_specificity +FROM ( SELECT client, - getSpecificityInfo(css) AS info - FROM - `httparchive.almanac.parsed_css_1k` \ No newline at end of file + MAX(specificity_cmp) AS max_specificity_cmp, + MIN(IF(freq_cdf >= 0.5, specificity_cmp, NULL)) AS median_specificity_cmp + FROM ( + SELECT + client, + page, + bin.specificity_cmp, + SUM(bin.freq) OVER (PARTITION BY client, page ORDER BY bin.specificity_cmp) / SUM(bin.freq) OVER (PARTITION BY client, page) AS freq_cdf + FROM ( + SELECT + client, + page, + getSpecificityInfo(css) AS info + FROM + `httparchive.almanac.parsed_css` + WHERE + date = '2020-08-01' AND + # Limit the size of the CSS to avoid OOM crashes. + LENGTH(css) < 0.1 * 1024 * 1024), + UNNEST(info.distribution) AS bin + WHERE + bin.specificity_cmp IS NOT NULL) + GROUP BY + client, + page), + UNNEST([10, 25, 50, 75, 90, 95, 99, 100]) AS percentile +GROUP BY + percentile, + client +ORDER BY + percentile, + client \ No newline at end of file diff --git a/sql/2020/01_CSS/top_selector_attributes.sql b/sql/2020/01_CSS/top_selector_attributes.sql new file mode 100644 index 00000000000..bb21d4cec58 --- /dev/null +++ b/sql/2020/01_CSS/top_selector_attributes.sql @@ -0,0 +1,88 @@ +#standardSQL +CREATE TEMPORARY FUNCTION getSelectorParts(css STRING) +RETURNS STRUCT< + class ARRAY, + id ARRAY, + attribute ARRAY, + pseudo_class ARRAY, + pseudo_element ARRAY +> LANGUAGE js AS ''' +try { + function compute(ast) { + let ret = { + class: {}, + id: {}, + attribute: {}, + "pseudo-class": {}, + "pseudo-element": {} + }; + + walkSelectors(ast, selector => { + let sast = parsel.parse(selector, {list: false}); + + parsel.walk(sast, node => { + if (node.type in ret) { + incrementByKey(ret[node.type], node.name); + } + }, {subtree: true}); + }); + + for (let type in ret) { + ret[type] = sortObject(ret[type]); + } + + return ret; + } + + function unzip(obj) { + return Object.entries(obj).filter(([name, value]) => { + return !isNaN(value); + }).map(([name, value]) => name); + } + + const ast = JSON.parse(css); + let parts = compute(ast); + return { + class: unzip(parts.class), + id: unzip(parts.id), + attribute: unzip(parts.attribute), + pseudo_class: unzip(parts['pseudo-class']), + pseudo_element: unzip(parts['pseudo-element']) + } +} catch (e) { + return null; +} +''' +OPTIONS (library="gs://httparchive/lib/css-utils.js"); + +SELECT + client, + pages, + attribute.value AS attribute, + attribute.count AS freq, + attribute.count / pages AS pct +FROM ( + SELECT + client, + COUNT(DISTINCT page) AS pages, + APPROX_TOP_COUNT(attribute, 100) AS attributes + FROM ( + SELECT DISTINCT + client, + page, + attribute + FROM + `httparchive.almanac.parsed_css` + LEFT JOIN + UNNEST(getSelectorParts(css).attribute) AS attribute + WHERE + date = '2020-08-01' AND + # Limit the size of the CSS to avoid OOM crashes. + LENGTH(css) < 0.1 * 1024 * 1024) + GROUP BY + client), + UNNEST(attributes) AS attribute +WHERE + attribute.value IS NOT NULL +ORDER BY + pct DESC \ No newline at end of file diff --git a/sql/2020/01_CSS/top_selector_classes.sql b/sql/2020/01_CSS/top_selector_classes.sql new file mode 100644 index 00000000000..a8ab57fdefb --- /dev/null +++ b/sql/2020/01_CSS/top_selector_classes.sql @@ -0,0 +1,88 @@ +#standardSQL +CREATE TEMPORARY FUNCTION getSelectorParts(css STRING) +RETURNS STRUCT< + class ARRAY, + id ARRAY, + attribute ARRAY, + pseudo_class ARRAY, + pseudo_element ARRAY +> LANGUAGE js AS ''' +try { + function compute(ast) { + let ret = { + class: {}, + id: {}, + attribute: {}, + "pseudo-class": {}, + "pseudo-element": {} + }; + + walkSelectors(ast, selector => { + let sast = parsel.parse(selector, {list: false}); + + parsel.walk(sast, node => { + if (node.type in ret) { + incrementByKey(ret[node.type], node.name); + } + }, {subtree: true}); + }); + + for (let type in ret) { + ret[type] = sortObject(ret[type]); + } + + return ret; + } + + function unzip(obj) { + return Object.entries(obj).filter(([name, value]) => { + return !isNaN(value); + }).map(([name, value]) => name); + } + + const ast = JSON.parse(css); + let parts = compute(ast); + return { + class: unzip(parts.class), + id: unzip(parts.id), + attribute: unzip(parts.attribute), + pseudo_class: unzip(parts['pseudo-class']), + pseudo_element: unzip(parts['pseudo-element']) + } +} catch (e) { + return null; +} +''' +OPTIONS (library="gs://httparchive/lib/css-utils.js"); + +SELECT + client, + pages, + class.value AS class, + class.count AS freq, + class.count / pages AS pct +FROM ( + SELECT + client, + COUNT(DISTINCT page) AS pages, + APPROX_TOP_COUNT(class, 100) AS classes + FROM ( + SELECT DISTINCT + client, + page, + class + FROM + `httparchive.almanac.parsed_css` + LEFT JOIN + UNNEST(getSelectorParts(css).class) AS class + WHERE + date = '2020-08-01' AND + # Limit the size of the CSS to avoid OOM crashes. + LENGTH(css) < 0.1 * 1024 * 1024) + GROUP BY + client), + UNNEST(classes) AS class +WHERE + class.value IS NOT NULL +ORDER BY + pct DESC \ No newline at end of file diff --git a/sql/2020/01_CSS/top_selector_ids.sql b/sql/2020/01_CSS/top_selector_ids.sql new file mode 100644 index 00000000000..c1f06e7154d --- /dev/null +++ b/sql/2020/01_CSS/top_selector_ids.sql @@ -0,0 +1,88 @@ +#standardSQL +CREATE TEMPORARY FUNCTION getSelectorParts(css STRING) +RETURNS STRUCT< + class ARRAY, + id ARRAY, + attribute ARRAY, + pseudo_class ARRAY, + pseudo_element ARRAY +> LANGUAGE js AS ''' +try { + function compute(ast) { + let ret = { + class: {}, + id: {}, + attribute: {}, + "pseudo-class": {}, + "pseudo-element": {} + }; + + walkSelectors(ast, selector => { + let sast = parsel.parse(selector, {list: false}); + + parsel.walk(sast, node => { + if (node.type in ret) { + incrementByKey(ret[node.type], node.name); + } + }, {subtree: true}); + }); + + for (let type in ret) { + ret[type] = sortObject(ret[type]); + } + + return ret; + } + + function unzip(obj) { + return Object.entries(obj).filter(([name, value]) => { + return !isNaN(value); + }).map(([name, value]) => name); + } + + const ast = JSON.parse(css); + let parts = compute(ast); + return { + class: unzip(parts.class), + id: unzip(parts.id), + attribute: unzip(parts.attribute), + pseudo_class: unzip(parts['pseudo-class']), + pseudo_element: unzip(parts['pseudo-element']) + } +} catch (e) { + return null; +} +''' +OPTIONS (library="gs://httparchive/lib/css-utils.js"); + +SELECT + client, + pages, + id.value AS id, + id.count AS freq, + id.count / pages AS pct +FROM ( + SELECT + client, + COUNT(DISTINCT page) AS pages, + APPROX_TOP_COUNT(id, 100) AS ids + FROM ( + SELECT DISTINCT + client, + page, + id + FROM + `httparchive.almanac.parsed_css` + LEFT JOIN + UNNEST(getSelectorParts(css).id) AS id + WHERE + date = '2020-08-01' AND + # Limit the size of the CSS to avoid OOM crashes. + LENGTH(css) < 0.1 * 1024 * 1024) + GROUP BY + client), + UNNEST(ids) AS id +WHERE + id.value IS NOT NULL +ORDER BY + pct DESC \ No newline at end of file diff --git a/sql/2020/01_CSS/top_selector_pseudo_classes.sql b/sql/2020/01_CSS/top_selector_pseudo_classes.sql new file mode 100644 index 00000000000..c29d6e24712 --- /dev/null +++ b/sql/2020/01_CSS/top_selector_pseudo_classes.sql @@ -0,0 +1,88 @@ +#standardSQL +CREATE TEMPORARY FUNCTION getSelectorParts(css STRING) +RETURNS STRUCT< + class ARRAY, + id ARRAY, + attribute ARRAY, + pseudo_class ARRAY, + pseudo_element ARRAY +> LANGUAGE js AS ''' +try { + function compute(ast) { + let ret = { + class: {}, + id: {}, + attribute: {}, + "pseudo-class": {}, + "pseudo-element": {} + }; + + walkSelectors(ast, selector => { + let sast = parsel.parse(selector, {list: false}); + + parsel.walk(sast, node => { + if (node.type in ret) { + incrementByKey(ret[node.type], node.name); + } + }, {subtree: true}); + }); + + for (let type in ret) { + ret[type] = sortObject(ret[type]); + } + + return ret; + } + + function unzip(obj) { + return Object.entries(obj).filter(([name, value]) => { + return !isNaN(value); + }).map(([name, value]) => name); + } + + const ast = JSON.parse(css); + let parts = compute(ast); + return { + class: unzip(parts.class), + id: unzip(parts.id), + attribute: unzip(parts.attribute), + pseudo_class: unzip(parts['pseudo-class']), + pseudo_element: unzip(parts['pseudo-element']) + } +} catch (e) { + return null; +} +''' +OPTIONS (library="gs://httparchive/lib/css-utils.js"); + +SELECT + client, + pages, + pseudo_class.value AS pseudo_class, + pseudo_class.count AS freq, + pseudo_class.count / pages AS pct +FROM ( + SELECT + client, + COUNT(DISTINCT page) AS pages, + APPROX_TOP_COUNT(pseudo_class, 100) AS pseudo_classes + FROM ( + SELECT DISTINCT + client, + page, + pseudo_class + FROM + `httparchive.almanac.parsed_css` + LEFT JOIN + UNNEST(getSelectorParts(css).pseudo_class) AS pseudo_class + WHERE + date = '2020-08-01' AND + # Limit the size of the CSS to avoid OOM crashes. + LENGTH(css) < 0.1 * 1024 * 1024) + GROUP BY + client), + UNNEST(pseudo_classes) AS pseudo_class +WHERE + pseudo_class.value IS NOT NULL +ORDER BY + pct DESC \ No newline at end of file diff --git a/sql/2020/01_CSS/top_selector_pseudo_elements.sql b/sql/2020/01_CSS/top_selector_pseudo_elements.sql new file mode 100644 index 00000000000..23893dafc52 --- /dev/null +++ b/sql/2020/01_CSS/top_selector_pseudo_elements.sql @@ -0,0 +1,88 @@ +#standardSQL +CREATE TEMPORARY FUNCTION getSelectorParts(css STRING) +RETURNS STRUCT< + class ARRAY, + id ARRAY, + attribute ARRAY, + pseudo_class ARRAY, + pseudo_element ARRAY +> LANGUAGE js AS ''' +try { + function compute(ast) { + let ret = { + class: {}, + id: {}, + attribute: {}, + "pseudo-class": {}, + "pseudo-element": {} + }; + + walkSelectors(ast, selector => { + let sast = parsel.parse(selector, {list: false}); + + parsel.walk(sast, node => { + if (node.type in ret) { + incrementByKey(ret[node.type], node.name); + } + }, {subtree: true}); + }); + + for (let type in ret) { + ret[type] = sortObject(ret[type]); + } + + return ret; + } + + function unzip(obj) { + return Object.entries(obj).filter(([name, value]) => { + return !isNaN(value); + }).map(([name, value]) => name); + } + + const ast = JSON.parse(css); + let parts = compute(ast); + return { + class: unzip(parts.class), + id: unzip(parts.id), + attribute: unzip(parts.attribute), + pseudo_class: unzip(parts['pseudo-class']), + pseudo_element: unzip(parts['pseudo-element']) + } +} catch (e) { + return null; +} +''' +OPTIONS (library="gs://httparchive/lib/css-utils.js"); + +SELECT + client, + pages, + pseudo_element.value AS pseudo_element, + pseudo_element.count AS freq, + pseudo_element.count / pages AS pct +FROM ( + SELECT + client, + COUNT(DISTINCT page) AS pages, + APPROX_TOP_COUNT(pseudo_element, 100) AS pseudo_elements + FROM ( + SELECT DISTINCT + client, + page, + pseudo_element + FROM + `httparchive.almanac.parsed_css` + LEFT JOIN + UNNEST(getSelectorParts(css).pseudo_element) AS pseudo_element + WHERE + date = '2020-08-01' AND + # Limit the size of the CSS to avoid OOM crashes. + LENGTH(css) < 0.1 * 1024 * 1024) + GROUP BY + client), + UNNEST(pseudo_elements) AS pseudo_element +WHERE + pseudo_element.value IS NOT NULL +ORDER BY + pct DESC \ No newline at end of file From 6320287f8624b50434d72702893da237a11cc54c Mon Sep 17 00:00:00 2001 From: Rick Viscomi Date: Wed, 30 Sep 2020 00:44:47 -0400 Subject: [PATCH 17/17] Update README.md --- sql/2020/01_CSS/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/2020/01_CSS/README.md b/sql/2020/01_CSS/README.md index 5dc8b34dc55..d98095c1a87 100644 --- a/sql/2020/01_CSS/README.md +++ b/sql/2020/01_CSS/README.md @@ -6,7 +6,7 @@ The 2020 data in the [`parsed_css`](https://console.cloud.google.com/bigquery?p= When prototyping queries, it's advisable to use the [`parsed_css_1k`](https://console.cloud.google.com/bigquery?p=httparchive&d=almanac&t=parsed_css_1k&page=table) table instead, which only contains 1000 rows for easier testing. Make sure to switch this back to the full table when saving the results for analysis. -## [Rework utils](../../lib/rework-utils.js) +## [CSS utils](../../lib/css-utils.js) - **Source**: https://github.com/LeaVerou/rework-utils/tree/master/src - **Playground**: https://projects.verou.me/rework-utils/ @@ -18,4 +18,4 @@ This file provides JS utility functions to be used by the queries that depend on - [Tracking issue](https://github.com/HTTPArchive/almanac.httparchive.org/issues/898) - [Draft doc](https://docs.google.com/document/d/1Cy9acip1ZQScoQEeds5-6l1FFFBJTJr4SheZiQxbj-Q/edit?usp=sharing) -- [Results sheet](https://docs.google.com/spreadsheets/d/1sMWXWjMujqfAREYxNbG_t1fOJKYCA6ASLwtz4pBQVTw/edit?usp=sharing) \ No newline at end of file +- [Results sheet](https://docs.google.com/spreadsheets/d/1sMWXWjMujqfAREYxNbG_t1fOJKYCA6ASLwtz4pBQVTw/edit?usp=sharing)