From 863c4b724d794b7fbde1e5e34247bbd9f214f1d0 Mon Sep 17 00:00:00 2001 From: Pavel Kvach Date: Tue, 16 Apr 2024 07:59:55 +0300 Subject: [PATCH] utils/html: Provide full control of allowed HTML elements - Added new configuration option "strictly-allowed-html-elements" to specify only allowed HTML tags in the generated output. - Allowed "mark" and "u" elements for "highlight" and "underline" Markup extensions. - Updated "allowed-elements" in configuration files to include "tr". Fixes https://github.com/isso-comments/isso/issues/751 --- CHANGES.rst | 2 ++ contrib/isso-dev.cfg | 1 + docs/docs/reference/server-config.rst | 15 ++++++++--- isso/isso.cfg | 6 ++++- isso/tests/test_html.py | 26 ++++++++++++++---- isso/utils/html.py | 39 +++++++++++++++++++-------- 6 files changed, 69 insertions(+), 20 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index e8e4ee2c..925b5365 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -50,6 +50,7 @@ Bugfixes & Improvements - Make 'text' field in 'comments' table NOT NULL and handling data migration (`#1019`_, pkvach) - Python 3.12 support (`#1015`_, ix5) - Disable Postbox submit button on click, enable after response (`#993`_, pkvach) +- Provide full control of allowed HTML elements via the configuration file (`#1007`_, pkvach) .. _#951: https://github.com/posativ/isso/pull/951 .. _#967: https://github.com/posativ/isso/pull/967 @@ -64,6 +65,7 @@ Bugfixes & Improvements .. _#1019: https://github.com/isso-comments/isso/pull/1019 .. _#1015: https://github.com/isso-comments/isso/pull/1015 .. _#993: https://github.com/isso-comments/isso/pull/993 +.. _#1007: https://github.com/isso-comments/isso/pull/1007 0.13.1.dev0 (2023-02-05) ------------------------ diff --git a/contrib/isso-dev.cfg b/contrib/isso-dev.cfg index 760494b4..ce1cd9a1 100644 --- a/contrib/isso-dev.cfg +++ b/contrib/isso-dev.cfg @@ -39,6 +39,7 @@ reply-to-self = true options = autolink, fenced-code, no-intra-emphasis, strikethrough, superscript flags = allowed-elements = +strictly-allowed-html-elements = allowed-attributes = [hash] diff --git a/docs/docs/reference/server-config.rst b/docs/docs/reference/server-config.rst index ea3c91d8..04c8385a 100644 --- a/docs/docs/reference/server-config.rst +++ b/docs/docs/reference/server-config.rst @@ -432,7 +432,7 @@ allowed-elements By default, only ``a``, ``blockquote``, ``br``, ``code``, ``del``, ``em``, ``h1``, ``h2``, ``h3``, ``h4``, ``h5``, ``h6``, ``hr``, ``ins``, ``li``, - ``ol``, ``p``, ``pre``, ``strong``, ``table``, ``tbody``, ``td``, ``th``, + ``ol``, ``p``, ``pre``, ``strong``, ``table``, ``tbody``, ``tr``, ``td``, ``th``, ``thead`` and ``ul`` are allowed. For a more detailed explanation, see :doc:`/docs/reference/markdown-config`. @@ -444,11 +444,20 @@ allowed-elements mean that ``br, code, del, ...`` and all other default allowed tags are still allowed. You can only add *additional* elements here. - It is planned to change this behavior, see - `this issue `_. + To specify a list of *only* allowed elements, use the + ``strictly-allowed-html-elements`` option instead. Default: (empty) +strictly-allowed-html-elements + + **Only** allow the specified HTML tags in the generated output, comma-separated. + If this option is set, the ``allowed-elements`` option is ignored. + + Default: (empty) + + .. versionadded:: 0.13.1 + allowed-attributes **Additional** HTML attributes (independent from elements) to allow in the generated output, comma-separated. diff --git a/isso/isso.cfg b/isso/isso.cfg index fec23829..d9d09d45 100644 --- a/isso/isso.cfg +++ b/isso/isso.cfg @@ -213,9 +213,13 @@ flags = # Additional HTML tags to allow in the generated output, comma-separated. By # default, only a, blockquote, br, code, del, em, h1, h2, h3, h4, h5, h6, hr, -# ins, li, ol, p, pre, strong, table, tbody, td, th, thead and ul are allowed. +# ins, li, ol, p, pre, strong, table, tbody, tr, td, th, thead and ul are allowed. allowed-elements = +# Only allow the specified HTML tags in the generated output, comma-separated. +# If this option is set, the "allowed-elements" option is ignored. +strictly-allowed-html-elements = + # Additional HTML attributes (independent from elements) to allow in the # generated output, comma-separated. By default, only align and href are # allowed. diff --git a/isso/tests/test_html.py b/isso/tests/test_html.py index 5d5f87a3..e12d6478 100644 --- a/isso/tests/test_html.py +++ b/isso/tests/test_html.py @@ -60,7 +60,7 @@ def test_github_flavoured_markdown(self): """) def test_sanitizer(self): - sanitizer = html.Sanitizer(elements=[], attributes=[]) + sanitizer = html.Sanitizer(elements=["p", "a", "code"], attributes=["href"]) examples = [ ('Look: ', 'Look: '), ('Ha', @@ -94,8 +94,9 @@ def test_render(self): "markup": { "options": "autolink", "flags": "", - "allowed-elements": "", - "allowed-attributes": "" + "allowed-elements": "a, p", + "allowed-attributes": "href", + "strictly-allowed-html-elements": "" } }) renderer = html.Markup(conf.section("markup")).render @@ -103,14 +104,29 @@ def test_render(self): ['

http://example.org/ and sms:+1234567890

', '

http://example.org/ and sms:+1234567890

']) + def test_render_with_strictly_allowed_elements(self): + conf = config.new({ + "markup": { + "options": "autolink", + "flags": "", + "allowed-elements": "a, p", + "strictly-allowed-html-elements": "p", + "allowed-attributes": "href" + } + }) + renderer = html.Markup(conf.section("markup")).render + self.assertEqual(renderer("http://example.org/ and sms:+1234567890"), + '

http://example.org/ and sms:+1234567890

') + def test_sanitized_render_extensions(self): """Options should be normalized from both dashed-case or snake_case (legacy)""" conf = config.new({ "markup": { "options": "no_intra_emphasis", # Deliberately snake_case "flags": "", - "allowed-elements": "", - "allowed-attributes": "" + "allowed-elements": "p", + "allowed-attributes": "", + "strictly-allowed-html-elements": "" } }) renderer = html.Markup(conf.section("markup")).render diff --git a/isso/utils/html.py b/isso/utils/html.py index c1aafad1..b11e1ea7 100644 --- a/isso/utils/html.py +++ b/isso/utils/html.py @@ -17,16 +17,7 @@ def allow_attribute_class(tag, name, value): return name == "class" and bool(Sanitizer.code_language_pattern.match(value)) def __init__(self, elements, attributes): - # attributes found in Sundown's HTML serializer [1] - # - except for tag, because images are not generated anyways. - # - sub and sup added - # - # [1] https://github.com/vmg/sundown/blob/master/html/html.c - self.elements = ["a", "p", "hr", "br", "ol", "ul", "li", - "pre", "code", "blockquote", - "del", "ins", "strong", "em", - "h1", "h2", "h3", "h4", "h5", "h6", "sub", "sup", - "table", "thead", "tbody", "th", "td"] + elements + self.elements = elements # allowed attributes for tags self.attributes = { @@ -108,12 +99,38 @@ def __init__(self, conf): parser = Markdown(extensions=self.extensions, flags=self.flags) # Filter out empty strings: - allowed_elements = [x for x in conf.getlist("allowed-elements") if x] + strictly_allowed_html_elements = [x for x in conf.getlist("strictly-allowed-html-elements") if x] allowed_attributes = [x for x in conf.getlist("allowed-attributes") if x] + # if "strictly-allowed-html-elements" option is set, use it instead of "allowed-elements" + if strictly_allowed_html_elements: + allowed_elements = strictly_allowed_html_elements + else: + allowed_elements = [x for x in conf.getlist("allowed-elements") if x] + + # attributes found in Sundown's HTML serializer [1] + # - except for tag, because images are not generated anyways. + # - sub and sup added + # + # [1] https://github.com/vmg/sundown/blob/master/html/html.c + allowed_elements = ["a", "p", "hr", "br", "ol", "ul", "li", + "pre", "code", "blockquote", + "del", "ins", "strong", "em", + "h1", "h2", "h3", "h4", "h5", "h6", "sub", "sup", + "table", "thead", "tbody", "tr", "th", "td"] + allowed_elements + # If images are allowed, source element should be allowed as well if 'img' in allowed_elements and 'src' not in allowed_attributes: allowed_attributes.append('src') + + # If 'highlight' extension is enabled, allow 'mark' element + if 'highlight' in self.extensions and 'mark' not in allowed_elements: + allowed_elements.append('mark') + + # If 'underline' extension is enabled, allow 'u' element + if 'underline' in self.extensions and 'u' not in allowed_elements: + allowed_elements.append('u') + sanitizer = Sanitizer(allowed_elements, allowed_attributes) self._render = lambda text: sanitizer.sanitize(parser(text))