diff --git a/se/commands/find_unusual_characters.py b/se/commands/find_unusual_characters.py index 9ca06774..0eb5f422 100644 --- a/se/commands/find_unusual_characters.py +++ b/se/commands/find_unusual_characters.py @@ -53,66 +53,70 @@ def find_unusual_characters(plain_output: bool) -> int: se.print_error(str(ex) + f" File: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output) return_code = ex.code - # Create a list of unusual characters. - # We start with every character, and remove ones we know are valid in SE productions + # Create a regex for unusual characters. + # The result is a series of Unicode ranges that cover the characters + # we _don’t_ expect to see in a standard SE production. The comments + # indicate the Unicode ranges we exclude from the regex as they’re + # normal characters we don’t want to flag. + unusual_character_set = "[" - # Ignore basic ASCII u0000-u007e + # OK: basic ASCII u0000-u007e unusual_character_set += "\u007f-\u009f" - # Ignore NO BREAK SPACE u00a0 + # OK: NO BREAK SPACE u00a0 unusual_character_set += "\u00a1" - # Ignore CENT SIGN and POUND SIGN u00a2-u00a3 + # OK: CENT SIGN and POUND SIGN u00a2-u00a3 unusual_character_set += "\u00a4-\u00af" - # Ignore DEGREE SYMBOL u00b0 + # OK: DEGREE SYMBOL u00b0 unusual_character_set += "\u00b1-\u00b6" - # Ignore MIDDLE DOT u00b7 (used for Morse code) + # OK: MIDDLE DOT u00b7 (used for Morse code) unusual_character_set += "\u00b8-\u00bb" - # Ignore vulgar fractions u00bc-u00be + # OK: vulgar fractions u00bc-u00be unusual_character_set += "\u00bf" - # Ignore standard accented letters u00c0-u00ff + # OK: standard accented letters u00c0-u00ff unusual_character_set += "\u0100-\u0151" - # Ignore œ / Œ u0152-u0153 + # OK: œ / Œ u0152-u0153 unusual_character_set += "\u0154-\u02ba" - # Ignore MODIFIER LETTER TURNED COMMA u02bb (used for glottal stops) + # OK: MODIFIER LETTER TURNED COMMA u02bb (used for glottal stops) unusual_character_set += "\u02bc" - # Ignore MODIFIER LETTER REVERSED COMMA u02bd (used for Greek / Chinese) + # OK: MODIFIER LETTER REVERSED COMMA u02bd (used for Greek / Chinese) unusual_character_set += "\u02be-\u030c" - # Ignore COMBINING VERTICAL LINE ABOVE u030d + # OK: COMBINING VERTICAL LINE ABOVE u030d unusual_character_set += "\u030e-\u036f" - # Ignore basic Greek characters u0370-u03ff + # OK: basic Greek characters u0370-u03ff unusual_character_set += "\u0400-\u1eff" - # Ignore extended Greek characters u1f00-u1fff + # OK: extended Greek characters u1f00-u1fff unusual_character_set += "\u2000-\u2009" - # Ignore HAIR SPACE u200a + # OK: HAIR SPACE u200a unusual_character_set += "\u200b-\u2010" - # Ignore valid dashes u2011-u2014 + # OK: valid dashes u2011-u2014 unusual_character_set += "\u2015-\u2017" - # Ignore valid single quotes u2018-u2019 + # OK: valid single quotes u2018-u2019 unusual_character_set += "\u201a-\u201b" - # Ignore valid double quotes u201c-u201d + # OK: valid double quotes u201c-u201d unusual_character_set += "\u201e-\u2025" - # Ignore HORIZONTAL ELLIPSIS u2026 + # OK: HORIZONTAL ELLIPSIS u2026 unusual_character_set += "\u2027-\u2031" - # Ignore single/double prime marks u2032-u2033 + # OK: single/double prime marks u2032-u2033 unusual_character_set += "\u2034-\u203d" - # Ignore OVERLINE u203e (used in MathML) + # OK: OVERLINE u203e (used in MathML) unusual_character_set += "\u203f-\u2043" - # Ignore FRACTION SLASH u2044 + # OK: FRACTION SLASH u2044 unusual_character_set += "\u2045-\u205f" - # Ignore WORD JOINER u2060 + # OK: WORD JOINER u2060 unusual_character_set += "\u2061-\u21a8" - # Ignore LEFTWARDS ARROW WITH HOOK u21a9 (used in endquotes) + # OK: LEFTWARDS ARROW WITH HOOK u21a9 (used in endquotes) unusual_character_set += "\u21aa-\u2211" - # Ignore MINUS SIGN u2212 + # OK: MINUS SIGN u2212 unusual_character_set += "\u2213-\u2235" - # Ignore RATIO u2236 + # OK: RATIO u2236 unusual_character_set += "\u2237-\u2260" - # Ignore IDENTICAL TO u2261 + # OK: IDENTICAL TO u2261 unusual_character_set += "\u2262-\u22ed" - # Ignore VERTICAL ELLIPSIS u22ee + # OK: VERTICAL ELLIPSIS u22ee unusual_character_set += "\u22ef-\u2e39" - # Ignore two-/three-em dashes u2e3a-u2e3b + # OK: two-/three-em dashes u2e3a-u2e3b unusual_character_set += "\u2e3c-\ufefe" - # Ignore no-break hyphen + # OK: no-break hyphen unusual_character_set += "\u2011" unusual_character_set += "]"