Skip to content

Commit

Permalink
Add better documentation for how the unusual characters regex is built
Browse files Browse the repository at this point in the history
  • Loading branch information
robinwhittleton authored and acabal committed Nov 28, 2023
1 parent 4b7035e commit 401b199
Showing 1 changed file with 35 additions and 31 deletions.
66 changes: 35 additions & 31 deletions se/commands/find_unusual_characters.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,66 +53,70 @@ def find_unusual_characters(plain_output: bool) -> int:
se.print_error(str(ex) + f" File: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output)
return_code = ex.code

# Create a list of unusual characters.
# We start with every character, and remove ones we know are valid in SE productions
# Create a regex for unusual characters.
# The result is a series of Unicode ranges that cover the characters
# we _don’t_ expect to see in a standard SE production. The comments
# indicate the Unicode ranges we exclude from the regex as they’re
# normal characters we don’t want to flag.

unusual_character_set = "["
# Ignore basic ASCII u0000-u007e
# OK: basic ASCII u0000-u007e
unusual_character_set += "\u007f-\u009f"
# Ignore NO BREAK SPACE u00a0
# OK: NO BREAK SPACE u00a0
unusual_character_set += "\u00a1"
# Ignore CENT SIGN and POUND SIGN u00a2-u00a3
# OK: CENT SIGN and POUND SIGN u00a2-u00a3
unusual_character_set += "\u00a4-\u00af"
# Ignore DEGREE SYMBOL u00b0
# OK: DEGREE SYMBOL u00b0
unusual_character_set += "\u00b1-\u00b6"
# Ignore MIDDLE DOT u00b7 (used for Morse code)
# OK: MIDDLE DOT u00b7 (used for Morse code)
unusual_character_set += "\u00b8-\u00bb"
# Ignore vulgar fractions u00bc-u00be
# OK: vulgar fractions u00bc-u00be
unusual_character_set += "\u00bf"
# Ignore standard accented letters u00c0-u00ff
# OK: standard accented letters u00c0-u00ff
unusual_character_set += "\u0100-\u0151"
# Ignore œ / Œ u0152-u0153
# OK: œ / Œ u0152-u0153
unusual_character_set += "\u0154-\u02ba"
# Ignore MODIFIER LETTER TURNED COMMA u02bb (used for glottal stops)
# OK: MODIFIER LETTER TURNED COMMA u02bb (used for glottal stops)
unusual_character_set += "\u02bc"
# Ignore MODIFIER LETTER REVERSED COMMA u02bd (used for Greek / Chinese)
# OK: MODIFIER LETTER REVERSED COMMA u02bd (used for Greek / Chinese)
unusual_character_set += "\u02be-\u030c"
# Ignore COMBINING VERTICAL LINE ABOVE u030d
# OK: COMBINING VERTICAL LINE ABOVE u030d
unusual_character_set += "\u030e-\u036f"
# Ignore basic Greek characters u0370-u03ff
# OK: basic Greek characters u0370-u03ff
unusual_character_set += "\u0400-\u1eff"
# Ignore extended Greek characters u1f00-u1fff
# OK: extended Greek characters u1f00-u1fff
unusual_character_set += "\u2000-\u2009"
# Ignore HAIR SPACE u200a
# OK: HAIR SPACE u200a
unusual_character_set += "\u200b-\u2010"
# Ignore valid dashes u2011-u2014
# OK: valid dashes u2011-u2014
unusual_character_set += "\u2015-\u2017"
# Ignore valid single quotes u2018-u2019
# OK: valid single quotes u2018-u2019
unusual_character_set += "\u201a-\u201b"
# Ignore valid double quotes u201c-u201d
# OK: valid double quotes u201c-u201d
unusual_character_set += "\u201e-\u2025"
# Ignore HORIZONTAL ELLIPSIS u2026
# OK: HORIZONTAL ELLIPSIS u2026
unusual_character_set += "\u2027-\u2031"
# Ignore single/double prime marks u2032-u2033
# OK: single/double prime marks u2032-u2033
unusual_character_set += "\u2034-\u203d"
# Ignore OVERLINE u203e (used in MathML)
# OK: OVERLINE u203e (used in MathML)
unusual_character_set += "\u203f-\u2043"
# Ignore FRACTION SLASH u2044
# OK: FRACTION SLASH u2044
unusual_character_set += "\u2045-\u205f"
# Ignore WORD JOINER u2060
# OK: WORD JOINER u2060
unusual_character_set += "\u2061-\u21a8"
# Ignore LEFTWARDS ARROW WITH HOOK u21a9 (used in endquotes)
# OK: LEFTWARDS ARROW WITH HOOK u21a9 (used in endquotes)
unusual_character_set += "\u21aa-\u2211"
# Ignore MINUS SIGN u2212
# OK: MINUS SIGN u2212
unusual_character_set += "\u2213-\u2235"
# Ignore RATIO u2236
# OK: RATIO u2236
unusual_character_set += "\u2237-\u2260"
# Ignore IDENTICAL TO u2261
# OK: IDENTICAL TO u2261
unusual_character_set += "\u2262-\u22ed"
# Ignore VERTICAL ELLIPSIS u22ee
# OK: VERTICAL ELLIPSIS u22ee
unusual_character_set += "\u22ef-\u2e39"
# Ignore two-/three-em dashes u2e3a-u2e3b
# OK: two-/three-em dashes u2e3a-u2e3b
unusual_character_set += "\u2e3c-\ufefe"
# Ignore no-break hyphen
# OK: no-break hyphen
unusual_character_set += "\u2011"
unusual_character_set += "]"

Expand Down

0 comments on commit 401b199

Please sign in to comment.