Add better documentation for how the unusual characters regex is built

standardebooks · Nov 28, 2023 · 401b199 · 401b199
1 parent 4b7035e
commit 401b199
Showing 1 changed file with 35 additions and 31 deletions.
diff --git a/se/commands/find_unusual_characters.py b/se/commands/find_unusual_characters.py
@@ -53,66 +53,70 @@ def find_unusual_characters(plain_output: bool) -> int:
 			se.print_error(str(ex) + f" File: [path][link=file://{filename}]{filename}[/][/].", plain_output=plain_output)
 			return_code = ex.code
 
-	# Create a list of unusual characters.
-	# We start with every character, and remove ones we know are valid in SE productions
+	# Create a regex for unusual characters.
+	# The result is a series of Unicode ranges that cover the characters
+	# we _don’t_ expect to see in a standard SE production. The comments
+	# indicate the Unicode ranges we exclude from the regex as they’re
+	# normal characters we don’t want to flag.
+
 	unusual_character_set = "["
-	# Ignore basic ASCII u0000-u007e
+	# OK: basic ASCII u0000-u007e
 	unusual_character_set += "\u007f-\u009f"
-	# Ignore NO BREAK SPACE u00a0
+	# OK: NO BREAK SPACE u00a0
 	unusual_character_set += "\u00a1"
-	# Ignore CENT SIGN and POUND SIGN u00a2-u00a3
+	# OK: CENT SIGN and POUND SIGN u00a2-u00a3
 	unusual_character_set += "\u00a4-\u00af"
-	# Ignore DEGREE SYMBOL u00b0
+	# OK: DEGREE SYMBOL u00b0
 	unusual_character_set += "\u00b1-\u00b6"
-	# Ignore MIDDLE DOT u00b7 (used for Morse code)
+	# OK: MIDDLE DOT u00b7 (used for Morse code)
 	unusual_character_set += "\u00b8-\u00bb"
-	# Ignore vulgar fractions u00bc-u00be
+	# OK: vulgar fractions u00bc-u00be
 	unusual_character_set += "\u00bf"
-	# Ignore standard accented letters u00c0-u00ff
+	# OK: standard accented letters u00c0-u00ff
 	unusual_character_set += "\u0100-\u0151"
-	# Ignore œ / Œ u0152-u0153
+	# OK: œ / Œ u0152-u0153
 	unusual_character_set += "\u0154-\u02ba"
-	# Ignore MODIFIER LETTER TURNED COMMA u02bb (used for glottal stops)
+	# OK: MODIFIER LETTER TURNED COMMA u02bb (used for glottal stops)
 	unusual_character_set += "\u02bc"
-	# Ignore MODIFIER LETTER REVERSED COMMA u02bd (used for Greek / Chinese)
+	# OK: MODIFIER LETTER REVERSED COMMA u02bd (used for Greek / Chinese)
 	unusual_character_set += "\u02be-\u030c"
-	# Ignore COMBINING VERTICAL LINE ABOVE u030d
+	# OK: COMBINING VERTICAL LINE ABOVE u030d
 	unusual_character_set += "\u030e-\u036f"
-	# Ignore basic Greek characters u0370-u03ff
+	# OK: basic Greek characters u0370-u03ff
 	unusual_character_set += "\u0400-\u1eff"
-	# Ignore extended Greek characters u1f00-u1fff
+	# OK: extended Greek characters u1f00-u1fff
 	unusual_character_set += "\u2000-\u2009"
-	# Ignore HAIR SPACE u200a
+	# OK: HAIR SPACE u200a
 	unusual_character_set += "\u200b-\u2010"
-	# Ignore valid dashes u2011-u2014
+	# OK: valid dashes u2011-u2014
 	unusual_character_set += "\u2015-\u2017"
-	# Ignore valid single quotes u2018-u2019
+	# OK: valid single quotes u2018-u2019
 	unusual_character_set += "\u201a-\u201b"
-	# Ignore valid double quotes u201c-u201d
+	# OK: valid double quotes u201c-u201d
 	unusual_character_set += "\u201e-\u2025"
-	# Ignore HORIZONTAL ELLIPSIS u2026
+	# OK: HORIZONTAL ELLIPSIS u2026
 	unusual_character_set += "\u2027-\u2031"
-	# Ignore single/double prime marks u2032-u2033
+	# OK: single/double prime marks u2032-u2033
 	unusual_character_set += "\u2034-\u203d"
-	# Ignore OVERLINE u203e (used in MathML)
+	# OK: OVERLINE u203e (used in MathML)
 	unusual_character_set += "\u203f-\u2043"
-	# Ignore FRACTION SLASH u2044
+	# OK: FRACTION SLASH u2044
 	unusual_character_set += "\u2045-\u205f"
-	# Ignore WORD JOINER u2060
+	# OK: WORD JOINER u2060
 	unusual_character_set += "\u2061-\u21a8"
-	# Ignore LEFTWARDS ARROW WITH HOOK u21a9 (used in endquotes)
+	# OK: LEFTWARDS ARROW WITH HOOK u21a9 (used in endquotes)
 	unusual_character_set += "\u21aa-\u2211"
-	# Ignore MINUS SIGN u2212
+	# OK: MINUS SIGN u2212
 	unusual_character_set += "\u2213-\u2235"
-	# Ignore RATIO u2236
+	# OK: RATIO u2236
 	unusual_character_set += "\u2237-\u2260"
-	# Ignore IDENTICAL TO u2261
+	# OK: IDENTICAL TO u2261
 	unusual_character_set += "\u2262-\u22ed"
-	# Ignore VERTICAL ELLIPSIS u22ee
+	# OK: VERTICAL ELLIPSIS u22ee
 	unusual_character_set += "\u22ef-\u2e39"
-	# Ignore two-/three-em dashes u2e3a-u2e3b
+	# OK: two-/three-em dashes u2e3a-u2e3b
 	unusual_character_set += "\u2e3c-\ufefe"
-	# Ignore no-break hyphen
+	# OK: no-break hyphen
 	unusual_character_set += "\u2011"
 	unusual_character_set += "]"