Improve codec and site handlers

Remove non-closing brackets Strip all parsed strings
dreulavelle · Jul 19, 2024 · 30e4ae5 · 30e4ae5
1 parent 97e9639
commit 30e4ae5
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 7 deletions.
diff --git a/PTT/handlers.py b/PTT/handlers.py
@@ -130,7 +130,7 @@ def handle_bit_depth(context):
 
     # Codec
     parser.add_handler("codec", regex.compile(r"\b[xh][\. \-]?264\b", regex.IGNORECASE), lowercase, {"remove": True})
-    parser.add_handler("codec", regex.compile(r"\bHEVC10(bit)?|[xh][\. \-]?265\b", regex.IGNORECASE), value("x265"), {"remove": True})
+    parser.add_handler("codec", regex.compile(r"\bHEVC10(bit)?\b|\b[xh][\. \-]?265\b", regex.IGNORECASE), value("x265"), {"remove": True})
     parser.add_handler("codec", regex.compile(r"\bhevc(?:\s?10)?\b", regex.IGNORECASE), value("x265"), {"remove": True, "skipIfAlreadyFound": False})
     parser.add_handler("codec", regex.compile(r"\b(?:dvix|mpeg2|divx|xvid|avc|av1)\b", regex.IGNORECASE), lowercase, {"remove": True, "skipIfAlreadyFound": False})
     def handle_space_in_codec(context):
@@ -264,8 +264,8 @@ def handle_episodes(context):
     parser.add_handler("episodes", handle_episodes)
 
     # Site before languages to get rid of domain name with country code.
-    parser.add_handler("site", regex.compile(r"^(www\.[\w-]+\.[\w-]+(?:\.[\w-]+)?)\s+-\s*", regex.IGNORECASE), options={"skipFromTitle": True, "remove": True, "skipIfAlreadyFound": False})
-    parser.add_handler("site", regex.compile(r"^((?:www\.)?[\w-]+\.[\w-]+(?:\.[\w-]+)*?)\s+-\s*", regex.IGNORECASE), options={"skipIfAlreadyFound": False})
+    parser.add_handler("site", regex.compile(r"^(www?[\.,][\w-]+\.[\w-]+(?:\.[\w-]+)?)\s+-\s*", regex.IGNORECASE), options={"skipFromTitle": True, "remove": True, "skipIfAlreadyFound": False})
+    parser.add_handler("site", regex.compile(r"^((?:www?[\.,])?[\w-]+\.[\w-]+(?:\.[\w-]+)*?)\s+-\s*", regex.IGNORECASE), options={"skipIfAlreadyFound": False})
 
     # Languages
     parser.add_handler("languages", regex.compile(r"\bmulti(?:ple)?[ .-]*(?:su?$|sub\w*|dub\w*)\b|msub", regex.IGNORECASE), uniq_concat(value("multi subs")), {"skipIfAlreadyFound": False, "remove": True})
@@ -385,7 +385,7 @@ def infer_language_based_on_naming(context):
                     regex.search(r"dublado", title, regex.IGNORECASE):
                 result["languages"] = result.get("languages", []) + ["portuguese"]
 
-        return {"match_index": 0}
+        return None
     parser.add_handler("languages", infer_language_based_on_naming)
 
     # Subbed
@@ -399,7 +399,7 @@ def handle_dubbed(context):
         result = context["result"]
         if "languages" in result and any(lang in ["multi audio", "dual audio"] for lang in result["languages"]):
             result["dubbed"] = True
-        return {"match_index": 0}
+        return None
     parser.add_handler("dubbed", handle_dubbed)
 
     # Group
@@ -415,7 +415,7 @@ def handle_group(context):
             # Check if there's any overlap with other matched elements
             if any(key != "group" and matched[key]["match_index"] < end_index for key in matched if "match_index" in matched[key]) and "group" in result:
                 del result["group"]
-        return {"match_index": 0}
+        return None
 
     parser.add_handler("group", handle_group)
 

diff --git a/PTT/parse.py b/PTT/parse.py
@@ -20,13 +20,18 @@
     "\u0e00-\u0e7f"  # Thai characters
 )
 
+CURLY_BRACKETS = ["{", "}"]
+SQUARE_BRACKETS = ["[", "]"]
+PARENTHESES = ["(", ")"]
+BRACKETS = [CURLY_BRACKETS, SQUARE_BRACKETS, PARENTHESES]
+
 RUSSIAN_CAST_REGEX = regex.compile(r"\([^)]*[\u0400-\u04ff][^)]*\)$|(?<=\/.*)\(.*\)$")
 ALT_TITLES_REGEX = regex.compile(rf"[^/|(]*[{NON_ENGLISH_CHARS}][^/|]*[/|]|[/|][^/|(]*[{NON_ENGLISH_CHARS}][^/|]*")
 NOT_ONLY_NON_ENGLISH_REGEX = regex.compile(rf"(?<=[a-zA-Z][^{NON_ENGLISH_CHARS}]+)[{NON_ENGLISH_CHARS}].*[{NON_ENGLISH_CHARS}]|[{NON_ENGLISH_CHARS}].*[{NON_ENGLISH_CHARS}](?=[^{NON_ENGLISH_CHARS}]+[a-zA-Z])")
 NOT_ALLOWED_SYMBOLS_AT_START_AND_END = regex.compile(rf"^[^\w{NON_ENGLISH_CHARS}#[【★]+|[ \-:/\\[|{{(#$&^]+$")
 REMAINING_NOT_ALLOWED_SYMBOLS_AT_START_AND_END = regex.compile(rf"^[^\w{NON_ENGLISH_CHARS}#]+|]$")
 
-DEBUG_HANDLER = "seasons"
+DEBUG_HANDLER = False
 
 
 def extend_options(options: Dict[str, Any] = None) -> Dict[str, Any]:
@@ -75,6 +80,8 @@ def handler(context: Dict[str, Any]) -> Union[Dict[str, Any], None]:
             sig = inspect.signature(transformer)
             param_count = len(sig.parameters)
             transformed = transformer(clean_match or raw_match, *([result.get(name)] if param_count > 1 else []))
+            if type(transformed) is str:
+                transformed = transformed.strip()
 
             before_title_match = regex.match(r"^\[([^[\]]+)]", title)
             is_before_title = before_title_match is not None and raw_match in before_title_match.group(1)
@@ -115,6 +122,13 @@ def clean_title(raw_title: str) -> str:
     cleaned_title = NOT_ONLY_NON_ENGLISH_REGEX.sub("", cleaned_title)
     cleaned_title = REMAINING_NOT_ALLOWED_SYMBOLS_AT_START_AND_END.sub("", cleaned_title)
 
+    # Remove brackets if only one is present
+    for brackets in BRACKETS:
+        only_one_bracket = not all(bracket in cleaned_title for bracket in brackets)
+        if only_one_bracket:
+            for bracket in brackets:
+                cleaned_title = cleaned_title.replace(bracket, "")
+
     if " " not in cleaned_title and "." in cleaned_title:
         cleaned_title = regex.sub(r"\.", " ", cleaned_title)
 
@@ -184,6 +198,9 @@ def parse(self, title: str) -> Dict[str, Any]:
         for handler in self.handlers:
             match_result = handler({"title": title, "result": result, "matched": matched})
 
+            if DEBUG_HANDLER is True or (type(DEBUG_HANDLER) is str and DEBUG_HANDLER in handler.handler_name):
+                print(handler.handler_name, match_result, title)
+
             if match_result is None:
                 continue