Improve title parse (#10)

* Add support for site & add docs & refactors & cleanup * format with black * refactor condition * update import all * Fix docs typo * Add support for new languages patterns & Fix the issue with language detect from domain name & fmt * replace source with quality similar to PTN * Add hq audio pattern * Add support for size * refactor parser * Fix parse and parse_title conflict * Add fan dub & Cleanup redundant chars in title & remove empty brackets in titles * add site to first for not overlap with other handlers * enhance title cleanup and ensure at least one character available for torrent title * fmt
dreulavelle · Jul 20, 2024 · 4ec4a0a · 4ec4a0a
1 parent 55882fb
commit 4ec4a0a
Show file tree

Hide file tree

Showing 7 changed files with 92 additions and 86 deletions.
diff --git a/PTT/__init__.py b/PTT/__init__.py
@@ -5,7 +5,7 @@
 add_defaults(_parser)
 
 
-def parse(raw_title: str) -> dict:
+def parse_title(raw_title: str) -> dict:
     """
     Parse the given input string using the initialized parser instance.
     :param raw_title: The input raw torrent title to parse.

diff --git a/PTT/handlers.py b/PTT/handlers.py
@@ -1,18 +1,7 @@
 import regex
 
 from PTT.parse import Parser
-from PTT.transformers import (
-    array,
-    boolean,
-    date,
-    integer,
-    lowercase,
-    none,
-    range_func,
-    uniq_concat,
-    uppercase,
-    value
-)
+from PTT.transformers import array, boolean, date, integer, lowercase, none, range_func, uniq_concat, uppercase, value
 
 
 def add_defaults(parser: Parser):
@@ -26,6 +15,10 @@ def add_defaults(parser: Parser):
     # Torrent extension
     parser.add_handler("torrent", regex.compile(r"\.torrent$"), boolean, {"remove": True})
 
+    # Site before languages to get rid of domain name with country code.
+    parser.add_handler("site", regex.compile(r"^(www?[\.,][\w-]+\.[\w-]+(?:\.[\w-]+)?)\s+-\s*", regex.IGNORECASE), options={"skipFromTitle": True, "remove": True, "skipIfAlreadyFound": False})
+    parser.add_handler("site", regex.compile(r"^((?:www?[\.,])?[\w-]+\.[\w-]+(?:\.[\w-]+)*?)\s+-\s*", regex.IGNORECASE), options={"skipIfAlreadyFound": False})
+
     # Episode code
     parser.add_handler("episode_code", regex.compile(r"[[(]([a-zA-Z0-9]{8})[\])](?=\.[a-zA-Z0-9]{1,5}$|$)"), uppercase, {"remove": True})
     parser.add_handler("episode_code", regex.compile(r"\[([A-Z0-9]{8})]"), uppercase, {"remove": True})
@@ -316,10 +309,6 @@ def handle_episodes(context):
 
     parser.add_handler("episodes", handle_episodes)
 
-    # Site before languages to get rid of domain name with country code.
-    parser.add_handler("site", regex.compile(r"^(www?[\.,][\w-]+\.[\w-]+(?:\.[\w-]+)?)\s+-\s*", regex.IGNORECASE), options={"skipFromTitle": True, "remove": True, "skipIfAlreadyFound": False})
-    parser.add_handler("site", regex.compile(r"^((?:www?[\.,])?[\w-]+\.[\w-]+(?:\.[\w-]+)*?)\s+-\s*", regex.IGNORECASE), options={"skipIfAlreadyFound": False})
-
     # Languages
     parser.add_handler("languages", regex.compile(r"\bmulti(?:ple)?[ .-]*(?:su?$|sub\w*|dub\w*)\b|msub", regex.IGNORECASE), uniq_concat(value("multi subs")), {"skipIfAlreadyFound": False, "remove": True})
     parser.add_handler("languages", regex.compile(r"\bmulti(?:ple)?[ .-]*(?:lang(?:uages?)?|audio|VF2)?\b", regex.IGNORECASE), uniq_concat(value("multi audio")), {"skipIfAlreadyFound": False})
@@ -445,6 +434,7 @@ def infer_language_based_on_naming(context):
     parser.add_handler("subbed", regex.compile(r"\bsub(s|bed)?\b", regex.IGNORECASE), boolean)
 
     # Dubbed
+    parser.add_handler("dubbed", regex.compile(r"\b(fan\s?dub)\b", regex.IGNORECASE), boolean, {"remove": True, "skipFromTitle": True})
     parser.add_handler("dubbed", regex.compile(r"\b(Fan.*)?(?:DUBBED|dublado|dubbing|DUBS?)\b", regex.IGNORECASE), boolean, {"remove": True})
     parser.add_handler("dubbed", regex.compile(r"\b(?!.*\bsub(s|bed)?\b)([ _\-\[(\.])?(dual|multi)([ _\-\[(\.])?(audio)?\b", regex.IGNORECASE), boolean, {"remove": True})
     parser.add_handler("dubbed", regex.compile(r"\b(JAP?(anese)?|ZH)\+ENG?(lish)?|ENG?(lish)?\+(JAP?(anese)?|ZH)\b", regex.IGNORECASE), boolean, {"remove": True})

diff --git a/PTT/parse.py b/PTT/parse.py
@@ -30,6 +30,8 @@
 NOT_ONLY_NON_ENGLISH_REGEX = regex.compile(rf"(?<=[a-zA-Z][^{NON_ENGLISH_CHARS}]+)[{NON_ENGLISH_CHARS}].*[{NON_ENGLISH_CHARS}]|[{NON_ENGLISH_CHARS}].*[{NON_ENGLISH_CHARS}](?=[^{NON_ENGLISH_CHARS}]+[a-zA-Z])")
 NOT_ALLOWED_SYMBOLS_AT_START_AND_END = regex.compile(rf"^[^\w{NON_ENGLISH_CHARS}#[【★]+|[ \-:/\\[|{{(#$&^]+$")
 REMAINING_NOT_ALLOWED_SYMBOLS_AT_START_AND_END = regex.compile(rf"^[^\w{NON_ENGLISH_CHARS}#]+|]$")
+REDUNDANT_SYMBOLS_AT_END = regex.compile(r"[ \-:./\\]+$")
+EMPTY_BRACKETS_REGEX = regex.compile(r"\(\s*\)|\[\s*\]|\{\s*\}")
 
 DEBUG_HANDLER = False
 
@@ -121,17 +123,17 @@ def clean_title(raw_title: str) -> str:
     cleaned_title = ALT_TITLES_REGEX.sub("", cleaned_title)
     cleaned_title = NOT_ONLY_NON_ENGLISH_REGEX.sub("", cleaned_title)
     cleaned_title = REMAINING_NOT_ALLOWED_SYMBOLS_AT_START_AND_END.sub("", cleaned_title)
+    cleaned_title = EMPTY_BRACKETS_REGEX.sub("", cleaned_title)
 
     # Remove brackets if only one is present
-    for brackets in BRACKETS:
-        only_one_bracket = not all(bracket in cleaned_title for bracket in brackets)
-        if only_one_bracket:
-            for bracket in brackets:
-                cleaned_title = cleaned_title.replace(bracket, "")
+    for open_bracket, close_bracket in BRACKETS:
+        if cleaned_title.count(open_bracket) != cleaned_title.count(close_bracket):
+            cleaned_title = cleaned_title.replace(open_bracket, "").replace(close_bracket, "")
 
     if " " not in cleaned_title and "." in cleaned_title:
         cleaned_title = regex.sub(r"\.", " ", cleaned_title)
 
+    cleaned_title = REDUNDANT_SYMBOLS_AT_END.sub("", cleaned_title)
     cleaned_title = cleaned_title.strip()
     return cleaned_title
 
@@ -211,7 +213,7 @@ def parse(self, title: str) -> Dict[str, Any]:
 
             if remove:
                 title = title[:match_index] + title[match_index + len(raw_match) :]
-            if not skip_from_title and match_index and match_index < end_of_title:
+            if not skip_from_title and match_index and 1 < match_index < end_of_title:
                 end_of_title = match_index
             if remove and skip_from_title and match_index < end_of_title:
                 end_of_title -= len(raw_match)

diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -10,6 +10,7 @@ def parser():
     add_defaults(p)
     return p
 
+
 def test_parsed_output(parser):
     test_case = "[Golumpa] Fairy Tail - 214 [FuniDub 720p x264 AAC] [5E46AC39]"
     result = parser.parse(test_case)
@@ -20,6 +21,7 @@ def test_parsed_output(parser):
     assert "codec" in result
     assert "audio" in result
 
+
 def test_basic_parsed(parser):
     test_case = "The.Matrix.1999.1080p.BluRay.x264"
     result = parser.parse(test_case)
@@ -30,6 +32,7 @@ def test_basic_parsed(parser):
     assert result["quality"] == "BluRay"
     assert result["codec"] == "x264"
 
+
 def test_season_parser(parser):
     test_cases = [
         ("Archer.S02.1080p.BluRay.DTSMA.AVC.Remux", [2]),
@@ -42,19 +45,21 @@ def test_season_parser(parser):
         ("Dragon Ball Z Movie - 09 - Bojack Unbound - 1080p BluRay x264 DTS 5.1 -DDR", []),  # Correct. This should not match, its a movie.
         ("BoJack Horseman [06x01-08 of 16] (2019-2020) WEB-DLRip 720p", [6]),
         ("[HR] Boku no Hero Academia 87 (S4-24) [1080p HEVC Multi-Subs] HR-GZ", [4]),
-        ("The Simpsons S28E21 720p HDTV x264-AVS", [28])
+        ("The Simpsons S28E21 720p HDTV x264-AVS", [28]),
     ]
 
     for test_case, expected in test_cases:
         result = parser.parse(test_case)
         assert isinstance(result, dict)
         assert result["seasons"] == expected, f"Failed for {test_case}"
 
+
 def test_episode_code(parser):
     test_case = "[Golumpa] Fairy Tail - 214 [FuniDub 720p x264 AAC] [5E46AC39]"
     result = parser.parse(test_case)
     assert result["episode_code"] == "5E46AC39"
 
+
 # def test_languages_parser(parser):
 #     test_cases = [
 #         ("Deadpool 2016 1080p BluRay DTS Rus Ukr 3xEng HDCL", ["ukrainian", "russian"]),
@@ -67,4 +72,4 @@ def test_episode_code(parser):
 #     for test_case, expected in test_cases:
 #         result = parser.parse(test_case)
 #         assert isinstance(result, dict)
-#         assert result["languages"] == expected
+#         assert result["languages"] == expected
diff --git a/tests/test_site.py b/tests/test_site.py
@@ -1,21 +1,24 @@
 import pytest
 
-from PTT import parse
+import PTT
 
 
-@pytest.mark.parametrize("release_name, expected_site", [
-    ("The.Expanse.S05E02.1080p.AMZN.WEB.DDP5.1.x264-NTb[eztv.re].mp4", "eztv.re"),
-    ("www.1TamilBlasters.lat - Thuritham (2023) [Tamil - 2K QHD AVC UNTOUCHED - x264 - AAC - 3.4GB - ESub].mkv", "www.1TamilBlasters.lat"),
-    ("www.1TamilMV.world - Raja Vikramarka (2024) Tamil HQ HDRip - 400MB - x264 - AAC - ESub.mkv", "www.1TamilMV.world"),
-    ("Anatomia De Grey - Temporada 19 [HDTV][Cap.1905][Castellano][www.AtomoHD.nu].avi", "www.AtomoHD.nu"),
-    ("[HD-ELITE.NET] -  The.Art.Of.The.Steal.2014.DVDRip.XviD.Dual.Aud", "HD-ELITE.NET"),
-    ("[ Torrent9.cz ] The.InBetween.S01E10.FiNAL.HDTV.XviD-EXTREME.avi", "Torrent9.cz"),
-    ("Jurassic.World.Dominion.CUSTOM.EXTENDED.2022.2160p.MULTi.VF2.UHD.Blu-ray.REMUX.HDR.DoVi.HEVC.DTS-X.DTS-HDHRA.7.1-MOONLY.mkv", None),
-    ("Last.Call.for.Istanbul.2023.1080p.NF.WEB-DL.DDP5.1.H.264.MKV.torrent", None),
-])
+@pytest.mark.parametrize(
+    "release_name, expected_site",
+    [
+        ("The.Expanse.S05E02.1080p.AMZN.WEB.DDP5.1.x264-NTb[eztv.re].mp4", "eztv.re"),
+        ("www.1TamilBlasters.lat - Thuritham (2023) [Tamil - 2K QHD AVC UNTOUCHED - x264 - AAC - 3.4GB - ESub].mkv", "www.1TamilBlasters.lat"),
+        ("www.1TamilMV.world - Raja Vikramarka (2024) Tamil HQ HDRip - 400MB - x264 - AAC - ESub.mkv", "www.1TamilMV.world"),
+        ("Anatomia De Grey - Temporada 19 [HDTV][Cap.1905][Castellano][www.AtomoHD.nu].avi", "www.AtomoHD.nu"),
+        ("[HD-ELITE.NET] -  The.Art.Of.The.Steal.2014.DVDRip.XviD.Dual.Aud", "HD-ELITE.NET"),
+        ("[ Torrent9.cz ] The.InBetween.S01E10.FiNAL.HDTV.XviD-EXTREME.avi", "Torrent9.cz"),
+        ("Jurassic.World.Dominion.CUSTOM.EXTENDED.2022.2160p.MULTi.VF2.UHD.Blu-ray.REMUX.HDR.DoVi.HEVC.DTS-X.DTS-HDHRA.7.1-MOONLY.mkv", None),
+        ("Last.Call.for.Istanbul.2023.1080p.NF.WEB-DL.DDP5.1.H.264.MKV.torrent", None),
+    ],
+)
 def test_group_detection(release_name, expected_site):
-    result = parse(release_name)
+    result = PTT.parse_title(release_name)
     if expected_site:
         assert result.get("site") == expected_site, f"Incorrect site detected for {release_name}"
     else:
-        assert "site" not in result, f"Incorrectly detected site for {release_name}"
+        assert "site" not in result, f"Incorrectly detected site for {release_name}"
diff --git a/tests/test_size.py b/tests/test_size.py
@@ -13,8 +13,8 @@
     ],
 )
 def test_group_detection(release_name, expected_size):
-    result = PTT.parse(release_name)
+    result = PTT.parse_title(release_name)
     if expected_size:
         assert result.get("size") == expected_size, f"Incorrect site detected for {release_name}"
     else:
-        assert "size" not in result, f"Incorrectly detected size for {release_name}"
+        assert "size" not in result, f"Incorrectly detected size for {release_name}"