feat: add adult parsing

dreulavelle · Nov 16, 2024 · 70be90e · 70be90e
1 parent 9d99da7
commit 70be90e
Show file tree

Hide file tree

Showing 13 changed files with 2,322 additions and 145 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 .vscode
 *.txt
 data.json
+!*/keywords/*.txt
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/Makefile b/Makefile
@@ -24,6 +24,9 @@ clean:
 	@find . -type d -name '.pytest_cache' -exec rm -rf {} +
 	@find . -type d -name '.ruff_cache' -exec rm -rf {} +
 
+keywords:
+	@python cli.py combine ./PTT/keywords/
+
 format:
 	@poetry run black $(SRC_DIR)
 

diff --git a/PTT/__init__.py b/PTT/__init__.py
@@ -11,12 +11,7 @@ def parse_title(raw_title: str, translate_languages: bool = False) -> dict:
 
     :param raw_title: The input raw torrent title to parse.
     :param translate_languages: Whether to translate language codes to language names or short codes (default: False returns short codes)
-    :param parse_anime: Whether to parse anime title (default: False)
     :return: A dictionary with the parsed results.
-
-    Note:
-        If `parse_anime` is True, the anime handlers will be added to the parser instance.
-        This can add more time to the parsing process.
     """
     return _parser.parse(raw_title, translate_languages)
 

diff --git a/PTT/adult.py b/PTT/adult.py
@@ -0,0 +1,25 @@
+import regex
+from pathlib import Path
+from typing import Set
+
+
+def load_adult_keywords(filename: str = "combined-keywords.txt") -> Set[str]:
+    """Load adult keywords from the keywords file."""
+    keywords_file = Path(__file__).parent / "keywords" / filename
+    keywords = set()
+
+    with open(keywords_file, "r") as f:
+        for line in f:
+            keyword = line.strip()
+            if keyword and not keyword.isspace():
+                keywords.add(regex.escape(keyword))
+
+    return keywords
+
+def create_adult_pattern() -> regex.Pattern:
+    """Create a compiled regex pattern for adult content detection."""
+    keywords = load_adult_keywords()
+    return regex.compile(
+        r'\b(' + '|'.join(keywords) + r')\b',
+        regex.IGNORECASE
+    )
diff --git a/PTT/anime.py b/PTT/anime.py
@@ -19,6 +19,7 @@ def anime_handler(parser: Parser):
     parser.add_handler("anime", regex.compile(r"\[Spark\]|-Spark\b"), boolean, {"remove": False, "skipIfAlreadyFound": True})
     parser.add_handler("anime", regex.compile(r"\b(HorribleRips)\b"), boolean, {"remove": False, "skipIfAlreadyFound": True})
     parser.add_handler("anime", regex.compile(r"\b(HorribleSubs)\b"), boolean, {"remove": False, "skipIfAlreadyFound": True})
+    parser.add_handler("anime", regex.compile(r"\b(SubsPlease)\b", regex.IGNORECASE), boolean, {"remove": False, "skipIfAlreadyFound": True})
     parser.add_handler("anime", regex.compile(r"\[EMBER\]|-EMBER\b"), boolean, {"remove": False, "skipIfAlreadyFound": True})
     parser.add_handler("anime", regex.compile(r"\[Judas\]|-Judas"), boolean, {"remove": False, "skipIfAlreadyFound": True})
     parser.add_handler("anime", regex.compile(r"\[Tsundere\]|-Tsundere(?!-)\b"), boolean, {"remove": False, "skipIfAlreadyFound": True})

diff --git a/PTT/handlers.py b/PTT/handlers.py
@@ -1,6 +1,7 @@
 import regex
 
-# from PTT.anime import anime_handler
+from PTT.adult import create_adult_pattern
+from PTT.anime import anime_handler
 from PTT.parse import Parser
 from PTT.transformers import (
     array,
@@ -27,6 +28,12 @@ def add_defaults(parser: Parser):
     """
     # Torrent extension
     parser.add_handler("torrent", regex.compile(r"\.torrent$"), boolean, {"remove": True})
+
+    # Adult
+    parser.add_handler("adult", create_adult_pattern(), boolean, {"remove": True, "skipFromTitle": True})
+
+    # Anime  
+    # anime_handler(parser)  # adds too much time to overall parsing
 
     # Scene
     parser.add_handler("scene", regex.compile(r"^(?=.*(\b\d{3,4}p\b).*([_. ]WEB[_. ])(?!DL)\b)|\b(-CAKES|-GGEZ|-GGWP|-GLHF|-GOSSIP|-NAISU|-KOGI|-PECULATE|-SLOT|-EDITH|-ETHEL|-ELEANOR|-B2B|-SPAMnEGGS|-FTP|-DiRT|-SYNCOPY|-BAE|-SuccessfulCrab|-NHTFS|-SURCODE|-B0MBARDIERS)"), boolean, {"remove": False})
@@ -312,7 +319,7 @@ def handle_volumes(context):
     parser.add_handler("seasons", regex.compile(r"[Сс]езон:?[. _]?№?(\d{1,2})(?!\d)", regex.IGNORECASE), array(integer))
     parser.add_handler("seasons", regex.compile(r"(?:\D|^)(\d{1,2})Â?[°ºªa]?[. ]*temporada", regex.IGNORECASE), array(integer), {"remove": True})
     parser.add_handler("seasons", regex.compile(r"t(\d{1,3})(?:[ex]+|$)", regex.IGNORECASE), array(integer), {"remove": True})
-    parser.add_handler("seasons", regex.compile(r"(?:(?:\bthe\W)?\bcomplete)?(?:\W|^)s(\d{1,3})(?:[\Wex]|\d{2}\b|$)", regex.IGNORECASE), array(integer), {"skipIfAlreadyFound": False})
+    parser.add_handler("seasons", regex.compile(r"(?:(?:\bthe\W)?\bcomplete)?s(\d{1,3})(?:[\Wex]|\d{2}\b|$)", regex.IGNORECASE), array(integer), {"remove": False, "skipIfAlreadyFound": False})
     parser.add_handler("seasons", regex.compile(r"(?:(?:\bthe\W)?\bcomplete\W)?(?:\W|^)(\d{1,2})[. ]?(?:st|nd|rd|th)[. ]*season", regex.IGNORECASE), array(integer))
     parser.add_handler("seasons", regex.compile(r"(?<=S)\d{2}(?=E\d+)"), array(integer))
     parser.add_handler("seasons", regex.compile(r"(?:\D|^)(\d{1,2})[xх]\d{1,3}(?:\D|$)"), array(integer))
@@ -331,7 +338,7 @@ def handle_volumes(context):
     parser.add_handler("episodes", regex.compile(r"(?:[\W\d]|^)(?:episodes?|[Сс]ерии:?)[ .]?[([]?(\d{1,3}(?:[ .+]*[&+][ .]?\d{1,3})+)(?:\W|$)", regex.IGNORECASE), range_func)
     parser.add_handler("episodes", regex.compile(r"[([]?(?:\D|^)(\d{1,3}[ .]?ao[ .]?\d{1,3})[)\]]?(?:\W|$)", regex.IGNORECASE), range_func)
     parser.add_handler("episodes", regex.compile(r"(?:[\W\d]|^)(?:e|eps?|episodes?|[Сс]ерии:?|\d+[xх])[ .]*[([]?(\d{1,3}(?:-\d{1,3})+)(?:\W|$)", regex.IGNORECASE), range_func)
-    parser.add_handler("episodes", regex.compile(r"(?:\W|^)[st]\d{1,2}[. ]?[xх-]?[. ]?(?:e|x|х|ep|-|\.)[. ]?(\d{1,4})(?:[abc]|v0?[1-4]|\D|$)", regex.IGNORECASE), array(integer))
+    parser.add_handler("episodes", regex.compile(r"[st]\d{1,2}[. ]?[xх-]?[. ]?(?:e|x|х|ep|-|\.)[. ]?(\d{1,4})(?:[abc]|v0?[1-4]|\D|$)", regex.IGNORECASE), array(integer), {"remove": True})
     parser.add_handler("episodes", regex.compile(r"\b[st]\d{2}(\d{2})\b", regex.IGNORECASE), array(integer))
     parser.add_handler("episodes", regex.compile(r"(?:\W|^)(\d{1,3}(?:[ .]*~[ .]*\d{1,3})+)(?:\W|$)", regex.IGNORECASE), range_func)
     parser.add_handler("episodes", regex.compile(r"-\s(\d{1,3}[ .]*-[ .]*\d{1,3})(?!-\d)(?:\W|$)", regex.IGNORECASE), range_func)
@@ -405,11 +412,11 @@ def handle_episodes(context):
     parser.add_handler("languages", regex.compile(r"\bzh-hans\b", regex.IGNORECASE), uniq_concat(value("zh")), {"skipIfAlreadyFound": False})
     parser.add_handler("languages", regex.compile(r"\bFR(?:ench|a|e|anc[eê]s)?\b", regex.IGNORECASE), uniq_concat(value("fr")), {"skipIfAlreadyFound": False})
     parser.add_handler("languages", regex.compile(r"\b(VOST(?:FR?|A)?)\b", regex.IGNORECASE), uniq_concat(value("fr")), {"skipIfAlreadyFound": False})
-    parser.add_handler("languages", regex.compile(r"\b(VF[FQIB2]?|(TRUE|SUB)?.?FRENCH|(VOST)?FR2?)\b", regex.IGNORECASE), uniq_concat(value("fr")), {"skipIfAlreadyFound": False})
+    parser.add_handler("languages", regex.compile(r"\b(VF[FQIB2]?|(TRUE|SUB)?.?FRENCH|(VOST)?FR2?)\b", regex.IGNORECASE), uniq_concat(value("fr")), {"remove": True, "skipIfAlreadyFound": False})
     parser.add_handler("languages", regex.compile(r"\bspanish\W?latin|american\W*(?:spa|esp?)", regex.IGNORECASE), uniq_concat(value("la")), {"skipFromTitle": True, "skipIfAlreadyFound": False, "remove": True})
     parser.add_handler("languages", regex.compile(r"\b(?:\bla\b.+(?:cia\b))", regex.IGNORECASE), uniq_concat(value("es")), {"skipFromTitle": True, "skipIfAlreadyFound": False})
     parser.add_handler("languages", regex.compile(r"\b(?:audio.)?lat(?:in?|ino)?\b", regex.IGNORECASE), uniq_concat(value("la")), {"skipIfAlreadyFound": False})
-    parser.add_handler("languages", regex.compile(r"\b(?:audio.)?(?:ESP|spa|(en[ .]+)?espa[nñ]ola?|castellano)\b", regex.IGNORECASE), uniq_concat(value("es")), {"skipIfAlreadyFound": False})
+    parser.add_handler("languages", regex.compile(r"\b(?:audio.)?(?:ESP?|spa|(en[ .]+)?espa[nñ]ola?|castellano)\b", regex.IGNORECASE), uniq_concat(value("es")), {"skipIfAlreadyFound": False})
     parser.add_handler("languages", regex.compile(r"\bes(?=[ .,/-]+(?:[A-Z]{2}[ .,/-]+){2,})\b", regex.IGNORECASE), uniq_concat(value("es")), {"skipFromTitle": True, "skipIfAlreadyFound": False})
     parser.add_handler("languages", regex.compile(r"\b(?<=[ .,/-]+(?:[A-Z]{2}[ .,/-]+){2,})es\b", regex.IGNORECASE), uniq_concat(value("es")), {"skipFromTitle": True, "skipIfAlreadyFound": False})
     parser.add_handler("languages", regex.compile(r"\b(?<=[ .,/-]+[A-Z]{2}[ .,/-]+)es(?=[ .,/-]+[A-Z]{2}[ .,/-]+)\b", regex.IGNORECASE), uniq_concat(value("es")), {"skipFromTitle": True, "skipIfAlreadyFound": False})
@@ -524,7 +531,7 @@ def infer_language_based_on_naming(context):
     parser.add_handler("subbed", regex.compile(r"\bmulti(?:ple)?[ .-]*(?:su?$|sub\w*|dub\w*)\b|msub", regex.IGNORECASE), boolean, {"skipIfAlreadyFound": False, "remove": True})
 
     # Dubbed
-    parser.add_handler("dubbed", regex.compile(r"\bmulti(?:ple)?[ .-]*(?:lang(?:uages?)?|audio|VF2)?\b", regex.IGNORECASE), boolean, {"skipIfAlreadyFound": False})
+    parser.add_handler("dubbed", regex.compile(r"\bmulti(?:ple)?[ .-]*(?:lang(?:uages?)?|audio|VF2)?\b", regex.IGNORECASE), boolean, {"remove": True, "skipIfAlreadyFound": False})
     parser.add_handler("dubbed", regex.compile(r"\btri(?:ple)?[ .-]*(?:audio|dub\w*)\b", regex.IGNORECASE), boolean, {"skipIfAlreadyFound": False})
     parser.add_handler("dubbed", regex.compile(r"\bdual[ .-]*(?:au?$|[aá]udio|line)\b", regex.IGNORECASE), boolean, {"skipIfAlreadyFound": False})
     parser.add_handler("dubbed", regex.compile(r"\bdual\b(?![ .-]*sub)", regex.IGNORECASE), boolean, {"skipIfAlreadyFound": False})
@@ -602,3 +609,6 @@ def handle_group_exclusion(context):
     # Title (hardcoded cleanup)
     parser.add_handler("title", regex.compile(r"\b100[ .-]*years?[ .-]*quest\b", regex.IGNORECASE), none, {"remove": True}) # episode title
     parser.add_handler("title", regex.compile(r"\b(?:INTEGRALE?|INTÉGRALE?|INTERNAL|HFR)\b", regex.IGNORECASE), none, {"remove": True})
+
+    # Adult (post-processing)
+    parser.add_handler("adult", regex.compile(r"\b(?:adult|porn|sex|xxx|xx)\b", regex.IGNORECASE), boolean, {"remove": True, "skipIfAlreadyFound": True, "skipFromTitle": True})