Skip to content

Commit

Permalink
feat: add adult parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
dreulavelle committed Nov 16, 2024
1 parent 9d99da7 commit 70be90e
Show file tree
Hide file tree
Showing 13 changed files with 2,322 additions and 145 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
.vscode
*.txt
data.json
!*/keywords/*.txt

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ clean:
@find . -type d -name '.pytest_cache' -exec rm -rf {} +
@find . -type d -name '.ruff_cache' -exec rm -rf {} +

keywords:
@python cli.py combine ./PTT/keywords/

format:
@poetry run black $(SRC_DIR)

Expand Down
5 changes: 0 additions & 5 deletions PTT/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,7 @@ def parse_title(raw_title: str, translate_languages: bool = False) -> dict:
:param raw_title: The input raw torrent title to parse.
:param translate_languages: Whether to translate language codes to language names or short codes (default: False returns short codes)
:param parse_anime: Whether to parse anime title (default: False)
:return: A dictionary with the parsed results.
Note:
If `parse_anime` is True, the anime handlers will be added to the parser instance.
This can add more time to the parsing process.
"""
return _parser.parse(raw_title, translate_languages)

Expand Down
25 changes: 25 additions & 0 deletions PTT/adult.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import regex
from pathlib import Path
from typing import Set


def load_adult_keywords(filename: str = "combined-keywords.txt") -> Set[str]:
"""Load adult keywords from the keywords file."""
keywords_file = Path(__file__).parent / "keywords" / filename
keywords = set()

with open(keywords_file, "r") as f:
for line in f:
keyword = line.strip()
if keyword and not keyword.isspace():
keywords.add(regex.escape(keyword))

return keywords

def create_adult_pattern() -> regex.Pattern:
"""Create a compiled regex pattern for adult content detection."""
keywords = load_adult_keywords()
return regex.compile(
r'\b(' + '|'.join(keywords) + r')\b',
regex.IGNORECASE
)
1 change: 1 addition & 0 deletions PTT/anime.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def anime_handler(parser: Parser):
parser.add_handler("anime", regex.compile(r"\[Spark\]|-Spark\b"), boolean, {"remove": False, "skipIfAlreadyFound": True})
parser.add_handler("anime", regex.compile(r"\b(HorribleRips)\b"), boolean, {"remove": False, "skipIfAlreadyFound": True})
parser.add_handler("anime", regex.compile(r"\b(HorribleSubs)\b"), boolean, {"remove": False, "skipIfAlreadyFound": True})
parser.add_handler("anime", regex.compile(r"\b(SubsPlease)\b", regex.IGNORECASE), boolean, {"remove": False, "skipIfAlreadyFound": True})
parser.add_handler("anime", regex.compile(r"\[EMBER\]|-EMBER\b"), boolean, {"remove": False, "skipIfAlreadyFound": True})
parser.add_handler("anime", regex.compile(r"\[Judas\]|-Judas"), boolean, {"remove": False, "skipIfAlreadyFound": True})
parser.add_handler("anime", regex.compile(r"\[Tsundere\]|-Tsundere(?!-)\b"), boolean, {"remove": False, "skipIfAlreadyFound": True})
Expand Down
22 changes: 16 additions & 6 deletions PTT/handlers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import regex

# from PTT.anime import anime_handler
from PTT.adult import create_adult_pattern
from PTT.anime import anime_handler
from PTT.parse import Parser
from PTT.transformers import (
array,
Expand All @@ -27,6 +28,12 @@ def add_defaults(parser: Parser):
"""
# Torrent extension
parser.add_handler("torrent", regex.compile(r"\.torrent$"), boolean, {"remove": True})

# Adult
parser.add_handler("adult", create_adult_pattern(), boolean, {"remove": True, "skipFromTitle": True})

# Anime
# anime_handler(parser) # adds too much time to overall parsing

# Scene
parser.add_handler("scene", regex.compile(r"^(?=.*(\b\d{3,4}p\b).*([_. ]WEB[_. ])(?!DL)\b)|\b(-CAKES|-GGEZ|-GGWP|-GLHF|-GOSSIP|-NAISU|-KOGI|-PECULATE|-SLOT|-EDITH|-ETHEL|-ELEANOR|-B2B|-SPAMnEGGS|-FTP|-DiRT|-SYNCOPY|-BAE|-SuccessfulCrab|-NHTFS|-SURCODE|-B0MBARDIERS)"), boolean, {"remove": False})
Expand Down Expand Up @@ -312,7 +319,7 @@ def handle_volumes(context):
parser.add_handler("seasons", regex.compile(r"[Сс]езон:?[. _]?№?(\d{1,2})(?!\d)", regex.IGNORECASE), array(integer))
parser.add_handler("seasons", regex.compile(r"(?:\D|^)(\d{1,2})Â?[°ºªa]?[. ]*temporada", regex.IGNORECASE), array(integer), {"remove": True})
parser.add_handler("seasons", regex.compile(r"t(\d{1,3})(?:[ex]+|$)", regex.IGNORECASE), array(integer), {"remove": True})
parser.add_handler("seasons", regex.compile(r"(?:(?:\bthe\W)?\bcomplete)?(?:\W|^)s(\d{1,3})(?:[\Wex]|\d{2}\b|$)", regex.IGNORECASE), array(integer), {"skipIfAlreadyFound": False})
parser.add_handler("seasons", regex.compile(r"(?:(?:\bthe\W)?\bcomplete)?s(\d{1,3})(?:[\Wex]|\d{2}\b|$)", regex.IGNORECASE), array(integer), {"remove": False, "skipIfAlreadyFound": False})
parser.add_handler("seasons", regex.compile(r"(?:(?:\bthe\W)?\bcomplete\W)?(?:\W|^)(\d{1,2})[. ]?(?:st|nd|rd|th)[. ]*season", regex.IGNORECASE), array(integer))
parser.add_handler("seasons", regex.compile(r"(?<=S)\d{2}(?=E\d+)"), array(integer))
parser.add_handler("seasons", regex.compile(r"(?:\D|^)(\d{1,2})[xх]\d{1,3}(?:\D|$)"), array(integer))
Expand All @@ -331,7 +338,7 @@ def handle_volumes(context):
parser.add_handler("episodes", regex.compile(r"(?:[\W\d]|^)(?:episodes?|[Сс]ерии:?)[ .]?[([]?(\d{1,3}(?:[ .+]*[&+][ .]?\d{1,3})+)(?:\W|$)", regex.IGNORECASE), range_func)
parser.add_handler("episodes", regex.compile(r"[([]?(?:\D|^)(\d{1,3}[ .]?ao[ .]?\d{1,3})[)\]]?(?:\W|$)", regex.IGNORECASE), range_func)
parser.add_handler("episodes", regex.compile(r"(?:[\W\d]|^)(?:e|eps?|episodes?|[Сс]ерии:?|\d+[xх])[ .]*[([]?(\d{1,3}(?:-\d{1,3})+)(?:\W|$)", regex.IGNORECASE), range_func)
parser.add_handler("episodes", regex.compile(r"(?:\W|^)[st]\d{1,2}[. ]?[xх-]?[. ]?(?:e|x|х|ep|-|\.)[. ]?(\d{1,4})(?:[abc]|v0?[1-4]|\D|$)", regex.IGNORECASE), array(integer))
parser.add_handler("episodes", regex.compile(r"[st]\d{1,2}[. ]?[xх-]?[. ]?(?:e|x|х|ep|-|\.)[. ]?(\d{1,4})(?:[abc]|v0?[1-4]|\D|$)", regex.IGNORECASE), array(integer), {"remove": True})
parser.add_handler("episodes", regex.compile(r"\b[st]\d{2}(\d{2})\b", regex.IGNORECASE), array(integer))
parser.add_handler("episodes", regex.compile(r"(?:\W|^)(\d{1,3}(?:[ .]*~[ .]*\d{1,3})+)(?:\W|$)", regex.IGNORECASE), range_func)
parser.add_handler("episodes", regex.compile(r"-\s(\d{1,3}[ .]*-[ .]*\d{1,3})(?!-\d)(?:\W|$)", regex.IGNORECASE), range_func)
Expand Down Expand Up @@ -405,11 +412,11 @@ def handle_episodes(context):
parser.add_handler("languages", regex.compile(r"\bzh-hans\b", regex.IGNORECASE), uniq_concat(value("zh")), {"skipIfAlreadyFound": False})
parser.add_handler("languages", regex.compile(r"\bFR(?:ench|a|e|anc[eê]s)?\b", regex.IGNORECASE), uniq_concat(value("fr")), {"skipIfAlreadyFound": False})
parser.add_handler("languages", regex.compile(r"\b(VOST(?:FR?|A)?)\b", regex.IGNORECASE), uniq_concat(value("fr")), {"skipIfAlreadyFound": False})
parser.add_handler("languages", regex.compile(r"\b(VF[FQIB2]?|(TRUE|SUB)?.?FRENCH|(VOST)?FR2?)\b", regex.IGNORECASE), uniq_concat(value("fr")), {"skipIfAlreadyFound": False})
parser.add_handler("languages", regex.compile(r"\b(VF[FQIB2]?|(TRUE|SUB)?.?FRENCH|(VOST)?FR2?)\b", regex.IGNORECASE), uniq_concat(value("fr")), {"remove": True, "skipIfAlreadyFound": False})
parser.add_handler("languages", regex.compile(r"\bspanish\W?latin|american\W*(?:spa|esp?)", regex.IGNORECASE), uniq_concat(value("la")), {"skipFromTitle": True, "skipIfAlreadyFound": False, "remove": True})
parser.add_handler("languages", regex.compile(r"\b(?:\bla\b.+(?:cia\b))", regex.IGNORECASE), uniq_concat(value("es")), {"skipFromTitle": True, "skipIfAlreadyFound": False})
parser.add_handler("languages", regex.compile(r"\b(?:audio.)?lat(?:in?|ino)?\b", regex.IGNORECASE), uniq_concat(value("la")), {"skipIfAlreadyFound": False})
parser.add_handler("languages", regex.compile(r"\b(?:audio.)?(?:ESP|spa|(en[ .]+)?espa[nñ]ola?|castellano)\b", regex.IGNORECASE), uniq_concat(value("es")), {"skipIfAlreadyFound": False})
parser.add_handler("languages", regex.compile(r"\b(?:audio.)?(?:ESP?|spa|(en[ .]+)?espa[nñ]ola?|castellano)\b", regex.IGNORECASE), uniq_concat(value("es")), {"skipIfAlreadyFound": False})
parser.add_handler("languages", regex.compile(r"\bes(?=[ .,/-]+(?:[A-Z]{2}[ .,/-]+){2,})\b", regex.IGNORECASE), uniq_concat(value("es")), {"skipFromTitle": True, "skipIfAlreadyFound": False})
parser.add_handler("languages", regex.compile(r"\b(?<=[ .,/-]+(?:[A-Z]{2}[ .,/-]+){2,})es\b", regex.IGNORECASE), uniq_concat(value("es")), {"skipFromTitle": True, "skipIfAlreadyFound": False})
parser.add_handler("languages", regex.compile(r"\b(?<=[ .,/-]+[A-Z]{2}[ .,/-]+)es(?=[ .,/-]+[A-Z]{2}[ .,/-]+)\b", regex.IGNORECASE), uniq_concat(value("es")), {"skipFromTitle": True, "skipIfAlreadyFound": False})
Expand Down Expand Up @@ -524,7 +531,7 @@ def infer_language_based_on_naming(context):
parser.add_handler("subbed", regex.compile(r"\bmulti(?:ple)?[ .-]*(?:su?$|sub\w*|dub\w*)\b|msub", regex.IGNORECASE), boolean, {"skipIfAlreadyFound": False, "remove": True})

# Dubbed
parser.add_handler("dubbed", regex.compile(r"\bmulti(?:ple)?[ .-]*(?:lang(?:uages?)?|audio|VF2)?\b", regex.IGNORECASE), boolean, {"skipIfAlreadyFound": False})
parser.add_handler("dubbed", regex.compile(r"\bmulti(?:ple)?[ .-]*(?:lang(?:uages?)?|audio|VF2)?\b", regex.IGNORECASE), boolean, {"remove": True, "skipIfAlreadyFound": False})
parser.add_handler("dubbed", regex.compile(r"\btri(?:ple)?[ .-]*(?:audio|dub\w*)\b", regex.IGNORECASE), boolean, {"skipIfAlreadyFound": False})
parser.add_handler("dubbed", regex.compile(r"\bdual[ .-]*(?:au?$|[aá]udio|line)\b", regex.IGNORECASE), boolean, {"skipIfAlreadyFound": False})
parser.add_handler("dubbed", regex.compile(r"\bdual\b(?![ .-]*sub)", regex.IGNORECASE), boolean, {"skipIfAlreadyFound": False})
Expand Down Expand Up @@ -602,3 +609,6 @@ def handle_group_exclusion(context):
# Title (hardcoded cleanup)
parser.add_handler("title", regex.compile(r"\b100[ .-]*years?[ .-]*quest\b", regex.IGNORECASE), none, {"remove": True}) # episode title
parser.add_handler("title", regex.compile(r"\b(?:INTEGRALE?|INTÉGRALE?|INTERNAL|HFR)\b", regex.IGNORECASE), none, {"remove": True})

# Adult (post-processing)
parser.add_handler("adult", regex.compile(r"\b(?:adult|porn|sex|xxx|xx)\b", regex.IGNORECASE), boolean, {"remove": True, "skipIfAlreadyFound": True, "skipFromTitle": True})
Loading

0 comments on commit 70be90e

Please sign in to comment.