Skip to content

Commit

Permalink
Improve title parse (#10)
Browse files Browse the repository at this point in the history
* Add support for site & add docs & refactors & cleanup

* format with black

* refactor condition

* update import all

* Fix docs typo

* Add support for new languages patterns & Fix the issue with language detect from domain name & fmt

* replace source with quality similar to PTN

* Add hq audio pattern

* Add support for size

* refactor parser

* Fix parse and parse_title conflict

* Add fan dub & Cleanup redundant chars in title & remove empty brackets in titles

* add site to first for not overlap with other handlers

* enhance title cleanup and ensure at least one character available for torrent title

* fmt
  • Loading branch information
mhdzumair authored Jul 20, 2024
1 parent 55882fb commit 4ec4a0a
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 86 deletions.
2 changes: 1 addition & 1 deletion PTT/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
add_defaults(_parser)


def parse(raw_title: str) -> dict:
def parse_title(raw_title: str) -> dict:
"""
Parse the given input string using the initialized parser instance.
:param raw_title: The input raw torrent title to parse.
Expand Down
22 changes: 6 additions & 16 deletions PTT/handlers.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,7 @@
import regex

from PTT.parse import Parser
from PTT.transformers import (
array,
boolean,
date,
integer,
lowercase,
none,
range_func,
uniq_concat,
uppercase,
value
)
from PTT.transformers import array, boolean, date, integer, lowercase, none, range_func, uniq_concat, uppercase, value


def add_defaults(parser: Parser):
Expand All @@ -26,6 +15,10 @@ def add_defaults(parser: Parser):
# Torrent extension
parser.add_handler("torrent", regex.compile(r"\.torrent$"), boolean, {"remove": True})

# Site before languages to get rid of domain name with country code.
parser.add_handler("site", regex.compile(r"^(www?[\.,][\w-]+\.[\w-]+(?:\.[\w-]+)?)\s+-\s*", regex.IGNORECASE), options={"skipFromTitle": True, "remove": True, "skipIfAlreadyFound": False})
parser.add_handler("site", regex.compile(r"^((?:www?[\.,])?[\w-]+\.[\w-]+(?:\.[\w-]+)*?)\s+-\s*", regex.IGNORECASE), options={"skipIfAlreadyFound": False})

# Episode code
parser.add_handler("episode_code", regex.compile(r"[[(]([a-zA-Z0-9]{8})[\])](?=\.[a-zA-Z0-9]{1,5}$|$)"), uppercase, {"remove": True})
parser.add_handler("episode_code", regex.compile(r"\[([A-Z0-9]{8})]"), uppercase, {"remove": True})
Expand Down Expand Up @@ -316,10 +309,6 @@ def handle_episodes(context):

parser.add_handler("episodes", handle_episodes)

# Site before languages to get rid of domain name with country code.
parser.add_handler("site", regex.compile(r"^(www?[\.,][\w-]+\.[\w-]+(?:\.[\w-]+)?)\s+-\s*", regex.IGNORECASE), options={"skipFromTitle": True, "remove": True, "skipIfAlreadyFound": False})
parser.add_handler("site", regex.compile(r"^((?:www?[\.,])?[\w-]+\.[\w-]+(?:\.[\w-]+)*?)\s+-\s*", regex.IGNORECASE), options={"skipIfAlreadyFound": False})

# Languages
parser.add_handler("languages", regex.compile(r"\bmulti(?:ple)?[ .-]*(?:su?$|sub\w*|dub\w*)\b|msub", regex.IGNORECASE), uniq_concat(value("multi subs")), {"skipIfAlreadyFound": False, "remove": True})
parser.add_handler("languages", regex.compile(r"\bmulti(?:ple)?[ .-]*(?:lang(?:uages?)?|audio|VF2)?\b", regex.IGNORECASE), uniq_concat(value("multi audio")), {"skipIfAlreadyFound": False})
Expand Down Expand Up @@ -445,6 +434,7 @@ def infer_language_based_on_naming(context):
parser.add_handler("subbed", regex.compile(r"\bsub(s|bed)?\b", regex.IGNORECASE), boolean)

# Dubbed
parser.add_handler("dubbed", regex.compile(r"\b(fan\s?dub)\b", regex.IGNORECASE), boolean, {"remove": True, "skipFromTitle": True})
parser.add_handler("dubbed", regex.compile(r"\b(Fan.*)?(?:DUBBED|dublado|dubbing|DUBS?)\b", regex.IGNORECASE), boolean, {"remove": True})
parser.add_handler("dubbed", regex.compile(r"\b(?!.*\bsub(s|bed)?\b)([ _\-\[(\.])?(dual|multi)([ _\-\[(\.])?(audio)?\b", regex.IGNORECASE), boolean, {"remove": True})
parser.add_handler("dubbed", regex.compile(r"\b(JAP?(anese)?|ZH)\+ENG?(lish)?|ENG?(lish)?\+(JAP?(anese)?|ZH)\b", regex.IGNORECASE), boolean, {"remove": True})
Expand Down
14 changes: 8 additions & 6 deletions PTT/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
NOT_ONLY_NON_ENGLISH_REGEX = regex.compile(rf"(?<=[a-zA-Z][^{NON_ENGLISH_CHARS}]+)[{NON_ENGLISH_CHARS}].*[{NON_ENGLISH_CHARS}]|[{NON_ENGLISH_CHARS}].*[{NON_ENGLISH_CHARS}](?=[^{NON_ENGLISH_CHARS}]+[a-zA-Z])")
NOT_ALLOWED_SYMBOLS_AT_START_AND_END = regex.compile(rf"^[^\w{NON_ENGLISH_CHARS}#[【★]+|[ \-:/\\[|{{(#$&^]+$")
REMAINING_NOT_ALLOWED_SYMBOLS_AT_START_AND_END = regex.compile(rf"^[^\w{NON_ENGLISH_CHARS}#]+|]$")
REDUNDANT_SYMBOLS_AT_END = regex.compile(r"[ \-:./\\]+$")
EMPTY_BRACKETS_REGEX = regex.compile(r"\(\s*\)|\[\s*\]|\{\s*\}")

DEBUG_HANDLER = False

Expand Down Expand Up @@ -121,17 +123,17 @@ def clean_title(raw_title: str) -> str:
cleaned_title = ALT_TITLES_REGEX.sub("", cleaned_title)
cleaned_title = NOT_ONLY_NON_ENGLISH_REGEX.sub("", cleaned_title)
cleaned_title = REMAINING_NOT_ALLOWED_SYMBOLS_AT_START_AND_END.sub("", cleaned_title)
cleaned_title = EMPTY_BRACKETS_REGEX.sub("", cleaned_title)

# Remove brackets if only one is present
for brackets in BRACKETS:
only_one_bracket = not all(bracket in cleaned_title for bracket in brackets)
if only_one_bracket:
for bracket in brackets:
cleaned_title = cleaned_title.replace(bracket, "")
for open_bracket, close_bracket in BRACKETS:
if cleaned_title.count(open_bracket) != cleaned_title.count(close_bracket):
cleaned_title = cleaned_title.replace(open_bracket, "").replace(close_bracket, "")

if " " not in cleaned_title and "." in cleaned_title:
cleaned_title = regex.sub(r"\.", " ", cleaned_title)

cleaned_title = REDUNDANT_SYMBOLS_AT_END.sub("", cleaned_title)
cleaned_title = cleaned_title.strip()
return cleaned_title

Expand Down Expand Up @@ -211,7 +213,7 @@ def parse(self, title: str) -> Dict[str, Any]:

if remove:
title = title[:match_index] + title[match_index + len(raw_match) :]
if not skip_from_title and match_index and match_index < end_of_title:
if not skip_from_title and match_index and 1 < match_index < end_of_title:
end_of_title = match_index
if remove and skip_from_title and match_index < end_of_title:
end_of_title -= len(raw_match)
Expand Down
9 changes: 7 additions & 2 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def parser():
add_defaults(p)
return p


def test_parsed_output(parser):
test_case = "[Golumpa] Fairy Tail - 214 [FuniDub 720p x264 AAC] [5E46AC39]"
result = parser.parse(test_case)
Expand All @@ -20,6 +21,7 @@ def test_parsed_output(parser):
assert "codec" in result
assert "audio" in result


def test_basic_parsed(parser):
test_case = "The.Matrix.1999.1080p.BluRay.x264"
result = parser.parse(test_case)
Expand All @@ -30,6 +32,7 @@ def test_basic_parsed(parser):
assert result["quality"] == "BluRay"
assert result["codec"] == "x264"


def test_season_parser(parser):
test_cases = [
("Archer.S02.1080p.BluRay.DTSMA.AVC.Remux", [2]),
Expand All @@ -42,19 +45,21 @@ def test_season_parser(parser):
("Dragon Ball Z Movie - 09 - Bojack Unbound - 1080p BluRay x264 DTS 5.1 -DDR", []), # Correct. This should not match, its a movie.
("BoJack Horseman [06x01-08 of 16] (2019-2020) WEB-DLRip 720p", [6]),
("[HR] Boku no Hero Academia 87 (S4-24) [1080p HEVC Multi-Subs] HR-GZ", [4]),
("The Simpsons S28E21 720p HDTV x264-AVS", [28])
("The Simpsons S28E21 720p HDTV x264-AVS", [28]),
]

for test_case, expected in test_cases:
result = parser.parse(test_case)
assert isinstance(result, dict)
assert result["seasons"] == expected, f"Failed for {test_case}"


def test_episode_code(parser):
test_case = "[Golumpa] Fairy Tail - 214 [FuniDub 720p x264 AAC] [5E46AC39]"
result = parser.parse(test_case)
assert result["episode_code"] == "5E46AC39"


# def test_languages_parser(parser):
# test_cases = [
# ("Deadpool 2016 1080p BluRay DTS Rus Ukr 3xEng HDCL", ["ukrainian", "russian"]),
Expand All @@ -67,4 +72,4 @@ def test_episode_code(parser):
# for test_case, expected in test_cases:
# result = parser.parse(test_case)
# assert isinstance(result, dict)
# assert result["languages"] == expected
# assert result["languages"] == expected
29 changes: 16 additions & 13 deletions tests/test_site.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
import pytest

from PTT import parse
import PTT


@pytest.mark.parametrize("release_name, expected_site", [
("The.Expanse.S05E02.1080p.AMZN.WEB.DDP5.1.x264-NTb[eztv.re].mp4", "eztv.re"),
("www.1TamilBlasters.lat - Thuritham (2023) [Tamil - 2K QHD AVC UNTOUCHED - x264 - AAC - 3.4GB - ESub].mkv", "www.1TamilBlasters.lat"),
("www.1TamilMV.world - Raja Vikramarka (2024) Tamil HQ HDRip - 400MB - x264 - AAC - ESub.mkv", "www.1TamilMV.world"),
("Anatomia De Grey - Temporada 19 [HDTV][Cap.1905][Castellano][www.AtomoHD.nu].avi", "www.AtomoHD.nu"),
("[HD-ELITE.NET] - The.Art.Of.The.Steal.2014.DVDRip.XviD.Dual.Aud", "HD-ELITE.NET"),
("[ Torrent9.cz ] The.InBetween.S01E10.FiNAL.HDTV.XviD-EXTREME.avi", "Torrent9.cz"),
("Jurassic.World.Dominion.CUSTOM.EXTENDED.2022.2160p.MULTi.VF2.UHD.Blu-ray.REMUX.HDR.DoVi.HEVC.DTS-X.DTS-HDHRA.7.1-MOONLY.mkv", None),
("Last.Call.for.Istanbul.2023.1080p.NF.WEB-DL.DDP5.1.H.264.MKV.torrent", None),
])
@pytest.mark.parametrize(
"release_name, expected_site",
[
("The.Expanse.S05E02.1080p.AMZN.WEB.DDP5.1.x264-NTb[eztv.re].mp4", "eztv.re"),
("www.1TamilBlasters.lat - Thuritham (2023) [Tamil - 2K QHD AVC UNTOUCHED - x264 - AAC - 3.4GB - ESub].mkv", "www.1TamilBlasters.lat"),
("www.1TamilMV.world - Raja Vikramarka (2024) Tamil HQ HDRip - 400MB - x264 - AAC - ESub.mkv", "www.1TamilMV.world"),
("Anatomia De Grey - Temporada 19 [HDTV][Cap.1905][Castellano][www.AtomoHD.nu].avi", "www.AtomoHD.nu"),
("[HD-ELITE.NET] - The.Art.Of.The.Steal.2014.DVDRip.XviD.Dual.Aud", "HD-ELITE.NET"),
("[ Torrent9.cz ] The.InBetween.S01E10.FiNAL.HDTV.XviD-EXTREME.avi", "Torrent9.cz"),
("Jurassic.World.Dominion.CUSTOM.EXTENDED.2022.2160p.MULTi.VF2.UHD.Blu-ray.REMUX.HDR.DoVi.HEVC.DTS-X.DTS-HDHRA.7.1-MOONLY.mkv", None),
("Last.Call.for.Istanbul.2023.1080p.NF.WEB-DL.DDP5.1.H.264.MKV.torrent", None),
],
)
def test_group_detection(release_name, expected_site):
result = parse(release_name)
result = PTT.parse_title(release_name)
if expected_site:
assert result.get("site") == expected_site, f"Incorrect site detected for {release_name}"
else:
assert "site" not in result, f"Incorrectly detected site for {release_name}"
assert "site" not in result, f"Incorrectly detected site for {release_name}"
4 changes: 2 additions & 2 deletions tests/test_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
],
)
def test_group_detection(release_name, expected_size):
result = PTT.parse(release_name)
result = PTT.parse_title(release_name)
if expected_size:
assert result.get("size") == expected_size, f"Incorrect site detected for {release_name}"
else:
assert "size" not in result, f"Incorrectly detected size for {release_name}"
assert "size" not in result, f"Incorrectly detected size for {release_name}"
Loading

0 comments on commit 4ec4a0a

Please sign in to comment.