Skip to content

Commit

Permalink
Improve codec and site handlers
Browse files Browse the repository at this point in the history
Remove non-closing brackets
Strip all parsed strings
  • Loading branch information
davidemarcoli committed Jul 19, 2024
1 parent 97e9639 commit 30e4ae5
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 7 deletions.
12 changes: 6 additions & 6 deletions PTT/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def handle_bit_depth(context):

# Codec
parser.add_handler("codec", regex.compile(r"\b[xh][\. \-]?264\b", regex.IGNORECASE), lowercase, {"remove": True})
parser.add_handler("codec", regex.compile(r"\bHEVC10(bit)?|[xh][\. \-]?265\b", regex.IGNORECASE), value("x265"), {"remove": True})
parser.add_handler("codec", regex.compile(r"\bHEVC10(bit)?\b|\b[xh][\. \-]?265\b", regex.IGNORECASE), value("x265"), {"remove": True})
parser.add_handler("codec", regex.compile(r"\bhevc(?:\s?10)?\b", regex.IGNORECASE), value("x265"), {"remove": True, "skipIfAlreadyFound": False})
parser.add_handler("codec", regex.compile(r"\b(?:dvix|mpeg2|divx|xvid|avc|av1)\b", regex.IGNORECASE), lowercase, {"remove": True, "skipIfAlreadyFound": False})
def handle_space_in_codec(context):
Expand Down Expand Up @@ -264,8 +264,8 @@ def handle_episodes(context):
parser.add_handler("episodes", handle_episodes)

# Site before languages to get rid of domain name with country code.
parser.add_handler("site", regex.compile(r"^(www\.[\w-]+\.[\w-]+(?:\.[\w-]+)?)\s+-\s*", regex.IGNORECASE), options={"skipFromTitle": True, "remove": True, "skipIfAlreadyFound": False})
parser.add_handler("site", regex.compile(r"^((?:www\.)?[\w-]+\.[\w-]+(?:\.[\w-]+)*?)\s+-\s*", regex.IGNORECASE), options={"skipIfAlreadyFound": False})
parser.add_handler("site", regex.compile(r"^(www?[\.,][\w-]+\.[\w-]+(?:\.[\w-]+)?)\s+-\s*", regex.IGNORECASE), options={"skipFromTitle": True, "remove": True, "skipIfAlreadyFound": False})
parser.add_handler("site", regex.compile(r"^((?:www?[\.,])?[\w-]+\.[\w-]+(?:\.[\w-]+)*?)\s+-\s*", regex.IGNORECASE), options={"skipIfAlreadyFound": False})

# Languages
parser.add_handler("languages", regex.compile(r"\bmulti(?:ple)?[ .-]*(?:su?$|sub\w*|dub\w*)\b|msub", regex.IGNORECASE), uniq_concat(value("multi subs")), {"skipIfAlreadyFound": False, "remove": True})
Expand Down Expand Up @@ -385,7 +385,7 @@ def infer_language_based_on_naming(context):
regex.search(r"dublado", title, regex.IGNORECASE):
result["languages"] = result.get("languages", []) + ["portuguese"]

return {"match_index": 0}
return None
parser.add_handler("languages", infer_language_based_on_naming)

# Subbed
Expand All @@ -399,7 +399,7 @@ def handle_dubbed(context):
result = context["result"]
if "languages" in result and any(lang in ["multi audio", "dual audio"] for lang in result["languages"]):
result["dubbed"] = True
return {"match_index": 0}
return None
parser.add_handler("dubbed", handle_dubbed)

# Group
Expand All @@ -415,7 +415,7 @@ def handle_group(context):
# Check if there's any overlap with other matched elements
if any(key != "group" and matched[key]["match_index"] < end_index for key in matched if "match_index" in matched[key]) and "group" in result:
del result["group"]
return {"match_index": 0}
return None

parser.add_handler("group", handle_group)

Expand Down
19 changes: 18 additions & 1 deletion PTT/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,18 @@
"\u0e00-\u0e7f" # Thai characters
)

CURLY_BRACKETS = ["{", "}"]
SQUARE_BRACKETS = ["[", "]"]
PARENTHESES = ["(", ")"]
BRACKETS = [CURLY_BRACKETS, SQUARE_BRACKETS, PARENTHESES]

RUSSIAN_CAST_REGEX = regex.compile(r"\([^)]*[\u0400-\u04ff][^)]*\)$|(?<=\/.*)\(.*\)$")
ALT_TITLES_REGEX = regex.compile(rf"[^/|(]*[{NON_ENGLISH_CHARS}][^/|]*[/|]|[/|][^/|(]*[{NON_ENGLISH_CHARS}][^/|]*")
NOT_ONLY_NON_ENGLISH_REGEX = regex.compile(rf"(?<=[a-zA-Z][^{NON_ENGLISH_CHARS}]+)[{NON_ENGLISH_CHARS}].*[{NON_ENGLISH_CHARS}]|[{NON_ENGLISH_CHARS}].*[{NON_ENGLISH_CHARS}](?=[^{NON_ENGLISH_CHARS}]+[a-zA-Z])")
NOT_ALLOWED_SYMBOLS_AT_START_AND_END = regex.compile(rf"^[^\w{NON_ENGLISH_CHARS}#[【★]+|[ \-:/\\[|{{(#$&^]+$")
REMAINING_NOT_ALLOWED_SYMBOLS_AT_START_AND_END = regex.compile(rf"^[^\w{NON_ENGLISH_CHARS}#]+|]$")

DEBUG_HANDLER = "seasons"
DEBUG_HANDLER = False


def extend_options(options: Dict[str, Any] = None) -> Dict[str, Any]:
Expand Down Expand Up @@ -75,6 +80,8 @@ def handler(context: Dict[str, Any]) -> Union[Dict[str, Any], None]:
sig = inspect.signature(transformer)
param_count = len(sig.parameters)
transformed = transformer(clean_match or raw_match, *([result.get(name)] if param_count > 1 else []))
if type(transformed) is str:
transformed = transformed.strip()

before_title_match = regex.match(r"^\[([^[\]]+)]", title)
is_before_title = before_title_match is not None and raw_match in before_title_match.group(1)
Expand Down Expand Up @@ -115,6 +122,13 @@ def clean_title(raw_title: str) -> str:
cleaned_title = NOT_ONLY_NON_ENGLISH_REGEX.sub("", cleaned_title)
cleaned_title = REMAINING_NOT_ALLOWED_SYMBOLS_AT_START_AND_END.sub("", cleaned_title)

# Remove brackets if only one is present
for brackets in BRACKETS:
only_one_bracket = not all(bracket in cleaned_title for bracket in brackets)
if only_one_bracket:
for bracket in brackets:
cleaned_title = cleaned_title.replace(bracket, "")

if " " not in cleaned_title and "." in cleaned_title:
cleaned_title = regex.sub(r"\.", " ", cleaned_title)

Expand Down Expand Up @@ -184,6 +198,9 @@ def parse(self, title: str) -> Dict[str, Any]:
for handler in self.handlers:
match_result = handler({"title": title, "result": result, "matched": matched})

if DEBUG_HANDLER is True or (type(DEBUG_HANDLER) is str and DEBUG_HANDLER in handler.handler_name):
print(handler.handler_name, match_result, title)

if match_result is None:
continue

Expand Down

0 comments on commit 30e4ae5

Please sign in to comment.