Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Subtitle text guesser (resolves #287) #288

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions mnamer/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,13 @@ class MnamerNetworkException(MnamerException):

class MnamerNotFoundException(MnamerException):
"""Raised when a lookup or search works as expected yet yields no results."""


class MnamerFailedLangGuesserInstantiation(MnamerException):
"""
Raised when a requested text language guesser failed to instantiate.
"""


class MnamerNoSuchLangGuesser(MnamerException):
"""Raised when a requested text language guesser name does not match any known guessers."""
17 changes: 17 additions & 0 deletions mnamer/setting_store.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import dataclasses
import json
from functools import cached_property
from pathlib import Path
from typing import Any, Callable

Expand All @@ -11,6 +12,7 @@
from mnamer.setting_spec import SettingSpec
from mnamer.types import MediaType, ProviderType, SettingType
from mnamer.utils import crawl_out, json_loads, normalize_containers
from mnamer import text_lang_guesser


@dataclasses.dataclass
Expand Down Expand Up @@ -106,6 +108,15 @@ class SettingStore:
help="--language=<LANG>: specify the search language",
).as_dict(),
)
subtitle_lang_guesser: Language | None = dataclasses.field(
default=None,
metadata=SettingSpec(
flags=["--subtitle-lang-guesser"],
group=SettingType.PARAMETER,
choices=list(text_lang_guesser.available_guessers),
help="--subtitle-lang-guesser=<GUESSER>: subtitle file text language guesser (must be installed)",
).as_dict(),
)
mask: list[str] = dataclasses.field(
default_factory=lambda: [
"avi",
Expand Down Expand Up @@ -367,6 +378,12 @@ def specifications(cls) -> list[SettingSpec]:
def _resolve_path(path: str | Path) -> Path:
return Path(path).resolve()

@cached_property
def text_lang_guesser(self):
if not self.subtitle_lang_guesser:
return None
return text_lang_guesser.guesser(self.subtitle_lang_guesser, Language.all())

def __setattr__(self, key: str, value: Any):
converter_map: dict[str, Callable] = {
"episode_api": ProviderType,
Expand Down
14 changes: 13 additions & 1 deletion mnamer/target.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ def destination(self) -> Path:

def _parse(self, file_path: Path):
path_data: dict[str, Any] = {"language": self._settings.language}
if is_subtitle(self.source):
source_is_subtitle = is_subtitle(self.source)
if source_is_subtitle:
try:
path_data["language"] = Language.parse(self.source.stem[-2:])
file_path = Path(self.source.parent, self.source.stem[:-2])
Expand Down Expand Up @@ -176,6 +177,17 @@ def _parse(self, file_path: Path):
self.metadata.language_sub = path_data.get("subtitle_language")
except MnamerException:
pass
if (
source_is_subtitle
and not self.metadata.language_sub
and self._settings.subtitle_lang_guesser
):
try:
self.metadata.language_sub = (
self._settings.text_lang_guesser.guess_language(self.source)
)
except MnamerException:
pass
if isinstance(self.metadata, MetadataMovie):
self.metadata.name = path_data.get("title")
self.metadata.year = path_data.get("year")
Expand Down
54 changes: 54 additions & 0 deletions mnamer/text_lang_guesser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import logging
from typing import Dict
from mnamer.exceptions import (
MnamerFailedLangGuesserInstantiation,
MnamerNoSuchLangGuesser,
)
from mnamer.language import Language
from importlib import import_module


def _import_module(dotted_module_name: str):
try:
return import_module(dotted_module_name)
except ImportError as e:
logging.debug(f"Failed to import {dotted_module_name}: {e}", exc_info=e)
return None


possible_guessers = (
("lingua", "mnamer.text_lang_guesser.lingua.LinguaGuesser"),
("langdetect", "mnamer.text_lang_guesser.langdetect.LangdetectGuesser"),
("fasttext", "mnamer.text_lang_guesser.fasttext.FasttextGuesser"),
("langid", "mnamer.text_lang_guesser.langid.LangidGuesser"),
)

available_guessers = {}
for name, module_class in possible_guessers:
module_name, classname = module_class.rsplit(".", 1)
mod = _import_module(module_name)
if mod:
try:
cls = getattr(mod, classname)
except AttributeError as e:
logging.debug(
f"Failed to load class {classname} from module {mod}: {e}", exc_info=e
)
continue
available_guessers[name] = cls


def guesser(name: str, guess_languages: Dict[str, Language]):
if name not in available_guessers:
raise MnamerNoSuchLangGuesser("Unrecognized language guesser")
try:
return available_guessers[name](guess_languages=guess_languages)
except Exception as e:
class_name = available_guessers[name].__name__
logging.debug(
f"Error trying to instantiate {class_name}",
exc_info=e,
)
raise MnamerFailedLangGuesserInstantiation(
f"Failed creating guesser {class_name}"
)
174 changes: 174 additions & 0 deletions mnamer/text_lang_guesser/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
from abc import ABC, abstractmethod
from pathlib import Path
import logging
import os
import re
from typing import List, Optional
from chardet.universaldetector import UniversalDetector
from mnamer.language import Language


class TextLanguageGuesser(ABC):
def __init__(self, guess_languages: List[Language], min_probability: float = 0.9):
self.guess_languages = guess_languages
self.language_map = self._language_map(guess_languages)
self.min_probability = min_probability
self.identifier = self._initialize_identifier()

exp_only_nums = r"^\d+$"
exp_timeframe = r"^[\s0-9:.,>-]+$"
skip_patterns = [exp_only_nums, exp_timeframe]
self.skip_line_expressions_str = [re.compile(exp) for exp in skip_patterns]
self.skip_line_expressions_bytes = [
re.compile(exp.encode("ascii")) for exp in skip_patterns
]
self.encoding_detector = UniversalDetector()

@abstractmethod
def guess_language_from_text(self, text: str) -> Optional[str]:
"""
Guess the language, based on the text in the file.
"""
pass

def _language_map(self, lang_list: List[Language]):
"""
Returns a dict that will be used to map an identification result to a Language.
"""
return {lang.a2: lang for lang in lang_list}

@abstractmethod
def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None):
"""
Set up the language identifier, and return it.
It will be available in self.identifier.

If restrict_to_langs is present, the identifier should restrict
its identification efforts to the given languages.

Note that restricting the languages used is usually not a good idea
because it increases the possibility of false positives.

:param restrict_to_langs: a list of two-letter language codes.
"""
pass

def _skip_line(self, line, skip_expressions) -> bool:
stripped = line.strip()
if not stripped:
return True
for exp in skip_expressions:
if exp.match(stripped):
return True
return False

def _detect_file_encoding(self, filepath) -> dict:
"""
Tries to guess the encoding (utf-8, iso-8859-1, etc).

The returned dict has these fields of interest:
{
"encoding": str,
"confidence": float between 0 and 1
}
"""
self.encoding_detector.reset()
for line in open(filepath, "rb"):
if self._skip_line(line, self.skip_line_expressions_bytes):
continue
self.encoding_detector.feed(line)
if self.encoding_detector.done:
break
self.encoding_detector.close()

result = dict(self.encoding_detector.result)
if result["encoding"] == "ascii":
result["encoding"] = "utf-8"
return result

def _read_lines_from_file(
self, filepath, encoding: str, lines=200, skip_first_lines=10
) -> str:
"""
Read a certain number of lines from the file, returning a unicode string.

Lines that are subtitle control lines (only numbers, or time ranges)
are filtered out, and do not count towards the number of lines.

By default, the 10 first lines are skipped. The reasoning behind
that is that perhaps the first lines contain subtitle credits
(e.g. a little advertisement for the subtitle creator), which may
not correspond to the principal language of the file.
"""
stop_count = lines + skip_first_lines
text = ""
i = 0
for line in open(filepath, mode="r", encoding=encoding):
if self._skip_line(line, self.skip_line_expressions_str):
continue

i += 1
if i <= skip_first_lines:
continue

text += line
if i > stop_count:
break
return text

def _get_file_text(self, filepath) -> Optional[str]:
"""
Tries to determine the file encoding and read some lines from the file.

If the confidence for the encoding is not high enough, or an error
occurs while reading lines from the file, the return value is None.
"""
encoding = self._detect_file_encoding(filepath)
text = None
if encoding["confidence"] >= 0.6:
try:
text = self._read_lines_from_file(
filepath, encoding=encoding["encoding"]
)
except Exception as e:
logging.warning(
f"Unable to read file {filepath} with encoding {encoding['encoding']}. "
f"Error: {e}"
)
return text

@staticmethod
def boolean_env_var(env_var, default=None) -> Optional[bool]:
value = os.getenv(env_var)
if value is None:
return default
value = value.strip().lower()
if value in ["true", "yes", "1"]:
return True
return False

def guess_language(self, filepath: Path) -> Optional[Language]:
"""
Reads text from the file and passes it the implementation-specific
guess_language_from_text() method.

If a matching mnamer.Language exists, it is returned, otherwise None.
"""
text = self._get_file_text(filepath)

if not text:
return None

guessed_language = None
try:
guessed_language = self.guess_language_from_text(text)
except Exception as e:
logging.warning(
"Unexpected error while guessing language from file text. "
f"File: {filepath}, Error: {e}"
)

if not guessed_language:
return None

return self.language_map.get(guessed_language, None)
35 changes: 35 additions & 0 deletions mnamer/text_lang_guesser/fasttext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from typing import Optional, Dict, Union
from ftlangdetect.detect import get_or_load_model
from mnamer.text_lang_guesser.base import TextLanguageGuesser


class FasttextGuesser(TextLanguageGuesser):
"""
Installation note: a modern g++ version is required for building fasttext.
"""

def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None):
# Note: It seems there is no way to restrict languages for fasttext.
low_memory = self.boolean_env_var("FASTTEXT_LOW_MEMORY", False)
return get_or_load_model(low_memory=low_memory)

def detect(self, text: str) -> Optional[Dict[str, Union[str, float]]]:
"""
Modified version of ftlangdetect.detect.detect, that specifies the threshold.
"""
labels, scores = self.identifier.predict(text, threshold=self.min_probability)
if not labels:
return None
label = labels[0].replace("__label__", "")
score = min(float(scores[0]), 1.0)
return {
"lang": label,
"score": score,
}

def guess_language_from_text(self, text: str) -> Optional[str]:
text = text.replace("\n", " ").replace("\r", "")
guessed_language = self.detect(text)
if not guessed_language:
return None
return guessed_language["lang"]
47 changes: 47 additions & 0 deletions mnamer/text_lang_guesser/langdetect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import logging
from pathlib import Path
from typing import Optional, List
from langdetect.detector_factory import DetectorFactory, PROFILES_DIRECTORY
from mnamer.language import Language
from mnamer.text_lang_guesser.base import TextLanguageGuesser


class LangdetectGuesser(TextLanguageGuesser):
def _language_map(self, lang_list: List[Language]):
lang_map = super()._language_map(lang_list)
zh = lang_map.pop("zh", None)
if zh:
# lang-detect has zh-cn and zh-tw. Map them both to mnamer's zh.
lang_map["zh-cn"] = zh
lang_map["zh-tw"] = zh
return lang_map

def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None):
# Be deterministic. Without this, langdetect could guess different
# languages for the same short text.
DetectorFactory.seed = 0

identifier = DetectorFactory()
if restrict_to_langs:
profiles_root = Path(PROFILES_DIRECTORY)
json_profiles = []
for lang in self.language_map:
profile = profiles_root / lang
if profile.is_file():
json_profiles.append(profile.read_text(encoding="utf-8"))
else:
logging.warning(f"Language profile not found for language '{lang}'")
identifier.load_json_profile(json_profiles)
else:
identifier.load_profile(PROFILES_DIRECTORY)
return identifier

def guess_language_from_text(self, text: str) -> Optional[str]:
detector = self.identifier.create()
detector.append(text)
guessed_languages = detector.get_probabilities()
if not guessed_languages:
return None
lang = guessed_languages[0]
if lang.prob >= self.min_probability:
return lang.lang
Loading