diff --git a/mnamer/exceptions.py b/mnamer/exceptions.py index ccf40ca5..00e1ca06 100644 --- a/mnamer/exceptions.py +++ b/mnamer/exceptions.py @@ -19,3 +19,13 @@ class MnamerNetworkException(MnamerException): class MnamerNotFoundException(MnamerException): """Raised when a lookup or search works as expected yet yields no results.""" + + +class MnamerFailedLangGuesserInstantiation(MnamerException): + """ + Raised when a requested text language guesser failed to instantiate. + """ + + +class MnamerNoSuchLangGuesser(MnamerException): + """Raised when a requested text language guesser name does not match any known guessers.""" diff --git a/mnamer/setting_store.py b/mnamer/setting_store.py index 8981d335..40282a7a 100644 --- a/mnamer/setting_store.py +++ b/mnamer/setting_store.py @@ -1,5 +1,6 @@ import dataclasses import json +from functools import cached_property from pathlib import Path from typing import Any, Callable @@ -11,6 +12,7 @@ from mnamer.setting_spec import SettingSpec from mnamer.types import MediaType, ProviderType, SettingType from mnamer.utils import crawl_out, json_loads, normalize_containers +from mnamer import text_lang_guesser @dataclasses.dataclass @@ -106,6 +108,15 @@ class SettingStore: help="--language=: specify the search language", ).as_dict(), ) + subtitle_lang_guesser: Language | None = dataclasses.field( + default=None, + metadata=SettingSpec( + flags=["--subtitle-lang-guesser"], + group=SettingType.PARAMETER, + choices=list(text_lang_guesser.available_guessers), + help="--subtitle-lang-guesser=: subtitle file text language guesser (must be installed)", + ).as_dict(), + ) mask: list[str] = dataclasses.field( default_factory=lambda: [ "avi", @@ -367,6 +378,12 @@ def specifications(cls) -> list[SettingSpec]: def _resolve_path(path: str | Path) -> Path: return Path(path).resolve() + @cached_property + def text_lang_guesser(self): + if not self.subtitle_lang_guesser: + return None + return text_lang_guesser.guesser(self.subtitle_lang_guesser, Language.all()) + def __setattr__(self, key: str, value: Any): converter_map: dict[str, Callable] = { "episode_api": ProviderType, diff --git a/mnamer/target.py b/mnamer/target.py index cd9927e2..d22f3f11 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -116,7 +116,8 @@ def destination(self) -> Path: def _parse(self, file_path: Path): path_data: dict[str, Any] = {"language": self._settings.language} - if is_subtitle(self.source): + source_is_subtitle = is_subtitle(self.source) + if source_is_subtitle: try: path_data["language"] = Language.parse(self.source.stem[-2:]) file_path = Path(self.source.parent, self.source.stem[:-2]) @@ -176,6 +177,17 @@ def _parse(self, file_path: Path): self.metadata.language_sub = path_data.get("subtitle_language") except MnamerException: pass + if ( + source_is_subtitle + and not self.metadata.language_sub + and self._settings.subtitle_lang_guesser + ): + try: + self.metadata.language_sub = ( + self._settings.text_lang_guesser.guess_language(self.source) + ) + except MnamerException: + pass if isinstance(self.metadata, MetadataMovie): self.metadata.name = path_data.get("title") self.metadata.year = path_data.get("year") diff --git a/mnamer/text_lang_guesser/__init__.py b/mnamer/text_lang_guesser/__init__.py new file mode 100644 index 00000000..fe86c1d9 --- /dev/null +++ b/mnamer/text_lang_guesser/__init__.py @@ -0,0 +1,54 @@ +import logging +from typing import Dict +from mnamer.exceptions import ( + MnamerFailedLangGuesserInstantiation, + MnamerNoSuchLangGuesser, +) +from mnamer.language import Language +from importlib import import_module + + +def _import_module(dotted_module_name: str): + try: + return import_module(dotted_module_name) + except ImportError as e: + logging.debug(f"Failed to import {dotted_module_name}: {e}", exc_info=e) + return None + + +possible_guessers = ( + ("lingua", "mnamer.text_lang_guesser.lingua.LinguaGuesser"), + ("langdetect", "mnamer.text_lang_guesser.langdetect.LangdetectGuesser"), + ("fasttext", "mnamer.text_lang_guesser.fasttext.FasttextGuesser"), + ("langid", "mnamer.text_lang_guesser.langid.LangidGuesser"), +) + +available_guessers = {} +for name, module_class in possible_guessers: + module_name, classname = module_class.rsplit(".", 1) + mod = _import_module(module_name) + if mod: + try: + cls = getattr(mod, classname) + except AttributeError as e: + logging.debug( + f"Failed to load class {classname} from module {mod}: {e}", exc_info=e + ) + continue + available_guessers[name] = cls + + +def guesser(name: str, guess_languages: Dict[str, Language]): + if name not in available_guessers: + raise MnamerNoSuchLangGuesser("Unrecognized language guesser") + try: + return available_guessers[name](guess_languages=guess_languages) + except Exception as e: + class_name = available_guessers[name].__name__ + logging.debug( + f"Error trying to instantiate {class_name}", + exc_info=e, + ) + raise MnamerFailedLangGuesserInstantiation( + f"Failed creating guesser {class_name}" + ) diff --git a/mnamer/text_lang_guesser/base.py b/mnamer/text_lang_guesser/base.py new file mode 100644 index 00000000..ea70560c --- /dev/null +++ b/mnamer/text_lang_guesser/base.py @@ -0,0 +1,174 @@ +from abc import ABC, abstractmethod +from pathlib import Path +import logging +import os +import re +from typing import List, Optional +from chardet.universaldetector import UniversalDetector +from mnamer.language import Language + + +class TextLanguageGuesser(ABC): + def __init__(self, guess_languages: List[Language], min_probability: float = 0.9): + self.guess_languages = guess_languages + self.language_map = self._language_map(guess_languages) + self.min_probability = min_probability + self.identifier = self._initialize_identifier() + + exp_only_nums = r"^\d+$" + exp_timeframe = r"^[\s0-9:.,>-]+$" + skip_patterns = [exp_only_nums, exp_timeframe] + self.skip_line_expressions_str = [re.compile(exp) for exp in skip_patterns] + self.skip_line_expressions_bytes = [ + re.compile(exp.encode("ascii")) for exp in skip_patterns + ] + self.encoding_detector = UniversalDetector() + + @abstractmethod + def guess_language_from_text(self, text: str) -> Optional[str]: + """ + Guess the language, based on the text in the file. + """ + pass + + def _language_map(self, lang_list: List[Language]): + """ + Returns a dict that will be used to map an identification result to a Language. + """ + return {lang.a2: lang for lang in lang_list} + + @abstractmethod + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + """ + Set up the language identifier, and return it. + It will be available in self.identifier. + + If restrict_to_langs is present, the identifier should restrict + its identification efforts to the given languages. + + Note that restricting the languages used is usually not a good idea + because it increases the possibility of false positives. + + :param restrict_to_langs: a list of two-letter language codes. + """ + pass + + def _skip_line(self, line, skip_expressions) -> bool: + stripped = line.strip() + if not stripped: + return True + for exp in skip_expressions: + if exp.match(stripped): + return True + return False + + def _detect_file_encoding(self, filepath) -> dict: + """ + Tries to guess the encoding (utf-8, iso-8859-1, etc). + + The returned dict has these fields of interest: + { + "encoding": str, + "confidence": float between 0 and 1 + } + """ + self.encoding_detector.reset() + for line in open(filepath, "rb"): + if self._skip_line(line, self.skip_line_expressions_bytes): + continue + self.encoding_detector.feed(line) + if self.encoding_detector.done: + break + self.encoding_detector.close() + + result = dict(self.encoding_detector.result) + if result["encoding"] == "ascii": + result["encoding"] = "utf-8" + return result + + def _read_lines_from_file( + self, filepath, encoding: str, lines=200, skip_first_lines=10 + ) -> str: + """ + Read a certain number of lines from the file, returning a unicode string. + + Lines that are subtitle control lines (only numbers, or time ranges) + are filtered out, and do not count towards the number of lines. + + By default, the 10 first lines are skipped. The reasoning behind + that is that perhaps the first lines contain subtitle credits + (e.g. a little advertisement for the subtitle creator), which may + not correspond to the principal language of the file. + """ + stop_count = lines + skip_first_lines + text = "" + i = 0 + for line in open(filepath, mode="r", encoding=encoding): + if self._skip_line(line, self.skip_line_expressions_str): + continue + + i += 1 + if i <= skip_first_lines: + continue + + text += line + if i > stop_count: + break + return text + + def _get_file_text(self, filepath) -> Optional[str]: + """ + Tries to determine the file encoding and read some lines from the file. + + If the confidence for the encoding is not high enough, or an error + occurs while reading lines from the file, the return value is None. + """ + encoding = self._detect_file_encoding(filepath) + text = None + if encoding["confidence"] >= 0.6: + try: + text = self._read_lines_from_file( + filepath, encoding=encoding["encoding"] + ) + except Exception as e: + logging.warning( + f"Unable to read file {filepath} with encoding {encoding['encoding']}. " + f"Error: {e}" + ) + return text + + @staticmethod + def boolean_env_var(env_var, default=None) -> Optional[bool]: + value = os.getenv(env_var) + if value is None: + return default + value = value.strip().lower() + if value in ["true", "yes", "1"]: + return True + return False + + def guess_language(self, filepath: Path) -> Optional[Language]: + """ + Reads text from the file and passes it the implementation-specific + guess_language_from_text() method. + + If a matching mnamer.Language exists, it is returned, otherwise None. + """ + text = self._get_file_text(filepath) + + if not text: + return None + + guessed_language = None + try: + guessed_language = self.guess_language_from_text(text) + except Exception as e: + logging.warning( + "Unexpected error while guessing language from file text. " + f"File: {filepath}, Error: {e}" + ) + + if not guessed_language: + return None + + return self.language_map.get(guessed_language, None) diff --git a/mnamer/text_lang_guesser/fasttext.py b/mnamer/text_lang_guesser/fasttext.py new file mode 100644 index 00000000..a17c7db4 --- /dev/null +++ b/mnamer/text_lang_guesser/fasttext.py @@ -0,0 +1,35 @@ +from typing import Optional, Dict, Union +from ftlangdetect.detect import get_or_load_model +from mnamer.text_lang_guesser.base import TextLanguageGuesser + + +class FasttextGuesser(TextLanguageGuesser): + """ + Installation note: a modern g++ version is required for building fasttext. + """ + + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + # Note: It seems there is no way to restrict languages for fasttext. + low_memory = self.boolean_env_var("FASTTEXT_LOW_MEMORY", False) + return get_or_load_model(low_memory=low_memory) + + def detect(self, text: str) -> Optional[Dict[str, Union[str, float]]]: + """ + Modified version of ftlangdetect.detect.detect, that specifies the threshold. + """ + labels, scores = self.identifier.predict(text, threshold=self.min_probability) + if not labels: + return None + label = labels[0].replace("__label__", "") + score = min(float(scores[0]), 1.0) + return { + "lang": label, + "score": score, + } + + def guess_language_from_text(self, text: str) -> Optional[str]: + text = text.replace("\n", " ").replace("\r", "") + guessed_language = self.detect(text) + if not guessed_language: + return None + return guessed_language["lang"] diff --git a/mnamer/text_lang_guesser/langdetect.py b/mnamer/text_lang_guesser/langdetect.py new file mode 100644 index 00000000..9eb3a724 --- /dev/null +++ b/mnamer/text_lang_guesser/langdetect.py @@ -0,0 +1,47 @@ +import logging +from pathlib import Path +from typing import Optional, List +from langdetect.detector_factory import DetectorFactory, PROFILES_DIRECTORY +from mnamer.language import Language +from mnamer.text_lang_guesser.base import TextLanguageGuesser + + +class LangdetectGuesser(TextLanguageGuesser): + def _language_map(self, lang_list: List[Language]): + lang_map = super()._language_map(lang_list) + zh = lang_map.pop("zh", None) + if zh: + # lang-detect has zh-cn and zh-tw. Map them both to mnamer's zh. + lang_map["zh-cn"] = zh + lang_map["zh-tw"] = zh + return lang_map + + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + # Be deterministic. Without this, langdetect could guess different + # languages for the same short text. + DetectorFactory.seed = 0 + + identifier = DetectorFactory() + if restrict_to_langs: + profiles_root = Path(PROFILES_DIRECTORY) + json_profiles = [] + for lang in self.language_map: + profile = profiles_root / lang + if profile.is_file(): + json_profiles.append(profile.read_text(encoding="utf-8")) + else: + logging.warning(f"Language profile not found for language '{lang}'") + identifier.load_json_profile(json_profiles) + else: + identifier.load_profile(PROFILES_DIRECTORY) + return identifier + + def guess_language_from_text(self, text: str) -> Optional[str]: + detector = self.identifier.create() + detector.append(text) + guessed_languages = detector.get_probabilities() + if not guessed_languages: + return None + lang = guessed_languages[0] + if lang.prob >= self.min_probability: + return lang.lang diff --git a/mnamer/text_lang_guesser/langid.py b/mnamer/text_lang_guesser/langid.py new file mode 100644 index 00000000..c1c7671c --- /dev/null +++ b/mnamer/text_lang_guesser/langid.py @@ -0,0 +1,17 @@ +from typing import Optional +from py3langid.langid import LanguageIdentifier, MODEL_FILE +from mnamer.text_lang_guesser.base import TextLanguageGuesser + + +class LangidGuesser(TextLanguageGuesser): + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True) + if restrict_to_langs: + identifier.set_languages(restrict_to_langs) + return identifier + + def guess_language_from_text(self, text: str) -> Optional[str]: + guessed_language = self.identifier.classify(text) + if not guessed_language or guessed_language[1] < self.min_probability: + return None + return guessed_language[0] diff --git a/mnamer/text_lang_guesser/lingua.py b/mnamer/text_lang_guesser/lingua.py new file mode 100644 index 00000000..794b912f --- /dev/null +++ b/mnamer/text_lang_guesser/lingua.py @@ -0,0 +1,34 @@ +from typing import List, Optional +from lingua import LanguageDetectorBuilder +from lingua import Language as LinguaLanguage +from mnamer.language import Language +from mnamer.text_lang_guesser.base import TextLanguageGuesser + + +class LinguaGuesser(TextLanguageGuesser): + def _language_map(self, lang_list: List[Language]): + """ + Returns a dict that will be used to map an identification result to a Language. + """ + upcase_map = {lang.name.upper(): lang for lang in lang_list} + + return { + lang: upcase_map[lang.name] + for lang in LinguaLanguage.all() + if lang.name in upcase_map + } + + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + if restrict_to_langs: + language_list = self.language_map.keys() + else: + language_list = LinguaLanguage.all() + + return ( + LanguageDetectorBuilder.from_languages(*language_list) + .with_minimum_relative_distance(self.min_probability) + .build() + ) + + def guess_language_from_text(self, text: str) -> Optional[str]: + return self.identifier.detect_language_of(text) diff --git a/pyproject.toml b/pyproject.toml index 1d58cc21..5d3c61b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,11 @@ dependencies = { file = "requirements.txt" } [tool.setuptools.dynamic.optional-dependencies] dev = { file = "requirements-dev.txt" } +guess_langid = { file = "requirements-guess-langid.txt" } +guess_lingua = { file = "requirements-guess-lingua.txt" } +guess_fasttext = { file = "requirements-guess-fasttext.txt" } +guess_langdetect = { file = "requirements-guess-langdetect.txt" } +guess_all = { file = "requirements-guess-all.txt" } [build-system] requires = ["setuptools >= 61.0.0", "setuptools_scm[toml] >= 6.2", "wheel"] diff --git a/requirements-guess-all.txt b/requirements-guess-all.txt new file mode 100644 index 00000000..f90cce91 --- /dev/null +++ b/requirements-guess-all.txt @@ -0,0 +1,5 @@ +chardet >= 5.2.0 +py3langid ~= 0.2.2 +lingua-language-detector ~= 2.0.2 +fasttext-langdetect ~= 1.0.5 +langdetect ~= 1.0.9 diff --git a/requirements-guess-fasttext.txt b/requirements-guess-fasttext.txt new file mode 100644 index 00000000..b04d0dca --- /dev/null +++ b/requirements-guess-fasttext.txt @@ -0,0 +1,2 @@ +chardet >= 5.2.0 +fasttext-langdetect ~= 1.0.5 \ No newline at end of file diff --git a/requirements-guess-langdetect.txt b/requirements-guess-langdetect.txt new file mode 100644 index 00000000..7cfaec7e --- /dev/null +++ b/requirements-guess-langdetect.txt @@ -0,0 +1,2 @@ +chardet >= 5.2.0 +langdetect ~= 1.0.9 \ No newline at end of file diff --git a/requirements-guess-langid.txt b/requirements-guess-langid.txt new file mode 100644 index 00000000..e140d37a --- /dev/null +++ b/requirements-guess-langid.txt @@ -0,0 +1,2 @@ +chardet >= 5.2.0 +py3langid ~= 0.2.2 \ No newline at end of file diff --git a/requirements-guess-lingua.txt b/requirements-guess-lingua.txt new file mode 100644 index 00000000..197bd646 --- /dev/null +++ b/requirements-guess-lingua.txt @@ -0,0 +1,2 @@ +chardet >= 5.2.0 +lingua-language-detector ~= 2.0.2 \ No newline at end of file