jkwill87 · big-eater · Dec 31, 2023 · Dec 31, 2023 · Dec 31, 2023 · Dec 31, 2023
diff --git a/mnamer/exceptions.py b/mnamer/exceptions.py
@@ -19,3 +19,13 @@ class MnamerNetworkException(MnamerException):
 
 class MnamerNotFoundException(MnamerException):
     """Raised when a lookup or search works as expected yet yields no results."""
+
+
+class MnamerFailedLangGuesserInstantiation(MnamerException):
+    """
+    Raised when a requested text language guesser failed to instantiate.
+    """
+
+
+class MnamerNoSuchLangGuesser(MnamerException):
+    """Raised when a requested text language guesser name does not match any known guessers."""
diff --git a/mnamer/setting_store.py b/mnamer/setting_store.py
@@ -1,5 +1,6 @@
 import dataclasses
 import json
+from functools import cached_property
 from pathlib import Path
 from typing import Any, Callable
 
@@ -11,6 +12,7 @@
 from mnamer.setting_spec import SettingSpec
 from mnamer.types import MediaType, ProviderType, SettingType
 from mnamer.utils import crawl_out, json_loads, normalize_containers
+from mnamer import text_lang_guesser
 
 
 @dataclasses.dataclass
@@ -106,6 +108,15 @@ class SettingStore:
             help="--language=<LANG>: specify the search language",
         ).as_dict(),
     )
+    subtitle_lang_guesser: Language | None = dataclasses.field(
+        default=None,
+        metadata=SettingSpec(
+            flags=["--subtitle-lang-guesser"],
+            group=SettingType.PARAMETER,
+            choices=list(text_lang_guesser.available_guessers),
+            help="--subtitle-lang-guesser=<GUESSER>: subtitle file text language guesser (must be installed)",
+        ).as_dict(),
+    )
     mask: list[str] = dataclasses.field(
         default_factory=lambda: [
             "avi",
@@ -367,6 +378,12 @@ def specifications(cls) -> list[SettingSpec]:
     def _resolve_path(path: str | Path) -> Path:
         return Path(path).resolve()
 
+    @cached_property
+    def text_lang_guesser(self):
+        if not self.subtitle_lang_guesser:
+            return None
+        return text_lang_guesser.guesser(self.subtitle_lang_guesser, Language.all())
+
     def __setattr__(self, key: str, value: Any):
         converter_map: dict[str, Callable] = {
             "episode_api": ProviderType,

diff --git a/mnamer/target.py b/mnamer/target.py
@@ -116,7 +116,8 @@ def destination(self) -> Path:
 
     def _parse(self, file_path: Path):
         path_data: dict[str, Any] = {"language": self._settings.language}
-        if is_subtitle(self.source):
+        source_is_subtitle = is_subtitle(self.source)
+        if source_is_subtitle:
             try:
                 path_data["language"] = Language.parse(self.source.stem[-2:])
                 file_path = Path(self.source.parent, self.source.stem[:-2])
@@ -176,6 +177,17 @@ def _parse(self, file_path: Path):
             self.metadata.language_sub = path_data.get("subtitle_language")
         except MnamerException:
             pass
+        if (
+            source_is_subtitle
+            and not self.metadata.language_sub
+            and self._settings.subtitle_lang_guesser
+        ):
+            try:
+                self.metadata.language_sub = (
+                    self._settings.text_lang_guesser.guess_language(self.source)
+                )
+            except MnamerException:
+                pass
         if isinstance(self.metadata, MetadataMovie):
             self.metadata.name = path_data.get("title")
             self.metadata.year = path_data.get("year")

diff --git a/mnamer/text_lang_guesser/__init__.py b/mnamer/text_lang_guesser/__init__.py
@@ -0,0 +1,54 @@
+import logging
+from typing import Dict
+from mnamer.exceptions import (
+    MnamerFailedLangGuesserInstantiation,
+    MnamerNoSuchLangGuesser,
+)
+from mnamer.language import Language
+from importlib import import_module
+
+
+def _import_module(dotted_module_name: str):
+    try:
+        return import_module(dotted_module_name)
+    except ImportError as e:
+        logging.debug(f"Failed to import {dotted_module_name}: {e}", exc_info=e)
+    return None
+
+
+possible_guessers = (
+    ("lingua", "mnamer.text_lang_guesser.lingua.LinguaGuesser"),
+    ("langdetect", "mnamer.text_lang_guesser.langdetect.LangdetectGuesser"),
+    ("fasttext", "mnamer.text_lang_guesser.fasttext.FasttextGuesser"),
+    ("langid", "mnamer.text_lang_guesser.langid.LangidGuesser"),
+)
+
+available_guessers = {}
+for name, module_class in possible_guessers:
+    module_name, classname = module_class.rsplit(".", 1)
+    mod = _import_module(module_name)
+    if mod:
+        try:
+            cls = getattr(mod, classname)
+        except AttributeError as e:
+            logging.debug(
+                f"Failed to load class {classname} from module {mod}: {e}", exc_info=e
+            )
+            continue
+        available_guessers[name] = cls
+
+
+def guesser(name: str, guess_languages: Dict[str, Language]):
+    if name not in available_guessers:
+        raise MnamerNoSuchLangGuesser("Unrecognized language guesser")
+    try:
+        return available_guessers[name](guess_languages=guess_languages)
+    except Exception as e:
+        class_name = available_guessers[name].__name__
+        logging.debug(
+            f"Error trying to instantiate {class_name}",
+            exc_info=e,
+        )
+        raise MnamerFailedLangGuesserInstantiation(
+            f"Failed creating guesser {class_name}"
+        )
diff --git a/mnamer/text_lang_guesser/base.py b/mnamer/text_lang_guesser/base.py
@@ -0,0 +1,174 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+import logging
+import os
+import re
+from typing import List, Optional
+from chardet.universaldetector import UniversalDetector
+from mnamer.language import Language
+
+
+class TextLanguageGuesser(ABC):
+    def __init__(self, guess_languages: List[Language], min_probability: float = 0.9):
+        self.guess_languages = guess_languages
+        self.language_map = self._language_map(guess_languages)
+        self.min_probability = min_probability
+        self.identifier = self._initialize_identifier()
+
+        exp_only_nums = r"^\d+$"
+        exp_timeframe = r"^[\s0-9:.,>-]+$"
+        skip_patterns = [exp_only_nums, exp_timeframe]
+        self.skip_line_expressions_str = [re.compile(exp) for exp in skip_patterns]
+        self.skip_line_expressions_bytes = [
+            re.compile(exp.encode("ascii")) for exp in skip_patterns
+        ]
+        self.encoding_detector = UniversalDetector()
+
+    @abstractmethod
+    def guess_language_from_text(self, text: str) -> Optional[str]:
+        """
+        Guess the language, based on the text in the file.
+        """
+        pass
+
+    def _language_map(self, lang_list: List[Language]):
+        """
+        Returns a dict that will be used to map an identification result to a Language.
+        """
+        return {lang.a2: lang for lang in lang_list}
+
+    @abstractmethod
+    def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None):
+        """
+        Set up the language identifier, and return it.
+        It will be available in self.identifier.
+
+        If restrict_to_langs is present, the identifier should restrict
+        its identification efforts to the given languages.
+
+        Note that restricting the languages used is usually not a good idea
+        because it increases the possibility of false positives.
+
+        :param restrict_to_langs: a list of two-letter language codes.
+        """
+        pass
+
+    def _skip_line(self, line, skip_expressions) -> bool:
+        stripped = line.strip()
+        if not stripped:
+            return True
+        for exp in skip_expressions:
+            if exp.match(stripped):
+                return True
+        return False
+
+    def _detect_file_encoding(self, filepath) -> dict:
+        """
+        Tries to guess the encoding (utf-8, iso-8859-1, etc).
+
+        The returned dict has these fields of interest:
+            {
+                "encoding": str,
+                "confidence": float between 0 and 1
+            }
+        """
+        self.encoding_detector.reset()
+        for line in open(filepath, "rb"):
+            if self._skip_line(line, self.skip_line_expressions_bytes):
+                continue
+            self.encoding_detector.feed(line)
+            if self.encoding_detector.done:
+                break
+        self.encoding_detector.close()
+
+        result = dict(self.encoding_detector.result)
+        if result["encoding"] == "ascii":
+            result["encoding"] = "utf-8"
+        return result
+
+    def _read_lines_from_file(
+        self, filepath, encoding: str, lines=200, skip_first_lines=10
+    ) -> str:
+        """
+        Read a certain number of lines from the file, returning a unicode string.
+
+        Lines that are subtitle control lines (only numbers, or time ranges)
+        are filtered out, and do not count towards the number of lines.
+
+        By default, the 10 first lines are skipped. The reasoning behind
+        that is that perhaps the first lines contain subtitle credits
+        (e.g. a little advertisement for the subtitle creator), which may
+        not correspond to the principal language of the file.
+        """
+        stop_count = lines + skip_first_lines
+        text = ""
+        i = 0
+        for line in open(filepath, mode="r", encoding=encoding):
+            if self._skip_line(line, self.skip_line_expressions_str):
+                continue
+
+            i += 1
+            if i <= skip_first_lines:
+                continue
+
+            text += line
+            if i > stop_count:
+                break
+        return text
+
+    def _get_file_text(self, filepath) -> Optional[str]:
+        """
+        Tries to determine the file encoding and read some lines from the file.
+
+        If the confidence for the encoding is not high enough, or an error
+        occurs while reading lines from the file, the return value is None.
+        """
+        encoding = self._detect_file_encoding(filepath)
+        text = None
+        if encoding["confidence"] >= 0.6:
+            try:
+                text = self._read_lines_from_file(
+                    filepath, encoding=encoding["encoding"]
+                )
+            except Exception as e:
+                logging.warning(
+                    f"Unable to read file {filepath} with encoding {encoding['encoding']}. "
+                    f"Error: {e}"
+                )
+        return text
+
+    @staticmethod
+    def boolean_env_var(env_var, default=None) -> Optional[bool]:
+        value = os.getenv(env_var)
+        if value is None:
+            return default
+        value = value.strip().lower()
+        if value in ["true", "yes", "1"]:
+            return True
+        return False
+
+    def guess_language(self, filepath: Path) -> Optional[Language]:
+        """
+        Reads text from the file and passes it the implementation-specific
+        guess_language_from_text() method.
+
+        If a matching mnamer.Language exists, it is returned, otherwise None.
+        """
+        text = self._get_file_text(filepath)
+
+        if not text:
+            return None
+
+        guessed_language = None
+        try:
+            guessed_language = self.guess_language_from_text(text)
+        except Exception as e:
+            logging.warning(
+                "Unexpected error while guessing language from file text. "
+                f"File: {filepath}, Error: {e}"
+            )
+
+        if not guessed_language:
+            return None
+
+        return self.language_map.get(guessed_language, None)
diff --git a/mnamer/text_lang_guesser/fasttext.py b/mnamer/text_lang_guesser/fasttext.py
@@ -0,0 +1,35 @@
+from typing import Optional, Dict, Union
+from ftlangdetect.detect import get_or_load_model
+from mnamer.text_lang_guesser.base import TextLanguageGuesser
+
+
+class FasttextGuesser(TextLanguageGuesser):
+    """
+    Installation note: a modern g++ version is required for building fasttext.
+    """
+
+    def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None):
+        # Note: It seems there is no way to restrict languages for fasttext.
+        low_memory = self.boolean_env_var("FASTTEXT_LOW_MEMORY", False)
+        return get_or_load_model(low_memory=low_memory)
+
+    def detect(self, text: str) -> Optional[Dict[str, Union[str, float]]]:
+        """
+        Modified version of ftlangdetect.detect.detect, that specifies the threshold.
+        """
+        labels, scores = self.identifier.predict(text, threshold=self.min_probability)
+        if not labels:
+            return None
+        label = labels[0].replace("__label__", "")
+        score = min(float(scores[0]), 1.0)
+        return {
+            "lang": label,
+            "score": score,
+        }
+
+    def guess_language_from_text(self, text: str) -> Optional[str]:
+        text = text.replace("\n", " ").replace("\r", "")
+        guessed_language = self.detect(text)
+        if not guessed_language:
+            return None
+        return guessed_language["lang"]
diff --git a/mnamer/text_lang_guesser/langdetect.py b/mnamer/text_lang_guesser/langdetect.py
@@ -0,0 +1,47 @@
+import logging
+from pathlib import Path
+from typing import Optional, List
+from langdetect.detector_factory import DetectorFactory, PROFILES_DIRECTORY
+from mnamer.language import Language
+from mnamer.text_lang_guesser.base import TextLanguageGuesser
+
+
+class LangdetectGuesser(TextLanguageGuesser):
+    def _language_map(self, lang_list: List[Language]):
+        lang_map = super()._language_map(lang_list)
+        zh = lang_map.pop("zh", None)
+        if zh:
+            # lang-detect has zh-cn and zh-tw. Map them both to mnamer's zh.
+            lang_map["zh-cn"] = zh
+            lang_map["zh-tw"] = zh
+        return lang_map
+
+    def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None):
+        # Be deterministic. Without this, langdetect could guess different
+        # languages for the same short text.
+        DetectorFactory.seed = 0
+
+        identifier = DetectorFactory()
+        if restrict_to_langs:
+            profiles_root = Path(PROFILES_DIRECTORY)
+            json_profiles = []
+            for lang in self.language_map:
+                profile = profiles_root / lang
+                if profile.is_file():
+                    json_profiles.append(profile.read_text(encoding="utf-8"))
+                else:
+                    logging.warning(f"Language profile not found for language '{lang}'")
+            identifier.load_json_profile(json_profiles)
+        else:
+            identifier.load_profile(PROFILES_DIRECTORY)
+        return identifier
+
+    def guess_language_from_text(self, text: str) -> Optional[str]:
+        detector = self.identifier.create()
+        detector.append(text)
+        guessed_languages = detector.get_probabilities()
+        if not guessed_languages:
+            return None
+        lang = guessed_languages[0]
+        if lang.prob >= self.min_probability:
+            return lang.lang