Fix metrics submodules, enforce utf-8 encoding

speechmatics · Dec 13, 2023 · bee43e1 · bee43e1
1 parent 848f249
commit bee43e1
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 7 deletions.
diff --git a/metrics/README.md b/metrics/README.md
@@ -6,7 +6,7 @@ We provide some additional tooling to help benchmark transcription and diarizati
 
 ### CLI
 
-The `sm-metrics` binary is built after installing with PyPI or running `python3 setup.py` from the source code. To see the options from the command-line, use the following:
+The `sm-metrics` binary is built after installing with PyPI or running `python3 setup.py install` from the source code. To see the options from the command-line, use the following:
 ``` bash
 sm-metrics -h
 ```

diff --git a/metrics/cli.py b/metrics/cli.py
@@ -10,7 +10,7 @@ def main():
 
     # Create subparsers
     subparsers = parser.add_subparsers(
-        dest="mode", help="Metrics mode. Choose from 'wer' or 'diarization"
+        dest="mode", help="Metrics mode. Choose from 'wer' or 'diarization'"
     )
     subparsers.required = True  # Make sure a subparser id always provided
 

diff --git a/metrics/wer/README.md b/metrics/wer/README.md
@@ -71,6 +71,8 @@ To see all the commands, run:
 sm-metrics wer -h
 ```
 
+You must ensure that both the reference and hypothesis files are encoded in UTF-8.
+
 ## Read More
 
 - [The Future of Word Error Rate](https://www.speechmatics.com/company/articles-and-news/the-future-of-word-error-rate?utm_source=facebook&utm_medium=social&fbclid=IwAR1z7ZU4WowgDBs91MNKFTwPACD9gb7dkrQpkr1HmfsgXPv-Ndt5PeySjIk&restored=1676632411598)

diff --git a/metrics/wer/__main__.py b/metrics/wer/__main__.py
@@ -22,8 +22,13 @@ def load_file(path: Path, file_type: str) -> str:
 
 
 def load_text(path: Path) -> str:
-    with open(path, "r", encoding="utf-8") as input_path:
-        return input_path.read()
+    try:
+        with open(path, "r", encoding="utf-8") as input_path:
+            return input_path.read()
+    except UnicodeDecodeError as error:
+        raise ValueError(
+            f"Error reading file {path}: {error}. Ensure the file is UTF-8 encoded."
+        )
 
 
 def load_sm_json(path: Path) -> str:

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 import os
 import logging
 
-from setuptools import setup
+from setuptools import setup, find_packages
 
 
 def read(fname):
@@ -55,11 +55,12 @@ def get_version(fname):
 
 
 logging.basicConfig(level=logging.INFO)
-
+print(f"Packages to install: {find_packages(exclude=['tests'])}")
 setup(
     name="speechmatics-python",
     version=os.getenv("VERSION", get_version("VERSION")),
-    packages=["speechmatics", "metrics"],
+    packages=find_packages(exclude=["tests"]),
+    package_data={"metrics": ["wer/normalizers/english.yaml"]},
     url="https://github.com/speechmatics/speechmatics-python/",
     license="MIT",
     author="Speechmatics",