manage occurence of full stops in a better way (#229)

* manage occurence of full stops in a better way * bump version * cleanup
ssciwr · Dec 2, 2024 · e12929a · e12929a
1 parent 403525a
commit e12929a
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 8 deletions.
diff --git a/ammico/test/test_text.py b/ammico/test/test_text.py
@@ -141,6 +141,19 @@ def test_init_revision_numbers_and_models(accepted):
         tt.TextDetector({}, revision_numbers=["something"], accept_privacy=accepted)
 
 
+def test_check_add_space_after_full_stop(accepted):
+    test_obj = tt.TextDetector({}, accept_privacy=accepted)
+    test_obj.subdict["text"] = "I like cats. I like dogs."
+    test_obj._check_add_space_after_full_stop()
+    assert test_obj.subdict["text"] == "I like cats. I like dogs."
+    test_obj.subdict["text"] = "I like cats."
+    test_obj._check_add_space_after_full_stop()
+    assert test_obj.subdict["text"] == "I like cats."
+    test_obj.subdict["text"] = "www.icanhascheezburger.com"
+    test_obj._check_add_space_after_full_stop()
+    assert test_obj.subdict["text"] == "www. icanhascheezburger. com"
+
+
 @pytest.mark.gcv
 def test_analyse_image(set_testdict, set_environ, accepted):
     for item in set_testdict:

diff --git a/ammico/text.py b/ammico/text.py
@@ -4,6 +4,7 @@
 import spacy
 import io
 import os
+import re
 from ammico.utils import AnalysisMethod
 import grpc
 import pandas as pd
@@ -225,6 +226,39 @@ def _initialize_spacy(self):
             spacy.cli.download("en_core_web_md")
             self.nlp = spacy.load("en_core_web_md")
 
+    def _check_add_space_after_full_stop(self):
+        """Add a space after a full stop. Required by googletrans."""
+        # we have found text, now we check for full stops
+        index_stop = [
+            i.start() for i in re.finditer("\.", self.subdict["text"])  # noqa
+        ]
+        if not index_stop:  # no full stops found
+            return
+        # check if this includes the last string item
+        end_of_list = False
+        if len(self.subdict["text"]) <= (index_stop[-1] + 1):
+            # the last found full stop is at the end of the string
+            # but we can include all others
+            if len(index_stop) == 1:
+                end_of_list = True
+            else:
+                index_stop.pop()
+        if end_of_list:  # only one full stop at end of string
+            return
+        # if this is not the end of the list, check if there is a space after the full stop
+        no_space = [i for i in index_stop if self.subdict["text"][i + 1] != " "]
+        if not no_space:  # all full stops have a space after them
+            return
+        # else, amend the text
+        add_one = 1
+        for i in no_space:
+            self.subdict["text"] = (
+                self.subdict["text"][: i + add_one]
+                + " "
+                + self.subdict["text"][i + add_one :]
+            )
+            add_one += 1
+
     def analyse_image(self) -> dict:
         """Perform text extraction and analysis of the text.
 
@@ -239,13 +273,7 @@ def analyse_image(self) -> dict:
         else:
             # make sure all full stops are followed by whitespace
             # otherwise googletrans breaks
-            index_stop = self.subdict["text"].find(".")
-            if self.subdict["text"][index_stop + 1] != " ":
-                self.subdict["text"] = (
-                    self.subdict["text"][: index_stop + 1]
-                    + " "
-                    + self.subdict["text"][index_stop + 1 :]
-                )
+            self._check_add_space_after_full_stop()
             self.translate_text()
             self.remove_linebreaks()
             if self.analyse_text:

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "ammico"
-version = "0.2.3"
+version = "0.2.4"
 description = "AI Media and Misinformation Content Analysis Tool"
 readme = "README.md"
 maintainers = [