Skip to content

Commit

Permalink
manage occurence of full stops in a better way (#229)
Browse files Browse the repository at this point in the history
* manage occurence of full stops in a better way

* bump version

* cleanup
  • Loading branch information
iulusoy authored Dec 2, 2024
1 parent 403525a commit e12929a
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 8 deletions.
13 changes: 13 additions & 0 deletions ammico/test/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,19 @@ def test_init_revision_numbers_and_models(accepted):
tt.TextDetector({}, revision_numbers=["something"], accept_privacy=accepted)


def test_check_add_space_after_full_stop(accepted):
test_obj = tt.TextDetector({}, accept_privacy=accepted)
test_obj.subdict["text"] = "I like cats. I like dogs."
test_obj._check_add_space_after_full_stop()
assert test_obj.subdict["text"] == "I like cats. I like dogs."
test_obj.subdict["text"] = "I like cats."
test_obj._check_add_space_after_full_stop()
assert test_obj.subdict["text"] == "I like cats."
test_obj.subdict["text"] = "www.icanhascheezburger.com"
test_obj._check_add_space_after_full_stop()
assert test_obj.subdict["text"] == "www. icanhascheezburger. com"


@pytest.mark.gcv
def test_analyse_image(set_testdict, set_environ, accepted):
for item in set_testdict:
Expand Down
42 changes: 35 additions & 7 deletions ammico/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import spacy
import io
import os
import re
from ammico.utils import AnalysisMethod
import grpc
import pandas as pd
Expand Down Expand Up @@ -225,6 +226,39 @@ def _initialize_spacy(self):
spacy.cli.download("en_core_web_md")
self.nlp = spacy.load("en_core_web_md")

def _check_add_space_after_full_stop(self):
"""Add a space after a full stop. Required by googletrans."""
# we have found text, now we check for full stops
index_stop = [
i.start() for i in re.finditer("\.", self.subdict["text"]) # noqa
]
if not index_stop: # no full stops found
return
# check if this includes the last string item
end_of_list = False
if len(self.subdict["text"]) <= (index_stop[-1] + 1):
# the last found full stop is at the end of the string
# but we can include all others
if len(index_stop) == 1:
end_of_list = True
else:
index_stop.pop()
if end_of_list: # only one full stop at end of string
return
# if this is not the end of the list, check if there is a space after the full stop
no_space = [i for i in index_stop if self.subdict["text"][i + 1] != " "]
if not no_space: # all full stops have a space after them
return
# else, amend the text
add_one = 1
for i in no_space:
self.subdict["text"] = (
self.subdict["text"][: i + add_one]
+ " "
+ self.subdict["text"][i + add_one :]
)
add_one += 1

def analyse_image(self) -> dict:
"""Perform text extraction and analysis of the text.
Expand All @@ -239,13 +273,7 @@ def analyse_image(self) -> dict:
else:
# make sure all full stops are followed by whitespace
# otherwise googletrans breaks
index_stop = self.subdict["text"].find(".")
if self.subdict["text"][index_stop + 1] != " ":
self.subdict["text"] = (
self.subdict["text"][: index_stop + 1]
+ " "
+ self.subdict["text"][index_stop + 1 :]
)
self._check_add_space_after_full_stop()
self.translate_text()
self.remove_linebreaks()
if self.analyse_text:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "ammico"
version = "0.2.3"
version = "0.2.4"
description = "AI Media and Misinformation Content Analysis Tool"
readme = "README.md"
maintainers = [
Expand Down

0 comments on commit e12929a

Please sign in to comment.