Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

manage occurence of full stops in a better way #229

Merged
merged 3 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions ammico/test/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,19 @@ def test_init_revision_numbers_and_models(accepted):
tt.TextDetector({}, revision_numbers=["something"], accept_privacy=accepted)


def test_check_add_space_after_full_stop(accepted):
test_obj = tt.TextDetector({}, accept_privacy=accepted)
test_obj.subdict["text"] = "I like cats. I like dogs."
test_obj._check_add_space_after_full_stop()
assert test_obj.subdict["text"] == "I like cats. I like dogs."
test_obj.subdict["text"] = "I like cats."
test_obj._check_add_space_after_full_stop()
assert test_obj.subdict["text"] == "I like cats."
test_obj.subdict["text"] = "www.icanhascheezburger.com"
test_obj._check_add_space_after_full_stop()
assert test_obj.subdict["text"] == "www. icanhascheezburger. com"


@pytest.mark.gcv
def test_analyse_image(set_testdict, set_environ, accepted):
for item in set_testdict:
Expand Down
42 changes: 35 additions & 7 deletions ammico/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import spacy
import io
import os
import re
from ammico.utils import AnalysisMethod
import grpc
import pandas as pd
Expand Down Expand Up @@ -225,6 +226,39 @@ def _initialize_spacy(self):
spacy.cli.download("en_core_web_md")
self.nlp = spacy.load("en_core_web_md")

def _check_add_space_after_full_stop(self):
"""Add a space after a full stop. Required by googletrans."""
# we have found text, now we check for full stops
index_stop = [
i.start() for i in re.finditer("\.", self.subdict["text"]) # noqa
]
if not index_stop: # no full stops found
return
# check if this includes the last string item
end_of_list = False
if len(self.subdict["text"]) <= (index_stop[-1] + 1):
# the last found full stop is at the end of the string
# but we can include all others
if len(index_stop) == 1:
end_of_list = True
else:
index_stop.pop()
if end_of_list: # only one full stop at end of string
return
# if this is not the end of the list, check if there is a space after the full stop
no_space = [i for i in index_stop if self.subdict["text"][i + 1] != " "]
if not no_space: # all full stops have a space after them
return
# else, amend the text
add_one = 1
for i in no_space:
self.subdict["text"] = (
self.subdict["text"][: i + add_one]
+ " "
+ self.subdict["text"][i + add_one :]
)
add_one += 1

def analyse_image(self) -> dict:
"""Perform text extraction and analysis of the text.

Expand All @@ -239,13 +273,7 @@ def analyse_image(self) -> dict:
else:
# make sure all full stops are followed by whitespace
# otherwise googletrans breaks
index_stop = self.subdict["text"].find(".")
if self.subdict["text"][index_stop + 1] != " ":
self.subdict["text"] = (
self.subdict["text"][: index_stop + 1]
+ " "
+ self.subdict["text"][index_stop + 1 :]
)
self._check_add_space_after_full_stop()
self.translate_text()
self.remove_linebreaks()
if self.analyse_text:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "ammico"
version = "0.2.3"
version = "0.2.4"
description = "AI Media and Misinformation Content Analysis Tool"
readme = "README.md"
maintainers = [
Expand Down