Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/TLDR-849-txtlayerreader-bboxes-f…
Browse files Browse the repository at this point in the history
…ix' into TLDR-849-txtlayerreader-bboxes-fix
  • Loading branch information
alexander1999-hub committed Nov 7, 2024
2 parents 54f5c64 + 6cdb2ca commit 81787e9
Showing 1 changed file with 37 additions and 2 deletions.
39 changes: 37 additions & 2 deletions scripts/test_words_bbox_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from dedoc.api.dedoc_api import config
from dedoc.utils.image_utils import rotate_image
from dedoc.utils.pdf_utils import get_page_image
from dedoc.utils.pdf_utils import get_page_image, get_pdf_page_count
from tests.api_tests.abstract_api_test import AbstractTestApiDocReader

BboxWithConfsType = namedtuple("WordWithConf", ["start", "end", "bbox", "confs", "text_type"])
Expand Down Expand Up @@ -61,12 +61,15 @@ def __extract_texttype_annotation(self, anns_type: List[dict], ann_bbox: dict, t

return text_type

def __get_words_annotation(self, structure: dict) -> List[BboxWithConfsType]:
def __get_words_annotation(self, structure: dict, page_id: int = 0) -> List[BboxWithConfsType]:
stack = [structure]
words_annotation = []

while len(stack) > 0:
node = stack.pop()
if node["metadata"]["page_id"] != page_id:
stack.extend(node["subparagraphs"])
continue

anns_bbox = [annotation for annotation in node["annotations"] if annotation["name"] == "bounding box"]
anns_conf = [annotation for annotation in node["annotations"] if annotation["name"] == "confidence"]
Expand Down Expand Up @@ -171,6 +174,38 @@ def test_pdf_documents(self) -> None:
image = self.__draw_tables_words(tables, image)
cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}.png"), image)

def test_gost_frame_documents(self) -> None:
filename_parameters_outputdir = [
["tables/gost_multipage_table_2.pdf", dict(pdf_with_text_layer="true", need_gost_frame_analysis="true"), "gost_frame_true"],
["tables/gost_multipage_table_2.pdf", dict(pdf_with_text_layer="false", need_gost_frame_analysis="true"), "gost_frame_false"]
]

for file_name, parameters, outputdir in filename_parameters_outputdir:
output_path = os.path.join(self.output_path, outputdir)
os.makedirs(output_path, exist_ok=True)
result = self._send_request(file_name, data=parameters)
structure = result["content"]["structure"]
tables = result["content"]["tables"]
page_count = get_pdf_page_count(self._get_abs_path(file_name))

for page_id in range(page_count):
image = np.asarray(get_page_image(self._get_abs_path(file_name), page_id))
word_annotations = self.__get_words_annotation(structure, page_id=page_id)
if len(word_annotations) > 0:
ann = word_annotations[0]
if ann is not None:
bbox = json.loads(ann.bbox)
image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC)
image = self.__draw_word_annotations(image, word_annotations)
if len(tables) > 0:
if len(word_annotations) == 0:
cell_line = tables[0]["cells"][0][0]["lines"][0]
ann_bbox = [annotation for annotation in cell_line["annotations"] if annotation["name"] == "bounding box"][0]
bbox = json.loads(ann_bbox["value"])
image = cv2.resize(image, dsize=(bbox["page_width"], bbox["page_height"]), interpolation=cv2.INTER_CUBIC)
image = self.__draw_tables_words(tables, image)
cv2.imwrite(os.path.join(output_path, f"{os.path.split(file_name)[1]}_{page_id}.png"), image)

def test_table_word_extraction(self) -> None:
output_path = os.path.join(self.output_path, "tables")
os.makedirs(output_path, exist_ok=True)
Expand Down

0 comments on commit 81787e9

Please sign in to comment.