Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-440: Add cell properties to tabby #317

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docker/Dockerfile → Dockerfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
ARG REPOSITORY="docker.io"
FROM dedocproject/baseimg
FROM dedocproject/dedoc_p3.9_base:version_2023_08_28

ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root"
ENV RESOURCES_PATH "/dedoc_root/resources"

ADD requirements.txt .
RUN pip3 install -r requirements.txt
RUN pip3 install --no-cache-dir -r requirements.txt

RUN mkdir /dedoc_root
ADD dedoc /dedoc_root/dedoc
Expand All @@ -17,4 +17,4 @@ RUN python3 /dedoc_root/dedoc/download_models.py
ADD tests /dedoc_root/tests
ADD resources /dedoc_root/resources

CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"]
CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"]
6 changes: 6 additions & 0 deletions dedoc/api/api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,9 @@ def __table2html(table: Table, table2id: Dict[str, int]) -> str:
text += "</tr>\n"
text += "</tbody>\n</table>"
return text


def json2txt(paragraph: TreeNode) -> str:
subparagraphs_text = "\n".join([json2txt(subparagraph) for subparagraph in paragraph.subparagraphs])
text = f"{paragraph.text}\n{subparagraphs_text}"
return text
5 changes: 4 additions & 1 deletion dedoc/api/dedoc_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import dedoc
from dedoc.api.api_args import QueryParameters
from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree
from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt
from dedoc.common.exceptions.dedoc_error import DedocError
from dedoc.common.exceptions.missing_file_error import MissingFileError
from dedoc.config import get_config
Expand Down Expand Up @@ -76,6 +76,9 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
if return_format == "html":
html_content = json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0)
return HTMLResponse(content=html_content, status_code=200)
elif return_format == "plain_text":
txt_content = json2txt(paragraph=document_tree.content.structure)
return PlainTextResponse(content=txt_content, status_code=200)
elif return_format == "tree":
html_content = json2tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content, status_code=200)
Expand Down
1 change: 1 addition & 0 deletions dedoc/api/static/html_eng/form_input.html
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ <h2>Structure Document Recognition</h2>
<select name="return_format">
<option value="html" selected>html</option>
<option value="pretty_json">pretty_json</option>
<option value="plain_text">plain_text</option>
<option value="tree">tree</option>
<option value="json">json</option>
<option value="collapsed_tree">collapsed_tree</option>
Expand Down
1 change: 1 addition & 0 deletions dedoc/api/static/html_rus/form_input.html
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ <h2>Распознавание структуры документа</h2>
<select name="return_format">
<option value="html" selected>html</option>
<option value="pretty_json">pretty_json</option>
<option value="plain_text">plain_text</option>
<option value="tree">tree</option>
<option value="json">json</option>
<option value="collapsed_tree">collapsed_tree</option>
Expand Down
56 changes: 44 additions & 12 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
import math
import os
import subprocess
from collections import namedtuple
from typing import List, Optional, Tuple

import numpy as np

from dedoc.common.exceptions.java_not_found_error import JavaNotFoundError
from dedoc.common.exceptions.tabby_pdf_error import TabbyPdfError
from dedoc.data_structures.bbox import BBox
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation
from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation
Expand All @@ -33,6 +35,8 @@
from dedoc.utils.parameter_utils import get_param_page_slice
from dedoc.utils.utils import calculate_file_hash

CellPropertyInfo = namedtuple("NamedTuple", "colspan, rowspan, invisible")


class PdfTabbyReader(PdfBaseReader):
"""
Expand Down Expand Up @@ -76,7 +80,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
parameters = {} if parameters is None else parameters
lines, scan_tables = self.__extract(path=path)
lines, scan_tables, tables_cell_properties = self.__extract(path=path)
warnings = []
document_metadata = None

Expand All @@ -93,10 +97,11 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio

lines = self.linker.link_objects(lines=lines, tables=scan_tables, images=[])
tables = []
for scan_table in scan_tables:
for scan_table, table_cells_property in zip(scan_tables, tables_cell_properties):
cell_properties = [[cellp for cellp in row] for row in table_cells_property]
metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name)
cells = [[cell for cell in row] for row in scan_table.matrix_cells]
table = Table(metadata=metadata, cells=cells)
table = Table(metadata=metadata, cells=cells, cells_properties=cell_properties)
tables.append(table)

attachments = []
Expand All @@ -111,23 +116,26 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio

return self._postprocess(result)

def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable]]:
def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable], List[List[CellPropertyInfo]]]:
file_hash = calculate_file_hash(path=path)
document = self.__process_pdf(path=path, start_page=start_page, end_page=end_page)
all_lines = []
all_tables = []
all_cell_properties = []
for page in document.get("pages", []):
lines = self.__get_lines_with_location(page, file_hash)
if lines:
all_lines.extend(lines)
tables = self.__get_tables(page, file_hash)
tables, cell_properties = self.__get_tables(page, file_hash)
if tables:
all_tables.extend(tables)
all_cell_properties.extend(cell_properties)

return all_lines, all_tables
return all_lines, all_tables, all_cell_properties

def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
tables = []
cell_properties = []
page_number = page["number"]
i = 0
for table in page["tables"]:
Expand All @@ -138,26 +146,44 @@ def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
y_bottom_right = y_top_left + table["height"]
order = table["order"]
rows = table["rows"]
cell_properties_json = table["cell_properties"]
cell_property_list = []

for cell_properties_row in cell_properties_json:
cell_property_row_list = []

for cell_property in cell_properties_row:
cell_property_info = CellPropertyInfo(cell_property["col_span"],
cell_property["row_span"],
bool(cell_property["invisible"]))

cell_property_row_list.append(cell_property_info)

cell_property_list.append(cell_property_row_list)

cells = [row for row in rows]
bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))

tables.append(ScanTable(matrix_cells=cells, page_number=page_number, bbox=bbox, name=file_hash + str(page_number) + str(i), order=order))
cell_properties.append(cell_property_list)

return tables
return tables, cell_properties

def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]:
lines = []
page_number = page["number"]
page_width = int(page["width"])
page_height = int(page["height"])
prev_line = None

for block in page["blocks"]:
annotations = []
order = block["order"]
block_text = block["text"]
bx_top_left = block["x_top_left"]
by_top_left = block["y_top_left"]
bx_bottom_right = bx_top_left + block["width"]
by_bottom_right = by_top_left + block["height"]
bx_top_left = int(block["x_top_left"])
by_top_left = int(block["y_top_left"])
bx_bottom_right = bx_top_left + int(block["width"])
by_bottom_right = by_top_left + int(block["height"])
indent = block["indent"]
spacing = block["spacing"]
len_block = len(block_text)
Expand All @@ -173,7 +199,12 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
url = annotation["url"]
start = annotation["start"]
end = annotation["end"]

x_top_left = int(annotation["x_top_left"])
y_top_left = int(annotation["y_top_left"])
x_bottom_right = bx_top_left + int(annotation["width"])
y_bottom_right = by_top_left + int(annotation["height"])
box = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))
annotations.append(BBoxAnnotation(start, end, box, page_width=page_width, page_height=page_height))
annotations.append(SizeAnnotation(start, end, str(font_size)))
annotations.append(StyleAnnotation(start, end, font_name))

Expand All @@ -189,6 +220,7 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
meta = block["metadata"].lower()
uid = f"txt_{file_hash}_{order}"
bbox = BBox.from_two_points((bx_top_left, by_top_left), (bx_bottom_right, by_bottom_right))
annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height))

metadata = LineMetadata(page_id=page_number, line_id=order)
line_with_location = LineWithLocation(line=block_text,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.extractor_pdf_textlayer import ExtractorPdfTextLayer
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor
from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox


Expand All @@ -25,7 +25,7 @@ def __init__(self, *, config: dict) -> None:
:param config: configuration of the reader, e.g. logger for logging
"""
super().__init__(config=config)
self.extractor_layer = ExtractorPdfTextLayer(config=config)
self.extractor_layer = PdfminerExtractor(config=config)

def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
"""
Expand Down
Empty file.
Loading
Loading