diff --git a/docker/Dockerfile b/Dockerfile
similarity index 79%
rename from docker/Dockerfile
rename to Dockerfile
index 43e4be14..a191bd42 100644
--- a/docker/Dockerfile
+++ b/Dockerfile
@@ -1,11 +1,11 @@
 ARG REPOSITORY="docker.io"
-FROM dedocproject/baseimg
+FROM dedocproject/dedoc_p3.9_base:version_2023_08_28
 
 ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root"
 ENV RESOURCES_PATH "/dedoc_root/resources"
 
 ADD requirements.txt .
-RUN pip3 install -r requirements.txt
+RUN pip3 install --no-cache-dir -r requirements.txt
 
 RUN mkdir /dedoc_root
 ADD dedoc /dedoc_root/dedoc
@@ -17,4 +17,4 @@ RUN python3 /dedoc_root/dedoc/download_models.py
 ADD tests /dedoc_root/tests
 ADD resources /dedoc_root/resources
 
-CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"]
\ No newline at end of file
+CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"]
diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py
index bd765535..a772c95b 100644
--- a/dedoc/api/api_utils.py
+++ b/dedoc/api/api_utils.py
@@ -219,3 +219,9 @@ def __table2html(table: Table, table2id: Dict[str, int]) -> str:
         text += "</tr>\n"
     text += "</tbody>\n</table>"
     return text
+
+
+def json2txt(paragraph: TreeNode) -> str:
+    subparagraphs_text = "\n".join([json2txt(subparagraph) for subparagraph in paragraph.subparagraphs])
+    text = f"{paragraph.text}\n{subparagraphs_text}"
+    return text
diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py
index 370a3cc1..7e295c7a 100644
--- a/dedoc/api/dedoc_api.py
+++ b/dedoc/api/dedoc_api.py
@@ -10,7 +10,7 @@
 
 import dedoc
 from dedoc.api.api_args import QueryParameters
-from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree
+from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt
 from dedoc.common.exceptions.dedoc_error import DedocError
 from dedoc.common.exceptions.missing_file_error import MissingFileError
 from dedoc.config import get_config
@@ -76,6 +76,9 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
     if return_format == "html":
         html_content = json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0)
         return HTMLResponse(content=html_content, status_code=200)
+    elif return_format == "plain_text":
+        txt_content = json2txt(paragraph=document_tree.content.structure)
+        return PlainTextResponse(content=txt_content, status_code=200)
     elif return_format == "tree":
         html_content = json2tree(paragraph=document_tree.content.structure)
         return HTMLResponse(content=html_content, status_code=200)
diff --git a/dedoc/api/static/html_eng/form_input.html b/dedoc/api/static/html_eng/form_input.html
index cdb39042..e8ab3081 100644
--- a/dedoc/api/static/html_eng/form_input.html
+++ b/dedoc/api/static/html_eng/form_input.html
@@ -32,6 +32,7 @@ <h2>Structure Document Recognition</h2>
                         <select name="return_format">
                             <option value="html" selected>html</option>
                             <option value="pretty_json">pretty_json</option>
+                            <option value="plain_text">plain_text</option>
                             <option value="tree">tree</option>
                             <option value="json">json</option>
                             <option value="collapsed_tree">collapsed_tree</option>
diff --git a/dedoc/api/static/html_rus/form_input.html b/dedoc/api/static/html_rus/form_input.html
index 8619c169..603b9afa 100644
--- a/dedoc/api/static/html_rus/form_input.html
+++ b/dedoc/api/static/html_rus/form_input.html
@@ -33,6 +33,7 @@ <h2>Распознавание структуры документа</h2>
                         <select name="return_format">
                             <option value="html" selected>html</option>
                             <option value="pretty_json">pretty_json</option>
+                            <option value="plain_text">plain_text</option>
                             <option value="tree">tree</option>
                             <option value="json">json</option>
                             <option value="collapsed_tree">collapsed_tree</option>
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
index 6bf31bd2..15e264cd 100644
--- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
@@ -3,6 +3,7 @@
 import math
 import os
 import subprocess
+from collections import namedtuple
 from typing import List, Optional, Tuple
 
 import numpy as np
@@ -10,6 +11,7 @@
 from dedoc.common.exceptions.java_not_found_error import JavaNotFoundError
 from dedoc.common.exceptions.tabby_pdf_error import TabbyPdfError
 from dedoc.data_structures.bbox import BBox
+from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
 from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
 from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation
 from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation
@@ -33,6 +35,8 @@
 from dedoc.utils.parameter_utils import get_param_page_slice
 from dedoc.utils.utils import calculate_file_hash
 
+CellPropertyInfo = namedtuple("NamedTuple", "colspan, rowspan, invisible")
+
 
 class PdfTabbyReader(PdfBaseReader):
     """
@@ -76,7 +80,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
         Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
         """
         parameters = {} if parameters is None else parameters
-        lines, scan_tables = self.__extract(path=path)
+        lines, scan_tables, tables_cell_properties = self.__extract(path=path)
         warnings = []
         document_metadata = None
 
@@ -93,10 +97,11 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
 
         lines = self.linker.link_objects(lines=lines, tables=scan_tables, images=[])
         tables = []
-        for scan_table in scan_tables:
+        for scan_table, table_cells_property in zip(scan_tables, tables_cell_properties):
+            cell_properties = [[cellp for cellp in row] for row in table_cells_property]
             metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name)
             cells = [[cell for cell in row] for row in scan_table.matrix_cells]
-            table = Table(metadata=metadata, cells=cells)
+            table = Table(metadata=metadata, cells=cells, cells_properties=cell_properties)
             tables.append(table)
 
         attachments = []
@@ -111,23 +116,26 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
 
         return self._postprocess(result)
 
-    def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable]]:
+    def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable], List[List[CellPropertyInfo]]]:
         file_hash = calculate_file_hash(path=path)
         document = self.__process_pdf(path=path, start_page=start_page, end_page=end_page)
         all_lines = []
         all_tables = []
+        all_cell_properties = []
         for page in document.get("pages", []):
             lines = self.__get_lines_with_location(page, file_hash)
             if lines:
                 all_lines.extend(lines)
-            tables = self.__get_tables(page, file_hash)
+            tables, cell_properties = self.__get_tables(page, file_hash)
             if tables:
                 all_tables.extend(tables)
+                all_cell_properties.extend(cell_properties)
 
-        return all_lines, all_tables
+        return all_lines, all_tables, all_cell_properties
 
     def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
         tables = []
+        cell_properties = []
         page_number = page["number"]
         i = 0
         for table in page["tables"]:
@@ -138,26 +146,44 @@ def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
             y_bottom_right = y_top_left + table["height"]
             order = table["order"]
             rows = table["rows"]
+            cell_properties_json = table["cell_properties"]
+            cell_property_list = []
+
+            for cell_properties_row in cell_properties_json:
+                cell_property_row_list = []
+
+                for cell_property in cell_properties_row:
+                    cell_property_info = CellPropertyInfo(cell_property["col_span"],
+                                                          cell_property["row_span"],
+                                                          bool(cell_property["invisible"]))
+
+                    cell_property_row_list.append(cell_property_info)
+
+                cell_property_list.append(cell_property_row_list)
+
             cells = [row for row in rows]
             bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))
 
             tables.append(ScanTable(matrix_cells=cells, page_number=page_number, bbox=bbox, name=file_hash + str(page_number) + str(i), order=order))
+            cell_properties.append(cell_property_list)
 
-        return tables
+        return tables, cell_properties
 
     def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]:
         lines = []
         page_number = page["number"]
+        page_width = int(page["width"])
+        page_height = int(page["height"])
         prev_line = None
 
         for block in page["blocks"]:
             annotations = []
             order = block["order"]
             block_text = block["text"]
-            bx_top_left = block["x_top_left"]
-            by_top_left = block["y_top_left"]
-            bx_bottom_right = bx_top_left + block["width"]
-            by_bottom_right = by_top_left + block["height"]
+            bx_top_left = int(block["x_top_left"])
+            by_top_left = int(block["y_top_left"])
+            bx_bottom_right = bx_top_left + int(block["width"])
+            by_bottom_right = by_top_left + int(block["height"])
             indent = block["indent"]
             spacing = block["spacing"]
             len_block = len(block_text)
@@ -173,7 +199,12 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
                 url = annotation["url"]
                 start = annotation["start"]
                 end = annotation["end"]
-
+                x_top_left = int(annotation["x_top_left"])
+                y_top_left = int(annotation["y_top_left"])
+                x_bottom_right = bx_top_left + int(annotation["width"])
+                y_bottom_right = by_top_left + int(annotation["height"])
+                box = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))
+                annotations.append(BBoxAnnotation(start, end, box, page_width=page_width, page_height=page_height))
                 annotations.append(SizeAnnotation(start, end, str(font_size)))
                 annotations.append(StyleAnnotation(start, end, font_name))
 
@@ -189,6 +220,7 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
             meta = block["metadata"].lower()
             uid = f"txt_{file_hash}_{order}"
             bbox = BBox.from_two_points((bx_top_left, by_top_left), (bx_bottom_right, by_bottom_right))
+            annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height))
 
             metadata = LineMetadata(page_id=page_number, line_id=order)
             line_with_location = LineWithLocation(line=block_text,
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
index 21bf6943..0f17ce23 100644
--- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
+++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
@@ -8,7 +8,7 @@
 from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
 from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
 from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader
-from dedoc.readers.pdf_reader.pdf_txtlayer_reader.extractor_pdf_textlayer import ExtractorPdfTextLayer
+from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor
 from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox
 
 
@@ -25,7 +25,7 @@ def __init__(self, *, config: dict) -> None:
         :param config: configuration of the reader, e.g. logger for logging
         """
         super().__init__(config=config)
-        self.extractor_layer = ExtractorPdfTextLayer(config=config)
+        self.extractor_layer = PdfminerExtractor(config=config)
 
     def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
         """
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/__init__.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/extractor_pdf_textlayer.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py
similarity index 65%
rename from dedoc/readers/pdf_reader/pdf_txtlayer_reader/extractor_pdf_textlayer.py
rename to dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py
index 5f742f0a..d91d8439 100644
--- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/extractor_pdf_textlayer.py
+++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py
@@ -2,23 +2,23 @@
 import itertools
 import logging
 import os
-import re
 import uuid
 from collections import namedtuple
-from typing import IO, List, Match, Optional, Tuple
+from typing import List, Optional, Tuple
 
 import cv2
 import numpy as np
 from PIL import Image
 from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTAnno, LTChar, LTContainer, LTCurve, LTFigure, LTImage, LTRect, LTTextBox, LTTextBoxHorizontal, LTTextLineHorizontal
+from pdfminer.layout import LAParams, LTAnno, LTChar, LTContainer, LTCurve, LTFigure, LTImage, LTRect
+from pdfminer.layout import LTTextBox, LTTextBoxHorizontal, LTTextContainer, LTTextLineHorizontal
 from pdfminer.pdfinterp import PDFPageInterpreter
 from pdfminer.pdfinterp import PDFResourceManager
 from pdfminer.pdfpage import PDFPage
 
 from dedoc.common.exceptions.bad_file_error import BadFileFormatError
 from dedoc.data_structures.annotation import Annotation
-from dedoc.data_structures.bbox import BBox
+from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
 from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
 from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation
 from dedoc.data_structures.concrete_annotations.size_annotation import SizeAnnotation
@@ -27,13 +27,14 @@
 from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
 from dedoc.readers.pdf_reader.data_classes.tables.location import Location
 from dedoc.readers.pdf_reader.data_classes.text_with_bbox import TextWithBBox
+from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_utils import cleaning_text_from_hieroglyphics, create_bbox, draw_annotation
 from dedoc.utils.pdf_utils import get_page_image
 
-StyleLine = namedtuple("StyleLine", ["begin", "end", "bold", "italic", "font_size", "font_style", "table_name"])
 logging.getLogger("pdfminer").setLevel(logging.ERROR)
+WordObj = namedtuple("Word", ["start", "end", "value"])
 
 
-class ExtractorPdfTextLayer(object):
+class PdfminerExtractor(object):
     """
     Class extarcts text with style from pdf with help pdfminer.six
     """
@@ -68,15 +69,19 @@ def __handle_page(self, page: PDFPage, page_number: int, path: str, is_one_colum
         image_page = self.__get_image(path=path, page_num=page_number)
         image_height, image_width, *_ = image_page.shape
 
-        height = page.mediabox[3]
-        width = page.mediabox[2]
+        height = int(page.mediabox[3])
+        width = int(page.mediabox[2])
         if height > 0 and width > 0:
-            k_w, k_h = image_width / width, image_height / height
+            k_w, k_h = image_width / page.mediabox[2], image_height / page.mediabox[3]
             page_broken = False
         else:
             page_broken = True
             k_w, k_h = None, None
-        # 1. extract only textline object
+
+        if self.config.get("debug_mode", False):
+            self.__debug_extract_layout(image_page, layout, page_number, k_w, k_h, page, width, height)
+
+        # 1. extract textline objects and image (as LTImage)
         images = []
         layout_objects = [lobj for lobj in layout]
         lobjs_textline = []
@@ -87,19 +92,20 @@ def __handle_page(self, page: PDFPage, page_number: int, path: str, is_one_colum
                 lobjs_textline.extend(lines)
             elif isinstance(lobj, LTTextLineHorizontal):
                 lobjs_textline.append(lobj)
+
             elif isinstance(lobj, LTFigure) and not page_broken:
                 attachment = self.__extract_image(directory, height, image_page, k_h, k_w, lobj, page_number)
-
                 if attachment is not None:
                     images.append(attachment)
 
         bboxes = []
         for line_num, lobj in enumerate(lobjs_textline):
-            bbox = self.get_info_layout_object(lobj, page_num=page_number, line_num=line_num, k_w=k_w, k_h=k_h, height=height)
+            text_with_bbox = self.get_info_layout_object(lobj, page_num=page_number, line_num=line_num, k_w=k_w, k_h=k_h, height=height, width=width)
+            if text_with_bbox.bbox.width * text_with_bbox.bbox.height > 0:
+                bboxes.append(text_with_bbox)
 
-            if bbox.bbox.width * bbox.bbox.height > 0:
-                bboxes.append(bbox)
         attachments = images if len(images) < 10 else []
+
         return PageWithBBox(bboxes=bboxes, image=image_page, page_num=page_number, attachments=attachments)
 
     def __extract_image(self,
@@ -111,14 +117,13 @@ def __extract_image(self,
                         lobj: LTContainer,
                         page_number: int) -> Optional[PdfImageAttachment]:
         try:
-            bbox = self._create_bbox(k_h=k_h, k_w=k_w, height=height, lobj=lobj)
+            bbox = create_bbox(k_h=k_h, k_w=k_w, height=height, lobj=lobj)
             location = Location(bbox=bbox, page_number=page_number)
             cropped = image_page[bbox.y_top_left: bbox.y_bottom_right, bbox.x_top_left: bbox.x_bottom_right]
             uid = f"fig_{uuid.uuid1()}"
             file_name = f"{uid}.png"
             path_out = os.path.join(directory, file_name)
             Image.fromarray(cropped).save(path_out)
-            image_page[bbox.y_top_left: bbox.y_bottom_right, bbox.x_top_left: bbox.x_bottom_right] = 255
             attachment = PdfImageAttachment(original_name=file_name, tmp_file_path=path_out, need_content_analysis=False, uid=uid, location=location)
         except Exception as ex:
             self.logger.error(ex)
@@ -144,175 +149,157 @@ def __get_interpreter(self, is_one_column_document: bool) -> Tuple[PDFPageAggreg
         interpreter = PDFPageInterpreter(rsrcmgr, device)
         return device, interpreter
 
-    def __debug_extract_layout(self, image_src: np.ndarray, layout: LTContainer, page_num: int, k_w: float, k_h: float, page: PDFPage) -> None:
-        """
-        Function for debugging of pdfminer.six layout
-        :param layout: container of layout element
-        :return: None
-        """
-        tmp_dir = os.path.join(self.config["path_debug"], "pdfminer")
-        if not os.path.exists(tmp_dir):
-            os.mkdir(tmp_dir)
-
-        file_text = open(os.path.join(tmp_dir, f"text_{page_num}.txt"), "wt")
-
-        # 1. extract layout objects
-        lobjs = [lobj for lobj in layout]
-        lobjs_textline = []
-        lobjs_box = []
-        lobjs_words = []
-        lobjs_figures = []
-        lobjs_images = []
-        lobjs_curves = []
-
-        for lobj in lobjs:
-            if isinstance(lobj, LTTextBoxHorizontal):
-                lobjs_textline.extend(lobj)
-            elif isinstance(lobj, LTTextLineHorizontal):
-                lobjs_textline.append(lobj)
-            elif isinstance(lobj, LTRect):
-                lobjs_box.append(lobj)
-            elif isinstance(lobj, LTFigure):
-                lobjs_figures.append(lobj)
-            elif isinstance(lobj, LTImage):
-                lobjs_images.append(lobj)
-            elif isinstance(lobj, LTCurve):
-                lobjs_curves.append(lobj)
-            elif isinstance(lobj, LTTextBox):
-                lobjs_words.append(lobj)
-
-        # 3. print information
-        self.__draw_layout_element(image_src, lobjs_textline, file_text, k_w, k_h, page, (0, 255, 0))
-        self.__draw_layout_element(image_src, lobjs_words, file_text, k_w, k_h, page, (0, 255, 0))
-        self.__draw_layout_element(image_src, lobjs_box, file_text, k_w, k_h, page, (0, 0, 255), text="LTRect")
-        self.__draw_layout_element(image_src, lobjs_figures, file_text, k_w, k_h, page, (255, 0, 0), text="LTFigure")
-        self.__draw_layout_element(image_src, lobjs_images, file_text, k_w, k_h, page, (0, 255, 255), text="LTImage")
-        self.__draw_layout_element(image_src, lobjs_curves, file_text, k_w, k_h, page, (0, 255, 255), text="LTCurve")
-
-        cv2.imwrite(os.path.join(tmp_dir, f"img_page_{page_num}.png"), image_src)
-        file_text.close()
-
-    def __draw_layout_element(self,
-                              image_src: np.ndarray,
-                              lobjs: List,
-                              file: IO,
-                              k_w: float,
-                              k_h: float,
-                              page: PDFPage,
-                              color: Tuple[int, int, int],
-                              text: Optional[str] = None) -> None:
-        for lobj in lobjs:
-            # converting coordinate from pdf format into image
-            box_lobj = ExtractorPdfTextLayer.convert_coordinates_pdf_to_image(lobj, k_w, k_h, page.mediabox[3])
-
-            cv2.rectangle(image_src, (box_lobj.x_top_left, box_lobj.y_top_left), (box_lobj.x_bottom_right, box_lobj.y_bottom_right), color)
-
-            if text is not None:
-                cv2.putText(image_src, text, (box_lobj.x_top_left, box_lobj.y_top_left), cv2.FONT_HERSHEY_SIMPLEX, 1, color)
-            else:
-                file.write(lobj.get_text())
-
-    @staticmethod
-    def convert_coordinates_pdf_to_image(lobj: LTContainer, k_w: float, k_h: float, height_page: int) -> BBox:
-        x0_new = int(lobj.x0 * k_w)
-        x1_new = int(lobj.x1 * k_w)
-        y0_new = int((height_page - lobj.y1) * k_h)
-        y1_new = int((height_page - lobj.y0) * k_h)
-
-        return BBox(x0_new, y0_new, x1_new - x0_new, y1_new - y0_new)
-
-    def get_info_layout_object(self, lobj: LTContainer, page_num: int, line_num: int, k_w: float, k_h: float, height: int) -> TextWithBBox:
+    def get_info_layout_object(self,
+                               lobj: LTContainer,
+                               page_num: int,
+                               line_num: int,
+                               k_w: float,
+                               k_h: float,
+                               height: int,
+                               width: int) -> TextWithBBox:
         # 1 - converting coordinate from pdf format into image
-        bbox = self._create_bbox(height, k_h, k_w, lobj)
+        bbox = create_bbox(height, k_h, k_w, lobj)
         # 2 - extract text and text annotations from current object
-        text, text_anns = self._get_style_and_text_from_layout_object(lobj)
-        return TextWithBBox(bbox=bbox, page_num=page_num, text=text, line_num=line_num, annotations=text_anns)
-
-    def _create_bbox(self, height: int, k_h: float, k_w: float, lobj: LTContainer) -> BBox:
-        curr_box_line = ExtractorPdfTextLayer.convert_coordinates_pdf_to_image(lobj, k_w, k_h, height)
-        bbox = BBox.from_two_points((curr_box_line.x_top_left, curr_box_line.y_top_left), (curr_box_line.x_bottom_right, curr_box_line.y_bottom_right))
-        return bbox
-
-    def _get_style_and_text_from_layout_object(self, lobj: LTContainer) -> [str, List[Annotation]]:
-
+        text = ""
+        annotations = []
         if isinstance(lobj, LTTextLineHorizontal):
             # cleaning text from (cid: *)
-            text = self._cleaning_text_from_hieroglyphics(lobj.get_text())
-            # get line's style
-            anns = self._get_line_style(lobj)
+            text = cleaning_text_from_hieroglyphics(lobj.get_text())
+            # get line's annotations
+            annotations = self.__get_line_annotations(lobj, k_w, k_h, height, width)
 
-            return text, anns
-        else:
-            return "", None
+        return TextWithBBox(bbox=bbox, page_num=page_num, text=text, line_num=line_num, annotations=annotations)
 
-    def _get_line_style(self, lobj: LTTextLineHorizontal) -> List[Annotation]:
-        # 1 - prepare data for groupby name
+    def __get_line_annotations(self, lobj: LTTextLineHorizontal, k_w: float, k_h: float, height: int, width: int) -> List[Annotation]:
+        # 1 - prepare data for group by name
         chars_with_style = []
         rand_weight = self._get_new_weight()
         prev_style = ""
+
         for lobj_char in lobj:
             if isinstance(lobj_char, LTChar) or isinstance(lobj_char, LTAnno):
+                # get styles
                 if len(chars_with_style) > 0:
                     # check next char different from previously then we fresh rand_weight
                     prev_style, prev_size = chars_with_style[-1].split("_rand_")
-                if isinstance(lobj_char, LTChar):
+
+                if isinstance(lobj_char, LTChar) and lobj_char.get_text() not in (" ", "\n", "\t"):
                     curr_style = f"{lobj_char.fontname}_{round(lobj_char.size, 0)}"
 
                     if curr_style != prev_style:
                         rand_weight = self._get_new_weight()
 
                     chars_with_style.append(f"{curr_style}_rand_{rand_weight}")
-                elif isinstance(lobj_char, LTAnno) and lobj_char.get_text() in (" ", "\n") and len(chars_with_style) > 0:
-                    # check on the space or \n (in pdfminer is type LTAnno)
+                elif lobj_char.get_text() in (" ", "\n", "\t") and len(chars_with_style) > 0:
+                    # check on the space or \n
                     # duplicated previous style
                     chars_with_style.append(chars_with_style[-1])
 
-        styles = []
-
-        # 2 - extract diapasons from the style char array (chars_with_style)
-        pointer_into_string = 0
+        annotations = self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width)
+        # 3 - extract range from chars_with_style array
+        char_pointer = 0
 
         for key, group in itertools.groupby(chars_with_style, lambda x: x):
             count_chars = len(list(group))
-            styles.extend(self.__parse_style_string(key, pointer_into_string, pointer_into_string + count_chars - 1))
-            pointer_into_string += count_chars
-
-        return styles
+            annotations.extend(self.__parse_style_string(key, char_pointer, char_pointer + count_chars - 1))
+            char_pointer += count_chars
 
-    def _cleaning_text_from_hieroglyphics(self, text_str: str) -> str:
-        """
-        replace all cid-codecs into ascii symbols. cid-encoding - hieroglyphic fonts
-        :param text_str: text
-        :return: text wo cids-chars
-        """
-        return re.sub(r"\(cid:(\d)*\)", self.cid_recognized, text_str)
+        return annotations
 
-    def cid_recognized(self, m: Match) -> str:
-        v = m.group(0)
-        v = v.strip("(")
-        v = v.strip(")")
-        ascii_num = v.split(":")[-1]
-        ascii_num = int(ascii_num)
-        text_val = chr(ascii_num)
-
-        return text_val
+    def __extract_words_bbox_annotation(self, lobj: LTTextContainer, k_w: float, k_h: float, height: int, width: int) -> List[Annotation]:
+        words: List[WordObj] = []
+        word: WordObj = WordObj(start=0, end=0, value=LTTextContainer())
+        if isinstance(lobj, LTTextLineHorizontal):
+            lobj = [lobj]
+
+        for text_line in lobj:
+            for item, lobj_char in enumerate(text_line):
+                if isinstance(lobj_char, LTChar) and lobj_char.get_text() not in (" ", "\n", "\t"):
+                    word = word._replace(end=word.end + 1)
+                    word.value.add(lobj_char)
+                elif lobj_char.get_text() in (" ", "\n", "\t"):
+                    if word.value._objs:
+                        words.append(word)
+                    word = WordObj(start=item + 1, end=item + 1, value=LTTextContainer())
+
+        annotations = [BBoxAnnotation(start=word.start,
+                                      end=word.end,
+                                      value=create_bbox(height=height, k_h=k_h, k_w=k_w, lobj=word.value),
+                                      page_width=width,
+                                      page_height=height) for word in words]
+        return annotations
 
     def _get_new_weight(self) -> str:
         return binascii.hexlify(os.urandom(8)).decode("ascii")
 
     def __parse_style_string(self, chars_with_meta: str, begin: int, end: int) -> List[Annotation]:
         # style parsing
-        line_anns = []
+        annotations = []
         prev_style, _ = chars_with_meta.split("_rand_")
         font, size, *_ = prev_style.split("_")
         fontname_wo_rand = font.split("+")[-1]
         styles = fontname_wo_rand.split("-")[-1]
+        annotations.append(StyleAnnotation(begin, end, value=fontname_wo_rand))
+
         if "Bold" in styles:
-            line_anns.append(BoldAnnotation(begin, end, value="True"))
+            annotations.append(BoldAnnotation(begin, end, value="True"))
         if "Italic" in styles:
-            line_anns.append(ItalicAnnotation(begin, end, value="True"))
-        line_anns.append(StyleAnnotation(begin, end, value=fontname_wo_rand))
+            annotations.append(ItalicAnnotation(begin, end, value="True"))
+
         if size.replace(".", "", 1).isnumeric():
-            line_anns.append(SizeAnnotation(begin, end, value=size))
+            annotations.append(SizeAnnotation(begin, end, value=size))
+
+        return annotations
+
+    def __debug_extract_layout(self, image_src: np.ndarray, layout: LTContainer, page_num: int, k_w: float, k_h: float, page: PDFPage,
+                               width: int, height: int) -> None:
+        """
+        Function for debugging of pdfminer.six layout
+        :param layout: container of layout element
+        :return: None
+        """
+        tmp_dir = os.path.join(self.config.get("path_debug"), "pdfminer")
+        os.makedirs(tmp_dir, exist_ok=True)
+
+        file_text = open(os.path.join(tmp_dir, f"text_{page_num}.txt"), "wt")
+
+        # 1. extract layout objects
+        lobjs = [lobj for lobj in layout]
+        lobjs_textline = []
+        lobjs_box = []
+        lobjs_words = []
+        lobjs_figures = []
+        lobjs_images = []
+        lobjs_curves = []
+        annotations = []
 
-        return line_anns
+        for lobj in lobjs:
+            if isinstance(lobj, LTTextBoxHorizontal):
+                annotations.extend(self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width))
+                lobjs_textline.extend(lobj)
+            elif isinstance(lobj, LTTextLineHorizontal):
+                annotations.extend(self.__extract_words_bbox_annotation(lobj, k_w, k_h, height, width))
+                lobjs_textline.append(lobj)
+            elif isinstance(lobj, LTRect):
+                lobjs_box.append(lobj)
+            elif isinstance(lobj, LTFigure):
+                lobjs_figures.append(lobj)
+            elif isinstance(lobj, LTImage):
+                lobjs_images.append(lobj)
+            elif isinstance(lobj, LTCurve):
+                lobjs_curves.append(lobj)
+            elif isinstance(lobj, LTTextBox):
+                lobjs_words.append(lobj)
+        # 3. print information
+        draw_annotation(image_src, annotations)
+        """
+        Call for debugging other LT elements:
+        self.__draw_layout_element(image_src, lobjs_textline, file_text, k_w, k_h, page, (0, 255, 0))
+        self.__draw_layout_element(image_src, lobjs_words, file_text, k_w, k_h, page, (0, 255, 0))
+        self.__draw_layout_element(image_src, lobjs_box, file_text, k_w, k_h, page, (0, 0, 255), text="LTRect")
+        self.__draw_layout_element(image_src, lobjs_figures, file_text, k_w, k_h, page, (255, 0, 0), text="LTFigure")
+        self.__draw_layout_element(image_src, lobjs_images, file_text, k_w, k_h, page, (0, 255, 255), text="LTImage")
+        self.__draw_layout_element(image_src, lobjs_curves, file_text, k_w, k_h, page, (0, 255, 255), text="LTCurve")'''
+        """
+        cv2.imwrite(os.path.join(tmp_dir, f"img_page_{page_num}.png"), image_src)
+        file_text.close()
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_utils.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_utils.py
new file mode 100644
index 00000000..cc10c1af
--- /dev/null
+++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_utils.py
@@ -0,0 +1,73 @@
+import json
+import re
+from typing import IO, List, Match, Optional, Tuple
+
+import cv2
+import numpy as np
+from pdfminer.layout import LTContainer
+from pdfminer.pdfpage import PDFPage
+
+from dedoc.data_structures.bbox import BBox
+from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
+
+
+def draw_layout_element(image_src: np.ndarray,
+                        lobjs: List,
+                        file: IO,
+                        k_w: float,
+                        k_h: float,
+                        page: PDFPage,
+                        color: Tuple[int, int, int],
+                        text: Optional[str] = None) -> None:
+    for lobj in lobjs:
+        # converting coordinate from pdf format into image
+        box_lobj = convert_coordinates_pdf_to_image(lobj, k_w, k_h, int(page.mediabox[3]))
+
+        cv2.rectangle(image_src, (box_lobj.x_top_left, box_lobj.y_top_left), (box_lobj.x_bottom_right, box_lobj.y_bottom_right), color)
+
+        if text is not None:
+            cv2.putText(image_src, text, (box_lobj.x_top_left, box_lobj.y_top_left), cv2.FONT_HERSHEY_SIMPLEX, 1, color)
+        else:
+            file.write(lobj.get_text())
+
+
+def draw_annotation(image: np.ndarray, annotations: List[BBoxAnnotation]) -> None:
+    for ann in annotations:
+        bbox = json.loads(ann.value)
+        p1 = (int(bbox["x_top_left"] * bbox["page_width"]), int(bbox["y_top_left"] * bbox["page_height"]))
+        p2 = (int((bbox["x_top_left"] + bbox["width"]) * bbox["page_width"]), int((bbox["y_top_left"] + bbox["height"]) * bbox["page_height"]))
+        cv2.rectangle(image, p1, p2, (0, 255, 0))
+
+
+def convert_coordinates_pdf_to_image(lobj: LTContainer, k_w: float, k_h: float, height_page: int) -> BBox:
+    x0 = int(lobj.x0 * k_w)
+    x1 = int(lobj.x1 * k_w)
+    y0 = int((height_page - lobj.y1) * k_h)
+    y1 = int((height_page - lobj.y0) * k_h)
+
+    return BBox(x0, y0, x1 - x0, y1 - y0)
+
+
+def create_bbox(height: int, k_h: float, k_w: float, lobj: LTContainer) -> BBox:
+    curr_box_line = convert_coordinates_pdf_to_image(lobj, k_w, k_h, height)
+    bbox = BBox.from_two_points((curr_box_line.x_top_left, curr_box_line.y_top_left), (curr_box_line.x_bottom_right, curr_box_line.y_bottom_right))
+    return bbox
+
+
+def cleaning_text_from_hieroglyphics(text_str: str) -> str:
+    """
+    replace all cid-codecs into ascii symbols. cid-encoding - hieroglyphic fonts
+    :param text_str: text
+    :return: text wo cids-chars
+    """
+    return re.sub(r"\(cid:(\d)*\)", cid_to_ascii_text, text_str)
+
+
+def cid_to_ascii_text(m: Match) -> str:
+    v = m.group(0)
+    v = v.strip("(").strip(")")
+    ascii_num = v.split(":")[-1]
+    ascii_num = int(ascii_num)
+    text_value = chr(ascii_num)
+
+    return text_value
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar
index 7899e232..b3d5eae8 100644
Binary files a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar differ
diff --git a/dedoc/scripts/benchmark_tl_correctness.py b/dedoc/scripts/benchmark_tl_correctness.py
index b51fdc6e..ae379c78 100644
--- a/dedoc/scripts/benchmark_tl_correctness.py
+++ b/dedoc/scripts/benchmark_tl_correctness.py
@@ -5,7 +5,7 @@
 
 import requests
 import wget
-from config import get_config
+from dedoc.config import get_config
 from tqdm import tqdm
 
 from dedoc.utils.utils import send_file
diff --git a/dedoc/scripts/calc_tesseract_benchmarks.py b/dedoc/scripts/calc_tesseract_benchmarks.py
index 0db13299..69f569c7 100644
--- a/dedoc/scripts/calc_tesseract_benchmarks.py
+++ b/dedoc/scripts/calc_tesseract_benchmarks.py
@@ -1,7 +1,5 @@
-import argparse
 import os
 import re
-import shutil
 import zipfile
 from tempfile import TemporaryDirectory
 from typing import Dict, List
@@ -9,17 +7,15 @@
 import cv2
 import numpy as np
 import pytesseract
+import wget
 from texttable import Texttable
 
-parser = argparse.ArgumentParser()
-parser.add_argument("--input_path", "-i", type=str, default="../../resources/benchmarks/data_tesseract_benchmarks.zip")
-parser.add_argument("--output_path", "-o", type=str, default="../../resources/benchmarks/")
-parser.add_argument("--log_path", "-l", type=str, default="/tmp/dedoc/benchamarks/tesseract/")
+from dedoc.config import get_config
 
 
 def _call_tesseract(image: np.ndarray, language: str, psm: int = 3) -> str:
-    config = "--psm {}".format(psm)
-    text = pytesseract.image_to_string(image, lang=language, output_type=pytesseract.Output.DICT, config=config)['text']
+    config = f"--psm {psm}"
+    text = pytesseract.image_to_string(image, lang=language, output_type=pytesseract.Output.DICT, config=config)["text"]
     return text
 
 
@@ -53,20 +49,14 @@ def _update_statistics_by_dataset(statistics: Dict, dataset: str, accuracy_path:
         matched = [line for line in lines if "Accuracy After Correction" in line]
         if not matched:
             matched = [line for line in lines if "Accuracy\n" in line]
-        acc_percent = re.findall(r'\d+\.\d+', matched[0])[0][:-1]
+        acc_percent = re.findall(r"\d+\.\d+", matched[0])[0][:-1]
         statistic["Accuracy"].append(float(acc_percent))
         statistic["Amount of words"].append(word_cnt)
 
-        statistic["ASCII_Spacing_Characters"] = _update_statistics_by_symbol_kind(statistic["ASCII_Spacing_Characters"],
-                                                                                  "ASCII Spacing Characters",
-                                                                                  lines)
-        statistic["ASCII_Special_Symbols"] = _update_statistics_by_symbol_kind(statistic["ASCII_Special_Symbols"],
-                                                                               "ASCII Special Symbols",
-                                                                               lines)
+        statistic["ASCII_Spacing_Characters"] = _update_statistics_by_symbol_kind(statistic["ASCII_Spacing_Characters"], "ASCII Spacing Characters", lines)
+        statistic["ASCII_Special_Symbols"] = _update_statistics_by_symbol_kind(statistic["ASCII_Special_Symbols"], "ASCII Special Symbols", lines)
         statistic["ASCII_Digits"] = _update_statistics_by_symbol_kind(statistic["ASCII_Digits"], "ASCII Digits", lines)
-        statistic["ASCII_Spacing_Characters"] = _update_statistics_by_symbol_kind(statistic["ASCII_Spacing_Characters"],
-                                                                                  "ASCII Spacing Characters",
-                                                                                  lines)
+        statistic["ASCII_Spacing_Characters"] = _update_statistics_by_symbol_kind(statistic["ASCII_Spacing_Characters"], "ASCII Spacing Characters", lines)
         statistic["Cyrillic"] = _update_statistics_by_symbol_kind(statistic["Cyrillic"], "Cyrillic", lines)
 
     statistics[dataset] = statistic
@@ -90,21 +80,27 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List:
 
 
 if __name__ == "__main__":
-    args = parser.parse_args()
+    base_zip = "data_tesseract_benchmarks"
+    output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks"))
+    cache_dir = os.path.join(get_config()["intermediate_data_path"], "tesseract_data")
+    os.makedirs(cache_dir, exist_ok=True)
+    benchmark_data_path = os.path.join(cache_dir, f"{base_zip}.zip")
+
+    if not os.path.isfile(benchmark_data_path):
+        wget.download("https://at.ispras.ru/owncloud/index.php/s/HqKt53BWmR8nCVG/download", benchmark_data_path)
+        print(f"Benchmark data downloaded to {benchmark_data_path}")
+    else:
+        print(f"Use cached benchmark data from {benchmark_data_path}")
+    assert os.path.isfile(benchmark_data_path)
+
     accs = [["Dataset", "Image name", "--psm", "Amount of words", "Accuracy OCR"]]
     accs_common = [["Dataset", "ASCII_Spacing_Chars", "ASCII_Special_Symbols", "ASCII_Digits",
                     "ASCII_Uppercase_Chars", "Latin1_Special_Symbols", "Cyrillic", "Amount of words", "AVG Accuracy"]]
-    base_zip = "data_tesseract_benchmarks"
-
     statistics = {}
 
-    if os.path.exists(args.log_path):
-        shutil.rmtree(args.log_path)
-    os.makedirs(args.log_path)
-
-    with zipfile.ZipFile(args.input_path, 'r') as arch_file:
+    with zipfile.ZipFile(benchmark_data_path, "r") as arch_file:
         names_dirs = [member.filename for member in arch_file.infolist() if member.file_size > 0]
-        abs_paths_to_files = [name.split('/')[:] for name in names_dirs]
+        abs_paths_to_files = [name.split("/")[:] for name in names_dirs]
 
         datasets = set([paths[1] for paths in abs_paths_to_files])
 
@@ -114,21 +110,19 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List:
 
             for img_name in sorted(imgs):
                 base_name, ext = os.path.splitext(img_name)
-                if ext not in ['.txt', '.png', '.tiff', '.tif', '.jpg']:
+                if ext not in [".txt", ".png", ".tiff", ".tif", ".jpg"]:
                     continue
 
-                gt_path = os.path.join(base_zip, dataset_name, "gts", base_name + ".txt")
+                gt_path = os.path.join(base_zip, dataset_name, "gts", f"{base_name}.txt")
                 imgs_path = os.path.join(base_zip, dataset_name, "imgs", img_name)
-                accuracy_path = os.path.join(args.log_path, dataset_name + "_" + base_name + "_accuracy.txt")
+                accuracy_path = os.path.join(cache_dir, f"{dataset_name}_{base_name}_accuracy.txt")
 
                 with TemporaryDirectory() as tmpdir:
                     tmp_gt_path = os.path.join(tmpdir, "tmp_gt.txt")
                     tmp_ocr_path = os.path.join(tmpdir, "tmp_ocr.txt")
 
                     try:
-                        with arch_file.open(gt_path) as gt_file, \
-                                open(tmp_gt_path, "wb") as tmp_gt_file,\
-                                open(tmp_ocr_path, "w") as tmp_ocr_file:
+                        with arch_file.open(gt_path) as gt_file, open(tmp_gt_path, "wb") as tmp_gt_file, open(tmp_ocr_path, "w") as tmp_ocr_file:
 
                             gt_text = gt_file.read().decode("utf-8")
                             word_cnt = len(gt_text.split())
@@ -146,7 +140,8 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List:
                             tmp_ocr_file.flush()
 
                             # calculation accuracy build for Ubuntu from source https://github.com/eddieantonio/ocreval
-                            command = "accuracy {} {} >> {}".format(tmp_gt_path, tmp_ocr_path, accuracy_path)
+                            accuracy_script_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "accuracy"))
+                            command = f"{accuracy_script_path} {tmp_gt_path} {tmp_ocr_path} >> {accuracy_path}"
                             os.system(command)
 
                             statistics = _update_statistics_by_dataset(statistics, dataset_name, accuracy_path, word_cnt)
@@ -154,6 +149,7 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List:
 
                     except Exception as ex:
                         print(ex)
+                        print("If you have problems with libutf8proc.so.2, try the command: `apt install -y libutf8proc-dev`")
 
     table_aacuracy_per_image = Texttable()
     table_aacuracy_per_image.add_rows(accs)
@@ -167,13 +163,12 @@ def _get_avg_by_dataset(statistics: Dict, dataset: str) -> List:
         accs_common.append(row)
     table_common.add_rows(accs_common)
 
-    with open(os.path.join(args.output_path, "tesseract.benchmark"), "w") as res_file:
-        res_file.write(
-            "Tesseract version is {}\nTable 1 - Accuracy for each file\n".format(pytesseract.get_tesseract_version()))
+    with open(os.path.join(output_dir, "tesseract_benchmark.txt"), "w") as res_file:
+        res_file.write(f"Tesseract version is {pytesseract.get_tesseract_version()}\nTable 1 - Accuracy for each file\n")
         res_file.write(table_aacuracy_per_image.draw())
-        res_file.write("\n\nTable 2 - AVG by each type of symbols:\n")
+        res_file.write(f"\n\nTable 2 - AVG by each type of symbols:\n")
         res_file.write(table_common.draw())
 
-    print("Tesseract version is {}".format(pytesseract.get_tesseract_version()))
+    print(f"Tesseract version is {pytesseract.get_tesseract_version()}")
     print(table_aacuracy_per_image.draw())
     print(table_common.draw())
diff --git a/docker-compose.yml b/docker-compose.yml
index ec6bab7c..3cfe4b62 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -5,7 +5,7 @@ services:
     mem_limit: 16G
     build:
       context: .
-      dockerfile: docker/Dockerfile
+      dockerfile: Dockerfile
     restart: always
     tty: true
     ports:
@@ -19,7 +19,7 @@ services:
       - dedoc
     build:
           context: .
-          dockerfile: docker/Dockerfile
+          dockerfile: Dockerfile
     tty: true
     environment:
       DOC_READER_HOST: "dedoc"
diff --git a/docker/DockerfileBaseimg b/docker/DockerfileBaseimg
deleted file mode 100644
index 3f05f37f..00000000
--- a/docker/DockerfileBaseimg
+++ /dev/null
@@ -1,59 +0,0 @@
-ARG REPOSITORY="docker.io"
-FROM ubuntu:bionic-20210118
-
-
-RUN apt-get update && apt-get install -y software-properties-common locales && locale-gen en_US.UTF-8
-RUN locale-gen ru_RU.UTF-8
-ENV TZ=Europe/Moscow
-RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-ENV LANG ru_RU.utf8
-ENV LANGUAGE ru_RU:ru
-ENV LC_ALL ru_RU.UTF-8
-
-# --------------------------------------------------PYTHON INSTALLATION-------------------------------------------------
-RUN apt-get update && \
-    apt-get -y install curl git unzip wget build-essential gcc-multilib g++-multilib git clang zlib1g-dev \
-                       pkg-config libglib2.0-dev python3 python3-pip libtool binutils-dev
-RUN curl https://repo.anaconda.com/archive/Anaconda3-2022.10-Linux-x86_64.sh --output anaconda.sh
-RUN bash anaconda.sh -b -p /anaconda3
-ENV PATH=/anaconda3/bin:$PATH
-RUN conda init bash
-RUN bash
-
-RUN apt-get install -y libreoffice
-
-# -----------------------------------------------TESSERACT INSTALLATION-------------------------------------------------
-# the commands below are used to install tesseract
-
-RUN add-apt-repository -y ppa:alex-p/tesseract-ocr-devel \
-    && apt update --allow-releaseinfo-change \
-    && apt-get install -y djvulibre-bin unrtf poppler-utils pstotext tesseract-ocr libjpeg-dev swig \
-     libtesseract-dev libleptonica-dev unrar python-poppler automake ca-certificates g++ libtool libleptonica-dev \
-     make pkg-config libpango1.0-dev
-
-RUN git clone --depth 1 --branch 5.0.0-beta-20210916 https://github.com/tesseract-ocr/tesseract/
-RUN cd tesseract && ./autogen.sh && ./configure &&  make &&  make install && ldconfig
-
-RUN apt update --allow-releaseinfo-change \
-    && apt-get install -y tesseract-ocr-rus build-essential libcairo2  \
-    libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 libffi-dev shared-mime-info
-ENV TESSDATA_PREFIX /usr/share/tesseract-ocr/5/tessdata/
-ENV PATH=/tesseract:$PATH
-
-# for reading j2k
-ENV OPENCV_IO_ENABLE_JASPER "true"
-
-# --------------------------------------------------DOCTR INSTALLATION--------------------------------------------------
-# ATTENTION: don't change an order of pip's package install here, otherwise you get conflicts
-# RUN pip install setuptools==60.10.0 cffi==1.15.0
-# RUN pip install python-doctr==0.5.1
-# We decided to stop using Doctr. If you need it, uncomment two lines above and comment one line below to make docker image with Doctr.
-
-RUN pip install pyclipper==1.3.0.post4 shapely==2.0.1 Pillow==9.2.0
-
-# ----------------------------------------SECURE TORCH & TORCHVISION INSTALLATION---------------------------------------
-RUN wget -O torch-1.11.0a0+git1911a63-cp39-cp39-linux_x86_64.whl https://at.ispras.ru/owncloud/index.php/s/gGZa46pboBlVZ7t/download
-RUN pip install torch-1.11.0a0+git1911a63-cp39-cp39-linux_x86_64.whl
-RUN wget -O torchvision-0.12.0a0+9b5a3fe-cp39-cp39-linux_x86_64.whl https://at.ispras.ru/owncloud/index.php/s/doFEAhID6OhNCkp/download
-RUN pip install torchvision-0.12.0a0+9b5a3fe-cp39-cp39-linux_x86_64.whl
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/anaconda3/lib/
diff --git a/docker/DockerfilePytorch b/docker/DockerfilePytorch
deleted file mode 100644
index 8e0ba6f7..00000000
--- a/docker/DockerfilePytorch
+++ /dev/null
@@ -1,45 +0,0 @@
-ARG REPOSITORY="docker.io"
-FROM ubuntu:bionic-20210118
-
-RUN apt-get update && \
-    apt-get install -y curl wget git vim clang python3 python3-pip \
-    build-essential gcc-multilib g++-multilib unzip
-
-RUN wget https://github.com/ninja-build/ninja/releases/download/v1.10.2/ninja-linux.zip \
-    && unzip ninja-linux.zip && mv ninja /usr/bin && rm ninja-linux.zip
-
-RUN curl -L -O https://github.com/Kitware/CMake/releases/download/v3.22.1/cmake-3.22.1-linux-x86_64.sh && \
-    mkdir /cmake && \
-    bash cmake-3.22.1-linux-x86_64.sh --prefix=/cmake --exclude-subdir --skip-license && \
-    ln -s /cmake/bin/cmake /bin/cmake && \
-    rm cmake-3.22.1-linux-x86_64.sh
-
-# Clone target from GitHub.
-RUN pip3 install --upgrade pip && \
-    pip3 install scikit-build astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions
-
-ADD pytorch.tar.gz / # put the file pytorch.tar.gz to the root of the repository
-
-WORKDIR /
-RUN curl https://repo.anaconda.com/archive/Anaconda3-2022.10-Linux-x86_64.sh --output anaconda.sh
-RUN bash anaconda.sh -b -p /anaconda3
-ENV PATH=/anaconda3/bin:$PATH
-RUN conda init bash
-RUN bash
-
-RUN conda install astunparse numpy ninja pyyaml setuptools cmake cffi typing_extensions future six requests dataclasses mkl mkl-include
-
-WORKDIR /pytorch
-ENV CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-RUN git config --global --add safe.directory '*'
-RUN python3 setup.py build
-RUN python3 setup.py bdist_wheel
-RUN pip3 install dist/torch-1.11.0a0+git1911a63-cp39-cp39-linux_x86_64.whl
-
-WORKDIR /
-RUN git clone https://github.com/pytorch/vision.git torchvision && cd /torchvision && \
-    git checkout v0.12.0
-WORKDIR /torchvision
-RUN python3 setup.py build
-RUN python3 setup.py bdist_wheel
-RUN pip3 install dist/torchvision-0.12.0a0+9b5a3fe-cp39-cp39-linux_x86_64.whl
diff --git a/docker/README.md b/docker/README.md
deleted file mode 100644
index b76b1183..00000000
--- a/docker/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-
-
-# How to change base image version for building dedoc using docker
-## Change the DockerfileBaseimg file
-
-This file is used for building an image with tesseract-ocr, libreoffice, secure pytorch and python tools in order to 
-reduce time for its building in the main docker/Dockerfile
-
-## Build the new baseimg image locally 
-
-Run the command below from the project root
-
-```shell
-export VERSION_TAG=$(date '+%Y_%m_%d')
-docker build -t dedocproject/baseimg:version_$VERSION_TAG -f docker/DockerfileBaseimg .
-```
-
-## Push the built image to the remote repository
-
-The commands below allow to push the image to the [docker-hub](https://hub.docker.com).
-You need login and password for this purpose. 
-
-```shell
-docker login -u dedocproject -p <password>
-docker tag dedocproject/baseimg:version_$VERSION_TAG dedocproject/baseimg:latest
-docker push dedocproject/baseimg:version_$VERSION_TAG
-docker push dedocproject/baseimg:latest
-```
\ No newline at end of file
diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst
index 5fcf9363..ef7f5bf1 100644
--- a/docs/source/dedoc_api_usage/api.rst
+++ b/docs/source/dedoc_api_usage/api.rst
@@ -95,7 +95,7 @@ Api parameters description
         This type is used for choosing a specific structure constructor after document structure extraction.
 
     * - return_format
-      - json, pretty_json, html, tree
+      - json, pretty_json, html, plain_text, tree
       - json
       - The output format of the result data.
         The document structure from a structure constructor (see :class:`~dedoc.data_structures.ParsedDocument`)
@@ -107,6 +107,8 @@ Api parameters description
 
         * **html** -- :class:`~dedoc.data_structures.ParsedDocument` is transformed into html file with styles and headers according to the extracted annotations and structure;
 
+        * **plain_text** -- simple textual lines of the document;
+
         * **tree** -- simple document tree representation in html format (useful for structure visualization).
 
     * - :cspan:`3` **Attachments handling**
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index c835fbf8..42ae2ea8 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -73,7 +73,7 @@ If you don't need converters, you can skip this step.
 
 
 2. Install `Tesseract OCR 5` framework.
-You can try any tutorial for this purpose or look `here <https://github.com/ispras/dedoc/blob/master/docker/DockerfileBaseimg>`_
+You can try any tutorial for this purpose or look `here <https://github.com/ispras/dedockerfiles/blob/master/dedoc_p3.9_base.Dockerfile>`_
 to get the example of Tesseract installing for dedoc container.
 
 
diff --git a/resources/benchmarks/tesseract.benchmark b/resources/benchmarks/tesseract_benchmark.txt
similarity index 89%
rename from resources/benchmarks/tesseract.benchmark
rename to resources/benchmarks/tesseract_benchmark.txt
index 864ecaed..6a59d51a 100644
--- a/resources/benchmarks/tesseract.benchmark
+++ b/resources/benchmarks/tesseract_benchmark.txt
@@ -1,17 +1,16 @@
-Tesseract version is 5.0.0-alpha-20210401-94-ga968
-Table 1 - Accuracy for each file
+Tesseract version is 5.0.0
 +---------------+---------------------+-------+-----------------+--------------+
 |    Dataset    |     Image name      | --psm | Amount of words | Accuracy OCR |
 +===============+=====================+=======+=================+==============+
-| english-words | Kaspersky           | 6     | 111             | 99.400       |
+| english-words | Kaspersky           | 6     | 111             | 99.600       |
 +---------------+---------------------+-------+-----------------+--------------+
-| english-words | USB                 | 6     | 4               | 80.900       |
+| english-words | USB                 | 6     | 4               | 85.700       |
 +---------------+---------------------+-------+-----------------+--------------+
-| english-words | words1              | 6     | 19              | 99.200       |
+| english-words | words1              | 6     | 19              | 100          |
 +---------------+---------------------+-------+-----------------+--------------+
-| english-words | words2              | 6     | 9               | 98.400       |
+| english-words | words2              | 6     | 9               | 100          |
 +---------------+---------------------+-------+-----------------+--------------+
-| english-words | words3              | 6     | 9               | 98.100       |
+| english-words | words3              | 6     | 9               | 100          |
 +---------------+---------------------+-------+-----------------+--------------+
 | others        | Zaklyuchenie_nevrol | 4     | 525             | 83.800       |
 |               | oga_00              |       |                 |              |
@@ -19,45 +18,45 @@ Table 1 - Accuracy for each file
 | others        | Zaklyuchenie_nevrol | 4     | 241             | 88.800       |
 |               | oga_01              |       |                 |              |
 +---------------+---------------------+-------+-----------------+--------------+
-| others        | napalm_doc_2_2_6    | 4     | 124             | 86           |
+| others        | napalm_doc_2_2_6    | 4     | 124             | 85.500       |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | 1.620e+14           | 4     | 695             | 99.700       |
+| tz-npa        | 1.620e+14           | 4     | 695             | 99.800       |
 +---------------+---------------------+-------+-----------------+--------------+
 | tz-npa        | 1.620e+14           | 4     | 696             | 99.700       |
 +---------------+---------------------+-------+-----------------+--------------+
 | tz-npa        | 1.620e+14           | 4     | 699             | 99.800       |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | article_multiline   | 4     | 471             | 99.900       |
+| tz-npa        | article_multiline   | 4     | 471             | 100          |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | fstek17_00          | 4     | 192             | 95.200       |
+| tz-npa        | fstek17_00          | 4     | 192             | 95.300       |
 +---------------+---------------------+-------+-----------------+--------------+
 | tz-npa        | fstek17_01          | 4     | 332             | 99.700       |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | law_image           | 4     | 182             | 99.500       |
+| tz-npa        | law_image           | 4     | 182             | 99.600       |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | napalm_doc_13_2     | 4     | 243             | 97.500       |
+| tz-npa        | napalm_doc_13_2     | 4     | 243             | 97.600       |
 +---------------+---------------------+-------+-----------------+--------------+
 | tz-npa        | ukaz_prezidenta_1   | 4     | 264             | 99.800       |
 +---------------+---------------------+-------+-----------------+--------------+
 | tz-npa        | ukodeksrf_00        | 4     | 287             | 99.900       |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | ukodeksrf_01        | 4     | 340             | 99.500       |
+| tz-npa        | ukodeksrf_01        | 4     | 340             | 99.600       |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | with_applications_0 | 4     | 146             | 95.600       |
+| tz-npa        | with_applications_0 | 4     | 146             | 95.700       |
 |               | 0                   |       |                 |              |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | with_applications_0 | 4     | 276             | 99.500       |
+| tz-npa        | with_applications_0 | 4     | 276             | 99.600       |
 |               | 1                   |       |                 |              |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | with_applications_0 | 4     | 165             | 98.700       |
+| tz-npa        | with_applications_0 | 4     | 165             | 98.800       |
 |               | 2                   |       |                 |              |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | with_applications_0 | 4     | 90              | 99.400       |
+| tz-npa        | with_applications_0 | 4     | 90              | 99.600       |
 |               | 3                   |       |                 |              |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | ТЗ_00               | 4     | 78              | 97.700       |
+| tz-npa        | ТЗ_00               | 4     | 78              | 97.900       |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | ТЗ_01               | 4     | 296             | 98.200       |
+| tz-npa        | ТЗ_01               | 4     | 296             | 98.300       |
 +---------------+---------------------+-------+-----------------+--------------+
 | tz-npa        | ТЗ_02               | 4     | 309             | 98.800       |
 +---------------+---------------------+-------+-----------------+--------------+
@@ -69,27 +68,25 @@ Table 1 - Accuracy for each file
 +---------------+---------------------+-------+-----------------+--------------+
 | tz-npa        | ТЗ_06               | 4     | 219             | 93.500       |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | ТЗ_07               | 4     | 233             | 98.500       |
+| tz-npa        | ТЗ_07               | 4     | 233             | 98.600       |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | ТЗ_08               | 4     | 284             | 97.100       |
+| tz-npa        | ТЗ_08               | 4     | 284             | 97.200       |
 +---------------+---------------------+-------+-----------------+--------------+
-| tz-npa        | ТЗ_09               | 4     | 154             | 97.400       |
+| tz-npa        | ТЗ_09               | 4     | 154             | 97.500       |
 +---------------+---------------------+-------+-----------------+--------------+
-
-Table 2 - AVG by each type of symbols:
 +--------+--------+--------+--------+--------+--------+--------+-------+-------+
 | Datase | ASCII_ | ASCII_ | ASCII_ | ASCII_ | Latin1 | Cyrill | Amoun | AVG A |
 |   t    | Spacin | Specia | Digits | Upperc | _Speci |   ic   | t of  | ccura |
 |        | g_Char | l_Symb |        | ase_Ch | al_Sym |        | words |  cy   |
 |        |   s    |  ols   |        |  ars   |  bols  |        |       |       |
 +========+========+========+========+========+========+========+=======+=======+
-| englis | 89.280 | 99.333 | 100    | 0      | 0      | 94.540 | 152   | 95.20 |
+| englis | 100    | 99.333 | 100    | 0      | 0      | 94.540 | 152   | 97.06 |
 | h-     |        |        |        |        |        |        |       | 0     |
 | words  |        |        |        |        |        |        |       |       |
 +--------+--------+--------+--------+--------+--------+--------+-------+-------+
-| others | 90.567 | 77.400 | 89.533 | 0      | 0      | 86.433 | 890   | 86.20 |
-|        |        |        |        |        |        |        |       | 0     |
+| others | 90.967 | 79.867 | 89.533 | 0      | 0      | 86.133 | 890   | 86.03 |
+|        |        |        |        |        |        |        |       | 3     |
 +--------+--------+--------+--------+--------+--------+--------+-------+-------+
-| tz-npa | 98.824 | 91.064 | 92.076 | 0      | 0      | 99.480 | 7483  | 98.32 |
-|        |        |        |        |        |        |        |       | 8     |
+| tz-npa | 99.268 | 91.064 | 92.076 | 0      | 0      | 99.480 | 7483  | 98.39 |
+|        |        |        |        |        |        |        |       | 6     |
 +--------+--------+--------+--------+--------+--------+--------+-------+-------+
\ No newline at end of file
diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py
index 67d66b5a..d9d046e9 100644
--- a/tests/api_tests/test_api_format_pdf_tabby_reader.py
+++ b/tests/api_tests/test_api_format_pdf_tabby_reader.py
@@ -2,6 +2,9 @@
 import unittest
 from typing import List
 
+from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
+from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
+from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation
 from tests.api_tests.abstract_api_test import AbstractTestApiDocReader
 
 
@@ -267,3 +270,13 @@ def test_pdf_with_tables(self) -> None:
         node = self._get_by_tree_path(tree, "0.4.2")
         self.assertEqual("list_item", node["metadata"]["paragraph_type"])
         self.assertEqual("3. В соответствии с полученной", node["text"].strip()[:30])
+
+    def test_pdf_annotations(self) -> None:
+        file_name = "Document635.pdf"
+        result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby"))
+        content = result["content"]["structure"]["subparagraphs"]
+        annotations = content[0]["annotations"]
+        annotation_names = {annotation["name"] for annotation in annotations}
+        self.assertIn(BoldAnnotation.name, annotation_names)
+        self.assertIn(SpacingAnnotation.name, annotation_names)
+        self.assertIn(BBoxAnnotation.name, annotation_names)
diff --git a/tests/api_tests/test_api_format_pdf_with_text.py b/tests/api_tests/test_api_format_pdf_with_text.py
index 407d97af..1f5c51e1 100644
--- a/tests/api_tests/test_api_format_pdf_with_text.py
+++ b/tests/api_tests/test_api_format_pdf_with_text.py
@@ -1,5 +1,4 @@
 import os
-import unittest
 from typing import List
 
 from tests.api_tests.abstract_api_test import AbstractTestApiDocReader
@@ -13,24 +12,30 @@ def _get_abs_path(self, file_name: str) -> str:
     def __filter_by_name(self, annotations: List[dict], name: str) -> List[dict]:
         return [annotation for annotation in annotations if annotation["name"] == name]
 
-    @unittest.skip("TODO")
+    def __get_annotation_names(self, annotations: List[dict]) -> List[str]:
+        return [annotation["name"] for annotation in annotations]
+
     def test_pdf_with_text_style(self) -> None:
         file_name = "diff_styles.pdf"
         result = self._send_request(file_name, dict(pdf_with_text_layer="true", document_type="", need_pdf_table_analysis="false"))
         tree = result["content"]["structure"]
         self._check_tree_sanity(tree)
 
-        node = self._get_by_tree_path(tree, "0.0")
+        node = self._get_by_tree_path(tree, "0.0.0")
         self.assertEqual("1.1TimesNewRomanItalicBold20\n", node["text"])
         self.assertIn({"start": 0, "end": 28, "name": "size", "value": "20.0"}, node["annotations"])
+        annotation_names = self.__get_annotation_names(node["annotations"])
+        self.assertListEqual(["bounding box", "style", "size", "color_annotation", "spacing"], annotation_names)
 
-        node = self._get_by_tree_path(tree, "0.1")
+        node = self._get_by_tree_path(tree, "0.0.0.0")
         annotations_size = self.__filter_by_name(name="size", annotations=node["annotations"])
         self.assertIn({"start": 0, "end": 26, "name": "size", "value": "16.0"}, annotations_size)
-        self.assertEqual(len(node["annotations"]), 5)
+        self.assertEqual(len(node["annotations"]), 6)
+        annotation_names = self.__get_annotation_names(node["annotations"])
         self.assertEqual("Different styles(Arial16):\n", node["text"])
+        self.assertListEqual(["bounding box", "bounding box", "style", "size", "color_annotation", "spacing"], annotation_names)
 
-        node = self._get_by_tree_path(tree, "0.2.2")
+        node = self._get_by_tree_path(tree, "0.1.2")
         self.assertEqual("3. TimesNewRomanItalic14, Calibri18, Tahoma16\n", node["text"])
         self.assertEqual("3. ", node["text"][0:3])
         self.assertIn({"start": 0, "end": 36, "name": "style", "value": "TimesNewRomanPSMT"}, node["annotations"])
@@ -44,7 +49,14 @@ def test_pdf_with_text_style(self) -> None:
         self.assertEqual("Tahoma16\n", node["text"][37:46])
         self.assertIn({"start": 37, "end": 45, "value": "Tahoma", "name": "style"}, node["annotations"])
         self.assertIn({"start": 37, "end": 45, "name": "size", "value": "16.0"}, node["annotations"])
-        self.assertEqual(9, len(node["annotations"]))
+        self.assertEqual(12, len(node["annotations"]))
+
+        word_bboxes = self.__filter_by_name(node["annotations"], "bounding box")
+        self.assertEqual(len(word_bboxes), 4)
+        self.assertEqual("3.", node["text"][word_bboxes[0]["start"]:word_bboxes[0]["end"]])
+        self.assertEqual("TimesNewRomanItalic14,", node["text"][word_bboxes[1]["start"]:word_bboxes[1]["end"]])
+        self.assertEqual("Calibri18,", node["text"][word_bboxes[2]["start"]:word_bboxes[2]["end"]])
+        self.assertEqual("Tahoma16", node["text"][word_bboxes[3]["start"]:word_bboxes[3]["end"]])
 
     def test_pdf_with_text_style_2(self) -> None:
         file_name = "2-column-state.pdf"
@@ -65,7 +77,6 @@ def test_pdf_with_text_style_2(self) -> None:
 
         self.assertIn("Pere Manils, Abdelberi Chaabane, Stevens Le Blond,", self._get_by_tree_path(tree, "0.1")["text"])
 
-    @unittest.skip("TODO")
     def test_pdf_with_2_columns_text(self) -> None:
         file_name = "2-column-state.pdf"
         result = self._send_request(file_name, dict(pdf_with_text_layer="true", document_type="", need_pdf_table_analysis="false"))
@@ -75,17 +86,17 @@ def test_pdf_with_2_columns_text(self) -> None:
         self.assertIn("Privacy of users in P2P networks goes far beyond their\n"
                       "current usage and is a fundamental requirement to the adop-\n"
                       "tion of P2P protocols for legal usage. In a climate of cold",
-                      self._get_by_tree_path(tree, "0.5")["text"])
+                      self._get_by_tree_path(tree, "0.4.1.2")["text"])
 
-        self.assertIn("Keywords", self._get_by_tree_path(tree, "0.6")["text"])
-        self.assertIn("Anonymizing Networks, Privacy, Tor, BitTorrent", self._get_by_tree_path(tree, "0.7")["text"])
+        self.assertIn("Keywords", self._get_by_tree_path(tree, "0.4.1.3")["text"])
+        self.assertIn("Anonymizing Networks, Privacy, Tor, BitTorrent", self._get_by_tree_path(tree, "0.4.1.4")["text"])
 
-        self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.8.0.0")["text"])
+        self.assertIn("INTRODUCTION\n", self._get_by_tree_path(tree, "0.5.0.0")["text"])
         self.assertIn("The Tor network was designed to provide freedom\n"
                       "of speech by guaranteeing anonymous communications.\n"
                       "Whereas the cryptographic foundations of Tor, based on\n"
                       "onion-routing [3, 9, 22, 24], are known to be robust, identity",
-                      self._get_by_tree_path(tree, "0.8.0.1")["text"])
+                      self._get_by_tree_path(tree, "0.5.0.1")["text"])
 
     def test_pdf_with_2_columns_text_2(self) -> None:
         file_name = "liters_state.pdf"
diff --git a/tests/data/pdf_with_text_layer/big_table_with_merged_cells.pdf b/tests/data/pdf_with_text_layer/big_table_with_merged_cells.pdf
new file mode 100644
index 00000000..c1298ab4
Binary files /dev/null and b/tests/data/pdf_with_text_layer/big_table_with_merged_cells.pdf differ