From ecad5ca5c24188d79ea09c40d3360867ac7bb2e1 Mon Sep 17 00:00:00 2001
From: Ceceliachenen <chenanyu.cay@alibaba-inc.com>
Date: Tue, 17 Dec 2024 16:35:51 +0800
Subject: [PATCH] add image ocr (#307)

* add image ocr

* fix typo error
---
 magic-pdf.gpu.template.json                   |   2 +-
 magic-pdf.template.json                       |   2 +-
 .../readers/utils/pai_image_ocr.py            | 149 ++++++++++++++++++
 3 files changed, 151 insertions(+), 2 deletions(-)
 create mode 100644 src/pai_rag/integrations/readers/utils/pai_image_ocr.py

diff --git a/magic-pdf.gpu.template.json b/magic-pdf.gpu.template.json
index dd5c2756..0723d54f 100644
--- a/magic-pdf.gpu.template.json
+++ b/magic-pdf.gpu.template.json
@@ -16,7 +16,7 @@
   },
   "table-config": {
     "model": "rapid_table",
-    "enable": false,
+    "enable": true,
     "max_time": 400
   },
   "config_version": "1.0.0"
diff --git a/magic-pdf.template.json b/magic-pdf.template.json
index e6d5c3e2..85d825e7 100644
--- a/magic-pdf.template.json
+++ b/magic-pdf.template.json
@@ -16,7 +16,7 @@
   },
   "table-config": {
     "model": "rapid_table",
-    "enable": false,
+    "enable": true,
     "max_time": 400
   },
   "config_version": "1.0.0"
diff --git a/src/pai_rag/integrations/readers/utils/pai_image_ocr.py b/src/pai_rag/integrations/readers/utils/pai_image_ocr.py
new file mode 100644
index 00000000..fbe90f18
--- /dev/null
+++ b/src/pai_rag/integrations/readers/utils/pai_image_ocr.py
@@ -0,0 +1,149 @@
+import os
+import cv2
+from loguru import logger
+from paddleocr import PPStructure
+from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes
+
+
+def check_merge_method(in_region):
+    """Select the function to merge paragraph.
+
+    Determine the paragraph merging method based on the positional
+    relationship between the text bbox and the first line of text in the text bbox.
+
+    Args:
+        in_region: Elements with text type in the layout result.
+
+    Returns:
+        Merge the functions of paragraph, convert_text_space_head or convert_text_space_tail.
+    """
+    if len(in_region["res"]) > 0:
+        text_bbox = in_region["bbox"]
+        text_x1 = text_bbox[0]
+        first_line_box = in_region["res"][0]["text_region"]
+        point_1 = first_line_box[0]
+        point_2 = first_line_box[2]
+        first_line_x1 = point_1[0]
+        first_line_height = abs(point_2[1] - point_1[1])
+        x1_distance = first_line_x1 - text_x1
+        return (
+            convert_text_space_head
+            if x1_distance > first_line_height
+            else convert_text_space_tail
+        )
+
+
+def convert_text_space_head(in_region):
+    """The function to merge paragraph.
+
+    The sign of dividing paragraph is that there are two spaces at the beginning.
+
+    Args:
+        in_region: Elements with text type in the layout result.
+
+    Returns:
+        The text content of the current text box.
+    """
+    text = ""
+    pre_x = None
+    first_line = True
+    for i, res in enumerate(in_region["res"]):
+        point1 = res["text_region"][0]
+        point2 = res["text_region"][2]
+        h = point2[1] - point1[1]
+
+        if i == 0:
+            text += res["text"]
+            pre_x = point1[0]
+            continue
+
+        x1 = point1[0]
+        if first_line:
+            if abs(pre_x - x1) < h:
+                text += "\n\n"
+                text += res["text"]
+                first_line = True
+            else:
+                text += res["text"]
+                first_line = False
+        else:
+            same_paragh = abs(pre_x - x1) < h
+            if same_paragh:
+                text += res["text"]
+                first_line = False
+            else:
+                text += "\n\n"
+                text += res["text"]
+                first_line = True
+        pre_x = x1
+    return text
+
+
+def convert_text_space_tail(in_region):
+    """The function to merge paragraph.
+
+    The symbol for dividing paragraph is a space at the end.
+
+    Args:
+        in_region: Elements with text type in the layout result.
+
+    Returns:
+        The text content of the current text box.
+    """
+    text = ""
+    first_line = True
+    text_bbox = in_region["bbox"]
+    width = text_bbox[2] - text_bbox[0]
+    for i, res in enumerate(in_region["res"]):
+        point1 = res["text_region"][0]
+        point2 = res["text_region"][2]
+        row_width = point2[0] - point1[0]
+        row_height = point2[1] - point1[1]
+        full_row_threshold = width - row_height
+        is_full = row_width >= full_row_threshold
+
+        if first_line:
+            text += "\n\n"
+            text += res["text"]
+        else:
+            text += res["text"]
+
+        first_line = not is_full
+    return text
+
+
+def convert_info_to_text(res, image_name):
+    """Save the recognition result as a markdown file.
+
+    Args:
+        res: Recognition result
+        save_folder: Folder to save the markdown file
+        img_name: PDF file or image file name
+
+    Returns:
+        None
+    """
+
+    text_list = []
+
+    for i, region in enumerate(res):
+        merge_func = check_merge_method(region)
+        if merge_func:
+            text_list.append(merge_func(region))
+
+    text_string = "\n\n".join(text_list)
+
+    logger.info(f"finished processing image {image_name}")
+    return text_string
+
+
+def plain_image_ocr(image_path):
+    image_name = os.path.basename(image_path).split(".")[0]
+    img = cv2.imread(image_path)
+    result = PPStructure(recovery=True)(img)
+    _, w, _ = img.shape
+    res = sorted_layout_boxes(result, w)
+    return convert_info_to_text(res, image_name)
+
+
+print(plain_image_ocr("tests/testdata/data/image_data/用户故事.jpg"))