From ecad5ca5c24188d79ea09c40d3360867ac7bb2e1 Mon Sep 17 00:00:00 2001 From: Ceceliachenen Date: Tue, 17 Dec 2024 16:35:51 +0800 Subject: [PATCH] add image ocr (#307) * add image ocr * fix typo error --- magic-pdf.gpu.template.json | 2 +- magic-pdf.template.json | 2 +- .../readers/utils/pai_image_ocr.py | 149 ++++++++++++++++++ 3 files changed, 151 insertions(+), 2 deletions(-) create mode 100644 src/pai_rag/integrations/readers/utils/pai_image_ocr.py diff --git a/magic-pdf.gpu.template.json b/magic-pdf.gpu.template.json index dd5c2756..0723d54f 100644 --- a/magic-pdf.gpu.template.json +++ b/magic-pdf.gpu.template.json @@ -16,7 +16,7 @@ }, "table-config": { "model": "rapid_table", - "enable": false, + "enable": true, "max_time": 400 }, "config_version": "1.0.0" diff --git a/magic-pdf.template.json b/magic-pdf.template.json index e6d5c3e2..85d825e7 100644 --- a/magic-pdf.template.json +++ b/magic-pdf.template.json @@ -16,7 +16,7 @@ }, "table-config": { "model": "rapid_table", - "enable": false, + "enable": true, "max_time": 400 }, "config_version": "1.0.0" diff --git a/src/pai_rag/integrations/readers/utils/pai_image_ocr.py b/src/pai_rag/integrations/readers/utils/pai_image_ocr.py new file mode 100644 index 00000000..fbe90f18 --- /dev/null +++ b/src/pai_rag/integrations/readers/utils/pai_image_ocr.py @@ -0,0 +1,149 @@ +import os +import cv2 +from loguru import logger +from paddleocr import PPStructure +from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes + + +def check_merge_method(in_region): + """Select the function to merge paragraph. + + Determine the paragraph merging method based on the positional + relationship between the text bbox and the first line of text in the text bbox. + + Args: + in_region: Elements with text type in the layout result. + + Returns: + Merge the functions of paragraph, convert_text_space_head or convert_text_space_tail. + """ + if len(in_region["res"]) > 0: + text_bbox = in_region["bbox"] + text_x1 = text_bbox[0] + first_line_box = in_region["res"][0]["text_region"] + point_1 = first_line_box[0] + point_2 = first_line_box[2] + first_line_x1 = point_1[0] + first_line_height = abs(point_2[1] - point_1[1]) + x1_distance = first_line_x1 - text_x1 + return ( + convert_text_space_head + if x1_distance > first_line_height + else convert_text_space_tail + ) + + +def convert_text_space_head(in_region): + """The function to merge paragraph. + + The sign of dividing paragraph is that there are two spaces at the beginning. + + Args: + in_region: Elements with text type in the layout result. + + Returns: + The text content of the current text box. + """ + text = "" + pre_x = None + first_line = True + for i, res in enumerate(in_region["res"]): + point1 = res["text_region"][0] + point2 = res["text_region"][2] + h = point2[1] - point1[1] + + if i == 0: + text += res["text"] + pre_x = point1[0] + continue + + x1 = point1[0] + if first_line: + if abs(pre_x - x1) < h: + text += "\n\n" + text += res["text"] + first_line = True + else: + text += res["text"] + first_line = False + else: + same_paragh = abs(pre_x - x1) < h + if same_paragh: + text += res["text"] + first_line = False + else: + text += "\n\n" + text += res["text"] + first_line = True + pre_x = x1 + return text + + +def convert_text_space_tail(in_region): + """The function to merge paragraph. + + The symbol for dividing paragraph is a space at the end. + + Args: + in_region: Elements with text type in the layout result. + + Returns: + The text content of the current text box. + """ + text = "" + first_line = True + text_bbox = in_region["bbox"] + width = text_bbox[2] - text_bbox[0] + for i, res in enumerate(in_region["res"]): + point1 = res["text_region"][0] + point2 = res["text_region"][2] + row_width = point2[0] - point1[0] + row_height = point2[1] - point1[1] + full_row_threshold = width - row_height + is_full = row_width >= full_row_threshold + + if first_line: + text += "\n\n" + text += res["text"] + else: + text += res["text"] + + first_line = not is_full + return text + + +def convert_info_to_text(res, image_name): + """Save the recognition result as a markdown file. + + Args: + res: Recognition result + save_folder: Folder to save the markdown file + img_name: PDF file or image file name + + Returns: + None + """ + + text_list = [] + + for i, region in enumerate(res): + merge_func = check_merge_method(region) + if merge_func: + text_list.append(merge_func(region)) + + text_string = "\n\n".join(text_list) + + logger.info(f"finished processing image {image_name}") + return text_string + + +def plain_image_ocr(image_path): + image_name = os.path.basename(image_path).split(".")[0] + img = cv2.imread(image_path) + result = PPStructure(recovery=True)(img) + _, w, _ = img.shape + res = sorted_layout_boxes(result, w) + return convert_info_to_text(res, image_name) + + +print(plain_image_ocr("tests/testdata/data/image_data/用户故事.jpg"))