Personal/ranxia/fix markdown reader and parser (#283)

* fix_markdown_parser * markdown reader and parser * markdown reader and parser * markdown reader and parser
aigc-apps · Nov 28, 2024 · 224648a · 224648a
1 parent e67599e
commit 224648a
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 110 deletions.
diff --git a/src/pai_rag/integrations/nodeparsers/base.py b/src/pai_rag/integrations/nodeparsers/base.py
@@ -176,7 +176,10 @@ def _build_nodes_from_split(
 
         for section_parts in self._cut(raw_section_without_image):
             section_image_urls_positions = []
-            node_text = f"{current_header}: {section_parts}"
+            if len(current_header) > 0:
+                node_text = f"{current_header}: {section_parts}"
+            else:
+                node_text = section_parts
             cur_chunk_end_position = cur_chunk_start_position + len(section_parts)
 
             for img_info in image_urls_positions:

diff --git a/src/pai_rag/integrations/readers/markdown_reader.py b/src/pai_rag/integrations/readers/markdown_reader.py
diff --git a/src/pai_rag/integrations/readers/pai/pai_data_reader.py b/src/pai_rag/integrations/readers/pai/pai_data_reader.py
@@ -2,7 +2,6 @@
 from typing import List, Any
 import os
 import pathlib
-from pai_rag.integrations.readers.markdown_reader import MarkdownReader
 from pai_rag.integrations.readers.pai_image_reader import PaiImageReader
 from pai_rag.integrations.readers.pai_pdf_reader import PaiPDFReader
 from pai_rag.integrations.readers.pai_html_reader import PaiHtmlReader
@@ -11,6 +10,7 @@
 from pai_rag.integrations.readers.pai_jsonl_reader import PaiJsonLReader
 from pai_rag.integrations.readers.pai_docx_reader import PaiDocxReader
 from pai_rag.integrations.readers.pai_pptx_reader import PaiPptxReader
+from pai_rag.integrations.readers.pai_markdown_reader import PaiMarkdownReader
 
 from llama_index.core.readers.base import BaseReader
 from llama_index.core.readers import SimpleDirectoryReader
@@ -52,6 +52,10 @@ def get_file_readers(reader_config: BaseDataReaderConfig = None, oss_store: Any
             enable_table_summary=reader_config.enable_table_summary,
             oss_cache=oss_store,  # Storing pptx images
         ),
+        ".md": PaiMarkdownReader(
+            enable_table_summary=reader_config.enable_table_summary,
+            oss_cache=oss_store,  # Storing markdown images
+        ),
         ".csv": PaiPandasCSVReader(
             concat_rows=reader_config.concat_csv_rows,
             format_sheet_data_to_json=reader_config.format_sheet_data_to_json,
@@ -67,7 +71,6 @@ def get_file_readers(reader_config: BaseDataReaderConfig = None, oss_store: Any
             format_sheet_data_to_json=reader_config.format_sheet_data_to_json,
             sheet_column_filters=reader_config.sheet_column_filters,
         ),
-        ".md": MarkdownReader(),
         ".jsonl": PaiJsonLReader(),
         ".jpg": image_reader,
         ".jpeg": image_reader,

diff --git a/src/pai_rag/integrations/readers/pai_markdown_reader.py b/src/pai_rag/integrations/readers/pai_markdown_reader.py
@@ -0,0 +1,139 @@
+"""Read markdown files.
+
+"""
+from pathlib import Path
+from PIL import Image
+from typing import Dict, List, Optional, Union, Any
+import re
+import time
+import os
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+from pai_rag.utils.markdown_utils import transform_local_to_oss
+
+from loguru import logger
+
+REGEX_H1 = "===+"
+REGEX_H2 = "---+"
+REGEX_USELESS_PHRASE = "\{#[0-9a-z]+\}"  # Only for aliyun docs
+IMAGE_URL_PATTERN = r"!\[(?P<alt_text>.*?)\]\((?P<url>(?:https?://[^\s()]+|[^\s()]+)\.(?P<image_type>jpg|jpeg|png|gif|bmp))\)"
+
+
+class PaiMarkdownReader(BaseReader):
+    def __init__(
+        self,
+        enable_table_summary: bool = False,
+        oss_cache: Any = None,
+    ) -> None:
+        self.enable_table_summary = enable_table_summary
+        self._oss_cache = oss_cache
+        logger.info(
+            f"PaiMarkdownReader created with enable_table_summary : {self.enable_table_summary}"
+        )
+
+    def replace_image_paths(self, markdown_name: str, content: str):
+        image_pattern = IMAGE_URL_PATTERN
+        matches = re.findall(image_pattern, content)
+        for alt_text, local_url, image_type in matches:
+            if self._oss_cache:
+                time_tag = int(time.time())
+                oss_url = self._transform_local_to_oss(markdown_name, local_url)
+                updated_alt_text = f"pai_rag_image_{time_tag}_{alt_text}"
+                if oss_url:
+                    content = content.replace(
+                        f"![{alt_text}]({local_url})",
+                        f"![{updated_alt_text}]({oss_url})",
+                    )
+            else:
+                content = content.replace(f"![{alt_text}]({local_url})", "")
+
+        return content
+
+    def _transform_local_to_oss(self, markdown_name: str, local_url: str):
+        try:
+            image = Image.open(local_url)
+            return transform_local_to_oss(self._oss_cache, image, markdown_name)
+        except Exception as e:
+            logger.error(f"read markdown local image failed: {e}")
+            return None
+
+    def parse_markdown(self, markdown_path):
+        markdown_name = os.path.basename(markdown_path).split(".")[0]
+        markdown_name = markdown_name.replace(" ", "_")
+        text = ""
+        pre_line = ""
+        with open(markdown_path) as fp:
+            line = fp.readline()
+            is_code = False
+            while line:
+                striped_line = re.sub(REGEX_USELESS_PHRASE, "", line)
+                if striped_line.startswith("```"):
+                    is_code = not is_code
+
+                if not striped_line:
+                    text += pre_line
+                    pre_line = "\n"
+                    line = fp.readline()
+                elif re.match(REGEX_H1, striped_line):
+                    text += f"# {pre_line}"
+                    pre_line = ""
+                    line = fp.readline()
+                elif re.match(REGEX_H2, striped_line):
+                    text += f"## {pre_line}"
+                    pre_line = ""
+                    line = fp.readline()
+                else:
+                    text += pre_line
+                    pre_line = striped_line
+                    if is_code or line.startswith("#") or line.endswith("  \n"):
+                        pre_line = f"{striped_line}\n"
+                    line = fp.readline()
+
+        text += pre_line
+        md_content = self.replace_image_paths(markdown_name, text)
+        return md_content
+
+    def load_data(
+        self,
+        file_path: Union[Path, str],
+        metadata: bool = True,
+        extra_info: Optional[Dict] = None,
+    ) -> List[Document]:
+        """Loads list of documents from Markdown file and also accepts extra information in dict format."""
+        return self.load(file_path, metadata=metadata, extra_info=extra_info)
+
+    def load(
+        self,
+        file_path: Union[Path, str],
+        metadata: bool = True,
+        extra_info: Optional[Dict] = None,
+    ) -> List[Document]:
+        """Loads list of documents from Markdown file and also accepts extra information in dict format.
+
+        Args:
+            file_path (Union[Path, str]): file path of Markdown file (accepts string or Path).
+            metadata (bool, optional): if metadata to be included or not. Defaults to True.
+            extra_info (Optional[Dict], optional): extra information related to each document in dict format. Defaults to None.
+
+        Raises:
+            TypeError: if extra_info is not a dictionary.
+            TypeError: if file_path is not a string or Path.
+
+        Returns:
+            List[Document]: list of documents.
+        """
+        md_content = self.parse_markdown(file_path)
+
+        logger.info(
+            f"[PaiMarkdownReader] successfully processed markdown file {file_path}."
+        )
+        docs = []
+        if metadata and extra_info:
+            extra_info = extra_info
+        else:
+            extra_info = dict()
+            logger.info(f"processed markdown file {file_path} without metadata")
+        doc = Document(text=md_content, extra_info=extra_info)
+        docs.append(doc)
+        logger.info(f"[PaiMarkdownReader] successfully loaded {len(docs)} nodes.")
+        return docs