Personal/ranxia/pptx reader (#266)

* pai ppt reader * pai ppt reader & fix oss cache * fix poetry * pptx reader
aigc-apps · Nov 12, 2024 · f42c565 · f42c565
1 parent 4a7287f
commit f42c565
Show file tree

Hide file tree

Showing 12 changed files with 323 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -55,6 +55,12 @@ PAI-RAG is an easy-to-use opensource framework for modular RAG (Retrieval-Augmen
    conda activate rag_env
    ```
 
+   if you use macOS and need to process PPTX files, you need use the following command to install the dependencies to process PPTX files:
+
+   ```bash
+      brew install mono-libgdiplus
+   ```
+
 ### (1) CPU
 
 Use poetry to install project dependency packages directly:
@@ -328,3 +334,15 @@ You can use data analysis based on database or sheet file in PAI-RAG, please ref
 For more customization options, please refer to the documentation:
 
 [Parameter Configuration Instruction](./docs/config_guide_en.md)
+
+# Supported File Types
+
+| 文件类型     | 文件格式                               |
+| ------------ | -------------------------------------- |
+| Unstructured | .txt, .docx， .pdf， .html，.pptx，.md |
+| Images       | .gif， .jpg，.png，.jpeg， .webp       |
+| Structured   | .csv，.xls， .xlsx，.jsonl             |
+| Others       | .epub，.mbox，.ipynb                   |
+
+1. .doc files need to be converted to .docx files.
+2. .ppt and .pptm files need to be converted to .pptx files.
diff --git a/README_zh.md b/README_zh.md
@@ -50,6 +50,12 @@ PAI-RAG 是一个易于使用的模块化 RAG（检索增强生成）开源框
    conda activate rag_env
    ```
 
+   如果使用macOS且需要处理PPTX文件，需要下载依赖库处理PPTX文件
+
+   ```bash
+   brew install mono-libgdiplus
+   ```
+
    ### (1) CPU环境
 
    直接使用poetry安装项目依赖包：
@@ -281,3 +287,15 @@ curl -X 'POST' http://127.0.0.1:8000/service/query -H "Content-Type: application
 如需实现更多个性化配置，请参考文档：
 
 [参数配置说明](./docs/config_guide_cn.md)
+
+# 支持文件类型
+
+| 文件类型 | 文件格式                               |
+| -------- | -------------------------------------- |
+| 非结构化 | .txt, .docx， .pdf， .html，.pptx，.md |
+| 图片     | .gif， .jpg，.png，.jpeg， .webp       |
+| 结构化   | .csv，.xls， .xlsx，.jsonl             |
+| 其他     | .epub，.mbox，.ipynb                   |
+
+1. .doc格式文档需转化为.docx格式
+2. .ppt和.pptm格式需转化为.pptx格式
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -103,6 +103,8 @@ peft = "^0.12.0"
 duckduckgo-search = "6.2.12"
 aliyun-bootstrap = "^1.0.1"
 docx = "^0.2.4"
+python-pptx = "^1.0.2"
+aspose-slides = "^24.10.0"
 
 [tool.poetry.scripts]
 pai_rag = "pai_rag.main:run"

diff --git a/pyproject_gpu.toml b/pyproject_gpu.toml
@@ -95,6 +95,9 @@ magic-pdf = {version = "0.7.0b1", extras = ["full"]}
 llama-index-callbacks-arize-phoenix = "0.1.6"
 peft = "^0.12.0"
 aliyun-bootstrap = "^1.0.1"
+docx = "^0.2.4"
+python-pptx = "^1.0.2"
+aspose-slides = "^24.10.0"
 
 [tool.poetry.scripts]
 pai_rag = "pai_rag.main:run"

diff --git a/src/pai_rag/integrations/nodeparsers/base.py b/src/pai_rag/integrations/nodeparsers/base.py
@@ -156,6 +156,24 @@ def _build_nodes_from_split(
                 )
         nodes = []
         cur_chunk_start_position = 0
+        if len(self._cut(raw_section_without_image)) == 0 and self.enable_multimodal:
+            for img_info in image_urls_positions:
+                image_node = ImageNode(
+                    embedding=node.embedding,
+                    image_url=img_info["image_url"],
+                    excluded_embed_metadata_keys=node.excluded_embed_metadata_keys,
+                    excluded_llm_metadata_keys=node.excluded_llm_metadata_keys,
+                    metadata_seperator=node.metadata_seperator,
+                    metadata_template=node.metadata_template,
+                    text_template=node.text_template,
+                    metadata={
+                        "image_url": img_info["image_url"],
+                        **node.extra_info,
+                    },
+                    relationships=relationships,
+                )
+                nodes.append(image_node)
+
         for section_parts in self._cut(raw_section_without_image):
             section_image_urls_positions = []
             node_text = f"{current_header}: {section_parts}"

diff --git a/src/pai_rag/integrations/nodeparsers/pai/pai_node_parser.py b/src/pai_rag/integrations/nodeparsers/pai/pai_node_parser.py
@@ -39,7 +39,7 @@ class NodeParserConfig(BaseModel):
 
 
 DOC_TYPES_DO_NOT_NEED_CHUNKING = set([".csv", ".xlsx", ".xls", ".jsonl"])
-DOC_TYPES_CONVERT_TO_MD = set([".md", ".pdf", ".docx", ".htm", ".html"])
+DOC_TYPES_CONVERT_TO_MD = set([".md", ".pdf", ".docx", ".htm", ".html", ".pptx"])
 IMAGE_FILE_TYPES = set([".jpg", ".jpeg", ".png"])
 
 IMAGE_URL_REGEX = re.compile(

diff --git a/src/pai_rag/integrations/readers/pai/pai_data_reader.py b/src/pai_rag/integrations/readers/pai/pai_data_reader.py
@@ -9,6 +9,7 @@
 from pai_rag.integrations.readers.pai_excel_reader import PaiPandasExcelReader
 from pai_rag.integrations.readers.pai_jsonl_reader import PaiJsonLReader
 from pai_rag.integrations.readers.pai_docx_reader import PaiDocxReader
+from pai_rag.integrations.readers.pai_pptx_reader import PaiPptxReader
 
 from llama_index.core.readers.base import BaseReader
 from llama_index.core.readers import SimpleDirectoryReader
@@ -46,6 +47,10 @@ def get_file_readers(reader_config: BaseDataReaderConfig = None, oss_store: Any
             enable_table_summary=reader_config.enable_table_summary,
             oss_cache=oss_store,  # Storing pdf images
         ),
+        ".pptx": PaiPptxReader(
+            enable_table_summary=reader_config.enable_table_summary,
+            oss_cache=oss_store,  # Storing pptx images
+        ),
         ".csv": PaiPandasCSVReader(
             concat_rows=reader_config.concat_csv_rows,
             format_sheet_data_to_json=reader_config.format_sheet_data_to_json,

diff --git a/src/pai_rag/integrations/readers/pai_docx_reader.py b/src/pai_rag/integrations/readers/pai_docx_reader.py
@@ -122,6 +122,7 @@ def _parse_row(self, row, doc_name, total_cols):
                 break
             cell_content = self._parse_cell(cell, doc_name).strip()
             row_cells[col_index] = cell_content
+            col_index += 1
         return row_cells
 
     def _parse_cell(self, cell, doc_name):
@@ -144,7 +145,7 @@ def _parse_cell_paragraph(self, paragraph, doc_name):
                     if not image_id:
                         continue
                     image_part = paragraph.part.rels.get(image_id, None)
-                    if image_id:
+                    if image_id and self._oss_cache:
                         image_blob = image_part.blob
                         image_filename = os.path.basename(image_part.partname)
                         image_url = self._transform_local_to_oss(
@@ -195,7 +196,7 @@ def convert_document_to_markdown(self, doc_path):
                                     embed_id = blip.get(
                                         "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
                                     )
-                                    if embed_id:
+                                    if embed_id and self._oss_cache:
                                         image_part = document.part.related_parts.get(
                                             embed_id
                                         )
@@ -209,7 +210,7 @@ def convert_document_to_markdown(self, doc_path):
                                         time_tag = int(time.time())
                                         alt_text = f"pai_rag_image_{time_tag}_"
                                         image_content = f"![{alt_text}]({image_url})"
-                                markdown.append(f"{image_content}\n\n")
+                                        markdown.append(f"{image_content}\n\n")
                     markdown.append(self._convert_paragraph(paragraph))
 
             elif isinstance(element.tag, str) and element.tag.endswith("tbl"):  # 表格

diff --git a/src/pai_rag/integrations/readers/pai_html_reader.py b/src/pai_rag/integrations/readers/pai_html_reader.py
@@ -157,13 +157,15 @@ def _replace_image_paths(self, html_name: str, content: str):
         image_pattern = IMAGE_URL_PATTERN
         matches = re.findall(image_pattern, content)
         for alt_text, image_url, image_type in matches:
-            time_tag = int(time.time())
-            oss_url = self._transform_local_to_oss(html_name, image_url)
-            updated_alt_text = f"pai_rag_image_{time_tag}_{alt_text}"
-            content = content.replace(
-                f"![{alt_text}]({image_url})", f"![{updated_alt_text}]({oss_url})"
-            )
-
+            if self._oss_cache:
+                time_tag = int(time.time())
+                oss_url = self._transform_local_to_oss(html_name, image_url)
+                updated_alt_text = f"pai_rag_image_{time_tag}_{alt_text}"
+                content = content.replace(
+                    f"![{alt_text}]({image_url})", f"![{updated_alt_text}]({oss_url})"
+                )
+            else:
+                content = content.replace(f"![{alt_text}]({image_url})", "")
         return content
 
     def convert_html_to_markdown(self, html_path):

diff --git a/src/pai_rag/integrations/readers/pai_pdf_reader.py b/src/pai_rag/integrations/readers/pai_pdf_reader.py
@@ -69,12 +69,15 @@ def replace_image_paths(self, pdf_name: str, content: str):
         local_image_pattern = IMAGE_LOCAL_PATTERN
         matches = re.findall(local_image_pattern, content)
         for alt_text, local_url, image_type in matches:
-            time_tag = int(time.time())
-            oss_url = self._transform_local_to_oss(pdf_name, local_url)
-            updated_alt_text = f"pai_rag_image_{time_tag}_{alt_text}"
-            content = content.replace(
-                f"![{alt_text}]({local_url})", f"![{updated_alt_text}]({oss_url})"
-            )
+            if self._oss_cache:
+                time_tag = int(time.time())
+                oss_url = self._transform_local_to_oss(pdf_name, local_url)
+                updated_alt_text = f"pai_rag_image_{time_tag}_{alt_text}"
+                content = content.replace(
+                    f"![{alt_text}]({local_url})", f"![{updated_alt_text}]({oss_url})"
+                )
+            else:
+                content = content.replace(f"![{alt_text}]({local_url})", "")
 
         return content