Skip to content

Commit

Permalink
Personal/ranxia/pptx reader (#266)
Browse files Browse the repository at this point in the history
* pai ppt reader

* pai ppt reader & fix oss cache

* fix poetry

* pptx reader
  • Loading branch information
Ceceliachenen authored Nov 12, 2024
1 parent 4a7287f commit f42c565
Show file tree
Hide file tree
Showing 12 changed files with 323 additions and 19 deletions.
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ PAI-RAG is an easy-to-use opensource framework for modular RAG (Retrieval-Augmen
conda activate rag_env
```

if you use macOS and need to process PPTX files, you need use the following command to install the dependencies to process PPTX files:

```bash
brew install mono-libgdiplus
```

### (1) CPU

Use poetry to install project dependency packages directly:
Expand Down Expand Up @@ -328,3 +334,15 @@ You can use data analysis based on database or sheet file in PAI-RAG, please ref
For more customization options, please refer to the documentation:
[Parameter Configuration Instruction](./docs/config_guide_en.md)
# Supported File Types
| 文件类型 | 文件格式 |
| ------------ | -------------------------------------- |
| Unstructured | .txt, .docx, .pdf, .html,.pptx,.md |
| Images | .gif, .jpg,.png,.jpeg, .webp |
| Structured | .csv,.xls, .xlsx,.jsonl |
| Others | .epub,.mbox,.ipynb |
1. .doc files need to be converted to .docx files.
2. .ppt and .pptm files need to be converted to .pptx files.
18 changes: 18 additions & 0 deletions README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ PAI-RAG 是一个易于使用的模块化 RAG(检索增强生成)开源框
conda activate rag_env
```

如果使用macOS且需要处理PPTX文件,需要下载依赖库处理PPTX文件

```bash
brew install mono-libgdiplus
```

### (1) CPU环境

直接使用poetry安装项目依赖包:
Expand Down Expand Up @@ -281,3 +287,15 @@ curl -X 'POST' http://127.0.0.1:8000/service/query -H "Content-Type: application
如需实现更多个性化配置,请参考文档:

[参数配置说明](./docs/config_guide_cn.md)

# 支持文件类型

| 文件类型 | 文件格式 |
| -------- | -------------------------------------- |
| 非结构化 | .txt, .docx, .pdf, .html,.pptx,.md |
| 图片 | .gif, .jpg,.png,.jpeg, .webp |
| 结构化 | .csv,.xls, .xlsx,.jsonl |
| 其他 | .epub,.mbox,.ipynb |

1. .doc格式文档需转化为.docx格式
2. .ppt和.pptm格式需转化为.pptx格式
46 changes: 44 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ peft = "^0.12.0"
duckduckgo-search = "6.2.12"
aliyun-bootstrap = "^1.0.1"
docx = "^0.2.4"
python-pptx = "^1.0.2"
aspose-slides = "^24.10.0"

[tool.poetry.scripts]
pai_rag = "pai_rag.main:run"
Expand Down
3 changes: 3 additions & 0 deletions pyproject_gpu.toml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ magic-pdf = {version = "0.7.0b1", extras = ["full"]}
llama-index-callbacks-arize-phoenix = "0.1.6"
peft = "^0.12.0"
aliyun-bootstrap = "^1.0.1"
docx = "^0.2.4"
python-pptx = "^1.0.2"
aspose-slides = "^24.10.0"

[tool.poetry.scripts]
pai_rag = "pai_rag.main:run"
Expand Down
18 changes: 18 additions & 0 deletions src/pai_rag/integrations/nodeparsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,24 @@ def _build_nodes_from_split(
)
nodes = []
cur_chunk_start_position = 0
if len(self._cut(raw_section_without_image)) == 0 and self.enable_multimodal:
for img_info in image_urls_positions:
image_node = ImageNode(
embedding=node.embedding,
image_url=img_info["image_url"],
excluded_embed_metadata_keys=node.excluded_embed_metadata_keys,
excluded_llm_metadata_keys=node.excluded_llm_metadata_keys,
metadata_seperator=node.metadata_seperator,
metadata_template=node.metadata_template,
text_template=node.text_template,
metadata={
"image_url": img_info["image_url"],
**node.extra_info,
},
relationships=relationships,
)
nodes.append(image_node)

for section_parts in self._cut(raw_section_without_image):
section_image_urls_positions = []
node_text = f"{current_header}: {section_parts}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class NodeParserConfig(BaseModel):


DOC_TYPES_DO_NOT_NEED_CHUNKING = set([".csv", ".xlsx", ".xls", ".jsonl"])
DOC_TYPES_CONVERT_TO_MD = set([".md", ".pdf", ".docx", ".htm", ".html"])
DOC_TYPES_CONVERT_TO_MD = set([".md", ".pdf", ".docx", ".htm", ".html", ".pptx"])
IMAGE_FILE_TYPES = set([".jpg", ".jpeg", ".png"])

IMAGE_URL_REGEX = re.compile(
Expand Down
5 changes: 5 additions & 0 deletions src/pai_rag/integrations/readers/pai/pai_data_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pai_rag.integrations.readers.pai_excel_reader import PaiPandasExcelReader
from pai_rag.integrations.readers.pai_jsonl_reader import PaiJsonLReader
from pai_rag.integrations.readers.pai_docx_reader import PaiDocxReader
from pai_rag.integrations.readers.pai_pptx_reader import PaiPptxReader

from llama_index.core.readers.base import BaseReader
from llama_index.core.readers import SimpleDirectoryReader
Expand Down Expand Up @@ -46,6 +47,10 @@ def get_file_readers(reader_config: BaseDataReaderConfig = None, oss_store: Any
enable_table_summary=reader_config.enable_table_summary,
oss_cache=oss_store, # Storing pdf images
),
".pptx": PaiPptxReader(
enable_table_summary=reader_config.enable_table_summary,
oss_cache=oss_store, # Storing pptx images
),
".csv": PaiPandasCSVReader(
concat_rows=reader_config.concat_csv_rows,
format_sheet_data_to_json=reader_config.format_sheet_data_to_json,
Expand Down
7 changes: 4 additions & 3 deletions src/pai_rag/integrations/readers/pai_docx_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def _parse_row(self, row, doc_name, total_cols):
break
cell_content = self._parse_cell(cell, doc_name).strip()
row_cells[col_index] = cell_content
col_index += 1
return row_cells

def _parse_cell(self, cell, doc_name):
Expand All @@ -144,7 +145,7 @@ def _parse_cell_paragraph(self, paragraph, doc_name):
if not image_id:
continue
image_part = paragraph.part.rels.get(image_id, None)
if image_id:
if image_id and self._oss_cache:
image_blob = image_part.blob
image_filename = os.path.basename(image_part.partname)
image_url = self._transform_local_to_oss(
Expand Down Expand Up @@ -195,7 +196,7 @@ def convert_document_to_markdown(self, doc_path):
embed_id = blip.get(
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
)
if embed_id:
if embed_id and self._oss_cache:
image_part = document.part.related_parts.get(
embed_id
)
Expand All @@ -209,7 +210,7 @@ def convert_document_to_markdown(self, doc_path):
time_tag = int(time.time())
alt_text = f"pai_rag_image_{time_tag}_"
image_content = f"![{alt_text}]({image_url})"
markdown.append(f"{image_content}\n\n")
markdown.append(f"{image_content}\n\n")
markdown.append(self._convert_paragraph(paragraph))

elif isinstance(element.tag, str) and element.tag.endswith("tbl"): # 表格
Expand Down
16 changes: 9 additions & 7 deletions src/pai_rag/integrations/readers/pai_html_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,15 @@ def _replace_image_paths(self, html_name: str, content: str):
image_pattern = IMAGE_URL_PATTERN
matches = re.findall(image_pattern, content)
for alt_text, image_url, image_type in matches:
time_tag = int(time.time())
oss_url = self._transform_local_to_oss(html_name, image_url)
updated_alt_text = f"pai_rag_image_{time_tag}_{alt_text}"
content = content.replace(
f"![{alt_text}]({image_url})", f"![{updated_alt_text}]({oss_url})"
)

if self._oss_cache:
time_tag = int(time.time())
oss_url = self._transform_local_to_oss(html_name, image_url)
updated_alt_text = f"pai_rag_image_{time_tag}_{alt_text}"
content = content.replace(
f"![{alt_text}]({image_url})", f"![{updated_alt_text}]({oss_url})"
)
else:
content = content.replace(f"![{alt_text}]({image_url})", "")
return content

def convert_html_to_markdown(self, html_path):
Expand Down
15 changes: 9 additions & 6 deletions src/pai_rag/integrations/readers/pai_pdf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,15 @@ def replace_image_paths(self, pdf_name: str, content: str):
local_image_pattern = IMAGE_LOCAL_PATTERN
matches = re.findall(local_image_pattern, content)
for alt_text, local_url, image_type in matches:
time_tag = int(time.time())
oss_url = self._transform_local_to_oss(pdf_name, local_url)
updated_alt_text = f"pai_rag_image_{time_tag}_{alt_text}"
content = content.replace(
f"![{alt_text}]({local_url})", f"![{updated_alt_text}]({oss_url})"
)
if self._oss_cache:
time_tag = int(time.time())
oss_url = self._transform_local_to_oss(pdf_name, local_url)
updated_alt_text = f"pai_rag_image_{time_tag}_{alt_text}"
content = content.replace(
f"![{alt_text}]({local_url})", f"![{updated_alt_text}]({oss_url})"
)
else:
content = content.replace(f"![{alt_text}]({local_url})", "")

return content

Expand Down
Loading

0 comments on commit f42c565

Please sign in to comment.