Skip to content

Commit

Permalink
tabular reader
Browse files Browse the repository at this point in the history
  • Loading branch information
Ceceliachenen committed Jun 5, 2024
1 parent 4cd4176 commit 8929e29
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 5 deletions.
54 changes: 49 additions & 5 deletions src/pai_rag/integrations/readers/pai_excel_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pathlib import Path
from typing import Any, Dict, List, Optional
from fsspec import AbstractFileSystem
from openpyxl import load_workbook

import pandas as pd
from llama_index.core.readers.base import BaseReader
Expand Down Expand Up @@ -48,18 +49,61 @@ def __init__(
self._row_joiner = row_joiner
self._pandas_config = pandas_config

def read_xlsx(
self,
file: Path,
fs: Optional[AbstractFileSystem] = None,
):
"""Parse Excel file。"""
if fs:
with fs.open(file) as f:
excel = pd.ExcelFile(load_workbook(f), engine="openpyxl")
else:
excel = pd.ExcelFile(load_workbook(file), engine="openpyxl")
sheet_name = excel.sheet_names[0]
sheet = excel.book[sheet_name]
df = excel.parse(sheet_name, **self._pandas_config)

header_max = 0
if (
"header" in self._pandas_config
and self._pandas_config["header"] is not None
and isinstance(self._pandas_config["header"], list)
):
header_max = max(self._pandas_config["header"])
elif (
"header" in self._pandas_config
and self._pandas_config["header"] is not None
and isinstance(self._pandas_config["header"], int)
):
header_max = self._pandas_config["header"]

for item in sheet.merged_cells:
top_col, top_row, bottom_col, bottom_row = item.bounds
base_value = item.start_cell.value
# Convert 1-based index to 0-based index
top_row -= 1
top_col -= 1
# Since the previous lines are set as headers, the coordinates need to be adjusted here.
if (
"header" in self._pandas_config
and self._pandas_config["header"] is not None
) or "header" not in self._pandas_config:
top_row -= header_max + 1
bottom_row -= header_max + 1

df.iloc[top_row:bottom_row, top_col:bottom_col] = base_value
return df

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
"""Parse Excel file. only process the first sheet"""
if fs:
with fs.open(file) as f:
df = pd.read_excel(f, sheet_name=0, **self._pandas_config)
else:
df = pd.read_excel(file, sheet_name=0, **self._pandas_config)

df = self.read_xlsx(file, fs)

text_list = df.apply(
lambda row: str(dict(zip(df.columns, row.astype(str)))), axis=1
Expand Down
Binary file not shown.

0 comments on commit 8929e29

Please sign in to comment.