Skip to content

Commit

Permalink
Fix xls parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
moria97 committed Dec 23, 2024
1 parent a3171b9 commit f39e8b8
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 23 deletions.
2 changes: 1 addition & 1 deletion src/pai_rag/integrations/readers/pai_csv_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def __init__(
def load_data(
self,
file: Path,
chunk_size=800,
chunk_size=3000,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
Expand Down
62 changes: 40 additions & 22 deletions src/pai_rag/integrations/readers/utils/pai_parse_workbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def split_row_group(row_group, headers=[], splitter=None, form_title=None):
Split a row group into smaller pieces.
"""
raw_text = ""
form_title = form_title + "\n"
form_title = form_title + "\n\n"
title_text = ""
if len(row_group) == 0:
return []
Expand All @@ -46,12 +46,18 @@ def split_row_group(row_group, headers=[], splitter=None, form_title=None):
is_same_value = False
break

is_outline_column.append(is_same_value)
if is_same_value:
if len(headers) == 0:
title_text += f"{first_value}\n\n"
column_text = f"{first_value}\n\n\n"
else:
column_text = f"{headers[j]}: {first_value}\n\n\n"

if len(column_text) <= 30:
title_text += column_text
else:
title_text += f"{headers[j]}: {first_value}\n\n"
is_same_value = False

is_outline_column.append(is_same_value)

for i in range(len(row_group)):
for j in range(len(row_group[0])):
Expand All @@ -66,7 +72,7 @@ def split_row_group(row_group, headers=[], splitter=None, form_title=None):
else:
raw_text += f"{headers[j]}: {row_group[i][j]}\n"

raw_text += "\n"
raw_text += "\n\n"

image_url_list = re.findall(IMAGE_REGEX, raw_text) + re.findall(
IMAGE_REGEX, title_text
Expand All @@ -79,19 +85,20 @@ def split_row_group(row_group, headers=[], splitter=None, form_title=None):
raw_text = re.sub(IMAGE_REGEX, "", raw_text)
title_text = re.sub(IMAGE_REGEX, "", title_text)

if len(raw_text) < 800 and len(title_text) < 800:
if len(raw_text) < 3000:
return [
Document(
text=form_title + raw_text + title_text,
text=form_title + title_text + raw_text,
extra_info={"image_info_list": image_info_list},
)
]
else:
return [
Document(
text=form_title + split, extra_info={"image_info_list": image_info_list}
text=form_title + title_text + split,
extra_info={"image_info_list": image_info_list},
)
for split in splitter.split_text(title_text + raw_text)
for split in splitter.split_text(raw_text)
]


Expand Down Expand Up @@ -137,13 +144,17 @@ def chunk_form(form_title, form_data, header_row=-1, splitter=None):
# 试探下一行是否可以合并
while i + 1 < values.shape[0]:
should_merge = False
if values.shape[1] <= 2:
if values.shape[1] <= 1:
should_merge = True
else:
for j in range(values.shape[1]):
if values[i + 1][j] is not None:
if values[i + 1][j] == values[i][j]:
should_merge = True
if (
values[i + 1][j] is not None
and values[i + 1][j] != ""
and values[i + 1][j] == values[i][j]
):
should_merge = True
break

if should_merge:
row_group_values.append(values[i + 1])
Expand Down Expand Up @@ -360,15 +371,22 @@ def split_sheet_v2(sheet, oss_client, splitter):
anchor = image.anchor
i = anchor._from.row
j = anchor._from.col
image_url = oss_client.put_object_if_not_exists(
data=image._data(),
file_ext=f".{image.format}",
headers={
"x-oss-object-acl": "public-read"
}, # set public read to make image accessible
path_prefix="pairag/images/",
)
logger.info(f"Uploaded image to {image_url}.")

image_url = None
try:
image_url = oss_client.put_object_if_not_exists(
data=image._data(),
file_ext=f".{image.format}",
headers={
"x-oss-object-acl": "public-read"
}, # set public read to make image accessible
path_prefix="pairag/images/",
)
logger.info(f"Uploaded image to {image_url}.")
except Exception as ex:
logger.warning(f"Error occurred when upload image to OSS: {ex}")
continue

if len(sheet_data) == 0:
sheet_data = [[]]
if len(sheet_data[0]) == 0:
Expand Down

0 comments on commit f39e8b8

Please sign in to comment.