Skip to content

Commit

Permalink
Personal/ranxia/fix html reader (#297)
Browse files Browse the repository at this point in the history
* fix html reader

* fix html reader

* fix html reader
  • Loading branch information
Ceceliachenen authored Dec 5, 2024
1 parent 65a5a1d commit 51dc01e
Show file tree
Hide file tree
Showing 8 changed files with 579 additions and 11 deletions.
18 changes: 11 additions & 7 deletions src/pai_rag/integrations/readers/pai_html_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,11 @@ def _convert_table_to_pai_table(self, table):
row_cells = [""] * max_cols
if current_row_index >= max_rows:
table_matrix.append(row_cells)
max_rows += 1
for cell in row.find_all(["th", "td"]):
if cell.name != "th":
row_header_flag = False
else:
elif cell.name == "th" and current_row_index != 0:
col_header_index_max = max(col_header_index_max, current_col_index)
cell_content = self._parse_cell_content(cell)
col_span = int(cell.get("colspan", 1))
Expand All @@ -96,16 +97,19 @@ def _convert_table_to_pai_table(self, table):
table_matrix[current_row_index][
current_col_index + i
] = cell_content

if current_row_index == 0:
max_cols += col_span
for i in range(1, row_span):
if current_row_index + i > max_rows:
if current_row_index + i >= max_rows:
row_cells = [""] * max_cols
table_matrix.append(row_cells)
table_matrix[current_row_index + i][
current_col_index
] = cell_content
max_rows += 1
table_matrix[current_row_index + i][
current_col_index
] = cell_content
max_rows = max(current_row_index + row_span, max_rows)
current_col_index += col_span
if current_row_index == 0:
max_cols += col_span
if row_header_flag:
row_headers_index.append(current_row_index)
current_row_index += 1
Expand Down
10 changes: 6 additions & 4 deletions src/pai_rag/utils/markdown_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,17 @@ def get_rows(self):
def get_column_headers(self):
if not self.column_headers_index or len(self.column_headers_index) == 0:
return []
return [[row[i] for i in self.column_headers_index] for row in self.data]
# return [self.data[:, col] for col in self.column_headers_index]
return [[row[col] for row in self.data] for col in self.column_headers_index]

def get_columns(self):
if self.column_headers_index and len(self.column_headers_index) > 0:
data_col_start_index = max(self.col_headers_index) + 1
data_col_start_index = max(self.column_headers_index) + 1
else:
data_col_start_index = 0
return [
[row[i] for i in range(data_col_start_index, self.get_col_numbers())]
for row in self.data
[row[col] for row in self.data]
for col in range(data_col_start_index, self.get_col_numbers())
]


Expand Down Expand Up @@ -113,6 +114,7 @@ def convert_table_to_markdown(table: PaiTable, total_cols: int) -> str:
if len(table.get_column_headers()) > 0:
headers = table.get_column_headers()
rows = table.get_columns()
total_cols = table.get_row_numbers()
else:
headers = table.get_row_headers()
rows = table.get_rows()
Expand Down
23 changes: 23 additions & 0 deletions tests/data_readers/test_html_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import os
from pathlib import Path
from pai_rag.core.rag_config_manager import RagConfigManager
from pai_rag.core.rag_module import resolve
from pai_rag.integrations.readers.pai.pai_data_reader import PaiDataReader
from pai_rag.integrations.readers.pai_html_reader import PaiHtmlReader

BASE_DIR = Path(__file__).parent.parent.parent


def test_pai_html_reader():
config_file = os.path.join(BASE_DIR, "src/pai_rag/config/settings.toml")
config = RagConfigManager.from_file(config_file).get_value()
directory_reader = resolve(
cls=PaiDataReader,
reader_config=config.data_reader,
)
input_dir = "tests/testdata/data/html_data"

directory_reader.file_readers[".html"] = PaiHtmlReader()

documents = directory_reader.load_data(file_path_or_directory=input_dir)
assert len(documents) == 5

Large diffs are not rendered by default.

101 changes: 101 additions & 0 deletions tests/testdata/data/html_data/AI写真计费说明.html

Large diffs are not rendered by default.

97 changes: 97 additions & 0 deletions tests/testdata/data/html_data/EAS计费说明.html

Large diffs are not rendered by default.

101 changes: 101 additions & 0 deletions tests/testdata/data/html_data/多媒体分析计费说明.html

Large diffs are not rendered by default.

143 changes: 143 additions & 0 deletions tests/testdata/data/html_data/聚类模型评估.html

Large diffs are not rendered by default.

0 comments on commit 51dc01e

Please sign in to comment.