Skip to content

Commit

Permalink
Fix fetching schema ranges in wide tables with object columns (posit-…
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Mar 28, 2024
1 parent 98ba2e3 commit 374bcc3
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def _get_schema(self, column_start: int, num_columns: int) -> TableSchema:
for i, (c, dtype) in enumerate(zip(columns_slice, dtypes_slice)):
if dtype == object:
column_index = i + column_start
if i not in self._inferred_dtypes:
if column_index not in self._inferred_dtypes:
self._inferred_dtypes[column_index] = infer_dtype(
self.table.iloc[:, column_index]
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,27 @@ def test_pandas_get_schema(pandas_fixture: PandasFixture):
assert result["columns"] == _wrap_json(ColumnSchema, bigger_schema[10:20])


def test_pandas_wide_schemas(pandas_fixture: PandasFixture):
arr = np.arange(10).astype(object)

ncols = 10000
df = pd.DataFrame({f"col_{i}": arr for i in range(ncols)})

pandas_fixture.register_table("wide_df", df)

chunk_size = 100
for chunk_index in range(ncols // chunk_size):
start_index = chunk_index * chunk_size
pandas_fixture.register_table(
f"wide_df_{chunk_index}",
df.iloc[:, start_index : (chunk_index + 1) * chunk_size],
)

schema_slice = pandas_fixture.get_schema("wide_df", start_index, chunk_size)
expected = pandas_fixture.get_schema(f"wide_df_{chunk_index}", 0, chunk_size)
assert schema_slice["columns"] == expected["columns"]


def _trim_whitespace(columns):
return [[x.strip() for x in column] for column in columns]

Expand Down

0 comments on commit 374bcc3

Please sign in to comment.