diff --git a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py index 2e05290eed9..881b773f0ff 100644 --- a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py +++ b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py @@ -48,6 +48,9 @@ SearchSchemaResult, SetRowFiltersRequest, SetSortColumnsRequest, + SummaryStatsBoolean, + SummaryStatsNumber, + SummaryStatsString, TableData, TableSchema, TableShape, @@ -104,9 +107,7 @@ def _recompute_if_needed(self) -> bool: return False def get_schema(self, request: GetSchemaRequest): - return self._get_schema( - request.params.start_index, request.params.num_columns - ).dict() + return self._get_schema(request.params.start_index, request.params.num_columns).dict() def search_schema(self, request: SearchSchemaRequest): return self._search_schema( @@ -294,9 +295,11 @@ def __init__( # search term changes, we discard the last search result. We # might add an LRU cache here or something if it helps # performance. - self._search_schema_last_result: Optional[ - Tuple[str, List[ColumnSchema]] - ] = None + self._search_schema_last_result: Optional[Tuple[str, List[ColumnSchema]]] = None + + # squelch a warning from pandas 2.2.0 about the use below of + # fillna + pd_.set_option("future.no_silent_downcasting", True) def invalidate_computations(self): self.filtered_indices = self.view_indices = None @@ -341,12 +344,7 @@ def _get_schema(self, column_start: int, num_columns: int) -> TableSchema: column_start, min(column_start + num_columns, len(self.table.columns)), ): - column_raw_name = self.table.columns[column_index] - column_name = str(column_raw_name) - - col_schema = self._get_single_column_schema( - column_index, column_name - ) + col_schema = self._get_single_column_schema(column_index) column_schemas.append(col_schema) return TableSchema(columns=column_schemas) @@ -372,9 +370,7 @@ def _search_schema( total_num_matches=len(matches), ) - def _search_schema_get_matches( - self, search_term: str - ) -> List[ColumnSchema]: + def _search_schema_get_matches(self, search_term: str) -> List[ColumnSchema]: matches = [] for column_index in range(len(self.table.columns)): column_raw_name = self.table.columns[column_index] @@ -384,9 +380,7 @@ def _search_schema_get_matches( if search_term not in column_name.lower(): continue - col_schema = self._get_single_column_schema( - column_index, column_name - ) + col_schema = self._get_single_column_schema(column_index) matches.append(col_schema) return matches @@ -395,12 +389,13 @@ def _get_inferred_dtype(self, column_index: int): from pandas.api.types import infer_dtype if column_index not in self._inferred_dtypes: - self._inferred_dtypes[column_index] = infer_dtype( - self.table.iloc[:, column_index] - ) + self._inferred_dtypes[column_index] = infer_dtype(self.table.iloc[:, column_index]) return self._inferred_dtypes[column_index] - def _get_single_column_schema(self, column_index: int, column_name: str): + def _get_single_column_schema(self, column_index: int): + column_raw_name = self.table.columns[column_index] + column_name = str(column_raw_name) + # TODO: pandas MultiIndex columns # TODO: time zone for datetimetz datetime64[ns] types dtype = self.dtypes.iloc[column_index] @@ -443,16 +438,15 @@ def _get_data_values( if self.view_indices is not None: # If the table is either filtered or sorted, use a slice - # the view_indices to select the virtual range of values for the grid + # the view_indices to select the virtual range of values + # for the grid view_slice = self.view_indices[row_start : row_start + num_rows] columns = [col.take(view_slice) for col in columns] indices = self.table.index.take(view_slice) else: # No filtering or sorting, just slice directly indices = self.table.index[row_start : row_start + num_rows] - columns = [ - col.iloc[row_start : row_start + num_rows] for col in columns - ] + columns = [col.iloc[row_start : row_start + num_rows] for col in columns] formatted_columns = [_pandas_format_values(col) for col in columns] @@ -560,7 +554,7 @@ def _eval_filter(self, filt: RowFilter): # Nulls are possible in the mask, so we just fill them if any if mask.dtype != bool: - mask = mask.fillna(False) + mask = mask.fillna(False).infer_objects(copy=False) return mask.to_numpy() @@ -585,9 +579,7 @@ def _sort_data(self) -> None: self.view_indices = self.filtered_indices.take(sort_indexer) else: # Data is not filtered - self.view_indices = nargsort( - column, kind="mergesort", ascending=key.ascending - ) + self.view_indices = nargsort(column, kind="mergesort", ascending=key.ascending) elif len(self.sort_keys) > 1: # Multiple sorting keys cols_to_sort = [] @@ -618,7 +610,63 @@ def _prof_null_count(self, column_index: int): return self._get_column(column_index).isnull().sum() def _prof_summary_stats(self, column_index: int): - raise NotImplementedError + col_schema = self._get_single_column_schema(column_index) + col = self._get_column(column_index) + + ui_type = col_schema.type_display + handler = self._SUMMARIZERS.get(ui_type) + + if handler is None: + # Return nothing for types we don't yet know how to summarize + return ColumnSummaryStats(type_display=ui_type) + else: + return handler(col) + + @staticmethod + def _summarize_number(col: "pd.Series"): + min_value = col.min() + max_value = col.max() + mean = col.mean() + median = col.median() + stdev = col.std() + + return ColumnSummaryStats( + type_display=ColumnDisplayType.Number, + number_stats=SummaryStatsNumber( + min_value=str(min_value), + max_value=str(max_value), + mean=str(mean), + median=str(median), + stdev=str(stdev), + ), + ) + + @staticmethod + def _summarize_string(col: "pd.Series"): + num_empty = (col.str.len() == 0).sum() + num_unique = col.nunique() + + return ColumnSummaryStats( + type_display=ColumnDisplayType.String, + string_stats=SummaryStatsString(num_empty=num_empty, num_unique=num_unique), + ) + + @staticmethod + def _summarize_boolean(col: "pd.Series"): + null_count = col.isnull().sum() + true_count = col.sum() + false_count = len(col) - true_count - null_count + + return ColumnSummaryStats( + type_display=ColumnDisplayType.Boolean, + boolean_stats=SummaryStatsBoolean(true_count=true_count, false_count=false_count), + ) + + _SUMMARIZERS = { + ColumnDisplayType.Boolean: _summarize_boolean, + ColumnDisplayType.Number: _summarize_number, + ColumnDisplayType.String: _summarize_string, + } def _prof_freq_table(self, column_index: int): raise NotImplementedError @@ -628,9 +676,7 @@ def _prof_histogram(self, column_index: int): def _get_state(self) -> TableState: return TableState( - table_shape=TableShape( - num_rows=self.table.shape[0], num_columns=self.table.shape[1] - ), + table_shape=TableShape(num_rows=self.table.shape[0], num_columns=self.table.shape[1]), row_filters=self.filters, sort_keys=self.sort_keys, ) @@ -817,9 +863,7 @@ def handle_variable_updated(self, variable_name, new_variable): for comm_id in list(self.path_to_comm_ids[path]): self._update_explorer_for_comm(comm_id, path, new_variable) - def _update_explorer_for_comm( - self, comm_id: str, path: PathKey, new_variable - ): + def _update_explorer_for_comm(self, comm_id: str, path: PathKey, new_variable): """ If a variable is updated, we have to handle the different scenarios: @@ -853,9 +897,7 @@ def _update_explorer_for_comm( # data explorer open for a nested value, then we need to use # the same variables inspection logic to resolve it here. if len(path) > 1: - is_found, new_table = _resolve_value_from_path( - new_variable, path[1:] - ) + is_found, new_table = _resolve_value_from_path(new_variable, path[1:]) if not is_found: raise KeyError(f"Path {', '.join(path)} not found in value") else: @@ -874,9 +916,7 @@ def _fire_data_update(): def _fire_schema_update(discard_state=False): msg = SchemaUpdateParams(discard_state=discard_state) - comm.send_event( - DataExplorerFrontendEvent.SchemaUpdate.value, msg.dict() - ) + comm.send_event(DataExplorerFrontendEvent.SchemaUpdate.value, msg.dict()) if type(new_table) is not type(table_view.table): # noqa: E721 # Data type has changed. For now, we will signal the UI to @@ -921,9 +961,7 @@ def _fire_schema_update(discard_state=False): else: _fire_data_update() - def handle_msg( - self, msg: CommMessage[DataExplorerBackendMessageContent], raw_msg - ): + def handle_msg(self, msg: CommMessage[DataExplorerBackendMessageContent], raw_msg): """ Handle messages received from the client via the positron.data_explorer comm. diff --git a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py index 64d0bab6e3d..3876af048cb 100644 --- a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py +++ b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py @@ -13,11 +13,16 @@ from ..access_keys import encode_access_key from ..data_explorer import COMPARE_OPS, DataExplorerService from ..data_explorer_comm import ( + ColumnDisplayType, + ColumnSummaryStats, RowFilter, ColumnProfileResult, ColumnSchema, ColumnSortKey, FilterResult, + SummaryStatsNumber, + SummaryStatsString, + SummaryStatsBoolean, ) from .conftest import DummyComm, PositronShell from .test_variables import BIG_ARRAY_LENGTH @@ -426,7 +431,7 @@ def compare_tables(self, table_id: str, expected_id: str, table_shape: tuple): @pytest.fixture() -def de_fixture(de_service: DataExplorerService): +def dxf(de_service: DataExplorerService): return DataExplorerFixture(de_service) @@ -434,8 +439,8 @@ def _wrap_json(model: Type[BaseModel], data: JsonRecords): return [model(**d).dict() for d in data] -def test_pandas_get_state(de_fixture: DataExplorerFixture): - result = de_fixture.get_state("simple") +def test_pandas_get_state(dxf: DataExplorerFixture): + result = dxf.get_state("simple") assert result["table_shape"]["num_rows"] == 5 assert result["table_shape"]["num_columns"] == 6 @@ -444,17 +449,15 @@ def test_pandas_get_state(de_fixture: DataExplorerFixture): {"column_index": 1, "ascending": False}, ] filters = [_compare_filter(0, ">", 0), _compare_filter(0, "<", 5)] - de_fixture.set_sort_columns("simple", sort_keys=sort_keys) - de_fixture.set_row_filters("simple", filters=filters) + dxf.set_sort_columns("simple", sort_keys=sort_keys) + dxf.set_row_filters("simple", filters=filters) - result = de_fixture.get_state("simple") + result = dxf.get_state("simple") assert result["sort_keys"] == sort_keys assert result["row_filters"] == [RowFilter(**f) for f in filters] -def test_pandas_get_schema(de_fixture: DataExplorerFixture): - dxf = de_fixture - +def test_pandas_get_schema(dxf: DataExplorerFixture): result = dxf.get_schema("simple", 0, 100) full_schema = [ @@ -524,9 +527,7 @@ def test_pandas_get_schema(de_fixture: DataExplorerFixture): assert result["columns"] == _wrap_json(ColumnSchema, bigger_schema[10:20]) -def test_pandas_wide_schemas(de_fixture: DataExplorerFixture): - dxf = de_fixture - +def test_pandas_wide_schemas(dxf: DataExplorerFixture): arr = np.arange(10).astype(object) ncols = 10000 @@ -550,9 +551,7 @@ def test_pandas_wide_schemas(de_fixture: DataExplorerFixture): assert left == right -def test_pandas_search_schema(de_fixture: DataExplorerFixture): - dxf = de_fixture - +def test_pandas_search_schema(dxf: DataExplorerFixture): # Make a few thousand column names we can search for column_names = [ f"{prefix}_{i}" @@ -591,8 +590,8 @@ def _trim_whitespace(columns): return [[x.strip() for x in column] for column in columns] -def test_pandas_get_data_values(de_fixture: DataExplorerFixture): - result = de_fixture.get_data_values( +def test_pandas_get_data_values(dxf: DataExplorerFixture): + result = dxf.get_data_values( "simple", row_start_index=0, num_rows=20, @@ -621,14 +620,12 @@ def test_pandas_get_data_values(de_fixture: DataExplorerFixture): assert result["row_labels"] == [["0", "1", "2", "3", "4"]] # Edge cases: request beyond end of table - response = de_fixture.get_data_values( - "simple", row_start_index=5, num_rows=10, column_indices=[0] - ) + response = dxf.get_data_values("simple", row_start_index=5, num_rows=10, column_indices=[0]) assert response["columns"] == [[]] # Issue #2149 -- return empty result when requesting non-existent # column indices - response = de_fixture.get_data_values( + response = dxf.get_data_values( "simple", row_start_index=0, num_rows=5, column_indices=[2, 3, 4, 5] ) assert _trim_whitespace(response["columns"]) == expected_columns[2:] @@ -638,7 +635,7 @@ def test_pandas_get_data_values(de_fixture: DataExplorerFixture): # to request non-existent column indices, disable this test # with pytest.raises(IndexError): - # de_fixture.get_data_values( + # dxf.get_data_values( # "simple", row_start_index=0, num_rows=10, column_indices=[4] # ) @@ -690,8 +687,7 @@ def _set_member_filter(column_index, values, inclusive=True): ) -def test_pandas_filter_between(de_fixture: DataExplorerFixture): - dxf = de_fixture +def test_pandas_filter_between(dxf: DataExplorerFixture): df = SIMPLE_PANDAS_DF column = "a" column_index = df.columns.get_loc(column) @@ -719,7 +715,7 @@ def test_pandas_filter_between(de_fixture: DataExplorerFixture): ) -def test_pandas_filter_compare(de_fixture: DataExplorerFixture): +def test_pandas_filter_compare(dxf: DataExplorerFixture): # Just use the 'a' column to smoke test comparison filters on # integers table_name = "simple" @@ -731,23 +727,23 @@ def test_pandas_filter_compare(de_fixture: DataExplorerFixture): for op, op_func in COMPARE_OPS.items(): filt = _compare_filter(column_index, op, str(compare_value)) expected_df = df[op_func(df[column], compare_value)] - de_fixture.check_filter_case(df, [filt], expected_df) + dxf.check_filter_case(df, [filt], expected_df) # TODO(wesm): move these tests to their own test case # Test that passing empty filter set resets to unfiltered state filt = _compare_filter(column_index, "<", str(compare_value)) - _ = de_fixture.set_row_filters(table_name, filters=[filt]) - response = de_fixture.set_row_filters(table_name, filters=[]) + _ = dxf.set_row_filters(table_name, filters=[filt]) + response = dxf.set_row_filters(table_name, filters=[]) assert response == FilterResult(selected_num_rows=len(df)) # register the whole table to make sure the filters are really cleared ex_id = guid() - de_fixture.register_table(ex_id, df) - de_fixture.compare_tables(table_name, ex_id, df.shape) + dxf.register_table(ex_id, df) + dxf.compare_tables(table_name, ex_id, df.shape) -def test_pandas_filter_is_null_not_null(de_fixture: DataExplorerFixture): +def test_pandas_filter_is_null_not_null(dxf: DataExplorerFixture): df = SIMPLE_PANDAS_DF b_is_null = _filter("is_null", 1) b_not_null = _filter("not_null", 1) @@ -760,10 +756,10 @@ def test_pandas_filter_is_null_not_null(de_fixture: DataExplorerFixture): ] for filter_set, expected_df in cases: - de_fixture.check_filter_case(df, filter_set, expected_df) + dxf.check_filter_case(df, filter_set, expected_df) -def test_pandas_filter_set_membership(de_fixture: DataExplorerFixture): +def test_pandas_filter_set_membership(dxf: DataExplorerFixture): df = SIMPLE_PANDAS_DF cases = [ @@ -778,11 +774,10 @@ def test_pandas_filter_set_membership(de_fixture: DataExplorerFixture): ] for filter_set, expected_df in cases: - de_fixture.check_filter_case(df, filter_set, expected_df) + dxf.check_filter_case(df, filter_set, expected_df) -def test_pandas_filter_search(de_fixture: DataExplorerFixture): - dxf = de_fixture +def test_pandas_filter_search(dxf: DataExplorerFixture): df = pd.DataFrame( { "a": ["foo1", "foo2", None, "2FOO", "FOO3", "bar1", "2BAR"], @@ -856,7 +851,7 @@ def test_pandas_filter_search(de_fixture: DataExplorerFixture): ) -def test_pandas_set_sort_columns(de_fixture: DataExplorerFixture): +def test_pandas_set_sort_columns(dxf: DataExplorerFixture): tables = { "df1": SIMPLE_PANDAS_DF, # Just some random data to test multiple keys, different sort @@ -902,18 +897,18 @@ def test_pandas_set_sort_columns(de_fixture: DataExplorerFixture): expected_df = df.sort_values(**expected_params) - de_fixture.check_sort_case(df, wrapped_keys, expected_df) + dxf.check_sort_case(df, wrapped_keys, expected_df) for filter_f, filters in filter_cases.get(df_name, []): expected_filtered = filter_f(df).sort_values(**expected_params) - de_fixture.check_sort_case(df, wrapped_keys, expected_filtered, filters=filters) + dxf.check_sort_case(df, wrapped_keys, expected_filtered, filters=filters) def test_pandas_change_schema_after_sort( shell: PositronShell, de_service: DataExplorerService, variables_comm: DummyComm, - de_fixture: DataExplorerFixture, + dxf: DataExplorerFixture, ): df = pd.DataFrame( { @@ -929,26 +924,32 @@ def test_pandas_change_schema_after_sort( # Sort a column that is out of bounds for the table after the # schema change below - de_fixture.set_sort_columns("df", [{"column_index": 4, "ascending": True}]) + dxf.set_sort_columns("df", [{"column_index": 4, "ascending": True}]) expected_df = df[["a", "b"]] - de_fixture.register_table("expected_df", df) + dxf.register_table("expected_df", df) # Sort last column, and we will then change the schema shell.run_cell("df = df[['a', 'b']]") _check_update_variable(de_service, "df", update_type="schema", discard_state=True) # Call get_data_values and make sure it works - de_fixture.compare_tables("df", "expected_df", expected_df.shape) + dxf.compare_tables("df", "expected_df", expected_df.shape) def _profile_request(column_index, profile_type): return {"column_index": column_index, "type": profile_type} -def test_pandas_profile_null_counts(de_fixture: DataExplorerFixture): - dxf = de_fixture +def _get_null_count(column_index): + return _profile_request(column_index, "null_count") + + +def _get_summary_stats(column_index): + return _profile_request(column_index, "summary_stats") + +def test_pandas_profile_null_counts(dxf: DataExplorerFixture): df1 = pd.DataFrame( { "a": [0, np.nan, 2, np.nan, 4, 5, 6], @@ -962,26 +963,28 @@ def test_pandas_profile_null_counts(de_fixture: DataExplorerFixture): for name, df in tables.items(): dxf.register_table(name, df) - def _null_count(column_index): - return _profile_request(column_index, "null_count") - # tuples like (table_name, [ColumnProfileRequest], [results]) all_profiles = [ - _null_count(0), - _null_count(1), - _null_count(2), - _null_count(3), + _get_null_count(0), + _get_null_count(1), + _get_null_count(2), + _get_null_count(3), ] cases = [ ("df1", [], []), ( "df1", - [_null_count(3)], + [_get_null_count(3)], [0], ), ( "df1", - [_null_count(0), _null_count(1), _null_count(2), _null_count(3)], + [ + _get_null_count(0), + _get_null_count(1), + _get_null_count(2), + _get_null_count(3), + ], [2, 3, 4, 0], ), ] @@ -1010,6 +1013,105 @@ def _null_count(column_index): assert results == ex_results +EPSILON = 1e-7 + + +def _assert_close(expected, actual): + assert np.abs(actual - expected) < EPSILON + + +def _assert_numeric_stats_equal(expected, actual): + for attr, value in expected.items(): + _assert_close(value, float(actual.get(attr))) + + +def _assert_string_stats_equal(expected, actual): + assert expected["num_empty"] == actual["num_empty"] + assert expected["num_unique"] == actual["num_unique"] + + +def _assert_boolean_stats_equal(expected, actual): + assert expected["true_count"] == actual["true_count"] + assert expected["false_count"] == actual["false_count"] + + +def test_pandas_profile_summary_stats(dxf: DataExplorerFixture): + arr = np.random.standard_normal(100) + arr_with_nulls = arr.copy() + arr_with_nulls[::10] = np.nan + + df1 = pd.DataFrame( + { + "a": arr, + "b": arr_with_nulls, + "c": [False, False, False, True, None] * 20, + "d": [ + "foo", + "", + "baz", + "qux", + "foo", + None, + "bar", + "", + "bar", + "zzz", + ] + * 10, + } + ) + dxf.register_table("df1", df1) + + cases = [ + ( + "df1", + 0, + { + "min_value": arr.min(), + "max_value": arr.max(), + "mean": df1["a"].mean(), + "stdev": df1["a"].std(), + "median": df1["a"].median(), + }, + ), + ( + "df1", + 1, + { + "min_value": df1["b"].min(), + "max_value": df1["b"].max(), + "mean": df1["b"].mean(), + "stdev": df1["b"].std(), + "median": df1["b"].median(), + }, + ), + ( + "df1", + 2, + {"true_count": 20, "false_count": 60}, + ), + ( + "df1", + 3, + {"num_empty": 20, "num_unique": 6}, + ), + ] + + for table_name, col_index, ex_result in cases: + profiles = [_get_summary_stats(col_index)] + results = dxf.get_column_profiles(table_name, profiles) + + stats = results[0]["summary_stats"] + ui_type = stats["type_display"] + + if ui_type == ColumnDisplayType.Number: + _assert_numeric_stats_equal(ex_result, stats["number_stats"]) + elif ui_type == ColumnDisplayType.String: + _assert_string_stats_equal(ex_result, stats["string_stats"]) + elif ui_type == ColumnDisplayType.Boolean: + _assert_boolean_stats_equal(ex_result, stats["boolean_stats"]) + + # ---------------------------------------------------------------------- # Test RPCs for polars DataFrame diff --git a/src/vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell.tsx b/src/vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell.tsx index 5b391a50233..1c49daf3e55 100644 --- a/src/vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell.tsx +++ b/src/vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell.tsx @@ -75,10 +75,15 @@ export const ColumnSummaryCell = (props: ColumnSummaryCellProps) => { * @returns The profile component. */ const profile = () => { + // Hack just to get things working + props.instance.computeColumnSummaryStats(props.columnIndex); // Determine the alignment based on type. switch (props.columnSchema.type_display) { case ColumnDisplayType.Number: - return ; + return ; case ColumnDisplayType.Boolean: return null; diff --git a/src/vs/workbench/services/positronDataExplorer/browser/components/profileNumber.tsx b/src/vs/workbench/services/positronDataExplorer/browser/components/profileNumber.tsx index 679e9a7cdbe..c726fd20368 100644 --- a/src/vs/workbench/services/positronDataExplorer/browser/components/profileNumber.tsx +++ b/src/vs/workbench/services/positronDataExplorer/browser/components/profileNumber.tsx @@ -8,10 +8,14 @@ import 'vs/css!./profileNumber'; // React. import * as React from 'react'; +import { TableSummaryDataGridInstance } from 'vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance'; + /** * ProfileNumberProps interface. */ interface ProfileNumberProps { + instance: TableSummaryDataGridInstance; + columnIndex: number; } /** @@ -20,33 +24,38 @@ interface ProfileNumberProps { * @returns The rendered component. */ export const ProfileNumber = (props: ProfileNumberProps) => { + // Hack + let stats: any = props.instance.getColumnSummaryStats(props.columnIndex)?.number_stats!; + if (!stats) { + stats = {}; + } return (
NA
-
Median
Mean
+
Median
SD
Min
Max
-
12
-
1
-
4
-
2
-
5
-
102
+
-999999
+
{stats.mean}
+
{stats.median}
+
{stats.stdev}
+
{stats.min_value}
+
{stats.max_value}
-
+ {/*
 
.51
.20
.24
 
.44
-
+
*/}
); diff --git a/src/vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance.tsx b/src/vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance.tsx index 7021dc051b5..8e7d379c53a 100644 --- a/src/vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance.tsx +++ b/src/vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance.tsx @@ -9,7 +9,7 @@ import * as React from 'react'; import { Emitter } from 'vs/base/common/event'; import { DataGridInstance } from 'vs/workbench/browser/positronDataGrid/classes/dataGridInstance'; import { DataExplorerCache } from 'vs/workbench/services/positronDataExplorer/common/dataExplorerCache'; -import { ColumnDisplayType } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; +import { ColumnDisplayType, ColumnSummaryStats } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; import { ColumnSummaryCell } from 'vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell'; import { DataExplorerClientInstance } from 'vs/workbench/services/languageRuntime/common/languageRuntimeDataExplorerClient'; @@ -230,6 +230,14 @@ export class TableSummaryDataGridInstance extends DataGridInstance { nullCount * 100 / this._dataExplorerCache.rows); } + getColumnSummaryStats(columnIndex: number): ColumnSummaryStats | undefined { + return this._dataExplorerCache.getColumnSummaryStats(columnIndex); + } + + computeColumnSummaryStats(columnIndex: number) { + this._dataExplorerCache.updateColumnSummaryStats([columnIndex]); + } + //#endregion DataGridInstance Methods //#region Public Events diff --git a/src/vs/workbench/services/positronDataExplorer/common/dataExplorerCache.ts b/src/vs/workbench/services/positronDataExplorer/common/dataExplorerCache.ts index f3c7961ab8d..1696850f065 100644 --- a/src/vs/workbench/services/positronDataExplorer/common/dataExplorerCache.ts +++ b/src/vs/workbench/services/positronDataExplorer/common/dataExplorerCache.ts @@ -4,7 +4,7 @@ import { Emitter } from 'vs/base/common/event'; import { Disposable } from 'vs/base/common/lifecycle'; -import { ColumnProfileRequestType, ColumnSchema, TableData } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; +import { ColumnProfileRequestType, ColumnSchema, ColumnSummaryStats, TableData } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; import { DataExplorerClientInstance } from 'vs/workbench/services/languageRuntime/common/languageRuntimeDataExplorerClient'; /** @@ -72,6 +72,11 @@ export class DataExplorerCache extends Disposable { */ private readonly _columnNullCountCache = new Map(); + /** + * Gets the column summary stats cache. + */ + private readonly _columnSummaryStatsCache = new Map(); + /** * Gets the row label cache. */ @@ -107,6 +112,7 @@ export class DataExplorerCache extends Disposable { // Clear the column schema cache, row label cache, and data cell cache. this._columnSchemaCache.clear(); this._columnNullCountCache.clear(); + this._columnSummaryStatsCache.clear(); this._rowLabelCache.clear(); this._dataCellCache.clear(); })); @@ -117,6 +123,7 @@ export class DataExplorerCache extends Disposable { this._rowLabelCache.clear(); this._dataCellCache.clear(); this._columnNullCountCache.clear(); + this._columnSummaryStatsCache.clear(); })); } @@ -158,6 +165,7 @@ export class DataExplorerCache extends Disposable { this._rowLabelCache.clear(); this._dataCellCache.clear(); this._columnNullCountCache.clear(); + this._columnSummaryStatsCache.clear(); } /** @@ -190,6 +198,35 @@ export class DataExplorerCache extends Disposable { return this._columnNullCountCache.get(columnIndex); } + /** + * Gets the cached summary stats for the specified column index. + * @param columnIndex The column index. + * @returns ColumnSummaryStats in the specified column index. + */ + getColumnSummaryStats(columnIndex: number) { + return this._columnSummaryStatsCache.get(columnIndex); + } + + async updateColumnSummaryStats(columnIndices: Array) { + // Request the profiles + const results = await this._dataExplorerClientInstance.getColumnProfiles( + columnIndices.map(column_index => { + return { + column_index, + type: ColumnProfileRequestType.SummaryStats + }; + }) + ); + + // Update the column schema cache, overwriting any entries we already have cached. + for (let i = 0; i < results.length; i++) { + const stats = results[i].summary_stats; + if (stats !== undefined) { + this._columnSummaryStatsCache.set(columnIndices[i], stats); + } + } + } + /** * Gets the row label for the specified row index. * @param rowIndex The row index.