From 73f12b82b8439bcf2117386e095a6bb7d2532f7c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 2 Apr 2024 13:44:06 -0500 Subject: [PATCH 1/5] Protocol improvements to support different types of summary stats --- .../positron_ipykernel/data_explorer.py | 56 ++++++-- .../positron_ipykernel/data_explorer_comm.py | 82 +++++++++-- .../comms/data_explorer-backend-openrpc.json | 101 +++++++++++--- positron/comms/generate-comms.ts | 132 ++++++++---------- .../addEditRowFilterModalPopup.tsx | 42 +++--- .../utility/columnSchemaUtilities.ts | 20 +-- .../common/positronDataExplorerComm.ts | 76 ++++++++-- .../browser/components/columnSummaryCell.tsx | 38 ++--- .../browser/positronDataExplorerColumn.ts | 20 +-- .../browser/tableSummaryDataGridInstance.tsx | 20 +-- 10 files changed, 383 insertions(+), 204 deletions(-) diff --git a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py index 7fe12776473..2e05290eed9 100644 --- a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py +++ b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py @@ -31,7 +31,7 @@ ColumnProfileRequestType, ColumnProfileResult, ColumnSchema, - ColumnSchemaTypeDisplay, + ColumnDisplayType, ColumnSortKey, DataExplorerBackendMessageContent, DataExplorerFrontendEvent, @@ -104,7 +104,9 @@ def _recompute_if_needed(self) -> bool: return False def get_schema(self, request: GetSchemaRequest): - return self._get_schema(request.params.start_index, request.params.num_columns).dict() + return self._get_schema( + request.params.start_index, request.params.num_columns + ).dict() def search_schema(self, request: SearchSchemaRequest): return self._search_schema( @@ -292,7 +294,9 @@ def __init__( # search term changes, we discard the last search result. We # might add an LRU cache here or something if it helps # performance. - self._search_schema_last_result: Optional[Tuple[str, List[ColumnSchema]]] = None + self._search_schema_last_result: Optional[ + Tuple[str, List[ColumnSchema]] + ] = None def invalidate_computations(self): self.filtered_indices = self.view_indices = None @@ -340,7 +344,9 @@ def _get_schema(self, column_start: int, num_columns: int) -> TableSchema: column_raw_name = self.table.columns[column_index] column_name = str(column_raw_name) - col_schema = self._get_single_column_schema(column_index, column_name) + col_schema = self._get_single_column_schema( + column_index, column_name + ) column_schemas.append(col_schema) return TableSchema(columns=column_schemas) @@ -366,7 +372,9 @@ def _search_schema( total_num_matches=len(matches), ) - def _search_schema_get_matches(self, search_term: str) -> List[ColumnSchema]: + def _search_schema_get_matches( + self, search_term: str + ) -> List[ColumnSchema]: matches = [] for column_index in range(len(self.table.columns)): column_raw_name = self.table.columns[column_index] @@ -376,7 +384,9 @@ def _search_schema_get_matches(self, search_term: str) -> List[ColumnSchema]: if search_term not in column_name.lower(): continue - col_schema = self._get_single_column_schema(column_index, column_name) + col_schema = self._get_single_column_schema( + column_index, column_name + ) matches.append(col_schema) return matches @@ -385,7 +395,9 @@ def _get_inferred_dtype(self, column_index: int): from pandas.api.types import infer_dtype if column_index not in self._inferred_dtypes: - self._inferred_dtypes[column_index] = infer_dtype(self.table.iloc[:, column_index]) + self._inferred_dtypes[column_index] = infer_dtype( + self.table.iloc[:, column_index] + ) return self._inferred_dtypes[column_index] def _get_single_column_schema(self, column_index: int, column_name: str): @@ -405,7 +417,7 @@ def _get_single_column_schema(self, column_index: int, column_name: str): column_name=column_name, column_index=column_index, type_name=type_name, - type_display=ColumnSchemaTypeDisplay(type_display), + type_display=ColumnDisplayType(type_display), ) def _get_data_values( @@ -438,7 +450,9 @@ def _get_data_values( else: # No filtering or sorting, just slice directly indices = self.table.index[row_start : row_start + num_rows] - columns = [col.iloc[row_start : row_start + num_rows] for col in columns] + columns = [ + col.iloc[row_start : row_start + num_rows] for col in columns + ] formatted_columns = [_pandas_format_values(col) for col in columns] @@ -571,7 +585,9 @@ def _sort_data(self) -> None: self.view_indices = self.filtered_indices.take(sort_indexer) else: # Data is not filtered - self.view_indices = nargsort(column, kind="mergesort", ascending=key.ascending) + self.view_indices = nargsort( + column, kind="mergesort", ascending=key.ascending + ) elif len(self.sort_keys) > 1: # Multiple sorting keys cols_to_sort = [] @@ -612,7 +628,9 @@ def _prof_histogram(self, column_index: int): def _get_state(self) -> TableState: return TableState( - table_shape=TableShape(num_rows=self.table.shape[0], num_columns=self.table.shape[1]), + table_shape=TableShape( + num_rows=self.table.shape[0], num_columns=self.table.shape[1] + ), row_filters=self.filters, sort_keys=self.sort_keys, ) @@ -799,7 +817,9 @@ def handle_variable_updated(self, variable_name, new_variable): for comm_id in list(self.path_to_comm_ids[path]): self._update_explorer_for_comm(comm_id, path, new_variable) - def _update_explorer_for_comm(self, comm_id: str, path: PathKey, new_variable): + def _update_explorer_for_comm( + self, comm_id: str, path: PathKey, new_variable + ): """ If a variable is updated, we have to handle the different scenarios: @@ -833,7 +853,9 @@ def _update_explorer_for_comm(self, comm_id: str, path: PathKey, new_variable): # data explorer open for a nested value, then we need to use # the same variables inspection logic to resolve it here. if len(path) > 1: - is_found, new_table = _resolve_value_from_path(new_variable, path[1:]) + is_found, new_table = _resolve_value_from_path( + new_variable, path[1:] + ) if not is_found: raise KeyError(f"Path {', '.join(path)} not found in value") else: @@ -852,7 +874,9 @@ def _fire_data_update(): def _fire_schema_update(discard_state=False): msg = SchemaUpdateParams(discard_state=discard_state) - comm.send_event(DataExplorerFrontendEvent.SchemaUpdate.value, msg.dict()) + comm.send_event( + DataExplorerFrontendEvent.SchemaUpdate.value, msg.dict() + ) if type(new_table) is not type(table_view.table): # noqa: E721 # Data type has changed. For now, we will signal the UI to @@ -897,7 +921,9 @@ def _fire_schema_update(discard_state=False): else: _fire_data_update() - def handle_msg(self, msg: CommMessage[DataExplorerBackendMessageContent], raw_msg): + def handle_msg( + self, msg: CommMessage[DataExplorerBackendMessageContent], raw_msg + ): """ Handle messages received from the client via the positron.data_explorer comm. diff --git a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer_comm.py b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer_comm.py index 5b832565a3f..de002df2025 100644 --- a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer_comm.py +++ b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer_comm.py @@ -18,9 +18,9 @@ @enum.unique -class ColumnSchemaTypeDisplay(str, enum.Enum): +class ColumnDisplayType(str, enum.Enum): """ - Possible values for TypeDisplay in ColumnSchema + Possible values for ColumnDisplayType """ Number = "number" @@ -202,7 +202,7 @@ class ColumnSchema(BaseModel): description="Exact name of data type used by underlying table", ) - type_display: ColumnSchemaTypeDisplay = Field( + type_display: ColumnDisplayType = Field( description="Canonical Positron display name of data type", ) @@ -387,7 +387,33 @@ class ColumnProfileResult(BaseModel): class ColumnSummaryStats(BaseModel): """ - ColumnSummaryStats in Schemas + Profile result containing summary stats for a column based on the data + type + """ + + type_display: ColumnDisplayType = Field( + description="Canonical Positron display name of data type", + ) + + number_stats: Optional[SummaryStatsNumber] = Field( + default=None, + description="Statistics for a numeric data type", + ) + + string_stats: Optional[SummaryStatsString] = Field( + default=None, + description="Statistics for a string-like data type", + ) + + boolean_stats: Optional[SummaryStatsBoolean] = Field( + default=None, + description="Statistics for a boolean data type", + ) + + +class SummaryStatsNumber(BaseModel): + """ + SummaryStatsNumber in Schemas """ min_value: str = Field( @@ -398,24 +424,44 @@ class ColumnSummaryStats(BaseModel): description="Maximum value as string", ) - mean_value: Optional[str] = Field( - default=None, + mean: str = Field( description="Average value as string", ) - median: Optional[str] = Field( - default=None, + median: str = Field( description="Sample median (50% value) value as string", ) - q25: Optional[str] = Field( - default=None, - description="25th percentile value as string", + stdev: str = Field( + description="Sample standard deviation as a string", ) - q75: Optional[str] = Field( - default=None, - description="75th percentile value as string", + +class SummaryStatsBoolean(BaseModel): + """ + SummaryStatsBoolean in Schemas + """ + + true_count: int = Field( + description="The number of non-null true values", + ) + + false_count: int = Field( + description="The number of non-null false values", + ) + + +class SummaryStatsString(BaseModel): + """ + SummaryStatsString in Schemas + """ + + num_empty: int = Field( + description="The number of empty / length-zero values", + ) + + num_unique: int = Field( + description="The exact number of distinct values", ) @@ -560,7 +606,7 @@ class SearchSchemaParams(BaseModel): """ search_term: str = Field( - description="Substring to match for (currently case insensitive", + description="Substring to match for (currently case insensitive)", ) start_index: int = Field( @@ -798,6 +844,12 @@ class SchemaUpdateParams(BaseModel): ColumnSummaryStats.update_forward_refs() +SummaryStatsNumber.update_forward_refs() + +SummaryStatsBoolean.update_forward_refs() + +SummaryStatsString.update_forward_refs() + ColumnHistogram.update_forward_refs() ColumnFrequencyTable.update_forward_refs() diff --git a/positron/comms/data_explorer-backend-openrpc.json b/positron/comms/data_explorer-backend-openrpc.json index 09eb728a433..91d033fb13b 100644 --- a/positron/comms/data_explorer-backend-openrpc.json +++ b/positron/comms/data_explorer-backend-openrpc.json @@ -289,6 +289,22 @@ "components": { "contentDescriptors": {}, "schemas": { + "column_display_type": { + "name": "column_display_type", + "type": "string", + "description": "Canonical Positron display name of data type", + "enum": [ + "number", + "boolean", + "string", + "date", + "datetime", + "time", + "array", + "struct", + "unknown" + ] + }, "column_schema": { "type": "object", "description": "Schema for a column in a table", @@ -312,19 +328,8 @@ "description": "Exact name of data type used by underlying table" }, "type_display": { - "type": "string", "description": "Canonical Positron display name of data type", - "enum": [ - "number", - "boolean", - "string", - "date", - "datetime", - "time", - "array", - "struct", - "unknown" - ] + "$ref": "#/components/schemas/column_display_type" }, "description": { "type": "string", @@ -560,10 +565,38 @@ } }, "column_summary_stats": { + "type": "object", + "description": "Profile result containing summary stats for a column based on the data type", + "required": [ + "type_display" + ], + "properties": { + "type_display": { + "description": "Canonical Positron display name of data type", + "$ref": "#/components/schemas/column_display_type" + }, + "number_stats": { + "description": "Statistics for a numeric data type", + "$ref": "#/components/schemas/summary_stats_number" + }, + "string_stats": { + "description": "Statistics for a string-like data type", + "$ref": "#/components/schemas/summary_stats_string" + }, + "boolean_stats": { + "description": "Statistics for a boolean data type", + "$ref": "#/components/schemas/summary_stats_boolean" + } + } + }, + "summary_stats_number": { "type": "object", "required": [ "min_value", - "max_value" + "max_value", + "mean", + "median", + "stdev" ], "properties": { "min_value": { @@ -574,7 +607,7 @@ "type": "string", "description": "Maximum value as string" }, - "mean_value": { + "mean": { "type": "string", "description": "Average value as string" }, @@ -582,13 +615,43 @@ "type": "string", "description": "Sample median (50% value) value as string" }, - "q25": { + "stdev": { "type": "string", - "description": "25th percentile value as string" + "description": "Sample standard deviation as a string" + } + } + }, + "summary_stats_boolean": { + "type": "object", + "required": [ + "true_count", + "false_count" + ], + "properties": { + "true_count": { + "type": "integer", + "description": "The number of non-null true values" }, - "q75": { - "type": "string", - "description": "75th percentile value as string" + "false_count": { + "type": "integer", + "description": "The number of non-null false values" + } + } + }, + "summary_stats_string": { + "type": "object", + "required": [ + "num_empty", + "num_unique" + ], + "properties": { + "num_empty": { + "type": "integer", + "description": "The number of empty / length-zero values" + }, + "num_unique": { + "type": "integer", + "description": "The exact number of distinct values" } } }, diff --git a/positron/comms/generate-comms.ts b/positron/comms/generate-comms.ts index f06dfe97910..c98530a95d0 100644 --- a/positron/comms/generate-comms.ts +++ b/positron/comms/generate-comms.ts @@ -212,10 +212,12 @@ function deriveType(contracts: Array, } } else if (schema.type === 'string' && schema.enum) { if (context.length < 2) { - throw new Error(`missing context (need at least 2 elements): ${context[0]}`); + return snakeCaseToSentenceCase(context[0]); + } else { + // An enum field within another type, we add the context prefix + return snakeCaseToSentenceCase(context[1]) + + snakeCaseToSentenceCase(context[0]); } - return snakeCaseToSentenceCase(context[1]) + - snakeCaseToSentenceCase(context[0]); } else { if (Object.keys(typeMap).includes(schema.type)) { return typeMap[schema.type]; @@ -301,7 +303,7 @@ function* enumVisitor( // and recurse yield* enumVisitor( [contract['name'], ...context], contract[key], callback); - } else if (key === 'properties' || key === 'params') { + } else if (key === 'properties' || key === 'params' || key === 'schemas' || key === 'components') { // If this is a properties or params object, recurse into each // property, but don't push the parent name onto the context yield* enumVisitor( @@ -397,21 +399,6 @@ use serde::Serialize; continue; } - // Create type aliases for all the shared types - if (source.components && source.components.schemas) { - for (const key of Object.keys(source.components.schemas)) { - const schema = source.components.schemas[key]; - if (schema.type !== 'object') { - yield formatComment('/// ', schema.description); - yield `type ${snakeCaseToSentenceCase(key)} = `; - yield deriveType(contracts, RustTypeMap, - [schema.name ? schema.name : key], - schema); - yield ';\n\n'; - } - } - } - // Create structs for all object types yield* objectVisitor([], source, function* (context: Array, o: Record) { if (o.description) { @@ -463,12 +450,22 @@ use serde::Serialize; // Create enums for all enum types yield* enumVisitor([], source, function* (context: Array, values: Array) { - yield formatComment(`/// `, - `Possible values for ` + - snakeCaseToSentenceCase(context[0]) + ` in ` + - snakeCaseToSentenceCase(context[1])); - yield '#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]\n'; - yield `pub enum ${snakeCaseToSentenceCase(context[1])}${snakeCaseToSentenceCase(context[0])} {\n`; + if (context.length === 1) { + // Shared enum at the components.schemas level + yield formatComment(`/// `, + `Possible values for ` + + snakeCaseToSentenceCase(context[0])); + yield '#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]\n'; + yield `pub enum ${snakeCaseToSentenceCase(context[0])} {\n`; + } else { + // Enum field within another interface + yield formatComment(`/// `, + `Possible values for ` + + snakeCaseToSentenceCase(context[0]) + ` in ` + + snakeCaseToSentenceCase(context[1])); + yield '#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]\n'; + yield `pub enum ${snakeCaseToSentenceCase(context[1])}${snakeCaseToSentenceCase(context[0])} {\n`; + } for (let i = 0; i < values.length; i++) { const value = values[i]; yield `\t#[serde(rename = "${value}")]\n`; @@ -712,32 +709,28 @@ from ._vendor.pydantic import BaseModel, Field continue; } - // Create type aliases for all the shared types - if (source.components && source.components.schemas) { - for (const key of Object.keys(source.components.schemas)) { - const schema = source.components.schemas[key]; - if (schema.type !== 'object') { - yield formatComment('# ', schema.description); - yield `${snakeCaseToSentenceCase(key)} = `; - yield deriveType(contracts, PythonTypeMap, - [schema.name ? schema.name : key], - schema); - yield '\n\n'; - } - } - } - // Create enums for all enum types yield* enumVisitor([], source, function* (context: Array, values: Array) { - yield '@enum.unique\n'; - yield `class ${snakeCaseToSentenceCase(context[1])}`; - yield `${snakeCaseToSentenceCase(context[0])}(str, enum.Enum):\n`; - yield ' """\n'; - yield formatComment(` `, - `Possible values for ` + - snakeCaseToSentenceCase(context[0]) + - ` in ` + - snakeCaseToSentenceCase(context[1])); + if (context.length === 1) { + // Shared enum at the components.schemas level + yield '@enum.unique\n'; + yield `class ${snakeCaseToSentenceCase(context[0])}(str, enum.Enum):\n`; + yield ' """\n'; + yield formatComment(` `, + `Possible values for ` + + snakeCaseToSentenceCase(context[0])); + } else { + // Enum field within another interface + yield '@enum.unique\n'; + yield `class ${snakeCaseToSentenceCase(context[1])}`; + yield `${snakeCaseToSentenceCase(context[0])}(str, enum.Enum):\n`; + yield ' """\n'; + yield formatComment(` `, + `Possible values for ` + + snakeCaseToSentenceCase(context[0]) + + ` in ` + + snakeCaseToSentenceCase(context[1])); + } yield ' """\n'; yield '\n'; for (let i = 0; i < values.length; i++) { @@ -1051,12 +1044,22 @@ import { IRuntimeClientInstance } from 'vs/workbench/services/languageRuntime/co // Create enums for all enum types yield* enumVisitor([], source, function* (context: Array, values: Array) { yield '/**\n'; - yield formatComment(` * `, - `Possible values for ` + - snakeCaseToSentenceCase(context[0]) + ` in ` + - snakeCaseToSentenceCase(context[1])); - yield ' */\n'; - yield `export enum ${snakeCaseToSentenceCase(context[1])}${snakeCaseToSentenceCase(context[0])} {\n`; + if (context.length === 1) { + // Shared enum at the components.schemas level + yield formatComment(` * `, + `Possible values for ` + + snakeCaseToSentenceCase(context[0])); + yield ' */\n'; + yield `export enum ${snakeCaseToSentenceCase(context[0])} {\n`; + } else { + // Enum field within another interface + yield formatComment(` * `, + `Possible values for ` + + snakeCaseToSentenceCase(context[0]) + ` in ` + + snakeCaseToSentenceCase(context[1])); + yield ' */\n'; + yield `export enum ${snakeCaseToSentenceCase(context[1])}${snakeCaseToSentenceCase(context[0])} {\n`; + } for (let i = 0; i < values.length; i++) { const value = values[i]; yield `\t${snakeCaseToSentenceCase(value)} = '${value}'`; @@ -1070,25 +1073,6 @@ import { IRuntimeClientInstance } from 'vs/workbench/services/languageRuntime/co }); } - for (const source of [backend, frontend]) { - if (!source) { - continue; - } - if (source.components && source.components.schemas) { - for (const key of Object.keys(source.components.schemas)) { - const schema = source.components.schemas[key]; - if (schema.type !== 'object') { - yield `/**\n`; - yield formatComment(' * ', schema.description); - yield ' */\n'; - yield `export type ${snakeCaseToSentenceCase(key)} = `; - yield deriveType(contracts, TypescriptTypeMap, [key], schema); - yield ';\n\n'; - } - } - } - } - if (frontend) { const events: string[] = []; const requests: string[] = []; diff --git a/src/vs/workbench/browser/positronDataExplorer/components/dataExplorerPanel/components/addEditRowFilterModalPopup/addEditRowFilterModalPopup.tsx b/src/vs/workbench/browser/positronDataExplorer/components/dataExplorerPanel/components/addEditRowFilterModalPopup/addEditRowFilterModalPopup.tsx index 7e5113fcd1c..482d66156a6 100644 --- a/src/vs/workbench/browser/positronDataExplorer/components/dataExplorerPanel/components/addEditRowFilterModalPopup/addEditRowFilterModalPopup.tsx +++ b/src/vs/workbench/browser/positronDataExplorer/components/dataExplorerPanel/components/addEditRowFilterModalPopup/addEditRowFilterModalPopup.tsx @@ -18,7 +18,7 @@ import { PositronModalPopup } from 'vs/workbench/browser/positronComponents/posi import { PositronModalReactRenderer } from 'vs/workbench/browser/positronModalReactRenderer/positronModalReactRenderer'; import { DropDownListBoxSeparator } from 'vs/workbench/browser/positronComponents/dropDownListBox/dropDownListBoxSeparator'; import { DataExplorerClientInstance } from 'vs/workbench/services/languageRuntime/common/languageRuntimeDataExplorerClient'; -import { ColumnSchema, ColumnSchemaTypeDisplay } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; +import { ColumnSchema, ColumnDisplayType } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; import { RowFilterParameter } from 'vs/workbench/browser/positronDataExplorer/components/dataExplorerPanel/components/addEditRowFilterModalPopup/components/rowFilterParameter'; import { DropDownColumnSelector } from 'vs/workbench/browser/positronDataExplorer/components/dataExplorerPanel/components/addEditRowFilterModalPopup/components/dropDownColumnSelector'; import { RangeRowFilter, RowFilter, RowFilterCondition, RowFilterIsBetween, RowFilterIsEmpty, RowFilterIsEqualTo, RowFilterIsGreaterThan, RowFilterIsLessThan, RowFilterIsNotBetween, RowFilterIsNotEmpty, SingleValueRowFilter } from 'vs/workbench/browser/positronDataExplorer/components/dataExplorerPanel/components/addEditRowFilterModalPopup/rowFilter'; @@ -51,21 +51,21 @@ const validateRowFilterValue = (columnSchema: ColumnSchema, value: string) => { // Validate the row filter value that was supplied based on the column schema type. switch (columnSchema.type_display) { // Number. - case ColumnSchemaTypeDisplay.Number: + case ColumnDisplayType.Number: return isNumber(); // Boolean. - case ColumnSchemaTypeDisplay.Boolean: + case ColumnDisplayType.Boolean: return isBoolean(); // String. - case ColumnSchemaTypeDisplay.String: + case ColumnDisplayType.String: return true; // TODO: Add more complete validation. - case ColumnSchemaTypeDisplay.Date: - case ColumnSchemaTypeDisplay.Datetime: - case ColumnSchemaTypeDisplay.Time: + case ColumnDisplayType.Date: + case ColumnDisplayType.Datetime: + case ColumnDisplayType.Time: return isDate(); // Can't get here. @@ -160,10 +160,10 @@ export const AddEditRowFilterModalPopup = (props: AddEditRowFilterModalPopupProp // Add is less than / is greater than conditions. switch (selectedColumnSchema.type_display) { - case ColumnSchemaTypeDisplay.Number: - case ColumnSchemaTypeDisplay.Date: - case ColumnSchemaTypeDisplay.Datetime: - case ColumnSchemaTypeDisplay.Time: + case ColumnDisplayType.Number: + case ColumnDisplayType.Date: + case ColumnDisplayType.Datetime: + case ColumnDisplayType.Time: conditionEntries.push(new DropDownListBoxItem({ identifier: RowFilterCondition.CONDITION_IS_LESS_THAN, title: localize( @@ -183,12 +183,12 @@ export const AddEditRowFilterModalPopup = (props: AddEditRowFilterModalPopupProp // Add is equal to condition. switch (selectedColumnSchema.type_display) { - case ColumnSchemaTypeDisplay.Number: - case ColumnSchemaTypeDisplay.Boolean: - case ColumnSchemaTypeDisplay.String: - case ColumnSchemaTypeDisplay.Date: - case ColumnSchemaTypeDisplay.Datetime: - case ColumnSchemaTypeDisplay.Time: + case ColumnDisplayType.Number: + case ColumnDisplayType.Boolean: + case ColumnDisplayType.String: + case ColumnDisplayType.Date: + case ColumnDisplayType.Datetime: + case ColumnDisplayType.Time: conditionEntries.push(new DropDownListBoxItem({ identifier: RowFilterCondition.CONDITION_IS_EQUAL_TO, title: localize( @@ -201,10 +201,10 @@ export const AddEditRowFilterModalPopup = (props: AddEditRowFilterModalPopupProp // Add is between / is not between conditions. switch (selectedColumnSchema.type_display) { - case ColumnSchemaTypeDisplay.Number: - case ColumnSchemaTypeDisplay.Date: - case ColumnSchemaTypeDisplay.Datetime: - case ColumnSchemaTypeDisplay.Time: + case ColumnDisplayType.Number: + case ColumnDisplayType.Date: + case ColumnDisplayType.Datetime: + case ColumnDisplayType.Time: conditionEntries.push(new DropDownListBoxSeparator()); conditionEntries.push(new DropDownListBoxItem({ identifier: RowFilterCondition.CONDITION_IS_BETWEEN, diff --git a/src/vs/workbench/browser/positronDataExplorer/components/dataExplorerPanel/utility/columnSchemaUtilities.ts b/src/vs/workbench/browser/positronDataExplorer/components/dataExplorerPanel/utility/columnSchemaUtilities.ts index 2446d3dce22..5fe5c9f9b5c 100644 --- a/src/vs/workbench/browser/positronDataExplorer/components/dataExplorerPanel/utility/columnSchemaUtilities.ts +++ b/src/vs/workbench/browser/positronDataExplorer/components/dataExplorerPanel/utility/columnSchemaUtilities.ts @@ -2,7 +2,7 @@ * Copyright (C) 2024 Posit Software, PBC. All rights reserved. *--------------------------------------------------------------------------------------------*/ -import { ColumnSchema, ColumnSchemaTypeDisplay } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; +import { ColumnSchema, ColumnDisplayType } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; /** * Returns the data type icon for the column schema. @@ -16,31 +16,31 @@ export const columnSchemaDataTypeIcon = (columnSchema?: ColumnSchema) => { // Determine the alignment based on type. switch (columnSchema.type_display) { - case ColumnSchemaTypeDisplay.Number: + case ColumnDisplayType.Number: return 'codicon-positron-data-type-number'; - case ColumnSchemaTypeDisplay.Boolean: + case ColumnDisplayType.Boolean: return 'codicon-positron-data-type-boolean'; - case ColumnSchemaTypeDisplay.String: + case ColumnDisplayType.String: return 'codicon-positron-data-type-string'; - case ColumnSchemaTypeDisplay.Date: + case ColumnDisplayType.Date: return 'codicon-positron-data-type-date'; - case ColumnSchemaTypeDisplay.Datetime: + case ColumnDisplayType.Datetime: return 'codicon-positron-data-type-date-time'; - case ColumnSchemaTypeDisplay.Time: + case ColumnDisplayType.Time: return 'codicon-positron-data-type-time'; - case ColumnSchemaTypeDisplay.Array: + case ColumnDisplayType.Array: return 'codicon-positron-data-type-array'; - case ColumnSchemaTypeDisplay.Struct: + case ColumnDisplayType.Struct: return 'codicon-positron-data-type-struct'; - case ColumnSchemaTypeDisplay.Unknown: + case ColumnDisplayType.Unknown: return 'codicon-positron-data-type-unknown'; // This shouldn't ever happen. diff --git a/src/vs/workbench/services/languageRuntime/common/positronDataExplorerComm.ts b/src/vs/workbench/services/languageRuntime/common/positronDataExplorerComm.ts index 8af3e7e49be..ae4f15dc01d 100644 --- a/src/vs/workbench/services/languageRuntime/common/positronDataExplorerComm.ts +++ b/src/vs/workbench/services/languageRuntime/common/positronDataExplorerComm.ts @@ -112,7 +112,7 @@ export interface ColumnSchema { /** * Canonical Positron display name of data type */ - type_display: ColumnSchemaTypeDisplay; + type_display: ColumnDisplayType; /** * Column annotation / description @@ -310,9 +310,36 @@ export interface ColumnProfileResult { } /** - * ColumnSummaryStats in Schemas + * Profile result containing summary stats for a column based on the data + * type */ export interface ColumnSummaryStats { + /** + * Canonical Positron display name of data type + */ + type_display: ColumnDisplayType; + + /** + * Statistics for a numeric data type + */ + number_stats?: SummaryStatsNumber; + + /** + * Statistics for a string-like data type + */ + string_stats?: SummaryStatsString; + + /** + * Statistics for a boolean data type + */ + boolean_stats?: SummaryStatsBoolean; + +} + +/** + * SummaryStatsNumber in Schemas + */ +export interface SummaryStatsNumber { /** * Minimum value as string */ @@ -326,22 +353,49 @@ export interface ColumnSummaryStats { /** * Average value as string */ - mean_value?: string; + mean: string; /** * Sample median (50% value) value as string */ - median?: string; + median: string; /** - * 25th percentile value as string + * Sample standard deviation as a string + */ + stdev: string; + +} + +/** + * SummaryStatsBoolean in Schemas + */ +export interface SummaryStatsBoolean { + /** + * The number of non-null true values + */ + true_count: number; + + /** + * The number of non-null false values + */ + false_count: number; + +} + +/** + * SummaryStatsString in Schemas + */ +export interface SummaryStatsString { + /** + * The number of empty / length-zero values */ - q25?: string; + num_empty: number; /** - * 75th percentile value as string + * The exact number of distinct values */ - q75?: string; + num_unique: number; } @@ -432,9 +486,9 @@ export interface ColumnSortKey { } /** - * Possible values for TypeDisplay in ColumnSchema + * Possible values for ColumnDisplayType */ -export enum ColumnSchemaTypeDisplay { +export enum ColumnDisplayType { Number = 'number', Boolean = 'boolean', String = 'string', @@ -540,7 +594,7 @@ export class PositronDataExplorerComm extends PositronBaseComm { * * Search schema for column names matching a passed substring * - * @param searchTerm Substring to match for (currently case insensitive + * @param searchTerm Substring to match for (currently case insensitive) * @param startIndex Index (starting from zero) of first result to fetch * @param maxResults Maximum number of resulting column schemas to fetch * from the start index diff --git a/src/vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell.tsx b/src/vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell.tsx index 691003a554b..5b391a50233 100644 --- a/src/vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell.tsx +++ b/src/vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell.tsx @@ -11,7 +11,7 @@ import * as React from 'react'; // Other dependencies. import { ProfileNumber } from 'vs/workbench/services/positronDataExplorer/browser/components/profileNumber'; import { ProfileString } from 'vs/workbench/services/positronDataExplorer/browser/components/profileString'; -import { ColumnSchema, ColumnSchemaTypeDisplay } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; +import { ColumnSchema, ColumnDisplayType } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; import { TableSummaryDataGridInstance } from 'vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance'; /** @@ -37,31 +37,31 @@ export const ColumnSummaryCell = (props: ColumnSummaryCellProps) => { const dataTypeIcon = () => { // Determine the alignment based on type. switch (props.columnSchema.type_display) { - case ColumnSchemaTypeDisplay.Number: + case ColumnDisplayType.Number: return 'codicon-positron-data-type-number'; - case ColumnSchemaTypeDisplay.Boolean: + case ColumnDisplayType.Boolean: return 'codicon-positron-data-type-boolean'; - case ColumnSchemaTypeDisplay.String: + case ColumnDisplayType.String: return 'codicon-positron-data-type-string'; - case ColumnSchemaTypeDisplay.Date: + case ColumnDisplayType.Date: return 'codicon-positron-data-type-date'; - case ColumnSchemaTypeDisplay.Datetime: + case ColumnDisplayType.Datetime: return 'codicon-positron-data-type-date-time'; - case ColumnSchemaTypeDisplay.Time: + case ColumnDisplayType.Time: return 'codicon-positron-data-type-time'; - case ColumnSchemaTypeDisplay.Array: + case ColumnDisplayType.Array: return 'codicon-positron-data-type-array'; - case ColumnSchemaTypeDisplay.Struct: + case ColumnDisplayType.Struct: return 'codicon-positron-data-type-struct'; - case ColumnSchemaTypeDisplay.Unknown: + case ColumnDisplayType.Unknown: return 'codicon-positron-data-type-unknown'; // This shouldn't ever happen. @@ -77,31 +77,31 @@ export const ColumnSummaryCell = (props: ColumnSummaryCellProps) => { const profile = () => { // Determine the alignment based on type. switch (props.columnSchema.type_display) { - case ColumnSchemaTypeDisplay.Number: + case ColumnDisplayType.Number: return ; - case ColumnSchemaTypeDisplay.Boolean: + case ColumnDisplayType.Boolean: return null; - case ColumnSchemaTypeDisplay.String: + case ColumnDisplayType.String: return ; - case ColumnSchemaTypeDisplay.Date: + case ColumnDisplayType.Date: return null; - case ColumnSchemaTypeDisplay.Datetime: + case ColumnDisplayType.Datetime: return null; - case ColumnSchemaTypeDisplay.Time: + case ColumnDisplayType.Time: return null; - case ColumnSchemaTypeDisplay.Array: + case ColumnDisplayType.Array: return null; - case ColumnSchemaTypeDisplay.Struct: + case ColumnDisplayType.Struct: return null; - case ColumnSchemaTypeDisplay.Unknown: + case ColumnDisplayType.Unknown: return null; // This shouldn't ever happen. diff --git a/src/vs/workbench/services/positronDataExplorer/browser/positronDataExplorerColumn.ts b/src/vs/workbench/services/positronDataExplorer/browser/positronDataExplorerColumn.ts index ac21df778cf..2eb6dc74e9b 100644 --- a/src/vs/workbench/services/positronDataExplorer/browser/positronDataExplorerColumn.ts +++ b/src/vs/workbench/services/positronDataExplorer/browser/positronDataExplorerColumn.ts @@ -3,7 +3,7 @@ *--------------------------------------------------------------------------------------------*/ import { DataColumnAlignment } from 'vs/workbench/browser/positronDataGrid/interfaces/dataColumn'; -import { ColumnSchema, ColumnSchemaTypeDisplay } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; +import { ColumnSchema, ColumnDisplayType } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; import { IPositronDataExplorerColumn } from 'vs/workbench/services/positronDataExplorer/browser/interfaces/positronDataExplorerColumn'; /** @@ -65,31 +65,31 @@ export class PositronDataExplorerColumn implements IPositronDataExplorerColumn { get alignment() { // Determine the alignment based on type. switch (this.columnSchema.type_display) { - case ColumnSchemaTypeDisplay.Number: + case ColumnDisplayType.Number: return DataColumnAlignment.Right; - case ColumnSchemaTypeDisplay.Boolean: + case ColumnDisplayType.Boolean: return DataColumnAlignment.Left; - case ColumnSchemaTypeDisplay.String: + case ColumnDisplayType.String: return DataColumnAlignment.Left; - case ColumnSchemaTypeDisplay.Date: + case ColumnDisplayType.Date: return DataColumnAlignment.Right; - case ColumnSchemaTypeDisplay.Datetime: + case ColumnDisplayType.Datetime: return DataColumnAlignment.Right; - case ColumnSchemaTypeDisplay.Time: + case ColumnDisplayType.Time: return DataColumnAlignment.Right; - case ColumnSchemaTypeDisplay.Array: + case ColumnDisplayType.Array: return DataColumnAlignment.Left; - case ColumnSchemaTypeDisplay.Struct: + case ColumnDisplayType.Struct: return DataColumnAlignment.Left; - case ColumnSchemaTypeDisplay.Unknown: + case ColumnDisplayType.Unknown: return DataColumnAlignment.Left; } } diff --git a/src/vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance.tsx b/src/vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance.tsx index 502957a3d54..7021dc051b5 100644 --- a/src/vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance.tsx +++ b/src/vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance.tsx @@ -9,7 +9,7 @@ import * as React from 'react'; import { Emitter } from 'vs/base/common/event'; import { DataGridInstance } from 'vs/workbench/browser/positronDataGrid/classes/dataGridInstance'; import { DataExplorerCache } from 'vs/workbench/services/positronDataExplorer/common/dataExplorerCache'; -import { ColumnSchemaTypeDisplay } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; +import { ColumnDisplayType } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; import { ColumnSummaryCell } from 'vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell'; import { DataExplorerClientInstance } from 'vs/workbench/services/languageRuntime/common/languageRuntimeDataExplorerClient'; @@ -161,31 +161,31 @@ export class TableSummaryDataGridInstance extends DataGridInstance { // Return the row height. switch (columnSchema.type_display) { - case ColumnSchemaTypeDisplay.Number: + case ColumnDisplayType.Number: return rowHeight(6); - case ColumnSchemaTypeDisplay.Boolean: + case ColumnDisplayType.Boolean: return rowHeight(3); - case ColumnSchemaTypeDisplay.String: + case ColumnDisplayType.String: return rowHeight(3); - case ColumnSchemaTypeDisplay.Date: + case ColumnDisplayType.Date: return rowHeight(7); - case ColumnSchemaTypeDisplay.Datetime: + case ColumnDisplayType.Datetime: return rowHeight(7); - case ColumnSchemaTypeDisplay.Time: + case ColumnDisplayType.Time: return rowHeight(7); - case ColumnSchemaTypeDisplay.Array: + case ColumnDisplayType.Array: return rowHeight(2); - case ColumnSchemaTypeDisplay.Struct: + case ColumnDisplayType.Struct: return rowHeight(2); - case ColumnSchemaTypeDisplay.Unknown: + case ColumnDisplayType.Unknown: return rowHeight(2); // This shouldn't ever happen. From d9342e1a8d372afc41db5cd5c8a7e9d569b08292 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 2 Apr 2024 20:06:22 -0500 Subject: [PATCH 2/5] Kludge the UI to get the statistics to show up minimally --- .../positron_ipykernel/data_explorer.py | 130 +++++++---- .../tests/test_data_explorer.py | 210 +++++++++++++----- .../browser/components/columnSummaryCell.tsx | 7 +- .../browser/components/profileNumber.tsx | 27 ++- .../browser/tableSummaryDataGridInstance.tsx | 10 +- .../common/dataExplorerCache.ts | 39 +++- 6 files changed, 311 insertions(+), 112 deletions(-) diff --git a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py index 2e05290eed9..881b773f0ff 100644 --- a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py +++ b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py @@ -48,6 +48,9 @@ SearchSchemaResult, SetRowFiltersRequest, SetSortColumnsRequest, + SummaryStatsBoolean, + SummaryStatsNumber, + SummaryStatsString, TableData, TableSchema, TableShape, @@ -104,9 +107,7 @@ def _recompute_if_needed(self) -> bool: return False def get_schema(self, request: GetSchemaRequest): - return self._get_schema( - request.params.start_index, request.params.num_columns - ).dict() + return self._get_schema(request.params.start_index, request.params.num_columns).dict() def search_schema(self, request: SearchSchemaRequest): return self._search_schema( @@ -294,9 +295,11 @@ def __init__( # search term changes, we discard the last search result. We # might add an LRU cache here or something if it helps # performance. - self._search_schema_last_result: Optional[ - Tuple[str, List[ColumnSchema]] - ] = None + self._search_schema_last_result: Optional[Tuple[str, List[ColumnSchema]]] = None + + # squelch a warning from pandas 2.2.0 about the use below of + # fillna + pd_.set_option("future.no_silent_downcasting", True) def invalidate_computations(self): self.filtered_indices = self.view_indices = None @@ -341,12 +344,7 @@ def _get_schema(self, column_start: int, num_columns: int) -> TableSchema: column_start, min(column_start + num_columns, len(self.table.columns)), ): - column_raw_name = self.table.columns[column_index] - column_name = str(column_raw_name) - - col_schema = self._get_single_column_schema( - column_index, column_name - ) + col_schema = self._get_single_column_schema(column_index) column_schemas.append(col_schema) return TableSchema(columns=column_schemas) @@ -372,9 +370,7 @@ def _search_schema( total_num_matches=len(matches), ) - def _search_schema_get_matches( - self, search_term: str - ) -> List[ColumnSchema]: + def _search_schema_get_matches(self, search_term: str) -> List[ColumnSchema]: matches = [] for column_index in range(len(self.table.columns)): column_raw_name = self.table.columns[column_index] @@ -384,9 +380,7 @@ def _search_schema_get_matches( if search_term not in column_name.lower(): continue - col_schema = self._get_single_column_schema( - column_index, column_name - ) + col_schema = self._get_single_column_schema(column_index) matches.append(col_schema) return matches @@ -395,12 +389,13 @@ def _get_inferred_dtype(self, column_index: int): from pandas.api.types import infer_dtype if column_index not in self._inferred_dtypes: - self._inferred_dtypes[column_index] = infer_dtype( - self.table.iloc[:, column_index] - ) + self._inferred_dtypes[column_index] = infer_dtype(self.table.iloc[:, column_index]) return self._inferred_dtypes[column_index] - def _get_single_column_schema(self, column_index: int, column_name: str): + def _get_single_column_schema(self, column_index: int): + column_raw_name = self.table.columns[column_index] + column_name = str(column_raw_name) + # TODO: pandas MultiIndex columns # TODO: time zone for datetimetz datetime64[ns] types dtype = self.dtypes.iloc[column_index] @@ -443,16 +438,15 @@ def _get_data_values( if self.view_indices is not None: # If the table is either filtered or sorted, use a slice - # the view_indices to select the virtual range of values for the grid + # the view_indices to select the virtual range of values + # for the grid view_slice = self.view_indices[row_start : row_start + num_rows] columns = [col.take(view_slice) for col in columns] indices = self.table.index.take(view_slice) else: # No filtering or sorting, just slice directly indices = self.table.index[row_start : row_start + num_rows] - columns = [ - col.iloc[row_start : row_start + num_rows] for col in columns - ] + columns = [col.iloc[row_start : row_start + num_rows] for col in columns] formatted_columns = [_pandas_format_values(col) for col in columns] @@ -560,7 +554,7 @@ def _eval_filter(self, filt: RowFilter): # Nulls are possible in the mask, so we just fill them if any if mask.dtype != bool: - mask = mask.fillna(False) + mask = mask.fillna(False).infer_objects(copy=False) return mask.to_numpy() @@ -585,9 +579,7 @@ def _sort_data(self) -> None: self.view_indices = self.filtered_indices.take(sort_indexer) else: # Data is not filtered - self.view_indices = nargsort( - column, kind="mergesort", ascending=key.ascending - ) + self.view_indices = nargsort(column, kind="mergesort", ascending=key.ascending) elif len(self.sort_keys) > 1: # Multiple sorting keys cols_to_sort = [] @@ -618,7 +610,63 @@ def _prof_null_count(self, column_index: int): return self._get_column(column_index).isnull().sum() def _prof_summary_stats(self, column_index: int): - raise NotImplementedError + col_schema = self._get_single_column_schema(column_index) + col = self._get_column(column_index) + + ui_type = col_schema.type_display + handler = self._SUMMARIZERS.get(ui_type) + + if handler is None: + # Return nothing for types we don't yet know how to summarize + return ColumnSummaryStats(type_display=ui_type) + else: + return handler(col) + + @staticmethod + def _summarize_number(col: "pd.Series"): + min_value = col.min() + max_value = col.max() + mean = col.mean() + median = col.median() + stdev = col.std() + + return ColumnSummaryStats( + type_display=ColumnDisplayType.Number, + number_stats=SummaryStatsNumber( + min_value=str(min_value), + max_value=str(max_value), + mean=str(mean), + median=str(median), + stdev=str(stdev), + ), + ) + + @staticmethod + def _summarize_string(col: "pd.Series"): + num_empty = (col.str.len() == 0).sum() + num_unique = col.nunique() + + return ColumnSummaryStats( + type_display=ColumnDisplayType.String, + string_stats=SummaryStatsString(num_empty=num_empty, num_unique=num_unique), + ) + + @staticmethod + def _summarize_boolean(col: "pd.Series"): + null_count = col.isnull().sum() + true_count = col.sum() + false_count = len(col) - true_count - null_count + + return ColumnSummaryStats( + type_display=ColumnDisplayType.Boolean, + boolean_stats=SummaryStatsBoolean(true_count=true_count, false_count=false_count), + ) + + _SUMMARIZERS = { + ColumnDisplayType.Boolean: _summarize_boolean, + ColumnDisplayType.Number: _summarize_number, + ColumnDisplayType.String: _summarize_string, + } def _prof_freq_table(self, column_index: int): raise NotImplementedError @@ -628,9 +676,7 @@ def _prof_histogram(self, column_index: int): def _get_state(self) -> TableState: return TableState( - table_shape=TableShape( - num_rows=self.table.shape[0], num_columns=self.table.shape[1] - ), + table_shape=TableShape(num_rows=self.table.shape[0], num_columns=self.table.shape[1]), row_filters=self.filters, sort_keys=self.sort_keys, ) @@ -817,9 +863,7 @@ def handle_variable_updated(self, variable_name, new_variable): for comm_id in list(self.path_to_comm_ids[path]): self._update_explorer_for_comm(comm_id, path, new_variable) - def _update_explorer_for_comm( - self, comm_id: str, path: PathKey, new_variable - ): + def _update_explorer_for_comm(self, comm_id: str, path: PathKey, new_variable): """ If a variable is updated, we have to handle the different scenarios: @@ -853,9 +897,7 @@ def _update_explorer_for_comm( # data explorer open for a nested value, then we need to use # the same variables inspection logic to resolve it here. if len(path) > 1: - is_found, new_table = _resolve_value_from_path( - new_variable, path[1:] - ) + is_found, new_table = _resolve_value_from_path(new_variable, path[1:]) if not is_found: raise KeyError(f"Path {', '.join(path)} not found in value") else: @@ -874,9 +916,7 @@ def _fire_data_update(): def _fire_schema_update(discard_state=False): msg = SchemaUpdateParams(discard_state=discard_state) - comm.send_event( - DataExplorerFrontendEvent.SchemaUpdate.value, msg.dict() - ) + comm.send_event(DataExplorerFrontendEvent.SchemaUpdate.value, msg.dict()) if type(new_table) is not type(table_view.table): # noqa: E721 # Data type has changed. For now, we will signal the UI to @@ -921,9 +961,7 @@ def _fire_schema_update(discard_state=False): else: _fire_data_update() - def handle_msg( - self, msg: CommMessage[DataExplorerBackendMessageContent], raw_msg - ): + def handle_msg(self, msg: CommMessage[DataExplorerBackendMessageContent], raw_msg): """ Handle messages received from the client via the positron.data_explorer comm. diff --git a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py index 64d0bab6e3d..3876af048cb 100644 --- a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py +++ b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py @@ -13,11 +13,16 @@ from ..access_keys import encode_access_key from ..data_explorer import COMPARE_OPS, DataExplorerService from ..data_explorer_comm import ( + ColumnDisplayType, + ColumnSummaryStats, RowFilter, ColumnProfileResult, ColumnSchema, ColumnSortKey, FilterResult, + SummaryStatsNumber, + SummaryStatsString, + SummaryStatsBoolean, ) from .conftest import DummyComm, PositronShell from .test_variables import BIG_ARRAY_LENGTH @@ -426,7 +431,7 @@ def compare_tables(self, table_id: str, expected_id: str, table_shape: tuple): @pytest.fixture() -def de_fixture(de_service: DataExplorerService): +def dxf(de_service: DataExplorerService): return DataExplorerFixture(de_service) @@ -434,8 +439,8 @@ def _wrap_json(model: Type[BaseModel], data: JsonRecords): return [model(**d).dict() for d in data] -def test_pandas_get_state(de_fixture: DataExplorerFixture): - result = de_fixture.get_state("simple") +def test_pandas_get_state(dxf: DataExplorerFixture): + result = dxf.get_state("simple") assert result["table_shape"]["num_rows"] == 5 assert result["table_shape"]["num_columns"] == 6 @@ -444,17 +449,15 @@ def test_pandas_get_state(de_fixture: DataExplorerFixture): {"column_index": 1, "ascending": False}, ] filters = [_compare_filter(0, ">", 0), _compare_filter(0, "<", 5)] - de_fixture.set_sort_columns("simple", sort_keys=sort_keys) - de_fixture.set_row_filters("simple", filters=filters) + dxf.set_sort_columns("simple", sort_keys=sort_keys) + dxf.set_row_filters("simple", filters=filters) - result = de_fixture.get_state("simple") + result = dxf.get_state("simple") assert result["sort_keys"] == sort_keys assert result["row_filters"] == [RowFilter(**f) for f in filters] -def test_pandas_get_schema(de_fixture: DataExplorerFixture): - dxf = de_fixture - +def test_pandas_get_schema(dxf: DataExplorerFixture): result = dxf.get_schema("simple", 0, 100) full_schema = [ @@ -524,9 +527,7 @@ def test_pandas_get_schema(de_fixture: DataExplorerFixture): assert result["columns"] == _wrap_json(ColumnSchema, bigger_schema[10:20]) -def test_pandas_wide_schemas(de_fixture: DataExplorerFixture): - dxf = de_fixture - +def test_pandas_wide_schemas(dxf: DataExplorerFixture): arr = np.arange(10).astype(object) ncols = 10000 @@ -550,9 +551,7 @@ def test_pandas_wide_schemas(de_fixture: DataExplorerFixture): assert left == right -def test_pandas_search_schema(de_fixture: DataExplorerFixture): - dxf = de_fixture - +def test_pandas_search_schema(dxf: DataExplorerFixture): # Make a few thousand column names we can search for column_names = [ f"{prefix}_{i}" @@ -591,8 +590,8 @@ def _trim_whitespace(columns): return [[x.strip() for x in column] for column in columns] -def test_pandas_get_data_values(de_fixture: DataExplorerFixture): - result = de_fixture.get_data_values( +def test_pandas_get_data_values(dxf: DataExplorerFixture): + result = dxf.get_data_values( "simple", row_start_index=0, num_rows=20, @@ -621,14 +620,12 @@ def test_pandas_get_data_values(de_fixture: DataExplorerFixture): assert result["row_labels"] == [["0", "1", "2", "3", "4"]] # Edge cases: request beyond end of table - response = de_fixture.get_data_values( - "simple", row_start_index=5, num_rows=10, column_indices=[0] - ) + response = dxf.get_data_values("simple", row_start_index=5, num_rows=10, column_indices=[0]) assert response["columns"] == [[]] # Issue #2149 -- return empty result when requesting non-existent # column indices - response = de_fixture.get_data_values( + response = dxf.get_data_values( "simple", row_start_index=0, num_rows=5, column_indices=[2, 3, 4, 5] ) assert _trim_whitespace(response["columns"]) == expected_columns[2:] @@ -638,7 +635,7 @@ def test_pandas_get_data_values(de_fixture: DataExplorerFixture): # to request non-existent column indices, disable this test # with pytest.raises(IndexError): - # de_fixture.get_data_values( + # dxf.get_data_values( # "simple", row_start_index=0, num_rows=10, column_indices=[4] # ) @@ -690,8 +687,7 @@ def _set_member_filter(column_index, values, inclusive=True): ) -def test_pandas_filter_between(de_fixture: DataExplorerFixture): - dxf = de_fixture +def test_pandas_filter_between(dxf: DataExplorerFixture): df = SIMPLE_PANDAS_DF column = "a" column_index = df.columns.get_loc(column) @@ -719,7 +715,7 @@ def test_pandas_filter_between(de_fixture: DataExplorerFixture): ) -def test_pandas_filter_compare(de_fixture: DataExplorerFixture): +def test_pandas_filter_compare(dxf: DataExplorerFixture): # Just use the 'a' column to smoke test comparison filters on # integers table_name = "simple" @@ -731,23 +727,23 @@ def test_pandas_filter_compare(de_fixture: DataExplorerFixture): for op, op_func in COMPARE_OPS.items(): filt = _compare_filter(column_index, op, str(compare_value)) expected_df = df[op_func(df[column], compare_value)] - de_fixture.check_filter_case(df, [filt], expected_df) + dxf.check_filter_case(df, [filt], expected_df) # TODO(wesm): move these tests to their own test case # Test that passing empty filter set resets to unfiltered state filt = _compare_filter(column_index, "<", str(compare_value)) - _ = de_fixture.set_row_filters(table_name, filters=[filt]) - response = de_fixture.set_row_filters(table_name, filters=[]) + _ = dxf.set_row_filters(table_name, filters=[filt]) + response = dxf.set_row_filters(table_name, filters=[]) assert response == FilterResult(selected_num_rows=len(df)) # register the whole table to make sure the filters are really cleared ex_id = guid() - de_fixture.register_table(ex_id, df) - de_fixture.compare_tables(table_name, ex_id, df.shape) + dxf.register_table(ex_id, df) + dxf.compare_tables(table_name, ex_id, df.shape) -def test_pandas_filter_is_null_not_null(de_fixture: DataExplorerFixture): +def test_pandas_filter_is_null_not_null(dxf: DataExplorerFixture): df = SIMPLE_PANDAS_DF b_is_null = _filter("is_null", 1) b_not_null = _filter("not_null", 1) @@ -760,10 +756,10 @@ def test_pandas_filter_is_null_not_null(de_fixture: DataExplorerFixture): ] for filter_set, expected_df in cases: - de_fixture.check_filter_case(df, filter_set, expected_df) + dxf.check_filter_case(df, filter_set, expected_df) -def test_pandas_filter_set_membership(de_fixture: DataExplorerFixture): +def test_pandas_filter_set_membership(dxf: DataExplorerFixture): df = SIMPLE_PANDAS_DF cases = [ @@ -778,11 +774,10 @@ def test_pandas_filter_set_membership(de_fixture: DataExplorerFixture): ] for filter_set, expected_df in cases: - de_fixture.check_filter_case(df, filter_set, expected_df) + dxf.check_filter_case(df, filter_set, expected_df) -def test_pandas_filter_search(de_fixture: DataExplorerFixture): - dxf = de_fixture +def test_pandas_filter_search(dxf: DataExplorerFixture): df = pd.DataFrame( { "a": ["foo1", "foo2", None, "2FOO", "FOO3", "bar1", "2BAR"], @@ -856,7 +851,7 @@ def test_pandas_filter_search(de_fixture: DataExplorerFixture): ) -def test_pandas_set_sort_columns(de_fixture: DataExplorerFixture): +def test_pandas_set_sort_columns(dxf: DataExplorerFixture): tables = { "df1": SIMPLE_PANDAS_DF, # Just some random data to test multiple keys, different sort @@ -902,18 +897,18 @@ def test_pandas_set_sort_columns(de_fixture: DataExplorerFixture): expected_df = df.sort_values(**expected_params) - de_fixture.check_sort_case(df, wrapped_keys, expected_df) + dxf.check_sort_case(df, wrapped_keys, expected_df) for filter_f, filters in filter_cases.get(df_name, []): expected_filtered = filter_f(df).sort_values(**expected_params) - de_fixture.check_sort_case(df, wrapped_keys, expected_filtered, filters=filters) + dxf.check_sort_case(df, wrapped_keys, expected_filtered, filters=filters) def test_pandas_change_schema_after_sort( shell: PositronShell, de_service: DataExplorerService, variables_comm: DummyComm, - de_fixture: DataExplorerFixture, + dxf: DataExplorerFixture, ): df = pd.DataFrame( { @@ -929,26 +924,32 @@ def test_pandas_change_schema_after_sort( # Sort a column that is out of bounds for the table after the # schema change below - de_fixture.set_sort_columns("df", [{"column_index": 4, "ascending": True}]) + dxf.set_sort_columns("df", [{"column_index": 4, "ascending": True}]) expected_df = df[["a", "b"]] - de_fixture.register_table("expected_df", df) + dxf.register_table("expected_df", df) # Sort last column, and we will then change the schema shell.run_cell("df = df[['a', 'b']]") _check_update_variable(de_service, "df", update_type="schema", discard_state=True) # Call get_data_values and make sure it works - de_fixture.compare_tables("df", "expected_df", expected_df.shape) + dxf.compare_tables("df", "expected_df", expected_df.shape) def _profile_request(column_index, profile_type): return {"column_index": column_index, "type": profile_type} -def test_pandas_profile_null_counts(de_fixture: DataExplorerFixture): - dxf = de_fixture +def _get_null_count(column_index): + return _profile_request(column_index, "null_count") + + +def _get_summary_stats(column_index): + return _profile_request(column_index, "summary_stats") + +def test_pandas_profile_null_counts(dxf: DataExplorerFixture): df1 = pd.DataFrame( { "a": [0, np.nan, 2, np.nan, 4, 5, 6], @@ -962,26 +963,28 @@ def test_pandas_profile_null_counts(de_fixture: DataExplorerFixture): for name, df in tables.items(): dxf.register_table(name, df) - def _null_count(column_index): - return _profile_request(column_index, "null_count") - # tuples like (table_name, [ColumnProfileRequest], [results]) all_profiles = [ - _null_count(0), - _null_count(1), - _null_count(2), - _null_count(3), + _get_null_count(0), + _get_null_count(1), + _get_null_count(2), + _get_null_count(3), ] cases = [ ("df1", [], []), ( "df1", - [_null_count(3)], + [_get_null_count(3)], [0], ), ( "df1", - [_null_count(0), _null_count(1), _null_count(2), _null_count(3)], + [ + _get_null_count(0), + _get_null_count(1), + _get_null_count(2), + _get_null_count(3), + ], [2, 3, 4, 0], ), ] @@ -1010,6 +1013,105 @@ def _null_count(column_index): assert results == ex_results +EPSILON = 1e-7 + + +def _assert_close(expected, actual): + assert np.abs(actual - expected) < EPSILON + + +def _assert_numeric_stats_equal(expected, actual): + for attr, value in expected.items(): + _assert_close(value, float(actual.get(attr))) + + +def _assert_string_stats_equal(expected, actual): + assert expected["num_empty"] == actual["num_empty"] + assert expected["num_unique"] == actual["num_unique"] + + +def _assert_boolean_stats_equal(expected, actual): + assert expected["true_count"] == actual["true_count"] + assert expected["false_count"] == actual["false_count"] + + +def test_pandas_profile_summary_stats(dxf: DataExplorerFixture): + arr = np.random.standard_normal(100) + arr_with_nulls = arr.copy() + arr_with_nulls[::10] = np.nan + + df1 = pd.DataFrame( + { + "a": arr, + "b": arr_with_nulls, + "c": [False, False, False, True, None] * 20, + "d": [ + "foo", + "", + "baz", + "qux", + "foo", + None, + "bar", + "", + "bar", + "zzz", + ] + * 10, + } + ) + dxf.register_table("df1", df1) + + cases = [ + ( + "df1", + 0, + { + "min_value": arr.min(), + "max_value": arr.max(), + "mean": df1["a"].mean(), + "stdev": df1["a"].std(), + "median": df1["a"].median(), + }, + ), + ( + "df1", + 1, + { + "min_value": df1["b"].min(), + "max_value": df1["b"].max(), + "mean": df1["b"].mean(), + "stdev": df1["b"].std(), + "median": df1["b"].median(), + }, + ), + ( + "df1", + 2, + {"true_count": 20, "false_count": 60}, + ), + ( + "df1", + 3, + {"num_empty": 20, "num_unique": 6}, + ), + ] + + for table_name, col_index, ex_result in cases: + profiles = [_get_summary_stats(col_index)] + results = dxf.get_column_profiles(table_name, profiles) + + stats = results[0]["summary_stats"] + ui_type = stats["type_display"] + + if ui_type == ColumnDisplayType.Number: + _assert_numeric_stats_equal(ex_result, stats["number_stats"]) + elif ui_type == ColumnDisplayType.String: + _assert_string_stats_equal(ex_result, stats["string_stats"]) + elif ui_type == ColumnDisplayType.Boolean: + _assert_boolean_stats_equal(ex_result, stats["boolean_stats"]) + + # ---------------------------------------------------------------------- # Test RPCs for polars DataFrame diff --git a/src/vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell.tsx b/src/vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell.tsx index 5b391a50233..1c49daf3e55 100644 --- a/src/vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell.tsx +++ b/src/vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell.tsx @@ -75,10 +75,15 @@ export const ColumnSummaryCell = (props: ColumnSummaryCellProps) => { * @returns The profile component. */ const profile = () => { + // Hack just to get things working + props.instance.computeColumnSummaryStats(props.columnIndex); // Determine the alignment based on type. switch (props.columnSchema.type_display) { case ColumnDisplayType.Number: - return ; + return ; case ColumnDisplayType.Boolean: return null; diff --git a/src/vs/workbench/services/positronDataExplorer/browser/components/profileNumber.tsx b/src/vs/workbench/services/positronDataExplorer/browser/components/profileNumber.tsx index 679e9a7cdbe..c726fd20368 100644 --- a/src/vs/workbench/services/positronDataExplorer/browser/components/profileNumber.tsx +++ b/src/vs/workbench/services/positronDataExplorer/browser/components/profileNumber.tsx @@ -8,10 +8,14 @@ import 'vs/css!./profileNumber'; // React. import * as React from 'react'; +import { TableSummaryDataGridInstance } from 'vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance'; + /** * ProfileNumberProps interface. */ interface ProfileNumberProps { + instance: TableSummaryDataGridInstance; + columnIndex: number; } /** @@ -20,33 +24,38 @@ interface ProfileNumberProps { * @returns The rendered component. */ export const ProfileNumber = (props: ProfileNumberProps) => { + // Hack + let stats: any = props.instance.getColumnSummaryStats(props.columnIndex)?.number_stats!; + if (!stats) { + stats = {}; + } return (
NA
-
Median
Mean
+
Median
SD
Min
Max
-
12
-
1
-
4
-
2
-
5
-
102
+
-999999
+
{stats.mean}
+
{stats.median}
+
{stats.stdev}
+
{stats.min_value}
+
{stats.max_value}
-
+ {/*
 
.51
.20
.24
 
.44
-
+
*/}
); diff --git a/src/vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance.tsx b/src/vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance.tsx index 7021dc051b5..8e7d379c53a 100644 --- a/src/vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance.tsx +++ b/src/vs/workbench/services/positronDataExplorer/browser/tableSummaryDataGridInstance.tsx @@ -9,7 +9,7 @@ import * as React from 'react'; import { Emitter } from 'vs/base/common/event'; import { DataGridInstance } from 'vs/workbench/browser/positronDataGrid/classes/dataGridInstance'; import { DataExplorerCache } from 'vs/workbench/services/positronDataExplorer/common/dataExplorerCache'; -import { ColumnDisplayType } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; +import { ColumnDisplayType, ColumnSummaryStats } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; import { ColumnSummaryCell } from 'vs/workbench/services/positronDataExplorer/browser/components/columnSummaryCell'; import { DataExplorerClientInstance } from 'vs/workbench/services/languageRuntime/common/languageRuntimeDataExplorerClient'; @@ -230,6 +230,14 @@ export class TableSummaryDataGridInstance extends DataGridInstance { nullCount * 100 / this._dataExplorerCache.rows); } + getColumnSummaryStats(columnIndex: number): ColumnSummaryStats | undefined { + return this._dataExplorerCache.getColumnSummaryStats(columnIndex); + } + + computeColumnSummaryStats(columnIndex: number) { + this._dataExplorerCache.updateColumnSummaryStats([columnIndex]); + } + //#endregion DataGridInstance Methods //#region Public Events diff --git a/src/vs/workbench/services/positronDataExplorer/common/dataExplorerCache.ts b/src/vs/workbench/services/positronDataExplorer/common/dataExplorerCache.ts index f3c7961ab8d..1696850f065 100644 --- a/src/vs/workbench/services/positronDataExplorer/common/dataExplorerCache.ts +++ b/src/vs/workbench/services/positronDataExplorer/common/dataExplorerCache.ts @@ -4,7 +4,7 @@ import { Emitter } from 'vs/base/common/event'; import { Disposable } from 'vs/base/common/lifecycle'; -import { ColumnProfileRequestType, ColumnSchema, TableData } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; +import { ColumnProfileRequestType, ColumnSchema, ColumnSummaryStats, TableData } from 'vs/workbench/services/languageRuntime/common/positronDataExplorerComm'; import { DataExplorerClientInstance } from 'vs/workbench/services/languageRuntime/common/languageRuntimeDataExplorerClient'; /** @@ -72,6 +72,11 @@ export class DataExplorerCache extends Disposable { */ private readonly _columnNullCountCache = new Map(); + /** + * Gets the column summary stats cache. + */ + private readonly _columnSummaryStatsCache = new Map(); + /** * Gets the row label cache. */ @@ -107,6 +112,7 @@ export class DataExplorerCache extends Disposable { // Clear the column schema cache, row label cache, and data cell cache. this._columnSchemaCache.clear(); this._columnNullCountCache.clear(); + this._columnSummaryStatsCache.clear(); this._rowLabelCache.clear(); this._dataCellCache.clear(); })); @@ -117,6 +123,7 @@ export class DataExplorerCache extends Disposable { this._rowLabelCache.clear(); this._dataCellCache.clear(); this._columnNullCountCache.clear(); + this._columnSummaryStatsCache.clear(); })); } @@ -158,6 +165,7 @@ export class DataExplorerCache extends Disposable { this._rowLabelCache.clear(); this._dataCellCache.clear(); this._columnNullCountCache.clear(); + this._columnSummaryStatsCache.clear(); } /** @@ -190,6 +198,35 @@ export class DataExplorerCache extends Disposable { return this._columnNullCountCache.get(columnIndex); } + /** + * Gets the cached summary stats for the specified column index. + * @param columnIndex The column index. + * @returns ColumnSummaryStats in the specified column index. + */ + getColumnSummaryStats(columnIndex: number) { + return this._columnSummaryStatsCache.get(columnIndex); + } + + async updateColumnSummaryStats(columnIndices: Array) { + // Request the profiles + const results = await this._dataExplorerClientInstance.getColumnProfiles( + columnIndices.map(column_index => { + return { + column_index, + type: ColumnProfileRequestType.SummaryStats + }; + }) + ); + + // Update the column schema cache, overwriting any entries we already have cached. + for (let i = 0; i < results.length; i++) { + const stats = results[i].summary_stats; + if (stats !== undefined) { + this._columnSummaryStatsCache.set(columnIndices[i], stats); + } + } + } + /** * Gets the row label for the specified row index. * @param rowIndex The row index. From 63ec1b4a0e6539e8f83ac25f25ac6eea1cabde86 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 2 Apr 2024 20:14:12 -0500 Subject: [PATCH 3/5] Fix ruff flakes --- .../positron/positron_ipykernel/tests/test_data_explorer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py index 3876af048cb..6da925b861d 100644 --- a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py +++ b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py @@ -14,15 +14,11 @@ from ..data_explorer import COMPARE_OPS, DataExplorerService from ..data_explorer_comm import ( ColumnDisplayType, - ColumnSummaryStats, RowFilter, ColumnProfileResult, ColumnSchema, ColumnSortKey, FilterResult, - SummaryStatsNumber, - SummaryStatsString, - SummaryStatsBoolean, ) from .conftest import DummyComm, PositronShell from .test_variables import BIG_ARRAY_LENGTH From e08fb96f5b03a6c4351820197151479c9dccf9e7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 2 Apr 2024 20:21:51 -0500 Subject: [PATCH 4/5] Work around python 3.9 fussiness --- .../positron/positron_ipykernel/data_explorer.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py index 881b773f0ff..b920b93a76e 100644 --- a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py +++ b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py @@ -301,6 +301,14 @@ def __init__( # fillna pd_.set_option("future.no_silent_downcasting", True) + # Putting this here rather than in the class body before + # Python < 3.10 has fussier rules about staticmethods + self._SUMMARIZERS = { + ColumnDisplayType.Boolean: self._summarize_boolean, + ColumnDisplayType.Number: self._summarize_number, + ColumnDisplayType.String: self._summarize_string, + } + def invalidate_computations(self): self.filtered_indices = self.view_indices = None self._need_recompute = True @@ -662,12 +670,6 @@ def _summarize_boolean(col: "pd.Series"): boolean_stats=SummaryStatsBoolean(true_count=true_count, false_count=false_count), ) - _SUMMARIZERS = { - ColumnDisplayType.Boolean: _summarize_boolean, - ColumnDisplayType.Number: _summarize_number, - ColumnDisplayType.String: _summarize_string, - } - def _prof_freq_table(self, column_index: int): raise NotImplementedError From 351ccbb7d901ceb96a0efc348e88f8201407deed Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 3 Apr 2024 10:02:44 -0500 Subject: [PATCH 5/5] Don't fiddle with pandas options, work around warning another way --- .../positron/positron_ipykernel/data_explorer.py | 9 ++++----- .../positron_ipykernel/tests/test_data_explorer.py | 3 ++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py index b920b93a76e..05eccba168b 100644 --- a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py +++ b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/data_explorer.py @@ -297,10 +297,6 @@ def __init__( # performance. self._search_schema_last_result: Optional[Tuple[str, List[ColumnSchema]]] = None - # squelch a warning from pandas 2.2.0 about the use below of - # fillna - pd_.set_option("future.no_silent_downcasting", True) - # Putting this here rather than in the class body before # Python < 3.10 has fussier rules about staticmethods self._SUMMARIZERS = { @@ -560,9 +556,12 @@ def _eval_filter(self, filt: RowFilter): elif params.type == SearchFilterParamsType.EndsWith: mask = col.str.endswith(term) + assert mask is not None + # Nulls are possible in the mask, so we just fill them if any if mask.dtype != bool: - mask = mask.fillna(False).infer_objects(copy=False) + mask[mask.isna()] = False + mask = mask.astype(bool) return mask.to_numpy() diff --git a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py index 6da925b861d..26c04fb157b 100644 --- a/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py +++ b/extensions/positron-python/pythonFiles/positron/positron_ipykernel/tests/test_data_explorer.py @@ -832,7 +832,8 @@ def test_pandas_filter_search(dxf: DataExplorerFixture): ] for search_type, column_index, term, cs, mask in cases: - ex_table = df[mask.fillna(False)] + mask[mask.isna()] = False + ex_table = df[mask.astype(bool)] dxf.check_filter_case( df, [