Skip to content

Commit

Permalink
Refine summary stats backend RPCs for data explorer, add Python imple…
Browse files Browse the repository at this point in the history
…mentation and basic scaffolding (#2617)

* Protocol improvements to support different types of summary stats

* Kludge the UI to get the statistics to show up minimally

* Fix ruff flakes

* Work around python 3.9 fussiness

* Don't fiddle with pandas options, work around warning another way
  • Loading branch information
wesm authored Apr 3, 2024
1 parent 6a0b220 commit 126b5a4
Show file tree
Hide file tree
Showing 13 changed files with 642 additions and 266 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
ColumnProfileRequestType,
ColumnProfileResult,
ColumnSchema,
ColumnSchemaTypeDisplay,
ColumnDisplayType,
ColumnSortKey,
DataExplorerBackendMessageContent,
DataExplorerFrontendEvent,
Expand All @@ -48,6 +48,9 @@
SearchSchemaResult,
SetRowFiltersRequest,
SetSortColumnsRequest,
SummaryStatsBoolean,
SummaryStatsNumber,
SummaryStatsString,
TableData,
TableSchema,
TableShape,
Expand Down Expand Up @@ -294,6 +297,14 @@ def __init__(
# performance.
self._search_schema_last_result: Optional[Tuple[str, List[ColumnSchema]]] = None

# Putting this here rather than in the class body before
# Python < 3.10 has fussier rules about staticmethods
self._SUMMARIZERS = {
ColumnDisplayType.Boolean: self._summarize_boolean,
ColumnDisplayType.Number: self._summarize_number,
ColumnDisplayType.String: self._summarize_string,
}

def invalidate_computations(self):
self.filtered_indices = self.view_indices = None
self._need_recompute = True
Expand Down Expand Up @@ -337,10 +348,7 @@ def _get_schema(self, column_start: int, num_columns: int) -> TableSchema:
column_start,
min(column_start + num_columns, len(self.table.columns)),
):
column_raw_name = self.table.columns[column_index]
column_name = str(column_raw_name)

col_schema = self._get_single_column_schema(column_index, column_name)
col_schema = self._get_single_column_schema(column_index)
column_schemas.append(col_schema)

return TableSchema(columns=column_schemas)
Expand Down Expand Up @@ -376,7 +384,7 @@ def _search_schema_get_matches(self, search_term: str) -> List[ColumnSchema]:
if search_term not in column_name.lower():
continue

col_schema = self._get_single_column_schema(column_index, column_name)
col_schema = self._get_single_column_schema(column_index)
matches.append(col_schema)

return matches
Expand All @@ -388,7 +396,10 @@ def _get_inferred_dtype(self, column_index: int):
self._inferred_dtypes[column_index] = infer_dtype(self.table.iloc[:, column_index])
return self._inferred_dtypes[column_index]

def _get_single_column_schema(self, column_index: int, column_name: str):
def _get_single_column_schema(self, column_index: int):
column_raw_name = self.table.columns[column_index]
column_name = str(column_raw_name)

# TODO: pandas MultiIndex columns
# TODO: time zone for datetimetz datetime64[ns] types
dtype = self.dtypes.iloc[column_index]
Expand All @@ -405,7 +416,7 @@ def _get_single_column_schema(self, column_index: int, column_name: str):
column_name=column_name,
column_index=column_index,
type_name=type_name,
type_display=ColumnSchemaTypeDisplay(type_display),
type_display=ColumnDisplayType(type_display),
)

def _get_data_values(
Expand All @@ -431,7 +442,8 @@ def _get_data_values(

if self.view_indices is not None:
# If the table is either filtered or sorted, use a slice
# the view_indices to select the virtual range of values for the grid
# the view_indices to select the virtual range of values
# for the grid
view_slice = self.view_indices[row_start : row_start + num_rows]
columns = [col.take(view_slice) for col in columns]
indices = self.table.index.take(view_slice)
Expand Down Expand Up @@ -544,9 +556,12 @@ def _eval_filter(self, filt: RowFilter):
elif params.type == SearchFilterParamsType.EndsWith:
mask = col.str.endswith(term)

assert mask is not None

# Nulls are possible in the mask, so we just fill them if any
if mask.dtype != bool:
mask = mask.fillna(False)
mask[mask.isna()] = False
mask = mask.astype(bool)

return mask.to_numpy()

Expand Down Expand Up @@ -602,7 +617,57 @@ def _prof_null_count(self, column_index: int):
return self._get_column(column_index).isnull().sum()

def _prof_summary_stats(self, column_index: int):
raise NotImplementedError
col_schema = self._get_single_column_schema(column_index)
col = self._get_column(column_index)

ui_type = col_schema.type_display
handler = self._SUMMARIZERS.get(ui_type)

if handler is None:
# Return nothing for types we don't yet know how to summarize
return ColumnSummaryStats(type_display=ui_type)
else:
return handler(col)

@staticmethod
def _summarize_number(col: "pd.Series"):
min_value = col.min()
max_value = col.max()
mean = col.mean()
median = col.median()
stdev = col.std()

return ColumnSummaryStats(
type_display=ColumnDisplayType.Number,
number_stats=SummaryStatsNumber(
min_value=str(min_value),
max_value=str(max_value),
mean=str(mean),
median=str(median),
stdev=str(stdev),
),
)

@staticmethod
def _summarize_string(col: "pd.Series"):
num_empty = (col.str.len() == 0).sum()
num_unique = col.nunique()

return ColumnSummaryStats(
type_display=ColumnDisplayType.String,
string_stats=SummaryStatsString(num_empty=num_empty, num_unique=num_unique),
)

@staticmethod
def _summarize_boolean(col: "pd.Series"):
null_count = col.isnull().sum()
true_count = col.sum()
false_count = len(col) - true_count - null_count

return ColumnSummaryStats(
type_display=ColumnDisplayType.Boolean,
boolean_stats=SummaryStatsBoolean(true_count=true_count, false_count=false_count),
)

def _prof_freq_table(self, column_index: int):
raise NotImplementedError
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@


@enum.unique
class ColumnSchemaTypeDisplay(str, enum.Enum):
class ColumnDisplayType(str, enum.Enum):
"""
Possible values for TypeDisplay in ColumnSchema
Possible values for ColumnDisplayType
"""

Number = "number"
Expand Down Expand Up @@ -202,7 +202,7 @@ class ColumnSchema(BaseModel):
description="Exact name of data type used by underlying table",
)

type_display: ColumnSchemaTypeDisplay = Field(
type_display: ColumnDisplayType = Field(
description="Canonical Positron display name of data type",
)

Expand Down Expand Up @@ -387,7 +387,33 @@ class ColumnProfileResult(BaseModel):

class ColumnSummaryStats(BaseModel):
"""
ColumnSummaryStats in Schemas
Profile result containing summary stats for a column based on the data
type
"""

type_display: ColumnDisplayType = Field(
description="Canonical Positron display name of data type",
)

number_stats: Optional[SummaryStatsNumber] = Field(
default=None,
description="Statistics for a numeric data type",
)

string_stats: Optional[SummaryStatsString] = Field(
default=None,
description="Statistics for a string-like data type",
)

boolean_stats: Optional[SummaryStatsBoolean] = Field(
default=None,
description="Statistics for a boolean data type",
)


class SummaryStatsNumber(BaseModel):
"""
SummaryStatsNumber in Schemas
"""

min_value: str = Field(
Expand All @@ -398,24 +424,44 @@ class ColumnSummaryStats(BaseModel):
description="Maximum value as string",
)

mean_value: Optional[str] = Field(
default=None,
mean: str = Field(
description="Average value as string",
)

median: Optional[str] = Field(
default=None,
median: str = Field(
description="Sample median (50% value) value as string",
)

q25: Optional[str] = Field(
default=None,
description="25th percentile value as string",
stdev: str = Field(
description="Sample standard deviation as a string",
)

q75: Optional[str] = Field(
default=None,
description="75th percentile value as string",

class SummaryStatsBoolean(BaseModel):
"""
SummaryStatsBoolean in Schemas
"""

true_count: int = Field(
description="The number of non-null true values",
)

false_count: int = Field(
description="The number of non-null false values",
)


class SummaryStatsString(BaseModel):
"""
SummaryStatsString in Schemas
"""

num_empty: int = Field(
description="The number of empty / length-zero values",
)

num_unique: int = Field(
description="The exact number of distinct values",
)


Expand Down Expand Up @@ -560,7 +606,7 @@ class SearchSchemaParams(BaseModel):
"""

search_term: str = Field(
description="Substring to match for (currently case insensitive",
description="Substring to match for (currently case insensitive)",
)

start_index: int = Field(
Expand Down Expand Up @@ -798,6 +844,12 @@ class SchemaUpdateParams(BaseModel):

ColumnSummaryStats.update_forward_refs()

SummaryStatsNumber.update_forward_refs()

SummaryStatsBoolean.update_forward_refs()

SummaryStatsString.update_forward_refs()

ColumnHistogram.update_forward_refs()

ColumnFrequencyTable.update_forward_refs()
Expand Down
Loading

0 comments on commit 126b5a4

Please sign in to comment.