Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refine summary stats backend RPCs for data explorer, add Python implementation and basic scaffolding #2617

Merged
merged 5 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
ColumnProfileRequestType,
ColumnProfileResult,
ColumnSchema,
ColumnSchemaTypeDisplay,
ColumnDisplayType,
ColumnSortKey,
DataExplorerBackendMessageContent,
DataExplorerFrontendEvent,
Expand All @@ -48,6 +48,9 @@
SearchSchemaResult,
SetRowFiltersRequest,
SetSortColumnsRequest,
SummaryStatsBoolean,
SummaryStatsNumber,
SummaryStatsString,
TableData,
TableSchema,
TableShape,
Expand Down Expand Up @@ -294,6 +297,14 @@ def __init__(
# performance.
self._search_schema_last_result: Optional[Tuple[str, List[ColumnSchema]]] = None

# Putting this here rather than in the class body before
# Python < 3.10 has fussier rules about staticmethods
self._SUMMARIZERS = {
ColumnDisplayType.Boolean: self._summarize_boolean,
ColumnDisplayType.Number: self._summarize_number,
ColumnDisplayType.String: self._summarize_string,
}

def invalidate_computations(self):
self.filtered_indices = self.view_indices = None
self._need_recompute = True
Expand Down Expand Up @@ -337,10 +348,7 @@ def _get_schema(self, column_start: int, num_columns: int) -> TableSchema:
column_start,
min(column_start + num_columns, len(self.table.columns)),
):
column_raw_name = self.table.columns[column_index]
column_name = str(column_raw_name)

col_schema = self._get_single_column_schema(column_index, column_name)
col_schema = self._get_single_column_schema(column_index)
column_schemas.append(col_schema)

return TableSchema(columns=column_schemas)
Expand Down Expand Up @@ -376,7 +384,7 @@ def _search_schema_get_matches(self, search_term: str) -> List[ColumnSchema]:
if search_term not in column_name.lower():
continue

col_schema = self._get_single_column_schema(column_index, column_name)
col_schema = self._get_single_column_schema(column_index)
matches.append(col_schema)

return matches
Expand All @@ -388,7 +396,10 @@ def _get_inferred_dtype(self, column_index: int):
self._inferred_dtypes[column_index] = infer_dtype(self.table.iloc[:, column_index])
return self._inferred_dtypes[column_index]

def _get_single_column_schema(self, column_index: int, column_name: str):
def _get_single_column_schema(self, column_index: int):
column_raw_name = self.table.columns[column_index]
column_name = str(column_raw_name)

# TODO: pandas MultiIndex columns
# TODO: time zone for datetimetz datetime64[ns] types
dtype = self.dtypes.iloc[column_index]
Expand All @@ -405,7 +416,7 @@ def _get_single_column_schema(self, column_index: int, column_name: str):
column_name=column_name,
column_index=column_index,
type_name=type_name,
type_display=ColumnSchemaTypeDisplay(type_display),
type_display=ColumnDisplayType(type_display),
)

def _get_data_values(
Expand All @@ -431,7 +442,8 @@ def _get_data_values(

if self.view_indices is not None:
# If the table is either filtered or sorted, use a slice
# the view_indices to select the virtual range of values for the grid
# the view_indices to select the virtual range of values
# for the grid
view_slice = self.view_indices[row_start : row_start + num_rows]
columns = [col.take(view_slice) for col in columns]
indices = self.table.index.take(view_slice)
Expand Down Expand Up @@ -544,9 +556,12 @@ def _eval_filter(self, filt: RowFilter):
elif params.type == SearchFilterParamsType.EndsWith:
mask = col.str.endswith(term)

assert mask is not None

# Nulls are possible in the mask, so we just fill them if any
if mask.dtype != bool:
mask = mask.fillna(False)
mask[mask.isna()] = False
mask = mask.astype(bool)

return mask.to_numpy()

Expand Down Expand Up @@ -602,7 +617,57 @@ def _prof_null_count(self, column_index: int):
return self._get_column(column_index).isnull().sum()

def _prof_summary_stats(self, column_index: int):
raise NotImplementedError
col_schema = self._get_single_column_schema(column_index)
col = self._get_column(column_index)

ui_type = col_schema.type_display
handler = self._SUMMARIZERS.get(ui_type)

if handler is None:
# Return nothing for types we don't yet know how to summarize
return ColumnSummaryStats(type_display=ui_type)
else:
return handler(col)

@staticmethod
def _summarize_number(col: "pd.Series"):
min_value = col.min()
max_value = col.max()
mean = col.mean()
median = col.median()
stdev = col.std()

return ColumnSummaryStats(
type_display=ColumnDisplayType.Number,
number_stats=SummaryStatsNumber(
min_value=str(min_value),
max_value=str(max_value),
mean=str(mean),
median=str(median),
stdev=str(stdev),
),
)

@staticmethod
def _summarize_string(col: "pd.Series"):
num_empty = (col.str.len() == 0).sum()
num_unique = col.nunique()

return ColumnSummaryStats(
type_display=ColumnDisplayType.String,
string_stats=SummaryStatsString(num_empty=num_empty, num_unique=num_unique),
)

@staticmethod
def _summarize_boolean(col: "pd.Series"):
null_count = col.isnull().sum()
true_count = col.sum()
false_count = len(col) - true_count - null_count

return ColumnSummaryStats(
type_display=ColumnDisplayType.Boolean,
boolean_stats=SummaryStatsBoolean(true_count=true_count, false_count=false_count),
)

def _prof_freq_table(self, column_index: int):
raise NotImplementedError
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@


@enum.unique
class ColumnSchemaTypeDisplay(str, enum.Enum):
class ColumnDisplayType(str, enum.Enum):
"""
Possible values for TypeDisplay in ColumnSchema
Possible values for ColumnDisplayType
"""

Number = "number"
Expand Down Expand Up @@ -202,7 +202,7 @@ class ColumnSchema(BaseModel):
description="Exact name of data type used by underlying table",
)

type_display: ColumnSchemaTypeDisplay = Field(
type_display: ColumnDisplayType = Field(
description="Canonical Positron display name of data type",
)

Expand Down Expand Up @@ -387,7 +387,33 @@ class ColumnProfileResult(BaseModel):

class ColumnSummaryStats(BaseModel):
"""
ColumnSummaryStats in Schemas
Profile result containing summary stats for a column based on the data
type
"""

type_display: ColumnDisplayType = Field(
description="Canonical Positron display name of data type",
)

number_stats: Optional[SummaryStatsNumber] = Field(
default=None,
description="Statistics for a numeric data type",
)

string_stats: Optional[SummaryStatsString] = Field(
default=None,
description="Statistics for a string-like data type",
)

boolean_stats: Optional[SummaryStatsBoolean] = Field(
default=None,
description="Statistics for a boolean data type",
)


class SummaryStatsNumber(BaseModel):
"""
SummaryStatsNumber in Schemas
"""

min_value: str = Field(
Expand All @@ -398,24 +424,44 @@ class ColumnSummaryStats(BaseModel):
description="Maximum value as string",
)

mean_value: Optional[str] = Field(
default=None,
mean: str = Field(
description="Average value as string",
)

median: Optional[str] = Field(
default=None,
median: str = Field(
description="Sample median (50% value) value as string",
)

q25: Optional[str] = Field(
default=None,
description="25th percentile value as string",
stdev: str = Field(
description="Sample standard deviation as a string",
)

q75: Optional[str] = Field(
default=None,
description="75th percentile value as string",

class SummaryStatsBoolean(BaseModel):
"""
SummaryStatsBoolean in Schemas
"""

true_count: int = Field(
description="The number of non-null true values",
)

false_count: int = Field(
description="The number of non-null false values",
)


class SummaryStatsString(BaseModel):
"""
SummaryStatsString in Schemas
"""

num_empty: int = Field(
description="The number of empty / length-zero values",
)

num_unique: int = Field(
description="The exact number of distinct values",
)


Expand Down Expand Up @@ -560,7 +606,7 @@ class SearchSchemaParams(BaseModel):
"""

search_term: str = Field(
description="Substring to match for (currently case insensitive",
description="Substring to match for (currently case insensitive)",
)

start_index: int = Field(
Expand Down Expand Up @@ -798,6 +844,12 @@ class SchemaUpdateParams(BaseModel):

ColumnSummaryStats.update_forward_refs()

SummaryStatsNumber.update_forward_refs()

SummaryStatsBoolean.update_forward_refs()

SummaryStatsString.update_forward_refs()

ColumnHistogram.update_forward_refs()

ColumnFrequencyTable.update_forward_refs()
Expand Down
Loading
Loading