Skip to content

Commit

Permalink
Data Explorer: Respect num bins (#4483)
Browse files Browse the repository at this point in the history
Make sure the `num_bins` is respected and corresponds to the maximum
supported number of bins that we can render in the front-end.
  • Loading branch information
dfalbel authored Aug 26, 2024
1 parent ac7fdd2 commit a7d014c
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1749,8 +1749,9 @@ def _prof_histogram(


def _get_histogram_numpy(data, params: ColumnHistogramParams):
assert params.num_bins is not None

if params.method == ColumnHistogramParamsMethod.Fixed:
assert params.num_bins is not None
hist_params = {"bins": params.num_bins}
elif params.method == ColumnHistogramParamsMethod.Sturges:
hist_params = {"bins": "sturges"}
Expand All @@ -1761,27 +1762,30 @@ def _get_histogram_numpy(data, params: ColumnHistogramParams):
else:
raise NotImplementedError

# For integers, we want to make sure the number of bins is smaller
# then than `data.max() - data.min()`, so we don't endup with more bins
# then there's data to display.
if issubclass(data.dtype.type, np_.integer):
width = (data.max() - data.min()).item()
bins = np_.histogram_bin_edges(data, hist_params["bins"])
if len(bins) > width and width > 0:
hist_params = {"bins": width + 1}
else:
# Don't need to recompute the bin edges, so we pass them
hist_params = {"bins": bins.tolist()}

try:
bin_counts, bin_edges = np_.histogram(data, **hist_params)
bin_edges = np_.histogram_bin_edges(data, bins=hist_params["bins"])
except ValueError:
# If there are inf/-inf values in the dataset, ValueError is
# raised. We catch it and try again to avoid paying the
# filtering cost every time
data = data[np_.isfinite(data)]
bin_counts, bin_edges = np_.histogram(data, **hist_params)
bin_edges = np_.histogram_bin_edges(data, bins=hist_params["bins"])

# If the method returns more bins than what the fron-end requested,
# we re-define the bin edges.
if len(bin_edges) > params.num_bins:
bin_edges = np_.histogram_bin_edges(data, bins=params.num_bins)

# For integers, we want to make sure the number of bins is smaller
# then than `data.max() - data.min()`, so we don't endup with more bins
# then there's data to display.
hist_params = {"bins": bin_edges.tolist()}
if issubclass(data.dtype.type, np_.integer):
width = (data.max() - data.min()).item()
if len(bin_edges) > width and width > 0:
hist_params = {"bins": width + 1}

bin_counts, bin_edges = np_.histogram(data, **hist_params)
return bin_counts, bin_edges


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -789,9 +789,8 @@ class ColumnHistogramParams(BaseModel):
description="Method for determining number of bins",
)

num_bins: Optional[StrictInt] = Field(
default=None,
description="Number of bins in the computed histogram",
num_bins: StrictInt = Field(
description="Maximum number of bins in the computed histogram.",
)

quantiles: Optional[List[Union[StrictInt, StrictFloat]]] = Field(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2235,7 +2235,7 @@ def _get_null_count(column_index):
return _profile_request(column_index, [{"profile_type": "null_count"}])


def _get_histogram(column_index, bins=None, method="fixed"):
def _get_histogram(column_index, bins=2000, method="fixed"):
return _profile_request(
column_index,
[
Expand Down Expand Up @@ -2735,6 +2735,13 @@ def test_pandas_polars_profile_histogram(dxf: DataExplorerFixture):
result = dxf.get_column_profiles(name, [profile])
assert result[0]["histogram"] == ex_result

dfl = pd.DataFrame({"x": np.random.exponential(2, 2000)})
dxf.register_table("dfl", dfl)
result = dxf.get_column_profiles(
"dfl", [_get_histogram(0, bins=10, method="freedman_diaconis")]
)
assert len(result[0]["histogram"]["bin_counts"]) == 10


def test_pandas_polars_profile_frequency_table(dxf: DataExplorerFixture):
data = {
Expand Down
5 changes: 3 additions & 2 deletions positron/comms/data_explorer-backend-openrpc.json
Original file line number Diff line number Diff line change
Expand Up @@ -1094,7 +1094,8 @@
"type": "object",
"description": "Parameters for a column histogram profile request",
"required": [
"method"
"method",
"num_bins"
],
"properties": {
"method": {
Expand All @@ -1109,7 +1110,7 @@
},
"num_bins": {
"type": "integer",
"description": "Number of bins in the computed histogram"
"description": "Maximum number of bins in the computed histogram."
},
"quantiles": {
"type": "array",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -668,9 +668,9 @@ export interface ColumnHistogramParams {
method: ColumnHistogramParamsMethod;

/**
* Number of bins in the computed histogram
* Maximum number of bins in the computed histogram.
*/
num_bins?: number;
num_bins: number;

/**
* Sample quantiles (numbers between 0 and 1) to compute along with the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,8 @@ export class TableSummaryCache extends Disposable {
profiles.push({
profile_type: ColumnProfileType.Histogram,
params: {
method: ColumnHistogramParamsMethod.FreedmanDiaconis
method: ColumnHistogramParamsMethod.FreedmanDiaconis,
num_bins: 100
}
});
}
Expand Down Expand Up @@ -389,7 +390,8 @@ export class TableSummaryCache extends Disposable {
columnProfileSpecs.push({
profile_type: ColumnProfileType.Histogram,
params: {
method: ColumnHistogramParamsMethod.FreedmanDiaconis
method: ColumnHistogramParamsMethod.FreedmanDiaconis,
num_bins: 100
}
});
}
Expand Down

0 comments on commit a7d014c

Please sign in to comment.