Skip to content

Commit

Permalink
Feat/improved summary stats (#321)
Browse files Browse the repository at this point in the history
* fixes #319

* styling gallery now based on pandas

* Separate pandas and polars styling gallerys

* default tooltips on all string/object columns

* nbstripout
  • Loading branch information
paddymul authored Oct 29, 2024
1 parent bc539e3 commit f1bed29
Show file tree
Hide file tree
Showing 5 changed files with 603 additions and 12 deletions.
36 changes: 28 additions & 8 deletions buckaroo/customizations/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,15 +152,35 @@ def computed_summary(summary_dict):
distinct_count=len(value_counts)
unique_count = len(value_counts[value_counts==1])

return dict(
unique_count=unique_count,
empty_count=empty_count,
distinct_count=distinct_count,
distinct_per=distinct_count/l,
empty_per=empty_count/l,
unique_per=unique_count/l,
nan_per=summary_dict['nan_count']/l)
try:
empty_count = value_counts.get('', 0)
except:
empty_count = 0
distinct_count=len(value_counts)
unique_count = len(value_counts[value_counts==1])

def vc_nth(pos):
if pos >= len(value_counts):
return None
else:
return value_counts.index[pos]

return {
'non_null_count':l - summary_dict['nan_count'],
'null_count': summary_dict['nan_count'],
'most_freq':vc_nth(0),
'2nd_freq':vc_nth(1),
'3rd_freq':vc_nth(2),
'4th_freq':vc_nth(3),
'5th_freq':vc_nth(4),
'unique_count':unique_count,
'empty_count':empty_count,
'distinct_count':distinct_count,
'distinct_per':distinct_count/l,
'empty_per':empty_count/l,
'unique_per':unique_count/l,
'nan_per':summary_dict['nan_count']/l
}

class PdCleaningStats(ColAnalysis):
provides_defaults = {'int_parse_fail': 0.0, 'int_parse':0.0}
Expand Down
21 changes: 18 additions & 3 deletions buckaroo/customizations/styling.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def style_column(kls, col:str, column_metadata: Any) -> Any:

digits = 3
t = column_metadata['_type']
base_config = {'col_name':col}
if t == 'integer':
disp = {'displayer': 'float', 'min_fraction_digits':0, 'max_fraction_digits':0}
elif t == 'float':
Expand All @@ -33,20 +34,34 @@ def style_column(kls, col:str, column_metadata: Any) -> Any:
disp = {'displayer': 'datetimeLocaleString','locale': 'en-US', 'args': {}}
elif t == 'string':
disp = {'displayer': 'string', 'max_length': 35}
base_config['tooltip_config'] = {'tooltip_type':'simple', 'val_column': col}
else:
disp = {'displayer': 'obj'}
return {'col_name':col, 'displayer_args': disp }
base_config['tooltip_config'] = {'tooltip_type':'simple', 'val_column': col}
base_config['displayer_args'] = disp
return base_config


class DefaultSummaryStatsStyling(StylingAnalysis):
pinned_rows = [
obj_('dtype'),
float_('min'),
float_('non_null_count', 0),
float_('null_count', 0),
float_('mean'),
float_('std'),
float_('min'),
float_('25th'),
float_('median'),
float_('75th'),
float_('max'),
float_('unique_count', 0),
float_('distinct_count', 0),
float_('empty_count', 0)]
obj_('most_freq'),
obj_('2nd_freq'),
obj_('3rd_freq'),
obj_('4th_freq'),
obj_('5th_freq')
]

df_display_name = "summary"
data_key = "empty"
Expand Down
Loading

0 comments on commit f1bed29

Please sign in to comment.