diff --git a/buckaroo/all_transforms.py b/buckaroo/all_transforms.py index eff7a4dc..9b4613c5 100644 --- a/buckaroo/all_transforms.py +++ b/buckaroo/all_transforms.py @@ -1,8 +1,10 @@ -from .lispy import s -from .configure_utils import configure_buckaroo import pandas as pd import numpy as np +from .lispy import s +from .configure_utils import configure_buckaroo +from .cleaning_commands import (to_bool, to_datetime, to_int, to_float, to_string) + class Command(object): pass @@ -122,7 +124,7 @@ def transform_to_py(df, col, col_spec): -class to_datetime(Command): +class ato_datetime(Command): #argument_names = ["df", "col"] command_default = [s('to_datetime'), s('df'), "col"] command_pattern = [None] @@ -154,6 +156,7 @@ def transform_to_py(df, col): " df.drop('%s', axis=1, inplace=True)" % col, " df.index = old_col.values"]) -DefaultCommandKlsList = [DropCol, to_datetime, SafeInt, FillNA, reindex, OneHot, GroupBy] +DefaultCommandKlsList = [DropCol, SafeInt, FillNA, reindex, OneHot, GroupBy, + to_bool, to_datetime, to_int, to_float, to_string] command_defaults, command_patterns, buckaroo_transform, buckaroo_to_py_core = configure_buckaroo(DefaultCommandKlsList) diff --git a/buckaroo/analysis.py b/buckaroo/analysis.py index 260f6e5b..49350284 100644 --- a/buckaroo/analysis.py +++ b/buckaroo/analysis.py @@ -91,18 +91,107 @@ def summary(sampled_ser, summary_ser, ser): def int_digits(n): + if np.isnan(n): + return 1 if n == 0: return 1 if np.sign(n) == -1: return int(np.floor(np.log10(np.abs(n)))) + 2 return int(np.floor(np.log10(n)+1)) -def histogram(ser): - raw_counts, bins = np.histogram(ser, 10) - scaled_counts = np.round(raw_counts/raw_counts.sum(),2) - return [scaled_counts, bins] +def numeric_histogram_labels(endpoints): + left = endpoints[0] + labels = [] + for edge in endpoints[1:]: + labels.append("{:.0f}-{:.0f}".format(left, edge)) + left = edge + return labels +#histogram_labels(endpoints) + +def numeric_histogram(arr, nan_per): + ret_histo = [] + nan_observation = {'name':'NA', 'NA':np.round(nan_per*100, 0)} + if nan_per == 1.0: + return [nan_observation] + + vals = arr.dropna() + low_tail, high_tail = np.quantile(vals, 0.01), np.quantile(vals, 0.99) + low_pass = arr>low_tail + high_pass = arr < high_tail + meat = vals[low_pass & high_pass] + populations, endpoints =np.histogram(meat, 10) + + labels = numeric_histogram_labels(endpoints) + normalized_pop = populations / populations.sum() + low_label = "%r - %r" % (vals.min(), low_tail) + high_label = "%r - %r" % (high_tail, vals.max()) + ret_histo.append({'name': low_label, 'tail':1}) + for label, pop in zip(labels, normalized_pop): + ret_histo.append({'name': label, 'population':np.round(pop * 100, 0)}) + high_label = "%r - %r" % (high_tail, vals.max()) + ret_histo.append({'name': high_label, 'tail':1}) + if nan_per > 0.0: + ret_histo.append(nan_observation) + return ret_histo + + +def histo_format(v, l): + scaled = v/l + + +def categorical_dict(ser, val_counts, top_n_positions=7): + l = len(ser) + top = min(len(val_counts), top_n_positions) + + + top_vals = val_counts.iloc[:top] + #top_percentage = top_vals.sum() / l + #if len(val_counts) > 5 and top_percentage < .05: + + rest_vals = val_counts.iloc[top:] + histogram = top_vals.to_dict() + + + full_long_tail = rest_vals.sum() + unique_count = sum(val_counts == 1) + long_tail = full_long_tail - unique_count + if unique_count > 0: + histogram['unique'] = np.round( (unique_count/l)* 100, 0) + if long_tail > 0: + histogram['longtail'] = np.round((long_tail/l) * 100,0) + return histogram + +def categorical_histogram(ser, val_counts, nan_per, top_n_positions=7): + nan_observation = {'name':'NA', 'NA':np.round(nan_per*100, 0)} + cd = categorical_dict(ser, val_counts, top_n_positions) + + l = len(ser) + histogram = [] + longtail_obs = {'name': 'longtail'} + for k,v in cd.items(): + if k in ["longtail", "unique"]: + longtail_obs[k] = v + continue + histogram.append({'name':k, 'cat_pop': np.round((v/l)*100,0) }) + if len(longtail_obs) > 1: + histogram.append(longtail_obs) + if nan_per > 0.0: + histogram.append(nan_observation) + return histogram + + +def histogram(ser, nan_per): + is_numeric = pd.api.types.is_numeric_dtype(ser.dtype) + val_counts = ser.value_counts() + if is_numeric and len(val_counts)>5: + temp_histo = numeric_histogram(ser, nan_per) + if len(temp_histo) > 5: + #if we had basically a categorical variable encoded into an integer.. don't return it + return temp_histo + return categorical_histogram(ser, val_counts, nan_per) + class ColDisplayHints(ColAnalysis): requires_summary = ['min', 'max'] # What summary stats does this analysis provide provided_summary = [] @@ -111,15 +200,16 @@ class ColDisplayHints(ColAnalysis): 'is_numeric', 'is_integer', 'min_digits', 'max_digits', 'histogram'] @staticmethod - def col_hints(sampled_ser, summary_ser, ser): - is_numeric = pd.api.types.is_numeric_dtype(ser.dtype) - if not is_numeric: - return dict(is_numeric=False) - if len(ser) == 0: - return dict(is_numeric=False) + def table_hints(sampled_ser, summary_ser, table_hint_col_dict): + is_numeric = pd.api.types.is_numeric_dtype(sampled_ser.dtype) + # if not is_numeric: + # return dict(is_numeric=False) + # if len(sampled_ser) == 0: + # return dict(is_numeric=False) return dict( - is_numeric=True, - is_integer=pd.api.types.is_integer_dtype(ser), - min_digits=int_digits(summary_ser.loc['min']), - max_digits=int_digits(summary_ser.loc['max']), - histogram=histogram(ser)) + is_numeric=is_numeric, + is_integer=pd.api.types.is_integer_dtype(sampled_ser), + min_digits=(is_numeric and int_digits(summary_ser.loc['min'])) or 0, + max_digits=(is_numeric and int_digits(summary_ser.loc['max'])) or 0, + histogram=histogram(sampled_ser, summary_ser['nan_per'])) + diff --git a/buckaroo/analysis_management.py b/buckaroo/analysis_management.py index b5eff8ed..e707eb63 100644 --- a/buckaroo/analysis_management.py +++ b/buckaroo/analysis_management.py @@ -1,6 +1,6 @@ import numpy as np import pandas as pd - +import traceback from buckaroo.pluggable_analysis_framework import ( ColAnalysis, order_analysis, check_solvable, NotProvidedException) @@ -28,11 +28,13 @@ def produce_summary_df(df, ordered_objs, df_name='test_df'): summary_res = a_kls.summary(ser, summary_ser, ser) for k,v in summary_res.items(): summary_ser.loc[k] = v - for k,v in a_kls.table_hints(sampled_ser, summary_ser, table_hint_dict): + th_dict = a_kls.table_hints(sampled_ser, summary_ser, table_hint_dict) + for k,v in th_dict.items(): table_hint_dict[k] = v except Exception as e: print("summary_ser", summary_ser) errs[ser_name] = e, a_kls + traceback.print_exc() continue summary_col_dict[ser_name] = summary_ser table_hint_col_dict[ser_name] = table_hint_dict diff --git a/buckaroo/auto_clean.py b/buckaroo/auto_clean.py new file mode 100644 index 00000000..7cfd7f68 --- /dev/null +++ b/buckaroo/auto_clean.py @@ -0,0 +1,208 @@ +import sys +import math +import warnings + +from datetime import timedelta +from collections import defaultdict + +import pandas as pd +import numpy as np + + +#adapted from https://docs.python.org/3/library/warnings.html + +# this is needed to see if a series that we think should be datetime +# would throw a warning indicating a slow coercion this saves a huge +# (100x) slowdown + +def check_for_datetime_warnings(ser): + "return 1 if to_datetime throws errors for the series" + with warnings.catch_warnings(record=True) as w: + pd.to_datetime(ser, errors='coerce') + if len(w) == 0: + return 0 + if "Could not infer format" in str(w[-1].message): + return 1 + else: + #not sure how we'd get here + return 1 + +default_type_dict = { + 'datetime':0, 'datetime_error':0, + 'int':0, 'int_error':0, + 'float':0, 'float_error':0, + 'bool':0, 'bool_error':0} + + +#pandas parses ints and floats as datetimes in ns, you end up with a +# lot of values around 1970 Unix epoch we use 1971 as a cutoff, a bit +# of a hack, but pragmatic +SEVENTY_ONE_CUTOFF = pd.to_datetime('1971-01-01') + +def get_object_typing_metadata(ser): + # this function should just return percentages, to separate + # introspection from action. This way we can pass in a different + # decision weighting. As is this function is complex enough + counts = defaultdict(lambda: 0) + counts.update(default_type_dict) # we always want these keys present + assert pd.api.types.is_object_dtype(ser.dtype) + #this is slow because it goes through python as opposed to vectorized C code + for v in ser.values: + try: + dt = pd.to_datetime(v) + if dt > SEVENTY_ONE_CUTOFF: + counts['datetime'] += 1 + else: + counts['datetime_error'] += 1 + except (pd.core.tools.datetimes.DateParseError, ValueError, TypeError): + counts['datetime_error'] += 1 + try: + int(v) + counts['int'] += 1 + except ValueError: + counts['int_error'] += 1 + try: + float(v) + counts['float'] += 1 + except ValueError: + counts['float_error'] += 1 + + if isinstance(v, bool): + counts['bool'] += 1 + else: + counts['bool_error'] += 1 + + if len(ser) == 0: + return counts + ret_dict = dict([[k, v/len(ser)] for k,v in counts.items()]) + + #this is an ugly hack, but it really speeds things up if there are + #abberant kind of datetime columns + if ret_dict['datetime_error'] < .5: + if check_for_datetime_warnings(ser): + ret_dict['datetime_error'] = 1.0 + if ret_dict['int_error'] < .5: + float_remainder = (pd.to_numeric(ser, errors='coerce').abs() % 1).sum() + if float_remainder > 0.0001: + ret_dict['int_error'] = 1 + return ret_dict + +def get_typing_metadata(ser): + td = type_dict = default_type_dict.copy() + dt = ser.dtype + if not pd.api.types.is_object_dtype(dt): + td['exact_type'] = str(dt) + if pd.api.types.is_datetime64_any_dtype(dt): + #general_type is used as a pass through + td['general_type'] = 'datetime' + elif pd.api.types.is_bool_dtype(dt): + td['general_type'] = 'bool' + elif pd.api.types.is_categorical_dtype(dt): + pass #not sure how to handle this yet, it will end up being handled as an object/string + elif pd.api.types.is_float_dtype(dt): + #could still be a float that includes only ints + td['general_type'] = 'float' + td['float'], td['float_error'] = 1, 0 + elif pd.api.types.is_integer_dtype(dt): + td['general_type'] = 'int' + td['int'], td['int_error'] = 1, 0 + return td + else: + return get_object_typing_metadata(ser.dropna()) + +def recommend_type(type_dict): + if type_dict.get('general_type', None) is not None: + return type_dict['general_type'] + if type_dict['datetime_error'] < 0.5: + return 'datetime' + if type_dict['bool_error'] < 0.5: + #bool ends up being stricter than int or float + return 'bool' + if type_dict['float_error'] < 0.5 or type_dict['int_error'] < 0.5: + #numeric type, figure out if float or int float will parse for + # everything that also parses as int + if math.isclose(type_dict['float'], type_dict['int']) or type_dict['int'] > type_dict['float']: + return 'int' + else: + return 'float' + return 'string' + +def smart_to_int(ser): + + if pd.api.types.is_numeric_dtype(ser): + working_ser = ser + lower, upper = ser.min(), ser.max() + else: + working_ser = pd.to_numeric(ser, errors='coerce') + lower, upper = working_ser.min(), working_ser.max() + + + if lower < 0: + if upper < np.iinfo(np.int8).max: + new_type = 'Int8' + elif upper < np.iinfo(np.int16).max: + new_type = 'Int16' + elif upper < np.iinfo(np.int32).max: + new_type = 'Int32' + else: + new_type = 'Int64' + else: + if upper < np.iinfo(np.uint8).max: + new_type = 'UInt8' + elif upper < np.iinfo(np.uint16).max: + new_type = 'UInt16' + elif upper < np.iinfo(np.uint32).max: + new_type = 'UInt32' + else: + new_type = 'UInt64' + base_ser = pd.to_numeric(ser, errors='coerce').dropna() + return base_ser.astype(new_type).reindex(ser.index) + +def coerce_series(ser, new_type): + if new_type == 'bool': + #dropna is key here, otherwise Nan's and errors are treated as true + return pd.to_numeric(ser, errors='coerce').dropna().astype('boolean').reindex(ser.index) + elif new_type == 'datetime': + return pd.to_datetime(ser, errors='coerce').reindex(ser.index) + elif new_type == 'int': + # try: + return smart_to_int(ser) + # except: + # #just let pandas figure it out, we recommended the wrong type + # return pd.to_numeric(ser, errors='coerce') + + elif new_type == 'float': + return pd.to_numeric(ser, errors='coerce').dropna().astype('float').reindex(ser.index) + elif new_type == 'string': + return ser.fillna(value="").astype('string').replace("", None).reindex(ser.index) + else: + raise Exception("Unkown type of %s" % new_type) + +def emit_command(col_name, new_type): + # I need a "no-op" new_type that doesn't change a column at all + # also possible meta tags about commands taht will change data, vs just re-typing + return [{"symbol":"to_%s" % new_type , "meta":{"precleaning":True}},{"symbol":"df"}, col_name] + +def auto_type_df(df): + #this is much faster because we only run the slow function on a maximum of 200 rows. + #That's a good size for an estimate + sample_size = min(len(df), 200) + recommended_types = {} + new_data = {} + for c in df.columns: + recommended_types[c] = recommend_type(get_typing_metadata(df[c].sample(sample_size))) + new_data[c] = coerce_series(df[c], recommended_types[c]) + return pd.DataFrame(new_data) + +def get_auto_type_operations(df): + #this is much faster because we only run the slow function on a maximum of 200 rows. + #That's a good size for an estimate + sample_size = min(len(df), 200) + cleaning_commands = [] + for c in df.columns: + new_type = recommend_type(get_typing_metadata(df[c].sample(sample_size))) + cleaning_commands.append(emit_command(c, new_type)) + return cleaning_commands + + + diff --git a/buckaroo/buckaroo_widget.py b/buckaroo/buckaroo_widget.py index 831bd679..8d42d187 100644 --- a/buckaroo/buckaroo_widget.py +++ b/buckaroo/buckaroo_widget.py @@ -8,17 +8,25 @@ TODO: Add module docstring """ import json +import warnings from ipywidgets import DOMWidget from traitlets import Unicode, List, Dict, observe from ._frontend import module_name, module_version from .all_transforms import configure_buckaroo, DefaultCommandKlsList +from .lisp_utils import (lists_match, split_operations) + +from .auto_clean import get_auto_type_operations from .down_sample import sample from .analysis import (TypingStats, DefaultSummaryStats, ColDisplayHints) + + from .analysis_management import DfStats + + from pandas.io.json import dumps as pdumps @@ -51,6 +59,7 @@ def df_to_obj(df, order = None, table_hints=None): obj['schema'] = dict(fields=fields) return obj + FAST_SUMMARY_WHEN_GREATER = 1_000_000 class BuckarooWidget(DOMWidget): """TODO: Add docstring here @@ -64,13 +73,15 @@ class BuckarooWidget(DOMWidget): commandConfig = Dict({}).tag(sync=True) operations = List().tag(sync=True) - + machine_gen_operations = List().tag(sync=True) command_classes = DefaultCommandKlsList origDf = Dict({}).tag(sync=True) summaryDf = Dict({}).tag(sync=True) - operation_results = Dict({}).tag(sync=True) + operation_results = Dict( + {'transformed_df':EMPTY_DF_OBJ, 'generated_py_code':'# instantiation, unused'} + ).tag(sync=True) dfConfig = Dict( { @@ -85,104 +96,119 @@ class BuckarooWidget(DOMWidget): 'showCommands': True, }).tag(sync=True) - - def __init__(self, df, - sampled=True, - summaryStats=False, - reorderdColumns=False, - showTransformed=True, - showCommands=True, - really_reorder_columns=False): - super().__init__() + def should_sample(self, df, sampled, reorderdColumns): rows = len(df) cols = len(df.columns) item_count = rows * cols - fast_mode = sampled or reorderdColumns if item_count > FAST_SUMMARY_WHEN_GREATER: fast_mode = True - elif really_reorder_columns: #an override - fast_mode = True if fast_mode: - self.dfConfig['sampled'] = True - - self.stats = DfStats(df, [TypingStats, DefaultSummaryStats, ColDisplayHints]) - self.summaryDf = df_to_obj(self.stats.presentation_sdf, self.stats.col_order) + return True + return False + def get_df_config(self, df, sampled, reorderdColumns, showCommands): tempDfc = self.dfConfig.copy() tempDfc.update(dict( totalRows=len(df), columns=len(df.columns), - showTransformed=showTransformed, + #removing showCommands for now, mirroring showTransformed showCommands=showCommands)) + tempDfc['sampled'] = self.should_sample(df, sampled, reorderdColumns) + return tempDfc + + def __init__(self, df, + sampled=True, + summaryStats=False, + reorderdColumns=False, + showCommands=True): + super().__init__() + warnings.filterwarnings('ignore') + #moving setup_from_command_kls_list early in the init because + #it's relatively benign and not tied to other linked updates - self.df = df - self.dfConfig = tempDfc - #just called to trigger setting origDf properly - self.update_based_on_df_config(3) - self.operation_results = { - 'transformed_df':self.origDf, - 'generated_py_code':'#from py widget init'} self.setup_from_command_kls_list() + self.dfConfig = self.get_df_config(df, sampled, reorderdColumns, showCommands) + #we need dfConfig setup first before we get the proper + #working_df and generate the typed_df + self.raw_df = df - def add_analysis(self, analysis_obj): - self.stats.add_analysis(analysis_obj) - self.summaryDf = df_to_obj(self.stats.presentation_sdf, self.stats.col_order) - #just trigger redisplay - self.update_based_on_df_config(3) + # this will trigger the setting of self.typed_df + self.operations = get_auto_type_operations(df) + warnings.filterwarnings('default') @observe('dfConfig') def update_based_on_df_config(self, change): - tdf = self.df_from_dfConfig() - if self.dfConfig['reorderdColumns']: - #ideally this won't require a reserialization. All - #possible col_orders shoudl be serialized once, and the - #frontend should just toggle from them - #self.origDf = df_to_obj(tdf, self.stats.col_order, table_hints=self.stats.table_hints) - self.origDf = df_to_obj(tdf, self.stats.col_order) #, table_hints=self.stats.table_hints) - else: - self.origDf = df_to_obj(tdf) #, table_hints=self.stats.table_hints) - - def df_from_dfConfig(self): - if self.dfConfig['sampled']: - return sample(self.df, self.dfConfig['sampleSize']) - else: - return self.df + if hasattr(self, 'typed_df'): + self.origDf = df_to_obj(self.typed_df, self.typed_df.columns, table_hints=self.stats.table_hints) + #otherwise this is a call before typed_df has been completely setup @observe('operations') - def interpret_operations(self, change): - print("interpret_operations") + def handle_operations(self, change): + if lists_match(change['old'], change['new']): + return + new_ops = change['new'] + split_ops = split_operations(new_ops) + self.machine_gen_operations = split_ops[0] + + user_gen_ops = split_ops[1] + + #if either the user_gen part or the machine_gen part changes, + #we still have to recompute the generated code and + #resulting_df because the input df will be different + results = {} - results['generated_py_code'] = 'before interpreter' try: - operations = [{'symbol': 'begin'}] - operations.extend(change['new']) - #print("interpret_operations", operations) - - if len(operations) == 1: - results['transformed_df'] = self.origDf - results['generated_py_code'] = 'no operations' - #print('exiting early') - return - #generating python code seems slightly less error prone than the transform - results['generated_py_code'] = self.buckaroo_to_py_core(operations[1:]) - #note doesn't use df_to_obj - transformed_df = self.buckaroo_transform(operations, self.df) + transformed_df = self.interpret_ops(user_gen_ops, self.typed_df) + #note we call gneerate_py_code based on the full + #self.operations, this makes sure that machine_gen + #cleaning code shows up too + results['generated_py_code'] = self.generate_code(new_ops) results['transformed_df'] = json.loads(transformed_df.to_json(orient='table', indent=2)) results['transform_error'] = False - except Exception as e: results['transformed_df'] = EMPTY_DF_OBJ print(e) results['transform_error'] = str(e) self.operation_results = results + @observe('machine_gen_operations') + def interpret_machine_gen_ops(self, change, force=False): + if (not force) and lists_match(change['old'], change['new']): + return # nothing changed, do no computations + new_ops = change['new'] + + #this won't listen to sampled changes proeprly + if self.dfConfig['sampled']: + working_df = sample(self.raw_df, self.dfConfig['sampleSize']) + else: + working_df = self.raw_df + self.typed_df = self.interpret_ops(new_ops, working_df) + + # stats need to be rerun each time + self.stats = DfStats(self.typed_df, [TypingStats, DefaultSummaryStats, ColDisplayHints]) + self.summaryDf = df_to_obj(self.stats.presentation_sdf, self.stats.col_order) + self.update_based_on_df_config(3) + + def generate_code(self, operations): + if len(operations) == 0: + return 'no operations' + return self.buckaroo_to_py_core(operations) + + def interpret_ops(self, new_ops, df): + operations = [{'symbol': 'begin'}] + operations.extend(new_ops) + if len(operations) == 1: + return df + return self.buckaroo_transform(operations , df) + def setup_from_command_kls_list(self): + #used to initially setup the interpreter, and when a command + #is added interactively command_defaults, command_patterns, self.buckaroo_transform, self.buckaroo_to_py_core = configure_buckaroo( self.command_classes) - self.commandConfig = dict( - argspecs=command_patterns, defaultArgs=command_defaults) + self.commandConfig = dict(argspecs=command_patterns, defaultArgs=command_defaults) def add_command(self, incomingCommandKls): @@ -191,5 +217,21 @@ def add_command(self, incomingCommandKls): self.command_classes = without_incoming self.setup_from_command_kls_list() + def add_analysis(self, analysis_obj): + self.stats.add_analysis(analysis_obj) + self.summaryDf = df_to_obj(self.stats.presentation_sdf, self.stats.col_order) + #just trigger redisplay + self.update_based_on_df_config(3) + +class Unused(): + def update_based_on_df_config(self, change): + if self.dfConfig['reorderdColumns']: + #ideally this won't require a reserialization. All + #possible col_orders shoudl be serialized once, and the + #frontend should just toggle from them + #self.origDf = df_to_obj(tdf, self.stats.col_order, table_hints=self.stats.table_hints) + self.origDf = df_to_obj(self.typed_df, self.stats.col_order, table_hints=self.stats.table_hints) + else: + self.origDf = df_to_obj(tdf, tdf.columns, table_hints=self.stats.table_hints) diff --git a/buckaroo/cleaning_commands.py b/buckaroo/cleaning_commands.py new file mode 100644 index 00000000..5a1f6853 --- /dev/null +++ b/buckaroo/cleaning_commands.py @@ -0,0 +1,102 @@ +from .lispy import s +from .configure_utils import configure_buckaroo +from .auto_clean import smart_to_int, get_auto_type_operations +import pandas as pd +import numpy as np + +class Command(object): + pass + +class to_bool(Command): + #argument_names = ["df", "col"] + command_default = [s('to_bool'), s('df'), "col"] + command_pattern = [None] + + @staticmethod + def transform(df, col): + ser = df[col] + df[col] = pd.to_numeric(ser, errors='coerce').dropna().astype('boolean').reindex(ser.index) + return df + + @staticmethod + def transform_to_py(df, col): + return " df['%s'] = pd.to_numeric(df['%s'], errors='coerce').dropna().astype('boolean').reindex(df['%s'].index)" % (col, col, col) + +class to_datetime(Command): + #argument_names = ["df", "col"] + command_default = [s('to_datetime'), s('df'), "col"] + command_pattern = [None] + + @staticmethod + def transform(df, col): + ser = df[col] + df[col] = pd.to_datetime(ser, errors='coerce').reindex(ser.index) + return df + + @staticmethod + def transform_to_py(df, col): + return " df['%s'] = pd.to_datetime(df['%s'], errors='coerce').reindex(df['%s'].index)" % (col, col, col) + +class to_int(Command): + #argument_names = ["df", "col"] + command_default = [s('to_int'), s('df'), "col"] + command_pattern = [None] + + @staticmethod + def transform(df, col): + ser = df[col] + try: + df[col] = smart_to_int(ser) + except Exception as e: + #just let pandas figure it out, we recommended the wrong type + df[col] = pd.to_numeric(ser, errors='coerce') + + return df + + @staticmethod + def transform_to_py(df, col): + return " df['%s'] = smart_int(df['%s'])" % (col, col) + +class to_float(Command): + #argument_names = ["df", "col"] + command_default = [s('to_float'), s('df'), "col"] + command_pattern = [None] + + @staticmethod + def transform(df, col): + ser = df[col] + df[col] = pd.to_numeric(ser, errors='coerce').dropna().astype('float').reindex(ser.index) + return df + + @staticmethod + def transform_to_py(df, col): + return " df['%s'] = pd.to_numeric(df['%s'], errors='coerce')" % (col, col) + +class to_string(Command): + #argument_names = ["df", "col"] + command_default = [s('to_string'), s('df'), "col"] + command_pattern = [None] + + @staticmethod + def transform(df, col): + ser = df[col] + df[col] = ser.fillna(value="").astype('string').replace("", None).reindex(ser.index) + return df + + @staticmethod + def transform_to_py(df, col): + return " df['%s'] = df['%s'].fillna(value='').astype('string').replace('', None)" % (col, col) + + + +cleaning_classes = [to_bool, to_datetime, to_int, to_float, to_string,] + +def auto_type_df2(df): + _command_defaults, _command_patterns, transform, buckaroo_to_py_core = configure_buckaroo( + cleaning_classes) + + cleaning_operations = get_auto_type_operations(df) + + full_ops = [{'symbol': 'begin'}] + full_ops.extend(cleaning_operations) + return transform(full_ops, df) diff --git a/buckaroo/configure_utils.py b/buckaroo/configure_utils.py index 219874db..b47f9498 100644 --- a/buckaroo/configure_utils.py +++ b/buckaroo/configure_utils.py @@ -17,7 +17,6 @@ def configure_buckaroo(transforms): def buckaroo_transform(instructions, df): df_copy = df.copy() ret_val = buckaroo_eval(instructions, {'df':df_copy}) - #print(ret_val) return ret_val convert_to_python, __unused = make_interpreter(to_py_lisp_primitives) @@ -28,8 +27,6 @@ def buckaroo_to_py(instructions): #interpreter as 'begin'... that way the exact same instructions #could be sent to either interpreter. For now, this will do individual_instructions = [x for x in map(lambda x:convert_to_python(x, {'df':5}), instructions)] - #print("individual_instructions", individual_instructions) code_block = '\n'.join(individual_instructions) - return "def clean(df):\n" + code_block + "\n return df" return command_defaults, command_patterns, buckaroo_transform, buckaroo_to_py diff --git a/buckaroo/down_sample.py b/buckaroo/down_sample.py index 2ebe5c88..b168f845 100644 --- a/buckaroo/down_sample.py +++ b/buckaroo/down_sample.py @@ -2,6 +2,14 @@ import numpy as np +def get_outlier_idxs(ser): + if not pd.api.types.is_numeric_dtype(ser.dtype): + return [] + outlier_idxs = [] + outlier_idxs.extend(ser.nlargest(5).index) + outlier_idxs.extend(ser.nsmallest(5).index) + return outlier_idxs + def sample(df, sample_size=500, include_outliers=True): if len(df) <= sample_size: diff --git a/buckaroo/lisp_utils.py b/buckaroo/lisp_utils.py new file mode 100644 index 00000000..08a0b658 --- /dev/null +++ b/buckaroo/lisp_utils.py @@ -0,0 +1,46 @@ +""" +It would be awesome to have cleaning and verification commands that add new columns with related names + +The new columns are null accept for errored values. + +Could use this to show original values that were removed/cleaned. Combined with conditional styling in the UI + +sick. And still ahve high perfromance properly typed columns + + + +""" + +def is_symbol(obj): + return isinstance(obj, dict) and "symbol" in obj + +def is_generated_symbol(obj): + return is_symbol(obj) and obj.get("meta", False) + +def split_operations(full_operations): + """ + utitlity to split a combined set of operations with machine generated commands and user entered commands into two lists, machine_generated and user_generated + + machine_generated commands have function calls with the symbol token also having a meta key with a value of {"precleaning":True} + """ + + machine_generated, user_entered = [], [] + for command in full_operations: + assert isinstance(command, list) + sym_atom = command[0] + if is_symbol(sym_atom): + if is_generated_symbol(sym_atom): + machine_generated.append(command) + else: + user_entered.append(command) + continue + raise Exception("Unexpected token %r" % command) + return machine_generated, user_entered + +def lists_match(l1, l2): + #https://note.nkmk.me/en/python-list-compare/#checking-the-exact-match-of-lists + if len(l1) != len(l2): + return False + return all(x == y and type(x) == type(y) for x, y in zip(l1, l2)) + + diff --git a/buckaroo/lispy.py b/buckaroo/lispy.py index 27d58939..b5e740c5 100644 --- a/buckaroo/lispy.py +++ b/buckaroo/lispy.py @@ -186,7 +186,6 @@ def eval(x, env=global_env): eval(exp, env) x = x[-1] else: # (proc exp*) - print("exp", x) exps = [eval(exp, env) for exp in x] proc = exps.pop(0) if isa(proc, Procedure): @@ -197,6 +196,22 @@ def eval(x, env=global_env): return proc(*exps) + def is_unparsed_atom_a_symbol(obj): + if isinstance(obj, dict): + if obj.get('symbol', False): + if len(obj) == 1: + return True + elif len(obj) == 2 and obj.get('meta', False) is not False: + #our symbols can have a meta key too + return True + return False + + def is_unparsed_atom_a_quote(obj): + if isinstance(obj, dict) and len(obj) == 1: + if obj.get('quote', False) is not False: + return True + return False + def list_parse(lst): ret_list = [] if isinstance(lst, list) == False: @@ -207,15 +222,12 @@ def list_parse(lst): while True: if isinstance(x, list): ret_list.append(list_parse(x)) - elif isinstance(x, dict) and len(x) == 1: #hack to make the aprser easier - if x.get('symbol', False): - ret_list.append(Sym(x['symbol'])) - elif x.get('quote', False): - quote_char = x.get('quote') - quote_func = quotes[quote_char] - ret_list.append([quote_func, list_parse(next(lst))]) - else: - ret_list.append(x) + elif is_unparsed_atom_a_symbol(x): + ret_list.append(Sym(x['symbol'])) + elif is_unparsed_atom_a_quote(x): + quote_char = x.get('quote') + quote_func = quotes[quote_char] + ret_list.append([quote_func, list_parse(next(lst))]) elif isinstance(x, dict): print("x was a dict", x) ret_list.append(x) diff --git a/buckaroo/order_columns.py b/buckaroo/order_columns.py index 0b4ac8d6..f5438346 100644 --- a/buckaroo/order_columns.py +++ b/buckaroo/order_columns.py @@ -82,14 +82,6 @@ def reorder_columns(df): col_order = order_columns(tdf_stats, cpd) return df[col_order] -def get_outlier_idxs(ser): - if not pd.api.types.is_numeric_dtype(ser.dtype): - return [] - outlier_idxs = [] - outlier_idxs.extend(ser.nlargest(5).index) - outlier_idxs.extend(ser.nsmallest(5).index) - return outlier_idxs - def add_col_rankings(df, sdf): sdf.loc['one_distinct'] = 0 diff --git a/buckaroo/widget_utils.py b/buckaroo/widget_utils.py index b33493bf..07db0d15 100644 --- a/buckaroo/widget_utils.py +++ b/buckaroo/widget_utils.py @@ -3,7 +3,7 @@ def _display_as_buckaroo(df): from IPython.display import display - return display(BuckarooWidget(df, showCommands=False, showTransformed=False)) + return display(BuckarooWidget(df, showCommands=False)) def enable(): """ diff --git a/docs/source/_static/embed-bundle.js.LICENSE.txt b/docs/source/_static/embed-bundle.js.LICENSE.txt index cfe1226b..6ef8448c 100644 --- a/docs/source/_static/embed-bundle.js.LICENSE.txt +++ b/docs/source/_static/embed-bundle.js.LICENSE.txt @@ -1,3 +1,151 @@ +/*! + Copyright (c) 2016 Jed Watson. + Licensed under the MIT License (MIT), see + http://jedwatson.github.io/classnames +*/ + +/*! Conditions:: INITIAL */ + +/*! Production:: css_value : ANGLE */ + +/*! Production:: css_value : CHS */ + +/*! Production:: css_value : EMS */ + +/*! Production:: css_value : EXS */ + +/*! Production:: css_value : FREQ */ + +/*! Production:: css_value : LENGTH */ + +/*! Production:: css_value : PERCENTAGE */ + +/*! Production:: css_value : REMS */ + +/*! Production:: css_value : RES */ + +/*! Production:: css_value : SUB css_value */ + +/*! Production:: css_value : TIME */ + +/*! Production:: css_value : VHS */ + +/*! Production:: css_value : VMAXS */ + +/*! Production:: css_value : VMINS */ + +/*! Production:: css_value : VWS */ + +/*! Production:: css_variable : CSS_VAR LPAREN CSS_CPROP COMMA math_expression RPAREN */ + +/*! Production:: css_variable : CSS_VAR LPAREN CSS_CPROP RPAREN */ + +/*! Production:: expression : math_expression EOF */ + +/*! Production:: math_expression : LPAREN math_expression RPAREN */ + +/*! Production:: math_expression : NESTED_CALC LPAREN math_expression RPAREN */ + +/*! Production:: math_expression : SUB PREFIX SUB NESTED_CALC LPAREN math_expression RPAREN */ + +/*! Production:: math_expression : css_value */ + +/*! Production:: math_expression : css_variable */ + +/*! Production:: math_expression : math_expression ADD math_expression */ + +/*! Production:: math_expression : math_expression DIV math_expression */ + +/*! Production:: math_expression : math_expression MUL math_expression */ + +/*! Production:: math_expression : math_expression SUB math_expression */ + +/*! Production:: math_expression : value */ + +/*! Production:: value : NUMBER */ + +/*! Production:: value : SUB NUMBER */ + +/*! Rule:: $ */ + +/*! Rule:: (--[0-9a-z-A-Z-]*) */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)% */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)Hz\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)ch\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)cm\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)deg\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)dpcm\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)dpi\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)dppx\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)em\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)ex\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)grad\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)in\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)kHz\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)mm\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)ms\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)pc\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)pt\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)px\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)rad\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)rem\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)s\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)turn\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)vh\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)vmax\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)vmin\b */ + +/*! Rule:: ([0-9]+(\.[0-9]*)?|\.[0-9]+)vw\b */ + +/*! Rule:: ([a-z]+) */ + +/*! Rule:: (calc) */ + +/*! Rule:: (var) */ + +/*! Rule:: , */ + +/*! Rule:: - */ + +/*! Rule:: \( */ + +/*! Rule:: \) */ + +/*! Rule:: \* */ + +/*! Rule:: \+ */ + +/*! Rule:: \/ */ + +/*! decimal.js-light v2.5.1 https://github.com/MikeMcl/decimal.js-light/LICENCE */ + /** * @ag-grid-community/all-modules - Advanced Data Grid / Data Table supporting Javascript / Typescript / React / Angular / Vue * @version v29.3.5 * @link https://www.ag-grid.com/ @@ -69,3 +217,12 @@ * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ + +/** @license React v16.13.1 + * react-is.production.min.js + * + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ diff --git a/docs/source/articles/histograms.rst b/docs/source/articles/histograms.rst new file mode 100644 index 00000000..4cb9988b --- /dev/null +++ b/docs/source/articles/histograms.rst @@ -0,0 +1,56 @@ +.. _using: + +========== +Histograms +========== + +Buckaroo uses histograms to convey the general shape of a colum in the minimum amount of screen real estate. Like the rest of buckaroo, this is an opionated feature. + + +Simple histograms for numeric columns +===================================== + +Histograms traditionally show the distribution of values in a column allowing different distributions to be identified (normal distribution, bimodal, random, skew). This works well for accurate data without outliers. + +There are a couple types of outliers that mess up normal histogrrams. + +Traditional histograms make no provision for NaNs. There are two ways we could deal with NaN's treating them as another bar, or as an independent bar. We chose a separate bar because NaNs are a property of the entire dataset and the histogram is a function of the relevant values. NaNs are displayed in a different color and pattern. + +Sentinel values. Columns frquently have sentinel values mixed in to convey missing data or some special condition. After dropping NaNs, we then look at the value counts, here is one explanation of a sampling + +imagine a dataset inserts 0 for NaNs, without 0's the range of numbers goes from 55 to 235, 0's account for 10% of observations. 0 is obviosuly a sentinel here and should be plotted as a categorical. If you disagree you can write your own histogram implementation and plug it in with the pluggable analysis framework. + +Extreme values. Buckaroo limits the extents of the set where the histogram is computed so that of the 10 bins, no single bin represents more than 50% of samples, limited to the quantile range from (0.1 to 0.9). The reasoning is that the extreme values represent errant values if they are so far off that they mess up the range of the rest of the histogram. I haven't decided how to convey which quantile range was chosen. + + +Categorical Histograms for everything else +========================================== + +Histograms are generally considered for numeric columns. Most datasets have many categorical or non numeric values, how can we get a quick overview of them? + +Well we already know how to plot NaNs, there are three other sentinel values that matter False, True, and "0". + +Remaining categoricals are plotted as a long tail plot, most frequent on the left with decreasing frequency to the right. The top 7 most frequent values are plotted, with a final bar of "long tail" consisting of the sum of all the remaining values" + + +Objections to this approach +=========================== + +This is not a traditional histogram and should not be read as such. It is the best way to show the most insight about frequency of values in a column that we could come up with. + + +Other research +============== + +https://edwinth.github.io/blog/outlier-bin/ + +https://stackoverflow.com/questions/11882393/matplotlib-disregard-outliers-when-plotting +references + + Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and + Handle Outliers", The ASQC Basic References in Quality Control: + Statistical Techniques, Edward F. Mykytka, Ph.D., Editor. + + + + diff --git a/examples/App.tsx b/examples/App.tsx index 4c382f9d..fa253994 100644 --- a/examples/App.tsx +++ b/examples/App.tsx @@ -1,5 +1,5 @@ /* eslint-disable @typescript-eslint/ban-ts-comment */ -//import 'bootstrap/dist/css/bootstrap.min.css'; +import 'bootstrap/dist/css/bootstrap.min.css'; import React from 'react'; import {HashRouter as Router, Route, Link} from 'react-router-dom'; //import {Button} from 'react-bootstrap'; @@ -16,7 +16,8 @@ const examples = { ColumnsEditorEx: {title: 'ColumnsEditor', file: 'ColumnsEditorEx'}, CommandViewerEx: {title: 'CommandViewer', file: 'CommandViewerEx'}, DFViewerEx: {title: 'DFViewer', file: 'DFViewerEx'}, - StatusBarEx: {title: 'StatusBar', file: 'StatusBarEx'} + StatusBarEx: {title: 'StatusBar', file: 'StatusBarEx'}, + HistogramEx: {title: 'Histogram', file: 'HistogramEx'} }; // The examples use a code-loading technique that I have described in diff --git a/examples/app.css b/examples/app.css index 2802ab69..c464b456 100644 --- a/examples/app.css +++ b/examples/app.css @@ -9,9 +9,10 @@ } .left-menu { - width: 10em; - min-width: 10em; - max-width: 10em; + width: 105; + min-width: 15em; + max-width: 15em; + overflow:hidden } .codeblock { @@ -35,6 +36,9 @@ --color-example: #fff; } +.fluid-container { + min-width:950px +} html,body { @@ -65,3 +69,106 @@ html,body outline:4px solid pink; } +.histogram { + color:black; +} + +.customHeaderRenderer { + height:100%; +} +.customHeaderLabel { + padding:0; +} + + +.histogram-ex { + +} + +.histogram-ex .histogram-wrap {border: 1px solid gray; clear:both} +.histogram-ex span {color:black; width:200px; float:left; text-align:left; } +.histogram-ex .histogram-component {width:105px; float:left} +.patterns { + width: 100vw; + height: 100vw; +} + + +.small-bar { + height:30px; width:10px; +} + +.med-bar { + height:100px; width:100px; +} + +/* +from https://www.magicpattern.design/tools/css-backgrounds +*/ +.pt1 { +background-color: #e5e5f7; +opacity: 0.8; +background-image: repeating-linear-gradient(45deg, #444cf7 25%, transparent 25%, transparent 75%, #444cf7 75%, #444cf7), repeating-linear-gradient(45deg, #444cf7 25%, #e5e5f7 25%, #e5e5f7 75%, #444cf7 75%, #444cf7); +background-position: 0 0, 2px 2px; +background-size: 4px 4px; + +} +.pt4 { + background-color: #e5e5f7; + opacity: 0.8; + background: repeating-linear-gradient( 45deg, #444cf7, #444cf7 3px, #e5e5f7 3px, #e5e5f7 6px ); +} + +.pt5 { +background-color: #e5e5f7; +opacity: 0.8; +background-image: linear-gradient(135deg, #444cf7 25%, transparent 25%), linear-gradient(225deg, #444cf7 25%, transparent 25%), linear-gradient(45deg, #444cf7 25%, transparent 25%), linear-gradient(315deg, #444cf7 25%, #e5e5f7 25%); +background-position: 3px 0, 3px 0, 0 0, 0 0; +background-size: 6px 6px; +background-repeat: repeat; +} + +.pt6 { +background-color: #e5e5f7; +opacity: 0.8; +background-image: radial-gradient(#444cf7 1.2px, #e5e5f7 1.2px); +background-size: 4px 4px; +} + +.pt7 { +background-color: #e5e5f7; +opacity: 0.8; +background-image: radial-gradient( ellipse farthest-corner at 4px 4px , #444cf7, #444cf7 50%, #e5e5f7 50%); +background-size: 4px 4px; + +} +/* +body { + + margin: 0; + min-height: 100vh; +} + +html { + background: #fff; +} + +special values + +true +false +n/a +long tail +completely unique + + +*/ + + +.ag-column-hover: { + overflow: visible +} +.histogram-ex { background:#181d1f} +.histogram-ex span { color : white } + + diff --git a/examples/ex/HistogramEx.tsx b/examples/ex/HistogramEx.tsx new file mode 100644 index 00000000..b65c747a --- /dev/null +++ b/examples/ex/HistogramEx.tsx @@ -0,0 +1,77 @@ +import React from 'react'; +import { HistogramCell} from '../../js/components/CustomHeader'; +import {histograms } from '../../js/components/staticData'; + + + +export default function Simple() { + const { + num_histo, bool_histo, NA_Only, simple_catgeorical, categorical_histo, + categorical_histo_lt, all_unique, unique_na, unique_continuous, + unique_continuous_scaled, unique_continuous_scaled_50, + start_station_categorical} = histograms; + return