Feat/auto clean (#33)

the 0.3.19 release adds Histograms and auto cleaning of dataframes.
paddymul · Sep 26, 2023 · 6175d0e · 6175d0e
1 parent 094b178
commit 6175d0e
Show file tree

Hide file tree

Showing 42 changed files with 3,563 additions and 587 deletions.
diff --git a/buckaroo/all_transforms.py b/buckaroo/all_transforms.py
@@ -1,8 +1,10 @@
-from .lispy import s
-from .configure_utils import configure_buckaroo
 import pandas as pd
 import numpy as np
 
+from .lispy import s
+from .configure_utils import configure_buckaroo
+from .cleaning_commands import (to_bool, to_datetime, to_int, to_float, to_string)
+
 class Command(object):
     pass
 
@@ -122,7 +124,7 @@ def transform_to_py(df, col, col_spec):
 
 
 
-class to_datetime(Command):
+class ato_datetime(Command):
     #argument_names = ["df", "col"]
     command_default = [s('to_datetime'), s('df'), "col"]
     command_pattern = [None]
@@ -154,6 +156,7 @@ def transform_to_py(df, col):
              "    df.drop('%s', axis=1, inplace=True)" % col,
              "    df.index = old_col.values"])
 
-DefaultCommandKlsList = [DropCol, to_datetime, SafeInt, FillNA, reindex, OneHot, GroupBy]
+DefaultCommandKlsList = [DropCol, SafeInt, FillNA, reindex, OneHot, GroupBy,
+                         to_bool, to_datetime, to_int, to_float, to_string]
 command_defaults, command_patterns, buckaroo_transform, buckaroo_to_py_core = configure_buckaroo(DefaultCommandKlsList)
 
diff --git a/buckaroo/analysis.py b/buckaroo/analysis.py
@@ -91,18 +91,107 @@ def summary(sampled_ser, summary_ser, ser):
 
 
 def int_digits(n):
+    if np.isnan(n):
+        return 1
     if n == 0:
         return 1
     if np.sign(n) == -1:
         return int(np.floor(np.log10(np.abs(n)))) + 2
     return int(np.floor(np.log10(n)+1))
 
-def histogram(ser):
-    raw_counts, bins = np.histogram(ser, 10)
-    scaled_counts = np.round(raw_counts/raw_counts.sum(),2)
-    return [scaled_counts, bins]
 
 
+def numeric_histogram_labels(endpoints):
+    left = endpoints[0]
+    labels = []
+    for edge in endpoints[1:]:
+        labels.append("{:.0f}-{:.0f}".format(left, edge))
+        left = edge
+    return labels
+#histogram_labels(endpoints)
+
+def numeric_histogram(arr, nan_per):
+    ret_histo = []
+    nan_observation = {'name':'NA', 'NA':np.round(nan_per*100, 0)}
+    if nan_per == 1.0:
+        return [nan_observation]
+
+    vals = arr.dropna()
+    low_tail, high_tail = np.quantile(vals, 0.01), np.quantile(vals, 0.99)
+    low_pass = arr>low_tail 
+    high_pass = arr < high_tail
+    meat = vals[low_pass & high_pass]
+    populations, endpoints =np.histogram(meat, 10)
+
+    labels = numeric_histogram_labels(endpoints)
+    normalized_pop = populations / populations.sum()
+    low_label = "%r - %r" % (vals.min(), low_tail)
+    high_label = "%r - %r" % (high_tail, vals.max())
+    ret_histo.append({'name': low_label, 'tail':1})
+    for label, pop in zip(labels, normalized_pop):
+        ret_histo.append({'name': label, 'population':np.round(pop * 100, 0)})
+    high_label = "%r - %r" % (high_tail, vals.max())
+    ret_histo.append({'name': high_label, 'tail':1})
+    if nan_per > 0.0:
+        ret_histo.append(nan_observation)
+    return ret_histo
+
+
+def histo_format(v, l):
+    scaled = v/l
+
+
+def categorical_dict(ser, val_counts, top_n_positions=7):
+    l = len(ser)
+    top = min(len(val_counts), top_n_positions)
+
+
+    top_vals = val_counts.iloc[:top]
+    #top_percentage = top_vals.sum() / l
+    #if len(val_counts) > 5 and top_percentage < .05:
+
+    rest_vals = val_counts.iloc[top:]
+    histogram = top_vals.to_dict()
+
+
+    full_long_tail = rest_vals.sum()
+    unique_count = sum(val_counts == 1)
+    long_tail = full_long_tail - unique_count
+    if unique_count > 0:
+        histogram['unique'] = np.round( (unique_count/l)* 100, 0)
+    if long_tail > 0:
+        histogram['longtail'] = np.round((long_tail/l) * 100,0)
+    return histogram    
+
+def categorical_histogram(ser, val_counts, nan_per, top_n_positions=7):
+    nan_observation = {'name':'NA', 'NA':np.round(nan_per*100, 0)}
+    cd = categorical_dict(ser, val_counts, top_n_positions)
+
+    l = len(ser)
+    histogram = []
+    longtail_obs = {'name': 'longtail'}
+    for k,v in cd.items():
+        if k in ["longtail", "unique"]:
+            longtail_obs[k] = v
+            continue
+        histogram.append({'name':k, 'cat_pop': np.round((v/l)*100,0) })
+    if len(longtail_obs) > 1:
+        histogram.append(longtail_obs)
+    if nan_per > 0.0:
+        histogram.append(nan_observation)
+    return histogram
+
+
+def histogram(ser, nan_per):
+    is_numeric = pd.api.types.is_numeric_dtype(ser.dtype)
+    val_counts = ser.value_counts()
+    if is_numeric and len(val_counts)>5:
+        temp_histo =  numeric_histogram(ser, nan_per)
+        if len(temp_histo) > 5:
+            #if we had basically a categorical variable encoded into an integer.. don't return it
+            return temp_histo
+    return categorical_histogram(ser, val_counts, nan_per)
+
 class ColDisplayHints(ColAnalysis):
     requires_summary = ['min', 'max'] # What summary stats does this analysis provide
     provided_summary = []
@@ -111,15 +200,16 @@ class ColDisplayHints(ColAnalysis):
         'is_numeric', 'is_integer', 'min_digits', 'max_digits', 'histogram']
 
     @staticmethod
-    def col_hints(sampled_ser, summary_ser, ser):
-        is_numeric = pd.api.types.is_numeric_dtype(ser.dtype)
-        if not is_numeric:
-            return dict(is_numeric=False)
-        if len(ser) == 0:
-            return dict(is_numeric=False)
+    def table_hints(sampled_ser, summary_ser, table_hint_col_dict):
+        is_numeric = pd.api.types.is_numeric_dtype(sampled_ser.dtype)
+        # if not is_numeric:
+        #     return dict(is_numeric=False)
+        # if len(sampled_ser) == 0:
+        #     return dict(is_numeric=False)
         return dict(
-            is_numeric=True,
-            is_integer=pd.api.types.is_integer_dtype(ser),
-            min_digits=int_digits(summary_ser.loc['min']),
-            max_digits=int_digits(summary_ser.loc['max']),
-            histogram=histogram(ser))
+            is_numeric=is_numeric,
+            is_integer=pd.api.types.is_integer_dtype(sampled_ser),
+            min_digits=(is_numeric and int_digits(summary_ser.loc['min'])) or 0,
+            max_digits=(is_numeric and int_digits(summary_ser.loc['max'])) or 0,
+            histogram=histogram(sampled_ser, summary_ser['nan_per']))
+
diff --git a/buckaroo/analysis_management.py b/buckaroo/analysis_management.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pandas as pd
-
+import traceback
 from buckaroo.pluggable_analysis_framework import (
     ColAnalysis, order_analysis, check_solvable, NotProvidedException)
 
@@ -28,11 +28,13 @@ def produce_summary_df(df, ordered_objs, df_name='test_df'):
                 summary_res = a_kls.summary(ser, summary_ser, ser)
                 for k,v in summary_res.items():
                     summary_ser.loc[k] = v
-                for k,v in a_kls.table_hints(sampled_ser, summary_ser, table_hint_dict):
+                th_dict = a_kls.table_hints(sampled_ser, summary_ser, table_hint_dict)
+                for k,v in th_dict.items():
                     table_hint_dict[k] = v
             except Exception as e:
                 print("summary_ser", summary_ser)
                 errs[ser_name] = e, a_kls
+                traceback.print_exc()
                 continue
         summary_col_dict[ser_name] = summary_ser
         table_hint_col_dict[ser_name] = table_hint_dict