Commands in polars (#170)

* basic JLisp commands in polars with tests * Fixes #158
paddymul · Nov 22, 2023 · d06bfe1 · d06bfe1
1 parent 4b7bb47
commit d06bfe1
Show file tree

Hide file tree

Showing 4 changed files with 182 additions and 1 deletion.
diff --git a/buckaroo/customizations/polars_commands.py b/buckaroo/customizations/polars_commands.py
@@ -0,0 +1,90 @@
+import polars as pl
+#import numpy as np
+
+from ..jlisp.lispy import s
+#from ..jlisp.configure_utils import configure_buckaroo
+#from ..auto_clean.cleaning_commands import (to_bool, to_datetime, to_int, to_float, to_string)
+
+class Command(object):
+    pass
+
+class FillNA(Command):
+    #argument_names = ["df", "col", "fill_val"]
+    command_default = [s('fillna'), s('df'), "col", 8]
+    command_pattern = [[3, 'fillVal', 'type', 'integer']]
+
+    @staticmethod 
+    def transform(df, col, val):
+        return df.with_columns(pl.col(col).fill_null(val))
+        return df
+
+    @staticmethod 
+    def transform_to_py(df, col, val):
+        return "    df = df.with_columns(pl.col('%s').fill_null(%r))" % (col, val)
+
+class DropCol(Command):
+    #argument_names = ["df", "col"]
+    command_default = [s('dropcol'), s('df'), "col"]
+    command_pattern = [None]
+
+    @staticmethod 
+    def transform(df, col):
+        return df.drop(col)
+        return df
+
+    @staticmethod 
+    def transform_to_py(df, col):
+        return "    df.drop_in_place('%s')" % col
+
+
+class GroupBy(Command):
+    command_default = [s("groupby"), s('df'), 'col', {}]
+    command_pattern = [[3, 'colMap', 'colEnum', ['null', 'sum', 'mean', 'median', 'count']]]
+    @staticmethod 
+    def transform(df, col, col_spec):
+        agg_clauses = []
+        for k, v in col_spec.items():
+            if v == "sum":
+                agg_clauses.append(pl.col(k).sum().alias("%s(sum)" % k))
+            elif v == "mean":
+                agg_clauses.append(pl.col(k).mean().alias("%s(mean)" % k))
+            elif v == "median":
+                agg_clauses.append(pl.col(k).median.alias("%s(median)" % k))
+            elif v == "count":
+                agg_clauses.append(pl.col(k).drop_nulls().count().alias("%s(count)" % k))
+
+        q = (
+            df
+            .lazy()
+            .group_by(by=col)
+            .agg(*agg_clauses)
+            .sort(col, descending=True)
+        )
+        return q.collect()
+
+
+
+    @staticmethod 
+    def transform_to_py(df, col, col_spec):
+        agg_clauses = []
+        for k, v in col_spec.items():
+            if v == "sum":
+                agg_clauses.append("    pl.col('%s').sum().alias('%s(sum)')"  % (k, k))
+            elif v == "mean":
+                agg_clauses.append("    pl.col('%s').mean().alias('%s(mean)')"  % (k, k))
+            elif v == "median":
+                agg_clauses.append("    pl.col('%s').median().alias('%s(median)')"  % (k, k))
+            elif v == "count":
+                agg_clauses.append("    pl.col('%s').drop_nulls().count().alias('%s(count)')"  % (k, k))
+        full_agg_text = ",\n".join(agg_clauses)
+        command_template = """
+    q = (
+         df
+        .lazy()
+        .group_by(by='%s')
+        .agg(%s)
+        .sort('%s', descending=True)
+        )
+    df = q.collect()
+        """
+        return command_template % (col, full_agg_text, col)
diff --git a/buckaroo/jlisp/configure_utils.py b/buckaroo/jlisp/configure_utils.py
@@ -1,3 +1,4 @@
+import pandas as pd
 from .lispy import make_interpreter
 def configure_buckaroo(transforms):
     command_defaults = {}
@@ -14,8 +15,12 @@ def configure_buckaroo(transforms):
         to_py_lisp_primitives[transform_name] = T.transform_to_py
 
     buckaroo_eval, raw_parse = make_interpreter(transform_lisp_primitives)
+
     def buckaroo_transform(instructions, df):
-        df_copy = df.copy()
+        if isinstance(df, pd.DataFrame):
+            df_copy = df.copy()
+        else: # hack we know it's polars here... just getting something working for now
+            df_copy = df.clone()
         ret_val =  buckaroo_eval(instructions, {'df':df_copy})
         return ret_val
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -194,6 +194,7 @@ dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
 
 #bare except used in a convience method for developer ergonomics
 "tests/unit/commands/command_tests.py" = ["E722"] #bare except
+"tests/unit/commands/polars_command_tests.py" = ["E722"] #bare except
 
 
 [tool.ruff.format]

diff --git a/tests/unit/commands/polars_command_tests.py b/tests/unit/commands/polars_command_tests.py
@@ -0,0 +1,85 @@
+from collections import OrderedDict
+import polars as pl
+import numpy as np
+
+from buckaroo.jlisp.lispy import s
+from polars.testing import assert_frame_equal
+from buckaroo.jlisp.configure_utils import configure_buckaroo
+from buckaroo.customizations.polars_commands import (
+    DropCol, FillNA, GroupBy #, OneHot, GroupBy, reindex
+)
+
+
+def result_from_exec(code_str, df_input):
+    CODE_PREAMBLE = "import polars as pl\nimport numpy as np\n"
+    CODE_PREAMBLE += "from buckaroo.auto_clean.auto_clean import smart_to_int\n"
+    RETRIEVE_RESULT_STR = '\n__ret_closure[0] = clean(__test_df)'
+    outer_scope_result = [0]
+    full_code_str = CODE_PREAMBLE + code_str + RETRIEVE_RESULT_STR
+    try:
+        exec(full_code_str, {'__test_df':df_input, '__ret_closure':outer_scope_result})
+    except:
+        print("Failure calling exec with following code string")
+        print(full_code_str)
+    print(full_code_str)
+    return outer_scope_result[0]
+
+def assert_to_py_same_transform_df(command_kls, operations, test_df):
+    _a, _b, transform_df, transform_to_py = configure_buckaroo([command_kls])
+    tdf_ops = [{'symbol': 'begin'}]
+    tdf_ops.extend(operations)
+    tdf = transform_df(tdf_ops, test_df.clone())
+    py_code_string = transform_to_py(operations)
+
+    edf = result_from_exec(py_code_string, test_df.clone())
+    assert_frame_equal(tdf, edf)
+    return tdf
+same = assert_to_py_same_transform_df
+
+def test_fillna():
+    base_df = pl.DataFrame({
+        'a':[1,2,3,4,5], 'b': [None, 2, 2, 2, None]})
+
+    output_df = same(FillNA, [[s('fillna'), s('df'), "b", 100]], base_df)
+    assert output_df[0, 'b'] == 100
+
+def test_dropcol():
+    base_df = pl.DataFrame({
+        'a':np.random.randint(1, 10, 5), 'b':np.random.randint(1, 10, 5),
+        'c':np.random.randint(1, 10, 5)})
+
+    same(DropCol, [[s('dropcol'), s('df'), "a"]], base_df)
+
+def test_groupby():
+    base_df = pl.DataFrame({
+        'a':['cc', 'cc', 'cc', 'ee', 'ff'], 'b': [None, 2, 2, 2, None], 
+        'c': [10, 20, 30, 40, 50]})
+
+    output_df = same(GroupBy, [[s('groupby'), s('df'), "a", {'b':'count', 'c': 'sum'}]], base_df)
+    expected = pl.DataFrame(
+        {'a':        ["ff", "ee",  "cc"],
+         'b(count)': [   0,    1,    2],
+         'c(sum)':   [  50,   40,   60]},
+        schema=OrderedDict([('a', pl.Utf8), ('b(count)', pl.UInt32), ('c(sum)', pl.Int64)])
+    )
+    assert_frame_equal(output_df, expected)
+
+
+'''
+
+
+def test_onehot():
+    base_df = pd.DataFrame({
+        'a':['cc', 'cc', 'dd', 'ee', 'ff'], 'b': [pd.NA, 2, 2, 2, pd.NA]})
+    
+    output_df = same(OneHot, [[s('onehot'), s('df'), "a"]], base_df)
+    assert output_df.columns.to_list() == ['b', 'cc', 'dd', 'ee', 'ff']
+    
+    
+def test_reindex():
+    base_df = pd.DataFrame({
+        'a':['ca', 'cb', 'cd', 'ee', 'ff'], 'b': [pd.NA, 2, 2, 2, pd.NA]})
+    
+    output_df = same(reindex, [[s('reindex'), s('df'), "a"]], base_df)
+    assert output_df.index.to_list() == ['ca', 'cb', 'cd', 'ee', 'ff']
+'''