Skip to content

Commit

Permalink
Commands in polars (#170)
Browse files Browse the repository at this point in the history
* basic JLisp commands in polars with tests
* Fixes #158
  • Loading branch information
paddymul authored Nov 22, 2023
1 parent 4b7bb47 commit d06bfe1
Show file tree
Hide file tree
Showing 4 changed files with 182 additions and 1 deletion.
90 changes: 90 additions & 0 deletions buckaroo/customizations/polars_commands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import polars as pl
#import numpy as np

from ..jlisp.lispy import s
#from ..jlisp.configure_utils import configure_buckaroo
#from ..auto_clean.cleaning_commands import (to_bool, to_datetime, to_int, to_float, to_string)

class Command(object):
pass

class FillNA(Command):
#argument_names = ["df", "col", "fill_val"]
command_default = [s('fillna'), s('df'), "col", 8]
command_pattern = [[3, 'fillVal', 'type', 'integer']]

@staticmethod
def transform(df, col, val):
return df.with_columns(pl.col(col).fill_null(val))
return df

@staticmethod
def transform_to_py(df, col, val):
return " df = df.with_columns(pl.col('%s').fill_null(%r))" % (col, val)

class DropCol(Command):
#argument_names = ["df", "col"]
command_default = [s('dropcol'), s('df'), "col"]
command_pattern = [None]

@staticmethod
def transform(df, col):
return df.drop(col)
return df

@staticmethod
def transform_to_py(df, col):
return " df.drop_in_place('%s')" % col


class GroupBy(Command):
command_default = [s("groupby"), s('df'), 'col', {}]
command_pattern = [[3, 'colMap', 'colEnum', ['null', 'sum', 'mean', 'median', 'count']]]
@staticmethod
def transform(df, col, col_spec):
agg_clauses = []
for k, v in col_spec.items():
if v == "sum":
agg_clauses.append(pl.col(k).sum().alias("%s(sum)" % k))
elif v == "mean":
agg_clauses.append(pl.col(k).mean().alias("%s(mean)" % k))
elif v == "median":
agg_clauses.append(pl.col(k).median.alias("%s(median)" % k))
elif v == "count":
agg_clauses.append(pl.col(k).drop_nulls().count().alias("%s(count)" % k))

q = (
df
.lazy()
.group_by(by=col)
.agg(*agg_clauses)
.sort(col, descending=True)
)
return q.collect()



@staticmethod
def transform_to_py(df, col, col_spec):
agg_clauses = []
for k, v in col_spec.items():
if v == "sum":
agg_clauses.append(" pl.col('%s').sum().alias('%s(sum)')" % (k, k))
elif v == "mean":
agg_clauses.append(" pl.col('%s').mean().alias('%s(mean)')" % (k, k))
elif v == "median":
agg_clauses.append(" pl.col('%s').median().alias('%s(median)')" % (k, k))
elif v == "count":
agg_clauses.append(" pl.col('%s').drop_nulls().count().alias('%s(count)')" % (k, k))
full_agg_text = ",\n".join(agg_clauses)
command_template = """
q = (
df
.lazy()
.group_by(by='%s')
.agg(%s)
.sort('%s', descending=True)
)
df = q.collect()
"""
return command_template % (col, full_agg_text, col)
7 changes: 6 additions & 1 deletion buckaroo/jlisp/configure_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pandas as pd
from .lispy import make_interpreter
def configure_buckaroo(transforms):
command_defaults = {}
Expand All @@ -14,8 +15,12 @@ def configure_buckaroo(transforms):
to_py_lisp_primitives[transform_name] = T.transform_to_py

buckaroo_eval, raw_parse = make_interpreter(transform_lisp_primitives)

def buckaroo_transform(instructions, df):
df_copy = df.copy()
if isinstance(df, pd.DataFrame):
df_copy = df.copy()
else: # hack we know it's polars here... just getting something working for now
df_copy = df.clone()
ret_val = buckaroo_eval(instructions, {'df':df_copy})
return ret_val

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"

#bare except used in a convience method for developer ergonomics
"tests/unit/commands/command_tests.py" = ["E722"] #bare except
"tests/unit/commands/polars_command_tests.py" = ["E722"] #bare except


[tool.ruff.format]
Expand Down
85 changes: 85 additions & 0 deletions tests/unit/commands/polars_command_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from collections import OrderedDict
import polars as pl
import numpy as np

from buckaroo.jlisp.lispy import s
from polars.testing import assert_frame_equal
from buckaroo.jlisp.configure_utils import configure_buckaroo
from buckaroo.customizations.polars_commands import (
DropCol, FillNA, GroupBy #, OneHot, GroupBy, reindex
)


def result_from_exec(code_str, df_input):
CODE_PREAMBLE = "import polars as pl\nimport numpy as np\n"
CODE_PREAMBLE += "from buckaroo.auto_clean.auto_clean import smart_to_int\n"
RETRIEVE_RESULT_STR = '\n__ret_closure[0] = clean(__test_df)'
outer_scope_result = [0]
full_code_str = CODE_PREAMBLE + code_str + RETRIEVE_RESULT_STR
try:
exec(full_code_str, {'__test_df':df_input, '__ret_closure':outer_scope_result})
except:
print("Failure calling exec with following code string")
print(full_code_str)
print(full_code_str)
return outer_scope_result[0]

def assert_to_py_same_transform_df(command_kls, operations, test_df):
_a, _b, transform_df, transform_to_py = configure_buckaroo([command_kls])
tdf_ops = [{'symbol': 'begin'}]
tdf_ops.extend(operations)
tdf = transform_df(tdf_ops, test_df.clone())
py_code_string = transform_to_py(operations)

edf = result_from_exec(py_code_string, test_df.clone())
assert_frame_equal(tdf, edf)
return tdf
same = assert_to_py_same_transform_df

def test_fillna():
base_df = pl.DataFrame({
'a':[1,2,3,4,5], 'b': [None, 2, 2, 2, None]})

output_df = same(FillNA, [[s('fillna'), s('df'), "b", 100]], base_df)
assert output_df[0, 'b'] == 100

def test_dropcol():
base_df = pl.DataFrame({
'a':np.random.randint(1, 10, 5), 'b':np.random.randint(1, 10, 5),
'c':np.random.randint(1, 10, 5)})

same(DropCol, [[s('dropcol'), s('df'), "a"]], base_df)

def test_groupby():
base_df = pl.DataFrame({
'a':['cc', 'cc', 'cc', 'ee', 'ff'], 'b': [None, 2, 2, 2, None],
'c': [10, 20, 30, 40, 50]})

output_df = same(GroupBy, [[s('groupby'), s('df'), "a", {'b':'count', 'c': 'sum'}]], base_df)
expected = pl.DataFrame(
{'a': ["ff", "ee", "cc"],
'b(count)': [ 0, 1, 2],
'c(sum)': [ 50, 40, 60]},
schema=OrderedDict([('a', pl.Utf8), ('b(count)', pl.UInt32), ('c(sum)', pl.Int64)])
)
assert_frame_equal(output_df, expected)


'''
def test_onehot():
base_df = pd.DataFrame({
'a':['cc', 'cc', 'dd', 'ee', 'ff'], 'b': [pd.NA, 2, 2, 2, pd.NA]})
output_df = same(OneHot, [[s('onehot'), s('df'), "a"]], base_df)
assert output_df.columns.to_list() == ['b', 'cc', 'dd', 'ee', 'ff']
def test_reindex():
base_df = pd.DataFrame({
'a':['ca', 'cb', 'cd', 'ee', 'ff'], 'b': [pd.NA, 2, 2, 2, pd.NA]})
output_df = same(reindex, [[s('reindex'), s('df'), "a"]], base_df)
assert output_df.index.to_list() == ['ca', 'cb', 'cd', 'ee', 'ff']
'''

0 comments on commit d06bfe1

Please sign in to comment.