paddymul · paddymul · Nov 18, 2023 · Nov 18, 2023 · Nov 18, 2023 · Nov 18, 2023
diff --git a/buckaroo/auto_clean/cleaning_commands.py b/buckaroo/auto_clean/cleaning_commands.py
@@ -54,7 +54,7 @@ def transform(df, col):
 
     @staticmethod 
     def transform_to_py(df, col):
-        return "    df['%s'] = smart_int(df['%s'])" % (col, col)
+        return "    df['%s'] = smart_to_int(df['%s'])" % (col, col)
 
 class to_float(Command):
     #argument_names = ["df", "col"]

diff --git a/tests/unit/basic_widget_test.py b/tests/unit/basic_widget_test.py
@@ -29,7 +29,7 @@ def test_interpreter():
 
     w = BuckarooWidget(simple_df)
     assert w.operation_results['generated_py_code'] == '''def clean(df):
-    df['int_col'] = smart_int(df['int_col'])
+    df['int_col'] = smart_to_int(df['int_col'])
     df['str_col'] = df['str_col'].fillna(value='').astype('string').replace('', None)
     return df'''
 
@@ -42,7 +42,7 @@ def test_interpreter():
     field_names = [ f['name'] for f in tdf['schema']['fields'] ]
     assert 'str_col' not in field_names
     assert w.operation_results['generated_py_code'] == """def clean(df):
-    df['int_col'] = smart_int(df['int_col'])
+    df['int_col'] = smart_to_int(df['int_col'])
     df['str_col'] = df['str_col'].fillna(value='').astype('string').replace('', None)
     df.drop('str_col', axis=1, inplace=True)
     return df"""

diff --git a/tests/unit/commands/__init__.py b/tests/unit/commands/__init__.py
diff --git a/tests/unit/commands/cleaning_command_test.py b/tests/unit/commands/cleaning_command_test.py
@@ -0,0 +1,58 @@
+import pandas as pd
+import numpy as np
+
+from buckaroo.jlisp.lispy import s
+from .command_tests import assert_to_py_same_transform_df
+from buckaroo.auto_clean.cleaning_commands import (
+    to_bool, to_datetime, to_int, to_float,
+    to_string)
+
+same = assert_to_py_same_transform_df
+
+def test_to_bool():
+    base_df = pd.DataFrame({
+        'mixed_bool':[True, False, 0, 1, pd.NA], 'b': [pd.NA] * 5})
+
+    output_df = same(to_bool, [[s('to_bool'), s('df'), "mixed_bool"]], base_df)
+    assert isinstance(output_df['mixed_bool'].dtype , pd.core.arrays.boolean.BooleanDtype)
+    assert output_df['mixed_bool'].to_list() == [True, False, False, True, pd.NA]
+
+def test_to_datetime():
+    base_df = pd.DataFrame({
+        'mixed_dates':['2021-02-03', '2022-05-07', 'asdf', pd.NA], 'b': [pd.NA] * 4})
+
+    output_df = same(to_datetime, [[s('to_datetime'), s('df'), "mixed_dates"]], base_df)
+    assert pd.api.types.is_datetime64_any_dtype(output_df['mixed_dates'])
+    assert output_df['mixed_dates'].to_list() == [
+        pd.Timestamp('2021-02-03'), pd.Timestamp('2022-05-07'), pd.NaT, pd.NaT]
+
+def test_to_int():
+    base_df = pd.DataFrame({
+        'mixed_ints':['3', '4', '3.', 'asdf', pd.NA], 'b': [pd.NA] * 5})
+
+    output_df = same(to_int, [[s('to_int'), s('df'), "mixed_ints"]], base_df)
+    pd.testing.assert_series_equal(
+        output_df['mixed_ints'],
+        pd.Series([3,4,3, pd.NA, pd.NA], dtype='UInt8', name='mixed_ints'))
+
+def test_to_float():
+    base_df = pd.DataFrame({
+        'mixed_floats':['3', '4', '7.1', 'asdf', np.nan], 'b': [pd.NA] * 5})
+
+    output_df = same(to_float, [[s('to_float'), s('df'), "mixed_floats"]], base_df)
+    pd.testing.assert_series_equal(
+        output_df['mixed_floats'],
+        pd.Series([3, 4, 7.1, np.nan, np.nan], dtype='float64', name='mixed_floats'))
+
+def _test_to_string():
+    """
+    skipping for now.  works on my machine against pandas 2.1.1 fails in CI against pandas 1.3.5
+    """
+    base_df = pd.DataFrame({
+        'mixed_strings':['a', 'b', pd.NA], 'b': [pd.NA] * 3})
+
+    output_df = same(to_string, [[s('to_string'), s('df'), "mixed_strings"]], base_df)
+    pd.testing.assert_series_equal(
+        output_df['mixed_strings'],
+        pd.Series(['a', 'b', pd.NA], dtype='string', name='mixed_strings'))
+
diff --git a/tests/unit/commands/command_tests.py b/tests/unit/commands/command_tests.py
@@ -9,6 +9,7 @@
 
 def result_from_exec(code_str, df_input):
     CODE_PREAMBLE = "import pandas as pd\nimport numpy as np\n"
+    CODE_PREAMBLE += "from buckaroo.auto_clean.auto_clean import smart_to_int\n"
     RETRIEVE_RESULT_STR = '\n__ret_closure[0] = clean(__test_df)'
     outer_scope_result = [0]
     full_code_str = CODE_PREAMBLE + code_str + RETRIEVE_RESULT_STR