diff --git a/README.md b/README.md index aa1bbea..d02147b 100755 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# Pandandic +# pandandic -Pandandic is a library for documenting dataset schemas in code, by inheriting from a base class and assigning attributes for columns and column sets. +pandandic is a library for documenting dataset schemas in code, by inheriting from a base class and assigning attributes for columns and column sets. ## Installation @@ -21,6 +21,8 @@ Consider a project that reads data from several datasets, performs some preproce The preprocessing must act on certain columns and so the team rightfully add constants in order to perform slicing on the input dataframes. Two of these datasets share a column name. One of the datasets consists of time series data, and each time the dataset is refreshed the number of columns changes. +This scenario presents several challenges with how to structure the processing logic in a clear and adaptable manner whilst maintaining clear ownership. +Here is how `pandandic` helps: 1. **Schema ownership**: with `pandandic`, each schema has a corresponding class. 2. **Shared variables**: with `pandandic`, there are no shared constants. Each `BaseFrame` subclass is responsible for its own schema. @@ -37,7 +39,6 @@ For both: `pip install pandandic[all]`, `poetry add "pandandic[all]"` ## What Doesn't It Do? * **Validation**, save for what is built in to pandas. For validation of defined types, please see other libraries such as pandera, dataenforce, strictly-typed-pandas (apologies for any I have missed). -* **Aliasing**: if columns should have different names, it shouldn't be the job of the schema to achieve that; this could lead to unclear behaviour and that's not what we want. Although, some degree of aliasing can be achieved through `ColumnGroup`, it isn't recommended. * **Appending columns**: if columns are appended to the object after calling `read_x` or `from_df` that should be captured by a `ColumnSet`, they won't be captured. This can be solved by transforming to a dataframe and back again with `to_df` and `from_df` respectively. * **Dask**: although support may be added in future. @@ -105,7 +106,7 @@ class TemperatureFrame(BaseFrame): df = TemperatureFrame().read_csv("intermediate.csv") -df.set_index(TemperatureFrame.date.name, inplace=True) +df.set_index(TemperatureFrame.date.column_name, inplace=True) # name attribute also works here, but column_name is recommended print(df.temperature) ``` @@ -129,6 +130,7 @@ This can be done as well with non-regex `ColumnSet`, in that case accessing the import datetime from pandandic import BaseFrame, Column, ColumnSet, ColumnGroup + class AdvancedFrame(BaseFrame): """ A Group can be used to group together multiple column groups and columns. @@ -143,13 +145,49 @@ class AdvancedFrame(BaseFrame): numerical = ColumnGroup(members=[temperature, ref]) time_series = ColumnGroup(members=[temperature, door_open]) + df = AdvancedFrame().read_csv("advanced.csv") -df.set_index(AdvancedFrame.date.name, inplace=True) +df.set_index(AdvancedFrame.date.column_name, inplace=True) # name attribute also works here, but column_name is recommended print(df.time_series) ``` `ColumnGroup` and `ColumnSet` attributes can be accessed on the instantiated object, and will return a `DataFrame` view of their members. +```python +# examples/expert_usage.py +import datetime + +from pandandic import BaseFrame, Column, ColumnSet, ColumnGroup, DefinedLater + + +class ExpertFrame(BaseFrame): + """ + Aliasing can be used to dynamically set columns or column set members at runtime. + """ + date = Column(type=datetime.date, alias=DefinedLater) + metadata = ColumnSet(members=DefinedLater) + + temperature = ColumnSet(type=float, members=["temperature-\d+"], regex=True) + door_open = ColumnSet(type=bool, members=["door-open-0", "door-open-1", "door-open-2"], regex=False) + + time_series = ColumnGroup(members=[temperature, door_open]) + + +# anything DefinedLater MUST be set before ExpertFrame reads or accesses a Column or ColumnSet via attribute +ExpertFrame.date.alias = "date" +ExpertFrame.metadata.members = ["comment", "ref"] + +df = ExpertFrame().read_csv("advanced.csv") +df.set_index(ExpertFrame.date.column_name, inplace=True) # now sets index with the defined alias +print(df.metadata) + +``` + +`Column` alias can be set as `DefinedLater` to clearly document that it is set dynamically at runtime. +The same is possible for `ColumnSet` members. This has the benefit of adding a runtime check that the alias or members are set before being used. + +**Warning**: If a `Column` alias is set, it will be used **regardless** of whether it exists in the data or not. + ## Class Diagram ```mermaid diff --git a/examples/advanced_usage.py b/examples/advanced_usage.py index 5481aa2..cae2386 100644 --- a/examples/advanced_usage.py +++ b/examples/advanced_usage.py @@ -20,5 +20,5 @@ class AdvancedFrame(BaseFrame): df = AdvancedFrame().read_csv(Path(__file__).parent.joinpath("advanced.csv").as_posix()) -df.set_index(AdvancedFrame.date.name, inplace=True) +df.set_index(AdvancedFrame.date.column_name, inplace=True) # name attribute also works here, but column_name is recommended print(df.time_series) diff --git a/examples/expert_usage.py b/examples/expert_usage.py new file mode 100644 index 0000000..1c048f4 --- /dev/null +++ b/examples/expert_usage.py @@ -0,0 +1,26 @@ +import datetime +from pathlib import Path + +from pandandic import BaseFrame, Column, ColumnSet, ColumnGroup, DefinedLater + + +class ExpertFrame(BaseFrame): + """ + Aliasing can be used to dynamically set columns or column set members at runtime. + """ + date = Column(type=datetime.date, alias=DefinedLater) + metadata = ColumnSet(members=DefinedLater) + + temperature = ColumnSet(type=float, members=["temperature-\d+"], regex=True) + door_open = ColumnSet(type=bool, members=["door-open-0", "door-open-1", "door-open-2"], regex=False) + + time_series = ColumnGroup(members=[temperature, door_open]) + + +# anything DefinedLater MUST be set before ExpertFrame reads or accesses a Column or ColumnSet via attribute +ExpertFrame.date.alias = "date" +ExpertFrame.metadata.members = ["comment", "ref"] + +df = ExpertFrame().read_csv(Path(__file__).parent.joinpath("advanced.csv").as_posix()) +df.set_index(ExpertFrame.date.column_name, inplace=True) # now sets index with the defined alias +print(df.metadata) diff --git a/examples/intermediate_usage.py b/examples/intermediate_usage.py index 999f12b..ef929a1 100755 --- a/examples/intermediate_usage.py +++ b/examples/intermediate_usage.py @@ -18,5 +18,5 @@ class TemperatureFrame(BaseFrame): df = TemperatureFrame().read_csv(Path(__file__).parent.joinpath("intermediate.csv").as_posix()) -df.set_index(TemperatureFrame.date.name, inplace=True) +df.set_index(TemperatureFrame.date.column_name, inplace=True) # name attribute also works here, but column_name is recommended print(df.temperature) diff --git a/pyproject.toml b/pyproject.toml index 4eff239..ad6af6e 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pandandic" -version = "0.1.1a0" +version = "0.2.0" description = "A typed dataframe helper" license = "MIT" authors = ["Will Martin "] diff --git a/src/pandandic/__init__.py b/src/pandandic/__init__.py index 076d265..605d98a 100755 --- a/src/pandandic/__init__.py +++ b/src/pandandic/__init__.py @@ -3,3 +3,6 @@ from .column_set import ColumnSet from .column_group import ColumnGroup from .column_group_exception import ColumnGroupException +from .defined_later import DefinedLater +from .column_alias_not_yet_defined_exception import ColumnAliasNotYetDefinedException +from .column_set_members_not_yet_defined_exception import ColumnSetMembersNotYetDefinedException diff --git a/src/pandandic/base_frame.py b/src/pandandic/base_frame.py index 3740b19..51cfd27 100755 --- a/src/pandandic/base_frame.py +++ b/src/pandandic/base_frame.py @@ -11,9 +11,12 @@ from pandas import DataFrame from .column import Column -from .column_set import ColumnSet -from .column_group_exception import ColumnGroupException +from .column_alias_not_yet_defined_exception import ColumnAliasNotYetDefinedException from .column_group import ColumnGroup +from .column_group_exception import ColumnGroupException +from .column_set import ColumnSet +from .column_set_members_not_yet_defined_exception import ColumnSetMembersNotYetDefinedException +from .defined_later import DefinedLater if sys.version_info >= (3, 11): from typing import Self @@ -21,6 +24,7 @@ from typing_extensions import Self + class BaseFrame(DataFrame): """ Enabled schema-in-code through subclassing of DataFrame. @@ -41,8 +45,8 @@ def __init__(self, *args, **kwargs): self.greedy_column_sets = False self._get_column_map() - self._get_column_sets() - self._get_column_groups() + self._get_column_set_map() + self._get_column_group_map() if len(args) > 0 and isinstance(args[0], DataFrame): self.from_df(args[0]) @@ -50,22 +54,31 @@ def __init__(self, *args, **kwargs): def __getattribute__(self, item): if not item.startswith("_"): if self._column_map is not None and item in self._column_map: - return self[self._column_map[item].name] + column = self._column_map[item] + if column.alias == DefinedLater or isinstance(column.alias, DefinedLater): + raise ColumnAliasNotYetDefinedException(column) + return self[column.alias or column.name] + if self._column_set_map is not None and item in self._column_set_map: - return self[self._column_consumed_map[self._column_set_map[item].name]] + column_set = self._column_set_map[item] + if column_set.members == DefinedLater or isinstance(column_set.members, DefinedLater): + raise ColumnSetMembersNotYetDefinedException(column_set) + return self[self._column_consumed_map[column_set.name]] + if self._column_group_map is not None and item in self._column_group_map: return self[list(itertools.chain.from_iterable( [[column_or_column_group.name] if isinstance(column_or_column_group, Column) else self._column_consumed_map[column_or_column_group.name] for column_or_column_group in self._column_group_map[item].members]))] + return super().__getattribute__(item) def to_df(self) -> DataFrame: return DataFrame(self) - def with_enforced_types(self, validate: bool = True) -> Self: - self.enforce_types = validate + def with_enforced_types(self, enforce_types: bool = True) -> Self: + self.enforce_types = enforce_types return self def with_enforced_columns(self, enforce_typed_columns: bool = True) -> Self: @@ -80,7 +93,7 @@ def with_greedy_column_sets(self, greedy_column_sets: bool = True) -> Self: self.greedy_column_sets = greedy_column_sets return self - def read_csv(self, *args, **kwargs) -> DataFrame: + def read_csv(self, *args, **kwargs) -> Self: if self.enforce_columns or self.enforce_types: columns = self.read_csv_columns(*args, **kwargs) column_map = self._compute_column_map(columns) @@ -99,7 +112,7 @@ def read_csv(self, *args, **kwargs) -> DataFrame: super().__init__(df) return self - def read_excel(self, *args, **kwargs) -> DataFrame: + def read_excel(self, *args, **kwargs) -> Self: if self.enforce_columns or self.enforce_types: columns = self.read_excel_columns(*args, **kwargs) column_map = self._compute_column_map(columns) @@ -118,7 +131,7 @@ def read_excel(self, *args, **kwargs) -> DataFrame: super().__init__(df) return self - def read_parquet(self, *args, **kwargs) -> DataFrame: + def read_parquet(self, *args, **kwargs) -> Self: columns = self.read_parquet_columns(*args, **kwargs) column_map = self._compute_column_map(columns) @@ -135,7 +148,7 @@ def read_parquet(self, *args, **kwargs) -> DataFrame: super().__init__(df) return self - def read_avro(self, *args, **kwargs) -> DataFrame: + def read_avro(self, *args, **kwargs) -> Self: from pandavro import read_avro columns = self.read_avro_columns(*args, **kwargs) @@ -154,7 +167,7 @@ def read_avro(self, *args, **kwargs) -> DataFrame: super().__init__(df) return self - def from_df(self, df) -> DataFrame: + def from_df(self, df) -> Self: df = df.copy() columns = df.columns.tolist() @@ -175,16 +188,23 @@ def from_df(self, df) -> DataFrame: def _compute_column_map(self, columns: List[str]) -> Dict[str, type]: self._column_consumed_map.clear() - key_column_map = {column.name: column for column in self._get_column_map().values()} + key_column_map = {(column.alias or column.name): column for column in self._get_column_map().values()} + for alias, column in key_column_map.items(): + if alias == DefinedLater or isinstance(alias, DefinedLater): + raise ColumnAliasNotYetDefinedException(column) - if len(self._get_column_sets()) == 0: + if len(self._get_column_set_map()) == 0: return {k: v.type for k, v in key_column_map.items()} column_bag = np.array([key_column_map[c] if c in key_column_map else None for c in columns]) consumed_columns = ~np.equal(column_bag, None) - exact_column_sets = list(filter(lambda column_set: not column_set.regex, self._get_column_sets().values())) - regex_column_sets = list(filter(lambda column_set: column_set.regex, self._get_column_sets().values())) + for key, column_set in self._get_column_set_map().items(): + if column_set.members == DefinedLater or isinstance(column_set.members, DefinedLater): + raise ColumnSetMembersNotYetDefinedException(column_set) + + exact_column_sets = list(filter(lambda column_set: not column_set.regex, self._get_column_set_map().values())) + regex_column_sets = list(filter(lambda column_set: column_set.regex, self._get_column_set_map().values())) for i, column in enumerate(columns): for column_set in exact_column_sets: @@ -233,7 +253,7 @@ def _type_is_castable(column_type_tuple: Tuple[str, type]) -> bool: @staticmethod def _type_is_not_any(column_type_tuple: Tuple[str, type]) -> bool: _, t = column_type_tuple - return t is not Any + return t not in (Any, None) @staticmethod def _type_is_date(column_type_tuple: Tuple[str, type]) -> bool: @@ -297,13 +317,13 @@ def _get_column_map(cls) -> Dict[str, Column]: return cls._column_map @classmethod - def _get_column_sets(cls) -> Dict[str, ColumnSet]: + def _get_column_set_map(cls) -> Dict[str, ColumnSet]: if cls._column_set_map is None: cls._column_set_map = dict(filter(lambda kv: isinstance(kv[1], ColumnSet), cls.__dict__.items())) return cls._column_set_map @classmethod - def _get_column_groups(cls) -> Dict[str, ColumnGroup]: + def _get_column_group_map(cls) -> Dict[str, ColumnGroup]: if cls._column_group_map is None: cls._column_group_map = dict(filter(lambda kv: isinstance(kv[1], ColumnGroup), cls.__dict__.items())) return cls._column_group_map diff --git a/src/pandandic/column.py b/src/pandandic/column.py index 6f595c2..06946b4 100755 --- a/src/pandandic/column.py +++ b/src/pandandic/column.py @@ -1,10 +1,17 @@ from dataclasses import dataclass -from typing import Type, Optional +from typing import Type, Any, Optional, Union + +from .defined_later import DefinedLater @dataclass class Column: - type: Type + type: Type = Any + alias: Optional[Union[str, DefinedLater, DefinedLater.__class__]] = None def __set_name__(self, _, name): self.name = name + + @property + def column_name(self) -> Union[str, DefinedLater, DefinedLater.__class__]: + return self.alias or self.name diff --git a/src/pandandic/column_alias_not_yet_defined_exception.py b/src/pandandic/column_alias_not_yet_defined_exception.py new file mode 100644 index 0000000..a6b5377 --- /dev/null +++ b/src/pandandic/column_alias_not_yet_defined_exception.py @@ -0,0 +1,9 @@ +from .column import Column + + +class ColumnAliasNotYetDefinedException(Exception): + def __init__(self, column: Column): + self._column = column + + def __str__(self) -> str: + return f"Error. Column with name {self._column.name} was used with an alias that is not defined." diff --git a/src/pandandic/column_group.py b/src/pandandic/column_group.py index 46e7e20..db037b1 100644 --- a/src/pandandic/column_group.py +++ b/src/pandandic/column_group.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import List, Union from .column import Column @@ -8,3 +8,6 @@ @dataclass class ColumnGroup: members: List[Union[Column, ColumnSet]] + + def __set_name__(self, _, name): + self.name = name diff --git a/src/pandandic/column_set.py b/src/pandandic/column_set.py index 4f18fc8..18a1c36 100755 --- a/src/pandandic/column_set.py +++ b/src/pandandic/column_set.py @@ -1,11 +1,13 @@ from dataclasses import dataclass -from typing import Type, List +from typing import Type, List, Any, Union + +from .defined_later import DefinedLater @dataclass class ColumnSet: - type: Type # dtype applied to this set - members: List[str] # list of columns matched to this set + members: Union[List[str], DefinedLater, DefinedLater.__class__] # list of columns matched to this set + type: Type = Any # dtype applied to this set regex: bool = False # enables matching members by regex def __set_name__(self, _, name): diff --git a/src/pandandic/column_set_members_not_yet_defined_exception.py b/src/pandandic/column_set_members_not_yet_defined_exception.py new file mode 100644 index 0000000..3fd6402 --- /dev/null +++ b/src/pandandic/column_set_members_not_yet_defined_exception.py @@ -0,0 +1,9 @@ +from .column_set import ColumnSet + + +class ColumnSetMembersNotYetDefinedException(Exception): + def __init__(self, column_set: ColumnSet): + self._column_set = column_set + + def __str__(self) -> str: + return f"Error. ColumnSet with name {self._column_set.name} was used with a member list that is not defined." diff --git a/src/pandandic/defined_later.py b/src/pandandic/defined_later.py new file mode 100644 index 0000000..866e555 --- /dev/null +++ b/src/pandandic/defined_later.py @@ -0,0 +1,7 @@ +class DefinedLater: + """ + Denotes that a Column alias or ColumnSet member list will be defined dynamically at runtime. + If a read call is made before the DefinedLater is replaced, a ColumnAliasNotYetDefinedException or + ColumnSetMembersNotYetDefinedException will be thrown. + """ + ... diff --git a/test/unit/test_base_frame.py b/test/unit/test_base_frame.py index b80fee9..5f19593 100755 --- a/test/unit/test_base_frame.py +++ b/test/unit/test_base_frame.py @@ -9,9 +9,11 @@ import pandas as pd from pandas import DataFrame -from pandandic import BaseFrame, ColumnSet +from pandandic import BaseFrame, ColumnSet, DefinedLater from pandandic import Column +from pandandic.column_alias_not_yet_defined_exception import ColumnAliasNotYetDefinedException from pandandic.column_group_exception import ColumnGroupException +from pandandic.column_set_members_not_yet_defined_exception import ColumnSetMembersNotYetDefinedException class FooBarFrame(BaseFrame): @@ -406,3 +408,128 @@ def test_should_peek_cols(self): # assert self.assertListEqual(expected_columns, actual_columns) pd.testing.assert_frame_equal(expected_data, actual_data) + + def test_should_read_dynamic_column(self): + # arrange + class DynamicColumnFrame(BaseFrame): + foo = Column(type=int) + bar = Column() + + foo_column_name = "foob" + bar_column_name = "barb" + + data = pd.DataFrame( + columns=[foo_column_name, bar_column_name], + data=[ + [20, "baz"], + [21, "17.5"], + [21, "36"], + ] + ) + sut = DynamicColumnFrame() + DynamicColumnFrame.foo.alias = foo_column_name + DynamicColumnFrame.bar.alias = bar_column_name + + for name, file_type in self.supported_file_types.items(): + with self.subTest(name): + file_type.save_method(data, file_type.filename(), index=False) + # act + result = file_type.read_method(sut, file_type.filename()) + actual = result.to_df() + # assert + pd.testing.assert_frame_equal(data, actual) + pd.testing.assert_series_equal(data[foo_column_name], result.foo) + pd.testing.assert_series_equal(data[bar_column_name], result.bar) + + def test_should_raise_if_column_not_defined_yet(self): + # arrange + class DynamicColumnFrame(BaseFrame): + foo = Column(alias=DefinedLater) + + foo_column_name = "foob" + bar_column_name = "barb" + + data = pd.DataFrame( + columns=[foo_column_name, bar_column_name], + data=[ + [20, "baz"], + [21, "17.5"], + [21, "36"], + ] + ) + sut = DynamicColumnFrame() + + for name, file_type in self.supported_file_types.items(): + with self.subTest(name): + file_type.save_method(data, file_type.filename(), index=False) + # assert + with self.assertRaises(ColumnAliasNotYetDefinedException): + # act + file_type.read_method(sut, file_type.filename()) + # assert + with self.assertRaises(ColumnAliasNotYetDefinedException): + # act + _ = sut.foo + + def test_should_read_dynamic_column_set(self): + # arrange + class DynamicColumnFrame(BaseFrame): + foo = ColumnSet(members=DefinedLater) + bar = ColumnSet(type=int, members=DefinedLater) + + foo_column_names = ["foob", "fooc", "food"] + bar_column_names = ["barb", "barc"] + + data = pd.DataFrame( + columns=foo_column_names + bar_column_names, + data=[ + [20, "baz", 1, 2, 3], + [21, "17.5", 1, 2, 3], + [21, "36", 1, 2, 3], + ] + ) + sut = DynamicColumnFrame() + DynamicColumnFrame.foo.members = foo_column_names + DynamicColumnFrame.bar.members = bar_column_names + + for name, file_type in self.supported_file_types.items(): + with self.subTest(name): + file_type.save_method(data, file_type.filename(), index=False) + # act + result = file_type.read_method(sut, file_type.filename()) + actual = result.to_df() + # assert + pd.testing.assert_frame_equal(data, actual) + pd.testing.assert_frame_equal(data[foo_column_names], result.foo) + pd.testing.assert_frame_equal(data[bar_column_names], result.bar) + + def test_should_raise_if_column_set_not_defined_yet(self): + # arrange + class DynamicColumnFrame(BaseFrame): + foo = ColumnSet(members=DefinedLater) + bar = ColumnSet(type=int, members=DefinedLater) + + foo_column_names = ["foob", "fooc", "food"] + bar_column_names = ["barb", "barc"] + + data = pd.DataFrame( + columns=foo_column_names + bar_column_names, + data=[ + [20, "baz", 1, 2, 3], + [21, "17.5", 1, 2, 3], + [21, "36", 1, 2, 3], + ] + ) + sut = DynamicColumnFrame() + + for name, file_type in self.supported_file_types.items(): + with self.subTest(name): + file_type.save_method(data, file_type.filename(), index=False) + # assert + with self.assertRaises(ColumnSetMembersNotYetDefinedException): + # act + file_type.read_method(sut, file_type.filename()) + # assert + with self.assertRaises(ColumnSetMembersNotYetDefinedException): + # act + _ = sut.foo diff --git a/test/unit/test_column.py b/test/unit/test_column.py index 2eb26b2..edc2698 100755 --- a/test/unit/test_column.py +++ b/test/unit/test_column.py @@ -1,3 +1,4 @@ +from typing import Any from unittest import TestCase from pandandic import Column @@ -11,3 +12,20 @@ class Foo: bar = Column(type=str) # assert self.assertEqual("bar", Foo.bar.name) + + def test_should_not_modify_type_class_attribute(self): + # act + foo = Column() + foo.type = float + bar = Column() + # assert + self.assertEqual(bar.type, Any) + + def test_should_not_modify_alias_class_attribute(self): + # act + foo = Column() + foo.alias = "baz" + bar = Column() + # assert + self.assertEqual("baz", foo.alias) + self.assertIsNone(bar.alias)