Adds aliasing of Column and ColumnSet via DefinedLater

w-martin · Oct 25, 2022 · 27b2a7a · 27b2a7a
1 parent 0b9202d
commit 27b2a7a
Show file tree

Hide file tree

Showing 15 changed files with 304 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
-# Pandandic
+# pandandic
 
-Pandandic is a library for documenting dataset schemas in code, by inheriting from a base class and assigning attributes for columns and column sets.
+pandandic is a library for documenting dataset schemas in code, by inheriting from a base class and assigning attributes for columns and column sets.
 
 ## Installation
 
@@ -21,6 +21,8 @@ Consider a project that reads data from several datasets, performs some preproce
 The preprocessing must act on certain columns and so the team rightfully add constants in order to perform slicing on the input dataframes.
 Two of these datasets share a column name.
 One of the datasets consists of time series data, and each time the dataset is refreshed the number of columns changes.
+This scenario presents several challenges with how to structure the processing logic in a clear and adaptable manner whilst maintaining clear ownership.
+Here is how `pandandic` helps:
 
 1. **Schema ownership**: with `pandandic`, each schema has a corresponding class.
 2. **Shared variables**: with `pandandic`, there are no shared constants. Each `BaseFrame` subclass is responsible for its own schema.
@@ -37,7 +39,6 @@ For both: `pip install pandandic[all]`, `poetry add "pandandic[all]"`
 ## What Doesn't It Do?
 
 * **Validation**, save for what is built in to pandas. For validation of defined types, please see other libraries such as pandera, dataenforce, strictly-typed-pandas (apologies for any I have missed).
-* **Aliasing**: if columns should have different names, it shouldn't be the job of the schema to achieve that; this could lead to unclear behaviour and that's not what we want. Although, some degree of aliasing can be achieved through `ColumnGroup`, it isn't recommended.
 * **Appending columns**: if columns are appended to the object after calling `read_x` or `from_df` that should be captured by a `ColumnSet`, they won't be captured. This can be solved by transforming to a dataframe and back again with `to_df` and `from_df` respectively.
 * **Dask**: although support may be added in future.
 
@@ -105,7 +106,7 @@ class TemperatureFrame(BaseFrame):
 
 
 df = TemperatureFrame().read_csv("intermediate.csv")
-df.set_index(TemperatureFrame.date.name, inplace=True)
+df.set_index(TemperatureFrame.date.column_name, inplace=True)  # name attribute also works here, but column_name is recommended
 print(df.temperature)
 
 ```
@@ -129,6 +130,7 @@ This can be done as well with non-regex `ColumnSet`, in that case accessing the
 import datetime
 from pandandic import BaseFrame, Column, ColumnSet, ColumnGroup
 
+
 class AdvancedFrame(BaseFrame):
     """
     A Group can be used to group together multiple column groups and columns.
@@ -143,13 +145,49 @@ class AdvancedFrame(BaseFrame):
     numerical = ColumnGroup(members=[temperature, ref])
     time_series = ColumnGroup(members=[temperature, door_open])
 
+
 df = AdvancedFrame().read_csv("advanced.csv")
-df.set_index(AdvancedFrame.date.name, inplace=True)
+df.set_index(AdvancedFrame.date.column_name, inplace=True)  # name attribute also works here, but column_name is recommended
 print(df.time_series)
 ```
 
 `ColumnGroup` and `ColumnSet` attributes can be accessed on the instantiated object, and will return a `DataFrame` view of their members.
 
+```python
+# examples/expert_usage.py
+import datetime
+
+from pandandic import BaseFrame, Column, ColumnSet, ColumnGroup, DefinedLater
+
+
+class ExpertFrame(BaseFrame):
+    """
+    Aliasing can be used to dynamically set columns or column set members at runtime.
+    """
+    date = Column(type=datetime.date, alias=DefinedLater)
+    metadata = ColumnSet(members=DefinedLater)
+
+    temperature = ColumnSet(type=float, members=["temperature-\d+"], regex=True)
+    door_open = ColumnSet(type=bool, members=["door-open-0", "door-open-1", "door-open-2"], regex=False)
+
+    time_series = ColumnGroup(members=[temperature, door_open])
+
+
+# anything DefinedLater MUST be set before ExpertFrame reads or accesses a Column or ColumnSet via attribute
+ExpertFrame.date.alias = "date"
+ExpertFrame.metadata.members = ["comment", "ref"]
+
+df = ExpertFrame().read_csv("advanced.csv")
+df.set_index(ExpertFrame.date.column_name, inplace=True)  # now sets index with the defined alias
+print(df.metadata)
+
+```
+
+`Column` alias can be set as `DefinedLater` to clearly document that it is set dynamically at runtime. 
+The same is possible for `ColumnSet` members. This has the benefit of adding a runtime check that the alias or members are set before being used.
+
+**Warning**: If a `Column` alias is set, it will be used **regardless** of whether it exists in the data or not. 
+
 ## Class Diagram
 
 ```mermaid

diff --git a/examples/advanced_usage.py b/examples/advanced_usage.py
@@ -20,5 +20,5 @@ class AdvancedFrame(BaseFrame):
 
 
 df = AdvancedFrame().read_csv(Path(__file__).parent.joinpath("advanced.csv").as_posix())
-df.set_index(AdvancedFrame.date.name, inplace=True)
+df.set_index(AdvancedFrame.date.column_name, inplace=True)  # name attribute also works here, but column_name is recommended
 print(df.time_series)
diff --git a/examples/expert_usage.py b/examples/expert_usage.py
@@ -0,0 +1,26 @@
+import datetime
+from pathlib import Path
+
+from pandandic import BaseFrame, Column, ColumnSet, ColumnGroup, DefinedLater
+
+
+class ExpertFrame(BaseFrame):
+    """
+    Aliasing can be used to dynamically set columns or column set members at runtime.
+    """
+    date = Column(type=datetime.date, alias=DefinedLater)
+    metadata = ColumnSet(members=DefinedLater)
+
+    temperature = ColumnSet(type=float, members=["temperature-\d+"], regex=True)
+    door_open = ColumnSet(type=bool, members=["door-open-0", "door-open-1", "door-open-2"], regex=False)
+
+    time_series = ColumnGroup(members=[temperature, door_open])
+
+
+# anything DefinedLater MUST be set before ExpertFrame reads or accesses a Column or ColumnSet via attribute
+ExpertFrame.date.alias = "date"
+ExpertFrame.metadata.members = ["comment", "ref"]
+
+df = ExpertFrame().read_csv(Path(__file__).parent.joinpath("advanced.csv").as_posix())
+df.set_index(ExpertFrame.date.column_name, inplace=True)  # now sets index with the defined alias
+print(df.metadata)
diff --git a/examples/intermediate_usage.py b/examples/intermediate_usage.py
@@ -18,5 +18,5 @@ class TemperatureFrame(BaseFrame):
 
 
 df = TemperatureFrame().read_csv(Path(__file__).parent.joinpath("intermediate.csv").as_posix())
-df.set_index(TemperatureFrame.date.name, inplace=True)
+df.set_index(TemperatureFrame.date.column_name, inplace=True)  # name attribute also works here, but column_name is recommended
 print(df.temperature)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pandandic"
-version = "0.1.1a0"
+version = "0.2.0"
 description = "A typed dataframe helper"
 license = "MIT"
 authors = ["Will Martin <[email protected]>"]

diff --git a/src/pandandic/__init__.py b/src/pandandic/__init__.py
@@ -3,3 +3,6 @@
 from .column_set import ColumnSet
 from .column_group import ColumnGroup
 from .column_group_exception import ColumnGroupException
+from .defined_later import DefinedLater
+from .column_alias_not_yet_defined_exception import ColumnAliasNotYetDefinedException
+from .column_set_members_not_yet_defined_exception import ColumnSetMembersNotYetDefinedException
diff --git a/src/pandandic/base_frame.py b/src/pandandic/base_frame.py
@@ -11,16 +11,20 @@
 from pandas import DataFrame
 
 from .column import Column
-from .column_set import ColumnSet
-from .column_group_exception import ColumnGroupException
+from .column_alias_not_yet_defined_exception import ColumnAliasNotYetDefinedException
 from .column_group import ColumnGroup
+from .column_group_exception import ColumnGroupException
+from .column_set import ColumnSet
+from .column_set_members_not_yet_defined_exception import ColumnSetMembersNotYetDefinedException
+from .defined_later import DefinedLater
 
 if sys.version_info >= (3, 11):
     from typing import Self
 else:
     from typing_extensions import Self
 
 
+
 class BaseFrame(DataFrame):
     """
     Enabled schema-in-code through subclassing of DataFrame.
@@ -41,31 +45,40 @@ def __init__(self, *args, **kwargs):
         self.greedy_column_sets = False
 
         self._get_column_map()
-        self._get_column_sets()
-        self._get_column_groups()
+        self._get_column_set_map()
+        self._get_column_group_map()
 
         if len(args) > 0 and isinstance(args[0], DataFrame):
             self.from_df(args[0])
 
     def __getattribute__(self, item):
         if not item.startswith("_"):
             if self._column_map is not None and item in self._column_map:
-                return self[self._column_map[item].name]
+                column = self._column_map[item]
+                if column.alias == DefinedLater or isinstance(column.alias, DefinedLater):
+                    raise ColumnAliasNotYetDefinedException(column)
+                return self[column.alias or column.name]
+
             if self._column_set_map is not None and item in self._column_set_map:
-                return self[self._column_consumed_map[self._column_set_map[item].name]]
+                column_set = self._column_set_map[item]
+                if column_set.members == DefinedLater or isinstance(column_set.members, DefinedLater):
+                    raise ColumnSetMembersNotYetDefinedException(column_set)
+                return self[self._column_consumed_map[column_set.name]]
+
             if self._column_group_map is not None and item in self._column_group_map:
                 return self[list(itertools.chain.from_iterable(
                     [[column_or_column_group.name]
                      if isinstance(column_or_column_group, Column)
                      else self._column_consumed_map[column_or_column_group.name]
                      for column_or_column_group in self._column_group_map[item].members]))]
+
         return super().__getattribute__(item)
 
     def to_df(self) -> DataFrame:
         return DataFrame(self)
 
-    def with_enforced_types(self, validate: bool = True) -> Self:
-        self.enforce_types = validate
+    def with_enforced_types(self, enforce_types: bool = True) -> Self:
+        self.enforce_types = enforce_types
         return self
 
     def with_enforced_columns(self, enforce_typed_columns: bool = True) -> Self:
@@ -80,7 +93,7 @@ def with_greedy_column_sets(self, greedy_column_sets: bool = True) -> Self:
         self.greedy_column_sets = greedy_column_sets
         return self
 
-    def read_csv(self, *args, **kwargs) -> DataFrame:
+    def read_csv(self, *args, **kwargs) -> Self:
         if self.enforce_columns or self.enforce_types:
             columns = self.read_csv_columns(*args, **kwargs)
             column_map = self._compute_column_map(columns)
@@ -99,7 +112,7 @@ def read_csv(self, *args, **kwargs) -> DataFrame:
         super().__init__(df)
         return self
 
-    def read_excel(self, *args, **kwargs) -> DataFrame:
+    def read_excel(self, *args, **kwargs) -> Self:
         if self.enforce_columns or self.enforce_types:
             columns = self.read_excel_columns(*args, **kwargs)
             column_map = self._compute_column_map(columns)
@@ -118,7 +131,7 @@ def read_excel(self, *args, **kwargs) -> DataFrame:
         super().__init__(df)
         return self
 
-    def read_parquet(self, *args, **kwargs) -> DataFrame:
+    def read_parquet(self, *args, **kwargs) -> Self:
         columns = self.read_parquet_columns(*args, **kwargs)
         column_map = self._compute_column_map(columns)
 
@@ -135,7 +148,7 @@ def read_parquet(self, *args, **kwargs) -> DataFrame:
         super().__init__(df)
         return self
 
-    def read_avro(self, *args, **kwargs) -> DataFrame:
+    def read_avro(self, *args, **kwargs) -> Self:
         from pandavro import read_avro
 
         columns = self.read_avro_columns(*args, **kwargs)
@@ -154,7 +167,7 @@ def read_avro(self, *args, **kwargs) -> DataFrame:
         super().__init__(df)
         return self
 
-    def from_df(self, df) -> DataFrame:
+    def from_df(self, df) -> Self:
         df = df.copy()
 
         columns = df.columns.tolist()
@@ -175,16 +188,23 @@ def from_df(self, df) -> DataFrame:
     def _compute_column_map(self, columns: List[str]) -> Dict[str, type]:
         self._column_consumed_map.clear()
 
-        key_column_map = {column.name: column for column in self._get_column_map().values()}
+        key_column_map = {(column.alias or column.name): column for column in self._get_column_map().values()}
+        for alias, column in key_column_map.items():
+            if alias == DefinedLater or isinstance(alias, DefinedLater):
+                raise ColumnAliasNotYetDefinedException(column)
 
-        if len(self._get_column_sets()) == 0:
+        if len(self._get_column_set_map()) == 0:
             return {k: v.type for k, v in key_column_map.items()}
 
         column_bag = np.array([key_column_map[c] if c in key_column_map else None for c in columns])
         consumed_columns = ~np.equal(column_bag, None)
 
-        exact_column_sets = list(filter(lambda column_set: not column_set.regex, self._get_column_sets().values()))
-        regex_column_sets = list(filter(lambda column_set: column_set.regex, self._get_column_sets().values()))
+        for key, column_set in self._get_column_set_map().items():
+            if column_set.members == DefinedLater or isinstance(column_set.members, DefinedLater):
+                raise ColumnSetMembersNotYetDefinedException(column_set)
+
+        exact_column_sets = list(filter(lambda column_set: not column_set.regex, self._get_column_set_map().values()))
+        regex_column_sets = list(filter(lambda column_set: column_set.regex, self._get_column_set_map().values()))
 
         for i, column in enumerate(columns):
             for column_set in exact_column_sets:
@@ -233,7 +253,7 @@ def _type_is_castable(column_type_tuple: Tuple[str, type]) -> bool:
     @staticmethod
     def _type_is_not_any(column_type_tuple: Tuple[str, type]) -> bool:
         _, t = column_type_tuple
-        return t is not Any
+        return t not in (Any, None)
 
     @staticmethod
     def _type_is_date(column_type_tuple: Tuple[str, type]) -> bool:
@@ -297,13 +317,13 @@ def _get_column_map(cls) -> Dict[str, Column]:
         return cls._column_map
 
     @classmethod
-    def _get_column_sets(cls) -> Dict[str, ColumnSet]:
+    def _get_column_set_map(cls) -> Dict[str, ColumnSet]:
         if cls._column_set_map is None:
             cls._column_set_map = dict(filter(lambda kv: isinstance(kv[1], ColumnSet), cls.__dict__.items()))
         return cls._column_set_map
 
     @classmethod
-    def _get_column_groups(cls) -> Dict[str, ColumnGroup]:
+    def _get_column_group_map(cls) -> Dict[str, ColumnGroup]:
         if cls._column_group_map is None:
             cls._column_group_map = dict(filter(lambda kv: isinstance(kv[1], ColumnGroup), cls.__dict__.items()))
         return cls._column_group_map
diff --git a/src/pandandic/column.py b/src/pandandic/column.py
@@ -1,10 +1,17 @@
 from dataclasses import dataclass
-from typing import Type, Optional
+from typing import Type, Any, Optional, Union
+
+from .defined_later import DefinedLater
 
 
 @dataclass
 class Column:
-    type: Type
+    type: Type = Any
+    alias: Optional[Union[str, DefinedLater, DefinedLater.__class__]] = None
 
     def __set_name__(self, _, name):
         self.name = name
+
+    @property
+    def column_name(self) -> Union[str, DefinedLater, DefinedLater.__class__]:
+        return self.alias or self.name
diff --git a/src/pandandic/column_alias_not_yet_defined_exception.py b/src/pandandic/column_alias_not_yet_defined_exception.py
@@ -0,0 +1,9 @@
+from .column import Column
+
+
+class ColumnAliasNotYetDefinedException(Exception):
+    def __init__(self, column: Column):
+        self._column = column
+
+    def __str__(self) -> str:
+        return f"Error. Column with name {self._column.name} was used with an alias that is not defined."
diff --git a/src/pandandic/column_group.py b/src/pandandic/column_group.py
@@ -1,4 +1,4 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import List, Union
 
 from .column import Column
@@ -8,3 +8,6 @@
 @dataclass
 class ColumnGroup:
     members: List[Union[Column, ColumnSet]]
+
+    def __set_name__(self, _, name):
+        self.name = name
diff --git a/src/pandandic/column_set.py b/src/pandandic/column_set.py
@@ -1,11 +1,13 @@
 from dataclasses import dataclass
-from typing import Type, List
+from typing import Type, List, Any, Union
+
+from .defined_later import DefinedLater
 
 
 @dataclass
 class ColumnSet:
-    type: Type  # dtype applied to this set
-    members: List[str]  # list of columns matched to this set
+    members: Union[List[str], DefinedLater, DefinedLater.__class__]  # list of columns matched to this set
+    type: Type = Any  # dtype applied to this set
     regex: bool = False  # enables matching members by regex
 
     def __set_name__(self, _, name):

diff --git a/src/pandandic/column_set_members_not_yet_defined_exception.py b/src/pandandic/column_set_members_not_yet_defined_exception.py
@@ -0,0 +1,9 @@
+from .column_set import ColumnSet
+
+
+class ColumnSetMembersNotYetDefinedException(Exception):
+    def __init__(self, column_set: ColumnSet):
+        self._column_set = column_set
+
+    def __str__(self) -> str:
+        return f"Error. ColumnSet with name {self._column_set.name} was used with a member list that is not defined."
diff --git a/src/pandandic/defined_later.py b/src/pandandic/defined_later.py
@@ -0,0 +1,7 @@
+class DefinedLater:
+    """
+    Denotes that a Column alias or ColumnSet member list will be defined dynamically at runtime.
+    If a read call is made before the DefinedLater is replaced, a ColumnAliasNotYetDefinedException or
+    ColumnSetMembersNotYetDefinedException will be thrown.
+    """
+    ...