[FEAT] Add support for exogenous variables in utils.aggregate (#297)

Co-authored-by: Olivier Sprangers <[email protected]> Co-authored-by: Olivier Sprangers <[email protected]>
Nixtla · Oct 21, 2024 · b5245d0 · b5245d0
1 parent a9a5866
commit b5245d0
Show file tree

Hide file tree

Showing 3 changed files with 130 additions and 26 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,12 +1,5 @@
 # How to contribute
 
-## How to get started
-
-Before anything else, please install the git hooks that run automatic scripts during each commit and merge to strip the notebooks of superfluous metadata (and avoid merge conflicts). After cloning the repository, run the following command inside it:
-```
-nbdev_install_hooks
-```
-
 ## Did you find a bug?
 
 * Ensure the bug was not already reported by searching on GitHub under Issues.
@@ -39,7 +32,7 @@ The repo comes with an `environment.yml` file which contains the libraries neede
 
 Once you have `conda` go to the top level directory of the repository and run:
 ```
-conda env create -f environment.yml
+conda env update -f environment.yml
 ```
 
 #### Install the library

diff --git a/hierarchicalforecast/utils.py b/hierarchicalforecast/utils.py
@@ -8,8 +8,7 @@
 import timeit
 import warnings
 from itertools import chain
-from typing import Callable, Dict, List, Optional, Iterable
-from collections.abc import Sequence
+from typing import Callable, Dict, List, Optional, Iterable, Union, Sequence
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -154,6 +153,7 @@ def join_upper(bottom_value):
 def aggregate(
     df: pd.DataFrame,
     spec: List[List[str]],
+    exog_vars: Optional[Dict[str, Union[str, List[str]]]] = None,
     is_balanced: bool = False,
     sparse_s: bool = False,
 ):
@@ -167,6 +167,8 @@ def aggregate(
         Dataframe with columns `['ds', 'y']` and columns to aggregate.
     spec : list of list of str
         List of levels. Each element of the list should contain a list of columns of `df` to aggregate.
+    exog_vars: dictionary of string keys & values that can either be a list of strings or a single string
+        keys correspond to column names and the values represent the aggregation(s) that will be applied to each column. Accepted values are those from Pandas aggregation Functions, check the Pandas docs for guidance
     is_balanced : bool (default=False)
         Deprecated.
     sparse_s : bool (default=False)
@@ -190,14 +192,39 @@ def aggregate(
             "Don't set this argument to suppress this warning.",
             category=DeprecationWarning,
         )
+
 
     # compute aggregations and tags
     spec = sorted(spec, key=len)
     bottom = spec[-1]
     aggs = []
     tags = {}
+    # Prepare the aggregation dictionary
+    agg_dict = {
+        "y": ("y", "sum")
+    }
+
+
+    # Check if exog_vars are present in df & add to the aggregation dictionary if it is not None
+    if exog_vars is not None:
+        missing_vars = [var for var in exog_vars.keys() if var not in df.columns]
+        if missing_vars:
+            raise ValueError(f"The following exogenous variables are not present in the DataFrame: {', '.join(missing_vars)}")    
+        else:
+          # Update agg_dict to handle multiple aggregations for each exog_vars key
+            for key, agg_func in exog_vars.items():
+                # Ensure agg_func is a list
+                if isinstance(agg_func, str):  # If it's a single string, convert to list
+                    agg_func = [agg_func]
+                elif not isinstance(agg_func, list):  # Raise an error if it's neither
+                    raise ValueError(f"Aggregation functions for '{key}' must be a string or a list of strings.")
+
+                for func in agg_func:
+                    agg_dict[f"{key}_{func}"] = (key, func)  # Update the agg_dict with the new naming structure
+
+    # Perform the aggregation
     for levels in spec:
-        agg = df.groupby(levels + ['ds'], observed=True)['y'].sum()
+        agg = df.groupby(levels + ['ds'], observed=True).agg(**agg_dict)
         if not agg.index.is_monotonic_increasing:
             agg = agg.sort_index()
         agg = agg.reset_index('ds')

diff --git a/nbs/utils.ipynb b/nbs/utils.ipynb
@@ -43,8 +43,7 @@
     "import timeit\n",
     "import warnings\n",
     "from itertools import chain\n",
-    "from typing import Callable, Dict, List, Optional, Iterable\n",
-    "from collections.abc import Sequence\n",
+    "from typing import Callable, Dict, List, Optional, Iterable, Union, Sequence\n",
     "\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
@@ -267,6 +266,7 @@
     "def aggregate(\n",
     "    df: pd.DataFrame,\n",
     "    spec: List[List[str]],\n",
+    "    exog_vars: Optional[Dict[str, Union[str, List[str]]]] = None,\n",
     "    is_balanced: bool = False,\n",
     "    sparse_s: bool = False,\n",
     "):\n",
@@ -280,6 +280,8 @@
     "        Dataframe with columns `['ds', 'y']` and columns to aggregate.\n",
     "    spec : list of list of str\n",
     "        List of levels. Each element of the list should contain a list of columns of `df` to aggregate.\n",
+    "    exog_vars: dictionary of string keys & values that can either be a list of strings or a single string\n",
+    "        keys correspond to column names and the values represent the aggregation(s) that will be applied to each column. Accepted values are those from Pandas aggregation Functions, check the Pandas docs for guidance\n",
     "    is_balanced : bool (default=False)\n",
     "        Deprecated.\n",
     "    sparse_s : bool (default=False)\n",
@@ -303,14 +305,39 @@
     "            \"Don't set this argument to suppress this warning.\",\n",
     "            category=DeprecationWarning,\n",
     "        )\n",
+    "         \n",
     "            \n",
     "    # compute aggregations and tags\n",
     "    spec = sorted(spec, key=len)\n",
     "    bottom = spec[-1]\n",
     "    aggs = []\n",
     "    tags = {}\n",
+    "    # Prepare the aggregation dictionary\n",
+    "    agg_dict = {\n",
+    "        \"y\": (\"y\", \"sum\")\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "    # Check if exog_vars are present in df & add to the aggregation dictionary if it is not None\n",
+    "    if exog_vars is not None:\n",
+    "        missing_vars = [var for var in exog_vars.keys() if var not in df.columns]\n",
+    "        if missing_vars:\n",
+    "            raise ValueError(f\"The following exogenous variables are not present in the DataFrame: {', '.join(missing_vars)}\")    \n",
+    "        else:\n",
+    "          # Update agg_dict to handle multiple aggregations for each exog_vars key\n",
+    "            for key, agg_func in exog_vars.items():\n",
+    "                # Ensure agg_func is a list\n",
+    "                if isinstance(agg_func, str):  # If it's a single string, convert to list\n",
+    "                    agg_func = [agg_func]\n",
+    "                elif not isinstance(agg_func, list):  # Raise an error if it's neither\n",
+    "                    raise ValueError(f\"Aggregation functions for '{key}' must be a string or a list of strings.\")\n",
+    "                \n",
+    "                for func in agg_func:\n",
+    "                    agg_dict[f\"{key}_{func}\"] = (key, func)  # Update the agg_dict with the new naming structure\n",
+    "\n",
+    "    # Perform the aggregation\n",
     "    for levels in spec:\n",
-    "        agg = df.groupby(levels + ['ds'], observed=True)['y'].sum()\n",
+    "        agg = df.groupby(levels + ['ds'], observed=True).agg(**agg_dict)\n",
     "        if not agg.index.is_monotonic_increasing:\n",
     "            agg = agg.sort_index()\n",
     "        agg = agg.reset_index('ds')\n",
@@ -1213,21 +1240,78 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "afc421ba",
+   "id": "ddd3fec9",
    "metadata": {},
    "outputs": [],
    "source": [
-    "samples_to_quantiles_df(samples, unique_ids, dates, level=level)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a9b5ddbc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "samples_to_quantiles_df(samples, unique_ids, dates, quantiles=quantiles)"
+    "#| hide\n",
+    "\n",
+    "#Unit test for the aggregate function accounting for exog_vars\n",
+    "\n",
+    "df = pd.DataFrame(\n",
+    "    {\n",
+    "        'cat1': ['a', 'a', 'c'],\n",
+    "        'cat2': ['1', '2', '3'],\n",
+    "        'exog1': [4, 5, 6],\n",
+    "        'exog2': [7, 6, 5],\n",
+    "        'y': [10, 20, 30],\n",
+    "        'ds': ['2020-01-01', '2020-02-01', '2020-03-01']\n",
+    "    }\n",
+    ")\n",
+    "spec = [[\"cat1\"],[\"cat1\",\"cat2\"]]\n",
+    "\n",
+    "\n",
+    "Y_df_check = pd.DataFrame(\n",
+    "    data={\n",
+    "        \n",
+    "        'ds': ['2020-01-01','2020-02-01','2020-03-01','2020-01-01','2020-02-01','2020-03-01'],\n",
+    "        'y': [10, 20, 30, 10, 20, 30],\n",
+    "    },\n",
+    "    index=['a', 'a', 'c', 'a/1', 'a/2', 'c/3'],\n",
+    ")\n",
+    "Y_df_check.index.name = 'unique_id'\n",
+    "\n",
+    "S_df_check = pd.DataFrame(\n",
+    "    data={\n",
+    "        'a/1': np.array([1.0, 0.0, 1.0, 0.0, 0.0], dtype=np.float32),\n",
+    "        'a/2': np.array([1.0, 0.0, 0.0, 1.0, 0.0], dtype=np.float32),\n",
+    "        'c/3': np.array([0.0, 1.0, 0.0, 0.0, 1.0], dtype=np.float32)\n",
+    "    },\n",
+    "    index=['a', 'c', 'a/1', 'a/2', 'c/3']\n",
+    ")\n",
+    "\n",
+    "\n",
+    "Y_df_check_exog = pd.DataFrame(\n",
+    "    data = {\n",
+    "        'ds': ['2020-01-01', '2020-02-01', '2020-03-01', '2020-01-01', '2020-02-01', '2020-03-01'],\n",
+    "        'y': [10, 20, 30, 10, 20, 30],\n",
+    "        'exog1_mean': [4.0, 5.0, 6.0, 4.0, 5.0, 6.0],\n",
+    "        'exog2_sum': [7, 6, 5, 7, 6, 5]\n",
+    "    },\n",
+    "    index = ['a', 'a', 'c', 'a/1', 'a/2', 'c/3']\n",
+    ")\n",
+    "Y_df_check_exog.index.name = 'unique_id'\n",
+    "\n",
+    "Y_df, S_df, tags = aggregate(\n",
+    "    df = df,\n",
+    "    spec = spec,\n",
+    "    exog_vars = None,\n",
+    ")\n",
+    "\n",
+    "Y_df_exog, S_df_exog, tags = aggregate(\n",
+    "    df = df,\n",
+    "    spec = spec,\n",
+    "    exog_vars = {'exog1':'mean','exog2':'sum'},\n",
+    ")\n",
+    "\n",
+    "test_eq(Y_df, \n",
+    "        Y_df_check)\n",
+    "\n",
+    "test_eq(S_df, \n",
+    "        S_df_check)\n",
+    "\n",
+    "test_eq(Y_df_exog,\n",
+    "        Y_df_check_exog)"
    ]
   }
  ],