Skip to content

Commit

Permalink
[FEAT] Add support for exogenous variables in utils.aggregate (#297)
Browse files Browse the repository at this point in the history
Co-authored-by: Olivier Sprangers <[email protected]>
Co-authored-by: Olivier Sprangers <[email protected]>
  • Loading branch information
3 people authored Oct 21, 2024
1 parent a9a5866 commit b5245d0
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 26 deletions.
9 changes: 1 addition & 8 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,5 @@
# How to contribute

## How to get started

Before anything else, please install the git hooks that run automatic scripts during each commit and merge to strip the notebooks of superfluous metadata (and avoid merge conflicts). After cloning the repository, run the following command inside it:
```
nbdev_install_hooks
```

## Did you find a bug?

* Ensure the bug was not already reported by searching on GitHub under Issues.
Expand Down Expand Up @@ -39,7 +32,7 @@ The repo comes with an `environment.yml` file which contains the libraries neede

Once you have `conda` go to the top level directory of the repository and run:
```
conda env create -f environment.yml
conda env update -f environment.yml
```

#### Install the library
Expand Down
33 changes: 30 additions & 3 deletions hierarchicalforecast/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
import timeit
import warnings
from itertools import chain
from typing import Callable, Dict, List, Optional, Iterable
from collections.abc import Sequence
from typing import Callable, Dict, List, Optional, Iterable, Union, Sequence

import matplotlib.pyplot as plt
import numpy as np
Expand Down Expand Up @@ -154,6 +153,7 @@ def join_upper(bottom_value):
def aggregate(
df: pd.DataFrame,
spec: List[List[str]],
exog_vars: Optional[Dict[str, Union[str, List[str]]]] = None,
is_balanced: bool = False,
sparse_s: bool = False,
):
Expand All @@ -167,6 +167,8 @@ def aggregate(
Dataframe with columns `['ds', 'y']` and columns to aggregate.
spec : list of list of str
List of levels. Each element of the list should contain a list of columns of `df` to aggregate.
exog_vars: dictionary of string keys & values that can either be a list of strings or a single string
keys correspond to column names and the values represent the aggregation(s) that will be applied to each column. Accepted values are those from Pandas aggregation Functions, check the Pandas docs for guidance
is_balanced : bool (default=False)
Deprecated.
sparse_s : bool (default=False)
Expand All @@ -190,14 +192,39 @@ def aggregate(
"Don't set this argument to suppress this warning.",
category=DeprecationWarning,
)


# compute aggregations and tags
spec = sorted(spec, key=len)
bottom = spec[-1]
aggs = []
tags = {}
# Prepare the aggregation dictionary
agg_dict = {
"y": ("y", "sum")
}


# Check if exog_vars are present in df & add to the aggregation dictionary if it is not None
if exog_vars is not None:
missing_vars = [var for var in exog_vars.keys() if var not in df.columns]
if missing_vars:
raise ValueError(f"The following exogenous variables are not present in the DataFrame: {', '.join(missing_vars)}")
else:
# Update agg_dict to handle multiple aggregations for each exog_vars key
for key, agg_func in exog_vars.items():
# Ensure agg_func is a list
if isinstance(agg_func, str): # If it's a single string, convert to list
agg_func = [agg_func]
elif not isinstance(agg_func, list): # Raise an error if it's neither
raise ValueError(f"Aggregation functions for '{key}' must be a string or a list of strings.")

for func in agg_func:
agg_dict[f"{key}_{func}"] = (key, func) # Update the agg_dict with the new naming structure

# Perform the aggregation
for levels in spec:
agg = df.groupby(levels + ['ds'], observed=True)['y'].sum()
agg = df.groupby(levels + ['ds'], observed=True).agg(**agg_dict)
if not agg.index.is_monotonic_increasing:
agg = agg.sort_index()
agg = agg.reset_index('ds')
Expand Down
114 changes: 99 additions & 15 deletions nbs/utils.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@
"import timeit\n",
"import warnings\n",
"from itertools import chain\n",
"from typing import Callable, Dict, List, Optional, Iterable\n",
"from collections.abc import Sequence\n",
"from typing import Callable, Dict, List, Optional, Iterable, Union, Sequence\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
Expand Down Expand Up @@ -267,6 +266,7 @@
"def aggregate(\n",
" df: pd.DataFrame,\n",
" spec: List[List[str]],\n",
" exog_vars: Optional[Dict[str, Union[str, List[str]]]] = None,\n",
" is_balanced: bool = False,\n",
" sparse_s: bool = False,\n",
"):\n",
Expand All @@ -280,6 +280,8 @@
" Dataframe with columns `['ds', 'y']` and columns to aggregate.\n",
" spec : list of list of str\n",
" List of levels. Each element of the list should contain a list of columns of `df` to aggregate.\n",
" exog_vars: dictionary of string keys & values that can either be a list of strings or a single string\n",
" keys correspond to column names and the values represent the aggregation(s) that will be applied to each column. Accepted values are those from Pandas aggregation Functions, check the Pandas docs for guidance\n",
" is_balanced : bool (default=False)\n",
" Deprecated.\n",
" sparse_s : bool (default=False)\n",
Expand All @@ -303,14 +305,39 @@
" \"Don't set this argument to suppress this warning.\",\n",
" category=DeprecationWarning,\n",
" )\n",
" \n",
" \n",
" # compute aggregations and tags\n",
" spec = sorted(spec, key=len)\n",
" bottom = spec[-1]\n",
" aggs = []\n",
" tags = {}\n",
" # Prepare the aggregation dictionary\n",
" agg_dict = {\n",
" \"y\": (\"y\", \"sum\")\n",
" }\n",
"\n",
"\n",
" # Check if exog_vars are present in df & add to the aggregation dictionary if it is not None\n",
" if exog_vars is not None:\n",
" missing_vars = [var for var in exog_vars.keys() if var not in df.columns]\n",
" if missing_vars:\n",
" raise ValueError(f\"The following exogenous variables are not present in the DataFrame: {', '.join(missing_vars)}\") \n",
" else:\n",
" # Update agg_dict to handle multiple aggregations for each exog_vars key\n",
" for key, agg_func in exog_vars.items():\n",
" # Ensure agg_func is a list\n",
" if isinstance(agg_func, str): # If it's a single string, convert to list\n",
" agg_func = [agg_func]\n",
" elif not isinstance(agg_func, list): # Raise an error if it's neither\n",
" raise ValueError(f\"Aggregation functions for '{key}' must be a string or a list of strings.\")\n",
" \n",
" for func in agg_func:\n",
" agg_dict[f\"{key}_{func}\"] = (key, func) # Update the agg_dict with the new naming structure\n",
"\n",
" # Perform the aggregation\n",
" for levels in spec:\n",
" agg = df.groupby(levels + ['ds'], observed=True)['y'].sum()\n",
" agg = df.groupby(levels + ['ds'], observed=True).agg(**agg_dict)\n",
" if not agg.index.is_monotonic_increasing:\n",
" agg = agg.sort_index()\n",
" agg = agg.reset_index('ds')\n",
Expand Down Expand Up @@ -1213,21 +1240,78 @@
{
"cell_type": "code",
"execution_count": null,
"id": "afc421ba",
"id": "ddd3fec9",
"metadata": {},
"outputs": [],
"source": [
"samples_to_quantiles_df(samples, unique_ids, dates, level=level)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a9b5ddbc",
"metadata": {},
"outputs": [],
"source": [
"samples_to_quantiles_df(samples, unique_ids, dates, quantiles=quantiles)"
"#| hide\n",
"\n",
"#Unit test for the aggregate function accounting for exog_vars\n",
"\n",
"df = pd.DataFrame(\n",
" {\n",
" 'cat1': ['a', 'a', 'c'],\n",
" 'cat2': ['1', '2', '3'],\n",
" 'exog1': [4, 5, 6],\n",
" 'exog2': [7, 6, 5],\n",
" 'y': [10, 20, 30],\n",
" 'ds': ['2020-01-01', '2020-02-01', '2020-03-01']\n",
" }\n",
")\n",
"spec = [[\"cat1\"],[\"cat1\",\"cat2\"]]\n",
"\n",
"\n",
"Y_df_check = pd.DataFrame(\n",
" data={\n",
" \n",
" 'ds': ['2020-01-01','2020-02-01','2020-03-01','2020-01-01','2020-02-01','2020-03-01'],\n",
" 'y': [10, 20, 30, 10, 20, 30],\n",
" },\n",
" index=['a', 'a', 'c', 'a/1', 'a/2', 'c/3'],\n",
")\n",
"Y_df_check.index.name = 'unique_id'\n",
"\n",
"S_df_check = pd.DataFrame(\n",
" data={\n",
" 'a/1': np.array([1.0, 0.0, 1.0, 0.0, 0.0], dtype=np.float32),\n",
" 'a/2': np.array([1.0, 0.0, 0.0, 1.0, 0.0], dtype=np.float32),\n",
" 'c/3': np.array([0.0, 1.0, 0.0, 0.0, 1.0], dtype=np.float32)\n",
" },\n",
" index=['a', 'c', 'a/1', 'a/2', 'c/3']\n",
")\n",
"\n",
"\n",
"Y_df_check_exog = pd.DataFrame(\n",
" data = {\n",
" 'ds': ['2020-01-01', '2020-02-01', '2020-03-01', '2020-01-01', '2020-02-01', '2020-03-01'],\n",
" 'y': [10, 20, 30, 10, 20, 30],\n",
" 'exog1_mean': [4.0, 5.0, 6.0, 4.0, 5.0, 6.0],\n",
" 'exog2_sum': [7, 6, 5, 7, 6, 5]\n",
" },\n",
" index = ['a', 'a', 'c', 'a/1', 'a/2', 'c/3']\n",
")\n",
"Y_df_check_exog.index.name = 'unique_id'\n",
"\n",
"Y_df, S_df, tags = aggregate(\n",
" df = df,\n",
" spec = spec,\n",
" exog_vars = None,\n",
")\n",
"\n",
"Y_df_exog, S_df_exog, tags = aggregate(\n",
" df = df,\n",
" spec = spec,\n",
" exog_vars = {'exog1':'mean','exog2':'sum'},\n",
")\n",
"\n",
"test_eq(Y_df, \n",
" Y_df_check)\n",
"\n",
"test_eq(S_df, \n",
" S_df_check)\n",
"\n",
"test_eq(Y_df_exog,\n",
" Y_df_check_exog)"
]
}
],
Expand Down

0 comments on commit b5245d0

Please sign in to comment.