fixed multi index in mapping

IndEcol · Feb 13, 2024 · aa026d0 · aa026d0
1 parent 5d11974
commit aa026d0
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 127 deletions.
diff --git a/pymrio/tools/ioutil.py b/pymrio/tools/ioutil.py
@@ -1004,7 +1004,7 @@ def _get_sample():
 
 
 
-def match_and_convert(df_orig, df_map, agg_func):
+def match_and_convert(df_orig, df_map, agg_func='sum'):
     """ Match and convert a DataFrame to a new classification
     
     Parameters
@@ -1031,6 +1031,7 @@ def match_and_convert(df_orig, df_map, agg_func):
         Some additional columns are possible, but not necessary:
 
         agg_func ... the aggregation function to use for multiple impact (summation by default)
+                    If passed as a column here, that overrides the default value/value passed as argument
         unit_orig ... the original unit (optional, for double check with an potential unit column in the original df)
         unit_new ... the new unit to be set as the unit column in the new df
 
@@ -1042,45 +1043,6 @@ def match_and_convert(df_orig, df_map, agg_func):
 
     """
 
-    # TESTS
-    to_char = pd.DataFrame(
-        data=5,
-        index=pd.MultiIndex.from_product([["em1", "em2"], ["air", "water"]]),
-        columns=pd.MultiIndex.from_product([["r1", "c1"], ["r2", "c2"]]),
-    )
-    to_char.columns.names = ["reg", "sec"]
-    to_char.index.names = ["em_type", "compart"] 
-
-    mapping = pd.DataFrame(
-        columns=["em_type", "compart", "total__em_type", "factor"],
-        data=[["em.*", "air|water", "total_regex", 2], 
-              ["em1", "air", "total_sum", 2], 
-              ["em1", "water", "total_sum", 2], 
-              ["em2", "air", "total_sum", 2], 
-              ["em2", "water", "total_sum", 2], 
-              ["em1", "air", "all_air", 0.5], 
-              ["em2", "air", "all_air", 0.5]],
-    )
-
-    exp_res = pd.DataFrame(
-        columns = to_char.columns,
-        index = ["total_regex", "total_sum", "all_air"])
-    exp_res.loc['all_air'] = to_char.loc[("em1", "air")] * 0.5 + to_char.loc[("em2", "air")] * 0.5
-    exp_res.loc['total_regex'] = (to_char.sum(axis=1) * 2).values
-    exp_res.loc['total_sum'] = (to_char.sum(axis=1) * 2).values
-    exp_res = exp_res.astype(float)
-    exp_res.sort_index(inplace=True)
-
-    res = match_and_convert(to_char, mapping, agg_func="sum")
-    res.sort_index(inplace=True)
-
-    exp_res.index.names = res.index.names
-    exp_res.columns.names = res.columns.names
-
-    df_map = mapping
-    df_orig = to_char
-    # TEST END
-
     new_col = [col for col in df_map.columns if "__" in col]
     unique_new_index = df_map.loc[:, new_col].value_counts()
 
@@ -1089,96 +1051,27 @@ def match_and_convert(df_orig, df_map, agg_func):
 
     # loop over each new impact/characterized value
     for char in unique_new_index.index:
-        # __import__('pdb').set_trace()
         if len(char) == 1:
             df_cur_map = df_map.loc[[char[0]]]
         else:
             df_cur_map = df_map.loc[[char]]
-        agg_method = df_cur_map.agg_func if 'agg_func' in df_cur_map.columns else 'sum'
+        agg_method = df_cur_map.agg_func if 'agg_func' in df_cur_map.columns else agg_func
         df_agg = pd.DataFrame(columns=df_orig.columns, index=df_cur_map.index, data=0)
         df_agg.index.names = [n.split('__')[0] for n in df_agg.index.names]
         collector = []
 
+        # the loop for getting all (potential) regex matches
         for row in df_cur_map.iterrows():
-            # find via regex match - can be multiple entries defined in one row
             matched_entries = index_fullmatch(df_ix=df_orig, **row[1].to_dict())
             mul_entries = matched_entries * row[1].factor
             aggregated = mul_entries.aggregate(agg_method, axis=0)
             collector.append(aggregated)
 
-        df_collected = pd.concat(collector, axis=0)
-        # FIX: - think about adding the index (right amount for after the aggregation)
-        # CONT:
-        df_collected.index = df_agg.index
 
-        res_collector.append(df_collected.groupby(by=df_collected.index.names).agg(agg_method))
+        df_collected = pd.concat(collector, axis=1).T
+        df_collected.index = np.repeat(df_cur_map.index.unique(), df_collected.shape[0])
 
+        res_collector.append(df_collected.groupby(by=df_collected.index.names).agg(agg_method))
 
     return pd.concat(res_collector, axis=0)
 
-
-
-def match_and_convert_legacy(df, factor=1, **kwargs):
-    """
-    OLD
-    Parameters
-    ----------
-
-    df: pd.DataFrame
-        The DataFrame to process.
-        Index levels must be named, all matching occurs on the index.
-
-    factor: float, optional
-        The factor to multiply the matching values with.
-        Default: 1
-
-    kwargs: One for each index level which should be matched.
-
-
-    """
-
-    factor = 1000
-
-    kwargs = dict(
-    stressor = r"emission_type.*",
-    compartment = r".*",
-    sector = r"food|mining",
-    rename_stressor = "ghg",
-    rename_compartment = "air",
-    rename_sector = None,
-    )
-
-    match_kwargs = {k:v for k,v in kwargs.items() if not k.startswith('rename_')}
-    rename_kwargs = {k:v for k,v in kwargs.items() if k.startswith('rename_')}
-
-    # emission_type1, emission_type2 - emission
-    # emission_type1, emission_type2 - ghg_type1, ghg_type2
-    # match = pymrio.index_match(df_ix=FF, stressor="emission_type.*")
-
-    match = pymrio.index_match(df_ix=FF, **match_kwargs)
-
-    for rename_idx_level, new_index_name in rename_kwargs.items():
-        if new_index_name:
-            idx_level = rename_idx_level.split('rename_')[1]
-            match = match.reset_index(idx_level)
-            match.loc[:, idx_level] = new_index_name
-            match = match.set_index(idx_level, append=True)
-
-    # CONT: find duplicates in index and aggregate
-    multi = match * factor
-    res = multi.agg(func='sum', axis=0) 
-    res
-
-    multi.groupby(level=['compartment', 'sector']).agg(func='sum')
-
-    import re
-
-    # write re.sub which converts: emission_type1, emission_type2 - emission
-    text = "emission_type1, emission_type2"
-    re.sub(r"emission", "ghg", text)
-
-    re.sub(r"\w+", "ghg", text)
-
-
-    pass
-
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -5,7 +5,6 @@
 import sys
 from collections import namedtuple
 from unittest.mock import mock_open, patch
-
 import numpy as np
 import numpy.testing as npt
 import pandas as pd
@@ -366,7 +365,9 @@ def test_char_table():
     to_char.columns.names = ["reg", "sec"]
     to_char.index.names = ["em_type", "compart"] 
 
-    mapping = pd.DataFrame(
+    # TEST1: with only impact (one index level in the result) , sum over compartments
+
+    map_test1 = pd.DataFrame(
         columns=["em_type", "compart", "total__em_type", "factor"],
         data=[["em.*", "air|water", "total_regex", 2], 
 
@@ -379,21 +380,54 @@ def test_char_table():
               ["em2", "air", "all_air", 0.5]],
     )
 
-    exp_res = pd.DataFrame(
+    # alternative way to calculated the expected result
+    exp_res1 = pd.DataFrame(
         columns = to_char.columns,
         index = ["total_regex", "total_sum", "all_air"])
-    exp_res.loc['all_air'] = to_char.loc[("em1", "air")] * 0.5 + to_char.loc[("em2", "air")] * 0.5
-    exp_res.loc['total_regex'] = (to_char.sum(axis=1) * 2).values
-    exp_res.loc['total_sum'] = (to_char.sum(axis=1) * 2).values
-    exp_res = exp_res.astype(float)
-    exp_res.sort_index(inplace=True)
+    exp_res1.loc['all_air'] = to_char.loc[("em1", "air")] * 0.5 + to_char.loc[("em2", "air")] * 0.5
+    exp_res1.loc['total_regex'] = (to_char.sum(axis=1) * 2).values
+    exp_res1.loc['total_sum'] = (to_char.sum(axis=1) * 2).values
+    exp_res1 = exp_res1.astype(float)
+    exp_res1.sort_index(inplace=True)
+
+    res1 = match_and_convert(to_char, map_test1)
+    res1.sort_index(inplace=True)
+
+    exp_res1.index.names = res1.index.names
+    exp_res1.columns.names = res1.columns.names
+
+    pdt.assert_frame_equal(res1, exp_res1)
+
+    # TEST2 with impact per compartment (two index levels in the result)
+
+    map_test2 = pd.DataFrame(
+        columns=["em_type", "compart", "total__em_type", "compart__compart", "factor"],
+        data=[["em.*", "air|water", "total_regex", "all", 2], 
+              ["em1", "air", "total_sum", "all", 2], 
+              ["em1", "water", "total_sum", "all",  2], 
+              ["em2", "air", "total_sum", "all", 2], 
+              ["em2", "water", "total_sum", "all", 2], 
+              ["em1", "air", "all_air", "air", 0.5], 
+              ["em2", "air", "all_air", "air", 0.5]],
+    )
+
+    # alternative way to calculated the expected result
+    exp_res2 = pd.DataFrame(
+        columns = to_char.columns,
+        index = pd.MultiIndex.from_tuples(
+            [("total_regex", "all"), ("total_sum", "all"), ("all_air", "air")]))
+    exp_res2.loc[('all_air', 'air')] = to_char.loc[("em1", "air")] * 0.5 + to_char.loc[("em2", "air")] * 0.5
+    exp_res2.loc[('total_regex', 'all')] = (to_char.sum(axis=1) * 2).values
+    exp_res2.loc[('total_sum', 'all')] = (to_char.sum(axis=1) * 2).values
+    exp_res2 = exp_res2.astype(float)
+    exp_res2.sort_index(inplace=True)
 
-    res = match_and_convert(to_char, mapping, agg_func="sum")
-    res.sort_index(inplace=True)
+    res2 = match_and_convert(to_char, map_test2)
+    res2.sort_index(inplace=True)
 
-    exp_res.index.names = res.index.names
-    exp_res.columns.names = res.columns.names
+    exp_res2.index.names = res2.index.names
+    exp_res2.columns.names = res2.columns.names
 
-    pdt.assert_frame_equal(res, exp_res)
+    pdt.assert_frame_equal(res2, exp_res2)