diff --git a/pymrio/tools/ioutil.py b/pymrio/tools/ioutil.py index 6fc25d4e..6a1b7f11 100644 --- a/pymrio/tools/ioutil.py +++ b/pymrio/tools/ioutil.py @@ -1004,7 +1004,7 @@ def _get_sample(): -def match_and_convert(df_orig, df_map, agg_func): +def match_and_convert(df_orig, df_map, agg_func='sum'): """ Match and convert a DataFrame to a new classification Parameters @@ -1031,6 +1031,7 @@ def match_and_convert(df_orig, df_map, agg_func): Some additional columns are possible, but not necessary: agg_func ... the aggregation function to use for multiple impact (summation by default) + If passed as a column here, that overrides the default value/value passed as argument unit_orig ... the original unit (optional, for double check with an potential unit column in the original df) unit_new ... the new unit to be set as the unit column in the new df @@ -1042,45 +1043,6 @@ def match_and_convert(df_orig, df_map, agg_func): """ - # TESTS - to_char = pd.DataFrame( - data=5, - index=pd.MultiIndex.from_product([["em1", "em2"], ["air", "water"]]), - columns=pd.MultiIndex.from_product([["r1", "c1"], ["r2", "c2"]]), - ) - to_char.columns.names = ["reg", "sec"] - to_char.index.names = ["em_type", "compart"] - - mapping = pd.DataFrame( - columns=["em_type", "compart", "total__em_type", "factor"], - data=[["em.*", "air|water", "total_regex", 2], - ["em1", "air", "total_sum", 2], - ["em1", "water", "total_sum", 2], - ["em2", "air", "total_sum", 2], - ["em2", "water", "total_sum", 2], - ["em1", "air", "all_air", 0.5], - ["em2", "air", "all_air", 0.5]], - ) - - exp_res = pd.DataFrame( - columns = to_char.columns, - index = ["total_regex", "total_sum", "all_air"]) - exp_res.loc['all_air'] = to_char.loc[("em1", "air")] * 0.5 + to_char.loc[("em2", "air")] * 0.5 - exp_res.loc['total_regex'] = (to_char.sum(axis=1) * 2).values - exp_res.loc['total_sum'] = (to_char.sum(axis=1) * 2).values - exp_res = exp_res.astype(float) - exp_res.sort_index(inplace=True) - - res = match_and_convert(to_char, mapping, agg_func="sum") - res.sort_index(inplace=True) - - exp_res.index.names = res.index.names - exp_res.columns.names = res.columns.names - - df_map = mapping - df_orig = to_char - # TEST END - new_col = [col for col in df_map.columns if "__" in col] unique_new_index = df_map.loc[:, new_col].value_counts() @@ -1089,96 +1051,27 @@ def match_and_convert(df_orig, df_map, agg_func): # loop over each new impact/characterized value for char in unique_new_index.index: - # __import__('pdb').set_trace() if len(char) == 1: df_cur_map = df_map.loc[[char[0]]] else: df_cur_map = df_map.loc[[char]] - agg_method = df_cur_map.agg_func if 'agg_func' in df_cur_map.columns else 'sum' + agg_method = df_cur_map.agg_func if 'agg_func' in df_cur_map.columns else agg_func df_agg = pd.DataFrame(columns=df_orig.columns, index=df_cur_map.index, data=0) df_agg.index.names = [n.split('__')[0] for n in df_agg.index.names] collector = [] + # the loop for getting all (potential) regex matches for row in df_cur_map.iterrows(): - # find via regex match - can be multiple entries defined in one row matched_entries = index_fullmatch(df_ix=df_orig, **row[1].to_dict()) mul_entries = matched_entries * row[1].factor aggregated = mul_entries.aggregate(agg_method, axis=0) collector.append(aggregated) - df_collected = pd.concat(collector, axis=0) - # FIX: - think about adding the index (right amount for after the aggregation) - # CONT: - df_collected.index = df_agg.index - res_collector.append(df_collected.groupby(by=df_collected.index.names).agg(agg_method)) + df_collected = pd.concat(collector, axis=1).T + df_collected.index = np.repeat(df_cur_map.index.unique(), df_collected.shape[0]) + res_collector.append(df_collected.groupby(by=df_collected.index.names).agg(agg_method)) return pd.concat(res_collector, axis=0) - - -def match_and_convert_legacy(df, factor=1, **kwargs): - """ - OLD - Parameters - ---------- - - df: pd.DataFrame - The DataFrame to process. - Index levels must be named, all matching occurs on the index. - - factor: float, optional - The factor to multiply the matching values with. - Default: 1 - - kwargs: One for each index level which should be matched. - - - """ - - factor = 1000 - - kwargs = dict( - stressor = r"emission_type.*", - compartment = r".*", - sector = r"food|mining", - rename_stressor = "ghg", - rename_compartment = "air", - rename_sector = None, - ) - - match_kwargs = {k:v for k,v in kwargs.items() if not k.startswith('rename_')} - rename_kwargs = {k:v for k,v in kwargs.items() if k.startswith('rename_')} - - # emission_type1, emission_type2 - emission - # emission_type1, emission_type2 - ghg_type1, ghg_type2 - # match = pymrio.index_match(df_ix=FF, stressor="emission_type.*") - - match = pymrio.index_match(df_ix=FF, **match_kwargs) - - for rename_idx_level, new_index_name in rename_kwargs.items(): - if new_index_name: - idx_level = rename_idx_level.split('rename_')[1] - match = match.reset_index(idx_level) - match.loc[:, idx_level] = new_index_name - match = match.set_index(idx_level, append=True) - - # CONT: find duplicates in index and aggregate - multi = match * factor - res = multi.agg(func='sum', axis=0) - res - - multi.groupby(level=['compartment', 'sector']).agg(func='sum') - - import re - - # write re.sub which converts: emission_type1, emission_type2 - emission - text = "emission_type1, emission_type2" - re.sub(r"emission", "ghg", text) - - re.sub(r"\w+", "ghg", text) - - - pass - diff --git a/tests/test_util.py b/tests/test_util.py index f5b8175e..1d5f1fd3 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -5,7 +5,6 @@ import sys from collections import namedtuple from unittest.mock import mock_open, patch - import numpy as np import numpy.testing as npt import pandas as pd @@ -366,7 +365,9 @@ def test_char_table(): to_char.columns.names = ["reg", "sec"] to_char.index.names = ["em_type", "compart"] - mapping = pd.DataFrame( + # TEST1: with only impact (one index level in the result) , sum over compartments + + map_test1 = pd.DataFrame( columns=["em_type", "compart", "total__em_type", "factor"], data=[["em.*", "air|water", "total_regex", 2], @@ -379,21 +380,54 @@ def test_char_table(): ["em2", "air", "all_air", 0.5]], ) - exp_res = pd.DataFrame( + # alternative way to calculated the expected result + exp_res1 = pd.DataFrame( columns = to_char.columns, index = ["total_regex", "total_sum", "all_air"]) - exp_res.loc['all_air'] = to_char.loc[("em1", "air")] * 0.5 + to_char.loc[("em2", "air")] * 0.5 - exp_res.loc['total_regex'] = (to_char.sum(axis=1) * 2).values - exp_res.loc['total_sum'] = (to_char.sum(axis=1) * 2).values - exp_res = exp_res.astype(float) - exp_res.sort_index(inplace=True) + exp_res1.loc['all_air'] = to_char.loc[("em1", "air")] * 0.5 + to_char.loc[("em2", "air")] * 0.5 + exp_res1.loc['total_regex'] = (to_char.sum(axis=1) * 2).values + exp_res1.loc['total_sum'] = (to_char.sum(axis=1) * 2).values + exp_res1 = exp_res1.astype(float) + exp_res1.sort_index(inplace=True) + + res1 = match_and_convert(to_char, map_test1) + res1.sort_index(inplace=True) + + exp_res1.index.names = res1.index.names + exp_res1.columns.names = res1.columns.names + + pdt.assert_frame_equal(res1, exp_res1) + + # TEST2 with impact per compartment (two index levels in the result) + + map_test2 = pd.DataFrame( + columns=["em_type", "compart", "total__em_type", "compart__compart", "factor"], + data=[["em.*", "air|water", "total_regex", "all", 2], + ["em1", "air", "total_sum", "all", 2], + ["em1", "water", "total_sum", "all", 2], + ["em2", "air", "total_sum", "all", 2], + ["em2", "water", "total_sum", "all", 2], + ["em1", "air", "all_air", "air", 0.5], + ["em2", "air", "all_air", "air", 0.5]], + ) + + # alternative way to calculated the expected result + exp_res2 = pd.DataFrame( + columns = to_char.columns, + index = pd.MultiIndex.from_tuples( + [("total_regex", "all"), ("total_sum", "all"), ("all_air", "air")])) + exp_res2.loc[('all_air', 'air')] = to_char.loc[("em1", "air")] * 0.5 + to_char.loc[("em2", "air")] * 0.5 + exp_res2.loc[('total_regex', 'all')] = (to_char.sum(axis=1) * 2).values + exp_res2.loc[('total_sum', 'all')] = (to_char.sum(axis=1) * 2).values + exp_res2 = exp_res2.astype(float) + exp_res2.sort_index(inplace=True) - res = match_and_convert(to_char, mapping, agg_func="sum") - res.sort_index(inplace=True) + res2 = match_and_convert(to_char, map_test2) + res2.sort_index(inplace=True) - exp_res.index.names = res.index.names - exp_res.columns.names = res.columns.names + exp_res2.index.names = res2.index.names + exp_res2.columns.names = res2.columns.names - pdt.assert_frame_equal(res, exp_res) + pdt.assert_frame_equal(res2, exp_res2)