HK3-Lab-Team · alessiamarcolini · Dec 21, 2020 · Dec 10, 2020 · Dec 10, 2020 · Dec 10, 2020
diff --git a/src/trousse/dataframe_with_info.py b/src/trousse/dataframe_with_info.py
@@ -26,8 +26,6 @@
 # R = How many times a unique value is repeated in column (in average)
 CATEG_COL_THRESHOLD = 300
 
-logger = logging.getLogger(__name__)
-
 
 def get_df_from_csv(df_filename: str) -> pd.DataFrame:
     """
@@ -46,10 +44,10 @@ def get_df_from_csv(df_filename: str) -> pd.DataFrame:
     """
     try:
         df = pd.read_csv(df_filename)
-        logger.info("Data imported from file successfully")
+        logging.info("Data imported from file successfully")
         return df
     except FileNotFoundError as e:
-        logger.error(e)
+        logging.error(e)
         return None
 
 
@@ -588,10 +586,10 @@ def check_duplicated_features(self) -> bool:
         # TODO: Rename to "contains_duplicated_features"
         # TODO: In case there are columns with the same name, check if the
         #  values are the same too and inform the user appropriately
-        logger.info("Checking duplicated columns")
+        logging.info("Checking duplicated columns")
         # Check if there are duplicates in the df columns
         if len(self.df.columns) != len(set(self.df.columns)):
-            logger.error("There are duplicated columns")
+            logging.error("There are duplicated columns")
             return True
         else:
             return False

diff --git a/src/trousse/feature_fix.py b/src/trousse/feature_fix.py
@@ -1,6 +1,6 @@
 import itertools
 import logging
-from typing import Tuple
+from typing import Dict, List, Tuple
 
 import numpy as np
 import pandas as pd
@@ -13,8 +13,6 @@
 )
 from .feature_enum import ENCODED_COLUMN_SUFFIX, EncodingFunctions, OperationTypeEnum
 
-logger = logging.getLogger(__name__)
-
 NAN_CATEGORY = "Nan"
 BIN_SPLIT_COL_SUFFIX = "_bin_id"
 
@@ -33,55 +31,206 @@ def convert_maps_from_tuple_to_str(group_id_to_tuple_map):
     return gr_id_to_string_map
 
 
-def split_continuous_column_into_bins(
-    df_info: DataFrameWithInfo, col_name, bin_threshold
-):
+def _split_into_bins_infer_range_boundaries(
+    column: pd.Series,
+    bin_thresholds: List[float],
+    extra_padding_ratio: float = 0.4,
+) -> Tuple[pd.Series, Dict[int, List[float]]]:
     """
-    This function adds a column to DataFrame df_info called "[col_name]_bin_id" where we split the "col_name" into bins
-    :param df_info: DataFrameWithInfo -> DataFrameWithInfo instance containing the 'col_name' column to split
-    :param col_name: String -> Name of the column to be split into discrete intervals
-    :param bin_threshold: List -> It contains the thresholds used to separate different groups
-                                  (the threshold will be included in the bin with higher values)
-    :return: pd.DataFrame -> Same "df_info" passed with a new column with the bin_indices
-                             which the column value belongs to
-             Dict[List] -> Dictionary with the bin_indices as keys and bin_ranges as values
+    Split the ``column`` values into discrete bins, inferring the range boundaries.
+
+    The lower bound of the first bin is inferred based on the ``column`` minimum, and
+    the upper bound of the last bin is inferred based on the ``column``
+    maximum value. An ``extra_bin_padding`` is added to both values to include
+    the values of the future/test set sample values (where maximum and minimum
+    value could exceed the ones found in ``column``).
+    Therefore, the ``bin_thresholds`` values are considered as intermediate thresholds.
+    The intermediate bin thresholds will be included in the bin that uses them as
+    lower bound. Particularly the values associated to bin "i" are those that have
+    a ``column`` value "V" as follows:
+    bin_threshold[i-1] <= V < bin_threshold[i]
+
+    Parameters
+    ----------
+    column : pd.Series
+        Column to be split into discrete intervals
+    bin_thresholds : List[float]
+        List of the thresholds used to split the ``column`` column into bins.
+    extra_bin_padding : float
+        Ratio of the total interval size that is added to the lower and upper
+        bound of the first and last bin respectively. Default set to 0.4.
+
+    Returns
+    -------
+    pd.Series
+        Column with the IDs of the computed bins
+    Dict[int, List[float]]
+        Dictionary with the bin IDs as keys and bin lower and upper bounds as values
     """
-    new_col_name = f"{col_name}{BIN_SPLIT_COL_SUFFIX}"
-    # Initialize the bin <--> id_range map  with the min and max value
+    # Initialize the bin <--> id_range map
     bin_id_range_map = {}
-    # For the BIN 0 choose the column minimum as the bin "lower_value",
-    # in the other case the "upper_value" of the previous loops is set as "lower_value"
-    lower_value = min(df_info.df[col_name].unique()) - 1
+    bin_column = pd.Series([pd.NA] * len(column))
+
+    min_value = min(column.unique())
+    max_value = max(column.unique())
+    extra_padding = abs(max_value - min_value) * extra_padding_ratio
+    lower_range_boundary = min_value - extra_padding
+    upper_range_boundary = max_value + extra_padding
+    # Set the bin "0" lower bound with extra padding. In the other cases
+    # the "upper_value" of the previous loops is set as "lower_value"
+    lower_value = lower_range_boundary
     # Loop over the bins (we need to increase by 1 because they are only the separating values)
-    for i in range(len(bin_threshold) + 1):
+    for i in range(len(bin_thresholds) + 1):
 
         bin_id_range_map[i] = []
-        # Append the bin upper and lower value to the "bin_id_range_map"
-        # For the first and last bin, we set some special values
         bin_id_range_map[i].append(lower_value)
 
         # Assign the bin upper value:
         # 1. Either to the higher threshold
         # 2. Or to the column maximum value (if there is not a higher threshold in list)
         try:
-            upper_value = bin_threshold[i]
+            upper_value = bin_thresholds[i]
         except IndexError:
-            upper_value = max(df_info.df[col_name].unique())
+            # In case where ```infer_upper_lower_bounds`` is True, there may be not a
+            # higher threshold, so the following value is used as the
+            # highest bound of the last bin
+            upper_value = upper_range_boundary
 
-        # Append the bin upper value to the "bin_id_range_map"
         bin_id_range_map[i].append(upper_value)
 
         # Identify the values in the range [lower_value, upper_value] in every row,
         # and assign them "i" as the value of the new column "_bin_id"
-        df_info.df.loc[
-            (df_info.df[col_name] >= lower_value)
-            & (df_info.df[col_name] <= upper_value),
-            new_col_name,
-        ] = i
+        bin_column.loc[(column >= lower_value) & (column < upper_value)] = i
 
         # Set the upper_value as the lower_value for the next higher bin
         lower_value = upper_value
 
+    return bin_column, bin_id_range_map
+
+
+def _split_into_bins_no_inference(
+    column: pd.Series,
+    bin_thresholds: List[float],
+) -> Tuple[pd.Series, Dict[int, List[float]]]:
+    """
+    Split the ``column`` values into discrete bins, using the bin_thresholds.
+
+    The ``bin_thresholds`` first value is considered as lower bound of the first
+    bin and the last value is considered as upper bound of the last bin.
+    The intermediate bin thresholds are included in the bin that uses them as
+    lower bound. Particularly the values associated to bin "i" are those that have
+    a ``column`` value "V" as follows:
+    bin_threshold[i] <= V < bin_threshold[i+1]
+
+    Parameters
+    ----------
+    column : pd.Series
+        Column to be split into discrete intervals
+    bin_thresholds : List[float]
+        List of the thresholds used to split the ``column`` column into bins.
+
+    Returns
+    -------
+    pd.Series
+        Column with the IDs of the computed bins
+    Dict[int, List[float]]
+        Dictionary with the bin IDs as keys and bin lower and upper bounds as values
+    """
+    # Initialize the bin <--> id_range map
+    bin_id_range_map = {}
+    bin_column = pd.Series([pd.NA] * len(column))
+
+    lower_value = bin_thresholds[0]
+    # Loop over the bins (we need to increase by 1 because they are only the separating values)
+    for i in range(len(bin_thresholds) - 1):
+
+        bin_id_range_map[i] = []
+        bin_id_range_map[i].append(lower_value)
+
+        # Assign the bin upper value:
+        # 1. Either to the higher threshold
+        # 2. Or to the column maximum value (if there is not a higher threshold in list)
+        upper_value = bin_thresholds[i + 1]
+        bin_id_range_map[i].append(upper_value)
+        # Identify the values in the range [lower_value, upper_value] in every row,
+        # and assign them "i" as the value of the new column "_bin_id"
+        bin_column.loc[(column >= lower_value) & (column < upper_value)] = i
+
+        # Set the upper_value as the lower_value for the next higher bin
+        lower_value = upper_value
+
+    return bin_column, bin_id_range_map
+
+
+def split_continuous_column_into_bins(
+    df_info: DataFrameWithInfo,
+    col_name: str,
+    bin_thresholds: List[float],
+    infer_upper_lower_bounds: bool = True,
+    extra_padding_ratio: float = 0.4,
+) -> DataFrameWithInfo:
+    """
+    Split the continuous values from ``df_info`` column ``col_name`` into discrete bins.
+
+    When ``infer_upper_lower_bounds`` is True, the lower
+    bound of the first bin is inferred based on the ``col_name`` minimum, and
+    the upper bound of the last bin is inferred based on the ``col_name``
+    maximum value. An ``extra_bin_padding`` is added to both values to include
+    the values of the future/test set sample values (where maximum and minimum
+    value could exceed the ones found in ``df_info``).
+    Therefore, when ``infer_upper_lower_bounds`` is True, the ``bin_thresholds`` values
+    are considered as intermediate thresholds.
+    When ``infer_upper_lower_bounds`` is False, the ``bin_thresholds`` first values
+    is considered as lower bound of the first bin and the last value is considered
+    as upper bound of the last bin.
+    In both cases the intermediate bin thresholds will be included in the bin that
+    uses them as lower bound. So the values associated to bin "i" are those that have
+    a ``col_name`` value "V" as follows:
+    bin_threshold[i] <= V < bin_threshold[i+1]
+
+    The new column with the computed bin IDs is added ``df_info`` with
+    the name "[``col_name``]_bin_id"
+
+    Parameters
+    ----------
+    df_info : DataFrameWithInfo
+        DataFrameWithInfo instance containing the 'col_name' column to split
+    col_name : str
+        Name of the column to be split into discrete intervals
+    bin_thresholds : List[float]
+        List of the thresholds used to split the ``col_name`` column into bins.
+    infer_upper_lower_bounds : bool
+        Option to infer the lower bound of the first bin and the upper bound of the
+        last bin from the minimum and maximum value of the ``col_name`` column.
+    extra_bin_padding : float
+        Ratio of the total interval size that is added to the lower and upper
+        bound of the first and last bin respectively. Default set to 0.4.
+
+    Returns
+    -------
+    DataFrameWithInfo
+        Same "df_info" passed with a new column with the bin_indices
+        which the column value belongs to
+    """
+    new_col_name = f"{col_name}{BIN_SPLIT_COL_SUFFIX}"
+
+    if infer_upper_lower_bounds:
+        (
+            df_info.df.loc[:, new_col_name],
+            bin_id_range_map,
+        ) = _split_into_bins_infer_range_boundaries(
+            column=df_info.df[col_name],
+            bin_thresholds=bin_thresholds,
+            extra_padding_ratio=extra_padding_ratio,
+        )
+    else:
+        (
+            df_info.df.loc[:, new_col_name],
+            bin_id_range_map,
+        ) = _split_into_bins_no_inference(
+            column=df_info.df[col_name], bin_thresholds=bin_thresholds
+        )
+
     # Cast the new column to int8
     df_info.df.loc[:, new_col_name] = df_info.df[new_col_name].astype("Int16")
 
@@ -215,7 +364,7 @@ def _one_hot_encode_column(
     try:
         encoded_categories.remove(NAN_CATEGORY.title())
     except ValueError:
-        logger.debug(f"No NaN values were found in column {column}")
+        logging.debug(f"No NaN values were found in column {column}")
     # Name the new columns after the categories (adding a suffix). Exclude the first which was dropped
     new_column_names = [
         f"{column}_{col}{ENCODED_COLUMN_SUFFIX}" for col in encoded_categories[1:]

diff --git a/src/trousse/feature_operation.py b/src/trousse/feature_operation.py
@@ -57,11 +57,15 @@ def __init__(
             OperationTypeEnum value that describes the type of the operation performed.
         original_columns: Union[Tuple[str], str, None], optional
             Tuple of the columns that were used as input for the operation performed.
-            Default set to ().
+            When set to None, it means that the attribute is not
+            defined and it can be used when looking for a FeatureOperation, whose
+            original columns are unknown. Default set to ().
         derived_columns: Union[Tuple[str], str, None], optional
             Tuple of the columns that were generated as output after performing the
             operation. If this tuple is equal to original_columns, it will be
-            reassigned to (). Default set to ().
+            reassigned to (). When set to None, it means that the attribute is not
+            defined and it can be used when looking for a FeatureOperation, whose
+            derived columns are unknown. Default set to ().
         encoded_values_map: Union[Dict[int, Any], None], optional
             Map that connects the ``derived_columns`` values, generated by an
             encoding operation, to the represented values (of the
@@ -100,9 +104,9 @@ def __init__(
             )
 
         if encoded_values_map is None:
-            self.encoded_values_map = encoded_values_map
-        else:
             self.encoded_values_map = {}
+        else:
+            self.encoded_values_map = encoded_values_map
 
         if derived_columns == original_columns and original_columns is not None:
             self.derived_columns = ()