HK3-Lab-Team · alessiamarcolini · Dec 21, 2020 · Dec 10, 2020 · Dec 10, 2020 · Dec 10, 2020
diff --git a/src/trousse/dataframe_with_info.py b/src/trousse/dataframe_with_info.py
@@ -26,8 +26,6 @@
 # R = How many times a unique value is repeated in column (in average)
 CATEG_COL_THRESHOLD = 300
 
-logger = logging.getLogger(__name__)
-
 
 def get_df_from_csv(df_filename: str) -> pd.DataFrame:
     """
@@ -46,10 +44,10 @@ def get_df_from_csv(df_filename: str) -> pd.DataFrame:
     """
     try:
         df = pd.read_csv(df_filename)
-        logger.info("Data imported from file successfully")
+        logging.info("Data imported from file successfully")
         return df
     except FileNotFoundError as e:
-        logger.error(e)
+        logging.error(e)
         return None
 
 
@@ -588,10 +586,10 @@ def check_duplicated_features(self) -> bool:
         # TODO: Rename to "contains_duplicated_features"
         # TODO: In case there are columns with the same name, check if the
         #  values are the same too and inform the user appropriately
-        logger.info("Checking duplicated columns")
+        logging.info("Checking duplicated columns")
         # Check if there are duplicates in the df columns
         if len(self.df.columns) != len(set(self.df.columns)):
-            logger.error("There are duplicated columns")
+            logging.error("There are duplicated columns")
             return True
         else:
             return False

diff --git a/src/trousse/feature_fix.py b/src/trousse/feature_fix.py
@@ -13,8 +13,6 @@
 )
 from .feature_enum import ENCODED_COLUMN_SUFFIX, EncodingFunctions, OperationTypeEnum
 
-logger = logging.getLogger(__name__)
-
 NAN_CATEGORY = "Nan"
 BIN_SPLIT_COL_SUFFIX = "_bin_id"
 
@@ -34,48 +32,73 @@ def convert_maps_from_tuple_to_str(group_id_to_tuple_map):
 
 
 def split_continuous_column_into_bins(
-    df_info: DataFrameWithInfo, col_name, bin_threshold
+    df_info: DataFrameWithInfo,
+    col_name,
+    bin_thresholds,
+    extra_padding_ratio: float = 0.4,
 ):
     """
-    This function adds a column to DataFrame df_info called "[col_name]_bin_id" where we split the "col_name" into bins
-    :param df_info: DataFrameWithInfo -> DataFrameWithInfo instance containing the 'col_name' column to split
-    :param col_name: String -> Name of the column to be split into discrete intervals
-    :param bin_threshold: List -> It contains the thresholds used to separate different groups
-                                  (the threshold will be included in the bin with higher values)
-    :return: pd.DataFrame -> Same "df_info" passed with a new column with the bin_indices
-                             which the column value belongs to
-             Dict[List] -> Dictionary with the bin_indices as keys and bin_ranges as values
+    Split the continuous values from ``df_info`` column ``col_name`` into discrete bins.
+
+    called "[col_name]_bin_id" where we split the "col_name" into bins
+
+    Parameters
+    ----------
+    df_info : DataFrameWithInfo
+        DataFrameWithInfo instance containing the 'col_name' column to split
+    col_name : str
+        Name of the column to be split into discrete intervals
+    bin_thresholds : List
+        It contains the thresholds used to separate different groups
+        (the threshold will be included in the bin with higher values)
+    extra_bin_padding : float
+        This float number indicates the ratio of the total interval size that is
+        added to the lower and upper bound of the first and last bin respectively.
+        Its purpose is to include the values of the future/test set sample values
+        (where maximum and minimum value could exceed the ones found here).
+        Default set to 0.4.
+
+    Returns
+    -------
+    pd.DataFrame
+        Same "df_info" passed with a new column with the bin_indices
+        which the column value belongs to
+    Dict[List]
+        Dictionary with the bin_indices as keys and bin_ranges as values
     """
     new_col_name = f"{col_name}{BIN_SPLIT_COL_SUFFIX}"
     # Initialize the bin <--> id_range map  with the min and max value
     bin_id_range_map = {}
-    # For the BIN 0 choose the column minimum as the bin "lower_value",
-    # in the other case the "upper_value" of the previous loops is set as "lower_value"
-    lower_value = min(df_info.df[col_name].unique()) - 1
+
+    min_value = min(df_info.df[col_name].unique())
+    max_value = max(df_info.df[col_name].unique())
+    extra_padding = abs(max_value - min_value) * extra_padding_ratio
+    # Set the bin "0" lower bound with extra padding. In the other cases
+    # the "upper_value" of the previous loops is set as "lower_value"
+    lower_value = min_value - extra_padding
     # Loop over the bins (we need to increase by 1 because they are only the separating values)
-    for i in range(len(bin_threshold) + 1):
+    for i in range(len(bin_thresholds) + 1):
 
         bin_id_range_map[i] = []
-        # Append the bin upper and lower value to the "bin_id_range_map"
-        # For the first and last bin, we set some special values
         bin_id_range_map[i].append(lower_value)
 
         # Assign the bin upper value:
         # 1. Either to the higher threshold
         # 2. Or to the column maximum value (if there is not a higher threshold in list)
         try:
-            upper_value = bin_threshold[i]
+            upper_value = bin_thresholds[i]
         except IndexError:
-            upper_value = max(df_info.df[col_name].unique())
+            # if there is not a higher threshold, the following value is used as the
+            # highest bound of the last bin
+            upper_value = max_value + extra_padding
 
-        # Append the bin upper value to the "bin_id_range_map"
         bin_id_range_map[i].append(upper_value)
 
         # Identify the values in the range [lower_value, upper_value] in every row,
         # and assign them "i" as the value of the new column "_bin_id"
         df_info.df.loc[
             (df_info.df[col_name] >= lower_value)
-            & (df_info.df[col_name] <= upper_value),
+            & (df_info.df[col_name] < upper_value),
             new_col_name,
         ] = i
 
@@ -215,7 +238,7 @@ def _one_hot_encode_column(
     try:
         encoded_categories.remove(NAN_CATEGORY.title())
     except ValueError:
-        logger.debug(f"No NaN values were found in column {column}")
+        logging.debug(f"No NaN values were found in column {column}")
     # Name the new columns after the categories (adding a suffix). Exclude the first which was dropped
     new_column_names = [
         f"{column}_{col}{ENCODED_COLUMN_SUFFIX}" for col in encoded_categories[1:]

diff --git a/src/trousse/feature_operation.py b/src/trousse/feature_operation.py
@@ -57,11 +57,15 @@ def __init__(
             OperationTypeEnum value that describes the type of the operation performed.
         original_columns: Union[Tuple[str], str, None], optional
             Tuple of the columns that were used as input for the operation performed.
-            Default set to ().
+            When set to None, it means that the attribute is not
+            defined and it can be used when looking for a FeatureOperation, whose
+            original columns are unknown. Default set to ().
         derived_columns: Union[Tuple[str], str, None], optional
             Tuple of the columns that were generated as output after performing the
             operation. If this tuple is equal to original_columns, it will be
-            reassigned to (). Default set to ().
+            reassigned to (). When set to None, it means that the attribute is not
+            defined and it can be used when looking for a FeatureOperation, whose
+            derived columns are unknown. Default set to ().
         encoded_values_map: Union[Dict[int, Any], None], optional
             Map that connects the ``derived_columns`` values, generated by an
             encoding operation, to the represented values (of the
@@ -100,9 +104,9 @@ def __init__(
             )
 
         if encoded_values_map is None:
-            self.encoded_values_map = encoded_values_map
-        else:
             self.encoded_values_map = {}
+        else:
+            self.encoded_values_map = encoded_values_map
 
         if derived_columns == original_columns and original_columns is not None:
             self.derived_columns = ()

diff --git a/src/trousse/row_fix.py b/src/trousse/row_fix.py
@@ -7,8 +7,6 @@
 
 from .dataframe_with_info import DataFrameWithInfo, copy_df_info_with_new_df
 
-logger = logging.getLogger(__name__)
-
 
 class RowFix:
     """
@@ -97,7 +95,7 @@ def _check_numeric_cols(
                 lost_values = set(
                     df_info.df[col][df_info.df[col].notna() & numeric_col_serie.isna()]
                 )
-                logger.info(
+                logging.info(
                     f"{col} can be converted from String to Mixed. The percentage of "
                     f"non numeric values is {1- num_valuecount_ratio}.\n"
                     f"Values: {lost_values}"
@@ -136,14 +134,14 @@ def _convert_out_of_scale_values(self, elem, symbol):
                     result - self.percentage_to_add_out_of_scale * result
                 )
             else:
-                logger.error(
-                    f"You end up using the wrong function to convert {elem}."
+                logging.error(
+                    f"You end up using the wrong function to convert `{elem}` ."
                     " It will be replaced with NaN."
                 )
                 return self.nan_value
         except (ValueError, TypeError):
-            logger.error(
-                f"You end up using the wrong function to convert {elem}"
+            logging.error(
+                f"You end up using the wrong function to convert `{elem}` ."
                 " It will be replaced with NaN."
             )
             return self.nan_value
@@ -231,11 +229,11 @@ def fix_typos(
                 self._convert_to_float_value, column=c, axis=1
             )
             # Progress bar
-            print("=", end="")
-        print()
+            logging.info("=", end="")
+        logging.info()
 
         if verbose:
-            logger.info(self.count_errors())
+            logging.info(self.count_errors())
 
         return copy_df_info_with_new_df(df_info=df_info, new_pandas_df=df_converted)
 
@@ -269,7 +267,7 @@ def cols_to_correct_dtype(
         bool_cols = bool_cols.union(cols_by_type.bool_cols)
         df_info.df[list(bool_cols)] = df_info.df[list(bool_cols)].astype(np.bool)
         if verbose:
-            logger.info(
+            logging.info(
                 f"Casted to INT32: {int_cols}\n Casted to FLOAT64: {float_cols}\n"
                 f"Casted to BOOL: {bool_cols}"
             )
@@ -320,9 +318,9 @@ def fix_common_errors(
 
     def print_errors_per_column(self):
         """ This is to print the actual error values, to check the fixes"""
-        print("The errors per feature are:")
+        logging.info("The errors per feature are:")
         for c in self.errors_before_correction_dict.keys():
-            print(
+            logging.info(
                 f"{c}: {len(self.errors_before_correction_dict[c])} :"
                 f" {set(self.errors_before_correction_dict[c])}"
                 f" ---> {len(self.errors_after_correction_dict[c])} :"
@@ -368,7 +366,7 @@ def force_conversion_to_numeric(
         dfinfo_num_only = copy_df_info_with_new_df(df_info, df_info.df)
         non_num_convertible_count = 0
         if verbose:
-            logger.info("The values non convertible to numbers are:")
+            logging.info("The values non convertible to numbers are:")
         for col in columns:
             # Convert every value to number if possible, the other will be NaN
             forced_converted_to_num = pd.to_numeric(df_info.df[col], errors="coerce")
@@ -380,15 +378,15 @@ def force_conversion_to_numeric(
             )
             non_num_convertible_count += non_numeric_in_column.sum()
             if verbose:
-                logger.info(
+                logging.info(
                     f"{col} -> \n"
                     f"{df_info.df[col][non_numeric_in_column].value_counts()}"
                 )
             # Set to NaN all the non convertible values
             if not dry_run:
                 dfinfo_num_only.df.loc[:, col] = forced_converted_to_num
 
-        logger.info(
+        logging.info(
             "The total count of values non convertible to numbers"
             f" is: {non_num_convertible_count}"
         )