Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix encoded values map attribute #105

Merged
merged 9 commits into from
Dec 21, 2020
10 changes: 4 additions & 6 deletions src/trousse/dataframe_with_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@
# R = How many times a unique value is repeated in column (in average)
CATEG_COL_THRESHOLD = 300

logger = logging.getLogger(__name__)


def get_df_from_csv(df_filename: str) -> pd.DataFrame:
"""
Expand All @@ -46,10 +44,10 @@ def get_df_from_csv(df_filename: str) -> pd.DataFrame:
"""
try:
df = pd.read_csv(df_filename)
logger.info("Data imported from file successfully")
logging.info("Data imported from file successfully")
return df
except FileNotFoundError as e:
logger.error(e)
logging.error(e)
return None


Expand Down Expand Up @@ -588,10 +586,10 @@ def check_duplicated_features(self) -> bool:
# TODO: Rename to "contains_duplicated_features"
# TODO: In case there are columns with the same name, check if the
# values are the same too and inform the user appropriately
logger.info("Checking duplicated columns")
logging.info("Checking duplicated columns")
# Check if there are duplicates in the df columns
if len(self.df.columns) != len(set(self.df.columns)):
logger.error("There are duplicated columns")
logging.error("There are duplicated columns")
return True
else:
return False
Expand Down
211 changes: 180 additions & 31 deletions src/trousse/feature_fix.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import itertools
import logging
from typing import Tuple
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
Expand All @@ -13,8 +13,6 @@
)
from .feature_enum import ENCODED_COLUMN_SUFFIX, EncodingFunctions, OperationTypeEnum

logger = logging.getLogger(__name__)

NAN_CATEGORY = "Nan"
BIN_SPLIT_COL_SUFFIX = "_bin_id"

Expand All @@ -33,55 +31,206 @@ def convert_maps_from_tuple_to_str(group_id_to_tuple_map):
return gr_id_to_string_map


def split_continuous_column_into_bins(
df_info: DataFrameWithInfo, col_name, bin_threshold
):
def _split_into_bins_infer_range_boundaries(
column: pd.Series,
bin_thresholds: List[float],
extra_padding_ratio: float = 0.4,
) -> Tuple[pd.Series, Dict[int, List[float]]]:
"""
This function adds a column to DataFrame df_info called "[col_name]_bin_id" where we split the "col_name" into bins
:param df_info: DataFrameWithInfo -> DataFrameWithInfo instance containing the 'col_name' column to split
:param col_name: String -> Name of the column to be split into discrete intervals
:param bin_threshold: List -> It contains the thresholds used to separate different groups
(the threshold will be included in the bin with higher values)
:return: pd.DataFrame -> Same "df_info" passed with a new column with the bin_indices
which the column value belongs to
Dict[List] -> Dictionary with the bin_indices as keys and bin_ranges as values
Split the ``column`` values into discrete bins, inferring the range boundaries.

The lower bound of the first bin is inferred based on the ``column`` minimum, and
the upper bound of the last bin is inferred based on the ``column``
maximum value. An ``extra_bin_padding`` is added to both values to include
the values of the future/test set sample values (where maximum and minimum
value could exceed the ones found in ``column``).
Therefore, the ``bin_thresholds`` values are considered as intermediate thresholds.
The intermediate bin thresholds will be included in the bin that uses them as
lower bound. Particularly the values associated to bin "i" are those that have
a ``column`` value "V" as follows:
bin_threshold[i-1] <= V < bin_threshold[i]

Parameters
----------
column : pd.Series
Column to be split into discrete intervals
bin_thresholds : List[float]
List of the thresholds used to split the ``column`` column into bins.
extra_bin_padding : float
Ratio of the total interval size that is added to the lower and upper
bound of the first and last bin respectively. Default set to 0.4.

Returns
-------
pd.Series
Column with the IDs of the computed bins
Dict[int, List[float]]
Dictionary with the bin IDs as keys and bin lower and upper bounds as values
"""
new_col_name = f"{col_name}{BIN_SPLIT_COL_SUFFIX}"
# Initialize the bin <--> id_range map with the min and max value
# Initialize the bin <--> id_range map
bin_id_range_map = {}
# For the BIN 0 choose the column minimum as the bin "lower_value",
# in the other case the "upper_value" of the previous loops is set as "lower_value"
lower_value = min(df_info.df[col_name].unique()) - 1
bin_column = pd.Series([pd.NA] * len(column))

min_value = min(column.unique())
max_value = max(column.unique())
extra_padding = abs(max_value - min_value) * extra_padding_ratio
lower_range_boundary = min_value - extra_padding
upper_range_boundary = max_value + extra_padding
# Set the bin "0" lower bound with extra padding. In the other cases
# the "upper_value" of the previous loops is set as "lower_value"
lower_value = lower_range_boundary
# Loop over the bins (we need to increase by 1 because they are only the separating values)
for i in range(len(bin_threshold) + 1):
for i in range(len(bin_thresholds) + 1):

bin_id_range_map[i] = []
# Append the bin upper and lower value to the "bin_id_range_map"
# For the first and last bin, we set some special values
bin_id_range_map[i].append(lower_value)

# Assign the bin upper value:
# 1. Either to the higher threshold
# 2. Or to the column maximum value (if there is not a higher threshold in list)
try:
upper_value = bin_threshold[i]
upper_value = bin_thresholds[i]
except IndexError:
upper_value = max(df_info.df[col_name].unique())
# In case where ```infer_upper_lower_bounds`` is True, there may be not a
# higher threshold, so the following value is used as the
# highest bound of the last bin
upper_value = upper_range_boundary

# Append the bin upper value to the "bin_id_range_map"
bin_id_range_map[i].append(upper_value)

# Identify the values in the range [lower_value, upper_value] in every row,
# and assign them "i" as the value of the new column "_bin_id"
df_info.df.loc[
(df_info.df[col_name] >= lower_value)
& (df_info.df[col_name] <= upper_value),
new_col_name,
] = i
bin_column.loc[(column >= lower_value) & (column < upper_value)] = i

# Set the upper_value as the lower_value for the next higher bin
lower_value = upper_value

return bin_column, bin_id_range_map


def _split_into_bins_no_inference(
column: pd.Series,
bin_thresholds: List[float],
) -> Tuple[pd.Series, Dict[int, List[float]]]:
"""
Split the ``column`` values into discrete bins, using the bin_thresholds.

The ``bin_thresholds`` first value is considered as lower bound of the first
bin and the last value is considered as upper bound of the last bin.
The intermediate bin thresholds are included in the bin that uses them as
lower bound. Particularly the values associated to bin "i" are those that have
a ``column`` value "V" as follows:
bin_threshold[i] <= V < bin_threshold[i+1]

Parameters
----------
column : pd.Series
Column to be split into discrete intervals
bin_thresholds : List[float]
List of the thresholds used to split the ``column`` column into bins.

Returns
-------
pd.Series
Column with the IDs of the computed bins
Dict[int, List[float]]
Dictionary with the bin IDs as keys and bin lower and upper bounds as values
"""
# Initialize the bin <--> id_range map
bin_id_range_map = {}
bin_column = pd.Series([pd.NA] * len(column))

lower_value = bin_thresholds[0]
# Loop over the bins (we need to increase by 1 because they are only the separating values)
for i in range(len(bin_thresholds) - 1):

bin_id_range_map[i] = []
bin_id_range_map[i].append(lower_value)

# Assign the bin upper value:
# 1. Either to the higher threshold
# 2. Or to the column maximum value (if there is not a higher threshold in list)
upper_value = bin_thresholds[i + 1]
bin_id_range_map[i].append(upper_value)
# Identify the values in the range [lower_value, upper_value] in every row,
# and assign them "i" as the value of the new column "_bin_id"
bin_column.loc[(column >= lower_value) & (column < upper_value)] = i

# Set the upper_value as the lower_value for the next higher bin
lower_value = upper_value

return bin_column, bin_id_range_map


def split_continuous_column_into_bins(
df_info: DataFrameWithInfo,
col_name: str,
bin_thresholds: List[float],
infer_upper_lower_bounds: bool = True,
extra_padding_ratio: float = 0.4,
) -> DataFrameWithInfo:
"""
Split the continuous values from ``df_info`` column ``col_name`` into discrete bins.

When ``infer_upper_lower_bounds`` is True, the lower
bound of the first bin is inferred based on the ``col_name`` minimum, and
the upper bound of the last bin is inferred based on the ``col_name``
maximum value. An ``extra_bin_padding`` is added to both values to include
the values of the future/test set sample values (where maximum and minimum
value could exceed the ones found in ``df_info``).
Therefore, when ``infer_upper_lower_bounds`` is True, the ``bin_thresholds`` values
are considered as intermediate thresholds.
When ``infer_upper_lower_bounds`` is False, the ``bin_thresholds`` first values
is considered as lower bound of the first bin and the last value is considered
as upper bound of the last bin.
In both cases the intermediate bin thresholds will be included in the bin that
uses them as lower bound. So the values associated to bin "i" are those that have
a ``col_name`` value "V" as follows:
bin_threshold[i] <= V < bin_threshold[i+1]

The new column with the computed bin IDs is added ``df_info`` with
the name "[``col_name``]_bin_id"

Parameters
----------
df_info : DataFrameWithInfo
DataFrameWithInfo instance containing the 'col_name' column to split
col_name : str
Name of the column to be split into discrete intervals
bin_thresholds : List[float]
List of the thresholds used to split the ``col_name`` column into bins.
infer_upper_lower_bounds : bool
Option to infer the lower bound of the first bin and the upper bound of the
last bin from the minimum and maximum value of the ``col_name`` column.
extra_bin_padding : float
Ratio of the total interval size that is added to the lower and upper
bound of the first and last bin respectively. Default set to 0.4.

Returns
-------
DataFrameWithInfo
Same "df_info" passed with a new column with the bin_indices
which the column value belongs to
"""
new_col_name = f"{col_name}{BIN_SPLIT_COL_SUFFIX}"

if infer_upper_lower_bounds:
(
df_info.df.loc[:, new_col_name],
bin_id_range_map,
) = _split_into_bins_infer_range_boundaries(
column=df_info.df[col_name],
bin_thresholds=bin_thresholds,
extra_padding_ratio=extra_padding_ratio,
)
else:
(
df_info.df.loc[:, new_col_name],
bin_id_range_map,
) = _split_into_bins_no_inference(
column=df_info.df[col_name], bin_thresholds=bin_thresholds
)

# Cast the new column to int8
df_info.df.loc[:, new_col_name] = df_info.df[new_col_name].astype("Int16")

Expand Down Expand Up @@ -215,7 +364,7 @@ def _one_hot_encode_column(
try:
encoded_categories.remove(NAN_CATEGORY.title())
except ValueError:
logger.debug(f"No NaN values were found in column {column}")
logging.debug(f"No NaN values were found in column {column}")
# Name the new columns after the categories (adding a suffix). Exclude the first which was dropped
new_column_names = [
f"{column}_{col}{ENCODED_COLUMN_SUFFIX}" for col in encoded_categories[1:]
Expand Down
12 changes: 8 additions & 4 deletions src/trousse/feature_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,15 @@ def __init__(
OperationTypeEnum value that describes the type of the operation performed.
original_columns: Union[Tuple[str], str, None], optional
Tuple of the columns that were used as input for the operation performed.
Default set to ().
When set to None, it means that the attribute is not
defined and it can be used when looking for a FeatureOperation, whose
original columns are unknown. Default set to ().
derived_columns: Union[Tuple[str], str, None], optional
Tuple of the columns that were generated as output after performing the
operation. If this tuple is equal to original_columns, it will be
reassigned to (). Default set to ().
reassigned to (). When set to None, it means that the attribute is not
defined and it can be used when looking for a FeatureOperation, whose
derived columns are unknown. Default set to ().
encoded_values_map: Union[Dict[int, Any], None], optional
Map that connects the ``derived_columns`` values, generated by an
encoding operation, to the represented values (of the
Expand Down Expand Up @@ -100,9 +104,9 @@ def __init__(
)

if encoded_values_map is None:
self.encoded_values_map = encoded_values_map
else:
self.encoded_values_map = {}
else:
self.encoded_values_map = encoded_values_map

if derived_columns == original_columns and original_columns is not None:
self.derived_columns = ()
Expand Down
Loading