Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix encoded values map attribute #105

Merged
merged 9 commits into from
Dec 21, 2020
10 changes: 4 additions & 6 deletions src/trousse/dataframe_with_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@
# R = How many times a unique value is repeated in column (in average)
CATEG_COL_THRESHOLD = 300

logger = logging.getLogger(__name__)


def get_df_from_csv(df_filename: str) -> pd.DataFrame:
"""
Expand All @@ -46,10 +44,10 @@ def get_df_from_csv(df_filename: str) -> pd.DataFrame:
"""
try:
df = pd.read_csv(df_filename)
logger.info("Data imported from file successfully")
logging.info("Data imported from file successfully")
return df
except FileNotFoundError as e:
logger.error(e)
logging.error(e)
return None


Expand Down Expand Up @@ -588,10 +586,10 @@ def check_duplicated_features(self) -> bool:
# TODO: Rename to "contains_duplicated_features"
# TODO: In case there are columns with the same name, check if the
# values are the same too and inform the user appropriately
logger.info("Checking duplicated columns")
logging.info("Checking duplicated columns")
# Check if there are duplicates in the df columns
if len(self.df.columns) != len(set(self.df.columns)):
logger.error("There are duplicated columns")
logging.error("There are duplicated columns")
return True
else:
return False
Expand Down
67 changes: 45 additions & 22 deletions src/trousse/feature_fix.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
)
from .feature_enum import ENCODED_COLUMN_SUFFIX, EncodingFunctions, OperationTypeEnum

logger = logging.getLogger(__name__)

NAN_CATEGORY = "Nan"
BIN_SPLIT_COL_SUFFIX = "_bin_id"

Expand All @@ -34,48 +32,73 @@ def convert_maps_from_tuple_to_str(group_id_to_tuple_map):


def split_continuous_column_into_bins(
df_info: DataFrameWithInfo, col_name, bin_threshold
df_info: DataFrameWithInfo,
col_name,
bin_thresholds,
extra_padding_ratio: float = 0.4,
):
"""
This function adds a column to DataFrame df_info called "[col_name]_bin_id" where we split the "col_name" into bins
:param df_info: DataFrameWithInfo -> DataFrameWithInfo instance containing the 'col_name' column to split
:param col_name: String -> Name of the column to be split into discrete intervals
:param bin_threshold: List -> It contains the thresholds used to separate different groups
(the threshold will be included in the bin with higher values)
:return: pd.DataFrame -> Same "df_info" passed with a new column with the bin_indices
which the column value belongs to
Dict[List] -> Dictionary with the bin_indices as keys and bin_ranges as values
Split the continuous values from ``df_info`` column ``col_name`` into discrete bins.

called "[col_name]_bin_id" where we split the "col_name" into bins
lorenz-gorini marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
df_info : DataFrameWithInfo
DataFrameWithInfo instance containing the 'col_name' column to split
col_name : str
Name of the column to be split into discrete intervals
bin_thresholds : List
It contains the thresholds used to separate different groups
(the threshold will be included in the bin with higher values)
lorenz-gorini marked this conversation as resolved.
Show resolved Hide resolved
extra_bin_padding : float
This float number indicates the ratio of the total interval size that is
lorenz-gorini marked this conversation as resolved.
Show resolved Hide resolved
added to the lower and upper bound of the first and last bin respectively.
Its purpose is to include the values of the future/test set sample values
(where maximum and minimum value could exceed the ones found here).
Default set to 0.4.

Returns
-------
pd.DataFrame
alessiamarcolini marked this conversation as resolved.
Show resolved Hide resolved
Same "df_info" passed with a new column with the bin_indices
which the column value belongs to
Dict[List]
lorenz-gorini marked this conversation as resolved.
Show resolved Hide resolved
Dictionary with the bin_indices as keys and bin_ranges as values
"""
new_col_name = f"{col_name}{BIN_SPLIT_COL_SUFFIX}"
# Initialize the bin <--> id_range map with the min and max value
bin_id_range_map = {}
# For the BIN 0 choose the column minimum as the bin "lower_value",
# in the other case the "upper_value" of the previous loops is set as "lower_value"
lower_value = min(df_info.df[col_name].unique()) - 1

min_value = min(df_info.df[col_name].unique())
max_value = max(df_info.df[col_name].unique())
extra_padding = abs(max_value - min_value) * extra_padding_ratio
# Set the bin "0" lower bound with extra padding. In the other cases
# the "upper_value" of the previous loops is set as "lower_value"
lower_value = min_value - extra_padding
# Loop over the bins (we need to increase by 1 because they are only the separating values)
for i in range(len(bin_threshold) + 1):
for i in range(len(bin_thresholds) + 1):

bin_id_range_map[i] = []
# Append the bin upper and lower value to the "bin_id_range_map"
# For the first and last bin, we set some special values
bin_id_range_map[i].append(lower_value)

# Assign the bin upper value:
# 1. Either to the higher threshold
# 2. Or to the column maximum value (if there is not a higher threshold in list)
try:
upper_value = bin_threshold[i]
upper_value = bin_thresholds[i]
except IndexError:
upper_value = max(df_info.df[col_name].unique())
# if there is not a higher threshold, the following value is used as the
# highest bound of the last bin
upper_value = max_value + extra_padding

# Append the bin upper value to the "bin_id_range_map"
bin_id_range_map[i].append(upper_value)

# Identify the values in the range [lower_value, upper_value] in every row,
# and assign them "i" as the value of the new column "_bin_id"
df_info.df.loc[
(df_info.df[col_name] >= lower_value)
& (df_info.df[col_name] <= upper_value),
& (df_info.df[col_name] < upper_value),
new_col_name,
] = i

Expand Down Expand Up @@ -215,7 +238,7 @@ def _one_hot_encode_column(
try:
encoded_categories.remove(NAN_CATEGORY.title())
except ValueError:
logger.debug(f"No NaN values were found in column {column}")
logging.debug(f"No NaN values were found in column {column}")
# Name the new columns after the categories (adding a suffix). Exclude the first which was dropped
new_column_names = [
f"{column}_{col}{ENCODED_COLUMN_SUFFIX}" for col in encoded_categories[1:]
Expand Down
12 changes: 8 additions & 4 deletions src/trousse/feature_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,15 @@ def __init__(
OperationTypeEnum value that describes the type of the operation performed.
original_columns: Union[Tuple[str], str, None], optional
Tuple of the columns that were used as input for the operation performed.
Default set to ().
When set to None, it means that the attribute is not
defined and it can be used when looking for a FeatureOperation, whose
original columns are unknown. Default set to ().
derived_columns: Union[Tuple[str], str, None], optional
Tuple of the columns that were generated as output after performing the
operation. If this tuple is equal to original_columns, it will be
reassigned to (). Default set to ().
reassigned to (). When set to None, it means that the attribute is not
defined and it can be used when looking for a FeatureOperation, whose
derived columns are unknown. Default set to ().
encoded_values_map: Union[Dict[int, Any], None], optional
Map that connects the ``derived_columns`` values, generated by an
encoding operation, to the represented values (of the
Expand Down Expand Up @@ -100,9 +104,9 @@ def __init__(
)

if encoded_values_map is None:
self.encoded_values_map = encoded_values_map
else:
self.encoded_values_map = {}
else:
self.encoded_values_map = encoded_values_map

if derived_columns == original_columns and original_columns is not None:
self.derived_columns = ()
Expand Down
30 changes: 14 additions & 16 deletions src/trousse/row_fix.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@

from .dataframe_with_info import DataFrameWithInfo, copy_df_info_with_new_df

logger = logging.getLogger(__name__)


class RowFix:
"""
Expand Down Expand Up @@ -97,7 +95,7 @@ def _check_numeric_cols(
lost_values = set(
df_info.df[col][df_info.df[col].notna() & numeric_col_serie.isna()]
)
logger.info(
logging.info(
f"{col} can be converted from String to Mixed. The percentage of "
f"non numeric values is {1- num_valuecount_ratio}.\n"
f"Values: {lost_values}"
Expand Down Expand Up @@ -136,14 +134,14 @@ def _convert_out_of_scale_values(self, elem, symbol):
result - self.percentage_to_add_out_of_scale * result
)
else:
logger.error(
f"You end up using the wrong function to convert {elem}."
logging.error(
f"You end up using the wrong function to convert `{elem}` ."
" It will be replaced with NaN."
)
return self.nan_value
except (ValueError, TypeError):
logger.error(
f"You end up using the wrong function to convert {elem}"
logging.error(
f"You end up using the wrong function to convert `{elem}` ."
" It will be replaced with NaN."
)
return self.nan_value
Expand Down Expand Up @@ -231,11 +229,11 @@ def fix_typos(
self._convert_to_float_value, column=c, axis=1
)
# Progress bar
print("=", end="")
print()
logging.info("=", end="")
logging.info()

if verbose:
logger.info(self.count_errors())
logging.info(self.count_errors())

return copy_df_info_with_new_df(df_info=df_info, new_pandas_df=df_converted)

Expand Down Expand Up @@ -269,7 +267,7 @@ def cols_to_correct_dtype(
bool_cols = bool_cols.union(cols_by_type.bool_cols)
df_info.df[list(bool_cols)] = df_info.df[list(bool_cols)].astype(np.bool)
if verbose:
logger.info(
logging.info(
f"Casted to INT32: {int_cols}\n Casted to FLOAT64: {float_cols}\n"
f"Casted to BOOL: {bool_cols}"
)
Expand Down Expand Up @@ -320,9 +318,9 @@ def fix_common_errors(

def print_errors_per_column(self):
""" This is to print the actual error values, to check the fixes"""
print("The errors per feature are:")
logging.info("The errors per feature are:")
for c in self.errors_before_correction_dict.keys():
print(
logging.info(
f"{c}: {len(self.errors_before_correction_dict[c])} :"
f" {set(self.errors_before_correction_dict[c])}"
f" ---> {len(self.errors_after_correction_dict[c])} :"
Expand Down Expand Up @@ -368,7 +366,7 @@ def force_conversion_to_numeric(
dfinfo_num_only = copy_df_info_with_new_df(df_info, df_info.df)
non_num_convertible_count = 0
if verbose:
logger.info("The values non convertible to numbers are:")
logging.info("The values non convertible to numbers are:")
for col in columns:
# Convert every value to number if possible, the other will be NaN
forced_converted_to_num = pd.to_numeric(df_info.df[col], errors="coerce")
Expand All @@ -380,15 +378,15 @@ def force_conversion_to_numeric(
)
non_num_convertible_count += non_numeric_in_column.sum()
if verbose:
logger.info(
logging.info(
f"{col} -> \n"
f"{df_info.df[col][non_numeric_in_column].value_counts()}"
)
# Set to NaN all the non convertible values
if not dry_run:
dfinfo_num_only.df.loc[:, col] = forced_converted_to_num

logger.info(
logging.info(
"The total count of values non convertible to numbers"
f" is: {non_num_convertible_count}"
)
Expand Down