diff --git a/DIRECTORY.md b/DIRECTORY.md index 92691b6..8d7bd0a 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -1 +1 @@ -/home/runner/work/_temp/fcbe3165-95cc-4878-8e84-ef18a16fbf3a.sh: line 1: scripts/build_directory_md.py: Permission denied +/home/runner/work/_temp/8260a9d1-eff9-42f4-b1a3-1bd90558f43e.sh: line 1: scripts/build_directory_md.py: Permission denied diff --git a/pysnippets/Data_preprocessing/data_cleaning.py b/pysnippets/Data_preprocessing/data_cleaning.py index 1f9ec5a..9326729 100644 --- a/pysnippets/Data_preprocessing/data_cleaning.py +++ b/pysnippets/Data_preprocessing/data_cleaning.py @@ -1,13 +1,61 @@ import pandas as pd def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame: + """ + Remove duplicate rows from the DataFrame. + + Args: + - df (pd.DataFrame): The input DataFrame. + + Returns: + - pd.DataFrame: A new DataFrame with duplicates removed. + """ + if df.empty: + print("Warning: The DataFrame is empty.") return df.drop_duplicates() -def replace_missing_with_mean(df: pd.DataFrame, column: str) -> pd.DataFrame: - mean_value = df[column].mean() - df[column].fillna(mean_value, inplace=True) +def replace_missing_with_mean(df: pd.DataFrame, column: str, default_value: float = None) -> pd.DataFrame: + """ + Replace missing values in a specified column with the column's mean or a provided default value. + + Args: + - df (pd.DataFrame): The input DataFrame. + - column (str): The column name where missing values need to be replaced. + - default_value (float, optional): If provided, will replace missing values with this value. + + Returns: + - pd.DataFrame: A new DataFrame with missing values replaced. + + Raises: + - ValueError: If the column does not exist in the DataFrame. + """ + if column not in df.columns: + raise ValueError(f"Column '{column}' does not exist in the DataFrame.") + + if default_value is not None: + df[column].fillna(default_value, inplace=True) + else: + mean_value = df[column].mean() + df[column].fillna(mean_value, inplace=True) + return df def standardize_text(df: pd.DataFrame, column: str) -> pd.DataFrame: + """ + Standardize the text in a specified column by converting it to lowercase and stripping whitespace. + + Args: + - df (pd.DataFrame): The input DataFrame. + - column (str): The column to standardize. + + Returns: + - pd.DataFrame: A new DataFrame with standardized text in the specified column. + + Raises: + - ValueError: If the column does not exist in the DataFrame. + """ + if column not in df.columns: + raise ValueError(f"Column '{column}' does not exist in the DataFrame.") + df[column] = df[column].str.lower().str.strip() - return df \ No newline at end of file + return df diff --git a/pysnippets/Data_preprocessing/data_transformation.py b/pysnippets/Data_preprocessing/data_transformation.py index 662bda5..9934239 100644 --- a/pysnippets/Data_preprocessing/data_transformation.py +++ b/pysnippets/Data_preprocessing/data_transformation.py @@ -2,13 +2,69 @@ import numpy as np def log_transform(df: pd.DataFrame, column: str) -> pd.DataFrame: + """ + Apply a logarithmic transformation (log1p) to a specified column. + The transformation is log(1 + x) to handle zero and positive values. + + Args: + - df (pd.DataFrame): The input DataFrame. + - column (str): The column name to transform. + + Returns: + - pd.DataFrame: A new DataFrame with the transformed column. + + Raises: + - ValueError: If the column does not exist in the DataFrame or contains non-positive values. + """ + if column not in df.columns: + raise ValueError(f"Column '{column}' does not exist in the DataFrame.") + + # Ensure that the values are positive before applying log transformation + if (df[column] <= 0).any(): + raise ValueError(f"Log transformation cannot be applied to non-positive values in column '{column}'.") + df[column] = np.log1p(df[column]) return df def power_transform(df: pd.DataFrame, column: str, power: float = 2.0) -> pd.DataFrame: + """ + Apply a power transformation to a specified column. + + Args: + - df (pd.DataFrame): The input DataFrame. + - column (str): The column name to transform. + - power (float): The power to raise the values to (default is 2.0). + + Returns: + - pd.DataFrame: A new DataFrame with the transformed column. + + Raises: + - ValueError: If the column does not exist in the DataFrame. + """ + if column not in df.columns: + raise ValueError(f"Column '{column}' does not exist in the DataFrame.") + df[column] = np.power(df[column], power) return df def binarize(df: pd.DataFrame, column: str, threshold: float) -> pd.DataFrame: + """ + Binarize a specified column based on a threshold. Values greater than the threshold + are set to 1, and values less than or equal to the threshold are set to 0. + + Args: + - df (pd.DataFrame): The input DataFrame. + - column (str): The column name to binarize. + - threshold (float): The threshold for binarization. + + Returns: + - pd.DataFrame: A new DataFrame with the binarized column. + + Raises: + - ValueError: If the column does not exist in the DataFrame. + """ + if column not in df.columns: + raise ValueError(f"Column '{column}' does not exist in the DataFrame.") + df[column] = (df[column] > threshold).astype(int) - return df \ No newline at end of file + return df diff --git a/pysnippets/Data_preprocessing/encoding.py b/pysnippets/Data_preprocessing/encoding.py index 058d1b1..66e396b 100644 --- a/pysnippets/Data_preprocessing/encoding.py +++ b/pysnippets/Data_preprocessing/encoding.py @@ -1,13 +1,48 @@ import pandas as pd from sklearn.preprocessing import OneHotEncoder, LabelEncoder -def one_hot_encode(df: pd.DataFrame, column: str) -> pd.DataFrame: - encoder = OneHotEncoder(sparse=False, drop='first') +def one_hot_encode(df: pd.DataFrame, column: str, drop_first: bool = True) -> pd.DataFrame: + """ + Perform one-hot encoding on a specified column. + + Args: + - df (pd.DataFrame): The input DataFrame. + - column (str): The column name to encode. + - drop_first (bool): Whether to drop the first category to avoid multicollinearity (default is True). + + Returns: + - pd.DataFrame: A new DataFrame with the one-hot encoded column(s). + + Raises: + - ValueError: If the column does not exist in the DataFrame. + """ + if column not in df.columns: + raise ValueError(f"Column '{column}' does not exist in the DataFrame.") + + encoder = OneHotEncoder(sparse=False, drop='first' if drop_first else None) encoded = encoder.fit_transform(df[[column]]) encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([column])) + + # Concatenate the original dataframe without the encoded column and the encoded DataFrame return pd.concat([df.drop(column, axis=1), encoded_df], axis=1) def label_encode(df: pd.DataFrame, column: str) -> pd.DataFrame: + """ + Perform label encoding on a specified column (converting categories to integer labels). + + Args: + - df (pd.DataFrame): The input DataFrame. + - column (str): The column name to encode. + + Returns: + - pd.DataFrame: A new DataFrame with the label encoded column. + + Raises: + - ValueError: If the column does not exist in the DataFrame. + """ + if column not in df.columns: + raise ValueError(f"Column '{column}' does not exist in the DataFrame.") + encoder = LabelEncoder() df[column] = encoder.fit_transform(df[column]) - return df \ No newline at end of file + return df \ No newline at end of file diff --git a/pysnippets/Data_preprocessing/feature_scaling.py b/pysnippets/Data_preprocessing/feature_scaling.py index fd42f54..cbfe7d3 100644 --- a/pysnippets/Data_preprocessing/feature_scaling.py +++ b/pysnippets/Data_preprocessing/feature_scaling.py @@ -2,11 +2,49 @@ import pandas as pd def standardize_features(df: pd.DataFrame, columns: list) -> pd.DataFrame: + """ + Standardize the features (scale to zero mean and unit variance) for specified columns. + + Args: + - df (pd.DataFrame): The input DataFrame. + - columns (list): List of column names to standardize. + + Returns: + - pd.DataFrame: A new DataFrame with standardized features. + + Raises: + - ValueError: If any of the columns do not exist in the DataFrame. + """ + # Check if columns exist in the DataFrame + missing_cols = [col for col in columns if col not in df.columns] + if missing_cols: + raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.") + + # Apply StandardScaler scaler = StandardScaler() df[columns] = scaler.fit_transform(df[columns]) return df def min_max_scale_features(df: pd.DataFrame, columns: list) -> pd.DataFrame: + """ + Apply Min-Max scaling (scale to a [0, 1] range) for specified columns. + + Args: + - df (pd.DataFrame): The input DataFrame. + - columns (list): List of column names to scale. + + Returns: + - pd.DataFrame: A new DataFrame with Min-Max scaled features. + + Raises: + - ValueError: If any of the columns do not exist in the DataFrame. + """ + # Check if columns exist in the DataFrame + missing_cols = [col for col in columns if col not in df.columns] + if missing_cols: + raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.") + + # Apply MinMaxScaler scaler = MinMaxScaler() df[columns] = scaler.fit_transform(df[columns]) - return df \ No newline at end of file + return df diff --git a/pysnippets/Data_preprocessing/missing_values.py b/pysnippets/Data_preprocessing/missing_values.py index 1bca590..c01c362 100644 --- a/pysnippets/Data_preprocessing/missing_values.py +++ b/pysnippets/Data_preprocessing/missing_values.py @@ -2,14 +2,59 @@ from sklearn.impute import SimpleImputer def impute_numeric_with_mean(df: pd.DataFrame, columns: list) -> pd.DataFrame: + """ + Impute missing numeric values in specified columns with the mean value of each column. + + Args: + - df (pd.DataFrame): The input DataFrame. + - columns (list): List of column names to impute. + + Returns: + - pd.DataFrame: A new DataFrame with missing numeric values imputed with the mean. + + Raises: + - ValueError: If any of the columns do not exist in the DataFrame. + """ + missing_cols = [col for col in columns if col not in df.columns] + if missing_cols: + raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.") + imputer = SimpleImputer(strategy='mean') df[columns] = imputer.fit_transform(df[columns]) return df def impute_categorical_with_mode(df: pd.DataFrame, columns: list) -> pd.DataFrame: + """ + Impute missing categorical values in specified columns with the most frequent value (mode). + + Args: + - df (pd.DataFrame): The input DataFrame. + - columns (list): List of column names to impute. + + Returns: + - pd.DataFrame: A new DataFrame with missing categorical values imputed with the mode. + + Raises: + - ValueError: If any of the columns do not exist in the DataFrame. + """ + missing_cols = [col for col in columns if col not in df.columns] + if missing_cols: + raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.") + imputer = SimpleImputer(strategy='most_frequent') df[columns] = imputer.fit_transform(df[columns]) return df def drop_missing(df: pd.DataFrame, threshold: float = 0.5) -> pd.DataFrame: - return df.dropna(thresh=int(df.shape[1] * threshold)) \ No newline at end of file + """ + Drop rows with missing values if the number of missing values exceeds a given threshold. + + Args: + - df (pd.DataFrame): The input DataFrame. + - threshold (float): Proportion of non-null values required in a row to keep it. Default is 0.5. + + Returns: + - pd.DataFrame: A new DataFrame with rows dropped based on the missing value threshold. + """ + # Drop rows where the number of non-null values is less than the threshold + return df.dropna(thresh=int(df.shape[1] * threshold)) diff --git a/pysnippets/Data_preprocessing/outlier_detection.py b/pysnippets/Data_preprocessing/outlier_detection.py index aee16ad..d63d55a 100644 --- a/pysnippets/Data_preprocessing/outlier_detection.py +++ b/pysnippets/Data_preprocessing/outlier_detection.py @@ -2,15 +2,52 @@ import numpy as np def remove_outliers_iqr(df: pd.DataFrame, column: str) -> pd.DataFrame: + """ + Remove outliers from a specified column using the Interquartile Range (IQR) method. + + Args: + - df (pd.DataFrame): The input DataFrame. + - column (str): The column name to remove outliers from. + + Returns: + - pd.DataFrame: A new DataFrame with outliers removed based on the IQR method. + + Raises: + - ValueError: If the column does not exist in the DataFrame. + """ + if column not in df.columns: + raise ValueError(f"Column '{column}' does not exist in the DataFrame.") + Q1 = df[column].quantile(0.25) Q3 = df[column].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR + + # Remove rows where the column's value is outside the IQR bounds return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)] def remove_outliers_zscore(df: pd.DataFrame, column: str, threshold: float = 3.0) -> pd.DataFrame: + """ + Remove outliers from a specified column using the Z-Score method. + + Args: + - df (pd.DataFrame): The input DataFrame. + - column (str): The column name to remove outliers from. + - threshold (float): The Z-score threshold to identify outliers (default is 3.0). + + Returns: + - pd.DataFrame: A new DataFrame with outliers removed based on the Z-Score method. + + Raises: + - ValueError: If the column does not exist in the DataFrame. + """ + if column not in df.columns: + raise ValueError(f"Column '{column}' does not exist in the DataFrame.") + mean = df[column].mean() std = df[column].std() z_scores = (df[column] - mean) / std - return df[np.abs(z_scores) <= threshold] \ No newline at end of file + + # Remove rows where the absolute Z-Score exceeds the threshold + return df[np.abs(z_scores) <= threshold] diff --git a/pysnippets/Data_preprocessing/test_data_cleaning.py b/pysnippets/Data_preprocessing/test_data_cleaning.py index 25a507f..143575a 100644 --- a/pysnippets/Data_preprocessing/test_data_cleaning.py +++ b/pysnippets/Data_preprocessing/test_data_cleaning.py @@ -4,6 +4,7 @@ class TestDataCleaning(unittest.TestCase): def setUp(self): + # Set up a sample DataFrame for testing self.df = pd.DataFrame({ 'Name': ['Alice', 'Bob', 'Alice', 'Charlie'], 'Age': [25, 30, 25, None], @@ -11,16 +12,41 @@ def setUp(self): }) def test_remove_duplicates(self): + # Test that duplicates are removed cleaned_df = remove_duplicates(self.df) - self.assertEqual(len(cleaned_df), 3) + self.assertEqual(len(cleaned_df), 3, "Duplicates were not removed correctly.") + self.assertTrue('Alice' in cleaned_df['Name'].values, "Duplicate for 'Alice' was not removed.") def test_replace_missing_with_mean(self): + # Test replacing missing values with the mean cleaned_df = replace_missing_with_mean(self.df, 'Age') - self.assertEqual(cleaned_df.loc[3, 'Age'], 26.666666666666668) - + expected_mean = self.df['Age'].mean() + self.assertEqual(cleaned_df.loc[3, 'Age'], expected_mean, "Missing value was not replaced correctly with the mean.") + def test_standardize_text(self): + # Test standardizing text (lowercase and stripped) standardized_df = standardize_text(self.df, 'Name') - self.assertTrue(standardized_df['Name'].str.islower().all()) + self.assertTrue(standardized_df['Name'].str.islower().all(), "Text was not standardized to lowercase.") + self.assertFalse(standardized_df['Name'].str.contains(' ').any(), "Text was not stripped of whitespace.") + + def test_replace_missing_with_mean_no_missing_values(self): + # Test replacing missing values when there are no missing values + df_no_missing = self.df.copy() + df_no_missing['Age'].iloc[3] = 29 # No missing value in 'Age' + cleaned_df = replace_missing_with_mean(df_no_missing, 'Age') + self.assertEqual(cleaned_df.loc[3, 'Age'], 29, "The value should remain unchanged when there is no missing value.") + + def test_remove_duplicates_empty_df(self): + # Test that no error is raised when removing duplicates from an empty DataFrame + empty_df = pd.DataFrame(columns=self.df.columns) + cleaned_df = remove_duplicates(empty_df) + self.assertEqual(len(cleaned_df), 0, "Removing duplicates from an empty DataFrame failed.") + + def test_standardize_text_empty_column(self): + # Test standardizing text for an empty column + empty_df = pd.DataFrame({'Name': []}) + cleaned_df = standardize_text(empty_df, 'Name') + self.assertEqual(len(cleaned_df), 0, "Standardizing text on an empty column failed.") if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/pysnippets/Data_preprocessing/test_data_transformation.py b/pysnippets/Data_preprocessing/test_data_transformation.py index 188b9cd..efcdc00 100644 --- a/pysnippets/Data_preprocessing/test_data_transformation.py +++ b/pysnippets/Data_preprocessing/test_data_transformation.py @@ -5,24 +5,65 @@ class TestDataTransformation(unittest.TestCase): def setUp(self): + # Set up a sample DataFrame for testing self.df = pd.DataFrame({ 'Views': [100, 150, 200, 250, 300] }) def test_log_transform(self): + # Test Log Transformation (log1p applied) transformed_df = log_transform(self.df.copy(), 'Views') expected = np.log1p([100, 150, 200, 250, 300]) - pd.testing.assert_series_equal(transformed_df['Views'], pd.Series(expected)) + pd.testing.assert_series_equal(transformed_df['Views'], pd.Series(expected), check_dtype=False) def test_power_transform(self): + # Test Power Transformation (square by default) transformed_df = power_transform(self.df.copy(), 'Views', power=2) expected = [100**2, 150**2, 200**2, 250**2, 300**2] - pd.testing.assert_series_equal(transformed_df['Views'], pd.Series(expected)) + pd.testing.assert_series_equal(transformed_df['Views'], pd.Series(expected), check_dtype=False) def test_binarize(self): + # Test Binarization (threshold at 200) transformed_df = binarize(self.df.copy(), 'Views', threshold=200) expected = [0, 0, 1, 1, 1] - pd.testing.assert_series_equal(transformed_df['Views'], pd.Series(expected)) + pd.testing.assert_series_equal(transformed_df['Views'], pd.Series(expected), check_dtype=False) + + def test_log_transform_edge_case(self): + # Test Log Transformation with 0 (should handle log(0) = -inf gracefully) + df_zero = pd.DataFrame({'Views': [0, 1, 2]}) + transformed_df = log_transform(df_zero, 'Views') + expected = np.log1p([0, 1, 2]) + pd.testing.assert_series_equal(transformed_df['Views'], pd.Series(expected), check_dtype=False) + + def test_power_transform_negative_power(self): + # Test Power Transformation with negative power (should work with fractional values) + transformed_df = power_transform(self.df.copy(), 'Views', power=-2) + expected = [100**-2, 150**-2, 200**-2, 250**-2, 300**-2] + pd.testing.assert_series_equal(transformed_df['Views'], pd.Series(expected), check_dtype=False) + + def test_binarize_edge_case(self): + # Test Binarization with threshold lower than all values + transformed_df = binarize(self.df.copy(), 'Views', threshold=50) + expected = [1, 1, 1, 1, 1] # All values should be 1 since threshold is 50 + pd.testing.assert_series_equal(transformed_df['Views'], pd.Series(expected), check_dtype=False) + + def test_empty_dataframe(self): + # Test transformations on an empty DataFrame + empty_df = pd.DataFrame({'Views': []}) + transformed_df = log_transform(empty_df, 'Views') + self.assertTrue(transformed_df.empty, "The transformed DataFrame should be empty.") + + transformed_df = power_transform(empty_df, 'Views', power=2) + self.assertTrue(transformed_df.empty, "The transformed DataFrame should be empty.") + + transformed_df = binarize(empty_df, 'Views', threshold=200) + self.assertTrue(transformed_df.empty, "The transformed DataFrame should be empty.") + + def test_binarize_non_numeric(self): + # Test Binarization on a non-numeric column (should raise error) + df_non_numeric = pd.DataFrame({'Views': ['a', 'b', 'c', 'd', 'e']}) + with self.assertRaises(TypeError): + binarize(df_non_numeric, 'Views', threshold=200) if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/pysnippets/Data_preprocessing/test_encoding.py b/pysnippets/Data_preprocessing/test_encoding.py index d4fc8c8..772f334 100644 --- a/pysnippets/Data_preprocessing/test_encoding.py +++ b/pysnippets/Data_preprocessing/test_encoding.py @@ -4,20 +4,58 @@ class TestEncoding(unittest.TestCase): def setUp(self): + # Set up a sample DataFrame for testing self.df = pd.DataFrame({ 'Color': ['Red', 'Blue', 'Green', 'Blue'] }) def test_one_hot_encode(self): + # Test One-Hot Encoding encoded_df = one_hot_encode(self.df, 'Color') - self.assertIn('Color_Green', encoded_df.columns) - self.assertIn('Color_Red', encoded_df.columns) - self.assertNotIn('Color', encoded_df.columns) + self.assertIn('Color_Green', encoded_df.columns, "Encoded DataFrame should contain 'Color_Green'.") + self.assertIn('Color_Red', encoded_df.columns, "Encoded DataFrame should contain 'Color_Red'.") + self.assertNotIn('Color', encoded_df.columns, "Original 'Color' column should be dropped.") + self.assertEqual(encoded_df.shape[1], 3, "The number of columns after one-hot encoding is incorrect.") def test_label_encode(self): + # Test Label Encoding encoded_df = label_encode(self.df.copy(), 'Color') - self.assertTrue(encoded_df['Color'].dtype == int) - self.assertEqual(encoded_df['Color'].min(), 0) + self.assertTrue(encoded_df['Color'].dtype == int, "Encoded column should have integer dtype.") + self.assertEqual(encoded_df['Color'].min(), 0, "The minimum label value should be 0.") + self.assertEqual(encoded_df['Color'].max(), 2, "The maximum label value should be 2 (for 'Red', 'Blue', 'Green').") + + def test_one_hot_encode_empty_df(self): + # Test One-Hot Encoding with an empty DataFrame + empty_df = pd.DataFrame(columns=['Color']) + encoded_df = one_hot_encode(empty_df, 'Color') + self.assertTrue(encoded_df.empty, "The encoded DataFrame should be empty when the input is empty.") + + def test_label_encode_empty_df(self): + # Test Label Encoding with an empty DataFrame + empty_df = pd.DataFrame(columns=['Color']) + encoded_df = label_encode(empty_df, 'Color') + self.assertTrue(encoded_df.empty, "The encoded DataFrame should be empty when the input is empty.") + + def test_one_hot_encode_single_unique_value(self): + # Test One-Hot Encoding when there is only one unique value in the column + single_value_df = pd.DataFrame({'Color': ['Red', 'Red', 'Red', 'Red']}) + encoded_df = one_hot_encode(single_value_df, 'Color') + self.assertIn('Color_Red', encoded_df.columns, "Encoded DataFrame should contain 'Color_Red'.") + self.assertEqual(encoded_df.shape[1], 1, "The number of columns after one-hot encoding with a single value should be 1.") + + def test_label_encode_single_unique_value(self): + # Test Label Encoding when there is only one unique value in the column + single_value_df = pd.DataFrame({'Color': ['Red', 'Red', 'Red', 'Red']}) + encoded_df = label_encode(single_value_df, 'Color') + self.assertEqual(encoded_df['Color'].min(), 0, "The encoded value should be 0 for a single unique value.") + self.assertEqual(encoded_df['Color'].max(), 0, "The encoded value should be 0 for a single unique value.") + + def test_label_encode_non_string_column(self): + # Test Label Encoding with a non-string column (e.g., numeric) + numeric_df = pd.DataFrame({'Color': [1, 2, 1, 3]}) + encoded_df = label_encode(numeric_df, 'Color') + self.assertTrue(encoded_df['Color'].dtype == int, "Encoded column should have integer dtype.") + self.assertEqual(encoded_df['Color'].min(), 0, "The minimum label value should be 0.") if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/pysnippets/Data_preprocessing/test_feature_scaling.py b/pysnippets/Data_preprocessing/test_feature_scaling.py index 77db021..619cd29 100644 --- a/pysnippets/Data_preprocessing/test_feature_scaling.py +++ b/pysnippets/Data_preprocessing/test_feature_scaling.py @@ -4,20 +4,64 @@ class TestFeatureScaling(unittest.TestCase): def setUp(self): + # Set up a sample DataFrame for testing self.df = pd.DataFrame({ 'Height': [150, 160, 170, 180], 'Weight': [50, 60, 70, 80] }) def test_standardize_features(self): + # Test Standardization (mean should be 0 and std should be 1) scaled_df = standardize_features(self.df.copy(), ['Height', 'Weight']) - self.assertAlmostEqual(scaled_df['Height'].mean(), 0.0, places=5) - self.assertAlmostEqual(scaled_df['Weight'].mean(), 0.0, places=5) + self.assertAlmostEqual(scaled_df['Height'].mean(), 0.0, places=5, msg="Height column mean is not close to 0 after standardization.") + self.assertAlmostEqual(scaled_df['Weight'].mean(), 0.0, places=5, msg="Weight column mean is not close to 0 after standardization.") + self.assertAlmostEqual(scaled_df['Height'].std(), 1.0, places=5, msg="Height column standard deviation is not 1 after standardization.") + self.assertAlmostEqual(scaled_df['Weight'].std(), 1.0, places=5, msg="Weight column standard deviation is not 1 after standardization.") def test_min_max_scale_features(self): + # Test Min-Max Scaling (values should be in the range [0, 1]) scaled_df = min_max_scale_features(self.df.copy(), ['Height', 'Weight']) - self.assertEqual(scaled_df['Height'].min(), 0.0) - self.assertEqual(scaled_df['Weight'].max(), 1.0) + self.assertEqual(scaled_df['Height'].min(), 0.0, "Min value of Height after Min-Max scaling should be 0.") + self.assertEqual(scaled_df['Height'].max(), 1.0, "Max value of Height after Min-Max scaling should be 1.") + self.assertEqual(scaled_df['Weight'].min(), 0.0, "Min value of Weight after Min-Max scaling should be 0.") + self.assertEqual(scaled_df['Weight'].max(), 1.0, "Max value of Weight after Min-Max scaling should be 1.") + + def test_standardize_empty_df(self): + # Test Standardization with an empty DataFrame + empty_df = pd.DataFrame(columns=['Height', 'Weight']) + scaled_df = standardize_features(empty_df, ['Height', 'Weight']) + self.assertTrue(scaled_df.empty, "The scaled DataFrame should be empty for an empty input.") + + def test_min_max_scale_empty_df(self): + # Test Min-Max Scaling with an empty DataFrame + empty_df = pd.DataFrame(columns=['Height', 'Weight']) + scaled_df = min_max_scale_features(empty_df, ['Height', 'Weight']) + self.assertTrue(scaled_df.empty, "The scaled DataFrame should be empty for an empty input.") + + def test_standardize_single_value_column(self): + # Test Standardization with a column that has only one unique value + single_value_df = pd.DataFrame({'Height': [170, 170, 170, 170], 'Weight': [70, 70, 70, 70]}) + scaled_df = standardize_features(single_value_df, ['Height', 'Weight']) + self.assertEqual(scaled_df['Height'].std(), 0.0, "The standard deviation of a single-value column should be 0.") + self.assertEqual(scaled_df['Weight'].std(), 0.0, "The standard deviation of a single-value column should be 0.") + self.assertEqual(scaled_df['Height'].mean(), 0.0, "The mean should be 0 for a column with a single unique value after standardization.") + self.assertEqual(scaled_df['Weight'].mean(), 0.0, "The mean should be 0 for a column with a single unique value after standardization.") + + def test_min_max_scale_single_value_column(self): + # Test Min-Max Scaling with a column that has only one unique value + single_value_df = pd.DataFrame({'Height': [170, 170, 170, 170], 'Weight': [70, 70, 70, 70]}) + scaled_df = min_max_scale_features(single_value_df, ['Height', 'Weight']) + self.assertEqual(scaled_df['Height'].min(), 0.0, "Min value should be 0 when there is only one unique value.") + self.assertEqual(scaled_df['Height'].max(), 0.0, "Max value should be 0 when there is only one unique value.") + self.assertEqual(scaled_df['Weight'].min(), 0.0, "Min value should be 0 when there is only one unique value.") + self.assertEqual(scaled_df['Weight'].max(), 0.0, "Max value should be 0 when there is only one unique value.") + + def test_standardize_with_negative_values(self): + # Test Standardization with negative values + negative_df = pd.DataFrame({'Height': [-150, -160, -170, -180], 'Weight': [-50, -60, -70, -80]}) + scaled_df = standardize_features(negative_df, ['Height', 'Weight']) + self.assertAlmostEqual(scaled_df['Height'].mean(), 0.0, places=5, msg="Mean of negative values after standardization should be 0.") + self.assertAlmostEqual(scaled_df['Weight'].mean(), 0.0, places=5, msg="Mean of negative values after standardization should be 0.") if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/pysnippets/Data_preprocessing/test_missing_values.py b/pysnippets/Data_preprocessing/test_missing_values.py index f8224da..ec53c77 100644 --- a/pysnippets/Data_preprocessing/test_missing_values.py +++ b/pysnippets/Data_preprocessing/test_missing_values.py @@ -24,5 +24,51 @@ def test_drop_missing(self): cleaned_df = drop_missing(self.df.copy(), threshold=0.67) self.assertEqual(len(cleaned_df), 3) + def test_impute_numeric_with_mean_all_missing(self): + # All values are missing in the column 'Income', check imputation with mean. + df_all_missing = pd.DataFrame({ + 'Age': [25, np.nan, 30, 22], + 'Gender': ['M', 'F', np.nan, 'F'], + 'Income': [np.nan, np.nan, np.nan, np.nan] + }) + imputed_df = impute_numeric_with_mean(df_all_missing, ['Income']) + self.assertAlmostEqual(imputed_df['Income'].iloc[0], np.nan) + + def test_drop_missing_empty_df(self): + # Test for an empty DataFrame + empty_df = pd.DataFrame(columns=['Age', 'Gender', 'Income']) + cleaned_df = drop_missing(empty_df, threshold=0.5) + self.assertTrue(cleaned_df.empty, "Empty DataFrame should remain empty.") + + def test_impute_categorical_with_mode_empty(self): + # Test impute categorical with mode for empty column 'Gender' + df_empty_cat = pd.DataFrame({ + 'Age': [25, np.nan, 30, 22], + 'Gender': [np.nan, np.nan, np.nan, np.nan], + 'Income': [50000, 60000, np.nan, 52000] + }) + imputed_df = impute_categorical_with_mode(df_empty_cat, ['Gender']) + self.assertEqual(imputed_df['Gender'].isna().sum(), 0) + + def test_drop_missing_threshold_1(self): + # Test with a threshold of 1 (i.e., drop rows with any missing values) + df_with_missing = pd.DataFrame({ + 'Age': [25, np.nan, 30, 22], + 'Gender': ['M', 'F', np.nan, 'F'], + 'Income': [50000, 60000, np.nan, 52000] + }) + cleaned_df = drop_missing(df_with_missing, threshold=1) + self.assertEqual(len(cleaned_df), 0, "All rows should be dropped when threshold is 1.0.") + + def test_drop_missing_no_missing(self): + # Test for no missing values in the DataFrame + df_no_missing = pd.DataFrame({ + 'Age': [25, 30, 30, 22], + 'Gender': ['M', 'F', 'M', 'F'], + 'Income': [50000, 60000, 65000, 52000] + }) + cleaned_df = drop_missing(df_no_missing, threshold=0.67) + self.assertEqual(len(cleaned_df), 4, "No rows should be dropped when there are no missing values.") + if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/pysnippets/Data_preprocessing/test_outlier_detection.py b/pysnippets/Data_preprocessing/test_outlier_detection.py index 15831cd..0d4e7c5 100644 --- a/pysnippets/Data_preprocessing/test_outlier_detection.py +++ b/pysnippets/Data_preprocessing/test_outlier_detection.py @@ -4,17 +4,64 @@ class TestOutlierDetection(unittest.TestCase): def setUp(self): + # A sample DataFrame with salary data, including an outlier (120000) self.df = pd.DataFrame({ 'Salary': [50000, 60000, 55000, 120000, 58000, 59000, 61000] }) def test_remove_outliers_iqr(self): + # Remove outliers using IQR method cleaned_df = remove_outliers_iqr(self.df, 'Salary') + + # Ensure that the outlier (120000) is removed self.assertFalse((cleaned_df['Salary'] == 120000).any()) + + # Check if the length of the DataFrame is reduced by one (1 outlier should be removed) + self.assertEqual(len(cleaned_df), len(self.df) - 1, "One row should be removed when outlier is detected.") def test_remove_outliers_zscore(self): + # Remove outliers using Z-score method (with a threshold of 2) cleaned_df = remove_outliers_zscore(self.df, 'Salary', threshold=2) + + # Ensure that the outlier (120000) is removed self.assertFalse((cleaned_df['Salary'] == 120000).any()) + + # Check if the length of the DataFrame is reduced by one (1 outlier should be removed) + self.assertEqual(len(cleaned_df), len(self.df) - 1, "One row should be removed when outlier is detected.") + + def test_no_outliers(self): + # A DataFrame with no outliers + df_no_outliers = pd.DataFrame({ + 'Salary': [50000, 60000, 55000, 58000, 59000, 61000] + }) + + # Check if the length remains the same + cleaned_df = remove_outliers_iqr(df_no_outliers, 'Salary') + self.assertEqual(len(cleaned_df), len(df_no_outliers), "No rows should be removed if there are no outliers.") + + # Also test using Z-score method with a threshold of 2 + cleaned_df = remove_outliers_zscore(df_no_outliers, 'Salary', threshold=2) + self.assertEqual(len(cleaned_df), len(df_no_outliers), "No rows should be removed if there are no outliers.") + + def test_multiple_outliers(self): + # Data with multiple outliers + df_multiple_outliers = pd.DataFrame({ + 'Salary': [50000, 60000, 55000, 120000, 130000, 140000, 61000] + }) + + # Remove outliers using IQR method + cleaned_df = remove_outliers_iqr(df_multiple_outliers, 'Salary') + self.assertNotIn(120000, cleaned_df['Salary'].values) + self.assertNotIn(130000, cleaned_df['Salary'].values) + self.assertNotIn(140000, cleaned_df['Salary'].values) + self.assertEqual(len(cleaned_df), len(df_multiple_outliers) - 3, "Three outliers should be removed.") + + # Remove outliers using Z-score method + cleaned_df = remove_outliers_zscore(df_multiple_outliers, 'Salary', threshold=2) + self.assertNotIn(120000, cleaned_df['Salary'].values) + self.assertNotIn(130000, cleaned_df['Salary'].values) + self.assertNotIn(140000, cleaned_df['Salary'].values) + self.assertEqual(len(cleaned_df), len(df_multiple_outliers) - 3, "Three outliers should be removed.") if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main()