Merge pull request #474 from smog-root/data_processing

Enhance Outlier Detection Test Coverage and Edge Case Handling
UTSAVS26 · Jan 7, 2025 · 13718c9 · 13718c9
2 parents b7b352b + 5f7df50
commit 13718c9
Show file tree

Hide file tree

Showing 13 changed files with 535 additions and 34 deletions.
diff --git a/DIRECTORY.md b/DIRECTORY.md
@@ -1 +1 @@
-/home/runner/work/_temp/fcbe3165-95cc-4878-8e84-ef18a16fbf3a.sh: line 1: scripts/build_directory_md.py: Permission denied
+/home/runner/work/_temp/8260a9d1-eff9-42f4-b1a3-1bd90558f43e.sh: line 1: scripts/build_directory_md.py: Permission denied
diff --git a/pysnippets/Data_preprocessing/data_cleaning.py b/pysnippets/Data_preprocessing/data_cleaning.py
@@ -1,13 +1,61 @@
 import pandas as pd
 
 def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Remove duplicate rows from the DataFrame.
+    
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    
+    Returns:
+    - pd.DataFrame: A new DataFrame with duplicates removed.
+    """
+    if df.empty:
+        print("Warning: The DataFrame is empty.")
     return df.drop_duplicates()
 
-def replace_missing_with_mean(df: pd.DataFrame, column: str) -> pd.DataFrame:
-    mean_value = df[column].mean()
-    df[column].fillna(mean_value, inplace=True)
+def replace_missing_with_mean(df: pd.DataFrame, column: str, default_value: float = None) -> pd.DataFrame:
+    """
+    Replace missing values in a specified column with the column's mean or a provided default value.
+    
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    - column (str): The column name where missing values need to be replaced.
+    - default_value (float, optional): If provided, will replace missing values with this value.
+    
+    Returns:
+    - pd.DataFrame: A new DataFrame with missing values replaced.
+    
+    Raises:
+    - ValueError: If the column does not exist in the DataFrame.
+    """
+    if column not in df.columns:
+        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
+
+    if default_value is not None:
+        df[column].fillna(default_value, inplace=True)
+    else:
+        mean_value = df[column].mean()
+        df[column].fillna(mean_value, inplace=True)
+
     return df
 
 def standardize_text(df: pd.DataFrame, column: str) -> pd.DataFrame:
+    """
+    Standardize the text in a specified column by converting it to lowercase and stripping whitespace.
+    
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    - column (str): The column to standardize.
+    
+    Returns:
+    - pd.DataFrame: A new DataFrame with standardized text in the specified column.
+    
+    Raises:
+    - ValueError: If the column does not exist in the DataFrame.
+    """
+    if column not in df.columns:
+        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
+
     df[column] = df[column].str.lower().str.strip()
-    return df 
+    return df
diff --git a/pysnippets/Data_preprocessing/data_transformation.py b/pysnippets/Data_preprocessing/data_transformation.py
@@ -2,13 +2,69 @@
 import numpy as np
 
 def log_transform(df: pd.DataFrame, column: str) -> pd.DataFrame:
+    """
+    Apply a logarithmic transformation (log1p) to a specified column.
+    The transformation is log(1 + x) to handle zero and positive values.
+
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    - column (str): The column name to transform.
+
+    Returns:
+    - pd.DataFrame: A new DataFrame with the transformed column.
+
+    Raises:
+    - ValueError: If the column does not exist in the DataFrame or contains non-positive values.
+    """
+    if column not in df.columns:
+        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
+
+    # Ensure that the values are positive before applying log transformation
+    if (df[column] <= 0).any():
+        raise ValueError(f"Log transformation cannot be applied to non-positive values in column '{column}'.")
+
     df[column] = np.log1p(df[column])
     return df
 
 def power_transform(df: pd.DataFrame, column: str, power: float = 2.0) -> pd.DataFrame:
+    """
+    Apply a power transformation to a specified column.
+
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    - column (str): The column name to transform.
+    - power (float): The power to raise the values to (default is 2.0).
+
+    Returns:
+    - pd.DataFrame: A new DataFrame with the transformed column.
+
+    Raises:
+    - ValueError: If the column does not exist in the DataFrame.
+    """
+    if column not in df.columns:
+        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
+
     df[column] = np.power(df[column], power)
     return df
 
 def binarize(df: pd.DataFrame, column: str, threshold: float) -> pd.DataFrame:
+    """
+    Binarize a specified column based on a threshold. Values greater than the threshold
+    are set to 1, and values less than or equal to the threshold are set to 0.
+
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    - column (str): The column name to binarize.
+    - threshold (float): The threshold for binarization.
+
+    Returns:
+    - pd.DataFrame: A new DataFrame with the binarized column.
+
+    Raises:
+    - ValueError: If the column does not exist in the DataFrame.
+    """
+    if column not in df.columns:
+        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
+
     df[column] = (df[column] > threshold).astype(int)
-    return df 
+    return df
diff --git a/pysnippets/Data_preprocessing/encoding.py b/pysnippets/Data_preprocessing/encoding.py
@@ -1,13 +1,48 @@
 import pandas as pd
 from sklearn.preprocessing import OneHotEncoder, LabelEncoder
 
-def one_hot_encode(df: pd.DataFrame, column: str) -> pd.DataFrame:
-    encoder = OneHotEncoder(sparse=False, drop='first')
+def one_hot_encode(df: pd.DataFrame, column: str, drop_first: bool = True) -> pd.DataFrame:
+    """
+    Perform one-hot encoding on a specified column.
+
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    - column (str): The column name to encode.
+    - drop_first (bool): Whether to drop the first category to avoid multicollinearity (default is True).
+
+    Returns:
+    - pd.DataFrame: A new DataFrame with the one-hot encoded column(s).
+    
+    Raises:
+    - ValueError: If the column does not exist in the DataFrame.
+    """
+    if column not in df.columns:
+        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
+
+    encoder = OneHotEncoder(sparse=False, drop='first' if drop_first else None)
     encoded = encoder.fit_transform(df[[column]])
     encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([column]))
+
+    # Concatenate the original dataframe without the encoded column and the encoded DataFrame
     return pd.concat([df.drop(column, axis=1), encoded_df], axis=1)
 
 def label_encode(df: pd.DataFrame, column: str) -> pd.DataFrame:
+    """
+    Perform label encoding on a specified column (converting categories to integer labels).
+    
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    - column (str): The column name to encode.
+    
+    Returns:
+    - pd.DataFrame: A new DataFrame with the label encoded column.
+    
+    Raises:
+    - ValueError: If the column does not exist in the DataFrame.
+    """
+    if column not in df.columns:
+        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
+
     encoder = LabelEncoder()
     df[column] = encoder.fit_transform(df[column])
-    return df 
+    return df
diff --git a/pysnippets/Data_preprocessing/feature_scaling.py b/pysnippets/Data_preprocessing/feature_scaling.py
@@ -2,11 +2,49 @@
 import pandas as pd
 
 def standardize_features(df: pd.DataFrame, columns: list) -> pd.DataFrame:
+    """
+    Standardize the features (scale to zero mean and unit variance) for specified columns.
+
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    - columns (list): List of column names to standardize.
+
+    Returns:
+    - pd.DataFrame: A new DataFrame with standardized features.
+
+    Raises:
+    - ValueError: If any of the columns do not exist in the DataFrame.
+    """
+    # Check if columns exist in the DataFrame
+    missing_cols = [col for col in columns if col not in df.columns]
+    if missing_cols:
+        raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.")
+
+    # Apply StandardScaler
     scaler = StandardScaler()
     df[columns] = scaler.fit_transform(df[columns])
     return df
 
 def min_max_scale_features(df: pd.DataFrame, columns: list) -> pd.DataFrame:
+    """
+    Apply Min-Max scaling (scale to a [0, 1] range) for specified columns.
+
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    - columns (list): List of column names to scale.
+
+    Returns:
+    - pd.DataFrame: A new DataFrame with Min-Max scaled features.
+
+    Raises:
+    - ValueError: If any of the columns do not exist in the DataFrame.
+    """
+    # Check if columns exist in the DataFrame
+    missing_cols = [col for col in columns if col not in df.columns]
+    if missing_cols:
+        raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.")
+
+    # Apply MinMaxScaler
     scaler = MinMaxScaler()
     df[columns] = scaler.fit_transform(df[columns])
-    return df 
+    return df
diff --git a/pysnippets/Data_preprocessing/missing_values.py b/pysnippets/Data_preprocessing/missing_values.py
@@ -2,14 +2,59 @@
 from sklearn.impute import SimpleImputer
 
 def impute_numeric_with_mean(df: pd.DataFrame, columns: list) -> pd.DataFrame:
+    """
+    Impute missing numeric values in specified columns with the mean value of each column.
+
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    - columns (list): List of column names to impute.
+
+    Returns:
+    - pd.DataFrame: A new DataFrame with missing numeric values imputed with the mean.
+
+    Raises:
+    - ValueError: If any of the columns do not exist in the DataFrame.
+    """
+    missing_cols = [col for col in columns if col not in df.columns]
+    if missing_cols:
+        raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.")
+
     imputer = SimpleImputer(strategy='mean')
     df[columns] = imputer.fit_transform(df[columns])
     return df
 
 def impute_categorical_with_mode(df: pd.DataFrame, columns: list) -> pd.DataFrame:
+    """
+    Impute missing categorical values in specified columns with the most frequent value (mode).
+
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    - columns (list): List of column names to impute.
+
+    Returns:
+    - pd.DataFrame: A new DataFrame with missing categorical values imputed with the mode.
+
+    Raises:
+    - ValueError: If any of the columns do not exist in the DataFrame.
+    """
+    missing_cols = [col for col in columns if col not in df.columns]
+    if missing_cols:
+        raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.")
+
     imputer = SimpleImputer(strategy='most_frequent')
     df[columns] = imputer.fit_transform(df[columns])
     return df
 
 def drop_missing(df: pd.DataFrame, threshold: float = 0.5) -> pd.DataFrame:
-    return df.dropna(thresh=int(df.shape[1] * threshold)) 
+    """
+    Drop rows with missing values if the number of missing values exceeds a given threshold.
+
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    - threshold (float): Proportion of non-null values required in a row to keep it. Default is 0.5.
+
+    Returns:
+    - pd.DataFrame: A new DataFrame with rows dropped based on the missing value threshold.
+    """
+    # Drop rows where the number of non-null values is less than the threshold
+    return df.dropna(thresh=int(df.shape[1] * threshold))
diff --git a/pysnippets/Data_preprocessing/outlier_detection.py b/pysnippets/Data_preprocessing/outlier_detection.py
@@ -2,15 +2,52 @@
 import numpy as np
 
 def remove_outliers_iqr(df: pd.DataFrame, column: str) -> pd.DataFrame:
+    """
+    Remove outliers from a specified column using the Interquartile Range (IQR) method.
+
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    - column (str): The column name to remove outliers from.
+
+    Returns:
+    - pd.DataFrame: A new DataFrame with outliers removed based on the IQR method.
+    
+    Raises:
+    - ValueError: If the column does not exist in the DataFrame.
+    """
+    if column not in df.columns:
+        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
+
     Q1 = df[column].quantile(0.25)
     Q3 = df[column].quantile(0.75)
     IQR = Q3 - Q1
     lower_bound = Q1 - 1.5 * IQR
     upper_bound = Q3 + 1.5 * IQR
+
+    # Remove rows where the column's value is outside the IQR bounds
     return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
 
 def remove_outliers_zscore(df: pd.DataFrame, column: str, threshold: float = 3.0) -> pd.DataFrame:
+    """
+    Remove outliers from a specified column using the Z-Score method.
+
+    Args:
+    - df (pd.DataFrame): The input DataFrame.
+    - column (str): The column name to remove outliers from.
+    - threshold (float): The Z-score threshold to identify outliers (default is 3.0).
+
+    Returns:
+    - pd.DataFrame: A new DataFrame with outliers removed based on the Z-Score method.
+    
+    Raises:
+    - ValueError: If the column does not exist in the DataFrame.
+    """
+    if column not in df.columns:
+        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
+
     mean = df[column].mean()
     std = df[column].std()
     z_scores = (df[column] - mean) / std
-    return df[np.abs(z_scores) <= threshold] 
+
+    # Remove rows where the absolute Z-Score exceeds the threshold
+    return df[np.abs(z_scores) <= threshold]
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		/home/runner/work/_temp/fcbe3165-95cc-4878-8e84-ef18a16fbf3a.sh: line 1: scripts/build_directory_md.py: Permission denied
		/home/runner/work/_temp/8260a9d1-eff9-42f4-b1a3-1bd90558f43e.sh: line 1: scripts/build_directory_md.py: Permission denied