Skip to content

Commit

Permalink
Merge pull request #474 from smog-root/data_processing
Browse files Browse the repository at this point in the history
Enhance Outlier Detection Test Coverage and Edge Case Handling
  • Loading branch information
UTSAVS26 authored Jan 7, 2025
2 parents b7b352b + 5f7df50 commit 13718c9
Show file tree
Hide file tree
Showing 13 changed files with 535 additions and 34 deletions.
2 changes: 1 addition & 1 deletion DIRECTORY.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
/home/runner/work/_temp/fcbe3165-95cc-4878-8e84-ef18a16fbf3a.sh: line 1: scripts/build_directory_md.py: Permission denied
/home/runner/work/_temp/8260a9d1-eff9-42f4-b1a3-1bd90558f43e.sh: line 1: scripts/build_directory_md.py: Permission denied
56 changes: 52 additions & 4 deletions pysnippets/Data_preprocessing/data_cleaning.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,61 @@
import pandas as pd

def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
"""
Remove duplicate rows from the DataFrame.
Args:
- df (pd.DataFrame): The input DataFrame.
Returns:
- pd.DataFrame: A new DataFrame with duplicates removed.
"""
if df.empty:
print("Warning: The DataFrame is empty.")
return df.drop_duplicates()

def replace_missing_with_mean(df: pd.DataFrame, column: str) -> pd.DataFrame:
mean_value = df[column].mean()
df[column].fillna(mean_value, inplace=True)
def replace_missing_with_mean(df: pd.DataFrame, column: str, default_value: float = None) -> pd.DataFrame:
"""
Replace missing values in a specified column with the column's mean or a provided default value.
Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name where missing values need to be replaced.
- default_value (float, optional): If provided, will replace missing values with this value.
Returns:
- pd.DataFrame: A new DataFrame with missing values replaced.
Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

if default_value is not None:
df[column].fillna(default_value, inplace=True)
else:
mean_value = df[column].mean()
df[column].fillna(mean_value, inplace=True)

return df

def standardize_text(df: pd.DataFrame, column: str) -> pd.DataFrame:
"""
Standardize the text in a specified column by converting it to lowercase and stripping whitespace.
Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column to standardize.
Returns:
- pd.DataFrame: A new DataFrame with standardized text in the specified column.
Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

df[column] = df[column].str.lower().str.strip()
return df
return df
58 changes: 57 additions & 1 deletion pysnippets/Data_preprocessing/data_transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,69 @@
import numpy as np

def log_transform(df: pd.DataFrame, column: str) -> pd.DataFrame:
"""
Apply a logarithmic transformation (log1p) to a specified column.
The transformation is log(1 + x) to handle zero and positive values.
Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name to transform.
Returns:
- pd.DataFrame: A new DataFrame with the transformed column.
Raises:
- ValueError: If the column does not exist in the DataFrame or contains non-positive values.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

# Ensure that the values are positive before applying log transformation
if (df[column] <= 0).any():
raise ValueError(f"Log transformation cannot be applied to non-positive values in column '{column}'.")

df[column] = np.log1p(df[column])
return df

def power_transform(df: pd.DataFrame, column: str, power: float = 2.0) -> pd.DataFrame:
"""
Apply a power transformation to a specified column.
Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name to transform.
- power (float): The power to raise the values to (default is 2.0).
Returns:
- pd.DataFrame: A new DataFrame with the transformed column.
Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

df[column] = np.power(df[column], power)
return df

def binarize(df: pd.DataFrame, column: str, threshold: float) -> pd.DataFrame:
"""
Binarize a specified column based on a threshold. Values greater than the threshold
are set to 1, and values less than or equal to the threshold are set to 0.
Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name to binarize.
- threshold (float): The threshold for binarization.
Returns:
- pd.DataFrame: A new DataFrame with the binarized column.
Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

df[column] = (df[column] > threshold).astype(int)
return df
return df
41 changes: 38 additions & 3 deletions pysnippets/Data_preprocessing/encoding.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,48 @@
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

def one_hot_encode(df: pd.DataFrame, column: str) -> pd.DataFrame:
encoder = OneHotEncoder(sparse=False, drop='first')
def one_hot_encode(df: pd.DataFrame, column: str, drop_first: bool = True) -> pd.DataFrame:
"""
Perform one-hot encoding on a specified column.
Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name to encode.
- drop_first (bool): Whether to drop the first category to avoid multicollinearity (default is True).
Returns:
- pd.DataFrame: A new DataFrame with the one-hot encoded column(s).
Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

encoder = OneHotEncoder(sparse=False, drop='first' if drop_first else None)
encoded = encoder.fit_transform(df[[column]])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([column]))

# Concatenate the original dataframe without the encoded column and the encoded DataFrame
return pd.concat([df.drop(column, axis=1), encoded_df], axis=1)

def label_encode(df: pd.DataFrame, column: str) -> pd.DataFrame:
"""
Perform label encoding on a specified column (converting categories to integer labels).
Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name to encode.
Returns:
- pd.DataFrame: A new DataFrame with the label encoded column.
Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

encoder = LabelEncoder()
df[column] = encoder.fit_transform(df[column])
return df
return df
40 changes: 39 additions & 1 deletion pysnippets/Data_preprocessing/feature_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,49 @@
import pandas as pd

def standardize_features(df: pd.DataFrame, columns: list) -> pd.DataFrame:
"""
Standardize the features (scale to zero mean and unit variance) for specified columns.
Args:
- df (pd.DataFrame): The input DataFrame.
- columns (list): List of column names to standardize.
Returns:
- pd.DataFrame: A new DataFrame with standardized features.
Raises:
- ValueError: If any of the columns do not exist in the DataFrame.
"""
# Check if columns exist in the DataFrame
missing_cols = [col for col in columns if col not in df.columns]
if missing_cols:
raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.")

# Apply StandardScaler
scaler = StandardScaler()
df[columns] = scaler.fit_transform(df[columns])
return df

def min_max_scale_features(df: pd.DataFrame, columns: list) -> pd.DataFrame:
"""
Apply Min-Max scaling (scale to a [0, 1] range) for specified columns.
Args:
- df (pd.DataFrame): The input DataFrame.
- columns (list): List of column names to scale.
Returns:
- pd.DataFrame: A new DataFrame with Min-Max scaled features.
Raises:
- ValueError: If any of the columns do not exist in the DataFrame.
"""
# Check if columns exist in the DataFrame
missing_cols = [col for col in columns if col not in df.columns]
if missing_cols:
raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.")

# Apply MinMaxScaler
scaler = MinMaxScaler()
df[columns] = scaler.fit_transform(df[columns])
return df
return df
47 changes: 46 additions & 1 deletion pysnippets/Data_preprocessing/missing_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,59 @@
from sklearn.impute import SimpleImputer

def impute_numeric_with_mean(df: pd.DataFrame, columns: list) -> pd.DataFrame:
"""
Impute missing numeric values in specified columns with the mean value of each column.
Args:
- df (pd.DataFrame): The input DataFrame.
- columns (list): List of column names to impute.
Returns:
- pd.DataFrame: A new DataFrame with missing numeric values imputed with the mean.
Raises:
- ValueError: If any of the columns do not exist in the DataFrame.
"""
missing_cols = [col for col in columns if col not in df.columns]
if missing_cols:
raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.")

imputer = SimpleImputer(strategy='mean')
df[columns] = imputer.fit_transform(df[columns])
return df

def impute_categorical_with_mode(df: pd.DataFrame, columns: list) -> pd.DataFrame:
"""
Impute missing categorical values in specified columns with the most frequent value (mode).
Args:
- df (pd.DataFrame): The input DataFrame.
- columns (list): List of column names to impute.
Returns:
- pd.DataFrame: A new DataFrame with missing categorical values imputed with the mode.
Raises:
- ValueError: If any of the columns do not exist in the DataFrame.
"""
missing_cols = [col for col in columns if col not in df.columns]
if missing_cols:
raise ValueError(f"Columns {missing_cols} do not exist in the DataFrame.")

imputer = SimpleImputer(strategy='most_frequent')
df[columns] = imputer.fit_transform(df[columns])
return df

def drop_missing(df: pd.DataFrame, threshold: float = 0.5) -> pd.DataFrame:
return df.dropna(thresh=int(df.shape[1] * threshold))
"""
Drop rows with missing values if the number of missing values exceeds a given threshold.
Args:
- df (pd.DataFrame): The input DataFrame.
- threshold (float): Proportion of non-null values required in a row to keep it. Default is 0.5.
Returns:
- pd.DataFrame: A new DataFrame with rows dropped based on the missing value threshold.
"""
# Drop rows where the number of non-null values is less than the threshold
return df.dropna(thresh=int(df.shape[1] * threshold))
39 changes: 38 additions & 1 deletion pysnippets/Data_preprocessing/outlier_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,52 @@
import numpy as np

def remove_outliers_iqr(df: pd.DataFrame, column: str) -> pd.DataFrame:
"""
Remove outliers from a specified column using the Interquartile Range (IQR) method.
Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name to remove outliers from.
Returns:
- pd.DataFrame: A new DataFrame with outliers removed based on the IQR method.
Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove rows where the column's value is outside the IQR bounds
return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

def remove_outliers_zscore(df: pd.DataFrame, column: str, threshold: float = 3.0) -> pd.DataFrame:
"""
Remove outliers from a specified column using the Z-Score method.
Args:
- df (pd.DataFrame): The input DataFrame.
- column (str): The column name to remove outliers from.
- threshold (float): The Z-score threshold to identify outliers (default is 3.0).
Returns:
- pd.DataFrame: A new DataFrame with outliers removed based on the Z-Score method.
Raises:
- ValueError: If the column does not exist in the DataFrame.
"""
if column not in df.columns:
raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

mean = df[column].mean()
std = df[column].std()
z_scores = (df[column] - mean) / std
return df[np.abs(z_scores) <= threshold]

# Remove rows where the absolute Z-Score exceeds the threshold
return df[np.abs(z_scores) <= threshold]
Loading

0 comments on commit 13718c9

Please sign in to comment.