Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cluster Branch - Implementation of manifold models to reduce dimensionality on data for visualization #67

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
3 changes: 2 additions & 1 deletion bibmon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
from ._esn import ESN
from ._sbm import SBM
from ._sklearn_regressor import sklearnRegressor
from ._sklearn_manifold import sklearnManifold
from ._preprocess import PreProcess
from ._load_data import load_tennessee_eastman, load_real_data
from ._bibmon_tools import train_val_test_split, complete_analysis, comparative_table, spearmanr_dendrogram, create_df_with_dates, create_df_with_noise, align_dfs_by_rows

__all__ = ['Autoencoder','PCA','ESN','SBM',
'sklearnRegressor', 'PreProcess',
'sklearnRegressor','sklearnManifold' , 'PreProcess',
'load_tennessee_eastman', 'load_real_data',
'train_val_test_split', 'complete_analysis', 'comparative_table',
'spearmanr_dendrogram', 'create_df_with_dates',
Expand Down
156 changes: 156 additions & 0 deletions bibmon/_sklearn_manifold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# -*- coding: utf-8 -*-
"""
Created on Tue 8 08:14:01 2024

@author: leovo
"""


import matplotlib.pyplot as plt
import pandas as pd

from ._generic_model import GenericModel

###############################################################################

class sklearnManifold(GenericModel):
"""
Interface for sklearn manifold learning models.

Parameters
----------
manifold_model: any manifold model that uses the sklearn interface.
For example:
* sklearn.manifold.MDS,
* sklearn.manifold.Isomap,
* sklearn.manifold.TSNE,
* sklearn.manifold.LocallyLinearEmbedding,
* etc....
"""

###########################################################################

def __init__(self, manifold_model):
self.has_Y = False # Default set to False, because Manifold algorithms don't require a target variable
self.manifold_model = manifold_model

self.name = self.manifold_model.__class__.__name__

###########################################################################

def train_core(self):
"""
Fits the manifold model using the training data.
"""
## Check if the input is a pandas DataFrame
if isinstance(self.X_train, pd.DataFrame):
# If it's a DataFrame, use the `.values` attribute to extract numpy array
self.transformed_data = self.manifold_model.fit_transform(self.X_train.values)
else:
# If it's already a numpy array, use it directly
self.transformed_data = self.manifold_model.fit_transform(self.X_train)

###########################################################################

def fit_transform(self,X):
"""
Fits the clustering method and returns the transformed data

"""

self.X_train=X #Attributing training data to variable X passed in the m
self.train_core() #Training the method with train_core

"""
Returning the transformed data for visualization
"""
return self.transformed_data


def map_from_X(self,X_test):
"""
Applies the transformation to a new dataset. Note that some manifold
models, like TSNE, may not have a direct `transform` method.
"""
if hasattr(self.manifold_model, 'transform'):
return self.manifold_model.transform(X_test)
else:
raise NotImplementedError("This manifold model does not support transformation on new data.")

###########################################################################

def set_hyperparameters(self, params_dict):
"""
Sets the hyperparameters for the manifold model.
"""
for key, value in params_dict.items():
setattr(self.manifold_model, key, value)

###########################################################################

def transform(self, X_test):
"""
Transforms the input data using the trained manifold model by calling map_from_X.

Parameters
----------
X_test: array-like or DataFrame
The new data to transform.

Returns
-------
transformed_data: array-like
The transformed data.
"""
return self.map_from_X(X_test)

def plot_embedding(self):
"""
Plots the 2D or 3D embedding resulting from the manifold model.
"""
if self.transformed_data.shape[1] == 2:
plt.scatter(self.transformed_data[:, 0], self.transformed_data[:, 1], s=50, cmap='viridis')
plt.title(f"{self.name} 2D Embedding")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
elif self.transformed_data.shape[1] == 3:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(self.transformed_data[:, 0], self.transformed_data[:, 1], self.transformed_data[:, 2], s=50, cmap='viridis')
ax.set_title(f"{self.name} 3D Embedding")
ax.set_xlabel("Component 1")
ax.set_ylabel("Component 2")
ax.set_zlabel("Component 3")
else:
print("Embedding dimensionality is not 2D or 3D; custom plotting is required.")

def clusters_visualization(self, X):
"""
Fits the manifold model, transforms the data, and plots the resulting 2D or 3D embedding.

Parameters
----------
X: array-like or DataFrame
The data to fit and transform.
"""
# Perform fit_transform and store the transformed data
transformed_data = self.fit_transform(X)

# Plot the 2D or 3D embedding based on the transformed data
if transformed_data.shape[1] == 2:
plt.scatter(transformed_data[:, 0], transformed_data[:, 1], s=50, cmap='viridis')
plt.title(f"{self.name} 2D Embedding")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
elif transformed_data.shape[1] == 3:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(transformed_data[:, 0], transformed_data[:, 1], transformed_data[:, 2], s=50, cmap='viridis')
ax.set_title(f"{self.name} 3D Embedding")
ax.set_xlabel("Component 1")
ax.set_ylabel("Component 2")
ax.set_zlabel("Component 3")
else:
print("Embedding dimensionality is not 2D or 3D; custom plotting is required.")

plt.show()
78 changes: 78 additions & 0 deletions test/test_manifold_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 8 09:34:01 2024

@author: Leonardo Voltolini
"""

import bibmon
from sklearn.preprocessing import StandardScaler
import numpy as np



SC=StandardScaler()

# loading the data from TEP
df_train, df_test = bibmon.load_tennessee_eastman(train_id = 0,
test_id = 1)

#Transforming training and testing data using StandardScaler
X_train=SC.fit_transform(df_train)
X_test=SC.transform(df_test)

#Concatenating train and the test, because manifold models normally
#don't require a separation between training and testing folds
X=np.concatenate( (X_train, X_test),axis=0)


for attr in bibmon.__all__:
a = getattr(bibmon,attr)
if isinstance(a, type):
'''
Verifying if the attribute a is generic model from sklearn manifold
and then applying the adequate model as wanted
'''
if a.__base__ == bibmon._generic_model.GenericModel:
if a == bibmon.sklearnManifold:
from sklearn.manifold import TSNE
model = a(TSNE(n_components=2)) #Creating the model

'''
Computing the embeeding data from fit_transform function
and subsequently plotting the clustering in the appropriate
dimension
'''
embedded_data=model.fit_transform(X)
model.plot_embedding()

#%%

'''
This implementation does the same as previous cell, but, it applies
a distinct model and automatically computes fit_transform and clusters
visualization
'''

for attr in bibmon.__all__:
a = getattr(bibmon,attr)
if isinstance(a, type):
if a.__base__ == bibmon._generic_model.GenericModel:
if a == bibmon.sklearnManifold:
from sklearn.manifold import MDS
model = a(MDS(n_components=3))

'''
The below code transforms the data and presents the
graph for cluster visualization
'''
model.clusters_visualization(X)