From 04995543aacc7613548f00d5539d7437a5664987 Mon Sep 17 00:00:00 2001 From: Leonardo Voltolini Date: Tue, 12 Nov 2024 15:48:46 +0100 Subject: [PATCH 1/9] Data visualization functionality Creating a data visualization functionality for BibMon using sklearn.manifold clustering algorithms. --- bibmon/__init__.py | 3 +- bibmon/_sklearn_manifold.py | 90 ++++++++++++++++++++++++++++++++++++ test/test_manifold_models.py | 43 +++++++++++++++++ 3 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 bibmon/_sklearn_manifold.py create mode 100644 test/test_manifold_models.py diff --git a/bibmon/__init__.py b/bibmon/__init__.py index 3dd3ee7..750fc31 100644 --- a/bibmon/__init__.py +++ b/bibmon/__init__.py @@ -3,12 +3,13 @@ from ._esn import ESN from ._sbm import SBM from ._sklearn_regressor import sklearnRegressor +from ._sklearn_manifold import sklearnManifold from ._preprocess import PreProcess from ._load_data import load_tennessee_eastman, load_real_data from ._bibmon_tools import train_val_test_split, complete_analysis, comparative_table, spearmanr_dendrogram, create_df_with_dates, create_df_with_noise, align_dfs_by_rows __all__ = ['Autoencoder','PCA','ESN','SBM', - 'sklearnRegressor', 'PreProcess', + 'sklearnRegressor','sklearnManifold' , 'PreProcess', 'load_tennessee_eastman', 'load_real_data', 'train_val_test_split', 'complete_analysis', 'comparative_table', 'spearmanr_dendrogram', 'create_df_with_dates', diff --git a/bibmon/_sklearn_manifold.py b/bibmon/_sklearn_manifold.py new file mode 100644 index 0000000..205efc0 --- /dev/null +++ b/bibmon/_sklearn_manifold.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue 8 08:14:01 2024 + +@author: leovo +""" + + +import matplotlib.pyplot as plt + +from ._generic_model import GenericModel + +############################################################################### + +class sklearnManifold(GenericModel): + """ + Interface for sklearn manifold learning models. + + Parameters + ---------- + manifold_model: any manifold model that uses the sklearn interface. + For example: + * sklearn.manifold.MDS, + * sklearn.manifold.Isomap, + * sklearn.manifold.TSNE, + * sklearn.manifold.LocallyLinearEmbedding, + * etc.... + """ + + ########################################################################### + + def __init__(self, manifold_model): + self.has_Y = False # Manifold models generally don't use a target variable + self.manifold_model = manifold_model + + self.name = self.manifold_model.__class__.__name__ + + ########################################################################### + + def train_core(self): + """ + Fits the manifold model using the training data. + """ + # Manifold models often apply dimensionality reduction directly to the input data + self.transformed_data = self.manifold_model.fit_transform(self.X_train.values) + + ########################################################################### + + def map_from_X(self, X): + """ + Applies the transformation to a new dataset. Note that some manifold + models, like TSNE, may not have a direct `transform` method. + """ + if hasattr(self.manifold_model, 'transform'): + return self.manifold_model.transform(X) + else: + raise NotImplementedError("This manifold model does not support transformation on new data.") + + ########################################################################### + + def set_hyperparameters(self, params_dict): + """ + Sets the hyperparameters for the manifold model. + """ + for key, value in params_dict.items(): + setattr(self.manifold_model, key, value) + + ########################################################################### + + def plot_embedding(self): + """ + Plots the 2D or 3D embedding resulting from the manifold model. + """ + if self.transformed_data.shape[1] == 2: + plt.scatter(self.transformed_data[:, 0], self.transformed_data[:, 1], s=50, cmap='viridis') + plt.title(f"{self.name} 2D Embedding") + plt.xlabel("Component 1") + plt.ylabel("Component 2") + elif self.transformed_data.shape[1] == 3: + fig = plt.figure() + ax = fig.add_subplot(111, projection='3d') + ax.scatter(self.transformed_data[:, 0], self.transformed_data[:, 1], self.transformed_data[:, 2], s=50, cmap='viridis') + ax.set_title(f"{self.name} 3D Embedding") + ax.set_xlabel("Component 1") + ax.set_ylabel("Component 2") + ax.set_zlabel("Component 3") + else: + print("Embedding dimensionality is not 2D or 3D; custom plotting is required.") + + plt.show() \ No newline at end of file diff --git a/test/test_manifold_models.py b/test/test_manifold_models.py new file mode 100644 index 0000000..bd49d54 --- /dev/null +++ b/test/test_manifold_models.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Nov 8 09:34:01 2024 + +@author: Leonardo Voltolini +""" + +import bibmon +from sklearn.preprocessing import StandardScaler +import numpy as np + +SC=StandardScaler() + +# loading the data from TEP +df_train, df_test = bibmon.load_tennessee_eastman(train_id = 0, + test_id = 1) + + +X_train=SC.fit_transform(df_train) + +X_test=SC.transform(df_test) + +X=np.concatenate( (X_train, X_test),axis=0) + + +for attr in bibmon.__all__: + a = getattr(bibmon,attr) + if isinstance(a, type): + if a.__base__ == bibmon._generic_model.GenericModel: + if a == bibmon.sklearnManifold: + from sklearn.manifold import TSNE + m = a(TSNE(n_components=2)) + else: + m = a() + + # TRAINING + + X_embedded=m.fit_transform(X) + X_embedded.plot_embedding() + + + + \ No newline at end of file From 9717bc579a1b277c8944006ecc2021324c04b59e Mon Sep 17 00:00:00 2001 From: Leonardo Voltolini Date: Tue, 12 Nov 2024 16:51:27 +0100 Subject: [PATCH 2/9] Modifications to the cluster tool for visualization --- bibmon/_sklearn_manifold.py | 8 ++++---- test/test_manifold_models.py | 15 ++++++++------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/bibmon/_sklearn_manifold.py b/bibmon/_sklearn_manifold.py index 205efc0..16ed219 100644 --- a/bibmon/_sklearn_manifold.py +++ b/bibmon/_sklearn_manifold.py @@ -37,14 +37,14 @@ def __init__(self, manifold_model): ########################################################################### - def train_core(self): + def train_core(self,X_train): """ Fits the manifold model using the training data. """ # Manifold models often apply dimensionality reduction directly to the input data - self.transformed_data = self.manifold_model.fit_transform(self.X_train.values) - - ########################################################################### + self.transformed_data=self.manifold_model.fit_transform(self.X_train.values) + + ########################################################################### def map_from_X(self, X): """ diff --git a/test/test_manifold_models.py b/test/test_manifold_models.py index bd49d54..a9fc76b 100644 --- a/test/test_manifold_models.py +++ b/test/test_manifold_models.py @@ -9,6 +9,8 @@ from sklearn.preprocessing import StandardScaler import numpy as np + + SC=StandardScaler() # loading the data from TEP @@ -29,14 +31,13 @@ if a.__base__ == bibmon._generic_model.GenericModel: if a == bibmon.sklearnManifold: from sklearn.manifold import TSNE - m = a(TSNE(n_components=2)) - else: - m = a() - - # TRAINING + model = a(TSNE(n_components=2)) + #else: + #m = a() + + model.train_core() + - X_embedded=m.fit_transform(X) - X_embedded.plot_embedding() From fb2776141d52c39e0844e4e9a3560df9dc0440e2 Mon Sep 17 00:00:00 2001 From: Leonardo Voltolini Date: Tue, 12 Nov 2024 17:35:53 +0100 Subject: [PATCH 3/9] Modifications to manifold methods --- bibmon/_sklearn_manifold.py | 23 +++++++++++++++++++---- test/test_manifold_models.py | 2 +- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/bibmon/_sklearn_manifold.py b/bibmon/_sklearn_manifold.py index 16ed219..842a482 100644 --- a/bibmon/_sklearn_manifold.py +++ b/bibmon/_sklearn_manifold.py @@ -30,14 +30,14 @@ class sklearnManifold(GenericModel): ########################################################################### def __init__(self, manifold_model): - self.has_Y = False # Manifold models generally don't use a target variable + self.has_Y = False # Default set to False, because Manifold algorithms don't require a target variable self.manifold_model = manifold_model self.name = self.manifold_model.__class__.__name__ ########################################################################### - def train_core(self,X_train): + def train_core(self): """ Fits the manifold model using the training data. """ @@ -45,14 +45,29 @@ def train_core(self,X_train): self.transformed_data=self.manifold_model.fit_transform(self.X_train.values) ########################################################################### + + def fit_transform(self,X): + """ + Fits the clustering method and returns the transformed data + + """ + + self.X_train=X #Attributing training data to variable X passed in the m + self.train_core() #Training the method with train_core + + """ + Returning the transformed data for visualization + """ + return self.transformed_data + - def map_from_X(self, X): + def transform(self,X_test): """ Applies the transformation to a new dataset. Note that some manifold models, like TSNE, may not have a direct `transform` method. """ if hasattr(self.manifold_model, 'transform'): - return self.manifold_model.transform(X) + return self.manifold_model.transform(X_test) else: raise NotImplementedError("This manifold model does not support transformation on new data.") diff --git a/test/test_manifold_models.py b/test/test_manifold_models.py index a9fc76b..f00a2e1 100644 --- a/test/test_manifold_models.py +++ b/test/test_manifold_models.py @@ -35,7 +35,7 @@ #else: #m = a() - model.train_core() + model.train_core(X) From 347ae2d9e928239bee57f380c96f6c061927e802 Mon Sep 17 00:00:00 2001 From: Leonardo Voltolini Date: Tue, 12 Nov 2024 17:39:23 +0100 Subject: [PATCH 4/9] Update test_manifold_models.py --- test/test_manifold_models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_manifold_models.py b/test/test_manifold_models.py index f00a2e1..271b3d7 100644 --- a/test/test_manifold_models.py +++ b/test/test_manifold_models.py @@ -35,7 +35,9 @@ #else: #m = a() - model.train_core(X) + embedded_data=model.fit_transform(X) + embedded_data.plot_embedding() + From 5a72c88fed54f609cf9be2c995aaf882cb14dc4f Mon Sep 17 00:00:00 2001 From: Leonardo Voltolini Date: Wed, 13 Nov 2024 09:40:05 +0100 Subject: [PATCH 5/9] Update test_manifold_models.py --- test/test_manifold_models.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/test_manifold_models.py b/test/test_manifold_models.py index 271b3d7..2af915a 100644 --- a/test/test_manifold_models.py +++ b/test/test_manifold_models.py @@ -32,9 +32,8 @@ if a == bibmon.sklearnManifold: from sklearn.manifold import TSNE model = a(TSNE(n_components=2)) - #else: - #m = a() - + + embedded_data=model.fit_transform(X) embedded_data.plot_embedding() From c0f846098543832d079adce75ac7d7042973059e Mon Sep 17 00:00:00 2001 From: Leonardo Voltolini Date: Wed, 13 Nov 2024 12:40:39 +0100 Subject: [PATCH 6/9] Update _sklearn_manifold.py --- bibmon/_sklearn_manifold.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/bibmon/_sklearn_manifold.py b/bibmon/_sklearn_manifold.py index 842a482..feee8e0 100644 --- a/bibmon/_sklearn_manifold.py +++ b/bibmon/_sklearn_manifold.py @@ -61,7 +61,7 @@ def fit_transform(self,X): return self.transformed_data - def transform(self,X_test): + def map_from_X(self,X_test): """ Applies the transformation to a new dataset. Note that some manifold models, like TSNE, may not have a direct `transform` method. @@ -81,6 +81,22 @@ def set_hyperparameters(self, params_dict): setattr(self.manifold_model, key, value) ########################################################################### + + def transform(self, X_test): + """ + Transforms the input data using the trained manifold model by calling map_from_X. + + Parameters + ---------- + X_test: array-like or DataFrame + The new data to transform. + + Returns + ------- + transformed_data: array-like + The transformed data. + """ + return self.map_from_X(X_test) def plot_embedding(self): """ From ae4740d83c2e402443de608727afb4b519d0de18 Mon Sep 17 00:00:00 2001 From: Leonardo Voltolini Date: Wed, 13 Nov 2024 12:51:12 +0100 Subject: [PATCH 7/9] Update _sklearn_manifold.py --- bibmon/_sklearn_manifold.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bibmon/_sklearn_manifold.py b/bibmon/_sklearn_manifold.py index feee8e0..cf55caf 100644 --- a/bibmon/_sklearn_manifold.py +++ b/bibmon/_sklearn_manifold.py @@ -7,6 +7,7 @@ import matplotlib.pyplot as plt +import pandas as pd from ._generic_model import GenericModel @@ -41,8 +42,13 @@ def train_core(self): """ Fits the manifold model using the training data. """ - # Manifold models often apply dimensionality reduction directly to the input data - self.transformed_data=self.manifold_model.fit_transform(self.X_train.values) + ## Check if the input is a pandas DataFrame + if isinstance(self.X_train, pd.DataFrame): + # If it's a DataFrame, use the `.values` attribute to extract numpy array + self.transformed_data = self.manifold_model.fit_transform(self.X_train.values) + else: + # If it's already a numpy array, use it directly + self.transformed_data = self.manifold_model.fit_transform(self.X_train) ########################################################################### From bb5e68339eeaa5f2c65b565400c0d7e8d278e06a Mon Sep 17 00:00:00 2001 From: Leonardo Voltolini Date: Wed, 13 Nov 2024 13:16:52 +0100 Subject: [PATCH 8/9] changes to improve clusters visualization --- bibmon/_sklearn_manifold.py | 29 +++++++++++++++++++++++++++++ test/test_manifold_models.py | 7 +++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/bibmon/_sklearn_manifold.py b/bibmon/_sklearn_manifold.py index cf55caf..4d69a77 100644 --- a/bibmon/_sklearn_manifold.py +++ b/bibmon/_sklearn_manifold.py @@ -123,5 +123,34 @@ def plot_embedding(self): ax.set_zlabel("Component 3") else: print("Embedding dimensionality is not 2D or 3D; custom plotting is required.") + + def clusters_visualization(self, X): + """ + Fits the manifold model, transforms the data, and plots the resulting 2D or 3D embedding. + + Parameters + ---------- + X: array-like or DataFrame + The data to fit and transform. + """ + # Perform fit_transform and store the transformed data + transformed_data = self.fit_transform(X) + + # Plot the 2D or 3D embedding based on the transformed data + if transformed_data.shape[1] == 2: + plt.scatter(transformed_data[:, 0], transformed_data[:, 1], s=50, cmap='viridis') + plt.title(f"{self.name} 2D Embedding") + plt.xlabel("Component 1") + plt.ylabel("Component 2") + elif transformed_data.shape[1] == 3: + fig = plt.figure() + ax = fig.add_subplot(111, projection='3d') + ax.scatter(transformed_data[:, 0], transformed_data[:, 1], transformed_data[:, 2], s=50, cmap='viridis') + ax.set_title(f"{self.name} 3D Embedding") + ax.set_xlabel("Component 1") + ax.set_ylabel("Component 2") + ax.set_zlabel("Component 3") + else: + print("Embedding dimensionality is not 2D or 3D; custom plotting is required.") plt.show() \ No newline at end of file diff --git a/test/test_manifold_models.py b/test/test_manifold_models.py index 2af915a..43fcd09 100644 --- a/test/test_manifold_models.py +++ b/test/test_manifold_models.py @@ -33,9 +33,12 @@ from sklearn.manifold import TSNE model = a(TSNE(n_components=2)) - embedded_data=model.fit_transform(X) - embedded_data.plot_embedding() + model.plot_embedding() + + model.clusters_visualization(X) + + From e48456dd00eccb262d30b084faf7f3b2c9fe2e9b Mon Sep 17 00:00:00 2001 From: Leonardo Voltolini Date: Wed, 13 Nov 2024 18:12:22 +0100 Subject: [PATCH 9/9] Update test_manifold_models.py --- test/test_manifold_models.py | 38 ++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/test/test_manifold_models.py b/test/test_manifold_models.py index 43fcd09..e4c2afa 100644 --- a/test/test_manifold_models.py +++ b/test/test_manifold_models.py @@ -17,25 +17,55 @@ df_train, df_test = bibmon.load_tennessee_eastman(train_id = 0, test_id = 1) - +#Transforming training and testing data using StandardScaler X_train=SC.fit_transform(df_train) - X_test=SC.transform(df_test) +#Concatenating train and the test, because manifold models normally +#don't require a separation between training and testing folds X=np.concatenate( (X_train, X_test),axis=0) for attr in bibmon.__all__: a = getattr(bibmon,attr) if isinstance(a, type): + ''' + Verifying if the attribute a is generic model from sklearn manifold + and then applying the adequate model as wanted + ''' if a.__base__ == bibmon._generic_model.GenericModel: - if a == bibmon.sklearnManifold: + if a == bibmon.sklearnManifold: from sklearn.manifold import TSNE - model = a(TSNE(n_components=2)) + model = a(TSNE(n_components=2)) #Creating the model + ''' + Computing the embeeding data from fit_transform function + and subsequently plotting the clustering in the appropriate + dimension + ''' embedded_data=model.fit_transform(X) model.plot_embedding() + +#%% + +''' +This implementation does the same as previous cell, but, it applies +a distinct model and automatically computes fit_transform and clusters +visualization +''' + +for attr in bibmon.__all__: + a = getattr(bibmon,attr) + if isinstance(a, type): + if a.__base__ == bibmon._generic_model.GenericModel: + if a == bibmon.sklearnManifold: + from sklearn.manifold import MDS + model = a(MDS(n_components=3)) + ''' + The below code transforms the data and presents the + graph for cluster visualization + ''' model.clusters_visualization(X)