Merge pull request #186 from jameschapman19/dev

Dev
jameschapman19 · Oct 18, 2023 · d54b16a · d54b16a
2 parents 7d1950e + 54c289c
commit d54b16a
Show file tree

Hide file tree

Showing 92 changed files with 1,128 additions and 896 deletions.
diff --git a/README.md b/README.md
@@ -76,7 +76,7 @@ For citing our work, please use the following BibTeX entry:
 author = {Chapman, James and Wang, Hao-Ting and Wells, Lennie and Wiesner, Johannes},
 doi = {10.5281/zenodo.4382739},
 month = aug,
-title = {{CCA-Zoo}},
+title = {{CCALoss-Zoo}},
 url = {https://github.com/jameschapman19/cca_zoo},
 version = {2.3.0},
 year = {2023}

diff --git a/benchmark/CCA_Speed_Benchmark.svg b/benchmark/CCA_Speed_Benchmark.svg
diff --git a/benchmark/PLS_Speed_Benchmark.svg b/benchmark/PLS_Speed_Benchmark.svg
diff --git a/benchmark/cca_high_dimensions.py b/benchmark/cca_high_dimensions.py
@@ -1,5 +1,5 @@
 """
-Benchmarking CCA on high dimensional data. Using CCA-Zoo and Scikit-learn.
+Benchmarking CCALoss on high dimensional data. Using CCALoss-Zoo and Scikit-learn.
 
 Use different dimensionalities and produce a nice seaborn plot of the runtimes.
 """
@@ -34,14 +34,14 @@
         X = np.random.rand(n_samples, dim)
         Y = np.random.rand(n_samples, dim)
 
-        # CCA-Zoo
+        # CCALoss-Zoo
         start_time = time.time()
         cca_zoo = CCA(latent_dimensions=latent_dimensions)
         cca_zoo.fit((X, Y))
         cca_zoo_time = time.time() - start_time
 
         # Record results
-        results.append({"Dimension": dim, "Time": cca_zoo_time, "Method": "CCA-Zoo"})
+        results.append({"Dimension": dim, "Time": cca_zoo_time, "Method": "CCALoss-Zoo"})
 
         # Scikit-learn
         start_time = time.time()
@@ -60,7 +60,7 @@
 # Seaborn Plot
 plt.figure(figsize=(10, 6))
 sns.lineplot(data=df, x="Dimension", y="Time", hue="Method", marker="o", errorbar="sd")
-plt.title("CCA Performance comparison with Uncertainty")
+plt.title("CCALoss Performance comparison with Uncertainty")
 plt.xlabel("Dimension")
 plt.ylabel("Average Execution Time (seconds)")
 plt.tight_layout()

diff --git a/benchmark/gradient_benchmark.py b/benchmark/gradient_benchmark.py
@@ -1,5 +1,5 @@
 """
-Benchmarking CCA on high dimensional data. Using CCA-Zoo and Scikit-learn.
+Benchmarking CCALoss on high dimensional data. Using CCALoss-Zoo and Scikit-learn.
 
 Use different dimensionalities and produce a nice seaborn plot of the runtimes.
 """
@@ -8,7 +8,7 @@
 import pandas as pd
 import numpy as np
 from cca_zoo.linear import CCA
-from cca_zoo.linear import CCA_EY
+from cca_zoo.linear import CCA_EYLoss
 import seaborn as sns
 import matplotlib.pyplot as plt
 
@@ -34,34 +34,34 @@
         X = np.random.rand(n_samples, dim)
         Y = np.random.rand(n_samples, dim)
 
-        # CCA-Zoo
+        # CCALoss-Zoo
         start_time = time.time()
         cca_zoo = CCA(latent_dimensions=latent_dimensions)
         cca_zoo.fit((X, Y))
         cca_zoo_time = time.time() - start_time
 
         # Record results
-        results.append({"Dimension": dim, "Time": cca_zoo_time, "Method": "CCA-Zoo"})
+        results.append({"Dimension": dim, "Time": cca_zoo_time, "Method": "CCALoss-Zoo"})
 
         # Scikit-learn
         start_time = time.time()
-        sk_cca = CCA_EY(latent_dimensions=latent_dimensions, epochs=200)
+        sk_cca = CCA_EYLoss(latent_dimensions=latent_dimensions, epochs=200)
         sk_cca.fit((X, Y))
         sklearn_time = time.time() - start_time
 
         score = cca_zoo.score((X, Y))
         sk_score = sk_cca.score((X, Y))
 
         # Record results
-        results.append({"Dimension": dim, "Time": sklearn_time, "Method": "CCA-EY"})
+        results.append({"Dimension": dim, "Time": sklearn_time, "Method": "CCALoss-EY"})
 
 # Convert to DataFrame
 df = pd.DataFrame(results)
 
 # Seaborn Plot
 plt.figure(figsize=(10, 6))
 sns.lineplot(data=df, x="Dimension", y="Time", hue="Method", marker="o", errorbar="sd")
-plt.title("CCA Performance comparison with Uncertainty")
+plt.title("CCALoss Performance comparison with Uncertainty")
 plt.xlabel("Dimension")
 plt.ylabel("Average Execution Time (seconds)")
 plt.tight_layout()

diff --git a/benchmark/pls_high_dimension.py b/benchmark/pls_high_dimension.py
@@ -1,5 +1,5 @@
 """
-Benchmarking CCA on high dimensional data. Using CCA-Zoo and Scikit-learn.
+Benchmarking CCALoss on high dimensional data. Using CCALoss-Zoo and Scikit-learn.
 
 Use different dimensionalities and produce a nice seaborn plot of the runtimes.
 """
@@ -34,14 +34,14 @@
         X = np.random.rand(n_samples, dim)
         Y = np.random.rand(n_samples, dim)
 
-        # CCA-Zoo
+        # CCALoss-Zoo
         start_time = time.time()
         cca_zoo = PLS(latent_dimensions=latent_dimensions)
         cca_zoo.fit((X, Y))
         cca_zoo_time = time.time() - start_time
 
         # Record results
-        results.append({"Dimension": dim, "Time": cca_zoo_time, "Method": "CCA-Zoo"})
+        results.append({"Dimension": dim, "Time": cca_zoo_time, "Method": "CCALoss-Zoo"})
 
         # Scikit-learn
         start_time = time.time()

diff --git a/cca_zoo/_base.py b/cca_zoo/_base.py
@@ -17,7 +17,7 @@ class BaseModel(BaseEstimator, MultiOutputMixin, RegressorMixin):
     A base class for multivariate latent variable linear.
 
     This class implements common methods and attributes for fitting and transforming
-    multiple views of data using latent variable linear. It inherits from scikit-learn's
+    multiple representations of data using latent variable linear. It inherits from scikit-learn's
     BaseEstimator, MultiOutputMixin and RegressorMixin classes.
 
     Parameters
@@ -34,7 +34,7 @@ class BaseModel(BaseEstimator, MultiOutputMixin, RegressorMixin):
     Attributes
     ----------
     n_views_ : int
-        Number of views.
+        Number of representations.
     n_features_ : list of int
         Number of features for each view.
 
@@ -56,14 +56,14 @@ def __init__(
 
     def _validate_data(self, views: Iterable[np.ndarray]):
         if not all(view.shape[0] == views[0].shape[0] for view in views):
-            raise ValueError("All views must have the same number of samples")
+            raise ValueError("All representations must have the same number of samples")
         if not all(view.ndim == 2 for view in views):
-            raise ValueError("All views must have 2 dimensions")
+            raise ValueError("All representations must have 2 dimensions")
         if not all(view.dtype in self.dtypes for view in views):
-            raise ValueError("All views must have dtype of {}.".format(self.dtypes))
+            raise ValueError("All representations must have dtype of {}.".format(self.dtypes))
         if not all(view.shape[1] >= self.latent_dimensions for view in views):
             raise ValueError(
-                "All views must have at least {} features.".format(
+                "All representations must have at least {} features.".format(
                     self.latent_dimensions
                 )
             )
@@ -121,7 +121,7 @@ def transform(self, views: Iterable[np.ndarray], **kwargs) -> List[np.ndarray]:
 
     def fit_transform(self, views: Iterable[np.ndarray], **kwargs) -> List[np.ndarray]:
         """
-        Fits the model to the given data and returns the transformed views
+        Fits the model to the given data and returns the transformed representations
 
         Parameters
         ----------
@@ -139,7 +139,7 @@ def pairwise_correlations(
         self, views: Iterable[np.ndarray], **kwargs
     ) -> np.ndarray:
         """
-        Calculate pairwise correlations between views in each dimension.
+        Calculate pairwise correlations between representations in each dimension.
 
         Parameters
         ----------
@@ -163,7 +163,7 @@ def average_pairwise_correlations(
         self, views: Iterable[np.ndarray], **kwargs
     ) -> np.ndarray:
         """
-        Calculate the average pairwise correlations between views in each dimension.
+        Calculate the average pairwise correlations between representations in each dimension.
 
         Parameters
         ----------
@@ -175,7 +175,7 @@ def average_pairwise_correlations(
         average_pairwise_correlations: numpy array of shape (latent_dimensions, )
         """
         pair_corrs = self.pairwise_correlations(views, **kwargs)
-        # Sum all the pairwise correlations for each dimension, subtract self-correlations, and divide by the number of views
+        # Sum all the pairwise correlations for each dimension, subtract self-correlations, and divide by the number of representations
         dim_corrs = np.sum(pair_corrs, axis=(0, 1)) - pair_corrs.shape[0]
         # Number of pairs is n_views choose 2
         num_pairs = (self.n_views_ * (self.n_views_ - 1)) / 2
@@ -186,7 +186,7 @@ def score(
         self, views: Iterable[np.ndarray], y: Optional[Any] = None, **kwargs
     ) -> float:
         """
-        Calculate the sum of average pairwise correlations between views.
+        Calculate the sum of average pairwise correlations between representations.
 
         Parameters
         ----------
@@ -197,7 +197,7 @@ def score(
         Returns
         -------
         score : float
-            Sum of average pairwise correlations between views.
+            Sum of average pairwise correlations between representations.
         """
         return self.average_pairwise_correlations(views, **kwargs).sum()
 
@@ -212,8 +212,8 @@ def canonical_loadings(
         linear combinations of the original variables formed to maximize the correlation
         with canonical variates from another view.
 
-        Mathematically, given two views \(X_i\), canonical variates
-        from the views are:
+        Mathematically, given two representations \(X_i\), canonical variates
+        from the representations are:
 
             \(Z_i = w_i^T X_i\)
 
@@ -223,7 +223,7 @@ def canonical_loadings(
         Parameters
         ----------
         views : list/tuple of numpy arrays
-            Each array corresponds to a view. All views must have the same number of rows (observations).
+            Each array corresponds to a view. All representations must have the same number of rows (observations).
 
         Returns
         -------
@@ -281,12 +281,12 @@ def explained_variance(self, views: Iterable[np.ndarray]) -> List[np.ndarray]:
         """
         check_is_fitted(self, attributes=["weights"])
 
-        # Transform the views using the loadings
+        # Transform the representations using the loadings
         transformed_views = [
             view @ loading for view, loading in zip(views, self.loadings)
         ]
 
-        # Calculate the variance of each latent dimension in the transformed views
+        # Calculate the variance of each latent dimension in the transformed representations
         transformed_vars = [
             np.var(transformed, axis=0) for transformed in transformed_views
         ]
@@ -334,7 +334,7 @@ def explained_variance_cumulative(
 
     def _compute_covariance(self, views: Iterable[np.ndarray]) -> np.ndarray:
         """
-        Computes the covariance matrix for the given views.
+        Computes the covariance matrix for the given representations.
 
         Parameters
         ----------
@@ -364,7 +364,7 @@ def explained_covariance(self, views: Iterable[np.ndarray]) -> np.ndarray:
         """
         check_is_fitted(self, attributes=["weights"])
 
-        # Transform the views using the loadings
+        # Transform the representations using the loadings
         transformed_views = [
             view @ loading for view, loading in zip(views, self.loadings)
         ]
@@ -412,7 +412,7 @@ def explained_covariance_cumulative(
 
     def predict(self, views: Iterable[np.ndarray]) -> List[np.ndarray]:
         """
-        Predicts the missing view from the given views.
+        Predicts the missing view from the given representations.
 
 
         Parameters
@@ -422,30 +422,30 @@ def predict(self, views: Iterable[np.ndarray]) -> List[np.ndarray]:
         Returns
         -------
         predicted_views : list of numpy arrays. None if the view is missing.
-            Predicted views.
+            Predicted representations.
 
         Examples
         --------
         >>> import numpy as np
         >>> X1 = np.random.rand(100, 5)
         >>> X2 = np.random.rand(100, 5)
-        >>> cca = CCA()
+        >>> cca = CCALoss()
         >>> cca.fit([X1, X2])
         >>> X1_pred, X2_pred = cca.predict([X1, None])
 
         """
         check_is_fitted(self, attributes=["weights"])
-        # check if views is same length as weights
+        # check if representations is same length as weights
         if len(views) != len(self.weights):
             raise ValueError(
-                "The number of views must be the same as the number of weights. Put None for missing views."
+                "The number of representations must be the same as the number of weights. Put None for missing representations."
             )
         transformed_views = []
         for i, view in enumerate(views):
             if view is not None:
                 transformed_view = view @ self.weights[i]
                 transformed_views.append(transformed_view)
-        # average the transformed views
+        # average the transformed representations
         average_score = np.mean(transformed_views, axis=0)
         # return the average score transformed back to the original space
         reconstucted_views = []

diff --git a/cca_zoo/data/__init__.py b/cca_zoo/data/__init__.py
@@ -1,3 +1,6 @@
-from . import deep, simulated
+from .simulated import JointDataGenerator, LatentVariableDataGenerator
 
-__all__ = ["simulated", "deep"]
+__all__ = [
+    "JointDataGenerator",
+    "LatentVariableDataGenerator",
+]