Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
jameschapman19 committed Oct 18, 2023
2 parents 80696e6 + 8cb6bec commit 577187d
Show file tree
Hide file tree
Showing 92 changed files with 1,168 additions and 903 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ For citing our work, please use the following BibTeX entry:
author = {Chapman, James and Wang, Hao-Ting and Wells, Lennie and Wiesner, Johannes},
doi = {10.5281/zenodo.4382739},
month = aug,
title = {{CCA-Zoo}},
title = {{CCALoss-Zoo}},
url = {https://github.com/jameschapman19/cca_zoo},
version = {2.3.0},
year = {2023}
Expand Down
4 changes: 2 additions & 2 deletions benchmark/CCA_Speed_Benchmark.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion benchmark/PLS_Speed_Benchmark.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
10 changes: 6 additions & 4 deletions benchmark/cca_high_dimensions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Benchmarking CCA on high dimensional data. Using CCA-Zoo and Scikit-learn.
Benchmarking CCALoss on high dimensional data. Using CCALoss-Zoo and Scikit-learn.
Use different dimensionalities and produce a nice seaborn plot of the runtimes.
"""
Expand Down Expand Up @@ -34,14 +34,16 @@
X = np.random.rand(n_samples, dim)
Y = np.random.rand(n_samples, dim)

# CCA-Zoo
# CCALoss-Zoo
start_time = time.time()
cca_zoo = CCA(latent_dimensions=latent_dimensions)
cca_zoo.fit((X, Y))
cca_zoo_time = time.time() - start_time

# Record results
results.append({"Dimension": dim, "Time": cca_zoo_time, "Method": "CCA-Zoo"})
results.append(
{"Dimension": dim, "Time": cca_zoo_time, "Method": "CCALoss-Zoo"}
)

# Scikit-learn
start_time = time.time()
Expand All @@ -60,7 +62,7 @@
# Seaborn Plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x="Dimension", y="Time", hue="Method", marker="o", errorbar="sd")
plt.title("CCA Performance comparison with Uncertainty")
plt.title("CCALoss Performance comparison with Uncertainty")
plt.xlabel("Dimension")
plt.ylabel("Average Execution Time (seconds)")
plt.tight_layout()
Expand Down
16 changes: 9 additions & 7 deletions benchmark/gradient_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Benchmarking CCA on high dimensional data. Using CCA-Zoo and Scikit-learn.
Benchmarking CCALoss on high dimensional data. Using CCALoss-Zoo and Scikit-learn.
Use different dimensionalities and produce a nice seaborn plot of the runtimes.
"""
Expand All @@ -8,7 +8,7 @@
import pandas as pd
import numpy as np
from cca_zoo.linear import CCA
from cca_zoo.linear import CCA_EY
from cca_zoo.linear import CCA_EYLoss
import seaborn as sns
import matplotlib.pyplot as plt

Expand All @@ -34,34 +34,36 @@
X = np.random.rand(n_samples, dim)
Y = np.random.rand(n_samples, dim)

# CCA-Zoo
# CCALoss-Zoo
start_time = time.time()
cca_zoo = CCA(latent_dimensions=latent_dimensions)
cca_zoo.fit((X, Y))
cca_zoo_time = time.time() - start_time

# Record results
results.append({"Dimension": dim, "Time": cca_zoo_time, "Method": "CCA-Zoo"})
results.append(
{"Dimension": dim, "Time": cca_zoo_time, "Method": "CCALoss-Zoo"}
)

# Scikit-learn
start_time = time.time()
sk_cca = CCA_EY(latent_dimensions=latent_dimensions, epochs=200)
sk_cca = CCA_EYLoss(latent_dimensions=latent_dimensions, epochs=200)
sk_cca.fit((X, Y))
sklearn_time = time.time() - start_time

score = cca_zoo.score((X, Y))
sk_score = sk_cca.score((X, Y))

# Record results
results.append({"Dimension": dim, "Time": sklearn_time, "Method": "CCA-EY"})
results.append({"Dimension": dim, "Time": sklearn_time, "Method": "CCALoss-EY"})

# Convert to DataFrame
df = pd.DataFrame(results)

# Seaborn Plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x="Dimension", y="Time", hue="Method", marker="o", errorbar="sd")
plt.title("CCA Performance comparison with Uncertainty")
plt.title("CCALoss Performance comparison with Uncertainty")
plt.xlabel("Dimension")
plt.ylabel("Average Execution Time (seconds)")
plt.tight_layout()
Expand Down
8 changes: 5 additions & 3 deletions benchmark/pls_high_dimension.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Benchmarking CCA on high dimensional data. Using CCA-Zoo and Scikit-learn.
Benchmarking CCALoss on high dimensional data. Using CCALoss-Zoo and Scikit-learn.
Use different dimensionalities and produce a nice seaborn plot of the runtimes.
"""
Expand Down Expand Up @@ -34,14 +34,16 @@
X = np.random.rand(n_samples, dim)
Y = np.random.rand(n_samples, dim)

# CCA-Zoo
# CCALoss-Zoo
start_time = time.time()
cca_zoo = PLS(latent_dimensions=latent_dimensions)
cca_zoo.fit((X, Y))
cca_zoo_time = time.time() - start_time

# Record results
results.append({"Dimension": dim, "Time": cca_zoo_time, "Method": "CCA-Zoo"})
results.append(
{"Dimension": dim, "Time": cca_zoo_time, "Method": "CCALoss-Zoo"}
)

# Scikit-learn
start_time = time.time()
Expand Down
52 changes: 27 additions & 25 deletions cca_zoo/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class BaseModel(BaseEstimator, MultiOutputMixin, RegressorMixin):
A base class for multivariate latent variable linear.
This class implements common methods and attributes for fitting and transforming
multiple views of data using latent variable linear. It inherits from scikit-learn's
multiple representations of data using latent variable linear. It inherits from scikit-learn's
BaseEstimator, MultiOutputMixin and RegressorMixin classes.
Parameters
Expand All @@ -34,7 +34,7 @@ class BaseModel(BaseEstimator, MultiOutputMixin, RegressorMixin):
Attributes
----------
n_views_ : int
Number of views.
Number of representations.
n_features_ : list of int
Number of features for each view.
Expand All @@ -56,14 +56,16 @@ def __init__(

def _validate_data(self, views: Iterable[np.ndarray]):
if not all(view.shape[0] == views[0].shape[0] for view in views):
raise ValueError("All views must have the same number of samples")
raise ValueError("All representations must have the same number of samples")
if not all(view.ndim == 2 for view in views):
raise ValueError("All views must have 2 dimensions")
raise ValueError("All representations must have 2 dimensions")
if not all(view.dtype in self.dtypes for view in views):
raise ValueError("All views must have dtype of {}.".format(self.dtypes))
raise ValueError(
"All representations must have dtype of {}.".format(self.dtypes)
)
if not all(view.shape[1] >= self.latent_dimensions for view in views):
raise ValueError(
"All views must have at least {} features.".format(
"All representations must have at least {} features.".format(
self.latent_dimensions
)
)
Expand Down Expand Up @@ -121,7 +123,7 @@ def transform(self, views: Iterable[np.ndarray], **kwargs) -> List[np.ndarray]:

def fit_transform(self, views: Iterable[np.ndarray], **kwargs) -> List[np.ndarray]:
"""
Fits the model to the given data and returns the transformed views
Fits the model to the given data and returns the transformed representations
Parameters
----------
Expand All @@ -139,7 +141,7 @@ def pairwise_correlations(
self, views: Iterable[np.ndarray], **kwargs
) -> np.ndarray:
"""
Calculate pairwise correlations between views in each dimension.
Calculate pairwise correlations between representations in each dimension.
Parameters
----------
Expand All @@ -163,7 +165,7 @@ def average_pairwise_correlations(
self, views: Iterable[np.ndarray], **kwargs
) -> np.ndarray:
"""
Calculate the average pairwise correlations between views in each dimension.
Calculate the average pairwise correlations between representations in each dimension.
Parameters
----------
Expand All @@ -175,7 +177,7 @@ def average_pairwise_correlations(
average_pairwise_correlations: numpy array of shape (latent_dimensions, )
"""
pair_corrs = self.pairwise_correlations(views, **kwargs)
# Sum all the pairwise correlations for each dimension, subtract self-correlations, and divide by the number of views
# Sum all the pairwise correlations for each dimension, subtract self-correlations, and divide by the number of representations
dim_corrs = np.sum(pair_corrs, axis=(0, 1)) - pair_corrs.shape[0]
# Number of pairs is n_views choose 2
num_pairs = (self.n_views_ * (self.n_views_ - 1)) / 2
Expand All @@ -186,7 +188,7 @@ def score(
self, views: Iterable[np.ndarray], y: Optional[Any] = None, **kwargs
) -> float:
"""
Calculate the sum of average pairwise correlations between views.
Calculate the sum of average pairwise correlations between representations.
Parameters
----------
Expand All @@ -197,7 +199,7 @@ def score(
Returns
-------
score : float
Sum of average pairwise correlations between views.
Sum of average pairwise correlations between representations.
"""
return self.average_pairwise_correlations(views, **kwargs).sum()

Expand All @@ -212,8 +214,8 @@ def canonical_loadings(
linear combinations of the original variables formed to maximize the correlation
with canonical variates from another view.
Mathematically, given two views \(X_i\), canonical variates
from the views are:
Mathematically, given two representations \(X_i\), canonical variates
from the representations are:
\(Z_i = w_i^T X_i\)
Expand All @@ -223,7 +225,7 @@ def canonical_loadings(
Parameters
----------
views : list/tuple of numpy arrays
Each array corresponds to a view. All views must have the same number of rows (observations).
Each array corresponds to a view. All representations must have the same number of rows (observations).
Returns
-------
Expand Down Expand Up @@ -281,12 +283,12 @@ def explained_variance(self, views: Iterable[np.ndarray]) -> List[np.ndarray]:
"""
check_is_fitted(self, attributes=["weights"])

# Transform the views using the loadings
# Transform the representations using the loadings
transformed_views = [
view @ loading for view, loading in zip(views, self.loadings)
]

# Calculate the variance of each latent dimension in the transformed views
# Calculate the variance of each latent dimension in the transformed representations
transformed_vars = [
np.var(transformed, axis=0) for transformed in transformed_views
]
Expand Down Expand Up @@ -334,7 +336,7 @@ def explained_variance_cumulative(

def _compute_covariance(self, views: Iterable[np.ndarray]) -> np.ndarray:
"""
Computes the covariance matrix for the given views.
Computes the covariance matrix for the given representations.
Parameters
----------
Expand Down Expand Up @@ -364,7 +366,7 @@ def explained_covariance(self, views: Iterable[np.ndarray]) -> np.ndarray:
"""
check_is_fitted(self, attributes=["weights"])

# Transform the views using the loadings
# Transform the representations using the loadings
transformed_views = [
view @ loading for view, loading in zip(views, self.loadings)
]
Expand Down Expand Up @@ -412,7 +414,7 @@ def explained_covariance_cumulative(

def predict(self, views: Iterable[np.ndarray]) -> List[np.ndarray]:
"""
Predicts the missing view from the given views.
Predicts the missing view from the given representations.
Parameters
Expand All @@ -422,30 +424,30 @@ def predict(self, views: Iterable[np.ndarray]) -> List[np.ndarray]:
Returns
-------
predicted_views : list of numpy arrays. None if the view is missing.
Predicted views.
Predicted representations.
Examples
--------
>>> import numpy as np
>>> X1 = np.random.rand(100, 5)
>>> X2 = np.random.rand(100, 5)
>>> cca = CCA()
>>> cca = CCALoss()
>>> cca.fit([X1, X2])
>>> X1_pred, X2_pred = cca.predict([X1, None])
"""
check_is_fitted(self, attributes=["weights"])
# check if views is same length as weights
# check if representations is same length as weights
if len(views) != len(self.weights):
raise ValueError(
"The number of views must be the same as the number of weights. Put None for missing views."
"The number of representations must be the same as the number of weights. Put None for missing representations."
)
transformed_views = []
for i, view in enumerate(views):
if view is not None:
transformed_view = view @ self.weights[i]
transformed_views.append(transformed_view)
# average the transformed views
# average the transformed representations
average_score = np.mean(transformed_views, axis=0)
# return the average score transformed back to the original space
reconstucted_views = []
Expand Down
7 changes: 5 additions & 2 deletions cca_zoo/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from . import deep, simulated
from .simulated import JointDataGenerator, LatentVariableDataGenerator

__all__ = ["simulated", "deep"]
__all__ = [
"JointDataGenerator",
"LatentVariableDataGenerator",
]
Loading

0 comments on commit 577187d

Please sign in to comment.