Skip to content

Commit

Permalink
Validation set for Gradient Based Models woohoo
Browse files Browse the repository at this point in the history
  • Loading branch information
jameschapman19 committed Oct 19, 2023
1 parent 9d65f6f commit ec9d0a2
Show file tree
Hide file tree
Showing 20 changed files with 255 additions and 140 deletions.
24 changes: 15 additions & 9 deletions benchmark/gradient_benchmark.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Benchmarking CCALoss on high dimensional data. Using CCALoss-Zoo and Scikit-learn.
Benchmarking CCA on high dimensional data. Using CCA-Zoo and CCA-EY
Use different dimensionalities and produce a nice seaborn plot of the runtimes.
"""
Expand All @@ -8,10 +8,11 @@
import pandas as pd
import numpy as np
from cca_zoo.linear import CCA
from cca_zoo.linear import CCA_EYLoss
from cca_zoo.linear import CCA_EY
import seaborn as sns
import matplotlib.pyplot as plt


# Initialize empty list to hold the benchmarking results
results = []

Expand All @@ -33,6 +34,8 @@
# Generate synthetic data
X = np.random.rand(n_samples, dim)
Y = np.random.rand(n_samples, dim)
X -= X.mean(axis=0)
Y -= Y.mean(axis=0)

# CCALoss-Zoo
start_time = time.time()
Expand All @@ -41,21 +44,24 @@
cca_zoo_time = time.time() - start_time

# Record results
results.append(
{"Dimension": dim, "Time": cca_zoo_time, "Method": "CCALoss-Zoo"}
)
results.append({"Dimension": dim, "Time": cca_zoo_time, "Method": "CCA-Zoo"})

# Scikit-learn
start_time = time.time()
sk_cca = CCA_EYLoss(latent_dimensions=latent_dimensions, epochs=200)
sk_cca.fit((X, Y))
cca_ey = CCA_EY(
latent_dimensions=latent_dimensions,
epochs=100,
learning_rate=1e-1,
early_stopping=True,
)
cca_ey.fit((X, Y))
sklearn_time = time.time() - start_time

score = cca_zoo.score((X, Y))
sk_score = sk_cca.score((X, Y))
score_ey = cca_ey.score((X, Y))

# Record results
results.append({"Dimension": dim, "Time": sklearn_time, "Method": "CCALoss-EY"})
results.append({"Dimension": dim, "Time": sklearn_time, "Method": "CCA-EY"})

# Convert to DataFrame
df = pd.DataFrame(results)
Expand Down
4 changes: 2 additions & 2 deletions cca_zoo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from . import (
data,
datasets,
deep,
linear,
model_selection,
Expand All @@ -9,7 +9,7 @@
)

__all__ = [
"data",
"datasets",
"deep",
"linear",
"model_selection",
Expand Down
6 changes: 0 additions & 6 deletions cca_zoo/data/__init__.py

This file was deleted.

16 changes: 16 additions & 0 deletions cca_zoo/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from .simulated import JointData, LatentVariableData
from .toy import (
load_breast_data,
load_split_cifar10_data,
load_mfeat_data,
load_split_mnist_data,
)

__all__ = [
"JointData",
"LatentVariableData",
"load_breast_data",
"load_split_cifar10_data",
"load_mfeat_data",
"load_split_mnist_data",
]
6 changes: 3 additions & 3 deletions cca_zoo/data/simulated.py → cca_zoo/datasets/simulated.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from cca_zoo.utils import _process_parameter


class BaseDataGenerator(ABC):
class BaseData(ABC):
def __init__(
self,
view_features: List[int],
Expand All @@ -26,7 +26,7 @@ def sample(self, n_samples: int):
pass


class LatentVariableDataGenerator(BaseDataGenerator):
class LatentVariableData(BaseData):
def __init__(
self,
view_features: List[int],
Expand Down Expand Up @@ -106,7 +106,7 @@ def joint_cov(self):
return cov


class JointDataGenerator(BaseDataGenerator):
class JointData(BaseData):
"""
Class for generating simulated data for a linear model with multiple representations.
"""
Expand Down
128 changes: 128 additions & 0 deletions cca_zoo/datasets/toy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import os

import numpy as np
from sklearn.utils import Bunch

from cca_zoo.utils.check_values import check_rdata_support

DATA_MODULE = "cca_zoo.datasets.data"


def load_breast_data():
# Describe File
fdescr = ""
check_rdata_support("load_breast_data")
import rdata

url = "https://tibshirani.su.domains/PMA/breastdata.rda"
data_file_name = "breastdata.rda"

# Download the data file
tmpdir = os.path.join(os.getcwd(), "tmpdir")
os.makedirs(tmpdir, exist_ok=True)
filepath = os.path.join(tmpdir, data_file_name)

if not os.path.exists(filepath):
import urllib.request

urllib.request.urlretrieve(url, filepath)

parsed = rdata.parser.parse_file(filepath)
converted = rdata.conversion.convert(parsed)["breastdata"]
return Bunch(
views=[converted["dna"], converted["rna"]],
view_names=["dna", "rna"],
chrom=converted["chrom"],
nuc=converted["nuc"],
gene=converted["gene"],
genenames=converted["genenames"],
genechr=converted["genechr"],
genedesc=converted["genedesc"],
genepos=converted["genepos"],
DESCR=fdescr,
filename=data_file_name,
data_module=DATA_MODULE,
)


def load_split_cifar10_data(data_home=None, cache=True):
from sklearn.datasets import fetch_openml

# Download CIFAR-10
cifar_data = fetch_openml(name="CIFAR_10", data_home=data_home,cache=cache)

# Split into left and right halves
X = cifar_data.data.values

# X is a 60000 x 3072 matrix. First 1024 columns are red, next 1024 are green, last 1024 are blue. The image is
# stored in row-major order, so that the first 32 entries of the array are the red channel values of the first
# row of the image. We reshape it to 60000 x 32 x 32 x 3 to get the RGB images.
X_R = X[:, :1024].reshape((60000, 32, 32))
X_G = X[:, 1024:2048].reshape((60000, 32, 32))
X_B = X[:, 2048:].reshape((60000, 32, 32))
X = np.stack((X_R, X_G, X_B), axis=3)
X1 = X[:, :, :16, :]
X2 = X[:, :, 16:, :]
X1_R = X1[:, :, :, 0].reshape((60000, -1))
X1_G = X1[:, :, :, 1].reshape((60000, -1))
X1_B = X1[:, :, :, 2].reshape((60000, -1))
X2_R = X2[:, :, :, 0].reshape((60000, -1))
X2_G = X2[:, :, :, 1].reshape((60000, -1))
X2_B = X2[:, :, :, 2].reshape((60000, -1))
X1 = np.hstack((X1_R, X1_G, X1_B))
X2 = np.hstack((X2_R, X2_G, X2_B))
cifar_data.views = [X1, X2]
return cifar_data


def load_mfeat_data(features=None):
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mfeat/mfeat.tar"
data_file_name = "mfeat.tar"
# Download the data file
tmpdir = os.path.join(os.getcwd(), "tmpdir")
os.makedirs(tmpdir, exist_ok=True)
filepath = os.path.join(tmpdir, data_file_name)
from torchvision.datasets.utils import download_and_extract_archive

if not os.path.exists(filepath):
download_and_extract_archive(url, download_root=tmpdir, filename=data_file_name)
if features is None:
features = ["fac", "fou", "kar", "mor", "pix", "zer"]
views = [
np.genfromtxt(os.path.join(tmpdir, f"mfeat/mfeat-{feature}"))
for feature in features
]
# first 200 patterns are of class `0', followed by sets of 200 patterns
# for each of the classes `1' - `9'.
targets = np.array(
[0] * 200
+ [1] * 200
+ [2] * 200
+ [3] * 200
+ [4] * 200
+ [5] * 200
+ [6] * 200
+ [7] * 200
+ [8] * 200
+ [9] * 200
)
return Bunch(
views=views,
target=targets,
DESCR="MFeat Dataset",
data_module=DATA_MODULE,
)


def load_split_mnist_data():
from sklearn.datasets import fetch_openml

# Download MNIST
mnist_data = fetch_openml(name="mnist_784")

# Split into left and right halves
X = mnist_data.data.values
X1 = X[:, :392]
X2 = X[:, 392:]
mnist_data.views = [X1, X2]
return mnist_data
3 changes: 1 addition & 2 deletions cca_zoo/deep/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from . import architectures, callbacks, objectives
from . import architectures, objectives
from ._discriminative import (
DCCA,
DCCA_EY,
Expand All @@ -24,7 +24,6 @@
"SplitAE",
"architectures",
"objectives",
"callbacks",
]

classes = [
Expand Down
64 changes: 0 additions & 64 deletions cca_zoo/deep/callbacks.py

This file was deleted.

14 changes: 6 additions & 8 deletions cca_zoo/deep/objectives.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,14 @@ def get_AB(self, representations):
class CCA_GHALoss(CCA_EYLoss):
def loss(self, representations, independent_representations=None):
A, B = self.get_AB(representations)
rewards = torch.trace(2 * A)
rewards = torch.trace(A)
if independent_representations is None:
penalties = torch.trace(A.detach() @ B)
rewards += torch.trace(A)
penalties = torch.trace(A @ B)
else:
independent_A, independent_B = self.get_AB(independent_representations)
penalties = torch.trace(independent_A.detach() @ B)
rewards += torch.trace(independent_A)
penalties = torch.trace(independent_A @ B)
return {
"objective": -rewards + penalties,
"rewards": rewards,
Expand All @@ -275,11 +277,7 @@ def loss(self, representations, independent_representations=None):
if independent_representations is None:
Cyy = C[latent_dims:, latent_dims:]
else:
Cyy = cross_cov(
independent_representations[1],
independent_representations[1],
rowvar=False,
)
Cyy = torch.cov(independent_representations[1].T)

rewards = torch.trace(2 * Cxy)
penalties = torch.trace(Cxx @ Cyy)
Expand Down
5 changes: 3 additions & 2 deletions cca_zoo/linear/_gradient/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
import pytorch_lightning as pl
import torch
from torch.utils import data
from pytorch_lightning.callbacks import EarlyStopping
from torch.utils.data import DataLoader

from cca_zoo._base import BaseModel
Expand Down Expand Up @@ -38,6 +38,7 @@ def __init__(
initialization: Union[str, callable] = "random",
trainer_kwargs=None,
optimizer_kwargs=None,
early_stopping=False,
):
BaseModel.__init__(
self,
Expand All @@ -61,6 +62,7 @@ def __init__(
self.dataloader_kwargs = dataloader_kwargs or DEFAULT_LOADER_KWARGS
self.trainer_kwargs = trainer_kwargs or DEFAULT_TRAINER_KWARGS
self.optimizer_kwargs = optimizer_kwargs or DEFAULT_OPTIMIZER_KWARGS
self.early_stopping = early_stopping

def fit(self, views: Iterable[np.ndarray], y=None, validation_views=None, **kwargs):
views = self._validate_data(views)
Expand All @@ -78,7 +80,6 @@ def _fit(self, views: Iterable[np.ndarray], validation_views=None):
for weight in self.weights
]
self.torch_weights = torch.nn.ParameterList(self.torch_weights)
# make a trainer
trainer = pl.Trainer(
max_epochs=self.epochs,
**self.trainer_kwargs,
Expand Down
Loading

0 comments on commit ec9d0a2

Please sign in to comment.