From 6dbf870fe4aeddf98c5b19616b22fbb14d8297a5 Mon Sep 17 00:00:00 2001 From: Austin Talbot Date: Wed, 6 Sep 2023 19:50:31 -0400 Subject: [PATCH] Feature/ppca (#235) This implements the basic maximum likelihood PPCA model along with a base class for all future PPCA/factor analysis models. --- python/python/bystro/_template_npr.py | 18 +- python/python/bystro/_template_sgd_np.py | 34 +- .../covariance/{test => tests}/__init__.py | 0 .../{test => tests}/test_base_covariance.py | 0 .../{test => tests}/test_base_precision.py | 0 .../{test => tests}/test_covariance_np.py | 0 python/python/bystro/supervised_ppca/_base.py | 552 ++++++++++++++++++ .../supervised_ppca/ppca_analytic_np.py | 115 ++++ .../bystro/supervised_ppca/tests/__init__.py | 0 .../supervised_ppca/tests/test_analytic.py | 23 + 10 files changed, 719 insertions(+), 23 deletions(-) rename python/python/bystro/covariance/{test => tests}/__init__.py (100%) rename python/python/bystro/covariance/{test => tests}/test_base_covariance.py (100%) rename python/python/bystro/covariance/{test => tests}/test_base_precision.py (100%) rename python/python/bystro/covariance/{test => tests}/test_covariance_np.py (100%) create mode 100644 python/python/bystro/supervised_ppca/_base.py create mode 100644 python/python/bystro/supervised_ppca/ppca_analytic_np.py create mode 100644 python/python/bystro/supervised_ppca/tests/__init__.py create mode 100644 python/python/bystro/supervised_ppca/tests/test_analytic.py diff --git a/python/python/bystro/_template_npr.py b/python/python/bystro/_template_npr.py index af87ed4ae..effee1b30 100644 --- a/python/python/bystro/_template_npr.py +++ b/python/python/bystro/_template_npr.py @@ -10,7 +10,7 @@ Objects ------- -_BaseNPRModel(object) +BaseNumpyroModel(mcmc_options=None,hp_options=None) This is the template Numpyro model """ @@ -19,7 +19,11 @@ import numpyro # type: ignore -class _BaseNumpyroModel: +class BaseNumpyroModel(abc.ABC): + """ + The template for a numpyro-based model + """ + def __init__(self, mcmc_options=None, hp_options=None): """ @@ -75,11 +79,12 @@ def pickle(self, path): ------- """ - assert self.samples is not None, "Fit model first" + mydict = {"model": self} with open(path, "wb") as f: - cloudpickle.dump(self.samples, f) + cloudpickle.dump(mydict, f) - def unpickle(self, path): + @classmethod + def unpickle(cls, path): """ This loads samples from a previously saved model @@ -91,7 +96,8 @@ def unpickle(self, path): """ with open(path, "rb") as f: - self.samples = cloudpickle.load(f) + myDict = cloudpickle.load(f) + return myDict["model"] def _fill_mcmc_options(self, mcmc_options): """ diff --git a/python/python/bystro/_template_sgd_np.py b/python/python/bystro/_template_sgd_np.py index 7fa4dc9c9..e9eb0c660 100644 --- a/python/python/bystro/_template_sgd_np.py +++ b/python/python/bystro/_template_sgd_np.py @@ -4,7 +4,8 @@ Objects ------- -_BaseSGDModel(object): +BaseSGDModel(training_options=None) + Methods ------- @@ -14,12 +15,13 @@ import cloudpickle # type: ignore -class _BaseSGDModel(object): +class BaseSGDModel(abc.ABC): + """ + The base class of a model relying on stochastic gradient descent for + inference + """ + def __init__(self, training_options=None): - """ - The base class of a model relying on stochastic gradient descent for - inference - """ if training_options is None: training_options = {} self.training_options = self._fill_training_options(training_options) @@ -51,8 +53,8 @@ def pickle(self, path): with open(path, "wb") as f: cloudpickle.dump(mydict, f) - @abc.abstractmethod - def unpickle(self, path): + @classmethod + def unpickle(cls, path): """ Method for loading the model @@ -61,6 +63,9 @@ def unpickle(self, path): path : str The directory to load the model from """ + with open(path, "rb") as f: + myDict = cloudpickle.load(f) + return myDict["model"] def _fill_training_options(self, training_options): """ @@ -79,16 +84,15 @@ def _fill_training_options(self, training_options): return training_opts @abc.abstractmethod - def _save_variables(self, training_variables): + def _store_instance_variables(self, trainable_variables): """ - This saves the final parameter values after training + Saves the learned variables Parameters ---------- - training_variables :list - The variables trained + trainable_variables : list + List of variables to save """ - raise NotImplementedError("_save_variables") @abc.abstractmethod def _initialize_save_losses(self): @@ -99,7 +103,6 @@ def _initialize_save_losses(self): Parameters ---------- """ - raise NotImplementedError("_initialize_save_losses") @abc.abstractmethod def _save_losses(self, *args): @@ -109,7 +112,6 @@ def _save_losses(self, *args): Parameters ---------- """ - raise NotImplementedError("_save_losses") @abc.abstractmethod def _test_inputs(self, *args): @@ -119,7 +121,6 @@ def _test_inputs(self, *args): Parameters ---------- """ - raise NotImplementedError("_transform_training_data") @abc.abstractmethod def _transform_training_data(self, *args): @@ -129,4 +130,3 @@ def _transform_training_data(self, *args): Parameters ---------- """ - raise NotImplementedError("_transform_training_data") diff --git a/python/python/bystro/covariance/test/__init__.py b/python/python/bystro/covariance/tests/__init__.py similarity index 100% rename from python/python/bystro/covariance/test/__init__.py rename to python/python/bystro/covariance/tests/__init__.py diff --git a/python/python/bystro/covariance/test/test_base_covariance.py b/python/python/bystro/covariance/tests/test_base_covariance.py similarity index 100% rename from python/python/bystro/covariance/test/test_base_covariance.py rename to python/python/bystro/covariance/tests/test_base_covariance.py diff --git a/python/python/bystro/covariance/test/test_base_precision.py b/python/python/bystro/covariance/tests/test_base_precision.py similarity index 100% rename from python/python/bystro/covariance/test/test_base_precision.py rename to python/python/bystro/covariance/tests/test_base_precision.py diff --git a/python/python/bystro/covariance/test/test_covariance_np.py b/python/python/bystro/covariance/tests/test_covariance_np.py similarity index 100% rename from python/python/bystro/covariance/test/test_covariance_np.py rename to python/python/bystro/covariance/tests/test_covariance_np.py diff --git a/python/python/bystro/supervised_ppca/_base.py b/python/python/bystro/supervised_ppca/_base.py new file mode 100644 index 000000000..897ab5837 --- /dev/null +++ b/python/python/bystro/supervised_ppca/_base.py @@ -0,0 +1,552 @@ +""" +This provides the base class of any Gaussian factor model, such as +(probabilistic) PCA, supervised PCA, and factor analysis, as well as +supervised and adversarial derivatives. + +Implementing an extension model requires that the following methods be +implemented + fit - Learns the model given data (and optionally supervised/adversarial + labels + get_covariance - Given a fitted model returns the covariance matrix + +For the remaining shared methods computing likelihoods etc, there are two +options, compute the covariance matrix then use the default methods from +the covariance module, or use the Sherman-Woodbury matrix identity to +invert the matrix more efficiently. Currently only the first is implemented +but left the options for future implementation. + +Objects +------- +BaseGaussianFactorModel(_BaseSGDModel) + Base class of all factor analysis models implementing any shared + Gaussian methods. + +BaseSGDModel(BaseGaussianFactorModel) + This is the base class of models that use Tensorflow stochastic + gradient descent for inference. This reduces boilerplate code + associated with some of the standard methods etc. + +Methods +------- +None +""" +from abc import abstractmethod, ABC + +import numpy as np +from bystro.covariance._base_covariance import ( # type: ignore + _get_stable_rank, # type: ignore + _conditional_score, # type: ignore + _conditional_score_samples, # type: ignore + _marginal_score, # type: ignore + _marginal_score_samples, # type: ignore + _score, # type: ignore + _score_samples, # type: ignore + _entropy, # type: ignore + _entropy_subset, # type: ignore + _mutual_information, # type: ignore +) # type: ignore +from numpy import linalg as la +from datetime import datetime as dt +import pytz +from bystro._template_sgd_np import BaseSGDModel # type: ignore + + +class BaseGaussianFactorModel(BaseSGDModel, ABC): + def __init__(self, n_components=2): + """ + This is the base class of the model. Will never be called directly. + + Parameters + ---------- + n_components : int,default=2 + The latent dimensionality + + Sets + ---- + creationDate : datetime + The date/time that the object was created + """ + self.n_components = int(n_components) + self.creationDate = dt.now(pytz.utc) + + @abstractmethod + def fit(self, *args): + """ + Fits a model given covariates X as well as option labels y in the + supervised methods + + Parameters + ---------- + X : np.array-like,(n_samples,n_covariates) + The data + + other arguments + + Returns + ------- + self : object + The model + """ + + @abstractmethod + def get_covariance(self): + """ + Gets the covariance matrix defined by the model parameters + + Parameters + ---------- + None + + Returns + ------- + covariance : np.array-like(p,p) + The covariance matrix + """ + + def get_precision(self): + """ + Gets the precision matrix defined as the inverse of the covariance + + Parameters + ---------- + None + + Returns + ------- + precision : np.array-like(p,p) + The inverse of the covariance matrix + """ + covariance = self.get_covariance() + precision = la.inv(covariance) + return precision + + def get_stable_rank(self): + """ + Returns the stable rank defined as + ||A||_F^2/||A||^2 + + Parameters + ---------- + None + + Returns + ------- + srank : float + The stable rank. See Vershynin High dimensional probability for + discussion, but this is a statistically stable approximation to rank + """ + covariance = self.get_covariance() + srank = _get_stable_rank(covariance) + return srank + + def transform(self, X): + """ + This returns the latent variable estimates given X + + Parameters + ---------- + X : np array-like,(N_samples,p + The data to transform. + + Returns + ------- + S : np.array-like,(N_samples,n_components) + The factor estimates + """ + prec = self.get_precision() + coefs = np.dot(self.W_, prec) + S = np.dot(X, coefs.T) + return S + + def conditional_score( + self, X, observed_feature_idxs, weights=None, sherman_woodbury=False + ): + """ + Returns the predictive log-likelihood of a subset of data. + + mean(log p(X[idx==1]|X[idx==0],covariance)) + + Parameters + ---------- + X : np.array-like,(N,sum(observed_feature_idxs)) + The data + + observed_feature_idxs: np.array-like,(sum(p),) + The observation locations + + weights : np.array-like,(N,),default=None + The optional weights on the samples + + Returns + ------- + avg_score : float + Average log likelihood + """ + if sherman_woodbury is False: + covariance = self.get_covariance() + avg_score = _conditional_score( + covariance, X, observed_feature_idxs, weights=weights + ) + else: + raise NotImplementedError( + "Sherman-Woodbury matrix inversion not implemented yet" + ) + return avg_score + + def conditional_score_samples( + self, X, observed_feature_idxs, sherman_woodbury=False + ): + """ + Return the conditional log likelihood of each sample, that is + + log p(X[idx==1]|X[idx==0],covariance) + + Parameters + ---------- + X : np.array-like,(N,p) + The data + + observed_feature_idxs: np.array-like,(p,) + The observation locations + + Returns + ------- + scores : float + Log likelihood for each sample + """ + if sherman_woodbury is False: + covariance = self.get_covariance() + scores = _conditional_score_samples( + covariance, X, observed_feature_idxs + ) + else: + raise NotImplementedError( + "Sherman-Woodbury matrix inversion not implemented yet" + ) + return scores + + def marginal_score( + self, X, observed_feature_idxs, weights=None, sherman_woodbury=False + ): + """ + Returns the marginal log-likelihood of a subset of data + + Parameters + ---------- + X : np.array-like,(N,sum(idxs)) + The data + + observed_feature_idxs: np.array-like,(sum(p),) + The observation locations + + weights : np.array-like,(N,),default=None + The optional weights on the samples + + Returns + ------- + avg_score : float + Average log likelihood + """ + if sherman_woodbury is False: + covariance = self.get_covariance() + avg_score = _marginal_score( + covariance, X, observed_feature_idxs, weights=weights + ) + else: + raise NotImplementedError( + "Sherman-Woodbury matrix inversion not implemented yet" + ) + return avg_score + + def marginal_score_samples( + self, X, observed_feature_idxs, sherman_woodbury=False + ): + """ + Returns the marginal log-likelihood of a subset of data + + Parameters + ---------- + X : np.array-like,(N,sum(observed_feature_idxs)) + The data + + observed_feature_idxs: np.array-like,(sum(p),) + The observation locations + + Returns + ------- + scores : float + Average log likelihood + """ + if sherman_woodbury is False: + covariance = self.get_covariance() + scores = _marginal_score_samples( + covariance, X, observed_feature_idxs + ) + else: + raise NotImplementedError( + "Sherman-Woodbury matrix inversion not implemented yet" + ) + return scores + + def score(self, X, weights=None, sherman_woodbury=False): + """ + Returns the average log liklihood of data. + + Parameters + ---------- + X : np.array-like,(N,sum(p)) + The data + + weights : np.array-like,(N,),default=None + The optional weights on the samples + + Returns + ------- + avg_score : float + Average log likelihood + """ + if sherman_woodbury is False: + covariance = self.get_covariance() + avg_score = _score(covariance, X, weights=weights) + else: + raise NotImplementedError( + "Sherman-Woodbury matrix inversion not implemented yet" + ) + return avg_score + + def score_samples(self, X, sherman_woodbury=False): + """ + Return the log likelihood of each sample + + Parameters + ---------- + X : np.array-like,(N,sum(p)) + The data + + Returns + ------- + scores : float + Log likelihood for each sample + """ + if sherman_woodbury is False: + covariance = self.get_covariance() + scores = _score_samples(covariance, X) + else: + raise NotImplementedError( + "Sherman-Woodbury matrix inversion not implemented yet" + ) + return scores + + def get_entropy(self): + """ + Computes the entropy of a Gaussian distribution parameterized by + covariance. + + Parameters + ---------- + None + + Returns + ------- + entropy : float + The differential entropy of the distribution + """ + covariance = self.get_covariance() + entropy = _entropy(covariance) + return entropy + + def get_entropy_subset(self, observed_feature_idxs): + """ + Computes the entropy of a subset of the Gaussian distribution + parameterized by covariance. + + Parameters + ---------- + observed_feature_idxs: np.array-like,(sum(p),) + The observation locations + + Returns + ------- + entropy : float + The differential entropy of the distribution + """ + covariance = self.get_covariance() + entropy = _entropy_subset(covariance, observed_feature_idxs) + return entropy + + def mutual_information( + self, observed_feature_idxs1, observed_feature_idxs2 + ): + """ + This computes the mutual information bewteen the two sets of + covariates based on the model. + + Parameters + ---------- + observed_feature_idxs1 : np.array-like,(p,) + First group of variables + + observed_feature_idxs2 : np.array-like,(p,) + Second group of variables + + Returns + ------- + mutual_information : float + The mutual information between the two variables + """ + covariance = self.get_covariance() + mutual_information = _mutual_information( + covariance, observed_feature_idxs1, observed_feature_idxs2 + ) + return mutual_information + + +class BasePCASGDModel(BaseGaussianFactorModel, ABC): + def __init__( + self, n_components=2, training_options=None, prior_options=None + ): + """ + This is the base class of models that use stochastic + gradient descent for inference. This reduces boilerplate code + associated with some of the standard methods etc. + + Parameters + ---------- + n_components : int,default=2 + The latent dimensionality + + training_options : dict,default={} + The options for gradient descent + + prior_options : dict,default={} + The options for priors on model parameters + + Sets + ---- + creationDate : datetime + The date/time that the object was created + """ + super().__init__(n_components=n_components) + + if training_options is None: + training_options = {} + if prior_options is None: + prior_options = {} + + self.training_options = self._fill_training_options(training_options) + self.prior_options = self._fill_prior_options(prior_options) + + def _fill_training_options(self, training_options): + """ + This sets the default parameters for stochastic gradient descent, + our inference strategy for the model. + + Parameters + ---------- + training_options : dict + The original options set by the user passed as a dictionary + + Options + ------- + n_iterations : int, default=3000 + Number of iterations to train using stochastic gradient descent + + learning_rate : float, default=1e-4 + Learning rate of gradient descent + + method : string {'Nadam'}, default='Nadam' + The learning algorithm + + batch_size : int, default=None + The number of observations to use at each iteration. If none + corresponds to batch learning + + gpu_memory : int, default=1024 + The amount of memory you wish to use during training + """ + default_options = { + "n_iterations": 3000, + "learning_rate": 1e-2, + "gpu_memory": 1024, + "method": "Nadam", + "batch_size": 100, + "momentum": 0.9, + } + tops = {**default_options, **training_options} + + default_keys = set(default_options.keys()) + final_keys = set(tops.keys()) + + expected_but_missing_keys = default_keys - final_keys + unexpected_but_present_keys = final_keys - default_keys + if expected_but_missing_keys: + raise ValueError( + "the following training options were expected but not found..." + ) + if unexpected_but_present_keys: + raise ValueError( + "the following training options were unrecognized but provided..." + ) + return tops + + def _initialize_saved_losses(self): + """ + This method initializes the arrays to track relevant variables + during training at each iteration + + Sets + ---- + losses_likelihood : np.array(n_iterations) + The log likelihood + + losses_prior : np.array(n_iterations) + The log prior + + losses_posterior : np.array(n_iterations) + The log posterior + """ + n_iterations = self.training_options["n_iterations"] + self.losses_likelihood = np.zeros(n_iterations) + self.losses_prior = np.zeros(n_iterations) + self.losses_posterior = np.zeros(n_iterations) + + def _save_losses(self, i, log_likelihood, log_prior, log_posterior): + """ + Saves the values of the losses at each iteration + + Parameters + ----------- + i : int + Current training iteration + + losses_likelihood : tf.Float + The log likelihood + + losses_prior : tf.Float + The log prior + + losses_posterior : tf.Float + The log posterior + """ + self.losses_likelihood[i] = log_likelihood.numpy() + if isinstance(log_prior, np.ndarray): + self.losses_prior[i] = log_prior + else: + self.losses_prior[i] = log_prior.numpy() + self.losses_posterior[i] = log_posterior.numpy() + + @abstractmethod + def _create_prior(self): + """ + Creates a prior on the parameters taking your trainable variable + dictionary as input + + Parameters + ---------- + None + + Returns + ------- + log_prior : function + The function representing the negative log density of the prior + """ diff --git a/python/python/bystro/supervised_ppca/ppca_analytic_np.py b/python/python/bystro/supervised_ppca/ppca_analytic_np.py new file mode 100644 index 000000000..b689a0cbc --- /dev/null +++ b/python/python/bystro/supervised_ppca/ppca_analytic_np.py @@ -0,0 +1,115 @@ +""" +This implements the standard version of probabilistic PCA found in Chapter +12 of Bishop (2006). It has an analytic solution making this method very +quick to fit. Will serve as a baseline comparison for many of our methods. + +Objects +------- +PPCAanalytic(n_components=2) + The plain vanilla probabilistic PCA model with an analytic formula that + is estimated via maximum likelihood. + +Methods +------- +None +""" +import numpy as np +import numpy.linalg as la +from bystro.supervised_ppca._base import BaseGaussianFactorModel + + +class PPCAanalytic(BaseGaussianFactorModel): + """ + Analytic PPCA solution as described by Bishop + + Parameters + ---------- + n_components : int,default=2 + The latent dimensionality + """ + + def __init__(self, n_components=2): + super().__init__(n_components=n_components) + + def __repr__(self): + return f"PPCAAnalytic(n_components={self.n_components})" + + def fit(self, X): + """ + Fits a model given covariates X + + Parameters + ---------- + X : np.array-like,(n_samples,n_covariates) + The data + + Returns + ------- + self : object + The model + """ + N, self.p = X.shape + L, p = self.n_components, self.p + + U, s, V = la.svd(X, full_matrices=False) + eigenvals = s ** 2 / (N - 1) + + var = 1.0 / (p - L) * (np.sum(eigenvals) - np.sum(eigenvals[:L])) + + L_m = np.diag((eigenvals[:L] - np.ones(L) * var) ** 0.5) + W = np.dot(V[:L].T, L_m) + self._store_instance_variables([W, var]) + + def get_covariance(self): + """ + Gets the covariance matrix + + Sigma = W^TW + sigma2*I + + Parameters + ---------- + None + + Returns + ------- + covariance : np.array-like(p,p) + The covariance matrix + """ + covariance = np.dot(self.W_.T, self.W_) + self.sigma2_ * np.eye(self.p) + return covariance + + def _store_instance_variables(self, trainable_variables): + """ + Saves the learned variables + + Parameters + ---------- + trainable_variables : list + List of variables saved + + Sets + ---- + W_ : np.array-like,(n_components,p) + The loadings + + sigma2_ : float + The isotropic variance + """ + self.W_ = trainable_variables[0].T + self.sigma2_ = trainable_variables[1] + + def _initialize_save_losses(self): + pass + + def _save_losses(self): + pass + + def _test_inputs(self): + pass + + def _transform_training_data(self): + pass + + def _save_variables(self): + pass + diff --git a/python/python/bystro/supervised_ppca/tests/__init__.py b/python/python/bystro/supervised_ppca/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/python/bystro/supervised_ppca/tests/test_analytic.py b/python/python/bystro/supervised_ppca/tests/test_analytic.py new file mode 100644 index 000000000..32ae269c1 --- /dev/null +++ b/python/python/bystro/supervised_ppca/tests/test_analytic.py @@ -0,0 +1,23 @@ +import numpy as np +import numpy.linalg as la +import scipy.stats as st # type: ignore +from bystro.supervised_ppca.ppca_analytic_np import PPCAanalytic + + +def test_methods(): + rng = np.random.default_rng(2021) + N, p = 100000, 10 + L = 2 + W = rng.normal(size=(L, p)) + sigma = 0.2 + cov = np.dot(W.T, W) + sigma * np.eye(p) + X = st.multivariate_normal.rvs( + mean=np.zeros(p), cov=cov, size=N, random_state=1993 + ) + model = PPCAanalytic(n_components=L) + model.fit(X) + cov_est = model.get_covariance() + cov_emp = np.dot(X.T, X) / X.shape[0] + s1 = la.norm(cov_emp - cov) + s2 = la.norm(cov_est - cov) + assert np.abs(s2 - s1) <= 0.01