diff --git a/python/python/bystro/_template_npr.py b/python/python/bystro/_template_npr.py new file mode 100644 index 000000000..af87ed4ae --- /dev/null +++ b/python/python/bystro/_template_npr.py @@ -0,0 +1,130 @@ +""" +This provides a basic template for any model that uses numpyro as an +inference method. It has several methods that should be filled by any +object extending the template, namely + fit + _fill_hp_options + +These are the methods for running the samples given data and providing +hyperameter selections respectively. + +Objects +------- +_BaseNPRModel(object) + This is the template Numpyro model + +""" +import abc +import cloudpickle # type: ignore +import numpyro # type: ignore + + +class _BaseNumpyroModel: + def __init__(self, mcmc_options=None, hp_options=None): + """ + + Parameters + ---------- + + Returns + ------- + + """ + if mcmc_options is None: + mcmc_options = {} + if hp_options is None: + hp_options = {} + self.mcmc_options = self._fill_mcmc_options(mcmc_options) + self.hp_options = self._fill_hp_options(hp_options) + self.samples = None + + @abc.abstractmethod + def fit(self, *args): + """ + + Parameters + ---------- + + Returns + ------- + + """ + + def render_model(self): + """ + This provides a graphical representation of the model + + Parameters + ---------- + + Returns + ------- + + """ + assert self._model is not None, "Fit model first" + return numpyro.render_model(self._model, model_kwargs=self.model_kwargs) + + def pickle(self, path): + """ + This saves samples from a fit model + + Parameters + ---------- + + Returns + ------- + + """ + assert self.samples is not None, "Fit model first" + with open(path, "wb") as f: + cloudpickle.dump(self.samples, f) + + def unpickle(self, path): + """ + This loads samples from a previously saved model + + Parameters + ---------- + + Returns + ------- + + """ + with open(path, "rb") as f: + self.samples = cloudpickle.load(f) + + def _fill_mcmc_options(self, mcmc_options): + """ + This fills in default MCMC options of the sampler. Further methods + might override these but these are common/basic enough to leave in + as an implemented method. + + Parameters + ---------- + + Returns + ------- + + """ + default_options = { + "num_chains": 1, + "num_warmup": 500, + "num_samples": 2000, + } + mopts = {**default_options, **mcmc_options} + return mopts + + @abc.abstractmethod + def _fill_hp_options(self, hp_options): + """ + This fills in default hyperparameters of the model. Since these are + not conserved between models we leave this as an abstract method + to be filled in per model. + + Parameters + ---------- + + Returns + ------- + + """ diff --git a/python/python/bystro/_template_sgd_np.py b/python/python/bystro/_template_sgd_np.py new file mode 100644 index 000000000..7fa4dc9c9 --- /dev/null +++ b/python/python/bystro/_template_sgd_np.py @@ -0,0 +1,132 @@ +""" +This implements a base class for any model using stochastic gradient +descent-based techniques for inference. + +Objects +------- +_BaseSGDModel(object): + +Methods +------- +None +""" +import abc +import cloudpickle # type: ignore + + +class _BaseSGDModel(object): + def __init__(self, training_options=None): + """ + The base class of a model relying on stochastic gradient descent for + inference + """ + if training_options is None: + training_options = {} + self.training_options = self._fill_training_options(training_options) + + @abc.abstractmethod + def fit(self, *args, **kwargs): + """ + Method for fitting + + Parameters + ---------- + *args: + List of arguments + + *kwargs: + Key word arguments + """ + + def pickle(self, path): + """ + Method for saving the model + + Parameters + ---------- + path : str + The directory to save the model to + """ + mydict = {"model": self} + with open(path, "wb") as f: + cloudpickle.dump(mydict, f) + + @abc.abstractmethod + def unpickle(self, path): + """ + Method for loading the model + + Parameters + ---------- + path : str + The directory to load the model from + """ + + def _fill_training_options(self, training_options): + """ + This fills any relevant parameters for the learning algorithm + + Parameters + ---------- + training_options : dict + + Returns + ------- + training_opts : dict + """ + default_options = {"n_iterations": 5000} + training_opts = {**default_options, **training_options} + return training_opts + + @abc.abstractmethod + def _save_variables(self, training_variables): + """ + This saves the final parameter values after training + + Parameters + ---------- + training_variables :list + The variables trained + """ + raise NotImplementedError("_save_variables") + + @abc.abstractmethod + def _initialize_save_losses(self): + """ + This method initializes the arrays to track relevant variables + during training + + Parameters + ---------- + """ + raise NotImplementedError("_initialize_save_losses") + + @abc.abstractmethod + def _save_losses(self, *args): + """ + This saves the respective losses at each iteration + + Parameters + ---------- + """ + raise NotImplementedError("_save_losses") + + @abc.abstractmethod + def _test_inputs(self, *args): + """ + This performs error checking on inputs for fit + + Parameters + ---------- + """ + raise NotImplementedError("_transform_training_data") + + @abc.abstractmethod + def _transform_training_data(self, *args): + """ + This converts training data to adequate format + + Parameters + ---------- + """ + raise NotImplementedError("_transform_training_data") diff --git a/python/python/bystro/ancestry/tests/test_train.py b/python/python/bystro/ancestry/tests/test_train.py index 0ec81ce02..0497ab7f1 100644 --- a/python/python/bystro/ancestry/tests/test_train.py +++ b/python/python/bystro/ancestry/tests/test_train.py @@ -13,7 +13,9 @@ superpop_predictions_from_pop_probs, superpop_probs_from_pop_probs, convert_1kgp_vcf_to_dosage, - process_vcf_for_pc_transformation + process_vcf_for_pc_transformation, + restrict_loadings_variants_to_vcf, + apply_pca_transform ) from bystro.vcf_utils.simulate_random_vcf import ( generate_simulated_vcf, @@ -286,3 +288,67 @@ def test_1kgp_vcf_to_dosage(): for column in processed_vcf.columns: for value in processed_vcf[column]: assert value in valid_values + + +def test_restrict_loadings_variants_to_vcf(): + """Tests restriction of gnomad loadings variants to vcf variants.""" + num_samples = 10 + num_vars = 10 + simulated_vcf = generate_simulated_vcf(num_samples, num_vars) + sim_vcf_df = convert_sim_vcf_to_df(simulated_vcf) + loaded_vcf = convert_1kgp_vcf_to_dosage(sim_vcf_df) + processed_sim_vcf = process_vcf_for_pc_transformation(loaded_vcf) + # Using same indices, generate sim loadings + num_pcs = 30 + sim_pcs = np.random.random((num_samples, num_pcs)) + sim_loadings = pd.DataFrame( + data=sim_pcs, index=processed_sim_vcf.columns, columns=[f"PC{i+1}" for i in range(num_pcs)] + ) + # Run restrict_loadings with sim data + pc_loadings_overlap, genos_overlap_transpose = restrict_loadings_variants_to_vcf( + sim_loadings, processed_sim_vcf + ) + # Check for expected columns/indices + expected_columns_loadings = [f"PC{i+1}" for i in range(num_pcs)] + assert set(expected_columns_loadings) == set(pc_loadings_overlap.columns) + expected_index_genos_transpose = ["SampleID1", "SampleID2"] + for sample in expected_index_genos_transpose: + assert sample in genos_overlap_transpose.index + # Check that the variant IDs match up to sorting + assert set(genos_overlap_transpose.columns) == set(pc_loadings_overlap.index) + + +def test_apply_pca_transform(): + # Prepare test data same as before + num_samples = 10 + num_vars = 10 + simulated_vcf = generate_simulated_vcf(num_samples, num_vars) + sim_vcf_df = convert_sim_vcf_to_df(simulated_vcf) + loaded_vcf = convert_1kgp_vcf_to_dosage(sim_vcf_df) + processed_sim_vcf = process_vcf_for_pc_transformation(loaded_vcf) + # Using same indices, generate sim loadings + num_pcs = 30 + sim_pcs = np.random.random((num_samples, num_pcs)) + sim_loadings = pd.DataFrame( + data=sim_pcs, index=processed_sim_vcf.columns, columns=[f"PC{i+1}" for i in range(num_pcs)] + ) + pc_loadings_overlap, genos_overlap_transpose = restrict_loadings_variants_to_vcf( + sim_loadings, processed_sim_vcf + ) + # Call function to test + transformed_data = apply_pca_transform(pc_loadings_overlap, genos_overlap_transpose) + # Check for correct data type and columns + assert isinstance(transformed_data, pd.DataFrame) + expected_columns = ["PC" + str(i) for i in range(1, 31)] + assert transformed_data.columns.tolist() == expected_columns + # Check index of transformed_data matches genos_overlap_transpose.index + assert transformed_data.index.equals(genos_overlap_transpose.index) + # Check shape of transformed_data matches expected shape + expected_shape = (genos_overlap_transpose.shape[0], 30) + assert transformed_data.shape == expected_shape + + # Check all values in transformed_data are numeric + assert transformed_data.dtypes.apply(pd.api.types.is_numeric_dtype).all() + + # Check that transformed_data does not contain any NaN or missing values + assert not transformed_data.isna().to_numpy().any() diff --git a/python/python/bystro/ancestry/train.py b/python/python/bystro/ancestry/train.py index 211c98655..4739dc015 100644 --- a/python/python/bystro/ancestry/train.py +++ b/python/python/bystro/ancestry/train.py @@ -513,13 +513,16 @@ def convert_1kgp_vcf_to_dosage(vcf_with_header: pd.DataFrame) -> pd.DataFrame: def process_vcf_for_pc_transformation(dosage_vcf: pd.DataFrame) -> pd.DataFrame: - """Process dosage_vcf so that it only includes genotypes for analysis.""" + """Process dosage_vcf and transpose so that it only includes + genotypes in correct configuration for analysis.""" genos = dosage_vcf.iloc[:, len(HEADER_COLS) :] genos = genos.set_index(dosage_vcf.index) genos = genos.sort_index() # Check that not all genotypes are the same for QC assert len(set(genos.to_numpy().flatten())) > 1, "All genotypes are the same" - return genos + #Transpose genos_overlap for analysis + genos_transpose = genos.T + return genos_transpose def _load_pca_loadings() -> pd.DataFrame: @@ -560,6 +563,35 @@ def get_variant(x): return pc_loadings +def restrict_loadings_variants_to_vcf( + pc_loadings: pd.DataFrame, genos_transpose: pd.DataFrame +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Restrict variant list to overlap between gnomad loadings and transposed + reference vcf and return versions only including the overlapping variants.""" + # IGSR version of 1kgp is current reference vcf + var_overlap = pc_loadings.index.intersection(genos_transpose.columns) + pc_loadings_overlap = pc_loadings[pc_loadings.index.isin(var_overlap)] + genos_overlap_transpose = genos_transpose[genos_transpose.columns.isin(var_overlap)] + # Ensure that genos transpose and pc_loadings have corresponding shape and vars + assert genos_overlap_transpose.shape[1] == pc_loadings_overlap.shape[0] + assert set(genos_overlap_transpose.columns) == set(pc_loadings_overlap.index) + assert (genos_overlap_transpose.index == genos_transpose.index).all() + # Record amount of overlap + num_var_overlap = len(var_overlap) + logger.info("Number of overlapping variants: %d", num_var_overlap) + return pc_loadings_overlap, genos_overlap_transpose + + +def apply_pca_transform( + pc_loadings_overlap: pd.DataFrame, + genos_overlap_transpose: pd.DataFrame +) -> pd.DataFrame: + """Transform vcf with genotypes in dosage format with PCs loadings from gnomad PCA.""" + # Dot product + transformed_data = genos_overlap_transpose @ pc_loadings_overlap + return transformed_data + + def _perform_pca(train_X: pd.DataFrame, test_X: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, PCA]: """Perform PCA, checking for good compression.""" logger.info("Beginning PCA") diff --git a/python/python/bystro/covariance/__init__.py b/python/python/bystro/covariance/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/python/bystro/covariance/_base_covariance.py b/python/python/bystro/covariance/_base_covariance.py new file mode 100644 index 000000000..601de605d --- /dev/null +++ b/python/python/bystro/covariance/_base_covariance.py @@ -0,0 +1,532 @@ +""" +This provides the base class for covariance estimation. While there are +many methods for estimating covariance matrices, once such a matrix has +been estimated most of the procedures for evaulating fit, comparing +quantities etc are shared between models. This implementation allows for +us to implement these methods a single time and for most code being +centered on the fit method and a small number of auxiliary methods. This +also will provide the basis for factor analysis and probablistic PCA. As +a practical design choice, these methods are implemented as separate +functions, which are simply converted into object methods by replacing +covariance with self.covariance. Documentation is provided in the +functional method and omitted from the object methods. + +Objects +------- +BaseCovariance + + get_precision() + Gets the precision matrix defined as the inverse of the covariance + + get_stable_rank() + Returns the stable rank defined as + ||A||_F^2/||A||^2 + + predict(Xobs,idxs) + Predicts missing data using observed data. + + -------------------- + conditional_score(X,idxs,weights=None) + mean(log p(X[idx==1]|X[idx==0],covariance)) + + conditional_score_samples(X,idxs) + log p(X[idx==1]|X[idx==0],covariance) + + marginal_score(X,idxs,weights=None) + mean(log p(X[idx==1]|covariance)) + + marginal_score_samples(X,idxs) + log p(X[idx==1]|covariance) + + score(X,weights=None): + mean log p(X) + + score_samples(X) + log p(X) + + -------------------- + entropy() + Computes the entropy of a Gaussian distribution parameterized by + covariance. + + entropy_subset(idxs) + Computes the entropy of a subset of the covariates + + mutual_information(covariance,idxs1,idxs2): + This computes the mutual information bewteen the two sets of + covariates based on the model. + +Methods +------- +_get_precision(covariance) + +_get_stable_rank(covariance) + +_predict(covariance,Xobs,idxs) + +_get_conditional_parameters(covariance,idxs) + Computes the distribution parameters p(X_miss|X_obs) + +---------------------------- + +_conditional_score_samples(covariance,X,idxs,weights=None) + +_conditional_score(covariance,X,idxs) + +_marginal_score(covariance,X,idxs,weights=None) + +_marginal_score_samples(covariance,X,idxs) + +_score(covariance,X,weights=None) + +_score_samples(covariance,X) + +---------------------------- + +_entropy(covariance) + +_entropy_subset(covariance,idxs) + +_mutual_information(covariance,idxs1,idxs2): + +""" +import numpy as np +from numpy import linalg as la +from datetime import datetime as dt +import pytz + + +class BaseCovariance(object): + """ + This object basically contains all the methods asides for fitting + a covariance matrix. + """ + + def __init__(self): + self.creationDate = dt.now(pytz.timezone("US/Pacific")) + + ################# + # Miscellaneous # + ################# + def get_precision(self): + return _get_precision(self.covariance) + + def get_stable_rank(self): + return _get_stable_rank(self.covariance) + + def predict(self, Xobs, idxs): + return _predict(self.covariance, Xobs, idxs) + + ##################################### + # Gaussian Likelihood-based methods # + ##################################### + def conditional_score(self, X, idxs, weights=None): + return _conditional_score(self.covariance, X, idxs, weights=weights) + + def conditional_score_samples(self, X, idxs): + return _conditional_score_samples(self.covariance, X, idxs) + + def marginal_score(self, X, idxs, weights=None): + return _marginal_score(self.covariance, X, idxs, weights=weights) + + def marginal_score_samples(self, X, idxs): + return _marginal_score_samples(self.covariance, X, idxs) + + def score(self, X, weights=None): + return _score(self.covariance, X, weights=weights) + + def score_samples(self, X): + return _score_samples(self.covariance, X) + + ################################# + # Information-theoretic methods # + ################################# + def entropy(self): + return _entropy(self.covariance) + + def entropy_subset(self, idxs): + return _entropy_subset(self.covariance, idxs) + + def mutual_information(self, idxs1, idxs2): + return _mutual_information(self.covariance, idxs1, idxs2) + + +########################################### +########################################### +########################################### +### ### +### ### +### Methods for covariance matrices ### +### ### +### ### +########################################### +########################################### +########################################### + + +def _get_precision(covariance): + """ + Gets the precision matrix defined as the inverse of the covariance + + Parameters + ---------- + covariance : np.array-like(p,p) + The covariance matrix + + Returns + ------- + precision : np.array-like(sum(p),sum(p)) + The inverse of the covariance matrix + """ + precision = la.inv(covariance) + return precision + + +def _get_stable_rank(covariance): + """ + Returns the stable rank defined as + ||A||_F^2/||A||^2 + + Parameters + ---------- + covariance : np.array-like(p,p) + The covariance matrix + + Returns + ------- + srank : float + The stable rank. See Vershynin High dimensional probability for + discussion, but this is a statistically stable approximation to rank + """ + singular_values = la.svd(covariance, compute_uv=False) + srank = np.sum(singular_values) / singular_values[0] + return srank + + +def _predict(covariance, Xobs, idxs): + """ + Predicts missing data using observed data. This uses the conditional + Gaussian formula (see wikipedia multivariate gaussian). Solve allows + us to avoid an explicit matrix inversion. + + Parameters + ---------- + covariance : np.array-like(p,p) + The covariance matrix + + Xobs : np array-like,(N_samples,\sum idxs) + The observed data + + idxs: np.array-like,(sum(p),) + The observation locations + + Returns + ------- + preds : np.array-like,(N_samples,p-\sum idxs) + The predicted values + """ + covariance_sub = covariance[idxs == 1] + covariance_22 = covariance_sub[:, idxs == 1] + covariance_21 = covariance_sub[:, idxs == 0] + + beta_bar = la.solve(covariance_22, covariance_21) + + preds = np.dot(Xobs[:, idxs == 1], beta_bar) + return preds + + +def _conditional_score(covariance, X, idxs, weights=None): + """ + Returns the predictive log-likelihood of a subset of data. + + mean(log p(X[idx==1]|X[idx==0],covariance)) + + Parameters + ---------- + covariance : np.array-like(p,p) + The covariance matrix + + X : np.array-like,(N,sum(idxs)) + The centered data + + idxs: np.array-like,(sum(p),) + The observation locations + + weights : np.array-like,(N,),default=None + The optional weights on the samples. Don't have negative values. + Average value forced to 1. + + Returns + ------- + avg_score : float + Average log likelihood + """ + weights = ( + np.ones(X.shape[0]) if weights is None else weights / np.mean(weights) + ) + avg_score = np.mean( + weights * _conditional_score_samples(covariance, X, idxs) + ) + return avg_score + + +def _conditional_score_samples(covariance, X, idxs): + """ + Return the conditional log likelihood of each sample, that is + + log p(X[idx==1]|X[idx==0],covariance) = N(Sigma_10Sigma_00^{-1}x_0, + Sigma_11-Sigma_10Sigma_00^{-1}Sigma_01) + + Parameters + ---------- + covariance : np.array-like(p,p) + The covariance matrix + + X : np.array-like,(N,p) + The centered data + + idxs: np.array-like,(p,) + The observation locations + + Returns + ------- + scores : float + Log likelihood for each sample + """ + covariance_sub = covariance[idxs == 1] + covariance_22 = covariance_sub[:, idxs == 1] + covariance_21 = covariance_sub[:, idxs == 0] + + covariance_nonsub = covariance[idxs == 0] + covariance_11 = covariance_nonsub[:, idxs == 0] + + beta_bar = la.solve(covariance_22, covariance_21) + Second_part = np.dot(covariance_21.T, beta_bar) + + covariance_bar = covariance_11 - Second_part + + mu_ = np.dot(X[:, idxs == 1], beta_bar) + scores = _score_samples(covariance_bar, X[:, idxs == 0] - mu_) + + return scores + + +def _get_conditional_parameters(covariance, idxs): + """ + Computes the distribution parameters p(X_miss|X_obs) + + Parameters + ---------- + covariance : np.array-like,(p,p) + The covariance matrix + + idxs: np.array-like,(p,) + The observed covariates + + Returns + ------- + beta_bar : array-like + The predictive covariates + + covariance_bar : array-like + Conditional covariance + """ + + covariance_sub = covariance[idxs == 1] + covariance_22 = covariance_sub[:, idxs == 1] + covariance_21 = covariance_sub[:, idxs == 0] + covariance_nonsub = covariance[idxs == 0] + covariance_11 = covariance_nonsub[:, idxs == 0] + beta_bar = la.solve(covariance_22, covariance_21) + Second_part = np.dot(covariance_21.T, beta_bar) + + covariance_bar = covariance_11 - Second_part + return beta_bar.T, covariance_bar + + +def _marginal_score(covariance, X, idxs, weights=None): + """ + Returns the marginal log-likelihood of a subset of data + + Parameters + ---------- + covariance : np.array-like(p,p) + The covariance matrix + + X : np.array-like,(N,sum(idxs)) + The centered data + + idxs: np.array-like,(sum(p),) + The observation locations + + weights : np.array-like,(N,),default=None + The optional weights on the samples + + Returns + ------- + avg_score : float + Average log likelihood + """ + if weights is None: + weights = np.ones(X.shape[0]) + avg_score = np.mean(weights * _marginal_score_samples(covariance, X, idxs)) + return avg_score + + +def _marginal_score_samples(covariance, X, idxs): + """ + Returns the marginal log-likelihood of a subset of data + + Parameters + ---------- + covariance : np.array-like(p,p) + The covariance matrix + + X : np.array-like,(N,sum(idxs)) + The centered data + + idxs: np.array-like,(sum(p),) + The observation locations + + Returns + ------- + scores : float + Average log likelihood + """ + cov1 = covariance[idxs == 1] + cov_sub = cov1[:, idxs == 1] + scores = _score_samples(cov_sub, X) + return scores + + +def _score(covariance, X, weights=None): + """ + Returns the average log liklihood of data. + + Parameters + ---------- + covariance : np.array-like(p,p) + The covariance matrix + + X : np.array-like,(N,sum(p)) + The centered data + + weights : np.array-like,(N,),default=None + The optional weights on the samples + + Returns + ------- + avg_score : float + Average log likelihood + """ + if weights is None: + weights = np.ones(X.shape[0]) + avg_score = np.mean(weights * _score_samples(covariance, X)) + return avg_score + + +def _score_samples(covariance, X): + """ + Return the log likelihood of each sample + + Parameters + ---------- + covariance : np.array-like(p,p) + The covariance matrix + + X : np.array-like,(N,sum(p)) + The centered data + + Returns + ------- + scores : float + Log likelihood for each sample + """ + p = covariance.shape[0] + term1 = -p / 2 * np.log(2 * np.pi) + + _, logdet = la.slogdet(covariance) + term2 = -0.5 * logdet + + quad_init = la.solve(covariance, np.transpose(X)) + difference = X * np.transpose(quad_init) + term3 = np.sum(difference, axis=1) + + scores = term1 + term2 - 0.5 * term3 + return scores + + +def _entropy(covariance): + """ + Computes the entropy of a Gaussian distribution parameterized by + covariance. + + Parameters + ---------- + covariance : np.array-like(p,p) + The covariance matrix + + Returns + ------- + entropy : float + The differential entropy of the distribution + """ + cov_new = 2 * np.pi * np.e * covariance + _, logdet = la.slogdet(cov_new) + entropy = 0.5 * logdet + return entropy + + +def _entropy_subset(covariance, idxs): + """ + Computes the entropy of a subset of the Gaussian distribution + parameterized by covariance. + + Parameters + ---------- + covariance : np.array-like(p,p) + The covariance matrix + + idxs: np.array-like,(sum(p),) + The observation locations + + Returns + ------- + entropy : float + The differential entropy of the distribution + """ + cov1 = covariance[idxs == 1] + cov_sub = cov1[:, idxs == 1] + entropy = _entropy(cov_sub) + return entropy + + +def _mutual_information(covariance, idxs1, idxs2): + """ + This computes the mutual information bewteen the two sets of + covariates based on the model. + + Parameters + ---------- + idxs1 : np.array-like,(p,) + First group of variables + + idxs2 : np.array-like,(p,) + Second group of variables + + Returns + ------- + mutual_information : float + The mutual information between the two variables + """ + idxs = idxs1 + idxs2 + cov_sub = covariance[np.ix_(idxs == 1, idxs == 1)] + idxs1_sub = idxs1[idxs == 1] + Hy = _entropy(cov_sub[np.ix_(idxs1_sub == 1, idxs1_sub == 1)]) + + _, covariance_conditional = _get_conditional_parameters( + cov_sub, 1 - idxs1_sub + ) + H_y_given_x = _entropy(covariance_conditional) + mutual_information = Hy - H_y_given_x + return mutual_information diff --git a/python/python/bystro/covariance/_base_precision.py b/python/python/bystro/covariance/_base_precision.py new file mode 100644 index 000000000..8b462a273 --- /dev/null +++ b/python/python/bystro/covariance/_base_precision.py @@ -0,0 +1,159 @@ +""" +This provides the base class for covariance estimation in terms of the +precision matrix. While in the futre I'll rewrite separate methods for some +of these to take advantage of the fact that we estimated the precision +matrix rather than covariance matrix, for now I just invert precision and +use the covariance matrix methods. Yuck right? + +Objects +------- +BasePrecision + + get_covariance() + Gets the precision matrix defined as the inverse of the covariance + + get_stable_rank() + Returns the stable rank defined as + ||A||_F^2/||A||^2 + + predict(Xobs,idxs) + Predicts missing data using observed data. + + -------------------- + conditional_score(X,idxs) + mean(log p(X[idx==1]|X[idx==0],covariance)) + + conditional_score_samples(X,idxs) + log p(X[idx==1]|X[idx==0],covariance) + + marginal_score(X,idxs) + mean(log p(X[idx==1]|covariance)) + + marginal_score_samples(X,idxs) + log p(X[idx==1]|covariance) + + score(X): + mean log p(X) + + score_samples(X) + log p(X) + + -------------------- + entropy() + Computes the entropy of a Gaussian distribution parameterized by + covariance. + + entropy_subset(idxs) + Computes the entropy of a subset of the covariates + + mutual_information(covariance,idxs1,idxs2): + This computes the mutual information bewteen the two sets of + covariates based on the model. + +Methods +------- +_get_covariance(covariance) +""" +from numpy import linalg as la +from datetime import datetime as dt +from ._base_covariance import BaseCovariance +import pytz + + +class BasePrecision(BaseCovariance): + """ + This object basically contains all the methods asides for fitting + a precision matrix. + """ + + def __init__(self): + self.creationDate = dt.now(pytz.timezone("US/Pacific")) + + ################# + # Miscellaneous # + ################# + def get_covariance(self): + return _get_covariance(self.precision) + + def get_stable_rank(self): + if not hasattr(self, "covariance"): + self.covariance = la.inv(self.precision) + return super(BasePrecision, self).get_stable_rank() + + def predict(self, Xobs, idxs): + if not hasattr(self, "covariance"): + self.covariance = la.inv(self.precision) + return super(BasePrecision, self).predict(Xobs, idxs) + + ##################################### + # Gaussian Likelihood-based methods # + ##################################### + def conditional_score(self, X, idxs, weights=None): + if not hasattr(self, "covariance"): + self.covariance = la.inv(self.precision) + return super(BasePrecision, self).conditional_score( + X, idxs, weights=weights + ) + + def conditional_score_samples(self, X, idxs): + if not hasattr(self, "covariance"): + self.covariance = la.inv(self.precision) + return super(BasePrecision, self).conditional_score_samples(X, idxs) + + def marginal_score(self, X, idxs, weights=None): + if not hasattr(self, "covariance"): + self.covariance = la.inv(self.precision) + return super(BasePrecision, self).marginal_score( + X, idxs, weights=weights + ) + + def marginal_score_samples(self, X, idxs): + if not hasattr(self, "covariance"): + self.covariance = la.inv(self.precision) + return super(BasePrecision, self).marginal_score_samples(X, idxs) + + def score(self, X, weights=None): + if not hasattr(self, "covariance"): + self.covariance = la.inv(self.precision) + return super(BasePrecision, self).score(X, weights=weights) + + def score_samples(self, X): + if not hasattr(self, "covariance"): + self.covariance = la.inv(self.precision) + return super(BasePrecision, self).score_samples(X) + + ################################# + # Information-theoretic methods # + ################################# + def entropy(self): + if not hasattr(self, "covariance"): + self.covariance = la.inv(self.precision) + return super(BasePrecision, self).entropy() + + def entropy_subset(self, idxs): + if not hasattr(self, "covariance"): + self.covariance = la.inv(self.precision) + return super(BasePrecision, self).entropy_subset(idxs) + + def mutual_information(self, idxs1, idxs2): + if not hasattr(self, "covariance"): + self.covariance = la.inv(self.precision) + return super(BasePrecision, self).mutual_information(idxs1, idxs2) + + +def _get_covariance(precision): + """ + Gets the covariance matrix defined as the inverse of the precision + + Parameters + ---------- + precision : np.array-like(p,p) + The precision matrix + + Returns + ------- + covariance : np.array-like(sum(p),sum(p)) + The inverse of the precision matrix + """ + covariance = la.inv(precision) + return covariance diff --git a/python/python/bystro/covariance/_covariance_np.py b/python/python/bystro/covariance/_covariance_np.py new file mode 100644 index 000000000..f96fd7747 --- /dev/null +++ b/python/python/bystro/covariance/_covariance_np.py @@ -0,0 +1,90 @@ +""" +This implements two objects, the empirical covariance estimator used as a +baseline comparison method. It also implements BayesianCovariance, which +implements MAP estimation using several common priors. + +Objects +------- +EmpiricalCovariance(BaseCovariance) + This object just fits the covariance matrix as the standard sample + covariance matrix + +BayesianCovariance(BaseCovariance): + This object fits the covariance matrix as the MAP estimator using + user-defined priors. +""" +import numpy as np +from bystro.covariance._base_covariance import BaseCovariance + + +class EmpiricalCovariance(BaseCovariance): + def __init__(self): + """ + This object just fits the covariance matrix as the standard sample + covariance matrix + """ + super().__init__() + + def fit(self, X): + """ + This fits a covariance matrix using samples X. + + Parameters + ---------- + X : np.array-like,(n_samples,n_covariates) + The centered data + """ + self.N, self.p = X.shape + XTX = np.dot(X.T, X) + self.covariance = XTX / self.N + + +class BayesianCovariance(BaseCovariance): + def __init__(self, prior_options=None): + """ + This object fits the covariance matrix as the MAP estimator using + user-defined priors. + """ + super().__init__() + if prior_options is None: + prior_options = {} + self.prior_options = self._fill_prior_options(prior_options) + + def fit(self, X): + """ + This fits a covariance matrix using samples X with MAP estimation. + + Parameters + ---------- + X : np.array-like,(n_samples,n_covariates) + The data + """ + self.N, self.p = X.shape + p_opts = self.prior_options + covariance_empirical = np.dot(X.T, X) + nu = p_opts["iw_params"]["pnu"] + self.p + cov_prior = p_opts["iw_params"]["sigma"] * np.eye(self.p) + posterior_cov = cov_prior + covariance_empirical + posterior_nu = nu + self.N + self.covariance = posterior_cov / (posterior_nu + self.p + 1) + + def _fill_prior_options(self, prior_options): + """ + This sets the prior options for our inference scheme + + Parameters + ---------- + prior_options : dict + The original prior options passed as a dictionary + + Options + ------- + iw_params : dict,default={'pnu':2,'sigma':1.0} + pnu : int - nu = p + pnu + sigma : float>0 - cov = sigma*I_p + """ + default_options = { + "iw_params": {"pnu": 2, "sigma": 1.0}, + } + pops = {**default_options, **prior_options} + return pops diff --git a/python/python/bystro/covariance/_precision_np.py b/python/python/bystro/covariance/_precision_np.py new file mode 100644 index 000000000..46535be2e --- /dev/null +++ b/python/python/bystro/covariance/_precision_np.py @@ -0,0 +1,24 @@ +""" + +""" +import numpy as np +import numpy.linalg as la +from ._base_precision import BasePrecision + + +class EmpiricalPrecision(BasePrecision): + def __init__(self): + super().__init__() + + def fit(self, X): + """ + This fits a precision matrix using samples X. + + Parameters + ---------- + X : np.array-like,(n_samples,n_covariates) + The data + """ + self.N, self.p = X.shape + XTX = np.dot(X.T, X) + self.precision = la.inv(XTX / self.N) diff --git a/python/python/bystro/covariance/test/__init__.py b/python/python/bystro/covariance/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/python/bystro/covariance/test/test_base_covariance.py b/python/python/bystro/covariance/test/test_base_covariance.py new file mode 100644 index 000000000..9cb1822e3 --- /dev/null +++ b/python/python/bystro/covariance/test/test_base_covariance.py @@ -0,0 +1,164 @@ +import numpy as np +import numpy.linalg as la +import scipy.stats as st # type: ignore +from bystro.covariance._base_covariance import BaseCovariance, _score_samples + + +def test_get_stable_rank(): + model = BaseCovariance() + + model.covariance = np.eye(10) + assert model.get_stable_rank() == 10 + + model.covariance = np.zeros((10, 10)) + model.covariance[0, 0] = 1 + assert model.get_stable_rank() == 1 + + +def test_predict(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BaseCovariance() + cov = 1 / X.shape[0] * np.dot(X.T, X) + model.covariance = cov + idxs = np.ones(10) + idxs[8:] = 0 + preds = model.predict(X, idxs) + csub = cov[idxs == 1] + c22 = csub[:, idxs == 1] + c21 = csub[:, idxs == 0] + csub = cov[idxs == 0] + beta_bar = la.solve(c22, c21) + mu = np.dot(X[:, idxs == 1], beta_bar) + assert np.all(np.abs(mu - preds) < 1e-8) + + +def test_conditional_score_samples(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BaseCovariance() + cov = 1 / X.shape[0] * np.dot(X.T, X) + model.covariance = cov + idxs = np.ones(10) + idxs[8:] = 0 + score_samples = model.conditional_score_samples(X, idxs) + csub = cov[idxs == 1] + c22 = csub[:, idxs == 1] + c21 = csub[:, idxs == 0] + csub = cov[idxs == 0] + c11 = csub[:, idxs == 0] + beta_bar = la.solve(c22, c21) + second = np.dot(c21.T, beta_bar) + cov_bar = c11 - second + mu = np.dot(X[:, idxs == 1], beta_bar) + score_samples2 = _score_samples(cov_bar, X[:, idxs == 0] - mu) + assert np.all(np.abs(score_samples - score_samples2) < 1e-8) + + +def test_conditional_score(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BaseCovariance() + cov = 1 / X.shape[0] * np.dot(X.T, X) + model.covariance = cov + idxs = np.ones(10) + idxs[8:] = 0 + score = model.conditional_score(X, idxs) + csub = cov[idxs == 1] + c22 = csub[:, idxs == 1] + c21 = csub[:, idxs == 0] + csub = cov[idxs == 0] + c11 = csub[:, idxs == 0] + beta_bar = la.solve(c22, c21) + second = np.dot(c21.T, beta_bar) + cov_bar = c11 - second + mu = np.dot(X[:, idxs == 1], beta_bar) + score_samples2 = _score_samples(cov_bar, X[:, idxs == 0] - mu) + assert np.abs(np.mean(score_samples2) - score) < 1e-8 + + +def test_marginal_score_samples(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(150, 10)) + model = BaseCovariance() + model.covariance = 1 / X.shape[0] * np.dot(X.T, X) + idxs = np.ones(10) + idxs[8:] = 0 + score_samples = model.marginal_score_samples(X[:, idxs == 1], idxs) + mvn = st.multivariate_normal(mean=np.zeros(8), cov=model.covariance[:8, :8]) + logpdf = mvn.logpdf(X[:, :8]) + assert np.all(np.abs(logpdf - score_samples) < 1e-8) + + +def test_marginal_score(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(15, 10)) + model = BaseCovariance() + model.covariance = 1 / X.shape[0] * np.dot(X.T, X) + idxs = np.ones(10) + idxs[8:] = 0 + score = model.marginal_score(X[:, idxs == 1], idxs) + mvn = st.multivariate_normal(mean=np.zeros(8), cov=model.covariance[:8, :8]) + logpdf = mvn.logpdf(X[:, :8]) + assert np.mean(logpdf) == score + + +def test_score(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BaseCovariance() + model.covariance = 1 / X.shape[0] * np.dot(X.T, X) + score = model.score(X, weights=1.0 * np.ones(1000)) + mvn = st.multivariate_normal(mean=np.zeros(10), cov=model.covariance) + logpdf = mvn.logpdf(X) + assert np.mean(logpdf) == score + + +def test_score_samples(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BaseCovariance() + model.covariance = 1 / X.shape[0] * np.dot(X.T, X) + score_samples = model.score_samples(X) + mvn = st.multivariate_normal(mean=np.zeros(10), cov=model.covariance) + logpdf = mvn.logpdf(X) + assert np.all(np.abs(logpdf - score_samples) < 1e-8) + + +def test_entropy(): + model = BaseCovariance() + model.covariance = np.eye(10) + _, logdet = la.slogdet(model.covariance) + ent = 10 / 2 * np.log(2 * np.pi * np.exp(1)) + 1 / 2 * logdet + entropy = model.entropy() + assert np.abs(ent - entropy) < 1e-8 + + +def test_entropy_subset(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BaseCovariance() + model.covariance = 1 / X.shape[0] * np.dot(X.T, X) + idxs = np.ones(10) + idxs[8:] = 0 + ent_sub = model.entropy_subset(idxs) + _, logdet = la.slogdet(model.covariance[:8, :8]) + ent = 8 / 2 * np.log(2 * np.pi * np.exp(1)) + 1 / 2 * logdet + assert np.abs(ent_sub - ent) < 1e-8 + + +def test_mutual_information(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BaseCovariance() + model.covariance = 1 / X.shape[0] * np.dot(X.T, X) + idxs1 = np.ones(10) + idxs2 = np.ones(10) + idxs1[5:] = 0 + idxs2[:5] = 0 + mi = model.mutual_information(idxs1, idxs2) + _, ldet1 = la.slogdet(model.covariance[:5, :5]) + _, ldet2 = la.slogdet(model.covariance[5:, 5:]) + _, ldet = la.slogdet(model.covariance) + mi_est = 0.5 * (ldet1 + ldet2 - ldet) + assert np.abs(mi_est - mi) < 1e-5 diff --git a/python/python/bystro/covariance/test/test_base_precision.py b/python/python/bystro/covariance/test/test_base_precision.py new file mode 100644 index 000000000..7df3ad3e9 --- /dev/null +++ b/python/python/bystro/covariance/test/test_base_precision.py @@ -0,0 +1,107 @@ +import numpy as np +import numpy.linalg as la +from bystro.covariance._base_precision import BasePrecision + + +def test_get_stable_rank(): + model = BasePrecision() + model.precision = np.eye(10) + assert model.get_stable_rank() == 10 + + +def test_predict(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BasePrecision() + model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X)) + idxs = np.ones(10) + idxs[8:] = 0 + model.predict(X, idxs) + + +def test_conditional_score_samples(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BasePrecision() + model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X)) + idxs = np.ones(10) + idxs[8:] = 0 + model.conditional_score_samples(X, idxs) + + +def test_conditional_score(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BasePrecision() + model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X)) + idxs = np.ones(10) + idxs[8:] = 0 + model.conditional_score(X, idxs) + + +def test_marginal_score_samples(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BasePrecision() + model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X)) + idxs = np.ones(10) + idxs[8:] = 0 + model.marginal_score(X[:, idxs == 1], idxs) + + +def test_marginal_score(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BasePrecision() + model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X)) + idxs = np.ones(10) + idxs[8:] = 0 + model.marginal_score(X[:, idxs == 1], idxs, weights=0.5 * np.ones(1000)) + + +def test_score(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BasePrecision() + model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X)) + model.score(X, weights=0.5 * np.ones(1000)) + + +def test_score_samples(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BasePrecision() + model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X)) + model.score_samples(X) + + +def test_entropy(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BasePrecision() + model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X)) + model.entropy() + + +def test_entropy_subset(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BasePrecision() + model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X)) + idxs = np.ones(10) + idxs[8:] = 0 + model.entropy_subset(idxs) + + +def test_mutual_information(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(1000, 10)) + model = BasePrecision() + model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X)) + idxs1 = np.ones(10) + idxs2 = np.ones(10) + idxs1[5:] = 0 + idxs1 = np.ones(10) + idxs2[:5] = 0 + idxs2[9] = 0 + model.mutual_information(idxs1, idxs2) diff --git a/python/python/bystro/covariance/test/test_covariance_np.py b/python/python/bystro/covariance/test/test_covariance_np.py new file mode 100644 index 000000000..45c41f965 --- /dev/null +++ b/python/python/bystro/covariance/test/test_covariance_np.py @@ -0,0 +1,28 @@ +import numpy as np +import numpy.linalg as la +from bystro.covariance._covariance_np import ( + EmpiricalCovariance, + BayesianCovariance, +) + + +def test_empirical_covariance(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(100000, 10)) + + model = EmpiricalCovariance() + model.fit(X) + + s_vals = la.svd(model.covariance, compute_uv=False) + assert np.abs(1 - s_vals[0]) <= 5e-2 + + +def test_bayesian_covariance(): + rng = np.random.default_rng(2021) + X = rng.normal(size=(100000, 10)) + + model = BayesianCovariance() + model.fit(X) + + s_vals = la.svd(model.covariance, compute_uv=False) + assert np.abs(1 - s_vals[0]) <= 5e-2 diff --git a/python/python/bystro/proteomics/fragpipe_tandem_mass_tag.py b/python/python/bystro/proteomics/fragpipe_tandem_mass_tag.py new file mode 100644 index 000000000..75954e8b4 --- /dev/null +++ b/python/python/bystro/proteomics/fragpipe_tandem_mass_tag.py @@ -0,0 +1,53 @@ +"""Load and prep fragpipe tandem mass Tag datasets.""" + +from dataclasses import dataclass + +import pandas as pd + +ABUNDANCE_COLS = ["Index", "NumberPSM", "ProteinID", "MaxPepProb", "ReferenceIntensity"] +ANNOTATION_COLS = ["plex", "channel", "sample", "sample_name", "condition", "replicate"] + + +@dataclass(frozen=True) +class TandemMassTagDataset: + """Represent a Fragpipe Tandem Mass Tag dataset.""" + + abundance_df: pd.DataFrame + annotation_df: pd.DataFrame + + +def _check_df_cols(df: pd.DataFrame, expected_cols: list[str]) -> None: + actual_cols = df.columns + if not all(x == y for x, y in zip(expected_cols, actual_cols, strict=False)): + err_msg = ( + f"expected dataframe to begin with cols: {expected_cols}, got cols: {actual_cols} instead." + ) + raise ValueError(err_msg) + + +def _prep_abundance_df(abundance_df: pd.DataFrame) -> pd.DataFrame: + """Prep abundance_df, setting index and normalizing abundances by ReferenceIntensity.""" + _check_df_cols(abundance_df, ABUNDANCE_COLS) + abundance_df = abundance_df.set_index("Index") + first_sample_column_idx = abundance_df.columns.to_list().index("ReferenceIntensity") + 1 + sample_columns = abundance_df.columns[first_sample_column_idx:] + for sample_column in sample_columns: + abundance_df[sample_column] -= abundance_df.ReferenceIntensity + return abundance_df + + +def _prep_annotation_df(annotation_df: pd.DataFrame) -> pd.DataFrame: + """Prep annotation df, setting index.""" + _check_df_cols(annotation_df, ANNOTATION_COLS) + return annotation_df.set_index("sample") + + +def load_tandem_mass_tag_dataset( + abundance_filename: str, annotation_filename: str +) -> TandemMassTagDataset: + """Load and prep Fragpipe tandem mass tag datasets.""" + raw_abundance_df = pd.read_csv(abundance_filename, sep="\t") + raw_annotation_df = pd.read_csv(annotation_filename, sep="\t") + abundance_df = _prep_abundance_df(raw_abundance_df) + annotation_df = _prep_annotation_df(raw_annotation_df) + return TandemMassTagDataset(abundance_df, annotation_df) diff --git a/python/python/bystro/proteomics/tests/test_fragpipe_tandem_mass_tag.py b/python/python/bystro/proteomics/tests/test_fragpipe_tandem_mass_tag.py new file mode 100644 index 000000000..e6f4b2dfd --- /dev/null +++ b/python/python/bystro/proteomics/tests/test_fragpipe_tandem_mass_tag.py @@ -0,0 +1,110 @@ +from io import StringIO +import pytest +import re + +import pandas as pd +from bystro.proteomics.fragpipe_tandem_mass_tag import ( + load_tandem_mass_tag_dataset, + _check_df_cols, + ABUNDANCE_COLS, +) +from pandas.testing import assert_frame_equal + +raw_abundance_df = pd.DataFrame( + { + "Index": {0: "A1BG", 1: "A1CF", 2: "A2M"}, + "NumberPSM": {0: 324, 1: 94, 2: 1418}, + "ProteinID": {0: "P04217", 1: "Q9NQ94", 2: "P01023"}, + "MaxPepProb": {0: 1.0, 1: 1.0, 2: 1.0}, + "ReferenceIntensity": {0: 30.0, 1: 26.1, 2: 30.8}, + "CPT0088900003": {0: 30.7, 1: 26.2, 2: 31.7}, + "CPT0079270003": {0: 30.2, 1: 26.3, 2: 30.5}, + "CPT0088920001": {0: 30.0, 1: 26.7, 2: 30.8}, + "CPT0079300001": {0: 30.6, 1: 25.1, 2: 31.6}, + "CPT0088550004": {0: 29.0, 1: 24.4, 2: 30.1}, + } +) + +raw_annotation_df = pd.DataFrame( + { + "plex": {0: 16, 1: 16, 2: 16}, + "channel": {0: "126", 1: "127N", 2: "127C"}, + "sample": {0: "CPT0088900003", 1: "CPT0079270003", 2: "CPT0088920001"}, + "sample_name": {0: "C3N-01179-T", 1: "C3L-00606-T", 2: "C3N-01179-N"}, + "condition": {0: "Tumor", 1: "Tumor", 2: "NAT"}, + "replicate": {0: 1, 1: 1, 2: 1}, + } +) + +expected_abundance_df = pd.DataFrame( + { + "NumberPSM": {"A1BG": 324, "A1CF": 94, "A2M": 1418}, + "ProteinID": {"A1BG": "P04217", "A1CF": "Q9NQ94", "A2M": "P01023"}, + "MaxPepProb": {"A1BG": 1.0, "A1CF": 1.0, "A2M": 1.0}, + "ReferenceIntensity": { + "A1BG": 30.0, + "A1CF": 26.1, + "A2M": 30.8, + }, + "CPT0088900003": { + "A1BG": 0.7, + "A1CF": 0.1, + "A2M": 0.9, + }, + "CPT0079270003": { + "A1BG": 0.2, + "A1CF": 0.2, + "A2M": -0.3, + }, + "CPT0088920001": { + "A1BG": -0.0, + "A1CF": 0.6, + "A2M": 0.0, + }, + "CPT0079300001": { + "A1BG": 0.6, + "A1CF": -1.0, + "A2M": 0.8, + }, + "CPT0088550004": { + "A1BG": -1.0, + "A1CF": -1.7, + "A2M": -0.7, + }, + } +) +expected_abundance_df.index.name = "Index" +expected_annotation_df = pd.DataFrame( + { + "plex": {"CPT0088900003": 16, "CPT0079270003": 16, "CPT0088920001": 16}, + "channel": {"CPT0088900003": "126", "CPT0079270003": "127N", "CPT0088920001": "127C"}, + "sample_name": { + "CPT0088900003": "C3N-01179-T", + "CPT0079270003": "C3L-00606-T", + "CPT0088920001": "C3N-01179-N", + }, + "condition": {"CPT0088900003": "Tumor", "CPT0079270003": "Tumor", "CPT0088920001": "NAT"}, + "replicate": {"CPT0088900003": 1, "CPT0079270003": 1, "CPT0088920001": 1}, + } +) +expected_annotation_df.index.name = "sample" + + +def test_load_tandem_mass_tag_dataset(): + abundance_handle = StringIO(raw_abundance_df.to_csv(index=None, sep="\t")) + annotation_handle = StringIO(raw_annotation_df.to_csv(index=None, sep="\t")) + tandem_mass_tag_dataset = load_tandem_mass_tag_dataset(abundance_handle, annotation_handle) + assert_frame_equal(expected_abundance_df, tandem_mass_tag_dataset.abundance_df) + assert_frame_equal(expected_annotation_df, tandem_mass_tag_dataset.annotation_df) + + +def test__check_df_cols(): + err_msg = re.escape( + r"expected dataframe to begin with cols: " + r"['Index', 'NumberPSM', 'ProteinID', 'MaxPepProb', 'ReferenceIntensity']," + r" got cols: " + r"Index(['plex', 'channel', 'sample', 'sample_name', 'condition', 'replicate']," + r" dtype='object') instead." + ) + with pytest.raises(ValueError, match=err_msg): + _check_df_cols(raw_annotation_df, ABUNDANCE_COLS) diff --git a/python/python/bystro/supervised_ppca/__init__.py b/python/python/bystro/supervised_ppca/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/python/bystro/tada_mixture/__init__.py b/python/python/bystro/tada_mixture/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/python/bystro/tada_mixture/mixture_tada_em.py b/python/python/bystro/tada_mixture/mixture_tada_em.py new file mode 100644 index 000000000..e9ee9883f --- /dev/null +++ b/python/python/bystro/tada_mixture/mixture_tada_em.py @@ -0,0 +1,395 @@ +""" +This uses the expectation maximization algorithm to fit a latent variable +model for TADA, with two flavors, a Zero-Inflated Poisson (ZIP) and a +standard Poisson. The generative formulation is + +p(z) = Dir(alpha_k) +p(x_ij|z=k) = (ZI)Poisson(Lambda_{jk}) + +Obviously the ZIP will have a parameter pi for probability of 0. + +ZIP does not have an analytic maximum likelihood formula. This is not a +problem for EM, the M step only requires that we be able to maximize +the likelihood, not that it has an analytic solution. So we just use +standard gradient descent with automatic differentiation in Torch. This +could be naughty if the objective was multimodal, but I strongly doubt that +this is the case. + +Objects +------- +MVTadaPoissonEM(K=4,training_options={}) + This fits a standard Poisson latent variable model. + +MVTadaZipEM(K=4,training_options={}) + This fits a standard ZIP latent variable model. + +Methods +------- +None + +""" +import numpy as np +import scipy.stats as st # type: ignore +from sklearn.mixture import GaussianMixture # type: ignore +from tqdm import trange # type: ignore + +import torch +from torch import nn +from torch.nn import PoissonNLLLoss + + +class MVTadaPoissonEM: + def __init__(self, K=4, training_options=None): + """ + This is a Product Poisson latent variable model. + + Parameters + ---------- + K : int,default=4 + The number of clusters, i.e. number of different types of + genes + + training_options : dict,default={} + The parameters for the inference scheme + """ + self.K = int(K) + if training_options is None: + training_options = {} + self.training_options = self._fill_training_options(training_options) + + def fit(self, X, progress_bar=True, Lamb_init=None, pi_init=None): + """ + Fits a model given count data X. + + Parameters + ---------- + X : np.array-like,shape=(N,p) + The count data + + progress_bar : bool,default=True + Whether to print a progress bar while fitting + + Lamb_init : np.array-like,shape=(k,p),default=None + Initialize the loadings of the model. Defaults to fitting a + GMM on the data and using those. + + pi_init : np.array-like,shape=(k),default=None + Initialize the class weights of the model. + + Returns + ------- + self + """ + td = self.training_options + K = self.K + + N, p = X.shape + + # Initialize variables + if Lamb_init is None: + model = GaussianMixture(K) + model.fit(X) + Lambda_ = model.means_ + else: + Lambda_ = Lamb_init + + rng = np.random.default_rng(2021) + pi_ = rng.dirichlet(np.ones(K)) if pi_init is None else pi_init + + myrange = trange if progress_bar else range + + for i in myrange(td["n_iterations"]): + # E step compute most likely categories + z_probs = np.zeros((N, K)) + for j in range(K): + log_pmf = st.poisson.logpmf(X, Lambda_[j]) + log_pmf_sum = np.sum(log_pmf, axis=1) + z_probs[:, j] = log_pmf_sum + np.log(pi_[j]) + Ez = np.argmax(z_probs, axis=1) + + # M step, maximize parameters + for j in range(K): + pi_[j] = np.mean(Ez == j) + Lambda_[j] = np.mean(X[Ez == j], axis=0) + + eps = 0.001 + pi_ = pi_ * (1 - K * eps) + np.ones(K) * eps + + self.Lambda = Lambda_ + self.pi = pi_ + return self + + def predict(self, data): + """ + This predicts the latent cluster assignments given a fitted model + + Parameters + ---------- + data : np.array-like,shape=(N_variants,n_phenotypes) + The data of counts, variants x phenotypes + + Returns + ------- + z_hat : np.array-like,shape=(N_samples,) + The cluster identities + """ + log_proba = self.predict_logproba(data) + z_hat = np.argmax(log_proba, axis=1) + return z_hat + + def predict_logproba(self, data): + """ + Predicts the log probability a datapoint being assigned a specific + cluster. + + Parameters + ---------- + data : np.array-like,shape=(N_variants,n_phenotypes) + The data of counts, variants x phenotypes + + Returns + ------- + z_hat : np.array-like,shape=(N_samples,) + The cluster identities + """ + N = data.shape[0] + log_proba = np.zeros((N, self.K)) + for i in range(N): + for k in range(self.K): + log_proba[i, k] = np.sum( + st.poisson.logpmf(data[i], self.Lambda[k]) + ) + return log_proba + + def _fill_training_options(self, training_options): + """ + This fills any relevant parameters for the learning algorithm + + Parameters + ---------- + training_options : dict + + Returns + ------- + tops : dict + """ + default_options = {"n_iterations": 100} + tops = {**default_options, **training_options} + return tops + + +class MVTadaZipEM: + def __init__(self, K=4, training_options=None): + """ + This is a Product ZIP latent variable model. + + Parameters + ---------- + K : int,default=4 + The number of clusters, i.e. number of different types of + genes + + training_options : dict,default={} + The parameters for the inference scheme + """ + self.K = int(K) + if training_options is None: + training_options = {} + self.training_options = self._fill_training_options(training_options) + + def fit(self, X, progress_bar=True): + """ + Fits a model given count data X. + + Parameters + ---------- + X : np.array-like,shape=(N,p) + The count data + Returns + ------- + self + """ + td = self.training_options + K = self.K + + N, p = X.shape + + model = GaussianMixture(K) + model.fit(X) + Lambda_list_l = [] + for i in range(K): + Lambda_list_l.append( + torch.tensor( + 1.2 * model.means_[i].astype(np.float32) + 0.1, + requires_grad=True, + ) + ) + + myrange = trange if progress_bar else range + + pct_0 = np.mean(X == 0, axis=0) + pct_0 = np.clip(pct_0, 0.01, 0.99) + pct_0_inv = np.log(pct_0 / (1 - pct_0)) + + alpha_ls = [ + torch.tensor(pct_0_inv, requires_grad=True) for i in range(K) + ] + + sigmoid = nn.Sigmoid() + + data = torch.tensor(X.astype(np.float32)) + pi_ = 1 / K * np.ones(K) + myloss = PoissonNLLLoss(log_input=False, full=True, reduction="none") + + for i in myrange(td["n_iterations"]): + # E step compute most likely categories + z_probs = np.zeros((N, K)) + for k in range(K): + Lambda_k = Lambda_list_l[k] + alpha_k = sigmoid(alpha_ls[k]) # Prob of being 0 + + log_likelihood_poisson = -1 * myloss(Lambda_k, data) + log_likelihood_poisson_w = log_likelihood_poisson + torch.log( + 1 - alpha_k + ) + + log_likelihood_0 = torch.log( + alpha_k + (1 - alpha_k) * torch.exp(-1 * Lambda_k) + ) + log_likelihood_0_b = torch.broadcast_to( + log_likelihood_0, data.shape + ) + + log_likelihood_point = torch.where( + data != 0, log_likelihood_poisson_w, log_likelihood_0_b + ) + + log_marginal = torch.sum(log_likelihood_point, axis=1) + + z_probs_k = log_marginal + np.log(pi_[k]) + z_probs[:, k] = z_probs_k.detach().numpy() + + Ez = np.argmax(z_probs, axis=1) + + # M step, maximize parameters + for k in range(K): + pi_[k] = np.mean(Ez == k) + data_sub = data[Ez == k] # Selects relevant data + + trainable_variables = [Lambda_list_l[k], alpha_ls[k]] + + optimizer = torch.optim.SGD( + trainable_variables, lr=td["learning_rate"], momentum=0.8 + ) + + for j in range(td["n_inner_iterations"]): + Lambda_k = Lambda_list_l[k] + alpha_k = sigmoid(alpha_ls[k]) + + log_likelihood_poisson = -1 * myloss(Lambda_k, data_sub) + log_likelihood_poisson_w = ( + log_likelihood_poisson + torch.log(1 - alpha_k) + ) + + log_likelihood_0 = torch.log( + alpha_k + (1 - alpha_k) * torch.exp(-1 * Lambda_k) + ) + log_likelihood_0_b = torch.broadcast_to( + log_likelihood_0, data_sub.shape + ) + + log_likelihood_point = torch.where( + data_sub != 0, + log_likelihood_poisson_w, + log_likelihood_0_b, + ) + + log_marginal = torch.sum(log_likelihood_point, axis=1) + loss = -1 * torch.mean(log_marginal) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + Lambda_k.requires_grad_(False) + Lambda_k[Lambda_k < 0] = 0 + Lambda_k.requires_grad_(True) + + self.pi = pi_ + self.Lambda = -1000 * np.ones((K, p)) + self.Alpha = -1000 * np.ones((K, p)) + for k in range(self.K): + Lambda_k = Lambda_list_l[k] + alpha_k = sigmoid(alpha_ls[k]) + + self.Lambda[k] = Lambda_k.detach().numpy() + self.Alpha[k] = alpha_k.detach().numpy() + + return self + + def predict(self, data): + """ + This predicts the latent cluster assignments given a fitted model + + Parameters + ---------- + data : np.array-like,shape=(N_variants,n_phenotypes) + The data of counts, variants x phenotypes + + Returns + ------- + z_hat : np.array-like,shape=(N_samples,) + The cluster identities + """ + log_proba = self.predict_logproba(data) + z_hat = np.argmax(log_proba, axis=1) + return z_hat + + def predict_logproba(self, data): + """ + Predicts the log probability a datapoint being assigned a specific + cluster. + + Parameters + ---------- + data : np.array-like,shape=(N_variants,n_phenotypes) + The data of counts, variants x phenotypes + + Returns + ------- + z_hat : np.array-like,shape=(N_samples,) + The cluster identities + """ + N = data.shape[0] + log_proba = np.zeros((N, self.K)) + for i in range(N): + for k in range(self.K): + log_prob_poisson = st.poisson.logpmf(data[i], self.Lambda[k]) + log_prob_0 = np.log( + self.Alpha[k] + + (1 - self.Alpha[k]) * np.exp(-1 * self.Lambda[k]) + ) + probs = log_prob_poisson.copy() + probs[data[i] == 0] = log_prob_0[data[i] == 0] + log_proba[i, k] = np.sum(probs) + return log_proba + + def _fill_training_options(self, training_options): + """ + This fills any relevant parameters for the learning algorithm + + Parameters + ---------- + training_options : dict + + Returns + ------- + tops : dict + """ + default_options = { + "n_iterations": 2000, + "n_inner_iterations": 50, + "learning_rate": 5e-4, + } + tops = {**default_options, **training_options} + return tops diff --git a/python/python/bystro/tada_mixture/mixture_tada_marginal.py b/python/python/bystro/tada_mixture/mixture_tada_marginal.py new file mode 100644 index 000000000..1643cd0af --- /dev/null +++ b/python/python/bystro/tada_mixture/mixture_tada_marginal.py @@ -0,0 +1,220 @@ +""" +This maximizes the marginal likelihood for the Poisson model, for now only +a Possion is implemented. The generative model is + +p(z) = Dir(alpha_k) +p(x_ij|z=k) = Poisson(Lambda_{jk}) + +The log likelihood is + +log p(x) = log(\sum p(z=k)p(x_ij|z=k)). + +There's not an analytic formula that makes this easy to solve. But as long +as you use the logsumexp trick you can compute this easily numerically. I +honestly don't know why that isn't taught more, I only learned it as a +postdoc even though I got a PhD in statistics. Meh + + +Objects +------- +MVTadaPoissonML(K=4,training_options={}) + This fits a standard Poisson latent variable model. + +Methods +------- +None + +""" +import numpy as np +import torch +from tqdm import trange # type: ignore +from torch import nn +from torch.distributions import Dirichlet +import scipy.stats as st # type: ignore +from sklearn.mixture import GaussianMixture # type: ignore +from torch.nn import PoissonNLLLoss + + +class MVTadaPoissonML: + def __init__(self, K=4, training_options=None): + """ + This is a product Poisson latent variable model. + + Parameters + ---------- + K : int,default=4 + The number of clusters, i.e. number of different types of + genes + + training_options : dict,default={} + The parameters for the inference scheme + """ + self.K = int(K) + if training_options is None: + training_options = {} + self.training_options = self._fill_training_options(training_options) + + def fit(self, data, progress_bar=True, Lamb_init=None, pi_init=None): + """ + Fits a model given count data X. + + Parameters + ---------- + X : np.array-like,shape=(N,p) + The count data + + progress_bar : bool,default=True + Whether to print a progress bar while fitting + + Lamb_init : np.array-like,shape=(k,p),default=None + Initialize the loadings of the model. Defaults to fitting a + GMM on the data and using those. + + pi_init : np.array-like,shape=(k),default=None + Initialize the class weights of the model. + + Returns + ------- + self + + """ + td = self.training_options + K = self.K + + N, p = data.shape + X = torch.tensor(data) + rng = np.random.default_rng(2021) + + # Initialize variables + if Lamb_init is None: + model = GaussianMixture(K) + model.fit(data) + Lamb_init = model.means_.astype(np.float32) + if pi_init is None: + pi_init = rng.dirichlet(np.ones(K)).astype(np.float32) + lambda_latent = torch.tensor(Lamb_init, requires_grad=True) + pi_logits = torch.tensor(pi_init, requires_grad=True) + + trainable_variables = [lambda_latent, pi_logits] + + m_d = Dirichlet(torch.tensor(np.ones(self.K) * self.K)) + + optimizer = torch.optim.SGD( + trainable_variables, lr=td["learning_rate"], momentum=td["momentum"] + ) + + nll = PoissonNLLLoss(full=True, log_input=False, reduction="none") + mse = nn.MSELoss() + smax = nn.Softmax() + softplus = nn.Softplus() + + myrange = trange if progress_bar else range + + self.losses_likelihoods = np.zeros(td["n_iterations"]) + + for i in myrange(td["n_iterations"]): + idx = rng.choice(N, td["batch_size"], replace=False) + X_batch = X[idx] # Batch size x + + Lambda_ = softplus(lambda_latent) + mu = 0.01 + pi_ = smax(pi_logits) * mu + (1 - mu) / K + + loss_logits = 0.001 * mse(pi_logits, torch.zeros(K)) + loss_prior_pi = -1.0 * m_d.log_prob(pi_) / N + + loglikelihood_each = [ + -1 * nll(X_batch, Lambda_[k]) for k in range(K) + ] # List of K N x p log likelihoods + + loglikelihood_sum = [ + torch.sum(mat, axis=1) for mat in loglikelihood_each + ] # List of K N x 1 log likelihoods + + loglikelihood_stack = torch.stack(loglikelihood_sum) + # Matrix of N x k log likelihoods + loglikelihood_components = torch.transpose( + loglikelihood_stack, 0, 1 + ) + torch.log( + pi_ + ) # Matrix of Nxk posteriors + loglikelihood_marg = torch.logsumexp( + loglikelihood_components, dim=1 + ) # Vector of N x 1 marginal posteriors + loss_likelihood = -1 * torch.mean(loglikelihood_marg) + # Average marginal likelihood + + loss = loss_logits + loss_prior_pi + loss_likelihood + self.losses_likelihoods[i] = loss_likelihood.detach().numpy() + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + self.Lambda = Lambda_.detach().numpy() + self.pi = pi_.detach().numpy() + return self + + def predict(self, data): + """ + This predicts the latent cluster assignments given a fitted model + + + Parameters + ---------- + data : np.array-like,shape=(N_variants,n_phenotypes) + The data of counts, variants x phenotypes + + Returns + ------- + z_hat : np.array-like,shape=(N_samples,) + The cluster identities + """ + log_proba = self.predict_logproba(data) + z_hat = np.argmax(log_proba, axis=1) + return z_hat + + def predict_logproba(self, data): + """ + Predicts the log probability a datapoint being assigned a specific + cluster. + + Parameters + ---------- + data : np.array-like,shape=(N_variants,n_phenotypes) + The data of counts, variants x phenotypes + + Returns + ------- + z_hat : np.array-like,shape=(N_samples,) + The cluster identities + """ + N = data.shape[0] + log_proba = np.zeros((N, self.K)) + for i in range(N): + for k in range(self.K): + log_proba[i, k] = np.sum( + st.poisson.logpmf(data[i], self.Lambda[k]) + ) + return log_proba + + def _fill_training_options(self, training_options): + """ + This fills any relevant parameters for the learning algorithm + + Parameters + ---------- + training_options : dict + + Returns + ------- + tops : dict + """ + default_options = { + "n_iterations": 30000, + "batch_size": 200, + "learning_rate": 5e-5, + "momentum": 0.99, + } + tops = {**default_options, **training_options} + return tops diff --git a/python/python/bystro/tada_mixture/tests/__init__.py b/python/python/bystro/tada_mixture/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/python/bystro/tada_mixture/tests/test_em.py b/python/python/bystro/tada_mixture/tests/test_em.py new file mode 100644 index 000000000..7c3fc03cc --- /dev/null +++ b/python/python/bystro/tada_mixture/tests/test_em.py @@ -0,0 +1,33 @@ +import numpy as np +from ..mixture_tada_em import MVTadaPoissonEM, MVTadaZipEM + + +def test_fit(): + N = 1000 + p = 10 + K = 2 + + rng = np.random.default_rng(2021) + + Z_true = rng.integers(0, high=K, size=N) + Lambda = rng.integers(0, high=50, size=(K, p)) + Alphas = 0.2 * np.ones((K, p)) + + Lambda = Lambda * 1.0 + Lambda[0, int(p / 2) :] = 0 + Lambda[1, : int(p / 2)] = 0 + for k in range(K): + Lambda[k] = Lambda[k] / np.mean(Lambda[k]) * 10 + + X = np.zeros((N, p)) + for i in range(N): + idx = int(Z_true[i]) + X[i] = rng.poisson(Lambda[idx]) + zerod_out = rng.binomial(1, Alphas[idx]) + X[i, zerod_out == 1] = 0 + + model = MVTadaZipEM(K=K) + model.fit(X) + + model = MVTadaPoissonEM(K=K) + model.fit(X) diff --git a/python/python/bystro/tada_mixture/tests/test_marginal.py b/python/python/bystro/tada_mixture/tests/test_marginal.py new file mode 100644 index 000000000..f9b224434 --- /dev/null +++ b/python/python/bystro/tada_mixture/tests/test_marginal.py @@ -0,0 +1,27 @@ +import numpy as np +from ..mixture_tada_marginal import MVTadaPoissonML + + +def test_fit(): + N = 1000 + p = 10 + K = 2 + + rng = np.random.default_rng(2021) + + Z_true = rng.integers(0, high=K, size=N) + Lambda = rng.integers(0, high=50, size=(K, p)) + + Lambda = Lambda * 1.0 + Lambda[0, int(p / 2) :] = 0 + Lambda[1, : int(p / 2)] = 0 + for k in range(K): + Lambda[k] = Lambda[k] / np.mean(Lambda[k]) * 10 + + X = np.zeros((N, p)) + for i in range(N): + idx = int(Z_true[i]) + X[i] = rng.poisson(Lambda[idx]) + + model = MVTadaPoissonML(K=K) + model.fit(X) diff --git a/python/python/bystro/tadarrr/tests/__init__.py b/python/python/bystro/tadarrr/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/requirements.txt b/python/requirements.txt index e1bdee576..1bf3829ba 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -14,3 +14,7 @@ scikit-allel==1.3.6 scikit-learn==1.2.2 skops==0.7.post0 tqdm==4.65.0 +cvxpy==1.2.1 +cloudpickle==2.0.* +tensorflow==2.11.0 +torch==2.0.1