diff --git a/python/python/bystro/_template_npr.py b/python/python/bystro/_template_npr.py
new file mode 100644
index 000000000..af87ed4ae
--- /dev/null
+++ b/python/python/bystro/_template_npr.py
@@ -0,0 +1,130 @@
+"""
+This provides a basic template for any model that uses numpyro as an 
+inference method. It has several methods that should be filled by any
+object extending the template, namely
+    fit
+    _fill_hp_options
+
+These are the methods for running the samples given data and providing 
+hyperameter selections respectively.
+
+Objects
+-------
+_BaseNPRModel(object)
+    This is the template Numpyro model
+
+"""
+import abc
+import cloudpickle  # type: ignore
+import numpyro  # type: ignore
+
+
+class _BaseNumpyroModel:
+    def __init__(self, mcmc_options=None, hp_options=None):
+        """
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        """
+        if mcmc_options is None:
+            mcmc_options = {}
+        if hp_options is None:
+            hp_options = {}
+        self.mcmc_options = self._fill_mcmc_options(mcmc_options)
+        self.hp_options = self._fill_hp_options(hp_options)
+        self.samples = None
+
+    @abc.abstractmethod
+    def fit(self, *args):
+        """
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        """
+
+    def render_model(self):
+        """
+        This provides a graphical representation of the model
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        """
+        assert self._model is not None, "Fit model first"
+        return numpyro.render_model(self._model, model_kwargs=self.model_kwargs)
+
+    def pickle(self, path):
+        """
+        This saves samples from a fit model
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        """
+        assert self.samples is not None, "Fit model first"
+        with open(path, "wb") as f:
+            cloudpickle.dump(self.samples, f)
+
+    def unpickle(self, path):
+        """
+        This loads samples from a previously saved model
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        """
+        with open(path, "rb") as f:
+            self.samples = cloudpickle.load(f)
+
+    def _fill_mcmc_options(self, mcmc_options):
+        """
+        This fills in default MCMC options of the sampler. Further methods
+        might override these but these are common/basic enough to leave in
+        as an implemented method.
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        """
+        default_options = {
+            "num_chains": 1,
+            "num_warmup": 500,
+            "num_samples": 2000,
+        }
+        mopts = {**default_options, **mcmc_options}
+        return mopts
+
+    @abc.abstractmethod
+    def _fill_hp_options(self, hp_options):
+        """
+        This fills in default hyperparameters of the model. Since these are
+        not conserved between models we leave this as an abstract method 
+        to be filled in per model.
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        """
diff --git a/python/python/bystro/_template_sgd_np.py b/python/python/bystro/_template_sgd_np.py
new file mode 100644
index 000000000..7fa4dc9c9
--- /dev/null
+++ b/python/python/bystro/_template_sgd_np.py
@@ -0,0 +1,132 @@
+"""
+This implements a base class for any model using stochastic gradient 
+descent-based techniques for inference. 
+
+Objects
+-------
+_BaseSGDModel(object):
+
+Methods
+-------
+None
+"""
+import abc
+import cloudpickle  # type: ignore
+
+
+class _BaseSGDModel(object):
+    def __init__(self, training_options=None):
+        """
+        The base class of a model relying on stochastic gradient descent for
+        inference
+        """
+        if training_options is None:
+            training_options = {}
+        self.training_options = self._fill_training_options(training_options)
+
+    @abc.abstractmethod
+    def fit(self, *args, **kwargs):
+        """
+        Method for fitting
+
+        Parameters
+        ----------
+        *args:
+           List of arguments
+
+        *kwargs:
+           Key word arguments
+        """
+
+    def pickle(self, path):
+        """
+        Method for saving the model
+
+        Parameters
+        ----------
+        path : str
+            The directory to save the model to
+        """
+        mydict = {"model": self}
+        with open(path, "wb") as f:
+            cloudpickle.dump(mydict, f)
+
+    @abc.abstractmethod
+    def unpickle(self, path):
+        """
+        Method for loading the model
+
+        Parameters
+        ----------
+        path : str
+            The directory to load the model from
+        """
+
+    def _fill_training_options(self, training_options):
+        """
+        This fills any relevant parameters for the learning algorithm
+
+        Parameters
+        ----------
+        training_options : dict
+
+        Returns
+        -------
+        training_opts : dict
+        """
+        default_options = {"n_iterations": 5000}
+        training_opts = {**default_options, **training_options}
+        return training_opts
+
+    @abc.abstractmethod
+    def _save_variables(self, training_variables):
+        """
+        This saves the final parameter values after training
+
+        Parameters
+        ----------
+        training_variables :list
+            The variables trained
+        """
+        raise NotImplementedError("_save_variables")
+
+    @abc.abstractmethod
+    def _initialize_save_losses(self):
+        """
+        This method initializes the arrays to track relevant variables
+        during training
+
+        Parameters
+        ----------
+        """
+        raise NotImplementedError("_initialize_save_losses")
+
+    @abc.abstractmethod
+    def _save_losses(self, *args):
+        """
+        This saves the respective losses at each iteration
+
+        Parameters
+        ----------
+        """
+        raise NotImplementedError("_save_losses")
+
+    @abc.abstractmethod
+    def _test_inputs(self, *args):
+        """
+        This performs error checking on inputs for fit
+
+        Parameters
+        ----------
+        """
+        raise NotImplementedError("_transform_training_data")
+
+    @abc.abstractmethod
+    def _transform_training_data(self, *args):
+        """
+        This converts training data to adequate format
+
+        Parameters
+        ----------
+        """
+        raise NotImplementedError("_transform_training_data")
diff --git a/python/python/bystro/ancestry/tests/test_train.py b/python/python/bystro/ancestry/tests/test_train.py
index 0ec81ce02..0497ab7f1 100644
--- a/python/python/bystro/ancestry/tests/test_train.py
+++ b/python/python/bystro/ancestry/tests/test_train.py
@@ -13,7 +13,9 @@
     superpop_predictions_from_pop_probs,
     superpop_probs_from_pop_probs,
     convert_1kgp_vcf_to_dosage,
-    process_vcf_for_pc_transformation
+    process_vcf_for_pc_transformation,
+    restrict_loadings_variants_to_vcf,
+    apply_pca_transform
 )
 from bystro.vcf_utils.simulate_random_vcf import (
     generate_simulated_vcf,
@@ -286,3 +288,67 @@ def test_1kgp_vcf_to_dosage():
     for column in processed_vcf.columns:
         for value in processed_vcf[column]:
             assert value in valid_values
+
+
+def test_restrict_loadings_variants_to_vcf():
+    """Tests restriction of gnomad loadings variants to vcf variants."""
+    num_samples = 10
+    num_vars = 10
+    simulated_vcf = generate_simulated_vcf(num_samples, num_vars)
+    sim_vcf_df = convert_sim_vcf_to_df(simulated_vcf)
+    loaded_vcf = convert_1kgp_vcf_to_dosage(sim_vcf_df)
+    processed_sim_vcf = process_vcf_for_pc_transformation(loaded_vcf)
+    # Using same indices, generate sim loadings
+    num_pcs = 30
+    sim_pcs = np.random.random((num_samples, num_pcs))
+    sim_loadings = pd.DataFrame(
+        data=sim_pcs, index=processed_sim_vcf.columns, columns=[f"PC{i+1}" for i in range(num_pcs)]
+    )
+    # Run restrict_loadings with sim data
+    pc_loadings_overlap, genos_overlap_transpose = restrict_loadings_variants_to_vcf(
+        sim_loadings, processed_sim_vcf
+    )
+    # Check for expected columns/indices
+    expected_columns_loadings = [f"PC{i+1}" for i in range(num_pcs)]
+    assert set(expected_columns_loadings) == set(pc_loadings_overlap.columns)
+    expected_index_genos_transpose = ["SampleID1", "SampleID2"]
+    for sample in expected_index_genos_transpose:
+        assert sample in genos_overlap_transpose.index
+    # Check that the variant IDs match up to sorting
+    assert set(genos_overlap_transpose.columns) == set(pc_loadings_overlap.index)
+
+
+def test_apply_pca_transform():
+    # Prepare test data same as before
+    num_samples = 10
+    num_vars = 10
+    simulated_vcf = generate_simulated_vcf(num_samples, num_vars)
+    sim_vcf_df = convert_sim_vcf_to_df(simulated_vcf)
+    loaded_vcf = convert_1kgp_vcf_to_dosage(sim_vcf_df)
+    processed_sim_vcf = process_vcf_for_pc_transformation(loaded_vcf)
+    # Using same indices, generate sim loadings
+    num_pcs = 30
+    sim_pcs = np.random.random((num_samples, num_pcs))
+    sim_loadings = pd.DataFrame(
+        data=sim_pcs, index=processed_sim_vcf.columns, columns=[f"PC{i+1}" for i in range(num_pcs)]
+    )
+    pc_loadings_overlap, genos_overlap_transpose = restrict_loadings_variants_to_vcf(
+        sim_loadings, processed_sim_vcf
+    )
+    # Call function to test
+    transformed_data = apply_pca_transform(pc_loadings_overlap, genos_overlap_transpose)
+    # Check for correct data type and columns
+    assert isinstance(transformed_data, pd.DataFrame)
+    expected_columns = ["PC" + str(i) for i in range(1, 31)]
+    assert transformed_data.columns.tolist() == expected_columns
+    # Check index of transformed_data matches genos_overlap_transpose.index
+    assert transformed_data.index.equals(genos_overlap_transpose.index)
+    # Check shape of transformed_data matches expected shape
+    expected_shape = (genos_overlap_transpose.shape[0], 30)
+    assert transformed_data.shape == expected_shape
+
+    # Check all values in transformed_data are numeric
+    assert transformed_data.dtypes.apply(pd.api.types.is_numeric_dtype).all()
+
+    # Check that transformed_data does not contain any NaN or missing values
+    assert not transformed_data.isna().to_numpy().any()
diff --git a/python/python/bystro/ancestry/train.py b/python/python/bystro/ancestry/train.py
index 211c98655..4739dc015 100644
--- a/python/python/bystro/ancestry/train.py
+++ b/python/python/bystro/ancestry/train.py
@@ -513,13 +513,16 @@ def convert_1kgp_vcf_to_dosage(vcf_with_header: pd.DataFrame) -> pd.DataFrame:
 
 
 def process_vcf_for_pc_transformation(dosage_vcf: pd.DataFrame) -> pd.DataFrame:
-    """Process dosage_vcf so that it only includes genotypes for analysis."""
+    """Process dosage_vcf and transpose so that it only includes 
+    genotypes in correct configuration for analysis."""
     genos = dosage_vcf.iloc[:, len(HEADER_COLS) :]
     genos = genos.set_index(dosage_vcf.index)
     genos = genos.sort_index()
     # Check that not all genotypes are the same for QC
     assert len(set(genos.to_numpy().flatten())) > 1, "All genotypes are the same"
-    return genos
+    #Transpose genos_overlap for analysis
+    genos_transpose = genos.T
+    return genos_transpose
 
 
 def _load_pca_loadings() -> pd.DataFrame:
@@ -560,6 +563,35 @@ def get_variant(x):
     return pc_loadings
 
 
+def restrict_loadings_variants_to_vcf(
+    pc_loadings: pd.DataFrame, genos_transpose: pd.DataFrame
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Restrict variant list to overlap between gnomad loadings and transposed 
+    reference vcf and return versions only including the overlapping variants."""
+    # IGSR version of 1kgp is current reference vcf
+    var_overlap = pc_loadings.index.intersection(genos_transpose.columns)
+    pc_loadings_overlap = pc_loadings[pc_loadings.index.isin(var_overlap)]
+    genos_overlap_transpose = genos_transpose[genos_transpose.columns.isin(var_overlap)]
+    # Ensure that genos transpose and pc_loadings have corresponding shape and vars
+    assert genos_overlap_transpose.shape[1] == pc_loadings_overlap.shape[0]
+    assert set(genos_overlap_transpose.columns) == set(pc_loadings_overlap.index)
+    assert (genos_overlap_transpose.index == genos_transpose.index).all()
+    # Record amount of overlap
+    num_var_overlap = len(var_overlap)
+    logger.info("Number of overlapping variants: %d", num_var_overlap)
+    return pc_loadings_overlap, genos_overlap_transpose
+
+
+def apply_pca_transform(
+    pc_loadings_overlap: pd.DataFrame,
+    genos_overlap_transpose: pd.DataFrame
+) -> pd.DataFrame:
+    """Transform vcf with genotypes in dosage format with PCs loadings from gnomad PCA."""
+    # Dot product
+    transformed_data = genos_overlap_transpose @ pc_loadings_overlap
+    return transformed_data
+
+
 def _perform_pca(train_X: pd.DataFrame, test_X: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, PCA]:
     """Perform PCA, checking for good compression."""
     logger.info("Beginning PCA")
diff --git a/python/python/bystro/covariance/__init__.py b/python/python/bystro/covariance/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/python/bystro/covariance/_base_covariance.py b/python/python/bystro/covariance/_base_covariance.py
new file mode 100644
index 000000000..601de605d
--- /dev/null
+++ b/python/python/bystro/covariance/_base_covariance.py
@@ -0,0 +1,532 @@
+"""
+This provides the base class for covariance estimation. While there are 
+many methods for estimating covariance matrices, once such a matrix has 
+been estimated most of the procedures for evaulating fit, comparing 
+quantities etc are shared between models. This implementation allows for 
+us to implement these methods a single time and for most code being 
+centered on the fit method and a small number of auxiliary methods. This 
+also will provide the basis for factor analysis and probablistic PCA. As 
+a practical design choice, these methods are implemented as separate 
+functions, which are simply converted into object methods by replacing 
+covariance with self.covariance. Documentation is provided in the 
+functional method and omitted from the object methods.
+
+Objects
+-------
+BaseCovariance
+
+    get_precision()
+        Gets the precision matrix defined as the inverse of the covariance 
+
+    get_stable_rank()
+        Returns the stable rank defined as 
+        ||A||_F^2/||A||^2
+
+    predict(Xobs,idxs)
+        Predicts missing data using observed data.
+
+    --------------------
+    conditional_score(X,idxs,weights=None)
+        mean(log p(X[idx==1]|X[idx==0],covariance))
+
+    conditional_score_samples(X,idxs)
+        log p(X[idx==1]|X[idx==0],covariance)
+    
+    marginal_score(X,idxs,weights=None)
+        mean(log p(X[idx==1]|covariance))
+
+    marginal_score_samples(X,idxs)
+        log p(X[idx==1]|covariance)
+
+    score(X,weights=None):
+        mean log p(X)
+
+    score_samples(X)
+        log p(X)
+
+    --------------------
+    entropy()
+        Computes the entropy of a Gaussian distribution parameterized by 
+        covariance.
+    
+    entropy_subset(idxs)
+        Computes the entropy of a subset of the covariates
+
+    mutual_information(covariance,idxs1,idxs2):
+        This computes the mutual information bewteen the two sets of
+        covariates based on the model.
+
+Methods
+-------
+_get_precision(covariance)
+
+_get_stable_rank(covariance)
+
+_predict(covariance,Xobs,idxs)
+
+_get_conditional_parameters(covariance,idxs)
+    Computes the distribution parameters p(X_miss|X_obs)
+
+----------------------------
+
+_conditional_score_samples(covariance,X,idxs,weights=None)
+
+_conditional_score(covariance,X,idxs)
+
+_marginal_score(covariance,X,idxs,weights=None)
+
+_marginal_score_samples(covariance,X,idxs)
+
+_score(covariance,X,weights=None)
+
+_score_samples(covariance,X)
+
+----------------------------
+
+_entropy(covariance)
+
+_entropy_subset(covariance,idxs)
+
+_mutual_information(covariance,idxs1,idxs2):
+
+"""
+import numpy as np
+from numpy import linalg as la
+from datetime import datetime as dt
+import pytz
+
+
+class BaseCovariance(object):
+    """
+    This object basically contains all the methods asides for fitting
+    a covariance matrix.
+    """
+
+    def __init__(self):
+        self.creationDate = dt.now(pytz.timezone("US/Pacific"))
+
+    #################
+    # Miscellaneous #
+    #################
+    def get_precision(self):
+        return _get_precision(self.covariance)
+
+    def get_stable_rank(self):
+        return _get_stable_rank(self.covariance)
+
+    def predict(self, Xobs, idxs):
+        return _predict(self.covariance, Xobs, idxs)
+
+    #####################################
+    # Gaussian Likelihood-based methods #
+    #####################################
+    def conditional_score(self, X, idxs, weights=None):
+        return _conditional_score(self.covariance, X, idxs, weights=weights)
+
+    def conditional_score_samples(self, X, idxs):
+        return _conditional_score_samples(self.covariance, X, idxs)
+
+    def marginal_score(self, X, idxs, weights=None):
+        return _marginal_score(self.covariance, X, idxs, weights=weights)
+
+    def marginal_score_samples(self, X, idxs):
+        return _marginal_score_samples(self.covariance, X, idxs)
+
+    def score(self, X, weights=None):
+        return _score(self.covariance, X, weights=weights)
+
+    def score_samples(self, X):
+        return _score_samples(self.covariance, X)
+
+    #################################
+    # Information-theoretic methods #
+    #################################
+    def entropy(self):
+        return _entropy(self.covariance)
+
+    def entropy_subset(self, idxs):
+        return _entropy_subset(self.covariance, idxs)
+
+    def mutual_information(self, idxs1, idxs2):
+        return _mutual_information(self.covariance, idxs1, idxs2)
+
+
+###########################################
+###########################################
+###########################################
+###                                     ###
+###                                     ###
+###   Methods for covariance matrices   ###
+###                                     ###
+###                                     ###
+###########################################
+###########################################
+###########################################
+
+
+def _get_precision(covariance):
+    """
+    Gets the precision matrix defined as the inverse of the covariance
+
+    Parameters
+    ----------
+    covariance : np.array-like(p,p)
+        The covariance matrix
+
+    Returns
+    -------
+    precision : np.array-like(sum(p),sum(p))
+        The inverse of the covariance matrix
+    """
+    precision = la.inv(covariance)
+    return precision
+
+
+def _get_stable_rank(covariance):
+    """
+    Returns the stable rank defined as
+    ||A||_F^2/||A||^2
+
+    Parameters
+    ----------
+    covariance : np.array-like(p,p)
+        The covariance matrix
+
+    Returns
+    -------
+    srank : float
+        The stable rank. See Vershynin High dimensional probability for 
+        discussion, but this is a statistically stable approximation to rank
+    """
+    singular_values = la.svd(covariance, compute_uv=False)
+    srank = np.sum(singular_values) / singular_values[0]
+    return srank
+
+
+def _predict(covariance, Xobs, idxs):
+    """
+    Predicts missing data using observed data. This uses the conditional
+    Gaussian formula (see wikipedia multivariate gaussian). Solve allows
+    us to avoid an explicit matrix inversion.
+
+    Parameters
+    ----------
+    covariance : np.array-like(p,p)
+        The covariance matrix
+
+    Xobs : np array-like,(N_samples,\sum idxs)
+        The observed data
+
+    idxs: np.array-like,(sum(p),)
+        The observation locations
+
+    Returns
+    -------
+    preds : np.array-like,(N_samples,p-\sum idxs)
+        The predicted values
+    """
+    covariance_sub = covariance[idxs == 1]
+    covariance_22 = covariance_sub[:, idxs == 1]
+    covariance_21 = covariance_sub[:, idxs == 0]
+
+    beta_bar = la.solve(covariance_22, covariance_21)
+
+    preds = np.dot(Xobs[:, idxs == 1], beta_bar)
+    return preds
+
+
+def _conditional_score(covariance, X, idxs, weights=None):
+    """
+    Returns the predictive log-likelihood of a subset of data.
+
+    mean(log p(X[idx==1]|X[idx==0],covariance))
+
+    Parameters
+    ----------
+    covariance : np.array-like(p,p)
+        The covariance matrix
+
+    X : np.array-like,(N,sum(idxs))
+        The centered data
+
+    idxs: np.array-like,(sum(p),)
+        The observation locations
+
+    weights : np.array-like,(N,),default=None
+        The optional weights on the samples. Don't have negative values. 
+        Average value forced to 1.
+
+    Returns
+    -------
+    avg_score : float
+        Average log likelihood
+    """
+    weights = (
+        np.ones(X.shape[0]) if weights is None else weights / np.mean(weights)
+    )
+    avg_score = np.mean(
+        weights * _conditional_score_samples(covariance, X, idxs)
+    )
+    return avg_score
+
+
+def _conditional_score_samples(covariance, X, idxs):
+    """
+    Return the conditional log likelihood of each sample, that is
+
+    log p(X[idx==1]|X[idx==0],covariance) = N(Sigma_10Sigma_00^{-1}x_0,
+                                    Sigma_11-Sigma_10Sigma_00^{-1}Sigma_01)
+
+    Parameters
+    ----------
+    covariance : np.array-like(p,p)
+        The covariance matrix
+
+    X : np.array-like,(N,p)
+        The centered data
+
+    idxs: np.array-like,(p,)
+        The observation locations
+
+    Returns
+    -------
+    scores : float
+        Log likelihood for each sample
+    """
+    covariance_sub = covariance[idxs == 1]
+    covariance_22 = covariance_sub[:, idxs == 1]
+    covariance_21 = covariance_sub[:, idxs == 0]
+
+    covariance_nonsub = covariance[idxs == 0]
+    covariance_11 = covariance_nonsub[:, idxs == 0]
+
+    beta_bar = la.solve(covariance_22, covariance_21)
+    Second_part = np.dot(covariance_21.T, beta_bar)
+
+    covariance_bar = covariance_11 - Second_part
+
+    mu_ = np.dot(X[:, idxs == 1], beta_bar)
+    scores = _score_samples(covariance_bar, X[:, idxs == 0] - mu_)
+
+    return scores
+
+
+def _get_conditional_parameters(covariance, idxs):
+    """
+    Computes the distribution parameters p(X_miss|X_obs)
+
+    Parameters
+    ----------
+    covariance : np.array-like,(p,p)
+        The covariance matrix
+
+    idxs: np.array-like,(p,)
+        The observed covariates
+
+    Returns
+    -------
+    beta_bar : array-like
+        The predictive covariates
+
+    covariance_bar : array-like
+        Conditional covariance
+    """
+
+    covariance_sub = covariance[idxs == 1]
+    covariance_22 = covariance_sub[:, idxs == 1]
+    covariance_21 = covariance_sub[:, idxs == 0]
+    covariance_nonsub = covariance[idxs == 0]
+    covariance_11 = covariance_nonsub[:, idxs == 0]
+    beta_bar = la.solve(covariance_22, covariance_21)
+    Second_part = np.dot(covariance_21.T, beta_bar)
+
+    covariance_bar = covariance_11 - Second_part
+    return beta_bar.T, covariance_bar
+
+
+def _marginal_score(covariance, X, idxs, weights=None):
+    """
+    Returns the marginal log-likelihood of a subset of data
+
+    Parameters
+    ----------
+    covariance : np.array-like(p,p)
+        The covariance matrix
+
+    X : np.array-like,(N,sum(idxs))
+        The centered data
+
+    idxs: np.array-like,(sum(p),)
+        The observation locations
+
+    weights : np.array-like,(N,),default=None
+        The optional weights on the samples
+
+    Returns
+    -------
+    avg_score : float
+        Average log likelihood
+    """
+    if weights is None:
+        weights = np.ones(X.shape[0])
+    avg_score = np.mean(weights * _marginal_score_samples(covariance, X, idxs))
+    return avg_score
+
+
+def _marginal_score_samples(covariance, X, idxs):
+    """
+    Returns the marginal log-likelihood of a subset of data
+
+    Parameters
+    ----------
+    covariance : np.array-like(p,p)
+        The covariance matrix
+
+    X : np.array-like,(N,sum(idxs))
+        The centered data
+
+    idxs: np.array-like,(sum(p),)
+        The observation locations
+
+    Returns
+    -------
+    scores : float
+        Average log likelihood
+    """
+    cov1 = covariance[idxs == 1]
+    cov_sub = cov1[:, idxs == 1]
+    scores = _score_samples(cov_sub, X)
+    return scores
+
+
+def _score(covariance, X, weights=None):
+    """
+    Returns the average log liklihood of data.
+
+    Parameters
+    ----------
+    covariance : np.array-like(p,p)
+        The covariance matrix
+
+    X : np.array-like,(N,sum(p))
+        The centered data
+
+    weights : np.array-like,(N,),default=None
+        The optional weights on the samples
+
+    Returns
+    -------
+    avg_score : float
+        Average log likelihood
+    """
+    if weights is None:
+        weights = np.ones(X.shape[0])
+    avg_score = np.mean(weights * _score_samples(covariance, X))
+    return avg_score
+
+
+def _score_samples(covariance, X):
+    """
+    Return the log likelihood of each sample
+
+    Parameters
+    ----------
+    covariance : np.array-like(p,p)
+        The covariance matrix
+
+    X : np.array-like,(N,sum(p))
+        The centered data
+
+    Returns
+    -------
+    scores : float
+        Log likelihood for each sample
+    """
+    p = covariance.shape[0]
+    term1 = -p / 2 * np.log(2 * np.pi)
+
+    _, logdet = la.slogdet(covariance)
+    term2 = -0.5 * logdet
+
+    quad_init = la.solve(covariance, np.transpose(X))
+    difference = X * np.transpose(quad_init)
+    term3 = np.sum(difference, axis=1)
+
+    scores = term1 + term2 - 0.5 * term3
+    return scores
+
+
+def _entropy(covariance):
+    """
+    Computes the entropy of a Gaussian distribution parameterized by
+    covariance.
+
+    Parameters
+    ----------
+    covariance : np.array-like(p,p)
+        The covariance matrix
+
+    Returns
+    -------
+    entropy : float
+        The differential entropy of the distribution
+    """
+    cov_new = 2 * np.pi * np.e * covariance
+    _, logdet = la.slogdet(cov_new)
+    entropy = 0.5 * logdet
+    return entropy
+
+
+def _entropy_subset(covariance, idxs):
+    """
+    Computes the entropy of a subset of the Gaussian distribution
+    parameterized by covariance.
+
+    Parameters
+    ----------
+    covariance : np.array-like(p,p)
+        The covariance matrix
+
+    idxs: np.array-like,(sum(p),)
+        The observation locations
+
+    Returns
+    -------
+    entropy : float
+        The differential entropy of the distribution
+    """
+    cov1 = covariance[idxs == 1]
+    cov_sub = cov1[:, idxs == 1]
+    entropy = _entropy(cov_sub)
+    return entropy
+
+
+def _mutual_information(covariance, idxs1, idxs2):
+    """
+    This computes the mutual information bewteen the two sets of
+    covariates based on the model.
+
+    Parameters
+    ----------
+    idxs1 : np.array-like,(p,)
+        First group of variables
+
+    idxs2 : np.array-like,(p,)
+        Second group of variables
+
+    Returns
+    -------
+    mutual_information : float
+        The mutual information between the two variables
+    """
+    idxs = idxs1 + idxs2
+    cov_sub = covariance[np.ix_(idxs == 1, idxs == 1)]
+    idxs1_sub = idxs1[idxs == 1]
+    Hy = _entropy(cov_sub[np.ix_(idxs1_sub == 1, idxs1_sub == 1)])
+
+    _, covariance_conditional = _get_conditional_parameters(
+        cov_sub, 1 - idxs1_sub
+    )
+    H_y_given_x = _entropy(covariance_conditional)
+    mutual_information = Hy - H_y_given_x
+    return mutual_information
diff --git a/python/python/bystro/covariance/_base_precision.py b/python/python/bystro/covariance/_base_precision.py
new file mode 100644
index 000000000..8b462a273
--- /dev/null
+++ b/python/python/bystro/covariance/_base_precision.py
@@ -0,0 +1,159 @@
+"""
+This provides the base class for covariance estimation in terms of the 
+precision matrix. While in the futre I'll rewrite separate methods for some
+of these to take advantage of the fact that we estimated the precision 
+matrix rather than covariance matrix, for now I just invert precision and 
+use the covariance matrix methods. Yuck right?
+
+Objects
+-------
+BasePrecision
+
+    get_covariance()
+        Gets the precision matrix defined as the inverse of the covariance 
+
+    get_stable_rank()
+        Returns the stable rank defined as 
+        ||A||_F^2/||A||^2
+
+    predict(Xobs,idxs)
+        Predicts missing data using observed data.
+
+    --------------------
+    conditional_score(X,idxs)
+        mean(log p(X[idx==1]|X[idx==0],covariance))
+
+    conditional_score_samples(X,idxs)
+        log p(X[idx==1]|X[idx==0],covariance)
+    
+    marginal_score(X,idxs)
+        mean(log p(X[idx==1]|covariance))
+
+    marginal_score_samples(X,idxs)
+        log p(X[idx==1]|covariance)
+
+    score(X):
+        mean log p(X)
+
+    score_samples(X)
+        log p(X)
+
+    --------------------
+    entropy()
+        Computes the entropy of a Gaussian distribution parameterized by 
+        covariance.
+    
+    entropy_subset(idxs)
+        Computes the entropy of a subset of the covariates
+
+    mutual_information(covariance,idxs1,idxs2):
+        This computes the mutual information bewteen the two sets of
+        covariates based on the model.
+
+Methods
+-------
+_get_covariance(covariance)
+"""
+from numpy import linalg as la
+from datetime import datetime as dt
+from ._base_covariance import BaseCovariance
+import pytz
+
+
+class BasePrecision(BaseCovariance):
+    """
+    This object basically contains all the methods asides for fitting
+    a precision matrix.
+    """
+
+    def __init__(self):
+        self.creationDate = dt.now(pytz.timezone("US/Pacific"))
+
+    #################
+    # Miscellaneous #
+    #################
+    def get_covariance(self):
+        return _get_covariance(self.precision)
+
+    def get_stable_rank(self):
+        if not hasattr(self, "covariance"):
+            self.covariance = la.inv(self.precision)
+        return super(BasePrecision, self).get_stable_rank()
+
+    def predict(self, Xobs, idxs):
+        if not hasattr(self, "covariance"):
+            self.covariance = la.inv(self.precision)
+        return super(BasePrecision, self).predict(Xobs, idxs)
+
+    #####################################
+    # Gaussian Likelihood-based methods #
+    #####################################
+    def conditional_score(self, X, idxs, weights=None):
+        if not hasattr(self, "covariance"):
+            self.covariance = la.inv(self.precision)
+        return super(BasePrecision, self).conditional_score(
+            X, idxs, weights=weights
+        )
+
+    def conditional_score_samples(self, X, idxs):
+        if not hasattr(self, "covariance"):
+            self.covariance = la.inv(self.precision)
+        return super(BasePrecision, self).conditional_score_samples(X, idxs)
+
+    def marginal_score(self, X, idxs, weights=None):
+        if not hasattr(self, "covariance"):
+            self.covariance = la.inv(self.precision)
+        return super(BasePrecision, self).marginal_score(
+            X, idxs, weights=weights
+        )
+
+    def marginal_score_samples(self, X, idxs):
+        if not hasattr(self, "covariance"):
+            self.covariance = la.inv(self.precision)
+        return super(BasePrecision, self).marginal_score_samples(X, idxs)
+
+    def score(self, X, weights=None):
+        if not hasattr(self, "covariance"):
+            self.covariance = la.inv(self.precision)
+        return super(BasePrecision, self).score(X, weights=weights)
+
+    def score_samples(self, X):
+        if not hasattr(self, "covariance"):
+            self.covariance = la.inv(self.precision)
+        return super(BasePrecision, self).score_samples(X)
+
+    #################################
+    # Information-theoretic methods #
+    #################################
+    def entropy(self):
+        if not hasattr(self, "covariance"):
+            self.covariance = la.inv(self.precision)
+        return super(BasePrecision, self).entropy()
+
+    def entropy_subset(self, idxs):
+        if not hasattr(self, "covariance"):
+            self.covariance = la.inv(self.precision)
+        return super(BasePrecision, self).entropy_subset(idxs)
+
+    def mutual_information(self, idxs1, idxs2):
+        if not hasattr(self, "covariance"):
+            self.covariance = la.inv(self.precision)
+        return super(BasePrecision, self).mutual_information(idxs1, idxs2)
+
+
+def _get_covariance(precision):
+    """
+    Gets the covariance matrix defined as the inverse of the precision
+
+    Parameters
+    ----------
+    precision : np.array-like(p,p)
+        The precision matrix
+
+    Returns
+    -------
+    covariance : np.array-like(sum(p),sum(p))
+        The inverse of the precision matrix
+    """
+    covariance = la.inv(precision)
+    return covariance
diff --git a/python/python/bystro/covariance/_covariance_np.py b/python/python/bystro/covariance/_covariance_np.py
new file mode 100644
index 000000000..f96fd7747
--- /dev/null
+++ b/python/python/bystro/covariance/_covariance_np.py
@@ -0,0 +1,90 @@
+"""
+This implements two objects, the empirical covariance estimator used as a 
+baseline comparison method. It also implements BayesianCovariance, which
+implements MAP estimation using several common priors.
+
+Objects
+-------
+EmpiricalCovariance(BaseCovariance)
+    This object just fits the covariance matrix as the standard sample
+    covariance matrix
+
+BayesianCovariance(BaseCovariance):
+    This object fits the covariance matrix as the MAP estimator using
+    user-defined priors.
+"""
+import numpy as np
+from bystro.covariance._base_covariance import BaseCovariance
+
+
+class EmpiricalCovariance(BaseCovariance):
+    def __init__(self):
+        """
+        This object just fits the covariance matrix as the standard sample
+        covariance matrix
+        """
+        super().__init__()
+
+    def fit(self, X):
+        """
+        This fits a covariance matrix using samples X.
+
+        Parameters
+        ----------
+        X : np.array-like,(n_samples,n_covariates)
+            The centered data
+        """
+        self.N, self.p = X.shape
+        XTX = np.dot(X.T, X)
+        self.covariance = XTX / self.N
+
+
+class BayesianCovariance(BaseCovariance):
+    def __init__(self, prior_options=None):
+        """
+        This object fits the covariance matrix as the MAP estimator using
+        user-defined priors.
+        """
+        super().__init__()
+        if prior_options is None:
+            prior_options = {}
+        self.prior_options = self._fill_prior_options(prior_options)
+
+    def fit(self, X):
+        """
+        This fits a covariance matrix using samples X with MAP estimation.
+
+        Parameters
+        ----------
+        X : np.array-like,(n_samples,n_covariates)
+            The data
+        """
+        self.N, self.p = X.shape
+        p_opts = self.prior_options
+        covariance_empirical = np.dot(X.T, X)
+        nu = p_opts["iw_params"]["pnu"] + self.p
+        cov_prior = p_opts["iw_params"]["sigma"] * np.eye(self.p)
+        posterior_cov = cov_prior + covariance_empirical
+        posterior_nu = nu + self.N
+        self.covariance = posterior_cov / (posterior_nu + self.p + 1)
+
+    def _fill_prior_options(self, prior_options):
+        """
+        This sets the prior options for our inference scheme
+
+        Parameters
+        ----------
+        prior_options : dict
+            The original prior options passed as a dictionary
+
+        Options
+        -------
+        iw_params : dict,default={'pnu':2,'sigma':1.0}
+            pnu : int - nu = p + pnu
+            sigma : float>0 - cov = sigma*I_p
+        """
+        default_options = {
+            "iw_params": {"pnu": 2, "sigma": 1.0},
+        }
+        pops = {**default_options, **prior_options}
+        return pops
diff --git a/python/python/bystro/covariance/_precision_np.py b/python/python/bystro/covariance/_precision_np.py
new file mode 100644
index 000000000..46535be2e
--- /dev/null
+++ b/python/python/bystro/covariance/_precision_np.py
@@ -0,0 +1,24 @@
+"""
+
+"""
+import numpy as np
+import numpy.linalg as la
+from ._base_precision import BasePrecision
+
+
+class EmpiricalPrecision(BasePrecision):
+    def __init__(self):
+        super().__init__()
+
+    def fit(self, X):
+        """
+        This fits a precision matrix using samples X.
+
+        Parameters
+        ----------
+        X : np.array-like,(n_samples,n_covariates)
+            The data
+        """
+        self.N, self.p = X.shape
+        XTX = np.dot(X.T, X)
+        self.precision = la.inv(XTX / self.N)
diff --git a/python/python/bystro/covariance/test/__init__.py b/python/python/bystro/covariance/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/python/bystro/covariance/test/test_base_covariance.py b/python/python/bystro/covariance/test/test_base_covariance.py
new file mode 100644
index 000000000..9cb1822e3
--- /dev/null
+++ b/python/python/bystro/covariance/test/test_base_covariance.py
@@ -0,0 +1,164 @@
+import numpy as np
+import numpy.linalg as la
+import scipy.stats as st  # type: ignore
+from bystro.covariance._base_covariance import BaseCovariance, _score_samples
+
+
+def test_get_stable_rank():
+    model = BaseCovariance()
+
+    model.covariance = np.eye(10)
+    assert model.get_stable_rank() == 10
+
+    model.covariance = np.zeros((10, 10))
+    model.covariance[0, 0] = 1
+    assert model.get_stable_rank() == 1
+
+
+def test_predict():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BaseCovariance()
+    cov = 1 / X.shape[0] * np.dot(X.T, X)
+    model.covariance = cov
+    idxs = np.ones(10)
+    idxs[8:] = 0
+    preds = model.predict(X, idxs)
+    csub = cov[idxs == 1]
+    c22 = csub[:, idxs == 1]
+    c21 = csub[:, idxs == 0]
+    csub = cov[idxs == 0]
+    beta_bar = la.solve(c22, c21)
+    mu = np.dot(X[:, idxs == 1], beta_bar)
+    assert np.all(np.abs(mu - preds) < 1e-8)
+
+
+def test_conditional_score_samples():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BaseCovariance()
+    cov = 1 / X.shape[0] * np.dot(X.T, X)
+    model.covariance = cov
+    idxs = np.ones(10)
+    idxs[8:] = 0
+    score_samples = model.conditional_score_samples(X, idxs)
+    csub = cov[idxs == 1]
+    c22 = csub[:, idxs == 1]
+    c21 = csub[:, idxs == 0]
+    csub = cov[idxs == 0]
+    c11 = csub[:, idxs == 0]
+    beta_bar = la.solve(c22, c21)
+    second = np.dot(c21.T, beta_bar)
+    cov_bar = c11 - second
+    mu = np.dot(X[:, idxs == 1], beta_bar)
+    score_samples2 = _score_samples(cov_bar, X[:, idxs == 0] - mu)
+    assert np.all(np.abs(score_samples - score_samples2) < 1e-8)
+
+
+def test_conditional_score():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BaseCovariance()
+    cov = 1 / X.shape[0] * np.dot(X.T, X)
+    model.covariance = cov
+    idxs = np.ones(10)
+    idxs[8:] = 0
+    score = model.conditional_score(X, idxs)
+    csub = cov[idxs == 1]
+    c22 = csub[:, idxs == 1]
+    c21 = csub[:, idxs == 0]
+    csub = cov[idxs == 0]
+    c11 = csub[:, idxs == 0]
+    beta_bar = la.solve(c22, c21)
+    second = np.dot(c21.T, beta_bar)
+    cov_bar = c11 - second
+    mu = np.dot(X[:, idxs == 1], beta_bar)
+    score_samples2 = _score_samples(cov_bar, X[:, idxs == 0] - mu)
+    assert np.abs(np.mean(score_samples2) - score) < 1e-8
+
+
+def test_marginal_score_samples():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(150, 10))
+    model = BaseCovariance()
+    model.covariance = 1 / X.shape[0] * np.dot(X.T, X)
+    idxs = np.ones(10)
+    idxs[8:] = 0
+    score_samples = model.marginal_score_samples(X[:, idxs == 1], idxs)
+    mvn = st.multivariate_normal(mean=np.zeros(8), cov=model.covariance[:8, :8])
+    logpdf = mvn.logpdf(X[:, :8])
+    assert np.all(np.abs(logpdf - score_samples) < 1e-8)
+
+
+def test_marginal_score():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(15, 10))
+    model = BaseCovariance()
+    model.covariance = 1 / X.shape[0] * np.dot(X.T, X)
+    idxs = np.ones(10)
+    idxs[8:] = 0
+    score = model.marginal_score(X[:, idxs == 1], idxs)
+    mvn = st.multivariate_normal(mean=np.zeros(8), cov=model.covariance[:8, :8])
+    logpdf = mvn.logpdf(X[:, :8])
+    assert np.mean(logpdf) == score
+
+
+def test_score():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BaseCovariance()
+    model.covariance = 1 / X.shape[0] * np.dot(X.T, X)
+    score = model.score(X, weights=1.0 * np.ones(1000))
+    mvn = st.multivariate_normal(mean=np.zeros(10), cov=model.covariance)
+    logpdf = mvn.logpdf(X)
+    assert np.mean(logpdf) == score
+
+
+def test_score_samples():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BaseCovariance()
+    model.covariance = 1 / X.shape[0] * np.dot(X.T, X)
+    score_samples = model.score_samples(X)
+    mvn = st.multivariate_normal(mean=np.zeros(10), cov=model.covariance)
+    logpdf = mvn.logpdf(X)
+    assert np.all(np.abs(logpdf - score_samples) < 1e-8)
+
+
+def test_entropy():
+    model = BaseCovariance()
+    model.covariance = np.eye(10)
+    _, logdet = la.slogdet(model.covariance)
+    ent = 10 / 2 * np.log(2 * np.pi * np.exp(1)) + 1 / 2 * logdet
+    entropy = model.entropy()
+    assert np.abs(ent - entropy) < 1e-8
+
+
+def test_entropy_subset():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BaseCovariance()
+    model.covariance = 1 / X.shape[0] * np.dot(X.T, X)
+    idxs = np.ones(10)
+    idxs[8:] = 0
+    ent_sub = model.entropy_subset(idxs)
+    _, logdet = la.slogdet(model.covariance[:8, :8])
+    ent = 8 / 2 * np.log(2 * np.pi * np.exp(1)) + 1 / 2 * logdet
+    assert np.abs(ent_sub - ent) < 1e-8
+
+
+def test_mutual_information():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BaseCovariance()
+    model.covariance = 1 / X.shape[0] * np.dot(X.T, X)
+    idxs1 = np.ones(10)
+    idxs2 = np.ones(10)
+    idxs1[5:] = 0
+    idxs2[:5] = 0
+    mi = model.mutual_information(idxs1, idxs2)
+    _, ldet1 = la.slogdet(model.covariance[:5, :5])
+    _, ldet2 = la.slogdet(model.covariance[5:, 5:])
+    _, ldet = la.slogdet(model.covariance)
+    mi_est = 0.5 * (ldet1 + ldet2 - ldet)
+    assert np.abs(mi_est - mi) < 1e-5
diff --git a/python/python/bystro/covariance/test/test_base_precision.py b/python/python/bystro/covariance/test/test_base_precision.py
new file mode 100644
index 000000000..7df3ad3e9
--- /dev/null
+++ b/python/python/bystro/covariance/test/test_base_precision.py
@@ -0,0 +1,107 @@
+import numpy as np
+import numpy.linalg as la
+from bystro.covariance._base_precision import BasePrecision
+
+
+def test_get_stable_rank():
+    model = BasePrecision()
+    model.precision = np.eye(10)
+    assert model.get_stable_rank() == 10
+
+
+def test_predict():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BasePrecision()
+    model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X))
+    idxs = np.ones(10)
+    idxs[8:] = 0
+    model.predict(X, idxs)
+
+
+def test_conditional_score_samples():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BasePrecision()
+    model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X))
+    idxs = np.ones(10)
+    idxs[8:] = 0
+    model.conditional_score_samples(X, idxs)
+
+
+def test_conditional_score():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BasePrecision()
+    model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X))
+    idxs = np.ones(10)
+    idxs[8:] = 0
+    model.conditional_score(X, idxs)
+
+
+def test_marginal_score_samples():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BasePrecision()
+    model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X))
+    idxs = np.ones(10)
+    idxs[8:] = 0
+    model.marginal_score(X[:, idxs == 1], idxs)
+
+
+def test_marginal_score():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BasePrecision()
+    model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X))
+    idxs = np.ones(10)
+    idxs[8:] = 0
+    model.marginal_score(X[:, idxs == 1], idxs, weights=0.5 * np.ones(1000))
+
+
+def test_score():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BasePrecision()
+    model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X))
+    model.score(X, weights=0.5 * np.ones(1000))
+
+
+def test_score_samples():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BasePrecision()
+    model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X))
+    model.score_samples(X)
+
+
+def test_entropy():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BasePrecision()
+    model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X))
+    model.entropy()
+
+
+def test_entropy_subset():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BasePrecision()
+    model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X))
+    idxs = np.ones(10)
+    idxs[8:] = 0
+    model.entropy_subset(idxs)
+
+
+def test_mutual_information():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(1000, 10))
+    model = BasePrecision()
+    model.precision = la.inv(1 / X.shape[0] * np.dot(X.T, X))
+    idxs1 = np.ones(10)
+    idxs2 = np.ones(10)
+    idxs1[5:] = 0
+    idxs1 = np.ones(10)
+    idxs2[:5] = 0
+    idxs2[9] = 0
+    model.mutual_information(idxs1, idxs2)
diff --git a/python/python/bystro/covariance/test/test_covariance_np.py b/python/python/bystro/covariance/test/test_covariance_np.py
new file mode 100644
index 000000000..45c41f965
--- /dev/null
+++ b/python/python/bystro/covariance/test/test_covariance_np.py
@@ -0,0 +1,28 @@
+import numpy as np
+import numpy.linalg as la
+from bystro.covariance._covariance_np import (
+    EmpiricalCovariance,
+    BayesianCovariance,
+)
+
+
+def test_empirical_covariance():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(100000, 10))
+
+    model = EmpiricalCovariance()
+    model.fit(X)
+
+    s_vals = la.svd(model.covariance, compute_uv=False)
+    assert np.abs(1 - s_vals[0]) <= 5e-2
+
+
+def test_bayesian_covariance():
+    rng = np.random.default_rng(2021)
+    X = rng.normal(size=(100000, 10))
+
+    model = BayesianCovariance()
+    model.fit(X)
+
+    s_vals = la.svd(model.covariance, compute_uv=False)
+    assert np.abs(1 - s_vals[0]) <= 5e-2
diff --git a/python/python/bystro/proteomics/fragpipe_tandem_mass_tag.py b/python/python/bystro/proteomics/fragpipe_tandem_mass_tag.py
new file mode 100644
index 000000000..75954e8b4
--- /dev/null
+++ b/python/python/bystro/proteomics/fragpipe_tandem_mass_tag.py
@@ -0,0 +1,53 @@
+"""Load and prep fragpipe tandem mass Tag datasets."""
+
+from dataclasses import dataclass
+
+import pandas as pd
+
+ABUNDANCE_COLS = ["Index", "NumberPSM", "ProteinID", "MaxPepProb", "ReferenceIntensity"]
+ANNOTATION_COLS = ["plex", "channel", "sample", "sample_name", "condition", "replicate"]
+
+
+@dataclass(frozen=True)
+class TandemMassTagDataset:
+    """Represent a Fragpipe Tandem Mass Tag dataset."""
+
+    abundance_df: pd.DataFrame
+    annotation_df: pd.DataFrame
+
+
+def _check_df_cols(df: pd.DataFrame, expected_cols: list[str]) -> None:
+    actual_cols = df.columns
+    if not all(x == y for x, y in zip(expected_cols, actual_cols, strict=False)):
+        err_msg = (
+            f"expected dataframe to begin with cols: {expected_cols}, got cols: {actual_cols} instead."
+        )
+        raise ValueError(err_msg)
+
+
+def _prep_abundance_df(abundance_df: pd.DataFrame) -> pd.DataFrame:
+    """Prep abundance_df, setting index and normalizing abundances by ReferenceIntensity."""
+    _check_df_cols(abundance_df, ABUNDANCE_COLS)
+    abundance_df = abundance_df.set_index("Index")
+    first_sample_column_idx = abundance_df.columns.to_list().index("ReferenceIntensity") + 1
+    sample_columns = abundance_df.columns[first_sample_column_idx:]
+    for sample_column in sample_columns:
+        abundance_df[sample_column] -= abundance_df.ReferenceIntensity
+    return abundance_df
+
+
+def _prep_annotation_df(annotation_df: pd.DataFrame) -> pd.DataFrame:
+    """Prep annotation df, setting index."""
+    _check_df_cols(annotation_df, ANNOTATION_COLS)
+    return annotation_df.set_index("sample")
+
+
+def load_tandem_mass_tag_dataset(
+    abundance_filename: str, annotation_filename: str
+) -> TandemMassTagDataset:
+    """Load and prep Fragpipe tandem mass tag datasets."""
+    raw_abundance_df = pd.read_csv(abundance_filename, sep="\t")
+    raw_annotation_df = pd.read_csv(annotation_filename, sep="\t")
+    abundance_df = _prep_abundance_df(raw_abundance_df)
+    annotation_df = _prep_annotation_df(raw_annotation_df)
+    return TandemMassTagDataset(abundance_df, annotation_df)
diff --git a/python/python/bystro/proteomics/tests/test_fragpipe_tandem_mass_tag.py b/python/python/bystro/proteomics/tests/test_fragpipe_tandem_mass_tag.py
new file mode 100644
index 000000000..e6f4b2dfd
--- /dev/null
+++ b/python/python/bystro/proteomics/tests/test_fragpipe_tandem_mass_tag.py
@@ -0,0 +1,110 @@
+from io import StringIO
+import pytest
+import re
+
+import pandas as pd
+from bystro.proteomics.fragpipe_tandem_mass_tag import (
+    load_tandem_mass_tag_dataset,
+    _check_df_cols,
+    ABUNDANCE_COLS,
+)
+from pandas.testing import assert_frame_equal
+
+raw_abundance_df = pd.DataFrame(
+    {
+        "Index": {0: "A1BG", 1: "A1CF", 2: "A2M"},
+        "NumberPSM": {0: 324, 1: 94, 2: 1418},
+        "ProteinID": {0: "P04217", 1: "Q9NQ94", 2: "P01023"},
+        "MaxPepProb": {0: 1.0, 1: 1.0, 2: 1.0},
+        "ReferenceIntensity": {0: 30.0, 1: 26.1, 2: 30.8},
+        "CPT0088900003": {0: 30.7, 1: 26.2, 2: 31.7},
+        "CPT0079270003": {0: 30.2, 1: 26.3, 2: 30.5},
+        "CPT0088920001": {0: 30.0, 1: 26.7, 2: 30.8},
+        "CPT0079300001": {0: 30.6, 1: 25.1, 2: 31.6},
+        "CPT0088550004": {0: 29.0, 1: 24.4, 2: 30.1},
+    }
+)
+
+raw_annotation_df = pd.DataFrame(
+    {
+        "plex": {0: 16, 1: 16, 2: 16},
+        "channel": {0: "126", 1: "127N", 2: "127C"},
+        "sample": {0: "CPT0088900003", 1: "CPT0079270003", 2: "CPT0088920001"},
+        "sample_name": {0: "C3N-01179-T", 1: "C3L-00606-T", 2: "C3N-01179-N"},
+        "condition": {0: "Tumor", 1: "Tumor", 2: "NAT"},
+        "replicate": {0: 1, 1: 1, 2: 1},
+    }
+)
+
+expected_abundance_df = pd.DataFrame(
+    {
+        "NumberPSM": {"A1BG": 324, "A1CF": 94, "A2M": 1418},
+        "ProteinID": {"A1BG": "P04217", "A1CF": "Q9NQ94", "A2M": "P01023"},
+        "MaxPepProb": {"A1BG": 1.0, "A1CF": 1.0, "A2M": 1.0},
+        "ReferenceIntensity": {
+            "A1BG": 30.0,
+            "A1CF": 26.1,
+            "A2M": 30.8,
+        },
+        "CPT0088900003": {
+            "A1BG": 0.7,
+            "A1CF": 0.1,
+            "A2M": 0.9,
+        },
+        "CPT0079270003": {
+            "A1BG": 0.2,
+            "A1CF": 0.2,
+            "A2M": -0.3,
+        },
+        "CPT0088920001": {
+            "A1BG": -0.0,
+            "A1CF": 0.6,
+            "A2M": 0.0,
+        },
+        "CPT0079300001": {
+            "A1BG": 0.6,
+            "A1CF": -1.0,
+            "A2M": 0.8,
+        },
+        "CPT0088550004": {
+            "A1BG": -1.0,
+            "A1CF": -1.7,
+            "A2M": -0.7,
+        },
+    }
+)
+expected_abundance_df.index.name = "Index"
+expected_annotation_df = pd.DataFrame(
+    {
+        "plex": {"CPT0088900003": 16, "CPT0079270003": 16, "CPT0088920001": 16},
+        "channel": {"CPT0088900003": "126", "CPT0079270003": "127N", "CPT0088920001": "127C"},
+        "sample_name": {
+            "CPT0088900003": "C3N-01179-T",
+            "CPT0079270003": "C3L-00606-T",
+            "CPT0088920001": "C3N-01179-N",
+        },
+        "condition": {"CPT0088900003": "Tumor", "CPT0079270003": "Tumor", "CPT0088920001": "NAT"},
+        "replicate": {"CPT0088900003": 1, "CPT0079270003": 1, "CPT0088920001": 1},
+    }
+)
+expected_annotation_df.index.name = "sample"
+
+
+def test_load_tandem_mass_tag_dataset():
+    abundance_handle = StringIO(raw_abundance_df.to_csv(index=None, sep="\t"))
+    annotation_handle = StringIO(raw_annotation_df.to_csv(index=None, sep="\t"))
+    tandem_mass_tag_dataset = load_tandem_mass_tag_dataset(abundance_handle, annotation_handle)
+    assert_frame_equal(expected_abundance_df, tandem_mass_tag_dataset.abundance_df)
+    assert_frame_equal(expected_annotation_df, tandem_mass_tag_dataset.annotation_df)
+
+
+def test__check_df_cols():
+    err_msg = re.escape(
+        r"expected dataframe to begin with cols: "
+        r"['Index', 'NumberPSM', 'ProteinID', 'MaxPepProb', 'ReferenceIntensity'],"
+        r" got cols: "
+        r"Index(['plex', 'channel', 'sample', 'sample_name', 'condition', 'replicate'],"
+        r" dtype='object') instead."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        _check_df_cols(raw_annotation_df, ABUNDANCE_COLS)
diff --git a/python/python/bystro/supervised_ppca/__init__.py b/python/python/bystro/supervised_ppca/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/python/bystro/tada_mixture/__init__.py b/python/python/bystro/tada_mixture/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/python/bystro/tada_mixture/mixture_tada_em.py b/python/python/bystro/tada_mixture/mixture_tada_em.py
new file mode 100644
index 000000000..e9ee9883f
--- /dev/null
+++ b/python/python/bystro/tada_mixture/mixture_tada_em.py
@@ -0,0 +1,395 @@
+"""
+This uses the expectation maximization algorithm to fit a latent variable
+model for TADA, with two flavors, a Zero-Inflated Poisson (ZIP) and a 
+standard Poisson. The generative formulation is 
+
+p(z) = Dir(alpha_k)
+p(x_ij|z=k) = (ZI)Poisson(Lambda_{jk})
+
+Obviously the ZIP will have a parameter pi for probability of 0.
+
+ZIP does not have an analytic maximum likelihood formula. This is not a 
+problem for EM,  the M step only requires that we be able to maximize
+the likelihood, not that it has an analytic solution. So we just use 
+standard gradient descent with automatic differentiation in Torch. This 
+could be naughty if the objective was multimodal, but I strongly doubt that
+this is the case.
+
+Objects
+-------
+MVTadaPoissonEM(K=4,training_options={})
+    This fits a standard Poisson latent variable model.
+
+MVTadaZipEM(K=4,training_options={})
+    This fits a standard ZIP latent variable model.
+
+Methods
+-------
+None
+
+"""
+import numpy as np
+import scipy.stats as st  # type: ignore
+from sklearn.mixture import GaussianMixture  # type: ignore
+from tqdm import trange  # type: ignore
+
+import torch
+from torch import nn
+from torch.nn import PoissonNLLLoss
+
+
+class MVTadaPoissonEM:
+    def __init__(self, K=4, training_options=None):
+        """
+        This is a Product Poisson latent variable model.
+
+        Parameters
+        ----------
+        K : int,default=4
+            The number of clusters, i.e. number of different types of 
+            genes
+
+        training_options : dict,default={}
+            The parameters for the inference scheme
+        """
+        self.K = int(K)
+        if training_options is None:
+            training_options = {}
+        self.training_options = self._fill_training_options(training_options)
+
+    def fit(self, X, progress_bar=True, Lamb_init=None, pi_init=None):
+        """
+        Fits a model given count data X.
+
+        Parameters
+        ----------
+        X : np.array-like,shape=(N,p)
+            The count data
+
+        progress_bar : bool,default=True
+            Whether to print a progress bar while fitting
+
+        Lamb_init : np.array-like,shape=(k,p),default=None
+            Initialize the loadings of the model. Defaults to fitting a 
+            GMM on the data and using those.
+
+        pi_init : np.array-like,shape=(k),default=None
+            Initialize the class weights of the model.
+
+        Returns
+        -------
+        self
+        """
+        td = self.training_options
+        K = self.K
+
+        N, p = X.shape
+
+        # Initialize variables
+        if Lamb_init is None:
+            model = GaussianMixture(K)
+            model.fit(X)
+            Lambda_ = model.means_
+        else:
+            Lambda_ = Lamb_init
+
+        rng = np.random.default_rng(2021)
+        pi_ = rng.dirichlet(np.ones(K)) if pi_init is None else pi_init
+
+        myrange = trange if progress_bar else range
+
+        for i in myrange(td["n_iterations"]):
+            # E step compute most likely categories
+            z_probs = np.zeros((N, K))
+            for j in range(K):
+                log_pmf = st.poisson.logpmf(X, Lambda_[j])
+                log_pmf_sum = np.sum(log_pmf, axis=1)
+                z_probs[:, j] = log_pmf_sum + np.log(pi_[j])
+            Ez = np.argmax(z_probs, axis=1)
+
+            # M step, maximize parameters
+            for j in range(K):
+                pi_[j] = np.mean(Ez == j)
+                Lambda_[j] = np.mean(X[Ez == j], axis=0)
+
+            eps = 0.001
+            pi_ = pi_ * (1 - K * eps) + np.ones(K) * eps
+
+        self.Lambda = Lambda_
+        self.pi = pi_
+        return self
+
+    def predict(self, data):
+        """
+        This predicts the latent cluster assignments given a fitted model
+
+        Parameters
+        ----------
+        data : np.array-like,shape=(N_variants,n_phenotypes)
+            The data of counts, variants x phenotypes
+
+        Returns
+        -------
+        z_hat : np.array-like,shape=(N_samples,)
+            The cluster identities
+        """
+        log_proba = self.predict_logproba(data)
+        z_hat = np.argmax(log_proba, axis=1)
+        return z_hat
+
+    def predict_logproba(self, data):
+        """
+        Predicts the log probability a datapoint being assigned a specific
+        cluster.
+
+        Parameters
+        ----------
+        data : np.array-like,shape=(N_variants,n_phenotypes)
+            The data of counts, variants x phenotypes
+
+        Returns
+        -------
+        z_hat : np.array-like,shape=(N_samples,)
+            The cluster identities
+        """
+        N = data.shape[0]
+        log_proba = np.zeros((N, self.K))
+        for i in range(N):
+            for k in range(self.K):
+                log_proba[i, k] = np.sum(
+                    st.poisson.logpmf(data[i], self.Lambda[k])
+                )
+        return log_proba
+
+    def _fill_training_options(self, training_options):
+        """
+        This fills any relevant parameters for the learning algorithm
+
+        Parameters
+        ----------
+        training_options : dict
+
+        Returns
+        -------
+        tops : dict
+        """
+        default_options = {"n_iterations": 100}
+        tops = {**default_options, **training_options}
+        return tops
+
+
+class MVTadaZipEM:
+    def __init__(self, K=4, training_options=None):
+        """
+        This is a Product ZIP latent variable model.
+
+        Parameters
+        ----------
+        K : int,default=4
+            The number of clusters, i.e. number of different types of 
+            genes
+
+        training_options : dict,default={}
+            The parameters for the inference scheme
+        """
+        self.K = int(K)
+        if training_options is None:
+            training_options = {}
+        self.training_options = self._fill_training_options(training_options)
+
+    def fit(self, X, progress_bar=True):
+        """
+        Fits a model given count data X.
+
+        Parameters
+        ----------
+        X : np.array-like,shape=(N,p)
+            The count data
+        Returns
+        -------
+        self
+        """
+        td = self.training_options
+        K = self.K
+
+        N, p = X.shape
+
+        model = GaussianMixture(K)
+        model.fit(X)
+        Lambda_list_l = []
+        for i in range(K):
+            Lambda_list_l.append(
+                torch.tensor(
+                    1.2 * model.means_[i].astype(np.float32) + 0.1,
+                    requires_grad=True,
+                )
+            )
+
+        myrange = trange if progress_bar else range
+
+        pct_0 = np.mean(X == 0, axis=0)
+        pct_0 = np.clip(pct_0, 0.01, 0.99)
+        pct_0_inv = np.log(pct_0 / (1 - pct_0))
+
+        alpha_ls = [
+            torch.tensor(pct_0_inv, requires_grad=True) for i in range(K)
+        ]
+
+        sigmoid = nn.Sigmoid()
+
+        data = torch.tensor(X.astype(np.float32))
+        pi_ = 1 / K * np.ones(K)
+        myloss = PoissonNLLLoss(log_input=False, full=True, reduction="none")
+
+        for i in myrange(td["n_iterations"]):
+            # E step compute most likely categories
+            z_probs = np.zeros((N, K))
+            for k in range(K):
+                Lambda_k = Lambda_list_l[k]
+                alpha_k = sigmoid(alpha_ls[k])  # Prob of being 0
+
+                log_likelihood_poisson = -1 * myloss(Lambda_k, data)
+                log_likelihood_poisson_w = log_likelihood_poisson + torch.log(
+                    1 - alpha_k
+                )
+
+                log_likelihood_0 = torch.log(
+                    alpha_k + (1 - alpha_k) * torch.exp(-1 * Lambda_k)
+                )
+                log_likelihood_0_b = torch.broadcast_to(
+                    log_likelihood_0, data.shape
+                )
+
+                log_likelihood_point = torch.where(
+                    data != 0, log_likelihood_poisson_w, log_likelihood_0_b
+                )
+
+                log_marginal = torch.sum(log_likelihood_point, axis=1)
+
+                z_probs_k = log_marginal + np.log(pi_[k])
+                z_probs[:, k] = z_probs_k.detach().numpy()
+
+            Ez = np.argmax(z_probs, axis=1)
+
+            # M step, maximize parameters
+            for k in range(K):
+                pi_[k] = np.mean(Ez == k)
+                data_sub = data[Ez == k]  # Selects relevant data
+
+                trainable_variables = [Lambda_list_l[k], alpha_ls[k]]
+
+                optimizer = torch.optim.SGD(
+                    trainable_variables, lr=td["learning_rate"], momentum=0.8
+                )
+
+                for j in range(td["n_inner_iterations"]):
+                    Lambda_k = Lambda_list_l[k]
+                    alpha_k = sigmoid(alpha_ls[k])
+
+                    log_likelihood_poisson = -1 * myloss(Lambda_k, data_sub)
+                    log_likelihood_poisson_w = (
+                        log_likelihood_poisson + torch.log(1 - alpha_k)
+                    )
+
+                    log_likelihood_0 = torch.log(
+                        alpha_k + (1 - alpha_k) * torch.exp(-1 * Lambda_k)
+                    )
+                    log_likelihood_0_b = torch.broadcast_to(
+                        log_likelihood_0, data_sub.shape
+                    )
+
+                    log_likelihood_point = torch.where(
+                        data_sub != 0,
+                        log_likelihood_poisson_w,
+                        log_likelihood_0_b,
+                    )
+
+                    log_marginal = torch.sum(log_likelihood_point, axis=1)
+                    loss = -1 * torch.mean(log_marginal)
+
+                    optimizer.zero_grad()
+                    loss.backward()
+                    optimizer.step()
+
+                    Lambda_k.requires_grad_(False)
+                    Lambda_k[Lambda_k < 0] = 0
+                    Lambda_k.requires_grad_(True)
+
+        self.pi = pi_
+        self.Lambda = -1000 * np.ones((K, p))
+        self.Alpha = -1000 * np.ones((K, p))
+        for k in range(self.K):
+            Lambda_k = Lambda_list_l[k]
+            alpha_k = sigmoid(alpha_ls[k])
+
+            self.Lambda[k] = Lambda_k.detach().numpy()
+            self.Alpha[k] = alpha_k.detach().numpy()
+
+        return self
+
+    def predict(self, data):
+        """
+        This predicts the latent cluster assignments given a fitted model
+
+        Parameters
+        ----------
+        data : np.array-like,shape=(N_variants,n_phenotypes)
+            The data of counts, variants x phenotypes
+
+        Returns
+        -------
+        z_hat : np.array-like,shape=(N_samples,)
+            The cluster identities
+        """
+        log_proba = self.predict_logproba(data)
+        z_hat = np.argmax(log_proba, axis=1)
+        return z_hat
+
+    def predict_logproba(self, data):
+        """
+        Predicts the log probability a datapoint being assigned a specific
+        cluster.
+
+        Parameters
+        ----------
+        data : np.array-like,shape=(N_variants,n_phenotypes)
+            The data of counts, variants x phenotypes
+
+        Returns
+        -------
+        z_hat : np.array-like,shape=(N_samples,)
+            The cluster identities
+        """
+        N = data.shape[0]
+        log_proba = np.zeros((N, self.K))
+        for i in range(N):
+            for k in range(self.K):
+                log_prob_poisson = st.poisson.logpmf(data[i], self.Lambda[k])
+                log_prob_0 = np.log(
+                    self.Alpha[k]
+                    + (1 - self.Alpha[k]) * np.exp(-1 * self.Lambda[k])
+                )
+                probs = log_prob_poisson.copy()
+                probs[data[i] == 0] = log_prob_0[data[i] == 0]
+                log_proba[i, k] = np.sum(probs)
+        return log_proba
+
+    def _fill_training_options(self, training_options):
+        """
+        This fills any relevant parameters for the learning algorithm
+
+        Parameters
+        ----------
+        training_options : dict
+
+        Returns
+        -------
+        tops : dict
+        """
+        default_options = {
+            "n_iterations": 2000,
+            "n_inner_iterations": 50,
+            "learning_rate": 5e-4,
+        }
+        tops = {**default_options, **training_options}
+        return tops
diff --git a/python/python/bystro/tada_mixture/mixture_tada_marginal.py b/python/python/bystro/tada_mixture/mixture_tada_marginal.py
new file mode 100644
index 000000000..1643cd0af
--- /dev/null
+++ b/python/python/bystro/tada_mixture/mixture_tada_marginal.py
@@ -0,0 +1,220 @@
+"""
+This maximizes the marginal likelihood for the Poisson model, for now only 
+a Possion is implemented. The generative model is 
+
+p(z) = Dir(alpha_k)
+p(x_ij|z=k) = Poisson(Lambda_{jk})
+
+The log likelihood is 
+
+log p(x) = log(\sum p(z=k)p(x_ij|z=k)). 
+
+There's not an analytic formula that makes this easy to solve. But as long
+as you use the logsumexp trick you can compute this easily numerically. I 
+honestly don't know why that isn't taught more, I only learned it as a 
+postdoc even though I got a PhD in statistics. Meh
+
+
+Objects
+-------
+MVTadaPoissonML(K=4,training_options={})
+    This fits a standard Poisson latent variable model.
+
+Methods
+-------
+None
+
+"""
+import numpy as np
+import torch
+from tqdm import trange  # type: ignore
+from torch import nn
+from torch.distributions import Dirichlet
+import scipy.stats as st  # type: ignore
+from sklearn.mixture import GaussianMixture  # type: ignore
+from torch.nn import PoissonNLLLoss
+
+
+class MVTadaPoissonML:
+    def __init__(self, K=4, training_options=None):
+        """
+        This is a product Poisson latent variable model.
+
+        Parameters
+        ----------
+        K : int,default=4
+            The number of clusters, i.e. number of different types of 
+            genes
+
+        training_options : dict,default={}
+            The parameters for the inference scheme
+        """
+        self.K = int(K)
+        if training_options is None:
+            training_options = {}
+        self.training_options = self._fill_training_options(training_options)
+
+    def fit(self, data, progress_bar=True, Lamb_init=None, pi_init=None):
+        """
+        Fits a model given count data X.
+
+        Parameters
+        ----------
+        X : np.array-like,shape=(N,p)
+            The count data
+
+        progress_bar : bool,default=True
+            Whether to print a progress bar while fitting
+
+        Lamb_init : np.array-like,shape=(k,p),default=None
+            Initialize the loadings of the model. Defaults to fitting a
+            GMM on the data and using those.
+
+        pi_init : np.array-like,shape=(k),default=None
+            Initialize the class weights of the model.
+
+        Returns
+        -------
+        self
+
+        """
+        td = self.training_options
+        K = self.K
+
+        N, p = data.shape
+        X = torch.tensor(data)
+        rng = np.random.default_rng(2021)
+
+        # Initialize variables
+        if Lamb_init is None:
+            model = GaussianMixture(K)
+            model.fit(data)
+            Lamb_init = model.means_.astype(np.float32)
+        if pi_init is None:
+            pi_init = rng.dirichlet(np.ones(K)).astype(np.float32)
+        lambda_latent = torch.tensor(Lamb_init, requires_grad=True)
+        pi_logits = torch.tensor(pi_init, requires_grad=True)
+
+        trainable_variables = [lambda_latent, pi_logits]
+
+        m_d = Dirichlet(torch.tensor(np.ones(self.K) * self.K))
+
+        optimizer = torch.optim.SGD(
+            trainable_variables, lr=td["learning_rate"], momentum=td["momentum"]
+        )
+
+        nll = PoissonNLLLoss(full=True, log_input=False, reduction="none")
+        mse = nn.MSELoss()
+        smax = nn.Softmax()
+        softplus = nn.Softplus()
+
+        myrange = trange if progress_bar else range
+
+        self.losses_likelihoods = np.zeros(td["n_iterations"])
+
+        for i in myrange(td["n_iterations"]):
+            idx = rng.choice(N, td["batch_size"], replace=False)
+            X_batch = X[idx]  # Batch size x
+
+            Lambda_ = softplus(lambda_latent)
+            mu = 0.01
+            pi_ = smax(pi_logits) * mu + (1 - mu) / K
+
+            loss_logits = 0.001 * mse(pi_logits, torch.zeros(K))
+            loss_prior_pi = -1.0 * m_d.log_prob(pi_) / N
+
+            loglikelihood_each = [
+                -1 * nll(X_batch, Lambda_[k]) for k in range(K)
+            ]  # List of K N x p log likelihoods
+
+            loglikelihood_sum = [
+                torch.sum(mat, axis=1) for mat in loglikelihood_each
+            ]  # List of K N x 1 log likelihoods
+
+            loglikelihood_stack = torch.stack(loglikelihood_sum)
+            # Matrix of N x k log likelihoods
+            loglikelihood_components = torch.transpose(
+                loglikelihood_stack, 0, 1
+            ) + torch.log(
+                pi_
+            )  # Matrix of Nxk posteriors
+            loglikelihood_marg = torch.logsumexp(
+                loglikelihood_components, dim=1
+            )  # Vector of N x 1 marginal posteriors
+            loss_likelihood = -1 * torch.mean(loglikelihood_marg)
+            # Average marginal likelihood
+
+            loss = loss_logits + loss_prior_pi + loss_likelihood
+            self.losses_likelihoods[i] = loss_likelihood.detach().numpy()
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        self.Lambda = Lambda_.detach().numpy()
+        self.pi = pi_.detach().numpy()
+        return self
+
+    def predict(self, data):
+        """
+        This predicts the latent cluster assignments given a fitted model
+
+
+        Parameters
+        ----------
+        data : np.array-like,shape=(N_variants,n_phenotypes)
+            The data of counts, variants x phenotypes
+
+        Returns
+        -------
+        z_hat : np.array-like,shape=(N_samples,)
+            The cluster identities
+        """
+        log_proba = self.predict_logproba(data)
+        z_hat = np.argmax(log_proba, axis=1)
+        return z_hat
+
+    def predict_logproba(self, data):
+        """
+        Predicts the log probability a datapoint being assigned a specific
+        cluster.
+
+        Parameters
+        ----------
+        data : np.array-like,shape=(N_variants,n_phenotypes)
+            The data of counts, variants x phenotypes
+
+        Returns
+        -------
+        z_hat : np.array-like,shape=(N_samples,)
+            The cluster identities
+        """
+        N = data.shape[0]
+        log_proba = np.zeros((N, self.K))
+        for i in range(N):
+            for k in range(self.K):
+                log_proba[i, k] = np.sum(
+                    st.poisson.logpmf(data[i], self.Lambda[k])
+                )
+        return log_proba
+
+    def _fill_training_options(self, training_options):
+        """
+        This fills any relevant parameters for the learning algorithm
+
+        Parameters
+        ----------
+        training_options : dict
+
+        Returns
+        -------
+        tops : dict
+        """
+        default_options = {
+            "n_iterations": 30000,
+            "batch_size": 200,
+            "learning_rate": 5e-5,
+            "momentum": 0.99,
+        }
+        tops = {**default_options, **training_options}
+        return tops
diff --git a/python/python/bystro/tada_mixture/tests/__init__.py b/python/python/bystro/tada_mixture/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/python/bystro/tada_mixture/tests/test_em.py b/python/python/bystro/tada_mixture/tests/test_em.py
new file mode 100644
index 000000000..7c3fc03cc
--- /dev/null
+++ b/python/python/bystro/tada_mixture/tests/test_em.py
@@ -0,0 +1,33 @@
+import numpy as np
+from ..mixture_tada_em import MVTadaPoissonEM, MVTadaZipEM
+
+
+def test_fit():
+    N = 1000
+    p = 10
+    K = 2
+
+    rng = np.random.default_rng(2021)
+
+    Z_true = rng.integers(0, high=K, size=N)
+    Lambda = rng.integers(0, high=50, size=(K, p))
+    Alphas = 0.2 * np.ones((K, p))
+
+    Lambda = Lambda * 1.0
+    Lambda[0, int(p / 2) :] = 0
+    Lambda[1, : int(p / 2)] = 0
+    for k in range(K):
+        Lambda[k] = Lambda[k] / np.mean(Lambda[k]) * 10
+
+    X = np.zeros((N, p))
+    for i in range(N):
+        idx = int(Z_true[i])
+        X[i] = rng.poisson(Lambda[idx])
+        zerod_out = rng.binomial(1, Alphas[idx])
+        X[i, zerod_out == 1] = 0
+
+    model = MVTadaZipEM(K=K)
+    model.fit(X)
+
+    model = MVTadaPoissonEM(K=K)
+    model.fit(X)
diff --git a/python/python/bystro/tada_mixture/tests/test_marginal.py b/python/python/bystro/tada_mixture/tests/test_marginal.py
new file mode 100644
index 000000000..f9b224434
--- /dev/null
+++ b/python/python/bystro/tada_mixture/tests/test_marginal.py
@@ -0,0 +1,27 @@
+import numpy as np
+from ..mixture_tada_marginal import MVTadaPoissonML
+
+
+def test_fit():
+    N = 1000
+    p = 10
+    K = 2
+
+    rng = np.random.default_rng(2021)
+
+    Z_true = rng.integers(0, high=K, size=N)
+    Lambda = rng.integers(0, high=50, size=(K, p))
+
+    Lambda = Lambda * 1.0
+    Lambda[0, int(p / 2) :] = 0
+    Lambda[1, : int(p / 2)] = 0
+    for k in range(K):
+        Lambda[k] = Lambda[k] / np.mean(Lambda[k]) * 10
+
+    X = np.zeros((N, p))
+    for i in range(N):
+        idx = int(Z_true[i])
+        X[i] = rng.poisson(Lambda[idx])
+
+    model = MVTadaPoissonML(K=K)
+    model.fit(X)
diff --git a/python/python/bystro/tadarrr/tests/__init__.py b/python/python/bystro/tadarrr/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/requirements.txt b/python/requirements.txt
index e1bdee576..1bf3829ba 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -14,3 +14,7 @@ scikit-allel==1.3.6
 scikit-learn==1.2.2
 skops==0.7.post0
 tqdm==4.65.0
+cvxpy==1.2.1
+cloudpickle==2.0.*
+tensorflow==2.11.0
+torch==2.0.1