Merge branch 'master' of github.com:bystrogenomics/bystro into featur…

…e/api
akotlar · Sep 6, 2023 · 622917a · 622917a
2 parents 75e5961 + 162775f
commit 622917a
Show file tree

Hide file tree

Showing 24 changed files with 2,309 additions and 3 deletions.
diff --git a/python/python/bystro/_template_npr.py b/python/python/bystro/_template_npr.py
@@ -0,0 +1,130 @@
+"""
+This provides a basic template for any model that uses numpyro as an 
+inference method. It has several methods that should be filled by any
+object extending the template, namely
+    fit
+    _fill_hp_options
+
+These are the methods for running the samples given data and providing 
+hyperameter selections respectively.
+
+Objects
+-------
+_BaseNPRModel(object)
+    This is the template Numpyro model
+
+"""
+import abc
+import cloudpickle  # type: ignore
+import numpyro  # type: ignore
+
+
+class _BaseNumpyroModel:
+    def __init__(self, mcmc_options=None, hp_options=None):
+        """
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        """
+        if mcmc_options is None:
+            mcmc_options = {}
+        if hp_options is None:
+            hp_options = {}
+        self.mcmc_options = self._fill_mcmc_options(mcmc_options)
+        self.hp_options = self._fill_hp_options(hp_options)
+        self.samples = None
+
+    @abc.abstractmethod
+    def fit(self, *args):
+        """
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        """
+
+    def render_model(self):
+        """
+        This provides a graphical representation of the model
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        """
+        assert self._model is not None, "Fit model first"
+        return numpyro.render_model(self._model, model_kwargs=self.model_kwargs)
+
+    def pickle(self, path):
+        """
+        This saves samples from a fit model
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        """
+        assert self.samples is not None, "Fit model first"
+        with open(path, "wb") as f:
+            cloudpickle.dump(self.samples, f)
+
+    def unpickle(self, path):
+        """
+        This loads samples from a previously saved model
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        """
+        with open(path, "rb") as f:
+            self.samples = cloudpickle.load(f)
+
+    def _fill_mcmc_options(self, mcmc_options):
+        """
+        This fills in default MCMC options of the sampler. Further methods
+        might override these but these are common/basic enough to leave in
+        as an implemented method.
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        """
+        default_options = {
+            "num_chains": 1,
+            "num_warmup": 500,
+            "num_samples": 2000,
+        }
+        mopts = {**default_options, **mcmc_options}
+        return mopts
+
+    @abc.abstractmethod
+    def _fill_hp_options(self, hp_options):
+        """
+        This fills in default hyperparameters of the model. Since these are
+        not conserved between models we leave this as an abstract method 
+        to be filled in per model.
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+
+        """
diff --git a/python/python/bystro/_template_sgd_np.py b/python/python/bystro/_template_sgd_np.py
@@ -0,0 +1,132 @@
+"""
+This implements a base class for any model using stochastic gradient 
+descent-based techniques for inference. 
+
+Objects
+-------
+_BaseSGDModel(object):
+
+Methods
+-------
+None
+"""
+import abc
+import cloudpickle  # type: ignore
+
+
+class _BaseSGDModel(object):
+    def __init__(self, training_options=None):
+        """
+        The base class of a model relying on stochastic gradient descent for
+        inference
+        """
+        if training_options is None:
+            training_options = {}
+        self.training_options = self._fill_training_options(training_options)
+
+    @abc.abstractmethod
+    def fit(self, *args, **kwargs):
+        """
+        Method for fitting
+
+        Parameters
+        ----------
+        *args:
+           List of arguments
+
+        *kwargs:
+           Key word arguments
+        """
+
+    def pickle(self, path):
+        """
+        Method for saving the model
+
+        Parameters
+        ----------
+        path : str
+            The directory to save the model to
+        """
+        mydict = {"model": self}
+        with open(path, "wb") as f:
+            cloudpickle.dump(mydict, f)
+
+    @abc.abstractmethod
+    def unpickle(self, path):
+        """
+        Method for loading the model
+
+        Parameters
+        ----------
+        path : str
+            The directory to load the model from
+        """
+
+    def _fill_training_options(self, training_options):
+        """
+        This fills any relevant parameters for the learning algorithm
+
+        Parameters
+        ----------
+        training_options : dict
+
+        Returns
+        -------
+        training_opts : dict
+        """
+        default_options = {"n_iterations": 5000}
+        training_opts = {**default_options, **training_options}
+        return training_opts
+
+    @abc.abstractmethod
+    def _save_variables(self, training_variables):
+        """
+        This saves the final parameter values after training
+
+        Parameters
+        ----------
+        training_variables :list
+            The variables trained
+        """
+        raise NotImplementedError("_save_variables")
+
+    @abc.abstractmethod
+    def _initialize_save_losses(self):
+        """
+        This method initializes the arrays to track relevant variables
+        during training
+
+        Parameters
+        ----------
+        """
+        raise NotImplementedError("_initialize_save_losses")
+
+    @abc.abstractmethod
+    def _save_losses(self, *args):
+        """
+        This saves the respective losses at each iteration
+
+        Parameters
+        ----------
+        """
+        raise NotImplementedError("_save_losses")
+
+    @abc.abstractmethod
+    def _test_inputs(self, *args):
+        """
+        This performs error checking on inputs for fit
+
+        Parameters
+        ----------
+        """
+        raise NotImplementedError("_transform_training_data")
+
+    @abc.abstractmethod
+    def _transform_training_data(self, *args):
+        """
+        This converts training data to adequate format
+
+        Parameters
+        ----------
+        """
+        raise NotImplementedError("_transform_training_data")
diff --git a/python/python/bystro/ancestry/tests/test_train.py b/python/python/bystro/ancestry/tests/test_train.py
@@ -13,7 +13,9 @@
     superpop_predictions_from_pop_probs,
     superpop_probs_from_pop_probs,
     convert_1kgp_vcf_to_dosage,
-    process_vcf_for_pc_transformation
+    process_vcf_for_pc_transformation,
+    restrict_loadings_variants_to_vcf,
+    apply_pca_transform
 )
 from bystro.vcf_utils.simulate_random_vcf import (
     generate_simulated_vcf,
@@ -286,3 +288,67 @@ def test_1kgp_vcf_to_dosage():
     for column in processed_vcf.columns:
         for value in processed_vcf[column]:
             assert value in valid_values
+
+
+def test_restrict_loadings_variants_to_vcf():
+    """Tests restriction of gnomad loadings variants to vcf variants."""
+    num_samples = 10
+    num_vars = 10
+    simulated_vcf = generate_simulated_vcf(num_samples, num_vars)
+    sim_vcf_df = convert_sim_vcf_to_df(simulated_vcf)
+    loaded_vcf = convert_1kgp_vcf_to_dosage(sim_vcf_df)
+    processed_sim_vcf = process_vcf_for_pc_transformation(loaded_vcf)
+    # Using same indices, generate sim loadings
+    num_pcs = 30
+    sim_pcs = np.random.random((num_samples, num_pcs))
+    sim_loadings = pd.DataFrame(
+        data=sim_pcs, index=processed_sim_vcf.columns, columns=[f"PC{i+1}" for i in range(num_pcs)]
+    )
+    # Run restrict_loadings with sim data
+    pc_loadings_overlap, genos_overlap_transpose = restrict_loadings_variants_to_vcf(
+        sim_loadings, processed_sim_vcf
+    )
+    # Check for expected columns/indices
+    expected_columns_loadings = [f"PC{i+1}" for i in range(num_pcs)]
+    assert set(expected_columns_loadings) == set(pc_loadings_overlap.columns)
+    expected_index_genos_transpose = ["SampleID1", "SampleID2"]
+    for sample in expected_index_genos_transpose:
+        assert sample in genos_overlap_transpose.index
+    # Check that the variant IDs match up to sorting
+    assert set(genos_overlap_transpose.columns) == set(pc_loadings_overlap.index)
+
+
+def test_apply_pca_transform():
+    # Prepare test data same as before
+    num_samples = 10
+    num_vars = 10
+    simulated_vcf = generate_simulated_vcf(num_samples, num_vars)
+    sim_vcf_df = convert_sim_vcf_to_df(simulated_vcf)
+    loaded_vcf = convert_1kgp_vcf_to_dosage(sim_vcf_df)
+    processed_sim_vcf = process_vcf_for_pc_transformation(loaded_vcf)
+    # Using same indices, generate sim loadings
+    num_pcs = 30
+    sim_pcs = np.random.random((num_samples, num_pcs))
+    sim_loadings = pd.DataFrame(
+        data=sim_pcs, index=processed_sim_vcf.columns, columns=[f"PC{i+1}" for i in range(num_pcs)]
+    )
+    pc_loadings_overlap, genos_overlap_transpose = restrict_loadings_variants_to_vcf(
+        sim_loadings, processed_sim_vcf
+    )
+    # Call function to test
+    transformed_data = apply_pca_transform(pc_loadings_overlap, genos_overlap_transpose)
+    # Check for correct data type and columns
+    assert isinstance(transformed_data, pd.DataFrame)
+    expected_columns = ["PC" + str(i) for i in range(1, 31)]
+    assert transformed_data.columns.tolist() == expected_columns
+    # Check index of transformed_data matches genos_overlap_transpose.index
+    assert transformed_data.index.equals(genos_overlap_transpose.index)
+    # Check shape of transformed_data matches expected shape
+    expected_shape = (genos_overlap_transpose.shape[0], 30)
+    assert transformed_data.shape == expected_shape
+
+    # Check all values in transformed_data are numeric
+    assert transformed_data.dtypes.apply(pd.api.types.is_numeric_dtype).all()
+
+    # Check that transformed_data does not contain any NaN or missing values
+    assert not transformed_data.isna().to_numpy().any()
diff --git a/python/python/bystro/ancestry/train.py b/python/python/bystro/ancestry/train.py
@@ -513,13 +513,16 @@ def convert_1kgp_vcf_to_dosage(vcf_with_header: pd.DataFrame) -> pd.DataFrame:
 
 
 def process_vcf_for_pc_transformation(dosage_vcf: pd.DataFrame) -> pd.DataFrame:
-    """Process dosage_vcf so that it only includes genotypes for analysis."""
+    """Process dosage_vcf and transpose so that it only includes 
+    genotypes in correct configuration for analysis."""
     genos = dosage_vcf.iloc[:, len(HEADER_COLS) :]
     genos = genos.set_index(dosage_vcf.index)
     genos = genos.sort_index()
     # Check that not all genotypes are the same for QC
     assert len(set(genos.to_numpy().flatten())) > 1, "All genotypes are the same"
-    return genos
+    #Transpose genos_overlap for analysis
+    genos_transpose = genos.T
+    return genos_transpose
 
 
 def _load_pca_loadings() -> pd.DataFrame:
@@ -560,6 +563,35 @@ def get_variant(x):
     return pc_loadings
 
 
+def restrict_loadings_variants_to_vcf(
+    pc_loadings: pd.DataFrame, genos_transpose: pd.DataFrame
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Restrict variant list to overlap between gnomad loadings and transposed 
+    reference vcf and return versions only including the overlapping variants."""
+    # IGSR version of 1kgp is current reference vcf
+    var_overlap = pc_loadings.index.intersection(genos_transpose.columns)
+    pc_loadings_overlap = pc_loadings[pc_loadings.index.isin(var_overlap)]
+    genos_overlap_transpose = genos_transpose[genos_transpose.columns.isin(var_overlap)]
+    # Ensure that genos transpose and pc_loadings have corresponding shape and vars
+    assert genos_overlap_transpose.shape[1] == pc_loadings_overlap.shape[0]
+    assert set(genos_overlap_transpose.columns) == set(pc_loadings_overlap.index)
+    assert (genos_overlap_transpose.index == genos_transpose.index).all()
+    # Record amount of overlap
+    num_var_overlap = len(var_overlap)
+    logger.info("Number of overlapping variants: %d", num_var_overlap)
+    return pc_loadings_overlap, genos_overlap_transpose
+
+
+def apply_pca_transform(
+    pc_loadings_overlap: pd.DataFrame,
+    genos_overlap_transpose: pd.DataFrame
+) -> pd.DataFrame:
+    """Transform vcf with genotypes in dosage format with PCs loadings from gnomad PCA."""
+    # Dot product
+    transformed_data = genos_overlap_transpose @ pc_loadings_overlap
+    return transformed_data
+
+
 def _perform_pca(train_X: pd.DataFrame, test_X: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, PCA]:
     """Perform PCA, checking for good compression."""
     logger.info("Beginning PCA")

diff --git a/python/python/bystro/covariance/__init__.py b/python/python/bystro/covariance/__init__.py