Skip to content

Commit

Permalink
Merge branch 'master' of github.com:bystrogenomics/bystro into featur…
Browse files Browse the repository at this point in the history
…e/api
  • Loading branch information
akotlar committed Sep 6, 2023
2 parents 75e5961 + 162775f commit 622917a
Show file tree
Hide file tree
Showing 24 changed files with 2,309 additions and 3 deletions.
130 changes: 130 additions & 0 deletions python/python/bystro/_template_npr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""
This provides a basic template for any model that uses numpyro as an
inference method. It has several methods that should be filled by any
object extending the template, namely
fit
_fill_hp_options
These are the methods for running the samples given data and providing
hyperameter selections respectively.
Objects
-------
_BaseNPRModel(object)
This is the template Numpyro model
"""
import abc
import cloudpickle # type: ignore
import numpyro # type: ignore


class _BaseNumpyroModel:
def __init__(self, mcmc_options=None, hp_options=None):
"""
Parameters
----------
Returns
-------
"""
if mcmc_options is None:
mcmc_options = {}
if hp_options is None:
hp_options = {}
self.mcmc_options = self._fill_mcmc_options(mcmc_options)
self.hp_options = self._fill_hp_options(hp_options)
self.samples = None

@abc.abstractmethod
def fit(self, *args):
"""
Parameters
----------
Returns
-------
"""

def render_model(self):
"""
This provides a graphical representation of the model
Parameters
----------
Returns
-------
"""
assert self._model is not None, "Fit model first"
return numpyro.render_model(self._model, model_kwargs=self.model_kwargs)

def pickle(self, path):
"""
This saves samples from a fit model
Parameters
----------
Returns
-------
"""
assert self.samples is not None, "Fit model first"
with open(path, "wb") as f:
cloudpickle.dump(self.samples, f)

def unpickle(self, path):
"""
This loads samples from a previously saved model
Parameters
----------
Returns
-------
"""
with open(path, "rb") as f:
self.samples = cloudpickle.load(f)

def _fill_mcmc_options(self, mcmc_options):
"""
This fills in default MCMC options of the sampler. Further methods
might override these but these are common/basic enough to leave in
as an implemented method.
Parameters
----------
Returns
-------
"""
default_options = {
"num_chains": 1,
"num_warmup": 500,
"num_samples": 2000,
}
mopts = {**default_options, **mcmc_options}
return mopts

@abc.abstractmethod
def _fill_hp_options(self, hp_options):
"""
This fills in default hyperparameters of the model. Since these are
not conserved between models we leave this as an abstract method
to be filled in per model.
Parameters
----------
Returns
-------
"""
132 changes: 132 additions & 0 deletions python/python/bystro/_template_sgd_np.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
"""
This implements a base class for any model using stochastic gradient
descent-based techniques for inference.
Objects
-------
_BaseSGDModel(object):
Methods
-------
None
"""
import abc
import cloudpickle # type: ignore


class _BaseSGDModel(object):
def __init__(self, training_options=None):
"""
The base class of a model relying on stochastic gradient descent for
inference
"""
if training_options is None:
training_options = {}
self.training_options = self._fill_training_options(training_options)

@abc.abstractmethod
def fit(self, *args, **kwargs):
"""
Method for fitting
Parameters
----------
*args:
List of arguments
*kwargs:
Key word arguments
"""

def pickle(self, path):
"""
Method for saving the model
Parameters
----------
path : str
The directory to save the model to
"""
mydict = {"model": self}
with open(path, "wb") as f:
cloudpickle.dump(mydict, f)

@abc.abstractmethod
def unpickle(self, path):
"""
Method for loading the model
Parameters
----------
path : str
The directory to load the model from
"""

def _fill_training_options(self, training_options):
"""
This fills any relevant parameters for the learning algorithm
Parameters
----------
training_options : dict
Returns
-------
training_opts : dict
"""
default_options = {"n_iterations": 5000}
training_opts = {**default_options, **training_options}
return training_opts

@abc.abstractmethod
def _save_variables(self, training_variables):
"""
This saves the final parameter values after training
Parameters
----------
training_variables :list
The variables trained
"""
raise NotImplementedError("_save_variables")

@abc.abstractmethod
def _initialize_save_losses(self):
"""
This method initializes the arrays to track relevant variables
during training
Parameters
----------
"""
raise NotImplementedError("_initialize_save_losses")

@abc.abstractmethod
def _save_losses(self, *args):
"""
This saves the respective losses at each iteration
Parameters
----------
"""
raise NotImplementedError("_save_losses")

@abc.abstractmethod
def _test_inputs(self, *args):
"""
This performs error checking on inputs for fit
Parameters
----------
"""
raise NotImplementedError("_transform_training_data")

@abc.abstractmethod
def _transform_training_data(self, *args):
"""
This converts training data to adequate format
Parameters
----------
"""
raise NotImplementedError("_transform_training_data")
68 changes: 67 additions & 1 deletion python/python/bystro/ancestry/tests/test_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
superpop_predictions_from_pop_probs,
superpop_probs_from_pop_probs,
convert_1kgp_vcf_to_dosage,
process_vcf_for_pc_transformation
process_vcf_for_pc_transformation,
restrict_loadings_variants_to_vcf,
apply_pca_transform
)
from bystro.vcf_utils.simulate_random_vcf import (
generate_simulated_vcf,
Expand Down Expand Up @@ -286,3 +288,67 @@ def test_1kgp_vcf_to_dosage():
for column in processed_vcf.columns:
for value in processed_vcf[column]:
assert value in valid_values


def test_restrict_loadings_variants_to_vcf():
"""Tests restriction of gnomad loadings variants to vcf variants."""
num_samples = 10
num_vars = 10
simulated_vcf = generate_simulated_vcf(num_samples, num_vars)
sim_vcf_df = convert_sim_vcf_to_df(simulated_vcf)
loaded_vcf = convert_1kgp_vcf_to_dosage(sim_vcf_df)
processed_sim_vcf = process_vcf_for_pc_transformation(loaded_vcf)
# Using same indices, generate sim loadings
num_pcs = 30
sim_pcs = np.random.random((num_samples, num_pcs))
sim_loadings = pd.DataFrame(
data=sim_pcs, index=processed_sim_vcf.columns, columns=[f"PC{i+1}" for i in range(num_pcs)]
)
# Run restrict_loadings with sim data
pc_loadings_overlap, genos_overlap_transpose = restrict_loadings_variants_to_vcf(
sim_loadings, processed_sim_vcf
)
# Check for expected columns/indices
expected_columns_loadings = [f"PC{i+1}" for i in range(num_pcs)]
assert set(expected_columns_loadings) == set(pc_loadings_overlap.columns)
expected_index_genos_transpose = ["SampleID1", "SampleID2"]
for sample in expected_index_genos_transpose:
assert sample in genos_overlap_transpose.index
# Check that the variant IDs match up to sorting
assert set(genos_overlap_transpose.columns) == set(pc_loadings_overlap.index)


def test_apply_pca_transform():
# Prepare test data same as before
num_samples = 10
num_vars = 10
simulated_vcf = generate_simulated_vcf(num_samples, num_vars)
sim_vcf_df = convert_sim_vcf_to_df(simulated_vcf)
loaded_vcf = convert_1kgp_vcf_to_dosage(sim_vcf_df)
processed_sim_vcf = process_vcf_for_pc_transformation(loaded_vcf)
# Using same indices, generate sim loadings
num_pcs = 30
sim_pcs = np.random.random((num_samples, num_pcs))
sim_loadings = pd.DataFrame(
data=sim_pcs, index=processed_sim_vcf.columns, columns=[f"PC{i+1}" for i in range(num_pcs)]
)
pc_loadings_overlap, genos_overlap_transpose = restrict_loadings_variants_to_vcf(
sim_loadings, processed_sim_vcf
)
# Call function to test
transformed_data = apply_pca_transform(pc_loadings_overlap, genos_overlap_transpose)
# Check for correct data type and columns
assert isinstance(transformed_data, pd.DataFrame)
expected_columns = ["PC" + str(i) for i in range(1, 31)]
assert transformed_data.columns.tolist() == expected_columns
# Check index of transformed_data matches genos_overlap_transpose.index
assert transformed_data.index.equals(genos_overlap_transpose.index)
# Check shape of transformed_data matches expected shape
expected_shape = (genos_overlap_transpose.shape[0], 30)
assert transformed_data.shape == expected_shape

# Check all values in transformed_data are numeric
assert transformed_data.dtypes.apply(pd.api.types.is_numeric_dtype).all()

# Check that transformed_data does not contain any NaN or missing values
assert not transformed_data.isna().to_numpy().any()
36 changes: 34 additions & 2 deletions python/python/bystro/ancestry/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,13 +513,16 @@ def convert_1kgp_vcf_to_dosage(vcf_with_header: pd.DataFrame) -> pd.DataFrame:


def process_vcf_for_pc_transformation(dosage_vcf: pd.DataFrame) -> pd.DataFrame:
"""Process dosage_vcf so that it only includes genotypes for analysis."""
"""Process dosage_vcf and transpose so that it only includes
genotypes in correct configuration for analysis."""
genos = dosage_vcf.iloc[:, len(HEADER_COLS) :]
genos = genos.set_index(dosage_vcf.index)
genos = genos.sort_index()
# Check that not all genotypes are the same for QC
assert len(set(genos.to_numpy().flatten())) > 1, "All genotypes are the same"
return genos
#Transpose genos_overlap for analysis
genos_transpose = genos.T
return genos_transpose


def _load_pca_loadings() -> pd.DataFrame:
Expand Down Expand Up @@ -560,6 +563,35 @@ def get_variant(x):
return pc_loadings


def restrict_loadings_variants_to_vcf(
pc_loadings: pd.DataFrame, genos_transpose: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Restrict variant list to overlap between gnomad loadings and transposed
reference vcf and return versions only including the overlapping variants."""
# IGSR version of 1kgp is current reference vcf
var_overlap = pc_loadings.index.intersection(genos_transpose.columns)
pc_loadings_overlap = pc_loadings[pc_loadings.index.isin(var_overlap)]
genos_overlap_transpose = genos_transpose[genos_transpose.columns.isin(var_overlap)]
# Ensure that genos transpose and pc_loadings have corresponding shape and vars
assert genos_overlap_transpose.shape[1] == pc_loadings_overlap.shape[0]
assert set(genos_overlap_transpose.columns) == set(pc_loadings_overlap.index)
assert (genos_overlap_transpose.index == genos_transpose.index).all()
# Record amount of overlap
num_var_overlap = len(var_overlap)
logger.info("Number of overlapping variants: %d", num_var_overlap)
return pc_loadings_overlap, genos_overlap_transpose


def apply_pca_transform(
pc_loadings_overlap: pd.DataFrame,
genos_overlap_transpose: pd.DataFrame
) -> pd.DataFrame:
"""Transform vcf with genotypes in dosage format with PCs loadings from gnomad PCA."""
# Dot product
transformed_data = genos_overlap_transpose @ pc_loadings_overlap
return transformed_data


def _perform_pca(train_X: pd.DataFrame, test_X: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, PCA]:
"""Perform PCA, checking for good compression."""
logger.info("Beginning PCA")
Expand Down
Empty file.
Loading

0 comments on commit 622917a

Please sign in to comment.