Skip to content

Commit

Permalink
Class "Model" improved
Browse files Browse the repository at this point in the history
  • Loading branch information
remytuyeras authored Aug 24, 2023
1 parent b757a95 commit b0e647e
Show file tree
Hide file tree
Showing 7 changed files with 629 additions and 146 deletions.
2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ authors:
given-names: "Remy"
orcid: "https://orcid.org/0000-0002-5032-4113"
title: "HaploDynamics: A python library to develop genomic data simulators"
version: 0.3-beta.1
version: 0.4-beta.0
doi: 10.5281/zenodo.8247433
date-released: 2023-08-09
url: "https://github.com/remytuyeras/HaploDynamics"
103 changes: 90 additions & 13 deletions HaploDynamics/Framework/multiproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,29 +16,106 @@ def __init__(self,fname,cpus = 1,system = "unix"):
self.pool = mp.Pool(cpus)
self.fname = fname
self.eol = "\n" if system == "unix" else "\r\n"
self.initial_schema = None
self.landscape = []
self.initiate_landscape()

def initiate_landscape(self,afs=afs_sample,reference=0):
self.initial_schema = (afs,reference)

def initiate_vcf(self,length,Npop,chrom = "23"):
def extend_landscape(self,*schemas):
self.landscape.extend(list(schemas))
return len(self.landscape)

@staticmethod
def standard_schema(npos):
pos_dist = lambda reference: SNP_distribution(reference,npos)
return npos, afs_sample, pos_dist

def genotype_schema(self,alpha=4/30,afs=afs_sample):
p = afs(alpha)
#We use the minor allele frequency as a reference
maf = min(p,1-p)
#Probability for minor homogeneous genotypes
hom = maf**2
#Probability for heterogeneous genotypes
het = maf*(1-maf)
#Interval decomposition simulating the Hardy-Weinberg principle
hwp = [0, hom, hom+het, hom+2*het, 1]
return maf, hwp

def linkage_disequilibrium(self,alpha,beta,gamma,strength=-1,afs=afs_sample):
ub = gamma*math.log2(1/beta)
shift = random.uniform(min(ub,strength*ub),ub)
def LD(p):
def ld(t):
q = afs(alpha)
while not(lb_freq(beta,gamma,p,t,shift) <= q <= ub_freq(beta,gamma,p,t,shift)):
q = afs(alpha)
l = decay(amplifier(beta,p,q),2*gamma,shift)(t)+1
return l, q
return ld
return LD

def cond_genotype_schema(self,previous_maf,distance,alpha,beta,gamma,strength=-1,afs=afs_sample):
#Linkage disequilibrium
ld, maf = self.linkage_disequilibrium(alpha,beta,gamma,strength,afs)(previous_maf)(distance)
while maf > 0.5:
ld, maf = self.linkage_disequilibrium(alpha,beta,gamma,strength,afs)(previous_maf)(distance)
#Conditional Hardy-Weinberg principle encoded as a function
def hwp(previous_genotype):
#Previous genotype has minor alleles
if previous_genotype == 2:
#Probability for minor homogeneous genotypes
hom = (ld**2) * (maf**2)
#Probability for heterogeneous genotypes
het = ld * ref_alt_function(maf,ld) * maf * (1-maf)
#Interval decomposition simulating the conditional Hardy-Weinberg principle
return [0, hom, hom+het,hom+2*het, 1]
#Previous genotype has one minor allele
if abs(previous_genotype) == 1:
#Probability for minor homogeneous genotypes
hom = ld * ref_alt_function(previous_maf,ld) * (maf**2)
#Probability for heterogeneous genotypes
het1 = ld * alt_alt_function(previous_maf,maf,ld) * maf * (1-maf)
het2 = ref_alt_function(previous_maf,ld) * ref_alt_function(maf,ld) * maf * (1-maf)
#Previous genotype consists of a minor-major pairing
if previous_genotype == -1:
return [0, hom, hom+het1,hom+het1+het2, 1]
#Previous genotype consists of a major-minor pairing
elif previous_genotype == 1:
return [0, hom, hom+het2,hom+het2+het1, 1]
#Previous genotype has no minor alleles
if previous_genotype == 0:
#Probability for minor homogeneous genotypes
hom = (ref_alt_function(previous_maf,ld)**2) * (maf**2)
#Probability for heterogeneous genotypes
het = ref_alt_function(previous_maf,ld) * alt_alt_function(previous_maf,maf,ld) * maf * (1-maf)
#Interval decomposition simulating the conditional Hardy-Weinberg principle
return [0, hom, hom+het,hom+2*het, 1]
return maf, hwp, ld

def initiate_vcf(self,Npos,Npop,chrom = "23"):
f = gzip.open(self.fname+".vcf.gz","wt")
order = int(math.log10(Npop))
order = int(math.log10(Npop))+1
write = lambda s : f.write(s+self.eol)
idnum = lambda s: "ID"+"0"*(order-len(str(s)))+str(s)
write("##fileformat=VCFv4.2")
write("##fileDate="+"".join(str(datetime.date.today()).split("-")))
write("##source=HaploDX")
write("##reference=https://github.com/remytuyeras/HaploDynamics/blob/main/README.md")
write("##contig=<ID="+chrom+",length="+str(length)+",species=\"simulated Homo sapiens\">")
write("##contig=<ID="+chrom+",length="+str(Npos)+",species=\"simulated Homo sapiens\">")
write("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">")
write("##INFO=<ID=AP,Number=1,Type=Float,Description=\"Alpha Parameter for Simulation\">")
write("##INFO=<ID=BP,Number=1,Type=Float,Description=\"Beta Parameter for Simulation\">")
write("##INFO=<ID=CP,Number=1,Type=Float,Description=\"Gamma Parameter for Simulation\">")
write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">")
INDIV = "\t".join(map(idnum,range(1,Npop)))
INDIV = "\t".join(map(idnum,range(1,Npop+1)))
write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t"+INDIV)
f.flush()
return f

def generate_vcf(self,blocks,strength,population,Npop,chrom = "23"):
def generate_vcf(self,strength,population,Npop,chrom = "23"):
#Values needed to make the VCF file
alpha, beta, gamma = population_mld(population)
nuc = ["A","C","G","T"]
Expand All @@ -53,17 +130,17 @@ def write_genotype(f,vector):
#Important parameters to control the memory space
pre_vector = None
vector = None
length = sum(blocks)+1
Npos = sum([npos for npos,_,_ in self.landscape])+1
#For the loop
reference = 0
maf = None
#For the output
speed = None
max_mem = None
cur_mem = None
with self.initiate_vcf(length,Npop,chrom) as f:
with self.initiate_vcf(Npos,Npop,chrom) as f:
####### initiate_block
maf, hwp = genotype_schema(alpha)
afs, reference = self.initial_schema
maf, hwp = self.genotype_schema(alpha,afs)
minor = random.choice([0,2])
vector = [reference]
pre_vector = [genotype(hwp,minor) for i in range(Npop)]
Expand All @@ -72,15 +149,15 @@ def write_genotype(f,vector):
####### end of initiate_block

#Loop: Loading bar on the standard output
loading_frame = Loading("Model.generate_vcf: ",len(blocks))
for i, block in enumerate(blocks):
loading_frame = Loading("Model.generate_vcf: ",len(self.landscape))
for i, schema in enumerate(self.landscape):
#Loading bar progress
speed, max_mem, cur_mem = loading_frame.show(i+1)
positions = SNP_distribution(reference,block)
positions = schema[2](reference)
####### continue_block
for k in range(len(positions)):
distance = abs(positions[k]-reference)
maf, hwp, ld = cond_genotype_schema(maf,distance,alpha,beta,gamma,strength)
maf, hwp, ld = self.cond_genotype_schema(maf,distance,alpha,beta,gamma,strength,schema[1])
minor = random.choice([0,2])
vector = [positions[k]]
pre_vector = [genotype(hwp(gref(pre_vector[i])),minor) for i in range(Npop)]
Expand Down
2 changes: 1 addition & 1 deletion HaploDynamics/utils/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def text(self,d: int,color: bool = False):
m1 = "max. mem (MB): " + str(self.max_mem)
m0 = "cur. mem (MB): " + str(self.cur_mem)
info = "\n".join(["",t,m1,m0,""])
return ">"+c1+"|"*d+" "*(20-d)+c2+"< "+str(int(100*d/20))+"%"+info
return "|"+c1+u"\u2588"*d+" "*(20-d)+c2+"| "+str(int(100*d/20))+"%"+info

def show(self,state: int):
self.current = state
Expand Down
74 changes: 43 additions & 31 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,46 @@
<div style="width: 180px; margin: auto;"><a href="https://surveillance.cancer.gov/genetic-simulation-resources/"><img src="https://surveillance.cancer.gov/gsr/static/img/gsr_tile.jpg" alt="Catalogued on GSR" width="180" height="60" /></a></div>

## Highlights and updates
Release version ```0.3b*```:

1. Module ```Framework``` has been updated:
- Class ```Model``` added. Methods currently available:
* [HaploDynamics.Framework.Model.\_\_init\_\_](docs/source/framework-doc.md#haplodynamicsframeworkmodel__init__) (with a tutorial)
* [HaploDynamics.Framework.Model.initiate_vcf](docs/source/framework-doc.md#haplodynamicsframeworkmodelinitiate_vcf) (with a tutorial)
* [HaploDynamics.Framework.Model.generate_vcf](docs/source/framework-doc.md#haplodynamicsframeworkmodelgenerate_vcf) (with tutorials and performance tests)

2. Module ```HaploDX``` has been updated:
- A loading bar was added to the following processes, with information on time and memory:
* [HaploDynamics.HaploDX.genmatrix](docs/source/haplodx-doc.md#haplodynamicshaplodxgenmatrix)
* [HaploDynamics.HaploDX.create_vcfgz](docs/source/haplodx-doc.md#haplodynamicshaplodxcreate_vcfgz)
- Example:
```shell
$ python myscript.py
genmatrix: >||||||||||||||||||||< 100%
time (sec.): 3.409385681152344e-05
max. mem (MB): 0.0008192062377929688
cur. mem (MB): 0.0007524490356445312
```
- The accuracy of simulations has been improved by using better genotype representations. The <a href="https://www.normalesup.org/~tuyeras/node_diss/blg/home.php?page=blg_stat/stat_1/home.php">tutorial</a> has been updated to reflect these changes. The following functions have been affected by this change:
* [HaploDynamics.HaploDX.genotype](docs/source/haplodx-doc.md#haplodynamicshaplodxgenotype)
* [HaploDynamics.HaploDX.gref](docs/source/haplodx-doc.md#haplodynamicshaplodxgref)
* [HaploDynamics.HaploDX.cond_genotype_schema](docs/source/haplodx-doc.md#haplodynamicshaplodxcond_genotype_schema)
* [HaploDynamics.HaploDX.continue_block](docs/source/haplodx-doc.md#haplodynamicshaplodxcontinue_block)
* [HaploDynamics.HaploDX.minor_haplotype](docs/source/haplodx-doc.md#haplodynamicshaplodxminor_haplotype)

3. [Documentation](#documentation) completed with step-by-step instructions and visualizations.

1. The [documentation](#documentation) has been enhanced with tutorials and performance analyses;

2. Release version ```0.4b0```:
* **Compose your own mutation model:** the class ```Model``` now lets you create your own mutation model and use it with the generative functions of the HaploDX framework.
```python
import HaploDynamics.Framework as fmx
#Start your simulation
model = fmx.Model("tutorial")
#Initialize the genomic landscape
model.initiate_landscape(reference = 1.245)
#Design your own genomic landscape with any allele frequency model
model.extend_landscape(*(fmx.Model.standard_schema(20) for _ in range(6)))
#Population and LD parameters
strength = 1
population = 0.1
Npop = 1000
chrom = "1"
#Generate the simulation in a VCF file
model.generate_vcf(strength,population,Npop,chrom)
```

* [HaploDynamics.Framework.Model.initiate_landscape](docs/source/framework-doc.md#haplodynamicsframeworkmodelinitiate_landscape) added;
* [HaploDynamics.Framework.Model.extend_landscape](docs/source/framework-doc.md#haplodynamicsframeworkmodelextend_landscape) added;
* [HaploDynamics.Framework.Model.standard_schema](docs/source/framework-doc.md#haplodynamicsframeworkmodelstandard_schema) added;
* [HaploDynamics.Framework.Model.genotype_schema](docs/source/framework-doc.md#haplodynamicsframeworkmodelgenotype_schema) added;
* [HaploDynamics.Framework.Model.linkage_disequilibrium](docs/source/framework-doc.md#haplodynamicsframeworkmodellinkage_disequilibrium) added;
* [HaploDynamics.Framework.Model.cond_genotype_schema](docs/source/framework-doc.md#haplodynamicsframeworkmodelcond_genotype_schema) added;
* [Documentation for the Framework module](docs/source/framework-doc.md) polished;
* Various typos and clumsy phrasing have been corrected in the [documentation](#documentation);
* Loading bar appreance changed:
```shell
$ python myscript.py
Model.generate_vcf: |████████████████████| 100%
time (sec.): 0.7510931491851807
max. mem (MB): 0.11163139343261719
cur. mem (MB): 0.0834970474243164
```



## Installation

Expand All @@ -47,7 +59,7 @@ import HaploDynamics.Framework as fmx
```
To upgrade the package to its latest version, use the following command.
```shell
$ pip install --upgrade HaploDynamics==0.3b1
$ pip install --upgrade HaploDynamics==0.4b0
```
### Manual installation
HaploDynamics uses the [SciPy](https://docs.scipy.org/doc/scipy/reference/stats.html) library for certain calculations. To install SciPy, run the following command, or see SciPy's [installation instructions](https://scipy.org/install/) for more options.
Expand All @@ -69,7 +81,7 @@ absolute/path/to/HaploDynamics
To import the modules of the library to your script, you can use the following syntax where the path ```absolute/path/to/HaploDynamics``` should be replaced with the path obtained earlier.
```python
import sys
sys.path.insert(0,"absolute/path/to/HaploDynamics")
sys.path.insert(1,"absolute/path/to/HaploDynamics")
import HaploDynamics.HaploDX as hdx
import HaploDynamics.Framework as fmx
```
Expand Down Expand Up @@ -173,7 +185,7 @@ Correlations | genetic distances to average correlations

## To cite this work

Tuyeras, R. (2023). _HaploDynamics: A python library to develop genomic data simulators_ (Version 0.3-beta.1) [Computer software]. [![DOI](https://zenodo.org/badge/609227235.svg)](https://zenodo.org/badge/latestdoi/609227235)
Tuyeras, R. (2023). _HaploDynamics: A python library to develop genomic data simulators_ (Version 0.4-beta.0) [Computer software]. [![DOI](https://zenodo.org/badge/609227235.svg)](https://zenodo.org/badge/latestdoi/609227235)

<br/>

Expand Down
Loading

0 comments on commit b0e647e

Please sign in to comment.