Class "Model" improved

remytuyeras · Aug 24, 2023 · b0e647e · b0e647e
1 parent b757a95
commit b0e647e
Show file tree

Hide file tree

Showing 7 changed files with 629 additions and 146 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -5,7 +5,7 @@ authors:
   given-names: "Remy"
   orcid: "https://orcid.org/0000-0002-5032-4113"
 title: "HaploDynamics: A python library to develop genomic data simulators"
-version: 0.3-beta.1
+version: 0.4-beta.0
 doi: 10.5281/zenodo.8247433
 date-released: 2023-08-09
 url: "https://github.com/remytuyeras/HaploDynamics"
diff --git a/HaploDynamics/Framework/multiproc.py b/HaploDynamics/Framework/multiproc.py
@@ -16,29 +16,106 @@ def __init__(self,fname,cpus = 1,system = "unix"):
     self.pool = mp.Pool(cpus)
     self.fname = fname
     self.eol = "\n" if system == "unix" else "\r\n"
+    self.initial_schema = None
+    self.landscape = []
+    self.initiate_landscape()
 
+  def initiate_landscape(self,afs=afs_sample,reference=0):
+    self.initial_schema = (afs,reference)
 
-  def initiate_vcf(self,length,Npop,chrom = "23"):
+  def extend_landscape(self,*schemas):
+    self.landscape.extend(list(schemas))
+    return len(self.landscape)
+
+  @staticmethod
+  def standard_schema(npos):
+    pos_dist = lambda reference: SNP_distribution(reference,npos)
+    return npos, afs_sample, pos_dist
+
+  def genotype_schema(self,alpha=4/30,afs=afs_sample):
+    p =  afs(alpha)
+    #We use the minor allele frequency as a reference
+    maf = min(p,1-p)
+    #Probability for minor homogeneous genotypes
+    hom = maf**2
+    #Probability for heterogeneous genotypes
+    het = maf*(1-maf)
+    #Interval decomposition simulating the Hardy-Weinberg principle
+    hwp = [0, hom, hom+het, hom+2*het, 1]
+    return maf, hwp
+
+  def linkage_disequilibrium(self,alpha,beta,gamma,strength=-1,afs=afs_sample):
+    ub = gamma*math.log2(1/beta)
+    shift = random.uniform(min(ub,strength*ub),ub)
+    def LD(p):
+      def ld(t):
+        q = afs(alpha)
+        while not(lb_freq(beta,gamma,p,t,shift) <= q <= ub_freq(beta,gamma,p,t,shift)):
+          q = afs(alpha)
+        l = decay(amplifier(beta,p,q),2*gamma,shift)(t)+1
+        return l, q
+      return ld
+    return LD
+
+  def cond_genotype_schema(self,previous_maf,distance,alpha,beta,gamma,strength=-1,afs=afs_sample):
+    #Linkage disequilibrium
+    ld, maf = self.linkage_disequilibrium(alpha,beta,gamma,strength,afs)(previous_maf)(distance)
+    while maf > 0.5:
+      ld, maf = self.linkage_disequilibrium(alpha,beta,gamma,strength,afs)(previous_maf)(distance)
+    #Conditional Hardy-Weinberg principle encoded as a function
+    def hwp(previous_genotype):
+      #Previous genotype has minor alleles
+      if previous_genotype == 2:
+        #Probability for minor homogeneous genotypes
+        hom = (ld**2) * (maf**2)
+        #Probability for heterogeneous genotypes
+        het = ld * ref_alt_function(maf,ld) * maf * (1-maf)
+        #Interval decomposition simulating the conditional Hardy-Weinberg principle
+        return [0, hom, hom+het,hom+2*het, 1]
+      #Previous genotype has one minor allele
+      if abs(previous_genotype) == 1:
+        #Probability for minor homogeneous genotypes
+        hom = ld * ref_alt_function(previous_maf,ld) * (maf**2)
+        #Probability for heterogeneous genotypes
+        het1 = ld * alt_alt_function(previous_maf,maf,ld) * maf * (1-maf)
+        het2 = ref_alt_function(previous_maf,ld) * ref_alt_function(maf,ld) * maf * (1-maf)
+        #Previous genotype consists of a minor-major pairing
+        if previous_genotype == -1:
+          return [0, hom, hom+het1,hom+het1+het2, 1]
+        #Previous genotype consists of a major-minor pairing
+        elif previous_genotype == 1:
+          return [0, hom, hom+het2,hom+het2+het1, 1]
+      #Previous genotype has no minor alleles
+      if previous_genotype == 0:
+        #Probability for minor homogeneous genotypes
+        hom = (ref_alt_function(previous_maf,ld)**2) * (maf**2)
+        #Probability for heterogeneous genotypes
+        het = ref_alt_function(previous_maf,ld) * alt_alt_function(previous_maf,maf,ld) * maf * (1-maf)
+        #Interval decomposition simulating the conditional Hardy-Weinberg principle
+        return [0, hom, hom+het,hom+2*het, 1]
+    return maf, hwp, ld
+
+  def initiate_vcf(self,Npos,Npop,chrom = "23"):
     f = gzip.open(self.fname+".vcf.gz","wt")
-    order = int(math.log10(Npop))
+    order = int(math.log10(Npop))+1
     write = lambda s : f.write(s+self.eol)
     idnum = lambda s: "ID"+"0"*(order-len(str(s)))+str(s)
     write("##fileformat=VCFv4.2")
     write("##fileDate="+"".join(str(datetime.date.today()).split("-")))
     write("##source=HaploDX")
     write("##reference=https://github.com/remytuyeras/HaploDynamics/blob/main/README.md")
-    write("##contig=<ID="+chrom+",length="+str(length)+",species=\"simulated Homo sapiens\">")
+    write("##contig=<ID="+chrom+",length="+str(Npos)+",species=\"simulated Homo sapiens\">")
     write("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">")
     write("##INFO=<ID=AP,Number=1,Type=Float,Description=\"Alpha Parameter for Simulation\">")
     write("##INFO=<ID=BP,Number=1,Type=Float,Description=\"Beta Parameter for Simulation\">")
     write("##INFO=<ID=CP,Number=1,Type=Float,Description=\"Gamma Parameter for Simulation\">")
     write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">")
-    INDIV = "\t".join(map(idnum,range(1,Npop)))
+    INDIV = "\t".join(map(idnum,range(1,Npop+1)))
     write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t"+INDIV)
     f.flush()
     return f
 
-  def generate_vcf(self,blocks,strength,population,Npop,chrom = "23"):
+  def generate_vcf(self,strength,population,Npop,chrom = "23"):
     #Values needed to make the VCF file
     alpha, beta, gamma = population_mld(population)
     nuc   = ["A","C","G","T"]
@@ -53,17 +130,17 @@ def write_genotype(f,vector):
     #Important parameters to control the memory space
     pre_vector  = None
     vector      = None
-    length      = sum(blocks)+1
+    Npos        = sum([npos for npos,_,_ in self.landscape])+1
     #For the loop
-    reference   = 0
     maf         = None
     #For the output
     speed = None
     max_mem = None
     cur_mem = None
-    with self.initiate_vcf(length,Npop,chrom) as f:
+    with self.initiate_vcf(Npos,Npop,chrom) as f:
       ####### initiate_block
-      maf, hwp = genotype_schema(alpha)
+      afs, reference = self.initial_schema
+      maf, hwp = self.genotype_schema(alpha,afs)
       minor = random.choice([0,2])
       vector = [reference]
       pre_vector = [genotype(hwp,minor) for i in range(Npop)]
@@ -72,15 +149,15 @@ def write_genotype(f,vector):
       ####### end of initiate_block
 
       #Loop: Loading bar on the standard output
-      loading_frame = Loading("Model.generate_vcf: ",len(blocks))
-      for i, block in enumerate(blocks):
+      loading_frame = Loading("Model.generate_vcf: ",len(self.landscape))
+      for i, schema in enumerate(self.landscape):
         #Loading bar progress
         speed, max_mem, cur_mem = loading_frame.show(i+1)
-        positions = SNP_distribution(reference,block)
+        positions = schema[2](reference)
         ####### continue_block
         for k in range(len(positions)):
           distance = abs(positions[k]-reference)
-          maf, hwp, ld = cond_genotype_schema(maf,distance,alpha,beta,gamma,strength)
+          maf, hwp, ld = self.cond_genotype_schema(maf,distance,alpha,beta,gamma,strength,schema[1])
           minor = random.choice([0,2])
           vector = [positions[k]]
           pre_vector = [genotype(hwp(gref(pre_vector[i])),minor) for i in range(Npop)]

diff --git a/HaploDynamics/utils/load.py b/HaploDynamics/utils/load.py
@@ -33,7 +33,7 @@ def text(self,d: int,color: bool = False):
       m1 = "max. mem (MB): " + str(self.max_mem)
       m0 = "cur. mem (MB): " + str(self.cur_mem)
       info = "\n".join(["",t,m1,m0,""])
-    return ">"+c1+"|"*d+" "*(20-d)+c2+"< "+str(int(100*d/20))+"%"+info
+    return "|"+c1+u"\u2588"*d+" "*(20-d)+c2+"| "+str(int(100*d/20))+"%"+info
 
   def show(self,state: int):
     self.current = state

diff --git a/README.md b/README.md
@@ -4,34 +4,46 @@
 <div style="width: 180px; margin: auto;"><a href="https://surveillance.cancer.gov/genetic-simulation-resources/"><img src="https://surveillance.cancer.gov/gsr/static/img/gsr_tile.jpg" alt="Catalogued on GSR" width="180" height="60" /></a></div>
 
 ## Highlights and updates
-Release version ```0.3b*```:
-
-1. Module ```Framework``` has been updated:
-    - Class ```Model``` added. Methods currently available:
-      * [HaploDynamics.Framework.Model.\_\_init\_\_](docs/source/framework-doc.md#haplodynamicsframeworkmodel__init__) (with a tutorial)
-      * [HaploDynamics.Framework.Model.initiate_vcf](docs/source/framework-doc.md#haplodynamicsframeworkmodelinitiate_vcf) (with a tutorial)
-      * [HaploDynamics.Framework.Model.generate_vcf](docs/source/framework-doc.md#haplodynamicsframeworkmodelgenerate_vcf) (with tutorials and performance tests)
-
-2. Module ```HaploDX``` has been updated:
-    - A loading bar was added to the following processes, with information on time and memory:
-      * [HaploDynamics.HaploDX.genmatrix](docs/source/haplodx-doc.md#haplodynamicshaplodxgenmatrix)
-      * [HaploDynamics.HaploDX.create_vcfgz](docs/source/haplodx-doc.md#haplodynamicshaplodxcreate_vcfgz)
-      - Example:
-          ```shell
-          $ python myscript.py
-          genmatrix: >||||||||||||||||||||< 100%
-          time (sec.): 3.409385681152344e-05
-          max. mem (MB): 0.0008192062377929688
-          cur. mem (MB): 0.0007524490356445312
-          ```
-    - The accuracy of simulations has been improved by using better genotype representations. The <a href="https://www.normalesup.org/~tuyeras/node_diss/blg/home.php?page=blg_stat/stat_1/home.php">tutorial</a> has been updated to reflect these changes. The following functions have been affected by this change:
-        * [HaploDynamics.HaploDX.genotype](docs/source/haplodx-doc.md#haplodynamicshaplodxgenotype)
-        * [HaploDynamics.HaploDX.gref](docs/source/haplodx-doc.md#haplodynamicshaplodxgref)
-        * [HaploDynamics.HaploDX.cond_genotype_schema](docs/source/haplodx-doc.md#haplodynamicshaplodxcond_genotype_schema)
-        * [HaploDynamics.HaploDX.continue_block](docs/source/haplodx-doc.md#haplodynamicshaplodxcontinue_block)
-        * [HaploDynamics.HaploDX.minor_haplotype](docs/source/haplodx-doc.md#haplodynamicshaplodxminor_haplotype)
-
-3. [Documentation](#documentation) completed with step-by-step instructions and visualizations. 
+
+1. The [documentation](#documentation) has been enhanced with tutorials and performance analyses;
+
+2. Release version ```0.4b0```:
+    * **Compose your own mutation model:** the class ```Model``` now lets you create your own mutation model and use it with the generative functions of the HaploDX framework.
+        ```python
+        import HaploDynamics.Framework as fmx
+        #Start your simulation
+        model = fmx.Model("tutorial")
+        #Initialize the genomic landscape
+        model.initiate_landscape(reference = 1.245)
+        #Design your own genomic landscape with any allele frequency model
+        model.extend_landscape(*(fmx.Model.standard_schema(20) for _ in range(6)))
+        #Population and LD parameters
+        strength = 1
+        population = 0.1
+        Npop = 1000
+        chrom = "1"
+        #Generate the simulation in a VCF file
+        model.generate_vcf(strength,population,Npop,chrom)
+        ```
+
+    * [HaploDynamics.Framework.Model.initiate_landscape](docs/source/framework-doc.md#haplodynamicsframeworkmodelinitiate_landscape) added;
+    * [HaploDynamics.Framework.Model.extend_landscape](docs/source/framework-doc.md#haplodynamicsframeworkmodelextend_landscape) added;
+    * [HaploDynamics.Framework.Model.standard_schema](docs/source/framework-doc.md#haplodynamicsframeworkmodelstandard_schema) added;
+    * [HaploDynamics.Framework.Model.genotype_schema](docs/source/framework-doc.md#haplodynamicsframeworkmodelgenotype_schema) added;
+    * [HaploDynamics.Framework.Model.linkage_disequilibrium](docs/source/framework-doc.md#haplodynamicsframeworkmodellinkage_disequilibrium) added;
+    * [HaploDynamics.Framework.Model.cond_genotype_schema](docs/source/framework-doc.md#haplodynamicsframeworkmodelcond_genotype_schema) added;
+    * [Documentation for the Framework module](docs/source/framework-doc.md) polished;
+    * Various typos and clumsy phrasing have been corrected in the [documentation](#documentation);
+    * Loading bar appreance changed:
+        ```shell
+        $ python myscript.py
+        Model.generate_vcf: |████████████████████| 100%
+        time (sec.): 0.7510931491851807
+        max. mem (MB): 0.11163139343261719
+        cur. mem (MB): 0.0834970474243164
+        ```
+
+
 
 ## Installation
 
@@ -47,7 +59,7 @@ import HaploDynamics.Framework as fmx
 ```
 To upgrade the package to its latest version, use the following command.
 ```shell
-$ pip install --upgrade HaploDynamics==0.3b1
+$ pip install --upgrade HaploDynamics==0.4b0
 ```
 ### Manual installation
 HaploDynamics uses the [SciPy](https://docs.scipy.org/doc/scipy/reference/stats.html) library for certain calculations. To install SciPy, run the following command, or see SciPy's [installation instructions](https://scipy.org/install/) for more options.
@@ -69,7 +81,7 @@ absolute/path/to/HaploDynamics
 To import the modules of the library to your script, you can use the following syntax where the path ```absolute/path/to/HaploDynamics``` should be replaced with the path obtained earlier.
 ```python
 import sys
-sys.path.insert(0,"absolute/path/to/HaploDynamics")
+sys.path.insert(1,"absolute/path/to/HaploDynamics")
 import HaploDynamics.HaploDX as hdx
 import HaploDynamics.Framework as fmx
 ```
@@ -173,7 +185,7 @@ Correlations            |  genetic distances to average correlations
 
 ## To cite this work
 
-Tuyeras, R. (2023). _HaploDynamics: A python library to develop genomic data simulators_ (Version 0.3-beta.1) [Computer software]. [![DOI](https://zenodo.org/badge/609227235.svg)](https://zenodo.org/badge/latestdoi/609227235)
+Tuyeras, R. (2023). _HaploDynamics: A python library to develop genomic data simulators_ (Version 0.4-beta.0) [Computer software]. [![DOI](https://zenodo.org/badge/609227235.svg)](https://zenodo.org/badge/latestdoi/609227235)
 
 <br/>