split vcf function added

reneshbedre · May 9, 2020 · 5215303 · 5215303
1 parent 7f62e8f
commit 5215303
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -167,11 +167,11 @@ Correlation matrix plot image in same directory (corr_mat.png)
 
 <b>Merge VCF files</b>
 
-`bioinfokit.analys.mergevcf(file)`
+`bioinfokit.analys.marker.mergevcf(file)`
 
 Parameters | Description
 ------------ | -------------
-`file` | Multiple vcf files and separate them by comma
+`file` | Multiple vcf files separated by comma
 
 Returns:
 
@@ -180,17 +180,21 @@ Merged VCF file (merge_vcf.vcf)
 <a href="https://reneshbedre.github.io/blog/mergevcf.html" target="_blank">Working example</a>
 
 
-<b>Merge VCF files</b>
+<b>Split VCF file</b>
+
+`bioinfokit.analys.marker.splitvcf(file)`
 
-`bioinfokit.analys.mergevcf(file)`
+Split single VCF file containing variants for all chromosomes into individual file containing variants for each chromosome
 
 Parameters | Description
 ------------ | -------------
-`file` | Multiple vcf files and separate them by comma
+ `file` | VCF file to split
+ `id` | chromosome id column in VCF file [string][default='#CHROM']
+
 
 Returns:
 
-Merged VCF file (merge_vcf.vcf)
+VCF files for each chromosome
 
 <a href="https://reneshbedre.github.io/blog/mergevcf.html" target="_blank">Working example</a>
 
@@ -538,6 +542,8 @@ References:
 - Wes McKinney. Data Structures for Statistical Computing in Python, Proceedings of the 9th Python in Science Conference, 51-56 (2010)
 
 bioinfokit cited by:
+- Jennifer Gribble, Andrea J. Pruijssers, Maria L. Agostini, Jordan Anderson-Daniels, James D. Chappell, Xiaotao Lu, Laura J. Stevens, Andrew L. Routh, Mark R. Denison
+  bioRxiv 2020.04.23.057786; doi: https://doi.org/10.1101/2020.04.23.057786
 - Greaney AM, Adams TS, Raredon MS, Gubbins E, Schupp JC, Engler AJ, Ghaedi M, Yuan Y, Kaminski N, Niklason LE. Platform 
   Effects on Regeneration by Pulmonary Basal Cells as Evaluated by Single-Cell RNA Sequencing. Cell Reports. 2020 Mar 
   24;30(12):4250-65.
diff --git a/VERSIONLOG.md b/VERSIONLOG.md
@@ -1,3 +1,7 @@
+v0.7.2 has the following updates and changes (May 08, 2020)
+- `splitvcf` function added for splitting VCF file into individual VCF files for each chromosome
+- `mergevcf` moved to `analys.marker` class
+
 v0.7.1 has the following updates and changes (April 24, 2020)
 - `reg_lin` function updated for multiple regression
 - degree of freedom fixed for t-test for regression coefficients

diff --git a/bioinfokit/__init__.py b/bioinfokit/__init__.py
@@ -1,5 +1,5 @@
 name = "bioinfokit"
-__version__ = "0.7.1"
+__version__ = "0.7.2"
 __author__ = "Renesh Bedre"
 
 
diff --git a/bioinfokit/analys.py b/bioinfokit/analys.py
@@ -32,25 +32,7 @@ def seqcov(file="fastq_file", gs="genome_size"):
     print("Sequence coverage for", file, "is", cov)
 
 def mergevcf(file="vcf_file_com_sep"):
-    vcf_files = file.split(",")
-    merge_vcf = open("merge_vcf.vcf", "w+")
-    file_count = 0
-    print("merging vcf files...")
-    for f in vcf_files:
-        if file_count == 0:
-            read_file = open(f, "rU")
-            for line in read_file:
-                merge_vcf.write(line)
-            read_file.close()
-        elif file_count > 0:
-            read_file = open(f, "rU")
-            for line in read_file:
-                if not line.startswith("#"):
-                    merge_vcf.write(line)
-            read_file.close()
-        file_count += 1
-    merge_vcf.close()
-
+    general.depr_mes("bioinfokit.analys.marker.mergevcf")
 
 def pca(table="p_df"):
     d = pd.DataFrame(data=table)
@@ -273,6 +255,54 @@ def split_fastq(file="fastq_file"):
         out_file_name_2.close()
 
 
+class marker:
+
+    def __init__(self):
+        pass
+
+    def mergevcf(file="vcf_file_com_sep"):
+        vcf_files = file.split(",")
+        merge_vcf = open("merge_vcf.vcf", "w+")
+        file_count = 0
+        print("merging vcf files...")
+        for f in vcf_files:
+            if file_count == 0:
+                read_file = open(f, "rU")
+                for line in read_file:
+                    merge_vcf.write(line)
+                read_file.close()
+            elif file_count > 0:
+                read_file = open(f, "rU")
+                for line in read_file:
+                    if not line.startswith("#"):
+                        merge_vcf.write(line)
+                read_file.close()
+            file_count += 1
+        merge_vcf.close()
+
+    def splitvcf(file='vcf_file', id='#CHROM'):
+        read_vcf_file = open(file, 'r')
+        info_lines, headers = [], []
+        for line in read_vcf_file:
+            if line.startswith(id):
+                headers = line.strip().split('\t')
+            elif line.startswith('##'):
+                info_lines.append(line.strip())
+        read_vcf_file.close()
+        assert len(headers) != 0, "Non matching id parameter"
+        read_vcf_file_df = pd.read_csv(file, sep='\t', comment='#', header=None)
+        read_vcf_file_df.columns = headers
+        chrom_ids = read_vcf_file_df[id].unique()
+        for r in range(len(chrom_ids)):
+            sub_df = read_vcf_file_df[read_vcf_file_df[id]==chrom_ids[r]]
+            # out_vcf_file = open(chrom_ids[r]+'.vcf'
+            with open(chrom_ids[r]+'.vcf', 'w') as out_vcf_file:
+                for l in info_lines:
+                    out_vcf_file.write(l+'\n')
+            sub_df.to_csv(chrom_ids[r]+'.vcf', mode='a', sep='\t', index=False)
+            out_vcf_file.close()
+
+
 class format:
     def __init__(self):
         pass