rajanil · saketkc · Dec 3, 2018 · Dec 3, 2018 · Dec 3, 2018 · Dec 3, 2018
diff --git a/bam_to_tbi.py b/bam_to_tbi.py
@@ -1,3 +1,5 @@
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
 import numpy as np
 import pysam
 import subprocess
@@ -7,18 +9,20 @@
 
 MIN_MAP_QUAL = 10
 
+
 def parse_args():
-    parser = argparse.ArgumentParser(description=" convert bam data format to bigWig data format, "
-                                     " for ribosome profiling and RNA-seq data ")
+    parser = argparse.ArgumentParser(
+        description=" convert bam data format to bigWig data format, "
+        " for ribosome profiling and RNA-seq data ")
 
-    parser.add_argument("--dtype",
-                        choices=("rnaseq","riboseq"),
-                        default="riboseq",
-                        help="specifies the type of assay (default: riboseq)")
+    parser.add_argument(
+        "--dtype",
+        choices=("rnaseq", "riboseq"),
+        default="riboseq",
+        help="specifies the type of assay (default: riboseq)")
 
-    parser.add_argument("bam_file",
-                        action="store",
-                        help="path to bam input file")
+    parser.add_argument(
+        "bam_file", action="store", help="path to bam input file")
 
     options = parser.parse_args()
 
@@ -27,8 +31,8 @@ def parse_args():
 
     return options
 
-def which(program):
 
+def which(program):
     def is_exe(fpath):
         return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
 
@@ -45,14 +49,15 @@ def is_exe(fpath):
 
     return None
 
+
 def convert_rnaseq(options):
 
     # file names and handles
     count_file = os.path.splitext(options.bam_file)[0]
     sam_handle = pysam.Samfile(options.bam_file, "rb")
     count_handle = open(count_file, 'w')
 
-    for cname,clen in zip(sam_handle.references,sam_handle.lengths):
+    for cname, clen in zip(sam_handle.references, sam_handle.lengths):
 
         # fetch reads in chromosome
         sam_iter = sam_handle.fetch(reference=cname)
@@ -80,11 +85,14 @@ def convert_rnaseq(options):
                 counts[site] = 1
 
         # write counts to output file
-        indices = np.sort(counts.keys())
+        indices = np.sort(list(counts.keys()))
         for i in indices:
-            count_handle.write('\t'.join([cname,'%d'%i,'%d'%(i+1),'%d'%counts[i]])+'\n')
+            count_handle.write('\t'.join(
+                [cname, '%d' %
+                 i, '%d' %
+                 (i + 1), '%d' % counts[i]]) + '\n')
 
-        print "completed %s"%cname
+        print("completed %s" % cname)
 
     sam_handle.close()
     count_handle.close()
@@ -98,26 +106,28 @@ def convert_rnaseq(options):
     pipe = subprocess.Popen("%s -f -b 2 -e 3 -0 %s.gz"%(options.tabix, count_file), \
         stdout=subprocess.PIPE, shell=True)
     stdout = pipe.communicate()[0]
-    print "Compressed file with RNA-seq counts is %s"%(count_file+'.gz')
+    print("Compressed file with RNA-seq counts is %s" % (count_file + '.gz'))
 
 
 def convert_riboseq(options):
 
     # file names and handles
-    fwd_count_file = os.path.splitext(options.bam_file)[0]+'_fwd'
-    rev_count_file = os.path.splitext(options.bam_file)[0]+'_rev'
+    fwd_count_file = os.path.splitext(options.bam_file)[0] + '_fwd'
+    rev_count_file = os.path.splitext(options.bam_file)[0] + '_rev'
     sam_handle = pysam.Samfile(options.bam_file, "rb")
-    fwd_handle = dict([(r,open(fwd_count_file+'.%d'%r, 'w')) for r in utils.READ_LENGTHS])
-    rev_handle = dict([(r,open(rev_count_file+'.%d'%r, 'w')) for r in utils.READ_LENGTHS])
+    fwd_handle = dict([(r, open(fwd_count_file + '.%d' % r, 'w'))
+                       for r in utils.READ_LENGTHS])
+    rev_handle = dict([(r, open(rev_count_file + '.%d' % r, 'w'))
+                       for r in utils.READ_LENGTHS])
 
-    for cname,clen in zip(sam_handle.references,sam_handle.lengths):
+    for cname, clen in zip(sam_handle.references, sam_handle.lengths):
 
         # fetch reads in chromosome
         sam_iter = sam_handle.fetch(reference=cname)
 
         # initialize count arrays
-        fwd_counts = dict([(r,dict()) for r in utils.READ_LENGTHS])
-        rev_counts = dict([(r,dict()) for r in utils.READ_LENGTHS])
+        fwd_counts = dict([(r, dict()) for r in utils.READ_LENGTHS])
+        rev_counts = dict([(r, dict()) for r in utils.READ_LENGTHS])
         for read in sam_iter:
 
             # skip reads not of the appropriate length
@@ -131,7 +141,7 @@ def convert_riboseq(options):
             # skip read, if mapping quality is low
             if read.mapq < MIN_MAP_QUAL:
                 continue
-            
+
             if read.is_reverse:
                 asite = int(read.positions[-13])
                 try:
@@ -147,15 +157,23 @@ def convert_riboseq(options):
 
         # write counts to output files
         for r in utils.READ_LENGTHS:
-            indices = np.sort(fwd_counts[r].keys())
+            indices = np.sort(list(fwd_counts[r].keys()))
             for i in indices:
-                fwd_handle[r].write('\t'.join([cname, '%d'%i, '%d'%(i+1), '%d'%fwd_counts[r][i]])+'\n')
+                fwd_handle[r].write('\t'.join(
+                    [cname,
+                     '%d' % i,
+                     '%d' % (i + 1),
+                     '%d' % fwd_counts[r][i]]) + '\n')
 
-            indices = np.sort(rev_counts[r].keys())
+            indices = np.sort(list(rev_counts[r].keys()))
             for i in indices:
-                rev_handle[r].write('\t'.join([cname, '%d'%i, '%d'%(i+1), '%d'%rev_counts[r][i]])+'\n')
+                rev_handle[r].write('\t'.join(
+                    [cname,
+                     '%d' % i,
+                     '%d' % (i + 1),
+                     '%d' % rev_counts[r][i]]) + '\n')
 
-        print "completed %s"%cname
+        print("completed %s" % cname)
 
     sam_handle.close()
     for r in utils.READ_LENGTHS:
@@ -179,16 +197,20 @@ def convert_riboseq(options):
         pipe = subprocess.Popen("%s -f -b 2 -e 3 -0 %s.%d.gz"%(options.tabix, rev_count_file, r), \
             stdout=subprocess.PIPE, shell=True)
         stdout = pipe.communicate()[0]
-        print "Compressed file with ribosome footprint counts on forward strand is %s"%(fwd_count_file+'.%d.gz'%r)
-        print "Compressed file with ribosome footprint counts on reverse strand is %s"%(rev_count_file+'.%d.gz'%r)
+        print(
+            "Compressed file with ribosome footprint counts on forward strand is %s"
+            % (fwd_count_file + '.%d.gz' % r))
+        print(
+            "Compressed file with ribosome footprint counts on reverse strand is %s"
+            % (rev_count_file + '.%d.gz' % r))
+
 
-if __name__=="__main__":
+if __name__ == "__main__":
 
     options = parse_args()
 
-    if options.dtype=="rnaseq":
+    if options.dtype == "rnaseq":
         convert_rnaseq(options)
 
-    elif options.dtype=="riboseq":
+    elif options.dtype == "riboseq":
         convert_riboseq(options)
-
diff --git a/compute_mappability.py b/compute_mappability.py
@@ -1,3 +1,5 @@
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
 import numpy as np
 import subprocess
 import pysam
@@ -6,17 +8,19 @@
 
 MIN_MAP_QUAL = 10
 
+
 def parse_args():
-    parser = argparse.ArgumentParser(description=" convert bam data format to tabix data format, "
-                                     " for ribosome profiling and RNA-seq data ")
+    parser = argparse.ArgumentParser(
+        description=" convert bam data format to tabix data format, "
+        " for ribosome profiling and RNA-seq data ")
 
-    parser.add_argument("bam_file",
-                        action="store",
-                        help="path to bam input file")
+    parser.add_argument(
+        "bam_file", action="store", help="path to bam input file")
 
-    parser.add_argument("mappability_file_prefix",
-                        action="store",
-                        help="prefix of mappability file")
+    parser.add_argument(
+        "mappability_file_prefix",
+        action="store",
+        help="prefix of mappability file")
 
     options = parser.parse_args()
 
@@ -25,8 +29,8 @@ def parse_args():
 
     return options
 
-def which(program):
 
+def which(program):
     def is_exe(fpath):
         return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
 
@@ -43,14 +47,15 @@ def is_exe(fpath):
 
     return None
 
+
 def compute_mappability(options):
 
     # file names and handles
     map_file = options.mappability_file_prefix
     map_handle = open(map_file, 'w')
     sam_handle = pysam.Samfile(options.bam_file, "rb")
 
-    for cname,clen in zip(sam_handle.references,sam_handle.lengths):
+    for cname, clen in zip(sam_handle.references, sam_handle.lengths):
 
         # fetch reads in chromosome
         sam_iter = sam_handle.fetch(reference=cname)
@@ -70,32 +75,34 @@ def compute_mappability(options):
             if not read.is_reverse:
                 mapped_site = int(read.positions[0])
                 true_chrom, true_site = read.query_name.split(':')[:2]
-                if read.reference_name==true_chrom and mapped_site==int(true_site):
+                if read.reference_name == true_chrom and mapped_site == int(
+                        true_site):
                     mappable_positions.append(mapped_site)
 
-        if len(mappable_positions)>0:
+        if len(mappable_positions) > 0:
 
             # get boundaries of mappable portions of the genome
             mappable_positions = np.sort(mappable_positions)
 
-            boundaries = mappable_positions[:-1]-mappable_positions[1:]
-            indices = np.where(boundaries<-1)[0]
-            ends = (mappable_positions[indices]+1).tolist()
+            boundaries = mappable_positions[:-1] - mappable_positions[1:]
+            indices = np.where(boundaries < -1)[0]
+            ends = (mappable_positions[indices] + 1).tolist()
             try:
-                ends.append(mappable_positions[-1]+1)
+                ends.append(mappable_positions[-1] + 1)
             except IndexError:
                 pdb.set_trace()
 
-            boundaries = mappable_positions[1:]-mappable_positions[:-1]
-            indices = np.where(boundaries>1)[0]+1
+            boundaries = mappable_positions[1:] - mappable_positions[:-1]
+            indices = np.where(boundaries > 1)[0] + 1
             starts = mappable_positions[indices].tolist()
-            starts.insert(0,mappable_positions[0])
+            starts.insert(0, mappable_positions[0])
 
             # write to file
-            for start,end in zip(starts,ends):
-                map_handle.write('\t'.join([cname, '%d'%start, '%d'%end])+'\n')
+            for start, end in zip(starts, ends):
+                map_handle.write('\t'.join(
+                    [cname, '%d' % start, '%d' % end]) + '\n')
 
-        print "completed %s"%cname
+        print("completed %s" % cname)
 
     sam_handle.close()
     map_handle.close()
@@ -110,9 +117,11 @@ def compute_mappability(options):
         stdout=subprocess.PIPE, shell=True)
     stdout = pipe.communicate()[0]
 
-    print "completed computing mappability from BAM file %s"%options.bam_file
+    print(
+        "completed computing mappability from BAM file %s" % options.bam_file)
+
 
-if __name__=="__main__":
+if __name__ == "__main__":
 
     options = parse_args()