Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug fixes and Python 3 compatiblity #4

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 56 additions & 34 deletions bam_to_tbi.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import numpy as np
import pysam
import subprocess
Expand All @@ -7,18 +9,20 @@

MIN_MAP_QUAL = 10


def parse_args():
parser = argparse.ArgumentParser(description=" convert bam data format to bigWig data format, "
" for ribosome profiling and RNA-seq data ")
parser = argparse.ArgumentParser(
description=" convert bam data format to bigWig data format, "
" for ribosome profiling and RNA-seq data ")

parser.add_argument("--dtype",
choices=("rnaseq","riboseq"),
default="riboseq",
help="specifies the type of assay (default: riboseq)")
parser.add_argument(
"--dtype",
choices=("rnaseq", "riboseq"),
default="riboseq",
help="specifies the type of assay (default: riboseq)")

parser.add_argument("bam_file",
action="store",
help="path to bam input file")
parser.add_argument(
"bam_file", action="store", help="path to bam input file")

options = parser.parse_args()

Expand All @@ -27,8 +31,8 @@ def parse_args():

return options

def which(program):

def which(program):
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

Expand All @@ -45,14 +49,15 @@ def is_exe(fpath):

return None


def convert_rnaseq(options):

# file names and handles
count_file = os.path.splitext(options.bam_file)[0]
sam_handle = pysam.Samfile(options.bam_file, "rb")
count_handle = open(count_file, 'w')

for cname,clen in zip(sam_handle.references,sam_handle.lengths):
for cname, clen in zip(sam_handle.references, sam_handle.lengths):

# fetch reads in chromosome
sam_iter = sam_handle.fetch(reference=cname)
Expand Down Expand Up @@ -80,11 +85,14 @@ def convert_rnaseq(options):
counts[site] = 1

# write counts to output file
indices = np.sort(counts.keys())
indices = np.sort(list(counts.keys()))
for i in indices:
count_handle.write('\t'.join([cname,'%d'%i,'%d'%(i+1),'%d'%counts[i]])+'\n')
count_handle.write('\t'.join(
[cname, '%d' %
i, '%d' %
(i + 1), '%d' % counts[i]]) + '\n')

print "completed %s"%cname
print("completed %s" % cname)

sam_handle.close()
count_handle.close()
Expand All @@ -98,26 +106,28 @@ def convert_rnaseq(options):
pipe = subprocess.Popen("%s -f -b 2 -e 3 -0 %s.gz"%(options.tabix, count_file), \
stdout=subprocess.PIPE, shell=True)
stdout = pipe.communicate()[0]
print "Compressed file with RNA-seq counts is %s"%(count_file+'.gz')
print("Compressed file with RNA-seq counts is %s" % (count_file + '.gz'))


def convert_riboseq(options):

# file names and handles
fwd_count_file = os.path.splitext(options.bam_file)[0]+'_fwd'
rev_count_file = os.path.splitext(options.bam_file)[0]+'_rev'
fwd_count_file = os.path.splitext(options.bam_file)[0] + '_fwd'
rev_count_file = os.path.splitext(options.bam_file)[0] + '_rev'
sam_handle = pysam.Samfile(options.bam_file, "rb")
fwd_handle = dict([(r,open(fwd_count_file+'.%d'%r, 'w')) for r in utils.READ_LENGTHS])
rev_handle = dict([(r,open(rev_count_file+'.%d'%r, 'w')) for r in utils.READ_LENGTHS])
fwd_handle = dict([(r, open(fwd_count_file + '.%d' % r, 'w'))
for r in utils.READ_LENGTHS])
rev_handle = dict([(r, open(rev_count_file + '.%d' % r, 'w'))
for r in utils.READ_LENGTHS])

for cname,clen in zip(sam_handle.references,sam_handle.lengths):
for cname, clen in zip(sam_handle.references, sam_handle.lengths):

# fetch reads in chromosome
sam_iter = sam_handle.fetch(reference=cname)

# initialize count arrays
fwd_counts = dict([(r,dict()) for r in utils.READ_LENGTHS])
rev_counts = dict([(r,dict()) for r in utils.READ_LENGTHS])
fwd_counts = dict([(r, dict()) for r in utils.READ_LENGTHS])
rev_counts = dict([(r, dict()) for r in utils.READ_LENGTHS])
for read in sam_iter:

# skip reads not of the appropriate length
Expand All @@ -131,7 +141,7 @@ def convert_riboseq(options):
# skip read, if mapping quality is low
if read.mapq < MIN_MAP_QUAL:
continue

if read.is_reverse:
asite = int(read.positions[-13])
try:
Expand All @@ -147,15 +157,23 @@ def convert_riboseq(options):

# write counts to output files
for r in utils.READ_LENGTHS:
indices = np.sort(fwd_counts[r].keys())
indices = np.sort(list(fwd_counts[r].keys()))
for i in indices:
fwd_handle[r].write('\t'.join([cname, '%d'%i, '%d'%(i+1), '%d'%fwd_counts[r][i]])+'\n')
fwd_handle[r].write('\t'.join(
[cname,
'%d' % i,
'%d' % (i + 1),
'%d' % fwd_counts[r][i]]) + '\n')

indices = np.sort(rev_counts[r].keys())
indices = np.sort(list(rev_counts[r].keys()))
for i in indices:
rev_handle[r].write('\t'.join([cname, '%d'%i, '%d'%(i+1), '%d'%rev_counts[r][i]])+'\n')
rev_handle[r].write('\t'.join(
[cname,
'%d' % i,
'%d' % (i + 1),
'%d' % rev_counts[r][i]]) + '\n')

print "completed %s"%cname
print("completed %s" % cname)

sam_handle.close()
for r in utils.READ_LENGTHS:
Expand All @@ -179,16 +197,20 @@ def convert_riboseq(options):
pipe = subprocess.Popen("%s -f -b 2 -e 3 -0 %s.%d.gz"%(options.tabix, rev_count_file, r), \
stdout=subprocess.PIPE, shell=True)
stdout = pipe.communicate()[0]
print "Compressed file with ribosome footprint counts on forward strand is %s"%(fwd_count_file+'.%d.gz'%r)
print "Compressed file with ribosome footprint counts on reverse strand is %s"%(rev_count_file+'.%d.gz'%r)
print(
"Compressed file with ribosome footprint counts on forward strand is %s"
% (fwd_count_file + '.%d.gz' % r))
print(
"Compressed file with ribosome footprint counts on reverse strand is %s"
% (rev_count_file + '.%d.gz' % r))


if __name__=="__main__":
if __name__ == "__main__":

options = parse_args()

if options.dtype=="rnaseq":
if options.dtype == "rnaseq":
convert_rnaseq(options)

elif options.dtype=="riboseq":
elif options.dtype == "riboseq":
convert_riboseq(options)

57 changes: 33 additions & 24 deletions compute_mappability.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import numpy as np
import subprocess
import pysam
Expand All @@ -6,17 +8,19 @@

MIN_MAP_QUAL = 10


def parse_args():
parser = argparse.ArgumentParser(description=" convert bam data format to tabix data format, "
" for ribosome profiling and RNA-seq data ")
parser = argparse.ArgumentParser(
description=" convert bam data format to tabix data format, "
" for ribosome profiling and RNA-seq data ")

parser.add_argument("bam_file",
action="store",
help="path to bam input file")
parser.add_argument(
"bam_file", action="store", help="path to bam input file")

parser.add_argument("mappability_file_prefix",
action="store",
help="prefix of mappability file")
parser.add_argument(
"mappability_file_prefix",
action="store",
help="prefix of mappability file")

options = parser.parse_args()

Expand All @@ -25,8 +29,8 @@ def parse_args():

return options

def which(program):

def which(program):
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

Expand All @@ -43,14 +47,15 @@ def is_exe(fpath):

return None


def compute_mappability(options):

# file names and handles
map_file = options.mappability_file_prefix
map_handle = open(map_file, 'w')
sam_handle = pysam.Samfile(options.bam_file, "rb")

for cname,clen in zip(sam_handle.references,sam_handle.lengths):
for cname, clen in zip(sam_handle.references, sam_handle.lengths):

# fetch reads in chromosome
sam_iter = sam_handle.fetch(reference=cname)
Expand All @@ -70,32 +75,34 @@ def compute_mappability(options):
if not read.is_reverse:
mapped_site = int(read.positions[0])
true_chrom, true_site = read.query_name.split(':')[:2]
if read.reference_name==true_chrom and mapped_site==int(true_site):
if read.reference_name == true_chrom and mapped_site == int(
true_site):
mappable_positions.append(mapped_site)

if len(mappable_positions)>0:
if len(mappable_positions) > 0:

# get boundaries of mappable portions of the genome
mappable_positions = np.sort(mappable_positions)

boundaries = mappable_positions[:-1]-mappable_positions[1:]
indices = np.where(boundaries<-1)[0]
ends = (mappable_positions[indices]+1).tolist()
boundaries = mappable_positions[:-1] - mappable_positions[1:]
indices = np.where(boundaries < -1)[0]
ends = (mappable_positions[indices] + 1).tolist()
try:
ends.append(mappable_positions[-1]+1)
ends.append(mappable_positions[-1] + 1)
except IndexError:
pdb.set_trace()

boundaries = mappable_positions[1:]-mappable_positions[:-1]
indices = np.where(boundaries>1)[0]+1
boundaries = mappable_positions[1:] - mappable_positions[:-1]
indices = np.where(boundaries > 1)[0] + 1
starts = mappable_positions[indices].tolist()
starts.insert(0,mappable_positions[0])
starts.insert(0, mappable_positions[0])

# write to file
for start,end in zip(starts,ends):
map_handle.write('\t'.join([cname, '%d'%start, '%d'%end])+'\n')
for start, end in zip(starts, ends):
map_handle.write('\t'.join(
[cname, '%d' % start, '%d' % end]) + '\n')

print "completed %s"%cname
print("completed %s" % cname)

sam_handle.close()
map_handle.close()
Expand All @@ -110,9 +117,11 @@ def compute_mappability(options):
stdout=subprocess.PIPE, shell=True)
stdout = pipe.communicate()[0]

print "completed computing mappability from BAM file %s"%options.bam_file
print(
"completed computing mappability from BAM file %s" % options.bam_file)


if __name__=="__main__":
if __name__ == "__main__":

options = parse_args()

Expand Down
Loading