Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add customizable samtools exclude flag #503

Merged
merged 1 commit into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CRISPResso2/CRISPRessoCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,9 +819,9 @@ def process_bam(bam_filename, bam_chr_loc, output_bam, variantCache, ref_names,
crispresso_cmd_to_write = ' '.join(sys.argv)
sam_out.write('@PG\tID:crispresso2\tPN:crispresso2\tVN:'+CRISPRessoShared.__version__+'\tCL:"'+crispresso_cmd_to_write+'"\n')
if bam_chr_loc != "":
proc = sb.Popen(['samtools', 'view', bam_filename, bam_chr_loc], stdout=sb.PIPE, encoding='utf-8')
proc = sb.Popen(['samtools', 'view', '-F', args.samtools_exclude_flags, bam_filename, bam_chr_loc], stdout=sb.PIPE, encoding='utf-8')
else:
proc = sb.Popen(['samtools', 'view', bam_filename], stdout=sb.PIPE, encoding='utf-8')
proc = sb.Popen(['samtools', 'view', '-F', args.samtools_exclude_flags, bam_filename], stdout=sb.PIPE, encoding='utf-8')
num_reads = 0

# Reading through the bam file and enriching variantCache as a dictionary with the following:
Expand Down
37 changes: 24 additions & 13 deletions CRISPResso2/CRISPRessoPooledCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,22 @@ def get_n_reads_bam(bam_filename):
p = sb.Popen("samtools view -c %s" % bam_filename, shell=True, stdout=sb.PIPE)
return int(p.communicate()[0])

def get_n_aligned_bam(bam_filename):
p = sb.Popen("samtools view -F 0x904 -c %s" % bam_filename, shell=True, stdout=sb.PIPE)
def calculate_aligned_samtools_exclude_flags(samtools_exclude_flags):
"""Calculate the samtools exclude flags for aligned reads.

This function calculates the samtools exclude flags for aligned reads
by filtering 0x900 (not primary alignment and supplementary alignment)
and also including any other user specified filters.
"""
samtools_exclude_flags = int(samtools_exclude_flags, base=16)
return hex(0x900 | samtools_exclude_flags)

def get_n_aligned_bam(bam_filename, samtools_exclude_flags):
p = sb.Popen(f"samtools view -F {calculate_aligned_samtools_exclude_flags(samtools_exclude_flags)} -c {bam_filename}", shell=True, stdout=sb.PIPE)
return int(p.communicate()[0])

def get_n_aligned_bam_region(bam_filename, chr_name, chr_start, chr_end):
p = sb.Popen("samtools view -F 0x904 -c %s %s:%d-%d" %(bam_filename, chr_name, chr_start, chr_end), shell=True, stdout=sb.PIPE)
def get_n_aligned_bam_region(bam_filename, chr_name, chr_start, chr_end, samtools_exclude_flags):
p = sb.Popen(f"samtools view -F {calculate_aligned_samtools_exclude_flags(samtools_exclude_flags)} -c {bam_filename} {chr_name}:{chr_start}-{chr_end}", shell=True, stdout=sb.PIPE)
return int(p.communicate()[0])

def find_overlapping_genes(row, df_genes):
Expand Down Expand Up @@ -786,12 +796,13 @@ def main():
info('Alignment command: ' + aligner_command, {'percent_complete': 15})
sb.call(aligner_command, shell=True)

N_READS_ALIGNED = get_n_aligned_bam(bam_filename_amplicons)
N_READS_ALIGNED = get_n_aligned_bam(bam_filename_amplicons, args.samtools_exclude_flags)

if args.limit_open_files_for_demux:
bam_iter = CRISPRessoShared.get_command_output(
'(samtools sort {bam_file} | samtools view -F 4) 2>> {log_file}'.format(
'(samtools sort {bam_file} | samtools view -F {samtools_exclude_flags}) 2>> {log_file}'.format(
bam_file=bam_filename_amplicons,
samtools_exclude_flags=args.samtools_exclude_flags,
log_file=log_filename,
),
)
Expand Down Expand Up @@ -824,7 +835,7 @@ def main():
if curr_file is not None:
curr_file.close()
else:
s1 = r"samtools view -F 4 %s 2>>%s | grep -v ^'@'" % (bam_filename_amplicons,log_filename)
s1 = rf"samtools view -F {args.samtools_exclude_flags} {bam_filename_amplicons} 2>>{log_filename} | grep -v ^'@'"
s2 = r'''|awk '{ gzip_filename=sprintf("gzip >> OUTPUTPATH%s.fastq.gz",$3);\
print "@"$1"\n"$10"\n+\n"$11 | gzip_filename;}' '''

Expand Down Expand Up @@ -1014,7 +1025,7 @@ def rreplace(s, old, new):
info('Index file for input .bam file does not exist. Generating bam index file.')
sb.call('samtools index %s' % bam_filename_genome, shell=True)

N_READS_ALIGNED = get_n_aligned_bam(bam_filename_genome)
N_READS_ALIGNED = get_n_aligned_bam(bam_filename_genome, args.samtools_exclude_flags)
# save progress up to this point
crispresso2_info['running_info']['finished_steps']['n_reads_aligned_genome'] = N_READS_ALIGNED
CRISPRessoShared.write_crispresso_info(
Expand All @@ -1031,7 +1042,7 @@ def rreplace(s, old, new):

sb.call('samtools index %s' % bam_filename_genome, shell=True)

N_READS_ALIGNED = get_n_aligned_bam(bam_filename_genome)
N_READS_ALIGNED = get_n_aligned_bam(bam_filename_genome, args.samtools_exclude_flags)

# save progress up to this point
crispresso2_info['running_info']['finished_steps']['n_reads_aligned_genome'] = N_READS_ALIGNED
Expand Down Expand Up @@ -1061,7 +1072,7 @@ def rreplace(s, old, new):

# if we should only demultiplex where amplicons aligned... (as opposed to the whole genome)
if RUNNING_MODE=='AMPLICONS_AND_GENOME' and not args.demultiplex_genome_wide:
s1 = r'''samtools view -F 0x0004 %s __REGIONCHR__:__REGIONSTART__-__REGIONEND__ 2>>%s |''' % (bam_filename_genome, log_filename)+\
s1 = rf'''samtools view -F {args.samtools_exclude_flags} {bam_filename_genome} __REGIONCHR__:__REGIONSTART__-__REGIONEND__ 2>>{log_filename} |''' +\
r'''awk 'BEGIN{OFS="\t";num_records=0;fastq_filename="__OUTPUTPATH__REGION___REGIONCHR_____REGIONSTART_____REGIONEND__.fastq";} \
{ \
print "@"$1"\n"$10"\n+\n"$11 > fastq_filename; \
Expand Down Expand Up @@ -1093,7 +1104,7 @@ def rreplace(s, old, new):
else:
# next, create the general demux command
# variables like __CHR__ will be subbed out below for each iteration
s1 = r'''samtools view -F 0x0004 %s __CHR____REGION__ 2>>%s |''' % (bam_filename_genome, log_filename) + \
s1 = rf'''samtools view -F {args.samtools_exclude_flags} {bam_filename_genome} __CHR____REGION__ 2>>{log_filename} |''' + \
r'''awk 'BEGIN {OFS="\t"} {bpstart=$4; bpend=bpstart; split ($6,a,"[MIDNSHP]"); n=0;\
for (i=1; i in a; i++){\
n+=1+length(a[i]);\
Expand Down Expand Up @@ -1166,13 +1177,13 @@ def rreplace(s, old, new):
curr_end = curr_pos + chr_step_size
while curr_end < chr_len:
# make sure there aren't any reads at this breakpoint
n_reads_at_end = get_n_aligned_bam_region(bam_filename_genome, chr_str, curr_end-5, curr_end+5)
n_reads_at_end = get_n_aligned_bam_region(bam_filename_genome, chr_str, curr_end-5, curr_end+5, args.samtools_exclude_flags)
while n_reads_at_end > 0:
curr_end += 500 # look for another place with no reads
if curr_end >= chr_len:
curr_end = chr_len
break
n_reads_at_end = get_n_aligned_bam_region(bam_filename_genome, chr_str, curr_end-5, curr_end+5)
n_reads_at_end = get_n_aligned_bam_region(bam_filename_genome, chr_str, curr_end-5, curr_end+5, args.samtools_exclude_flags)

chr_output_filename = _jp('MAPPED_REGIONS/%s_%s_%s.info' % (chr_str, curr_pos, curr_end))
sub_chr_command = chr_cmd.replace("__REGION__", ":%d-%d "%(curr_pos, curr_end)).replace("__DEMUX_CHR_LOGFILENAME__",chr_output_filename)
Expand Down
11 changes: 6 additions & 5 deletions CRISPResso2/CRISPRessoWGSCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def get_n_reads_fastq(fastq_filename):
n_reads = int(float(p.communicate()[0])/4.0)
return n_reads

def extract_reads(row):
def extract_reads(row, samtools_exclude_flags):
if row.sequence:
#create place-holder fastq files
open(row.fastq_file_trimmed_reads_in_region, 'w+').close()
Expand All @@ -217,7 +217,7 @@ def extract_reads(row):

info('Extracting reads in:%s and creating .bam file: %s' % (region, row.bam_file_with_reads_in_region))

cmd=r'''samtools view -b -F 4 --reference %s %s %s > %s ''' % (row.reference_file, row.original_bam, region, row.bam_file_with_reads_in_region)
cmd = rf'''samtools view -b -F {samtools_exclude_flags} --reference {row.reference_file} {row.original_bam} {region} > {row.bam_file_with_reads_in_region} '''
sb.call(cmd, shell=True)

cmd=r'''samtools index %s ''' % (row.bam_file_with_reads_in_region)
Expand All @@ -232,10 +232,10 @@ def extract_reads(row):

return row

def extract_reads_chunk(df):
def extract_reads_chunk(df, samtools_exclude_flags):
new_df = pd.DataFrame(columns=df.columns)
for i in range(len(df)):
new_df.loc[i] = extract_reads(df.iloc[i].copy())
new_df.loc[i] = extract_reads(df.iloc[i].copy(), samtools_exclude_flags)
new_df.set_index(df.index,inplace=True)
return new_df

Expand Down Expand Up @@ -577,7 +577,8 @@ def set_filenames(row):

else:
#run region extraction here
df_regions = CRISPRessoMultiProcessing.run_pandas_apply_parallel(df_regions, extract_reads_chunk, n_processes_for_wgs)
extract_reads_chunk_partial = lambda x: extract_reads_chunk(x, args.samtools_exclude_flags)
df_regions = CRISPRessoMultiProcessing.run_pandas_apply_parallel(df_regions, extract_reads_chunk_partial, n_processes_for_wgs)
df_regions.sort_values('region_number', inplace=True)
cols_to_print = ["chr_id", "bpstart", "bpend", "sgRNA", "Expected_HDR", "Coding_sequence", "sequence", "n_reads", "bam_file_with_reads_in_region", "fastq_file_trimmed_reads_in_region"]
if args.gene_annotations:
Expand Down
14 changes: 14 additions & 0 deletions CRISPResso2/args.json
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,20 @@
"default": "None",
"tools": ["Core", "Batch", "Pooled", "WGS"]
},
"samtools_exclude_flags": {
"keys": ["--samtools_exclude_flags"],
"help": "Exclude reads with any of the specified flags set in the SAM/BAM file. Flags can be specified in either base 16 (hex) or base 10. Default is 4 (read unmapped).",
"type": "str",
"default": "4",
"tools": ["Pooled", "WGS"]
},
"samtools_exclude_flags_core": {
"keys": ["--samtools_exclude_flags"],
"help": "Exclude reads with any of the specified flags set in the SAM/BAM file. Flags can be specified in either base 16 (hex) or base 10. Default is 0 (no reads filtered).",
"type": "str",
"default": "0",
"tools": ["Core"]
},
"stringent_flash_merging": {
"keys": ["--stringent_flash_merging"],
"help": "DEPRECATED in v2.3.0",
Expand Down
17 changes: 17 additions & 0 deletions tests/unit_tests/test_CRISPRessoPooledCORE.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from CRISPResso2 import CRISPRessoPooledCORE


def test_calculate_aligned_samtools_exclude_flags():
assert CRISPRessoPooledCORE.calculate_aligned_samtools_exclude_flags('0') == hex(0x900)


def test_calculate_aligned_samtools_exclude_flags_4():
assert CRISPRessoPooledCORE.calculate_aligned_samtools_exclude_flags('4') == hex(0x904)


def test_calculate_aligned_samtools_exclude_flags_9():
assert CRISPRessoPooledCORE.calculate_aligned_samtools_exclude_flags('9') == hex(0x909)


def test_calculate_aligned_samtools_exclude_flags_0x100():
assert CRISPRessoPooledCORE.calculate_aligned_samtools_exclude_flags('0x100') == hex(0x900)
Loading