From 11811ea60b8c21815657fea7734441cfdc008879 Mon Sep 17 00:00:00 2001 From: taranewman Date: Thu, 24 Oct 2024 18:22:40 -0700 Subject: [PATCH] add resistance gene coverage per drug --- bin/add_gene_coverage_to_res_csv.py | 97 +++++++++++++++++++++++++++++ main.nf | 2 +- modules/tbprofiler.nf | 12 +++- nextflow.config | 1 + 4 files changed, 110 insertions(+), 2 deletions(-) create mode 100755 bin/add_gene_coverage_to_res_csv.py diff --git a/bin/add_gene_coverage_to_res_csv.py b/bin/add_gene_coverage_to_res_csv.py new file mode 100755 index 0000000..e530216 --- /dev/null +++ b/bin/add_gene_coverage_to_res_csv.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +import csv +import argparse +from pathlib import Path + +def process_coverage(coverage_file: Path, threshold: float) -> dict: + """ + Calculate gene coverage metric info from resistance gene coverage csv + + :param coverage_file: Path to the gene coverage CSV file. + :type coverage_file: pathlib.Path + :param threshold: The percentage threshold to consider a gene with complete coverage (default 100%). + :type threshold: float + :return: Dictionary wtih number of genes and genes with complete coverage above threshold + :rtype: dict + """ + coverage_data = {} + + with coverage_file.open('r') as f: + reader = csv.DictReader(f) + for row in reader: + drug = row['drug'] + percent_coverage = float(row['percent_of_gene_covered_above_depth_threshold']) + + if drug not in coverage_data: + coverage_data[drug] = { + 'total_genes': 0, + 'fully_covered_genes': 0 + } + + coverage_data[drug]['total_genes'] += 1 + if percent_coverage >= threshold: + coverage_data[drug]['fully_covered_genes'] += 1 + + return coverage_data + +def calculate_metrics(coverage_data: dict) -> dict: + """ + Calculate coverage metrics for each drug based on gene coverage data. + + :param coverage_data: A dictionary containing total and fully covered genes for each drug. + :type coverage_data: dict + :return: A dictionary with calculated metrics including the number of genes assessed, + the number of genes with complete coverage, and whether all genes have full coverage. + :rtype: dict + """ + metrics = {} + for drug, data in coverage_data.items(): + num_genes_assessed = data['total_genes'] + num_genes_with_complete_coverage = data['fully_covered_genes'] + all_genes_with_complete_coverage = num_genes_assessed == num_genes_with_complete_coverage + + metrics[drug] = { + 'num_genes_assessed': num_genes_assessed, + 'num_genes_with_complete_coverage': num_genes_with_complete_coverage, + 'all_genes_with_complete_coverage': all_genes_with_complete_coverage + } + + return metrics + +def main(args: argparse.Namespace) -> None: + # Process the coverage data to get gene coverage metrics + threshold = args.threshold + coverage_data = process_coverage(args.coverage, threshold) + coverage_metrics = calculate_metrics(coverage_data) + + # Add metrics to resistance file + output_file = args.output + with args.resistance.open('r') as res_file, output_file.open('w', newline='') as out_file: + res_reader = csv.DictReader(res_file) + fieldnames = res_reader.fieldnames + ['all_genes_with_complete_coverage', 'num_genes_assessed', 'num_genes_with_complete_coverage'] + writer = csv.DictWriter(out_file, fieldnames=fieldnames) + writer.writeheader() + + for row in res_reader: + drug = row['drug'].lower() + if drug in coverage_metrics: + metrics = coverage_metrics[drug] + row['all_genes_with_complete_coverage'] = metrics['all_genes_with_complete_coverage'] + row['num_genes_assessed'] = metrics['num_genes_assessed'] + row['num_genes_with_complete_coverage'] = metrics['num_genes_with_complete_coverage'] + else: + row['all_genes_with_complete_coverage'] = 'NA' + row['num_genes_assessed'] = 'NA' + row['num_genes_with_complete_coverage'] = 'NA' + + writer.writerow(row) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Calculate resistance gene coverage metrics.') + parser.add_argument('--resistance', required=True, type=Path, help='Path for resistance CSV file.') + parser.add_argument('--coverage', required=True, type=Path, help='Path for gene coverage CSV file.') + parser.add_argument('--threshold', type=float, default=100.0, help='Threshold for complete gene coverage (default: 100).') + parser.add_argument('--output', required=True, type=Path, help='Output CSV file name.') + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/main.nf b/main.nf index a62eb60..0cc46e3 100644 --- a/main.nf +++ b/main.nf @@ -75,7 +75,7 @@ workflow { generate_low_coverage_bed(ch_depths) - calculate_gene_coverage(ch_depths.combine(ch_resistance_genes_bed)) + calculate_gene_coverage(ch_depths.combine(ch_resistance_genes_bed).join(tbprofiler.out.resistance_csv)) if (params.collect_outputs) { fastp.out.csv.map{ it -> it[1] }.collectFile( diff --git a/modules/tbprofiler.nf b/modules/tbprofiler.nf index 3fd0f6b..5e2fdc6 100644 --- a/modules/tbprofiler.nf +++ b/modules/tbprofiler.nf @@ -345,12 +345,14 @@ process calculate_gene_coverage { tag { sample_id } publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_resistance_gene_coverage.csv" + publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_resistance_drug_coverage.csv" input: - tuple val(sample_id), path(depths), path(resistance_genes_bed) + tuple val(sample_id), path(depths), path(resistance_genes_bed), path(resistance_csv) output: tuple val(sample_id), path("${sample_id}_resistance_gene_coverage.csv") + tuple val(sample_id), path("${sample_id}_resistance_drug_coverage.csv") script: """ @@ -359,5 +361,13 @@ process calculate_gene_coverage { --depth ${depths} \ --threshold ${params.min_depth} \ --output ${sample_id}_resistance_gene_coverage.csv + + add_gene_coverage_to_res_csv.py \ + --resistance ${resistance_csv} \ + --coverage ${sample_id}_resistance_gene_coverage.csv \ + --threshold ${params.min_gene_coverage} \ + --output ${sample_id}_resistance_drug_coverage.csv + + """ } diff --git a/nextflow.config b/nextflow.config index c18f920..ff7d14d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,6 +28,7 @@ params { pipeline_minor_version = parseMinorVersion(manifest.toMap().get('version')) collect_outputs = false collected_outputs_prefix = 'collected' + min_gene_coverage = 100.0 } def makeFastqSearchPath ( illumina_suffixes, fastq_exts ) {