Merge pull request #173 from nf-core/microshifts

Add parameter for grouping detected circRNAs that are very close
nf-core · Sep 26, 2024 · f96729c · f96729c
2 parents edced9e + 44caaf3
commit f96729c
Show file tree

Hide file tree

Showing 15 changed files with 232 additions and 247 deletions.
diff --git a/conf/full.config b/conf/full.config
@@ -13,5 +13,5 @@
 
 params {
     tools                      = 'circexplorer2,ciriquant,find_circ,circrna_finder,mapsplice,dcc,segemehl'
-    tool_filter                = 2
+    min_tools                  = 2
 }
diff --git a/conf/modules.config b/conf/modules.config
@@ -596,63 +596,6 @@ process {
         ]
     }
 
-    withName: 'MASK_SCORES' {
-        // Take bed file and replace the score column with a dot
-        ext.args = "-v FS='\\t' -v OFS='\\t' '{ print \$1, \$2, \$3, \$4, \".\", \$6 }'"
-        ext.suffix = {"${meta.tool}.masked.bed"}
-        publishDir = [
-            path: { "${params.outdir}/bsj_detection/tools/${meta.tool}/masked" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null }
-        ]
-    }
-
-    withName: 'CONCAT_TOOLS_PER_SAMPLE' {
-        // GNU sort by columns 1,2,3,4,6
-        ext.args = "-k1,1 -k2,2n -k3,3n -k4,4 -k6,6"
-        ext.suffix = {"sorted.bed"}
-        publishDir = [
-            path: { "${params.outdir}/bsj_detection/samples/${meta.id}" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> null } // The same data will be published in COUNT_TOOLS in a better format
-        ]
-    }
-
-    withName: 'COUNT_TOOLS' {
-        // Count the number of tools that support each circRNA
-        ext.summary_col = 5
-        ext.args = "-g 1,2,3,4,6 -o count"
-        ext.suffix = {"tool_counts.bed"}
-        publishDir = [
-            path: { "${params.outdir}/bsj_detection/samples/${meta.id}" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null }
-        ]
-    }
-
-    withName: 'FILTER_MIN_TOOLS' {
-        // Keep only rows with at least the minimum number of tools
-        // Replace the score column with a dot
-        ext.args = { "-v FS='\\t' -v OFS='\\t' '{ if (\$6 >= ${params.tool_filter}) { print \$1, \$2, \$3, \$4, \".\", \$5 } }'" }
-        ext.suffix = "filtered.bed"
-        publishDir = [
-            path: { "${params.outdir}/bsj_detection/samples/${meta.id}" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-
-    withName: 'CONCAT_SAMPLES' {
-        // GNU sort by columns 1,2,3,4,6
-        ext.args = "-k1,1 -k2,2n -k3,3n -k4,4 -k6,6 -u"
-        ext.suffix = {"combined.bed"}
-        publishDir = [
-            path: { "${params.outdir}/bsj_detection/combined" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
-        ]
-    }
-
     withName: 'EXTRACT_COUNTS' {
         // Add meta.id as header
         // Keep columns 4,5
@@ -673,17 +616,30 @@ process {
         ]
     }
 
-    withName: UPSET_SAMPLES {
-        ext.when = { params.tools.split(',').length > 1 }
+    withName: 'BED_ADD_SAMPLE_TOOL' {
+        ext.args = { "-v FS='\\t' -v OFS='\\t' '{ print \$1, \$2, \$3, \$4, \$5, \$6, \"${meta.id}\", \"${meta.tool}\" }'" }
+        ext.prefix = { "${meta.id}_${meta.tool}" }
+        ext.suffix = { "meta.bed" }
+        publishDir = [
+            path: { "${params.outdir}/bsj_detection/tools/${meta.tool}/meta" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
+    withName: 'COMBINE_TOOLS_PER_SAMPLE' {
+        ext.suffix      = "combined.bed"
+
         publishDir = [
             path: { "${params.outdir}/bsj_detection/samples/${meta.id}" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
 
-    withName: UPSET_ALL {
-        ext.when = { params.tools.split(',').length > 1 }
+    withName: 'COMBINE_SAMPLES' {
+        ext.suffix      = "combined.bed"
+
         publishDir = [
             path: { "${params.outdir}/bsj_detection/combined" },
             mode: params.publish_dir_mode,

diff --git a/docs/output.md b/docs/output.md
@@ -140,7 +140,7 @@ The rough workflow for the BSJ detection looks like this:
 2. Bring the tool outputs into a common format.
 3. Apply a threshold (parameter `bsj_reads`) to the BSJ reads to filter out lowly supported BSJs.
 4. Combine all tool-specific BSJ calls per sample into a single file.
-5. Filter out BSJs that are not supported by at least as many tools as specified by`tool_filter`.
+5. Filter out BSJs that are not supported by at least as many tools as specified by`min_tools`.
 6. Merge all samples into a single file. This now represents the "circular transcriptome"
 
 ### Per tool
@@ -323,7 +323,7 @@ STAR in 2-pass mode is used to identify novel splice junctions in RNA-Seq data.
 
 - `bsj_detection/samples/${sample_id}/`
   - `*.grouped.bed`: Grouped BSJ calls in BED format. Score column represents the number of tools that support the BSJ.
-  - `*.filtered.bed`: Based on `*.grouped.bed`, but filtered for BSJs with at least `tool_filter` supporting tools.
+  - `*.filtered.bed`: Based on `*.grouped.bed`, but filtered for BSJs with at least `min_tools` supporting tools.
   - `*.intersect_gtf.bed`: Intersection of `*.filtered.bed` with the reference GTF file. Intermediate file for annotation.
   - `*.intersect_database.bed`: Intersection of `*.filtered.bed` with the database BED file. Intermediate file for annotation.
   - `*.annotated.bed`: Annotated BSJ calls in BED format, based on `*.filtered.bed`.
@@ -333,7 +333,7 @@ STAR in 2-pass mode is used to identify novel splice junctions in RNA-Seq data.
 
 </details>
 
-nf-core/circrna produces a sample-specific set of BSJ calls. The BSJ calls are filtered for BSJs with at least `tool_filter` supporting tools. The filtered BSJ calls are then annotated with the reference GTF file and the database BED file. An upset plot is generated to visualise the overlap of BSJ calls across tools.
+nf-core/circrna produces a sample-specific set of BSJ calls. The BSJ calls are filtered for BSJs with at least `min_tools` supporting tools. The filtered BSJ calls are then annotated with the reference GTF file and the database BED file. An upset plot is generated to visualise the overlap of BSJ calls across tools.
 
 ### Combined
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -67,7 +67,7 @@ This part of the pipeline is responsible for the detection of back-splice juncti
 
 The tools to be used can be specified using the `tools` parameter.
 Each of the tools also quantifies how many reads support each BSJ. You can specify a cutoff for the minimum number of reads supporting a BSJ using the `bsj_reads` parameter.
-Additionally, the parameter `tool_filter` can be used to specify how many tools a BSJ has to be detected by to be considered as a valid hit.
+Additionally, the parameter `min_tools` can be used to specify how many tools a BSJ has to be detected by to be considered as a valid hit.
 
 For instructions on how to interpret the output of this section, please check out the [output documentation](https://nf-co.re/circrna/output#bsj-detection).
 

diff --git a/modules/local/combine_beds/environment.yml b/modules/local/combine_beds/environment.yml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - conda-forge::polars=1.8.2
+  - conda-forge::upsetplot=0.9.0
diff --git a/modules/local/combine_beds/main.nf b/modules/local/combine_beds/main.nf
@@ -0,0 +1,26 @@
+process COMBINE_BEDS {
+    tag "$meta.id"
+    label "process_low"
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'oras://community.wave.seqera.io/library/polars_upsetplot:0fc26c37f7821606' :
+        'community.wave.seqera.io/library/polars_upsetplot:3382b69d3c1f6bf1' }"
+
+    input:
+    tuple val(meta), path(beds)
+    val(max_shift)
+    val(min_tools)
+    val(min_samples)
+
+    output:
+    tuple val(meta), path("${prefix}.${suffix}"), emit: combined
+    path "*.png"                                , emit: plots, optional: true
+    path "*.json"                               , emit: multiqc, optional: true
+    path "versions.yml"                         , emit: versions
+
+    script:
+    prefix      = task.ext.prefix      ?: "${meta.id}"
+    suffix      = task.ext.suffix      ?: "bed"
+    template "combine.py"
+}
diff --git a/modules/local/combine_beds/templates/combine.py b/modules/local/combine_beds/templates/combine.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+
+import platform
+import base64
+import json
+
+import polars as pl
+import upsetplot
+import matplotlib
+import matplotlib.pyplot as plt
+
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+    """Formats a dictionary to a YAML-like string.
+
+    Args:
+        data (dict): The dictionary to format.
+        indent (int): The current indentation level.
+
+    Returns:
+        str: A string formatted as YAML.
+    """
+    yaml_str = ""
+    for key, value in data.items():
+        spaces = "  " * indent
+        if isinstance(value, dict):
+            yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+        else:
+            yaml_str += f"{spaces}{key}: {value}\\n"
+    return yaml_str
+
+max_shift = int("${max_shift}")
+min_tools = int("${min_tools}")
+min_samples = int("${min_samples}")
+meta_id = "{meta_id}"
+prefix = "${prefix}"
+
+df = pl.scan_csv("*.bed",
+                 separator="\\t",
+                 has_header=False,
+                 new_columns=["chr", "start", "end", "name", "score", "strand", "sample", "tool"])
+
+for col in ["end", "start"]:
+    df = df.sort(["chr", col])
+    df = df.with_columns(**{f"{col}_group": pl.col(col).diff().fill_null(0).gt(max_shift).cum_sum()})
+
+df = (df.group_by(["chr", "start_group", "end_group"])
+    .agg(   pl.col("start").median().round().cast(int),
+            pl.col("end").median().round().cast(int),
+            pl.col("sample").unique().alias("samples"),
+            pl.col("tool").unique().alias("tools"),
+            pl.col("sample").n_unique().alias("n_samples"),
+            pl.col("tool").n_unique().alias("n_tools"))
+    .with_columns(name=pl.col("chr").cast(str) + ":" + pl.col("start").cast(str) + "-" + pl.col("end").cast(str),
+                  score=pl.lit("."),
+                  strand=pl.lit(".")))
+
+for col in ["samples", "tools"]:
+    series = pl.Series(df.select(col).collect())
+    if series.explode().n_unique() == 1:
+        continue
+    memberships = series.to_list()
+    dataset = upsetplot.from_memberships(memberships)
+    upsetplot.plot(dataset,
+                   orientation='horizontal',
+                   show_counts=True,
+                   subset_size="count")
+    plot_file = f"{prefix}_{col}.upset.png"
+    plt.savefig(plot_file)
+
+    image_string = base64.b64encode(open(plot_file, "rb").read()).decode("utf-8")
+    image_html = f'<div class="mqc-custom-content-image"><img src="data:image/png;base64,{image_string}" /></div>'
+
+    multiqc = {
+        'id': f"{meta_id}_upset_{col}",
+        'parent_id': "upset_plots",
+        'parent_name': 'UpSet Plots',
+        'parent_description': 'UpSet plots showing the overlap between tools for each sample',
+        'section_name': f'UpSet {col}: {meta_id} ',
+        'description': f'UpSet plot showing the overlap between {col} for {meta_id}',
+        'plot_type': 'image',
+        'data': image_html
+    }
+
+    with open(f"{prefix}_{col}.upset_mqc.json", "w") as f:
+        f.write(json.dumps(multiqc, indent=4))
+
+
+df = (df.filter((pl.col("n_tools") >= min_tools) & (pl.col("n_samples") >= min_samples))
+        .select(["chr", "start", "end", "name", "score", "strand"]))
+
+df.collect().write_csv("${prefix}.${suffix}", separator="\\t", include_header=False)
+
+# Versions
+
+versions = {
+    "${task.process}": {
+        "python": platform.python_version(),
+        "polars": pl.__version__,
+        "upsetplot": upsetplot.__version__,
+        "matplotlib": matplotlib.__version__
+    }
+}
+
+with open("versions.yml", "w") as f:
+    f.write(format_yaml_like(versions))
diff --git a/modules/local/fail_on_empty/main.nf b/modules/local/fail_on_empty/main.nf
@@ -7,9 +7,8 @@ process FAIL_ON_EMPTY {
 
     exec:
     if (!bed) {
-        log.error ((params.tool_filter <= 1 ?
-            "No circular RNAs were found by any tool in any sample.\n" :
-            "No circular RNAs were found by at least ${params.tool_filter} tools in any sample.\n") +
+        log.error ((
+            "No circular RNAs were found by at least ${params.min_tools} tools and in at least ${params.min_samples} samples.\n") +
             "Feel free to check the preliminary results in '${params.outdir}'\n" +
             (params.save_intermediates ? "" :
             "You can enable saving intermediate files by setting the parameter 'save_intermediates' to 'true'."))

diff --git a/modules/local/majority_vote/main.nf b/modules/local/majority_vote/main.nf
@@ -19,7 +19,7 @@ process MAJORITY_VOTE {
     task.ext.when == null || task.ext.when
 
     script:
-    min_tools = params.mirna_tool_filter
+    min_tools = params.mirna_min_tools
     template 'majority.py'
 
     stub:

diff --git a/modules/local/pygtftk/tabulate/main.nf b/modules/local/pygtftk/tabulate/main.nf
@@ -25,7 +25,8 @@ process PYGTFTK_TABULATE {
     """
     gtftk tabulate \\
         $args \\
-        -i $gtf > ${outfile}
+        -i $gtf | \\
+        grep -v '^tabulate()' > ${outfile}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/upset/main.nf b/modules/local/upset/main.nf