Skip to content

Commit

Permalink
Merge pull request #173 from nf-core/microshifts
Browse files Browse the repository at this point in the history
Add parameter for grouping detected circRNAs that are very close
  • Loading branch information
nictru authored Sep 26, 2024
2 parents edced9e + 44caaf3 commit f96729c
Show file tree
Hide file tree
Showing 15 changed files with 232 additions and 247 deletions.
2 changes: 1 addition & 1 deletion conf/full.config
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@

params {
tools = 'circexplorer2,ciriquant,find_circ,circrna_finder,mapsplice,dcc,segemehl'
tool_filter = 2
min_tools = 2
}
78 changes: 17 additions & 61 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -596,63 +596,6 @@ process {
]
}

withName: 'MASK_SCORES' {
// Take bed file and replace the score column with a dot
ext.args = "-v FS='\\t' -v OFS='\\t' '{ print \$1, \$2, \$3, \$4, \".\", \$6 }'"
ext.suffix = {"${meta.tool}.masked.bed"}
publishDir = [
path: { "${params.outdir}/bsj_detection/tools/${meta.tool}/masked" },
mode: params.publish_dir_mode,
saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null }
]
}

withName: 'CONCAT_TOOLS_PER_SAMPLE' {
// GNU sort by columns 1,2,3,4,6
ext.args = "-k1,1 -k2,2n -k3,3n -k4,4 -k6,6"
ext.suffix = {"sorted.bed"}
publishDir = [
path: { "${params.outdir}/bsj_detection/samples/${meta.id}" },
mode: params.publish_dir_mode,
saveAs: { filename -> null } // The same data will be published in COUNT_TOOLS in a better format
]
}

withName: 'COUNT_TOOLS' {
// Count the number of tools that support each circRNA
ext.summary_col = 5
ext.args = "-g 1,2,3,4,6 -o count"
ext.suffix = {"tool_counts.bed"}
publishDir = [
path: { "${params.outdir}/bsj_detection/samples/${meta.id}" },
mode: params.publish_dir_mode,
saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null }
]
}

withName: 'FILTER_MIN_TOOLS' {
// Keep only rows with at least the minimum number of tools
// Replace the score column with a dot
ext.args = { "-v FS='\\t' -v OFS='\\t' '{ if (\$6 >= ${params.tool_filter}) { print \$1, \$2, \$3, \$4, \".\", \$5 } }'" }
ext.suffix = "filtered.bed"
publishDir = [
path: { "${params.outdir}/bsj_detection/samples/${meta.id}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'CONCAT_SAMPLES' {
// GNU sort by columns 1,2,3,4,6
ext.args = "-k1,1 -k2,2n -k3,3n -k4,4 -k6,6 -u"
ext.suffix = {"combined.bed"}
publishDir = [
path: { "${params.outdir}/bsj_detection/combined" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'EXTRACT_COUNTS' {
// Add meta.id as header
// Keep columns 4,5
Expand All @@ -673,17 +616,30 @@ process {
]
}

withName: UPSET_SAMPLES {
ext.when = { params.tools.split(',').length > 1 }
withName: 'BED_ADD_SAMPLE_TOOL' {
ext.args = { "-v FS='\\t' -v OFS='\\t' '{ print \$1, \$2, \$3, \$4, \$5, \$6, \"${meta.id}\", \"${meta.tool}\" }'" }
ext.prefix = { "${meta.id}_${meta.tool}" }
ext.suffix = { "meta.bed" }
publishDir = [
path: { "${params.outdir}/bsj_detection/tools/${meta.tool}/meta" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: 'COMBINE_TOOLS_PER_SAMPLE' {
ext.suffix = "combined.bed"

publishDir = [
path: { "${params.outdir}/bsj_detection/samples/${meta.id}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: UPSET_ALL {
ext.when = { params.tools.split(',').length > 1 }
withName: 'COMBINE_SAMPLES' {
ext.suffix = "combined.bed"

publishDir = [
path: { "${params.outdir}/bsj_detection/combined" },
mode: params.publish_dir_mode,
Expand Down
6 changes: 3 additions & 3 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ The rough workflow for the BSJ detection looks like this:
2. Bring the tool outputs into a common format.
3. Apply a threshold (parameter `bsj_reads`) to the BSJ reads to filter out lowly supported BSJs.
4. Combine all tool-specific BSJ calls per sample into a single file.
5. Filter out BSJs that are not supported by at least as many tools as specified by`tool_filter`.
5. Filter out BSJs that are not supported by at least as many tools as specified by`min_tools`.
6. Merge all samples into a single file. This now represents the "circular transcriptome"

### Per tool
Expand Down Expand Up @@ -323,7 +323,7 @@ STAR in 2-pass mode is used to identify novel splice junctions in RNA-Seq data.

- `bsj_detection/samples/${sample_id}/`
- `*.grouped.bed`: Grouped BSJ calls in BED format. Score column represents the number of tools that support the BSJ.
- `*.filtered.bed`: Based on `*.grouped.bed`, but filtered for BSJs with at least `tool_filter` supporting tools.
- `*.filtered.bed`: Based on `*.grouped.bed`, but filtered for BSJs with at least `min_tools` supporting tools.
- `*.intersect_gtf.bed`: Intersection of `*.filtered.bed` with the reference GTF file. Intermediate file for annotation.
- `*.intersect_database.bed`: Intersection of `*.filtered.bed` with the database BED file. Intermediate file for annotation.
- `*.annotated.bed`: Annotated BSJ calls in BED format, based on `*.filtered.bed`.
Expand All @@ -333,7 +333,7 @@ STAR in 2-pass mode is used to identify novel splice junctions in RNA-Seq data.

</details>

nf-core/circrna produces a sample-specific set of BSJ calls. The BSJ calls are filtered for BSJs with at least `tool_filter` supporting tools. The filtered BSJ calls are then annotated with the reference GTF file and the database BED file. An upset plot is generated to visualise the overlap of BSJ calls across tools.
nf-core/circrna produces a sample-specific set of BSJ calls. The BSJ calls are filtered for BSJs with at least `min_tools` supporting tools. The filtered BSJ calls are then annotated with the reference GTF file and the database BED file. An upset plot is generated to visualise the overlap of BSJ calls across tools.

### Combined

Expand Down
2 changes: 1 addition & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ This part of the pipeline is responsible for the detection of back-splice juncti

The tools to be used can be specified using the `tools` parameter.
Each of the tools also quantifies how many reads support each BSJ. You can specify a cutoff for the minimum number of reads supporting a BSJ using the `bsj_reads` parameter.
Additionally, the parameter `tool_filter` can be used to specify how many tools a BSJ has to be detected by to be considered as a valid hit.
Additionally, the parameter `min_tools` can be used to specify how many tools a BSJ has to be detected by to be considered as a valid hit.

For instructions on how to interpret the output of this section, please check out the [output documentation](https://nf-co.re/circrna/output#bsj-detection).

Expand Down
6 changes: 6 additions & 0 deletions modules/local/combine_beds/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
dependencies:
- conda-forge::polars=1.8.2
- conda-forge::upsetplot=0.9.0
26 changes: 26 additions & 0 deletions modules/local/combine_beds/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
process COMBINE_BEDS {
tag "$meta.id"
label "process_low"

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'oras://community.wave.seqera.io/library/polars_upsetplot:0fc26c37f7821606' :
'community.wave.seqera.io/library/polars_upsetplot:3382b69d3c1f6bf1' }"

input:
tuple val(meta), path(beds)
val(max_shift)
val(min_tools)
val(min_samples)

output:
tuple val(meta), path("${prefix}.${suffix}"), emit: combined
path "*.png" , emit: plots, optional: true
path "*.json" , emit: multiqc, optional: true
path "versions.yml" , emit: versions

script:
prefix = task.ext.prefix ?: "${meta.id}"
suffix = task.ext.suffix ?: "bed"
template "combine.py"
}
105 changes: 105 additions & 0 deletions modules/local/combine_beds/templates/combine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python

import platform
import base64
import json

import polars as pl
import upsetplot
import matplotlib
import matplotlib.pyplot as plt

def format_yaml_like(data: dict, indent: int = 0) -> str:
"""Formats a dictionary to a YAML-like string.
Args:
data (dict): The dictionary to format.
indent (int): The current indentation level.
Returns:
str: A string formatted as YAML.
"""
yaml_str = ""
for key, value in data.items():
spaces = " " * indent
if isinstance(value, dict):
yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
else:
yaml_str += f"{spaces}{key}: {value}\\n"
return yaml_str

max_shift = int("${max_shift}")
min_tools = int("${min_tools}")
min_samples = int("${min_samples}")
meta_id = "{meta_id}"
prefix = "${prefix}"

df = pl.scan_csv("*.bed",
separator="\\t",
has_header=False,
new_columns=["chr", "start", "end", "name", "score", "strand", "sample", "tool"])

for col in ["end", "start"]:
df = df.sort(["chr", col])
df = df.with_columns(**{f"{col}_group": pl.col(col).diff().fill_null(0).gt(max_shift).cum_sum()})

df = (df.group_by(["chr", "start_group", "end_group"])
.agg( pl.col("start").median().round().cast(int),
pl.col("end").median().round().cast(int),
pl.col("sample").unique().alias("samples"),
pl.col("tool").unique().alias("tools"),
pl.col("sample").n_unique().alias("n_samples"),
pl.col("tool").n_unique().alias("n_tools"))
.with_columns(name=pl.col("chr").cast(str) + ":" + pl.col("start").cast(str) + "-" + pl.col("end").cast(str),
score=pl.lit("."),
strand=pl.lit(".")))

for col in ["samples", "tools"]:
series = pl.Series(df.select(col).collect())
if series.explode().n_unique() == 1:
continue
memberships = series.to_list()
dataset = upsetplot.from_memberships(memberships)
upsetplot.plot(dataset,
orientation='horizontal',
show_counts=True,
subset_size="count")
plot_file = f"{prefix}_{col}.upset.png"
plt.savefig(plot_file)

image_string = base64.b64encode(open(plot_file, "rb").read()).decode("utf-8")
image_html = f'<div class="mqc-custom-content-image"><img src="data:image/png;base64,{image_string}" /></div>'

multiqc = {
'id': f"{meta_id}_upset_{col}",
'parent_id': "upset_plots",
'parent_name': 'UpSet Plots',
'parent_description': 'UpSet plots showing the overlap between tools for each sample',
'section_name': f'UpSet {col}: {meta_id} ',
'description': f'UpSet plot showing the overlap between {col} for {meta_id}',
'plot_type': 'image',
'data': image_html
}

with open(f"{prefix}_{col}.upset_mqc.json", "w") as f:
f.write(json.dumps(multiqc, indent=4))


df = (df.filter((pl.col("n_tools") >= min_tools) & (pl.col("n_samples") >= min_samples))
.select(["chr", "start", "end", "name", "score", "strand"]))

df.collect().write_csv("${prefix}.${suffix}", separator="\\t", include_header=False)

# Versions

versions = {
"${task.process}": {
"python": platform.python_version(),
"polars": pl.__version__,
"upsetplot": upsetplot.__version__,
"matplotlib": matplotlib.__version__
}
}

with open("versions.yml", "w") as f:
f.write(format_yaml_like(versions))
5 changes: 2 additions & 3 deletions modules/local/fail_on_empty/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@ process FAIL_ON_EMPTY {

exec:
if (!bed) {
log.error ((params.tool_filter <= 1 ?
"No circular RNAs were found by any tool in any sample.\n" :
"No circular RNAs were found by at least ${params.tool_filter} tools in any sample.\n") +
log.error ((
"No circular RNAs were found by at least ${params.min_tools} tools and in at least ${params.min_samples} samples.\n") +
"Feel free to check the preliminary results in '${params.outdir}'\n" +
(params.save_intermediates ? "" :
"You can enable saving intermediate files by setting the parameter 'save_intermediates' to 'true'."))
Expand Down
2 changes: 1 addition & 1 deletion modules/local/majority_vote/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ process MAJORITY_VOTE {
task.ext.when == null || task.ext.when

script:
min_tools = params.mirna_tool_filter
min_tools = params.mirna_min_tools
template 'majority.py'

stub:
Expand Down
3 changes: 2 additions & 1 deletion modules/local/pygtftk/tabulate/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ process PYGTFTK_TABULATE {
"""
gtftk tabulate \\
$args \\
-i $gtf > ${outfile}
-i $gtf | \\
grep -v '^tabulate()' > ${outfile}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
22 changes: 0 additions & 22 deletions modules/local/upset/main.nf

This file was deleted.

Loading

0 comments on commit f96729c

Please sign in to comment.