Skip to content

Commit

Permalink
Merge pull request #6348 from paulzierep/decontam
Browse files Browse the repository at this point in the history
Add Decontam
  • Loading branch information
bgruening authored Oct 15, 2024
2 parents 78e6d7e + cf9b946 commit 9e0ce6d
Show file tree
Hide file tree
Showing 12 changed files with 988 additions and 0 deletions.
13 changes: 13 additions & 0 deletions tools/decontam/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: decontam
owner: iuc
description: Removes decontamination features (ASVs/OTUs) using control samples
long_description: |
The decontam package provides simple statistical methods to identify
and visualize contaminating DNA features, allowing them to be removed
and a more accurate picture of sampled communities to be constructed
from marker-gene and metagenomics data.
categories:
- Metagenomics
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontam
homepage_url: https://github.com/benjjneb/decontam
type: unrestricted
173 changes: 173 additions & 0 deletions tools/decontam/decontam.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
<tool id="decontam" name="Decontam" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<macros>
<import>macros.xml</import>
</macros>
<expand macro="bio_tools"/>
<expand macro="requirements"/>
<command detect_errors="exit_code"><![CDATA[
Rscript '$rscript'
]]></command>
<configfiles>
<configfile name="rscript"><![CDATA[
library(tidyverse)
library(phyloseq)
library(ggplot2)
library(decontam)
#if $input_type.select_input == 'phyloseq':
ps <- readRDS("$input_type.phyloseq_object")
sample_data(ps)\$control <- as.logical(sample_data(ps)[["$input_type.control_metadata"]])
#else
## get OTU table (first column is the OTU/ASV ID)
otu <- read_tsv("$input_type.otu")
otu2 <- otu %>% tibble::column_to_rownames(colnames(otu)[1]) #use first column as rownames
OTU <- otu_table(otu2, taxa_are_rows = FALSE)
## get metadata table must have matching OTU/ASV ID in first column
meta <- read_tsv("$input_type.metadata")
meta2 <- meta %>% tibble::column_to_rownames(colnames(meta)[1]) #use first column as rownames
control_column = as.integer("$input_type.control") - 1 ##remove one index since the dataframe uses the first column as index
## convert 0/1 to bool for the control column and store in control column
meta2\$control <- as.logical(meta2[[control_column]])
sampledata <- sample_data(meta2)
ps <- phyloseq(OTU, FALSE, sampledata)
#end if
## plot library_size_vs_control
df <- as.data.frame(sample_data(ps)) # Put sample_data into a ggplot-friendly data.frame
df\$LibrarySize <- sample_sums(ps)
df <- df[order(df\$LibrarySize),]
df\$Index <- seq(nrow(df))
ggplot(data=df, aes(x=Index, y=LibrarySize, color=control)) + geom_point()
ggsave("$library_size_vs_control", device = "png", width = 10, height = 8, units = "cm")
## plot prevalence
contamdf.prev <- isContaminant(ps, method="prevalence", neg="control", threshold=$threshold)
table(contamdf.prev\$contaminant)
ps.pa <- transform_sample_counts(ps, function(abund) 1*(abund>0))
ps.pa.neg <- prune_samples(sample_data(ps.pa)\$control == TRUE, ps.pa)
ps.pa.pos <- prune_samples(sample_data(ps.pa)\$control == FALSE, ps.pa)
## Make data.frame of prevalence in positive and negative samples
df.pa <- data.frame(pa.pos=taxa_sums(ps.pa.pos), pa.neg=taxa_sums(ps.pa.neg),
contaminant=contamdf.prev\$contaminant)
ggplot(data=df.pa, aes(x=pa.neg, y=pa.pos, color=contaminant)) + geom_point() +
xlab("Prevalence (Negative Controls)") + ylab("Prevalence (True Samples)")
ggsave("$prevalence", device = "png", width = 10, height = 8, units = "cm")
## remove contamination features from original data
#if $input_type.select_input == 'phyloseq':
id_name <- "SampleID"
#else
id_name <- colnames(otu)[1] ## we use the same name for the ID column as the OTU input
#end if
ps.noncontam <- prune_taxa(!contamdf.prev\$contaminant, ps)
otu_table(ps.noncontam) %>%
as.data.frame() %>%
rownames_to_column(id_name) -> otu
write.table(otu,
file="$decontam_otu",
sep = "\t",
row.names=FALSE,
quote = FALSE)
saveRDS(ps.noncontam, "$decontam_phyloseq")
]]></configfile>
</configfiles>
<inputs>
<conditional name="input_type">
<param name="select_input" type="select" label="Phyloseq or Feature table input" help="This tool can work with phyloseq objects or feature table inputs.">
<option value="phyloseq">Phyloseq</option>
<option value="feature_table">Feature table</option>
</param>
<when value="phyloseq">
<param name="phyloseq_object" type="data" format="phyloseq" label="Phyloseq object"/>
<param name="control_metadata" type="text" label="Control column" help="Column in the phyloseq metadata specifying weather a sample is a negative control (0 for normal samples / 1 for control)"/>
</when>
<when value="feature_table">
<param name="otu" type="data" format="tabular" label="Feature table" help="OTU/ASV or other feature table. The first column must have corresponding IDs to the metadata table."/>
<param name="metadata" type="data" format="tabular" label="Metadata" help="Metadata that contains a column specifying weather a samples is a negativ control (0 for normal samples / 1 for control). The first column must have corresponding IDs to the feature table."/>
<param name="control" type="data_column" data_ref="metadata" use_header_names="true" multiple="false" optional="false" label="Control column" help="Column specifying weather a sample is a negative control (0 for normal samples / 1 for control)."/>
</when>
</conditional>
<param name="threshold" type="float" label="Threshold to detect a contaminant" value="0.1" min="0" max="1" help="Probability of the feature to be a decontaminant in the statistical test being performed." />
</inputs>
<outputs>
<data name="library_size_vs_control" format="png" label="${tool.name} on ${on_string}: Library Size vs Control Plot"/>
<data name="prevalence" format="png" label="${tool.name} on ${on_string}: Prevalence Plot"/>
<data name="decontam_otu" format="tabular" label="${tool.name} on ${on_string}: Removed Contaminants - Feature Table"/>
<data name="decontam_phyloseq" format="phyloseq" label="${tool.name} on ${on_string}: Removed Contaminants - Phyloseq Object"/>
</outputs>
<tests>
<test>
<conditional name="input_type">
<param name="select_input" value="phyloseq"/>
<param name="phyloseq_object" value="phyloseq_input.rds"/>
<param name="control_metadata" value="control"/>
</conditional>
<param name="threshold" value="0.5"/>
<output name="decontam_phyloseq" file="phyloseq_output.rds" ftype="phyloseq"/>
<output name="decontam_otu" file="otu_output.tsv" ftype="tabular"/>
<output name="prevalence" file="Prevalence_Plot.png" ftype="png"/>
<output name="library_size_vs_control" file="Library_Size_vs_Control_Plot.png" ftype="png"/>
</test>
<test>
<conditional name="input_type">
<param name="select_input" value="feature_table"/>
<param name="otu" value="otu_input.tsv"/>
<param name="metadata" value="metadata_input.tsv"/>
<!-- using the index of the column -->
<param name="control" value="8"/>
</conditional>
<param name="threshold" value="0.5"/>
<output name="decontam_phyloseq" file="phyloseq_output2.rds" ftype="phyloseq"/>
<output name="decontam_otu" file="otu_output.tsv" ftype="tabular"/>
<output name="prevalence" file="Prevalence_Plot.png" ftype="png"/>
<output name="library_size_vs_control" file="Library_Size_vs_Control_Plot.png" ftype="png"/>
</test>
</tests>
<help><![CDATA[
Simple Statistical Identification of Contaminating Sequence Features in Marker-Gene or Metagenomics Data
========================================================================================================
This tool identifies contaminating sequence features in marker-gene or
metagenomics datasets. It can be applied to any type of feature derived from
environmental sequencing data (e.g., ASVs, OTUs, taxonomic groups, MAGs,
etc.). The method requires either DNA quantitation data or sequenced negative
control samples.
.. note::
Currently, only the negative control sample method is implemented in this
wrapper.
**Output**
- If a phyloseq object is provided as input, the output will be a phyloseq
object pruned to include only non-contaminant features.
- If only the feature table or metadata is provided, the output will be a
pruned phyloseq object containing only non-contaminant features, without
the TAX table. The output feature table will also be pruned to include only
non-contaminant features.
**Threshold**
The default threshold for identifying a contaminant is a probability of 0.1
in the statistical test being performed.
In the prevalence test, a special threshold value of 0.5 is notable: this
will identify as contaminants any sequences that are more prevalent in
negative controls than in positive samples.
]]>
</help>
<expand macro="citations"/>
</tool>
23 changes: 23 additions & 0 deletions tools/decontam/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<macros>
<token name="@TOOL_VERSION@">1.22</token>
<token name="@VERSION_SUFFIX@">0</token>
<token name="@PROFILE@">22.01</token>
<xml name="bio_tools">
<xrefs>
<xref type="bio.tools">decontam</xref>
<xref type="bioconductor">decontam</xref>
</xrefs>
</xml>
<xml name="requirements">
<requirements>
<requirement type="package" version="@TOOL_VERSION@">bioconductor-decontam</requirement>
<requirement type="package" version="1.46.0" >bioconductor-phyloseq</requirement>
<requirement type="package" version="2.0.0">r-tidyverse</requirement>
</requirements>
</xml>
<xml name="citations">
<citations>
<citation type="doi">10.1186/s40168-018-0605-2</citation>
</citations>
</xml>
</macros>
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tools/decontam/test-data/Prevalence_Plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
176 changes: 176 additions & 0 deletions tools/decontam/test-data/decontam_Rscript.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
---
title: "decontam_docs"
output: html_document
date: "2024-09-10"
---

# This R markdown generates the test data for the wrapper and can be used to test the functions used in the configfile

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Install test env and run studio in this env

```{bash install}
mamba create --name decontam bioconductor-decontam bioconductor-phyloseq r-tidyverse
mamba activate decontam
rstudio
```

### Check correct R home

```{r home}
R.home()
```

## Get test data

Create test data for wrapper, it should be able to use a matrix and vector as well as phyloseq object as input.

```{r store test data}
R.home()
library(phyloseq)
packageVersion("phyloseq")
library(ggplot2)
packageVersion("ggplot2")
library(decontam)
packageVersion("decontam")
ps <- readRDS(system.file("extdata", "MUClite.rds", package = "decontam"))
# Sample from a physeq object with a sampling function.
# ps: physeq object to be sampled
# fun: function to use for sampling (default `sample`)
# ...: parameters to be passed to fun,
# see `help(sample)` for default parameters
sample_ps <- function(ps, fun = sample, ...) {
ids <- sample_names(ps)
sampled_ids <- fun(ids, ...)
ps <- prune_samples(sampled_ids, ps)
return(ps)
}
# the initial object is to big for the test case so we subsample
ps <- sample_ps(ps, size = 200)
## ps
# get otu table
otu <- as(otu_table(ps), "matrix")
head(otu[, 1:10], 10)
# add control column to sample data
sample_data(ps)$control <- sample_data(ps)$Sample_or_Control == "Control Sample"
# store as 0 and 1
sample_data(ps)$control <- as.integer(sample_data(ps)$control)
head(sample_data(ps), 1000)
metadata <- as(sample_data(ps), "matrix")
head(metadata, 1000)
# store test data
# stores the row names as column,
# see https://stackoverflow.com/questions/2478352
# /write-table-writes-unwanted-leading-empty-column-to-header-when-has-rownames
write.table(data.frame("SampleID" = rownames(otu), otu),
file = file.path(getwd(), "otu_input.tsv"),
sep = "\t",
row.names = FALSE,
quote = FALSE
)
write.table(data.frame("SampleID" = rownames(metadata), metadata),
file = file.path(getwd(), "metadata_input.tsv"),
sep = "\t",
row.names = FALSE,
quote = FALSE
)
saveRDS(ps, file.path(getwd(), "phyloseq_input.rds"))
```

## Load test data

```{r load test data}
library(tidyverse)
# get OTU table (first column is the OTU/ASV ID)
otu <- read_tsv(file.path(getwd(), "otu_input.tsv"))
# use first column as colname
otu2 <- otu %>% tibble::column_to_rownames(colnames(otu)[1])
otu <- otu_table(otu2, taxa_are_rows = FALSE)
# get metadata table must have matching OTU/ASV ID in first column
meta <- read_tsv(file.path(getwd(), "metadata_input.tsv"))
# use first column as colname
meta2 <- meta %>% tibble::column_to_rownames(colnames(meta)[1])
control_column <- "control"
# convert 0/1 to bool for the control column and store in control column
meta2$control <- as.logical(meta2[[control_column]])
sampledata <- sample_data(meta2)
# create dummy tax table (actually not needed,
# but nice to learn how to load phyloseq objects)
taxmat <- as.data.frame(matrix(sample(letters, 10, replace = TRUE),
nrow = ncol(otu2), ncol = 7
))
rownames(taxmat) <- colnames(otu2)
tax <- tax_table(as.matrix(taxmat))
ps <- phyloseq(otu, tax, sampledata)
```

# plot 1

```{r plot library size vs control}
# Put sample_data into a ggplot-friendly data.frame
df <- as.data.frame(sample_data(ps))
df$LibrarySize <- sample_sums(ps)
df <- df[order(df$LibrarySize), ]
df$Index <- seq_len(nrow(df))
ggplot(data = df, aes(x = Index, y = LibrarySize, color = control)) +
geom_point()
```

# plot 2

```{r plot prevalence}
contamdf_prev <- isContaminant(ps,
method = "prevalence",
neg = "control",
threshold = 0.5
)
table(contamdf_prev$contaminant)
ps_pa <- transform_sample_counts(ps, function(abund) 1 * (abund > 0))
ps_pa_neg <- prune_samples(sample_data(ps_pa)$control == TRUE, ps_pa)
ps_pa_pos <- prune_samples(sample_data(ps_pa)$control == FALSE, ps_pa)
# Make data_frame of prevalence in positive and negative samples
df_pa <- data.frame(
pa_pos = taxa_sums(ps_pa_pos), pa_neg = taxa_sums(ps_pa_neg),
contaminant = contamdf_prev$contaminant
)
ggplot(data = df_pa, aes(x = pa_neg, y = pa_pos, color = contaminant)) +
geom_point() +
xlab("Prevalence (Negative Controls)") +
ylab("Prevalence (True Samples)")
```

# generate output

```{r remove contams}
id_name <- colnames(otu)[1]
ps_noncontam <- prune_taxa(!contamdf_prev$contaminant, ps)
otu_table(ps_noncontam) %>%
as.data.frame() %>%
rownames_to_column(id_name) <- otu
write.table(otu,
file = file.path(getwd(), "otu_output.tsv"),
sep = "\t",
row.names = FALSE,
)
```
Loading

0 comments on commit 9e0ce6d

Please sign in to comment.