-
Notifications
You must be signed in to change notification settings - Fork 441
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6348 from paulzierep/decontam
Add Decontam
- Loading branch information
Showing
12 changed files
with
988 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
name: decontam | ||
owner: iuc | ||
description: Removes decontamination features (ASVs/OTUs) using control samples | ||
long_description: | | ||
The decontam package provides simple statistical methods to identify | ||
and visualize contaminating DNA features, allowing them to be removed | ||
and a more accurate picture of sampled communities to be constructed | ||
from marker-gene and metagenomics data. | ||
categories: | ||
- Metagenomics | ||
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/decontam | ||
homepage_url: https://github.com/benjjneb/decontam | ||
type: unrestricted |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
<tool id="decontam" name="Decontam" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> | ||
<macros> | ||
<import>macros.xml</import> | ||
</macros> | ||
<expand macro="bio_tools"/> | ||
<expand macro="requirements"/> | ||
<command detect_errors="exit_code"><![CDATA[ | ||
Rscript '$rscript' | ||
]]></command> | ||
<configfiles> | ||
<configfile name="rscript"><![CDATA[ | ||
library(tidyverse) | ||
library(phyloseq) | ||
library(ggplot2) | ||
library(decontam) | ||
#if $input_type.select_input == 'phyloseq': | ||
ps <- readRDS("$input_type.phyloseq_object") | ||
sample_data(ps)\$control <- as.logical(sample_data(ps)[["$input_type.control_metadata"]]) | ||
#else | ||
## get OTU table (first column is the OTU/ASV ID) | ||
otu <- read_tsv("$input_type.otu") | ||
otu2 <- otu %>% tibble::column_to_rownames(colnames(otu)[1]) #use first column as rownames | ||
OTU <- otu_table(otu2, taxa_are_rows = FALSE) | ||
## get metadata table must have matching OTU/ASV ID in first column | ||
meta <- read_tsv("$input_type.metadata") | ||
meta2 <- meta %>% tibble::column_to_rownames(colnames(meta)[1]) #use first column as rownames | ||
control_column = as.integer("$input_type.control") - 1 ##remove one index since the dataframe uses the first column as index | ||
## convert 0/1 to bool for the control column and store in control column | ||
meta2\$control <- as.logical(meta2[[control_column]]) | ||
sampledata <- sample_data(meta2) | ||
ps <- phyloseq(OTU, FALSE, sampledata) | ||
#end if | ||
## plot library_size_vs_control | ||
df <- as.data.frame(sample_data(ps)) # Put sample_data into a ggplot-friendly data.frame | ||
df\$LibrarySize <- sample_sums(ps) | ||
df <- df[order(df\$LibrarySize),] | ||
df\$Index <- seq(nrow(df)) | ||
ggplot(data=df, aes(x=Index, y=LibrarySize, color=control)) + geom_point() | ||
ggsave("$library_size_vs_control", device = "png", width = 10, height = 8, units = "cm") | ||
## plot prevalence | ||
contamdf.prev <- isContaminant(ps, method="prevalence", neg="control", threshold=$threshold) | ||
table(contamdf.prev\$contaminant) | ||
ps.pa <- transform_sample_counts(ps, function(abund) 1*(abund>0)) | ||
ps.pa.neg <- prune_samples(sample_data(ps.pa)\$control == TRUE, ps.pa) | ||
ps.pa.pos <- prune_samples(sample_data(ps.pa)\$control == FALSE, ps.pa) | ||
## Make data.frame of prevalence in positive and negative samples | ||
df.pa <- data.frame(pa.pos=taxa_sums(ps.pa.pos), pa.neg=taxa_sums(ps.pa.neg), | ||
contaminant=contamdf.prev\$contaminant) | ||
ggplot(data=df.pa, aes(x=pa.neg, y=pa.pos, color=contaminant)) + geom_point() + | ||
xlab("Prevalence (Negative Controls)") + ylab("Prevalence (True Samples)") | ||
ggsave("$prevalence", device = "png", width = 10, height = 8, units = "cm") | ||
## remove contamination features from original data | ||
#if $input_type.select_input == 'phyloseq': | ||
id_name <- "SampleID" | ||
#else | ||
id_name <- colnames(otu)[1] ## we use the same name for the ID column as the OTU input | ||
#end if | ||
ps.noncontam <- prune_taxa(!contamdf.prev\$contaminant, ps) | ||
otu_table(ps.noncontam) %>% | ||
as.data.frame() %>% | ||
rownames_to_column(id_name) -> otu | ||
write.table(otu, | ||
file="$decontam_otu", | ||
sep = "\t", | ||
row.names=FALSE, | ||
quote = FALSE) | ||
saveRDS(ps.noncontam, "$decontam_phyloseq") | ||
]]></configfile> | ||
</configfiles> | ||
<inputs> | ||
<conditional name="input_type"> | ||
<param name="select_input" type="select" label="Phyloseq or Feature table input" help="This tool can work with phyloseq objects or feature table inputs."> | ||
<option value="phyloseq">Phyloseq</option> | ||
<option value="feature_table">Feature table</option> | ||
</param> | ||
<when value="phyloseq"> | ||
<param name="phyloseq_object" type="data" format="phyloseq" label="Phyloseq object"/> | ||
<param name="control_metadata" type="text" label="Control column" help="Column in the phyloseq metadata specifying weather a sample is a negative control (0 for normal samples / 1 for control)"/> | ||
</when> | ||
<when value="feature_table"> | ||
<param name="otu" type="data" format="tabular" label="Feature table" help="OTU/ASV or other feature table. The first column must have corresponding IDs to the metadata table."/> | ||
<param name="metadata" type="data" format="tabular" label="Metadata" help="Metadata that contains a column specifying weather a samples is a negativ control (0 for normal samples / 1 for control). The first column must have corresponding IDs to the feature table."/> | ||
<param name="control" type="data_column" data_ref="metadata" use_header_names="true" multiple="false" optional="false" label="Control column" help="Column specifying weather a sample is a negative control (0 for normal samples / 1 for control)."/> | ||
</when> | ||
</conditional> | ||
<param name="threshold" type="float" label="Threshold to detect a contaminant" value="0.1" min="0" max="1" help="Probability of the feature to be a decontaminant in the statistical test being performed." /> | ||
</inputs> | ||
<outputs> | ||
<data name="library_size_vs_control" format="png" label="${tool.name} on ${on_string}: Library Size vs Control Plot"/> | ||
<data name="prevalence" format="png" label="${tool.name} on ${on_string}: Prevalence Plot"/> | ||
<data name="decontam_otu" format="tabular" label="${tool.name} on ${on_string}: Removed Contaminants - Feature Table"/> | ||
<data name="decontam_phyloseq" format="phyloseq" label="${tool.name} on ${on_string}: Removed Contaminants - Phyloseq Object"/> | ||
</outputs> | ||
<tests> | ||
<test> | ||
<conditional name="input_type"> | ||
<param name="select_input" value="phyloseq"/> | ||
<param name="phyloseq_object" value="phyloseq_input.rds"/> | ||
<param name="control_metadata" value="control"/> | ||
</conditional> | ||
<param name="threshold" value="0.5"/> | ||
<output name="decontam_phyloseq" file="phyloseq_output.rds" ftype="phyloseq"/> | ||
<output name="decontam_otu" file="otu_output.tsv" ftype="tabular"/> | ||
<output name="prevalence" file="Prevalence_Plot.png" ftype="png"/> | ||
<output name="library_size_vs_control" file="Library_Size_vs_Control_Plot.png" ftype="png"/> | ||
</test> | ||
<test> | ||
<conditional name="input_type"> | ||
<param name="select_input" value="feature_table"/> | ||
<param name="otu" value="otu_input.tsv"/> | ||
<param name="metadata" value="metadata_input.tsv"/> | ||
<!-- using the index of the column --> | ||
<param name="control" value="8"/> | ||
</conditional> | ||
<param name="threshold" value="0.5"/> | ||
<output name="decontam_phyloseq" file="phyloseq_output2.rds" ftype="phyloseq"/> | ||
<output name="decontam_otu" file="otu_output.tsv" ftype="tabular"/> | ||
<output name="prevalence" file="Prevalence_Plot.png" ftype="png"/> | ||
<output name="library_size_vs_control" file="Library_Size_vs_Control_Plot.png" ftype="png"/> | ||
</test> | ||
</tests> | ||
<help><![CDATA[ | ||
Simple Statistical Identification of Contaminating Sequence Features in Marker-Gene or Metagenomics Data | ||
======================================================================================================== | ||
This tool identifies contaminating sequence features in marker-gene or | ||
metagenomics datasets. It can be applied to any type of feature derived from | ||
environmental sequencing data (e.g., ASVs, OTUs, taxonomic groups, MAGs, | ||
etc.). The method requires either DNA quantitation data or sequenced negative | ||
control samples. | ||
.. note:: | ||
Currently, only the negative control sample method is implemented in this | ||
wrapper. | ||
**Output** | ||
- If a phyloseq object is provided as input, the output will be a phyloseq | ||
object pruned to include only non-contaminant features. | ||
- If only the feature table or metadata is provided, the output will be a | ||
pruned phyloseq object containing only non-contaminant features, without | ||
the TAX table. The output feature table will also be pruned to include only | ||
non-contaminant features. | ||
**Threshold** | ||
The default threshold for identifying a contaminant is a probability of 0.1 | ||
in the statistical test being performed. | ||
In the prevalence test, a special threshold value of 0.5 is notable: this | ||
will identify as contaminants any sequences that are more prevalent in | ||
negative controls than in positive samples. | ||
]]> | ||
</help> | ||
<expand macro="citations"/> | ||
</tool> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
<macros> | ||
<token name="@TOOL_VERSION@">1.22</token> | ||
<token name="@VERSION_SUFFIX@">0</token> | ||
<token name="@PROFILE@">22.01</token> | ||
<xml name="bio_tools"> | ||
<xrefs> | ||
<xref type="bio.tools">decontam</xref> | ||
<xref type="bioconductor">decontam</xref> | ||
</xrefs> | ||
</xml> | ||
<xml name="requirements"> | ||
<requirements> | ||
<requirement type="package" version="@TOOL_VERSION@">bioconductor-decontam</requirement> | ||
<requirement type="package" version="1.46.0" >bioconductor-phyloseq</requirement> | ||
<requirement type="package" version="2.0.0">r-tidyverse</requirement> | ||
</requirements> | ||
</xml> | ||
<xml name="citations"> | ||
<citations> | ||
<citation type="doi">10.1186/s40168-018-0605-2</citation> | ||
</citations> | ||
</xml> | ||
</macros> |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
--- | ||
title: "decontam_docs" | ||
output: html_document | ||
date: "2024-09-10" | ||
--- | ||
|
||
# This R markdown generates the test data for the wrapper and can be used to test the functions used in the configfile | ||
|
||
```{r setup, include=FALSE} | ||
knitr::opts_chunk$set(echo = TRUE) | ||
``` | ||
|
||
## Install test env and run studio in this env | ||
|
||
```{bash install} | ||
mamba create --name decontam bioconductor-decontam bioconductor-phyloseq r-tidyverse | ||
mamba activate decontam | ||
rstudio | ||
``` | ||
|
||
### Check correct R home | ||
|
||
```{r home} | ||
R.home() | ||
``` | ||
|
||
## Get test data | ||
|
||
Create test data for wrapper, it should be able to use a matrix and vector as well as phyloseq object as input. | ||
|
||
```{r store test data} | ||
R.home() | ||
library(phyloseq) | ||
packageVersion("phyloseq") | ||
library(ggplot2) | ||
packageVersion("ggplot2") | ||
library(decontam) | ||
packageVersion("decontam") | ||
ps <- readRDS(system.file("extdata", "MUClite.rds", package = "decontam")) | ||
# Sample from a physeq object with a sampling function. | ||
# ps: physeq object to be sampled | ||
# fun: function to use for sampling (default `sample`) | ||
# ...: parameters to be passed to fun, | ||
# see `help(sample)` for default parameters | ||
sample_ps <- function(ps, fun = sample, ...) { | ||
ids <- sample_names(ps) | ||
sampled_ids <- fun(ids, ...) | ||
ps <- prune_samples(sampled_ids, ps) | ||
return(ps) | ||
} | ||
# the initial object is to big for the test case so we subsample | ||
ps <- sample_ps(ps, size = 200) | ||
## ps | ||
# get otu table | ||
otu <- as(otu_table(ps), "matrix") | ||
head(otu[, 1:10], 10) | ||
# add control column to sample data | ||
sample_data(ps)$control <- sample_data(ps)$Sample_or_Control == "Control Sample" | ||
# store as 0 and 1 | ||
sample_data(ps)$control <- as.integer(sample_data(ps)$control) | ||
head(sample_data(ps), 1000) | ||
metadata <- as(sample_data(ps), "matrix") | ||
head(metadata, 1000) | ||
# store test data | ||
# stores the row names as column, | ||
# see https://stackoverflow.com/questions/2478352 | ||
# /write-table-writes-unwanted-leading-empty-column-to-header-when-has-rownames | ||
write.table(data.frame("SampleID" = rownames(otu), otu), | ||
file = file.path(getwd(), "otu_input.tsv"), | ||
sep = "\t", | ||
row.names = FALSE, | ||
quote = FALSE | ||
) | ||
write.table(data.frame("SampleID" = rownames(metadata), metadata), | ||
file = file.path(getwd(), "metadata_input.tsv"), | ||
sep = "\t", | ||
row.names = FALSE, | ||
quote = FALSE | ||
) | ||
saveRDS(ps, file.path(getwd(), "phyloseq_input.rds")) | ||
``` | ||
|
||
## Load test data | ||
|
||
```{r load test data} | ||
library(tidyverse) | ||
# get OTU table (first column is the OTU/ASV ID) | ||
otu <- read_tsv(file.path(getwd(), "otu_input.tsv")) | ||
# use first column as colname | ||
otu2 <- otu %>% tibble::column_to_rownames(colnames(otu)[1]) | ||
otu <- otu_table(otu2, taxa_are_rows = FALSE) | ||
# get metadata table must have matching OTU/ASV ID in first column | ||
meta <- read_tsv(file.path(getwd(), "metadata_input.tsv")) | ||
# use first column as colname | ||
meta2 <- meta %>% tibble::column_to_rownames(colnames(meta)[1]) | ||
control_column <- "control" | ||
# convert 0/1 to bool for the control column and store in control column | ||
meta2$control <- as.logical(meta2[[control_column]]) | ||
sampledata <- sample_data(meta2) | ||
# create dummy tax table (actually not needed, | ||
# but nice to learn how to load phyloseq objects) | ||
taxmat <- as.data.frame(matrix(sample(letters, 10, replace = TRUE), | ||
nrow = ncol(otu2), ncol = 7 | ||
)) | ||
rownames(taxmat) <- colnames(otu2) | ||
tax <- tax_table(as.matrix(taxmat)) | ||
ps <- phyloseq(otu, tax, sampledata) | ||
``` | ||
|
||
# plot 1 | ||
|
||
```{r plot library size vs control} | ||
# Put sample_data into a ggplot-friendly data.frame | ||
df <- as.data.frame(sample_data(ps)) | ||
df$LibrarySize <- sample_sums(ps) | ||
df <- df[order(df$LibrarySize), ] | ||
df$Index <- seq_len(nrow(df)) | ||
ggplot(data = df, aes(x = Index, y = LibrarySize, color = control)) + | ||
geom_point() | ||
``` | ||
|
||
# plot 2 | ||
|
||
```{r plot prevalence} | ||
contamdf_prev <- isContaminant(ps, | ||
method = "prevalence", | ||
neg = "control", | ||
threshold = 0.5 | ||
) | ||
table(contamdf_prev$contaminant) | ||
ps_pa <- transform_sample_counts(ps, function(abund) 1 * (abund > 0)) | ||
ps_pa_neg <- prune_samples(sample_data(ps_pa)$control == TRUE, ps_pa) | ||
ps_pa_pos <- prune_samples(sample_data(ps_pa)$control == FALSE, ps_pa) | ||
# Make data_frame of prevalence in positive and negative samples | ||
df_pa <- data.frame( | ||
pa_pos = taxa_sums(ps_pa_pos), pa_neg = taxa_sums(ps_pa_neg), | ||
contaminant = contamdf_prev$contaminant | ||
) | ||
ggplot(data = df_pa, aes(x = pa_neg, y = pa_pos, color = contaminant)) + | ||
geom_point() + | ||
xlab("Prevalence (Negative Controls)") + | ||
ylab("Prevalence (True Samples)") | ||
``` | ||
|
||
# generate output | ||
|
||
```{r remove contams} | ||
id_name <- colnames(otu)[1] | ||
ps_noncontam <- prune_taxa(!contamdf_prev$contaminant, ps) | ||
otu_table(ps_noncontam) %>% | ||
as.data.frame() %>% | ||
rownames_to_column(id_name) <- otu | ||
write.table(otu, | ||
file = file.path(getwd(), "otu_output.tsv"), | ||
sep = "\t", | ||
row.names = FALSE, | ||
) | ||
``` |
Oops, something went wrong.