Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Added action normalize that can normalize a FeatureTable[Frequency] with common methods. #312

Open
wants to merge 3 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ci/recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ requirements:
- qiime2 {{ qiime2_epoch }}.*
- q2templates {{ qiime2_epoch }}.*
- q2-types {{ qiime2_epoch }}.*
- rnanorm

test:
requires:
Expand Down
4 changes: 2 additions & 2 deletions q2_feature_table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from ._normalize import rarefy
from ._normalize import rarefy, normalize
from ._subsample_ids import subsample_ids
from ._transform import (presence_absence, relative_frequency, transpose)
from ._summarize import (summarize, tabulate_seqs, tabulate_sample_frequencies,
Expand All @@ -31,4 +31,4 @@
'filter_seqs', 'subsample_ids', 'rename_ids',
'filter_features_conditionally', 'split',
'tabulate_feature_frequencies', 'tabulate_sample_frequencies',
'summarize_plus']
'summarize_plus', 'normalize']
98 changes: 98 additions & 0 deletions q2_feature_table/_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@

import biom

import os

import pandas as pd
from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat
from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ


def rarefy(table: biom.Table, sampling_depth: int,
with_replacement: bool = False) -> biom.Table:
Expand All @@ -23,3 +29,95 @@ def rarefy(table: biom.Table, sampling_depth: int,
'shallow enough sampling depth.')

return table


def normalize(
table: pd.DataFrame,
method: str,
m_trim: float = None,
a_trim: float = None,
gene_length: SequenceCharacteristicsDirectoryFormat = None,
) -> pd.DataFrame:
# Validate parameter combinations and set trim parameters
m_trim, a_trim = _validate_parameters(
method, m_trim, a_trim, gene_length)

# Process gene_lengths input and define methods that need gene_lengths
# input
if method in ["tpm", "fpkm"]:
lengths = _convert_lengths(table, gene_length)

methods = {
"tpm": TPM(gene_lengths=lengths),
"fpkm": FPKM(gene_lengths=lengths),
}

# Define remaining methods that don't need gene_lengths input
else:
methods = {
"tmm": TMM(m_trim=m_trim, a_trim=a_trim),
"ctf": CTF(m_trim=m_trim, a_trim=a_trim),
"uq": UQ(),
"cuf": CUF(),
"cpm": CPM(),
}

# Run normalization method on frequency table
normalized = methods[method].set_output(
transform="pandas").fit_transform(table)
normalized.index.name = "sample_id"

return normalized


def _validate_parameters(method, m_trim, a_trim, gene_length):
# Raise Error if gene-length is missing when using methods TPM or FPKM
if method in ["tpm", "fpkm"] and not gene_length:
raise ValueError("gene-length input is missing.")

# Raise Error if gene-length is given when using methods TMM, UQ, CUF,
# CPM or CTF
if method in ["tmm", "uq", "cuf", "ctf", "cpm"] and gene_length:
raise ValueError(
"gene-length input can only be used with FPKM and TPM methods."
)

# Raise Error if m_trim or a_trim are given when not using methods
# TMM or CTF
if ((method not in ["tmm", "ctf"])
and (m_trim is not None or a_trim is not None)):
raise ValueError(
"Parameters m-trim and a-trim can only be used with "
"methods TMM and CTF."
)

# Set m_trim and a_trim to their default values for methods
# TMM and CTF
if method in ["tmm", "ctf"]:
m_trim = 0.3 if m_trim is None else m_trim
a_trim = 0.05 if a_trim is None else a_trim

return m_trim, a_trim


def _convert_lengths(table, gene_length):
# Read in table from sequence_characteristics.tsv as a pd.Series
lengths = pd.read_csv(
os.path.join(gene_length.path, "sequence_characteristics.tsv"),
sep="\t",
header=None,
names=["index", "values"],
index_col="index",
skiprows=1,
).squeeze("columns")

# Check if all gene IDs that are present in the table are also
# present in the lengths
if not set(table.columns).issubset(set(lengths.index)):
only_in_counts = set(table.columns) - set(lengths.index)
raise ValueError(
f"There are genes present in the FeatureTable that are "
f"not present in the gene-length input. Missing lengths "
f"for genes: {only_in_counts}"
)
return lengths
9 changes: 9 additions & 0 deletions q2_feature_table/citations.bib
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,12 @@ @article{NCBI-BLAST
year = {2008},
doi = {10.1093/nar/gkn201},
}

@misc{Zmrzlikar_RNAnorm_RNA-seq_data_2023,
author = {Zmrzlikar, Jure and Žganec, Matjaž and Ausec, Luka and Štajdohar, Miha},
month = jul,
title = {{RNAnorm: RNA-seq data normalization in Python}},
url = {https://github.com/genialis/RNAnorm},
version = {2.1.0},
year = {2023}
}
81 changes: 78 additions & 3 deletions q2_feature_table/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,17 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from qiime2.core.type import Properties
from qiime2.plugin import (Plugin, Int, Float, Range, Metadata, Str, Bool,
Choices, MetadataColumn, Categorical, List,
Citations, TypeMatch, TypeMap, Collection,
Visualization)

from q2_types.feature_table import (
FeatureTable, Frequency, RelativeFrequency, PresenceAbsence, Composition)
FeatureTable, Frequency, RelativeFrequency, PresenceAbsence, Composition,
Normalized)
from q2_types.feature_data import (
FeatureData, Sequence, Taxonomy, AlignedSequence)
FeatureData, Sequence, Taxonomy, AlignedSequence, SequenceCharacteristics)
from q2_types.metadata import ImmutableMetadata

import q2_feature_table
Expand Down Expand Up @@ -723,3 +724,77 @@
"Tabulate sample and feature frequencies.",
examples={'feature_table_summarize_plus': ex.feature_table_summarize_plus}
)

P_method, T_normalized_table = TypeMap(
{
Str
% Choices("tpm"): (
FeatureTable[Normalized % Properties("TPM")],
),
Str
% Choices("fpkm"): (
FeatureTable[Normalized % Properties("FPKM")],
),
Str
% Choices("tmm"): (
FeatureTable[Normalized % Properties("TMM")],
),
Str
% Choices("uq"): (
FeatureTable[Normalized % Properties("UQ")],
),
Str
% Choices("cuf"): (
FeatureTable[Normalized % Properties("CUF")],
),
Str
% Choices("ctf"): (
FeatureTable[Normalized % Properties("CTF")],
),
Str
% Choices("cpm"): (
FeatureTable[Normalized % Properties("CPM")],
),
}
)

plugin.methods.register_function(
function=q2_feature_table.normalize,
inputs={
"table": FeatureTable[Frequency],
"gene_length": FeatureData[
SequenceCharacteristics % Properties("length")],
},
parameters={
"method": P_method,
"m_trim": Float % Range(
0, 1, inclusive_start=True, inclusive_end=True
),
"a_trim": Float % Range(
0, 1, inclusive_start=True, inclusive_end=True
),
},
outputs=[("normalized_table", T_normalized_table)],
input_descriptions={
"table": "Feature table with gene counts.",
"gene_length": "Gene lengths of all genes in the feature table.",
},
parameter_descriptions={
"method": "Specify the normalization method to be used. Use FPKM or "
"TPM for within comparisons and TMM, UQ, CUF or CTF for between "
"sample camparisons. Check https://www.genialis.com/wp-content/uploads"
"/2023/12/2023-Normalizing-RNA-seq-data-in-Python-with-RNAnorm.pdf "
"for more information on the methods.",
"m_trim": "Two sided cutoff for M-values. Can only be used for "
"methods TMM and CTF. (default = 0.3)",
"a_trim": "Two sided cutoff for A-values. Can only be used for "
"methods TMM and CTF. (default = 0.05)",
},
output_descriptions={
"normalized_table": "Feature table normalized with specified method."
},
name="Normalize FeatureTable",
description="Normalize FeatureTable by gene length, library size and "
"composition with common methods for RNA-seq.",
citations=[citations["Zmrzlikar_RNAnorm_RNA-seq_data_2023"]],
)
110 changes: 109 additions & 1 deletion q2_feature_table/tests/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,20 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import os
from unittest import TestCase, main
from unittest.mock import MagicMock, patch

import numpy as np
import numpy.testing as npt
import pandas as pd
from biom.table import Table
from pandas._testing import assert_series_equal
from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat

from q2_feature_table import rarefy
from q2_feature_table._normalize import (_validate_parameters,
_convert_lengths, normalize)


class RarefyTests(TestCase):
Expand Down Expand Up @@ -53,5 +59,107 @@ def test_rarefy_depth_error(self):
rarefy(t, 50)


class NormalizeTests(TestCase):

@classmethod
def setUpClass(cls):
cls.lengths = pd.Series(
{
"ARO1": 1356.0,
"ARO2": 1173.0,
},
name="values",
)
cls.lengths.index.name = "index"
cls.table = pd.DataFrame({
'ID': ['sample1', 'sample2'],
'ARO1': [2.0, 2.0],
'ARO2': [0.0, 0.0]
}).set_index('ID')

def test_validate_parameters_uq_with_m_a_trim(self):
# Test Error raised if gene-length is given with UQ method
with self.assertRaisesRegex(
ValueError,
"Parameters m-trim and a-trim can only "
"be used with methods TMM and CTF.",
):
_validate_parameters("uq", 0.2, 0.05, None)

def test_validate_parameters_tpm_missing_gene_length(self):
# Test Error raised if gene-length is missing with TPM method
with self.assertRaisesRegex(
ValueError, "gene-length input is missing."):
_validate_parameters("tpm", None, None, None)

def test_validate_parameters_tmm_gene_length(self):
# Test Error raised if gene-length is given with TMM method
with self.assertRaisesRegex(
ValueError,
"gene-length input can only be used with FPKM and "
"TPM methods."
):
_validate_parameters(
"tmm", None, None, gene_length=MagicMock())

def test_validate_parameters_default_m_a_trim(self):
# Test if m_trim and a_trim get set to default values if None
m_trim, a_trim = _validate_parameters("tmm", None, None, None)
self.assertEqual(m_trim, 0.3)
self.assertEqual(a_trim, 0.05)

def test_validate_parameters_m_a_trim(self):
# Test if m_trim and a_trim are not modified if not None
m_trim, a_trim = _validate_parameters("tmm", 0.1, 0.06, None)
self.assertEqual(m_trim, 0.1)
self.assertEqual(a_trim, 0.06)

def test_convert_lengths_gene_length(self):
# Test _convert_lengths
gene_length = SequenceCharacteristicsDirectoryFormat()
with open(os.path.join(
str(gene_length), "sequence_characteristics.tsv"),
'w') as file:
file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0")

obs = _convert_lengths(self.table, gene_length=gene_length)
assert_series_equal(obs, self.lengths)

def test_convert_lengths_short_gene_length(self):
# Test Error raised if gene-length is missing genes
gene_length = SequenceCharacteristicsDirectoryFormat()
with open(os.path.join(
str(gene_length),
"sequence_characteristics.tsv"), 'w') as file:
file.write("id\tlength\nARO1\t1356.0")
with self.assertRaisesRegex(
ValueError,
"There are genes present in the FeatureTable that are "
"not present in the gene-length input. Missing lengths "
"for genes: {'ARO2'}",
):
_convert_lengths(self.table, gene_length=gene_length)

@patch("q2_feature_table._normalize.TPM")
def test_tpm_fpkm_with_valid_inputs(self, mock_tpm):
# Test valid inputs for TPM method
gene_length = SequenceCharacteristicsDirectoryFormat()
with open(os.path.join(
str(gene_length), "sequence_characteristics.tsv"),
'w') as file:
file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0")
normalize(table=self.table, gene_length=gene_length, method="tpm")

@patch("q2_feature_table._normalize.TMM")
def test_tmm_uq_cuf_ctf_with_valid_inputs(self, mock_tmm):
# Test valid inputs for TMM method
gene_length = SequenceCharacteristicsDirectoryFormat()
with open(os.path.join(
str(gene_length), "sequence_characteristics.tsv"),
'w') as file:
file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0")
normalize(table=self.table, method="tmm", a_trim=0.06, m_trim=0.4)


if __name__ == "__main__":
main()
Loading