diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml index 79390ce..b810c35 100644 --- a/ci/recipe/meta.yaml +++ b/ci/recipe/meta.yaml @@ -31,6 +31,7 @@ requirements: - qiime2 {{ qiime2_epoch }}.* - q2templates {{ qiime2_epoch }}.* - q2-types {{ qiime2_epoch }}.* + - rnanorm test: requires: diff --git a/q2_feature_table/__init__.py b/q2_feature_table/__init__.py index d038d73..d68a6ac 100644 --- a/q2_feature_table/__init__.py +++ b/q2_feature_table/__init__.py @@ -6,7 +6,7 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -from ._normalize import rarefy +from ._normalize import rarefy, normalize from ._subsample_ids import subsample_ids from ._transform import (presence_absence, relative_frequency, transpose) from ._summarize import (summarize, tabulate_seqs, tabulate_sample_frequencies, @@ -31,4 +31,4 @@ 'filter_seqs', 'subsample_ids', 'rename_ids', 'filter_features_conditionally', 'split', 'tabulate_feature_frequencies', 'tabulate_sample_frequencies', - 'summarize_plus'] + 'summarize_plus', 'normalize'] diff --git a/q2_feature_table/_normalize.py b/q2_feature_table/_normalize.py index d9c2935..ff9e2ce 100644 --- a/q2_feature_table/_normalize.py +++ b/q2_feature_table/_normalize.py @@ -8,6 +8,12 @@ import biom +import os + +import pandas as pd +from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat +from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ + def rarefy(table: biom.Table, sampling_depth: int, with_replacement: bool = False) -> biom.Table: @@ -23,3 +29,95 @@ def rarefy(table: biom.Table, sampling_depth: int, 'shallow enough sampling depth.') return table + + +def normalize( + table: pd.DataFrame, + method: str, + m_trim: float = None, + a_trim: float = None, + gene_length: SequenceCharacteristicsDirectoryFormat = None, +) -> pd.DataFrame: + # Validate parameter combinations and set trim parameters + m_trim, a_trim = _validate_parameters( + method, m_trim, a_trim, gene_length) + + # Process gene_lengths input and define methods that need gene_lengths + # input + if method in ["tpm", "fpkm"]: + lengths = _convert_lengths(table, gene_length) + + methods = { + "tpm": TPM(gene_lengths=lengths), + "fpkm": FPKM(gene_lengths=lengths), + } + + # Define remaining methods that don't need gene_lengths input + else: + methods = { + "tmm": TMM(m_trim=m_trim, a_trim=a_trim), + "ctf": CTF(m_trim=m_trim, a_trim=a_trim), + "uq": UQ(), + "cuf": CUF(), + "cpm": CPM(), + } + + # Run normalization method on frequency table + normalized = methods[method].set_output( + transform="pandas").fit_transform(table) + normalized.index.name = "sample_id" + + return normalized + + +def _validate_parameters(method, m_trim, a_trim, gene_length): + # Raise Error if gene-length is missing when using methods TPM or FPKM + if method in ["tpm", "fpkm"] and not gene_length: + raise ValueError("gene-length input is missing.") + + # Raise Error if gene-length is given when using methods TMM, UQ, CUF, + # CPM or CTF + if method in ["tmm", "uq", "cuf", "ctf", "cpm"] and gene_length: + raise ValueError( + "gene-length input can only be used with FPKM and TPM methods." + ) + + # Raise Error if m_trim or a_trim are given when not using methods + # TMM or CTF + if ((method not in ["tmm", "ctf"]) + and (m_trim is not None or a_trim is not None)): + raise ValueError( + "Parameters m-trim and a-trim can only be used with " + "methods TMM and CTF." + ) + + # Set m_trim and a_trim to their default values for methods + # TMM and CTF + if method in ["tmm", "ctf"]: + m_trim = 0.3 if m_trim is None else m_trim + a_trim = 0.05 if a_trim is None else a_trim + + return m_trim, a_trim + + +def _convert_lengths(table, gene_length): + # Read in table from sequence_characteristics.tsv as a pd.Series + lengths = pd.read_csv( + os.path.join(gene_length.path, "sequence_characteristics.tsv"), + sep="\t", + header=None, + names=["index", "values"], + index_col="index", + skiprows=1, + ).squeeze("columns") + + # Check if all gene IDs that are present in the table are also + # present in the lengths + if not set(table.columns).issubset(set(lengths.index)): + only_in_counts = set(table.columns) - set(lengths.index) + raise ValueError( + f"There are genes present in the FeatureTable that are " + f"not present in the gene-length input. Missing lengths " + f"for genes: {only_in_counts}" + ) + return lengths diff --git a/q2_feature_table/citations.bib b/q2_feature_table/citations.bib index 3177333..82bc7c9 100644 --- a/q2_feature_table/citations.bib +++ b/q2_feature_table/citations.bib @@ -43,3 +43,12 @@ @article{NCBI-BLAST year = {2008}, doi = {10.1093/nar/gkn201}, } + +@misc{Zmrzlikar_RNAnorm_RNA-seq_data_2023, +author = {Zmrzlikar, Jure and Žganec, Matjaž and Ausec, Luka and Štajdohar, Miha}, +month = jul, +title = {{RNAnorm: RNA-seq data normalization in Python}}, +url = {https://github.com/genialis/RNAnorm}, +version = {2.1.0}, +year = {2023} +} diff --git a/q2_feature_table/plugin_setup.py b/q2_feature_table/plugin_setup.py index 569b4d3..fcebd4c 100644 --- a/q2_feature_table/plugin_setup.py +++ b/q2_feature_table/plugin_setup.py @@ -5,16 +5,17 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - +from qiime2.core.type import Properties from qiime2.plugin import (Plugin, Int, Float, Range, Metadata, Str, Bool, Choices, MetadataColumn, Categorical, List, Citations, TypeMatch, TypeMap, Collection, Visualization) from q2_types.feature_table import ( - FeatureTable, Frequency, RelativeFrequency, PresenceAbsence, Composition) + FeatureTable, Frequency, RelativeFrequency, PresenceAbsence, Composition, + Normalized) from q2_types.feature_data import ( - FeatureData, Sequence, Taxonomy, AlignedSequence) + FeatureData, Sequence, Taxonomy, AlignedSequence, SequenceCharacteristics) from q2_types.metadata import ImmutableMetadata import q2_feature_table @@ -723,3 +724,77 @@ "Tabulate sample and feature frequencies.", examples={'feature_table_summarize_plus': ex.feature_table_summarize_plus} ) + +P_method, T_normalized_table = TypeMap( + { + Str + % Choices("tpm"): ( + FeatureTable[Normalized % Properties("TPM")], + ), + Str + % Choices("fpkm"): ( + FeatureTable[Normalized % Properties("FPKM")], + ), + Str + % Choices("tmm"): ( + FeatureTable[Normalized % Properties("TMM")], + ), + Str + % Choices("uq"): ( + FeatureTable[Normalized % Properties("UQ")], + ), + Str + % Choices("cuf"): ( + FeatureTable[Normalized % Properties("CUF")], + ), + Str + % Choices("ctf"): ( + FeatureTable[Normalized % Properties("CTF")], + ), + Str + % Choices("cpm"): ( + FeatureTable[Normalized % Properties("CPM")], + ), + } +) + +plugin.methods.register_function( + function=q2_feature_table.normalize, + inputs={ + "table": FeatureTable[Frequency], + "gene_length": FeatureData[ + SequenceCharacteristics % Properties("length")], + }, + parameters={ + "method": P_method, + "m_trim": Float % Range( + 0, 1, inclusive_start=True, inclusive_end=True + ), + "a_trim": Float % Range( + 0, 1, inclusive_start=True, inclusive_end=True + ), + }, + outputs=[("normalized_table", T_normalized_table)], + input_descriptions={ + "table": "Feature table with gene counts.", + "gene_length": "Gene lengths of all genes in the feature table.", + }, + parameter_descriptions={ + "method": "Specify the normalization method to be used. Use FPKM or " + "TPM for within comparisons and TMM, UQ, CUF or CTF for between " + "sample camparisons. Check https://www.genialis.com/wp-content/uploads" + "/2023/12/2023-Normalizing-RNA-seq-data-in-Python-with-RNAnorm.pdf " + "for more information on the methods.", + "m_trim": "Two sided cutoff for M-values. Can only be used for " + "methods TMM and CTF. (default = 0.3)", + "a_trim": "Two sided cutoff for A-values. Can only be used for " + "methods TMM and CTF. (default = 0.05)", + }, + output_descriptions={ + "normalized_table": "Feature table normalized with specified method." + }, + name="Normalize FeatureTable", + description="Normalize FeatureTable by gene length, library size and " + "composition with common methods for RNA-seq.", + citations=[citations["Zmrzlikar_RNAnorm_RNA-seq_data_2023"]], +) diff --git a/q2_feature_table/tests/test_normalize.py b/q2_feature_table/tests/test_normalize.py index d45ca94..cb4a20f 100644 --- a/q2_feature_table/tests/test_normalize.py +++ b/q2_feature_table/tests/test_normalize.py @@ -5,14 +5,20 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - +import os from unittest import TestCase, main +from unittest.mock import MagicMock, patch import numpy as np import numpy.testing as npt +import pandas as pd from biom.table import Table +from pandas._testing import assert_series_equal +from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat from q2_feature_table import rarefy +from q2_feature_table._normalize import (_validate_parameters, + _convert_lengths, normalize) class RarefyTests(TestCase): @@ -53,5 +59,107 @@ def test_rarefy_depth_error(self): rarefy(t, 50) +class NormalizeTests(TestCase): + + @classmethod + def setUpClass(cls): + cls.lengths = pd.Series( + { + "ARO1": 1356.0, + "ARO2": 1173.0, + }, + name="values", + ) + cls.lengths.index.name = "index" + cls.table = pd.DataFrame({ + 'ID': ['sample1', 'sample2'], + 'ARO1': [2.0, 2.0], + 'ARO2': [0.0, 0.0] + }).set_index('ID') + + def test_validate_parameters_uq_with_m_a_trim(self): + # Test Error raised if gene-length is given with UQ method + with self.assertRaisesRegex( + ValueError, + "Parameters m-trim and a-trim can only " + "be used with methods TMM and CTF.", + ): + _validate_parameters("uq", 0.2, 0.05, None) + + def test_validate_parameters_tpm_missing_gene_length(self): + # Test Error raised if gene-length is missing with TPM method + with self.assertRaisesRegex( + ValueError, "gene-length input is missing."): + _validate_parameters("tpm", None, None, None) + + def test_validate_parameters_tmm_gene_length(self): + # Test Error raised if gene-length is given with TMM method + with self.assertRaisesRegex( + ValueError, + "gene-length input can only be used with FPKM and " + "TPM methods." + ): + _validate_parameters( + "tmm", None, None, gene_length=MagicMock()) + + def test_validate_parameters_default_m_a_trim(self): + # Test if m_trim and a_trim get set to default values if None + m_trim, a_trim = _validate_parameters("tmm", None, None, None) + self.assertEqual(m_trim, 0.3) + self.assertEqual(a_trim, 0.05) + + def test_validate_parameters_m_a_trim(self): + # Test if m_trim and a_trim are not modified if not None + m_trim, a_trim = _validate_parameters("tmm", 0.1, 0.06, None) + self.assertEqual(m_trim, 0.1) + self.assertEqual(a_trim, 0.06) + + def test_convert_lengths_gene_length(self): + # Test _convert_lengths + gene_length = SequenceCharacteristicsDirectoryFormat() + with open(os.path.join( + str(gene_length), "sequence_characteristics.tsv"), + 'w') as file: + file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0") + + obs = _convert_lengths(self.table, gene_length=gene_length) + assert_series_equal(obs, self.lengths) + + def test_convert_lengths_short_gene_length(self): + # Test Error raised if gene-length is missing genes + gene_length = SequenceCharacteristicsDirectoryFormat() + with open(os.path.join( + str(gene_length), + "sequence_characteristics.tsv"), 'w') as file: + file.write("id\tlength\nARO1\t1356.0") + with self.assertRaisesRegex( + ValueError, + "There are genes present in the FeatureTable that are " + "not present in the gene-length input. Missing lengths " + "for genes: {'ARO2'}", + ): + _convert_lengths(self.table, gene_length=gene_length) + + @patch("q2_feature_table._normalize.TPM") + def test_tpm_fpkm_with_valid_inputs(self, mock_tpm): + # Test valid inputs for TPM method + gene_length = SequenceCharacteristicsDirectoryFormat() + with open(os.path.join( + str(gene_length), "sequence_characteristics.tsv"), + 'w') as file: + file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0") + normalize(table=self.table, gene_length=gene_length, method="tpm") + + @patch("q2_feature_table._normalize.TMM") + def test_tmm_uq_cuf_ctf_with_valid_inputs(self, mock_tmm): + # Test valid inputs for TMM method + gene_length = SequenceCharacteristicsDirectoryFormat() + with open(os.path.join( + str(gene_length), "sequence_characteristics.tsv"), + 'w') as file: + file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0") + normalize(table=self.table, method="tmm", a_trim=0.06, m_trim=0.4) + + if __name__ == "__main__": main()