diff --git a/q2_feature_table/__init__.py b/q2_feature_table/__init__.py index d038d73..d68a6ac 100644 --- a/q2_feature_table/__init__.py +++ b/q2_feature_table/__init__.py @@ -6,7 +6,7 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -from ._normalize import rarefy +from ._normalize import rarefy, normalize from ._subsample_ids import subsample_ids from ._transform import (presence_absence, relative_frequency, transpose) from ._summarize import (summarize, tabulate_seqs, tabulate_sample_frequencies, @@ -31,4 +31,4 @@ 'filter_seqs', 'subsample_ids', 'rename_ids', 'filter_features_conditionally', 'split', 'tabulate_feature_frequencies', 'tabulate_sample_frequencies', - 'summarize_plus'] + 'summarize_plus', 'normalize'] diff --git a/q2_feature_table/_normalize.py b/q2_feature_table/_normalize.py index fcde136..ff9e2ce 100644 --- a/q2_feature_table/_normalize.py +++ b/q2_feature_table/_normalize.py @@ -75,19 +75,24 @@ def _validate_parameters(method, m_trim, a_trim, gene_length): if method in ["tpm", "fpkm"] and not gene_length: raise ValueError("gene-length input is missing.") - # Raise Error if gene-length is given when using methods TMM, UQ, CUF, CPM or CTF + # Raise Error if gene-length is given when using methods TMM, UQ, CUF, + # CPM or CTF if method in ["tmm", "uq", "cuf", "ctf", "cpm"] and gene_length: raise ValueError( "gene-length input can only be used with FPKM and TPM methods." ) - # Raise Error if m_trim or a_trim are given when not using methods TMM or CTF - if (method not in ["tmm", "ctf"]) and (m_trim is not None or a_trim is not None): + # Raise Error if m_trim or a_trim are given when not using methods + # TMM or CTF + if ((method not in ["tmm", "ctf"]) + and (m_trim is not None or a_trim is not None)): raise ValueError( - "Parameters m-trim and a-trim can only be used with methods TMM and CTF." + "Parameters m-trim and a-trim can only be used with " + "methods TMM and CTF." ) - # Set m_trim and a_trim to their default values for methods TMM and CTF + # Set m_trim and a_trim to their default values for methods + # TMM and CTF if method in ["tmm", "ctf"]: m_trim = 0.3 if m_trim is None else m_trim a_trim = 0.05 if a_trim is None else a_trim @@ -106,13 +111,13 @@ def _convert_lengths(table, gene_length): skiprows=1, ).squeeze("columns") - # Check if all gene IDs that are present in the table are also present in - # the lengths + # Check if all gene IDs that are present in the table are also + # present in the lengths if not set(table.columns).issubset(set(lengths.index)): only_in_counts = set(table.columns) - set(lengths.index) raise ValueError( - f"There are genes present in the FeatureTable that are not present " - f"in the gene-length input. Missing lengths for genes: " - f"{only_in_counts}" + f"There are genes present in the FeatureTable that are " + f"not present in the gene-length input. Missing lengths " + f"for genes: {only_in_counts}" ) return lengths diff --git a/q2_feature_table/citations.bib b/q2_feature_table/citations.bib index 3177333..82bc7c9 100644 --- a/q2_feature_table/citations.bib +++ b/q2_feature_table/citations.bib @@ -43,3 +43,12 @@ @article{NCBI-BLAST year = {2008}, doi = {10.1093/nar/gkn201}, } + +@misc{Zmrzlikar_RNAnorm_RNA-seq_data_2023, +author = {Zmrzlikar, Jure and Žganec, Matjaž and Ausec, Luka and Štajdohar, Miha}, +month = jul, +title = {{RNAnorm: RNA-seq data normalization in Python}}, +url = {https://github.com/genialis/RNAnorm}, +version = {2.1.0}, +year = {2023} +} diff --git a/q2_feature_table/plugin_setup.py b/q2_feature_table/plugin_setup.py index 569b4d3..fcebd4c 100644 --- a/q2_feature_table/plugin_setup.py +++ b/q2_feature_table/plugin_setup.py @@ -5,16 +5,17 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - +from qiime2.core.type import Properties from qiime2.plugin import (Plugin, Int, Float, Range, Metadata, Str, Bool, Choices, MetadataColumn, Categorical, List, Citations, TypeMatch, TypeMap, Collection, Visualization) from q2_types.feature_table import ( - FeatureTable, Frequency, RelativeFrequency, PresenceAbsence, Composition) + FeatureTable, Frequency, RelativeFrequency, PresenceAbsence, Composition, + Normalized) from q2_types.feature_data import ( - FeatureData, Sequence, Taxonomy, AlignedSequence) + FeatureData, Sequence, Taxonomy, AlignedSequence, SequenceCharacteristics) from q2_types.metadata import ImmutableMetadata import q2_feature_table @@ -723,3 +724,77 @@ "Tabulate sample and feature frequencies.", examples={'feature_table_summarize_plus': ex.feature_table_summarize_plus} ) + +P_method, T_normalized_table = TypeMap( + { + Str + % Choices("tpm"): ( + FeatureTable[Normalized % Properties("TPM")], + ), + Str + % Choices("fpkm"): ( + FeatureTable[Normalized % Properties("FPKM")], + ), + Str + % Choices("tmm"): ( + FeatureTable[Normalized % Properties("TMM")], + ), + Str + % Choices("uq"): ( + FeatureTable[Normalized % Properties("UQ")], + ), + Str + % Choices("cuf"): ( + FeatureTable[Normalized % Properties("CUF")], + ), + Str + % Choices("ctf"): ( + FeatureTable[Normalized % Properties("CTF")], + ), + Str + % Choices("cpm"): ( + FeatureTable[Normalized % Properties("CPM")], + ), + } +) + +plugin.methods.register_function( + function=q2_feature_table.normalize, + inputs={ + "table": FeatureTable[Frequency], + "gene_length": FeatureData[ + SequenceCharacteristics % Properties("length")], + }, + parameters={ + "method": P_method, + "m_trim": Float % Range( + 0, 1, inclusive_start=True, inclusive_end=True + ), + "a_trim": Float % Range( + 0, 1, inclusive_start=True, inclusive_end=True + ), + }, + outputs=[("normalized_table", T_normalized_table)], + input_descriptions={ + "table": "Feature table with gene counts.", + "gene_length": "Gene lengths of all genes in the feature table.", + }, + parameter_descriptions={ + "method": "Specify the normalization method to be used. Use FPKM or " + "TPM for within comparisons and TMM, UQ, CUF or CTF for between " + "sample camparisons. Check https://www.genialis.com/wp-content/uploads" + "/2023/12/2023-Normalizing-RNA-seq-data-in-Python-with-RNAnorm.pdf " + "for more information on the methods.", + "m_trim": "Two sided cutoff for M-values. Can only be used for " + "methods TMM and CTF. (default = 0.3)", + "a_trim": "Two sided cutoff for A-values. Can only be used for " + "methods TMM and CTF. (default = 0.05)", + }, + output_descriptions={ + "normalized_table": "Feature table normalized with specified method." + }, + name="Normalize FeatureTable", + description="Normalize FeatureTable by gene length, library size and " + "composition with common methods for RNA-seq.", + citations=[citations["Zmrzlikar_RNAnorm_RNA-seq_data_2023"]], +) diff --git a/q2_feature_table/tests/test_normalize.py b/q2_feature_table/tests/test_normalize.py index d2d1726..66134af 100644 --- a/q2_feature_table/tests/test_normalize.py +++ b/q2_feature_table/tests/test_normalize.py @@ -6,7 +6,6 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import os -import shutil from unittest import TestCase, main from unittest.mock import MagicMock, patch @@ -18,8 +17,8 @@ from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat from q2_feature_table import rarefy -from q2_feature_table._normalize import _validate_parameters, _convert_lengths, \ - normalize +from q2_feature_table._normalize import (_validate_parameters, \ + _convert_lengths, normalize) class RarefyTests(TestCase): @@ -89,16 +88,19 @@ def test_validate_parameters_uq_with_m_a_trim(self): def test_validate_parameters_tpm_missing_gene_length(self): # Test Error raised if gene-length is missing with TPM method - with self.assertRaisesRegex(ValueError, "gene-length input is missing."): + with self.assertRaisesRegex( + ValueError, "gene-length input is missing."): _validate_parameters("tpm", None, None, None) def test_validate_parameters_tmm_gene_length(self): # Test Error raised if gene-length is given with TMM method with self.assertRaisesRegex( ValueError, - "gene-length input can only be used with FPKM and TPM methods." + "gene-length input can only be used with FPKM and " + "TPM methods." ): - _validate_parameters("tmm", None, None, gene_length=MagicMock()) + _validate_parameters( + "tmm", None, None, gene_length=MagicMock()) def test_validate_parameters_default_m_a_trim(self): # Test if m_trim and a_trim get set to default values if None @@ -115,7 +117,8 @@ def test_validate_parameters_m_a_trim(self): def test_convert_lengths_gene_length(self): # Test _convert_lengths gene_length = SequenceCharacteristicsDirectoryFormat() - with open(os.path.join(str(gene_length), "sequence_characteristics.tsv"), + with open(os.path.join( + str(gene_length), "sequence_characteristics.tsv"), 'w') as file: file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0") @@ -125,13 +128,15 @@ def test_convert_lengths_gene_length(self): def test_convert_lengths_short_gene_length(self): # Test Error raised if gene-length is missing genes gene_length = SequenceCharacteristicsDirectoryFormat() - with open(os.path.join(str(gene_length), "sequence_characteristics.tsv"), 'w') as file: + with open(os.path.join( + str(gene_length), + "sequence_characteristics.tsv"), 'w') as file: file.write("id\tlength\nARO1\t1356.0") with self.assertRaisesRegex( ValueError, - "There are genes present in the FeatureTable that are not present " - "in the gene-length input. Missing lengths for genes: " - "{'ARO2'}", + "There are genes present in the FeatureTable that are " + "not present in the gene-length input. Missing lengths " + "for genes: {'ARO2'}", ): _convert_lengths(self.table, gene_length=gene_length) @@ -139,7 +144,8 @@ def test_convert_lengths_short_gene_length(self): def test_tpm_fpkm_with_valid_inputs(self, mock_tpm): # Test valid inputs for TPM method gene_length = SequenceCharacteristicsDirectoryFormat() - with open(os.path.join(str(gene_length), "sequence_characteristics.tsv"), + with open(os.path.join( + str(gene_length), "sequence_characteristics.tsv"), 'w') as file: file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0") normalize(table=self.table, gene_length=gene_length, method="tpm") @@ -148,7 +154,8 @@ def test_tpm_fpkm_with_valid_inputs(self, mock_tpm): def test_tmm_uq_cuf_ctf_with_valid_inputs(self, mock_tmm): # Test valid inputs for TMM method gene_length = SequenceCharacteristicsDirectoryFormat() - with open(os.path.join(str(gene_length), "sequence_characteristics.tsv"), + with open(os.path.join( + str(gene_length), "sequence_characteristics.tsv"), 'w') as file: file.write("id\tlength\nARO1\t1356.0\nARO2\t1173.0") normalize(table=self.table, method="tmm", a_trim=0.06, m_trim=0.4)