Merge pull request #34 from computational-ms/feature/break_tests

Feature/break tests
computational-ms · Mar 7, 2023 · 30e7ebc · 30e7ebc
2 parents 29af69b + db4c03b
commit 30e7ebc
Show file tree

Hide file tree

Showing 6 changed files with 39 additions and 12 deletions.
diff --git a/pyiohat/parsers/ident/comet_2020_01_4_parser.py b/pyiohat/parsers/ident/comet_2020_01_4_parser.py
@@ -96,10 +96,21 @@ def map_mod_mass(self):
                 if entry_tag.endswith("cvParam"):
                     mod_name = entry.attrib["name"]
                 elif entry_tag.endswith("SearchModification"):
+                    if mod_name == "unknown modification":
+                        potential_mod = self.mod_mapper.mass_to_names(
+                            float(entry.attrib["massDelta"]), decimals=4
+                        )
+                        if len(potential_mod) == 0:
+                            logger.error(
+                                f"Cannot map modification with mass {entry.attrib['massDelta']}."
+                            )
+                            raise ValueError
+                        else:
+                            mod_name = potential_mod[0]
                     mod_mass_map[entry.attrib["massDelta"]] = mod_name
                     if entry.attrib["fixedMod"] == "true":
-                        _key = entry.attrib["residues"]
-                        fixed_mods[_key] = mod_name
+                        residue = entry.attrib["residues"]
+                        fixed_mods[residue] = mod_name
                 elif entry_tag.endswith("ModificationParams"):
                     break
             entry.clear()
@@ -120,9 +131,9 @@ def get_peptide_lookup(self):
         for event, entry in etree.iterparse(self.input_file):
             entry_tag = entry.tag
 
-            if entry_tag.endswith("DBSequence"):
+            if entry_tag.endswith("PeptideSequence"):
                 peptide_information = True
-            elif peptide_information is True:
+            if peptide_information is True:
                 if entry_tag.endswith("PeptideSequence"):
                     sequence = entry.text
                     if len(self.fixed_mods) > 0:
@@ -160,9 +171,9 @@ def get_spec_records(self):
         for event, entry in etree.iterparse(self.input_file):
             entry_tag = entry.tag
 
-            if entry_tag.endswith("Inputs"):
+            if entry_tag.endswith("PeptideEvidenceRef"):
                 spec_information = True
-            elif spec_information is True:
+            if spec_information is True:
                 if entry_tag.endswith("cvParam"):
                     if entry.attrib["name"] in self.mapping_dict:
                         _key = self.mapping_dict[entry.attrib["name"]]

diff --git a/pyiohat/parsers/ident/msgfplus_2021_03_22_parser.py b/pyiohat/parsers/ident/msgfplus_2021_03_22_parser.py
@@ -85,9 +85,9 @@ def get_peptide_lookup(self):
         for event, entry in etree.iterparse(self.input_file):
             entry_tag = entry.tag
 
-            if entry_tag.endswith("DBSequence"):
+            if entry_tag.endswith("PeptideSequence"):
                 peptide_information = True
-            elif peptide_information is True:
+            if peptide_information is True:
                 if entry_tag.endswith("PeptideSequence"):
                     sequence = {"sequence": entry.text}
                 elif entry_tag.endswith("cvParam"):

diff --git a/pyiohat/parsers/misc.py b/pyiohat/parsers/misc.py
@@ -38,9 +38,21 @@ def get_atom_counts(sequences, modifications, compositions):
                 np.char.count(sequences, aa_or_mod), ordered_element_multiplier
             )
         else:
-            atom_counts += np.outer(
-                np.char.count(modifications, aa_or_mod), ordered_element_multiplier
+            mod_counts = []
+            escaped_mod_name = re.escape(aa_or_mod)
+            search_pattern = re.compile(
+                rf"(^{escaped_mod_name}:\d+)(?=;)|(?<=;)({escaped_mod_name}:\d+)(?=;)|(?<=;)({escaped_mod_name}:\d+$)|^({escaped_mod_name}:\d+)$"
             )
+            for mod in modifications:
+                mod_counts.append(
+                    len(
+                        re.findall(
+                            search_pattern,
+                            mod,
+                        )
+                    )
+                )
+            atom_counts += np.outer(mod_counts, ordered_element_multiplier)
     # Remove water (peptide bonds)
     water = np.zeros(shape=(1, len(elements)), dtype=int)
     water[0, elements.index("H")] = 2

diff --git a/tests/data/BSA1_comet_2020_01_4.mzid b/tests/data/BSA1_comet_2020_01_4.mzid
@@ -13,6 +13,7 @@
  </AnalysisSoftwareList>
  <SequenceCollection xmlns="http://psidev.info/psi/pi/mzIdentML/1.2">
  <DBSequence id="sp|P02769|ALBU_BOVIN" accession="sp|P02769|ALBU_BOVIN" searchDatabase_ref="DB0" />
+ <DBSequence id="sp|P02760|ALBU_BOVIN2" accession="sp|P02769|ALBU_BOVIN2" searchDatabase_ref="DB1" />
  <Peptide id="AEFVEVTK;">
   <PeptideSequence>AEFVEVTK</PeptideSequence>
  </Peptide>

diff --git a/tests/data/BSA1_msgfplus_2021_03_22.mzid b/tests/data/BSA1_msgfplus_2021_03_22.mzid
@@ -16,6 +16,9 @@
   <DBSequence length="607" searchDatabase_ref="SearchDB_1" accession="sp|P02769|ALBU_BOVIN" id="DBSeq1">
     <cvParam cvRef="PSI-MS" accession="MS:1001088" name="protein description" value="sp|P02769|ALBU_BOVIN Serum albumin OS=Bos taurus GN=ALB PE=1 SV=4"/>
   </DBSequence>
+  <DBSequence length="608" searchDatabase_ref="SearchDB_1" accession="sp|P02769|ALBU2_BOVIN" id="DBSeq1">
+    <cvParam cvRef="PSI-MS" accession="MS:1001089" name="protein description" value="sp|P02769|ALBU2_BOVIN Serum albumin OS=Bos taurus GN=ALB PE=1 SV=4"/>
+  </DBSequence>
   <Peptide id="Pep_YICDNQDTISSK">
     <PeptideSequence>YICDNQDTISSK</PeptideSequence>
     <Modification location="3" monoisotopicMassDelta="57.021464">

diff --git a/tests/parsers/misc/test_get_atom_counts.py b/tests/parsers/misc/test_get_atom_counts.py
@@ -12,7 +12,7 @@ def test_simple():
         [
             "",
             "",
-            "Magic",
+            "Magic:1",
         ],
         dtype=str,
     )
@@ -43,7 +43,7 @@ def test_negative():
     }
     modifications = np.array(
         [
-            "Magic",
+            "Magic:1",
         ],
         dtype=str,
     )