Minor fixes to glycan parsing (#219)

* Fix CCD code recognition when parsing glycan strings * Improve logging when tokenization fails * Glycans should always be tokenized per atom * Add test
chaidiscovery · Dec 6, 2024 · 3278943 · 3278943
1 parent cf7bf51
commit 3278943
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 3 deletions.
diff --git a/chai_lab/data/dataset/inference_dataset.py b/chai_lab/data/dataset/inference_dataset.py
@@ -187,8 +187,12 @@ def load_chains_from_raw(
                 chain_id=chain_index,
                 sym_id=sym_id,
             )
-        except Exception:
-            logger.exception(f"Failed to tokenize input {entity_data=}  {sym_id=}")
+            if tok is None:
+                logger.exception(f"Failed to tokenize input {entity_data=}  {sym_id=}")
+        except Exception as e:
+            logger.exception(
+                f"Failed to tokenize input {entity_data=}  {sym_id=}", exc_info=e
+            )
             tok = None
         structure_contexts.append(tok)
 

diff --git a/chai_lab/data/dataset/structure/all_atom_residue_tokenizer.py b/chai_lab/data/dataset/structure/all_atom_residue_tokenizer.py
@@ -183,6 +183,7 @@ def tokenize_residue(
             if (
                 residue.name in standard_residue_pdb_codes
                 and entity_type != EntityType.LIGAND
+                and entity_type != EntityType.MANUAL_GLYCAN
             )
             else self._tokenize_per_atom
         )
@@ -388,6 +389,9 @@ def _tokenize_entity(
 
         valid_residues = [x for x in tokenized_residues if x is not None]
         if len(valid_residues) == 0:
+            logger.warning(
+                f"Got no residues for entity {entity_data.entity_id} with residues {entity_data.residues}"
+            )
             return None
 
         tokens = TokenSpan.concatenate(valid_residues)

diff --git a/chai_lab/data/parsing/glycans.py b/chai_lab/data/parsing/glycans.py
@@ -61,7 +61,7 @@ def _glycan_string_to_sugars_and_bonds(
             parent_sugar_idx.pop()  # Remove
             continue
         chunk = glycan_string[i : i + 3]
-        if re.match(r"[A-Z]{3}", chunk):
+        if re.match(r"[0-9A-Z]{3}", chunk):  # Match CCD codes (3 char, alphanumeric)
             sugars.append(chunk)
             parent_sugar_idx.append(len(sugars) - 1)  # latest sugar
         elif re.match(r"[1-6]{1}-[1-6]{1}", chunk):
@@ -81,6 +81,8 @@ def _glycan_string_to_sugars_and_bonds(
 
 def glycan_string_residues(glycan_string: str) -> list[Residue]:
     sugars, _bonds = _glycan_string_to_sugars_and_bonds(glycan_string)
+    if not sugars:
+        raise ValueError(f"No residues parsed from {glycan_string=}")
     return [
         Residue(
             name=sugar,

diff --git a/tests/test_glycans.py b/tests/test_glycans.py
@@ -1,9 +1,18 @@
 # Copyright (c) 2024 Chai Discovery, Inc.
 # Licensed under the Apache License, Version 2.0.
 # See the LICENSE file for details.
+import pytest
+
 from chai_lab.data.parsing.glycans import _glycan_string_to_sugars_and_bonds
 
 
+@pytest.mark.parametrize("ccd_code", ["MAN", "99K", "FUC"])
+def test_parsing_ccd_codes(ccd_code: str):
+    """Test that various single CCD codes are parsed correctly."""
+    res, _ = _glycan_string_to_sugars_and_bonds(ccd_code)
+    assert len(res) == 1
+
+
 def test_complex_parsing():
     glycan = "MAN(6-1 FUC)(4-1 MAN(6-1 MAN(6-1 MAN)))"
     sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)