Skip to content

Commit

Permalink
Minor fixes to glycan parsing (#219)
Browse files Browse the repository at this point in the history
* Fix CCD code recognition when parsing glycan strings

* Improve logging when tokenization fails

* Glycans should always be tokenized per atom

* Add test
  • Loading branch information
wukevin authored Dec 6, 2024
1 parent cf7bf51 commit 3278943
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 3 deletions.
8 changes: 6 additions & 2 deletions chai_lab/data/dataset/inference_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,12 @@ def load_chains_from_raw(
chain_id=chain_index,
sym_id=sym_id,
)
except Exception:
logger.exception(f"Failed to tokenize input {entity_data=} {sym_id=}")
if tok is None:
logger.exception(f"Failed to tokenize input {entity_data=} {sym_id=}")
except Exception as e:
logger.exception(
f"Failed to tokenize input {entity_data=} {sym_id=}", exc_info=e
)
tok = None
structure_contexts.append(tok)

Expand Down
4 changes: 4 additions & 0 deletions chai_lab/data/dataset/structure/all_atom_residue_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ def tokenize_residue(
if (
residue.name in standard_residue_pdb_codes
and entity_type != EntityType.LIGAND
and entity_type != EntityType.MANUAL_GLYCAN
)
else self._tokenize_per_atom
)
Expand Down Expand Up @@ -388,6 +389,9 @@ def _tokenize_entity(

valid_residues = [x for x in tokenized_residues if x is not None]
if len(valid_residues) == 0:
logger.warning(
f"Got no residues for entity {entity_data.entity_id} with residues {entity_data.residues}"
)
return None

tokens = TokenSpan.concatenate(valid_residues)
Expand Down
4 changes: 3 additions & 1 deletion chai_lab/data/parsing/glycans.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def _glycan_string_to_sugars_and_bonds(
parent_sugar_idx.pop() # Remove
continue
chunk = glycan_string[i : i + 3]
if re.match(r"[A-Z]{3}", chunk):
if re.match(r"[0-9A-Z]{3}", chunk): # Match CCD codes (3 char, alphanumeric)
sugars.append(chunk)
parent_sugar_idx.append(len(sugars) - 1) # latest sugar
elif re.match(r"[1-6]{1}-[1-6]{1}", chunk):
Expand All @@ -81,6 +81,8 @@ def _glycan_string_to_sugars_and_bonds(

def glycan_string_residues(glycan_string: str) -> list[Residue]:
sugars, _bonds = _glycan_string_to_sugars_and_bonds(glycan_string)
if not sugars:
raise ValueError(f"No residues parsed from {glycan_string=}")
return [
Residue(
name=sugar,
Expand Down
9 changes: 9 additions & 0 deletions tests/test_glycans.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
# Copyright (c) 2024 Chai Discovery, Inc.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for details.
import pytest

from chai_lab.data.parsing.glycans import _glycan_string_to_sugars_and_bonds


@pytest.mark.parametrize("ccd_code", ["MAN", "99K", "FUC"])
def test_parsing_ccd_codes(ccd_code: str):
"""Test that various single CCD codes are parsed correctly."""
res, _ = _glycan_string_to_sugars_and_bonds(ccd_code)
assert len(res) == 1


def test_complex_parsing():
glycan = "MAN(6-1 FUC)(4-1 MAN(6-1 MAN(6-1 MAN)))"
sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
Expand Down

0 comments on commit 3278943

Please sign in to comment.