-
Notifications
You must be signed in to change notification settings - Fork 189
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #9 from chaidiscovery/alex/chailab
Warn user about potentially wrong EntityType
- Loading branch information
Showing
7 changed files
with
184 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
""" | ||
Simple heuristics that can help with identification of EntityType | ||
""" | ||
|
||
import string | ||
from string import ascii_letters | ||
|
||
from chai_lab.data.parsing.structure.entity_type import EntityType | ||
|
||
|
||
def constituents_of_modified_fasta(x: str) -> list[str] | None: | ||
""" | ||
Accepts RNA/DNA inputs: 'agtc', 'AGT(ASP)TG', etc. Does not accept SMILES strings. | ||
Returns constituents, e.g, [A, G, T, ASP, T, G] or None if string is incorrect. | ||
Everything in returned list is single character, except for blocks specified in brackets. | ||
""" | ||
x = x.strip().upper() | ||
# it is a bit strange that digits are here, but [NH2] was in one protein | ||
allowed_chars = ascii_letters + "()" + string.digits | ||
if not all(letter in allowed_chars for letter in x): | ||
return None | ||
|
||
current_modified: str | None = None | ||
|
||
constituents = [] | ||
for letter in x: | ||
if letter == "(": | ||
if current_modified is not None: | ||
return None # double open bracket | ||
current_modified = "" | ||
elif letter == ")": | ||
if current_modified is None: | ||
return None # closed without opening | ||
if len(current_modified) <= 1: | ||
return None # empty modification: () or single (K) | ||
constituents.append(current_modified) | ||
current_modified = None | ||
else: | ||
if current_modified is not None: | ||
current_modified += letter | ||
else: | ||
if letter not in ascii_letters: | ||
return None # strange single-letter residue | ||
constituents.append(letter) | ||
if current_modified is not None: | ||
return None # did not close bracket | ||
return constituents | ||
|
||
|
||
def identify_potential_entity_types(sequence: str) -> list[EntityType]: | ||
""" | ||
Provided FASTA sequence or smiles, lists which entities those could be. | ||
Returns an empty list if sequence is invalid for all entity types. | ||
""" | ||
sequence = sequence.strip() | ||
if len(sequence) == 0: | ||
return [] | ||
possible_entity_types = [] | ||
|
||
constituents = constituents_of_modified_fasta(sequence) | ||
if constituents is not None: | ||
# this can be RNA/DNA/protein. | ||
one_letter_constituents = set(x for x in constituents if len(x) == 1) | ||
if set.issubset(one_letter_constituents, set("AGTC")): | ||
possible_entity_types.append(EntityType.DNA) | ||
if set.issubset(one_letter_constituents, set("AGUC")): | ||
possible_entity_types.append(EntityType.RNA) | ||
if "U" not in one_letter_constituents: | ||
possible_entity_types.append(EntityType.PROTEIN) | ||
|
||
ascii_symbols = string.ascii_letters + string.digits + ".-+=#$%:/\\[]()<>@" | ||
if set.issubset(set(sequence.upper()), set(ascii_symbols)): | ||
possible_entity_types.append(EntityType.LIGAND) | ||
return possible_entity_types |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
example_ligands = [ | ||
"C", | ||
"O", | ||
"C(C1C(C(C(C(O1)O)O)O)O)O", | ||
"[O-]S(=O)(=O)[O-]", | ||
"CC1=C(C(CCC1)(C)C)/C=C/C(=C/C=C/C(=C/C=O)/C)/C", | ||
"CCC1=C(c2cc3c(c(c4n3[Mg]56[n+]2c1cc7n5c8c(c9[n+]6c(c4)C(C9CCC(=O)OC/C=C(\C)/CCC[C@H](C)CCC[C@H](C)CCCC(C)C)C)[C@H](C(=O)c8c7C)C(=O)OC)C)C=C)C=O", | ||
r"C=CC1=C(C)/C2=C/c3c(C)c(CCC(=O)O)c4n3[Fe@TB16]35<-N2=C1/C=c1/c(C)c(C=C)/c(n13)=C/C1=N->5/C(=C\4)C(CCC(=O)O)=C1C", | ||
# different ions | ||
"[Mg+2]", | ||
"[Na+]", | ||
"[Cl-]", | ||
] | ||
|
||
example_proteins = [ | ||
"AGSHSMRYFSTSVSRPGRGEPRFIAVGYVDDTQFVR", | ||
"(KCJ)(SEP)(PPN)(B3S)(BAL)(PPN)K(NH2)", | ||
"XDHPX", | ||
] | ||
|
||
|
||
example_rna = [ | ||
"AGUGGCUA", | ||
"AAAAAA", | ||
"AGUC", | ||
] | ||
|
||
example_dna = [ | ||
"AGTGGCTA", | ||
"AAAAAA", | ||
"AGTC", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
from chai_lab.data.parsing.input_validation import ( | ||
constituents_of_modified_fasta, | ||
identify_potential_entity_types, | ||
) | ||
from chai_lab.data.parsing.structure.entity_type import EntityType | ||
|
||
from .example_inputs import example_dna, example_ligands, example_proteins, example_rna | ||
|
||
|
||
def test_simple_protein_fasta(): | ||
parts = constituents_of_modified_fasta("RKDES") | ||
assert parts is not None | ||
assert all(x == y for x, y in zip(parts, ["R", "K", "D", "E", "S"])) | ||
|
||
|
||
def test_modified_protein_fasta(): | ||
parts = constituents_of_modified_fasta("(KCJ)(SEP)(PPN)(B3S)(BAL)(PPN)KX(NH2)") | ||
assert parts is not None | ||
expected = ["KCJ", "SEP", "PPN", "B3S", "BAL", "PPN", "K", "X", "NH2"] | ||
assert all(x == y for x, y in zip(parts, expected)) | ||
|
||
|
||
def test_rna_fasta(): | ||
seq = "ACUGACG" | ||
parts = constituents_of_modified_fasta(seq) | ||
assert parts is not None | ||
assert all(x == y for x, y in zip(parts, seq)) | ||
|
||
|
||
def test_dna_fasta(): | ||
seq = "ACGACTAGCAT" | ||
parts = constituents_of_modified_fasta(seq) | ||
assert parts is not None | ||
assert all(x == y for x, y in zip(parts, seq)) | ||
|
||
|
||
def test_parsing(): | ||
for ligand in example_ligands: | ||
assert EntityType.LIGAND in identify_potential_entity_types(ligand) | ||
|
||
for protein in example_proteins: | ||
assert EntityType.PROTEIN in identify_potential_entity_types(protein) | ||
|
||
for dna in example_dna: | ||
assert EntityType.DNA in identify_potential_entity_types(dna) | ||
|
||
for rna in example_rna: | ||
assert EntityType.RNA in identify_potential_entity_types(rna) |