Skip to content

Commit

Permalink
Merge branch 'residue-gaps'
Browse files Browse the repository at this point in the history
  • Loading branch information
avivrosenberg committed Feb 24, 2024
2 parents 9e3f895 + c04c809 commit d521d52
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 37 deletions.
2 changes: 1 addition & 1 deletion src/pp5/pgroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1331,7 +1331,7 @@ def _join(x: Iterable) -> str:
)

query_res = ResidueRecord(
res_id=_join(q.res_id for q in query_residues),
res_seq_idx=query_residues[0].res_seq_idx,
unp_idx=query_residues[0].unp_idx,
rel_loc=query_residues[0].rel_loc,
name=_join(q.name for q in query_residues),
Expand Down
145 changes: 109 additions & 36 deletions src/pp5/prec.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from Bio.PDB import PPBuilder
from Bio.Seq import Seq
from Bio.Align import PairwiseAligner
from Bio.PDB.Chain import Chain
from Bio.SeqRecord import SeqRecord
from Bio.PDB.Residue import Residue
from Bio.PDB.Polypeptide import Polypeptide
Expand All @@ -35,6 +36,7 @@
from pp5.align import BLOSUM80
from pp5.utils import ProteinInitError, filelock_context
from pp5.codons import (
ACIDS_1TO3,
ACIDS_3TO1,
UNKNOWN_AA,
CODON_TABLE,
Expand Down Expand Up @@ -85,6 +87,9 @@
backbone_only=True, unit_cell=None, isotropic=True, scale_as_bfactor=True
)

# Special insertion code to mark a residue that was missing from the structure.
ICODE_MISSING_RESIDUE = "M"


class ResidueRecord(object):
"""
Expand All @@ -93,7 +98,7 @@ class ResidueRecord(object):

def __init__(
self,
res_id: Union[str, int],
res_seq_idx: int,
unp_idx: int,
rel_loc: float,
name: str,
Expand All @@ -104,12 +109,14 @@ def __init__(
num_altlocs: int,
backbone_coords: Optional[Dict[str, Optional[np.ndarray]]] = None,
contacts: Optional[ResidueContacts] = None,
res_icode: str = "",
res_hflag: str = "",
):
"""
:param res_id: identifier of this residue in the sequence, usually an
integer + insertion code if present, which indicates some alteration
compared to the wild-type.
:param res_seq_idx: index of this residue in the sequence.
:param res_icode: insertion code, if present, which indicates some alteration
or a missing residue in the structure if equal to ICODE_MISSING_RESIDUE.
:param res_hflag: hetero flag, if present, which indicates a non-standard AA.
:param unp_idx: index of this residue in the corresponding UNP record.
:param rel_loc: relative location of this residue in the protein sequence,
a number between 0 and 1.
Expand Down Expand Up @@ -137,7 +144,12 @@ def __init__(
codon_score = max_count / total_count if total_count else 0
codon_opts = codon_counts.keys()

self.res_id, self.name = str(res_id), name
self.res_seq_idx, self.res_icode, self.res_hflag = (
res_seq_idx,
res_icode,
res_hflag,
)
self.name = name
self.unp_idx, self.rel_loc = unp_idx, rel_loc
self.codon, self.codon_score = best_codon, codon_score
self.codon_opts = str.join(CODON_OPTS_SEP, codon_opts)
Expand All @@ -147,6 +159,14 @@ def __init__(
self.backbone_coords = backbone_coords or {}
self.contacts = contacts

@property
def res_id(self) -> str:
"""
:return: The residue id, including insertion code and hetero flag.
Similar to biopython's id.
"""
return f"{self.res_hflag}{self.res_seq_idx}{self.res_icode}"

def as_dict(self, dihedral_args: Dict[str, Any] = None):
"""
Creates a dict representation of the data in this residue. The angles object
Expand All @@ -156,9 +176,11 @@ def as_dict(self, dihedral_args: Dict[str, Any] = None):
:return: A dict representation of this residue.
"""
return dict(
res_id=self.res_id,
name=self.name,
pdb_idx=self.res_seq_idx,
unp_idx=self.unp_idx,
res_name=self.name,
res_icode=self.res_icode,
res_hflag=self.res_hflag,
rel_loc=self.rel_loc,
**(
dict(
Expand Down Expand Up @@ -298,7 +320,9 @@ def __getitem__(self, altloc_id: str) -> str:
class AltlocResidueRecord(ResidueRecord):
def __init__(
self,
res_id: Union[str, int],
res_seq_idx: int,
res_icode: str,
res_hflag: str,
unp_idx: int,
rel_loc: float,
name: str,
Expand Down Expand Up @@ -345,7 +369,9 @@ def __init__(
self.altloc_contacts = altloc_contacts or {}

super().__init__(
res_id=res_id,
res_seq_idx=res_seq_idx,
res_icode=res_icode,
res_hflag=res_hflag,
unp_idx=unp_idx,
rel_loc=rel_loc,
name=name,
Expand Down Expand Up @@ -375,8 +401,16 @@ def from_residue(
with_contacts: bool = False,
contacts_assigner: Optional[ContactsAssigner] = None,
):
res_id: str = res_to_id(r_curr)
res_name: str = ACIDS_3TO1.get(r_curr.get_resname(), UNKNOWN_AA)
# Parse residue id
res_seq_idx: int
res_hflag: str
res_icode: str
res_hflag, res_seq_idx, res_icode = r_curr.get_id()
res_seq_idx = int(res_seq_idx)
res_hflag = res_hflag.strip()
res_icode = res_icode.strip()
res_name: str = r_curr.get_resname()
res_name = ACIDS_3TO1.get(res_name, res_name) # keep name for non-standard AAs

# mapping atom_name -> [altloc_id1, altloc_id2, ...]
altloc_ids: Dict[str, Sequence[str]] = {
Expand Down Expand Up @@ -420,7 +454,9 @@ def from_residue(
)

return cls(
res_id=res_id,
res_seq_idx=res_seq_idx,
res_icode=res_icode,
res_hflag=res_hflag,
unp_idx=unp_idx,
rel_loc=rel_loc,
name=res_name,
Expand Down Expand Up @@ -827,37 +863,74 @@ def __init__(
dihedral_est_name, dihedral_est_args
)

# Extract the residues from the PDB structure
pdb_aa_seq, residues = "", []
for i, pp in enumerate(self.polypeptides):
first_res: Residue = pp[0]
_, curr_start_idx, _ = first_res.get_id()

# More than one pp means there are gaps due to non-standard AAs
if i > 0:
# Calculate index gap between this polypeptide and previous
prev_pp_last_res: Residue = self.polypeptides[i - 1][-1]
_, prev_end_idx, _ = prev_pp_last_res.get_id()
gap_len = curr_start_idx - prev_end_idx - 1

# fill in the gaps
pdb_aa_seq += UNKNOWN_AA * gap_len
residues.extend(
[
Residue(("", j, ""), UNKNOWN_AA, i)
for j in range(prev_end_idx + 1, curr_start_idx)
]
)
# Extract the residues from the PDB structure
curr_res: Residue
chain: Chain = self.pdb_rec[0][self.pdb_auth_chain_id]
for curr_res in chain.get_residues():
# Skip water molecules
if curr_res.resname == "HOH":
continue

pdb_aa_seq += str(pp.get_sequence())
residues.extend(pp)
# For non-standard AAs, we place an 'X' in the sequence, but we still
# keep these residues.
res_name_single = ACIDS_3TO1.get(curr_res.get_resname(), UNKNOWN_AA)
pdb_aa_seq += res_name_single
residues.append(curr_res)

assert len(pdb_aa_seq) == len(residues)
n_residues = len(pdb_aa_seq)

# Add un-modelled residues by aligning to UNP:
# In case of non-modelled residues in the structure, they will be missing
# from the pdb_aa_seq and residues list, since they don't appear in
# biopython's structure. We will add them back using alignment to Uniprot.
# Find the alignment between the PDB AA sequence and the Uniprot AA sequence.
pdb_to_unp_idx = self._find_unp_alignment(pdb_aa_seq, self.unp_rec.sequence)

for curr_pdb_idx in pdb_to_unp_idx.keys():
next_pdb_idx = curr_pdb_idx + 1
if next_pdb_idx not in pdb_to_unp_idx:
# Either this is the last one, or the next one is not in the
# alignment (can happen if it's a non-standard AA)
continue

curr_unp_idx = pdb_to_unp_idx[curr_pdb_idx]
next_unp_idx = pdb_to_unp_idx[next_pdb_idx]

# Detect a gap in the alignment to UNP
gap_len = next_unp_idx - curr_unp_idx - 1
if gap_len == 0:
continue

# Here, we have two adjacent residues in the pdb structure are aligned to
# non-adjacent residues in the unp sequence. It means that there are
# missing residues in the structure, i.e. the pdb sequence we created is
# not complete.
curr_residue = residues[curr_pdb_idx]
curr_residue_seq_idx = curr_residue.get_id()[1]
for j, gap_unp_idx in enumerate(range(curr_unp_idx + 1, next_unp_idx)):
missing_res_name_single = self.unp_rec.sequence[gap_unp_idx]
missing_res_name = ACIDS_1TO3[missing_res_name_single]

gap_pdb_idx = next_pdb_idx + j
gap_pdb_seq_idx = curr_residue_seq_idx + j + 1

residues.insert( # inserts BEFORE the given index
gap_pdb_idx,
Residue(
(" ", gap_pdb_seq_idx, ICODE_MISSING_RESIDUE),
missing_res_name,
0,
),
)

# Update PDB sequence to include any missing residues we added
pdb_aa_seq = str.join(
"", [ACIDS_3TO1.get(r.resname, UNKNOWN_AA) for r in residues]
)
assert len(pdb_aa_seq) == len(residues)
n_residues = len(pdb_aa_seq)

# Find the best matching DNA for our AA sequence via pairwise alignment
# between the PDB AA sequence and translated DNA sequences.
self.ena_id = None
Expand Down

0 comments on commit d521d52

Please sign in to comment.