From 99036c1672a55f7287bd1c3d93c69b1d1cbb784b Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 11 Feb 2024 16:21:56 +0200 Subject: [PATCH 01/37] PDBMetadata: Major refactor to work with PDB REST API responses --- src/pp5/external_dbs/pdb.py | 353 ++++++++++++++++++++++++------------ 1 file changed, 241 insertions(+), 112 deletions(-) diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py index b3c7f9c..07bbf44 100644 --- a/src/pp5/external_dbs/pdb.py +++ b/src/pp5/external_dbs/pdb.py @@ -7,8 +7,20 @@ from math import cos, sin from math import degrees as deg from math import radians as rad -from typing import Any, Dict, List, Type, Tuple, Union, Optional, Sequence +from typing import ( + Any, + Set, + Dict, + List, + Tuple, + Union, + TypeVar, + Callable, + Optional, + Sequence, +) from pathlib import Path +from datetime import datetime from collections import defaultdict import numpy as np @@ -551,132 +563,249 @@ def from_cache( return super(PDB2UNP, cls).from_cache(cache_dir, filename) -class PDBMetadata(object): +_TC = TypeVar("_TC") + + +class PDBMetadata(object): # TODO: JSONCacheableMixin """ - Extracts metadata from a PDB structure. - Helpful metadata fields: - https://www.rcsb.org/pdb/results/reportField.do + Obtains and parses metadata from a PDB structure using PDB REST API. """ - def __init__(self, pdb_id: str, pdb_source: str = PDB_RCSB, struct_d=None): + def __init__(self, pdb_id: str): """ - :param pdb_id: The PDB ID of the structure. - :param struct_d: Optional dict which will be used if given, instead of - parsing the PDB file. - :param pdb_source: Source from which to obtain the pdb file. + :param pdb_id: The PDB ID of the structure. No chain. """ - pdb_base_id, chain_id = split_id(pdb_id) - struct_d = pdb_dict(pdb_id, pdb_source=pdb_source, struct_d=struct_d) - - # For alphafold structures, default to zero resolution instead of NaN. - default_res = 0.0 if pdb_source == PDB_AFLD else None - - def _meta(key: str, convert_to: Type = str, default=None): - val = struct_d.get(key, None) - if not val: - return default - if isinstance(val, list): - val = val[0] - if not val or val == "?": - return default + + self._pdb_id, _ = split_id(pdb_id) + + # Obtain structure-level metadata from the PDB API + self._meta_struct: dict = pdb_api.execute_raw_data_query(self.pdb_id) + self._meta_entities: Dict[int, dict] = {} + self._meta_chains: Dict[str, dict] = {} + entity_ids = self._meta_struct["rcsb_entry_container_identifiers"][ + "polymer_entity_ids" + ] + for entity_id in entity_ids: + entity_id = int(entity_id) + # Obtain entity-level metadata from the PDB API + self._meta_entities[entity_id] = pdb_api.execute_raw_data_query( + self.pdb_id, entity_id=entity_id + ) + + chain_ids = self._meta_entities[entity_id][ + "rcsb_polymer_entity_container_identifiers" + ]["asym_ids"] + for chain_id in chain_ids: + # Obtain chain-level metadata from the PDB API + self._meta_chains[chain_id] = pdb_api.execute_raw_data_query( + self.pdb_id, chain_id=chain_id + ) + + @staticmethod + def _resolve( + meta: dict, key: str, coerce_type: Callable[[Any], _TC] + ) -> Optional[_TC]: + for subkey in key.split("."): + if isinstance(meta, (list, tuple)): + subkey = int(subkey) + elif not isinstance(meta, dict): + raise ValueError(f"Can't resolve {key} in {meta}") + elif subkey not in meta: + return None + + meta = meta[subkey] + + if meta is not None: try: - return convert_to(val) + meta = coerce_type(meta) except ValueError: - return default - - title = _meta("_struct.title") - description = _meta("_entity.pdbx_description") - deposition_date = _meta("_pdbx_database_status.recvd_initial_deposition_date") - - src_org = _meta("_entity_src_nat.pdbx_organism_scientific") - if not src_org: - src_org = _meta("_entity_src_gen.pdbx_gene_src_scientific_name") - - src_org_id = _meta("_entity_src_nat.pdbx_ncbi_taxonomy_id", int) - if not src_org_id: - src_org_id = _meta("_entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id", int) - - host_org = _meta("_entity_src_gen.pdbx_host_org_scientific_name") - host_org_id = _meta("_entity_src_gen.pdbx_host_org_ncbi_taxonomy_id", int) - resolution = _meta("_refine.ls_d_res_high", float, default=default_res) - resolution_low = _meta("_refine.ls_d_res_low", float, default=default_res) - r_free = _meta("_refine.ls_R_factor_R_free", float) - r_work = _meta("_refine.ls_R_factor_R_work", float) - space_group = _meta("_symmetry.space_group_name_H-M") - - # Find ligands - ligands = set() - for i, chemical_type in enumerate(struct_d["_chem_comp.id"]): - if chemical_type.lower() == "hoh": - continue - if chemical_type not in STANDARD_ACID_NAMES: - ligands.add(chemical_type) - ligands = str.join(",", ligands) - - # Crystal growth details - cg_ph = _meta("_exptl_crystal_grow.pH", float) - cg_temp = _meta("_exptl_crystal_grow.temp", float) - - # Map each chain to entity id, and entity to 1-letter sequence. - chain_entities, entity_seq = {}, {} - for i, entity_id in enumerate(struct_d["_entity_poly.entity_id"]): - if not struct_d["_entity_poly.type"][i].startswith("polypeptide"): - continue + LOGGER.warning(f"Failed to coerce {meta}@{key} to {coerce_type}") - entity_id = int(entity_id) - chains_str = struct_d["_entity_poly.pdbx_strand_id"][i] - for chain in chains_str.split(","): - chain_entities[chain] = entity_id - - seq_str: str = struct_d["_entity_poly.pdbx_seq_one_letter_code_can"][i] - seq_str = seq_str.replace("\n", "") - entity_seq[entity_id] = seq_str - - self.pdb_id: str = pdb_base_id - self.pdb_source: str = pdb_source - self.title: str = title - self.description: str = description - self.deposition_date: str = deposition_date - self.src_org: str = src_org - self.src_org_id: int = src_org_id - self.host_org: str = host_org - self.host_org_id: int = host_org_id - self.resolution: float = resolution - self.resolution_low: float = resolution_low - self.r_free: float = r_free - self.r_work: float = r_work - self.space_group: str = space_group - self.ligands: str = ligands - self.cg_ph: float = cg_ph # crystal growth pH - self.cg_temp: float = cg_temp # crystal growth temperature - # mapping from chain_id to entity_id - self.chain_entities: Dict[str, int] = chain_entities - # mapping from entity_id to sequence - self.entity_sequence: Dict[int, str] = entity_seq - - def get_chain(self, entity_id: int) -> Optional[str]: - """ - :param entity_id: An ID of one of the entities in this structure. - :return: One of the chains from teh structure belonging to this entity id, - or None if this is not a valid entity if for the given structure. - """ - chains = [c for c, e in self.chain_entities.items() if e == entity_id] - if not chains: - return None - return sorted(chains)[0] + return meta - def as_dict(self) -> Dict[str, Any]: - return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} + @property + def pdb_id(self) -> str: + return self._pdb_id + + @property + def title(self) -> Optional[str]: + return self._resolve(self._meta_struct, "struct.title", str) + + @property + def description(self) -> Optional[str]: + # api_meta_entity["rcsb_polymer_entity"]["pdbx_description"] + return self._resolve(self._meta_struct, "struct.pdbx_descriptor", str) + + @property + def entity_description(self) -> Dict[int, Optional[str]]: + return { + entity_id: self._resolve( + meta_entity, "rcsb_polymer_entity.pdbx_description", str + ) + for entity_id, meta_entity in self._meta_entities.items() + } + + @property + def deposition_date(self) -> Optional[datetime]: + return self._resolve( + self._meta_struct, + "pdbx_database_status.recvd_initial_deposition_date", + datetime.fromisoformat, + ) + + @property + def entity_source_org(self) -> Dict[int, Optional[str]]: + return { + entity_id: self._resolve( + meta_entity, "rcsb_entity_source_organism.0.ncbi_scientific_name", str + ) + or self._resolve( + meta_entity, "entity_src_nat.0.pdbx_organism_scientific", str + ) + or self._resolve( + meta_entity, "entity_src_gen.0.pdbx_gene_src_scientific_name", str + ) + for entity_id, meta_entity in self._meta_entities.items() + } + + @property + def entity_source_org_id(self) -> Dict[int, Optional[int]]: + return { + entity_id: self._resolve( + meta_entity, "rcsb_entity_source_organism.0.ncbi_taxonomy_id", int + ) + or self._resolve(meta_entity, "entity_src_nat.0.pdbx_ncbi_taxonomy_id", int) + or self._resolve( + meta_entity, "entity_src_gen.0.pdbx_gene_src_ncbi_taxonomy_id", int + ) + for entity_id, meta_entity in self._meta_entities.items() + } + + @property + def entity_host_org(self) -> Dict[int, Optional[str]]: + return { + entity_id: self._resolve( + meta_entity, "rcsb_entity_host_organism.0.ncbi_scientific_name", str + ) + or self._resolve( + meta_entity, "entity_src_gen.0.pdbx_host_org_scientific_name", str + ) + for entity_id, meta_entity in self._meta_entities.items() + } + + @property + def entity_host_org_id(self) -> Dict[int, Optional[int]]: + return { + entity_id: self._resolve( + meta_entity, "rcsb_entity_host_organism.0.ncbi_taxonomy_id", int + ) + or self._resolve( + meta_entity, "entity_src_gen.0.pdbx_host_org_ncbi_taxonomy_id", int + ) + for entity_id, meta_entity in self._meta_entities.items() + } + + @property + def resolution(self) -> Optional[float]: + return self._resolve(self._meta_struct, "reflns.0.d_resolution_high", float) + + @property + def resolution_low(self) -> Optional[float]: + return self._resolve(self._meta_struct, "reflns.0.d_resolution_low", float) + + @property + def r_free(self) -> Optional[float]: + return self._resolve(self._meta_struct, "refine.0.ls_rfactor_rfree", float) + + @property + def r_work(self) -> Optional[float]: + return self._resolve(self._meta_struct, "refine.0.ls_rfactor_rwork", float) + + @property + def space_group(self) -> Optional[str]: + return self._resolve( + self._meta_struct, "symmetry.space_group_name_hm", str + ) or self._resolve(self._meta_struct, "symmetry.space_group_name_H_M", str) + + @property + def cg_ph(self) -> Optional[float]: + return self._resolve(self._meta_struct, "exptl_crystal_grow.0.pH", float) + + @property + def cg_temp(self) -> Optional[float]: + return self._resolve(self._meta_struct, "exptl_crystal_grow.0.temp", float) + + @property + def chain_ligands(self) -> Dict[str, Set[str]]: + return { + chain_id: set( + [ + ld.get("ligand_comp_id") + for ld in meta_chain.get("rcsb_ligand_neighbors", []) + ] + ) + for chain_id, meta_chain in self._meta_chains.items() + } + + @property + def ligands(self) -> str: + return str.join(",", sorted(set.union(*self.chain_ligands.values()))) @property def entity_chains(self) -> Dict[int, Sequence[str]]: """ :return: Mapping from entity id to a list of chains belonging to that entity. """ - entity_chains = defaultdict(list) - for chain, entity in self.chain_entities.items(): - entity_chains[entity].append(chain) - return dict(entity_chains) + return self._entity_chains(author=False) + + @property + def entity_auth_chains(self) -> Dict[int, Sequence[str]]: + """ + :return: Mapping from entity id to a list of chains belonging to that entity, + using the original author's chain ids. + """ + return self._entity_chains(author=True) + + def _entity_chains(self, author: bool = False) -> Dict[int, Sequence[str]]: + """ + :param author: Whether to use author or canonical chain ids. + :return: Mapping from entity id to a list of chains belonging to that entity. + """ + asym_ids_key = "auth_asym_ids" if author else "asym_ids" + key = f"rcsb_polymer_entity_container_identifiers.{asym_ids_key}" + return { + entity_id: self._resolve(meta_entity, key, tuple) + for entity_id, meta_entity in self._meta_entities.items() + } + + @property + def chain_entities(self) -> Dict[str, int]: + """ + :return: Mapping from chain id to its entity id. + """ + chain_to_entity = {} + for entity_id, chain_ids in self.entity_chains.items(): + chain_to_entity = { + **chain_to_entity, + **{chain_id: entity_id for chain_id in chain_ids}, + } + return chain_to_entity + + @property + def entity_sequence(self) -> Dict[int, str]: + return { + entity_id: self._resolve( + meta_entity, "entity_poly.pdbx_seq_one_letter_code_can", str + ) + for entity_id, meta_entity in self._meta_entities.items() + } + + def as_dict(self) -> Dict[str, Any]: + return { + k: getattr(self, k) + for k, v in self.__class__.__dict__.items() + if isinstance(v, property) + } def __repr__(self): return str(self.as_dict()) From 225b7d6e22e03b902b42d10c9db458fe2e4fe672 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 11 Feb 2024 16:22:20 +0200 Subject: [PATCH 02/37] PDB2UNP: Include author chains --- src/pp5/external_dbs/pdb.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py index 07bbf44..b20efbe 100644 --- a/src/pp5/external_dbs/pdb.py +++ b/src/pp5/external_dbs/pdb.py @@ -466,7 +466,13 @@ def query_entity_uniprot_id_alignments( # Get list of chains and list of Uniprot IDs for this entity entity_containers = entity_data["rcsb_polymer_entity_container_identifiers"] - entity_chains = entity_containers.get("asym_ids", []) + entity_chains = [ + # The same chain can be referred to by different labels, + # the canonical PDB label and another label given by the + # structure author. + *entity_containers.get("asym_ids", []), + *entity_containers.get("auth_asym_ids", []), + ] entity_unp_ids = entity_containers.get("uniprot_ids", []) unp_alignments: Dict[str, List[Tuple[int, int]]] = { From 477bee898a651d7c295503c830c078f1e07cfd6e Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 11 Feb 2024 16:50:41 +0200 Subject: [PATCH 03/37] PDBMetadata: Improve resolution parsing --- src/pp5/external_dbs/pdb.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py index b20efbe..614969f 100644 --- a/src/pp5/external_dbs/pdb.py +++ b/src/pp5/external_dbs/pdb.py @@ -713,7 +713,9 @@ def entity_host_org_id(self) -> Dict[int, Optional[int]]: @property def resolution(self) -> Optional[float]: - return self._resolve(self._meta_struct, "reflns.0.d_resolution_high", float) + return self._resolve( + self._meta_struct, "rcsb_entry_info.diffrn_resolution_high.value", float + ) or self._resolve(self._meta_struct, "reflns.0.d_resolution_high", float) @property def resolution_low(self) -> Optional[float]: From 379378bcb9909f56cf9187058136e728e2e7ced9 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 11 Feb 2024 16:51:01 +0200 Subject: [PATCH 04/37] PDBMetadata: Map chains to auth chains --- src/pp5/external_dbs/pdb.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py index 614969f..37c52d5 100644 --- a/src/pp5/external_dbs/pdb.py +++ b/src/pp5/external_dbs/pdb.py @@ -21,6 +21,7 @@ ) from pathlib import Path from datetime import datetime +from itertools import zip_longest from collections import defaultdict import numpy as np @@ -799,6 +800,28 @@ def chain_entities(self) -> Dict[str, int]: } return chain_to_entity + @property + def chain_to_auth_chain(self) -> Dict[str, str]: + """ + :return: Mapping from PDB chain id to its author's chain id. If there are no + different names for the author chains, the PDB chain names are mapped to + themselves. + """ + entity_auth_chains = self.entity_auth_chains + chain_to_auth_chain = {} + for entity_id, chain_ids in self.entity_chains.items(): + auth_chain_ids = entity_auth_chains[entity_id] + chain_to_auth_chain = { + **chain_to_auth_chain, + **{ + chain_id: auth_chain_id or chain_id + for chain_id, auth_chain_id in zip_longest( + chain_ids, auth_chain_ids + ) + }, + } + return chain_to_auth_chain + @property def entity_sequence(self) -> Dict[int, str]: return { From 55a47364b524a2678af72de9d30ac5c22aea3830 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 11 Feb 2024 17:45:08 +0200 Subject: [PATCH 05/37] prec: Update to work with new metadata --- src/pp5/prec.py | 128 ++++++++++++------------------------------------ 1 file changed, 32 insertions(+), 96 deletions(-) diff --git a/src/pp5/prec.py b/src/pp5/prec.py index 95fd576..fda9852 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -570,8 +570,8 @@ def from_pdb( pdb_dict = pdb.pdb_dict( pdb_id, pdb_source=pdb_source, struct_d=pdb_dict ) - meta = pdb.PDBMetadata(pdb_id, pdb_source=pdb_source, struct_d=pdb_dict) - chain_id = meta.get_chain(entity_id) + meta = pdb.PDBMetadata(pdb_id) + chain_id = meta.entity_chains[entity_id][0] if not chain_id: # In rare cases the chain is a number instead of a letter, @@ -667,7 +667,7 @@ def from_unp( def __init__( self, - unp_id: str, + unp_id: str, # TODO: Get this from metadata pdb_id: str, pdb_source: str = PDB_RCSB, pdb_dict: dict = None, @@ -684,17 +684,13 @@ def __init__( contact_radius: float = CONTACT_DEFAULT_RADIUS, ): """ + Don't call this directly. Use class methods from_pdb or from_unp instead. + Initialize a protein record from both Uniprot and PDB ids. - To initialize a protein from Uniprot id or PDB id only, use the - class methods provided for this purpose. :param unp_id: Uniprot id which uniquely identifies the protein. - :param pdb_id: PDB id with or without chain (e.g. '1ABC' or '1ABC:D') - of the specific structure desired. Note that this structure must match - the unp_id, i.e. it must exist in the cross-refs of the given unp_id. - Otherwise an error will be raised (unless strict_unp_xref=False). If no - chain is specified, a chain matching the unp_id will be used, - if it exists. + :param pdb_id: PDB id with chain (e.g. '1ABC:D') of the specific structure chain + desired. :param pdb_source: Source from which to obtain the pdb file. :param dihedral_est_name: Method of dihedral angle estimation. Options are: @@ -747,27 +743,37 @@ class methods provided for this purpose. self.contact_radius = contact_radius self.contact_method = contact_method - # First we must find a matching PDB structure and chain for the - # Uniprot id. If a pdb_id is given we'll try to use that, depending - # on whether there's a Uniprot xref for it and on strict_unp_xref. - self.pdb_base_id, self.pdb_chain_id = self._find_pdb_xref(pdb_id) + # Parse the given PDB id + self.pdb_base_id, self.pdb_chain_id, ent_id = pdb.split_id_with_entity(pdb_id) + if numeric_chain: + self.pdb_chain_id = str(ent_id) self.pdb_id = f"{self.pdb_base_id}:{self.pdb_chain_id}" + self.pdb_source = pdb_source if pdb_dict: self._pdb_dict = pdb_dict - self.pdb_meta = pdb.PDBMetadata( - self.pdb_id, pdb_source=self.pdb_source, struct_d=self.pdb_dict - ) + self.pdb_meta = pdb.PDBMetadata(self.pdb_id) if not self.pdb_meta.resolution and self.pdb_source != PDB_AFLD: raise ProteinInitError(f"Unknown resolution for {pdb_id}") + self.pdb_entity_id = self.pdb_meta.chain_entities[self.pdb_chain_id] + self.pdb_auth_chain_id = self.pdb_meta.chain_to_auth_chain[self.pdb_chain_id] + + chain_str = ( + self.pdb_chain_id + if self.pdb_auth_chain_id == self.pdb_chain_id + else f"{self.pdb_chain_id}({self.pdb_auth_chain_id})" + ) LOGGER.info( - f"{self}: {self.pdb_meta.description}, " - f"org={self.pdb_meta.src_org} ({self.pdb_meta.src_org_id}), " - f"expr={self.pdb_meta.host_org} ({self.pdb_meta.host_org_id}), " + f"pdb_id={self.pdb_base_id}, chain={chain_str}, unp_id={self.unp_id}, " + f"entity_id={self.pdb_entity_id}, " f"res={self.pdb_meta.resolution:.2f}Å, " - f"entity_id={self.pdb_meta.chain_entities[self.pdb_chain_id]}" + f"desc={self.pdb_meta.entity_description[self.pdb_entity_id]}, " + f"org={self.pdb_meta.entity_source_org[self.pdb_entity_id]} " + f"({self.pdb_meta.entity_source_org_id[self.pdb_entity_id]}), " + f"expr={self.pdb_meta.entity_host_org[self.pdb_entity_id]} " + f"({self.pdb_meta.entity_host_org_id[self.pdb_entity_id]})" ) # Make sure the structure is sane. See e.g. 1FFK. @@ -994,7 +1000,9 @@ def polypeptides(self) -> List[Polypeptide]: https://proteopedia.org/wiki/index.php/HETATM """ if not self._pp: - chain = self.pdb_rec[0][self.pdb_chain_id] + # Use author chain id to get the polypeptides, as the author chain is + # what's associated with the coordinates in the mmCIF file. + chain = self.pdb_rec[0][self.pdb_auth_chain_id] pp_chains = PPBuilder().build_peptides(chain, aa_only=True) # Sort chain by sequence ID of first residue in the chain, @@ -1234,78 +1242,6 @@ def _find_dna_alignment( return best_ena.id, str(best_ena.seq), idx_to_codons - def _find_pdb_xref(self, ref_pdb_id) -> Tuple[str, str]: - ref_pdb_id, ref_chain_id, ent_id = pdb.split_id_with_entity(ref_pdb_id) - if not ref_chain_id: - if ent_id is not None and self.numeric_chain: - # In rare cases the chain is a number and indistinguishable - # from entity. Handle this case only if explicitly - # requested. - ref_chain_id = ent_id - else: - ref_chain_id = "" - - ref_pdb_id, ref_chain_id = ref_pdb_id.upper(), ref_chain_id.upper() - - xrefs = unp.find_pdb_xrefs(self.unp_rec, method="x-ray") - - # We'll sort the PDB entries according to multiple criteria based on - # the resolution, number of chains and sequence length. - def sort_key(xref: unp.UNPPDBXRef): - id_cmp = xref.pdb_id.upper() != ref_pdb_id - chain_cmp = xref.chain_id.upper() != ref_chain_id - seq_len_diff = abs(xref.seq_len - self.unp_rec.sequence_length) - # The sort key for PDB entries - # First, if we have a matching id to the reference PDB id we take - # it. Otherwise, we take the best match according to seq len and - # resolution. - return id_cmp, chain_cmp, seq_len_diff, xref.resolution - - xrefs = sorted(xrefs, key=sort_key) - if not xrefs: - msg = f"No PDB cross-refs for {self.unp_id}" - if self.strict_unp_xref: - raise ProteinInitError(msg) - elif not ref_chain_id: - raise ProteinInitError(f"{msg} and no chain provided in ref") - else: - LOGGER.warning(f"{msg}, using ref {ref_pdb_id}:{ref_chain_id}") - return ref_pdb_id, ref_chain_id - - # Get best match according to sort key and return its id. - xref = xrefs[0] - LOGGER.info(f"{self.unp_id}: PDB XREF = {xref}") - - pdb_id = xref.pdb_id.upper() - chain_id = xref.chain_id.upper() - - # Make sure we have a match with the Uniprot id. Id chain wasn't - # specified, match only PDB ID, otherwise, both must match. - if pdb_id != ref_pdb_id: - msg = ( - f"Reference PDB ID {ref_pdb_id} not found as " - f"cross-reference for protein {self.unp_id}" - ) - if self.strict_unp_xref: - raise ProteinInitError(msg) - else: - LOGGER.warning(msg) - pdb_id = ref_pdb_id - - if ref_chain_id and chain_id != ref_chain_id: - msg = ( - f"Reference chain {ref_chain_id} of PDB ID {ref_pdb_id} not" - f"found as cross-reference for protein {self.unp_id}. " - f"Did you mean chain {chain_id}?" - ) - if self.strict_unp_xref: - raise ProteinInitError(msg) - else: - LOGGER.warning(msg) - chain_id = ref_chain_id - - return pdb_id.upper(), chain_id.upper() - def _get_dihedral_estimators(self, est_name: str, est_args: dict): est_name = est_name.lower() if est_name else est_name est_args = {} if est_args is None else est_args @@ -1349,7 +1285,7 @@ def items(self) -> ItemsView[str, ResidueRecord]: return self._residue_recs.items() def __repr__(self): - return f"({self.unp_id}, {self.pdb_id})" + return f"{self.pdb_id}" def __getstate__(self): # Prevent serializing Bio objects From a6f694cff62054189dc66fca22c0688548219cd0 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Sun, 11 Feb 2024 17:46:02 +0200 Subject: [PATCH 06/37] pgroup/collect/align: Update to work with new metadata --- src/pp5/align.py | 2 +- src/pp5/collect.py | 2 +- src/pp5/pgroup.py | 4 +--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/pp5/align.py b/src/pp5/align.py index 41d052c..aad2910 100644 --- a/src/pp5/align.py +++ b/src/pp5/align.py @@ -530,7 +530,7 @@ def pdb(self, query_pdb_id: str, pdb_dict=None) -> pd.DataFrame: ) # Note: no need for pdb_source, we just care about what chains exist - meta = pdb.PDBMetadata(pdb_id, struct_d=pdb_dict) + meta = pdb.PDBMetadata(pdb_id) if chain_id not in meta.chain_entities: raise ValueError(f"Can't find chain {chain_id} in {pdb_id}") diff --git a/src/pp5/collect.py b/src/pp5/collect.py index 279aa10..1468a51 100644 --- a/src/pp5/collect.py +++ b/src/pp5/collect.py @@ -1043,7 +1043,7 @@ def _collect_single_structure( pdb_dict = pdb.pdb_dict(pdb_id, pdb_source=pdb_source) pdb2unp = pdb.PDB2UNP.from_pdb(pdb_id, cache=True) - meta = pdb.PDBMetadata(pdb_id, pdb_source=pdb_source, struct_d=pdb_dict) + meta = pdb.PDBMetadata(pdb_id) # Determine all chains we need to collect from the PDB structure chains_to_collect: Sequence[str] diff --git a/src/pp5/pgroup.py b/src/pp5/pgroup.py index cf279f1..3f4602d 100644 --- a/src/pp5/pgroup.py +++ b/src/pp5/pgroup.py @@ -259,9 +259,7 @@ def __init__( ) ref_pdb_dict = pdb.pdb_dict(self.ref_pdb_id, pdb_source=pdb_source) - ref_pdb_meta = pdb.PDBMetadata( - self.ref_pdb_base_id, pdb_source=pdb_source, struct_d=ref_pdb_dict - ) + ref_pdb_meta = pdb.PDBMetadata(self.ref_pdb_base_id) if self.ref_pdb_chain not in ref_pdb_meta.chain_entities: raise ProteinInitError(f"Unknown PDB entity for {self.ref_pdb_id}") From 8097401923920c5cac6a1dca91b79b348b998213 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Mon, 12 Feb 2024 05:14:19 +0200 Subject: [PATCH 07/37] PDBMetadata: Add uniprot id calculation --- src/pp5/external_dbs/pdb.py | 178 ++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py index 37c52d5..953f34d 100644 --- a/src/pp5/external_dbs/pdb.py +++ b/src/pp5/external_dbs/pdb.py @@ -831,6 +831,123 @@ def entity_sequence(self) -> Dict[int, str]: for entity_id, meta_entity in self._meta_entities.items() } + @property + def uniprot_ids(self) -> Sequence[str]: + """ + :return: All Uniprot IDs associated with the PDB structure. + """ + all_unp_ids = set() + for chain_id, unp_ids in self.chain_uniprot_ids.items(): + all_unp_ids.update(unp_ids) + return tuple(sorted(all_unp_ids)) + + @property + def chain_uniprot_ids(self) -> Dict[str, Sequence[str]]: + """ + Retrieves all Uniprot IDs associated with a PDB structure chains. + + :return: a map: chain -> [unp1, unp2, ...] + where unp1, unp2, ... are Uniprot IDs associated with the chain. + """ + + # entity -> chain -> unp -> [ (s1,e1), ... ] + entity_map = self.entity_uniprot_id_alignments + + all_chain_map = {} + for entity_id, chain_map in entity_map.items(): + for chain_id, unp_map in chain_map.items(): + # chain -> [unp1, unp2, ...] + all_chain_map[chain_id] = tuple(unp_map.keys()) + + return all_chain_map + + @property + def chain_uniprot_id_alignments( + self, + ) -> Dict[str, Dict[str, List[Tuple[int, int]]]]: + """ + Retrieves all Uniprot IDs associated with a PDB structure chains. + + :return: a map: chain -> unp -> [ (s1,e1), ... ] + where (s1,e1) are alignment start,end indices between the UNP and PDB sequences. + """ + # entity -> chain -> unp -> [ (s1,e1), ... ] + entity_map = self.entity_uniprot_id_alignments + + all_chain_map = {} + for entity_id, chain_map in entity_map.items(): + for chain_id, unp_map in chain_map.items(): + # chain -> unp -> [ (s1,e1), ... ] + all_chain_map[chain_id] = unp_map + + return all_chain_map + + @property + def entity_uniprot_ids(self) -> Dict[str, Dict[str, Sequence[str]]]: + """ + Retrieves all Uniprot IDs associated with a PDB structure entities. + + :return: a map: entity -> chain ->[unp1, unp2, ...] + where unp1, unp2, ... are Uniprot IDs associated with the entity. + """ + # entity -> chain -> unp -> [ (s1,e1), ... ] + entity_map = self.entity_uniprot_id_alignments + + new_entity_map = defaultdict(dict) + for entity_id, chain_map in entity_map.items(): + for chain_id, unp_map in chain_map.items(): + # entity -> chain -> [unp1, unp2, ...] + new_entity_map[entity_id][chain_id] = tuple(unp_map.keys()) + + return dict(new_entity_map) + + @property + def entity_uniprot_id_alignments( + self, + ) -> Dict[int, Dict[str, Dict[str, List[Tuple[int, int]]]]]: + """ + Retrieves all Uniprot IDs associated with a PDB structure entities. + + :return: a map: entity -> chain -> unp -> [ (s1,e1), ...] + where (s1,e1) are alignment start,end indices between the UNP and PDB sequences. + """ + map_to_unp_ids = {} + + for entity_id, entity_meta in self._meta_entities.items(): + # Get list of chains and list of Uniprot IDs for this entity + entity_containers = entity_meta["rcsb_polymer_entity_container_identifiers"] + entity_unp_ids = entity_containers.get("uniprot_ids", []) + + unp_alignments: Dict[str, List[Tuple[int, int]]] = { + unp_id: [] for unp_id in entity_unp_ids + } + for alignment_entry in entity_meta.get("rcsb_polymer_entity_align", []): + if alignment_entry["reference_database_name"].lower() != "uniprot": + continue + + unp_id = alignment_entry["reference_database_accession"] + if unp_id not in unp_alignments: + continue + + for alignment_region in alignment_entry["aligned_regions"]: + align_start = alignment_region["entity_beg_seq_id"] + align_end = align_start + alignment_region["length"] - 1 + unp_alignments[unp_id].append((align_start, align_end)) + + entity_chains = [ + # The same chain can be referred to by different labels, + # the canonical PDB label and another label given by the + # structure author. + *entity_containers.get("asym_ids", []), + *entity_containers.get("auth_asym_ids", []), + ] + + map_to_unp_ids[entity_id] = { + chain_id: unp_alignments for chain_id in entity_chains + } + + return map_to_unp_ids + def as_dict(self) -> Dict[str, Any]: return { k: getattr(self, k) @@ -841,6 +958,67 @@ def as_dict(self) -> Dict[str, Any]: def __repr__(self): return str(self.as_dict()) + @classmethod + def from_pdb(cls, pdb_id: str, cache=False) -> PDBMetadata: + """ + Create a PDBMetadata object from a given PDB ID. + :param pdb_id: The PDB ID to map for. Chain will be ignored if present. + :param cache: Whether to load a cached mapping if available. + :return: A PDBMetadata object. + """ + pdb_base_id, _ = split_id(pdb_id) + + # TODO: Implement caching + # if cache: + # pdb_meta = cls.from_cache(pdb_base_id) + # if pdb_meta is not None: + # return pdb_meta + + pdb_meta = cls(pdb_id) + # pdb_meta.save() + return pdb_meta + + @classmethod + def pdb_id_to_unp_id(cls, pdb_id: str, strict=True, cache=False) -> str: + """ + Given a PDB ID, returns a single Uniprot id for it. + :param pdb_id: PDB ID, with optional chain. + :param cache: Whether to use cached mapping. + :param strict: Whether to raise an error (True) or just warn (False) + if the PDB ID cannot be uniquely mapped to a single Uniprot ID. + This can happen if: (1) Chain wasn't specified and there are + different Uniprot IDs for different chains (e.g. 4HHB); (2) Chain was + specified but there are multiple Uniprot IDs for the chain + (chimeric entry, e.g. 3SG4:A). + :return: A Uniprot ID. + """ + pdb_base_id, chain_id = split_id(pdb_id) + meta = cls.from_pdb(pdb_id, cache=cache) + + if not meta.uniprot_ids: + raise ValueError(f"No Uniprot entries exist for {pdb_base_id}") + + if not chain_id: + if len(meta.uniprot_ids) > 1: + msg = f"Multiple Uniprot IDs for {pdb_base_id}, no chain specified." + if strict: + raise ValueError(msg) + LOGGER.warning(f"{msg} Returning first ID from the first chain.") + + for chain_id, unp_ids in meta.chain_uniprot_ids.items(): + return unp_ids[0] + + if chain_id not in meta.chain_uniprot_ids: + raise ValueError(f"No Uniprot ID for chain {chain_id} of {pdb_base_id}") + + if len(meta.chain_uniprot_ids[chain_id]) > 1: + msg = f"Multiple Uniprot IDs for {pdb_base_id} chain {chain_id} (chimeric)" + if strict: + raise ValueError(msg) + LOGGER.warning(f"{msg} Returning the first Uniprot ID.") + + return meta.chain_uniprot_ids[chain_id][0] + class PDBUnitCell(object): """ From 36519d91c3d1ee3f8f2eb58c98f3e08b02b26f2b Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Mon, 12 Feb 2024 05:21:33 +0200 Subject: [PATCH 08/37] PDBMetadata: Use str for entity id --- src/pp5/external_dbs/pdb.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py index 953f34d..d55a781 100644 --- a/src/pp5/external_dbs/pdb.py +++ b/src/pp5/external_dbs/pdb.py @@ -573,7 +573,7 @@ def from_cache( _TC = TypeVar("_TC") -class PDBMetadata(object): # TODO: JSONCacheableMixin +class PDBMetadata(JSONCacheableMixin): """ Obtains and parses metadata from a PDB structure using PDB REST API. """ @@ -587,13 +587,13 @@ def __init__(self, pdb_id: str): # Obtain structure-level metadata from the PDB API self._meta_struct: dict = pdb_api.execute_raw_data_query(self.pdb_id) - self._meta_entities: Dict[int, dict] = {} + self._meta_entities: Dict[str, dict] = {} self._meta_chains: Dict[str, dict] = {} entity_ids = self._meta_struct["rcsb_entry_container_identifiers"][ "polymer_entity_ids" ] for entity_id in entity_ids: - entity_id = int(entity_id) + entity_id = str(entity_id) # Obtain entity-level metadata from the PDB API self._meta_entities[entity_id] = pdb_api.execute_raw_data_query( self.pdb_id, entity_id=entity_id @@ -644,7 +644,7 @@ def description(self) -> Optional[str]: return self._resolve(self._meta_struct, "struct.pdbx_descriptor", str) @property - def entity_description(self) -> Dict[int, Optional[str]]: + def entity_description(self) -> Dict[str, Optional[str]]: return { entity_id: self._resolve( meta_entity, "rcsb_polymer_entity.pdbx_description", str @@ -661,7 +661,7 @@ def deposition_date(self) -> Optional[datetime]: ) @property - def entity_source_org(self) -> Dict[int, Optional[str]]: + def entity_source_org(self) -> Dict[str, Optional[str]]: return { entity_id: self._resolve( meta_entity, "rcsb_entity_source_organism.0.ncbi_scientific_name", str @@ -676,7 +676,7 @@ def entity_source_org(self) -> Dict[int, Optional[str]]: } @property - def entity_source_org_id(self) -> Dict[int, Optional[int]]: + def entity_source_org_id(self) -> Dict[str, Optional[int]]: return { entity_id: self._resolve( meta_entity, "rcsb_entity_source_organism.0.ncbi_taxonomy_id", int @@ -689,7 +689,7 @@ def entity_source_org_id(self) -> Dict[int, Optional[int]]: } @property - def entity_host_org(self) -> Dict[int, Optional[str]]: + def entity_host_org(self) -> Dict[str, Optional[str]]: return { entity_id: self._resolve( meta_entity, "rcsb_entity_host_organism.0.ncbi_scientific_name", str @@ -701,7 +701,7 @@ def entity_host_org(self) -> Dict[int, Optional[str]]: } @property - def entity_host_org_id(self) -> Dict[int, Optional[int]]: + def entity_host_org_id(self) -> Dict[str, Optional[int]]: return { entity_id: self._resolve( meta_entity, "rcsb_entity_host_organism.0.ncbi_taxonomy_id", int @@ -761,21 +761,21 @@ def ligands(self) -> str: return str.join(",", sorted(set.union(*self.chain_ligands.values()))) @property - def entity_chains(self) -> Dict[int, Sequence[str]]: + def entity_chains(self) -> Dict[str, Sequence[str]]: """ :return: Mapping from entity id to a list of chains belonging to that entity. """ return self._entity_chains(author=False) @property - def entity_auth_chains(self) -> Dict[int, Sequence[str]]: + def entity_auth_chains(self) -> Dict[str, Sequence[str]]: """ :return: Mapping from entity id to a list of chains belonging to that entity, using the original author's chain ids. """ return self._entity_chains(author=True) - def _entity_chains(self, author: bool = False) -> Dict[int, Sequence[str]]: + def _entity_chains(self, author: bool = False) -> Dict[str, Sequence[str]]: """ :param author: Whether to use author or canonical chain ids. :return: Mapping from entity id to a list of chains belonging to that entity. @@ -788,7 +788,7 @@ def _entity_chains(self, author: bool = False) -> Dict[int, Sequence[str]]: } @property - def chain_entities(self) -> Dict[str, int]: + def chain_entities(self) -> Dict[str, str]: """ :return: Mapping from chain id to its entity id. """ @@ -823,7 +823,7 @@ def chain_to_auth_chain(self) -> Dict[str, str]: return chain_to_auth_chain @property - def entity_sequence(self) -> Dict[int, str]: + def entity_sequence(self) -> Dict[str, str]: return { entity_id: self._resolve( meta_entity, "entity_poly.pdbx_seq_one_letter_code_can", str @@ -904,7 +904,7 @@ def entity_uniprot_ids(self) -> Dict[str, Dict[str, Sequence[str]]]: @property def entity_uniprot_id_alignments( self, - ) -> Dict[int, Dict[str, Dict[str, List[Tuple[int, int]]]]]: + ) -> Dict[str, Dict[str, Dict[str, List[Tuple[int, int]]]]]: """ Retrieves all Uniprot IDs associated with a PDB structure entities. From 5de6cf58f770e16eb94eb93ce7cf483d22b6ea73 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Mon, 12 Feb 2024 06:30:24 +0200 Subject: [PATCH 09/37] PDBMetadata: Add extra properties --- src/pp5/external_dbs/pdb.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py index d55a781..8d9312d 100644 --- a/src/pp5/external_dbs/pdb.py +++ b/src/pp5/external_dbs/pdb.py @@ -640,7 +640,6 @@ def title(self) -> Optional[str]: @property def description(self) -> Optional[str]: - # api_meta_entity["rcsb_polymer_entity"]["pdbx_description"] return self._resolve(self._meta_struct, "struct.pdbx_descriptor", str) @property @@ -760,6 +759,27 @@ def chain_ligands(self) -> Dict[str, Set[str]]: def ligands(self) -> str: return str.join(",", sorted(set.union(*self.chain_ligands.values()))) + @property + def entity_ids(self) -> Sequence[str]: + """ + :return: The entity ids which exist in the structure. + """ + return tuple(self._meta_entities.keys()) + + @property + def chain_ids(self) -> Sequence[str]: + """ + :return: The chain ids which exist in the structure. + """ + return tuple(self._meta_chains.keys()) + + @property + def auth_chain_ids(self) -> Sequence[str]: + """ + :return: The chain ids which exist in the structure. + """ + return tuple(self.chain_to_auth_chain[chain_id] for chain_id in self.chain_ids) + @property def entity_chains(self) -> Dict[str, Sequence[str]]: """ From 4f8ab4e72ba23cdc4adcc65f61792d298871343e Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Mon, 12 Feb 2024 06:30:51 +0200 Subject: [PATCH 10/37] Replace PDB2UNP with PDBMetadata --- src/pp5/external_dbs/pdb.py | 330 +----------------------------------- 1 file changed, 5 insertions(+), 325 deletions(-) diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py index 8d9312d..2c095fb 100644 --- a/src/pp5/external_dbs/pdb.py +++ b/src/pp5/external_dbs/pdb.py @@ -7,18 +7,7 @@ from math import cos, sin from math import degrees as deg from math import radians as rad -from typing import ( - Any, - Set, - Dict, - List, - Tuple, - Union, - TypeVar, - Callable, - Optional, - Sequence, -) +from typing import Any, Set, Dict, List, Tuple, TypeVar, Callable, Optional, Sequence from pathlib import Path from datetime import datetime from itertools import zip_longest @@ -32,7 +21,6 @@ from Bio.PDB.Polypeptide import standard_aa_names from Bio.PDB.PDBExceptions import PDBConstructionWarning, PDBConstructionException -import pp5 from pp5 import PDB_DIR, get_resource_path from pp5.utils import JSONCacheableMixin, remote_dl from pp5.external_dbs import pdb_api @@ -116,6 +104,8 @@ def pdb_download(pdb_id: str, pdb_dir=PDB_DIR, pdb_source: str = PDB_RCSB) -> Pa download_url_template = PDB_DOWNLOAD_SOURCES[pdb_source] if "unp_id" in download_url_template: + pdb_meta = PDBMetadata.from_pdb(pdb_id, cache=True) + # The alphafold source requires downloading the data based on the uniprot id unp_ids = None if not chain_id: @@ -123,9 +113,7 @@ def pdb_download(pdb_id: str, pdb_dir=PDB_DIR, pdb_source: str = PDB_RCSB) -> Pa raise ValueError(f"Chain or entity must be specified for {pdb_source=}") # Obtain uniprot ids from entity (entity -> chain -> unp ids) - entity_chains: dict = PDB2UNP.query_entity_uniprot_ids(pdb_id).get( - entity_id, {} - ) + entity_chains: dict = pdb_meta.entity_uniprot_ids.get(entity_id, {}) if not entity_chains: raise ValueError(f"Failed to obtain chain for {pdb_id}:{entity_id}") chain_id = [*entity_chains.keys()][0] # arbitrary chain from the entity @@ -138,7 +126,7 @@ def pdb_download(pdb_id: str, pdb_dir=PDB_DIR, pdb_source: str = PDB_RCSB) -> Pa return filename # Get uniprot id for this chain (only if we didn't get them from entity) - unp_ids = unp_ids or PDB2UNP.query_chain_uniprot_ids(pdb_id).get(chain_id, []) + unp_ids = unp_ids or pdb_meta.chain_uniprot_ids.get(chain_id, []) if len(unp_ids) != 1: raise ValueError( f"Can't determine unique uniprot id for {pdb_id}:{chain_id}, " @@ -262,314 +250,6 @@ def pdb_to_secondary_structure( return ss_dict, keys -class PDB2UNP(JSONCacheableMixin, object): - """ - Maps PDB IDs (in each chain) to one or more Uniprot IDs which correspond - to that chain, and their locations in the PDB sequence. - """ - - def __init__(self, pdb_id: str): - """ - Initialize a PDB to Uniprot mapping. - :param pdb_id: PDB ID, without chain. Chain will be ignored if specified. - """ - pdb_base_id, _ = split_id(pdb_id) - - # Get all chain Uniprot IDs by querying PDB. This gives us the most - # up-to-date IDs and provides the alignment info between the - # PDB structure's sequence and the Uniprot xref sequence. - # Map is chain -> unp -> [ (s1,e1), (s2, e2), ... ] - self.chain_to_unp_xrefs = self.query_chain_uniprot_id_alignments(pdb_id) - self.pdb_id = pdb_base_id - - def get_unp_id(self, chain_id: str, strict=True) -> str: - """ - :param chain_id: A chain in the PDB structure. - :param strict: Whether to raise an error (True) or just warn (False) - if the chain cannot be uniquely mapped to a single Uniprot ID. - :return: the first unp id matching the given chain. Usually there's - only one unless the entry is chimeric. - """ - if not chain_id or chain_id.upper() not in self.chain_to_unp_xrefs: - raise ValueError(f"No Uniprot ID for chain {chain_id} of" f" {self.pdb_id}") - - if self.is_chimeric(chain_id): - msg = ( - f"{self.pdb_id} is chimeric at chain {chain_id}, " - f"possible Uniprot IDs: " - f"{self.get_all_chain_unp_ids(chain_id)}." - ) - if strict: - raise ValueError(msg) - LOGGER.warning(f"{msg} Returning first ID.") - - for unp_id in self.chain_to_unp_xrefs[chain_id.upper()]: - return unp_id - - def is_chimeric(self, chain_id: str) -> bool: - """ - :param chain_id: A chain in the PDB structure. - :return: Whether the sequence in the given chain is chimeric, - i.e. is composed of regions from different proteins. - """ - return len(self.chain_to_unp_xrefs[chain_id.upper()]) > 1 - - def get_all_chain_unp_ids(self, chain_id) -> tuple: - """ - :param chain_id: A chain in the PDB structure. - :return: All unp ids matching the given chain. - """ - return tuple(self.chain_to_unp_xrefs[chain_id.upper()].keys()) - - def get_all_unp_ids(self) -> set: - """ - :return: All Uniprot IDs for all chains in the PDB structure. - """ - all_unp_ids = set() - for chain in self.chain_to_unp_xrefs: - all_unp_ids.update(self.get_all_chain_unp_ids(chain)) - return all_unp_ids - - def get_chain_to_unp_ids(self) -> Dict[str, Tuple[str]]: - """ - :return: A mapping from chain it to a sequence of uniprot ids for - that chain. - """ - return {c: tuple(u.keys()) for c, u in self.chain_to_unp_xrefs.items()} - - def save(self, out_dir=pp5.PDB2UNP_DIR) -> Path: - """ - Write the current mapping to a human-readable text file (json) which - can also be loaded later using from_cache. - :param out_dir: Output directory. - :return: The path of the written file. - """ - filename = f"{self.pdb_id}.json" - return self.to_cache(out_dir, filename, indent=None) - - def __getitem__(self, chain_id: str): - """ - :param chain_id: The chain. - :return: Uniprot xrefs for a given chain - """ - return self.chain_to_unp_xrefs[chain_id.upper()] - - def __contains__(self, chain_id: str): - """ - :param chain_id: The chain. - :return: Whether this mapping contains the given chain. - """ - return chain_id.upper() in self.chain_to_unp_xrefs - - def __repr__(self): - return f"PDB2UNP({self.pdb_id})={self.get_chain_to_unp_ids()}" - - @classmethod - def query_chain_uniprot_ids(cls, pdb_id: str) -> Dict[str, Sequence[str]]: - """ - Retrieves all Uniprot IDs associated with a PDB structure chains by querying - the PDB database. - - :param pdb_id: The PDB ID to search for. Chain or entity will be ignored. - :return: a map: chain -> [unp1, unp2, ...] - where unp1, unp2, ... are Uniprot IDs associated with the chain. - :raises pdb_api.PDBAPIException: If there's a problem obtaining the data. - """ - - # entity -> chain -> unp -> [ (s1,e1), ... ] - entity_map = cls.query_entity_uniprot_id_alignments(pdb_id) - - all_chain_map = {} - for entity_id, chain_map in entity_map.items(): - for chain_id, unp_map in chain_map.items(): - # chain -> [unp1, unp2, ...] - all_chain_map[chain_id] = tuple(unp_map.keys()) - - return all_chain_map - - @classmethod - def query_chain_uniprot_id_alignments( - cls, pdb_id: str - ) -> Dict[str, Dict[str, List[Tuple[int, int]]]]: - """ - Retrieves all Uniprot IDs associated with a PDB structure chains by querying - the PDB database. - - :param pdb_id: The PDB ID to search for. Chain or entity will be ignored. - :return: a map: chain -> unp -> [ (s1,e1), ... ] - where (s1,e1) are alignment start,end indices between the UNP and PDB sequences. - :raises pdb_api.PDBAPIException: If there's a problem obtaining the data. - """ - # entity -> chain -> unp -> [ (s1,e1), ... ] - entity_map = cls.query_entity_uniprot_id_alignments(pdb_id) - - all_chain_map = {} - for entity_id, chain_map in entity_map.items(): - for chain_id, unp_map in chain_map.items(): - # chain -> unp -> [ (s1,e1), ... ] - all_chain_map[chain_id] = unp_map - - return all_chain_map - - @classmethod - def query_entity_uniprot_ids( - cls, pdb_id: str - ) -> Dict[str, Dict[str, Sequence[str]]]: - """ - Retrieves all Uniprot IDs associated with a PDB structure entities by querying - the PDB database. - - :param pdb_id: The PDB ID to search for. Chain or entity will be ignored. - :return: a map: entity -> chain ->[unp1, unp2, ...] - where unp1, unp2, ... are Uniprot IDs associated with the entity. - :raises pdb_api.PDBAPIException: If there's a problem obtaining the data. - """ - - # entity -> chain -> unp -> [ (s1,e1), ... ] - entity_map = cls.query_entity_uniprot_id_alignments(pdb_id) - - new_entity_map = defaultdict(dict) - for entity_id, chain_map in entity_map.items(): - for chain_id, unp_map in chain_map.items(): - # entity -> chain -> [unp1, unp2, ...] - new_entity_map[entity_id][chain_id] = tuple(unp_map.keys()) - - return dict(new_entity_map) - - @classmethod - def query_entity_uniprot_id_alignments( - cls, pdb_id: str - ) -> Dict[str, Dict[str, Dict[str, List[Tuple[int, int]]]]]: - """ - Retrieves all Uniprot IDs associated with a PDB structure entities by querying - the PDB database. - - :param pdb_id: The PDB ID to search for. Chain or entity will be ignored. - :return: a map: entity -> chain -> unp -> [ (s1,e1), ...] - where (s1,e1) are alignment start,end indices between the UNP and PDB sequences. - :raises pdb_api.PDBAPIException: If there's a problem obtaining the data. - """ - map_to_unp_ids = {} - - # Make sure we have a base id - pdb_id, _, _ = split_id_with_entity(pdb_id) - - # Get all data for the PDB structure - entry_data = pdb_api.execute_raw_data_query(pdb_id) - entry_containers = entry_data["rcsb_entry_container_identifiers"] - - # Find all polymer entities - entity_ids = entry_containers.get("polymer_entity_ids", []) - for entity_id in entity_ids: - entity_id = str(entity_id) - # Get all data about this entity - entity_data = pdb_api.execute_raw_data_query(pdb_id, entity_id=entity_id) - - # Get list of chains and list of Uniprot IDs for this entity - entity_containers = entity_data["rcsb_polymer_entity_container_identifiers"] - entity_chains = [ - # The same chain can be referred to by different labels, - # the canonical PDB label and another label given by the - # structure author. - *entity_containers.get("asym_ids", []), - *entity_containers.get("auth_asym_ids", []), - ] - entity_unp_ids = entity_containers.get("uniprot_ids", []) - - unp_alignments: Dict[str, List[Tuple[int, int]]] = { - unp_id: [] for unp_id in entity_unp_ids - } - for alignment_entry in entity_data.get("rcsb_polymer_entity_align", []): - if alignment_entry["reference_database_name"].lower() != "uniprot": - continue - - unp_id = alignment_entry["reference_database_accession"] - if unp_id not in unp_alignments: - continue - - for alignment_region in alignment_entry["aligned_regions"]: - align_start = alignment_region["entity_beg_seq_id"] - align_end = align_start + alignment_region["length"] - 1 - unp_alignments[unp_id].append((align_start, align_end)) - - map_to_unp_ids[entity_id] = { - chain_id: unp_alignments for chain_id in entity_chains - } - - return map_to_unp_ids - - @classmethod - def pdb_id_to_unp_id( - cls, - pdb_id: str, - strict=True, - cache=False, - ) -> str: - """ - Given a PDB ID, returns a single Uniprot id for it. - :param pdb_id: PDB ID, with optional chain. If provided chain will - be used. - :param cache: Whether to use cached mapping. - :param strict: Whether to raise an error (True) or just warn (False) - if the PDB ID cannot be uniquely mapped to a single Uniprot ID. - This can happen if: (1) Chain wasn't specified and there are - different Uniprot IDs for different chains (e.g. 4HHB); (2) Chain was - specified but there are multiple Uniprot IDs for the chain - (chimeric entry, e.g. 3SG4:A). - :return: A Uniprot ID. - """ - pdb_base_id, chain_id = split_id(pdb_id) - pdb2unp = cls.from_pdb(pdb_id, cache=cache) - - all_unp_ids = pdb2unp.get_all_unp_ids() - if not all_unp_ids: - raise ValueError(f"No Uniprot entries exist for {pdb_base_id}") - - if not chain_id: - if len(all_unp_ids) > 1: - msg = ( - f"Multiple Uniprot IDs exists for {pdb_base_id}, and no " - f"chain specified." - ) - if strict: - raise ValueError(msg) - LOGGER.warning( - f"{msg} Returning the first Uniprot ID " f"from the first chain." - ) - - for chain_id, unp_ids in pdb2unp.get_chain_to_unp_ids().items(): - return unp_ids[0] - - return pdb2unp.get_unp_id(chain_id, strict=strict) - - @classmethod - def from_pdb(cls, pdb_id: str, cache=False) -> PDB2UNP: - """ - Create a PDB2UNP mapping from a given PDB ID. - :param pdb_id: The PDB ID to map for. Chain will be ignored if present. - :param cache: Whether to load a cached mapping if available. - :return: A PDB2UNP mapping object. - """ - pdb_base_id, _ = split_id(pdb_id) - - if cache: - pdb2unp = cls.from_cache(pdb_base_id) - if pdb2unp is not None: - return pdb2unp - - pdb2unp = cls(pdb_id) - pdb2unp.save() - return pdb2unp - - @classmethod - def from_cache( - cls, pdb_id, cache_dir: Union[str, Path] = pp5.PDB2UNP_DIR - ) -> Optional[PDB2UNP]: - pdb_id, _ = split_id(pdb_id) - filename = f"{pdb_id}.json" - return super(PDB2UNP, cls).from_cache(cache_dir, filename) - - _TC = TypeVar("_TC") From 1212201755bc4fb858f48b862ed9ab50a316588f Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Mon, 12 Feb 2024 06:32:14 +0200 Subject: [PATCH 11/37] Collect: Use PDBMetadata instead of PDB2UNP --- src/pp5/collect.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/pp5/collect.py b/src/pp5/collect.py index 1468a51..119279a 100644 --- a/src/pp5/collect.py +++ b/src/pp5/collect.py @@ -1042,8 +1042,8 @@ def _collect_single_structure( pdb_base_id, chain_id, entity_id = pdb.split_id_with_entity(pdb_id) pdb_dict = pdb.pdb_dict(pdb_id, pdb_source=pdb_source) - pdb2unp = pdb.PDB2UNP.from_pdb(pdb_id, cache=True) - meta = pdb.PDBMetadata(pdb_id) + meta = pdb.PDBMetadata.from_pdb(pdb_id, cache=True) + chain_to_unp_ids = meta.chain_uniprot_ids # Determine all chains we need to collect from the PDB structure chains_to_collect: Sequence[str] @@ -1079,16 +1079,16 @@ def _collect_single_structure( pdb_id_full = f"{pdb_base_id}:{chain_id}" # Skip chains with no Uniprot ID - if chain_id not in pdb2unp: + if chain_id not in chain_to_unp_ids: LOGGER.warning(f"No Uniprot ID for {pdb_id_full}") continue # Skip chimeric chains - if pdb2unp.is_chimeric(chain_id): + if len(chain_to_unp_ids[chain_id]) > 1: LOGGER.warning(f"Discarding chimeric chain {pdb_id_full}") continue - unp_id = pdb2unp.get_unp_id(chain_id) + unp_id = chain_to_unp_ids[chain_id][0] seq_len = len(meta.entity_sequence[meta.chain_entities[chain_id]]) # Create a ProteinRecord and save it so it's cached for when we @@ -1097,7 +1097,7 @@ def _collect_single_structure( try: nc = chain_id in string.digits prec = ProteinRecord( - unp_id, + unp_id, # TODO: remove unp_ids here pdb_id_full, pdb_source=pdb_source, pdb_dict=pdb_dict, @@ -1148,7 +1148,7 @@ def _collect_single_structure( msg = ( f"Collected {len(chain_data)} chains from {pdb_id} " - f"{pdb2unp.get_chain_to_unp_ids()} ({idx[0] + 1}/{idx[1]})" + f"{chain_to_unp_ids} ({idx[0] + 1}/{idx[1]})" ) LOGGER.log(level=logging.INFO if len(chain_data) else logging.WARNING, msg=msg) From 9a8ab3f5884d823ef5c9f6d2793193783ce1b3d1 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Mon, 12 Feb 2024 06:32:53 +0200 Subject: [PATCH 12/37] prec: Use PDBMetadata instead of PDB2UNP --- src/pp5/prec.py | 68 +++++++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/src/pp5/prec.py b/src/pp5/prec.py index fda9852..193ec3d 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -562,26 +562,28 @@ def from_pdb( try: # Either chain or entity or none can be provided, but not both pdb_base_id, chain_id, entity_id = pdb.split_id_with_entity(pdb_id) - numeric_chain = False if entity_id: - entity_id = int(entity_id) + entity_id = str(entity_id) - # Discover which chains belong to this entity - pdb_dict = pdb.pdb_dict( - pdb_id, pdb_source=pdb_source, struct_d=pdb_dict - ) - meta = pdb.PDBMetadata(pdb_id) - chain_id = meta.entity_chains[entity_id][0] + meta = pdb.PDBMetadata.from_pdb(pdb_id, cache=cache) + + chain_id = None + if entity_id in meta.entity_ids: + chain_id = meta.entity_chains[entity_id][0] if not chain_id: - # In rare cases the chain is a number instead of a letter, - # so there's no way to distinguish between entity id and - # chain except also trying to use our entity as a chain - # and finding the actual entity. See e.g. 4N6V. - if str(entity_id) in meta.chain_entities: + # In rare cases the author chain is a number instead of a letter. + # We check for this, and if it's the case, we use the + # corresponding PDB chain instead. See e.g. 4N6V. + if entity_id in meta.auth_chain_ids: # Chain is number, but use its string representation - chain_id = str(entity_id) - numeric_chain = True + chain_id = next( + iter( + c_id + for c_id, ac_id in (meta.chain_to_auth_chain.items()) + if ac_id == entity_id + ) + ) else: raise ProteinInitError( f"No matching chain found for entity " @@ -600,7 +602,8 @@ def from_pdb( pdb_id, pdb_source=pdb_source, struct_d=pdb_dict ) - unp_id = pdb.PDB2UNP.pdb_id_to_unp_id( + # TODO: Remove need for unp id from init + unp_id = pdb.PDBMetadata.pdb_id_to_unp_id( pdb_id, strict=strict_pdb_xref, cache=cache ) @@ -609,7 +612,6 @@ def from_pdb( pdb_id, pdb_source=pdb_source, pdb_dict=pdb_dict, - numeric_chain=numeric_chain, **kw_for_init, ) if cache_dir: @@ -674,8 +676,6 @@ def __init__( dihedral_est_name: str = None, dihedral_est_args: dict = None, max_ena: int = None, - strict_unp_xref: bool = True, - numeric_chain: bool = False, with_altlocs: bool = True, with_backbone: bool = True, with_contacts: bool = True, @@ -701,10 +701,6 @@ def __init__( :param max_ena: Number of maximal ENA records (containing protein genetic data) to align to the PDB structure of this protein. None means no limit (all cross-refs from Uniprot will be aligned). - :param strict_unp_xref: Whether to require that there exist a PDB - cross-ref for the given Uniprot ID. - :param numeric_chain: Whether the given chain id (if any) is - numeric. In rare cases PDB structures have numbers as chain ids. :param with_altlocs: Whether to include alternate conformations in the protein record. If False, only the default conformation will be used. :param with_backbone: Whether to include backbone atoms in the protein record. @@ -735,25 +731,37 @@ def __init__( if with_altlocs and contact_method == CONTACT_METHOD_ARPEGGIO: raise ValueError(f"Altlocs not supported with {contact_method=}") - self.strict_unp_xref = strict_unp_xref - self.numeric_chain = numeric_chain self.with_altlocs = with_altlocs self.with_backbone = with_backbone self.with_contacts = with_contacts self.contact_radius = contact_radius self.contact_method = contact_method - # Parse the given PDB id - self.pdb_base_id, self.pdb_chain_id, ent_id = pdb.split_id_with_entity(pdb_id) - if numeric_chain: - self.pdb_chain_id = str(ent_id) + # Parse the given PDB id and obtain metadata + self.pdb_base_id, pdb_chain_id, ent_id = pdb.split_id_with_entity(pdb_id) + self.pdb_meta = pdb.PDBMetadata.from_pdb(self.pdb_base_id, cache=True) + + if pdb_chain_id is None: + if ent_id and len(self.pdb_meta.entity_chains.get(ent_id, [])) == 1: + pdb_chain_id = self.pdb_meta.entity_chains[ent_id][0] + elif len(self.pdb_meta.chain_ids) == 1: + pdb_chain_id = next(iter(self.pdb_meta.chain_ids)) + else: + raise ProteinInitError( + f"No chain specified in {pdb_id}, and multiple chains exist." + ) + + self.pdb_chain_id = pdb_chain_id self.pdb_id = f"{self.pdb_base_id}:{self.pdb_chain_id}" + # TODO: Remove need for unp id from init, get it from metadata + if self.unp_id not in self.pdb_meta.chain_uniprot_ids[self.pdb_chain_id]: + raise ProteinInitError(f"Uniprot ID {self.unp_id} not found in {pdb_id}") + self.pdb_source = pdb_source if pdb_dict: self._pdb_dict = pdb_dict - self.pdb_meta = pdb.PDBMetadata(self.pdb_id) if not self.pdb_meta.resolution and self.pdb_source != PDB_AFLD: raise ProteinInitError(f"Unknown resolution for {pdb_id}") From 144278a44fa714ba63b20dea5796b52a65f53456 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Mon, 12 Feb 2024 06:33:16 +0200 Subject: [PATCH 13/37] Update tests --- tests/test_pdb.py | 52 +++++++++++++++++++--------------------------- tests/test_prec.py | 20 +++--------------- 2 files changed, 24 insertions(+), 48 deletions(-) diff --git a/tests/test_pdb.py b/tests/test_pdb.py index 3cd0280..cf5948a 100644 --- a/tests/test_pdb.py +++ b/tests/test_pdb.py @@ -2,6 +2,7 @@ import math import random import string +from pprint import pprint from urllib.request import urlopen import pandas as pd @@ -140,7 +141,7 @@ def test_entity_too_long(self): pdb.split_id(invalid_id) -@pytest.fixture(params=["1MWC:A", "2WUR:A", "4N6V:1"]) +@pytest.fixture(params=["1MWC:A", "2WUR:A", "4N6V:1", "1DWI:A"]) def pdb_id(request): return request.param @@ -177,84 +178,73 @@ def test_exception_chimeric_chain(self): pdb.pdb_download("3SG4:A", pdb_source=pdb.PDB_AFLD) +@pytest.mark.skipif(NO_INTERNET, reason="Needs internet") class TestPDBMetadata: - def test_metadata(self, pdb_id, pdb_source): - meta = pdb.PDBMetadata(pdb_id, pdb_source=pdb_source) + def test_metadata_properties(self, pdb_id): + meta = pdb.PDBMetadata(pdb_id) pdb_base_id, pdb_chain = pdb.split_id(pdb_id) assert meta.pdb_id == pdb_base_id + d = meta.as_dict() # evaluates all metadata properties + pprint(d) -@pytest.mark.skipif(NO_INTERNET, reason="Needs internet") -class TestPDB2UNP: @staticmethod - def _check(pdb_id, expected_unp_id): - actual_unp_id = pdb.PDB2UNP.pdb_id_to_unp_id(pdb_id) + def _check_unp(pdb_id, expected_unp_id): + actual_unp_id = pdb.PDBMetadata.pdb_id_to_unp_id(pdb_id) assert actual_unp_id == expected_unp_id def test_no_chain_single_unp(self): - self._check("102L", "P00720") + self._check_unp("102L", "P00720") def test_with_chain_single_unp(self): - self._check("102L:A", "P00720") + self._check_unp("102L:A", "P00720") def test_no_chain_multi_unp_strict(self): test_id = "4HHB" with pytest.raises(ValueError, match="Multiple Uniprot IDs"): - pdb.PDB2UNP.pdb_id_to_unp_id(test_id) + pdb.PDBMetadata.pdb_id_to_unp_id(test_id) def test_no_chain_multi_unp_not_strict(self): test_id = "4HHB" expected_unp_ids = {"P69905", "P68871"} - actual_unp_id = pdb.PDB2UNP.pdb_id_to_unp_id(test_id, strict=False) + actual_unp_id = pdb.PDBMetadata.pdb_id_to_unp_id(test_id, strict=False) assert actual_unp_id in expected_unp_ids @pytest.mark.parametrize("test_id", ["4HHB:A", "4HHB:C"]) def test_with_chain_multi_unp_1(self, test_id): - self._check(test_id, "P69905") + self._check_unp(test_id, "P69905") @pytest.mark.parametrize("test_id", ["4HHB:B", "4HHB:D"]) def test_with_chain_multi_unp_2(self, test_id): - self._check(test_id, "P68871") + self._check_unp(test_id, "P68871") def test_with_invalid_chain(self): with pytest.raises(ValueError, match="chain Z"): - pdb.PDB2UNP.pdb_id_to_unp_id("4HHB:Z") + pdb.PDBMetadata.pdb_id_to_unp_id("4HHB:Z") @pytest.mark.parametrize("test_id", ["5LTR", "5LTR:A"]) def test_with_no_xref_in_file(self, test_id): - self._check(test_id, "B1PNC0") + self._check_unp(test_id, "B1PNC0") @pytest.mark.parametrize("test_id", ["5EJU", "4DXP"]) def test_with_no_xref_in_file_and_pdb(self, test_id): with pytest.raises(ValueError, match="No Uniprot entries"): - pdb.PDB2UNP.pdb_id_to_unp_id(test_id) + pdb.PDBMetadata.pdb_id_to_unp_id(test_id) @pytest.mark.parametrize("test_id", ["3G53", "3G53:A"]) def test_with_no_struct_ref_entry(self, test_id): - self._check(test_id, "P02213") + self._check_unp(test_id, "P02213") @pytest.mark.parametrize( ("test_id", "unp_ids"), [("3SG4:A", {"P11799", "P42212", "P0DP29"}), ("4IK8:A", {"K4DIE3", "P42212"})], ) def test_multi_unp_for_single_chain_no_strict(self, test_id, unp_ids): - actual_unp_id = pdb.PDB2UNP.pdb_id_to_unp_id(test_id, strict=False) + actual_unp_id = pdb.PDBMetadata.pdb_id_to_unp_id(test_id, strict=False) assert actual_unp_id in unp_ids @pytest.mark.parametrize("test_id", ["3SG4:A", "4IK8:A"]) def test_multi_unp_for_single_chain_strict(self, test_id): with pytest.raises(ValueError, match="chimeric"): - pdb.PDB2UNP.pdb_id_to_unp_id(test_id) - - @pytest.mark.parametrize( - ("pdb_id", "unp_id"), - [("5LTR:A", "B1PNC0"), ("3G53:A", "P02213")], - ) - def test_pdb_source(self, pdb_id, unp_id, pdb_source): - p2u = pdb.PDB2UNP.from_pdb(pdb_id) - pdb_base_id, chain_id = pdb.split_id(pdb_id) - actual_unp_id = p2u.get_unp_id(chain_id, strict=False) - # assert len(unps) == 1 - assert actual_unp_id == unp_id - assert p2u.pdb_id == pdb_base_id + pdb.PDBMetadata.pdb_id_to_unp_id(test_id) diff --git a/tests/test_prec.py b/tests/test_prec.py index cdfd126..55b0468 100644 --- a/tests/test_prec.py +++ b/tests/test_prec.py @@ -202,7 +202,7 @@ def test_numerical_chain(self): pdb_id = "4N6V:9" prec = ProteinRecord.from_pdb(pdb_id) assert prec.pdb_base_id == "4N6V" - assert prec.pdb_chain_id == "9" + assert prec.pdb_chain_id == "J" # auth chain was converted to pdb chain def test_ambiguous_numerical_entity_and_chain(self): # In this rare case it's impossible to know if entity or chain! @@ -213,7 +213,7 @@ def test_ambiguous_numerical_entity_and_chain(self): pdb_id = "4N6V:1" prec = ProteinRecord.from_pdb(pdb_id) assert prec.pdb_base_id == "4N6V" - assert prec.pdb_chain_id == "0" + assert prec.pdb_chain_id == "A" def test_entity_with_invalid_entity(self): with pytest.raises(ProteinInitError): @@ -268,24 +268,10 @@ def test_init_with_mismatching_pdb_id(self): with pytest.raises(ProteinInitError): ProteinRecord("P00720", "4GY3") - def test_no_strict_xref_with_no_xref_in_pdb(self): - prec = ProteinRecord("Q6LDG3", "3SG4:A", strict_unp_xref=False) - assert prec.unp_id == "Q6LDG3" - assert prec.pdb_id == "3SG4:A" - - def test_no_strict_xref_with_no_xref_in_pdb_and_no_chain(self): - with pytest.raises(ProteinInitError, match="and no chain provided"): - ProteinRecord("Q6LDG3", "3SG4", strict_unp_xref=False) - - def test_strict_xref_with_no_matching_xref_in_pdb(self): + def test_no_matching_xref_in_pdb(self): with pytest.raises(ProteinInitError): ProteinRecord("P42212", "2QLE:A") - def test_no_strict_xref_with_no_matching_xref_in_pdb(self): - prec = ProteinRecord("P42212", "2QLE:A", strict_unp_xref=False) - assert prec.unp_id == "P42212" - assert prec.pdb_id == "2QLE:A" - class TestSave: @classmethod From e0b15907e50a4e9c6b66857602ae64c5866035e2 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Tue, 13 Feb 2024 06:07:27 +0200 Subject: [PATCH 14/37] PDBMetadata: support per-chain conversion to dict --- src/pp5/external_dbs/pdb.py | 74 +++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 11 deletions(-) diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py index 2c095fb..a06b1b1 100644 --- a/src/pp5/external_dbs/pdb.py +++ b/src/pp5/external_dbs/pdb.py @@ -332,12 +332,17 @@ def entity_description(self) -> Dict[str, Optional[str]]: } @property - def deposition_date(self) -> Optional[datetime]: - return self._resolve( + def deposition_date(self) -> Optional[str]: + dt = self._resolve( self._meta_struct, "pdbx_database_status.recvd_initial_deposition_date", datetime.fromisoformat, ) + if not dt: + return None + + # Keep only date + return dt.strftime("%Y-%m-%d") @property def entity_source_org(self) -> Dict[str, Optional[str]]: @@ -424,20 +429,24 @@ def cg_temp(self) -> Optional[float]: return self._resolve(self._meta_struct, "exptl_crystal_grow.0.temp", float) @property - def chain_ligands(self) -> Dict[str, Set[str]]: + def chain_ligands(self) -> Dict[str, Sequence[str]]: return { - chain_id: set( - [ - ld.get("ligand_comp_id") - for ld in meta_chain.get("rcsb_ligand_neighbors", []) - ] + chain_id: tuple( + sorted( + set( + [ + ld.get("ligand_comp_id") + for ld in meta_chain.get("rcsb_ligand_neighbors", []) + ] + ) + ) ) for chain_id, meta_chain in self._meta_chains.items() } @property def ligands(self) -> str: - return str.join(",", sorted(set.union(*self.chain_ligands.values()))) + return str.join(",", sorted(set.union(set(), *self.chain_ligands.values()))) @property def entity_ids(self) -> Sequence[str]: @@ -648,12 +657,55 @@ def entity_uniprot_id_alignments( return map_to_unp_ids - def as_dict(self) -> Dict[str, Any]: - return { + def as_dict(self, chain_id: Optional[str] = None) -> Dict[str, Any]: + """ + Returns a dictionary containing all the metadata properties. + + :param chain_id: Optional chain id to filter the metadata for. If provided, + only the metadata relevant to the chain will be returned. + :return: A dictionary containing all the metadata properties. + """ + result_dict = { k: getattr(self, k) for k, v in self.__class__.__dict__.items() if isinstance(v, property) } + if not chain_id: + return result_dict + + if chain_id not in self.chain_ids: + raise ValueError(f"Chain {chain_id} not found in {self.pdb_id}") + + entity_id = self.chain_entities[chain_id] + filtered_result_dict = {} + + for key, value in result_dict.items(): + new_value = None + + # If it's a dict, take value corresponding to the chain + if isinstance(value, dict): + if entity_id in value: + new_value = value[entity_id] + elif chain_id in value: + new_value = value[chain_id] + else: + continue + + # If it's a sequence, drop it + elif isinstance(value, (list, tuple)): + continue + + # Append chain to pdb_id + elif value == self.pdb_id: + new_value = f"{self.pdb_id}:{chain_id}" + + # If it's an internal dict, drop it + if isinstance(new_value, dict): + continue + + filtered_result_dict[key] = value if new_value is None else new_value + + return filtered_result_dict def __repr__(self): return str(self.as_dict()) From f27cc72bd2c46142825af2baa50342ba1eec017a Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Tue, 13 Feb 2024 06:08:32 +0200 Subject: [PATCH 15/37] prec: Remove requirement to initialize with uniprot id --- src/pp5/prec.py | 85 ++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/src/pp5/prec.py b/src/pp5/prec.py index 193ec3d..6533366 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -538,7 +538,7 @@ def from_pdb( pdb_dict=None, cache=False, cache_dir=pp5.PREC_DIR, - strict_pdb_xref=True, + strict_pdb_unp_xref=True, **kw_for_init, ) -> ProteinRecord: """ @@ -554,7 +554,7 @@ def from_pdb( :param cache: Whether to load prec from cache if available. :param cache_dir: Where the cache dir is. ProteinRecords will be written to this folder after creation, unless it's None. - :param strict_pdb_xref: Whether to require that the given PDB ID + :param strict_pdb_unp_xref: Whether to require that the given PDB ID maps uniquely to only one Uniprot ID. :param kw_for_init: Extra kwargs for the ProteinRecord initializer. :return: A ProteinRecord. @@ -602,16 +602,11 @@ def from_pdb( pdb_id, pdb_source=pdb_source, struct_d=pdb_dict ) - # TODO: Remove need for unp id from init - unp_id = pdb.PDBMetadata.pdb_id_to_unp_id( - pdb_id, strict=strict_pdb_xref, cache=cache - ) - prec = cls( - unp_id, pdb_id, pdb_source=pdb_source, pdb_dict=pdb_dict, + strict_pdb_unp_xref=strict_pdb_unp_xref, **kw_for_init, ) if cache_dir: @@ -657,19 +652,18 @@ def from_unp( if prec is not None: return prec - prec = cls(unp_id, pdb_id, **kw_for_init) + prec = cls(pdb_id, **kw_for_init) if cache_dir: prec.save(out_dir=cache_dir) return prec except Exception as e: raise ProteinInitError( - f"Failed to create protein record for " f"unp_id={unp_id}" + f"Failed to create protein record for unp_id={unp_id}" ) from e def __init__( self, - unp_id: str, # TODO: Get this from metadata pdb_id: str, pdb_source: str = PDB_RCSB, pdb_dict: dict = None, @@ -680,15 +674,15 @@ def __init__( with_backbone: bool = True, with_contacts: bool = True, with_codons: bool = True, + strict_pdb_unp_xref: bool = True, contact_method: str = CONTACT_METHOD_NEIGHBOR, contact_radius: float = CONTACT_DEFAULT_RADIUS, ): """ Don't call this directly. Use class methods from_pdb or from_unp instead. - Initialize a protein record from both Uniprot and PDB ids. + Initialize a protein record from PDB id. - :param unp_id: Uniprot id which uniquely identifies the protein. :param pdb_id: PDB id with chain (e.g. '1ABC:D') of the specific structure chain desired. :param pdb_source: Source from which to obtain the pdb file. @@ -706,39 +700,21 @@ def __init__( :param with_backbone: Whether to include backbone atoms in the protein record. :param with_contacts: Whether to calculate per-residue contacts. :param with_codons: Whether to assign codons to each residue. + :param strict_pdb_unp_xref: Whether to require that the given PDB ID + maps uniquely to only one Uniprot ID. :param contact_method: Method for calculating contacts. Options are: 'ns' for neighbor search; 'arp' for arpeggio. :param contact_radius: Radius for calculating contacts. """ - if not (unp_id and pdb_id): - raise ProteinInitError("Must provide both Uniprot and PDB IDs") + if not pdb_id: + raise ProteinInitError("Must provide PDB ID") - unp_id = unp_id.upper() - LOGGER.info(f"{unp_id}: Initializing protein record...") self.__setstate__({}) - self.unp_id = unp_id - rec_unp_id = self.unp_rec.accessions[0] - if rec_unp_id != unp_id: - LOGGER.warning(f"Replacing outdated UNP ID: {unp_id} -> {rec_unp_id}") - self.unp_id = rec_unp_id - - if contact_method not in CONTACT_METHODS: - raise ValueError( - f"Unknown {contact_method=}, must be one of {CONTACT_METHODS}" - ) - - if with_altlocs and contact_method == CONTACT_METHOD_ARPEGGIO: - raise ValueError(f"Altlocs not supported with {contact_method=}") - - self.with_altlocs = with_altlocs - self.with_backbone = with_backbone - self.with_contacts = with_contacts - self.contact_radius = contact_radius - self.contact_method = contact_method - # Parse the given PDB id and obtain metadata self.pdb_base_id, pdb_chain_id, ent_id = pdb.split_id_with_entity(pdb_id) + + LOGGER.info(f"{self.pdb_base_id}: Obtaining metadata...") self.pdb_meta = pdb.PDBMetadata.from_pdb(self.pdb_base_id, cache=True) if pdb_chain_id is None: @@ -754,9 +730,38 @@ def __init__( self.pdb_chain_id = pdb_chain_id self.pdb_id = f"{self.pdb_base_id}:{self.pdb_chain_id}" - # TODO: Remove need for unp id from init, get it from metadata - if self.unp_id not in self.pdb_meta.chain_uniprot_ids[self.pdb_chain_id]: - raise ProteinInitError(f"Uniprot ID {self.unp_id} not found in {pdb_id}") + LOGGER.info(f"{self.pdb_id}: Constructing protein record...") + + # Obtain UniProt ID for the given PDB chain + chain_unp_ids = self.pdb_meta.chain_uniprot_ids[self.pdb_chain_id] + if not chain_unp_ids: + raise ProteinInitError(f"No Uniprot ID found for chain {self.pdb_chain_id}") + if len(chain_unp_ids) > 1: + msg = f"Multiple UNP IDs for chain {self.pdb_chain_id}: {chain_unp_ids}" + if strict_pdb_unp_xref: + raise ProteinInitError(msg) + else: + LOGGER.warning(msg) + + self.unp_id = chain_unp_ids[0] + rec_unp_id = self.unp_rec.accessions[0] + if rec_unp_id != self.unp_id: + LOGGER.warning(f"Replacing outdated UNP ID: {self.unp_id} -> {rec_unp_id}") + self.unp_id = rec_unp_id + + if contact_method not in CONTACT_METHODS: + raise ValueError( + f"Unknown {contact_method=}, must be one of {CONTACT_METHODS}" + ) + + if with_altlocs and contact_method == CONTACT_METHOD_ARPEGGIO: + raise ValueError(f"Altlocs not supported with {contact_method=}") + + self.with_altlocs = with_altlocs + self.with_backbone = with_backbone + self.with_contacts = with_contacts + self.contact_radius = contact_radius + self.contact_method = contact_method self.pdb_source = pdb_source if pdb_dict: From 6b42eead1a8be3fca99a8a268fe05c95b69964ac Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Tue, 13 Feb 2024 06:09:39 +0200 Subject: [PATCH 16/37] pgroup: Updates for prec init --- src/pp5/pgroup.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/src/pp5/pgroup.py b/src/pp5/pgroup.py index 3f4602d..002f63b 100644 --- a/src/pp5/pgroup.py +++ b/src/pp5/pgroup.py @@ -190,8 +190,7 @@ def __init__( angle_aggregation="circ", compare_contacts: bool = False, strict_codons: bool = True, - strict_pdb_xref: bool = True, - strict_unp_xref: bool = False, + strict_pdb_unp_xref: bool = True, parallel: bool = True, ): """ @@ -213,7 +212,7 @@ def __init__( Where A, B are matching residues and X, Y are context residues. - :param ref_pdb_id: Reference structure PDB ID. + :param ref_pdb_id: Reference structure PDB ID with chain. :param query_pdb_ids: List of PDB IDs of query structures. :param pdb_source: Source from which to obtain the pdb file. :param match_len: Number of residues to include in a match. Can be either 1 @@ -243,10 +242,8 @@ def __init__( potential matches. :param strict_codons: Whether to require that a codon assignment for each AA exists and is un-ambiguous. - :param strict_pdb_xref: Whether to require that the given PDB ID + :param strict_pdb_unp_xref: Whether to require that the given PDB ID and chain maps uniquely to only one Uniprot ID. - :param strict_unp_xref: Whether to require that there exist a PDB - cross-ref for the given Uniprot ID. :param parallel: Whether to process query structures in parallel using the global worker process pool. """ @@ -259,7 +256,7 @@ def __init__( ) ref_pdb_dict = pdb.pdb_dict(self.ref_pdb_id, pdb_source=pdb_source) - ref_pdb_meta = pdb.PDBMetadata(self.ref_pdb_base_id) + ref_pdb_meta = pdb.PDBMetadata.from_pdb(self.ref_pdb_base_id, cache=True) if self.ref_pdb_chain not in ref_pdb_meta.chain_entities: raise ProteinInitError(f"Unknown PDB entity for {self.ref_pdb_id}") @@ -281,8 +278,7 @@ def __init__( self.prec_cache = prec_cache self.compare_contacts = compare_contacts self.strict_codons = strict_codons - self.strict_pdb_xref = strict_pdb_xref - self.strict_unp_xref = strict_unp_xref + self.strict_pdb_unp_xref = strict_pdb_unp_xref # Only one of these is relevant if pdb_source == PDB_AFLD: @@ -352,8 +348,7 @@ def sort_key(q_pdb_id: str): self.ref_pdb_id, pdb_source=self.pdb_source, cache=self.prec_cache, - strict_pdb_xref=self.strict_pdb_xref, - strict_unp_xref=self.strict_unp_xref, + strict_pdb_unp_xref=strict_pdb_unp_xref, pdb_dict=ref_pdb_dict, with_contacts=self.compare_contacts, ) @@ -766,8 +761,7 @@ def _align_query_residues_to_ref_inner( q_pdb_id, pdb_source=self.pdb_source, cache=self.prec_cache, - strict_pdb_xref=self.strict_pdb_xref, - strict_unp_xref=self.strict_unp_xref, + strict_pdb_unp_xref=self.strict_pdb_unp_xref, with_contacts=self.compare_contacts, ) except ProteinInitError as e: From 5d7ec62c557e5aa830585b6a1c5e44d1c4a0ea11 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Tue, 13 Feb 2024 06:09:57 +0200 Subject: [PATCH 17/37] pgroup: Improved metadata handling --- src/pp5/pgroup.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/src/pp5/pgroup.py b/src/pp5/pgroup.py index 002f63b..2d7b8d8 100644 --- a/src/pp5/pgroup.py +++ b/src/pp5/pgroup.py @@ -640,27 +640,15 @@ def to_struct_dataframe(self) -> pd.DataFrame: { "unp_id": q_prec.unp_id, "pdb_id": q_prec.pdb_id, - "resolution": q_prec.pdb_meta.resolution, "struct_rmse": q_alignment.rmse, "n_stars": q_alignment.n_stars, "seq_len": len(q_alignment.ungapped_seq_2), # seq2 is query - "description": q_prec.pdb_meta.description, - "src_org": q_prec.pdb_meta.src_org, - "src_org_id": q_prec.pdb_meta.src_org_id, - "host_org": q_prec.pdb_meta.host_org, - "host_org_id": q_prec.pdb_meta.host_org_id, - "ligands": q_prec.pdb_meta.ligands, - "space_group": q_prec.pdb_meta.space_group, - "r_free": q_prec.pdb_meta.r_free, - "r_work": q_prec.pdb_meta.r_work, - "cg_ph": q_prec.pdb_meta.cg_ph, - "cg_temp": q_prec.pdb_meta.cg_temp, + "ref_group": q_prec.unp_id == self.ref_prec.unp_id, + **q_prec.pdb_meta.as_dict(chain_id=q_prec.pdb_chain_id), } ) df = pd.DataFrame(data) - df["ref_group"] = df["unp_id"] == self.ref_prec.unp_id - df = df.astype({"src_org_id": "Int32", "host_org_id": "Int32"}) df.sort_values( by=["ref_group", "unp_id", "struct_rmse"], ascending=[False, True, True], From bdf6759e0c1099998ad411e802eb8a2282fb4c58 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Tue, 13 Feb 2024 06:30:30 +0200 Subject: [PATCH 18/37] collect: Update prec init and metadata handling --- src/pp5/collect.py | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/src/pp5/collect.py b/src/pp5/collect.py index 119279a..4460f51 100644 --- a/src/pp5/collect.py +++ b/src/pp5/collect.py @@ -1050,9 +1050,8 @@ def _collect_single_structure( if chain_id is not None: # If we got a single chain, use only that chains_to_collect = (chain_id,) - elif entity_id is not None: - entity_id = int(entity_id) + elif entity_id is not None: # If we got an entity id, discover all corresponding chains chains_to_collect = tuple( chain_id @@ -1077,9 +1076,11 @@ def _collect_single_structure( chain_data = [] for chain_id in chains_to_collect: pdb_id_full = f"{pdb_base_id}:{chain_id}" + entity_id = meta.chain_entities[chain_id] + seq_len = len(meta.entity_sequence[entity_id]) # Skip chains with no Uniprot ID - if chain_id not in chain_to_unp_ids: + if chain_id not in chain_to_unp_ids or not chain_to_unp_ids[chain_id]: LOGGER.warning(f"No Uniprot ID for {pdb_id_full}") continue @@ -1089,20 +1090,15 @@ def _collect_single_structure( continue unp_id = chain_to_unp_ids[chain_id][0] - seq_len = len(meta.entity_sequence[meta.chain_entities[chain_id]]) # Create a ProteinRecord and save it so it's cached for when we # create the pgroups. Only collect structures for which we can # create a prec (e.g. they must have a DNA sequence). try: - nc = chain_id in string.digits prec = ProteinRecord( - unp_id, # TODO: remove unp_ids here pdb_id_full, pdb_source=pdb_source, pdb_dict=pdb_dict, - strict_unp_xref=False, - numeric_chain=nc, with_altlocs=with_altlocs, with_backbone=with_backbone, with_contacts=with_contacts, @@ -1118,8 +1114,8 @@ def _collect_single_structure( except Exception as e: LOGGER.warning( - f"Failed to create ProteinRecord for " - f"({unp_id}, {pdb_id}), will not collect: {e}" + f"Failed to create ProteinRecord for {pdb_id} ({unp_id=}), " + f"will not collect: {e}" ) continue @@ -1128,21 +1124,11 @@ def _collect_single_structure( COL_UNP_ID: prec.unp_id, COL_PDB_ID: prec.pdb_id, COL_ENA_ID: prec.ena_id, - COL_RESOLUTION: meta.resolution, COL_SEQ_LEN: seq_len, COL_SEQ_GAPS: str.join(";", [f"{s}-{e}" for (s, e) in prec.seq_gaps]), - COL_DESCRIPTION: meta.description, - COL_DEPOSITION_DATE: meta.deposition_date, - COL_SRC_ORG: meta.src_org, - COL_HOST_ORG: meta.host_org, COL_NUM_ALTLOCS: prec.num_altlocs, - COL_LIGANDS: meta.ligands, - COL_R_FREE: meta.r_free, - COL_R_WORK: meta.r_work, - COL_SPACE_GROUP: meta.space_group, - COL_CG_PH: meta.cg_ph, - COL_CG_TEMP: meta.cg_temp, COL_PDB_SOURCE: pdb_source, + **meta.as_dict(chain_id=chain_id, seq_to_str=True), } ) From 8ab585709e9c750f53cb4100e886de8cd5ca7ada Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Tue, 13 Feb 2024 06:30:55 +0200 Subject: [PATCH 19/37] PDBMetadata: Improve dict conversion --- src/pp5/external_dbs/pdb.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py index a06b1b1..73b2aec 100644 --- a/src/pp5/external_dbs/pdb.py +++ b/src/pp5/external_dbs/pdb.py @@ -657,12 +657,16 @@ def entity_uniprot_id_alignments( return map_to_unp_ids - def as_dict(self, chain_id: Optional[str] = None) -> Dict[str, Any]: + def as_dict( + self, chain_id: Optional[str] = None, seq_to_str: bool = False + ) -> Dict[str, Any]: """ Returns a dictionary containing all the metadata properties. :param chain_id: Optional chain id to filter the metadata for. If provided, only the metadata relevant to the chain will be returned. + :param seq_to_str: Whether to convert sequences to a string, joined by ','. + Useful for writing metadata. :return: A dictionary containing all the metadata properties. """ result_dict = { @@ -680,9 +684,9 @@ def as_dict(self, chain_id: Optional[str] = None) -> Dict[str, Any]: filtered_result_dict = {} for key, value in result_dict.items(): - new_value = None + new_value = value - # If it's a dict, take value corresponding to the chain + # If original value is a dict, take value corresponding to the chain if isinstance(value, dict): if entity_id in value: new_value = value[entity_id] @@ -691,7 +695,7 @@ def as_dict(self, chain_id: Optional[str] = None) -> Dict[str, Any]: else: continue - # If it's a sequence, drop it + # If original value is a sequence, drop it elif isinstance(value, (list, tuple)): continue @@ -699,11 +703,15 @@ def as_dict(self, chain_id: Optional[str] = None) -> Dict[str, Any]: elif value == self.pdb_id: new_value = f"{self.pdb_id}:{chain_id}" - # If it's an internal dict, drop it + # If internal value is a dict, drop it if isinstance(new_value, dict): continue - filtered_result_dict[key] = value if new_value is None else new_value + # If internal value is a sequence, maybe convert it to a string + elif isinstance(new_value, (list, tuple)) and seq_to_str: + new_value = str.join(",", new_value) + + filtered_result_dict[key] = new_value return filtered_result_dict From bd42b15644eecaf39a676d7cc631a84abf4ded8a Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Tue, 13 Feb 2024 06:31:12 +0200 Subject: [PATCH 20/37] pgroup: Improve metadata writing --- src/pp5/pgroup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/pp5/pgroup.py b/src/pp5/pgroup.py index 2d7b8d8..16fdd9a 100644 --- a/src/pp5/pgroup.py +++ b/src/pp5/pgroup.py @@ -644,7 +644,9 @@ def to_struct_dataframe(self) -> pd.DataFrame: "n_stars": q_alignment.n_stars, "seq_len": len(q_alignment.ungapped_seq_2), # seq2 is query "ref_group": q_prec.unp_id == self.ref_prec.unp_id, - **q_prec.pdb_meta.as_dict(chain_id=q_prec.pdb_chain_id), + **q_prec.pdb_meta.as_dict( + chain_id=q_prec.pdb_chain_id, seq_to_str=True + ), } ) From 4a1aafc666ebdaa1d84359171a526c517bc8cf2d Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Tue, 13 Feb 2024 06:31:36 +0200 Subject: [PATCH 21/37] Update tests --- tests/test_pdb.py | 22 +++++++++++++++++----- tests/test_pgroup.py | 4 ++-- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/tests/test_pdb.py b/tests/test_pdb.py index cf5948a..ca23c03 100644 --- a/tests/test_pdb.py +++ b/tests/test_pdb.py @@ -141,7 +141,7 @@ def test_entity_too_long(self): pdb.split_id(invalid_id) -@pytest.fixture(params=["1MWC:A", "2WUR:A", "4N6V:1", "1DWI:A"]) +@pytest.fixture(scope="class", params=["1MWC:A", "2WUR:A", "4N6V:1", "1DWI:A"]) def pdb_id(request): return request.param @@ -180,15 +180,27 @@ def test_exception_chimeric_chain(self): @pytest.mark.skipif(NO_INTERNET, reason="Needs internet") class TestPDBMetadata: - def test_metadata_properties(self, pdb_id): - meta = pdb.PDBMetadata(pdb_id) + @pytest.fixture(scope="class") + def metadata(self, pdb_id): + return pdb.PDBMetadata(pdb_id) + def test_metadata_properties(self, metadata, pdb_id): pdb_base_id, pdb_chain = pdb.split_id(pdb_id) - assert meta.pdb_id == pdb_base_id + assert metadata.pdb_id == pdb_base_id - d = meta.as_dict() # evaluates all metadata properties + def test_as_dict(self, metadata): + d = metadata.as_dict() # evaluates all metadata properties pprint(d) + @pytest.mark.parametrize( + "seq_to_str", [False, True], ids=["seq_to_str=False", "seq_to_str=True"] + ) + def test_as_dict_chain(self, metadata, seq_to_str): + for chain_id in metadata.chain_ids: + d = metadata.as_dict(chain_id=chain_id, seq_to_str=seq_to_str) + print(f" === {chain_id=} === ") + pprint(d) + @staticmethod def _check_unp(pdb_id, expected_unp_id): actual_unp_id = pdb.PDBMetadata.pdb_id_to_unp_id(pdb_id) diff --git a/tests/test_pgroup.py b/tests/test_pgroup.py index d394ac3..9b7c9ed 100644 --- a/tests/test_pgroup.py +++ b/tests/test_pgroup.py @@ -1,11 +1,11 @@ import pytest from pp5.pgroup import ProteinGroup -from pp5.external_dbs.pdb import PDB_DOWNLOAD_SOURCES +from pp5.external_dbs.pdb import PDB_AFLD, PDB_RCSB, PDB_REDO class TestFromPDBRef(object): - @pytest.mark.parametrize("pdb_source", PDB_DOWNLOAD_SOURCES.keys()) + @pytest.mark.parametrize("pdb_source", [PDB_RCSB, PDB_REDO]) @pytest.mark.parametrize("match_len", [2, 1]) def test_default(self, match_len, pdb_source): pgroup = ProteinGroup.from_pdb_ref( From 9c04ca98f85cfbb5859cd5d24614294e1bdeca6d Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Wed, 14 Feb 2024 05:56:21 +0200 Subject: [PATCH 22/37] collect: refactor filename constants --- src/pp5/collect.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/pp5/collect.py b/src/pp5/collect.py index 4460f51..90ebb41 100644 --- a/src/pp5/collect.py +++ b/src/pp5/collect.py @@ -58,6 +58,13 @@ COL_REJECTED_BY = "rejected_by" COL_NUM_ALTLOCS = "num_altlocs" +COLLECTION_METADATA_FILENAME = "meta.json" +ALL_STRUCTS_FILENAME = "meta-structs_all" +FILTERED_STRUCTS_FILENAME = "meta-structs_filtered" +REJECTED_STRUCTS_FILENAME = "meta-structs_rejected" +BLAST_SCORES_FILENAME = "meta-blast_scores" +DATASET_DIRNAME = "data-precs" + @dataclass(repr=False) class CollectorStep: @@ -189,7 +196,7 @@ def _finalize_collection(self, pool: Pool): return # Create a metadata file in the output dir based on the step results - meta_filepath = self.out_dir.joinpath("meta.json") + meta_filepath = self.out_dir.joinpath(COLLECTION_METADATA_FILENAME) meta = self._collection_meta meta["steps"] = [str(s) for s in self._collection_steps] with open(str(meta_filepath), "w", encoding="utf-8") as f: @@ -304,11 +311,6 @@ def __repr__(self): class ProteinRecordCollector(ParallelDataCollector): DEFAULT_PREC_INIT_ARGS = dict() - ALL_STRUCTS_FILENAME = "meta-structs_all" - FILTERED_STRUCTS_FILENAME = "meta-structs_filtered" - REJECTED_STRUCTS_FILENAME = "meta-structs_rejected" - BLAST_SCORES_FILENAME = "meta-blast_scores" - DATASET_DIRNAME = "data-precs" def __init__( self, @@ -425,7 +427,7 @@ def __init__( self.entity_single_chain = entity_single_chain # Unique output dir for this collection run - self.prec_csv_out_dir = self.out_dir / self.DATASET_DIRNAME + self.prec_csv_out_dir = self.out_dir / DATASET_DIRNAME self.prec_csv_out_dir.mkdir(parents=True, exist_ok=True) def __repr__(self): @@ -470,7 +472,7 @@ def _collect_precs(self, pool: Pool): n_collected = len(df_all) self._out_filepaths.append( - _write_df_csv(df_all, self.out_dir, self.ALL_STRUCTS_FILENAME) + _write_df_csv(df_all, self.out_dir, ALL_STRUCTS_FILENAME) ) meta["n_collected"] = n_collected @@ -486,7 +488,7 @@ def _filter_collected(self, pool: Pool) -> dict: Filters collected structures according to conditions on their metadata. """ - df_all: pd.DataFrame = _read_df_csv(self.out_dir, self.ALL_STRUCTS_FILENAME) + df_all: pd.DataFrame = _read_df_csv(self.out_dir, ALL_STRUCTS_FILENAME) # A boolean series representing which structures to keep. filter_idx = pd.Series(data=[True] * len(df_all), index=df_all.index) rejected_counts = {"total": 0} @@ -516,7 +518,7 @@ def _update_rejected_counts(filter_name: str, idx: pd.Series): # Write the filtered structures df_filtered = df_all[filter_idx] self._out_filepaths.append( - _write_df_csv(df_filtered, self.out_dir, self.FILTERED_STRUCTS_FILENAME) + _write_df_csv(df_filtered, self.out_dir, FILTERED_STRUCTS_FILENAME) ) # Write the rejected structures and specify which filter rejected them @@ -526,7 +528,7 @@ def _update_rejected_counts(filter_name: str, idx: pd.Series): df_rejected.loc[rejected_idx, COL_REJECTED_BY] = filter_name df_rejected = df_rejected[~filter_idx] self._out_filepaths.append( - _write_df_csv(df_rejected, self.out_dir, self.REJECTED_STRUCTS_FILENAME) + _write_df_csv(df_rejected, self.out_dir, REJECTED_STRUCTS_FILENAME) ) return { @@ -582,7 +584,7 @@ def _filter_redundant_unps(self, pool: Pool, df_all: pd.DataFrame) -> pd.Series: ) self._out_filepaths.append( _write_df_csv( - df_blast_scores, self.out_dir, self.BLAST_SCORES_FILENAME, index=True + df_blast_scores, self.out_dir, BLAST_SCORES_FILENAME, index=True ) ) From d005cd7b015ecbf3a0a566fc5eddafc38cc569a1 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Wed, 14 Feb 2024 05:56:49 +0200 Subject: [PATCH 23/37] ResidueContacts: Add comparison operators --- src/pp5/contacts.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/pp5/contacts.py b/src/pp5/contacts.py index 92889b5..5749787 100644 --- a/src/pp5/contacts.py +++ b/src/pp5/contacts.py @@ -366,6 +366,14 @@ def _join(s): return d + def __eq__(self, other): + if not isinstance(other, ResidueContacts): + return False + return self.as_dict() == other.as_dict() + + def __hash__(self): + return hash(tuple(self.as_dict().values())) + class Arpeggio(object): """ From d1517af1fd82661a33d163281b08eb0bce28d375 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Wed, 14 Feb 2024 05:57:23 +0200 Subject: [PATCH 24/37] prec: fix comparison operators --- src/pp5/prec.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/pp5/prec.py b/src/pp5/prec.py index 6533366..b88980c 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -201,16 +201,33 @@ def __eq__(self, other): return True if not isinstance(other, ResidueRecord): return False + + def _compare(a, b): + eq = True + if isinstance(a, (float, np.ndarray)): + eq = np.allclose(a, b, equal_nan=True) + + elif isinstance(a, dict): + for key, val in a.items(): + # to handle dict that contains ndarrays + eq = _compare(val, b.get(key)) + if not eq: + break + else: + eq = a == b + + return eq + for k, v in self.__dict__.items(): other_v = other.__dict__.get(k, math.inf) - if isinstance(v, (float, np.ndarray)): - equal = np.allclose(v, other_v, equal_nan=True) - else: - equal = v == other_v + equal = _compare(v, other_v) if not equal: return False return True + def __hash__(self): + return hash(tuple(self.as_dict().items())) + class AltlocNameMap(dict): """ From caf468f881329fb3d498279b5334b273c33f13a6 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Wed, 14 Feb 2024 05:57:43 +0200 Subject: [PATCH 25/37] prec: bugfix in from_cache --- src/pp5/prec.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/pp5/prec.py b/src/pp5/prec.py index b88980c..97dd1c8 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -610,7 +610,9 @@ def from_pdb( pdb_id = f"{pdb_base_id}:{chain_id}" if cache and chain_id: - prec = cls.from_cache(pdb_id, cache_dir=cache_dir) + prec = cls.from_cache( + pdb_id, cache_dir=cache_dir, pdb_source=pdb_source + ) if prec is not None: return prec @@ -640,6 +642,7 @@ def from_unp( cls, unp_id: str, cache=False, + pdb_source: str = PDB_RCSB, cache_dir=pp5.PREC_DIR, xref_selector: Callable[[unp.UNPPDBXRef], Any] = None, **kw_for_init, @@ -651,6 +654,7 @@ def from_unp( :param xref_selector: Sort key for PDB cross refs. If None, resolution will be used. :param cache: Whether to load prec from cache if available. + :param pdb_source: Source from which to obtain the pdb file. :param cache_dir: Where the cache dir is. ProteinRecords will be written to this folder after creation, unless it's None. :param kw_for_init: Extra args for the ProteinRecord initializer. @@ -665,7 +669,9 @@ def from_unp( pdb_id = f"{xrefs[0].pdb_id}:{xrefs[0].chain_id}" if cache: - prec = cls.from_cache(pdb_id, cache_dir=cache_dir) + prec = cls.from_cache( + pdb_id, cache_dir=cache_dir, pdb_source=pdb_source + ) if prec is not None: return prec From 2da8957d1b854bbbfbd96d41cc804a69432d7b29 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Wed, 14 Feb 2024 05:57:57 +0200 Subject: [PATCH 26/37] prec: update tests --- tests/test_prec.py | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/tests/test_prec.py b/tests/test_prec.py index 55b0468..668d73a 100644 --- a/tests/test_prec.py +++ b/tests/test_prec.py @@ -226,14 +226,14 @@ def test_invalid_pdbid(self): def test_multiple_unp_ids_for_same_pdb_chain_no_strict_pdb_xref(self): prec = ProteinRecord.from_pdb( "3SG4:A", - strict_pdb_xref=False, + strict_pdb_unp_xref=False, ) assert prec.unp_id == "P42212" assert prec.pdb_id == "3SG4:A" prec = ProteinRecord.from_pdb( "3SG4", - strict_pdb_xref=False, + strict_pdb_unp_xref=False, ) assert prec.unp_id == "P42212" assert prec.pdb_id == "3SG4:A" @@ -250,27 +250,26 @@ class TestInit: def test_init_no_chain(self): unp_id = "P00720" pdb_id = "102L" - prec = ProteinRecord(unp_id, pdb_id) - assert prec.unp_id == "P00720" + prec = ProteinRecord(pdb_id) + assert prec.unp_id == unp_id assert prec.pdb_id == f"{pdb_id}:A" + def test_init_no_chain_ambiguous(self): + pdb_id = "4HHB" + with pytest.raises(ProteinInitError, match="multiple chains"): + _ = ProteinRecord(pdb_id) + def test_init_with_chain(self): unp_id = "P00720" pdb_id = "102L:A" - prec = ProteinRecord(unp_id, pdb_id) - assert prec.unp_id == "P00720" + prec = ProteinRecord(pdb_id) + assert prec.unp_id == unp_id assert prec.pdb_id == pdb_id - def test_init_with_mismatching_pdb_id(self): - with pytest.raises(ProteinInitError): - ProteinRecord("P00720", "2WUR:A") - - with pytest.raises(ProteinInitError): - ProteinRecord("P00720", "4GY3") - - def test_no_matching_xref_in_pdb(self): - with pytest.raises(ProteinInitError): - ProteinRecord("P42212", "2QLE:A") + def test_init_with_no_pdb_id(self): + for invalid_pdb_id in ["", None]: + with pytest.raises(ProteinInitError, match="provide PDB ID"): + ProteinRecord(pdb_id=invalid_pdb_id) class TestSave: @@ -296,15 +295,13 @@ def cache_dir(self): @pytest.mark.parametrize("pdb_id", ["1MWC:A", "4N6V:1"]) @pytest.mark.parametrize("pdb_source", tuple(PDB_DOWNLOAD_SOURCES)) - def test_from_pdb_with_cache(self, pdb_id, pdb_source, cache_dir, with_altlocs): + def test_from_pdb_with_cache(self, pdb_id, pdb_source, cache_dir): cache_dir = cache_dir / f"{pdb_source}" prec = ProteinRecord.from_pdb( pdb_id, pdb_source=pdb_source, cache=True, cache_dir=cache_dir, - strict_unp_xref=False, - with_altlocs=with_altlocs, ) filename = f"{prec.pdb_id.replace(':', '_')}-{pdb_source}.prec" From 22f7cefa46e15764bdd54d5181220e518ebb115b Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Wed, 14 Feb 2024 05:58:16 +0200 Subject: [PATCH 27/37] collect: basic tests for prec collector --- tests/test_collect.py | 71 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 tests/test_collect.py diff --git a/tests/test_collect.py b/tests/test_collect.py new file mode 100644 index 0000000..526376a --- /dev/null +++ b/tests/test_collect.py @@ -0,0 +1,71 @@ +from random import randint +from pathlib import Path + +import pytest + +import pp5 +from tests import get_tmp_path +from pp5.collect import ( + DATASET_DIRNAME, + ALL_STRUCTS_FILENAME, + COLLECTION_METADATA_FILENAME, + ProteinRecordCollector, +) + + +class TestPrecCollector(object): + @pytest.fixture(scope="class") + def collection_nproc(self): + return 4 + + @pytest.fixture(scope="class") + def collection_out_dir(self): + return get_tmp_path("prec-collected-tests") + + @pytest.fixture(scope="class") + def collection_out_tag(self): + return f"tag-{randint(0, 1000)}" + + @pytest.fixture(scope="class") + def collection_result( + self, collection_nproc, collection_out_dir, collection_out_tag + ): + pp5.set_config("MAX_PROCESSES", collection_nproc) + + collector = ProteinRecordCollector( + resolution=0.75, + with_altlocs=True, + with_contacts=True, + with_backbone=True, + entity_single_chain=False, + seq_similarity_thresh=1.0, + write_zip=True, + out_dir=collection_out_dir, + out_tag=collection_out_tag, + ) + + return collector.collect() + + def test_collection_result( + self, collection_result, collection_out_dir, collection_out_tag + ): + assert collection_result["n_collected"] > 10 + assert collection_result["n_query_results"] > 10 + assert collection_result["n_entries"] > 2000 + assert collection_result["out_tag"] == collection_out_tag + for step_result in collection_result["steps"]: + assert "SUCCESS" in step_result + + out_dir = Path(collection_result["out_dir"]) + assert out_dir.is_dir() + assert out_dir.is_relative_to(collection_out_dir) + + assert (out_dir / DATASET_DIRNAME).is_dir() + assert (out_dir / COLLECTION_METADATA_FILENAME).is_file() + assert (out_dir / f"{ALL_STRUCTS_FILENAME}.csv").is_file() + + collection_id = out_dir.name + assert (out_dir / f"{collection_id}.zip").is_file() + + csv_files = tuple((out_dir / DATASET_DIRNAME).glob("*.csv")) + assert collection_result["n_collected_filtered"] == len(csv_files) From 4340aee1075b387ad2a93be79d5cf8a8478edd77 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Wed, 14 Feb 2024 06:10:13 +0200 Subject: [PATCH 28/37] pdb_api: fix a test --- tests/test_pdb_api.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_pdb_api.py b/tests/test_pdb_api.py index 58b4d90..30a67bc 100644 --- a/tests/test_pdb_api.py +++ b/tests/test_pdb_api.py @@ -151,9 +151,14 @@ def test_search_structure_name(self): query_value=pdb_base_id, return_type=pdb_api.PDBQuery.ReturnType.CHAIN ) results = query.execute() + + # Some additional related structures will be returned + assert len(results) >= 4 + filtered_results = [r for r in results if r.startswith(pdb_base_id)] + # This structure has 4 chains - assert len(results) == 4 - for result in results: + assert len(filtered_results) == 4 + for result in filtered_results: pdb_id, chain_id, entity_id = split_id_with_entity(result) assert pdb_id == pdb_base_id assert chain_id From 6c89f1889400163023293c544396423dcf54d61e Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Wed, 14 Feb 2024 06:11:04 +0200 Subject: [PATCH 29/37] pyproject: configure pytest traceback length --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3a48e92..5568b99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,9 @@ addopts = [ # Show durations of slowest tests. "--durations=10", # Force colored output even on CI - "--color=yes" + "--color=yes", + # Traceback verbosity + "--tb=short" ] testpaths = [ From 8033d151c9b9eabd97c6b7caf39c60bfec343d36 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Fri, 16 Feb 2024 05:52:29 +0200 Subject: [PATCH 30/37] JSONCacheableMixin: Update to automatically generate filenames --- src/pp5/utils.py | 105 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 98 insertions(+), 7 deletions(-) diff --git a/src/pp5/utils.py b/src/pp5/utils.py index 38eac76..3b9525c 100644 --- a/src/pp5/utils.py +++ b/src/pp5/utils.py @@ -2,11 +2,14 @@ import sys import gzip import json +import pickle import random +import hashlib import logging import contextlib +from abc import abstractmethod from json import JSONEncoder -from typing import Any, Union, Callable +from typing import Any, Dict, Union, Callable, Optional from pathlib import Path from datetime import datetime, timedelta from collections.abc import Set, Mapping, Sequence @@ -310,6 +313,28 @@ def sort_key(kv: tuple): return {k: v for k, v in sorted(d.items(), key=sort_key)} +def stable_hash(obj: Any, hash_len: int = 8) -> str: + """ + Generates a stable hash for general python objects, as a hexadecimal string. Stable + means that the exact-same input will produce exactly the same output, even across + machines and processes. The provided object must be pickleable. + + :param obj: A python object. Must be pickle-able. + :param hash_len: Desired length of hash string. + :return: A string of the requested length comprised of hexadecimal digits, + representing a number which is the hash value. + """ + if hash_len < 2: + raise ValueError(f"Invalid {hash_len=}, must be > 1") + + def _hash(bytelike: bytes) -> str: + return hashlib.blake2b(bytelike, digest_size=hash_len // 2).hexdigest() + + obj_bytes: bytes = pickle.dumps(obj) + + return _hash(obj_bytes) + + class JSONCacheableMixin(object): """ Makes a class cacheable to JSON. @@ -321,8 +346,48 @@ def __getstate__(self): def __setstate__(self, state): self.__dict__.update(state) + @classmethod + @abstractmethod + def cache_dir(cls) -> Path: + """ + :return: The directory to which files will be cached. + """ + pass + + @abstractmethod + def cache_attribs(self) -> Dict[str, Any]: + """ + :return: The attributes which determine the cache filename. + """ + pass + + @classmethod + def _cache_filename_prefix(cls, cache_attribs: Dict[str, Any]) -> str: + """ + Generates the prefix of the cache filename. + :param cache_attribs: Attributes which determine the cache filename. + :return: The prefix of the cache filename. + """ + return cls.__name__.lower() + + @classmethod + def _cache_filename(cls, cache_attribs: Dict[str, Any]) -> str: + """ + Generates the cache filename. + :param cache_attribs: The attributes which determine the cache filename. + :return: The cache filename. + """ + return ( + f"{cls._cache_filename_prefix(cache_attribs=cache_attribs)}" + "-" + f"{stable_hash(sort_dict(cache_attribs,by_value=False))}.json" + ) + def to_cache( - self, cache_dir: Union[str, Path], filename: Union[str, Path], **json_kws + self, + cache_dir: Optional[Union[str, Path]] = None, + filename: Optional[Union[str, Path]] = None, + **json_kws, ) -> Path: """ Write the object to a human-readable text file (json) which @@ -331,24 +396,50 @@ def to_cache( :param filename: Cached file name (without directory). :return: The path of the written file. """ + if cache_dir is None: + cache_dir = self.cache_dir() + if filename is None: + filename = self._cache_filename(self.cache_attribs()) + filepath = pp5.get_resource_path(cache_dir, filename) os.makedirs(str(filepath.parent), exist_ok=True) with filelock_context(filepath): with open(str(filepath), "w", encoding="utf-8") as f: - json.dump(self.__getstate__(), f, **json_kws) - - LOGGER.info(f"Wrote {self} to {filepath}") + json.dump(self.__getstate__(), f, indent=2, **json_kws) + + file_size = os.path.getsize(filepath) + file_size_str = ( + f"{file_size / 1024:.1f}kB" + if file_size < 1024 * 1024 + else f"{file_size / 1024 / 1024:.1f}MB" + ) + LOGGER.info(f"Wrote cache file: {filepath} ({file_size_str})") return filepath @classmethod - def from_cache(cls, cache_dir: Union[str, Path], filename: Union[str, Path]): + def from_cache( + cls, + cache_dir: Optional[Union[str, Path]] = None, + cache_attribs: Optional[Dict[str, Any]] = None, + filename: Optional[Union[str, Path]] = None, + ): """ Load the object from a cached file. :param cache_dir: Directory of cached file. - :param filename: Cached file name (without directory). + :param cache_attribs: Attributes which determine the cache filename. + :param filename: Cached filename (without directory). Won't be used if + cache_attribs is given. :return: The loaded object, or None if the file doesn't exist. """ + if not (cache_attribs or filename): + raise ValueError("cache_attribs or filename must be given") + + if cache_dir is None: + cache_dir = cls.cache_dir() + + if filename is None: + filename = cls._cache_filename(cache_attribs) filepath = pp5.get_resource_path(cache_dir, filename) From b9f3aaa97049dd95f1f460557bc16f80c64d54af Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Fri, 16 Feb 2024 05:53:56 +0200 Subject: [PATCH 31/37] PDBMetadata: Implement caching --- src/pp5/__init__.py | 6 +++--- src/pp5/external_dbs/pdb.py | 32 +++++++++++++++++++++++++------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/pp5/__init__.py b/src/pp5/__init__.py index e79ea57..b6b8a11 100644 --- a/src/pp5/__init__.py +++ b/src/pp5/__init__.py @@ -27,7 +27,7 @@ ENV_PP5_UNP_DIR = "UNP_DIR" ENV_PP5_ENA_DIR = "ENA_DIR" ENV_PP5_PREC_DIR = "PREC_DIR" -ENV_PP5_PDB2UNP_DIR = "PDB2UNP_DIR" +ENV_PP5_PDB_METADATA_DIR = "PDB_METADATA_DIR" ENV_PP5_ALIGNMENT_DIR = "ALIGNMENT_DIR" ENV_PP5_BLASTDB_DIR = "BLASTDB_DIR" @@ -137,8 +137,8 @@ def set_config(key: str, value: Any): # Directory for ProteinRecords PREC_DIR = Path(os.getenv(ENV_PP5_PREC_DIR, data_subdir("prec"))) -# Directory for PDB to UNP mappings -PDB2UNP_DIR = Path(os.getenv(ENV_PP5_PDB2UNP_DIR, data_subdir("pdb2unp"))) +# Directory for PDB metadata +PDB_METADATA_DIR = Path(os.getenv(ENV_PP5_PDB_METADATA_DIR, data_subdir("pdb_meta"))) # Directory for Structural Alignments ALIGNMENT_DIR = Path(os.getenv(ENV_PP5_ALIGNMENT_DIR, data_subdir("align"))) diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py index 73b2aec..291c1f3 100644 --- a/src/pp5/external_dbs/pdb.py +++ b/src/pp5/external_dbs/pdb.py @@ -21,7 +21,7 @@ from Bio.PDB.Polypeptide import standard_aa_names from Bio.PDB.PDBExceptions import PDBConstructionWarning, PDBConstructionException -from pp5 import PDB_DIR, get_resource_path +from pp5 import PDB_DIR, PDB_METADATA_DIR, get_resource_path from pp5.utils import JSONCacheableMixin, remote_dl from pp5.external_dbs import pdb_api @@ -310,6 +310,24 @@ def _resolve( return meta + @classmethod + def cache_dir(cls) -> Path: + return PDB_METADATA_DIR + + @classmethod + def _cache_filename_prefix(cls, cache_attribs: Dict[str, Any]) -> str: + pdb_id = cache_attribs["pdb_id"] + return f"{super()._cache_filename_prefix(cache_attribs)}-{pdb_id}" + + def cache_attribs(self) -> Dict[str, Any]: + return {"pdb_id": self.pdb_id} + + def __eq__(self, other): + return self.pdb_id == other.pdb_id + + def __hash__(self): + return hash(self.pdb_id) + @property def pdb_id(self) -> str: return self._pdb_id @@ -728,14 +746,14 @@ def from_pdb(cls, pdb_id: str, cache=False) -> PDBMetadata: """ pdb_base_id, _ = split_id(pdb_id) - # TODO: Implement caching - # if cache: - # pdb_meta = cls.from_cache(pdb_base_id) - # if pdb_meta is not None: - # return pdb_meta + if cache: + pdb_meta = cls.from_cache(cache_attribs={"pdb_id": pdb_base_id}) + if pdb_meta is not None: + return pdb_meta pdb_meta = cls(pdb_id) - # pdb_meta.save() + if cache: + pdb_meta.to_cache() return pdb_meta @classmethod From aad46a1f52f750f0c91f859cb7460d3c4aeb2877 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Fri, 16 Feb 2024 05:56:23 +0200 Subject: [PATCH 32/37] StructuralAlignment: Update caching --- src/pp5/align.py | 81 ++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 47 deletions(-) diff --git a/src/pp5/align.py b/src/pp5/align.py index aad2910..c11fb34 100644 --- a/src/pp5/align.py +++ b/src/pp5/align.py @@ -12,7 +12,7 @@ import warnings import contextlib import subprocess -from typing import Tuple, Union, Iterable, Optional +from typing import Any, Dict, Tuple, Union, Iterable, Optional from pathlib import Path from datetime import datetime, timedelta @@ -213,36 +213,6 @@ def ungapped_seq_2(self): """ return self.ungap(self.aligned_seq_2) - def save(self, out_dir=pp5.ALIGNMENT_DIR) -> Path: - """ - Write the alignment to a human-readable text file (json) which - can also be loaded later using from_cache. - :param out_dir: Output directory. - :return: The path of the written file. - """ - filename = self._cache_filename( - self.pdb_id_1, - self.pdb_id_2, - self.pdb_source, - self.outlier_rejection_cutoff, - self.backbone_only, - ) - return self.to_cache(out_dir, filename, indent=2) - - @staticmethod - def _cache_filename( - pdb_id_1: str, - pdb_id_2: str, - pdb_source: str, - outlier_rejection_cutoff: float, - backbone_only, - ) -> str: - pdb_ids = f"{pdb_id_1}-{pdb_id_2}".replace(":", "_").upper() - config = f"cutoff={int(outlier_rejection_cutoff*10)}_bb={backbone_only}" - basename = f"{pdb_ids}_{config}" - filename = f"{basename}-{pdb_source}.json" - return filename - @staticmethod def ungap(seq: str) -> str: """ @@ -269,33 +239,50 @@ def __eq__(self, other): return self.__dict__ == other.__dict__ @classmethod - def from_cache( - cls, - pdb_id_1: str, - pdb_id_2: str, - pdb_source: str = PDB_RCSB, - cache_dir: Union[str, Path] = pp5.ALIGNMENT_DIR, - **kw_for_init, - ) -> Optional[StructuralAlignment]: - filename = cls._cache_filename(pdb_id_1, pdb_id_2, pdb_source, **kw_for_init) - return super(StructuralAlignment, cls).from_cache(cache_dir, filename) + def cache_dir(cls) -> Path: + return pp5.ALIGNMENT_DIR + + def cache_attribs(self) -> Dict[str, Any]: + return dict( + pdb_id_1=self.pdb_id_1, + pdb_id_2=self.pdb_id_2, + pdb_source=self.pdb_source, + outlier_rejection_cutoff=self.outlier_rejection_cutoff, + backbone_only=self.backbone_only, + ) + + @classmethod + def _cache_filename_prefix(cls, cache_attribs: Dict[str, Any]) -> str: + pdb_id_1 = cache_attribs["pdb_id_1"] + pdb_id_2 = cache_attribs["pdb_id_2"] + pdb_ids = f"{pdb_id_1}-{pdb_id_2}".replace(":", "_").upper() + return f"{super()._cache_filename_prefix(cache_attribs)}-{pdb_ids}" @classmethod def from_pdb( cls, - pdb_id1: str, - pdb_id2: str, + pdb_id_1: str, + pdb_id_2: str, pdb_source: str = PDB_RCSB, + outlier_rejection_cutoff: float = 2.0, + backbone_only=False, cache=False, - **kw_for_init, ): + kws = dict( + pdb_id_1=pdb_id_1, + pdb_id_2=pdb_id_2, + pdb_source=pdb_source, + outlier_rejection_cutoff=outlier_rejection_cutoff, + backbone_only=backbone_only, + ) if cache: - sa = cls.from_cache(pdb_id1, pdb_id2, pdb_source, **kw_for_init) + sa = cls.from_cache(cache_attribs=kws) if sa is not None: return sa - sa = cls(pdb_id1, pdb_id2, pdb_source, **kw_for_init) - sa.save() + sa = cls(**kws) + if cache: + sa.to_cache() return sa From 4fb69cf2f5ca4149c4a26bf074fa8c00dacf69e7 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Fri, 16 Feb 2024 05:56:48 +0200 Subject: [PATCH 33/37] Update tests --- tests/__init__.py | 4 ++-- tests/test_align.py | 27 ++++++++------------------- tests/test_pdb.py | 15 +++++++++++++++ 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index f69ed54..dceb1a8 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -5,8 +5,8 @@ from pp5 import ( ENV_PP5_DATA_DIR, ENV_PP5_PREC_DIR, - ENV_PP5_PDB2UNP_DIR, ENV_PP5_ALIGNMENT_DIR, + ENV_PP5_PDB_METADATA_DIR, ) TEST_RESOURCES_PATH = pathlib.Path(os.path.dirname(__file__)).joinpath("resources") @@ -37,7 +37,7 @@ def get_tmp_path(name: str, clear=True): # In the tests, we dont want to save data files generated by own code os.environ[ENV_PP5_PREC_DIR] = str(get_tmp_path("data/prec")) -os.environ[ENV_PP5_PDB2UNP_DIR] = str(get_tmp_path("data/pdb2unp")) +os.environ[ENV_PP5_PDB_METADATA_DIR] = str(get_tmp_path("data/pdb_meta")) os.environ[ENV_PP5_ALIGNMENT_DIR] = str(get_tmp_path("data/align")) # Remove imported pp5 so that its init runs again and updates the paths using the diff --git a/tests/test_align.py b/tests/test_align.py index 56720fd..a3bb1c4 100644 --- a/tests/test_align.py +++ b/tests/test_align.py @@ -47,34 +47,23 @@ def test_outlier_rejection_cutoff_example(self): def test_cache(self, backbone_only, outlier_rejection_cutoff, pdb_source): pdb1, pdb2 = "4NE4:A", "5TEU:A" - # Should not exist in cache - sa_cached = StructuralAlignment.from_cache( - pdb1, - pdb2, + kws = dict( + pdb_id_1=pdb1, + pdb_id_2=pdb2, pdb_source=pdb_source, backbone_only=backbone_only, outlier_rejection_cutoff=outlier_rejection_cutoff, ) + + # Should not exist in cache + sa_cached = StructuralAlignment.from_cache(cache_attribs=kws) assert sa_cached is None # Should be created and saved to cache - sa = StructuralAlignment.from_pdb( - pdb1, - pdb2, - cache=True, - pdb_source=pdb_source, - backbone_only=backbone_only, - outlier_rejection_cutoff=outlier_rejection_cutoff, - ) + sa = StructuralAlignment.from_pdb(**kws, cache=True) # Should exist in cache - sa_cached = StructuralAlignment.from_cache( - pdb1, - pdb2, - pdb_source=pdb_source, - backbone_only=backbone_only, - outlier_rejection_cutoff=outlier_rejection_cutoff, - ) + sa_cached = StructuralAlignment.from_cache(cache_attribs=kws) assert sa_cached is not None # Cached version should be the same diff --git a/tests/test_pdb.py b/tests/test_pdb.py index ca23c03..5c51bb6 100644 --- a/tests/test_pdb.py +++ b/tests/test_pdb.py @@ -192,6 +192,21 @@ def test_as_dict(self, metadata): d = metadata.as_dict() # evaluates all metadata properties pprint(d) + def test_cache(self, metadata): + path = metadata.to_cache() + cache_attrs = metadata.cache_attribs() + assert path.exists() + assert path.is_file() + metadata_ = pdb.PDBMetadata.from_cache(cache_attribs=cache_attrs) + + assert metadata == metadata_ + + @pytest.mark.parametrize("cache", [True, False], ids=["cache=True", "cache=False"]) + def test_from_pdb(self, pdb_id, cache): + pdb_base_id, chain_id = pdb.split_id(pdb_id) + metadata = pdb.PDBMetadata.from_pdb(pdb_id, cache=cache) + assert metadata.pdb_id == pdb_base_id + @pytest.mark.parametrize( "seq_to_str", [False, True], ids=["seq_to_str=False", "seq_to_str=True"] ) From 1fbed95caa5e17f97aab49a2a2d00e4348800a74 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Fri, 16 Feb 2024 09:45:06 +0200 Subject: [PATCH 34/37] Refactor caching --- src/pp5/align.py | 11 ++- src/pp5/cache.py | 170 ++++++++++++++++++++++++++++++++++++ src/pp5/collect.py | 3 +- src/pp5/external_dbs/pdb.py | 11 ++- src/pp5/utils.py | 144 +----------------------------- 5 files changed, 183 insertions(+), 156 deletions(-) create mode 100644 src/pp5/cache.py diff --git a/src/pp5/align.py b/src/pp5/align.py index c11fb34..a58eb40 100644 --- a/src/pp5/align.py +++ b/src/pp5/align.py @@ -32,7 +32,8 @@ from Bio.Align.Applications import ClustalOmegaCommandline import pp5 -from pp5.utils import JSONCacheableMixin, out_redirected +from pp5.cache import Cacheable, CacheSettings +from pp5.utils import out_redirected from pp5.external_dbs import pdb # Suppress messages from pymol upon import @@ -151,11 +152,13 @@ def multiseq_align( return msa_result -class StructuralAlignment(JSONCacheableMixin, object): +class StructuralAlignment(Cacheable, object): """ Represents a Structural Alignment between two protein structures. """ + _CACHE_SETTINGS = CacheSettings(cache_dir=pp5.ALIGNMENT_DIR) + def __init__( self, pdb_id_1: str, @@ -238,10 +241,6 @@ def __eq__(self, other): return False return self.__dict__ == other.__dict__ - @classmethod - def cache_dir(cls) -> Path: - return pp5.ALIGNMENT_DIR - def cache_attribs(self) -> Dict[str, Any]: return dict( pdb_id_1=self.pdb_id_1, diff --git a/src/pp5/cache.py b/src/pp5/cache.py new file mode 100644 index 0000000..f49639a --- /dev/null +++ b/src/pp5/cache.py @@ -0,0 +1,170 @@ +import os +import json +import logging +from abc import abstractmethod +from json import JSONEncoder +from typing import Any, Dict, Union, Optional +from pathlib import Path +from dataclasses import dataclass + +import pp5 +from pp5.utils import sort_dict, stable_hash, filelock_context + +CACHE_FORMAT_JSON = "json" +CACHE_FORMAT_PICKLE = "pkl" +CACHE_FORMATS = {CACHE_FORMAT_JSON, CACHE_FORMAT_PICKLE} + + +LOGGER = logging.getLogger(__name__) + + +@dataclass +class CacheSettings: + """ + Settings for caching objects to file. + """ + + cache_dir: Path + cache_format: str = CACHE_FORMAT_JSON + cache_compression: bool = False + + def __post_init__(self): + if self.cache_format not in CACHE_FORMATS: + raise ValueError(f"Invalid {self.cache_format=}") + + def __str__(self): + return f"{self.cache_format}{'-compressed' if self.cache_compression else ''}" + + +class Cacheable(object): + """ + Makes a class cacheable to file. + """ + + # Subclasses may override this with the desired settings. + _CACHE_SETTINGS = CacheSettings(cache_dir=pp5.data_subdir("cache")) + + def __getstate__(self): + return self.__dict__.copy() + + def __setstate__(self, state): + self.__dict__.update(state) + + @abstractmethod + def cache_attribs(self) -> Dict[str, Any]: + """ + :return: The attributes which determine the cache filename. + """ + pass + + @classmethod + def _cache_filename_prefix(cls, cache_attribs: Dict[str, Any]) -> str: + """ + Generates the prefix of the cache filename. + :param cache_attribs: Attributes which determine the cache filename. + :return: The prefix of the cache filename. + """ + return cls.__name__.lower() + + @classmethod + def _cache_filename(cls, cache_attribs: Dict[str, Any]) -> str: + """ + Generates the cache filename. + :param cache_attribs: The attributes which determine the cache filename. + :return: The cache filename. + """ + return ( + f"{cls._cache_filename_prefix(cache_attribs=cache_attribs)}" + "-" + f"{stable_hash(sort_dict(cache_attribs,by_value=False))}.json" + ) + + def to_cache( + self, + cache_dir: Optional[Union[str, Path]] = None, + filename: Optional[Union[str, Path]] = None, + **json_kws, + ) -> Path: + """ + Write the object to a human-readable text file (json) which + can also be loaded later using from_cache. + :param cache_dir: Directory of cached files. + :param filename: Cached file name (without directory). + :return: The path of the written file. + """ + if cache_dir is None: + cache_dir = self._CACHE_SETTINGS.cache_dir + if filename is None: + filename = self._cache_filename(self.cache_attribs()) + + filepath = pp5.get_resource_path(cache_dir, filename) + os.makedirs(str(filepath.parent), exist_ok=True) + + with filelock_context(filepath): + with open(str(filepath), "w", encoding="utf-8") as f: + json.dump(self.__getstate__(), f, indent=2, **json_kws) + + file_size = os.path.getsize(filepath) + file_size_str = ( + f"{file_size / 1024:.1f}kB" + if file_size < 1024 * 1024 + else f"{file_size / 1024 / 1024:.1f}MB" + ) + LOGGER.info(f"Wrote cache file: {filepath} ({file_size_str})") + return filepath + + @classmethod + def from_cache( + cls, + cache_dir: Optional[Union[str, Path]] = None, + cache_attribs: Optional[Dict[str, Any]] = None, + filename: Optional[Union[str, Path]] = None, + ): + """ + Load the object from a cached file. + :param cache_dir: Directory of cached file. + :param cache_attribs: Attributes which determine the cache filename. + :param filename: Cached filename (without directory). Won't be used if + cache_attribs is given. + :return: The loaded object, or None if the file doesn't exist. + """ + if not (cache_attribs or filename): + raise ValueError("cache_attribs or filename must be given") + + if cache_dir is None: + cache_dir = cls._CACHE_SETTINGS.cache_dir + + if filename is None: + filename = cls._cache_filename(cache_attribs) + + filepath = pp5.get_resource_path(cache_dir, filename) + + obj = None + + with filelock_context(filepath): + if filepath.is_file(): + try: + with open(str(filepath), "r", encoding="utf-8") as f: + state_dict = json.load(f) + obj = cls.__new__(cls) + obj.__setstate__(state_dict) + except Exception as e: + LOGGER.warning( + f"Failed to load cached {cls.__name__} {filepath} {e}" + ) + return obj + + +class ReprJSONEncoder(JSONEncoder): + """ + A JSONEncoder that converts an object to it's representation string in + case it's not serializable. + """ + + def default(self, o: Any) -> Any: + try: + return repr(o) + except Exception as e: + pass + # Let the base class default method raise the TypeError + return JSONEncoder.default(self, o) diff --git a/src/pp5/collect.py b/src/pp5/collect.py index 90ebb41..942d4ed 100644 --- a/src/pp5/collect.py +++ b/src/pp5/collect.py @@ -24,7 +24,8 @@ import pp5.parallel from pp5.prec import ProteinRecord from pp5.align import ProteinBLAST -from pp5.utils import ReprJSONEncoder, ProteinInitError, elapsed_seconds_to_dhms +from pp5.cache import ReprJSONEncoder +from pp5.utils import ProteinInitError, elapsed_seconds_to_dhms from pp5.pgroup import ProteinGroup from pp5.external_dbs import pdb, unp, pdb_api from pp5.external_dbs.pdb import PDB_RCSB diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py index 291c1f3..9bd1fc6 100644 --- a/src/pp5/external_dbs/pdb.py +++ b/src/pp5/external_dbs/pdb.py @@ -22,7 +22,8 @@ from Bio.PDB.PDBExceptions import PDBConstructionWarning, PDBConstructionException from pp5 import PDB_DIR, PDB_METADATA_DIR, get_resource_path -from pp5.utils import JSONCacheableMixin, remote_dl +from pp5.cache import Cacheable, CacheSettings +from pp5.utils import remote_dl from pp5.external_dbs import pdb_api PDB_ID_PATTERN = re.compile( @@ -253,11 +254,13 @@ def pdb_to_secondary_structure( _TC = TypeVar("_TC") -class PDBMetadata(JSONCacheableMixin): +class PDBMetadata(Cacheable): """ Obtains and parses metadata from a PDB structure using PDB REST API. """ + _CACHE_SETTINGS = CacheSettings(cache_dir=PDB_METADATA_DIR) + def __init__(self, pdb_id: str): """ :param pdb_id: The PDB ID of the structure. No chain. @@ -310,10 +313,6 @@ def _resolve( return meta - @classmethod - def cache_dir(cls) -> Path: - return PDB_METADATA_DIR - @classmethod def _cache_filename_prefix(cls, cache_attribs: Dict[str, Any]) -> str: pdb_id = cache_attribs["pdb_id"] diff --git a/src/pp5/utils.py b/src/pp5/utils.py index 3b9525c..93073bd 100644 --- a/src/pp5/utils.py +++ b/src/pp5/utils.py @@ -1,15 +1,12 @@ import os import sys import gzip -import json import pickle import random import hashlib import logging import contextlib -from abc import abstractmethod -from json import JSONEncoder -from typing import Any, Dict, Union, Callable, Optional +from typing import Any, Union, Callable from pathlib import Path from datetime import datetime, timedelta from collections.abc import Set, Mapping, Sequence @@ -335,144 +332,5 @@ def _hash(bytelike: bytes) -> str: return _hash(obj_bytes) -class JSONCacheableMixin(object): - """ - Makes a class cacheable to JSON. - """ - - def __getstate__(self): - return self.__dict__.copy() - - def __setstate__(self, state): - self.__dict__.update(state) - - @classmethod - @abstractmethod - def cache_dir(cls) -> Path: - """ - :return: The directory to which files will be cached. - """ - pass - - @abstractmethod - def cache_attribs(self) -> Dict[str, Any]: - """ - :return: The attributes which determine the cache filename. - """ - pass - - @classmethod - def _cache_filename_prefix(cls, cache_attribs: Dict[str, Any]) -> str: - """ - Generates the prefix of the cache filename. - :param cache_attribs: Attributes which determine the cache filename. - :return: The prefix of the cache filename. - """ - return cls.__name__.lower() - - @classmethod - def _cache_filename(cls, cache_attribs: Dict[str, Any]) -> str: - """ - Generates the cache filename. - :param cache_attribs: The attributes which determine the cache filename. - :return: The cache filename. - """ - return ( - f"{cls._cache_filename_prefix(cache_attribs=cache_attribs)}" - "-" - f"{stable_hash(sort_dict(cache_attribs,by_value=False))}.json" - ) - - def to_cache( - self, - cache_dir: Optional[Union[str, Path]] = None, - filename: Optional[Union[str, Path]] = None, - **json_kws, - ) -> Path: - """ - Write the object to a human-readable text file (json) which - can also be loaded later using from_cache. - :param cache_dir: Directory of cached files. - :param filename: Cached file name (without directory). - :return: The path of the written file. - """ - if cache_dir is None: - cache_dir = self.cache_dir() - if filename is None: - filename = self._cache_filename(self.cache_attribs()) - - filepath = pp5.get_resource_path(cache_dir, filename) - os.makedirs(str(filepath.parent), exist_ok=True) - - with filelock_context(filepath): - with open(str(filepath), "w", encoding="utf-8") as f: - json.dump(self.__getstate__(), f, indent=2, **json_kws) - - file_size = os.path.getsize(filepath) - file_size_str = ( - f"{file_size / 1024:.1f}kB" - if file_size < 1024 * 1024 - else f"{file_size / 1024 / 1024:.1f}MB" - ) - LOGGER.info(f"Wrote cache file: {filepath} ({file_size_str})") - return filepath - - @classmethod - def from_cache( - cls, - cache_dir: Optional[Union[str, Path]] = None, - cache_attribs: Optional[Dict[str, Any]] = None, - filename: Optional[Union[str, Path]] = None, - ): - """ - Load the object from a cached file. - :param cache_dir: Directory of cached file. - :param cache_attribs: Attributes which determine the cache filename. - :param filename: Cached filename (without directory). Won't be used if - cache_attribs is given. - :return: The loaded object, or None if the file doesn't exist. - """ - if not (cache_attribs or filename): - raise ValueError("cache_attribs or filename must be given") - - if cache_dir is None: - cache_dir = cls.cache_dir() - - if filename is None: - filename = cls._cache_filename(cache_attribs) - - filepath = pp5.get_resource_path(cache_dir, filename) - - obj = None - - with filelock_context(filepath): - if filepath.is_file(): - try: - with open(str(filepath), "r", encoding="utf-8") as f: - state_dict = json.load(f) - obj = cls.__new__(cls) - obj.__setstate__(state_dict) - except Exception as e: - LOGGER.warning( - f"Failed to load cached {cls.__name__} {filepath} {e}" - ) - return obj - - -class ReprJSONEncoder(JSONEncoder): - """ - A JSONEncoder that converts an object to it's representation string in - case it's not serializable. - """ - - def default(self, o: Any) -> Any: - try: - return repr(o) - except Exception as e: - pass - # Let the base class default method raise the TypeError - return JSONEncoder.default(self, o) - - class ProteinInitError(ValueError): pass From 539aa22a657612ebf59f45006104a6036d4493fb Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Fri, 16 Feb 2024 13:36:41 +0200 Subject: [PATCH 35/37] Cacheable: Add support for compression --- src/pp5/cache.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/pp5/cache.py b/src/pp5/cache.py index f49639a..08300a9 100644 --- a/src/pp5/cache.py +++ b/src/pp5/cache.py @@ -5,6 +5,7 @@ from json import JSONEncoder from typing import Any, Dict, Union, Optional from pathlib import Path +from zipfile import ZIP_DEFLATED, ZipFile from dataclasses import dataclass import pp5 @@ -104,6 +105,16 @@ def to_cache( with open(str(filepath), "w", encoding="utf-8") as f: json.dump(self.__getstate__(), f, indent=2, **json_kws) + if self._CACHE_SETTINGS.cache_compression: + zip_filepath = filepath.with_suffix(".zip") + with ZipFile( + zip_filepath, "w", compression=ZIP_DEFLATED, compresslevel=6 + ) as fzip: + fzip.write(str(filepath), arcname=filename) + + filepath.unlink() + filepath = zip_filepath + file_size = os.path.getsize(filepath) file_size_str = ( f"{file_size / 1024:.1f}kB" @@ -142,6 +153,11 @@ def from_cache( obj = None with filelock_context(filepath): + zip_filepath = filepath.with_suffix(".zip") + if cls._CACHE_SETTINGS.cache_compression and zip_filepath.is_file(): + with ZipFile(zip_filepath, "r") as fzip: + fzip.extractall(path=zip_filepath.parent) + if filepath.is_file(): try: with open(str(filepath), "r", encoding="utf-8") as f: @@ -152,6 +168,9 @@ def from_cache( LOGGER.warning( f"Failed to load cached {cls.__name__} {filepath} {e}" ) + finally: + if cls._CACHE_SETTINGS.cache_compression: + filepath.unlink() return obj From 8dd13229f856db4ff6c5a08ffab04c3ce0ead9b3 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Fri, 23 Feb 2024 04:25:45 +0200 Subject: [PATCH 36/37] PDBMetadata: Use compression for cache --- src/pp5/external_dbs/pdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py index 9bd1fc6..f1ab574 100644 --- a/src/pp5/external_dbs/pdb.py +++ b/src/pp5/external_dbs/pdb.py @@ -259,7 +259,7 @@ class PDBMetadata(Cacheable): Obtains and parses metadata from a PDB structure using PDB REST API. """ - _CACHE_SETTINGS = CacheSettings(cache_dir=PDB_METADATA_DIR) + _CACHE_SETTINGS = CacheSettings(cache_dir=PDB_METADATA_DIR, cache_compression=True) def __init__(self, pdb_id: str): """ From 073f667e031c833817ce3cb21655344c80828a75 Mon Sep 17 00:00:00 2001 From: Aviv Rosenberg Date: Fri, 23 Feb 2024 04:32:32 +0200 Subject: [PATCH 37/37] prec: add TODO about caching --- src/pp5/prec.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pp5/prec.py b/src/pp5/prec.py index 97dd1c8..39a0d91 100644 --- a/src/pp5/prec.py +++ b/src/pp5/prec.py @@ -528,6 +528,8 @@ def from_cache( :return: Loaded ProteinRecord, or None if the cached prec does not exist. """ + # TODO: Prec should use Cacheable base class instead of this custom approach. + if not isinstance(cache_dir, (str, Path)): cache_dir = pp5.PREC_DIR