From 99036c1672a55f7287bd1c3d93c69b1d1cbb784b Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Sun, 11 Feb 2024 16:21:56 +0200
Subject: [PATCH 01/37] PDBMetadata: Major refactor to work with PDB REST API
 responses

---
 src/pp5/external_dbs/pdb.py | 353 ++++++++++++++++++++++++------------
 1 file changed, 241 insertions(+), 112 deletions(-)

diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py
index b3c7f9c..07bbf44 100644
--- a/src/pp5/external_dbs/pdb.py
+++ b/src/pp5/external_dbs/pdb.py
@@ -7,8 +7,20 @@
 from math import cos, sin
 from math import degrees as deg
 from math import radians as rad
-from typing import Any, Dict, List, Type, Tuple, Union, Optional, Sequence
+from typing import (
+    Any,
+    Set,
+    Dict,
+    List,
+    Tuple,
+    Union,
+    TypeVar,
+    Callable,
+    Optional,
+    Sequence,
+)
 from pathlib import Path
+from datetime import datetime
 from collections import defaultdict
 
 import numpy as np
@@ -551,132 +563,249 @@ def from_cache(
         return super(PDB2UNP, cls).from_cache(cache_dir, filename)
 
 
-class PDBMetadata(object):
+_TC = TypeVar("_TC")
+
+
+class PDBMetadata(object):  # TODO: JSONCacheableMixin
     """
-    Extracts metadata from a PDB structure.
-    Helpful metadata fields:
-    https://www.rcsb.org/pdb/results/reportField.do
+    Obtains and parses metadata from a PDB structure using PDB REST API.
     """
 
-    def __init__(self, pdb_id: str, pdb_source: str = PDB_RCSB, struct_d=None):
+    def __init__(self, pdb_id: str):
         """
-        :param pdb_id: The PDB ID of the structure.
-        :param struct_d: Optional dict which will be used if given, instead of
-        parsing the PDB file.
-        :param pdb_source: Source from which to obtain the pdb file.
+        :param pdb_id: The PDB ID of the structure. No chain.
         """
-        pdb_base_id, chain_id = split_id(pdb_id)
-        struct_d = pdb_dict(pdb_id, pdb_source=pdb_source, struct_d=struct_d)
-
-        # For alphafold structures, default to zero resolution instead of NaN.
-        default_res = 0.0 if pdb_source == PDB_AFLD else None
-
-        def _meta(key: str, convert_to: Type = str, default=None):
-            val = struct_d.get(key, None)
-            if not val:
-                return default
-            if isinstance(val, list):
-                val = val[0]
-            if not val or val == "?":
-                return default
+
+        self._pdb_id, _ = split_id(pdb_id)
+
+        # Obtain structure-level metadata from the PDB API
+        self._meta_struct: dict = pdb_api.execute_raw_data_query(self.pdb_id)
+        self._meta_entities: Dict[int, dict] = {}
+        self._meta_chains: Dict[str, dict] = {}
+        entity_ids = self._meta_struct["rcsb_entry_container_identifiers"][
+            "polymer_entity_ids"
+        ]
+        for entity_id in entity_ids:
+            entity_id = int(entity_id)
+            # Obtain entity-level metadata from the PDB API
+            self._meta_entities[entity_id] = pdb_api.execute_raw_data_query(
+                self.pdb_id, entity_id=entity_id
+            )
+
+            chain_ids = self._meta_entities[entity_id][
+                "rcsb_polymer_entity_container_identifiers"
+            ]["asym_ids"]
+            for chain_id in chain_ids:
+                # Obtain chain-level metadata from the PDB API
+                self._meta_chains[chain_id] = pdb_api.execute_raw_data_query(
+                    self.pdb_id, chain_id=chain_id
+                )
+
+    @staticmethod
+    def _resolve(
+        meta: dict, key: str, coerce_type: Callable[[Any], _TC]
+    ) -> Optional[_TC]:
+        for subkey in key.split("."):
+            if isinstance(meta, (list, tuple)):
+                subkey = int(subkey)
+            elif not isinstance(meta, dict):
+                raise ValueError(f"Can't resolve {key} in {meta}")
+            elif subkey not in meta:
+                return None
+
+            meta = meta[subkey]
+
+        if meta is not None:
             try:
-                return convert_to(val)
+                meta = coerce_type(meta)
             except ValueError:
-                return default
-
-        title = _meta("_struct.title")
-        description = _meta("_entity.pdbx_description")
-        deposition_date = _meta("_pdbx_database_status.recvd_initial_deposition_date")
-
-        src_org = _meta("_entity_src_nat.pdbx_organism_scientific")
-        if not src_org:
-            src_org = _meta("_entity_src_gen.pdbx_gene_src_scientific_name")
-
-        src_org_id = _meta("_entity_src_nat.pdbx_ncbi_taxonomy_id", int)
-        if not src_org_id:
-            src_org_id = _meta("_entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id", int)
-
-        host_org = _meta("_entity_src_gen.pdbx_host_org_scientific_name")
-        host_org_id = _meta("_entity_src_gen.pdbx_host_org_ncbi_taxonomy_id", int)
-        resolution = _meta("_refine.ls_d_res_high", float, default=default_res)
-        resolution_low = _meta("_refine.ls_d_res_low", float, default=default_res)
-        r_free = _meta("_refine.ls_R_factor_R_free", float)
-        r_work = _meta("_refine.ls_R_factor_R_work", float)
-        space_group = _meta("_symmetry.space_group_name_H-M")
-
-        # Find ligands
-        ligands = set()
-        for i, chemical_type in enumerate(struct_d["_chem_comp.id"]):
-            if chemical_type.lower() == "hoh":
-                continue
-            if chemical_type not in STANDARD_ACID_NAMES:
-                ligands.add(chemical_type)
-        ligands = str.join(",", ligands)
-
-        # Crystal growth details
-        cg_ph = _meta("_exptl_crystal_grow.pH", float)
-        cg_temp = _meta("_exptl_crystal_grow.temp", float)
-
-        # Map each chain to entity id, and entity to 1-letter sequence.
-        chain_entities, entity_seq = {}, {}
-        for i, entity_id in enumerate(struct_d["_entity_poly.entity_id"]):
-            if not struct_d["_entity_poly.type"][i].startswith("polypeptide"):
-                continue
+                LOGGER.warning(f"Failed to coerce {meta}@{key} to {coerce_type}")
 
-            entity_id = int(entity_id)
-            chains_str = struct_d["_entity_poly.pdbx_strand_id"][i]
-            for chain in chains_str.split(","):
-                chain_entities[chain] = entity_id
-
-            seq_str: str = struct_d["_entity_poly.pdbx_seq_one_letter_code_can"][i]
-            seq_str = seq_str.replace("\n", "")
-            entity_seq[entity_id] = seq_str
-
-        self.pdb_id: str = pdb_base_id
-        self.pdb_source: str = pdb_source
-        self.title: str = title
-        self.description: str = description
-        self.deposition_date: str = deposition_date
-        self.src_org: str = src_org
-        self.src_org_id: int = src_org_id
-        self.host_org: str = host_org
-        self.host_org_id: int = host_org_id
-        self.resolution: float = resolution
-        self.resolution_low: float = resolution_low
-        self.r_free: float = r_free
-        self.r_work: float = r_work
-        self.space_group: str = space_group
-        self.ligands: str = ligands
-        self.cg_ph: float = cg_ph  # crystal growth pH
-        self.cg_temp: float = cg_temp  # crystal growth temperature
-        # mapping from chain_id to  entity_id
-        self.chain_entities: Dict[str, int] = chain_entities
-        # mapping from entity_id to sequence
-        self.entity_sequence: Dict[int, str] = entity_seq
-
-    def get_chain(self, entity_id: int) -> Optional[str]:
-        """
-        :param entity_id: An ID of one of the entities in this structure.
-        :return: One of the chains from teh structure belonging to this entity id,
-        or None if this is not a valid entity if for the given structure.
-        """
-        chains = [c for c, e in self.chain_entities.items() if e == entity_id]
-        if not chains:
-            return None
-        return sorted(chains)[0]
+        return meta
 
-    def as_dict(self) -> Dict[str, Any]:
-        return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}
+    @property
+    def pdb_id(self) -> str:
+        return self._pdb_id
+
+    @property
+    def title(self) -> Optional[str]:
+        return self._resolve(self._meta_struct, "struct.title", str)
+
+    @property
+    def description(self) -> Optional[str]:
+        # api_meta_entity["rcsb_polymer_entity"]["pdbx_description"]
+        return self._resolve(self._meta_struct, "struct.pdbx_descriptor", str)
+
+    @property
+    def entity_description(self) -> Dict[int, Optional[str]]:
+        return {
+            entity_id: self._resolve(
+                meta_entity, "rcsb_polymer_entity.pdbx_description", str
+            )
+            for entity_id, meta_entity in self._meta_entities.items()
+        }
+
+    @property
+    def deposition_date(self) -> Optional[datetime]:
+        return self._resolve(
+            self._meta_struct,
+            "pdbx_database_status.recvd_initial_deposition_date",
+            datetime.fromisoformat,
+        )
+
+    @property
+    def entity_source_org(self) -> Dict[int, Optional[str]]:
+        return {
+            entity_id: self._resolve(
+                meta_entity, "rcsb_entity_source_organism.0.ncbi_scientific_name", str
+            )
+            or self._resolve(
+                meta_entity, "entity_src_nat.0.pdbx_organism_scientific", str
+            )
+            or self._resolve(
+                meta_entity, "entity_src_gen.0.pdbx_gene_src_scientific_name", str
+            )
+            for entity_id, meta_entity in self._meta_entities.items()
+        }
+
+    @property
+    def entity_source_org_id(self) -> Dict[int, Optional[int]]:
+        return {
+            entity_id: self._resolve(
+                meta_entity, "rcsb_entity_source_organism.0.ncbi_taxonomy_id", int
+            )
+            or self._resolve(meta_entity, "entity_src_nat.0.pdbx_ncbi_taxonomy_id", int)
+            or self._resolve(
+                meta_entity, "entity_src_gen.0.pdbx_gene_src_ncbi_taxonomy_id", int
+            )
+            for entity_id, meta_entity in self._meta_entities.items()
+        }
+
+    @property
+    def entity_host_org(self) -> Dict[int, Optional[str]]:
+        return {
+            entity_id: self._resolve(
+                meta_entity, "rcsb_entity_host_organism.0.ncbi_scientific_name", str
+            )
+            or self._resolve(
+                meta_entity, "entity_src_gen.0.pdbx_host_org_scientific_name", str
+            )
+            for entity_id, meta_entity in self._meta_entities.items()
+        }
+
+    @property
+    def entity_host_org_id(self) -> Dict[int, Optional[int]]:
+        return {
+            entity_id: self._resolve(
+                meta_entity, "rcsb_entity_host_organism.0.ncbi_taxonomy_id", int
+            )
+            or self._resolve(
+                meta_entity, "entity_src_gen.0.pdbx_host_org_ncbi_taxonomy_id", int
+            )
+            for entity_id, meta_entity in self._meta_entities.items()
+        }
+
+    @property
+    def resolution(self) -> Optional[float]:
+        return self._resolve(self._meta_struct, "reflns.0.d_resolution_high", float)
+
+    @property
+    def resolution_low(self) -> Optional[float]:
+        return self._resolve(self._meta_struct, "reflns.0.d_resolution_low", float)
+
+    @property
+    def r_free(self) -> Optional[float]:
+        return self._resolve(self._meta_struct, "refine.0.ls_rfactor_rfree", float)
+
+    @property
+    def r_work(self) -> Optional[float]:
+        return self._resolve(self._meta_struct, "refine.0.ls_rfactor_rwork", float)
+
+    @property
+    def space_group(self) -> Optional[str]:
+        return self._resolve(
+            self._meta_struct, "symmetry.space_group_name_hm", str
+        ) or self._resolve(self._meta_struct, "symmetry.space_group_name_H_M", str)
+
+    @property
+    def cg_ph(self) -> Optional[float]:
+        return self._resolve(self._meta_struct, "exptl_crystal_grow.0.pH", float)
+
+    @property
+    def cg_temp(self) -> Optional[float]:
+        return self._resolve(self._meta_struct, "exptl_crystal_grow.0.temp", float)
+
+    @property
+    def chain_ligands(self) -> Dict[str, Set[str]]:
+        return {
+            chain_id: set(
+                [
+                    ld.get("ligand_comp_id")
+                    for ld in meta_chain.get("rcsb_ligand_neighbors", [])
+                ]
+            )
+            for chain_id, meta_chain in self._meta_chains.items()
+        }
+
+    @property
+    def ligands(self) -> str:
+        return str.join(",", sorted(set.union(*self.chain_ligands.values())))
 
     @property
     def entity_chains(self) -> Dict[int, Sequence[str]]:
         """
         :return: Mapping from entity id to a list of chains belonging to that entity.
         """
-        entity_chains = defaultdict(list)
-        for chain, entity in self.chain_entities.items():
-            entity_chains[entity].append(chain)
-        return dict(entity_chains)
+        return self._entity_chains(author=False)
+
+    @property
+    def entity_auth_chains(self) -> Dict[int, Sequence[str]]:
+        """
+        :return: Mapping from entity id to a list of chains belonging to that entity,
+        using the original author's chain ids.
+        """
+        return self._entity_chains(author=True)
+
+    def _entity_chains(self, author: bool = False) -> Dict[int, Sequence[str]]:
+        """
+        :param author: Whether to use author or canonical chain ids.
+        :return: Mapping from entity id to a list of chains belonging to that entity.
+        """
+        asym_ids_key = "auth_asym_ids" if author else "asym_ids"
+        key = f"rcsb_polymer_entity_container_identifiers.{asym_ids_key}"
+        return {
+            entity_id: self._resolve(meta_entity, key, tuple)
+            for entity_id, meta_entity in self._meta_entities.items()
+        }
+
+    @property
+    def chain_entities(self) -> Dict[str, int]:
+        """
+        :return: Mapping from chain id to its entity id.
+        """
+        chain_to_entity = {}
+        for entity_id, chain_ids in self.entity_chains.items():
+            chain_to_entity = {
+                **chain_to_entity,
+                **{chain_id: entity_id for chain_id in chain_ids},
+            }
+        return chain_to_entity
+
+    @property
+    def entity_sequence(self) -> Dict[int, str]:
+        return {
+            entity_id: self._resolve(
+                meta_entity, "entity_poly.pdbx_seq_one_letter_code_can", str
+            )
+            for entity_id, meta_entity in self._meta_entities.items()
+        }
+
+    def as_dict(self) -> Dict[str, Any]:
+        return {
+            k: getattr(self, k)
+            for k, v in self.__class__.__dict__.items()
+            if isinstance(v, property)
+        }
 
     def __repr__(self):
         return str(self.as_dict())

From 225b7d6e22e03b902b42d10c9db458fe2e4fe672 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Sun, 11 Feb 2024 16:22:20 +0200
Subject: [PATCH 02/37] PDB2UNP: Include author chains

---
 src/pp5/external_dbs/pdb.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py
index 07bbf44..b20efbe 100644
--- a/src/pp5/external_dbs/pdb.py
+++ b/src/pp5/external_dbs/pdb.py
@@ -466,7 +466,13 @@ def query_entity_uniprot_id_alignments(
 
             # Get list of chains and list of Uniprot IDs for this entity
             entity_containers = entity_data["rcsb_polymer_entity_container_identifiers"]
-            entity_chains = entity_containers.get("asym_ids", [])
+            entity_chains = [
+                # The same chain can be referred to by different labels,
+                # the canonical PDB label and another label given by the
+                # structure author.
+                *entity_containers.get("asym_ids", []),
+                *entity_containers.get("auth_asym_ids", []),
+            ]
             entity_unp_ids = entity_containers.get("uniprot_ids", [])
 
             unp_alignments: Dict[str, List[Tuple[int, int]]] = {

From 477bee898a651d7c295503c830c078f1e07cfd6e Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Sun, 11 Feb 2024 16:50:41 +0200
Subject: [PATCH 03/37] PDBMetadata: Improve resolution parsing

---
 src/pp5/external_dbs/pdb.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py
index b20efbe..614969f 100644
--- a/src/pp5/external_dbs/pdb.py
+++ b/src/pp5/external_dbs/pdb.py
@@ -713,7 +713,9 @@ def entity_host_org_id(self) -> Dict[int, Optional[int]]:
 
     @property
     def resolution(self) -> Optional[float]:
-        return self._resolve(self._meta_struct, "reflns.0.d_resolution_high", float)
+        return self._resolve(
+            self._meta_struct, "rcsb_entry_info.diffrn_resolution_high.value", float
+        ) or self._resolve(self._meta_struct, "reflns.0.d_resolution_high", float)
 
     @property
     def resolution_low(self) -> Optional[float]:

From 379378bcb9909f56cf9187058136e728e2e7ced9 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Sun, 11 Feb 2024 16:51:01 +0200
Subject: [PATCH 04/37] PDBMetadata: Map chains to auth chains

---
 src/pp5/external_dbs/pdb.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py
index 614969f..37c52d5 100644
--- a/src/pp5/external_dbs/pdb.py
+++ b/src/pp5/external_dbs/pdb.py
@@ -21,6 +21,7 @@
 )
 from pathlib import Path
 from datetime import datetime
+from itertools import zip_longest
 from collections import defaultdict
 
 import numpy as np
@@ -799,6 +800,28 @@ def chain_entities(self) -> Dict[str, int]:
             }
         return chain_to_entity
 
+    @property
+    def chain_to_auth_chain(self) -> Dict[str, str]:
+        """
+        :return: Mapping from PDB chain id to its author's chain id. If there are no
+        different names for the author chains, the PDB chain names are mapped to
+        themselves.
+        """
+        entity_auth_chains = self.entity_auth_chains
+        chain_to_auth_chain = {}
+        for entity_id, chain_ids in self.entity_chains.items():
+            auth_chain_ids = entity_auth_chains[entity_id]
+            chain_to_auth_chain = {
+                **chain_to_auth_chain,
+                **{
+                    chain_id: auth_chain_id or chain_id
+                    for chain_id, auth_chain_id in zip_longest(
+                        chain_ids, auth_chain_ids
+                    )
+                },
+            }
+        return chain_to_auth_chain
+
     @property
     def entity_sequence(self) -> Dict[int, str]:
         return {

From 55a47364b524a2678af72de9d30ac5c22aea3830 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Sun, 11 Feb 2024 17:45:08 +0200
Subject: [PATCH 05/37] prec: Update to work with new metadata

---
 src/pp5/prec.py | 128 ++++++++++++------------------------------------
 1 file changed, 32 insertions(+), 96 deletions(-)

diff --git a/src/pp5/prec.py b/src/pp5/prec.py
index 95fd576..fda9852 100644
--- a/src/pp5/prec.py
+++ b/src/pp5/prec.py
@@ -570,8 +570,8 @@ def from_pdb(
                 pdb_dict = pdb.pdb_dict(
                     pdb_id, pdb_source=pdb_source, struct_d=pdb_dict
                 )
-                meta = pdb.PDBMetadata(pdb_id, pdb_source=pdb_source, struct_d=pdb_dict)
-                chain_id = meta.get_chain(entity_id)
+                meta = pdb.PDBMetadata(pdb_id)
+                chain_id = meta.entity_chains[entity_id][0]
 
                 if not chain_id:
                     # In rare cases the chain is a number instead of a letter,
@@ -667,7 +667,7 @@ def from_unp(
 
     def __init__(
         self,
-        unp_id: str,
+        unp_id: str,  # TODO: Get this from metadata
         pdb_id: str,
         pdb_source: str = PDB_RCSB,
         pdb_dict: dict = None,
@@ -684,17 +684,13 @@ def __init__(
         contact_radius: float = CONTACT_DEFAULT_RADIUS,
     ):
         """
+        Don't call this directly. Use class methods from_pdb or from_unp instead.
+
         Initialize a protein record from both Uniprot and PDB ids.
-        To initialize a protein from Uniprot id or PDB id only, use the
-        class methods provided for this purpose.
 
         :param unp_id: Uniprot id which uniquely identifies the protein.
-        :param pdb_id: PDB id with or without chain (e.g. '1ABC' or '1ABC:D')
-        of the specific structure desired. Note that this structure must match
-        the unp_id, i.e. it must exist in the cross-refs of the given unp_id.
-        Otherwise an error will be raised (unless strict_unp_xref=False). If no
-        chain is specified, a chain matching the unp_id will be used,
-        if it exists.
+        :param pdb_id: PDB id with chain (e.g. '1ABC:D') of the specific structure chain
+        desired.
         :param pdb_source: Source from which to obtain the pdb file.
         :param dihedral_est_name: Method of dihedral angle estimation.
         Options are:
@@ -747,27 +743,37 @@ class methods provided for this purpose.
         self.contact_radius = contact_radius
         self.contact_method = contact_method
 
-        # First we must find a matching PDB structure and chain for the
-        # Uniprot id. If a pdb_id is given we'll try to use that, depending
-        # on whether there's a Uniprot xref for it and on strict_unp_xref.
-        self.pdb_base_id, self.pdb_chain_id = self._find_pdb_xref(pdb_id)
+        # Parse the given PDB id
+        self.pdb_base_id, self.pdb_chain_id, ent_id = pdb.split_id_with_entity(pdb_id)
+        if numeric_chain:
+            self.pdb_chain_id = str(ent_id)
         self.pdb_id = f"{self.pdb_base_id}:{self.pdb_chain_id}"
+
         self.pdb_source = pdb_source
         if pdb_dict:
             self._pdb_dict = pdb_dict
 
-        self.pdb_meta = pdb.PDBMetadata(
-            self.pdb_id, pdb_source=self.pdb_source, struct_d=self.pdb_dict
-        )
+        self.pdb_meta = pdb.PDBMetadata(self.pdb_id)
         if not self.pdb_meta.resolution and self.pdb_source != PDB_AFLD:
             raise ProteinInitError(f"Unknown resolution for {pdb_id}")
 
+        self.pdb_entity_id = self.pdb_meta.chain_entities[self.pdb_chain_id]
+        self.pdb_auth_chain_id = self.pdb_meta.chain_to_auth_chain[self.pdb_chain_id]
+
+        chain_str = (
+            self.pdb_chain_id
+            if self.pdb_auth_chain_id == self.pdb_chain_id
+            else f"{self.pdb_chain_id}({self.pdb_auth_chain_id})"
+        )
         LOGGER.info(
-            f"{self}: {self.pdb_meta.description}, "
-            f"org={self.pdb_meta.src_org} ({self.pdb_meta.src_org_id}), "
-            f"expr={self.pdb_meta.host_org} ({self.pdb_meta.host_org_id}), "
+            f"pdb_id={self.pdb_base_id}, chain={chain_str}, unp_id={self.unp_id}, "
+            f"entity_id={self.pdb_entity_id}, "
             f"res={self.pdb_meta.resolution:.2f}Å, "
-            f"entity_id={self.pdb_meta.chain_entities[self.pdb_chain_id]}"
+            f"desc={self.pdb_meta.entity_description[self.pdb_entity_id]}, "
+            f"org={self.pdb_meta.entity_source_org[self.pdb_entity_id]} "
+            f"({self.pdb_meta.entity_source_org_id[self.pdb_entity_id]}), "
+            f"expr={self.pdb_meta.entity_host_org[self.pdb_entity_id]} "
+            f"({self.pdb_meta.entity_host_org_id[self.pdb_entity_id]})"
         )
 
         # Make sure the structure is sane. See e.g. 1FFK.
@@ -994,7 +1000,9 @@ def polypeptides(self) -> List[Polypeptide]:
         https://proteopedia.org/wiki/index.php/HETATM
         """
         if not self._pp:
-            chain = self.pdb_rec[0][self.pdb_chain_id]
+            # Use author chain id to get the polypeptides, as the author chain is
+            # what's associated with the coordinates in the mmCIF file.
+            chain = self.pdb_rec[0][self.pdb_auth_chain_id]
             pp_chains = PPBuilder().build_peptides(chain, aa_only=True)
 
             # Sort chain by sequence ID of first residue in the chain,
@@ -1234,78 +1242,6 @@ def _find_dna_alignment(
 
         return best_ena.id, str(best_ena.seq), idx_to_codons
 
-    def _find_pdb_xref(self, ref_pdb_id) -> Tuple[str, str]:
-        ref_pdb_id, ref_chain_id, ent_id = pdb.split_id_with_entity(ref_pdb_id)
-        if not ref_chain_id:
-            if ent_id is not None and self.numeric_chain:
-                # In rare cases the chain is a number and indistinguishable
-                # from entity. Handle this case only if explicitly
-                # requested.
-                ref_chain_id = ent_id
-            else:
-                ref_chain_id = ""
-
-        ref_pdb_id, ref_chain_id = ref_pdb_id.upper(), ref_chain_id.upper()
-
-        xrefs = unp.find_pdb_xrefs(self.unp_rec, method="x-ray")
-
-        # We'll sort the PDB entries according to multiple criteria based on
-        # the resolution, number of chains and sequence length.
-        def sort_key(xref: unp.UNPPDBXRef):
-            id_cmp = xref.pdb_id.upper() != ref_pdb_id
-            chain_cmp = xref.chain_id.upper() != ref_chain_id
-            seq_len_diff = abs(xref.seq_len - self.unp_rec.sequence_length)
-            # The sort key for PDB entries
-            # First, if we have a matching id to the reference PDB id we take
-            # it. Otherwise, we take the best match according to seq len and
-            # resolution.
-            return id_cmp, chain_cmp, seq_len_diff, xref.resolution
-
-        xrefs = sorted(xrefs, key=sort_key)
-        if not xrefs:
-            msg = f"No PDB cross-refs for {self.unp_id}"
-            if self.strict_unp_xref:
-                raise ProteinInitError(msg)
-            elif not ref_chain_id:
-                raise ProteinInitError(f"{msg} and no chain provided in ref")
-            else:
-                LOGGER.warning(f"{msg}, using ref {ref_pdb_id}:{ref_chain_id}")
-                return ref_pdb_id, ref_chain_id
-
-        # Get best match according to sort key and return its id.
-        xref = xrefs[0]
-        LOGGER.info(f"{self.unp_id}: PDB XREF = {xref}")
-
-        pdb_id = xref.pdb_id.upper()
-        chain_id = xref.chain_id.upper()
-
-        # Make sure we have a match with the Uniprot id. Id chain wasn't
-        # specified, match only PDB ID, otherwise, both must match.
-        if pdb_id != ref_pdb_id:
-            msg = (
-                f"Reference PDB ID {ref_pdb_id} not found as "
-                f"cross-reference for protein {self.unp_id}"
-            )
-            if self.strict_unp_xref:
-                raise ProteinInitError(msg)
-            else:
-                LOGGER.warning(msg)
-                pdb_id = ref_pdb_id
-
-        if ref_chain_id and chain_id != ref_chain_id:
-            msg = (
-                f"Reference chain {ref_chain_id} of PDB ID {ref_pdb_id} not"
-                f"found as cross-reference for protein {self.unp_id}. "
-                f"Did you mean chain {chain_id}?"
-            )
-            if self.strict_unp_xref:
-                raise ProteinInitError(msg)
-            else:
-                LOGGER.warning(msg)
-                chain_id = ref_chain_id
-
-        return pdb_id.upper(), chain_id.upper()
-
     def _get_dihedral_estimators(self, est_name: str, est_args: dict):
         est_name = est_name.lower() if est_name else est_name
         est_args = {} if est_args is None else est_args
@@ -1349,7 +1285,7 @@ def items(self) -> ItemsView[str, ResidueRecord]:
         return self._residue_recs.items()
 
     def __repr__(self):
-        return f"({self.unp_id}, {self.pdb_id})"
+        return f"{self.pdb_id}"
 
     def __getstate__(self):
         # Prevent serializing Bio objects

From a6f694cff62054189dc66fca22c0688548219cd0 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Sun, 11 Feb 2024 17:46:02 +0200
Subject: [PATCH 06/37] pgroup/collect/align: Update to work with new metadata

---
 src/pp5/align.py   | 2 +-
 src/pp5/collect.py | 2 +-
 src/pp5/pgroup.py  | 4 +---
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/pp5/align.py b/src/pp5/align.py
index 41d052c..aad2910 100644
--- a/src/pp5/align.py
+++ b/src/pp5/align.py
@@ -530,7 +530,7 @@ def pdb(self, query_pdb_id: str, pdb_dict=None) -> pd.DataFrame:
             )
 
         # Note: no need for pdb_source, we just care about what chains exist
-        meta = pdb.PDBMetadata(pdb_id, struct_d=pdb_dict)
+        meta = pdb.PDBMetadata(pdb_id)
 
         if chain_id not in meta.chain_entities:
             raise ValueError(f"Can't find chain {chain_id} in {pdb_id}")
diff --git a/src/pp5/collect.py b/src/pp5/collect.py
index 279aa10..1468a51 100644
--- a/src/pp5/collect.py
+++ b/src/pp5/collect.py
@@ -1043,7 +1043,7 @@ def _collect_single_structure(
 
     pdb_dict = pdb.pdb_dict(pdb_id, pdb_source=pdb_source)
     pdb2unp = pdb.PDB2UNP.from_pdb(pdb_id, cache=True)
-    meta = pdb.PDBMetadata(pdb_id, pdb_source=pdb_source, struct_d=pdb_dict)
+    meta = pdb.PDBMetadata(pdb_id)
 
     # Determine all chains we need to collect from the PDB structure
     chains_to_collect: Sequence[str]
diff --git a/src/pp5/pgroup.py b/src/pp5/pgroup.py
index cf279f1..3f4602d 100644
--- a/src/pp5/pgroup.py
+++ b/src/pp5/pgroup.py
@@ -259,9 +259,7 @@ def __init__(
             )
 
         ref_pdb_dict = pdb.pdb_dict(self.ref_pdb_id, pdb_source=pdb_source)
-        ref_pdb_meta = pdb.PDBMetadata(
-            self.ref_pdb_base_id, pdb_source=pdb_source, struct_d=ref_pdb_dict
-        )
+        ref_pdb_meta = pdb.PDBMetadata(self.ref_pdb_base_id)
         if self.ref_pdb_chain not in ref_pdb_meta.chain_entities:
             raise ProteinInitError(f"Unknown PDB entity for {self.ref_pdb_id}")
 

From 8097401923920c5cac6a1dca91b79b348b998213 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Mon, 12 Feb 2024 05:14:19 +0200
Subject: [PATCH 07/37] PDBMetadata: Add uniprot id calculation

---
 src/pp5/external_dbs/pdb.py | 178 ++++++++++++++++++++++++++++++++++++
 1 file changed, 178 insertions(+)

diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py
index 37c52d5..953f34d 100644
--- a/src/pp5/external_dbs/pdb.py
+++ b/src/pp5/external_dbs/pdb.py
@@ -831,6 +831,123 @@ def entity_sequence(self) -> Dict[int, str]:
             for entity_id, meta_entity in self._meta_entities.items()
         }
 
+    @property
+    def uniprot_ids(self) -> Sequence[str]:
+        """
+        :return: All Uniprot IDs associated with the PDB structure.
+        """
+        all_unp_ids = set()
+        for chain_id, unp_ids in self.chain_uniprot_ids.items():
+            all_unp_ids.update(unp_ids)
+        return tuple(sorted(all_unp_ids))
+
+    @property
+    def chain_uniprot_ids(self) -> Dict[str, Sequence[str]]:
+        """
+        Retrieves all Uniprot IDs associated with a PDB structure chains.
+
+        :return: a map: chain -> [unp1, unp2, ...]
+        where unp1, unp2, ... are Uniprot IDs associated with the chain.
+        """
+
+        # entity -> chain -> unp -> [ (s1,e1), ... ]
+        entity_map = self.entity_uniprot_id_alignments
+
+        all_chain_map = {}
+        for entity_id, chain_map in entity_map.items():
+            for chain_id, unp_map in chain_map.items():
+                # chain -> [unp1, unp2, ...]
+                all_chain_map[chain_id] = tuple(unp_map.keys())
+
+        return all_chain_map
+
+    @property
+    def chain_uniprot_id_alignments(
+        self,
+    ) -> Dict[str, Dict[str, List[Tuple[int, int]]]]:
+        """
+        Retrieves all Uniprot IDs associated with a PDB structure chains.
+
+        :return: a map: chain -> unp -> [ (s1,e1), ... ]
+        where (s1,e1) are alignment start,end indices between the UNP and PDB sequences.
+        """
+        # entity -> chain -> unp -> [ (s1,e1), ... ]
+        entity_map = self.entity_uniprot_id_alignments
+
+        all_chain_map = {}
+        for entity_id, chain_map in entity_map.items():
+            for chain_id, unp_map in chain_map.items():
+                # chain -> unp -> [ (s1,e1), ... ]
+                all_chain_map[chain_id] = unp_map
+
+        return all_chain_map
+
+    @property
+    def entity_uniprot_ids(self) -> Dict[str, Dict[str, Sequence[str]]]:
+        """
+        Retrieves all Uniprot IDs associated with a PDB structure entities.
+
+        :return: a map: entity -> chain ->[unp1, unp2, ...]
+        where unp1, unp2, ... are Uniprot IDs associated with the entity.
+        """
+        # entity -> chain -> unp -> [ (s1,e1), ... ]
+        entity_map = self.entity_uniprot_id_alignments
+
+        new_entity_map = defaultdict(dict)
+        for entity_id, chain_map in entity_map.items():
+            for chain_id, unp_map in chain_map.items():
+                # entity -> chain -> [unp1, unp2, ...]
+                new_entity_map[entity_id][chain_id] = tuple(unp_map.keys())
+
+        return dict(new_entity_map)
+
+    @property
+    def entity_uniprot_id_alignments(
+        self,
+    ) -> Dict[int, Dict[str, Dict[str, List[Tuple[int, int]]]]]:
+        """
+        Retrieves all Uniprot IDs associated with a PDB structure entities.
+
+        :return: a map: entity -> chain -> unp -> [ (s1,e1), ...]
+        where (s1,e1) are alignment start,end indices between the UNP and PDB sequences.
+        """
+        map_to_unp_ids = {}
+
+        for entity_id, entity_meta in self._meta_entities.items():
+            # Get list of chains and list of Uniprot IDs for this entity
+            entity_containers = entity_meta["rcsb_polymer_entity_container_identifiers"]
+            entity_unp_ids = entity_containers.get("uniprot_ids", [])
+
+            unp_alignments: Dict[str, List[Tuple[int, int]]] = {
+                unp_id: [] for unp_id in entity_unp_ids
+            }
+            for alignment_entry in entity_meta.get("rcsb_polymer_entity_align", []):
+                if alignment_entry["reference_database_name"].lower() != "uniprot":
+                    continue
+
+                unp_id = alignment_entry["reference_database_accession"]
+                if unp_id not in unp_alignments:
+                    continue
+
+                for alignment_region in alignment_entry["aligned_regions"]:
+                    align_start = alignment_region["entity_beg_seq_id"]
+                    align_end = align_start + alignment_region["length"] - 1
+                    unp_alignments[unp_id].append((align_start, align_end))
+
+            entity_chains = [
+                # The same chain can be referred to by different labels,
+                # the canonical PDB label and another label given by the
+                # structure author.
+                *entity_containers.get("asym_ids", []),
+                *entity_containers.get("auth_asym_ids", []),
+            ]
+
+            map_to_unp_ids[entity_id] = {
+                chain_id: unp_alignments for chain_id in entity_chains
+            }
+
+        return map_to_unp_ids
+
     def as_dict(self) -> Dict[str, Any]:
         return {
             k: getattr(self, k)
@@ -841,6 +958,67 @@ def as_dict(self) -> Dict[str, Any]:
     def __repr__(self):
         return str(self.as_dict())
 
+    @classmethod
+    def from_pdb(cls, pdb_id: str, cache=False) -> PDBMetadata:
+        """
+        Create a PDBMetadata object from a given PDB ID.
+        :param pdb_id: The PDB ID to map for. Chain will be ignored if present.
+        :param cache: Whether to load a cached mapping if available.
+        :return: A PDBMetadata object.
+        """
+        pdb_base_id, _ = split_id(pdb_id)
+
+        # TODO: Implement caching
+        # if cache:
+        #     pdb_meta = cls.from_cache(pdb_base_id)
+        #     if pdb_meta is not None:
+        #         return pdb_meta
+
+        pdb_meta = cls(pdb_id)
+        # pdb_meta.save()
+        return pdb_meta
+
+    @classmethod
+    def pdb_id_to_unp_id(cls, pdb_id: str, strict=True, cache=False) -> str:
+        """
+        Given a PDB ID, returns a single Uniprot id for it.
+        :param pdb_id: PDB ID, with optional chain.
+        :param cache: Whether to use cached mapping.
+        :param strict: Whether to raise an error (True) or just warn (False)
+        if the PDB ID cannot be uniquely mapped to a single Uniprot ID.
+        This can happen if: (1) Chain wasn't specified and there are
+        different Uniprot IDs for different chains (e.g. 4HHB); (2) Chain was
+        specified but there are multiple Uniprot IDs for the chain
+        (chimeric entry, e.g. 3SG4:A).
+        :return: A Uniprot ID.
+        """
+        pdb_base_id, chain_id = split_id(pdb_id)
+        meta = cls.from_pdb(pdb_id, cache=cache)
+
+        if not meta.uniprot_ids:
+            raise ValueError(f"No Uniprot entries exist for {pdb_base_id}")
+
+        if not chain_id:
+            if len(meta.uniprot_ids) > 1:
+                msg = f"Multiple Uniprot IDs for {pdb_base_id}, no chain specified."
+                if strict:
+                    raise ValueError(msg)
+                LOGGER.warning(f"{msg} Returning first ID from the first chain.")
+
+            for chain_id, unp_ids in meta.chain_uniprot_ids.items():
+                return unp_ids[0]
+
+        if chain_id not in meta.chain_uniprot_ids:
+            raise ValueError(f"No Uniprot ID for chain {chain_id} of {pdb_base_id}")
+
+        if len(meta.chain_uniprot_ids[chain_id]) > 1:
+            msg = f"Multiple Uniprot IDs for {pdb_base_id} chain {chain_id} (chimeric)"
+            if strict:
+                raise ValueError(msg)
+            LOGGER.warning(f"{msg} Returning the first Uniprot ID.")
+
+        return meta.chain_uniprot_ids[chain_id][0]
+
 
 class PDBUnitCell(object):
     """

From 36519d91c3d1ee3f8f2eb58c98f3e08b02b26f2b Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Mon, 12 Feb 2024 05:21:33 +0200
Subject: [PATCH 08/37] PDBMetadata: Use str for entity id

---
 src/pp5/external_dbs/pdb.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py
index 953f34d..d55a781 100644
--- a/src/pp5/external_dbs/pdb.py
+++ b/src/pp5/external_dbs/pdb.py
@@ -573,7 +573,7 @@ def from_cache(
 _TC = TypeVar("_TC")
 
 
-class PDBMetadata(object):  # TODO: JSONCacheableMixin
+class PDBMetadata(JSONCacheableMixin):
     """
     Obtains and parses metadata from a PDB structure using PDB REST API.
     """
@@ -587,13 +587,13 @@ def __init__(self, pdb_id: str):
 
         # Obtain structure-level metadata from the PDB API
         self._meta_struct: dict = pdb_api.execute_raw_data_query(self.pdb_id)
-        self._meta_entities: Dict[int, dict] = {}
+        self._meta_entities: Dict[str, dict] = {}
         self._meta_chains: Dict[str, dict] = {}
         entity_ids = self._meta_struct["rcsb_entry_container_identifiers"][
             "polymer_entity_ids"
         ]
         for entity_id in entity_ids:
-            entity_id = int(entity_id)
+            entity_id = str(entity_id)
             # Obtain entity-level metadata from the PDB API
             self._meta_entities[entity_id] = pdb_api.execute_raw_data_query(
                 self.pdb_id, entity_id=entity_id
@@ -644,7 +644,7 @@ def description(self) -> Optional[str]:
         return self._resolve(self._meta_struct, "struct.pdbx_descriptor", str)
 
     @property
-    def entity_description(self) -> Dict[int, Optional[str]]:
+    def entity_description(self) -> Dict[str, Optional[str]]:
         return {
             entity_id: self._resolve(
                 meta_entity, "rcsb_polymer_entity.pdbx_description", str
@@ -661,7 +661,7 @@ def deposition_date(self) -> Optional[datetime]:
         )
 
     @property
-    def entity_source_org(self) -> Dict[int, Optional[str]]:
+    def entity_source_org(self) -> Dict[str, Optional[str]]:
         return {
             entity_id: self._resolve(
                 meta_entity, "rcsb_entity_source_organism.0.ncbi_scientific_name", str
@@ -676,7 +676,7 @@ def entity_source_org(self) -> Dict[int, Optional[str]]:
         }
 
     @property
-    def entity_source_org_id(self) -> Dict[int, Optional[int]]:
+    def entity_source_org_id(self) -> Dict[str, Optional[int]]:
         return {
             entity_id: self._resolve(
                 meta_entity, "rcsb_entity_source_organism.0.ncbi_taxonomy_id", int
@@ -689,7 +689,7 @@ def entity_source_org_id(self) -> Dict[int, Optional[int]]:
         }
 
     @property
-    def entity_host_org(self) -> Dict[int, Optional[str]]:
+    def entity_host_org(self) -> Dict[str, Optional[str]]:
         return {
             entity_id: self._resolve(
                 meta_entity, "rcsb_entity_host_organism.0.ncbi_scientific_name", str
@@ -701,7 +701,7 @@ def entity_host_org(self) -> Dict[int, Optional[str]]:
         }
 
     @property
-    def entity_host_org_id(self) -> Dict[int, Optional[int]]:
+    def entity_host_org_id(self) -> Dict[str, Optional[int]]:
         return {
             entity_id: self._resolve(
                 meta_entity, "rcsb_entity_host_organism.0.ncbi_taxonomy_id", int
@@ -761,21 +761,21 @@ def ligands(self) -> str:
         return str.join(",", sorted(set.union(*self.chain_ligands.values())))
 
     @property
-    def entity_chains(self) -> Dict[int, Sequence[str]]:
+    def entity_chains(self) -> Dict[str, Sequence[str]]:
         """
         :return: Mapping from entity id to a list of chains belonging to that entity.
         """
         return self._entity_chains(author=False)
 
     @property
-    def entity_auth_chains(self) -> Dict[int, Sequence[str]]:
+    def entity_auth_chains(self) -> Dict[str, Sequence[str]]:
         """
         :return: Mapping from entity id to a list of chains belonging to that entity,
         using the original author's chain ids.
         """
         return self._entity_chains(author=True)
 
-    def _entity_chains(self, author: bool = False) -> Dict[int, Sequence[str]]:
+    def _entity_chains(self, author: bool = False) -> Dict[str, Sequence[str]]:
         """
         :param author: Whether to use author or canonical chain ids.
         :return: Mapping from entity id to a list of chains belonging to that entity.
@@ -788,7 +788,7 @@ def _entity_chains(self, author: bool = False) -> Dict[int, Sequence[str]]:
         }
 
     @property
-    def chain_entities(self) -> Dict[str, int]:
+    def chain_entities(self) -> Dict[str, str]:
         """
         :return: Mapping from chain id to its entity id.
         """
@@ -823,7 +823,7 @@ def chain_to_auth_chain(self) -> Dict[str, str]:
         return chain_to_auth_chain
 
     @property
-    def entity_sequence(self) -> Dict[int, str]:
+    def entity_sequence(self) -> Dict[str, str]:
         return {
             entity_id: self._resolve(
                 meta_entity, "entity_poly.pdbx_seq_one_letter_code_can", str
@@ -904,7 +904,7 @@ def entity_uniprot_ids(self) -> Dict[str, Dict[str, Sequence[str]]]:
     @property
     def entity_uniprot_id_alignments(
         self,
-    ) -> Dict[int, Dict[str, Dict[str, List[Tuple[int, int]]]]]:
+    ) -> Dict[str, Dict[str, Dict[str, List[Tuple[int, int]]]]]:
         """
         Retrieves all Uniprot IDs associated with a PDB structure entities.
 

From 5de6cf58f770e16eb94eb93ce7cf483d22b6ea73 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Mon, 12 Feb 2024 06:30:24 +0200
Subject: [PATCH 09/37] PDBMetadata: Add extra properties

---
 src/pp5/external_dbs/pdb.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py
index d55a781..8d9312d 100644
--- a/src/pp5/external_dbs/pdb.py
+++ b/src/pp5/external_dbs/pdb.py
@@ -640,7 +640,6 @@ def title(self) -> Optional[str]:
 
     @property
     def description(self) -> Optional[str]:
-        # api_meta_entity["rcsb_polymer_entity"]["pdbx_description"]
         return self._resolve(self._meta_struct, "struct.pdbx_descriptor", str)
 
     @property
@@ -760,6 +759,27 @@ def chain_ligands(self) -> Dict[str, Set[str]]:
     def ligands(self) -> str:
         return str.join(",", sorted(set.union(*self.chain_ligands.values())))
 
+    @property
+    def entity_ids(self) -> Sequence[str]:
+        """
+        :return: The entity ids which exist in the structure.
+        """
+        return tuple(self._meta_entities.keys())
+
+    @property
+    def chain_ids(self) -> Sequence[str]:
+        """
+        :return: The chain ids which exist in the structure.
+        """
+        return tuple(self._meta_chains.keys())
+
+    @property
+    def auth_chain_ids(self) -> Sequence[str]:
+        """
+        :return: The chain ids which exist in the structure.
+        """
+        return tuple(self.chain_to_auth_chain[chain_id] for chain_id in self.chain_ids)
+
     @property
     def entity_chains(self) -> Dict[str, Sequence[str]]:
         """

From 4f8ab4e72ba23cdc4adcc65f61792d298871343e Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Mon, 12 Feb 2024 06:30:51 +0200
Subject: [PATCH 10/37] Replace PDB2UNP with PDBMetadata

---
 src/pp5/external_dbs/pdb.py | 330 +-----------------------------------
 1 file changed, 5 insertions(+), 325 deletions(-)

diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py
index 8d9312d..2c095fb 100644
--- a/src/pp5/external_dbs/pdb.py
+++ b/src/pp5/external_dbs/pdb.py
@@ -7,18 +7,7 @@
 from math import cos, sin
 from math import degrees as deg
 from math import radians as rad
-from typing import (
-    Any,
-    Set,
-    Dict,
-    List,
-    Tuple,
-    Union,
-    TypeVar,
-    Callable,
-    Optional,
-    Sequence,
-)
+from typing import Any, Set, Dict, List, Tuple, TypeVar, Callable, Optional, Sequence
 from pathlib import Path
 from datetime import datetime
 from itertools import zip_longest
@@ -32,7 +21,6 @@
 from Bio.PDB.Polypeptide import standard_aa_names
 from Bio.PDB.PDBExceptions import PDBConstructionWarning, PDBConstructionException
 
-import pp5
 from pp5 import PDB_DIR, get_resource_path
 from pp5.utils import JSONCacheableMixin, remote_dl
 from pp5.external_dbs import pdb_api
@@ -116,6 +104,8 @@ def pdb_download(pdb_id: str, pdb_dir=PDB_DIR, pdb_source: str = PDB_RCSB) -> Pa
 
     download_url_template = PDB_DOWNLOAD_SOURCES[pdb_source]
     if "unp_id" in download_url_template:
+        pdb_meta = PDBMetadata.from_pdb(pdb_id, cache=True)
+
         # The alphafold source requires downloading the data based on the uniprot id
         unp_ids = None
         if not chain_id:
@@ -123,9 +113,7 @@ def pdb_download(pdb_id: str, pdb_dir=PDB_DIR, pdb_source: str = PDB_RCSB) -> Pa
                 raise ValueError(f"Chain or entity must be specified for {pdb_source=}")
 
             # Obtain uniprot ids from entity (entity -> chain -> unp ids)
-            entity_chains: dict = PDB2UNP.query_entity_uniprot_ids(pdb_id).get(
-                entity_id, {}
-            )
+            entity_chains: dict = pdb_meta.entity_uniprot_ids.get(entity_id, {})
             if not entity_chains:
                 raise ValueError(f"Failed to obtain chain for {pdb_id}:{entity_id}")
             chain_id = [*entity_chains.keys()][0]  # arbitrary chain from the entity
@@ -138,7 +126,7 @@ def pdb_download(pdb_id: str, pdb_dir=PDB_DIR, pdb_source: str = PDB_RCSB) -> Pa
             return filename
 
         # Get uniprot id for this chain (only if we didn't get them from entity)
-        unp_ids = unp_ids or PDB2UNP.query_chain_uniprot_ids(pdb_id).get(chain_id, [])
+        unp_ids = unp_ids or pdb_meta.chain_uniprot_ids.get(chain_id, [])
         if len(unp_ids) != 1:
             raise ValueError(
                 f"Can't determine unique uniprot id for {pdb_id}:{chain_id}, "
@@ -262,314 +250,6 @@ def pdb_to_secondary_structure(
     return ss_dict, keys
 
 
-class PDB2UNP(JSONCacheableMixin, object):
-    """
-    Maps PDB IDs (in each chain) to one or more Uniprot IDs which correspond
-    to that chain, and their locations in the PDB sequence.
-    """
-
-    def __init__(self, pdb_id: str):
-        """
-        Initialize a PDB to Uniprot mapping.
-        :param pdb_id: PDB ID, without chain. Chain will be ignored if specified.
-        """
-        pdb_base_id, _ = split_id(pdb_id)
-
-        # Get all chain Uniprot IDs by querying PDB. This gives us the most
-        # up-to-date IDs and provides the alignment info between the
-        # PDB structure's sequence and the Uniprot xref sequence.
-        # Map is chain -> unp -> [ (s1,e1), (s2, e2), ... ]
-        self.chain_to_unp_xrefs = self.query_chain_uniprot_id_alignments(pdb_id)
-        self.pdb_id = pdb_base_id
-
-    def get_unp_id(self, chain_id: str, strict=True) -> str:
-        """
-        :param chain_id: A chain in the PDB structure.
-        :param strict: Whether to raise an error (True) or just warn (False)
-        if the chain cannot be uniquely mapped to a single Uniprot ID.
-        :return: the first unp id matching the given chain. Usually there's
-        only one unless the entry is chimeric.
-        """
-        if not chain_id or chain_id.upper() not in self.chain_to_unp_xrefs:
-            raise ValueError(f"No Uniprot ID for chain {chain_id} of" f" {self.pdb_id}")
-
-        if self.is_chimeric(chain_id):
-            msg = (
-                f"{self.pdb_id} is chimeric at chain {chain_id}, "
-                f"possible Uniprot IDs: "
-                f"{self.get_all_chain_unp_ids(chain_id)}."
-            )
-            if strict:
-                raise ValueError(msg)
-            LOGGER.warning(f"{msg} Returning first ID.")
-
-        for unp_id in self.chain_to_unp_xrefs[chain_id.upper()]:
-            return unp_id
-
-    def is_chimeric(self, chain_id: str) -> bool:
-        """
-        :param chain_id: A chain in the PDB structure.
-        :return: Whether the sequence in the given chain is chimeric,
-        i.e. is composed of regions from different proteins.
-        """
-        return len(self.chain_to_unp_xrefs[chain_id.upper()]) > 1
-
-    def get_all_chain_unp_ids(self, chain_id) -> tuple:
-        """
-        :param chain_id: A chain in the PDB structure.
-        :return: All unp ids matching the given chain.
-        """
-        return tuple(self.chain_to_unp_xrefs[chain_id.upper()].keys())
-
-    def get_all_unp_ids(self) -> set:
-        """
-        :return: All Uniprot IDs for all chains in the PDB structure.
-        """
-        all_unp_ids = set()
-        for chain in self.chain_to_unp_xrefs:
-            all_unp_ids.update(self.get_all_chain_unp_ids(chain))
-        return all_unp_ids
-
-    def get_chain_to_unp_ids(self) -> Dict[str, Tuple[str]]:
-        """
-        :return: A mapping from chain it to a sequence of uniprot ids for
-        that chain.
-        """
-        return {c: tuple(u.keys()) for c, u in self.chain_to_unp_xrefs.items()}
-
-    def save(self, out_dir=pp5.PDB2UNP_DIR) -> Path:
-        """
-        Write the current mapping to a human-readable text file (json) which
-        can also be loaded later using from_cache.
-        :param out_dir: Output directory.
-        :return: The path of the written file.
-        """
-        filename = f"{self.pdb_id}.json"
-        return self.to_cache(out_dir, filename, indent=None)
-
-    def __getitem__(self, chain_id: str):
-        """
-        :param chain_id: The chain.
-        :return: Uniprot xrefs for a given chain
-        """
-        return self.chain_to_unp_xrefs[chain_id.upper()]
-
-    def __contains__(self, chain_id: str):
-        """
-        :param chain_id: The chain.
-        :return: Whether this mapping contains the given chain.
-        """
-        return chain_id.upper() in self.chain_to_unp_xrefs
-
-    def __repr__(self):
-        return f"PDB2UNP({self.pdb_id})={self.get_chain_to_unp_ids()}"
-
-    @classmethod
-    def query_chain_uniprot_ids(cls, pdb_id: str) -> Dict[str, Sequence[str]]:
-        """
-        Retrieves all Uniprot IDs associated with a PDB structure chains by querying
-        the PDB database.
-
-        :param pdb_id: The PDB ID to search for. Chain or entity will be ignored.
-        :return: a map: chain -> [unp1, unp2, ...]
-        where unp1, unp2, ... are Uniprot IDs associated with the chain.
-        :raises pdb_api.PDBAPIException: If there's a problem obtaining the data.
-        """
-
-        # entity -> chain -> unp -> [ (s1,e1), ... ]
-        entity_map = cls.query_entity_uniprot_id_alignments(pdb_id)
-
-        all_chain_map = {}
-        for entity_id, chain_map in entity_map.items():
-            for chain_id, unp_map in chain_map.items():
-                # chain -> [unp1, unp2, ...]
-                all_chain_map[chain_id] = tuple(unp_map.keys())
-
-        return all_chain_map
-
-    @classmethod
-    def query_chain_uniprot_id_alignments(
-        cls, pdb_id: str
-    ) -> Dict[str, Dict[str, List[Tuple[int, int]]]]:
-        """
-        Retrieves all Uniprot IDs associated with a PDB structure chains by querying
-        the PDB database.
-
-        :param pdb_id: The PDB ID to search for. Chain or entity will be ignored.
-        :return: a map: chain -> unp -> [ (s1,e1), ... ]
-        where (s1,e1) are alignment start,end indices between the UNP and PDB sequences.
-        :raises pdb_api.PDBAPIException: If there's a problem obtaining the data.
-        """
-        # entity -> chain -> unp -> [ (s1,e1), ... ]
-        entity_map = cls.query_entity_uniprot_id_alignments(pdb_id)
-
-        all_chain_map = {}
-        for entity_id, chain_map in entity_map.items():
-            for chain_id, unp_map in chain_map.items():
-                # chain -> unp -> [ (s1,e1), ... ]
-                all_chain_map[chain_id] = unp_map
-
-        return all_chain_map
-
-    @classmethod
-    def query_entity_uniprot_ids(
-        cls, pdb_id: str
-    ) -> Dict[str, Dict[str, Sequence[str]]]:
-        """
-        Retrieves all Uniprot IDs associated with a PDB structure entities by querying
-        the PDB database.
-
-        :param pdb_id: The PDB ID to search for. Chain or entity will be ignored.
-        :return: a map: entity -> chain ->[unp1, unp2, ...]
-        where unp1, unp2, ... are Uniprot IDs associated with the entity.
-        :raises pdb_api.PDBAPIException: If there's a problem obtaining the data.
-        """
-
-        # entity -> chain -> unp -> [ (s1,e1), ... ]
-        entity_map = cls.query_entity_uniprot_id_alignments(pdb_id)
-
-        new_entity_map = defaultdict(dict)
-        for entity_id, chain_map in entity_map.items():
-            for chain_id, unp_map in chain_map.items():
-                # entity -> chain -> [unp1, unp2, ...]
-                new_entity_map[entity_id][chain_id] = tuple(unp_map.keys())
-
-        return dict(new_entity_map)
-
-    @classmethod
-    def query_entity_uniprot_id_alignments(
-        cls, pdb_id: str
-    ) -> Dict[str, Dict[str, Dict[str, List[Tuple[int, int]]]]]:
-        """
-        Retrieves all Uniprot IDs associated with a PDB structure entities by querying
-        the PDB database.
-
-        :param pdb_id: The PDB ID to search for. Chain or entity will be ignored.
-        :return: a map: entity -> chain -> unp -> [ (s1,e1), ...]
-        where (s1,e1) are alignment start,end indices between the UNP and PDB sequences.
-        :raises pdb_api.PDBAPIException: If there's a problem obtaining the data.
-        """
-        map_to_unp_ids = {}
-
-        # Make sure we have a base id
-        pdb_id, _, _ = split_id_with_entity(pdb_id)
-
-        # Get all data for the PDB structure
-        entry_data = pdb_api.execute_raw_data_query(pdb_id)
-        entry_containers = entry_data["rcsb_entry_container_identifiers"]
-
-        # Find all polymer entities
-        entity_ids = entry_containers.get("polymer_entity_ids", [])
-        for entity_id in entity_ids:
-            entity_id = str(entity_id)
-            # Get all data about this entity
-            entity_data = pdb_api.execute_raw_data_query(pdb_id, entity_id=entity_id)
-
-            # Get list of chains and list of Uniprot IDs for this entity
-            entity_containers = entity_data["rcsb_polymer_entity_container_identifiers"]
-            entity_chains = [
-                # The same chain can be referred to by different labels,
-                # the canonical PDB label and another label given by the
-                # structure author.
-                *entity_containers.get("asym_ids", []),
-                *entity_containers.get("auth_asym_ids", []),
-            ]
-            entity_unp_ids = entity_containers.get("uniprot_ids", [])
-
-            unp_alignments: Dict[str, List[Tuple[int, int]]] = {
-                unp_id: [] for unp_id in entity_unp_ids
-            }
-            for alignment_entry in entity_data.get("rcsb_polymer_entity_align", []):
-                if alignment_entry["reference_database_name"].lower() != "uniprot":
-                    continue
-
-                unp_id = alignment_entry["reference_database_accession"]
-                if unp_id not in unp_alignments:
-                    continue
-
-                for alignment_region in alignment_entry["aligned_regions"]:
-                    align_start = alignment_region["entity_beg_seq_id"]
-                    align_end = align_start + alignment_region["length"] - 1
-                    unp_alignments[unp_id].append((align_start, align_end))
-
-            map_to_unp_ids[entity_id] = {
-                chain_id: unp_alignments for chain_id in entity_chains
-            }
-
-        return map_to_unp_ids
-
-    @classmethod
-    def pdb_id_to_unp_id(
-        cls,
-        pdb_id: str,
-        strict=True,
-        cache=False,
-    ) -> str:
-        """
-        Given a PDB ID, returns a single Uniprot id for it.
-        :param pdb_id: PDB ID, with optional chain. If provided chain will
-        be used.
-        :param cache: Whether to use cached mapping.
-        :param strict: Whether to raise an error (True) or just warn (False)
-        if the PDB ID cannot be uniquely mapped to a single Uniprot ID.
-        This can happen if: (1) Chain wasn't specified and there are
-        different Uniprot IDs for different chains (e.g. 4HHB); (2) Chain was
-        specified but there are multiple Uniprot IDs for the chain
-        (chimeric entry, e.g. 3SG4:A).
-        :return: A Uniprot ID.
-        """
-        pdb_base_id, chain_id = split_id(pdb_id)
-        pdb2unp = cls.from_pdb(pdb_id, cache=cache)
-
-        all_unp_ids = pdb2unp.get_all_unp_ids()
-        if not all_unp_ids:
-            raise ValueError(f"No Uniprot entries exist for {pdb_base_id}")
-
-        if not chain_id:
-            if len(all_unp_ids) > 1:
-                msg = (
-                    f"Multiple Uniprot IDs exists for {pdb_base_id}, and no "
-                    f"chain specified."
-                )
-                if strict:
-                    raise ValueError(msg)
-                LOGGER.warning(
-                    f"{msg} Returning the first Uniprot ID " f"from the first chain."
-                )
-
-            for chain_id, unp_ids in pdb2unp.get_chain_to_unp_ids().items():
-                return unp_ids[0]
-
-        return pdb2unp.get_unp_id(chain_id, strict=strict)
-
-    @classmethod
-    def from_pdb(cls, pdb_id: str, cache=False) -> PDB2UNP:
-        """
-        Create a PDB2UNP mapping from a given PDB ID.
-        :param pdb_id: The PDB ID to map for. Chain will be ignored if present.
-        :param cache: Whether to load a cached mapping if available.
-        :return: A PDB2UNP mapping object.
-        """
-        pdb_base_id, _ = split_id(pdb_id)
-
-        if cache:
-            pdb2unp = cls.from_cache(pdb_base_id)
-            if pdb2unp is not None:
-                return pdb2unp
-
-        pdb2unp = cls(pdb_id)
-        pdb2unp.save()
-        return pdb2unp
-
-    @classmethod
-    def from_cache(
-        cls, pdb_id, cache_dir: Union[str, Path] = pp5.PDB2UNP_DIR
-    ) -> Optional[PDB2UNP]:
-        pdb_id, _ = split_id(pdb_id)
-        filename = f"{pdb_id}.json"
-        return super(PDB2UNP, cls).from_cache(cache_dir, filename)
-
-
 _TC = TypeVar("_TC")
 
 

From 1212201755bc4fb858f48b862ed9ab50a316588f Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Mon, 12 Feb 2024 06:32:14 +0200
Subject: [PATCH 11/37] Collect: Use PDBMetadata instead of PDB2UNP

---
 src/pp5/collect.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/pp5/collect.py b/src/pp5/collect.py
index 1468a51..119279a 100644
--- a/src/pp5/collect.py
+++ b/src/pp5/collect.py
@@ -1042,8 +1042,8 @@ def _collect_single_structure(
     pdb_base_id, chain_id, entity_id = pdb.split_id_with_entity(pdb_id)
 
     pdb_dict = pdb.pdb_dict(pdb_id, pdb_source=pdb_source)
-    pdb2unp = pdb.PDB2UNP.from_pdb(pdb_id, cache=True)
-    meta = pdb.PDBMetadata(pdb_id)
+    meta = pdb.PDBMetadata.from_pdb(pdb_id, cache=True)
+    chain_to_unp_ids = meta.chain_uniprot_ids
 
     # Determine all chains we need to collect from the PDB structure
     chains_to_collect: Sequence[str]
@@ -1079,16 +1079,16 @@ def _collect_single_structure(
         pdb_id_full = f"{pdb_base_id}:{chain_id}"
 
         # Skip chains with no Uniprot ID
-        if chain_id not in pdb2unp:
+        if chain_id not in chain_to_unp_ids:
             LOGGER.warning(f"No Uniprot ID for {pdb_id_full}")
             continue
 
         # Skip chimeric chains
-        if pdb2unp.is_chimeric(chain_id):
+        if len(chain_to_unp_ids[chain_id]) > 1:
             LOGGER.warning(f"Discarding chimeric chain {pdb_id_full}")
             continue
 
-        unp_id = pdb2unp.get_unp_id(chain_id)
+        unp_id = chain_to_unp_ids[chain_id][0]
         seq_len = len(meta.entity_sequence[meta.chain_entities[chain_id]])
 
         # Create a ProteinRecord and save it so it's cached for when we
@@ -1097,7 +1097,7 @@ def _collect_single_structure(
         try:
             nc = chain_id in string.digits
             prec = ProteinRecord(
-                unp_id,
+                unp_id,  # TODO: remove unp_ids here
                 pdb_id_full,
                 pdb_source=pdb_source,
                 pdb_dict=pdb_dict,
@@ -1148,7 +1148,7 @@ def _collect_single_structure(
 
     msg = (
         f"Collected {len(chain_data)} chains from {pdb_id} "
-        f"{pdb2unp.get_chain_to_unp_ids()} ({idx[0] + 1}/{idx[1]})"
+        f"{chain_to_unp_ids} ({idx[0] + 1}/{idx[1]})"
     )
     LOGGER.log(level=logging.INFO if len(chain_data) else logging.WARNING, msg=msg)
 

From 9a8ab3f5884d823ef5c9f6d2793193783ce1b3d1 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Mon, 12 Feb 2024 06:32:53 +0200
Subject: [PATCH 12/37] prec: Use PDBMetadata instead of PDB2UNP

---
 src/pp5/prec.py | 68 +++++++++++++++++++++++++++----------------------
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/src/pp5/prec.py b/src/pp5/prec.py
index fda9852..193ec3d 100644
--- a/src/pp5/prec.py
+++ b/src/pp5/prec.py
@@ -562,26 +562,28 @@ def from_pdb(
         try:
             # Either chain or entity or none can be provided, but not both
             pdb_base_id, chain_id, entity_id = pdb.split_id_with_entity(pdb_id)
-            numeric_chain = False
             if entity_id:
-                entity_id = int(entity_id)
+                entity_id = str(entity_id)
 
-                # Discover which chains belong to this entity
-                pdb_dict = pdb.pdb_dict(
-                    pdb_id, pdb_source=pdb_source, struct_d=pdb_dict
-                )
-                meta = pdb.PDBMetadata(pdb_id)
-                chain_id = meta.entity_chains[entity_id][0]
+                meta = pdb.PDBMetadata.from_pdb(pdb_id, cache=cache)
+
+                chain_id = None
+                if entity_id in meta.entity_ids:
+                    chain_id = meta.entity_chains[entity_id][0]
 
                 if not chain_id:
-                    # In rare cases the chain is a number instead of a letter,
-                    # so there's no way to distinguish between entity id and
-                    # chain except also trying to use our entity as a chain
-                    # and finding the actual entity. See e.g. 4N6V.
-                    if str(entity_id) in meta.chain_entities:
+                    # In rare cases the author chain is a number instead of a letter.
+                    # We check for this, and if it's the case, we use the
+                    # corresponding PDB chain instead. See e.g. 4N6V.
+                    if entity_id in meta.auth_chain_ids:
                         # Chain is number, but use its string representation
-                        chain_id = str(entity_id)
-                        numeric_chain = True
+                        chain_id = next(
+                            iter(
+                                c_id
+                                for c_id, ac_id in (meta.chain_to_auth_chain.items())
+                                if ac_id == entity_id
+                            )
+                        )
                     else:
                         raise ProteinInitError(
                             f"No matching chain found for entity "
@@ -600,7 +602,8 @@ def from_pdb(
                     pdb_id, pdb_source=pdb_source, struct_d=pdb_dict
                 )
 
-            unp_id = pdb.PDB2UNP.pdb_id_to_unp_id(
+            # TODO: Remove need for unp id from init
+            unp_id = pdb.PDBMetadata.pdb_id_to_unp_id(
                 pdb_id, strict=strict_pdb_xref, cache=cache
             )
 
@@ -609,7 +612,6 @@ def from_pdb(
                 pdb_id,
                 pdb_source=pdb_source,
                 pdb_dict=pdb_dict,
-                numeric_chain=numeric_chain,
                 **kw_for_init,
             )
             if cache_dir:
@@ -674,8 +676,6 @@ def __init__(
         dihedral_est_name: str = None,
         dihedral_est_args: dict = None,
         max_ena: int = None,
-        strict_unp_xref: bool = True,
-        numeric_chain: bool = False,
         with_altlocs: bool = True,
         with_backbone: bool = True,
         with_contacts: bool = True,
@@ -701,10 +701,6 @@ def __init__(
         :param max_ena: Number of maximal ENA records (containing protein
         genetic data) to align to the PDB structure of this protein. None
         means no limit (all cross-refs from Uniprot will be aligned).
-        :param strict_unp_xref: Whether to require that there exist a PDB
-        cross-ref for the given Uniprot ID.
-        :param numeric_chain: Whether the given chain id (if any) is
-        numeric. In rare cases PDB structures have numbers as chain ids.
         :param with_altlocs: Whether to include alternate conformations in the
         protein record. If False, only the default conformation will be used.
         :param with_backbone: Whether to include backbone atoms in the protein record.
@@ -735,25 +731,37 @@ def __init__(
         if with_altlocs and contact_method == CONTACT_METHOD_ARPEGGIO:
             raise ValueError(f"Altlocs not supported with {contact_method=}")
 
-        self.strict_unp_xref = strict_unp_xref
-        self.numeric_chain = numeric_chain
         self.with_altlocs = with_altlocs
         self.with_backbone = with_backbone
         self.with_contacts = with_contacts
         self.contact_radius = contact_radius
         self.contact_method = contact_method
 
-        # Parse the given PDB id
-        self.pdb_base_id, self.pdb_chain_id, ent_id = pdb.split_id_with_entity(pdb_id)
-        if numeric_chain:
-            self.pdb_chain_id = str(ent_id)
+        # Parse the given PDB id and obtain metadata
+        self.pdb_base_id, pdb_chain_id, ent_id = pdb.split_id_with_entity(pdb_id)
+        self.pdb_meta = pdb.PDBMetadata.from_pdb(self.pdb_base_id, cache=True)
+
+        if pdb_chain_id is None:
+            if ent_id and len(self.pdb_meta.entity_chains.get(ent_id, [])) == 1:
+                pdb_chain_id = self.pdb_meta.entity_chains[ent_id][0]
+            elif len(self.pdb_meta.chain_ids) == 1:
+                pdb_chain_id = next(iter(self.pdb_meta.chain_ids))
+            else:
+                raise ProteinInitError(
+                    f"No chain specified in {pdb_id}, and multiple chains exist."
+                )
+
+        self.pdb_chain_id = pdb_chain_id
         self.pdb_id = f"{self.pdb_base_id}:{self.pdb_chain_id}"
 
+        # TODO: Remove need for unp id from init, get it from metadata
+        if self.unp_id not in self.pdb_meta.chain_uniprot_ids[self.pdb_chain_id]:
+            raise ProteinInitError(f"Uniprot ID {self.unp_id} not found in {pdb_id}")
+
         self.pdb_source = pdb_source
         if pdb_dict:
             self._pdb_dict = pdb_dict
 
-        self.pdb_meta = pdb.PDBMetadata(self.pdb_id)
         if not self.pdb_meta.resolution and self.pdb_source != PDB_AFLD:
             raise ProteinInitError(f"Unknown resolution for {pdb_id}")
 

From 144278a44fa714ba63b20dea5796b52a65f53456 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Mon, 12 Feb 2024 06:33:16 +0200
Subject: [PATCH 13/37] Update tests

---
 tests/test_pdb.py  | 52 +++++++++++++++++++---------------------------
 tests/test_prec.py | 20 +++---------------
 2 files changed, 24 insertions(+), 48 deletions(-)

diff --git a/tests/test_pdb.py b/tests/test_pdb.py
index 3cd0280..cf5948a 100644
--- a/tests/test_pdb.py
+++ b/tests/test_pdb.py
@@ -2,6 +2,7 @@
 import math
 import random
 import string
+from pprint import pprint
 from urllib.request import urlopen
 
 import pandas as pd
@@ -140,7 +141,7 @@ def test_entity_too_long(self):
                 pdb.split_id(invalid_id)
 
 
-@pytest.fixture(params=["1MWC:A", "2WUR:A", "4N6V:1"])
+@pytest.fixture(params=["1MWC:A", "2WUR:A", "4N6V:1", "1DWI:A"])
 def pdb_id(request):
     return request.param
 
@@ -177,84 +178,73 @@ def test_exception_chimeric_chain(self):
             pdb.pdb_download("3SG4:A", pdb_source=pdb.PDB_AFLD)
 
 
+@pytest.mark.skipif(NO_INTERNET, reason="Needs internet")
 class TestPDBMetadata:
-    def test_metadata(self, pdb_id, pdb_source):
-        meta = pdb.PDBMetadata(pdb_id, pdb_source=pdb_source)
+    def test_metadata_properties(self, pdb_id):
+        meta = pdb.PDBMetadata(pdb_id)
 
         pdb_base_id, pdb_chain = pdb.split_id(pdb_id)
         assert meta.pdb_id == pdb_base_id
 
+        d = meta.as_dict()  # evaluates all metadata properties
+        pprint(d)
 
-@pytest.mark.skipif(NO_INTERNET, reason="Needs internet")
-class TestPDB2UNP:
     @staticmethod
-    def _check(pdb_id, expected_unp_id):
-        actual_unp_id = pdb.PDB2UNP.pdb_id_to_unp_id(pdb_id)
+    def _check_unp(pdb_id, expected_unp_id):
+        actual_unp_id = pdb.PDBMetadata.pdb_id_to_unp_id(pdb_id)
         assert actual_unp_id == expected_unp_id
 
     def test_no_chain_single_unp(self):
-        self._check("102L", "P00720")
+        self._check_unp("102L", "P00720")
 
     def test_with_chain_single_unp(self):
-        self._check("102L:A", "P00720")
+        self._check_unp("102L:A", "P00720")
 
     def test_no_chain_multi_unp_strict(self):
         test_id = "4HHB"
         with pytest.raises(ValueError, match="Multiple Uniprot IDs"):
-            pdb.PDB2UNP.pdb_id_to_unp_id(test_id)
+            pdb.PDBMetadata.pdb_id_to_unp_id(test_id)
 
     def test_no_chain_multi_unp_not_strict(self):
         test_id = "4HHB"
         expected_unp_ids = {"P69905", "P68871"}
-        actual_unp_id = pdb.PDB2UNP.pdb_id_to_unp_id(test_id, strict=False)
+        actual_unp_id = pdb.PDBMetadata.pdb_id_to_unp_id(test_id, strict=False)
         assert actual_unp_id in expected_unp_ids
 
     @pytest.mark.parametrize("test_id", ["4HHB:A", "4HHB:C"])
     def test_with_chain_multi_unp_1(self, test_id):
-        self._check(test_id, "P69905")
+        self._check_unp(test_id, "P69905")
 
     @pytest.mark.parametrize("test_id", ["4HHB:B", "4HHB:D"])
     def test_with_chain_multi_unp_2(self, test_id):
-        self._check(test_id, "P68871")
+        self._check_unp(test_id, "P68871")
 
     def test_with_invalid_chain(self):
         with pytest.raises(ValueError, match="chain Z"):
-            pdb.PDB2UNP.pdb_id_to_unp_id("4HHB:Z")
+            pdb.PDBMetadata.pdb_id_to_unp_id("4HHB:Z")
 
     @pytest.mark.parametrize("test_id", ["5LTR", "5LTR:A"])
     def test_with_no_xref_in_file(self, test_id):
-        self._check(test_id, "B1PNC0")
+        self._check_unp(test_id, "B1PNC0")
 
     @pytest.mark.parametrize("test_id", ["5EJU", "4DXP"])
     def test_with_no_xref_in_file_and_pdb(self, test_id):
         with pytest.raises(ValueError, match="No Uniprot entries"):
-            pdb.PDB2UNP.pdb_id_to_unp_id(test_id)
+            pdb.PDBMetadata.pdb_id_to_unp_id(test_id)
 
     @pytest.mark.parametrize("test_id", ["3G53", "3G53:A"])
     def test_with_no_struct_ref_entry(self, test_id):
-        self._check(test_id, "P02213")
+        self._check_unp(test_id, "P02213")
 
     @pytest.mark.parametrize(
         ("test_id", "unp_ids"),
         [("3SG4:A", {"P11799", "P42212", "P0DP29"}), ("4IK8:A", {"K4DIE3", "P42212"})],
     )
     def test_multi_unp_for_single_chain_no_strict(self, test_id, unp_ids):
-        actual_unp_id = pdb.PDB2UNP.pdb_id_to_unp_id(test_id, strict=False)
+        actual_unp_id = pdb.PDBMetadata.pdb_id_to_unp_id(test_id, strict=False)
         assert actual_unp_id in unp_ids
 
     @pytest.mark.parametrize("test_id", ["3SG4:A", "4IK8:A"])
     def test_multi_unp_for_single_chain_strict(self, test_id):
         with pytest.raises(ValueError, match="chimeric"):
-            pdb.PDB2UNP.pdb_id_to_unp_id(test_id)
-
-    @pytest.mark.parametrize(
-        ("pdb_id", "unp_id"),
-        [("5LTR:A", "B1PNC0"), ("3G53:A", "P02213")],
-    )
-    def test_pdb_source(self, pdb_id, unp_id, pdb_source):
-        p2u = pdb.PDB2UNP.from_pdb(pdb_id)
-        pdb_base_id, chain_id = pdb.split_id(pdb_id)
-        actual_unp_id = p2u.get_unp_id(chain_id, strict=False)
-        # assert len(unps) == 1
-        assert actual_unp_id == unp_id
-        assert p2u.pdb_id == pdb_base_id
+            pdb.PDBMetadata.pdb_id_to_unp_id(test_id)
diff --git a/tests/test_prec.py b/tests/test_prec.py
index cdfd126..55b0468 100644
--- a/tests/test_prec.py
+++ b/tests/test_prec.py
@@ -202,7 +202,7 @@ def test_numerical_chain(self):
         pdb_id = "4N6V:9"
         prec = ProteinRecord.from_pdb(pdb_id)
         assert prec.pdb_base_id == "4N6V"
-        assert prec.pdb_chain_id == "9"
+        assert prec.pdb_chain_id == "J"  # auth chain was converted to pdb chain
 
     def test_ambiguous_numerical_entity_and_chain(self):
         # In this rare case it's impossible to know if entity or chain!
@@ -213,7 +213,7 @@ def test_ambiguous_numerical_entity_and_chain(self):
         pdb_id = "4N6V:1"
         prec = ProteinRecord.from_pdb(pdb_id)
         assert prec.pdb_base_id == "4N6V"
-        assert prec.pdb_chain_id == "0"
+        assert prec.pdb_chain_id == "A"
 
     def test_entity_with_invalid_entity(self):
         with pytest.raises(ProteinInitError):
@@ -268,24 +268,10 @@ def test_init_with_mismatching_pdb_id(self):
         with pytest.raises(ProteinInitError):
             ProteinRecord("P00720", "4GY3")
 
-    def test_no_strict_xref_with_no_xref_in_pdb(self):
-        prec = ProteinRecord("Q6LDG3", "3SG4:A", strict_unp_xref=False)
-        assert prec.unp_id == "Q6LDG3"
-        assert prec.pdb_id == "3SG4:A"
-
-    def test_no_strict_xref_with_no_xref_in_pdb_and_no_chain(self):
-        with pytest.raises(ProteinInitError, match="and no chain provided"):
-            ProteinRecord("Q6LDG3", "3SG4", strict_unp_xref=False)
-
-    def test_strict_xref_with_no_matching_xref_in_pdb(self):
+    def test_no_matching_xref_in_pdb(self):
         with pytest.raises(ProteinInitError):
             ProteinRecord("P42212", "2QLE:A")
 
-    def test_no_strict_xref_with_no_matching_xref_in_pdb(self):
-        prec = ProteinRecord("P42212", "2QLE:A", strict_unp_xref=False)
-        assert prec.unp_id == "P42212"
-        assert prec.pdb_id == "2QLE:A"
-
 
 class TestSave:
     @classmethod

From e0b15907e50a4e9c6b66857602ae64c5866035e2 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Tue, 13 Feb 2024 06:07:27 +0200
Subject: [PATCH 14/37] PDBMetadata: support per-chain conversion to dict

---
 src/pp5/external_dbs/pdb.py | 74 +++++++++++++++++++++++++++++++------
 1 file changed, 63 insertions(+), 11 deletions(-)

diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py
index 2c095fb..a06b1b1 100644
--- a/src/pp5/external_dbs/pdb.py
+++ b/src/pp5/external_dbs/pdb.py
@@ -332,12 +332,17 @@ def entity_description(self) -> Dict[str, Optional[str]]:
         }
 
     @property
-    def deposition_date(self) -> Optional[datetime]:
-        return self._resolve(
+    def deposition_date(self) -> Optional[str]:
+        dt = self._resolve(
             self._meta_struct,
             "pdbx_database_status.recvd_initial_deposition_date",
             datetime.fromisoformat,
         )
+        if not dt:
+            return None
+
+        # Keep only date
+        return dt.strftime("%Y-%m-%d")
 
     @property
     def entity_source_org(self) -> Dict[str, Optional[str]]:
@@ -424,20 +429,24 @@ def cg_temp(self) -> Optional[float]:
         return self._resolve(self._meta_struct, "exptl_crystal_grow.0.temp", float)
 
     @property
-    def chain_ligands(self) -> Dict[str, Set[str]]:
+    def chain_ligands(self) -> Dict[str, Sequence[str]]:
         return {
-            chain_id: set(
-                [
-                    ld.get("ligand_comp_id")
-                    for ld in meta_chain.get("rcsb_ligand_neighbors", [])
-                ]
+            chain_id: tuple(
+                sorted(
+                    set(
+                        [
+                            ld.get("ligand_comp_id")
+                            for ld in meta_chain.get("rcsb_ligand_neighbors", [])
+                        ]
+                    )
+                )
             )
             for chain_id, meta_chain in self._meta_chains.items()
         }
 
     @property
     def ligands(self) -> str:
-        return str.join(",", sorted(set.union(*self.chain_ligands.values())))
+        return str.join(",", sorted(set.union(set(), *self.chain_ligands.values())))
 
     @property
     def entity_ids(self) -> Sequence[str]:
@@ -648,12 +657,55 @@ def entity_uniprot_id_alignments(
 
         return map_to_unp_ids
 
-    def as_dict(self) -> Dict[str, Any]:
-        return {
+    def as_dict(self, chain_id: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Returns a dictionary containing all the metadata properties.
+
+        :param chain_id: Optional chain id to filter the metadata for. If provided,
+        only the metadata relevant to the chain will be returned.
+        :return: A dictionary containing all the metadata properties.
+        """
+        result_dict = {
             k: getattr(self, k)
             for k, v in self.__class__.__dict__.items()
             if isinstance(v, property)
         }
+        if not chain_id:
+            return result_dict
+
+        if chain_id not in self.chain_ids:
+            raise ValueError(f"Chain {chain_id} not found in {self.pdb_id}")
+
+        entity_id = self.chain_entities[chain_id]
+        filtered_result_dict = {}
+
+        for key, value in result_dict.items():
+            new_value = None
+
+            # If it's a dict, take value corresponding to the chain
+            if isinstance(value, dict):
+                if entity_id in value:
+                    new_value = value[entity_id]
+                elif chain_id in value:
+                    new_value = value[chain_id]
+                else:
+                    continue
+
+            # If it's a sequence, drop it
+            elif isinstance(value, (list, tuple)):
+                continue
+
+            # Append chain to pdb_id
+            elif value == self.pdb_id:
+                new_value = f"{self.pdb_id}:{chain_id}"
+
+            # If it's an internal dict, drop it
+            if isinstance(new_value, dict):
+                continue
+
+            filtered_result_dict[key] = value if new_value is None else new_value
+
+        return filtered_result_dict
 
     def __repr__(self):
         return str(self.as_dict())

From f27cc72bd2c46142825af2baa50342ba1eec017a Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Tue, 13 Feb 2024 06:08:32 +0200
Subject: [PATCH 15/37] prec: Remove requirement to initialize with uniprot id

---
 src/pp5/prec.py | 85 ++++++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 40 deletions(-)

diff --git a/src/pp5/prec.py b/src/pp5/prec.py
index 193ec3d..6533366 100644
--- a/src/pp5/prec.py
+++ b/src/pp5/prec.py
@@ -538,7 +538,7 @@ def from_pdb(
         pdb_dict=None,
         cache=False,
         cache_dir=pp5.PREC_DIR,
-        strict_pdb_xref=True,
+        strict_pdb_unp_xref=True,
         **kw_for_init,
     ) -> ProteinRecord:
         """
@@ -554,7 +554,7 @@ def from_pdb(
         :param cache: Whether to load prec from cache if available.
         :param cache_dir: Where the cache dir is. ProteinRecords will be
         written to this folder after creation, unless it's None.
-        :param strict_pdb_xref: Whether to require that the given PDB ID
+        :param strict_pdb_unp_xref: Whether to require that the given PDB ID
         maps uniquely to only one Uniprot ID.
         :param kw_for_init: Extra kwargs for the ProteinRecord initializer.
         :return: A ProteinRecord.
@@ -602,16 +602,11 @@ def from_pdb(
                     pdb_id, pdb_source=pdb_source, struct_d=pdb_dict
                 )
 
-            # TODO: Remove need for unp id from init
-            unp_id = pdb.PDBMetadata.pdb_id_to_unp_id(
-                pdb_id, strict=strict_pdb_xref, cache=cache
-            )
-
             prec = cls(
-                unp_id,
                 pdb_id,
                 pdb_source=pdb_source,
                 pdb_dict=pdb_dict,
+                strict_pdb_unp_xref=strict_pdb_unp_xref,
                 **kw_for_init,
             )
             if cache_dir:
@@ -657,19 +652,18 @@ def from_unp(
                 if prec is not None:
                     return prec
 
-            prec = cls(unp_id, pdb_id, **kw_for_init)
+            prec = cls(pdb_id, **kw_for_init)
             if cache_dir:
                 prec.save(out_dir=cache_dir)
 
             return prec
         except Exception as e:
             raise ProteinInitError(
-                f"Failed to create protein record for " f"unp_id={unp_id}"
+                f"Failed to create protein record for unp_id={unp_id}"
             ) from e
 
     def __init__(
         self,
-        unp_id: str,  # TODO: Get this from metadata
         pdb_id: str,
         pdb_source: str = PDB_RCSB,
         pdb_dict: dict = None,
@@ -680,15 +674,15 @@ def __init__(
         with_backbone: bool = True,
         with_contacts: bool = True,
         with_codons: bool = True,
+        strict_pdb_unp_xref: bool = True,
         contact_method: str = CONTACT_METHOD_NEIGHBOR,
         contact_radius: float = CONTACT_DEFAULT_RADIUS,
     ):
         """
         Don't call this directly. Use class methods from_pdb or from_unp instead.
 
-        Initialize a protein record from both Uniprot and PDB ids.
+        Initialize a protein record from PDB id.
 
-        :param unp_id: Uniprot id which uniquely identifies the protein.
         :param pdb_id: PDB id with chain (e.g. '1ABC:D') of the specific structure chain
         desired.
         :param pdb_source: Source from which to obtain the pdb file.
@@ -706,39 +700,21 @@ def __init__(
         :param with_backbone: Whether to include backbone atoms in the protein record.
         :param with_contacts: Whether to calculate per-residue contacts.
         :param with_codons: Whether to assign codons to each residue.
+        :param strict_pdb_unp_xref: Whether to require that the given PDB ID
+        maps uniquely to only one Uniprot ID.
         :param contact_method: Method for calculating contacts.
         Options are: 'ns' for neighbor search; 'arp' for arpeggio.
         :param contact_radius: Radius for calculating contacts.
         """
-        if not (unp_id and pdb_id):
-            raise ProteinInitError("Must provide both Uniprot and PDB IDs")
+        if not pdb_id:
+            raise ProteinInitError("Must provide PDB ID")
 
-        unp_id = unp_id.upper()
-        LOGGER.info(f"{unp_id}: Initializing protein record...")
         self.__setstate__({})
 
-        self.unp_id = unp_id
-        rec_unp_id = self.unp_rec.accessions[0]
-        if rec_unp_id != unp_id:
-            LOGGER.warning(f"Replacing outdated UNP ID: {unp_id} -> {rec_unp_id}")
-            self.unp_id = rec_unp_id
-
-        if contact_method not in CONTACT_METHODS:
-            raise ValueError(
-                f"Unknown {contact_method=}, must be one of {CONTACT_METHODS}"
-            )
-
-        if with_altlocs and contact_method == CONTACT_METHOD_ARPEGGIO:
-            raise ValueError(f"Altlocs not supported with {contact_method=}")
-
-        self.with_altlocs = with_altlocs
-        self.with_backbone = with_backbone
-        self.with_contacts = with_contacts
-        self.contact_radius = contact_radius
-        self.contact_method = contact_method
-
         # Parse the given PDB id and obtain metadata
         self.pdb_base_id, pdb_chain_id, ent_id = pdb.split_id_with_entity(pdb_id)
+
+        LOGGER.info(f"{self.pdb_base_id}: Obtaining metadata...")
         self.pdb_meta = pdb.PDBMetadata.from_pdb(self.pdb_base_id, cache=True)
 
         if pdb_chain_id is None:
@@ -754,9 +730,38 @@ def __init__(
         self.pdb_chain_id = pdb_chain_id
         self.pdb_id = f"{self.pdb_base_id}:{self.pdb_chain_id}"
 
-        # TODO: Remove need for unp id from init, get it from metadata
-        if self.unp_id not in self.pdb_meta.chain_uniprot_ids[self.pdb_chain_id]:
-            raise ProteinInitError(f"Uniprot ID {self.unp_id} not found in {pdb_id}")
+        LOGGER.info(f"{self.pdb_id}: Constructing protein record...")
+
+        # Obtain UniProt ID for the given PDB chain
+        chain_unp_ids = self.pdb_meta.chain_uniprot_ids[self.pdb_chain_id]
+        if not chain_unp_ids:
+            raise ProteinInitError(f"No Uniprot ID found for chain {self.pdb_chain_id}")
+        if len(chain_unp_ids) > 1:
+            msg = f"Multiple UNP IDs for chain {self.pdb_chain_id}: {chain_unp_ids}"
+            if strict_pdb_unp_xref:
+                raise ProteinInitError(msg)
+            else:
+                LOGGER.warning(msg)
+
+        self.unp_id = chain_unp_ids[0]
+        rec_unp_id = self.unp_rec.accessions[0]
+        if rec_unp_id != self.unp_id:
+            LOGGER.warning(f"Replacing outdated UNP ID: {self.unp_id} -> {rec_unp_id}")
+            self.unp_id = rec_unp_id
+
+        if contact_method not in CONTACT_METHODS:
+            raise ValueError(
+                f"Unknown {contact_method=}, must be one of {CONTACT_METHODS}"
+            )
+
+        if with_altlocs and contact_method == CONTACT_METHOD_ARPEGGIO:
+            raise ValueError(f"Altlocs not supported with {contact_method=}")
+
+        self.with_altlocs = with_altlocs
+        self.with_backbone = with_backbone
+        self.with_contacts = with_contacts
+        self.contact_radius = contact_radius
+        self.contact_method = contact_method
 
         self.pdb_source = pdb_source
         if pdb_dict:

From 6b42eead1a8be3fca99a8a268fe05c95b69964ac Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Tue, 13 Feb 2024 06:09:39 +0200
Subject: [PATCH 16/37] pgroup: Updates for prec init

---
 src/pp5/pgroup.py | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/src/pp5/pgroup.py b/src/pp5/pgroup.py
index 3f4602d..002f63b 100644
--- a/src/pp5/pgroup.py
+++ b/src/pp5/pgroup.py
@@ -190,8 +190,7 @@ def __init__(
         angle_aggregation="circ",
         compare_contacts: bool = False,
         strict_codons: bool = True,
-        strict_pdb_xref: bool = True,
-        strict_unp_xref: bool = False,
+        strict_pdb_unp_xref: bool = True,
         parallel: bool = True,
     ):
         """
@@ -213,7 +212,7 @@ def __init__(
 
         Where A, B are matching residues and X, Y are context residues.
 
-        :param ref_pdb_id: Reference structure PDB ID.
+        :param ref_pdb_id: Reference structure PDB ID with chain.
         :param query_pdb_ids: List of PDB IDs of query structures.
         :param pdb_source: Source from which to obtain the pdb file.
         :param match_len: Number of residues to include in a match. Can be either 1
@@ -243,10 +242,8 @@ def __init__(
         potential matches.
         :param strict_codons: Whether to require that a codon assignment for each
         AA exists and is un-ambiguous.
-        :param strict_pdb_xref: Whether to require that the given PDB ID
+        :param strict_pdb_unp_xref: Whether to require that the given PDB ID and chain
         maps uniquely to only one Uniprot ID.
-        :param strict_unp_xref: Whether to require that there exist a PDB
-        cross-ref for the given Uniprot ID.
         :param parallel: Whether to process query structures in parallel using
         the global worker process pool.
         """
@@ -259,7 +256,7 @@ def __init__(
             )
 
         ref_pdb_dict = pdb.pdb_dict(self.ref_pdb_id, pdb_source=pdb_source)
-        ref_pdb_meta = pdb.PDBMetadata(self.ref_pdb_base_id)
+        ref_pdb_meta = pdb.PDBMetadata.from_pdb(self.ref_pdb_base_id, cache=True)
         if self.ref_pdb_chain not in ref_pdb_meta.chain_entities:
             raise ProteinInitError(f"Unknown PDB entity for {self.ref_pdb_id}")
 
@@ -281,8 +278,7 @@ def __init__(
         self.prec_cache = prec_cache
         self.compare_contacts = compare_contacts
         self.strict_codons = strict_codons
-        self.strict_pdb_xref = strict_pdb_xref
-        self.strict_unp_xref = strict_unp_xref
+        self.strict_pdb_unp_xref = strict_pdb_unp_xref
 
         # Only one of these is relevant
         if pdb_source == PDB_AFLD:
@@ -352,8 +348,7 @@ def sort_key(q_pdb_id: str):
             self.ref_pdb_id,
             pdb_source=self.pdb_source,
             cache=self.prec_cache,
-            strict_pdb_xref=self.strict_pdb_xref,
-            strict_unp_xref=self.strict_unp_xref,
+            strict_pdb_unp_xref=strict_pdb_unp_xref,
             pdb_dict=ref_pdb_dict,
             with_contacts=self.compare_contacts,
         )
@@ -766,8 +761,7 @@ def _align_query_residues_to_ref_inner(
                 q_pdb_id,
                 pdb_source=self.pdb_source,
                 cache=self.prec_cache,
-                strict_pdb_xref=self.strict_pdb_xref,
-                strict_unp_xref=self.strict_unp_xref,
+                strict_pdb_unp_xref=self.strict_pdb_unp_xref,
                 with_contacts=self.compare_contacts,
             )
         except ProteinInitError as e:

From 5d7ec62c557e5aa830585b6a1c5e44d1c4a0ea11 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Tue, 13 Feb 2024 06:09:57 +0200
Subject: [PATCH 17/37] pgroup: Improved metadata handling

---
 src/pp5/pgroup.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/src/pp5/pgroup.py b/src/pp5/pgroup.py
index 002f63b..2d7b8d8 100644
--- a/src/pp5/pgroup.py
+++ b/src/pp5/pgroup.py
@@ -640,27 +640,15 @@ def to_struct_dataframe(self) -> pd.DataFrame:
                 {
                     "unp_id": q_prec.unp_id,
                     "pdb_id": q_prec.pdb_id,
-                    "resolution": q_prec.pdb_meta.resolution,
                     "struct_rmse": q_alignment.rmse,
                     "n_stars": q_alignment.n_stars,
                     "seq_len": len(q_alignment.ungapped_seq_2),  # seq2 is query
-                    "description": q_prec.pdb_meta.description,
-                    "src_org": q_prec.pdb_meta.src_org,
-                    "src_org_id": q_prec.pdb_meta.src_org_id,
-                    "host_org": q_prec.pdb_meta.host_org,
-                    "host_org_id": q_prec.pdb_meta.host_org_id,
-                    "ligands": q_prec.pdb_meta.ligands,
-                    "space_group": q_prec.pdb_meta.space_group,
-                    "r_free": q_prec.pdb_meta.r_free,
-                    "r_work": q_prec.pdb_meta.r_work,
-                    "cg_ph": q_prec.pdb_meta.cg_ph,
-                    "cg_temp": q_prec.pdb_meta.cg_temp,
+                    "ref_group": q_prec.unp_id == self.ref_prec.unp_id,
+                    **q_prec.pdb_meta.as_dict(chain_id=q_prec.pdb_chain_id),
                 }
             )
 
         df = pd.DataFrame(data)
-        df["ref_group"] = df["unp_id"] == self.ref_prec.unp_id
-        df = df.astype({"src_org_id": "Int32", "host_org_id": "Int32"})
         df.sort_values(
             by=["ref_group", "unp_id", "struct_rmse"],
             ascending=[False, True, True],

From bdf6759e0c1099998ad411e802eb8a2282fb4c58 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Tue, 13 Feb 2024 06:30:30 +0200
Subject: [PATCH 18/37] collect: Update prec init and metadata handling

---
 src/pp5/collect.py | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/src/pp5/collect.py b/src/pp5/collect.py
index 119279a..4460f51 100644
--- a/src/pp5/collect.py
+++ b/src/pp5/collect.py
@@ -1050,9 +1050,8 @@ def _collect_single_structure(
     if chain_id is not None:
         # If we got a single chain, use only that
         chains_to_collect = (chain_id,)
-    elif entity_id is not None:
-        entity_id = int(entity_id)
 
+    elif entity_id is not None:
         # If we got an entity id, discover all corresponding chains
         chains_to_collect = tuple(
             chain_id
@@ -1077,9 +1076,11 @@ def _collect_single_structure(
     chain_data = []
     for chain_id in chains_to_collect:
         pdb_id_full = f"{pdb_base_id}:{chain_id}"
+        entity_id = meta.chain_entities[chain_id]
+        seq_len = len(meta.entity_sequence[entity_id])
 
         # Skip chains with no Uniprot ID
-        if chain_id not in chain_to_unp_ids:
+        if chain_id not in chain_to_unp_ids or not chain_to_unp_ids[chain_id]:
             LOGGER.warning(f"No Uniprot ID for {pdb_id_full}")
             continue
 
@@ -1089,20 +1090,15 @@ def _collect_single_structure(
             continue
 
         unp_id = chain_to_unp_ids[chain_id][0]
-        seq_len = len(meta.entity_sequence[meta.chain_entities[chain_id]])
 
         # Create a ProteinRecord and save it so it's cached for when we
         # create the pgroups. Only collect structures for which we can
         # create a prec (e.g. they must have a DNA sequence).
         try:
-            nc = chain_id in string.digits
             prec = ProteinRecord(
-                unp_id,  # TODO: remove unp_ids here
                 pdb_id_full,
                 pdb_source=pdb_source,
                 pdb_dict=pdb_dict,
-                strict_unp_xref=False,
-                numeric_chain=nc,
                 with_altlocs=with_altlocs,
                 with_backbone=with_backbone,
                 with_contacts=with_contacts,
@@ -1118,8 +1114,8 @@ def _collect_single_structure(
 
         except Exception as e:
             LOGGER.warning(
-                f"Failed to create ProteinRecord for "
-                f"({unp_id}, {pdb_id}), will not collect: {e}"
+                f"Failed to create ProteinRecord for {pdb_id} ({unp_id=}), "
+                f"will not collect: {e}"
             )
             continue
 
@@ -1128,21 +1124,11 @@ def _collect_single_structure(
                 COL_UNP_ID: prec.unp_id,
                 COL_PDB_ID: prec.pdb_id,
                 COL_ENA_ID: prec.ena_id,
-                COL_RESOLUTION: meta.resolution,
                 COL_SEQ_LEN: seq_len,
                 COL_SEQ_GAPS: str.join(";", [f"{s}-{e}" for (s, e) in prec.seq_gaps]),
-                COL_DESCRIPTION: meta.description,
-                COL_DEPOSITION_DATE: meta.deposition_date,
-                COL_SRC_ORG: meta.src_org,
-                COL_HOST_ORG: meta.host_org,
                 COL_NUM_ALTLOCS: prec.num_altlocs,
-                COL_LIGANDS: meta.ligands,
-                COL_R_FREE: meta.r_free,
-                COL_R_WORK: meta.r_work,
-                COL_SPACE_GROUP: meta.space_group,
-                COL_CG_PH: meta.cg_ph,
-                COL_CG_TEMP: meta.cg_temp,
                 COL_PDB_SOURCE: pdb_source,
+                **meta.as_dict(chain_id=chain_id, seq_to_str=True),
             }
         )
 

From 8ab585709e9c750f53cb4100e886de8cd5ca7ada Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Tue, 13 Feb 2024 06:30:55 +0200
Subject: [PATCH 19/37] PDBMetadata: Improve dict conversion

---
 src/pp5/external_dbs/pdb.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py
index a06b1b1..73b2aec 100644
--- a/src/pp5/external_dbs/pdb.py
+++ b/src/pp5/external_dbs/pdb.py
@@ -657,12 +657,16 @@ def entity_uniprot_id_alignments(
 
         return map_to_unp_ids
 
-    def as_dict(self, chain_id: Optional[str] = None) -> Dict[str, Any]:
+    def as_dict(
+        self, chain_id: Optional[str] = None, seq_to_str: bool = False
+    ) -> Dict[str, Any]:
         """
         Returns a dictionary containing all the metadata properties.
 
         :param chain_id: Optional chain id to filter the metadata for. If provided,
         only the metadata relevant to the chain will be returned.
+        :param seq_to_str: Whether to convert sequences to a string, joined by ','.
+        Useful for writing metadata.
         :return: A dictionary containing all the metadata properties.
         """
         result_dict = {
@@ -680,9 +684,9 @@ def as_dict(self, chain_id: Optional[str] = None) -> Dict[str, Any]:
         filtered_result_dict = {}
 
         for key, value in result_dict.items():
-            new_value = None
+            new_value = value
 
-            # If it's a dict, take value corresponding to the chain
+            # If original value is a dict, take value corresponding to the chain
             if isinstance(value, dict):
                 if entity_id in value:
                     new_value = value[entity_id]
@@ -691,7 +695,7 @@ def as_dict(self, chain_id: Optional[str] = None) -> Dict[str, Any]:
                 else:
                     continue
 
-            # If it's a sequence, drop it
+            # If original value is a sequence, drop it
             elif isinstance(value, (list, tuple)):
                 continue
 
@@ -699,11 +703,15 @@ def as_dict(self, chain_id: Optional[str] = None) -> Dict[str, Any]:
             elif value == self.pdb_id:
                 new_value = f"{self.pdb_id}:{chain_id}"
 
-            # If it's an internal dict, drop it
+            # If internal value is a dict, drop it
             if isinstance(new_value, dict):
                 continue
 
-            filtered_result_dict[key] = value if new_value is None else new_value
+            # If internal value is a sequence, maybe convert it to a string
+            elif isinstance(new_value, (list, tuple)) and seq_to_str:
+                new_value = str.join(",", new_value)
+
+            filtered_result_dict[key] = new_value
 
         return filtered_result_dict
 

From bd42b15644eecaf39a676d7cc631a84abf4ded8a Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Tue, 13 Feb 2024 06:31:12 +0200
Subject: [PATCH 20/37] pgroup: Improve metadata writing

---
 src/pp5/pgroup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/pp5/pgroup.py b/src/pp5/pgroup.py
index 2d7b8d8..16fdd9a 100644
--- a/src/pp5/pgroup.py
+++ b/src/pp5/pgroup.py
@@ -644,7 +644,9 @@ def to_struct_dataframe(self) -> pd.DataFrame:
                     "n_stars": q_alignment.n_stars,
                     "seq_len": len(q_alignment.ungapped_seq_2),  # seq2 is query
                     "ref_group": q_prec.unp_id == self.ref_prec.unp_id,
-                    **q_prec.pdb_meta.as_dict(chain_id=q_prec.pdb_chain_id),
+                    **q_prec.pdb_meta.as_dict(
+                        chain_id=q_prec.pdb_chain_id, seq_to_str=True
+                    ),
                 }
             )
 

From 4a1aafc666ebdaa1d84359171a526c517bc8cf2d Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Tue, 13 Feb 2024 06:31:36 +0200
Subject: [PATCH 21/37] Update tests

---
 tests/test_pdb.py    | 22 +++++++++++++++++-----
 tests/test_pgroup.py |  4 ++--
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/tests/test_pdb.py b/tests/test_pdb.py
index cf5948a..ca23c03 100644
--- a/tests/test_pdb.py
+++ b/tests/test_pdb.py
@@ -141,7 +141,7 @@ def test_entity_too_long(self):
                 pdb.split_id(invalid_id)
 
 
-@pytest.fixture(params=["1MWC:A", "2WUR:A", "4N6V:1", "1DWI:A"])
+@pytest.fixture(scope="class", params=["1MWC:A", "2WUR:A", "4N6V:1", "1DWI:A"])
 def pdb_id(request):
     return request.param
 
@@ -180,15 +180,27 @@ def test_exception_chimeric_chain(self):
 
 @pytest.mark.skipif(NO_INTERNET, reason="Needs internet")
 class TestPDBMetadata:
-    def test_metadata_properties(self, pdb_id):
-        meta = pdb.PDBMetadata(pdb_id)
+    @pytest.fixture(scope="class")
+    def metadata(self, pdb_id):
+        return pdb.PDBMetadata(pdb_id)
 
+    def test_metadata_properties(self, metadata, pdb_id):
         pdb_base_id, pdb_chain = pdb.split_id(pdb_id)
-        assert meta.pdb_id == pdb_base_id
+        assert metadata.pdb_id == pdb_base_id
 
-        d = meta.as_dict()  # evaluates all metadata properties
+    def test_as_dict(self, metadata):
+        d = metadata.as_dict()  # evaluates all metadata properties
         pprint(d)
 
+    @pytest.mark.parametrize(
+        "seq_to_str", [False, True], ids=["seq_to_str=False", "seq_to_str=True"]
+    )
+    def test_as_dict_chain(self, metadata, seq_to_str):
+        for chain_id in metadata.chain_ids:
+            d = metadata.as_dict(chain_id=chain_id, seq_to_str=seq_to_str)
+            print(f" === {chain_id=} === ")
+            pprint(d)
+
     @staticmethod
     def _check_unp(pdb_id, expected_unp_id):
         actual_unp_id = pdb.PDBMetadata.pdb_id_to_unp_id(pdb_id)
diff --git a/tests/test_pgroup.py b/tests/test_pgroup.py
index d394ac3..9b7c9ed 100644
--- a/tests/test_pgroup.py
+++ b/tests/test_pgroup.py
@@ -1,11 +1,11 @@
 import pytest
 
 from pp5.pgroup import ProteinGroup
-from pp5.external_dbs.pdb import PDB_DOWNLOAD_SOURCES
+from pp5.external_dbs.pdb import PDB_AFLD, PDB_RCSB, PDB_REDO
 
 
 class TestFromPDBRef(object):
-    @pytest.mark.parametrize("pdb_source", PDB_DOWNLOAD_SOURCES.keys())
+    @pytest.mark.parametrize("pdb_source", [PDB_RCSB, PDB_REDO])
     @pytest.mark.parametrize("match_len", [2, 1])
     def test_default(self, match_len, pdb_source):
         pgroup = ProteinGroup.from_pdb_ref(

From 9c04ca98f85cfbb5859cd5d24614294e1bdeca6d Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Wed, 14 Feb 2024 05:56:21 +0200
Subject: [PATCH 22/37] collect: refactor filename constants

---
 src/pp5/collect.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/pp5/collect.py b/src/pp5/collect.py
index 4460f51..90ebb41 100644
--- a/src/pp5/collect.py
+++ b/src/pp5/collect.py
@@ -58,6 +58,13 @@
 COL_REJECTED_BY = "rejected_by"
 COL_NUM_ALTLOCS = "num_altlocs"
 
+COLLECTION_METADATA_FILENAME = "meta.json"
+ALL_STRUCTS_FILENAME = "meta-structs_all"
+FILTERED_STRUCTS_FILENAME = "meta-structs_filtered"
+REJECTED_STRUCTS_FILENAME = "meta-structs_rejected"
+BLAST_SCORES_FILENAME = "meta-blast_scores"
+DATASET_DIRNAME = "data-precs"
+
 
 @dataclass(repr=False)
 class CollectorStep:
@@ -189,7 +196,7 @@ def _finalize_collection(self, pool: Pool):
             return
 
         # Create a metadata file in the output dir based on the step results
-        meta_filepath = self.out_dir.joinpath("meta.json")
+        meta_filepath = self.out_dir.joinpath(COLLECTION_METADATA_FILENAME)
         meta = self._collection_meta
         meta["steps"] = [str(s) for s in self._collection_steps]
         with open(str(meta_filepath), "w", encoding="utf-8") as f:
@@ -304,11 +311,6 @@ def __repr__(self):
 
 class ProteinRecordCollector(ParallelDataCollector):
     DEFAULT_PREC_INIT_ARGS = dict()
-    ALL_STRUCTS_FILENAME = "meta-structs_all"
-    FILTERED_STRUCTS_FILENAME = "meta-structs_filtered"
-    REJECTED_STRUCTS_FILENAME = "meta-structs_rejected"
-    BLAST_SCORES_FILENAME = "meta-blast_scores"
-    DATASET_DIRNAME = "data-precs"
 
     def __init__(
         self,
@@ -425,7 +427,7 @@ def __init__(
         self.entity_single_chain = entity_single_chain
 
         # Unique output dir for this collection run
-        self.prec_csv_out_dir = self.out_dir / self.DATASET_DIRNAME
+        self.prec_csv_out_dir = self.out_dir / DATASET_DIRNAME
         self.prec_csv_out_dir.mkdir(parents=True, exist_ok=True)
 
     def __repr__(self):
@@ -470,7 +472,7 @@ def _collect_precs(self, pool: Pool):
         n_collected = len(df_all)
 
         self._out_filepaths.append(
-            _write_df_csv(df_all, self.out_dir, self.ALL_STRUCTS_FILENAME)
+            _write_df_csv(df_all, self.out_dir, ALL_STRUCTS_FILENAME)
         )
 
         meta["n_collected"] = n_collected
@@ -486,7 +488,7 @@ def _filter_collected(self, pool: Pool) -> dict:
         Filters collected structures according to conditions on their metadata.
         """
 
-        df_all: pd.DataFrame = _read_df_csv(self.out_dir, self.ALL_STRUCTS_FILENAME)
+        df_all: pd.DataFrame = _read_df_csv(self.out_dir, ALL_STRUCTS_FILENAME)
         # A boolean series representing which structures to keep.
         filter_idx = pd.Series(data=[True] * len(df_all), index=df_all.index)
         rejected_counts = {"total": 0}
@@ -516,7 +518,7 @@ def _update_rejected_counts(filter_name: str, idx: pd.Series):
         # Write the filtered structures
         df_filtered = df_all[filter_idx]
         self._out_filepaths.append(
-            _write_df_csv(df_filtered, self.out_dir, self.FILTERED_STRUCTS_FILENAME)
+            _write_df_csv(df_filtered, self.out_dir, FILTERED_STRUCTS_FILENAME)
         )
 
         # Write the rejected structures and specify which filter rejected them
@@ -526,7 +528,7 @@ def _update_rejected_counts(filter_name: str, idx: pd.Series):
             df_rejected.loc[rejected_idx, COL_REJECTED_BY] = filter_name
         df_rejected = df_rejected[~filter_idx]
         self._out_filepaths.append(
-            _write_df_csv(df_rejected, self.out_dir, self.REJECTED_STRUCTS_FILENAME)
+            _write_df_csv(df_rejected, self.out_dir, REJECTED_STRUCTS_FILENAME)
         )
 
         return {
@@ -582,7 +584,7 @@ def _filter_redundant_unps(self, pool: Pool, df_all: pd.DataFrame) -> pd.Series:
         )
         self._out_filepaths.append(
             _write_df_csv(
-                df_blast_scores, self.out_dir, self.BLAST_SCORES_FILENAME, index=True
+                df_blast_scores, self.out_dir, BLAST_SCORES_FILENAME, index=True
             )
         )
 

From d005cd7b015ecbf3a0a566fc5eddafc38cc569a1 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Wed, 14 Feb 2024 05:56:49 +0200
Subject: [PATCH 23/37] ResidueContacts: Add comparison operators

---
 src/pp5/contacts.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/pp5/contacts.py b/src/pp5/contacts.py
index 92889b5..5749787 100644
--- a/src/pp5/contacts.py
+++ b/src/pp5/contacts.py
@@ -366,6 +366,14 @@ def _join(s):
 
         return d
 
+    def __eq__(self, other):
+        if not isinstance(other, ResidueContacts):
+            return False
+        return self.as_dict() == other.as_dict()
+
+    def __hash__(self):
+        return hash(tuple(self.as_dict().values()))
+
 
 class Arpeggio(object):
     """

From d1517af1fd82661a33d163281b08eb0bce28d375 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Wed, 14 Feb 2024 05:57:23 +0200
Subject: [PATCH 24/37] prec: fix comparison operators

---
 src/pp5/prec.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/pp5/prec.py b/src/pp5/prec.py
index 6533366..b88980c 100644
--- a/src/pp5/prec.py
+++ b/src/pp5/prec.py
@@ -201,16 +201,33 @@ def __eq__(self, other):
             return True
         if not isinstance(other, ResidueRecord):
             return False
+
+        def _compare(a, b):
+            eq = True
+            if isinstance(a, (float, np.ndarray)):
+                eq = np.allclose(a, b, equal_nan=True)
+
+            elif isinstance(a, dict):
+                for key, val in a.items():
+                    # to handle dict that contains ndarrays
+                    eq = _compare(val, b.get(key))
+                    if not eq:
+                        break
+            else:
+                eq = a == b
+
+            return eq
+
         for k, v in self.__dict__.items():
             other_v = other.__dict__.get(k, math.inf)
-            if isinstance(v, (float, np.ndarray)):
-                equal = np.allclose(v, other_v, equal_nan=True)
-            else:
-                equal = v == other_v
+            equal = _compare(v, other_v)
             if not equal:
                 return False
         return True
 
+    def __hash__(self):
+        return hash(tuple(self.as_dict().items()))
+
 
 class AltlocNameMap(dict):
     """

From caf468f881329fb3d498279b5334b273c33f13a6 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Wed, 14 Feb 2024 05:57:43 +0200
Subject: [PATCH 25/37] prec: bugfix in from_cache

---
 src/pp5/prec.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/pp5/prec.py b/src/pp5/prec.py
index b88980c..97dd1c8 100644
--- a/src/pp5/prec.py
+++ b/src/pp5/prec.py
@@ -610,7 +610,9 @@ def from_pdb(
                 pdb_id = f"{pdb_base_id}:{chain_id}"
 
             if cache and chain_id:
-                prec = cls.from_cache(pdb_id, cache_dir=cache_dir)
+                prec = cls.from_cache(
+                    pdb_id, cache_dir=cache_dir, pdb_source=pdb_source
+                )
                 if prec is not None:
                     return prec
 
@@ -640,6 +642,7 @@ def from_unp(
         cls,
         unp_id: str,
         cache=False,
+        pdb_source: str = PDB_RCSB,
         cache_dir=pp5.PREC_DIR,
         xref_selector: Callable[[unp.UNPPDBXRef], Any] = None,
         **kw_for_init,
@@ -651,6 +654,7 @@ def from_unp(
         :param xref_selector: Sort key for PDB cross refs. If None,
         resolution will be used.
         :param cache: Whether to load prec from cache if available.
+        :param pdb_source: Source from which to obtain the pdb file.
         :param cache_dir: Where the cache dir is. ProteinRecords will be
         written to this folder after creation, unless it's None.
         :param kw_for_init: Extra args for the ProteinRecord initializer.
@@ -665,7 +669,9 @@ def from_unp(
             pdb_id = f"{xrefs[0].pdb_id}:{xrefs[0].chain_id}"
 
             if cache:
-                prec = cls.from_cache(pdb_id, cache_dir=cache_dir)
+                prec = cls.from_cache(
+                    pdb_id, cache_dir=cache_dir, pdb_source=pdb_source
+                )
                 if prec is not None:
                     return prec
 

From 2da8957d1b854bbbfbd96d41cc804a69432d7b29 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Wed, 14 Feb 2024 05:57:57 +0200
Subject: [PATCH 26/37] prec: update tests

---
 tests/test_prec.py | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/tests/test_prec.py b/tests/test_prec.py
index 55b0468..668d73a 100644
--- a/tests/test_prec.py
+++ b/tests/test_prec.py
@@ -226,14 +226,14 @@ def test_invalid_pdbid(self):
     def test_multiple_unp_ids_for_same_pdb_chain_no_strict_pdb_xref(self):
         prec = ProteinRecord.from_pdb(
             "3SG4:A",
-            strict_pdb_xref=False,
+            strict_pdb_unp_xref=False,
         )
         assert prec.unp_id == "P42212"
         assert prec.pdb_id == "3SG4:A"
 
         prec = ProteinRecord.from_pdb(
             "3SG4",
-            strict_pdb_xref=False,
+            strict_pdb_unp_xref=False,
         )
         assert prec.unp_id == "P42212"
         assert prec.pdb_id == "3SG4:A"
@@ -250,27 +250,26 @@ class TestInit:
     def test_init_no_chain(self):
         unp_id = "P00720"
         pdb_id = "102L"
-        prec = ProteinRecord(unp_id, pdb_id)
-        assert prec.unp_id == "P00720"
+        prec = ProteinRecord(pdb_id)
+        assert prec.unp_id == unp_id
         assert prec.pdb_id == f"{pdb_id}:A"
 
+    def test_init_no_chain_ambiguous(self):
+        pdb_id = "4HHB"
+        with pytest.raises(ProteinInitError, match="multiple chains"):
+            _ = ProteinRecord(pdb_id)
+
     def test_init_with_chain(self):
         unp_id = "P00720"
         pdb_id = "102L:A"
-        prec = ProteinRecord(unp_id, pdb_id)
-        assert prec.unp_id == "P00720"
+        prec = ProteinRecord(pdb_id)
+        assert prec.unp_id == unp_id
         assert prec.pdb_id == pdb_id
 
-    def test_init_with_mismatching_pdb_id(self):
-        with pytest.raises(ProteinInitError):
-            ProteinRecord("P00720", "2WUR:A")
-
-        with pytest.raises(ProteinInitError):
-            ProteinRecord("P00720", "4GY3")
-
-    def test_no_matching_xref_in_pdb(self):
-        with pytest.raises(ProteinInitError):
-            ProteinRecord("P42212", "2QLE:A")
+    def test_init_with_no_pdb_id(self):
+        for invalid_pdb_id in ["", None]:
+            with pytest.raises(ProteinInitError, match="provide PDB ID"):
+                ProteinRecord(pdb_id=invalid_pdb_id)
 
 
 class TestSave:
@@ -296,15 +295,13 @@ def cache_dir(self):
 
     @pytest.mark.parametrize("pdb_id", ["1MWC:A", "4N6V:1"])
     @pytest.mark.parametrize("pdb_source", tuple(PDB_DOWNLOAD_SOURCES))
-    def test_from_pdb_with_cache(self, pdb_id, pdb_source, cache_dir, with_altlocs):
+    def test_from_pdb_with_cache(self, pdb_id, pdb_source, cache_dir):
         cache_dir = cache_dir / f"{pdb_source}"
         prec = ProteinRecord.from_pdb(
             pdb_id,
             pdb_source=pdb_source,
             cache=True,
             cache_dir=cache_dir,
-            strict_unp_xref=False,
-            with_altlocs=with_altlocs,
         )
 
         filename = f"{prec.pdb_id.replace(':', '_')}-{pdb_source}.prec"

From 22f7cefa46e15764bdd54d5181220e518ebb115b Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Wed, 14 Feb 2024 05:58:16 +0200
Subject: [PATCH 27/37] collect: basic tests for prec collector

---
 tests/test_collect.py | 71 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 tests/test_collect.py

diff --git a/tests/test_collect.py b/tests/test_collect.py
new file mode 100644
index 0000000..526376a
--- /dev/null
+++ b/tests/test_collect.py
@@ -0,0 +1,71 @@
+from random import randint
+from pathlib import Path
+
+import pytest
+
+import pp5
+from tests import get_tmp_path
+from pp5.collect import (
+    DATASET_DIRNAME,
+    ALL_STRUCTS_FILENAME,
+    COLLECTION_METADATA_FILENAME,
+    ProteinRecordCollector,
+)
+
+
+class TestPrecCollector(object):
+    @pytest.fixture(scope="class")
+    def collection_nproc(self):
+        return 4
+
+    @pytest.fixture(scope="class")
+    def collection_out_dir(self):
+        return get_tmp_path("prec-collected-tests")
+
+    @pytest.fixture(scope="class")
+    def collection_out_tag(self):
+        return f"tag-{randint(0, 1000)}"
+
+    @pytest.fixture(scope="class")
+    def collection_result(
+        self, collection_nproc, collection_out_dir, collection_out_tag
+    ):
+        pp5.set_config("MAX_PROCESSES", collection_nproc)
+
+        collector = ProteinRecordCollector(
+            resolution=0.75,
+            with_altlocs=True,
+            with_contacts=True,
+            with_backbone=True,
+            entity_single_chain=False,
+            seq_similarity_thresh=1.0,
+            write_zip=True,
+            out_dir=collection_out_dir,
+            out_tag=collection_out_tag,
+        )
+
+        return collector.collect()
+
+    def test_collection_result(
+        self, collection_result, collection_out_dir, collection_out_tag
+    ):
+        assert collection_result["n_collected"] > 10
+        assert collection_result["n_query_results"] > 10
+        assert collection_result["n_entries"] > 2000
+        assert collection_result["out_tag"] == collection_out_tag
+        for step_result in collection_result["steps"]:
+            assert "SUCCESS" in step_result
+
+        out_dir = Path(collection_result["out_dir"])
+        assert out_dir.is_dir()
+        assert out_dir.is_relative_to(collection_out_dir)
+
+        assert (out_dir / DATASET_DIRNAME).is_dir()
+        assert (out_dir / COLLECTION_METADATA_FILENAME).is_file()
+        assert (out_dir / f"{ALL_STRUCTS_FILENAME}.csv").is_file()
+
+        collection_id = out_dir.name
+        assert (out_dir / f"{collection_id}.zip").is_file()
+
+        csv_files = tuple((out_dir / DATASET_DIRNAME).glob("*.csv"))
+        assert collection_result["n_collected_filtered"] == len(csv_files)

From 4340aee1075b387ad2a93be79d5cf8a8478edd77 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Wed, 14 Feb 2024 06:10:13 +0200
Subject: [PATCH 28/37] pdb_api: fix a test

---
 tests/test_pdb_api.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/test_pdb_api.py b/tests/test_pdb_api.py
index 58b4d90..30a67bc 100644
--- a/tests/test_pdb_api.py
+++ b/tests/test_pdb_api.py
@@ -151,9 +151,14 @@ def test_search_structure_name(self):
             query_value=pdb_base_id, return_type=pdb_api.PDBQuery.ReturnType.CHAIN
         )
         results = query.execute()
+
+        # Some additional related structures will be returned
+        assert len(results) >= 4
+        filtered_results = [r for r in results if r.startswith(pdb_base_id)]
+
         # This structure has 4 chains
-        assert len(results) == 4
-        for result in results:
+        assert len(filtered_results) == 4
+        for result in filtered_results:
             pdb_id, chain_id, entity_id = split_id_with_entity(result)
             assert pdb_id == pdb_base_id
             assert chain_id

From 6c89f1889400163023293c544396423dcf54d61e Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Wed, 14 Feb 2024 06:11:04 +0200
Subject: [PATCH 29/37] pyproject: configure pytest traceback length

---
 pyproject.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3a48e92..5568b99 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,9 @@ addopts = [
     # Show durations of slowest tests.
     "--durations=10",
     # Force colored output even on CI
-    "--color=yes"
+    "--color=yes",
+    # Traceback verbosity
+    "--tb=short"
 ]
 
 testpaths = [

From 8033d151c9b9eabd97c6b7caf39c60bfec343d36 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Fri, 16 Feb 2024 05:52:29 +0200
Subject: [PATCH 30/37] JSONCacheableMixin: Update to automatically generate
 filenames

---
 src/pp5/utils.py | 105 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 98 insertions(+), 7 deletions(-)

diff --git a/src/pp5/utils.py b/src/pp5/utils.py
index 38eac76..3b9525c 100644
--- a/src/pp5/utils.py
+++ b/src/pp5/utils.py
@@ -2,11 +2,14 @@
 import sys
 import gzip
 import json
+import pickle
 import random
+import hashlib
 import logging
 import contextlib
+from abc import abstractmethod
 from json import JSONEncoder
-from typing import Any, Union, Callable
+from typing import Any, Dict, Union, Callable, Optional
 from pathlib import Path
 from datetime import datetime, timedelta
 from collections.abc import Set, Mapping, Sequence
@@ -310,6 +313,28 @@ def sort_key(kv: tuple):
     return {k: v for k, v in sorted(d.items(), key=sort_key)}
 
 
+def stable_hash(obj: Any, hash_len: int = 8) -> str:
+    """
+    Generates a stable hash for general python objects, as a hexadecimal string. Stable
+    means that the exact-same input will produce exactly the same output, even across
+    machines and processes. The provided object must be pickleable.
+
+    :param obj: A python object. Must be pickle-able.
+    :param hash_len: Desired length of hash string.
+    :return: A string of the requested length comprised of hexadecimal digits,
+        representing a number which is the hash value.
+    """
+    if hash_len < 2:
+        raise ValueError(f"Invalid {hash_len=}, must be > 1")
+
+    def _hash(bytelike: bytes) -> str:
+        return hashlib.blake2b(bytelike, digest_size=hash_len // 2).hexdigest()
+
+    obj_bytes: bytes = pickle.dumps(obj)
+
+    return _hash(obj_bytes)
+
+
 class JSONCacheableMixin(object):
     """
     Makes a class cacheable to JSON.
@@ -321,8 +346,48 @@ def __getstate__(self):
     def __setstate__(self, state):
         self.__dict__.update(state)
 
+    @classmethod
+    @abstractmethod
+    def cache_dir(cls) -> Path:
+        """
+        :return: The directory to which files will be cached.
+        """
+        pass
+
+    @abstractmethod
+    def cache_attribs(self) -> Dict[str, Any]:
+        """
+        :return: The attributes which determine the cache filename.
+        """
+        pass
+
+    @classmethod
+    def _cache_filename_prefix(cls, cache_attribs: Dict[str, Any]) -> str:
+        """
+        Generates the prefix of the cache filename.
+        :param cache_attribs: Attributes which determine the cache filename.
+        :return: The prefix of the cache filename.
+        """
+        return cls.__name__.lower()
+
+    @classmethod
+    def _cache_filename(cls, cache_attribs: Dict[str, Any]) -> str:
+        """
+        Generates the cache filename.
+        :param cache_attribs: The attributes which determine the cache filename.
+        :return: The cache filename.
+        """
+        return (
+            f"{cls._cache_filename_prefix(cache_attribs=cache_attribs)}"
+            "-"
+            f"{stable_hash(sort_dict(cache_attribs,by_value=False))}.json"
+        )
+
     def to_cache(
-        self, cache_dir: Union[str, Path], filename: Union[str, Path], **json_kws
+        self,
+        cache_dir: Optional[Union[str, Path]] = None,
+        filename: Optional[Union[str, Path]] = None,
+        **json_kws,
     ) -> Path:
         """
         Write the object to a human-readable text file (json) which
@@ -331,24 +396,50 @@ def to_cache(
         :param filename: Cached file name (without directory).
         :return: The path of the written file.
         """
+        if cache_dir is None:
+            cache_dir = self.cache_dir()
+        if filename is None:
+            filename = self._cache_filename(self.cache_attribs())
+
         filepath = pp5.get_resource_path(cache_dir, filename)
         os.makedirs(str(filepath.parent), exist_ok=True)
 
         with filelock_context(filepath):
             with open(str(filepath), "w", encoding="utf-8") as f:
-                json.dump(self.__getstate__(), f, **json_kws)
-
-        LOGGER.info(f"Wrote {self} to {filepath}")
+                json.dump(self.__getstate__(), f, indent=2, **json_kws)
+
+        file_size = os.path.getsize(filepath)
+        file_size_str = (
+            f"{file_size / 1024:.1f}kB"
+            if file_size < 1024 * 1024
+            else f"{file_size / 1024 / 1024:.1f}MB"
+        )
+        LOGGER.info(f"Wrote cache file: {filepath} ({file_size_str})")
         return filepath
 
     @classmethod
-    def from_cache(cls, cache_dir: Union[str, Path], filename: Union[str, Path]):
+    def from_cache(
+        cls,
+        cache_dir: Optional[Union[str, Path]] = None,
+        cache_attribs: Optional[Dict[str, Any]] = None,
+        filename: Optional[Union[str, Path]] = None,
+    ):
         """
         Load the object from a cached file.
         :param cache_dir: Directory of cached file.
-        :param filename: Cached file name (without directory).
+        :param cache_attribs: Attributes which determine the cache filename.
+        :param filename: Cached filename (without directory). Won't be used if
+        cache_attribs is given.
         :return: The loaded object, or None if the file doesn't exist.
         """
+        if not (cache_attribs or filename):
+            raise ValueError("cache_attribs or filename must be given")
+
+        if cache_dir is None:
+            cache_dir = cls.cache_dir()
+
+        if filename is None:
+            filename = cls._cache_filename(cache_attribs)
 
         filepath = pp5.get_resource_path(cache_dir, filename)
 

From b9f3aaa97049dd95f1f460557bc16f80c64d54af Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Fri, 16 Feb 2024 05:53:56 +0200
Subject: [PATCH 31/37] PDBMetadata: Implement caching

---
 src/pp5/__init__.py         |  6 +++---
 src/pp5/external_dbs/pdb.py | 32 +++++++++++++++++++++++++-------
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/pp5/__init__.py b/src/pp5/__init__.py
index e79ea57..b6b8a11 100644
--- a/src/pp5/__init__.py
+++ b/src/pp5/__init__.py
@@ -27,7 +27,7 @@
 ENV_PP5_UNP_DIR = "UNP_DIR"
 ENV_PP5_ENA_DIR = "ENA_DIR"
 ENV_PP5_PREC_DIR = "PREC_DIR"
-ENV_PP5_PDB2UNP_DIR = "PDB2UNP_DIR"
+ENV_PP5_PDB_METADATA_DIR = "PDB_METADATA_DIR"
 ENV_PP5_ALIGNMENT_DIR = "ALIGNMENT_DIR"
 ENV_PP5_BLASTDB_DIR = "BLASTDB_DIR"
 
@@ -137,8 +137,8 @@ def set_config(key: str, value: Any):
 # Directory for ProteinRecords
 PREC_DIR = Path(os.getenv(ENV_PP5_PREC_DIR, data_subdir("prec")))
 
-# Directory for PDB to UNP mappings
-PDB2UNP_DIR = Path(os.getenv(ENV_PP5_PDB2UNP_DIR, data_subdir("pdb2unp")))
+# Directory for PDB metadata
+PDB_METADATA_DIR = Path(os.getenv(ENV_PP5_PDB_METADATA_DIR, data_subdir("pdb_meta")))
 
 # Directory for Structural Alignments
 ALIGNMENT_DIR = Path(os.getenv(ENV_PP5_ALIGNMENT_DIR, data_subdir("align")))
diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py
index 73b2aec..291c1f3 100644
--- a/src/pp5/external_dbs/pdb.py
+++ b/src/pp5/external_dbs/pdb.py
@@ -21,7 +21,7 @@
 from Bio.PDB.Polypeptide import standard_aa_names
 from Bio.PDB.PDBExceptions import PDBConstructionWarning, PDBConstructionException
 
-from pp5 import PDB_DIR, get_resource_path
+from pp5 import PDB_DIR, PDB_METADATA_DIR, get_resource_path
 from pp5.utils import JSONCacheableMixin, remote_dl
 from pp5.external_dbs import pdb_api
 
@@ -310,6 +310,24 @@ def _resolve(
 
         return meta
 
+    @classmethod
+    def cache_dir(cls) -> Path:
+        return PDB_METADATA_DIR
+
+    @classmethod
+    def _cache_filename_prefix(cls, cache_attribs: Dict[str, Any]) -> str:
+        pdb_id = cache_attribs["pdb_id"]
+        return f"{super()._cache_filename_prefix(cache_attribs)}-{pdb_id}"
+
+    def cache_attribs(self) -> Dict[str, Any]:
+        return {"pdb_id": self.pdb_id}
+
+    def __eq__(self, other):
+        return self.pdb_id == other.pdb_id
+
+    def __hash__(self):
+        return hash(self.pdb_id)
+
     @property
     def pdb_id(self) -> str:
         return self._pdb_id
@@ -728,14 +746,14 @@ def from_pdb(cls, pdb_id: str, cache=False) -> PDBMetadata:
         """
         pdb_base_id, _ = split_id(pdb_id)
 
-        # TODO: Implement caching
-        # if cache:
-        #     pdb_meta = cls.from_cache(pdb_base_id)
-        #     if pdb_meta is not None:
-        #         return pdb_meta
+        if cache:
+            pdb_meta = cls.from_cache(cache_attribs={"pdb_id": pdb_base_id})
+            if pdb_meta is not None:
+                return pdb_meta
 
         pdb_meta = cls(pdb_id)
-        # pdb_meta.save()
+        if cache:
+            pdb_meta.to_cache()
         return pdb_meta
 
     @classmethod

From aad46a1f52f750f0c91f859cb7460d3c4aeb2877 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Fri, 16 Feb 2024 05:56:23 +0200
Subject: [PATCH 32/37] StructuralAlignment: Update caching

---
 src/pp5/align.py | 81 ++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 47 deletions(-)

diff --git a/src/pp5/align.py b/src/pp5/align.py
index aad2910..c11fb34 100644
--- a/src/pp5/align.py
+++ b/src/pp5/align.py
@@ -12,7 +12,7 @@
 import warnings
 import contextlib
 import subprocess
-from typing import Tuple, Union, Iterable, Optional
+from typing import Any, Dict, Tuple, Union, Iterable, Optional
 from pathlib import Path
 from datetime import datetime, timedelta
 
@@ -213,36 +213,6 @@ def ungapped_seq_2(self):
         """
         return self.ungap(self.aligned_seq_2)
 
-    def save(self, out_dir=pp5.ALIGNMENT_DIR) -> Path:
-        """
-        Write the alignment to a human-readable text file (json) which
-        can also be loaded later using from_cache.
-        :param out_dir: Output directory.
-        :return: The path of the written file.
-        """
-        filename = self._cache_filename(
-            self.pdb_id_1,
-            self.pdb_id_2,
-            self.pdb_source,
-            self.outlier_rejection_cutoff,
-            self.backbone_only,
-        )
-        return self.to_cache(out_dir, filename, indent=2)
-
-    @staticmethod
-    def _cache_filename(
-        pdb_id_1: str,
-        pdb_id_2: str,
-        pdb_source: str,
-        outlier_rejection_cutoff: float,
-        backbone_only,
-    ) -> str:
-        pdb_ids = f"{pdb_id_1}-{pdb_id_2}".replace(":", "_").upper()
-        config = f"cutoff={int(outlier_rejection_cutoff*10)}_bb={backbone_only}"
-        basename = f"{pdb_ids}_{config}"
-        filename = f"{basename}-{pdb_source}.json"
-        return filename
-
     @staticmethod
     def ungap(seq: str) -> str:
         """
@@ -269,33 +239,50 @@ def __eq__(self, other):
         return self.__dict__ == other.__dict__
 
     @classmethod
-    def from_cache(
-        cls,
-        pdb_id_1: str,
-        pdb_id_2: str,
-        pdb_source: str = PDB_RCSB,
-        cache_dir: Union[str, Path] = pp5.ALIGNMENT_DIR,
-        **kw_for_init,
-    ) -> Optional[StructuralAlignment]:
-        filename = cls._cache_filename(pdb_id_1, pdb_id_2, pdb_source, **kw_for_init)
-        return super(StructuralAlignment, cls).from_cache(cache_dir, filename)
+    def cache_dir(cls) -> Path:
+        return pp5.ALIGNMENT_DIR
+
+    def cache_attribs(self) -> Dict[str, Any]:
+        return dict(
+            pdb_id_1=self.pdb_id_1,
+            pdb_id_2=self.pdb_id_2,
+            pdb_source=self.pdb_source,
+            outlier_rejection_cutoff=self.outlier_rejection_cutoff,
+            backbone_only=self.backbone_only,
+        )
+
+    @classmethod
+    def _cache_filename_prefix(cls, cache_attribs: Dict[str, Any]) -> str:
+        pdb_id_1 = cache_attribs["pdb_id_1"]
+        pdb_id_2 = cache_attribs["pdb_id_2"]
+        pdb_ids = f"{pdb_id_1}-{pdb_id_2}".replace(":", "_").upper()
+        return f"{super()._cache_filename_prefix(cache_attribs)}-{pdb_ids}"
 
     @classmethod
     def from_pdb(
         cls,
-        pdb_id1: str,
-        pdb_id2: str,
+        pdb_id_1: str,
+        pdb_id_2: str,
         pdb_source: str = PDB_RCSB,
+        outlier_rejection_cutoff: float = 2.0,
+        backbone_only=False,
         cache=False,
-        **kw_for_init,
     ):
+        kws = dict(
+            pdb_id_1=pdb_id_1,
+            pdb_id_2=pdb_id_2,
+            pdb_source=pdb_source,
+            outlier_rejection_cutoff=outlier_rejection_cutoff,
+            backbone_only=backbone_only,
+        )
         if cache:
-            sa = cls.from_cache(pdb_id1, pdb_id2, pdb_source, **kw_for_init)
+            sa = cls.from_cache(cache_attribs=kws)
             if sa is not None:
                 return sa
 
-        sa = cls(pdb_id1, pdb_id2, pdb_source, **kw_for_init)
-        sa.save()
+        sa = cls(**kws)
+        if cache:
+            sa.to_cache()
         return sa
 
 

From 4fb69cf2f5ca4149c4a26bf074fa8c00dacf69e7 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Fri, 16 Feb 2024 05:56:48 +0200
Subject: [PATCH 33/37] Update tests

---
 tests/__init__.py   |  4 ++--
 tests/test_align.py | 27 ++++++++-------------------
 tests/test_pdb.py   | 15 +++++++++++++++
 3 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/tests/__init__.py b/tests/__init__.py
index f69ed54..dceb1a8 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -5,8 +5,8 @@
 from pp5 import (
     ENV_PP5_DATA_DIR,
     ENV_PP5_PREC_DIR,
-    ENV_PP5_PDB2UNP_DIR,
     ENV_PP5_ALIGNMENT_DIR,
+    ENV_PP5_PDB_METADATA_DIR,
 )
 
 TEST_RESOURCES_PATH = pathlib.Path(os.path.dirname(__file__)).joinpath("resources")
@@ -37,7 +37,7 @@ def get_tmp_path(name: str, clear=True):
 
 # In the tests, we dont want to save data files generated by own code
 os.environ[ENV_PP5_PREC_DIR] = str(get_tmp_path("data/prec"))
-os.environ[ENV_PP5_PDB2UNP_DIR] = str(get_tmp_path("data/pdb2unp"))
+os.environ[ENV_PP5_PDB_METADATA_DIR] = str(get_tmp_path("data/pdb_meta"))
 os.environ[ENV_PP5_ALIGNMENT_DIR] = str(get_tmp_path("data/align"))
 
 # Remove imported pp5 so that its init runs again and updates the paths using the
diff --git a/tests/test_align.py b/tests/test_align.py
index 56720fd..a3bb1c4 100644
--- a/tests/test_align.py
+++ b/tests/test_align.py
@@ -47,34 +47,23 @@ def test_outlier_rejection_cutoff_example(self):
     def test_cache(self, backbone_only, outlier_rejection_cutoff, pdb_source):
         pdb1, pdb2 = "4NE4:A", "5TEU:A"
 
-        # Should not exist in cache
-        sa_cached = StructuralAlignment.from_cache(
-            pdb1,
-            pdb2,
+        kws = dict(
+            pdb_id_1=pdb1,
+            pdb_id_2=pdb2,
             pdb_source=pdb_source,
             backbone_only=backbone_only,
             outlier_rejection_cutoff=outlier_rejection_cutoff,
         )
+
+        # Should not exist in cache
+        sa_cached = StructuralAlignment.from_cache(cache_attribs=kws)
         assert sa_cached is None
 
         # Should be created and saved to cache
-        sa = StructuralAlignment.from_pdb(
-            pdb1,
-            pdb2,
-            cache=True,
-            pdb_source=pdb_source,
-            backbone_only=backbone_only,
-            outlier_rejection_cutoff=outlier_rejection_cutoff,
-        )
+        sa = StructuralAlignment.from_pdb(**kws, cache=True)
 
         # Should exist in cache
-        sa_cached = StructuralAlignment.from_cache(
-            pdb1,
-            pdb2,
-            pdb_source=pdb_source,
-            backbone_only=backbone_only,
-            outlier_rejection_cutoff=outlier_rejection_cutoff,
-        )
+        sa_cached = StructuralAlignment.from_cache(cache_attribs=kws)
         assert sa_cached is not None
 
         # Cached version should be the same
diff --git a/tests/test_pdb.py b/tests/test_pdb.py
index ca23c03..5c51bb6 100644
--- a/tests/test_pdb.py
+++ b/tests/test_pdb.py
@@ -192,6 +192,21 @@ def test_as_dict(self, metadata):
         d = metadata.as_dict()  # evaluates all metadata properties
         pprint(d)
 
+    def test_cache(self, metadata):
+        path = metadata.to_cache()
+        cache_attrs = metadata.cache_attribs()
+        assert path.exists()
+        assert path.is_file()
+        metadata_ = pdb.PDBMetadata.from_cache(cache_attribs=cache_attrs)
+
+        assert metadata == metadata_
+
+    @pytest.mark.parametrize("cache", [True, False], ids=["cache=True", "cache=False"])
+    def test_from_pdb(self, pdb_id, cache):
+        pdb_base_id, chain_id = pdb.split_id(pdb_id)
+        metadata = pdb.PDBMetadata.from_pdb(pdb_id, cache=cache)
+        assert metadata.pdb_id == pdb_base_id
+
     @pytest.mark.parametrize(
         "seq_to_str", [False, True], ids=["seq_to_str=False", "seq_to_str=True"]
     )

From 1fbed95caa5e17f97aab49a2a2d00e4348800a74 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Fri, 16 Feb 2024 09:45:06 +0200
Subject: [PATCH 34/37] Refactor caching

---
 src/pp5/align.py            |  11 ++-
 src/pp5/cache.py            | 170 ++++++++++++++++++++++++++++++++++++
 src/pp5/collect.py          |   3 +-
 src/pp5/external_dbs/pdb.py |  11 ++-
 src/pp5/utils.py            | 144 +-----------------------------
 5 files changed, 183 insertions(+), 156 deletions(-)
 create mode 100644 src/pp5/cache.py

diff --git a/src/pp5/align.py b/src/pp5/align.py
index c11fb34..a58eb40 100644
--- a/src/pp5/align.py
+++ b/src/pp5/align.py
@@ -32,7 +32,8 @@
 from Bio.Align.Applications import ClustalOmegaCommandline
 
 import pp5
-from pp5.utils import JSONCacheableMixin, out_redirected
+from pp5.cache import Cacheable, CacheSettings
+from pp5.utils import out_redirected
 from pp5.external_dbs import pdb
 
 # Suppress messages from pymol upon import
@@ -151,11 +152,13 @@ def multiseq_align(
     return msa_result
 
 
-class StructuralAlignment(JSONCacheableMixin, object):
+class StructuralAlignment(Cacheable, object):
     """
     Represents a Structural Alignment between two protein structures.
     """
 
+    _CACHE_SETTINGS = CacheSettings(cache_dir=pp5.ALIGNMENT_DIR)
+
     def __init__(
         self,
         pdb_id_1: str,
@@ -238,10 +241,6 @@ def __eq__(self, other):
             return False
         return self.__dict__ == other.__dict__
 
-    @classmethod
-    def cache_dir(cls) -> Path:
-        return pp5.ALIGNMENT_DIR
-
     def cache_attribs(self) -> Dict[str, Any]:
         return dict(
             pdb_id_1=self.pdb_id_1,
diff --git a/src/pp5/cache.py b/src/pp5/cache.py
new file mode 100644
index 0000000..f49639a
--- /dev/null
+++ b/src/pp5/cache.py
@@ -0,0 +1,170 @@
+import os
+import json
+import logging
+from abc import abstractmethod
+from json import JSONEncoder
+from typing import Any, Dict, Union, Optional
+from pathlib import Path
+from dataclasses import dataclass
+
+import pp5
+from pp5.utils import sort_dict, stable_hash, filelock_context
+
+CACHE_FORMAT_JSON = "json"
+CACHE_FORMAT_PICKLE = "pkl"
+CACHE_FORMATS = {CACHE_FORMAT_JSON, CACHE_FORMAT_PICKLE}
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+@dataclass
+class CacheSettings:
+    """
+    Settings for caching objects to file.
+    """
+
+    cache_dir: Path
+    cache_format: str = CACHE_FORMAT_JSON
+    cache_compression: bool = False
+
+    def __post_init__(self):
+        if self.cache_format not in CACHE_FORMATS:
+            raise ValueError(f"Invalid {self.cache_format=}")
+
+    def __str__(self):
+        return f"{self.cache_format}{'-compressed' if self.cache_compression else ''}"
+
+
+class Cacheable(object):
+    """
+    Makes a class cacheable to file.
+    """
+
+    # Subclasses may override this with the desired settings.
+    _CACHE_SETTINGS = CacheSettings(cache_dir=pp5.data_subdir("cache"))
+
+    def __getstate__(self):
+        return self.__dict__.copy()
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+    @abstractmethod
+    def cache_attribs(self) -> Dict[str, Any]:
+        """
+        :return: The attributes which determine the cache filename.
+        """
+        pass
+
+    @classmethod
+    def _cache_filename_prefix(cls, cache_attribs: Dict[str, Any]) -> str:
+        """
+        Generates the prefix of the cache filename.
+        :param cache_attribs: Attributes which determine the cache filename.
+        :return: The prefix of the cache filename.
+        """
+        return cls.__name__.lower()
+
+    @classmethod
+    def _cache_filename(cls, cache_attribs: Dict[str, Any]) -> str:
+        """
+        Generates the cache filename.
+        :param cache_attribs: The attributes which determine the cache filename.
+        :return: The cache filename.
+        """
+        return (
+            f"{cls._cache_filename_prefix(cache_attribs=cache_attribs)}"
+            "-"
+            f"{stable_hash(sort_dict(cache_attribs,by_value=False))}.json"
+        )
+
+    def to_cache(
+        self,
+        cache_dir: Optional[Union[str, Path]] = None,
+        filename: Optional[Union[str, Path]] = None,
+        **json_kws,
+    ) -> Path:
+        """
+        Write the object to a human-readable text file (json) which
+        can also be loaded later using from_cache.
+        :param cache_dir: Directory of cached files.
+        :param filename: Cached file name (without directory).
+        :return: The path of the written file.
+        """
+        if cache_dir is None:
+            cache_dir = self._CACHE_SETTINGS.cache_dir
+        if filename is None:
+            filename = self._cache_filename(self.cache_attribs())
+
+        filepath = pp5.get_resource_path(cache_dir, filename)
+        os.makedirs(str(filepath.parent), exist_ok=True)
+
+        with filelock_context(filepath):
+            with open(str(filepath), "w", encoding="utf-8") as f:
+                json.dump(self.__getstate__(), f, indent=2, **json_kws)
+
+        file_size = os.path.getsize(filepath)
+        file_size_str = (
+            f"{file_size / 1024:.1f}kB"
+            if file_size < 1024 * 1024
+            else f"{file_size / 1024 / 1024:.1f}MB"
+        )
+        LOGGER.info(f"Wrote cache file: {filepath} ({file_size_str})")
+        return filepath
+
+    @classmethod
+    def from_cache(
+        cls,
+        cache_dir: Optional[Union[str, Path]] = None,
+        cache_attribs: Optional[Dict[str, Any]] = None,
+        filename: Optional[Union[str, Path]] = None,
+    ):
+        """
+        Load the object from a cached file.
+        :param cache_dir: Directory of cached file.
+        :param cache_attribs: Attributes which determine the cache filename.
+        :param filename: Cached filename (without directory). Won't be used if
+        cache_attribs is given.
+        :return: The loaded object, or None if the file doesn't exist.
+        """
+        if not (cache_attribs or filename):
+            raise ValueError("cache_attribs or filename must be given")
+
+        if cache_dir is None:
+            cache_dir = cls._CACHE_SETTINGS.cache_dir
+
+        if filename is None:
+            filename = cls._cache_filename(cache_attribs)
+
+        filepath = pp5.get_resource_path(cache_dir, filename)
+
+        obj = None
+
+        with filelock_context(filepath):
+            if filepath.is_file():
+                try:
+                    with open(str(filepath), "r", encoding="utf-8") as f:
+                        state_dict = json.load(f)
+                        obj = cls.__new__(cls)
+                        obj.__setstate__(state_dict)
+                except Exception as e:
+                    LOGGER.warning(
+                        f"Failed to load cached {cls.__name__} {filepath} {e}"
+                    )
+            return obj
+
+
+class ReprJSONEncoder(JSONEncoder):
+    """
+    A JSONEncoder that converts an object to it's representation string in
+    case it's not serializable.
+    """
+
+    def default(self, o: Any) -> Any:
+        try:
+            return repr(o)
+        except Exception as e:
+            pass
+        # Let the base class default method raise the TypeError
+        return JSONEncoder.default(self, o)
diff --git a/src/pp5/collect.py b/src/pp5/collect.py
index 90ebb41..942d4ed 100644
--- a/src/pp5/collect.py
+++ b/src/pp5/collect.py
@@ -24,7 +24,8 @@
 import pp5.parallel
 from pp5.prec import ProteinRecord
 from pp5.align import ProteinBLAST
-from pp5.utils import ReprJSONEncoder, ProteinInitError, elapsed_seconds_to_dhms
+from pp5.cache import ReprJSONEncoder
+from pp5.utils import ProteinInitError, elapsed_seconds_to_dhms
 from pp5.pgroup import ProteinGroup
 from pp5.external_dbs import pdb, unp, pdb_api
 from pp5.external_dbs.pdb import PDB_RCSB
diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py
index 291c1f3..9bd1fc6 100644
--- a/src/pp5/external_dbs/pdb.py
+++ b/src/pp5/external_dbs/pdb.py
@@ -22,7 +22,8 @@
 from Bio.PDB.PDBExceptions import PDBConstructionWarning, PDBConstructionException
 
 from pp5 import PDB_DIR, PDB_METADATA_DIR, get_resource_path
-from pp5.utils import JSONCacheableMixin, remote_dl
+from pp5.cache import Cacheable, CacheSettings
+from pp5.utils import remote_dl
 from pp5.external_dbs import pdb_api
 
 PDB_ID_PATTERN = re.compile(
@@ -253,11 +254,13 @@ def pdb_to_secondary_structure(
 _TC = TypeVar("_TC")
 
 
-class PDBMetadata(JSONCacheableMixin):
+class PDBMetadata(Cacheable):
     """
     Obtains and parses metadata from a PDB structure using PDB REST API.
     """
 
+    _CACHE_SETTINGS = CacheSettings(cache_dir=PDB_METADATA_DIR)
+
     def __init__(self, pdb_id: str):
         """
         :param pdb_id: The PDB ID of the structure. No chain.
@@ -310,10 +313,6 @@ def _resolve(
 
         return meta
 
-    @classmethod
-    def cache_dir(cls) -> Path:
-        return PDB_METADATA_DIR
-
     @classmethod
     def _cache_filename_prefix(cls, cache_attribs: Dict[str, Any]) -> str:
         pdb_id = cache_attribs["pdb_id"]
diff --git a/src/pp5/utils.py b/src/pp5/utils.py
index 3b9525c..93073bd 100644
--- a/src/pp5/utils.py
+++ b/src/pp5/utils.py
@@ -1,15 +1,12 @@
 import os
 import sys
 import gzip
-import json
 import pickle
 import random
 import hashlib
 import logging
 import contextlib
-from abc import abstractmethod
-from json import JSONEncoder
-from typing import Any, Dict, Union, Callable, Optional
+from typing import Any, Union, Callable
 from pathlib import Path
 from datetime import datetime, timedelta
 from collections.abc import Set, Mapping, Sequence
@@ -335,144 +332,5 @@ def _hash(bytelike: bytes) -> str:
     return _hash(obj_bytes)
 
 
-class JSONCacheableMixin(object):
-    """
-    Makes a class cacheable to JSON.
-    """
-
-    def __getstate__(self):
-        return self.__dict__.copy()
-
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-
-    @classmethod
-    @abstractmethod
-    def cache_dir(cls) -> Path:
-        """
-        :return: The directory to which files will be cached.
-        """
-        pass
-
-    @abstractmethod
-    def cache_attribs(self) -> Dict[str, Any]:
-        """
-        :return: The attributes which determine the cache filename.
-        """
-        pass
-
-    @classmethod
-    def _cache_filename_prefix(cls, cache_attribs: Dict[str, Any]) -> str:
-        """
-        Generates the prefix of the cache filename.
-        :param cache_attribs: Attributes which determine the cache filename.
-        :return: The prefix of the cache filename.
-        """
-        return cls.__name__.lower()
-
-    @classmethod
-    def _cache_filename(cls, cache_attribs: Dict[str, Any]) -> str:
-        """
-        Generates the cache filename.
-        :param cache_attribs: The attributes which determine the cache filename.
-        :return: The cache filename.
-        """
-        return (
-            f"{cls._cache_filename_prefix(cache_attribs=cache_attribs)}"
-            "-"
-            f"{stable_hash(sort_dict(cache_attribs,by_value=False))}.json"
-        )
-
-    def to_cache(
-        self,
-        cache_dir: Optional[Union[str, Path]] = None,
-        filename: Optional[Union[str, Path]] = None,
-        **json_kws,
-    ) -> Path:
-        """
-        Write the object to a human-readable text file (json) which
-        can also be loaded later using from_cache.
-        :param cache_dir: Directory of cached files.
-        :param filename: Cached file name (without directory).
-        :return: The path of the written file.
-        """
-        if cache_dir is None:
-            cache_dir = self.cache_dir()
-        if filename is None:
-            filename = self._cache_filename(self.cache_attribs())
-
-        filepath = pp5.get_resource_path(cache_dir, filename)
-        os.makedirs(str(filepath.parent), exist_ok=True)
-
-        with filelock_context(filepath):
-            with open(str(filepath), "w", encoding="utf-8") as f:
-                json.dump(self.__getstate__(), f, indent=2, **json_kws)
-
-        file_size = os.path.getsize(filepath)
-        file_size_str = (
-            f"{file_size / 1024:.1f}kB"
-            if file_size < 1024 * 1024
-            else f"{file_size / 1024 / 1024:.1f}MB"
-        )
-        LOGGER.info(f"Wrote cache file: {filepath} ({file_size_str})")
-        return filepath
-
-    @classmethod
-    def from_cache(
-        cls,
-        cache_dir: Optional[Union[str, Path]] = None,
-        cache_attribs: Optional[Dict[str, Any]] = None,
-        filename: Optional[Union[str, Path]] = None,
-    ):
-        """
-        Load the object from a cached file.
-        :param cache_dir: Directory of cached file.
-        :param cache_attribs: Attributes which determine the cache filename.
-        :param filename: Cached filename (without directory). Won't be used if
-        cache_attribs is given.
-        :return: The loaded object, or None if the file doesn't exist.
-        """
-        if not (cache_attribs or filename):
-            raise ValueError("cache_attribs or filename must be given")
-
-        if cache_dir is None:
-            cache_dir = cls.cache_dir()
-
-        if filename is None:
-            filename = cls._cache_filename(cache_attribs)
-
-        filepath = pp5.get_resource_path(cache_dir, filename)
-
-        obj = None
-
-        with filelock_context(filepath):
-            if filepath.is_file():
-                try:
-                    with open(str(filepath), "r", encoding="utf-8") as f:
-                        state_dict = json.load(f)
-                        obj = cls.__new__(cls)
-                        obj.__setstate__(state_dict)
-                except Exception as e:
-                    LOGGER.warning(
-                        f"Failed to load cached {cls.__name__} {filepath} {e}"
-                    )
-            return obj
-
-
-class ReprJSONEncoder(JSONEncoder):
-    """
-    A JSONEncoder that converts an object to it's representation string in
-    case it's not serializable.
-    """
-
-    def default(self, o: Any) -> Any:
-        try:
-            return repr(o)
-        except Exception as e:
-            pass
-        # Let the base class default method raise the TypeError
-        return JSONEncoder.default(self, o)
-
-
 class ProteinInitError(ValueError):
     pass

From 539aa22a657612ebf59f45006104a6036d4493fb Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Fri, 16 Feb 2024 13:36:41 +0200
Subject: [PATCH 35/37] Cacheable: Add support for compression

---
 src/pp5/cache.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/pp5/cache.py b/src/pp5/cache.py
index f49639a..08300a9 100644
--- a/src/pp5/cache.py
+++ b/src/pp5/cache.py
@@ -5,6 +5,7 @@
 from json import JSONEncoder
 from typing import Any, Dict, Union, Optional
 from pathlib import Path
+from zipfile import ZIP_DEFLATED, ZipFile
 from dataclasses import dataclass
 
 import pp5
@@ -104,6 +105,16 @@ def to_cache(
             with open(str(filepath), "w", encoding="utf-8") as f:
                 json.dump(self.__getstate__(), f, indent=2, **json_kws)
 
+            if self._CACHE_SETTINGS.cache_compression:
+                zip_filepath = filepath.with_suffix(".zip")
+                with ZipFile(
+                    zip_filepath, "w", compression=ZIP_DEFLATED, compresslevel=6
+                ) as fzip:
+                    fzip.write(str(filepath), arcname=filename)
+
+                filepath.unlink()
+                filepath = zip_filepath
+
         file_size = os.path.getsize(filepath)
         file_size_str = (
             f"{file_size / 1024:.1f}kB"
@@ -142,6 +153,11 @@ def from_cache(
         obj = None
 
         with filelock_context(filepath):
+            zip_filepath = filepath.with_suffix(".zip")
+            if cls._CACHE_SETTINGS.cache_compression and zip_filepath.is_file():
+                with ZipFile(zip_filepath, "r") as fzip:
+                    fzip.extractall(path=zip_filepath.parent)
+
             if filepath.is_file():
                 try:
                     with open(str(filepath), "r", encoding="utf-8") as f:
@@ -152,6 +168,9 @@ def from_cache(
                     LOGGER.warning(
                         f"Failed to load cached {cls.__name__} {filepath} {e}"
                     )
+                finally:
+                    if cls._CACHE_SETTINGS.cache_compression:
+                        filepath.unlink()
             return obj
 
 

From 8dd13229f856db4ff6c5a08ffab04c3ce0ead9b3 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Fri, 23 Feb 2024 04:25:45 +0200
Subject: [PATCH 36/37] PDBMetadata: Use compression for cache

---
 src/pp5/external_dbs/pdb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pp5/external_dbs/pdb.py b/src/pp5/external_dbs/pdb.py
index 9bd1fc6..f1ab574 100644
--- a/src/pp5/external_dbs/pdb.py
+++ b/src/pp5/external_dbs/pdb.py
@@ -259,7 +259,7 @@ class PDBMetadata(Cacheable):
     Obtains and parses metadata from a PDB structure using PDB REST API.
     """
 
-    _CACHE_SETTINGS = CacheSettings(cache_dir=PDB_METADATA_DIR)
+    _CACHE_SETTINGS = CacheSettings(cache_dir=PDB_METADATA_DIR, cache_compression=True)
 
     def __init__(self, pdb_id: str):
         """

From 073f667e031c833817ce3cb21655344c80828a75 Mon Sep 17 00:00:00 2001
From: Aviv Rosenberg <aviv.rosenberg@gmail.com>
Date: Fri, 23 Feb 2024 04:32:32 +0200
Subject: [PATCH 37/37] prec: add TODO about caching

---
 src/pp5/prec.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/pp5/prec.py b/src/pp5/prec.py
index 97dd1c8..39a0d91 100644
--- a/src/pp5/prec.py
+++ b/src/pp5/prec.py
@@ -528,6 +528,8 @@ def from_cache(
         :return: Loaded ProteinRecord, or None if the cached prec does not
         exist.
         """
+        # TODO: Prec should use Cacheable base class instead of this custom approach.
+
         if not isinstance(cache_dir, (str, Path)):
             cache_dir = pp5.PREC_DIR