Skip to content

Commit

Permalink
Merge branch 'crossref-chains'
Browse files Browse the repository at this point in the history
  • Loading branch information
avivrosenberg committed Feb 23, 2024
2 parents c72711f + 073f667 commit 80f923f
Show file tree
Hide file tree
Showing 17 changed files with 988 additions and 729 deletions.
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ addopts = [
# Show durations of slowest tests.
"--durations=10",
# Force colored output even on CI
"--color=yes"
"--color=yes",
# Traceback verbosity
"--tb=short"
]

testpaths = [
Expand Down
6 changes: 3 additions & 3 deletions src/pp5/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
ENV_PP5_UNP_DIR = "UNP_DIR"
ENV_PP5_ENA_DIR = "ENA_DIR"
ENV_PP5_PREC_DIR = "PREC_DIR"
ENV_PP5_PDB2UNP_DIR = "PDB2UNP_DIR"
ENV_PP5_PDB_METADATA_DIR = "PDB_METADATA_DIR"
ENV_PP5_ALIGNMENT_DIR = "ALIGNMENT_DIR"
ENV_PP5_BLASTDB_DIR = "BLASTDB_DIR"

Expand Down Expand Up @@ -137,8 +137,8 @@ def set_config(key: str, value: Any):
# Directory for ProteinRecords
PREC_DIR = Path(os.getenv(ENV_PP5_PREC_DIR, data_subdir("prec")))

# Directory for PDB to UNP mappings
PDB2UNP_DIR = Path(os.getenv(ENV_PP5_PDB2UNP_DIR, data_subdir("pdb2unp")))
# Directory for PDB metadata
PDB_METADATA_DIR = Path(os.getenv(ENV_PP5_PDB_METADATA_DIR, data_subdir("pdb_meta")))

# Directory for Structural Alignments
ALIGNMENT_DIR = Path(os.getenv(ENV_PP5_ALIGNMENT_DIR, data_subdir("align")))
Expand Down
86 changes: 36 additions & 50 deletions src/pp5/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import warnings
import contextlib
import subprocess
from typing import Tuple, Union, Iterable, Optional
from typing import Any, Dict, Tuple, Union, Iterable, Optional
from pathlib import Path
from datetime import datetime, timedelta

Expand All @@ -32,7 +32,8 @@
from Bio.Align.Applications import ClustalOmegaCommandline

import pp5
from pp5.utils import JSONCacheableMixin, out_redirected
from pp5.cache import Cacheable, CacheSettings
from pp5.utils import out_redirected
from pp5.external_dbs import pdb

# Suppress messages from pymol upon import
Expand Down Expand Up @@ -151,11 +152,13 @@ def multiseq_align(
return msa_result


class StructuralAlignment(JSONCacheableMixin, object):
class StructuralAlignment(Cacheable, object):
"""
Represents a Structural Alignment between two protein structures.
"""

_CACHE_SETTINGS = CacheSettings(cache_dir=pp5.ALIGNMENT_DIR)

def __init__(
self,
pdb_id_1: str,
Expand Down Expand Up @@ -213,36 +216,6 @@ def ungapped_seq_2(self):
"""
return self.ungap(self.aligned_seq_2)

def save(self, out_dir=pp5.ALIGNMENT_DIR) -> Path:
"""
Write the alignment to a human-readable text file (json) which
can also be loaded later using from_cache.
:param out_dir: Output directory.
:return: The path of the written file.
"""
filename = self._cache_filename(
self.pdb_id_1,
self.pdb_id_2,
self.pdb_source,
self.outlier_rejection_cutoff,
self.backbone_only,
)
return self.to_cache(out_dir, filename, indent=2)

@staticmethod
def _cache_filename(
pdb_id_1: str,
pdb_id_2: str,
pdb_source: str,
outlier_rejection_cutoff: float,
backbone_only,
) -> str:
pdb_ids = f"{pdb_id_1}-{pdb_id_2}".replace(":", "_").upper()
config = f"cutoff={int(outlier_rejection_cutoff*10)}_bb={backbone_only}"
basename = f"{pdb_ids}_{config}"
filename = f"{basename}-{pdb_source}.json"
return filename

@staticmethod
def ungap(seq: str) -> str:
"""
Expand All @@ -268,34 +241,47 @@ def __eq__(self, other):
return False
return self.__dict__ == other.__dict__

def cache_attribs(self) -> Dict[str, Any]:
return dict(
pdb_id_1=self.pdb_id_1,
pdb_id_2=self.pdb_id_2,
pdb_source=self.pdb_source,
outlier_rejection_cutoff=self.outlier_rejection_cutoff,
backbone_only=self.backbone_only,
)

@classmethod
def from_cache(
cls,
pdb_id_1: str,
pdb_id_2: str,
pdb_source: str = PDB_RCSB,
cache_dir: Union[str, Path] = pp5.ALIGNMENT_DIR,
**kw_for_init,
) -> Optional[StructuralAlignment]:
filename = cls._cache_filename(pdb_id_1, pdb_id_2, pdb_source, **kw_for_init)
return super(StructuralAlignment, cls).from_cache(cache_dir, filename)
def _cache_filename_prefix(cls, cache_attribs: Dict[str, Any]) -> str:
pdb_id_1 = cache_attribs["pdb_id_1"]
pdb_id_2 = cache_attribs["pdb_id_2"]
pdb_ids = f"{pdb_id_1}-{pdb_id_2}".replace(":", "_").upper()
return f"{super()._cache_filename_prefix(cache_attribs)}-{pdb_ids}"

@classmethod
def from_pdb(
cls,
pdb_id1: str,
pdb_id2: str,
pdb_id_1: str,
pdb_id_2: str,
pdb_source: str = PDB_RCSB,
outlier_rejection_cutoff: float = 2.0,
backbone_only=False,
cache=False,
**kw_for_init,
):
kws = dict(
pdb_id_1=pdb_id_1,
pdb_id_2=pdb_id_2,
pdb_source=pdb_source,
outlier_rejection_cutoff=outlier_rejection_cutoff,
backbone_only=backbone_only,
)
if cache:
sa = cls.from_cache(pdb_id1, pdb_id2, pdb_source, **kw_for_init)
sa = cls.from_cache(cache_attribs=kws)
if sa is not None:
return sa

sa = cls(pdb_id1, pdb_id2, pdb_source, **kw_for_init)
sa.save()
sa = cls(**kws)
if cache:
sa.to_cache()
return sa


Expand Down Expand Up @@ -530,7 +516,7 @@ def pdb(self, query_pdb_id: str, pdb_dict=None) -> pd.DataFrame:
)

# Note: no need for pdb_source, we just care about what chains exist
meta = pdb.PDBMetadata(pdb_id, struct_d=pdb_dict)
meta = pdb.PDBMetadata(pdb_id)

if chain_id not in meta.chain_entities:
raise ValueError(f"Can't find chain {chain_id} in {pdb_id}")
Expand Down
189 changes: 189 additions & 0 deletions src/pp5/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import os
import json
import logging
from abc import abstractmethod
from json import JSONEncoder
from typing import Any, Dict, Union, Optional
from pathlib import Path
from zipfile import ZIP_DEFLATED, ZipFile
from dataclasses import dataclass

import pp5
from pp5.utils import sort_dict, stable_hash, filelock_context

CACHE_FORMAT_JSON = "json"
CACHE_FORMAT_PICKLE = "pkl"
CACHE_FORMATS = {CACHE_FORMAT_JSON, CACHE_FORMAT_PICKLE}


LOGGER = logging.getLogger(__name__)


@dataclass
class CacheSettings:
"""
Settings for caching objects to file.
"""

cache_dir: Path
cache_format: str = CACHE_FORMAT_JSON
cache_compression: bool = False

def __post_init__(self):
if self.cache_format not in CACHE_FORMATS:
raise ValueError(f"Invalid {self.cache_format=}")

def __str__(self):
return f"{self.cache_format}{'-compressed' if self.cache_compression else ''}"


class Cacheable(object):
"""
Makes a class cacheable to file.
"""

# Subclasses may override this with the desired settings.
_CACHE_SETTINGS = CacheSettings(cache_dir=pp5.data_subdir("cache"))

def __getstate__(self):
return self.__dict__.copy()

def __setstate__(self, state):
self.__dict__.update(state)

@abstractmethod
def cache_attribs(self) -> Dict[str, Any]:
"""
:return: The attributes which determine the cache filename.
"""
pass

@classmethod
def _cache_filename_prefix(cls, cache_attribs: Dict[str, Any]) -> str:
"""
Generates the prefix of the cache filename.
:param cache_attribs: Attributes which determine the cache filename.
:return: The prefix of the cache filename.
"""
return cls.__name__.lower()

@classmethod
def _cache_filename(cls, cache_attribs: Dict[str, Any]) -> str:
"""
Generates the cache filename.
:param cache_attribs: The attributes which determine the cache filename.
:return: The cache filename.
"""
return (
f"{cls._cache_filename_prefix(cache_attribs=cache_attribs)}"
"-"
f"{stable_hash(sort_dict(cache_attribs,by_value=False))}.json"
)

def to_cache(
self,
cache_dir: Optional[Union[str, Path]] = None,
filename: Optional[Union[str, Path]] = None,
**json_kws,
) -> Path:
"""
Write the object to a human-readable text file (json) which
can also be loaded later using from_cache.
:param cache_dir: Directory of cached files.
:param filename: Cached file name (without directory).
:return: The path of the written file.
"""
if cache_dir is None:
cache_dir = self._CACHE_SETTINGS.cache_dir
if filename is None:
filename = self._cache_filename(self.cache_attribs())

filepath = pp5.get_resource_path(cache_dir, filename)
os.makedirs(str(filepath.parent), exist_ok=True)

with filelock_context(filepath):
with open(str(filepath), "w", encoding="utf-8") as f:
json.dump(self.__getstate__(), f, indent=2, **json_kws)

if self._CACHE_SETTINGS.cache_compression:
zip_filepath = filepath.with_suffix(".zip")
with ZipFile(
zip_filepath, "w", compression=ZIP_DEFLATED, compresslevel=6
) as fzip:
fzip.write(str(filepath), arcname=filename)

filepath.unlink()
filepath = zip_filepath

file_size = os.path.getsize(filepath)
file_size_str = (
f"{file_size / 1024:.1f}kB"
if file_size < 1024 * 1024
else f"{file_size / 1024 / 1024:.1f}MB"
)
LOGGER.info(f"Wrote cache file: {filepath} ({file_size_str})")
return filepath

@classmethod
def from_cache(
cls,
cache_dir: Optional[Union[str, Path]] = None,
cache_attribs: Optional[Dict[str, Any]] = None,
filename: Optional[Union[str, Path]] = None,
):
"""
Load the object from a cached file.
:param cache_dir: Directory of cached file.
:param cache_attribs: Attributes which determine the cache filename.
:param filename: Cached filename (without directory). Won't be used if
cache_attribs is given.
:return: The loaded object, or None if the file doesn't exist.
"""
if not (cache_attribs or filename):
raise ValueError("cache_attribs or filename must be given")

if cache_dir is None:
cache_dir = cls._CACHE_SETTINGS.cache_dir

if filename is None:
filename = cls._cache_filename(cache_attribs)

filepath = pp5.get_resource_path(cache_dir, filename)

obj = None

with filelock_context(filepath):
zip_filepath = filepath.with_suffix(".zip")
if cls._CACHE_SETTINGS.cache_compression and zip_filepath.is_file():
with ZipFile(zip_filepath, "r") as fzip:
fzip.extractall(path=zip_filepath.parent)

if filepath.is_file():
try:
with open(str(filepath), "r", encoding="utf-8") as f:
state_dict = json.load(f)
obj = cls.__new__(cls)
obj.__setstate__(state_dict)
except Exception as e:
LOGGER.warning(
f"Failed to load cached {cls.__name__} {filepath} {e}"
)
finally:
if cls._CACHE_SETTINGS.cache_compression:
filepath.unlink()
return obj


class ReprJSONEncoder(JSONEncoder):
"""
A JSONEncoder that converts an object to it's representation string in
case it's not serializable.
"""

def default(self, o: Any) -> Any:
try:
return repr(o)
except Exception as e:
pass
# Let the base class default method raise the TypeError
return JSONEncoder.default(self, o)
Loading

0 comments on commit 80f923f

Please sign in to comment.