From e1b3c36233b9a8829371c9c2b706772b2cd13a0f Mon Sep 17 00:00:00 2001
From: Arthur Zalevsky
Date: Thu, 21 Mar 2024 18:30:29 -0700
Subject: [PATCH] oops, the rest of 492f14b
---
ihm_validation/cx.py | 232 ++++++++++++++++++++++++++++-
ihm_validation/report.py | 4 +-
templates/data_quality.html | 2 +
templates/full_validation_pdf.html | 2 +
templates/layout.html | 15 +-
5 files changed, 252 insertions(+), 3 deletions(-)
diff --git a/ihm_validation/cx.py b/ihm_validation/cx.py
index 09897be0..850c0a78 100644
--- a/ihm_validation/cx.py
+++ b/ihm_validation/cx.py
@@ -22,6 +22,9 @@
import json
from bokeh.embed import json_item
from bokeh.io import export_svgs
+import requests
+import pickle
+import pyhmmer
pd.options.mode.chained_assignment = None
NA = 'Not available'
@@ -70,9 +73,10 @@ class CxValidation(GetInputInformation):
ID = None
driver = None
- def __init__(self, mmcif_file):
+ def __init__(self, mmcif_file, cache):
super().__init__(mmcif_file)
self.ID = str(self.get_id())
+ self.cache = cache
self.nos = self.get_number_of_models()
self.dataset = self.get_dataset_comp()
# Only atomic structures are supported so far
@@ -897,3 +901,229 @@ def save_plots(self, plot, title, imgDirname='.'):
svgs = [Path(x).name for x in svgs]
return (imgpath, imgpath_json, svgs)
+
+ @staticmethod
+ def request_pride(url: str) -> dict:
+ ''' pull data from PRIDE using crosslinking PDB-DEV API '''
+ result = None
+ r = requests.get(url)
+
+ if not r.ok:
+ # Wait until cold request completes and go to DB cache
+ logging.info('Retrying pulling data from PRIDE')
+ time.sleep(60)
+ r = requests.get(url)
+
+ if r.ok:
+ try:
+ result = r.json()
+ except JSONDecodeError:
+ pass
+
+ return result
+
+ def get_sequences_pride(self, pid: str) -> dict:
+ '''get sequences from PRIDE entry'''
+ url = f"https://www.ebi.ac.uk/pride/ws/archive/crosslinking/pdbdev/projects/{pid}/sequences"
+ result = self.request_pride(url)['data']
+ return result
+
+ def get_residue_pairs_pride(self, pid: str, page_size: int = 99) -> dict:
+ '''get sequences from PRIDE entry'''
+ url = f"https://www.ebi.ac.uk/pride/ws/archive/crosslinking/pdbdev/projects/{pid}/residue-pairs/based-on-reported-psm-level/passing"
+ page = 1
+ url_ = f"{url}?page={page}&page_size={page_size}"
+
+ result = self.request_pride(url_)
+
+ if result is not None:
+ session = requests.Session()
+
+ max_page = int(result['page']["total_pages"])
+ page_size = int(result['page']["page_size"])
+
+ rps = []
+ for i in range(1, max_page + 1):
+ url_ = f"{url}?page={i}&page_size={page_size}"
+ rps_ = session.get(url_).json()['data']
+ rps.extend(rps_)
+
+ result = rps
+
+ return result
+
+ def get_pride_data(self, code):
+ '''
+ get data from PRIDE
+ '''
+ cache_fn = Path(self.cache, f'{code}.pickle')
+ data = None
+
+ # Check if we already requested the data
+ if Path(cache_fn).is_file():
+ logging.info(f'Found {code} in cache! {cache_fn}')
+ with open(cache_fn, 'rb') as f:
+ data = pickle.load(f)
+ elif not Path(cache_fn).is_file():
+ ms_seqs = self.get_sequences_pride(code)
+ ms_res_pairs = self.get_residue_pairs_pride(code)
+
+ data = {
+ 'pride_id': code,
+ 'sequences': ms_seqs,
+ 'residue_pairs': ms_res_pairs
+ }
+
+ with open(cache_fn, 'wb') as f:
+ pickle.dump(data, f)
+
+ return data
+
+ def get_pride_ids(self) -> list:
+ '''
+ get a list of all PRIDE ids from entry
+ '''
+ ids = []
+ for i, d in enumerate(self.system.orphan_datasets):
+ if isinstance(d, ihm.dataset.MassSpecDataset) or isinstance(d, ihm.dataset.CXMSDataset):
+ if isinstance(d.location, ihm.location.PRIDELocation):
+ try:
+ pid = d.location.access_code
+ ids.append(pid)
+ except AttributeError:
+ pass
+
+ return ids
+
+ def validate_pride_data(self, data: dict) -> dict:
+
+ out = None
+
+ # Unpack pride data
+ pid = data['pride_id']
+ ms_res_pairs = data['residue_pairs']
+ ms_seqs_ = data['sequences']
+
+ # Get sequences from mmcif entry
+ mmcif_seqs = {}
+ for e in self.system.entities:
+ if e.is_polymeric:
+ seq = ''.join([x.code_canonical for x in e.sequence])
+ seq = pyhmmer.easel.TextSequence(sequence=seq, name=e._id.encode('utf-8'), description=e.description.encode('utf-8')).digitize(pyhmmer.easel.Alphabet.amino())
+ mmcif_seqs[e._id] = seq
+
+ # Get sequences from crosslinking-MS data
+ ms_seqs = [
+ pyhmmer.easel.TextSequence(
+ sequence=x['sequence'],
+ name=x['id'].encode('utf-8'),
+ source=x['file'].encode('utf-8')
+ ).digitize(pyhmmer.easel.Alphabet.amino()) for x in ms_seqs_
+ ]
+
+ matches_seqs = {}
+ matches_seqs_ids = {}
+ matches_residue_pairs = {}
+ for k, v in mmcif_seqs.items():
+ best_hit = list(pyhmmer.hmmer.phmmer(v, ms_seqs))[0][0]
+ matches_seqs[k] = best_hit
+ matches_seqs_ids[k] = best_hit.best_domain.hit.name.decode("utf-8")
+
+ matched_ms_seqs = list(matches_seqs_ids.values())
+ matched_mmcif_entities = list(matches_seqs_ids.keys())
+
+ ms_rps_total_ = len(ms_res_pairs)
+ ms_rps_mmcif_entities = 0
+
+ sxls = []
+ for r in ms_res_pairs:
+ if r['prot1'] in matched_ms_seqs and r['prot2'] in matched_ms_seqs:
+ eid1 = r['prot1']
+ rid1 = r['pos1']
+ eid2 = r['prot2']
+ rid2 = r['pos2']
+ sxl = tuple(sorted(((eid1, rid1), (eid2, rid2))))
+ sxls.append(sxl)
+
+ sxls = set(sxls)
+
+ ms_rps_mmcif_entities = len(sxls)
+
+ exls = []
+
+ for restr_ in self.system.restraints:
+ # We are interested only in Chemical crosslinks
+ if type(restr_) != ihm.restraint.CrossLinkRestraint:
+ continue
+
+ # Iterate over all crosslinks in the dataset
+ for xl in restr_.cross_links:
+ eid1 = xl.experimental_cross_link.residue1.entity._id
+ rid1 = xl.experimental_cross_link.residue1.seq_id
+ eid2 = xl.experimental_cross_link.residue1.entity._id
+ rid2 = xl.experimental_cross_link.residue2.seq_id
+ exl = tuple(sorted(((eid1, rid1), (eid2, rid2))))
+ exls.append(exl)
+
+ exls = list(set(exls))
+
+
+ mmcif_rps_ms_entities = 0
+
+ emxls = []
+ for (eid1, rid1), (eid2, rid2) in exls:
+ if eid1 in matched_mmcif_entities and eid2 in matched_mmcif_entities:
+ mmcif_rps_ms_entities += 1
+ eid1_ = matches_seqs_ids[eid1]
+ eid2_ = matches_seqs_ids[eid2]
+ exl = tuple(sorted(((eid1_, rid1), (eid2_, rid2))))
+ emxls.append(exl)
+
+ rps_both = len(set(emxls) & set(sxls))
+ mmcif_rps = len(exls)
+ ms_rps = len(ms_res_pairs)
+
+ out = {
+ 'pride_id': pid,
+ 'entities_ms': len(ms_seqs),
+ 'entities': len(mmcif_seqs),
+ 'matches': [
+ {
+ 'entity': k,
+ 'entity_ms': v.best_domain.hit.name.decode("utf-8"),
+ 'e-value': v.best_domain.c_evalue,
+ 'exact_match': v.best_domain.alignment.target_sequence == v.best_domain.alignment.hmm_sequence.upper()
+ } for k, v in matches_seqs.items()
+ ],
+ 'stats': {
+ 'entry': {
+ 'total': mmcif_rps,
+ 'mapped_entities': mmcif_rps_ms_entities,
+ 'mapped_entities_pct': mmcif_rps_ms_entities / mmcif_rps * 100.,
+ 'matched': rps_both,
+ 'matched_pct': rps_both / mmcif_rps * 100.,
+ },
+ 'ms': {
+ 'total': ms_rps,
+ 'mapped_entities': ms_rps_mmcif_entities,
+ 'mapped_entities_pct': ms_rps_mmcif_entities / ms_rps * 100.,
+ 'matched': rps_both,
+ 'matched_pct': rps_both / ms_rps * 100.,
+ }
+ }
+ }
+
+ return out
+
+ def validate_all_pride_data(self) -> list:
+ '''perform data quality validation for all crosslinking-MS datasets'''
+
+ codes = self.get_pride_ids()
+ outs = []
+ for code in codes:
+ data = self.get_pride_data(code)
+ out = self.validate_pride_data(data)
+ if out is not None:
+ outs.append(out)
+
+ return outs
diff --git a/ihm_validation/report.py b/ihm_validation/report.py
index 769811e7..f4b97dbd 100644
--- a/ihm_validation/report.py
+++ b/ihm_validation/report.py
@@ -397,7 +397,7 @@ def run_cx_validation(self, Template_Dict: dict) -> (dict):
if self.input.check_for_cx(self.input.get_dataset_comp()):
Template_Dict['cx'] = True
- I_cx = cx.CxValidation(self.mmcif_file)
+ I_cx = cx.CxValidation(self.mmcif_file, cache=self.cache)
self.I_cx = I_cx
raw_data = None
@@ -420,6 +420,8 @@ def run_cx_validation(self, Template_Dict: dict) -> (dict):
Template_Dict['cx_stats'] = stats
Template_Dict['cx_stats_per_model'] = I_cx.get_per_model_satifaction_rates()
+ Template_Dict['cx_data_quality'] = I_cx.validate_all_pride_data()
+
output = (Template_Dict, raw_data, raw_ertypes)
return output
diff --git a/templates/data_quality.html b/templates/data_quality.html
index b91c4dc2..0b5f0870 100644
--- a/templates/data_quality.html
+++ b/templates/data_quality.html
@@ -191,4 +191,6 @@
{% endif %}
+
+ {% include 'cx_data_quality.j2' %}
{% for i in range(Unique_dataset|length) %}
diff --git a/templates/layout.html b/templates/layout.html
index 7d575a61..c3ebf8b5 100644
--- a/templates/layout.html
+++ b/templates/layout.html
@@ -110,7 +110,7 @@
- {% if ('SAS DATA' in Data) and enable_sas %}
+ {% if (('SAS DATA' in Data) and enable_sas ) or (('CX-MS DATA' in Data) and enable_cx) %}
Data Quality
{% elif (('SAS DATA' not in Data) or not enable_sas) and Unique_dataset|length > 0 %}
@@ -151,6 +151,19 @@
{% for item in Unique_dataset %}
{{item}}
{% endfor %}
+
+ {% if ('CX-MS DATA' in Data) and enable_cx %}
+ Crosslinking-MS
+ {% if cx_data_quality|length > 0 %}
+
+ {% endif %}
+ {% endif %}