From e1b3c36233b9a8829371c9c2b706772b2cd13a0f Mon Sep 17 00:00:00 2001
From: Arthur Zalevsky <aozalevsky@gmail.com>
Date: Thu, 21 Mar 2024 18:30:29 -0700
Subject: [PATCH] oops, the rest of 492f14b

---
 ihm_validation/cx.py               | 232 ++++++++++++++++++++++++++++-
 ihm_validation/report.py           |   4 +-
 templates/data_quality.html        |   2 +
 templates/full_validation_pdf.html |   2 +
 templates/layout.html              |  15 +-
 5 files changed, 252 insertions(+), 3 deletions(-)

diff --git a/ihm_validation/cx.py b/ihm_validation/cx.py
index 09897be0..850c0a78 100644
--- a/ihm_validation/cx.py
+++ b/ihm_validation/cx.py
@@ -22,6 +22,9 @@
 import json
 from bokeh.embed import json_item
 from bokeh.io import export_svgs
+import requests
+import pickle
+import pyhmmer
 
 pd.options.mode.chained_assignment = None
 NA = 'Not available'
@@ -70,9 +73,10 @@ class CxValidation(GetInputInformation):
     ID = None
     driver = None
 
-    def __init__(self, mmcif_file):
+    def __init__(self, mmcif_file, cache):
         super().__init__(mmcif_file)
         self.ID = str(self.get_id())
+        self.cache = cache
         self.nos = self.get_number_of_models()
         self.dataset = self.get_dataset_comp()
         # Only atomic structures are supported so far
@@ -897,3 +901,229 @@ def save_plots(self, plot, title, imgDirname='.'):
         svgs = [Path(x).name for x in svgs]
 
         return (imgpath, imgpath_json, svgs)
+
+    @staticmethod
+    def request_pride(url: str) -> dict:
+        ''' pull data from PRIDE using crosslinking PDB-DEV API '''
+        result = None
+        r = requests.get(url)
+
+        if not r.ok:
+            # Wait until cold request completes and go to DB cache
+            logging.info('Retrying pulling data from PRIDE')
+            time.sleep(60)
+            r = requests.get(url)
+
+        if r.ok:
+            try:
+                result = r.json()
+            except JSONDecodeError:
+                pass
+
+        return result
+
+    def get_sequences_pride(self, pid: str) -> dict:
+        '''get sequences from PRIDE entry'''
+        url = f"https://www.ebi.ac.uk/pride/ws/archive/crosslinking/pdbdev/projects/{pid}/sequences"
+        result = self.request_pride(url)['data']
+        return result
+
+    def get_residue_pairs_pride(self, pid: str, page_size: int = 99) -> dict:
+        '''get sequences from PRIDE entry'''
+        url = f"https://www.ebi.ac.uk/pride/ws/archive/crosslinking/pdbdev/projects/{pid}/residue-pairs/based-on-reported-psm-level/passing"
+        page = 1
+        url_ = f"{url}?page={page}&page_size={page_size}"
+
+        result = self.request_pride(url_)
+
+        if result is not None:
+            session = requests.Session()
+
+            max_page = int(result['page']["total_pages"])
+            page_size = int(result['page']["page_size"])
+
+            rps = []
+            for i in range(1, max_page + 1):
+                url_ = f"{url}?page={i}&page_size={page_size}"
+                rps_ = session.get(url_).json()['data']
+                rps.extend(rps_)
+
+            result = rps
+
+        return result
+
+    def get_pride_data(self, code):
+        '''
+        get data from PRIDE
+        '''
+        cache_fn = Path(self.cache, f'{code}.pickle')
+        data = None
+
+        # Check if we already requested the data
+        if Path(cache_fn).is_file():
+            logging.info(f'Found {code} in cache! {cache_fn}')
+            with open(cache_fn, 'rb') as f:
+                data = pickle.load(f)
+        elif not Path(cache_fn).is_file():
+            ms_seqs = self.get_sequences_pride(code)
+            ms_res_pairs = self.get_residue_pairs_pride(code)
+
+            data = {
+                'pride_id': code,
+                'sequences': ms_seqs,
+                'residue_pairs': ms_res_pairs
+            }
+
+            with open(cache_fn, 'wb') as f:
+                pickle.dump(data, f)
+
+        return data
+
+    def get_pride_ids(self) -> list:
+        '''
+        get a list of all PRIDE ids from entry
+        '''
+        ids = []
+        for i, d in enumerate(self.system.orphan_datasets):
+            if isinstance(d, ihm.dataset.MassSpecDataset) or isinstance(d, ihm.dataset.CXMSDataset):
+                if isinstance(d.location, ihm.location.PRIDELocation):
+                    try:
+                        pid = d.location.access_code
+                        ids.append(pid)
+                    except AttributeError:
+                        pass
+
+        return ids
+
+    def validate_pride_data(self, data: dict) -> dict:
+
+        out = None
+
+        # Unpack pride data
+        pid = data['pride_id']
+        ms_res_pairs = data['residue_pairs']
+        ms_seqs_ = data['sequences']
+
+        # Get sequences from mmcif entry
+        mmcif_seqs = {}
+        for e in self.system.entities:
+            if e.is_polymeric:
+                seq = ''.join([x.code_canonical for x in e.sequence])
+                seq = pyhmmer.easel.TextSequence(sequence=seq, name=e._id.encode('utf-8'), description=e.description.encode('utf-8')).digitize(pyhmmer.easel.Alphabet.amino())
+                mmcif_seqs[e._id] = seq
+
+        # Get sequences from crosslinking-MS data
+        ms_seqs = [
+            pyhmmer.easel.TextSequence(
+                sequence=x['sequence'],
+                name=x['id'].encode('utf-8'),
+                source=x['file'].encode('utf-8')
+            ).digitize(pyhmmer.easel.Alphabet.amino()) for x in ms_seqs_
+        ]
+
+        matches_seqs = {}
+        matches_seqs_ids = {}
+        matches_residue_pairs = {}
+        for k, v in mmcif_seqs.items():
+            best_hit = list(pyhmmer.hmmer.phmmer(v, ms_seqs))[0][0]
+            matches_seqs[k] = best_hit
+            matches_seqs_ids[k] = best_hit.best_domain.hit.name.decode("utf-8")
+
+        matched_ms_seqs = list(matches_seqs_ids.values())
+        matched_mmcif_entities = list(matches_seqs_ids.keys())
+
+        ms_rps_total_ = len(ms_res_pairs)
+        ms_rps_mmcif_entities = 0
+
+        sxls = []
+        for r in ms_res_pairs:
+            if r['prot1'] in matched_ms_seqs and r['prot2'] in matched_ms_seqs:
+                eid1 = r['prot1']
+                rid1 = r['pos1']
+                eid2 = r['prot2']
+                rid2 = r['pos2']
+                sxl = tuple(sorted(((eid1, rid1), (eid2, rid2))))
+                sxls.append(sxl)
+
+        sxls = set(sxls)
+
+        ms_rps_mmcif_entities = len(sxls)
+
+        exls = []
+
+        for restr_ in self.system.restraints:
+            # We are interested only in Chemical crosslinks
+            if type(restr_) != ihm.restraint.CrossLinkRestraint:
+                continue
+
+            # Iterate over all crosslinks in the dataset
+            for xl in restr_.cross_links:
+                eid1 = xl.experimental_cross_link.residue1.entity._id
+                rid1 = xl.experimental_cross_link.residue1.seq_id
+                eid2 = xl.experimental_cross_link.residue1.entity._id
+                rid2 = xl.experimental_cross_link.residue2.seq_id
+                exl = tuple(sorted(((eid1, rid1), (eid2, rid2))))
+                exls.append(exl)
+
+        exls = list(set(exls))
+
+
+        mmcif_rps_ms_entities = 0
+
+        emxls = []
+        for (eid1, rid1), (eid2, rid2) in exls:
+            if eid1 in matched_mmcif_entities and eid2 in matched_mmcif_entities:
+                mmcif_rps_ms_entities += 1
+                eid1_ = matches_seqs_ids[eid1]
+                eid2_ = matches_seqs_ids[eid2]
+                exl = tuple(sorted(((eid1_, rid1), (eid2_, rid2))))
+                emxls.append(exl)
+
+        rps_both = len(set(emxls) & set(sxls))
+        mmcif_rps = len(exls)
+        ms_rps = len(ms_res_pairs)
+
+        out = {
+            'pride_id': pid,
+            'entities_ms': len(ms_seqs),
+            'entities': len(mmcif_seqs),
+            'matches': [
+                {
+                    'entity': k,
+                    'entity_ms': v.best_domain.hit.name.decode("utf-8"),
+                    'e-value': v.best_domain.c_evalue,
+                    'exact_match': v.best_domain.alignment.target_sequence == v.best_domain.alignment.hmm_sequence.upper()
+                } for k, v in matches_seqs.items()
+            ],
+            'stats': {
+                'entry': {
+                    'total': mmcif_rps,
+                    'mapped_entities': mmcif_rps_ms_entities,
+                    'mapped_entities_pct': mmcif_rps_ms_entities / mmcif_rps * 100.,
+                    'matched': rps_both,
+                    'matched_pct': rps_both / mmcif_rps * 100.,
+                },
+                'ms': {
+                    'total': ms_rps,
+                    'mapped_entities': ms_rps_mmcif_entities,
+                    'mapped_entities_pct': ms_rps_mmcif_entities / ms_rps * 100.,
+                    'matched': rps_both,
+                    'matched_pct': rps_both / ms_rps * 100.,
+                }
+            }
+        }
+
+        return out
+
+    def validate_all_pride_data(self) -> list:
+        '''perform data quality validation for all crosslinking-MS datasets'''
+
+        codes = self.get_pride_ids()
+        outs = []
+        for code in codes:
+            data = self.get_pride_data(code)
+            out = self.validate_pride_data(data)
+            if out is not None:
+                outs.append(out)
+
+        return outs
diff --git a/ihm_validation/report.py b/ihm_validation/report.py
index 769811e7..f4b97dbd 100644
--- a/ihm_validation/report.py
+++ b/ihm_validation/report.py
@@ -397,7 +397,7 @@ def run_cx_validation(self, Template_Dict: dict) -> (dict):
 
         if self.input.check_for_cx(self.input.get_dataset_comp()):
             Template_Dict['cx'] = True
-            I_cx = cx.CxValidation(self.mmcif_file)
+            I_cx = cx.CxValidation(self.mmcif_file, cache=self.cache)
             self.I_cx = I_cx
 
             raw_data = None
@@ -420,6 +420,8 @@ def run_cx_validation(self, Template_Dict: dict) -> (dict):
             Template_Dict['cx_stats'] = stats
             Template_Dict['cx_stats_per_model'] = I_cx.get_per_model_satifaction_rates()
 
+            Template_Dict['cx_data_quality'] = I_cx.validate_all_pride_data()
+
             output = (Template_Dict, raw_data, raw_ertypes)
 
         return output
diff --git a/templates/data_quality.html b/templates/data_quality.html
index b91c4dc2..0b5f0870 100644
--- a/templates/data_quality.html
+++ b/templates/data_quality.html
@@ -191,4 +191,6 @@ <h5 class= ex2 align= center><u><a name=firstG> Guinier analysis </a></u><a clas
       </div>
     </div>
   {% endif %}  
+
+  {% include 'cx_data_quality.j2' %}
 {% endblock %}
diff --git a/templates/full_validation_pdf.html b/templates/full_validation_pdf.html
index 20caf168..05bac7e7 100644
--- a/templates/full_validation_pdf.html
+++ b/templates/full_validation_pdf.html
@@ -553,6 +553,8 @@ <h5 align=center>
               SAS data used in this integrative model could not be validated as the sascif file is currently unavailable.
             </p>
           {% endif %}
+  
+          {% include 'cx_data_quality.j2' %}
           
           <!-- Other datasets -->
           {% for i in range(Unique_dataset|length) %}
diff --git a/templates/layout.html b/templates/layout.html
index 7d575a61..c3ebf8b5 100644
--- a/templates/layout.html
+++ b/templates/layout.html
@@ -110,7 +110,7 @@ <h5>
               
               <!-- Model details -->
               <li class="nav-item mr-4 dropdown">
-                {% if ('SAS DATA' in Data) and enable_sas %}
+                {% if (('SAS DATA' in Data) and enable_sas ) or (('CX-MS DATA' in Data) and enable_cx) %}
                   <a class="dropdown-toggle"  data-toggle="dropdown"><button type="button" class="btn btn-primary" onclick=window.location="data_quality.html";>Data <br> Quality </button></a>
                       
                 {% elif (('SAS DATA' not in Data) or not enable_sas) and Unique_dataset|length > 0 %}
@@ -151,6 +151,19 @@ <h5>
                   {% for item in Unique_dataset %}
                     <a class=dropdown-item-text dropdown-toggle type=button>{{item}}</a>
                   {% endfor %}
+                
+                  {% if ('CX-MS DATA' in Data) and enable_cx %}
+                      <a class="dropdown-item colorclass" dropdown-toggle  href=data_quality.html#crosslinking-ms>Crosslinking-MS</a>
+                      {% if cx_data_quality|length > 0 %}
+                        <ul class=dropdown-submenu>                  
+                          {% for data in cx_data_quality %}
+                            <li>
+                              <a class=dropdown-item2 href=data_quality.html#{{ data["pride_id"] }}>{{ data["pride_id"] }}</a>
+                            </li>
+                          {% endfor %}
+                        </ul>
+                      {% endif %}
+                  {% endif %}
                 </div>
               </li>