aKMT_Lys_pred.py

'''
Described at PyMOL wiki:
http://www.pymolwiki.org/index.php/aAKMT_Lys_pred

Authors : Troels Schwarz-Linnet
Program : aKMT_Lys_pred
Date    : June 2017

aKMT_Lys_pred -- Help predicting lysine methylation
'''
# Internal pymol import
from pymol import cmd
from pymol import stored
# From Pymol-script repo: https://pymolwiki.org/index.php/Findseq
import findseq

aa_1_3 = {'A': 'ALA', 'C': 'CYS', 'D': 'ASP', 'E': 'GLU', 'F': 'PHE', 'G': 'GLY', 'H': 'HIS', 'I': 'ILE', 'K': 'LYS',
            'L': 'LEU', 'M': 'MET', 'N': 'ASN', 'P': 'PRO', 'Q': 'GLN', 'R': 'ARG', 'S': 'SER', 'T': 'THR', 'V': 'VAL',
            'W': 'TRP', 'Y': 'TYR'}

aa_3_1 = {'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
            'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S',
            'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'}

aa_types = {'A': 'hydrophobic', 'C': 'cysteine', 'D': 'negative', 'E': 'negative', 'F': 'aromatic', 'G': 'glycine', 'H': 'polar',
            'I': 'hydrophobic', 'K': 'positive', 'L': 'hydrophobic', 'M': 'hydrophobic', 'N': 'polar', 'P': 'proline', 'Q': 'polar',
            'R': 'positive', 'S': 'polar', 'T': 'polar', 'V': 'hydrophobic', 'W': 'aromatic', 'Y': 'aromatic'}


def get_resis_from_resn(target_sel="all", resn="lys", atom_name="CA", verb=True):
    # Make uppercase
    resn = resn.upper()
    # Check if one letter residue name
    if len(resn) == 1:
        resn = aa_1_3[resn]
    atom_name = atom_name.upper()

    # Prepare for storing and make expression
    stored.infolist = []
    # resv (int): the residue identifier (residue number), ss (str): secondary structure, name (str): the atom name
    expression = "stored.infolist.append([model, chain, resv, resn, ss, name])"
    # Iterate over selection, storing info
    cmd.iterate(target_sel, expression)

    # Store info
    return_list_resn_resi = []
    return_list_resn_resi_sel = []
    group = "Find_%s_%s"%(resn, target_sel)
    for info in stored.infolist:
        cur_model, cur_chain, cur_resv, cur_resn, cur_ss, cur_name = info
        if cur_resn == resn and cur_name == atom_name:
            # Convert residue name to one letter
            cur_aa_3_1 = aa_3_1[resn]
            # Do selection and group
            #sel_str = "/%s/%s//%s"%(cur_model, cur_chain, cur_resv)
            sel_str = "%s and chain %s and resi %s"%(cur_model, cur_chain, cur_resv)
            resn_resi = "%s%s"%(cur_aa_3_1, cur_resv)
            sel_str_text = "%s_%s"%(group, resn_resi)
            cmd.select(sel_str_text, sel_str)
            # Store
            return_list_resn_resi.append(resn_resi)
            return_list_resn_resi_sel.append(sel_str)
            # If verbose
            if verb:
                print("%s , sel: %s"%(resn_resi, sel_str))
    # Group selections
    cmd.group(group, "%s_*"%group)
    cmd.select("%s_sel"%group, "%s_*"%group)
    cmd.show("lines", group)

    # If verbose
    if verb:
        print("\nThere are %i hits, in target_sel=%s, with resn=%s\n"%(len(return_list_resn_resi), target_sel, resn))
    return return_list_resn_resi, return_list_resn_resi_sel
cmd.extend("get_resis_from_resn", get_resis_from_resn)


def match_peptides(target_sel="all", peptides=[], verb=True):
    if type(peptides) != list:
        print("\nERROR: The peptides should be supplied as a list\n")
        return

    # Store info
    return_list_resn_resi = []
    return_list_resn_resi_sel = []
    group = "Match_%s"%(target_sel)

    for peptide in peptides:
        sequence, modification = peptide
        sequence = sequence.strip()
        modification = modification.strip()
        # Check input
        if modification[0].isdigit() or not modification[1:].isdigit():
            print("\nERROR: The modificaions should be in format of ex: K10\n")
            return
        #sel_str = "findseq"
        sel_str = sequence
        findseq.findseq(needle=sequence, haystack=target_sel, selName=sel_str)
        # Limit the selection to atom name CA
        atom_name = "CA"
        sel_str_atom = "%s_%s"%(sel_str, atom_name)
        # Make a sub selection with atom name, and delete old selection
        cmd.select(sel_str_atom, "%s and name %s"%(sel_str, atom_name))
        cmd.delete(sel_str)
        # Iterate
        stored.infolist = []
        # resv (int): the residue identifier (residue number), ss (str): secondary structure, name (str): the atom name
        expression = "stored.infolist.append([model, chain, resv, resn, ss, name])"
        # Iterate over selection, storing info
        cmd.iterate(sel_str_atom, expression)
        cmd.delete(sel_str_atom)

        # Check for results. Is there any found?
        if len(stored.infolist) == 0:
            print("\n#####################################################################")
            print("ERROR: The following sequence cannot be found: %s     %s"%(sequence, modification))
            print("\n#####################################################################\n")
            continue

        # Find resn and index for search. This is for the peptide
        resn = modification[0].upper()
        index = int(modification[1:]) - 1
        # This is for the mathc selection
        peptide_match_modification = stored.infolist[index]
        peptide_match_modification_model, peptide_match_modification_chain, peptide_match_modification_resv, peptide_match_modification_resn, \
            peptide_match_modification_ss, peptide_match_modification_name =  peptide_match_modification
        # Convert to single aa
        peptide_match_modification_resn = aa_3_1[peptide_match_modification_resn]
        # Convert ss, secondary structure, if ss=S (Sheet), or ss='' (Not Helix or Sheet)
        if peptide_match_modification_ss == '':
            peptide_match_modification_ss = 'L'

        # Check if the residue type match
        if peptide_match_modification_resn != resn:
            print("\n#####################################################################")
            print("ERROR: The match is not equal: %s=%s != %s"%(modification[0], resn, peptide_match_modification_resn))
            print("\n#####################################################################\n")
            continue

        # Do selection and group
        #peptide_match_modification_sel_str = "/%s/%s//%s"%(peptide_match_modification_model, peptide_match_modification_chain, peptide_match_modification_resv)
        peptide_match_modification_sel_str = "%s and chain %s and resi %s"%(peptide_match_modification_model, peptide_match_modification_chain, peptide_match_modification_resv)
        peptide_match_modification_resn_resi = "%s%s"%(peptide_match_modification_resn, peptide_match_modification_resv)
        peptide_match_modification_sel_str_text = "%s_%s_%s_%s"%(group, peptide_match_modification_resn_resi, modification, sequence)
        cmd.select(peptide_match_modification_sel_str_text, peptide_match_modification_sel_str)
        # Store
        if peptide_match_modification_resn_resi not in return_list_resn_resi:
            return_list_resn_resi.append(peptide_match_modification_resn_resi)
            return_list_resn_resi_sel.append(peptide_match_modification_sel_str)

        # Print
        if verb:
            print("The peptide=%s, with modification=%s, corresponds to resi=%s"%(sequence, modification, peptide_match_modification_resn_resi))

    # Group selections
    cmd.group(group, "%s_*"%group)
    cmd.select("%s_sel"%group, "%s_*"%group)
    cmd.show("lines", group)

    # If verbose
    if verb:
        print("\nThere are %i uniq matches, in target_sel=%s\n"%(len(return_list_resn_resi), target_sel))
    return return_list_resn_resi, return_list_resn_resi_sel
cmd.extend("match_peptides", match_peptides)


def get_resi_stats(target_sel="all", residues=[], group_id="X", atom="NZ", atom_dist=8, resi_n_term=8, resi_c_term=8,  verb=True):
    # The distance in angstrom to look for
    var_dist = 12

    if type(residues) != list:
        print("\nERROR: The residues should be supplied as a list\n")
        return

    # Get current setting
    ini_setting = cmd.get("dot_solvent")
    ini_setting2 = cmd.get("dot_density")
    # Increasing dot_density makes calculation slower, but not a big difference
    #cmd.set('dot_density', 3)

    # Make groups
    group = "Stats_%s_%s" % (target_sel, group_id)
    group_atom = "Stats_%s_%s_%s" % (atom, target_sel, group_id)
    group_chain = "Stats_%s_%s_%s" % ("chain", target_sel, group_id)
    group_3dweb = "Stats_%s_%s_%s" % ("3dweb", target_sel, group_id)

    # Make list for storing
    slist = []

    # Make file for writing
    wfileweblogo = open("resi_stats_weblogo_%s_%s.txt" % (target_sel, group_id), 'w')

    for residue in residues:
        residue = residue.strip()
        resn_1 = residue[0].upper()
        resn_3 = aa_1_3[resn_1]
        resi = int(residue[1:])
        # Check input
        if resn_1.isdigit():
            print("\nERROR: The residue should be in format of ex: K10\n")
            return

        # Do selection and group
        sel_str = "%s and resn %s and resi %s" % (target_sel, resn_3, resi)
        resn_resi = "%s%s" % (resn_1, resi)
        sel_str_text = "%s_%s_%s" % (target_sel, group_id, resn_resi)
        cmd.select(sel_str_text, sel_str)

        # Make quick test, to see if the atom is there
        sel_str_atom_test = "%s and name %s" % (sel_str_text, atom)
        test_str = "Test_nr_atoms"
        cmd.select(test_str, sel_str_atom_test)
        nr_test = cmd.count_atoms(test_str)
        if nr_test != 1:
            print("\nERROR: The selection '%s', has only nr of atoms:%s. SKIPPING"%(sel_str_atom_test, nr_test))
            continue

        # MSA = Molecular Surface Area
        cmd.set("dot_solvent", "off")
        MSA = cmd.get_area(sel_str)
        # SASA = Solvent Accessible Surface Area
        cmd.set("dot_solvent", "on")
        SASA = cmd.get_area(sel_str)

        # Get the chain residues
        chain = "."*(resi_n_term + resi_c_term + 1)
        chain_sec = "."*(resi_n_term + resi_c_term + 1)
        resi_sel_min = resi-resi_n_term
        if resi_sel_min < 1:
            resi_sel_min = 1
        resi_sel_max = resi+resi_c_term
        resi_sel = "%i-%i" % (resi_sel_min, resi_sel_max)

        # Make selection
        sel_str_chain = "%s and resi %s and name CA" % (target_sel, resi_sel)
        sel_str_text_chain = "%s_%s_%s_%s" % ("chain", target_sel, group_id, resn_resi)
        cmd.select(sel_str_text_chain, sel_str_chain)
        # Get the chain info
        stored.list_chain = []
        expression_chain="stored.list_chain.append([resi, resn, name, ss])"
        cmd.iterate(sel_str_text_chain, expression_chain)
        for chain_resi_info in stored.list_chain:
            chain_resi, chain_resn, chain_name, chain_ss = chain_resi_info
            # Convert ss, secondary structure, if ss=S (Sheet), or ss='' (Not Helix or Sheet)
            if chain_ss == '':
                chain_ss = 'L'
            chain_resi = int(chain_resi)
            try:
                chain_resn_1 = aa_3_1[chain_resn]
            except KeyError:
                chain_resn_1 = "."
            # Calculate index
            index = resi_n_term - (resi - chain_resi)
            # Replace in string for residue names
            chain = chain[:index] + chain_resn_1 + chain[index + 1:]
            # Replace in string for secondary structyre
            chain_sec = chain_sec[:index] + chain_ss + chain_sec[index + 1:]

        # Get number of neighbour atoms
        # Make selection for NZ atoms
        sel_str_atom = "%s and name %s" % (sel_str_text, atom)
        sel_str_text_atom = "%s_%s_%s_%s" % (atom, target_sel, group_id, resn_resi)
        cmd.select(sel_str_text_atom, sel_str_atom)

        # Make selection around NZ atom for fixed distance, and count
        sel_str_atom_around = "%s around %s and not (%s)" % (sel_str_text_atom, atom_dist, sel_str)
        sel_str_text_atom_around = "%s_around_%s_%s_%s" % (atom, target_sel, group_id, resn_resi)
        cmd.select(sel_str_text_atom_around, sel_str_atom_around)
        # Count around
        stored.list = []
        expression="stored.list.append([resi, resn, name])"
        cmd.iterate(sel_str_text_atom_around, expression)
        nr_atoms_around = len(stored.list)

        # Make selection around NZ atom for variable distance
        #for i in range(2, var_dist+1):
        for i in range(2, var_dist+1, 2):
            dist = i
            dist_pre = dist - 1
            # Select for an angstrom shorter
            sel_str_atom_3dweb_pre = "byres %s around %s" % (sel_str_text_atom, dist_pre)
            sel_str_text_atom_3dweb_pre = "%s_3dweb_pre_%s_%s_%s_%s_%s" % (atom, target_sel, group_id, resn_resi, dist, dist_pre)
            cmd.select(sel_str_text_atom_3dweb_pre, sel_str_atom_3dweb_pre)
            # Select at distance
            sel_str_atom_3dweb_post = "byres %s around %s" % (sel_str_text_atom, dist)
            sel_str_text_atom_3dweb_post = "%s_3dweb_post_%s_%s_%s_%s_%s" % (atom, target_sel, group_id, resn_resi, dist, dist)
            cmd.select(sel_str_text_atom_3dweb_post, sel_str_atom_3dweb_post)
            # Make selection for uniq residues with shell
            sel_str_text_atom_3dweb_sel = "%s_3dweb_sel_%s_%s_%s_%s" % (atom, target_sel, group_id, resn_resi, dist)
            cmd.select(sel_str_text_atom_3dweb_sel, "(%s and not %s) and name CA" % (sel_str_atom_3dweb_post, sel_str_atom_3dweb_pre))
            # delete
            cmd.delete(sel_str_text_atom_3dweb_pre)
            cmd.delete(sel_str_text_atom_3dweb_post)
            # Loop through selecion
            stored.list_3dweb = []
            expression_3dweb="stored.list_3dweb.append([resi, resn, name])"
            cmd.iterate(sel_str_text_atom_3dweb_sel, expression_3dweb)
            for web3d_residues in stored.list_3dweb:
                web3d_resi, web3d_resn, web3d_name = web3d_residues
                try:
                    web3d_resn_1 = aa_3_1[web3d_resn]
                except KeyError:
                    web3d_resn_1 = "."
                # Write http://weblogo.threeplusone.com/ file
                FASTA_text = "> %s %s %s %s %s, dist=%s resi=%s resn=%s %s" %(target_sel, group_id, resi, resn_1, resn_3, dist, web3d_resi, web3d_resn_1, web3d_resn)
                weblogo = "."*(var_dist)
                weblogo = weblogo[:i-1] + web3d_resn_1 + weblogo[i:]
                # Write
                wfileweblogo.write(FASTA_text + "\n")
                wfileweblogo.write(weblogo + "\n")

        # Store info
        slist.append([target_sel, group_id, resn_resi, resn_1, resi, MSA, SASA, nr_atoms_around, chain, chain_sec])

    # Group selections
    cmd.group(group, "%s_%s_*" % (target_sel, group_id))
    cmd.select("%s_sel"%group, "%s_%s_*" % (target_sel, group_id))
    # Group around
    cmd.group(group_chain, "%s_%s_%s*" % ("chain",target_sel, group_id) )
    cmd.group(group_atom, "%s_%s_%s_*" % (atom, target_sel, group_id))
    cmd.group(group_atom, "%s_around_%s_%s_*" % (atom, target_sel, group_id))
    cmd.group(group_3dweb, "%s_3dweb_sel_%s_%s_*" % (atom, target_sel, group_id))

    # Write output
    wfile = open("resi_stats_%s_%s.csv" % (target_sel, group_id), 'w')
    wfile.write("target_sel;group_id;resn_resi;resn;resi;MSA;SASA;nr_atoms_around;chain;chain_sec"+"\n")
    for i in slist:
        wfile.write("%s;%s;%s;%s;%i;%3.0f;%3.0f;%i;%s;%s" % (i[0], i[1], i[2], i[3], i[4], i[5], i[6], i[7], i[8], i[9]) + "\n")
    wfile.close()
    wfileweblogo.close()

    # Back to before
    cmd.set("dot_solvent", ini_setting)
    cmd.set('dot_density', ini_setting2)

cmd.extend("match_peptides", match_peptides)