From 03d57926bfad95b686281831d91d4a525bedefa6 Mon Sep 17 00:00:00 2001
From: David Kastner <kastner@mit.edu>
Date: Sat, 27 Jan 2024 23:47:05 -0500
Subject: [PATCH] New script for cc coupling analysis

---
 pyqmmm/cli.py                  |   8 +-
 pyqmmm/md/cc_coupling.py       | 130 +++++++++++++++++++++++++++++++++
 pyqmmm/md/hbond_analyzer.py    |   2 +-
 pyqmmm/qm/orca_scan_plotter.py |  59 ++++++++-------
 4 files changed, 169 insertions(+), 30 deletions(-)
 create mode 100644 pyqmmm/md/cc_coupling.py

diff --git a/pyqmmm/cli.py b/pyqmmm/cli.py
index ceba7ab..cbb7ef7 100644
--- a/pyqmmm/cli.py
+++ b/pyqmmm/cli.py
@@ -171,7 +171,6 @@ def md(
         import pyqmmm.md.quickcsa
         pyqmmm.md.quickcsa.quick_csa()
 
-
 @cli.command()
 @click.option("--plot_energy", "-pe", is_flag=True, help="Plot the energy of a xyz traj.")
 @click.option("--flip_xyz", "-f", is_flag=True, help="Reverse and xyz trajectory.")
@@ -245,13 +244,14 @@ def qm(
             atom_pairs = [(145, 146), (65, 145), (66, 145), (12, 145), (32, 145), (145, 149)]
             pyqmmm.qm.bond_valence.calculate_bond_valence(atom_pairs, 4)
             pyqmmm.qm.bond_valence.plot_bond_valence()
+            
     if orca_scan:
         import pyqmmm.qm.orca_scan_plotter
         atom_1 = input("   > What is your first atom being scanned? ")
         atom_2 = input("   > What is your second atom being scanned? ")
-
-        df = pyqmmm.qm.orca_scan_plotter.read_orca_output("orca.out")
-        pyqmmm.qm.orca_scan_plotter.plot_energy(df, atom_1, atom_2)
+        distances, relative_energies = pyqmmm.qm.orca_scan_plotter.read_orca_output("orca.out")
+        print(f"   > Start distance: {distances[0]}, End distance: {distances[-1]}\n")
+        pyqmmm.qm.orca_scan_plotter.plot_energy(distances, relative_energies, atom_1, atom_2)
 
 if __name__ == "__main__":
     # Run the command-line interface when this script is executed
diff --git a/pyqmmm/md/cc_coupling.py b/pyqmmm/md/cc_coupling.py
new file mode 100644
index 0000000..487d7b6
--- /dev/null
+++ b/pyqmmm/md/cc_coupling.py
@@ -0,0 +1,130 @@
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from typing import List
+import matplotlib.pyplot as plt
+
+def format_plot() -> None:
+    """
+    General plotting parameters for the Kulik Lab.
+
+    """
+    font = {"family": "sans-serif", "weight": "bold", "size": 10}
+    plt.rc("font", **font)
+    plt.rcParams["xtick.major.pad"] = 5
+    plt.rcParams["ytick.major.pad"] = 5
+    plt.rcParams["axes.linewidth"] = 2
+    plt.rcParams["xtick.major.size"] = 7
+    plt.rcParams["xtick.major.width"] = 2
+    plt.rcParams["ytick.major.size"] = 7
+    plt.rcParams["ytick.major.width"] = 2
+    plt.rcParams["xtick.direction"] = "in"
+    plt.rcParams["ytick.direction"] = "in"
+    plt.rcParams["xtick.top"] = True
+    plt.rcParams["ytick.right"] = True
+    plt.rcParams["svg.fonttype"] = "none"
+
+def heatmap(data: str, residues: List[str], delete: List[int] = [], 
+            out_file: str = "heatmap", v=[-0.4, 0.4]) -> None:
+    
+    """
+    Generates formatted heat maps.
+
+    Uses the Kulik Lab figure formatting standards.
+
+    Parameters
+    ----------
+    data: str
+        The name of the data file, which can be a data or a dat file.
+    delete: List[int]
+        A list of the amino acids you would like removed indexed at zero.
+    out_file: str
+        The name you would like the image saved as.
+
+    Examples
+    --------
+    heatmap(data="cacovar.dat", protein="mc6sa", delete=[0,15,16,27,28,29], out_file="matrix_geom.svg")
+
+    """
+
+    # General styling variables
+    light_gray = "#8e8e8e"
+    dark_gray = "#7e7e7e"
+    
+    # Sort the delete list
+    delete.sort()
+    
+    # Delete residues
+    residues = [item for index, item in enumerate(residues) if index not in delete]
+
+    # Identify matrix format and read in
+    contents = open(data, "r").readlines()
+    contents_joined = " ".join(contents)  # Create a single string for parsing
+    if "," in contents_joined:  # If a csv file
+        matrix = np.genfromtxt(data, delimiter=",")
+    elif " " in contents_joined:  # If a dat file
+        matrix = []
+        for line in contents:
+            matrix.append(line.split())
+        matrix = [[float(j) for j in i] for i in matrix]  # Strings to float
+        matrix = np.array(matrix)
+
+    np.fill_diagonal(matrix, 0)  # Set the diagonal to zero as they are trivial
+
+    df = pd.DataFrame(matrix)
+    # Remove specific rows and columns from non-residues
+
+    # Drop rows and columns
+    if len(delete) > 0:
+        df = df.drop(delete, axis=0)
+        df = df.drop(delete, axis=1)
+
+    df.columns = residues
+    df.index = residues
+
+    # Apply base Kulik plot parameters
+    format_plot()
+
+    # Set cmap
+    cmap = "RdBu" if any(x < 0 for x in v) else "Blues"
+
+    # Generate plot
+    ax = sns.heatmap(
+        df,
+        cmap=cmap,
+        vmin=v[0],
+        vmax=v[1],
+        xticklabels=True,
+        yticklabels=True,
+        linewidth=0.03,
+        linecolor=light_gray,
+    )
+    ax.get_yaxis().set_tick_params(direction="out")
+    plt.ylabel("residues", fontsize=10, weight="bold")
+    plt.xlabel("residues", fontsize=10, weight="bold")
+    plt.tick_params(axis="y", which="major", labelsize=8, rotation=0, length=0)
+    plt.tick_params(axis="x", which="major", labelsize=8, rotation=90, length=0)
+    ax.xaxis.tick_top()  # x axis on top
+    ax.xaxis.set_label_position("top")
+    # Add lines every five residues
+    ax.hlines([5, 10, 15, 20, 25], colors=dark_gray, *ax.get_xlim(), linewidth=1.5)
+    ax.vlines([5, 10, 15, 20, 25], colors=dark_gray, *ax.get_ylim(), linewidth=1.5)
+    # Add broders
+    ax.hlines([0, len(residues)], colors="k", *ax.get_xlim(), linewidth=3.5)
+    ax.vlines([0, len(residues)], colors="k", *ax.get_ylim(), linewidth=3.5)
+
+    extensions = ["png", "svg"]
+    for ext in extensions:
+        plt.savefig(f"{out_file}.{ext}", bbox_inches="tight", format=ext, dpi=300)
+    plt.close()
+
+if __name__ == "__main__":
+    pdb_path = f"../1_cluster/1/rep.c0.pdb"
+    residues = qa.process.get_protein_sequence(pdb_path)
+    delete = []
+    heatmap(
+        data="cacovar.dat",
+        residues=residues,
+        delete=delete,
+        out_file="matrix_geom.png",
+    )
\ No newline at end of file
diff --git a/pyqmmm/md/hbond_analyzer.py b/pyqmmm/md/hbond_analyzer.py
index 91b9b0e..9ebf179 100644
--- a/pyqmmm/md/hbond_analyzer.py
+++ b/pyqmmm/md/hbond_analyzer.py
@@ -247,7 +247,7 @@ def plot(data, file_path):
     df = df[df.ge(0.1).all(axis=1)]  # 10% occurence cutoff
 
     # Sort dataframe by occurrence in descending order and select the top 7 rows
-    df = df.sort_values(by=[df.columns[0]], ascending=False).head(7)
+    df = df.sort_values(by=[df.columns[0]], ascending=False).head(15)
 
     ax = df.plot.bar(
         color="darkgray", figsize=(4, 4)
diff --git a/pyqmmm/qm/orca_scan_plotter.py b/pyqmmm/qm/orca_scan_plotter.py
index 6331daf..23de258 100644
--- a/pyqmmm/qm/orca_scan_plotter.py
+++ b/pyqmmm/qm/orca_scan_plotter.py
@@ -1,5 +1,12 @@
+"""
+Plots the results of an ORCA scan.
+
+This script was difficult to write because of a strange matplotlib issue.
+No matter how I entered the data, maplotlib would autosort the x-axis.
+To fix this, I had to manually set the axes and add padding.
+"""
+
 import matplotlib.pyplot as plt
-import pandas as pd
 
 HARTREE_TO_KCAL_MOL = 627.509
 
@@ -27,8 +34,10 @@ def read_orca_output(file_name):
         lines = file.readlines()
     
     start_reading = False
-    data = []
+    distances = []
+    relative_energies = []
     first_energy_kcal_mol = None
+
     for line in lines:
         if "The Calculated Surface using the 'Actual Energy'" in line:
             start_reading = True
@@ -43,45 +52,45 @@ def read_orca_output(file_name):
                     if first_energy_kcal_mol is None:
                         first_energy_kcal_mol = energy_kcal_mol
                     relative_energy = energy_kcal_mol - first_energy_kcal_mol
-                    data.append((distance, relative_energy))
+                    distances.append(distance)
+                    relative_energies.append(relative_energy)
                 except ValueError:
                     break
             else:
                 break
-    df = pd.DataFrame(data, columns=['Distance', 'Relative Energy'])
 
-    return df
+    return distances, relative_energies
 
-def plot_energy(df, atom_1, atom_2):
+def plot_energy(distances, relative_energies, atom_1, atom_2):
     format_plot()
-    
+
     # Create a figure with adjustable size
-    fig = plt.figure(figsize=(4, 4))  # Start with a larger figure size
-    ax = fig.add_subplot(111)
+    fig, ax = plt.subplots(figsize=(4, 4))
 
-    # Plot the data
-    ax.plot(df['Distance'], df['Relative Energy'], marker='o', color='b')
-    ax.set_xlabel(f"{atom_1}···{atom_2} distance (Å)", weight="bold")
-    ax.set_ylabel("relative energy (kcal/mol)", weight="bold")
+    # Plot the data using lists
+    ax.scatter(distances, relative_energies, marker='o', color='b')
 
-    # Set x-axis and y-axis limits explicitly based on the data range
-    x_min, x_max = df['Distance'].min(), df['Distance'].max()
-    y_min, y_max = df['Relative Energy'].min(), df['Relative Energy'].max()
-    x_range = x_max - x_min
-    y_range = y_max - y_min
-    ax.set_xlim(x_min - 0.05 * x_range, x_max + 0.05 * x_range)
-    ax.set_ylim(y_min - 0.05 * y_range, y_max + 0.05 * y_range)
+    # Set the x-axis limits based on the first and last values in the distances list
+    x_range = max(distances) - min(distances)
+    padding = x_range * 0.06
 
-    # Adjust the aspect ratio of the plot area to be square based on data ranges
-    ratio = x_range / y_range
-    ax.set_aspect(ratio, adjustable='box')
+    # Check if its ascending or descending to add padding
+    if distances[0] < distances[-1]:
+        ax.set_xlim([distances[0] - padding, distances[-1] + padding])
+    else: 
+        ax.set_xlim([distances[0] + padding, distances[-1] - padding])
+    
+    ax.set_xlabel(f"{atom_1}···{atom_2} distance (Å)", weight="bold")
+    ax.set_ylabel("Relative energy (kcal/mol)", weight="bold")
 
     plt.savefig("energy_scan.png", bbox_inches="tight", dpi=600)
     plt.savefig("energy_scan.svg", bbox_inches="tight", format="svg")
 
+# Main execution
 if __name__ == "__main__":
     atom_1 = input("   > What is your first atom being scanned? ")
     atom_2 = input("   > What is your second atom being scanned? ")
 
-    df = read_orca_output("orca.out")
-    plot_energy(df, atom_1, atom_2)
+    distances, relative_energies = read_orca_output("orca.out")
+    print(f"   > Start distance: {distances[0]}, End distance: {distances[-1]}\n")
+    plot_energy(distances, relative_energies, atom_1, atom_2)