Merge pull request #256 from UC-Davis-molecular-computing/255-matplot…

…lib-histogram-of-nearest-neighbor-energies closes #255 matplotlib histogram of nearest neighbor energies
UC-Davis-molecular-computing · Apr 4, 2024 · e3ad89c · e3ad89c
2 parents fb9793c + 1e97db9
commit e3ad89c
Showing 1 changed file with 94 additions and 7 deletions.
diff --git a/nuad/np.py b/nuad/np.py
@@ -522,7 +522,7 @@ def longest_common_substrings_singlea1(a1: np.ndarray, a2s: np.ndarray) \
     idx_longest_raveled = np.argmax(counter_flat, axis=1)
     len_longest = counter_flat[np.arange(counter_flat.shape[0]), idx_longest_raveled]
 
-    idx_longest = np.unravel_index(idx_longest_raveled, dims=(len_a1 + 1, len_a2 + 1))
+    idx_longest = np.unravel_index(idx_longest_raveled, shape=(len_a1 + 1, len_a2 + 1))
     a1idx_longest = idx_longest[0] - len_longest
     a2idx_longest = idx_longest[1] - len_longest
 
@@ -583,7 +583,7 @@ def _longest_common_substrings_pairs(a1s: np.ndarray, a2s: np.ndarray) \
     idx_longest_raveled = np.argmax(counter_flat, axis=1)
     len_longest = counter_flat[np.arange(counter_flat.shape[0]), idx_longest_raveled]
 
-    idx_longest = np.unravel_index(idx_longest_raveled, dims=(len_a1 + 1, len_a2 + 1))
+    idx_longest = np.unravel_index(idx_longest_raveled, shape=(len_a1 + 1, len_a2 + 1))
     a1idx_longest = idx_longest[0] - len_longest
     a2idx_longest = idx_longest[1] - len_longest
 
@@ -669,7 +669,7 @@ def _strongest_common_substrings_all_pairs(a1s: np.ndarray, a2s: np.ndarray, tem
     len_strongest = counter_flat[np.arange(counter_flat.shape[0]), idx_strongest_raveled]
     energy_strongest = energies_flat[np.arange(counter_flat.shape[0]), idx_strongest_raveled]
 
-    idx_strongest = np.unravel_index(idx_strongest_raveled, dims=(len_a1 + 1, len_a2 + 1))
+    idx_strongest = np.unravel_index(idx_strongest_raveled, shape=(len_a1 + 1, len_a2 + 1))
     a1idx_strongest = idx_strongest[0] - len_strongest
     a2idx_strongest = idx_strongest[1] - len_strongest
 
@@ -902,10 +902,6 @@ def write_to_file(self, filename: str) -> None:
             for i in range(self.numseqs):
                 f.write(self.get_seq_str(i) + '\n')
 
-    def wcenergy(self, idx: int, temperature: float) -> float:
-        """Return energy of idx'th sequence binding to its complement."""
-        return wcenergy(self.seqarr[idx], temperature)
-
     def __repr__(self) -> str:
         return 'DNASeqSet(seqs={})'.format(str([self[i] for i in range(self.numseqs)]))
 
@@ -1283,3 +1279,94 @@ def calculate_wc_energies(seqarr: np.ndarray, temperature: float, negate: bool =
 def wc_arr(seqarr: np.ndarray) -> np.ndarray:
     """Return numpy array of reverse complements of sequences in `seqarr`."""
     return (3 - seqarr)[:, ::-1]
+
+
+def energy_hist(length: int | Iterable[int], temperature: float = 37,
+                combine_lengths: bool = False, show: bool = False,
+                num_random_sequences: int = 100_000,
+                figsize: Tuple[int, int] = (15, 6), **kwargs) -> None:
+    """
+    Make a matplotlib histogram of the nearest-neighbor energies (as defined by
+    :meth:`DNASeqList.energies`) of all DNA sequences of the given length(s),
+    or a randomly selected subset if length(s) is too large to enumerate all DNA sequences
+    of that length.
+
+    This is useful, for example, to choose low and high energy values to pass to 
+    :any:`NearestNeighborEnergyFilter`.
+
+    :param length:
+        length of DNA sequences to consider, or an iterable (e.g., list) of lengths
+    :param temperature:
+        temperature in Celsius
+    :param combine_lengths:
+        If True, then `length` should be an iterable, and the histogram will combine all calculated energies
+        from all lengths into one histogram to plot. If False (the default), then different lengths are
+        plotted in different colors in the histogram.
+    :param show:
+        If False, then the histogram plotted, but matplotlib.pyplot.show() is not called.
+        This allows one to tweak aspects of the histogram after it is plotted calling functions in
+        matplotlib.pyplot.
+        If True, then the histogram is displayed using matplotlib.pyplot.show().
+        This is convenient for making several histogram plots in a single notebook cell.
+        (Otherwise the several calls to matplotlib.pyplot.hist will overwrite each other.)
+    :param num_random_sequences:
+        If the length is too large to enumerate all DNA sequences of that length,
+        then this many random sequences are used to estimate the histogram.
+    :param figsize:
+        Size of the figure in inches.
+    :param kwargs:
+        Any keyword arguments given are passed along as keyword arguments to matplotlib.pyplot.hist:
+        https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.hist.html
+    """
+    import matplotlib.pyplot as plt
+
+    if combine_lengths and isinstance(length, int):
+        raise ValueError(f'length must be an iterable if combine_lengths is False, '
+                         f'but it is the int {length}')
+
+    plt.figure(figsize=figsize)
+    plt.xlabel(f'Nearest-neighbor energy (kcal/mol)')
+
+    lengths = [length] if isinstance(length, int) else length
+
+    alpha = 1
+    if len(lengths) > 1:
+        alpha = 0.5
+        if 'label' in kwargs:
+            raise ValueError(f'label (={kwargs["label"]}) '
+                             f'should not be specified if multiple lengths are given')
+
+    bins = kwargs.pop('bins', 20)
+
+    all_energies = []
+    for length in lengths:
+        if length < 9:
+            seqs = DNASeqList(length=length)
+        else:
+            seqs = DNASeqList(length=length, num_random_seqs=num_random_sequences)
+        energies = seqs.energies(temperature=temperature)
+
+        if combine_lengths:
+            all_energies.extend(energies)
+        else:
+            label = kwargs['label'] if 'label' in kwargs else f'length {length}'
+            _ = plt.hist(energies, alpha=alpha, label=label, bins=bins, **kwargs)
+
+    if combine_lengths:
+        if 'label' in kwargs:
+            label = kwargs['label']
+            del kwargs['label']
+        else:
+            if len(lengths) == 1:
+                label = f'length {length}'
+            else:
+                lengths_delimited = ', '.join(map(str, lengths))
+                label = f'lengths {lengths_delimited} combined'
+        _ = plt.hist(all_energies, alpha=alpha, label=label, bins=bins, **kwargs)
+
+    plt.legend(loc='upper right')
+    title = kwargs.pop('title', f'Nearest-neighbor energies of DNA sequences at {temperature} C')
+    plt.title(title)
+
+    if show:
+        plt.show()