From af17b86c567265e1660beb708fb7bc700a00d1d5 Mon Sep 17 00:00:00 2001
From: David Doty <doty@ucdavis.edu>
Date: Thu, 4 Apr 2024 15:51:17 -0700
Subject: [PATCH 1/5] closes #255 matplotlib histogram of nearest neighbor
 energies

---
 nuad/np.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 88 insertions(+), 7 deletions(-)

diff --git a/nuad/np.py b/nuad/np.py
index c72b30f2..4ba3282c 100644
--- a/nuad/np.py
+++ b/nuad/np.py
@@ -522,7 +522,7 @@ def longest_common_substrings_singlea1(a1: np.ndarray, a2s: np.ndarray) \
     idx_longest_raveled = np.argmax(counter_flat, axis=1)
     len_longest = counter_flat[np.arange(counter_flat.shape[0]), idx_longest_raveled]
 
-    idx_longest = np.unravel_index(idx_longest_raveled, dims=(len_a1 + 1, len_a2 + 1))
+    idx_longest = np.unravel_index(idx_longest_raveled, shape=(len_a1 + 1, len_a2 + 1))
     a1idx_longest = idx_longest[0] - len_longest
     a2idx_longest = idx_longest[1] - len_longest
 
@@ -583,7 +583,7 @@ def _longest_common_substrings_pairs(a1s: np.ndarray, a2s: np.ndarray) \
     idx_longest_raveled = np.argmax(counter_flat, axis=1)
     len_longest = counter_flat[np.arange(counter_flat.shape[0]), idx_longest_raveled]
 
-    idx_longest = np.unravel_index(idx_longest_raveled, dims=(len_a1 + 1, len_a2 + 1))
+    idx_longest = np.unravel_index(idx_longest_raveled, shape=(len_a1 + 1, len_a2 + 1))
     a1idx_longest = idx_longest[0] - len_longest
     a2idx_longest = idx_longest[1] - len_longest
 
@@ -669,7 +669,7 @@ def _strongest_common_substrings_all_pairs(a1s: np.ndarray, a2s: np.ndarray, tem
     len_strongest = counter_flat[np.arange(counter_flat.shape[0]), idx_strongest_raveled]
     energy_strongest = energies_flat[np.arange(counter_flat.shape[0]), idx_strongest_raveled]
 
-    idx_strongest = np.unravel_index(idx_strongest_raveled, dims=(len_a1 + 1, len_a2 + 1))
+    idx_strongest = np.unravel_index(idx_strongest_raveled, shape=(len_a1 + 1, len_a2 + 1))
     a1idx_strongest = idx_strongest[0] - len_strongest
     a2idx_strongest = idx_strongest[1] - len_strongest
 
@@ -902,10 +902,6 @@ def write_to_file(self, filename: str) -> None:
             for i in range(self.numseqs):
                 f.write(self.get_seq_str(i) + '\n')
 
-    def wcenergy(self, idx: int, temperature: float) -> float:
-        """Return energy of idx'th sequence binding to its complement."""
-        return wcenergy(self.seqarr[idx], temperature)
-
     def __repr__(self) -> str:
         return 'DNASeqSet(seqs={})'.format(str([self[i] for i in range(self.numseqs)]))
 
@@ -1283,3 +1279,88 @@ def calculate_wc_energies(seqarr: np.ndarray, temperature: float, negate: bool =
 def wc_arr(seqarr: np.ndarray) -> np.ndarray:
     """Return numpy array of reverse complements of sequences in `seqarr`."""
     return (3 - seqarr)[:, ::-1]
+
+def energy_hist(length: int | Iterable[int], temperature: float = 37,
+                combine_lengths: bool = False, show: bool = False,
+                num_random_sequences: int = 100_000,
+                figsize: Tuple[int,int] = (15,6), **kwargs) -> None:
+    """
+    Make a matplotlib histogram of the nearest-neighbor energies (as defined by
+    :meth:`DNASeqList.energies`) of all DNA sequences of the given length(s),
+    or a randomly selected subset if length(s) is too large to enumerate all DNA sequences
+    of that length.
+
+    :param length:
+        length of DNA sequences to consider, or an iterable (e.g., list) of lengths
+    :param temperature:
+        temperature in Celsius
+    :param combine_lengths:
+        If True, then `length` should be an iterable, and the histogram will combine all lengths
+        into one. If False (the default), then different lengths are plotted in different colors
+        in the historgram.
+    :param show:
+        If False, then the histogram plotted, but matplotlib.pyplot.show() is not called.
+        This allows one to tweak aspects of the histogram after it is plotted.
+        If True, then the histogram is displayed using matplotlib.pyplot.show().
+        This is convenient for making several histograms plots in a single notebook cell.
+    :param num_random_sequences:
+        If the length is too large to enumerate all DNA sequences of that length,
+        then this many random sequences are used to estimate the histogram.
+    :param figsize:
+        Size of the figure in inches.
+    :param kwargs:
+        Any keyword arguments given are passed along as keyword arguments to matplotlib.pyplot.hist:
+        https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.hist.html
+    """
+    import matplotlib.pyplot as plt
+
+    if combine_lengths and isinstance(length, int):
+        raise ValueError(f'length must be an iterable if combine_lengths is False, '
+                         f'but it is the int {length}')
+
+    plt.figure(figsize=figsize)
+    plt.xlabel(f'Nearest-neighbor energy (kcal/mol)')
+
+    lengths = [length] if isinstance(length, int) else length
+
+    alpha = 1
+    if len(lengths) > 1:
+        alpha = 0.5
+        if 'label' in kwargs:
+            raise ValueError(f'label (={kwargs["label"]}) '
+                             f'should not be specified if multiple lengths are given')
+
+    bins = kwargs.pop('bins', 20)
+
+    all_energies = []
+    for length in lengths:
+        if length < 9:
+            seqs = DNASeqList(length=length)
+        else:
+            seqs = DNASeqList(length=length, num_random_seqs=num_random_sequences)
+        energies = seqs.energies(temperature=temperature)
+
+        if combine_lengths:
+            all_energies.extend(energies)
+        else:
+            label = kwargs['label'] if 'label' in kwargs else f'length {length}'
+            _ = plt.hist(energies, alpha=alpha, label=label, bins=bins, **kwargs)
+
+    if combine_lengths:
+        if 'label' in kwargs:
+            label = kwargs['label']
+            del kwargs['label']
+        else:
+            if len(lengths) == 1:
+                label = f'length {length}'
+            else:
+                lengths_delimited = ', '.join(map(str, lengths))
+                label = f'lengths {lengths_delimited} combined'
+        _ = plt.hist(all_energies, alpha=alpha, label=label, bins=bins, **kwargs)
+
+    plt.legend(loc='upper right')
+    title = kwargs.pop('title', f'Nearest-neighbor energies of DNA sequences at {temperature} C')
+    plt.title(title)
+
+    if show:
+        plt.show()
\ No newline at end of file

From aa6bd7091a79e01254d61083700c7e025d00bc79 Mon Sep 17 00:00:00 2001
From: David Doty <doty@ucdavis.edu>
Date: Thu, 4 Apr 2024 15:52:39 -0700
Subject: [PATCH 2/5] Update np.py

---
 nuad/np.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/nuad/np.py b/nuad/np.py
index 4ba3282c..19b0a6ed 100644
--- a/nuad/np.py
+++ b/nuad/np.py
@@ -1290,6 +1290,9 @@ def energy_hist(length: int | Iterable[int], temperature: float = 37,
     or a randomly selected subset if length(s) is too large to enumerate all DNA sequences
     of that length.
 
+    This is useful, for example, to choose low and high energy values to pass to 
+    :any:`NearestNeighborEnergyFilter`.
+
     :param length:
         length of DNA sequences to consider, or an iterable (e.g., list) of lengths
     :param temperature:

From eaaabf95762c00ddcd4a0f2d46e69dd37fbe2a9c Mon Sep 17 00:00:00 2001
From: David Doty <doty@ucdavis.edu>
Date: Thu, 4 Apr 2024 15:53:35 -0700
Subject: [PATCH 3/5] Update np.py

---
 nuad/np.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nuad/np.py b/nuad/np.py
index 19b0a6ed..47bbc2bf 100644
--- a/nuad/np.py
+++ b/nuad/np.py
@@ -1300,7 +1300,7 @@ def energy_hist(length: int | Iterable[int], temperature: float = 37,
     :param combine_lengths:
         If True, then `length` should be an iterable, and the histogram will combine all lengths
         into one. If False (the default), then different lengths are plotted in different colors
-        in the historgram.
+        in the histogram.
     :param show:
         If False, then the histogram plotted, but matplotlib.pyplot.show() is not called.
         This allows one to tweak aspects of the histogram after it is plotted.

From 36704acca458a31c6aa764e1c2adb2c4541b65e7 Mon Sep 17 00:00:00 2001
From: David Doty <doty@ucdavis.edu>
Date: Thu, 4 Apr 2024 15:55:36 -0700
Subject: [PATCH 4/5] Update np.py

---
 nuad/np.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/nuad/np.py b/nuad/np.py
index 47bbc2bf..38cac559 100644
--- a/nuad/np.py
+++ b/nuad/np.py
@@ -1298,14 +1298,16 @@ def energy_hist(length: int | Iterable[int], temperature: float = 37,
     :param temperature:
         temperature in Celsius
     :param combine_lengths:
-        If True, then `length` should be an iterable, and the histogram will combine all lengths
-        into one. If False (the default), then different lengths are plotted in different colors
-        in the histogram.
+        If True, then `length` should be an iterable, and the histogram will combine all calculated energies
+        from all lengths into one histogram to plot. If False (the default), then different lengths are
+        plotted in different colors in the histogram.
     :param show:
         If False, then the histogram plotted, but matplotlib.pyplot.show() is not called.
-        This allows one to tweak aspects of the histogram after it is plotted.
+        This allows one to tweak aspects of the histogram after it is plotted calling functions in
+        matplotlib.pyplot.
         If True, then the histogram is displayed using matplotlib.pyplot.show().
-        This is convenient for making several histograms plots in a single notebook cell.
+        This is convenient for making several histogram plots in a single notebook cell.
+        (Otherwise the several calls to matplotlib.pyplot.hist will overwrite each other.)
     :param num_random_sequences:
         If the length is too large to enumerate all DNA sequences of that length,
         then this many random sequences are used to estimate the histogram.

From 1e97db9a356d61a2ea7a9d566a9201e4090d44ea Mon Sep 17 00:00:00 2001
From: David Doty <doty@ucdavis.edu>
Date: Thu, 4 Apr 2024 15:56:41 -0700
Subject: [PATCH 5/5] Update np.py

---
 nuad/np.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/nuad/np.py b/nuad/np.py
index 38cac559..43c53937 100644
--- a/nuad/np.py
+++ b/nuad/np.py
@@ -1280,10 +1280,11 @@ def wc_arr(seqarr: np.ndarray) -> np.ndarray:
     """Return numpy array of reverse complements of sequences in `seqarr`."""
     return (3 - seqarr)[:, ::-1]
 
+
 def energy_hist(length: int | Iterable[int], temperature: float = 37,
                 combine_lengths: bool = False, show: bool = False,
                 num_random_sequences: int = 100_000,
-                figsize: Tuple[int,int] = (15,6), **kwargs) -> None:
+                figsize: Tuple[int, int] = (15, 6), **kwargs) -> None:
     """
     Make a matplotlib histogram of the nearest-neighbor energies (as defined by
     :meth:`DNASeqList.energies`) of all DNA sequences of the given length(s),
@@ -1368,4 +1369,4 @@ def energy_hist(length: int | Iterable[int], temperature: float = 37,
     plt.title(title)
 
     if show:
-        plt.show()
\ No newline at end of file
+        plt.show()