From af17b86c567265e1660beb708fb7bc700a00d1d5 Mon Sep 17 00:00:00 2001 From: David Doty Date: Thu, 4 Apr 2024 15:51:17 -0700 Subject: [PATCH 1/5] closes #255 matplotlib histogram of nearest neighbor energies --- nuad/np.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/nuad/np.py b/nuad/np.py index c72b30f2..4ba3282c 100644 --- a/nuad/np.py +++ b/nuad/np.py @@ -522,7 +522,7 @@ def longest_common_substrings_singlea1(a1: np.ndarray, a2s: np.ndarray) \ idx_longest_raveled = np.argmax(counter_flat, axis=1) len_longest = counter_flat[np.arange(counter_flat.shape[0]), idx_longest_raveled] - idx_longest = np.unravel_index(idx_longest_raveled, dims=(len_a1 + 1, len_a2 + 1)) + idx_longest = np.unravel_index(idx_longest_raveled, shape=(len_a1 + 1, len_a2 + 1)) a1idx_longest = idx_longest[0] - len_longest a2idx_longest = idx_longest[1] - len_longest @@ -583,7 +583,7 @@ def _longest_common_substrings_pairs(a1s: np.ndarray, a2s: np.ndarray) \ idx_longest_raveled = np.argmax(counter_flat, axis=1) len_longest = counter_flat[np.arange(counter_flat.shape[0]), idx_longest_raveled] - idx_longest = np.unravel_index(idx_longest_raveled, dims=(len_a1 + 1, len_a2 + 1)) + idx_longest = np.unravel_index(idx_longest_raveled, shape=(len_a1 + 1, len_a2 + 1)) a1idx_longest = idx_longest[0] - len_longest a2idx_longest = idx_longest[1] - len_longest @@ -669,7 +669,7 @@ def _strongest_common_substrings_all_pairs(a1s: np.ndarray, a2s: np.ndarray, tem len_strongest = counter_flat[np.arange(counter_flat.shape[0]), idx_strongest_raveled] energy_strongest = energies_flat[np.arange(counter_flat.shape[0]), idx_strongest_raveled] - idx_strongest = np.unravel_index(idx_strongest_raveled, dims=(len_a1 + 1, len_a2 + 1)) + idx_strongest = np.unravel_index(idx_strongest_raveled, shape=(len_a1 + 1, len_a2 + 1)) a1idx_strongest = idx_strongest[0] - len_strongest a2idx_strongest = idx_strongest[1] - len_strongest @@ -902,10 +902,6 @@ def write_to_file(self, filename: str) -> None: for i in range(self.numseqs): f.write(self.get_seq_str(i) + '\n') - def wcenergy(self, idx: int, temperature: float) -> float: - """Return energy of idx'th sequence binding to its complement.""" - return wcenergy(self.seqarr[idx], temperature) - def __repr__(self) -> str: return 'DNASeqSet(seqs={})'.format(str([self[i] for i in range(self.numseqs)])) @@ -1283,3 +1279,88 @@ def calculate_wc_energies(seqarr: np.ndarray, temperature: float, negate: bool = def wc_arr(seqarr: np.ndarray) -> np.ndarray: """Return numpy array of reverse complements of sequences in `seqarr`.""" return (3 - seqarr)[:, ::-1] + +def energy_hist(length: int | Iterable[int], temperature: float = 37, + combine_lengths: bool = False, show: bool = False, + num_random_sequences: int = 100_000, + figsize: Tuple[int,int] = (15,6), **kwargs) -> None: + """ + Make a matplotlib histogram of the nearest-neighbor energies (as defined by + :meth:`DNASeqList.energies`) of all DNA sequences of the given length(s), + or a randomly selected subset if length(s) is too large to enumerate all DNA sequences + of that length. + + :param length: + length of DNA sequences to consider, or an iterable (e.g., list) of lengths + :param temperature: + temperature in Celsius + :param combine_lengths: + If True, then `length` should be an iterable, and the histogram will combine all lengths + into one. If False (the default), then different lengths are plotted in different colors + in the historgram. + :param show: + If False, then the histogram plotted, but matplotlib.pyplot.show() is not called. + This allows one to tweak aspects of the histogram after it is plotted. + If True, then the histogram is displayed using matplotlib.pyplot.show(). + This is convenient for making several histograms plots in a single notebook cell. + :param num_random_sequences: + If the length is too large to enumerate all DNA sequences of that length, + then this many random sequences are used to estimate the histogram. + :param figsize: + Size of the figure in inches. + :param kwargs: + Any keyword arguments given are passed along as keyword arguments to matplotlib.pyplot.hist: + https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.hist.html + """ + import matplotlib.pyplot as plt + + if combine_lengths and isinstance(length, int): + raise ValueError(f'length must be an iterable if combine_lengths is False, ' + f'but it is the int {length}') + + plt.figure(figsize=figsize) + plt.xlabel(f'Nearest-neighbor energy (kcal/mol)') + + lengths = [length] if isinstance(length, int) else length + + alpha = 1 + if len(lengths) > 1: + alpha = 0.5 + if 'label' in kwargs: + raise ValueError(f'label (={kwargs["label"]}) ' + f'should not be specified if multiple lengths are given') + + bins = kwargs.pop('bins', 20) + + all_energies = [] + for length in lengths: + if length < 9: + seqs = DNASeqList(length=length) + else: + seqs = DNASeqList(length=length, num_random_seqs=num_random_sequences) + energies = seqs.energies(temperature=temperature) + + if combine_lengths: + all_energies.extend(energies) + else: + label = kwargs['label'] if 'label' in kwargs else f'length {length}' + _ = plt.hist(energies, alpha=alpha, label=label, bins=bins, **kwargs) + + if combine_lengths: + if 'label' in kwargs: + label = kwargs['label'] + del kwargs['label'] + else: + if len(lengths) == 1: + label = f'length {length}' + else: + lengths_delimited = ', '.join(map(str, lengths)) + label = f'lengths {lengths_delimited} combined' + _ = plt.hist(all_energies, alpha=alpha, label=label, bins=bins, **kwargs) + + plt.legend(loc='upper right') + title = kwargs.pop('title', f'Nearest-neighbor energies of DNA sequences at {temperature} C') + plt.title(title) + + if show: + plt.show() \ No newline at end of file From aa6bd7091a79e01254d61083700c7e025d00bc79 Mon Sep 17 00:00:00 2001 From: David Doty Date: Thu, 4 Apr 2024 15:52:39 -0700 Subject: [PATCH 2/5] Update np.py --- nuad/np.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nuad/np.py b/nuad/np.py index 4ba3282c..19b0a6ed 100644 --- a/nuad/np.py +++ b/nuad/np.py @@ -1290,6 +1290,9 @@ def energy_hist(length: int | Iterable[int], temperature: float = 37, or a randomly selected subset if length(s) is too large to enumerate all DNA sequences of that length. + This is useful, for example, to choose low and high energy values to pass to + :any:`NearestNeighborEnergyFilter`. + :param length: length of DNA sequences to consider, or an iterable (e.g., list) of lengths :param temperature: From eaaabf95762c00ddcd4a0f2d46e69dd37fbe2a9c Mon Sep 17 00:00:00 2001 From: David Doty Date: Thu, 4 Apr 2024 15:53:35 -0700 Subject: [PATCH 3/5] Update np.py --- nuad/np.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nuad/np.py b/nuad/np.py index 19b0a6ed..47bbc2bf 100644 --- a/nuad/np.py +++ b/nuad/np.py @@ -1300,7 +1300,7 @@ def energy_hist(length: int | Iterable[int], temperature: float = 37, :param combine_lengths: If True, then `length` should be an iterable, and the histogram will combine all lengths into one. If False (the default), then different lengths are plotted in different colors - in the historgram. + in the histogram. :param show: If False, then the histogram plotted, but matplotlib.pyplot.show() is not called. This allows one to tweak aspects of the histogram after it is plotted. From 36704acca458a31c6aa764e1c2adb2c4541b65e7 Mon Sep 17 00:00:00 2001 From: David Doty Date: Thu, 4 Apr 2024 15:55:36 -0700 Subject: [PATCH 4/5] Update np.py --- nuad/np.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/nuad/np.py b/nuad/np.py index 47bbc2bf..38cac559 100644 --- a/nuad/np.py +++ b/nuad/np.py @@ -1298,14 +1298,16 @@ def energy_hist(length: int | Iterable[int], temperature: float = 37, :param temperature: temperature in Celsius :param combine_lengths: - If True, then `length` should be an iterable, and the histogram will combine all lengths - into one. If False (the default), then different lengths are plotted in different colors - in the histogram. + If True, then `length` should be an iterable, and the histogram will combine all calculated energies + from all lengths into one histogram to plot. If False (the default), then different lengths are + plotted in different colors in the histogram. :param show: If False, then the histogram plotted, but matplotlib.pyplot.show() is not called. - This allows one to tweak aspects of the histogram after it is plotted. + This allows one to tweak aspects of the histogram after it is plotted calling functions in + matplotlib.pyplot. If True, then the histogram is displayed using matplotlib.pyplot.show(). - This is convenient for making several histograms plots in a single notebook cell. + This is convenient for making several histogram plots in a single notebook cell. + (Otherwise the several calls to matplotlib.pyplot.hist will overwrite each other.) :param num_random_sequences: If the length is too large to enumerate all DNA sequences of that length, then this many random sequences are used to estimate the histogram. From 1e97db9a356d61a2ea7a9d566a9201e4090d44ea Mon Sep 17 00:00:00 2001 From: David Doty Date: Thu, 4 Apr 2024 15:56:41 -0700 Subject: [PATCH 5/5] Update np.py --- nuad/np.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nuad/np.py b/nuad/np.py index 38cac559..43c53937 100644 --- a/nuad/np.py +++ b/nuad/np.py @@ -1280,10 +1280,11 @@ def wc_arr(seqarr: np.ndarray) -> np.ndarray: """Return numpy array of reverse complements of sequences in `seqarr`.""" return (3 - seqarr)[:, ::-1] + def energy_hist(length: int | Iterable[int], temperature: float = 37, combine_lengths: bool = False, show: bool = False, num_random_sequences: int = 100_000, - figsize: Tuple[int,int] = (15,6), **kwargs) -> None: + figsize: Tuple[int, int] = (15, 6), **kwargs) -> None: """ Make a matplotlib histogram of the nearest-neighbor energies (as defined by :meth:`DNASeqList.energies`) of all DNA sequences of the given length(s), @@ -1368,4 +1369,4 @@ def energy_hist(length: int | Iterable[int], temperature: float = 37, plt.title(title) if show: - plt.show() \ No newline at end of file + plt.show()