-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f512566
commit 1604535
Showing
2 changed files
with
231 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,229 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Imports\n", | ||
"import numpy as np\n", | ||
"from scipy import stats\n", | ||
"from scipy.special import comb" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Getting Started\n", | ||
"This notebook provides helpful formulas for computing optimal parameters for the construction of B-field (described further [here](https://github.com/onecodex/rust-bfield)). It includes a few sections:\n", | ||
"* **Quick Calculator**: Change a few input variables to determine optimal B-field construction parameters\n", | ||
"* **Space Efficiency vs. Error Rate**: Visualize B-field space efficiency vs. error rate for B-fields supporting several different maximum numbers of values ($\\theta$)." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def calculate_nu_and_kappa(max_value, max_nu=64):\n", | ||
" \"\"\"Find ν and κ with a constraint of a `max_nu` value, minimizing κ.\n", | ||
" \"\"\"\n", | ||
" nu = 2\n", | ||
" kappa = 1\n", | ||
" while kappa < nu:\n", | ||
" for nu in range(1, max_nu + 1):\n", | ||
" if comb(nu, kappa) >= max_value:\n", | ||
" return nu, kappa\n", | ||
" kappa += 1\n", | ||
" raise Exception(f\"No value of ν choose κ has a value over {max_value}. Consider raising the `max_nu` parameter.\")\n", | ||
" \n", | ||
" \n", | ||
"def calculate_fp_rate(m_over_n, n_hashes):\n", | ||
" return np.power(1 - np.power(np.e, -n_hashes * 1 / m_over_n), n_hashes)\n", | ||
" \n", | ||
" \n", | ||
"def calculate_m_over_n_and_hashes_from_per_bit_fp(max_per_bit_fp, max_hashes=12):\n", | ||
" \"\"\"Find an optimal number of hashes, k, and m/n (bits per element), minimizing m/n\n", | ||
" \n", | ||
" See https://pages.cs.wisc.edu/~cao/papers/summary-cache/node8.html for helpful detail.\n", | ||
" \"\"\"\n", | ||
" m_over_n = 2\n", | ||
" fp_rate = np.inf\n", | ||
" while fp_rate >= max_per_bit_fp:\n", | ||
" for n_hashes in range(1, max_hashes + 1):\n", | ||
" fp_rate = calculate_fp_rate(m_over_n, n_hashes)\n", | ||
" if fp_rate < max_fp_rate:\n", | ||
" return m_over_n, n_hashes\n", | ||
" m_over_n += 1\n", | ||
" raise Exception(f\"No m/n found for max false positive rate of {max_fp_rate}. Consider increasing `max_hashes` parameter.\")\n", | ||
"\n", | ||
" \n", | ||
"def calculate_m_over_n_and_hashes_from_alpha(max_alpha, max_hashes=12):\n", | ||
" \"\"\"Find an optimal number of hashes, k, and m/n (bits per element), minimizing m/n \n", | ||
" \"\"\"\n", | ||
" m_over_n = 2\n", | ||
" alpha = np.inf\n", | ||
" while alpha >= max_alpha:\n", | ||
" for n_hashes in range(1, max_hashes + 1):\n", | ||
" fp_rate = calculate_fp_rate(m_over_n, n_hashes)\n", | ||
"\n", | ||
" # We skip anything where we're in the lefthand side of the CDF\n", | ||
" if stats.binom.cdf(kappa, nu, fp_rate) < 0.5:\n", | ||
" continue\n", | ||
" \n", | ||
" alpha = stats.binom.cdf(kappa, nu, fp_rate) - stats.binom.cdf(kappa - 1, nu, fp_rate)\n", | ||
" if alpha < max_alpha:\n", | ||
" return m_over_n, n_hashes, alpha\n", | ||
" m_over_n += 1\n", | ||
" raise Exception(f\"No m/n found for max false positive rate of {max_fp_rate}. Consider increasing `max_hashes` parameter.\")\n", | ||
" " | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Quick Calculator\n", | ||
"Set the following configuration options and then run the cell to compute the required B-field creation parameters:\n", | ||
"* `MAX_VALUE`: The maximum value $y$ you'd like to store (alternatively $\\theta$). Note the `rust-bfield` implementation only supports `u32` integers for values and you should strongly consider remapping values to a complete range of natural numbers $1...\\theta$.\n", | ||
"* `MAX_FALSE_POSITIVE_RATE`: The maximum false positive rate $(\\alpha)$ you'd like to allow in your B-field. Recommended values for many applications are 0.01 or below.\n", | ||
"* `MAX_INDETERMINACY_RATE`: The maximum indeterminacy rate $(\\beta)$ you'd like to allow in your B-field. Recommend a value of 0." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"MAX_VALUE = 1e6\n", | ||
"MAX_FALSE_POSITIVE_RATE = 0.001\n", | ||
"MAX_INDETERMINACY_RATE = 0\n", | ||
"N_ELEMENTS = 1e9\n", | ||
"\n", | ||
"# Recommended standard values\n", | ||
"MAX_SCALEDOWN = 0.001\n", | ||
"\n", | ||
"# First we find suitable values of nu and kappa\n", | ||
"nu, kappa = calculate_nu_and_kappa(MAX_VALUE)\n", | ||
"\n", | ||
"# Then we compute the bits per element required for the desired false positive rate on a per bit basis\n", | ||
"m_over_n, n_hashes, alpha = calculate_m_over_n_and_hashes_from_alpha(MAX_FALSE_POSITIVE_RATE)\n", | ||
"\n", | ||
"p = calculate_fp_rate(m_over_n, n_hashes)\n", | ||
"bits_per_element = m_over_n * kappa\n", | ||
"\n", | ||
"# Next, we compute the implied indeterminacy error rate and the required number and size of secondary arrays\n", | ||
"uncorrected_beta = stats.binom.cdf(1, nu - kappa, p) - stats.binom.cdf(0, nu - kappa, p) # this is also the scaledown factor\n", | ||
"n_secondaries = 0\n", | ||
"calculated_indeterminacy_rate = np.inf\n", | ||
"\n", | ||
"#\n", | ||
"secondary_array_size = N_ELEMENTS\n", | ||
"expected_indeterminate_results = int(N_ELEMENTS * uncorrected_beta)\n", | ||
"array_sizes = []\n", | ||
"debug = False\n", | ||
"while calculated_indeterminacy_rate > MAX_INDETERMINACY_RATE:\n", | ||
" # Stop if the expected number of indeterminate results is < 0.5 \n", | ||
" array_sizes.append(secondary_array_size * bits_per_element)\n", | ||
" if expected_indeterminate_results < 0.5:\n", | ||
" break\n", | ||
"\n", | ||
" # Scale the secondary array down by the uncorrected 𝛽\n", | ||
" n_secondaries += 1 \n", | ||
" secondary_array_size = int(secondary_array_size * uncorrected_beta)\n", | ||
" \n", | ||
" # But never make an array smaller than N_ELEMENTS * MAX_SCALEDOWN\n", | ||
" if secondary_array_size < N_ELEMENTS * MAX_SCALEDOWN:\n", | ||
" secondary_array_size = int(N_ELEMENTS * MAX_SCALEDOWN)\n", | ||
"\n", | ||
" if debug:\n", | ||
" print(f\"The #{n_secondaries} secondary array will be {secondary_array_size:,} elements ({int(expected_indeterminate_results):,} expected elements)\")\n", | ||
" \n", | ||
" # Now calculate the expected number of indeterminate results flowing *out* of the nth secondary array\n", | ||
" secondary_array_size_bits = secondary_array_size * bits_per_element\n", | ||
" corrected_m_over_n = (secondary_array_size / expected_indeterminate_results) * m_over_n\n", | ||
" corrected_p = calculate_fp_rate(corrected_m_over_n, n_hashes)\n", | ||
" \n", | ||
" # Heuristic: But don't allow p to be set to 0, always use at least 10-e7 (1 in 1M)\n", | ||
" corrected_p = max(10e-7, corrected_p)\n", | ||
" corrected_beta = stats.binom.cdf(1, nu - kappa, corrected_p) - stats.binom.cdf(0, nu - kappa, corrected_p)\n", | ||
" expected_indeterminate_results = expected_indeterminate_results * corrected_beta\n", | ||
" \n", | ||
" if debug:\n", | ||
" print(f\"Expect {int(expected_indeterminate_results):,} indeterminate results in next array ({corrected_m_over_n}, corrected p {corrected_p:.10f}), corrected beta {corrected_beta:.4f}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"print(f\"\"\"\n", | ||
"Input configuration requirements are:\n", | ||
"\n", | ||
"`MAX_VALUE` (𝜃) = {int(MAX_VALUE):,}\n", | ||
"`MAX_FALSE_POSITIVE_RATE` (𝛼) = {MAX_FALSE_POSITIVE_RATE}\n", | ||
"`MAX_INDETERMINACY_RATE` (corrected 𝛽) = {MAX_INDETERMINACY_RATE}\n", | ||
"`N_ELEMENTS` (n) = {int(N_ELEMENTS):,}\n", | ||
"`MAX_SCALEDOWN` = {MAX_SCALEDOWN} (recommended standard value)\n", | ||
"\n", | ||
"Recommended parameters are: \n", | ||
"\n", | ||
"`size` (mκ) = {int(N_ELEMENTS * m_over_n * kappa):,}\n", | ||
"`n_hashes` (k) = {n_hashes}\n", | ||
"`marker_width` (ν) = {nu}\n", | ||
"`n_marker_bits` (κ) = {kappa}\n", | ||
"`secondary_scaledown` (uncorrected Array_0 β) = {np.ceil(uncorrected_beta * 1000)/1000:.3f}\n", | ||
"`max_scaledown` (-) = {MAX_SCALEDOWN} (recommended standard value)\n", | ||
"`n_secondaries` (number of Array_x's) = {n_secondaries}\n", | ||
"\n", | ||
"Summary statistics:\n", | ||
"\n", | ||
"* {np.sum(array_sizes, dtype=int):,} total bits ({np.sum(array_sizes) / (8 * 1024**2):.2f} Mb, {np.sum(array_sizes) / (8 * 1024**3):.2f} Gb)\n", | ||
"* {np.sum(array_sizes) / N_ELEMENTS:.2f} bits per element\n", | ||
"* {np.sum(array_sizes) / (N_ELEMENTS * 8):.2f} bytes per element\n", | ||
"* Expected false positive rate (𝛼): {alpha:.4f}\n", | ||
"* Expected error rate per bit in the primary array (p): {p:.4f}\n", | ||
"\"\"\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.7" | ||
}, | ||
"vscode": { | ||
"interpreter": { | ||
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" | ||
} | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
jupyter==1.0.0 | ||
scipy==1.9.3 |