diff --git a/examples/beta-binomial.ipynb b/examples/beta-binomial.ipynb new file mode 100644 index 00000000..c90cce95 --- /dev/null +++ b/examples/beta-binomial.ipynb @@ -0,0 +1,902 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f07c7715", + "metadata": {}, + "source": [ + "# The All-Knowing Cube of Probability" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "94eb4081", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import empiricaldist\n", + "except ImportError:\n", + " !pip install empiricaldist" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e68dfce7", + "metadata": {}, + "outputs": [], + "source": [ + "# Get utils.py\n", + "\n", + "from os.path import basename, exists\n", + "\n", + "def download(url):\n", + " filename = basename(url)\n", + " if not exists(filename):\n", + " from urllib.request import urlretrieve\n", + " local, _ = urlretrieve(url, filename)\n", + " print('Downloaded ' + local)\n", + " \n", + "download('https://github.com/AllenDowney/ThinkBayes2/raw/master/soln/utils.py')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "de8f6c49", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "id": "949ce91c", + "metadata": {}, + "source": [ + "Suppose you run $n$ trials where the probability of success is $p$.\n", + "To compute the probability of $k$ successes, we can use the binomial distribution.\n", + "\n", + "For example, here's a range of value for $k$ and $n$, and a discrete grid of values for $p$." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "78f531f8", + "metadata": {}, + "outputs": [], + "source": [ + "ks = np.arange(101)\n", + "ns = np.arange(101)\n", + "ps = np.linspace(0, 1, 101)" + ] + }, + { + "cell_type": "markdown", + "id": "d77a2b77", + "metadata": {}, + "source": [ + "We can use `meshgrid` to make a 3-D grid of $k$, $n$, and $p$, and `binom` to evaluate the binomial PMF at each point." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9e450ec2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(101, 101, 101)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "from scipy.stats import binom\n", + "\n", + "K, N, P = np.meshgrid(ks, ns, ps, indexing='ij')\n", + "cube = binom.pmf(K, N, P)\n", + "cube.shape" + ] + }, + { + "cell_type": "markdown", + "id": "95297582", + "metadata": {}, + "source": [ + "The result is the all-knowing cube of probability, so-called because it can answer all of our questions about Bernoulli trials.\n", + "Allow me to demonstrate." + ] + }, + { + "cell_type": "markdown", + "id": "d482aa8d", + "metadata": {}, + "source": [ + "## The binomial distribution" + ] + }, + { + "cell_type": "markdown", + "id": "b7377f5f", + "metadata": {}, + "source": [ + "Suppose we are given $n$ and $p$, and we would like to know the distribution of $k$.\n", + "We can answer that question by selecting a vector from the cube along the $k$ axis." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ae639fe4", + "metadata": {}, + "outputs": [], + "source": [ + "n = 50\n", + "p = 50\n", + "pmf_k = cube[:, n, p]" + ] + }, + { + "cell_type": "markdown", + "id": "ab6c3676", + "metadata": {}, + "source": [ + "The result is a normalized PMF." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "07e338a7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9999999999999996" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pmf_k.sum()" + ] + }, + { + "cell_type": "markdown", + "id": "da690210", + "metadata": {}, + "source": [ + "Here's what it looks like." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6daa1886", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.bar(ks, pmf_k)\n", + "plt.xlabel('k')\n", + "plt.ylabel('PMF');" + ] + }, + { + "cell_type": "markdown", + "id": "c3071e06", + "metadata": {}, + "source": [ + "Because we used `binom` to compute the cube, we should not be surprised to find that this slice from the cube is a binomial PMF.\n", + "But just to make sure, we can use `binom` again to confirm it." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "46e816a6", + "metadata": {}, + "outputs": [], + "source": [ + "pmf_binom = binom.pmf(ks, n, p/100)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5b889032", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.allclose(pmf_k, pmf_binom)" + ] + }, + { + "cell_type": "markdown", + "id": "56cd2854", + "metadata": {}, + "source": [ + "So we can think of the cube as a collection of binomial PMFs.\n", + "But we can also think of it as a joint distribution of $k$, $n$, and $p$, which raises the question: what do we get if we select a vector along the $n$ and $p$ axes?" + ] + }, + { + "cell_type": "markdown", + "id": "5ba8700f", + "metadata": {}, + "source": [ + "## The negative binomial distribution\n", + "\n", + "Suppose we plan to run Bernoulli trials with probability $p$ until we see $k$ successes.\n", + "How many trials will it take?\n", + "\n", + "We can answer this question by selecting a vector from the cube along the $n$ axis." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "517c99e8", + "metadata": {}, + "outputs": [], + "source": [ + "k = 25\n", + "p = 50\n", + "pmf_n = cube[k, :, p].copy()" + ] + }, + { + "cell_type": "markdown", + "id": "1d7679c1", + "metadata": {}, + "source": [ + "The result is close to the answer we want, but there's something we have to fix.\n", + "Remember that the values in the cube come from the binomial PMF, which looks like this.\n", + "\n", + "$$Pr(k; n, p) = \\binom{n}{k} p^{k} (1-p)^{n-k}$$\n", + "\n", + "The first term is the binomial coefficient, which indicates that there are $n$ places we could find $k$ successes.\n", + "But if we keep running trials until we see $k$ successes, we know the last trial will be a success, which means there are only $n-1$ places we could find the other $k-1$ successes.\n", + "\n", + "So we have to adjust the values from the cube by dividing the elements by $n/k$." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a9d6f7f5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_125703/780240301.py:1: RuntimeWarning: invalid value encountered in divide\n", + " pmf_n /= (ns / k)\n" + ] + } + ], + "source": [ + "pmf_n /= (ns / k)\n", + "pmf_n[0] = 0" + ] + }, + { + "cell_type": "markdown", + "id": "ea6153c2", + "metadata": {}, + "source": [ + "Now we have to normalize the results to get a proper PMF." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c4ff9e52", + "metadata": {}, + "outputs": [], + "source": [ + "pmf_n /= pmf_n.sum()" + ] + }, + { + "cell_type": "markdown", + "id": "7cf1fe07", + "metadata": {}, + "source": [ + "Here's what it looks like." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "97bd5171", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(ks, pmf_n)\n", + "plt.xlabel('n')\n", + "plt.ylabel('PMF');" + ] + }, + { + "cell_type": "markdown", + "id": "8f05f187", + "metadata": {}, + "source": [ + "This is a negative binomial distribution, which we can confirm using `nbinom`." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b6e9de88", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9999999094998685" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from scipy.stats import nbinom\n", + "\n", + "pmf_nbinom = nbinom.pmf(ns-k, k, p/100)\n", + "pmf_nbinom.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "df8e2c43", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.allclose(pmf_n, pmf_nbinom)" + ] + }, + { + "cell_type": "markdown", + "id": "b35dd161", + "metadata": {}, + "source": [ + "To see why this works we can compare the binomial PMF, which is a distribution over $k$ with $n$ and $p$ as parameters:\n", + "\n", + "$$Pr(k; n, p) = \\binom{n}{k} p^{k} (1-p)^{n-k}$$\n", + "\n", + "And the negative binomial PMF, which I've written as a distribution over $n$ with $k$ and $p$ as parameters:\n", + "\n", + "$$Pr(n; k, p) = \\binom{n-1}{k-1} p^k (1-p)^{n-k}$$\n", + "\n", + "This is not the most common way to parameterize the negative binomial distribution, but it shows that the only difference is in the binomial coefficient, because we know that the last trial is a success." + ] + }, + { + "cell_type": "markdown", + "id": "aa0033cf", + "metadata": {}, + "source": [ + "## The beta distribution\n", + "\n", + "Suppose we have 101 devices that perform Bernoulli trials with different probabilities.\n", + "One device has $p=0$, one has $p=0.01$, one has $p=0.02$, and so on up to one device with $p=1$.\n", + "\n", + "Now suppose we choose one of the devices so that all values of $p$ are equally likely.\n", + "If we run $n$ trials and see $k$ successes, what is the distribution of $p$?\n", + "\n", + "We can answer this question by selecting a vector from the cube along the $p$ axis." + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "a0ff5426", + "metadata": {}, + "outputs": [], + "source": [ + "k = 25\n", + "n = 50\n", + "k = 6\n", + "n = 12\n", + "pdf_p = cube[k, n, :].copy()" + ] + }, + { + "cell_type": "markdown", + "id": "e001ecbe", + "metadata": {}, + "source": [ + "The result is not normalized." + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "59f2ee8f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.692307692307231" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pdf_p.sum()" + ] + }, + { + "cell_type": "markdown", + "id": "3f2195eb", + "metadata": {}, + "source": [ + "But we can normalize it like this." + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "90da0a89", + "metadata": {}, + "outputs": [], + "source": [ + "pdf_p /= pdf_p.sum()" + ] + }, + { + "cell_type": "markdown", + "id": "5d85a72c", + "metadata": {}, + "source": [ + "And here's what it looks like." + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "7e58f780", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(ps, pdf_p)\n", + "plt.xlabel('p')\n", + "plt.ylabel('PMF');" + ] + }, + { + "cell_type": "markdown", + "id": "a6227be2", + "metadata": {}, + "source": [ + "This is a beta distribution, which we can confirm by running `beta` with a change of variables, $a = k+1$ and $b = n-k+1$." + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "fb3fc2ef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(7, 7)" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from scipy.stats import beta\n", + "\n", + "a = k + 1\n", + "b = n - k + 1\n", + "a, b" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "18185a97", + "metadata": {}, + "outputs": [], + "source": [ + "pdf_beta = beta.pdf(ps, a, b)\n", + "pdf_beta /= pdf_beta.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "534580f9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.allclose(pdf_p, pdf_beta)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "26eca5b7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(6, 12)" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a - 1, b + k - 1" + ] + }, + { + "cell_type": "markdown", + "id": "5dd2e524", + "metadata": {}, + "source": [ + "To see why this works, let's compare the PDF of the beta distribution\n", + "\n", + "$$f(p, a, b) = \\frac{1}{B(a, b)} p^{a-1} (1-p)^{b-1} $$\n", + "\n", + "And the PMF of the binomial distribution.\n", + "\n", + "$$Pr(k; n, p) = \\binom{n}{k} p^{k} (1-p)^{n-k}$$\n", + "\n", + "With the change of variables, they are identical except for the first term, which normalizes the distributions." + ] + }, + { + "cell_type": "markdown", + "id": "7fb375ee", + "metadata": {}, + "source": [ + "## Conjugate priors\n", + "\n", + "This similarity is the reason the beta and binomial are conjugate distributions, which means they are joined together.\n", + "This relationship has a useful property for Bayesian statistics: if the prior distribution for $p$ is beta and the likelihood of the data is binomial, the posterior distribution is also beta.\n", + "\n", + "To see how that works, suppose the prior distribution of $p$ is beta with parameters $a$ and $b$. Here is the PDF of that distribution:\n", + "\n", + "$$p^{a-1} (1-p)^{b-1}$$\n", + "\n", + "I have omitted the normalizing factor, which we don't need because we are going to normalize the distribution after the update.\n", + "\n", + "Now suppose we see $k$ successes in $n$ trials.\n", + "The likelihood of this data is given by the binomial distribution, which has this PMF.\n", + "\n", + "$$p^{k} (1-p)^{n-k}$$\n", + "\n", + "Again, I have omitted the normalizing factor.\n", + "Now to get the unnormalized posterior, we multiply the beta prior and the binomial likelihood. The result is\n", + "\n", + "$$p^{a-1+k} (1-p)^{b-1+n-k}$$\n", + "\n", + "which we recognize as an unnormalized beta distribution with parameters $a+k$ and $b+n-k$.\n", + "\n", + "So if we observe $k$ successes in $n$ trials, we can do the update by making a beta distribution with parameters $a+k$ and $b+n-k$.\n", + "\n", + "As an example, let's start with a beta prior." + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "1441b8cc", + "metadata": {}, + "outputs": [], + "source": [ + "a = 2\n", + "b = 3\n", + "\n", + "prior = beta.pdf(ps, a, b)" + ] + }, + { + "cell_type": "markdown", + "id": "7e2d2437", + "metadata": {}, + "source": [ + "And suppose we see 5 successes in 10 attempts." + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "a7853483", + "metadata": {}, + "outputs": [], + "source": [ + "k = 5\n", + "n = 10\n", + "\n", + "like = binom.pmf(k, n, ps)" + ] + }, + { + "cell_type": "markdown", + "id": "3f235ad5", + "metadata": {}, + "source": [ + "We can compute the posterior by multiplying the prior and the likelihood, then normalizing the results." + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "7209c35d", + "metadata": {}, + "outputs": [], + "source": [ + "posterior = prior * like\n", + "posterior /= posterior.sum()" + ] + }, + { + "cell_type": "markdown", + "id": "a3eb050d", + "metadata": {}, + "source": [ + "Or we can compute a beta distribution with the updated parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "5dbefc32", + "metadata": {}, + "outputs": [], + "source": [ + "posterior_beta = beta.pdf(ps, a+k, b+n-k)\n", + "posterior_beta /= posterior_beta.sum()" + ] + }, + { + "cell_type": "markdown", + "id": "492f22dd", + "metadata": {}, + "source": [ + "The result is the same either way." + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "31c83e74", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.allclose(posterior, posterior_beta)" + ] + }, + { + "cell_type": "markdown", + "id": "f8f7ea3d", + "metadata": {}, + "source": [ + "## The all-knowing cube knows all" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "fca0f595", + "metadata": {}, + "outputs": [], + "source": [ + "def get_beta(a, b, cube):\n", + " k = a - 1\n", + " n = b + k - 1\n", + "\n", + " pdf= cube[k, n, :].copy()\n", + " pdf /= pdf.sum()\n", + " return pdf" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "93710b0d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2, 3, 5, 10)" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a, b, k, n" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "f4ef867d", + "metadata": {}, + "outputs": [], + "source": [ + "posterior_cube = get_beta(a + k, b + n - k, cube)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "860796a2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.allclose(posterior_beta, posterior_cube)" + ] + }, + { + "cell_type": "markdown", + "id": "c41afc17", + "metadata": {}, + "source": [ + "Think Bayes, Second Edition\n", + "\n", + "Copyright 2020 Allen B. Downey\n", + "\n", + "License: [Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "707d9f6f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}