pyro-ppl · fehiepsi · Oct 27, 2023 · Sep 8, 2023 · Oct 20, 2023 · Oct 25, 2023
diff --git a/numpyro/distributions/continuous.py b/numpyro/distributions/continuous.py
@@ -1276,6 +1276,16 @@ def _batch_solve_triangular(A, B):
     return X
 
 
+def _batch_trace_from_cholesky(L):
+    """Computes the trace of matrix X given it's Cholesky decomposition matrix L.
+
+    :param jnp.ndarray(..., M, M) L: An array with lower triangular structure in the last two dimensions.
+
+    :return: Trace of X, where X = L L^T
+    """
+    return jnp.square(L).sum((-1, -2))
+
+
 class MatrixNormal(Distribution):
     """
     Matrix variate normal distribution as described in [1] but with a lower_triangular parametrization,
@@ -1358,9 +1368,9 @@ def log_prob(self, values):
         diff_col_solve = _batch_solve_triangular(
             A=self.scale_tril_column, B=jnp.swapaxes(diff_row_solve, -2, -1)
         )
-        batched_trace_term = jnp.square(
+        batched_trace_term = _batch_trace_from_cholesky(
             diff_col_solve.reshape(diff_col_solve.shape[:-2] + (-1,))
-        ).sum(-1)
+        )
 
         log_prob = -0.5 * batched_trace_term - log_det_term
 

diff --git a/numpyro/distributions/kl.py b/numpyro/distributions/kl.py
@@ -36,8 +36,11 @@
     Dirichlet,
     Gamma,
     Kumaraswamy,
+    MultivariateNormal,
     Normal,
     Weibull,
+    _batch_solve_triangular,
+    _batch_trace_from_cholesky,
 )
 from numpyro.distributions.discrete import CategoricalProbs
 from numpyro.distributions.distribution import (
@@ -134,6 +137,53 @@ def kl_divergence(p, q):
     return 0.5 * (var_ratio + t1 - 1 - jnp.log(var_ratio))
 
 
+@dispatch(MultivariateNormal, MultivariateNormal)
+def kl_divergence(p: MultivariateNormal, q: MultivariateNormal):
+    # cf https://statproofbook.github.io/P/mvn-kl.html
+
+    if p.event_shape != q.event_shape:
+        raise ValueError(
+            "Distributions must have the same event shape, but are"
+            f" {p.event_shape} and {q.event_shape} for p and q, respectively."
+        )
+
+    min_batch_ndim = min(len(p.batch_shape), len(q.batch_shape))
+    if p.batch_shape[-min_batch_ndim:] != q.batch_shape[-min_batch_ndim:]:
+        raise ValueError(
+            "Distributions must have the same batch shape in common batch dimensions, "
+            f"but are {p.batch_shape} and {q.batch_shape} for p and q,"
+            "respectively."
+        )
+    result_batch_shape = (
+        p.batch_shape if len(p.batch_shape) >= len(q.batch_shape) else q.batch_shape
+    )
+
+    assert len(p.event_shape) == 1, "event_shape must be one-dimensional"
+    D = p.event_shape[0]
+
+    assert p.mean.shape == p.batch_shape + p.event_shape
+    assert q.mean.shape == q.batch_shape + q.event_shape
+
+    p_half_log_det = jnp.log(jnp.diagonal(p.scale_tril, axis1=-2, axis2=-1)).sum(-1)
+    assert p_half_log_det.shape == p.batch_shape
+
+    q_half_log_det = jnp.log(jnp.diagonal(q.scale_tril, axis1=-2, axis2=-1)).sum(-1)
+    assert q_half_log_det.shape == q.batch_shape
+
+    log_det_ratio = 2 * (p_half_log_det - q_half_log_det)
+    assert log_det_ratio.shape == result_batch_shape
+
+    Lq_inv = _batch_solve_triangular(q.scale_tril, jnp.eye(D))
+
+    tr = _batch_trace_from_cholesky(Lq_inv @ p.scale_tril)
+    assert tr.shape == result_batch_shape
+
+    t1 = jnp.square(Lq_inv @ (p.loc - q.loc)[..., jnp.newaxis]).sum((-2, -1))
+    assert t1.shape == result_batch_shape
+
+    return 0.5 * (tr + t1 - D - log_det_ratio)
+
+
 @dispatch(Beta, Beta)
 def kl_divergence(p, q):
     # From https://en.wikipedia.org/wiki/Beta_distribution#Quantities_of_information_(entropy)

diff --git a/test/test_distributions.py b/test/test_distributions.py
@@ -2849,6 +2849,58 @@ def test_kl_expanded_normal(batch_shape, event_shape):
     assert_allclose(actual, expected)
 
 
+@pytest.mark.parametrize(
+    "batch_shape_p, batch_shape_q",
+    [
+        ((), ()),
+        ((1,), (1,)),
+        ((2, 3), (2, 3)),
+        ((5, 2, 3), (2, 3)),
+        ((2, 3), (5, 2, 3)),
+    ],
+    ids=str,
+)
+def test_kl_multivariate_normal_consistency_with_independent_normals(
+    batch_shape_p, batch_shape_q
+):
+    event_shape = (5,)
+
+    def make_dists(batch_shape):
+        shape = batch_shape + event_shape
+        mus = np.random.normal(size=shape)
+        scales = np.exp(np.random.normal(size=shape))
+        scales = np.ones(shape)
+
+        def diagonalize(v, ignore_axes: int):
+            if ignore_axes == 0:
+                return jnp.diag(v)
+            return vmap(diagonalize, in_axes=(0, None))(v, ignore_axes - 1)
+
+        scale_tril = diagonalize(scales, len(batch_shape))
+        return (
+            dist.Normal(mus, scales).to_event(len(event_shape)),
+            dist.MultivariateNormal(mus, scale_tril=scale_tril),
+        )
+
+    p_uni, p_mvn = make_dists(batch_shape_p)
+    q_uni, q_mvn = make_dists(batch_shape_q)
+
+    actual = kl_divergence(p_mvn, q_mvn)
+    expected = kl_divergence(p_uni, q_uni)
+    assert_allclose(actual, expected, atol=1e-6)
+
+
+def test_kl_multivariate_normal_nondiagonal_covariance():
+    p_mvn = dist.MultivariateNormal(np.zeros(2), covariance_matrix=np.eye(2))
+    q_mvn = dist.MultivariateNormal(
+        np.ones(2), covariance_matrix=np.array([[2, 0.8], [0.8, 0.5]])
+    )
+
+    actual = kl_divergence(p_mvn, q_mvn)
+    expected = 3.21138
+    assert_allclose(actual, expected, atol=2e-5)
+
+
 @pytest.mark.parametrize("shape", [(), (4,), (2, 3)], ids=str)
 @pytest.mark.parametrize(
     "p_dist, q_dist",