Implement a parametric method for GLM PIs #26

jthaman · 2018-02-23T16:17:35Z

Currently we simulate from the model using a parametric bootstrap. We showed in the GLM vignette that this is valid if the model is "true", and the simulation is still relatively fast, but a parametric method would be more pleasing to me.

I'm not sure if there is any literature available on this.

TimTaylor · 2020-08-25T19:19:49Z

We've been thinking about this recently and are considering using one of two approaches:

Ignore uncertainty in the fitted parameters and use the quantiles of the fitted distribution to create a prediction interval.
Calculate the confidence intervals and then use these with the fitted distribution to get upper and lower bounds of the prediction intervals (following the approach https://fromthebottomoftheheap.net/2017/05/01/glm-prediction-intervals-i/ and https://fromthebottomoftheheap.net/2017/05/01/glm-prediction-intervals-ii/)

I think these two approaches should, in the limit of the simulations, bound your simulation approach. I've implemented these roughly in the somewhat long winded example below and they seem to give reasonable results. It would be interesting to get your feedback and whether this approach would be useful for ciTools to incorporate:

library(MASS)
library(ggplot2)
library(ciTools)
#> ciTools version 0.5.2 (C) Institute for Defense Analyses
library(dplyr)
library(tibble)
library(patchwork)

#  ci function from ciTools (pretty much anyway)
add_ci_glm <- function(dat, fit, alpha){
  out <- predict(fit, dat, se.fit = TRUE, type = "link")

  if (fit$family$family %in% c("binomial", "poisson"))
    crit_val <- qnorm(p = 1 - alpha/2, mean = 0, sd = 1)
  else
    crit_val <- qt(p = 1 - alpha/2, df = fit$df.residual)

  inverselink <- fit$family$linkinv
  pred <- inverselink(out$fit)
  upper <- inverselink(out$fit + crit_val *  out$se.fit)
  lower <- inverselink(out$fit - crit_val * out$se.fit)

  if (fit$family$link %in% c("inverse", "1/mu^2")) {
      ## need to do something like this for any decreasing link
      ## function.
      upr <- lower
      lower <- upper
      upper <- upr
  }

  dat$pred <- pred
  dat$`upper-ci` <- upper
  dat$`lower-ci` <- lower
  as_tibble(dat)
}

add_intervals_glm <- function(dat, fit, alpha = 0.05, uncertain = TRUE) {

  # confidence intervals
  dat <- add_ci_glm(dat, fit, alpha)
  
  # prediction intervals
  fam <- family(fit)$family

  if (uncertain) {
    lower <- dat$`lower-ci`
    upper <- dat$`upper-ci`
  } else {
    lower <- dat$pred
    upper <- dat$pred
  }

  if (inherits(fit, "negbin")) {
    theta <- fit$theta
    setheta <- fit$SE.theta
    dat$`lower-pi` = qnbinom(alpha / 2, mu = lower, size = theta)
    dat$`upper-pi` <- qnbinom(1 - alpha / 2, mu = upper, size = theta)
  } else if (fam == "poisson") {
    dat$`lower-pi` <- qpois(alpha / 2, lambda = lower)
    dat$`upper-pi` <- qpois(1 - alpha / 2, lambda = upper)
  } else if (fam == "quasipoisson") {
    overdispersion <- summary(fit)$dispersion
    dat$`lower-pi` <- qnbinom(alpha / 2, mu = lower, size = lower / (overdispersion - 1))
    dat$`upper-pi` <- qnbinom(1 - alpha / 2, mu = upper, size = upper / (overdispersion - 1))
  } else if (fam == "gamma") {
    overdispersion <- summary(fit)$dispersion
    dat$`lower-pi` <- qgamma(alpha / 2, shape = 1 / overdispersion, rate = 1 / (lower * overdispersion))
    dat$`upper-pi` <- qgamma(1 - alpha / 2, shape = 1 / overdispersion, rate = 1 / (upper * overdispersion))
  } else if (fam == "binomial") {
    dat$`lower-pi` <- qbinom(alpha / 2, size = fit$prior.weights, prob = lower / fit$prior.weights)
    dat$`upper-pi` <- qbinom(1 - alpha / 2, size = fit$prior.weights, prob = upper / fit$prior.weights)
  } else if (fam == "gaussian") {
    sigma_sq <- summary(fit)$dispersion
    se_terms <- pred$se.fit
    t_quant <- qt(p = alpha / 2, df = fit$df.residual, lower.tail = FALSE)
    se_global <- sqrt(sigma_sq + se_terms^2)
    dat$`lower-pi` <- dat$fitted - t_quant * se_global
    dat$`upper-pi` <- dat$fitted + t_quant * se_global
  } else {
    stop("Unsupported glm family type")
  }
  as_tibble(dat)
}

# poisson -----------------------------------------------------------------
x <- rnorm(100, mean = 0)
y <- rpois(n = 100, lambda = exp(1.5 + 0.5*x))
dat <- data.frame(x = x, y = y)
fit <- glm(y ~ x , family = poisson(link = "log"))

dat1 <-
  dat %>%
  add_ci(fit, names = c("lwr", "upr"), alpha = 0.1) %>%
  add_pi(fit, names = c("lpb", "upb"), alpha = 0.1, nSims = 20000)
#> Warning in add_pi.glm(., fit, names = c("lpb", "upb"), alpha = 0.1, nSims =
#> 20000): The response is not continuous, so Prediction Intervals are approximate

dat2 <-
  dat %>%
  add_intervals_glm(fit, alpha = 0.1, uncertain = FALSE)

dat3 <-
  dat %>%
  add_intervals_glm(fit, alpha = 0.1, uncertain = TRUE)

p1 <- ggplot(dat1, aes(x, y)) +
  geom_point(size = 2) +
  geom_line(aes(y = pred), size = 1.2) +
  geom_ribbon(aes(ymin = lpb, ymax = upb), alpha = 0.2) +
  geom_ribbon(aes(ymin = `lower-pi`, ymax = `upper-pi`), data = dat2, alpha = 0.4) +
  ggtitle("Poisson Regression with prediction intervals and no uncertainty in parameters", 
          subtitle = "Model fit (black line), with bootstrap intervals (gray), parametric intervals (dark gray)") +
  coord_cartesian(ylim=c(0, 30))

p2 <- ggplot(dat1, aes(x, y)) +
  geom_point(size = 2) +
  geom_line(aes(y = pred), size = 1.2) +
  geom_ribbon(aes(ymin = lpb, ymax = upb), alpha = 0.4) +
  geom_ribbon(aes(ymin = `lower-pi`, ymax = `upper-pi`), data = dat3, alpha = 0.2) +
  ggtitle("Poisson Regression with prediction intervals and uncertainty in parameters", 
          subtitle = "Model fit (black line), with parametric intervals (gray), bootstrap intervals (dark gray)") +
  coord_cartesian(ylim=c(0, 30))

p1 / p2

# quasipoisson ------------------------------------------------------------
x <- runif(n = 100, min = 0, max = 2)
mu <- exp(1 + x)
y <- rnegbin(n = 100, mu = mu, theta = mu/(5 - 1))
dat <- data.frame(x = x, y = y)
fit <- glm(y ~ x, family = quasipoisson(link = "log"))

dat1 <-
  dat %>%
  add_ci(fit, names = c("lwr", "upr"), alpha = 0.1) %>%
  add_pi(fit, names = c("lpb", "upb"), alpha = 0.1, nSims = 20000)
#> Warning in add_pi.glm(., fit, names = c("lpb", "upb"), alpha = 0.1, nSims =
#> 20000): The response is not continuous, so Prediction Intervals are approximate

dat2 <-
  dat %>%
  add_intervals_glm(fit, alpha = 0.1, uncertain = FALSE)

dat3 <-
  dat %>%
  add_intervals_glm(fit, alpha = 0.1, uncertain = TRUE)

p1 <- ggplot(dat1, aes(x, y)) +
  geom_point(size = 2) +
  geom_line(aes(y = pred), size = 1.2) +
  geom_ribbon(aes(ymin = lpb, ymax = upb), alpha = 0.2) +
  geom_ribbon(aes(ymin = `lower-pi`, ymax = `upper-pi`), data = dat2, alpha = 0.4) +
  ggtitle("Quasipoisson Regression with prediction intervals and no uncertainty in parameters", 
          subtitle = "Model fit (black line), with bootstrap intervals (gray), parametric intervals (dark gray)") +
  coord_cartesian(ylim=c(0, 40))

p2 <- ggplot(dat1, aes(x, y)) +
  geom_point(size = 2) +
  geom_line(aes(y = pred), size = 1.2) +
  geom_ribbon(aes(ymin = lpb, ymax = upb), alpha = 0.4) +
  geom_ribbon(aes(ymin = `lower-pi`, ymax = `upper-pi`), data = dat3, alpha = 0.2) +
  ggtitle("Quasioisson Regression with prediction intervals and uncertainty in parameters", 
          subtitle = "Model fit (black line), with parametric intervals (gray), bootstrap intervals (dark gray)") +
  coord_cartesian(ylim=c(0, 40))

p1 / p2

^{Created on 2020-08-25 by the reprex package (v0.3.0)}

jthaman · 2020-09-22T15:59:28Z

Tim, in our glm vignette, https://cran.r-project.org/web/packages/ciTools/vignettes/ciTools-glm-vignette.html, we showed in a simulation that our Poisson PIs are conservative (at least in the examples we tried). I would hesitate to add a method to ciTools that I know is more conservative than what's already available. For other GLMs, the current parametric bootstrap may be conservative or anticonservative depending on model/sample size.

I do think you are correct about your two methods bounding the current ciTools approach.

Personally, I find add_pi.glm to be pretty fast, at least compared to many other bootstrap methods in ciTools.

It would be interesting to put a sim together that demonstrates the coverage probs on these techniques compared to the one available (And also compared to a Bayesian prediction interval with uninformative priors).

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Implement a parametric method for GLM PIs #26

Implement a parametric method for GLM PIs #26

jthaman commented Feb 23, 2018

TimTaylor commented Aug 25, 2020

jthaman commented Sep 22, 2020

Implement a parametric method for GLM PIs #26

Implement a parametric method for GLM PIs #26

Comments

jthaman commented Feb 23, 2018

TimTaylor commented Aug 25, 2020

jthaman commented Sep 22, 2020