packages

library(ggplot2); theme_set(theme_bw())
library(aods3)

overdispersion

overdispersion

  • more variance than expected based on statistical model
  • e.g. variance > mean for Poisson
  • in general leads to overconfidence
    • overly narrow confidence intervals
    • too-small p-values
    • inflated type I error

Tick example

ticks <- read.table("../../data/Elston2001_tickdata.txt",header=TRUE)
ticks <- transform(ticks,
          YEAR=factor(YEAR),
          scHEIGHT=(HEIGHT-min(HEIGHT))/100)
ggplot(ticks,aes(scHEIGHT,TICKS,colour=YEAR))+
    geom_point()+
    scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis

ticks_glm1 <- glm(TICKS~scHEIGHT*YEAR,ticks,family=poisson)
aods3::gof(ticks_glm1)
## D  = 3008.964, df = 397, P(>D) = 0
## X2 = 4496.887, df = 397, P(>X2) = 0

methods

  • quasi-likelihood models
  • compounded distributions
  • observation-level random effects

quasi-likelihood

  • quantify excess variance
  • e.g. \(\phi\)=sum(residuals(m,type="pearson")^2)/df.residual(m)
  • multiply estimated standard errors by \(\sqrt{phi}\)
  • recompute \(Z\)/\(t\) statistics, \(p\) values
  • family=quasipoisson or family=quasibinomial does this automatically
  • quasilikelihood/quasiAIC? Depends who you ask.

ticks

ticks_QP <- update(ticks_glm1,family=quasipoisson)
summary(ticks_QP)
## 
## Call:
## glm(formula = TICKS ~ scHEIGHT * YEAR, family = quasipoisson, 
##     data = ticks)
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.0008     0.2391  16.731  < 2e-16 ***
## scHEIGHT         -5.8198     0.8547  -6.809 3.64e-11 ***
## YEAR96           -0.9831     0.2729  -3.603 0.000355 ***
## YEAR97           -2.9448     0.5057  -5.824 1.19e-08 ***
## scHEIGHT:YEAR96   4.4693     0.8959   4.988 9.12e-07 ***
## scHEIGHT:YEAR97   4.0453     1.2081   3.349 0.000890 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 11.3272)
## 
##     Null deviance: 5847.5  on 402  degrees of freedom
## Residual deviance: 3009.0  on 397  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 6

compounded distributions

  • instead of Poisson/binomial/etc., use a compounded distribution
  • Gamma + Poisson = negative binomial (e.g. MASS::glmer.nb)
  • Beta + binomial = beta-binomial (e.g. glmmTMB, bbmle::mle2)
ticks_NB <- MASS::glm.nb(TICKS~scHEIGHT*YEAR,data=ticks)

summary(ticks_NB)
## 
## Call:
## MASS::glm.nb(formula = TICKS ~ scHEIGHT * YEAR, data = ticks, 
##     init.theta = 0.9000852793, link = log)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       3.3829     0.2323  14.559  < 2e-16 ***
## scHEIGHT         -4.1308     0.4033 -10.242  < 2e-16 ***
## YEAR96           -0.2890     0.2829  -1.022 0.307009    
## YEAR97           -2.1926     0.3286  -6.672 2.52e-11 ***
## scHEIGHT:YEAR96   2.6132     0.4824   5.418 6.04e-08 ***
## scHEIGHT:YEAR97   2.0861     0.5571   3.745 0.000181 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.9001) family taken to be 1)
## 
##     Null deviance: 840.71  on 402  degrees of freedom
## Residual deviance: 418.82  on 397  degrees of freedom
## AIC: 1912.6
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.9001 
##           Std. Err.:  0.0867 
## 
##  2 x log-likelihood:  -1898.5880
## or:
library(glmmTMB)
glmmTMB(TICKS~scHEIGHT*YEAR,data=ticks,family=nbinom2)
## Formula:          TICKS ~ scHEIGHT * YEAR
## Data: ticks
##       AIC       BIC    logLik  df.resid 
## 1912.5876 1940.5801 -949.2938       396 
## 
## Number of obs: 403
## 
## Dispersion parameter for nbinom2 family ():  0.9 
## 
## Fixed Effects:
## 
## Conditional model:
##     (Intercept)         scHEIGHT           YEAR96           YEAR97  
##           3.383           -4.131           -0.289           -2.193  
## scHEIGHT:YEAR96  scHEIGHT:YEAR97  
##           2.613            2.086

observation-level random effects

  • use mixed models; add a Normal deviate to each observation
    (on the link-function/linear predictor scale)
  • e.g. logit-Normal-binomial, or log-Normal-Poisson
library(lme4)
## Loading required package: Matrix
ticks <- transform(ticks,
        obs=1:nrow(ticks))
ticks_OR <- glmer(TICKS~scHEIGHT*YEAR+(1|obs),data=ticks,
                  family=poisson)
summary(ticks_OR)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: TICKS ~ scHEIGHT * YEAR + (1 | obs)
##    Data: ticks
## 
##      AIC      BIC   logLik deviance df.resid 
##   1903.0   1931.0   -944.5   1889.0      396 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.29773 -0.50197 -0.06591  0.22414  1.91379 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  obs    (Intercept) 1.132    1.064   
## Number of obs: 403, groups:  obs, 403
## 
## Fixed effects:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       2.7402     0.2429  11.283  < 2e-16 ***
## scHEIGHT         -4.0492     0.4155  -9.746  < 2e-16 ***
## YEAR96           -0.2069     0.2958  -0.699  0.48435    
## YEAR97           -1.9407     0.3482  -5.573 2.51e-08 ***
## scHEIGHT:YEAR96   2.5381     0.5026   5.050 4.42e-07 ***
## scHEIGHT:YEAR97   1.8683     0.5889   3.173  0.00151 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##                (Intr) scHEIGHT YEAR96 YEAR97 sHEIGHT:YEAR96
## scHEIGHT       -0.830                                      
## YEAR96         -0.818  0.682                               
## YEAR97         -0.693  0.580    0.568                      
## sHEIGHT:YEAR96  0.689 -0.826   -0.835 -0.480               
## sHEIGHT:YEAR97  0.592 -0.704   -0.485 -0.834  0.583

offsets

  • account for differences in sampling effort, exposure
  • count data: typically log(effort), log(area) etc.
  • y ~ x + offset(log(e)): \(\exp(\log(y)) = \exp(\beta_0 + \beta_1 x + \log(e))\) or \[ y = \exp(\beta_0 + \beta_1 x) \cdot \exp(\log(e)) = e \cdot \exp(\beta_0 + \beta_1 x) \]
  • predictions are for 1 unit of effort/area
  • for mortality risk: \(\log(t)\) on complementary log-log link scale (see here)

complete separation

  • what happens when a logistic regression model is too good?
  • some threshold: all below=0, all above=1
  • best slope estimate on logit scale is infinite
  • Wald approximation breaks down (Hauck-Donner effect)
  • symptoms: \(|\beta|>10\), crazy SEs and terrible p-values
  • strong effects, or slicing data too thin

solutions

  • model comparison (anova()) still works
  • profile CI should get lower limit of parameters
  • penalization (brglm, “Firth’s method”)
  • Bayesian approaches: put a prior on parameters (blme, brms)

zero-inflation

zero-inflation [@martin_zero_2005]

  • too many zeros
  • “lots of zeros” can occur just because of low mean
  • mode at zero and away from zero usually does mean Z-I

zero-inflation models

  • zero-inflation: mixture of structural and sampling zeroes
    (not “true” and “false”)
  • hurdle: zeros plus truncated distribution
  • choice depends on meaning of zeros
  • Z-I as well as conditional mean may be modeled (e.g. glmmTMB)

testing for zero-inflation

  • a little tricky
  • easiest (?) to fit Z-I model and then test whether you needed it or not
  • posterior predictive simulation

posterior simulation

Use the simulate() method, if available

data(Salamanders,package="glmmTMB")
ss <- subset(Salamanders,spp=="GP" & mined=="no")
## fit model
ggplot(ss,aes(DOY,count))+stat_sum()

salam_1 <- glm(count~DOY,ss,family=poisson)
## simulate 1000 realizations from the model
sims <- simulate(salam_1,1000)
## count proportions of zeros per simulation
zero_prop <- prop.table(table(colSums(sims==0)))
zero_ind <- as.numeric(names(zero_prop))
obs_zeros <- sum(ss$count==0)
## p-value
sum(zero_prop[zero_ind>=obs_zeros])
## [1] 0.054

zero-inflation plot

plot(zero_prop)
points(obs_zeros,0,col="red",pch=16)

Gamma

  • useful
  • simply log-transforming may be easier: log-Normal very similar to Gamma in some ways
  • choose based on interpretation; behaviour near zero; tail behaviour

beyond the exponential family

beta regression

  • GLMs require counts (denominators), e.g. 40% = 4/10
  • what if data don’t have obvious denominators?
  • e.g. cover scores, activity budgets
  • Beta distribution: betareg, glmmTMB, brms

negative binomial regression

  • for overdispersion
  • standard NB (“NB2”): Gamma+Poisson, variance = \(\mu(1+\mu/k)\), \(k >0\)
  • NB1: variance=\(\mu \phi\), \(\phi>1\)

others …

  • beta-binomial (overdispersed binomial)
  • Tweedie (zeros + continuous)
  • COM-Poisson, generalized Poisson [@lynch_dealing_2014]
  • ordinal models (MASS::polr, ordinal package)