packages

library(ggplot2); theme_set(theme_bw())
library(aods3)

overdispersion

more variance than expected based on statistical model
e.g. variance > mean for Poisson
in general leads to overconfidence
- overly narrow confidence intervals
- too-small p-values
- inflated type I error

Tick example

ticks <- read.table("../../data/Elston2001_tickdata.txt",header=TRUE)
ticks <- transform(ticks,
          YEAR=factor(YEAR),
          scHEIGHT=(HEIGHT-min(HEIGHT))/100)
ggplot(ticks,aes(scHEIGHT,TICKS,colour=YEAR))+
    geom_point()+
    scale_y_log10()

## Warning: Transformation introduced infinite values in continuous y-axis

ticks_glm1 <- glm(TICKS~scHEIGHT*YEAR,ticks,family=poisson)
aods3::gof(ticks_glm1)

## D  = 3008.964, df = 397, P(>D) = 0
## X2 = 4496.887, df = 397, P(>X2) = 0

methods

quasi-likelihood models
compounded distributions
observation-level random effects

quasi-likelihood

quantify excess variance
e.g. \(\phi\)=sum(residuals(m,type="pearson")^2)/df.residual(m)
multiply estimated standard errors by \(\sqrt{phi}\)
recompute \(Z\)/\(t\) statistics, \(p\) values
family=quasipoisson or family=quasibinomial does this automatically
quasilikelihood/quasiAIC? Depends who you ask.

ticks

ticks_QP <- update(ticks_glm1,family=quasipoisson)
summary(ticks_QP)

## 
## Call:
## glm(formula = TICKS ~ scHEIGHT * YEAR, family = quasipoisson, 
##     data = ticks)
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.0008     0.2391  16.731  < 2e-16 ***
## scHEIGHT         -5.8198     0.8547  -6.809 3.64e-11 ***
## YEAR96           -0.9831     0.2729  -3.603 0.000355 ***
## YEAR97           -2.9448     0.5057  -5.824 1.19e-08 ***
## scHEIGHT:YEAR96   4.4693     0.8959   4.988 9.12e-07 ***
## scHEIGHT:YEAR97   4.0453     1.2081   3.349 0.000890 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 11.3272)
## 
##     Null deviance: 5847.5  on 402  degrees of freedom
## Residual deviance: 3009.0  on 397  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 6

compounded distributions

instead of Poisson/binomial/etc., use a compounded distribution
Gamma + Poisson = negative binomial (e.g. MASS::glmer.nb)
Beta + binomial = beta-binomial (e.g. glmmTMB, bbmle::mle2)

ticks_NB <- MASS::glm.nb(TICKS~scHEIGHT*YEAR,data=ticks)

summary(ticks_NB)

## 
## Call:
## MASS::glm.nb(formula = TICKS ~ scHEIGHT * YEAR, data = ticks, 
##     init.theta = 0.9000852793, link = log)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       3.3829     0.2323  14.559  < 2e-16 ***
## scHEIGHT         -4.1308     0.4033 -10.242  < 2e-16 ***
## YEAR96           -0.2890     0.2829  -1.022 0.307009    
## YEAR97           -2.1926     0.3286  -6.672 2.52e-11 ***
## scHEIGHT:YEAR96   2.6132     0.4824   5.418 6.04e-08 ***
## scHEIGHT:YEAR97   2.0861     0.5571   3.745 0.000181 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.9001) family taken to be 1)
## 
##     Null deviance: 840.71  on 402  degrees of freedom
## Residual deviance: 418.82  on 397  degrees of freedom
## AIC: 1912.6
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.9001 
##           Std. Err.:  0.0867 
## 
##  2 x log-likelihood:  -1898.5880

## or:
library(glmmTMB)
glmmTMB(TICKS~scHEIGHT*YEAR,data=ticks,family=nbinom2)

## Formula:          TICKS ~ scHEIGHT * YEAR
## Data: ticks
##       AIC       BIC    logLik  df.resid 
## 1912.5876 1940.5801 -949.2938       396 
## 
## Number of obs: 403
## 
## Dispersion parameter for nbinom2 family ():  0.9 
## 
## Fixed Effects:
## 
## Conditional model:
##     (Intercept)         scHEIGHT           YEAR96           YEAR97  
##           3.383           -4.131           -0.289           -2.193  
## scHEIGHT:YEAR96  scHEIGHT:YEAR97  
##           2.613            2.086

observation-level random effects

use mixed models; add a Normal deviate to each observation
(on the link-function/linear predictor scale)
e.g. logit-Normal-binomial, or log-Normal-Poisson

library(lme4)

## Loading required package: Matrix

ticks <- transform(ticks,
        obs=1:nrow(ticks))
ticks_OR <- glmer(TICKS~scHEIGHT*YEAR+(1|obs),data=ticks,
                  family=poisson)
summary(ticks_OR)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: TICKS ~ scHEIGHT * YEAR + (1 | obs)
##    Data: ticks
## 
##      AIC      BIC   logLik deviance df.resid 
##   1903.0   1931.0   -944.5   1889.0      396 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.29773 -0.50197 -0.06591  0.22414  1.91379 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  obs    (Intercept) 1.132    1.064   
## Number of obs: 403, groups:  obs, 403
## 
## Fixed effects:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       2.7402     0.2429  11.283  < 2e-16 ***
## scHEIGHT         -4.0492     0.4155  -9.746  < 2e-16 ***
## YEAR96           -0.2069     0.2958  -0.699  0.48435    
## YEAR97           -1.9407     0.3482  -5.573 2.51e-08 ***
## scHEIGHT:YEAR96   2.5381     0.5026   5.050 4.42e-07 ***
## scHEIGHT:YEAR97   1.8683     0.5889   3.173  0.00151 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##                (Intr) scHEIGHT YEAR96 YEAR97 sHEIGHT:YEAR96
## scHEIGHT       -0.830                                      
## YEAR96         -0.818  0.682                               
## YEAR97         -0.693  0.580    0.568                      
## sHEIGHT:YEAR96  0.689 -0.826   -0.835 -0.480               
## sHEIGHT:YEAR97  0.592 -0.704   -0.485 -0.834  0.583

offsets

account for differences in sampling effort, exposure
count data: typically log(effort), log(area) etc.
y ~ x + offset(log(e)): \(\exp(\log(y)) = \exp(\beta_0 + \beta_1 x + \log(e))\) or \[ y = \exp(\beta_0 + \beta_1 x) \cdot \exp(\log(e)) = e \cdot \exp(\beta_0 + \beta_1 x) \]
predictions are for 1 unit of effort/area
for mortality risk: \(\log(t)\) on complementary log-log link scale (see here)

complete separation

what happens when a logistic regression model is too good?
some threshold: all below=0, all above=1
best slope estimate on logit scale is infinite
Wald approximation breaks down (Hauck-Donner effect)
symptoms: \(|\beta|>10\), crazy SEs and terrible p-values
strong effects, or slicing data too thin

solutions

model comparison (anova()) still works
profile CI should get lower limit of parameters
penalization (brglm, “Firth’s method”)
Bayesian approaches: put a prior on parameters (blme, brms)

zero-inflation

zero-inflation [@martin_zero_2005]

too many zeros
“lots of zeros” can occur just because of low mean
mode at zero and away from zero usually does mean Z-I

zero-inflation models

zero-inflation: mixture of structural and sampling zeroes
(not “true” and “false”)
hurdle: zeros plus truncated distribution
choice depends on meaning of zeros
Z-I as well as conditional mean may be modeled (e.g. glmmTMB)

testing for zero-inflation

a little tricky
easiest (?) to fit Z-I model and then test whether you needed it or not
posterior predictive simulation

posterior simulation

Use the simulate() method, if available

data(Salamanders,package="glmmTMB")
ss <- subset(Salamanders,spp=="GP" & mined=="no")
## fit model
ggplot(ss,aes(DOY,count))+stat_sum()

salam_1 <- glm(count~DOY,ss,family=poisson)
## simulate 1000 realizations from the model
sims <- simulate(salam_1,1000)
## count proportions of zeros per simulation
zero_prop <- prop.table(table(colSums(sims==0)))
zero_ind <- as.numeric(names(zero_prop))
obs_zeros <- sum(ss$count==0)
## p-value
sum(zero_prop[zero_ind>=obs_zeros])

## [1] 0.054

zero-inflation plot

plot(zero_prop)
points(obs_zeros,0,col="red",pch=16)

alternative families and links

Gamma

useful
simply log-transforming may be easier: log-Normal very similar to Gamma in some ways
choose based on interpretation; behaviour near zero; tail behaviour

complementary log-log link

useful for discrete-time survival analysis
parameters on the log-hazard scale

beyond the exponential family

beta regression

GLMs require counts (denominators), e.g. 40% = 4/10
what if data don’t have obvious denominators?
e.g. cover scores, activity budgets
Beta distribution: betareg, glmmTMB, brms

negative binomial regression

for overdispersion
standard NB (“NB2”): Gamma+Poisson, variance = \(\mu(1+\mu/k)\), \(k >0\)
NB1: variance=\(\mu \phi\), \(\phi>1\)

others …

beta-binomial (overdispersed binomial)
Tweedie (zeros + continuous)
COM-Poisson, generalized Poisson [@lynch_dealing_2014]
ordinal models (MASS::polr, ordinal package)

intermediate generalized linear models

Ben Bolker

packages

overdispersion

overdispersion

Tick example

methods

quasi-likelihood

ticks

compounded distributions

observation-level random effects

offsets

complete separation

solutions

zero-inflation

zero-inflation [@martin_zero_2005]

zero-inflation models

testing for zero-inflation

posterior simulation

zero-inflation plot

alternative families and links

Gamma

complementary log-log link

beyond the exponential family

beta regression

negative binomial regression

others …