library(bbmle)
library(RTMB)
library(tmbstan)
library(igraph)
library(ggplot2); theme_set(theme_bw())
data("prussian", package = "pscl")
RTMB intro
what is RTMB?
- like TMB (Kristensen et al. 2016) but more convenient (although less mature)
- evaluation of R functions in compiled code (sort of)
- automatic differentiation
- Laplace approximation engine
what is autodiff?
- fancy chain rule
- various ways to do it
- \(C(\ell'_\theta)<4C(\ell)\) (“cheap gradient principle”)
- time-efficient, maybe memory-hungry
simple (?) example
<- function(param) {
f1 $x^2 + cos(param$y)
param
}<- list(x=1, y = 2)
par1 ## lobstr::ast(!!body(f1))
<- MakeTape(f1, par1)
tape1 plot(graph_from_adjacency_matrix(tape1$graph()),
layout=layout_as_tree, vertex.size = 25)
Now we construct the full TMB object (includes objective function, gradient, etc.)
<- MakeADFun(f1, par1)
ff1 names(ff1)
[1] "par" "fn" "gr" "he" "hessian" "method"
[7] "retape" "env" "report" "simulate"
$fn() ## uses $par (== par1) as default ff1
[1] 0.5838532
No speed advantage yet …
<- microbenchmark::microbenchmark(ff1$fn(), f1(par1))
bench1 print(bench1)
Unit: nanoseconds
expr min lq mean median uq max neval cld
ff1$fn() 13465 13686.0 14305.52 13796 13976 49423 100 a
f1(par1) 511 551.5 673.18 641 672 4689 100 b
But we can get gradients right away.
$gr() ff1
outer mgc: 2
[,1] [,2]
[1,] 2 -0.9092974
a simple GLM
Set up model matrix (can also parameterize the model ‘by hand’, i.e. beta0 + beta_y[year]
etc., but I find that extremely tedious).
<- model.matrix(~ factor(year), data = prussian)
X2 <- list(beta = rep(0, ncol(X2)))
par2 <- function(pars) {
f2 getAll(pars) ## this is like with() or attach()
<- exp(X2 %*% beta)
mu -sum(dpois(prussian$y, lambda = mu, log = TRUE))
}f2(par2)
[1] 328.2462
<- MakeADFun(f2, par2, silent = TRUE)
ff2 $fn() ff2
[1] 328.2462
$gr() ff2
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
[1,] 84 9 7 5 4 -4 8 0 3 5 9 3 -1 8
[,15] [,16] [,17] [,18] [,19] [,20]
[1,] 3 -3 2 -1 6 10
<- microbenchmark::microbenchmark(ff2$fn(), f2(par2))
bench2 print(bench2)
Unit: microseconds
expr min lq mean median uq max neval cld
ff2$fn() 14.267 14.798 16.30968 15.8050 16.5005 50.615 100 a
f2(par2) 24.766 25.543 29.21663 26.5395 27.4110 276.609 100 b
Objective function in RTMB is already twice as fast …
<- with(ff2, nlminb(par, fn, gr)) fit2
Compare with GLM fit:
<- glm(y ~ factor(year), data = prussian, family = poisson)
g2 all.equal(unname(coef(g2)), unname(fit2$par), tolerance = 1e-6)
[1] TRUE
GLMM
We can make this a GLMM with very little additional effort. Again, we can parameterize either ‘by hand’ or with a random effects model matrix \(Z\) (see here for a more complex but more general formulation of GLMMs in RTMB …)
## RTMB knows about sparse matrices
<- Matrix::sparse.model.matrix(~year, data = prussian)
Z_y <- Matrix::sparse.model.matrix(~corp, data = prussian)
Z_c <- list(beta0 = 0, b_y = rep(0, ncol(Z_y)), b_c = rep(0, ncol(Z_c)),
par3 logsd_c = 0, logsd_y = 0)
<- function(pars) {
f3 getAll(pars)
<- exp(beta0 + Z_y %*% b_y + Z_c %*% b_c)
mu ## need to convert to a vector here ...
<- drop(as.matrix(mu))
mu ## conditional (negative) log-likelihood
<- -sum(dpois(prussian$y, lambda = mu, log = TRUE))
nll ## log-likelihoods of the random effects
<- nll - sum(dnorm(b_y, sd = exp(logsd_y), log = TRUE))
nll <- nll - sum(dnorm(b_c, sd = exp(logsd_c), log = TRUE))
nll return(nll)
}f3(par3)
[1] 342.9492
## make AD fun *without* Laplace approximation
<- MakeADFun(f3, par3, silent = TRUE)
ff3 $fn() ff3
[1] 342.9492
## now integrate over random effects
<- MakeADFun(f3, par3, random = c("b_y", "b_c"), silent = TRUE)
ff3 $fn() ff3
[1] 325.3724
attr(,"logarithm")
[1] TRUE
<- with(ff3, nlminb(par, fn, gr))
fit3 $par fit3
beta0 logsd_c logsd_y
-0.3776499 -1.4075989 -14.0820055
tricks and traps
- objective function must be differentiable with respect to parameters (no
if()
,abs()
,round()
,min()
,max()
depending on parameters) - have to handle prediction, tests, diagnostics, etc. etc. yourself
- data handling (see here, here) (and very similar arguments from 2004 about MLE fitting machinery taking a
data
argument - have to implement exotic probability distributions yourself
- use of
<-[
(see here) etc.- specifically, if you use the
c()
function, or if you use thediag<-
function (which sets the diagonal of a matrix) or the[<-
function (which assigns values within a matrix), you need to add e.g.ADoverload("[<-")
to the beginning of your function
- specifically, if you use the
- for matrix exponentials, you should use
Matrix::expm()
rather thanexpm::expm()
- RTMB is pickier than R about matrices. You may need to use some combination of
drop()
andas.matrix()
to convert matrices with dimension 1 in some direction (orMatrix
matrices) back to vectors - not widely used yet; there will be holes
- compare to:
bbmle
(no random effects, no autodiff); NIMBLE (more restricted set of back-ends); Stan (see tmbstan), JAGS, etc.. [[
-indexing may be much faster than[
-indexing: see here (and later messages in that thread)- if you use
cat()
orprint()
to print out numeric values, the results may not make sense (you’ll see a printout of RTMB’s internal representation of autodiff-augmented numbers …)
if transitioning from TMB
- RTMB uses
%*%
(as in base R), not*
(as in C++) for matrix/matrix and matrix/vector multiplication
documentation and links
- RTMB on R-universe
- ISEC 2024 workshop
- TMB-users list on Google Groups
- GLMMs in RTMB
more complexity (beta-binomial)
library(bbmle)
library(emdbook)
load(system.file("vignetteData","orob1.rda",package="bbmle"))
<- mle2(m~dbetabinom(prob,size=n,theta),
m1 param=list(prob~dilution),
start=list(prob=0.5,theta=1),
data=orob1)
(this produces lots of lbeta
NaN
warnings)
<- function (x, prob, size, theta, log = FALSE) {
bb <- lfactorial(size) - lfactorial(x) - lfactorial(size - x) -
v lbeta(theta * (1 - prob), theta * prob) +
lbeta(size - x + theta * (1 - prob), x + theta * prob)
if (log) v else exp(v)
}
<- model.matrix(~dilution, data = orob1)
X <- list(n = orob1$n, m = orob1$m, X = X)
tmbdata <- function(a, b) lgamma(a) + lgamma(b) - lgamma(a+b)
lbeta
<- function (x, prob, size, theta, shape1, shape2, log = FALSE)
bb
{if (missing(prob) && !missing(shape1) && !missing(shape2)) {
<- shape1/(shape1 + shape2)
prob <- shape1 + shape2
theta
}<- lfactorial(size) - lfactorial(x) -
v lfactorial(size - x) - lbeta(theta * (1 - prob), theta * prob) +
lbeta(size - x + theta * (1 - prob), x + theta * prob)
if (log) v else exp(v)
}
<- function(param) {
fn getAll(param, tmbdata)
<- plogis(X %*% beta_prob)
probvec <- exp(X %*% beta_theta)
thetavec -sum(bb(m, prob = probvec,
size = n,
theta = thetavec, log = TRUE))
}<- list(beta_prob = rep(0, 3), beta_theta = rep(0,3))
pars fn(pars)
[1] 57.0385
<- MakeADFun(fn, pars)
ff $fn() ff
[1] 57.0385
autoplot(microbenchmark::microbenchmark(fn(pars), ff$fn()),
times = 1000) + aes(fill = I("gray"))
References
Kristensen, Kasper, Anders Nielsen, Casper W. Berg, Hans Skaug, and Bradley M. Bell. 2016. “TMB : Automatic Differentiation and {}Laplace{} Approximation.” Journal of Statistical Software 70 (5). https://doi.org/10.18637/jss.v070.i05.