RTMB intro

Author

Ben Bolker

Published

June 5, 2025

what is RTMB?

like TMB (Kristensen et al. 2016) but more convenient (although less mature)
evaluation of R functions in compiled code (sort of)
automatic differentiation
Laplace approximation engine

library(bbmle)
library(RTMB)
library(tmbstan)
library(igraph)
library(ggplot2); theme_set(theme_bw())
data("prussian", package = "pscl")

what is autodiff?

fancy chain rule
various ways to do it
\(C(\ell'_\theta)<4C(\ell)\) (“cheap gradient principle”)
time-efficient, maybe memory-hungry

simple (?) example

f1 <- function(param) {
    param$x^2 + cos(param$y)
}
par1 <- list(x=1, y = 2)
## lobstr::ast(!!body(f1))

tape1 <- MakeTape(f1, par1)
plot(graph_from_adjacency_matrix(tape1$graph()),
     layout=layout_as_tree, vertex.size = 25)

Now we construct the full TMB object (includes objective function, gradient, etc.)

ff1 <- MakeADFun(f1, par1)
names(ff1)

 [1] "par"      "fn"       "gr"       "he"       "hessian"  "method"  
 [7] "retape"   "env"      "report"   "simulate"

ff1$fn()  ## uses $par (== par1) as default

[1] 0.5838532

No speed advantage yet …

bench1 <- microbenchmark::microbenchmark(ff1$fn(), f1(par1))
print(bench1)

Unit: nanoseconds
     expr   min      lq     mean median    uq   max neval cld
 ff1$fn() 13465 13686.0 14305.52  13796 13976 49423   100  a 
 f1(par1)   511   551.5   673.18    641   672  4689   100   b

But we can get gradients right away.

ff1$gr()

outer mgc:  2

     [,1]       [,2]
[1,]    2 -0.9092974

a simple GLM

Set up model matrix (can also parameterize the model ‘by hand’, i.e. beta0 + beta_y[year] etc., but I find that extremely tedious).

X2 <- model.matrix(~ factor(year), data = prussian)
par2 <- list(beta = rep(0, ncol(X2)))
f2 <- function(pars) {
    getAll(pars)  ## this is like with() or attach()
    mu <- exp(X2 %*% beta)
    -sum(dpois(prussian$y, lambda = mu, log = TRUE))
}
f2(par2)

[1] 328.2462

ff2 <- MakeADFun(f2, par2, silent = TRUE)
ff2$fn()

[1] 328.2462

ff2$gr()

     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
[1,]   84    9    7    5    4   -4    8    0    3     5     9     3    -1     8
     [,15] [,16] [,17] [,18] [,19] [,20]
[1,]     3    -3     2    -1     6    10

bench2 <- microbenchmark::microbenchmark(ff2$fn(), f2(par2))
print(bench2)

Unit: microseconds
     expr    min     lq     mean  median      uq     max neval cld
 ff2$fn() 14.267 14.798 16.30968 15.8050 16.5005  50.615   100  a 
 f2(par2) 24.766 25.543 29.21663 26.5395 27.4110 276.609   100   b

Objective function in RTMB is already twice as fast …

fit2 <- with(ff2, nlminb(par, fn, gr))

Compare with GLM fit:

g2 <- glm(y ~ factor(year), data = prussian, family = poisson)
all.equal(unname(coef(g2)), unname(fit2$par), tolerance = 1e-6)

[1] TRUE

GLMM

We can make this a GLMM with very little additional effort. Again, we can parameterize either ‘by hand’ or with a random effects model matrix \(Z\) (see here for a more complex but more general formulation of GLMMs in RTMB …)

## RTMB knows about sparse matrices
Z_y <- Matrix::sparse.model.matrix(~year, data = prussian)
Z_c <- Matrix::sparse.model.matrix(~corp, data = prussian)
par3 <- list(beta0 = 0, b_y = rep(0, ncol(Z_y)), b_c = rep(0, ncol(Z_c)),
             logsd_c = 0, logsd_y = 0)
f3 <- function(pars) {
    getAll(pars)
    mu <- exp(beta0 + Z_y %*% b_y + Z_c %*% b_c)
    ## need to convert to a vector here ...
    mu <- drop(as.matrix(mu))
    ## conditional (negative) log-likelihood
    nll <- -sum(dpois(prussian$y, lambda = mu, log = TRUE))
    ## log-likelihoods of the random effects
    nll <- nll - sum(dnorm(b_y, sd = exp(logsd_y), log = TRUE))
    nll <- nll - sum(dnorm(b_c, sd = exp(logsd_c), log = TRUE))
    return(nll)
}
f3(par3)

[1] 342.9492

## make AD fun *without* Laplace approximation
ff3 <- MakeADFun(f3, par3, silent = TRUE)
ff3$fn()

[1] 342.9492

## now integrate over random effects
ff3 <- MakeADFun(f3, par3, random = c("b_y", "b_c"), silent = TRUE)
ff3$fn()

[1] 325.3724
attr(,"logarithm")
[1] TRUE

fit3 <- with(ff3, nlminb(par, fn, gr))
fit3$par

      beta0     logsd_c     logsd_y 
 -0.3776499  -1.4075989 -14.0820055

tricks and traps

objective function must be differentiable with respect to parameters (no if(), abs(), round(), min(), max() depending on parameters)
have to handle prediction, tests, diagnostics, etc. etc. yourself
data handling (see here, here) (and very similar arguments from 2004 about MLE fitting machinery taking a data argument
have to implement exotic probability distributions yourself
use of <-[ (see here) etc.
- specifically, if you use the c() function, or if you use the diag<- function (which sets the diagonal of a matrix) or the [<- function (which assigns values within a matrix), you need to add e.g. ADoverload("[<-") to the beginning of your function
for matrix exponentials, you should use Matrix::expm() rather than expm::expm()
RTMB is pickier than R about matrices. You may need to use some combination of drop() and as.matrix() to convert matrices with dimension 1 in some direction (or Matrix matrices) back to vectors
not widely used yet; there will be holes
compare to: bbmle (no random effects, no autodiff); NIMBLE (more restricted set of back-ends); Stan (see tmbstan), JAGS, etc..
[[-indexing may be much faster than [-indexing: see here (and later messages in that thread)
if you use cat() or print() to print out numeric values, the results may not make sense (you’ll see a printout of RTMB’s internal representation of autodiff-augmented numbers …)

if transitioning from TMB

RTMB uses %*% (as in base R), not * (as in C++) for matrix/matrix and matrix/vector multiplication

documentation and links

more complexity (beta-binomial)

library(bbmle)
library(emdbook)
load(system.file("vignetteData","orob1.rda",package="bbmle"))
m1 <- mle2(m~dbetabinom(prob,size=n,theta),
            param=list(prob~dilution),
            start=list(prob=0.5,theta=1),
            data=orob1)

(this produces lots of lbeta NaN warnings)

bb <- function (x, prob, size, theta, log = FALSE)  {
    v <- lfactorial(size) - lfactorial(x) - lfactorial(size - x) -
        lbeta(theta * (1 - prob), theta * prob) +
        lbeta(size - x + theta * (1 - prob), x + theta * prob)
    if (log) v else exp(v)
}

X <- model.matrix(~dilution, data = orob1)
tmbdata <- list(n = orob1$n, m = orob1$m, X = X)
lbeta <- function(a, b) lgamma(a) + lgamma(b) - lgamma(a+b)

bb <- function (x, prob, size, theta, shape1, shape2, log = FALSE) 
{
    if (missing(prob) && !missing(shape1) && !missing(shape2)) {
        prob <- shape1/(shape1 + shape2)
        theta <- shape1 + shape2
    }
    v <- lfactorial(size) - lfactorial(x) -
        lfactorial(size - x) - lbeta(theta * (1 - prob), theta * prob) +
        lbeta(size - x + theta * (1 - prob), x + theta * prob)
    if (log) v else exp(v)
}

fn <- function(param) {
    getAll(param, tmbdata)
    probvec <- plogis(X %*% beta_prob)
    thetavec <- exp(X %*% beta_theta)
    -sum(bb(m, prob = probvec,
                             size = n,
                             theta = thetavec, log = TRUE))
}
pars <- list(beta_prob = rep(0, 3), beta_theta = rep(0,3))
fn(pars)

[1] 57.0385

ff <- MakeADFun(fn, pars)
ff$fn()

[1] 57.0385

autoplot(microbenchmark::microbenchmark(fn(pars), ff$fn()),
         times = 1000) + aes(fill = I("gray"))

References

Kristensen, Kasper, Anders Nielsen, Casper W. Berg, Hans Skaug, and Bradley M. Bell. 2016. “TMB : Automatic Differentiation and {}Laplace{} Approximation.” Journal of Statistical Software 70 (5). https://doi.org/10.18637/jss.v070.i05.