RTMB intro

Author

Ben Bolker

Published

June 5, 2025

what is RTMB?

  • like TMB (Kristensen et al. 2016) but more convenient (although less mature)
  • evaluation of R functions in compiled code (sort of)
  • automatic differentiation
  • Laplace approximation engine

library(bbmle)
library(RTMB)
library(tmbstan)
library(igraph)
library(ggplot2); theme_set(theme_bw())
data("prussian", package = "pscl")

what is autodiff?

  • fancy chain rule
  • various ways to do it
  • \(C(\ell'_\theta)<4C(\ell)\) (“cheap gradient principle”)
  • time-efficient, maybe memory-hungry

simple (?) example

f1 <- function(param) {
    param$x^2 + cos(param$y)
}
par1 <- list(x=1, y = 2)
## lobstr::ast(!!body(f1))
tape1 <- MakeTape(f1, par1)
plot(graph_from_adjacency_matrix(tape1$graph()),
     layout=layout_as_tree, vertex.size = 25)

Now we construct the full TMB object (includes objective function, gradient, etc.)

ff1 <- MakeADFun(f1, par1)
names(ff1)
 [1] "par"      "fn"       "gr"       "he"       "hessian"  "method"  
 [7] "retape"   "env"      "report"   "simulate"
ff1$fn()  ## uses $par (== par1) as default
[1] 0.5838532

No speed advantage yet …

bench1 <- microbenchmark::microbenchmark(ff1$fn(), f1(par1))
print(bench1)
Unit: nanoseconds
     expr   min      lq     mean median    uq   max neval cld
 ff1$fn() 13465 13686.0 14305.52  13796 13976 49423   100  a 
 f1(par1)   511   551.5   673.18    641   672  4689   100   b

But we can get gradients right away.

ff1$gr()
outer mgc:  2 
     [,1]       [,2]
[1,]    2 -0.9092974

a simple GLM

Set up model matrix (can also parameterize the model ‘by hand’, i.e. beta0 + beta_y[year] etc., but I find that extremely tedious).

X2 <- model.matrix(~ factor(year), data = prussian)
par2 <- list(beta = rep(0, ncol(X2)))
f2 <- function(pars) {
    getAll(pars)  ## this is like with() or attach()
    mu <- exp(X2 %*% beta)
    -sum(dpois(prussian$y, lambda = mu, log = TRUE))
}
f2(par2)
[1] 328.2462
ff2 <- MakeADFun(f2, par2, silent = TRUE)
ff2$fn()
[1] 328.2462
ff2$gr()
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
[1,]   84    9    7    5    4   -4    8    0    3     5     9     3    -1     8
     [,15] [,16] [,17] [,18] [,19] [,20]
[1,]     3    -3     2    -1     6    10
bench2 <- microbenchmark::microbenchmark(ff2$fn(), f2(par2))
print(bench2)
Unit: microseconds
     expr    min     lq     mean  median      uq     max neval cld
 ff2$fn() 14.267 14.798 16.30968 15.8050 16.5005  50.615   100  a 
 f2(par2) 24.766 25.543 29.21663 26.5395 27.4110 276.609   100   b

Objective function in RTMB is already twice as fast …

fit2 <- with(ff2, nlminb(par, fn, gr))

Compare with GLM fit:

g2 <- glm(y ~ factor(year), data = prussian, family = poisson)
all.equal(unname(coef(g2)), unname(fit2$par), tolerance = 1e-6)
[1] TRUE

GLMM

We can make this a GLMM with very little additional effort. Again, we can parameterize either ‘by hand’ or with a random effects model matrix \(Z\) (see here for a more complex but more general formulation of GLMMs in RTMB …)

## RTMB knows about sparse matrices
Z_y <- Matrix::sparse.model.matrix(~year, data = prussian)
Z_c <- Matrix::sparse.model.matrix(~corp, data = prussian)
par3 <- list(beta0 = 0, b_y = rep(0, ncol(Z_y)), b_c = rep(0, ncol(Z_c)),
             logsd_c = 0, logsd_y = 0)
f3 <- function(pars) {
    getAll(pars)
    mu <- exp(beta0 + Z_y %*% b_y + Z_c %*% b_c)
    ## need to convert to a vector here ...
    mu <- drop(as.matrix(mu))
    ## conditional (negative) log-likelihood
    nll <- -sum(dpois(prussian$y, lambda = mu, log = TRUE))
    ## log-likelihoods of the random effects
    nll <- nll - sum(dnorm(b_y, sd = exp(logsd_y), log = TRUE))
    nll <- nll - sum(dnorm(b_c, sd = exp(logsd_c), log = TRUE))
    return(nll)
}
f3(par3)
[1] 342.9492
## make AD fun *without* Laplace approximation
ff3 <- MakeADFun(f3, par3, silent = TRUE)
ff3$fn()
[1] 342.9492
## now integrate over random effects
ff3 <- MakeADFun(f3, par3, random = c("b_y", "b_c"), silent = TRUE)
ff3$fn()
[1] 325.3724
attr(,"logarithm")
[1] TRUE
fit3 <- with(ff3, nlminb(par, fn, gr))
fit3$par
      beta0     logsd_c     logsd_y 
 -0.3776499  -1.4075989 -14.0820055 

tricks and traps

  • objective function must be differentiable with respect to parameters (no if(), abs(), round(), min(), max() depending on parameters)
  • have to handle prediction, tests, diagnostics, etc. etc. yourself
  • data handling (see here, here) (and very similar arguments from 2004 about MLE fitting machinery taking a data argument
  • have to implement exotic probability distributions yourself
  • use of <-[ (see here) etc.
    • specifically, if you use the c() function, or if you use the diag<- function (which sets the diagonal of a matrix) or the [<- function (which assigns values within a matrix), you need to add e.g. ADoverload("[<-") to the beginning of your function
  • for matrix exponentials, you should use Matrix::expm() rather than expm::expm()
  • RTMB is pickier than R about matrices. You may need to use some combination of drop() and as.matrix() to convert matrices with dimension 1 in some direction (or Matrix matrices) back to vectors
  • not widely used yet; there will be holes
  • compare to: bbmle (no random effects, no autodiff); NIMBLE (more restricted set of back-ends); Stan (see tmbstan), JAGS, etc..
  • [[-indexing may be much faster than [-indexing: see here (and later messages in that thread)
  • if you use cat() or print() to print out numeric values, the results may not make sense (you’ll see a printout of RTMB’s internal representation of autodiff-augmented numbers …)

if transitioning from TMB

  • RTMB uses %*% (as in base R), not * (as in C++) for matrix/matrix and matrix/vector multiplication

more complexity (beta-binomial)

library(bbmle)
library(emdbook)
load(system.file("vignetteData","orob1.rda",package="bbmle"))
m1 <- mle2(m~dbetabinom(prob,size=n,theta),
            param=list(prob~dilution),
            start=list(prob=0.5,theta=1),
            data=orob1)

(this produces lots of lbeta NaN warnings)

bb <- function (x, prob, size, theta, log = FALSE)  {
    v <- lfactorial(size) - lfactorial(x) - lfactorial(size - x) -
        lbeta(theta * (1 - prob), theta * prob) +
        lbeta(size - x + theta * (1 - prob), x + theta * prob)
    if (log) v else exp(v)
}
X <- model.matrix(~dilution, data = orob1)
tmbdata <- list(n = orob1$n, m = orob1$m, X = X)
lbeta <- function(a, b) lgamma(a) + lgamma(b) - lgamma(a+b)

bb <- function (x, prob, size, theta, shape1, shape2, log = FALSE) 
{
    if (missing(prob) && !missing(shape1) && !missing(shape2)) {
        prob <- shape1/(shape1 + shape2)
        theta <- shape1 + shape2
    }
    v <- lfactorial(size) - lfactorial(x) -
        lfactorial(size - x) - lbeta(theta * (1 - prob), theta * prob) +
        lbeta(size - x + theta * (1 - prob), x + theta * prob)
    if (log) v else exp(v)
}

fn <- function(param) {
    getAll(param, tmbdata)
    probvec <- plogis(X %*% beta_prob)
    thetavec <- exp(X %*% beta_theta)
    -sum(bb(m, prob = probvec,
                             size = n,
                             theta = thetavec, log = TRUE))
}
pars <- list(beta_prob = rep(0, 3), beta_theta = rep(0,3))
fn(pars)    
[1] 57.0385
ff <- MakeADFun(fn, pars)
ff$fn()
[1] 57.0385
autoplot(microbenchmark::microbenchmark(fn(pars), ff$fn()),
         times = 1000) + aes(fill = I("gray"))

References

Kristensen, Kasper, Anders Nielsen, Casper W. Berg, Hans Skaug, and Bradley M. Bell. 2016. TMB : Automatic Differentiation and {}Laplace{} Approximation.” Journal of Statistical Software 70 (5). https://doi.org/10.18637/jss.v070.i05.