Consider a situation where we have multiple items of some sort which have associated effects, but we can’t association observations with single items; instead, each observation is associated with a group of items. (Two examples which have come up are (1) authorship of papers and (2) hockey players on a team.) These are not quite identical to “multi-membership models” (I think), although similar techniques to those shown here could work for multi-membership models.

Load packages (we don’t really need anything beyond lme4; the rest are for convenience/drawing pictures).

library(lme4)
library(broom.mixed)
library(ggplot2); theme_set(theme_bw())
library(Matrix)

Construct a simulated example: first, simulate the design (structure).

nm <- 20
nobs0 <- 500
set.seed(101)
## choose items for observations
W <- matrix(rbinom(nobs0*nm, prob=0.25, size=1),nrow=nobs0,ncol=nm)
dimnames(W) <- list(NULL,LETTERS[seq(nm)])
W[1:5,]
##      A B C D E F G H I J K L M N O P Q R S T
## [1,] 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0
## [4,] 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
## [5,] 0 1 0 1 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 0
table(rowSums(W))
## 
##   1   2   3   4   5   6   7   8   9  10  11 
##  11  40  63  85 106  85  52  37  11   7   3

The first 10 observations:

image(Matrix(W),ylim=c(1,10),sub="",ylab="Observation",
      xlab="Item",
      ## draw tick labels at top
      scales=list(at=1:20,x=list(labels=colnames(W),
                                 alternating=2)))

Since we chose which items/individuals to include for each observation randomly and independently (Bernoulli with probability 0.25), we have a highly variable number of individuals present for different observations (1-11). This would be realistic for some examples (authorship), unrealistic for others (hockey) … I don’t think it really makes much difference computationally (statistically, having some observations with a single member must make estimation more powerful …) The 0/1 matrix (indicator variable for whether item \(i\) is included in observation \(j\)) is convenient, and will turn out to be the form we need for inclusion in the model. It should be fairly straightforward to convert other forms (e.g. a list of sets of items associated with each observation) to this form …

Now simulate the response variable.

b <- rnorm(nm)  ## item-level effects
beta <- c(1,2)
M <- data.frame(x=runif(nobs0))
X <- model.matrix(~x, data=M)
## n.b. get in trouble if we don't add residual error
## (theta is scaled relative to residual error)
## here, use theta=sigma=1
M$y <- c(X %*% beta) + c(W %*% b) +rnorm(nobs0,sd=1)

Fit (see ?modular):

## helpful to specify a factor with the right levels:
## actual values are unimportant since we will specify Zt/Ztlist directly
M$fake <- rep(LETTERS[seq(nm)],length.out=nobs)
lmod <- lFormula(y~x+(1|fake), data=M)
lmod$reTrms$Zt <- lmod$reTrms$Ztlist[[1]] <- Matrix(t(W))
devfun <- do.call(mkLmerDevfun, lmod)
opt <- optimizeLmer(devfun)
m1 <- mkMerMod(environment(devfun), opt, lmod$reTrms, fr = lmod$fr)

Results look OK (correct fixed effects, item and residual variance estimated):

m1
## Linear mixed model fit by REML ['lmerMod']
## REML criterion at convergence: 1506.767
## Random effects:
##  Groups   Name        Std.Dev.
##  fake     (Intercept) 0.9580  
##  Residual             0.9944  
## Number of obs: 500, groups:  fake, 20
## Fixed Effects:
## (Intercept)            x  
##      0.8534       2.2106

The conditional modes by item look good:

dd <- tidy(m1, effects="ran_vals")
dd <- transform(dd, level=reorder(level,estimate))
truth <- data.frame(level=LETTERS[seq(nm)],estimate=b)
ggplot(dd,aes(x=level,y=estimate))+
    geom_pointrange(aes(ymin=estimate-2*std.error,
              ymax=estimate+2*std.error))+coord_flip()+
    geom_point(data=truth,colour="red")

see also

sos::findFn("{multiple membership}") ## GPvam, R2MLwiN, RealVAMS, ...

function encapsulation

Suppose we have a real data frame M and a (sparse) weight matrix W (or? a list L of weight vectors?

##' @param formula mixed model formula
##' @param data data frame (possibly but not necessarily containing factors
##' @param memb_mat list of weights matrices  with which to replace Zt components
lmer_multimemb <- function(formula,data,memb_mat=list(),...) {
  ## FIXME: pass ... through appropriately
  ## FIXME: test dimensions
  mnms <- names(memb_mat)
  fb <- findbars(formula)
  gvars <- vapply(fb, function(x) deparse(x[[3]]), character(1))
  Ztlist <- list()
  for (i in seq_along(fb)) {
    fbnm <- deparse(fb[[i]])
    ## find corresponding random-effects term
    w <- which(mnms==gvars[i])
    if (length(w)>0) {
      M <- Matrix::Matrix(memb_mat[[w]])
      ## extract LHS (effect)
      form <- as.formula(substitute(~z,list(z=fb[[i]][[2]])))
      ## construct model matrix & compute Khatri-Rao product 
      X <- model.matrix(form,data=data)
      Zt <- Matrix::KhatriRao(t(M),t(X),make.dimnames=TRUE)
      ## FIXME: mess with names?
      Ztlist[[fbnm]] <- Zt
      ## if necessary, add factor to data
      if (!m %in% names(data)) {
        ## if the factor has non-trivial ordering, it should be included
        ## in the data.  Do we have to worry about ordering of Z? test!
        data[[gvars[i]]] <- factor(colnames(memb_mat[[w]]))
      }
    } ## if  (length(w)>0)
  } ## for i in seq(fb)
  ## call lFormula  (FIXME: allow glFormula)
  lmod <- lFormula(formula,data=data)
  ## substitute new Ztlist elements
  for (m in names(Ztlist)) {
    lmod$reTrms$Ztlist[[m]] <- Ztlist[[m]]
  }
  lmod$reTrms$Zt <- do.call(rbind,lmod$reTrms$Ztlist)
  ## finish fitting
  devfun <- do.call(mkLmerDevfun, lmod)
  opt <- optimizeLmer(devfun)
  m1 <- mkMerMod(environment(devfun), opt, lmod$reTrms, fr = lmod$fr)
  return(m1)
}

This seems to work …

lmer_multimemb(formula=y ~ x + (1|team),
               data=M,
               memb_mat=list(team=W))
## Linear mixed model fit by REML ['lmerMod']
## REML criterion at convergence: 1506.767
## Random effects:
##  Groups   Name        Std.Dev.
##  team     (Intercept) 0.9580  
##  Residual             0.9944  
## Number of obs: 500, groups:  team, 20
## Fixed Effects:
## (Intercept)            x  
##      0.8534       2.2106

Data formats

We want to allow users to conveniently transform whatever data formats they have to a sparse incidence matrix.

memberships in multiple columns

Maybe it’s in multiple columns (as in this example from Harold Doran):

tmp <- data.frame(id = 1:5,
                  tch1 = c(10,10,11,11,12),
                  tch2 = c(77,15,NA,10,11),
                  tch3 = c(NA, NA, 77, 12, 10),
                  y = rnorm(5), x = rnorm(5))
## list of teachers
all_tch <- with(tmp,sort(unique(na.omit(c(tch1,tch2,tch3)))))
## empty (sparse) matrix with dimnames
W <- Matrix(0,nrow=nrow(tmp),ncol=length(all_tch),
            dimnames=list(as.character(tmp$id),as.character(all_tch)))
## melt all teacher columns to long/combined format
all_prs0 <- na.omit(reshape2::melt(tmp[,1:4],id="id"))
all_prs1 <- all_prs0[,c("id","value")]
## convert to numeric indices
all_prs2 <- all_prs1
all_prs2[] <- lapply(all_prs2,function(x) as.numeric(factor(x)))
## convert to two-column matrix
all_prs2 <- as.matrix(all_prs2)
W[all_prs2] <- 1 ## weights

row_ID, col_ID, weight as three columns

As pointed out by Robyn Wimmers, reshape2::dcast is handy if your data are in a three-column format (row_label, col_label, weight). It’s marginally less efficient because it initially generates a dense matrix.

## convert from the format we have above
all_prs3 <- all_prs1
all_prs3[] <- lapply(all_prs3, factor)
all_prs3$wts <- 1
## now data frame -> matrix
W0 <- reshape2::dcast(all_prs3,id~value, value.var="wts", fill=0)
idvars <- W0$id
W <- Matrix(as.matrix(W0[,-1])) ## drop 'id' column and sparsify
rownames(W) <- as.character(idvars)

Session info:

sessionInfo()
## R Under development (unstable) (2019-03-04 r76196)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS: /usr/local/lib/R/lib/libRblas.so
## LAPACK: /usr/local/lib/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_CA.UTF8       LC_NUMERIC=C             
##  [3] LC_TIME=en_CA.UTF8        LC_COLLATE=en_CA.UTF8    
##  [5] LC_MONETARY=en_CA.UTF8    LC_MESSAGES=en_CA.UTF8   
##  [7] LC_PAPER=en_CA.UTF8       LC_NAME=C                
##  [9] LC_ADDRESS=C              LC_TELEPHONE=C           
## [11] LC_MEASUREMENT=en_CA.UTF8 LC_IDENTIFICATION=C      
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] spdep_1.0-2       sf_0.7-3          spData_0.3.0      sp_1.3-1         
## [5] ggplot2_3.1.0     broom.mixed_0.2.4 lme4_1.1-21       Matrix_1.2-16    
## 
## loaded via a namespace (and not attached):
##  [1] gtools_3.8.1      tidyselect_0.2.5  xfun_0.5         
##  [4] TMB_1.7.15        purrr_0.3.1       reshape2_1.4.3   
##  [7] splines_3.6.0     lattice_0.20-38   expm_0.999-3     
## [10] colorspace_1.4-0  generics_0.0.2    htmltools_0.3.6  
## [13] yaml_2.2.0        rlang_0.3.1       e1071_1.7-0.1    
## [16] nloptr_1.2.1      pillar_1.3.1      glue_1.3.0       
## [19] withr_2.1.2.9000  DBI_1.0.0         plyr_1.8.4       
## [22] stringr_1.4.0     munsell_0.5.0     gtable_0.2.0     
## [25] coda_0.19-2       evaluate_0.13     labeling_0.3     
## [28] knitr_1.21        class_7.3-15      broom_0.5.1      
## [31] Rcpp_1.0.0        backports_1.1.3   scales_1.0.0     
## [34] classInt_0.3-1    gdata_2.18.0      deldir_0.1-16    
## [37] digest_0.6.18     gmodels_2.18.1    stringi_1.3.1    
## [40] dplyr_0.8.0.9001  grid_3.6.0        LearnBayes_2.15.1
## [43] tools_3.6.0       magrittr_1.5      lazyeval_0.2.1   
## [46] tibble_2.0.1      crayon_1.3.4      tidyr_0.8.3      
## [49] pkgconfig_2.0.2   MASS_7.3-51.2     assertthat_0.2.0 
## [52] minqa_1.2.4       rmarkdown_1.11    R6_2.4.0         
## [55] boot_1.3-21       units_0.6-2       nlme_3.1-137     
## [58] compiler_3.6.0