Introduction to misl

Overview

misl implements Multiple Imputation by Super Learning, a flexible approach to handling missing data described in:

Carpenito T, Manjourides J. (2022) MISL: Multiple imputation by super learning. Statistical Methods in Medical Research. 31(10):1904–1915. doi: 10.1177/09622802221104238

Rather than relying on a single parametric imputation model, misl builds a stacked ensemble of machine learning algorithms for each incomplete column, producing well-calibrated imputations for continuous, binary, and categorical variables.

Installation

# Install from GitHub
remotes::install_github("JustinManjourides/misl")

# Optional backend packages for additional learners
install.packages(c("ranger", "xgboost", "earth"))

Simulated data

We simulate a small dataset with three types of incomplete variables to demonstrate misl across all supported outcome types.

library(misl)

set.seed(42)
n <- 300

sim_data <- data.frame(
  # Continuous predictors (always observed)
  age    = round(rnorm(n, mean = 50, sd = 12)),
  bmi    = round(rnorm(n, mean = 26, sd = 4), 1),
  # Continuous outcome with missingness
  sbp    = round(120 + 0.4 * rnorm(n, mean = 50, sd = 12) +
                       0.3 * rnorm(n, mean = 26, sd = 4) + rnorm(n, sd = 10)),
  # Binary outcome with missingness (0 = no, 1 = yes)
  smoker = rbinom(n, 1, prob = 0.3),
  # Categorical outcome with missingness
  group  = factor(sample(c("A", "B", "C"), n, replace = TRUE,
                         prob = c(0.4, 0.35, 0.25)))
)

# Introduce missing values
sim_data[sample(n, 40), "sbp"]    <- NA
sim_data[sample(n, 30), "smoker"] <- NA
sim_data[sample(n, 30), "group"]  <- NA

# Summarise missingness
sapply(sim_data, function(x) sum(is.na(x)))
#>    age    bmi    sbp smoker  group 
#>      0      0     40     30     30

Basic usage

The primary function is misl(). Supply a dataset and specify:

Use list_learners() to see available options:

knitr::kable(list_learners())
learner description continuous binomial categorical package installed
glm Linear / logistic regression TRUE TRUE FALSE stats TRUE
mars Multivariate adaptive regression splines TRUE TRUE FALSE earth TRUE
multinom_reg Multinomial regression FALSE FALSE TRUE nnet TRUE
rand_forest Random forest TRUE TRUE TRUE ranger TRUE
boost_tree Gradient boosted trees TRUE TRUE TRUE xgboost TRUE

Running misl() with a single learner per outcome type is the fastest option and is well suited for exploratory work:

misl_imp <- misl(
  sim_data,
  m          = 5,
  maxit      = 5,
  con_method = "glm",
  bin_method = "glm",
  cat_method = "multinom_reg",
  seed       = 42,
  quiet      = TRUE
)

misl() returns a list of length m. Each element contains:

# Number of imputed datasets
length(misl_imp)
#> [1] 5

# Confirm no missing values remain
anyNA(misl_imp[[1]]$datasets)
#> [1] FALSE

# Compare observed vs imputed distribution for sbp
obs_mean <- mean(sim_data$sbp, na.rm = TRUE)
imp_mean <- mean(misl_imp[[1]]$datasets$sbp)

cat("Observed mean sbp (non-missing):", round(obs_mean, 2), "\n")
#> Observed mean sbp (non-missing): 147.31
cat("Imputed mean sbp  (all values): ", round(imp_mean, 2), "\n")
#> Imputed mean sbp  (all values):  147.65

# Check binary imputations are valid
table(misl_imp[[1]]$datasets$smoker)
#> 
#>   0   1 
#> 212  88

# Check categorical imputations stay within observed levels
levels(misl_imp[[1]]$datasets$group)
#> [1] "A" "B" "C"

Multiple learners and stacking

When multiple learners are supplied, misl uses cross-validation to learn optimal ensemble weights via the stacks package. Use cv_folds to reduce the number of folds and speed up computation:

misl_stack <- misl(
  sim_data,
  m          = 5,
  maxit      = 5,
  con_method = c("glm", "rand_forest"),
  bin_method = c("glm", "rand_forest"),
  cat_method = c("multinom_reg", "rand_forest"),
  cv_folds   = 3,
  seed       = 42
)

Analysing the imputed datasets

After imputation, fit your analysis model to each of the m datasets and pool the results using Rubin’s rules. Here we implement pooling manually using base R:

# Fit a linear model to each imputed dataset
models <- lapply(misl_imp, function(imp) {
  lm(sbp ~ age + bmi + smoker + group, data = imp$datasets)
})

# Pool point estimates and standard errors using Rubin's rules
m       <- length(models)
ests    <- sapply(models, function(fit) coef(fit))
vars    <- sapply(models, function(fit) diag(vcov(fit)))

Q_bar   <- rowMeans(ests)                          # pooled estimate
U_bar   <- rowMeans(vars)                          # within-imputation variance
B       <- apply(ests, 1, var)                     # between-imputation variance
T_total <- U_bar + (1 + 1 / m) * B                # total variance

pooled <- data.frame(
  term     = names(Q_bar),
  estimate = round(Q_bar, 4),
  std.error = round(sqrt(T_total), 4),
  conf.low  = round(Q_bar - 1.96 * sqrt(T_total), 4),
  conf.high = round(Q_bar + 1.96 * sqrt(T_total), 4)
)
print(pooled)
#>                    term estimate std.error conf.low conf.high
#> (Intercept) (Intercept) 154.9631    5.1777 144.8148  165.1114
#> age                 age  -0.0421    0.0577  -0.1552    0.0711
#> bmi                 bmi  -0.1890    0.1699  -0.5219    0.1439
#> smoker           smoker  -0.7820    1.5082  -3.7381    2.1740
#> groupB           groupB  -0.1393    1.7862  -3.6402    3.3617
#> groupC           groupC  -0.9780    1.6066  -4.1269    2.1709

For a full implementation of Rubin’s rules including degrees of freedom and p-values, the mice package provides pool() and can be used directly with misl output:

library(mice)
pooled_mice <- summary(pool(models), conf.int = TRUE)

Convergence: trace plots

The $trace element records the mean and standard deviation of imputed values at each iteration, which can be used to assess convergence:

# Extract trace data across all m datasets
trace <- do.call(rbind, lapply(misl_imp, function(r) r$trace))

# Plot mean of imputed sbp values across iterations for each dataset
sbp_trace <- subset(trace, variable == "sbp" & statistic == "mean")

plot(
  sbp_trace$iteration,
  sbp_trace$value,
  col  = sbp_trace$m,
  pch  = 16,
  xlab = "Iteration",
  ylab = "Mean imputed sbp",
  main = "Trace plot: mean of imputed sbp values",
  xaxt = "n"
)
axis(1, at = 1:5)
legend("topright", legend = paste("m =", 1:5),
       col = 1:5, pch = 16, cex = 0.8)

Stable traces that mix well across datasets indicate the algorithm has converged.

Parallelism

The m imputed datasets are generated independently and can be run in parallel using the future framework. Set a parallel plan before calling misl():

library(future)

# Use all available cores
plan(multisession)

misl_parallel <- misl(
  sim_data,
  m          = 10,
  maxit      = 5,
  con_method = c("glm", "rand_forest"),
  bin_method = c("glm", "rand_forest"),
  cat_method = c("multinom_reg", "rand_forest"),
  seed       = 42
)

# Always reset the plan when done
plan(sequential)

To limit the number of cores:

plan(multisession, workers = 4)

The largest speedup comes from running the m datasets simultaneously. On a machine with 10 cores running m = 10, all 10 datasets can be imputed in parallel. Check how many cores are available with:

parallel::detectCores()

Session info

sessionInfo()
#> R version 4.5.2 (2025-10-31)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS Sequoia 15.7.4
#> 
#> Matrix products: default
#> BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
#> 
#> locale:
#> [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> time zone: America/New_York
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] future_1.70.0 misl_1.0.0   
#> 
#> loaded via a namespace (and not attached):
#>  [1] gtable_0.3.6        xfun_0.54           bslib_0.9.0        
#>  [4] ggplot2_4.0.2       recipes_1.3.1       lattice_0.22-7     
#>  [7] vctrs_0.7.2         tools_4.5.2         generics_0.1.4     
#> [10] parallel_4.5.2      tibble_3.3.1        pkgconfig_2.0.3    
#> [13] Matrix_1.7-4        data.table_1.18.2.1 RColorBrewer_1.1-3 
#> [16] S7_0.2.1            lifecycle_1.0.5     compiler_4.5.2     
#> [19] farver_2.1.2        codetools_0.2-20    htmltools_0.5.8.1  
#> [22] class_7.3-23        sass_0.4.10         yaml_2.3.10        
#> [25] prodlim_2026.03.11  Formula_1.2-5       pillar_1.11.1      
#> [28] jquerylib_0.1.4     tidyr_1.3.2         MASS_7.3-65        
#> [31] cachem_1.1.0        gower_1.0.2         plotmo_3.7.0       
#> [34] rpart_4.1.24        earth_5.3.5         parallelly_1.46.1  
#> [37] lava_1.8.2          tidyselect_1.2.1    digest_0.6.39      
#> [40] dplyr_1.2.0         purrr_1.2.1         listenv_0.10.1     
#> [43] splines_4.5.2       fastmap_1.2.0       parsnip_1.4.1      
#> [46] grid_4.5.2          cli_3.6.5           magrittr_2.0.4     
#> [49] survival_3.8-3      future.apply_1.20.2 withr_3.0.2        
#> [52] scales_1.4.0        plotrix_3.8-14      xgboost_3.2.1.1    
#> [55] lubridate_1.9.5     timechange_0.4.0    rmarkdown_2.30     
#> [58] globals_0.19.1      nnet_7.3-20         timeDate_4052.112  
#> [61] ranger_0.18.0       workflows_1.3.0     evaluate_1.0.5     
#> [64] knitr_1.50          hardhat_1.4.2       rlang_1.1.7        
#> [67] Rcpp_1.1.1          glue_1.8.0          sparsevctrs_0.3.6  
#> [70] ipred_0.9-15        rstudioapi_0.17.1   jsonlite_2.0.0     
#> [73] R6_2.6.1