Simulated examples

library(INLA)
library(ggplot2)
library(inlamemi)
library(dplyr)

This vignette shows how to fit measurement error and imputation models using the inlamemi package for a few different simple simulated data sets. Note that although the data sets describe realistic situations, they are all completely fictitious, and created purely to illustrate how to fit models in different situations.

Simple example with missingness and two types of measurement error

Error types Likelihood Response Covariate with error Other covariate(s)
Berkson, classical, missing values Gaussian \(y\) \(x\) \(z\)

This is a simple simulation with Berkson and classical error as well as missing data, to check that the package works as expected in that scenario.

Generating the data

set.seed(2024)
n <- 1000

# Covariate without error:
z <- rnorm(n, mean = 0, sd = 1)

# Berkson error:
u_b <- rnorm(n, sd = 1)
alpha.0 <- 1; alpha.z <- 2
r <- rnorm(n, mean = alpha.0 + alpha.z*z, sd = 1)
x <- r + u_b # Turn off Berkson by commenting out "+ u_b"

# Response:
beta.0 <- 1; beta.x <- 2; beta.z <- 2
y <- beta.0 + beta.x*x + beta.z*z + rnorm(n)

# Classical error:
u_c <- rnorm(n, sd = 1)
x_obs <- r + u_c 

# Missingness:
m_pred <- -1.5 - 0.5*z # This gives a mean probability of missing of ca 0.2.
m_prob <- exp(m_pred)/(1 + exp(m_pred))

m_index <- as.logical(rbinom(n, 1, prob = m_prob)) # MAR
# m_index <- sample(1:n, 0.2*n, replace = FALSE) # MCAR
x_obs[m_index] <- NA

simple_data <- data.frame(y = y, x = x_obs, z = z)

Fitting the model

# Fit the model
simple_model <- fit_inlamemi(data = simple_data, 
                         formula_moi = y ~ x + z, 
                         formula_imp = x ~ z, 
                         family_moi = "gaussian",
                         error_type = c("berkson", "classical"),
                         prior.prec.moi = c(10, 9),       # Gamma(10, 9)
                         prior.prec.berkson = c(10, 9),   # Gamma(10, 9)
                         prior.prec.classical = c(10, 9), # Gamma(10, 9)
                         prior.prec.imp = c(10, 9),       # Gamma(10, 9)
                         prior.beta.error = c(0, 1/1000), # N(0, 10^3)
                         initial.prec.moi = 1,
                         initial.prec.berkson = 1,
                         initial.prec.classical = 1,
                         initial.prec.imp = 1)
summary(simple_model)
#> Formula for model of interest: 
#> y ~ x + z
#> 
#> Formula for imputation model: 
#> x ~ z
#> 
#> Error types: 
#> [1] "berkson"   "classical"
#> 
#> Fixed effects for model of interest: 
#>            mean        sd 0.025quant 0.5quant 0.975quant     mode
#> beta.0 1.033298 0.2179847  0.6149367 1.033448   1.439182 1.028936
#> beta.z 1.918095 0.3864149  1.2320486 1.911048   2.564599 1.916255
#> 
#> Coefficient for variable with measurement error and/or missingness: 
#>            mean        sd 0.025quant 0.5quant 0.975quant     mode
#> beta.x 1.973451 0.2021624   1.572865 1.974339    2.36885 1.978043
#> 
#> Fixed effects for imputation model: 
#>               mean         sd 0.025quant 0.5quant 0.975quant     mode
#> alpha.x.0 1.033078 0.05059793  0.9338142 1.033086   1.132297 1.033086
#> alpha.x.z 2.024712 0.05226033  1.9222594 2.024694   2.127263 2.024694
#> 
#> Model hyperparameters (apart from beta.x): 
#>                                      mean        sd 0.025quant  0.5quant 0.975quant      mode
#> Precision for model of interest 1.1282039 0.3610639  0.5673544 1.0788282   1.973095 0.9880770
#> Precision for x berkson model   1.1283327 0.3464080  0.5947152 1.0795115   1.944788 0.9885253
#> Precision for x classical model 0.9256935 0.1092025  0.7301264 0.9190625   1.159482 0.9054505
#> Precision for x imp model       0.9777885 0.1259149  0.7530336 0.9699057   1.247940 0.9545509
simple.truth <- tibble::tribble(
  ~"variable", ~"value",
  "beta.x",  beta.x, 
  "beta.z",  beta.z, 
#  "beta.0",  beta.0, 
  "alpha.x.z", alpha.z, 
 # "alpha.0", alpha.0
  )

plot(simple_model, plot_intercepts = FALSE) +
    geom_point(data = simple.truth, aes(x = value))
plot of chunk unnamed-chunk-4

plot of chunk unnamed-chunk-4

Missing data only

Error types Likelihood Response Covariate with error Other covariate(s)
Missing values Gaussian \(y\) \(x\) \(z\)

In this example, we have missingness in one covariate, but no other measurement error, so this shows how to do simple imputation of a missing covariate in R-INLA.

Generating the data

set.seed(2024)
n <- 1000

# Covariate without missingness:
z <- rnorm(n, mean = 0, sd = 1)

# Covariate that will have missingness:
alpha.0 <- 1; alpha.z <- 2
x <- rnorm(n, mean = alpha.0 + alpha.z*z, sd = 1)

# Response:
beta.0 <- 1; beta.x <- 2; beta.z <- 2
y <- beta.0 + beta.x*x + beta.z*z + rnorm(n)

# Missingness:
m_pred <- -1.5 - 0.5*z # This gives a mean probability of missing of ca 0.2.
m_prob <- exp(m_pred)/(1 + exp(m_pred))

m_index <- as.logical(rbinom(n, 1, prob = m_prob)) # MAR
# m_index <- sample(1:n, 0.2*n, replace = FALSE) # MCAR
x_obs <- x
x_obs[m_index] <- NA

missing_data <- data.frame(y = y, x = x_obs, z = z)

Model without imputation

naive_model <- inla(formula = y ~ x + z, family = "gaussian", data = missing_data)

naive_model$summary.fixed
#>                 mean         sd 0.025quant 0.5quant 0.975quant     mode          kld
#> (Intercept) 2.059952 0.07312437  1.9165320 2.059952   2.203371 2.059952 1.021533e-11
#> x           1.060038 0.04847065  0.9649723 1.060038   1.155104 1.060038 1.018360e-11
#> z           4.242952 0.09764620  4.0514375 4.242952   4.434467 4.242952 1.033984e-11
naive_model$summary.hyperpar
#>                                              mean         sd 0.025quant  0.5quant 0.975quant      mode
#> Precision for the Gaussian observations 0.3082828 0.01379367  0.2818476 0.3080781  0.3359001 0.3076698

Model with imputation

missing_model <- fit_inlamemi(formula_moi = y ~ x + z,
                            formula_imp = x ~ z,
                            family_moi = "gaussian",
                            data = missing_data, 
                            error_type = "missing", 
                            prior.prec.moi = c(2, 1),
                            prior.prec.imp = c(2, 1),
                            prior.beta.error = c(0, 1/1000),
                            initial.prec.moi = 1,
                            initial.prec.imp = 1)

summary(missing_model)
#> Formula for model of interest: 
#> y ~ x + z
#> 
#> Formula for imputation model: 
#> x ~ z
#> 
#> Error types: 
#> [1] "missing"
#> 
#> Fixed effects for model of interest: 
#>             mean         sd 0.025quant  0.5quant 0.975quant      mode
#> beta.0 0.9738457 0.04853280  0.8797317 0.9737786   1.068299 0.9737591
#> beta.z 1.9439154 0.07630237  1.7986467 1.9435537   2.090058 1.9432547
#> 
#> Coefficient for variable with measurement error and/or missingness: 
#>            mean         sd 0.025quant 0.5quant 0.975quant     mode
#> beta.x 2.024611 0.03438061   1.956372   2.0248    2.09174 2.025593
#> 
#> Fixed effects for imputation model: 
#>               mean         sd 0.025quant 0.5quant 0.975quant     mode
#> alpha.x.0 1.031093 0.03138257  0.9695517 1.031090   1.092654 1.031090
#> alpha.x.z 1.983403 0.03208449  1.9204733 1.983403   2.046329 1.983403
#> 
#> Model hyperparameters (apart from beta.x): 
#>                                     mean         sd 0.025quant 0.5quant 0.975quant     mode
#> Precision for model of interest 1.055879 0.05245398  0.9566566 1.054484   1.163136 1.051504
#> Precision for x classical model 1.110965 0.33580113  0.5440299 1.078883   1.845706 1.027808
#> Precision for x imp model       1.066205 0.04985415  0.9717543 1.064929   1.168006 1.062149
missing_truth <- tibble::tribble(
  ~"variable", ~"value",
  "beta.0", beta.0,
  "beta.x",  beta.x, 
  "beta.z",  beta.z,
  "alpha.x.0", alpha.0,
  "alpha.x.z", alpha.z
)

plot(missing_model) +
    geom_point(data = missing_truth, aes(x = value))
plot of chunk unnamed-chunk-8

plot of chunk unnamed-chunk-8

Random effect in the main model

Error types Likelihood Response Covariate with error Other covariate(s)
Classical Gaussian \(y\) \(x\) \(z\), random effect \(w\)

In this example, we simulate data that is grouped in such a way that it should be modelled with a random effect in the model of interest.

Generating the data

m <- 10 # number of groups
n <- 100 # number of observations per group
N <- m*n # total number of observations

sd_y <- 3 # sd for the noise
sd_w <- 2 # sd for random effect
sd_x <- 2 # sd for covariate without error
sd_u <- 1 # sd for measurement error

# Covariate without error
z <- rnorm(N, 0, 2)

# Covariate with error
x <- rnorm(N, 0, sd_x) # Independent of z, but can change that here
x_obs <- x + rnorm(N, 0, sd_u)

# Random effect
w_per_group <- rnorm(m, 0, sd_w)
w <- rep(w_per_group, each = n)

# Response
y <- 1 + 2*x + 2*z + w + rnorm(N, 0, sd_y)

reff_data <- data.frame(y = y, id = rep(1:m, each = n), x = x_obs, z = z)

Fitting the model

Firstly, if we ignored the measurement error, we might fit a model like this:

naive_model <- inla(y ~ x + z + f(id, model = "iid"),
                    data = reff_data,
                    family = "gaussian")
summary(naive_model)
#> 
#> Call:
#>    c("inla.core(formula = formula, family = family, contrasts = contrasts, ", " data = data, quantiles = quantiles, E = E, 
#>    offset = offset, ", " scale = scale, weights = weights, Ntrials = Ntrials, strata = strata, ", " lp.scale = lp.scale, 
#>    link.covariates = link.covariates, verbose = verbose, ", " lincomb = lincomb, selection = selection, control.compute = 
#>    control.compute, ", " control.predictor = control.predictor, control.family = control.family, ", " control.inla = 
#>    control.inla, control.fixed = control.fixed, ", " control.mode = control.mode, control.expert = control.expert, ", " 
#>    control.hazard = control.hazard, control.lincomb = control.lincomb, ", " control.update = control.update, control.lp.scale 
#>    = control.lp.scale, ", " control.pardiso = control.pardiso, only.hyperparam = only.hyperparam, ", " inla.call = inla.call, 
#>    inla.arg = inla.arg, num.threads = num.threads, ", " keep = keep, working.directory = working.directory, silent = silent, 
#>    ", " inla.mode = inla.mode, safe = FALSE, debug = debug, .parent.frame = .parent.frame)" ) 
#> Time used:
#>     Pre = 1.78, Running = 0.569, Post = 0.0353, Total = 2.39 
#> Fixed effects:
#>               mean    sd 0.025quant 0.5quant 0.975quant   mode kld
#> (Intercept) -0.316 0.561     -1.431   -0.316      0.798 -0.316   0
#> x            1.682 0.051      1.582    1.682      1.782  1.682   0
#> z            2.022 0.058      1.908    2.022      2.136  2.022   0
#> 
#> Random effects:
#>   Name     Model
#>     id IID model
#> 
#> Model hyperparameters:
#>                                          mean    sd 0.025quant 0.5quant 0.975quant  mode
#> Precision for the Gaussian observations 0.077 0.003      0.070    0.077      0.084 0.077
#> Precision for id                        0.402 0.185      0.145    0.367      0.856 0.304
#> 
#> Marginal log-Likelihood:  -2756.06 
#>  is computed 
#> Posterior summaries for the linear predictor and the fitted values are computed
#> (Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
# curve(dgamma(x, shape = 1.5, rate = 2), to = 2)

reff_model <- fit_inlamemi(formula_moi = y ~ x + z + 
                         f(id, model = "iid", hyper = list(prec = list(initial = -15, param = c(2, 2)))),
                       formula_imp = x ~ 1,
                       family_moi = "gaussian",
                       error_type = "classical",
                       data = reff_data,
                       initial.prec.moi = 1/4, 
                       initial.prec.classical = 1, 
                       initial.prec.imp = 1/4,
                       prior.prec.moi = c(1, 4),
                       prior.prec.classical = c(10, 10),
                       prior.prec.imp = c(1, 4),
                       prior.beta.error = c(0, 1/1000))

summary(reff_model)
#> Formula for model of interest: 
#> y ~ x + z + f(id, model = "iid", hyper = list(prec = list(initial = -15, 
#>     param = c(2, 2))))
#> 
#> Formula for imputation model: 
#> x ~ 1
#> 
#> Error types: 
#> [1] "classical"
#> 
#> Fixed effects for model of interest: 
#>              mean         sd 0.025quant   0.5quant 0.975quant       mode
#> beta.0 -0.3654658 0.54099872  -1.442753 -0.3655729  0.7122681 -0.3655812
#> beta.z  2.0222091 0.05799369   1.908457  2.0222124  2.1359427  2.0222124
#> 
#> Coefficient for variable with measurement error and/or missingness: 
#>            mean        sd 0.025quant 0.5quant 0.975quant     mode
#> beta.x 2.135726 0.1933915   1.762287 2.133274   2.523689 2.122805
#> 
#> Fixed effects for imputation model: 
#>                mean         sd  0.025quant  0.5quant 0.975quant      mode
#> alpha.x.0 0.1022765 0.07116924 -0.03730514 0.1022765   0.241858 0.1022765
#> 
#> Model hyperparameters (apart from beta.x): 
#>                                      mean         sd 0.025quant  0.5quant 0.975quant      mode
#> Precision for model of interest 0.1107281 0.02005615 0.07717868 0.1087293  0.1558824 0.1044915
#> Precision for x classical model 0.9855458 0.32673955 0.48467682 0.9388165  1.7557921 0.8527067
#> Precision for x imp model       0.2524987 0.02583589 0.20618260 0.2509454  0.3078048 0.2473811
#> Precision for id                0.4130612 0.16813712 0.16405049 0.3862682  0.8139083 0.3357995

reff.truth <- tibble::tribble(
  ~"variable", ~"value",
  "beta.x",  2, 
  "beta.z",  2, 
  "beta.0",  1, 
  "alpha.x.0", 0
)

plot(reff_model) +
    geom_point(data = reff.truth, aes(x = value))
plot of chunk unnamed-chunk-11

plot of chunk unnamed-chunk-11

Interaction effect with error variable

Error types Likelihood Response Covariate with error Other covariate(s)
Classical Gaussian \(y\) \(x\) categorical variable \(s\), interaction effect \(x:s\)

Interaction effects between the error prone variable and an (assumed) error-free variable can also be used. The syntax is as normal, an interaction effect between variables x and s would be specified in the formula for the model of interest as x:s.

Generating the data

set.seed(2024)
n <- 1000

# Covariate without error:
s <- c(rep(0, n/2), rep(1, n/2))

# Classical error:
x <- rnorm(n, mean = 0, sd = 2)
u_c <- rnorm(n, sd = 1)
x_obs <- x + u_c 

# Response:
beta.0 <- 1; beta.x.s <- 2
y <- beta.0 + beta.x*x + beta.x.s*x_obs*s + rnorm(n)

interact_data <- data.frame(y = y, x = x_obs, z = z, s = s)

Fitting the model

interact_model <- fit_inlamemi(formula_moi = y ~ x:s,
                             formula_imp = x ~ 1,
                             family_moi = "gaussian",
                             data = interact_data,
                             error_type = "classical",
                             prior.beta.error = c(0, 0.01),
                             prior.prec.moi = c(10, 9),
                             prior.prec.classical = c(10, 9),
                             prior.prec.imp = c(10, 8),
                             initial.prec.moi = 1,
                             initial.prec.classical = 1,
                             initial.prec.imp = 2)
summary(interact_model)
#> Formula for model of interest: 
#> y ~ x:s
#> 
#> Formula for imputation model: 
#> x ~ 1
#> 
#> Error types: 
#> [1] "classical"
#> 
#> Fixed effects for model of interest: 
#>              mean         sd 0.025quant    0.5quant 0.975quant        mode
#> beta.0  1.0081320 0.07959036  0.8520092  1.00813505  1.1642376  1.00813515
#> beta.s -0.0860376 0.13376390 -0.3484059 -0.08603286  0.1763038 -0.08603285
#> 
#> Coefficient for variable with measurement error and/or missingness: 
#>             mean         sd 0.025quant 0.5quant 0.975quant     mode
#> beta.sx 2.055685 0.06319152   1.931406 2.055642   2.180214 2.055465
#> beta.x  1.709263 0.04043440   1.629983 1.709152   1.789188 1.708691
#> 
#> Fixed effects for imputation model: 
#>                 mean         sd 0.025quant   0.5quant 0.975quant       mode
#> alpha.x.0 0.03481148 0.06850681 -0.0995513 0.03481148  0.1691743 0.03481148
#> 
#> Model hyperparameters (apart from beta.sx, beta.x): 
#>                                      mean         sd 0.025quant  0.5quant 0.975quant      mode
#> Precision for model of interest 0.4106343 0.03324269  0.3493282 0.4092127   0.480130 0.4062196
#> Precision for x classical model 4.0998178 0.44079619  3.2916129 4.0794275   5.024943 4.0451641
#> Precision for x imp model       0.2252613 0.01058527  0.2050650 0.2250392   0.246733 0.2246524
plot(interact_model)
plot of chunk unnamed-chunk-14

plot of chunk unnamed-chunk-14

Logistic regression with classical error and missing data

Error types Likelihood Response Covariate with error Other covariate(s)
Classical, missing Binomial \(y\) \(w\) \(z\)

Here we show how to fit a logistic regression model.

set.seed(1)
nn <- 1000

z <- rbinom(nn, 1, 0.5)
x <- rnorm(nn, 1-0.5*z, 1)

# Error
w <- x + rnorm(nn, 0, 1)

# Generating missing data, depending on z
eta <- -1 + z
prob_missing <- exp(eta)/(1 + exp(eta))
missing_index <- rbinom(nn, 1, prob_missing)

# Proportion missing:
sum(missing_index)/nn
#> [1] 0.393

# Replace the values in w by missing in case the index for missingness is =1:
w <- ifelse(missing_index==1, NA, w)

# Generate binomial response
eta <- x + z 
prob <- exp(eta)/(1 + exp(eta))
y <- rbinom(nn, 1, prob)

data_binom <- data.frame(x = x, z = z, y = y, w = w)

Modeling error and missing data:

model_binom <- fit_inlamemi(data = data_binom,
                       formula_moi = y ~ w + z,
                       formula_imp = w ~ z,
                       formula_mis = m ~ z,
                       family_moi = "binomial",
                       error_type = c("classical","missing"),
                       prior.prec.classical = c(10, 9),
                       prior.prec.imp = c(10, 9),
                       prior.beta.error = c(0, 1/1000),
                       initial.prec.classical = 2,
                       initial.prec.imp = 1)

summary(model_binom)
plot(model_binom)

Poisson regression with classical error and missing data

Error types Likelihood Response Covariate with error Other covariate(s)
Classical, missing Poisson \(y\) \(w\) \(z\)

Here we show how to fit a Poisson regression model, where the model of interest has a random intercept term.

set.seed(1)
nn <- 1000

z <- rbinom(nn, 1, 0.5)
x <- rnorm(nn, 1-0.5*z, 1)

# Error
w <- x + rnorm(nn, 0, 1)

# Generating missing data, depending on z
eta <- -1 + z
prob_missing <- exp(eta)/(1 + exp(eta))
missing_index <- rbinom(nn, 1, prob_missing)

# Proportion missing:
sum(missing_index)/nn
#> [1] 0.393

# Replace the values in w by missing in case the index for missingness is =1:
w <- ifelse(missing_index==1, NA, w)

# Random effect to include in the regression model:
re <- rep(rnorm(nn/20), each = 20)

# Linear predictor
eta <-  x + z + re  

# Generate Poisson response
y <- rpois(nn, exp(eta))

data_pois <- data.frame(x = x, z = z, y = y, w = w, id = rep(seq(1:50), each = 20))

Simple regression models with correct and error-prone variables to check the effect:

library(lme4)
summary(glmer(y ~ x + z + (1|id), data = data_pois, family="poisson"))$coef

summary(glmer(y ~ w + z + (1|id), data = data_pois, family="poisson"))$coef

Modeling error and missing data:

model_pois <- fit_inlamemi(data = data_pois,
                       formula_moi = y ~ w + z + f(id,model="iid"),
                       formula_imp = w ~ z,
                       formula_mis = m ~ z,
                       family_moi = "poisson",
                       error_type = c("classical","missing"),
                       prior.prec.classical = c(19, 9),
                       prior.prec.imp = c(10, 9),
                       prior.beta.error = c(0, 1/1000),
                       initial.prec.classical = 2,
                       initial.prec.imp = 1)

summary(model_pois)
plot(model_pois)