Using e2tree with XGBoost, GBM, LightGBM, and CatBoost

Overview

Starting from version 1.1.0, e2tree supports the following tree ensemble backends in addition to randomForest and ranger:

Package	Model class	Task
xgboost	`xgb.Booster`	classification, regression
gbm	`gbm`	classification, regression
lightgbm	`lgb.Booster`	classification, regression
catboost	`catboost.CatBoost` / `catboost.Model`	classification, regression

The workflow is identical regardless of the backend: train a model, build the dissimilarity matrix with createDisMatrix(), then call e2tree().

XGBoost

Classification (iris)

library(e2tree)
if (!require("xgboost")) install.packages("xgboost", 
                                               repos="https://cran.r-project.org")
library(xgboost)

data(iris)
set.seed(42)
n  <- floor(0.75 * nrow(iris))
tr <- iris[sample(nrow(iris), n), ]
va <- iris[setdiff(seq_len(nrow(iris)), as.integer(rownames(tr))), ]

# XGBoost requires a numeric matrix and 0-indexed integer labels
X_tr <- as.matrix(tr[, 1:4])
y_tr <- as.integer(tr$Species) - 1L
dm_tr <- xgb.DMatrix(data = X_tr, label = y_tr)

ensemble <- xgb.train(
  params  = list(objective  = "multi:softmax",
                 num_class  = 3,
                 max_depth  = 4,
                 eta        = 0.3),
  data    = dm_tr,
  nrounds = 100,
  verbose = 0
)

# Attach the response back to the data.frame so the formula in e2tree()
# can find it; createDisMatrix() will use it to annotate the dissimilarity
# matrix (in classification, `label` is optional but recommended).
tr_xgb         <- tr[, 1:4]
tr_xgb$Species <- tr$Species

D <- createDisMatrix(ensemble, data = tr_xgb, label = "Species",
                     parallel = list(active = FALSE, no_cores = 1))

setting  <- list(impTotal = 0.1, maxDec = 0.01, n = 2, level = 5)
tree_xgb <- e2tree(Species ~ ., data = tr_xgb, D = D,
                   ensemble = ensemble, setting = setting)
print(tree_xgb)

Regression (mtcars)

For regression backends, createDisMatrix() needs the response column to compute the dissimilarity scale. Pass the full data frame (predictors plus response) and the name of the response column via the label argument.

library(xgboost)

data(mtcars)
set.seed(42)
n  <- floor(0.75 * nrow(mtcars))
tr <- mtcars[sample(nrow(mtcars), n), ]

X_tr  <- as.matrix(tr[, -1])
y_tr  <- tr$mpg
dm_tr <- xgb.DMatrix(data = X_tr, label = y_tr)

ensemble <- xgb.train(
  params  = list(objective = "reg:squarederror", max_depth = 4, eta = 0.3),
  data    = dm_tr,
  nrounds = 100,
  verbose = 0
)

# `data = tr` carries the response column too; the XGBoost adapter
# automatically trims the matrix to the features used at training time.
D    <- createDisMatrix(ensemble, data = tr, label = "mpg",
                        parallel = list(active = FALSE, no_cores = 1))
tree <- e2tree(mpg ~ ., data = tr, D = D, ensemble = ensemble,
               setting = list(impTotal = 0.1, maxDec = 1e-6, n = 2, level = 5))
print(tree)

GBM

Classification (iris, binary)

gbm expects a 0/1 numeric response for the bernoulli distribution, while e2tree expects a factor response for classification. We therefore train gbm on the integer column and pass a factor copy of the same column to e2tree.

if (!require("gbm")) install.packages("gbm", 
                                               repos="https://cran.r-project.org")
library(gbm)

data(iris)
set.seed(42)
df <- iris
df$is_setosa     <- as.integer(df$Species == "setosa")
df$is_setosa_fct <- factor(df$is_setosa, levels = c(0L, 1L))
n  <- floor(0.75 * nrow(df))
tr <- df[sample(nrow(df), n), ]

ensemble <- gbm(is_setosa ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width,
                data              = tr,
                distribution      = "bernoulli",
                n.trees           = 200,
                interaction.depth = 4,
                verbose           = FALSE)

D    <- createDisMatrix(ensemble,
                        data     = tr[, c("Sepal.Length","Sepal.Width",
                                          "Petal.Length","Petal.Width",
                                          "is_setosa_fct")],
                        label    = "is_setosa_fct",
                        parallel = list(active = FALSE, no_cores = 1))
tree <- e2tree(is_setosa_fct ~ Sepal.Length + Sepal.Width +
                               Petal.Length + Petal.Width,
               data = tr, D = D, ensemble = ensemble,
               setting = list(impTotal = 0.1, maxDec = 0.01, n = 2, level = 5))
print(tree)

Regression (mtcars)

gbm requires nTrain * bag.fraction > 2 * n.minobsinnode + 1, which fails on small training sets such as 24-row mtcars with the default settings. Lower n.minobsinnode and raise bag.fraction to keep the example self-contained.

library(gbm)

data(mtcars)
set.seed(42)
n  <- floor(0.75 * nrow(mtcars))
tr <- mtcars[sample(nrow(mtcars), n), ]

ensemble <- gbm(mpg ~ ., data = tr,
                distribution      = "gaussian",
                n.trees           = 200,
                interaction.depth = 4,
                n.minobsinnode    = 2,
                bag.fraction      = 0.8,
                verbose           = FALSE)

D    <- createDisMatrix(ensemble, data = tr, label = "mpg",
                        parallel = list(active = FALSE, no_cores = 1))
tree <- e2tree(mpg ~ ., data = tr, D = D, ensemble = ensemble,
               setting = list(impTotal = 0.1, maxDec = 1e-6, n = 2, level = 5))
print(tree)

LightGBM

Classification (iris)

if (!require("lightgbm")) install.packages("lightgbm", 
                                               repos="https://cran.r-project.org")
library(lightgbm)

data(iris)
set.seed(42)
n  <- floor(0.75 * nrow(iris))
tr <- iris[sample(nrow(iris), n), ]

X_tr <- as.matrix(tr[, 1:4])
y_tr <- as.integer(tr$Species) - 1L
ds   <- lgb.Dataset(X_tr, label = y_tr)

ensemble <- lgb.train(
  params  = list(objective  = "multiclass",
                 num_class  = 3,
                 num_leaves = 15,
                 verbose    = -1),
  data    = ds,
  nrounds = 100
)

tr_lgb         <- tr[, 1:4]
tr_lgb$Species <- tr$Species

D <- createDisMatrix(ensemble, data = tr_lgb, label = "Species",
                     parallel = list(active = FALSE, no_cores = 1))

tree <- e2tree(Species ~ ., data = tr_lgb, D = D, ensemble = ensemble,
               setting = list(impTotal = 0.1, maxDec = 0.01, n = 2, level = 5))
print(tree)

Regression (mtcars)

library(lightgbm)

data(mtcars)
set.seed(42)
n  <- floor(0.75 * nrow(mtcars))
tr <- mtcars[sample(nrow(mtcars), n), ]

X_tr <- as.matrix(tr[, -1])
y_tr <- tr$mpg
ds   <- lgb.Dataset(X_tr, label = y_tr)

ensemble <- lgb.train(
  params  = list(objective        = "regression",
                 num_leaves       = 8,
                 min_data_in_leaf = 2,
                 learning_rate    = 0.1,
                 verbose          = -1),
  data    = ds,
  nrounds = 200
)

# Pass the response column to createDisMatrix() via `label`. The
# LightGBM adapter selects the columns it needs through the booster's
# stored feature names, so any extra columns in `data` are ignored.
D    <- createDisMatrix(ensemble, data = tr, label = "mpg",
                        parallel = list(active = FALSE, no_cores = 1))
tree <- e2tree(mpg ~ ., data = tr, D = D, ensemble = ensemble,
               setting = list(impTotal = 0.1, maxDec = 1e-6, n = 2, level = 5))
print(tree)

Adding a new backend

To support a further model class MyEnsemble, implement three S3 methods and register them in NAMESPACE:

# In R/adapters.R (or a separate R/adapter_mymodel.R)

get_ensemble_type.MyEnsemble <- function(ensemble) {
  # return "classification" or "regression"
}

extract_terminal_nodes.MyEnsemble <- function(ensemble, data) {
  # return data.frame of (n_obs × n_trees) terminal node IDs
}

get_ensemble_predictions.MyEnsemble <- function(ensemble, data, type) {
  # return numeric vector of length n_obs
}

No changes to createDisMatrix(), e2tree(), or any other core function are required.