Starting from version 1.1.0, e2tree supports the
following tree ensemble backends in addition to
randomForest and ranger:
| Package | Model class | Task |
|---|---|---|
| xgboost | xgb.Booster |
classification, regression |
| gbm | gbm |
classification, regression |
| lightgbm | lgb.Booster |
classification, regression |
| catboost | catboost.CatBoost / catboost.Model |
classification, regression |
The workflow is identical regardless of the backend: train a model,
build the dissimilarity matrix with createDisMatrix(), then
call e2tree().
library(e2tree)
if (!require("xgboost")) install.packages("xgboost",
repos="https://cran.r-project.org")
library(xgboost)
data(iris)
set.seed(42)
n <- floor(0.75 * nrow(iris))
tr <- iris[sample(nrow(iris), n), ]
va <- iris[setdiff(seq_len(nrow(iris)), as.integer(rownames(tr))), ]
# XGBoost requires a numeric matrix and 0-indexed integer labels
X_tr <- as.matrix(tr[, 1:4])
y_tr <- as.integer(tr$Species) - 1L
dm_tr <- xgb.DMatrix(data = X_tr, label = y_tr)
ensemble <- xgb.train(
params = list(objective = "multi:softmax",
num_class = 3,
max_depth = 4,
eta = 0.3),
data = dm_tr,
nrounds = 100,
verbose = 0
)
# Attach the response back to the data.frame so the formula in e2tree()
# can find it; createDisMatrix() will use it to annotate the dissimilarity
# matrix (in classification, `label` is optional but recommended).
tr_xgb <- tr[, 1:4]
tr_xgb$Species <- tr$Species
D <- createDisMatrix(ensemble, data = tr_xgb, label = "Species",
parallel = list(active = FALSE, no_cores = 1))
setting <- list(impTotal = 0.1, maxDec = 0.01, n = 2, level = 5)
tree_xgb <- e2tree(Species ~ ., data = tr_xgb, D = D,
ensemble = ensemble, setting = setting)
print(tree_xgb)For regression backends, createDisMatrix() needs the
response column to compute the dissimilarity scale. Pass the full data
frame (predictors plus response) and the name of the response column via
the label argument.
library(xgboost)
data(mtcars)
set.seed(42)
n <- floor(0.75 * nrow(mtcars))
tr <- mtcars[sample(nrow(mtcars), n), ]
X_tr <- as.matrix(tr[, -1])
y_tr <- tr$mpg
dm_tr <- xgb.DMatrix(data = X_tr, label = y_tr)
ensemble <- xgb.train(
params = list(objective = "reg:squarederror", max_depth = 4, eta = 0.3),
data = dm_tr,
nrounds = 100,
verbose = 0
)
# `data = tr` carries the response column too; the XGBoost adapter
# automatically trims the matrix to the features used at training time.
D <- createDisMatrix(ensemble, data = tr, label = "mpg",
parallel = list(active = FALSE, no_cores = 1))
tree <- e2tree(mpg ~ ., data = tr, D = D, ensemble = ensemble,
setting = list(impTotal = 0.1, maxDec = 1e-6, n = 2, level = 5))
print(tree)gbm expects a 0/1 numeric response for the
bernoulli distribution, while e2tree expects a
factor response for classification. We therefore train gbm
on the integer column and pass a factor copy of the same column to
e2tree.
if (!require("gbm")) install.packages("gbm",
repos="https://cran.r-project.org")
library(gbm)
data(iris)
set.seed(42)
df <- iris
df$is_setosa <- as.integer(df$Species == "setosa")
df$is_setosa_fct <- factor(df$is_setosa, levels = c(0L, 1L))
n <- floor(0.75 * nrow(df))
tr <- df[sample(nrow(df), n), ]
ensemble <- gbm(is_setosa ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width,
data = tr,
distribution = "bernoulli",
n.trees = 200,
interaction.depth = 4,
verbose = FALSE)
D <- createDisMatrix(ensemble,
data = tr[, c("Sepal.Length","Sepal.Width",
"Petal.Length","Petal.Width",
"is_setosa_fct")],
label = "is_setosa_fct",
parallel = list(active = FALSE, no_cores = 1))
tree <- e2tree(is_setosa_fct ~ Sepal.Length + Sepal.Width +
Petal.Length + Petal.Width,
data = tr, D = D, ensemble = ensemble,
setting = list(impTotal = 0.1, maxDec = 0.01, n = 2, level = 5))
print(tree)gbm requires
nTrain * bag.fraction > 2 * n.minobsinnode + 1, which
fails on small training sets such as 24-row mtcars with the
default settings. Lower n.minobsinnode and raise
bag.fraction to keep the example self-contained.
library(gbm)
data(mtcars)
set.seed(42)
n <- floor(0.75 * nrow(mtcars))
tr <- mtcars[sample(nrow(mtcars), n), ]
ensemble <- gbm(mpg ~ ., data = tr,
distribution = "gaussian",
n.trees = 200,
interaction.depth = 4,
n.minobsinnode = 2,
bag.fraction = 0.8,
verbose = FALSE)
D <- createDisMatrix(ensemble, data = tr, label = "mpg",
parallel = list(active = FALSE, no_cores = 1))
tree <- e2tree(mpg ~ ., data = tr, D = D, ensemble = ensemble,
setting = list(impTotal = 0.1, maxDec = 1e-6, n = 2, level = 5))
print(tree)if (!require("lightgbm")) install.packages("lightgbm",
repos="https://cran.r-project.org")
library(lightgbm)
data(iris)
set.seed(42)
n <- floor(0.75 * nrow(iris))
tr <- iris[sample(nrow(iris), n), ]
X_tr <- as.matrix(tr[, 1:4])
y_tr <- as.integer(tr$Species) - 1L
ds <- lgb.Dataset(X_tr, label = y_tr)
ensemble <- lgb.train(
params = list(objective = "multiclass",
num_class = 3,
num_leaves = 15,
verbose = -1),
data = ds,
nrounds = 100
)
tr_lgb <- tr[, 1:4]
tr_lgb$Species <- tr$Species
D <- createDisMatrix(ensemble, data = tr_lgb, label = "Species",
parallel = list(active = FALSE, no_cores = 1))
tree <- e2tree(Species ~ ., data = tr_lgb, D = D, ensemble = ensemble,
setting = list(impTotal = 0.1, maxDec = 0.01, n = 2, level = 5))
print(tree)library(lightgbm)
data(mtcars)
set.seed(42)
n <- floor(0.75 * nrow(mtcars))
tr <- mtcars[sample(nrow(mtcars), n), ]
X_tr <- as.matrix(tr[, -1])
y_tr <- tr$mpg
ds <- lgb.Dataset(X_tr, label = y_tr)
ensemble <- lgb.train(
params = list(objective = "regression",
num_leaves = 8,
min_data_in_leaf = 2,
learning_rate = 0.1,
verbose = -1),
data = ds,
nrounds = 200
)
# Pass the response column to createDisMatrix() via `label`. The
# LightGBM adapter selects the columns it needs through the booster's
# stored feature names, so any extra columns in `data` are ignored.
D <- createDisMatrix(ensemble, data = tr, label = "mpg",
parallel = list(active = FALSE, no_cores = 1))
tree <- e2tree(mpg ~ ., data = tr, D = D, ensemble = ensemble,
setting = list(impTotal = 0.1, maxDec = 1e-6, n = 2, level = 5))
print(tree)To support a further model class MyEnsemble, implement
three S3 methods and register them in NAMESPACE:
# In R/adapters.R (or a separate R/adapter_mymodel.R)
get_ensemble_type.MyEnsemble <- function(ensemble) {
# return "classification" or "regression"
}
extract_terminal_nodes.MyEnsemble <- function(ensemble, data) {
# return data.frame of (n_obs × n_trees) terminal node IDs
}
get_ensemble_predictions.MyEnsemble <- function(ensemble, data, type) {
# return numeric vector of length n_obs
}No changes to createDisMatrix(), e2tree(),
or any other core function are required.