mlspatial:Spatial Machine Learning
workflow
olddir <- getwd()
setwd(tempdir())
setwd(olddir)
oldopt <- options(digits = 4)
options(oldopt)
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(mlspatial)
library(dplyr)
library(ggplot2)
library(tmap)
library(sf)
library(spdep)
library(randomForest)
library(xgboost)
library(e1071)
library(caret)
library(tidyr)
# Quick thematic map with country labels
plot(africa_shp)


qtm(africa_shp, text = "NAME") # Replace with actual field name

tm_shape(africa_shp) +
tm_polygons() +
tm_text("NAME", size = 0.5) + # Replace with correct column
tm_title ("Africa Countries")

ggplot(africa_shp) +
geom_sf(fill = "lightgray", color = "black") +
geom_sf_text(aes(label = NAME), size = 2) + # Replace as needed
ggtitle("Africa countries") +
theme_minimal()

# Join data
mapdata <- join_data(africa_shp, panc_incidence, by = "NAME")
## OR Joining/ merging my data and shapefiles
mapdata <- inner_join(africa_shp, panc_incidence, by = "NAME")
## OR mapdata <- left_join(nat, codata, by = "DISTRICT_N")
str(mapdata)
#> Classes 'sf' and 'data.frame': 53 obs. of 26 variables:
#> $ OBJECTID : int 2 3 5 6 7 8 9 10 11 12 ...
#> $ FIPS_CNTRY: chr "UV" "CV" "GA" "GH" ...
#> $ ISO_2DIGIT: chr "BF" "CV" "GM" "GH" ...
#> $ ISO_3DIGIT: chr "BFA" "CPV" "GMB" "GHA" ...
#> $ NAME : chr "Burkina Faso" "Cabo Verde" "Gambia" "Ghana" ...
#> $ COUNTRYAFF: chr "Burkina Faso" "Cabo Verde" "Gambia" "Ghana" ...
#> $ CONTINENT : chr "Africa" "Africa" "Africa" "Africa" ...
#> $ TOTPOP : int 20107509 560899 2051363 27499924 12413867 1792338 4689021 17885245 3758571 33986655 ...
#> $ incidence : num 330.4 53.4 31.4 856.3 163.1 ...
#> $ female : num 1683 362 140 4566 375 ...
#> $ male : num 1869 211 197 4640 1378 ...
#> $ ageb : num 669.7 93.7 68.7 2047 336.7 ...
#> $ agec : num 2878 480 268 7147 1414 ...
#> $ agea : num 4.597 0.265 0.718 11.888 2.13 ...
#> $ fageb : num 250.3 40.2 23.1 782 59.1 ...
#> $ fagec : num 1429 322 116 3775 315 ...
#> $ fagea : num 3.413 0.146 0.548 8.816 1.228 ...
#> $ mageb : num 419.5 53.5 45.6 1265 277.6 ...
#> $ magec : num 1448 158 152 3372 1100 ...
#> $ magea : num 1.184 0.12 0.17 3.073 0.902 ...
#> $ yra : num 182.4 30.2 16.6 524.7 73.1 ...
#> $ yrb : num 187.2 34.1 17.1 552.6 74.9 ...
#> $ yrc : num 193.1 35 18 578.5 76.9 ...
#> $ yrd : num 198.5 35.9 18.3 602.7 78.6 ...
#> $ yre : num 204.3 36.5 18.7 621.5 79.4 ...
#> $ geometry :sfc_MULTIPOLYGON of length 53; first list element: List of 1
#> ..$ :List of 1
#> .. ..$ : num [1:317, 1:2] 102188 90385 80645 74151 70224 ...
#> ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
#> - attr(*, "sf_column")= chr "geometry"
#> - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
#> ..- attr(*, "names")= chr [1:25] "OBJECTID" "FIPS_CNTRY" "ISO_2DIGIT" "ISO_3DIGIT" ...
#Visualize Pancreatic cancer Incidence by countries
#Basic map with labels
# quantile map
p1 <- tm_shape(mapdata) +
tm_fill("incidence", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Incidence")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p2 <- tm_shape(mapdata) +
tm_fill("female", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Female")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p3 <- tm_shape(mapdata) +
tm_fill("male", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Male")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p4 <- tm_shape(mapdata) +
tm_fill("ageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Age:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p5 <- tm_shape(mapdata) +
tm_fill("agec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Age:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p6 <- tm_shape(mapdata) +
tm_fill("agea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Age:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p7 <- tm_shape(mapdata) +
tm_fill("fageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Female:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p8 <- tm_shape(mapdata) +
tm_fill("fagec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Female:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p9 <- tm_shape(mapdata) +
tm_fill("fagea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Female:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p10 <- tm_shape(mapdata) +
tm_fill("mageb", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Male:20-54yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p11 <- tm_shape(mapdata) +
tm_fill("magec", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Male:55+yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
p12 <- tm_shape(mapdata) +
tm_fill("magea", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Male:<20yrs")) + tm_borders(fill_alpha = .3) + tm_compass() +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"), frame = TRUE, component.autoscale = FALSE)
current.mode <- tmap_mode("plot")
tmap_arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12,
widths = c(.75, .75))

## Machine Learning Model building
# 1. Random Forest Regression
# Train Random Forest
set.seed(123)
rf_model <- randomForest(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata, ntree = 500,
importance = TRUE)
# View model results
print(rf_model)
#>
#> Call:
#> randomForest(formula = incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec + magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata, ntree = 500, importance = TRUE)
#> Type of random forest: regression
#> Number of trees: 500
#> No. of variables tried at each split: 5
#>
#> Mean of squared residuals: 76299.33
#> % Var explained: 91.35
varImpPlot(rf_model)

#Plot Variable Importance
library(ggplot2)
importance_df <- data.frame(
Variable = rownames(importance(rf_model)),
Importance = importance(rf_model)[, "IncNodePurity"])
ggplot(importance_df, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Variable Importance (IncNodePurity)", x = "Variable", y = "Importance")

# 2. Gradient Boosting Machine (XGBoost)
# Prepare the data
x_vars <- model.matrix(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata)[,-1]
y <- mapdata$incidence
# Convert to DMatrix
dtrain <- xgb.DMatrix(data = x_vars, label = y)
# Train model
# Train model using xgb.train()
params <- list(
objective = "reg:squarederror",
max_depth = 4,
learning_rate = 0.1,
verbosity = 0
)
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 100
)
# Feature importance
xgb.importance(model = xgb_model)
#> Feature Gain Cover Frequency
#> <char> <num> <num> <num>
#> 1: female 9.907517e-01 0.604198127 0.614457831
#> 2: agec 5.419284e-03 0.084314910 0.105421687
#> 3: ageb 2.267636e-03 0.113560858 0.105421687
#> 4: male 1.553178e-03 0.114869626 0.106927711
#> 5: mageb 5.295916e-06 0.013188362 0.013554217
#> 6: fagec 1.610951e-06 0.019832880 0.016566265
#> 7: yrd 4.180083e-07 0.007449914 0.006024096
#> 8: agea 4.159760e-07 0.018222088 0.013554217
#> 9: magea 2.949594e-07 0.015201852 0.010542169
#> 10: fageb 7.749886e-08 0.003976644 0.003012048
#> 11: yrb 4.104050e-08 0.003070573 0.003012048
#> 12: fagea 3.490967e-09 0.002114165 0.001506024
# 3. Support Vector Regression (SVR)
# Train SVR model
svr_model <- svm(incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
magea + mageb + magec + yrb + yrc + yrd + yre, data = mapdata,
type = "eps-regression",
kernel = "radial")
# Summary and predictions
summary(svr_model)
#>
#> Call:
#> svm(formula = incidence ~ female + male + agea + ageb + agec + fagea +
#> fageb + fagec + magea + mageb + magec + yrb + yrc + yrd + yre,
#> data = mapdata, type = "eps-regression", kernel = "radial")
#>
#>
#> Parameters:
#> SVM-Type: eps-regression
#> SVM-Kernel: radial
#> cost: 1
#> gamma: 0.06666667
#> epsilon: 0.1
#>
#>
#> Number of Support Vectors: 7
#mapdata$pred_svr <- predict(svr_model)
# Model Evaluation (predictions):
# evaluate each model step-by-step:
# 1. Random Forest Evaluation
rf_preds <- predict(rf_model, newdata = mapdata)
rf_metrics <- postResample(pred = rf_preds, obs = mapdata$incidence)
print(rf_metrics)
#> RMSE Rsquared MAE
#> 98.2197100 0.9976235 30.2558297
# 2. XGBoost Evaluation
xgb_preds <- predict(xgb_model, newdata = x_vars)
xgb_metrics <- postResample(pred = xgb_preds, obs = mapdata$incidence)
print(xgb_metrics)
#> RMSE Rsquared MAE
#> 2.1004543 0.9999983 0.7290456
# 3. SVR Evaluation
svr_preds <- predict(svr_model, newdata = mapdata)
svr_metrics <- postResample(pred = svr_preds, obs = mapdata$incidence)
print(svr_metrics)
#> RMSE Rsquared MAE
#> 445.9372342 0.9178101 127.7105534
# Compare All Models
model_eval <- data.frame(
Model = c("Random Forest", "XGBoost", "SVR"),
RMSE = c(rf_metrics["RMSE"], xgb_metrics["RMSE"], svr_metrics["RMSE"]),
MAE = c(rf_metrics["MAE"], xgb_metrics["MAE"], svr_metrics["MAE"]),
Rsquared = c(rf_metrics["Rsquared"], xgb_metrics["Rsquared"], svr_metrics["Rsquared"]))
print(model_eval)
#> Model RMSE MAE Rsquared
#> 1 Random Forest 98.219710 30.2558297 0.9976235
#> 2 XGBoost 2.100454 0.7290456 0.9999983
#> 3 SVR 445.937234 127.7105534 0.9178101
#Choosing the Best Model
#Lowest RMSE and MAE = most accurate predictions
#Highest R² = best variance explanation
#Sort and interpret:
model_eval[order(model_eval$RMSE), ] # Best = Top row
#> Model RMSE MAE Rsquared
#> 2 XGBoost 2.100454 0.7290456 0.9999983
#> 1 Random Forest 98.219710 30.2558297 0.9976235
#> 3 SVR 445.937234 127.7105534 0.9178101
#### Models Predicted plots
# Create a data frame from your table
model_preds <- data.frame(rf_preds, xgb_preds, svr_preds)
# Add observation ID
model_preds$ID <- 1:nrow(model_preds)
# Reshape for plotting
model_long <- model_preds %>%
tidyr::pivot_longer(cols = c("rf_preds", "xgb_preds", "svr_preds"), names_to = "Model", values_to = "Predicted")
# Plot
ggplot(model_long, aes(x = ID, y = Predicted, color = Model)) +
geom_line(linewidth = 0.5) +
labs(title = "Model Predictions Over Observations",
x = "Observation", y = "Predicted Incidence") +
theme_minimal()

## plot actual vs predicted
oldpar <- par(mfrow = c(1, 3)) # 3 plots side-by-side
# Random Forest
plot(mapdata$incidence, mapdata$rf_pred,
xlab = "Observed", ylab = "Predicted",
main = "Random Forest", pch = 19, col = "steelblue")
abline(0, 1, col = "red", lwd = 2)
# XGBoost
plot(mapdata$incidence, mapdata$xgb_pred,
xlab = "Observed", ylab = "Predicted",
main = "XGBoost", pch = 19, col = "darkgreen")
abline(0, 1, col = "red", lwd = 2)
# SVR
plot(mapdata$incidence, mapdata$svr_pred,
xlab = "Observed", ylab = "Predicted",
main = "SVR", pch = 19, col = "purple")
abline(0, 1, col = "red", lwd = 2)

par(oldpar)
## Actual vs predicted plot with correlations
library(ggplot2)
library(ggpubr) # For stat_cor
mapdata$rf_pred <- predict(rf_model)
mapdata$xgb_pred <- predict(xgb_model, newdata = x_vars)
mapdata$svr_pred <- predict(svr_model, newdata = mapdata)
# Random Forest Plot
p1 <- ggplot(mapdata, aes(x = incidence, y = rf_pred)) +
geom_point(color = "steelblue", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
labs(title = "Random Forest", x = "Observed Incidence", y = "Predicted Incidence") +
theme_minimal()
# XGBoost Plot
p2 <- ggplot(mapdata, aes(x = incidence, y = xgb_pred)) +
geom_point(color = "darkgreen", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
labs(title = "XGBoost", x = "Observed Incidence", y = "Predicted Incidence") +
theme_minimal()
# SVR Plot
p3 <- ggplot(mapdata, aes(x = incidence, y = svr_pred)) +
geom_point(color = "purple", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
stat_cor(method = "pearson", aes(label = paste0("R² = ")), label.x = 0) +
labs(title = "SVR", x = "Observed Incidence", y = "Predicted Incidence") +
theme_minimal()
combined_plot <- ggarrange(p1, p2, p3, ncol = 3, nrow = 1, common.legend = FALSE)
## CROSS-VALIDATION
#Step 1: Prepare common setup
# Set seed for reproducibility
set.seed(123)
library(caret)
# Define 5-fold cross-validation
cv_control <- trainControl(method = "cv", number = 3)
# 1. Random Forest
library(randomForest)
rf_cv <- train(
incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
magea + mageb + magec + yrb + yrc + yrd + yre,
data = mapdata,
method = "rf",
trControl = cv_control,
tuneLength = 3,
importance = TRUE
)
print(rf_cv)
#> Random Forest
#>
#> 53 samples
#> 16 predictors
#>
#> No pre-processing
#> Resampling: Cross-Validated (3 fold)
#> Summary of sample sizes: 36, 34, 36
#> Resampling results across tuning parameters:
#>
#> mtry RMSE Rsquared MAE
#> 2 444.8157 0.8580149 161.2283
#> 8 450.7475 0.8548550 163.6862
#> 15 454.0229 0.8542215 165.2292
#>
#> RMSE was used to select the optimal model using the smallest value.
#> The final value used for the model was mtry = 2.
# 2. XGBoost
library(xgboost)
mapdata <- st_drop_geometry(mapdata)
mapdata$incidence <- as.numeric(mapdata$incidence)
cv_control <- trainControl(
method = "cv",
number = 5
)
# XGBoost with caret
xgb_cv <- xgb.cv(
params = params,
data = dtrain,
nrounds = 100,
nfold = 5, # 5-fold CV
early_stopping_rounds = 10, # stop if no improvement in 10 rounds
metrics = list("rmse"),
verbose = 0
)
print(xgb_cv)
#> ##### xgb.cv 5-folds
#> iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
#> <int> <num> <num> <num> <num>
#> 1 869.239393 127.3707020 790.2016 556.6500
#> 2 811.895853 113.6967223 753.6544 548.5258
#> 3 758.695958 101.1327812 719.3175 541.3886
#> 4 709.258868 89.5985897 687.8282 534.0328
#> 5 663.309677 79.0397616 658.7448 527.2620
#> 6 620.545640 69.4028269 632.3349 520.5086
#> 7 580.734477 60.6017161 608.0614 514.3051
#> 8 543.630439 52.5890340 585.6143 508.4693
#> 9 509.061361 45.2924036 565.3273 502.8183
#> 10 476.811169 38.6662221 545.6182 498.5917
#> 11 446.728563 32.6587860 527.4934 495.3552
#> 12 418.645545 27.2396970 510.8609 491.8151
#> 13 392.428753 22.3383309 495.3064 488.7539
#> 14 367.936479 17.9487507 480.7644 485.8850
#> 15 345.050737 14.0417407 467.7380 482.9319
#> 16 323.657820 10.5989823 455.5772 480.3232
#> 17 303.658221 7.6468438 444.2999 478.0095
#> 18 284.949110 5.3008448 433.7681 476.0269
#> 19 267.446830 3.9041583 423.8386 474.4026
#> 20 251.072552 3.8834340 414.6807 472.9013
#> 21 235.748441 4.8799317 406.5349 471.3381
#> 22 221.402044 6.2001606 398.7159 470.1287
#> 23 207.972113 7.5265587 391.4943 469.2266
#> 24 195.395832 8.7500043 384.7359 468.2956
#> 25 183.618137 9.8448664 378.4595 467.4727
#> 26 172.579433 10.8036044 372.8544 466.6354
#> 27 162.233649 11.6317973 367.5725 465.6864
#> 28 152.531762 12.3298351 362.3877 464.9543
#> 29 143.434085 12.9128515 357.7527 464.1551
#> 30 134.899409 13.3869674 353.4512 463.4164
#> 31 126.894472 13.7625645 349.4280 462.7497
#> 32 119.383493 14.0499628 345.6474 462.1675
#> 33 112.336085 14.2548206 342.1012 461.6551
#> 34 105.721359 14.3870126 338.8096 461.1720
#> 35 99.516489 14.4496212 335.7164 460.7433
#> 36 93.683744 14.4612375 332.8313 460.3522
#> 37 88.207532 14.4191607 330.1307 460.0082
#> 38 83.066339 14.3300654 327.5992 459.7049
#> 39 78.239213 14.2001212 325.2061 459.4532
#> 40 73.706952 14.0344257 322.9586 459.2320
#> 41 69.453772 13.8329782 320.8131 459.0585
#> 42 65.455340 13.6042703 318.8138 458.8997
#> 43 61.694388 13.3553540 316.8836 458.7848
#> 44 58.161545 13.0847456 315.0938 458.6745
#> 45 54.843352 12.7942086 313.4106 458.5800
#> 46 51.727013 12.4870438 311.8410 458.4893
#> 47 48.800109 12.1668617 310.3461 458.4239
#> 48 46.050852 11.8341907 308.9519 458.3600
#> 49 43.470091 11.4911414 307.6304 458.3118
#> 50 41.045067 11.1414634 306.3959 458.2640
#> 51 38.766216 10.7874114 305.2391 458.2214
#> 52 36.628218 10.4270416 304.1478 458.1830
#> 53 34.619590 10.0648898 303.1232 458.1492
#> 54 32.735149 9.7000239 302.1662 458.1140
#> 55 30.967443 9.3330799 301.2729 458.0778
#> 56 29.310898 8.9641345 300.4338 458.0435
#> 57 27.756956 8.5961935 299.6458 458.0095
#> 58 26.298972 8.2304126 298.9090 457.9752
#> 59 24.933027 7.8658598 298.2208 457.9401
#> 60 23.653849 7.5020737 297.5853 457.8992
#> 61 22.451003 7.1460083 296.9914 457.8580
#> 62 21.309321 6.8065028 296.4383 457.8143
#> 63 20.226651 6.4820813 295.9174 457.7738
#> 64 19.202495 6.1707325 295.3967 457.7511
#> 65 18.228768 5.8757562 294.9482 457.7059
#> 66 17.310864 5.5886756 294.4956 457.6777
#> 67 16.437824 5.3168546 294.0978 457.6350
#> 68 15.614714 5.0534595 293.7217 457.5942
#> 69 14.832464 4.8022642 293.3740 457.5513
#> 70 14.086469 4.5660667 293.0471 457.5093
#> 71 13.381798 4.3394510 292.7398 457.4681
#> 72 12.712208 4.1234158 292.4562 457.4248
#> 73 12.076774 3.9185987 292.1869 457.3840
#> 74 11.472889 3.7233422 291.9417 457.3399
#> 75 10.899541 3.5380380 291.7111 457.2972
#> 76 10.355206 3.3622312 291.4919 457.2569
#> 77 9.837891 3.1947687 291.2877 457.2165
#> 78 9.344513 3.0377197 291.1026 457.1739
#> 79 8.878307 2.8866616 290.9245 457.1346
#> 80 8.435075 2.7427897 290.7651 457.0920
#> 81 8.014227 2.6064334 290.6128 457.0525
#> 82 7.614482 2.4769421 290.4711 457.0131
#> 83 7.234784 2.3536897 290.3374 456.9753
#> 84 6.873823 2.2364538 290.2146 456.9371
#> 85 6.531273 2.1254459 290.0973 456.9014
#> 86 6.205946 2.0200509 289.9841 456.8685
#> 87 5.896936 1.9198917 289.8765 456.8371
#> 88 5.603172 1.8247213 289.7767 456.8061
#> 89 5.324408 1.7343520 289.6757 456.7801
#> 90 5.059483 1.6487059 289.5910 456.7492
#> 91 4.808006 1.5671828 289.5027 456.7241
#> 92 4.569061 1.4898766 289.4191 456.7002
#> 93 4.342314 1.4165729 289.3420 456.6764
#> 94 4.126835 1.3469663 289.2688 456.6538
#> 95 3.922361 1.2809817 289.1989 456.6325
#> 96 3.728132 1.2182728 289.1344 456.6112
#> 97 3.543669 1.1588310 289.0731 456.5911
#> 98 3.368373 1.1020500 289.0163 456.5712
#> 99 3.202143 1.0484843 288.9618 456.5526
#> 100 3.044323 0.9976821 288.9115 456.5340
#> iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
#> Best iteration:
#> iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
#> <int> <num> <num> <num> <num>
#> 100 3.044323 0.9976821 288.9115 456.534
best_nrounds <- xgb_cv$best_iteration
cat("Best number of rounds:", best_nrounds, "\n")
#> Best number of rounds:
# Extract mean RMSE
mean_rmse <- min(xgb_cv$evaluation_log$test_rmse_mean)
cat("XGBoost CV RMSE:", mean_rmse, "\n")
#> XGBoost CV RMSE: 288.9115
# 3. Support Vector Regression (SVR)
library(e1071)
library(kernlab)
svr_cv <- train(
incidence ~ female + male + agea + ageb + agec + fagea + fageb + fagec +
magea + mageb + magec + yrb + yrc + yrd + yre,
data = mapdata,
method = "svmRadial",
trControl = cv_control,
preProcess = c("center", "scale"), # SVR often benefits from scaling
tuneLength = 3
)
print(svr_cv)
#> Support Vector Machines with Radial Basis Function Kernel
#>
#> 53 samples
#> 15 predictors
#>
#> Pre-processing: centered (15), scaled (15)
#> Resampling: Cross-Validated (5 fold)
#> Summary of sample sizes: 43, 42, 42, 43, 42
#> Resampling results across tuning parameters:
#>
#> C RMSE Rsquared MAE
#> 0.25 713.0466 0.4939345 363.8929
#> 0.50 704.6990 0.4971961 355.2001
#> 1.00 701.0369 0.5012774 350.6448
#>
#> Tuning parameter 'sigma' was held constant at a value of 17.63942
#> RMSE was used to select the optimal model using the smallest value.
#> The final values used for the model were sigma = 17.63942 and C = 1.
## Spatial maps of predicted values of each model
# 1. Random Forest Spatial Map
mapdata <- inner_join(africa_shp, panc_incidence, by = "NAME")
mapdata$pred_rf <- predict(rf_model, newdata = mapdata)
tm_shape(mapdata) +
tm_fill("pred_rf", fill.scale =tm_scale_intervals(values = "brewer.greens", style = "quantile"),
fill.legend = tm_legend(title = "Inci_pred_rf")) + tm_borders(fill_alpha = .2) +
tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE)

# 2. XGBoost Spatial Map
# Ensure matrix used in training
mapdata$pred_xgb <- predict(xgb_model, newdata = x_vars)
tm_shape(mapdata) +
tm_fill("pred_xgb", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile"),
fill.legend = tm_legend(title = "Inci_pred_xgb")) + tm_borders(fill_alpha = .2) +
tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE)

# 3. Support Vector Regression (SVR) Spatial Map
mapdata$pred_svr <- predict(svr_model, newdata = mapdata)
tm_shape(mapdata) +
tm_fill("pred_svr", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Inci_pred_svr")) + tm_borders(fill_alpha = .2) +
tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE)

# Compare Side by Side
tmap_arrange(
tm_shape(mapdata) +
tm_fill("pred_rf", fill.scale =tm_scale_intervals(values = "brewer.greens", style = "quantile"),
fill.legend = tm_legend(title = "Inci_pred_rf")) + tm_borders(fill_alpha = .2) +
tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE),
tm_shape(mapdata) +
tm_fill("pred_xgb", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile"),
fill.legend = tm_legend(title = "Inci_pred_xgb")) + tm_borders(fill_alpha = .2) +
tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE),
tm_shape(mapdata) +
tm_fill("pred_svr", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Inci_pred_svr")) + tm_borders(fill_alpha = .2) +
tm_compass() + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE),
nrow = 1)

# Predicted Residuals
# we've already trained all 3 models and have `mapdata$incidence` as your actual values.
### Step 1: Generate predictions and residuals for each model
# Random Forest
rf_preds <- predict(rf_model, newdata = mapdata)
rf_resid <- mapdata$incidence - rf_preds
# XGBoost
xgb_preds <- predict(xgb_model, newdata = x_vars) # x_vars = model.matrix(...)
xgb_resid <- mapdata$incidence - xgb_preds
# SVR
svr_preds <- predict(svr_model, newdata = mapdata)
svr_resid <- mapdata$incidence - svr_preds
### Step 2: Combine into a single data frame
residuals_df <- data.frame(
actual = mapdata$incidence,
rf_pred = rf_preds,
rf_resid = rf_resid,
xgb_pred = xgb_preds,
xgb_resid = xgb_resid,
svr_pred = svr_preds,
svr_resid = svr_resid
)
# export
library(writexl)
### Compare residual distributions
boxplot(residuals_df$rf_resid, residuals_df$xgb_resid, residuals_df$svr_resid,
names = c("RF", "XGB", "SVR"),
main = "Model Residuals",
ylab = "Prediction Error (Residual)")

## Spatial maps of residual values from each model
#Add residuals to mapdata
#You should already have these from the previous steps:
mapdata$rf_resid <- residuals_df$rf_resid
mapdata$xgb_resid <- residuals_df$xgb_resid
mapdata$svr_resid <- residuals_df$svr_resid
# Set tmap mode to plot (static map)
tmap_mode("plot")
# Create individual residual maps
map_rf <- tm_shape(mapdata) +
tm_fill("rf_resid", fill.scale =tm_scale_intervals(values = "brewer.greens", style = "quantile",
midpoint = 0), fill.legend = tm_legend(title = "Inci_rf_resid")) +
tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE)
map_xgb <- tm_shape(mapdata) +
tm_fill("xgb_resid", fill.scale =tm_scale_intervals(values = "brewer.purples", style = "quantile",
midpoint = 0), fill.legend = tm_legend(title = "Inci_xgb_resid")) + tm_borders(fill_alpha = .2) +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE)
map_svr <- tm_shape(mapdata) +
tm_fill("svr_resid", fill.scale =tm_scale_intervals(values = "brewer.reds", style = "quantile"),
fill.legend = tm_legend(title = "Inci_svr_resid")) + tm_borders(fill_alpha = .2) +
tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"),
frame = TRUE, component.autoscale = FALSE)
#Step 3: Combine maps in a grid
# Combine maps in a grid layout
tmap_arrange(map_rf, map_xgb, map_svr, nrow = 1)

##Barplot and Spatial maps for RMSE and MAE
#Step 1: Calculate RMSE and MAE for each model
# Random Forest
rf_metrics <- postResample(pred = rf_preds, obs = mapdata$incidence)
# XGBoost
xgb_metrics <- postResample(pred = xgb_preds, obs = mapdata$incidence)
# SVR
svr_metrics <- postResample(pred = svr_preds, obs = mapdata$incidence)
#Step 2: Combine into a summary data frame
model_eval <- data.frame(
Model = c("Random Forest", "XGBoost", "SVR"),
RMSE = c(rf_metrics["RMSE"], xgb_metrics["RMSE"], svr_metrics["RMSE"]),
MAE = c(rf_metrics["MAE"], xgb_metrics["MAE"], svr_metrics["MAE"]),
Rsquared = c(rf_metrics["Rsquared"], xgb_metrics["Rsquared"], svr_metrics["Rsquared"])
)
print(model_eval)
#> Model RMSE MAE Rsquared
#> 1 Random Forest 98.219710 30.2558297 0.9976235
#> 2 XGBoost 2.100454 0.7290456 0.9999983
#> 3 SVR 445.937234 127.7105534 0.9178101
#Visualize MAE and RMSE
oldpar <- par(mfrow = c(1, 3))
#Barplot of RMSE
barplot(model_eval$RMSE, names.arg = model_eval$Model,
col = "skyblue", las = 1, main = "Model RMSE", ylab = "RMSE")
#Barplot of MAE
barplot(model_eval$MAE, names.arg = model_eval$Model,
col = "lightgreen", las = 1, main = "Model MAE", ylab = "MAE")
#Barplot of MAE
barplot(model_eval$Rsquared, names.arg = model_eval$Model,
col = "grey", las = 1, main = "Model Rsquared", ylab = "MAE")

par(oldpar)
#Add metrics to residual maps as captions
map_rf <- tm_shape(mapdata) +
tm_fill("rf_resid", fill.scale =tm_scale_intervals(values = "brewer.greens", midpoint = 0),
title = paste0("rf_resid\nRMSE: ", round(rf_metrics["RMSE"], 2),
"\nMAE: ", round(rf_metrics["MAE"], 2))) +
tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))
map_xgb <- tm_shape(mapdata) +
tm_fill("xgb_resid", fill.scale =tm_scale_intervals(values = "-RdBu", midpoint = 0),
title = paste0("xgb_resid\nRMSE: ", round(xgb_metrics["RMSE"], 2),
"\nMAE: ", round(xgb_metrics["MAE"], 2))) +
tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))
map_svr <- tm_shape(mapdata) +
tm_fill("svr_resid", fill.scale =tm_scale_intervals(values = "-RdBu", midpoint = 0),
title = paste0("svr_resid\nRMSE: ", round(svr_metrics["RMSE"], 2),
"\nMAE: ", round(svr_metrics["MAE"], 2))) +
tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))
tmap_arrange(map_rf, map_xgb, map_svr, nrow = 1)

#Global Moran’s I, Local Moran’s I (LISA), Cluster categories (e.g., High-High, Low-Low), Maps: Moran’s I map,
#LISA clusters, High-High clusters using the predicted values from the four machine learning models
#Assumptions: Your spatial data is in mapdata (an sf object).
#Predicted values for each model are already stored in: rf_preds, xgb_preds, svr_preds, mlp_preds
#STEP 1: Create Spatial Weights (if not done yet)
neighbors <- poly2nb(mapdata) #if this gives warning, use the below codes
mapdata <- st_as_sf(mapdata) # If it's not already sf
mapdata <- st_make_valid(mapdata) # Fix any invalid geometries
neighbors <- poly2nb(mapdata, snap = 1e-15) ## Try 1e-6, 1e-5, or higher if needed. You can adjust snap upward incrementally until the warnings disappear or are reduced
listw <- nb2listw(neighbors, style = "W", zero.policy = TRUE)
#STEP 2: Define a function to compute Moran’s I and cluster categories
analyze_spatial_autocorrelation <- function(mapdata, values, listw, model_name, signif_level = 0.05) {
# Standardize predicted values
mapdata$val_st <- scale(values)[, 1]
# Compute lag
mapdata$lag_val <- lag.listw(listw, mapdata$val_st, zero.policy = TRUE)
# Global Moran's I
global_moran <- moran.test(values, listw, zero.policy = TRUE)
# Local Moran's I (LISA)
lisa <- localmoran(values, listw, zero.policy = TRUE)
lisa_df <- as.data.frame(lisa)
#rename p-value column
colnames(lisa_df)[5] <- "Pr_z"
# Add to mapdata
mapdata$Ii <- lisa_df$Ii
mapdata$Z_Ii <- lisa_df$Z.I
mapdata$Pr_z <- lisa_df$Pr_z
#mapdata$Pr_z <- lisa_df[, "Pr(z > 0)"]
# Classify clusters
mapdata <- mapdata %>%
mutate(
cluster = case_when(
val_st > 0 & lag_val > 0 & Pr_z <= signif_level ~ "High-High",
val_st < 0 & lag_val < 0 & Pr_z <= signif_level ~ "Low-Low",
val_st < 0 & lag_val > 0 & Pr_z <= signif_level ~ "Low-High",
val_st > 0 & lag_val < 0 & Pr_z <= signif_level ~ "High-Low",
TRUE ~ "Not Significant"
)
)
return(list(
map = mapdata,
global_moran = global_moran
))
}
#STEP 3: Run the function for each model
rf_result <- analyze_spatial_autocorrelation(mapdata, rf_preds, listw, "Random Forest")
xgb_result <- analyze_spatial_autocorrelation(mapdata, xgb_preds, listw, "XGBoost")
svr_result <- analyze_spatial_autocorrelation(mapdata, svr_preds, listw, "SVR")
#STEP 4: Mapping LISA Clusters and High-High Areas for Random Forest
tmap_mode("plot")
# LISA Cluster Map. fill.scale =tm_scale_intervals(values = "-RdBu")
tm_rf <- tm_shape(rf_result$map) +
tm_fill(
"cluster",
fill.scale = tm_scale(values = c(
"High-High" = "red",
"Low-Low" = "blue",
"Low-High" = "lightblue",
"High-Low" = "pink",
"Not Significant" = "gray90")),
fill.legend = tm_legend(title = "LISA Clusters - RF")) +
tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))
# Mapping LISA Clusters and High-High Areas for XGBoost
tmap_mode("plot")
# LISA Cluster Map
tm_xgb <- tm_shape(xgb_result$map) +
tm_fill("cluster",
fill.scale = tm_scale(values = c(
"High-High" = "red",
"Low-Low" = "blue",
"Low-High" = "lightblue",
"High-Low" = "pink",
"Not Significant" = "gray90")),
fill.legend = tm_legend(title = "LISA Clusters - XGBoost")) +
tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))
#Mapping LISA Clusters and High-High Areas for SVR
tmap_mode("plot")
# LISA Cluster Map
tm_svr <- tm_shape(svr_result$map) +
tm_fill("cluster",
fill.scale = tm_scale(values = c(
"High-High" = "red",
"Low-Low" = "blue",
"Low-High" = "lightblue",
"High-Low" = "pink",
"Not Significant" = "gray90")),
fill.legend = tm_legend(title = "LISA Clusters - SVR")) +
tm_borders(fill_alpha = .2) + tm_layout(legend.text.size = 0.5, legend.position = c("left", "bottom"))
tmap_arrange(tm_rf, tm_xgb, tm_svr, nrow = 1)

#You can also arrange maps side-by-side using tmap_arrange().
#View Global Moran’s I Results
#These print the test statistic and p-values for global spatial autocorrelation of predictions.
rf_result$global_moran
#>
#> Moran I test under randomisation
#>
#> data: values
#> weights: listw
#> n reduced by no-neighbour observations
#>
#> Moran I statistic standard deviate = -0.24595, p-value = 0.5971
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic Expectation Variance
#> -0.044681255 -0.022727273 0.007967958
xgb_result$global_moran
#>
#> Moran I test under randomisation
#>
#> data: values
#> weights: listw
#> n reduced by no-neighbour observations
#>
#> Moran I statistic standard deviate = -0.32513, p-value = 0.6275
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic Expectation Variance
#> -0.051250814 -0.022727273 0.007696578
svr_result$global_moran
#>
#> Moran I test under randomisation
#>
#> data: values
#> weights: listw
#> n reduced by no-neighbour observations
#>
#> Moran I statistic standard deviate = -0.28034, p-value = 0.6104
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic Expectation Variance
#> -0.05111299 -0.02272727 0.01025262