library(MASS)
# library(tikzDevice)
library(Rmixmod)
library(mvtnorm)
library(scoringTools)
set.seed(123)
<- array(0, c(8, 1))
mu0 <- array(1, c(8, 1))
mu1
<- function(n, ev = runif(n, 0, 3)) {
Posdef <- matrix(ncol = n, rnorm(n^2))
Z <- qr(Z)
decomp <- qr.Q(decomp)
Q <- qr.R(decomp)
R <- diag(R)
d <- d / abs(d)
ph <- Q %*% diag(ph)
O <- t(O) %*% diag(ev) %*% O
Z return(Z)
}
<- Posdef(n = 8)
sigma0 <- Posdef(n = 8) sigma1
<- 100000
m_test <- 10000
m <- c(4, 8, 15, 30)
nbvar <- 0.5
p0 set.seed(21)
<- rbinom(m_test, 1, p0)
y <- array(0, c(4, m_test, 51))
data_test_pred <- array(0, c(4, m_test, 51))
data_test_gen <- array(0, c(4, 3))
data_test_bayes
for (n in 2:2) {
<- array(0, c(m_test, nbvar[n]))
x == 0, ] <-
x[y mvrnorm(n = sum(y == 0), mu0[1:nbvar[n]], sigma0[1:nbvar[n], 1:nbvar[n]])
== 1, ] <-
x[y mvrnorm(n = sum(y == 1), mu1[1:nbvar[n]], sigma1[1:nbvar[n], 1:nbvar[n]])
1:(nbvar[n] + 1)] <-
data_test_pred[n, , as.matrix(cbind.data.frame(y = y, x = x))
<- data_test_pred[n, , ]
data_test_gen[n, , ] 1] <- ifelse(data_test_pred[n, , 1] == 0, 2, 1)
data_test_gen[n, ,
<-
e0 log(p0) + dmvnorm(x[y == 0, ], mu0[1:nbvar[n]], sigma0[1:nbvar[n], 1:nbvar[n]], log = TRUE) < log(1 - p0) + dmvnorm(x[y == 0, ], mu1[1:nbvar[n]], sigma1[1:nbvar[n], 1:nbvar[n]], log = TRUE)
<-
e1 log(p0) + dmvnorm(x[y == 1, ], mu0[1:nbvar[n]], sigma0[1:nbvar[n], 1:nbvar[n]], log = TRUE) > log(1 - p0) + dmvnorm(x[y == 1, ], mu1[1:nbvar[n]], sigma1[1:nbvar[n], 1:nbvar[n]], log = TRUE)
<- mean(c(e0, e1))
eb <- prop.test(sum(c(e0, e1)), m_test)$conf.int[1:2]
ebconf <- c(eb, ebconf[1], ebconf[2])
data_test_bayes[n, ] rm(x)
}
rm(y)
<- seq(0, .9, by = 0.02)
cut <- array(NA, c(4, length(cut), 5, 20))
tx_erreur <- array(NA, c(4, length(cut), 5, 20))
gini
rm(ebconf)
rm(eb)
rm(e0)
rm(e1)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1988900 106.3 3781414 202.0 2899637 154.9
## Vcells 44658928 340.8 98760743 753.5 98451004 751.2
Not run: takes roughly 10 to 30 minutes.
for (random in 1:20) {
set.seed(random)
<- rbinom(m, 1, p0)
y <- array(0, c(4, m, 51))
data_learn_pred <- array(0, c(4, m, 51))
data_learn_gen
for (n in 2:2) {
<- array(0, c(m, nbvar[n]))
x == 0, ] <-
x[y mvrnorm(n = sum(y == 0), mu0[1:nbvar[n]], sigma0[1:nbvar[n], 1:nbvar[n]])
== 1, ] <-
x[y mvrnorm(n = sum(y == 1), mu1[1:nbvar[n]], sigma1[1:nbvar[n], 1:nbvar[n]])
1:(nbvar[n] + 1)] <-
data_learn_pred[n, , as.matrix(cbind.data.frame(y = y, x = x))
<- data_learn_pred[n, , ]
data_learn_gen[n, , ] 1] <-
data_learn_gen[n, , ifelse(data_learn_gen[n, , 1] == 0, 2, 1)
rm(x)
}
rm(y)
gc()
#### Eventual loop on number of featuers (frozen to 8) ####
for (j in 2:2) {
# On convertit l'ensemble d'apprentissage en dataframe
<- data.frame(data_learn_pred[j, , 1:(nbvar[j] + 1)])
learn_pred <- data.frame(data_learn_gen[j, , 1:(nbvar[j] + 1)])
learn_gen
# On construit la formule appliquee a glm (elle depend du nombre de variables dans le modele)
<- paste("X", 2:(nbvar[j] + 1), sep = "")
PredictorVariables <-
Formula formula(paste("X1 ~ ", paste(PredictorVariables, collapse = " + ")))
rm(PredictorVariables)
gc()
# On entraine un modele glm sur toutes les donnees pour simuler les refuses
<- glm(Formula, "binomial", learn_pred)
model_pred_tot
#### Boucle sur le cut ####
for (i in 1:length(cut)) {
# On calcule les acceptes et rejetes en fonction du cut et du premier modele
<- (model_pred_tot$fitted.values > cut[i])
accepte <- (model_pred_tot$fitted.values <= cut[i])
rejete
# On controle qu'il reste des acceptes et des mauvais payeurs parmi les acceptes
if (sum(accepte) > 0 &
!(sum(learn_pred[accepte, 1]) == nrow(learn_pred[accepte, ]))) {
# On construit l'ensemble d'apprentissage partiel
<- learn_pred[accepte, ]
learn_pred_part <- learn_gen
learn_gen_part 1] <- NA
learn_gen_part[rejete,
# On entra?ne les modeles
<-
model_pred_part glm(Formula, family = binomial(link = "logit"), learn_pred_part)
<-
model_gen_part mixmodCluster(
data = learn_gen_part[, 2:(nbvar[j] + 1)],
knownLabels = learn_gen_part[, 1],
nbCluster = 2
)
# On predit leur resultat sur l'ensemble de test et on enregistre le taux d'erreur
<-
model_pred_part.pred predict(model_pred_part, data.frame(data_test_pred[j, , ]), type = "response") >= 0.5
<-
model_pred_part.erreur sum(abs(data_test_pred[j, , 1] - model_pred_part.pred)) / m_test
<-
model_pred_part.gini normalizedGini(
1],
data_test_pred[j, , predict(model_pred_part, data.frame(data_test_pred[j, , ]), type = "response")
)
<- mixmodPredict(
model_gen_res data.frame(data_test_gen[j, , 2:(nbvar[j] + 1)]),
@bestResult
model_gen_part
)<-
model_gen_part.pred @partition
model_gen_res<-
model_gen_part.erreur sum(abs(data_test_gen[j, , 1] - model_gen_part.pred)) / m_test
<-
model_gen_part.gini normalizedGini(
1],
data_test_pred[j, , @proba[, 1]
model_gen_res
)
## Augmentation ##
if (sum(rejete) > 0) {
<- augmentation(
model_pred_augmente -1],
learn_pred_part[, -1],
learn_pred[rejete, 1]
learn_pred_part[,
)
<-
model_pred_augmente.pred predict(model_pred_augmente@infered_model,
data.frame(x = data.frame(data_test_pred[j, , ])),
type = "response"
>= 0.5
) <-
model_pred_augmente.erreur sum(abs(data_test_pred[j, , 1] - model_pred_augmente.pred)) / m_test
<-
model_pred_augmente.gini normalizedGini(
1],
data_test_pred[j, , predict(model_pred_augmente@infered_model,
data.frame(x = data.frame(data_test_pred[j, , ])),
type = "response"
)
)
## Parcelling ##
<-
model_pred_parcelling parcelling(
xf = learn_pred_part[, -1],
xnf = learn_pred[rejete, -1],
yf = learn_pred_part[, 1]
)
<-
model_pred_parcelling.pred predict(model_pred_parcelling@infered_model,
data.frame(x = data.frame(data_test_pred[j, , ])),
type = "response"
>= 0.5
) <-
model_pred_parcelling.erreur sum(abs(data_test_pred[j, , 1] - model_pred_parcelling.pred)) / m_test
<-
model_pred_parcelling.gini normalizedGini(
1],
data_test_pred[j, , predict(model_pred_parcelling@infered_model,
data.frame(x = data.frame(data_test_pred[j, , ])),
type = "response"
)
)
## Reclassification ##
<-
model_pred_reclassification reclassification(
xf = learn_pred_part[, -1],
xnf = learn_pred[rejete, -1],
yf = learn_pred_part[, 1]
)
<-
model_pred_reclassification.pred predict(
@infered_model,
model_pred_reclassificationdata.frame(x = data.frame(data_test_pred[j, , ])),
type = "response"
>= 0.5
) <-
model_pred_reclassification.erreur sum(abs(data_test_pred[j, , 1] - model_pred_reclassification.pred)) / m_test
<-
model_pred_reclassification.gini normalizedGini(
1],
data_test_pred[j, , predict(model_pred_reclassification@infered_model,
data.frame(x = data.frame(data_test_pred[j, , ])),
type = "response"
)
)
}
# On renvoie le taux d'erreur des modeles
if (sum(rejete) > 0) {
<-
tx_erreur[j, i, , random] c(
model_pred_part.erreur,
model_pred_augmente.erreur,
model_pred_reclassification.erreur,
model_pred_parcelling.erreur,
model_gen_part.erreur
)<-
gini[j, i, , random] c(
model_pred_part.gini,
model_pred_augmente.gini,
model_pred_reclassification.gini,
model_pred_parcelling.gini,
model_gen_part.gini
)else {
} <-
tx_erreur[j, i, , random] c(
model_pred_part.erreur,
model_pred_part.erreur,
model_pred_part.erreur,
model_pred_part.erreur,
model_gen_part.erreur
)<-
gini[j, i, , random] c(
model_pred_part.gini,
model_pred_part.gini,
model_pred_part.gini,
model_pred_part.gini,
model_gen_part.gini
)
}gc()
}gc()
}
} }
Consequently, not run either.
# path_figure_tx_erreur <-
# file.path(
# dirname(rstudioapi::getActiveDocumentContext()$path),
# "../TEX_CODE/reintegration_simu_miss_erreur.tex"
# )
#
# path_figure_gini <-
# file.path(
# dirname(rstudioapi::getActiveDocumentContext()$path),
# "../TEX_CODE/reintegration_simu_miss_gini.tex"
# )
############ Figure error rate ##############################################
<- array(0, c(length(cut), 5))
tx_erreur_moy <- array(0, c(length(cut), 5))
gini_moy
for (methode in 1:5) {
for (random in 1:20) {
<- tx_erreur_moy[, methode] + tx_erreur[2, , methode, random]
tx_erreur_moy[, methode] <- gini_moy[, methode] + gini[2, , methode, random]
gini_moy[, methode]
}<- tx_erreur_moy[, methode] / 20
tx_erreur_moy[, methode] <- gini_moy[, methode] / 20
gini_moy[, methode]
}
# tikz(path_figure_tx_erreur,
# width = 7,
# height = 4.4,
# engine = "pdftex")
plot(
x = cut,
y = tx_erreur_moy[, 1],
xlim = c(0, .9),
ylim = c(.1, .30),
ylab = "Error rate on test set",
xlab = "Cut-off value",
pch = 15,
xaxt = "n",
yaxt = "n",
type = "o"
)par(new = TRUE)
plot(
x = cut,
y = tx_erreur_moy[, 2],
xlim = c(0, .9),
ylim = c(.1, .30),
ylab = "",
xlab = "",
pch = 17,
col = "green",
xaxt = "n",
yaxt = "n",
type = "o"
)par(new = TRUE)
plot(
x = cut,
y = tx_erreur_moy[, 3],
xlim = c(0, .9),
ylim = c(.1, .30),
ylab = "",
xlab = "",
pch = 7,
col = 2,
xaxt = "n",
yaxt = "n",
type = "o"
)par(new = TRUE)
plot(
x = cut,
y = tx_erreur_moy[, 4],
xlim = c(0, .9),
ylim = c(.1, .30),
ylab = "",
xlab = "",
pch = 8,
col = 590,
xaxt = "n",
yaxt = "n",
type = "o"
)par(new = TRUE)
plot(
x = cut,
y = tx_erreur_moy[, 5],
xlim = c(0, .9),
ylim = c(.1, .30),
ylab = "",
xlab = "",
pch = 9,
col = 376,
xaxt = "n",
yaxt = "n",
type = "o"
)
axis(1,
at = pretty(cut),
lab = pretty(cut),
las = TRUE
)axis(2,
at = pretty(seq(0.1, 0.3, 0.05), n = 10),
lab = pretty(seq(0.1, 0.3, 0.05), n = 10),
las = TRUE
)
legend(
0.35,
0.3,
pch = c(15, 17, 7, 8, 9),
col = c(1, "green", 2, 590, 376),
legend = c(
"Logistic regression",
"Augmentation",
"Reclassification",
"Parcelling",
"Gaussian mixture"
)
)# dev.off()
############ Figure gini ##############################################
# tikz(path_figure_gini,
# width = 7,
# height = 4.4,
# engine = "pdftex")
plot(
x = cut,
y = gini_moy[, 1],
xlim = c(0, .9),
ylim = c(.74, .93),
ylab = "Gini on test set",
xlab = "Cut-off value",
pch = 15,
xaxt = "n",
yaxt = "n",
type = "o"
)par(new = TRUE)
plot(
x = cut,
y = gini_moy[, 2],
xlim = c(0, .9),
ylim = c(.74, .93),
ylab = "",
xlab = "",
pch = 17,
col = "green",
xaxt = "n",
yaxt = "n",
type = "o"
)par(new = TRUE)
plot(
x = cut,
y = gini_moy[, 3],
xlim = c(0, .9),
ylim = c(.74, .93),
ylab = "",
xlab = "",
pch = 7,
col = 2,
xaxt = "n",
yaxt = "n",
type = "o"
)par(new = TRUE)
plot(
x = cut,
y = gini_moy[, 4],
xlim = c(0, .9),
ylim = c(.74, .93),
ylab = "",
xlab = "",
pch = 8,
col = 590,
xaxt = "n",
yaxt = "n",
type = "o"
)par(new = TRUE)
plot(
x = cut,
y = gini_moy[, 5],
xlim = c(0, .9),
ylim = c(.74, .93),
ylab = "",
xlab = "",
pch = 9,
col = 376,
xaxt = "n",
yaxt = "n",
type = "o"
)
axis(1,
at = pretty(cut),
lab = pretty(cut),
las = TRUE
)axis(2,
at = pretty(seq(0.75, 0.95, 0.1), n = 10),
lab = pretty(seq(0.75, 0.95, 0.1), n = 10),
las = TRUE
)
legend(
0,
0.835,
pch = c(15, 17, 7, 8, 9),
col = c(1, "green", 2, 590, 376),
legend = c(
"Logistic regression",
"Augmentation",
"Reclassification",
"Parcelling",
"Gaussian mixture"
)
)# dev.off()