The package provides functionality to conduct and visualize
component-wise boosting on decentralized data. The basis is the
DataSHIELD infrastructure for distributed
computing. This package provides the calculation of the
component-wise
boosting.
Note that DataSHIELD uses an option datashield.privacyLevel
to
indicate the minimal amount of numbers required to be allowed to share
an aggregated value of these numbers. Instead of setting the option, we
directly retrieve the privacy level from the
DESCRIPTION
file each time a function calls for it. This options is set to 5 by
default.
At the moment, there is no CRAN version available. Install the development version from GitHub:
remotes::install_github("schalkdaniel/dsCWB")
It is necessary to register the assign and aggregate methods in the OPAL
administration. These methods are registered automatically when
publishing the package on OPAL (see
DESCRIPTION
).
Note that the package needs to be installed at both locations, the server and the analysts machine.
library(DSI)
library(DSOpal)
surl = "https://opal-demo.obiba.org/"
username = "administrator"
password = "password"
builder = newDSLoginBuilder()
for (i in seq_len(3L)) {
builder$append(
server = paste0("server", i),
url = surl,
user = username,
password = password,
table = paste0("CNSIM.CNSIM", i)
)
}
connections = datashield.login(logins = builder$build(), assign = TRUE)
library(dsCWB)
#Remove all missings:
datashield.assign(connections, "Dclean", quote(dsNaRm("D")))
symbol = "Dclean"
target = "LAB_TSC"
feature_names = c("GENDER", "DIS_DIAB", "LAB_HDL", "LAB_TRIG")
cwb = dsCWB(connections, "Dclean", target, feature_names, mstop = 100L,
val_fraction = 0.2, patience = 3L, seed = 31415L)
# Visualize selected base learner:
plotBaselearnerTraces(cwb)
# Get log for further investigation:
l = cwb$getLog()
l$minutes = as.numeric(difftime(l$time, l$time[1], units = "mins"))
library(ggplot2)
# Plot train vs test risk:
ggplot(l, aes(x = minutes)) +
geom_line(aes(y = risk_train, color = "Train risk")) +
geom_line(aes(y = risk_val, color = "Val risk")) +
labs(color = "") + xlab("Minutes") + ylab("Risk")
# Visualize effect LAB_TRIG (no site-specific corrections):
pdata_LAB_TRIG = cwb$featureEffectData("LAB_TRIG")
ggplot(pdata_LAB_TRIG, aes(x = value, y = pred)) +
geom_line()
# Effect of GENDER (just site-specific effects):
pdata_GENDER = cwb$featureEffectData("GENDER")
ggplot(pdata_GENDER, aes(x = value, y = pred, color = server)) +
geom_boxplot() +
facet_grid(~ server) +
guides(color = "none")
datashield.logout(connections)
To cite dsCWB
in publications, please use:
Schalk, D., Bischl, B., & Rügamer, D. (2022). Privacy-Preserving and Lossless Distributed Estimation of High-Dimensional Generalized Additive Mixed Models. arXiv preprint arXiv:2210.07723.
@article{schalk2022dcwb,
doi = {10.48550/ARXIV.2210.07723},
url = {https://arxiv.org/abs/2210.07723},
author = {Schalk, Daniel and Bischl, Bernd and Rügamer, David},
title = {Privacy-Preserving and Lossless Distributed Estimation of High-Dimensional Generalized Additive Mixed Models},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}