CimpleG, an R package to find (small) CpG signatures.
# Install directly from github:
devtools::install_github("costalab/CimpleG")
# Alternatively, downloading from our release page and installing it from a local source:
# - ie navigating through your system
install.packages(file.choose(), repos = NULL, type = "source")
# - ie given a path to a local source
install.packages("~/Downloads/CimpleG_0.0.5.XXXX.tar.gz", repos = NULL, type = "source")
# or
devtools::install_local("~/Downloads/CimpleG_0.0.5.XXXX.tar.gz")
library("CimpleG")
data(train_data)
data(train_targets)
data(test_data)
data(test_targets)
# check the train_targets table to see
# what other columns can be used as targets
# colnames(train_targets)
# mini example with just 4 target signatures
set.seed(42)
cimpleg_result <- CimpleG(
train_data = train_data,
train_targets = train_targets,
test_data = test_data,
test_targets = test_targets,
method = "CimpleG",
has_annotation = TRUE,
target_columns = c(
"neurons",
"glia",
"blood_cells",
"fibroblasts"
)
)
cimpleg_result$results
# check generated signatures
cimpleg_result$signatures
#> neurons glia blood_cells fibroblasts
#> "cg24548498" "cg14501977" "cg04785083" "cg03369247"
# Get it directly from the results object
cimpleg_result$annotation
#> # A tibble: 4 × 8
#> IlmnID CHR_hg38 Start_hg38 End_hg38 UCSC_RefGene_Name UCSC_RefGene_Group
#> <chr> <chr> <dbl> <dbl> <chr> <chr>
#> 1 cg24548498 chr2 181684680 181684682 <NA> <NA>
#> 2 cg14501977 chr12 123948446 123948448 CCDC92 5'UTR
#> 3 cg04785083 chr1 8971202 8971204 CA6 Body
#> 4 cg03369247 chr8 20174518 20174520 SLC18A1;SLC18A1;S… Body;Body;Body;Bo…
#> # ℹ 2 more variables: UCSC_CpG_Islands_Name <chr>,
#> # Relation_to_UCSC_CpG_Island <chr>
# or idependently through the "get_cpg_annotation" function
signature_annotation <- get_cpg_annotation(cimpleg_result$signatures)
# check signature annotation
signature_annotation
#> # A tibble: 4 × 8
#> IlmnID CHR_hg38 Start_hg38 End_hg38 UCSC_RefGene_Name UCSC_RefGene_Group
#> <chr> <chr> <dbl> <dbl> <chr> <chr>
#> 1 cg24548498 chr2 181684680 181684682 <NA> <NA>
#> 2 cg14501977 chr12 123948446 123948448 CCDC92 5'UTR
#> 3 cg04785083 chr1 8971202 8971204 CA6 Body
#> 4 cg03369247 chr8 20174518 20174520 SLC18A1;SLC18A1;S… Body;Body;Body;Bo…
#> # ℹ 2 more variables: UCSC_CpG_Islands_Name <chr>,
#> # Relation_to_UCSC_CpG_Island <chr>
# adjust target names to match signature names
# check generated signatures
plt <- signature_plot(
cimpleg_result,
train_data,
train_targets,
sample_id_column = "gsm",
true_label_column = "cell_type"
)
print(plt$plot)
plt <- diffmeans_sumvariance_plot(
data = train_data,
target_vector = train_targets$neurons == 1
)
print(plt)
df_dmeansvar <- compute_diffmeans_sumvar(
data = train_data,
target_vector = train_targets$neurons == 1
)
parab_param <- .7
df_dmeansvar$is_selected <- select_features(
x = df_dmeansvar$diff_means,
y = df_dmeansvar$sum_variance,
a = parab_param
)
plt <- diffmeans_sumvariance_plot(
data = df_dmeansvar,
label_var1 = "Neurons",
color_all_points = "purple",
threshold_func = function(x, a) (a * x) ^ 2,
is_feature_selected_col = "is_selected",
func_factor = parab_param
)
print(plt)
plt <- diffmeans_sumvariance_plot(
data = df_dmeansvar,
feats_to_highlight = cimpleg_result$signatures
)
print(plt)
deconv_result <- run_deconvolution(
cpg_obj = cimpleg_result,
new_data = test_data
)
plt <- deconvolution_barplot(
deconvoluted_data = deconv_result,
meta_data = test_targets,
sample_id = "gsm",
true_label = "cell_type"
)
print(plt$plot)
In this example, we’ll create two additional models made with CimpleG. One using only hypermethylated signatures, and the other using 3 CpGs per signature instead of just one.
set.seed(42)
cimpleg_hyper <- CimpleG(
train_data = train_data,
train_targets = train_targets,
test_data = test_data,
test_targets = test_targets,
method = "CimpleG",
pred_type = "hyper",
target_columns = c(
"neurons",
"glia",
"blood_cells",
"fibroblasts"
)
)
#> Training for target 'neurons' with 'CimpleG' has finished.: 0.342 sec elapsed
#> Training for target 'glia' with 'CimpleG' has finished.: 0.304 sec elapsed
#> Training for target 'blood_cells' with 'CimpleG' has finished.: 0.352 sec elapsed
#> Training for target 'fibroblasts' with 'CimpleG' has finished.: 0.317 sec elapsed
deconv_hyper <- run_deconvolution(
cpg_obj = cimpleg_hyper,
new_data = test_data
)
set.seed(42)
cimpleg_3sigs <- CimpleG(
train_data = train_data,
train_targets = train_targets,
test_data = test_data,
test_targets = test_targets,
method = "CimpleG",
n_sigs = 3,
target_columns = c(
"neurons",
"glia",
"blood_cells",
"fibroblasts"
)
)
#> Training for target 'neurons' with 'CimpleG' has finished.: 0.471 sec elapsed
#> Training for target 'glia' with 'CimpleG' has finished.: 0.39 sec elapsed
#> Training for target 'blood_cells' with 'CimpleG' has finished.: 0.459 sec elapsed
#> Training for target 'fibroblasts' with 'CimpleG' has finished.: 0.403 sec elapsed
deconv_3sigs <- run_deconvolution(
cpg_obj = cimpleg_3sigs,
new_data = test_data
)
deconv_3sigs$prop_3sigs <- deconv_3sigs$proportion
deconv_hyper$prop_hyper <- deconv_hyper$proportion
deconv_result$prop_cimpleg <- deconv_result$proportion
dummy_deconvolution_data <-
deconv_result |>
dplyr::mutate(true_vals = proportion + runif(nrow(deconv_result), min=-0.1,max=0.1)) |>
dplyr::select(cell_type,sample_id,prop_cimpleg,true_vals) |>
dplyr::left_join(deconv_hyper |> dplyr::select(-proportion), by=c("sample_id","cell_type")) |>
dplyr::left_join(deconv_3sigs |> dplyr::select(-proportion), by=c("sample_id","cell_type")) |>
dplyr::mutate_if(is.numeric, function(x){ifelse(x<0,0,x)}) |>
dplyr::mutate_if(is.numeric, function(x){ifelse(x>1,1,x)}) |>
tibble::as_tibble()
scatter_plts <- CimpleG:::deconv_pred_obs_plot(
deconv_df = dummy_deconvolution_data,
true_values_col = "true_vals",
predicted_cols = c("prop_cimpleg","prop_hyper","prop_3sigs"),
sample_id_col = "sample_id",
group_col= "cell_type"
)
scatter_panel <- scatter_plts |> patchwork::wrap_plots(ncol=1)
print(scatter_panel)
now, more interestingly, we can see in detail and rank one of the measures used to evaluate the deconvolution results
rank_plts <- CimpleG:::deconv_ranking_plot(
deconv_df = dummy_deconvolution_data,
true_values_col = "true_vals",
predicted_cols = c("prop_cimpleg","prop_hyper","prop_3sigs"),
sample_id_col = "sample_id",
group_col= "cell_type",
metrics = "rmse"
)
rank_panel <- list(rank_plts$perf_boxplt[[1]],rank_plts$nemenyi_plt[[1]]) |> patchwork::wrap_plots()
print(rank_panel)