-
Notifications
You must be signed in to change notification settings - Fork 0
/
simulated_data.R
171 lines (157 loc) · 5.85 KB
/
simulated_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
library(splatter)
library(scater)
library(scuttle)
setwd("/dss/dsshome1/02/di93zoj/yuge")
base_params <- newSplatParams()
# You can estimate these from a real dataset
# There are some tips for this if you want to try it
# base_params <- splatEstimate(counts)
# Define the condition parameters
# Can be done in the function but this is a bit neater
#
# Prob - Probability cell is assigned to this group (affects group sizes)
# DEProb - Probability that a gene is DE in this group
# DownProb - Probability a DE gene is down-regulated
# FacLoc - Mean of DE factors (log-normal)
# FacScale - Variance of DE factors (log-normal)
# Steps - Number of steps in the path
# Skew - Distribution of cells along the path
#
# Together Steps and Skew create discrete clusters rather than continuous paths (except for the control)
# Example dataframe
# conditions <- tibble::tribble(
# ~ Prob, ~ DEProb, ~ DownProb, ~ FacLoc, ~ FacScale, ~ Steps, ~ Skew,
# # Control
# 0.20, 0.05, 0.5, 0.10, 0.40, 20, 0.5,
# # Conditions
# 0.15, 0.05, 0.25, 0.20, 0.50, 2, 0.0,
# 0.08, 0.05, 0.50, 2.00, 0.02, 2, 0.0,
# 0.02, 0.05, 0.50, 0.50, 1.50, 2, 0.0,
# 0.08, 0.10, 0.25, 0.80, 0.80, 2, 0.0,
# 0.15, 0.80, 0.25, 1.20, 1.20, 2, 0.0,
# 0.01, 0.10, 0.50, 0.60, 0.90, 2, 0.0,
# 0.11, 0.15, 0.50, 1.50, 0.50, 2, 0.0,
# 0.11, 0.15, 0.25, 0.70, 2.00, 2, 0.0,
# # 0.05, 0.50, 0.50, 0.50, 0.90, 2, 0.0,
# 0.09, 0.40, 0.75, 0.90, 0.60, 2, 0.0,
# )
control <-tibble::tribble(
~ Prob, ~ DEProb, ~ DownProb, ~ FacLoc, ~ FacScale, ~ Steps, ~ Skew,
# Control
0.40, 0.05, 0.5, 0.10, 0.40, 20, 0.5,
)
n <- 32
n_loops <- 4
perturbed <- tibble::as_tibble(data.frame(
Prob = rep(.6/n, n),
DEProb = rep(c(0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05), times = n_loops),
DownProb = rep(.50, n),
FacLoc = c(rep(.2, n/n_loops), rep(.5, n/n_loops), rep(1, n/n_loops), rep(1.5, n/n_loops)),
FacScale = c(rep(.2, n/n_loops), rep(.5, n/n_loops), rep(1, n/n_loops), rep(1.5, n/n_loops)),
Steps = rep(2.0, n),
Skew = rep(0.0, n)
))
conditions = rbind(control, perturbed)
sim <- splatSimulatePaths(
params = base_params,
batchCells = 40000,
# BASIC PARAMETERS - CAN BE ESTIMATED
# Base gene means (gamma distribution)
mean.shape = 0.3,
mean.rate = 0.6,
# Library size (log-normal distribution)
lib.loc = 9,
lib.scale = 0.1,
# Dispersion (bit hard to explain...)
bcv.common = 0.1,
bcv.df = 60,
# Dropout, parameterized by rlogis(), ie. hist(rlogis(10000, location = .3, scale = 1))
dropout.type = 'experiment',
dropout.mid = .3,
dropout.shape = -1,
# PATH PARAMETERS - MUST BE SET
group.prob = conditions$Prob,
path.nSteps = conditions$Steps,
path.skew = conditions$Skew,
# RANDOM SEED
seed = 1,
# optional
de.downProb = conditions$DownProb,
de.prob = conditions$DEProb,
de.facLoc = conditions$FacLoc,
# de.facScale = conditions$FacScale
)
# Save the count matrix and then the observation labels
sce_counts <- as.matrix(counts(sim))
write.table(sce_counts, 'splatter_sim_dropout.csv', sep=",")
sce_obs <- colData(sim)
write.table(sce_obs, 'splatter_sim_obs_dropout.csv', sep=",")
write.table(conditions, 'splatter_sim_params_dropout.csv', sep=",")
##### Second simulation - sparsity ######
#########################################
# Here we keep the number of differentially expressed genes constant at __
# while we vary the library size to induce sparsity. Instead of comparing against
# a single control as before, we'll compare against a control of the same library
# size.
control <-tibble::tribble(
~ Prob, ~ DEProb, ~ DownProb, ~ FacLoc, ~ FacScale, ~ Steps, ~ Skew,
# Control
0.50, 0.05, 0.5, 0.10, 0.40, 20, 0.5,
)
perturbed <- tibble::tribble(
~ DEProb, ~ FacLoc,
0.05, .1,
0.05, .2,
0.05, .3,
0.05, .4,
0.05, .5,
)
n <- 5
perturbed$Prob = rep(.5/n, n)
perturbed$DownProb = rep(.50, n)
perturbed$FacScale = rep(.4, n)
perturbed$Steps = rep(2.0, n)
perturbed$Skew = rep(0.0, n)
conditions = rbind(control, perturbed)
for (LibLoc in c(8.1, 8.3, 8.5, 8.7, 8.9, 9.1, 9.3)){
sim <- splatSimulatePaths(
params = base_params,
batchCells = 12000,
# BASIC PARAMETERS - CAN BE ESTIMATED
# Base gene means (gamma distribution)
mean.shape = 0.3,
mean.rate = 0.6,
# Library size (log-normal distribution), setting the parameters of hist(rlnorm(n = 1000, meanlog = 10, sdlog = 0.2))
lib.loc = LibLoc,
lib.scale = 0.1,
# Dispersion (bit hard to explain...)
bcv.common = 0.1,
bcv.df = 60,
# Dropout
dropout.type = 'experiment',
dropout.mid = .3,
dropout.shape = -1,
# PATH PARAMETERS - MUST BE SET
group.prob = conditions$Prob,
path.nSteps = conditions$Steps,
path.skew = conditions$Skew,
# RANDOM SEED
seed = 1,
# optional
de.downProb = conditions$DownProb,
de.prob = conditions$DEProb,
de.facLoc = conditions$FacLoc,
# de.facScale = conditions$FacScale
)
# Save the count matrix and then the observation labels
sce_counts <- as.matrix(counts(sim))
write.table(sce_counts, paste(LibLoc, 'sparsity_sim.csv', sep='-'), sep=",")
sce_obs <- colData(sim)
write.table(sce_obs, paste(LibLoc, 'sparsity_sim_obs.csv', sep='-'), sep=",")
write.table(conditions, paste(LibLoc, 'sparsity_sim_params.csv', sep='-'), sep=",")
}
# Check the UMAP
sim <- scuttle::logNormCounts(sim)
sim <- scater::runPCA(sim)
sim <- scater::runUMAP(sim)
scater::plotUMAP(sim, colour_by = "Group")