forked from stanford-mast/INFaaS
-
Notifications
You must be signed in to change notification settings - Fork 1
/
start_infaas.sh
executable file
·425 lines (355 loc) · 14.6 KB
/
start_infaas.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
#!/bin/bash
set -ex
### General variables ###
SCRIPT_DIR=$(dirname $(readlink -f $0))
INFAAS_HOME=${SCRIPT_DIR} # Know relative path
CMAKE_VERSION='3.7'
LOG_DIR=${INFAAS_HOME}"/logs/master_logs/" # Logging directory for Master
MASTER_IP=`curl -s http://169.254.169.254/latest/meta-data/local-ipv4` # IP of the machine that runs this script
### Modes ###
# Different modes used to configure INFaaS like other existing systems.
# Defaults should be sufficient for most users, but more details
## about each can be found by running ./queryfe_server with no inputs
# Master Decision Mode
# 0: INFAAS_ALL, 1: INFAAS_NOQPSLAT, 2: ROUNDROBIN,
## 3: ROUNDROBIN_STATIC, 4: GPUSHARETRIGGER,
## 5: CPUBLISTCHECK, 6: GPUSHARETRIGGER_SKIPBLIST,
## 7: ROUNDROBIN_DYNAMIC
MASTER_DECISION_MODE="6"
# Whether to use VM_DAEMON or not. ON: true, OFF: false
# VM_DAEMON is disabled for single worker autoscaling test.
VM_DAEMON_MODE="ON"
# Worker Autoscaling Mode
WORKER_AUTOSCALER="3" # 0: NONE, 1: Static, 2: Individual, 3: INFaaS
# Utilization threshold and maximum number of tries when running this script
MAX_TRY=40 # Number of tries before failing to setup
CPUGPU_UTIL_THRESH=80 # CPU/GPU utilization threshold (out of 100)
INFERENTIA_UTIL_THRESH=70 # Inferentia utilization threshold (out of 100)
###### UPDATE THESE VALUES BEFORE RUNNING ######
REGION='<REGION>'
ZONE='<ZONE>'
SECURITY_GROUP='<SECURITYGROUP>'
IAM_ROLE='<IAMROLE>'
MODELDB='<MYMODELDB>' # Model repository bucket (do not include s3://)
CONFIGDB='<MYCONFIGDB>' # Configuration bucket (do not include s3://)
WORKER_IMAGE='ami-<INFAASAMI>'
NUM_INIT_CPU_WORKERS=1
NUM_INIT_GPU_WORKERS=0
NUM_INIT_INFERENTIA_WORKERS=0
MAX_CPU_WORKERS=1
MAX_GPU_WORKERS=0
MAX_INFERENTIA_WORKERS=0
# Used for making popular GPU variants exclusive
# Set to 0 for no GPU to be used as exclusive
# IMPORTANT: if NUM_INIT_GPU_WORKERS > 0, SLACK_GPU should be less than this (i.e. at least one GPU should be available for sharing)
SLACK_GPU=0
KEY_NAME='worker_key'
MACHINE_TYPE_GPU='p3.2xlarge'
MACHINE_TYPE_CPU='m5.2xlarge'
MACHINE_TYPE_INFERENTIA='inf1.2xlarge'
DELETE_MACHINES='2' # 0: VM daemon stops machines; 1: VM daemon deletes machines; 2: VM daemon persists machines, but removes them from INFaaS's view
### Values for VMs that don't need user configuration ###
MIN_WORKERS=$[$NUM_INIT_CPU_WORKERS + $NUM_INIT_GPU_WORKERS + $NUM_INIT_INFERENTIA_WORKERS] # Used for VM daemon
EXECUTOR_PREFIX='infaas-worker'
EXECUTOR_PORT='50051'
STARTUP_SCRIPT='/opt/INFaaS/src/worker/start_worker.sh'
#########Beginning of setup script#########
echo "=============Welcome to INFaaS============="
echo ""
echo "Executing setup script"
# Check if user has put in their credentials via aws configure
if [ ! -f ${HOME}/.aws/credentials ]; then
echo "AWS credentials not found! Please put them in by calling: aws configure"
exit 1
fi
# Checks if 1) cmake is installed and 2) if the right version of cmake is installed
cmake_installed=`cmake --version 2> /dev/null | grep -q ${CMAKE_VERSION} && echo "installed"`
if [ -z "$cmake_installed" ]; then
echo "Installing cmake, version "${CMAKE_VERSION}
pushd ${INFAAS_HOME}/scripts
bash install_cmake.sh
popd
else
echo "cmake, version "${CMAKE_VERSION}" detected"
fi
# Check if Redis is installed
if command -v redis-server >/dev/null; then
echo "Redis detected"
else
echo "Installing Redis"
wget http://download.redis.io/redis-stable.tar.gz
tar xvzf redis-stable.tar.gz
pushd redis-stable
make
sudo make install
popd
rm -rf redis-stable.tar.gz redis-stable
fi
# Create the metadata server: Redis
echo "Creating metadata store"
if [[ ! -z $(pidof redis-server) ]]; then
sudo kill $(pidof redis-server)
sleep 5 # May not be able to restart instantly.
fi
if [[ -f "appendonly.aof" ]]; then
rm appendonly.aof # remove old data.
fi
redis-server ${INFAAS_HOME}"/src/metadata-store/redis-serv.conf" &
# Get port from redis-serv.conf for passing to modelreg_server
REDIS_PORT=`grep "port" src/metadata-store/redis-serv.conf | awk '{print $NF}'`
echo "Metadata store created"
# Wait until redis server is ready
cnt=0
ready_state=""
while [[ -z "${ready_state}" ]]; do
redis-cli -p ${REDIS_PORT} PING | grep -q "PONG" && ready_state="ready"
cnt=$[$cnt+1]
if [[ $cnt -eq ${MAX_TRY} ]]; then
echo "Redis server failed to start."
exit 1
fi
sleep 1 # avoid busy looping
done
#########Ensure dependencies are installed vor master and metadata store#########
# Check if gRPC cpp is installed
if command -v grpc_cpp_plugin >/dev/null; then
echo "gRPC detected"
else
echo "Installing gRPC"
pushd ${INFAAS_HOME}"/scripts"
bash install_grpc.sh
popd
fi
# Check if redox is installed
if [[ -d ${HOME}/redox && -f /usr/local/lib64/libredox_static.a ]]; then
echo "Redox detected"
else
echo "Installing Redox"
sudo apt update
sudo apt install -y cmake build-essential libhiredis-dev libev-dev
pushd ${HOME}
git clone https://github.com/hmartiro/redox.git
cd redox
mkdir build && cd build
cmake ..
make -j$(nproc)
sudo make install
popd
fi
# Redox-specific (https://stackoverflow.com/a/9631350)
export LD_LIBRARY_PATH="/usr/local/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
# Check if OpenCV is installed
opencv_state="installed"
pkg-config --modversion opencv | grep -q "was not found" && opencv_state=""
if [[ -z "${opencv_state}" ]]; then
echo "Installing OpenCV"
sudo apt-get install libopencv-dev
else
echo "OpenCV detected"
fi
# Check if AWS CPP is installed
if [[ -d /usr/local/include/aws ]]; then
echo "AWS cpp detected"
else
echo "Installing AWS cpp"
pushd ${INFAAS_HOME}/scripts
bash install_aws_cpp_sdk.sh ${INFAAS_HOME}/thirdparty/aws-cpp-sdk
popd
fi
#########Build INFaaS#########
### First, write out the constants file ###
sed -e "s@const std::string infaas_bucket = .*;@const std::string infaas_bucket = \"${MODELDB}\";@g" \
-e "s@const std::string infaas_config_bucket = .*@const std::string infaas_config_bucket = \"${CONFIGDB}\";@g" \
-e "s@const std::string region = .*;@const std::string region = \"${REGION}\";@g" \
src/include/constants.h.templ > src/include/constants.h
echo "Now building INFaaS from source"
mkdir -p build && cd build
cmake ..
make -j$(nproc)
cd ..
# Create model database/bucket
# If bucket already exists, command will simply skip the creation.
aws s3api head-bucket --bucket ${MODELDB} 2>&1 | grep -q "Not Found" && \
aws s3api create-bucket --bucket ${MODELDB} --acl private \
--create-bucket-configuration LocationConstraint=${REGION}
# Create all initial CPU workers
if [[ $[${NUM_INIT_CPU_WORKERS}] -le 0 ]]; then
echo "NO CPU Workers!"
else
actual_num_init_cpu=$[$NUM_INIT_CPU_WORKERS-1] # Subtract one since we enumerate from zero
for iworker in $( eval echo {0..$actual_num_init_cpu} ); do
next_executor_name=${EXECUTOR_PREFIX}"-cpu-"${iworker}
echo "Starting CPU worker: "${next_executor_name}
exec_ip=`bash scripts/start_vm.sh ${REGION} ${ZONE} \
${KEY_NAME} ${next_executor_name} ${WORKER_IMAGE} \
${MACHINE_TYPE_CPU} ${STARTUP_SCRIPT} ${SECURITY_GROUP} \
${MAX_TRY} ${IAM_ROLE} ${MASTER_IP} ${WORKER_AUTOSCALER} \
${MODELDB}`
# Set to last word in line
exec_ip=`echo ${exec_ip} | awk '{print $NF}'`
if [[ "$exec_ip" == "FAIL" ]]; then
echo "Failure in creating worker "${next_executor_name}
exit 1
fi
echo "Executor "${next_executor_name}" created with IP "${exec_ip}
# Wait until the worker is ready.
cnt=0
ready_state=""
while [[ -z "${ready_state}" ]]; do
./build/bin/query_heartbeat ${exec_ip}:${EXECUTOR_PORT} \
| grep -q "SUCCEEDED" && ready_state="ready"
cnt=$[$cnt+1]
if [[ $cnt -eq ${MAX_TRY} ]]; then
echo ${next_executor_name}" query executor failed to start."
exit 1
fi
sleep 5 # avoid busy looping
done
echo ${next_executor_name}" query executor is ready!"
# Register worker in shared memory (both IP and instance-id)
worker_inst_id=`aws ec2 describe-instances --filters Name=tag:Name,Values=${next_executor_name} --query 'Reservations[*].Instances[*].InstanceId' --output text`
build/bin/redis_startup_helper localhost ${REDIS_PORT} ${next_executor_name} \
${exec_ip} ${EXECUTOR_PORT} "0" "0" \
${worker_inst_id}
done
fi
# Create all initial GPU workers
if [[ $[${NUM_INIT_GPU_WORKERS}] -le 0 ]]; then
echo "NO GPU Workers!"
else
slack_gpu_counter=${SLACK_GPU}
actual_num_init_gpu=$[$NUM_INIT_GPU_WORKERS-1] # Subtract one since we enumerate from zero
for iworker in $( eval echo {0..$actual_num_init_gpu} ); do
next_executor_name=${EXECUTOR_PREFIX}"-gpu-"${iworker}
echo "Starting GPU worker: "${next_executor_name}
exec_ip=`bash scripts/start_vm.sh ${REGION} ${ZONE} \
${KEY_NAME} ${next_executor_name} ${WORKER_IMAGE} \
${MACHINE_TYPE_GPU} ${STARTUP_SCRIPT} ${SECURITY_GROUP} \
${MAX_TRY} ${IAM_ROLE} ${MASTER_IP} ${WORKER_AUTOSCALER} \
${MODELDB}`
# Set to last word in line
exec_ip=`echo ${exec_ip} | awk '{print $NF}'`
if [[ "$exec_ip" == "FAIL" ]]; then
echo "Failure in creating worker "${next_executor_name}
exit 1
fi
echo "Executor "${next_executor_name}" created with IP "${exec_ip}
# Wait until the worker is ready.
cnt=0
ready_state=""
while [[ -z "${ready_state}" ]]; do
./build/bin/query_heartbeat ${exec_ip}:${EXECUTOR_PORT} \
| grep -q "SUCCEEDED" && ready_state="ready"
cnt=$[$cnt+1]
if [[ $cnt -eq ${MAX_TRY} ]]; then
echo ${next_executor_name}" query executor failed to start."
exit 1
fi
sleep 5 # avoid busy looping
done
echo ${next_executor_name}" query executor is ready!"
# Register worker in shared memory (both IP and instance-id)
worker_inst_id=`aws ec2 describe-instances --filters Name=tag:Name,Values=${next_executor_name} --query 'Reservations[*].Instances[*].InstanceId' --output text`
# Make slack if necessary
is_slack="0"
if [[ ${slack_gpu_counter} -gt 0 ]]; then
is_slack="1"
slack_gpu_counter=$[$slack_gpu_counter-1]
echo "Executor "${next_executor_name}" is slack"
fi
build/bin/redis_startup_helper localhost ${REDIS_PORT} ${next_executor_name} \
${exec_ip} ${EXECUTOR_PORT} "2" ${is_slack} \
${worker_inst_id}
done
fi
# Create all initial Inferentia workers
if [[ $[${NUM_INIT_INFERENTIA_WORKERS}] -le 0 ]]; then
echo "NO Inferentia Workers!"
else
actual_num_init_inferentia=$[$NUM_INIT_INFERENTIA_WORKERS-1] # Subtract one since we enumerate from zero
for iworker in $( eval echo {0..$actual_num_init_inferentia} ); do
next_executor_name=${EXECUTOR_PREFIX}"-inf-"${iworker}
echo "Starting Inferentia worker: "${next_executor_name}
exec_ip=`bash scripts/start_vm.sh ${REGION} ${ZONE} \
${KEY_NAME} ${next_executor_name} ${WORKER_IMAGE} \
${MACHINE_TYPE_INFERENTIA} ${STARTUP_SCRIPT} ${SECURITY_GROUP} \
${MAX_TRY} ${IAM_ROLE} ${MASTER_IP} ${WORKER_AUTOSCALER} \
${MODELDB}`
# Set to last word in line
exec_ip=`echo ${exec_ip} | awk '{print $NF}'`
if [[ "$exec_ip" == "FAIL" ]]; then
echo "Failure in creating worker "${next_executor_name}
exit 1
fi
echo "Executor "${next_executor_name}" created with IP "${exec_ip}
# Wait until the worker is ready.
cnt=0
ready_state=""
while [[ -z "${ready_state}" ]]; do
./build/bin/query_heartbeat ${exec_ip}:${EXECUTOR_PORT} \
| grep -q "SUCCEEDED" && ready_state="ready"
cnt=$[$cnt+1]
if [[ $cnt -eq ${MAX_TRY} ]]; then
echo ${next_executor_name}" query executor failed to start."
exit 1
fi
sleep 5 # avoid busy looping
done
echo ${next_executor_name}" query executor is ready!"
# Register worker in shared memory (both IP and instance-id)
worker_inst_id=`aws ec2 describe-instances --filters Name=tag:Name,Values=${next_executor_name} --query 'Reservations[*].Instances[*].InstanceId' --output text`
build/bin/redis_startup_helper localhost ${REDIS_PORT} ${next_executor_name} \
${exec_ip} ${EXECUTOR_PORT} "1" "0" \
${worker_inst_id}
done
fi
# Make LOG_DIR if it does not exist
mkdir -p ${LOG_DIR}
### Create model register frontend ###
if [[ ! -z $(pidof modelreg_server ) ]]; then
pkill -f modelreg_server
fi
MODELREG_LOG=${LOG_DIR}"modelreg_server.log"
build/bin/modelreg_server localhost ${REDIS_PORT} >${MODELREG_LOG} 2>&1 &
# Wait one second for server to launch
sleep 1
# Check that modelreg_server is successfully running
build/bin/modelreg_heartbeat | grep -q "FAILED\|No such file or directory" && \
echo "Heartbeat failed, check Model Registration Server" && exit 1
echo "Model Registration server successfully launched"
### Create query frontend ###
if [[ ! -z $(pidof queryfe_server) ]]; then
pkill -f queryfe_server
fi
QUERYFE_LOG=${LOG_DIR}"queryfe_server.log"
build/bin/queryfe_server localhost ${REDIS_PORT} ${MASTER_DECISION_MODE} >${QUERYFE_LOG} 2>&1 &
# Wait one second for server to launch
sleep 1
# Check that queryfe_server is successfully running
build/bin/queryfe_heartbeat 2>&1 | grep -q "FAILED\|No such file or directory" && \
echo "Heartbeat failed, check Query Frontend Server" && exit 1
echo "Query Frontend server successfully launched"
### Create VM scaling daemon ###
if [[ ! -z $(pidof master_vm_daemon) ]]; then
pkill -f master_vm_daemon
fi
if [[ "${VM_DAEMON_MODE}" == "ON" ]]; then
VMDAEMON_LOG=${LOG_DIR}"vmdaemon.log"
build/bin/master_vm_daemon localhost ${REDIS_PORT} \
${CPUGPU_UTIL_THRESH} ${INFERENTIA_UTIL_THRESH} \
${ZONE} ${KEY_NAME} ${WORKER_IMAGE} \
${MACHINE_TYPE_GPU} ${MACHINE_TYPE_CPU} \
${MACHINE_TYPE_INFERENTIA} \
${STARTUP_SCRIPT} ${SECURITY_GROUP} \
${MAX_TRY} ${IAM_ROLE} ${EXECUTOR_PORT} \
${EXECUTOR_PREFIX} ${MIN_WORKERS} \
${MAX_CPU_WORKERS} ${MAX_GPU_WORKERS} \
${MAX_INFERENTIA_WORKERS} \
${MASTER_IP} ${WORKER_AUTOSCALER} \
${DELETE_MACHINES} >${VMDAEMON_LOG} 2>&1 &
echo "Master VM daemon successfully launched"
else
echo "Not using Master VM daemon."
fi
echo "INFaaS is all set up!"
exit 0