forked from huggingface/transformers-bloom-inference
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile
79 lines (68 loc) · 2.92 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
gen-proto:
pip install grpcio-tools==1.50.0
mkdir -p inference_server/model_handler/grpc_utils/pb
python -m grpc_tools.protoc -Iinference_server/model_handler/grpc_utils/proto --python_out=inference_server/model_handler/grpc_utils/pb --grpc_python_out=inference_server/model_handler/grpc_utils/pb inference_server/model_handler/grpc_utils/proto/generation.proto
find inference_server/model_handler/grpc_utils/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
touch inference_server/model_handler/grpc_utils/__init__.py
touch inference_server/model_handler/grpc_utils/pb/__init__.py
rm -rf inference_server/model_handler/grpc_utils/pb/*.py-e
bloom-176b:
TOKENIZERS_PARALLELISM=false \
MODEL_NAME=microsoft/bloom-deepspeed-inference-fp16 \
MODEL_CLASS=AutoModelForCausalLM \
DEPLOYMENT_FRAMEWORK=ds_inference \
DTYPE=fp16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
bloomz-176b:
TOKENIZERS_PARALLELISM=false \
MODEL_NAME=bigscience/bloomz \
MODEL_CLASS=AutoModelForCausalLM \
DEPLOYMENT_FRAMEWORK=ds_inference \
DTYPE=fp16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=4 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
bloom-560m:
TOKENIZERS_PARALLELISM=false \
MODEL_NAME=bigscience/bloom-560m \
MODEL_CLASS=AutoModelForCausalLM \
DEPLOYMENT_FRAMEWORK=hf_accelerate \
DTYPE=bf16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=32 \
CUDA_VISIBLE_DEVICES=0 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
flan-t5-xxl:
TOKENIZERS_PARALLELISM=false \
MODEL_NAME=google/flan-t5-xxl \
MODEL_CLASS=AutoModelForSeq2SeqLM \
DEPLOYMENT_FRAMEWORK=hf_accelerate \
DTYPE=bf16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=4 \
CUDA_VISIBLE_DEVICES=0 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
ul2:
TOKENIZERS_PARALLELISM=false \
MODEL_NAME=google/ul2 \
MODEL_CLASS=AutoModelForSeq2SeqLM \
DEPLOYMENT_FRAMEWORK=hf_accelerate \
DTYPE=bf16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=4 \
CUDA_VISIBLE_DEVICES=0 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'
codegen-mono:
TOKENIZERS_PARALLELISM=false \
MODEL_NAME=Salesforce/codegen-16B-mono \
MODEL_CLASS=AutoModelForCausalLM \
DEPLOYMENT_FRAMEWORK=hf_accelerate \
DTYPE=bf16 \
MAX_INPUT_LENGTH=2048 \
MAX_BATCH_SIZE=4 \
CUDA_VISIBLE_DEVICES=0 \
gunicorn -t 0 -w 1 -b 127.0.0.1:5000 inference_server.server:app --access-logfile - --access-logformat '%(h)s %(t)s "%(r)s" %(s)s %(b)s'