diff --git a/docs/modelserving/v1beta1/llm/torchserve/accelerate/README.md b/docs/modelserving/v1beta1/llm/torchserve/accelerate/README.md new file mode 100644 index 000000000..a263346dd --- /dev/null +++ b/docs/modelserving/v1beta1/llm/torchserve/accelerate/README.md @@ -0,0 +1,89 @@ +# Serve Large Language Model with Huggingface Accelerate + +This documentation explains how KServe supports large language model serving via `TorchServe`. +The large language refers to the models that are not able to fit into a single GPU, and they need +to be sharded onto multiple partitions over multiple GPUs. + +Huggingface Accelerate can load sharded checkpoints and the maximum RAM usage will be the size of +the largest shard. By setting `device_map` to true, `Accelerate` automatically determines where +to put each layer of the model depending on the available resources. + + +## Package the model + +1. Download the model `bigscience/bloom-7b1` from Huggingface Hub by running +```bash +python Download_model.py --model_name bigscience/bloom-7b1 +``` + +1. Compress the model +```bash +zip -r model.zip model/models--bigscience-bloom-7b1/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/ +``` + +1. Package the model +Create the `setup_config.json` file with accelerate settings: +* Enable `low_cpu_mem_usage` to use accelerate +* Recommended `max_memory` in setup_config.json is the max size of shard. +```json +{ + "revision": "main", + "max_memory": { + "0": "10GB", + "cpu": "10GB" + }, + "low_cpu_mem_usage": true, + "device_map": "auto", + "offload_folder": "offload", + "offload_state_dict": true, + "torch_dtype":"float16", + "max_length":"80" +} +``` + +```bash +torch-model-archiver --model-name bloom7b1 --version 1.0 --handler custom_handler.py --extra-files model.zip,setup_config.json +``` + +1. Upload to your cloud storage, or you can use the uploaded bloom model from KServe GCS bucket. + +## Serve the large language model with InferenceService + +```yaml +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: "bloom7b1" +spec: + predictor: + pytorch: + runtimeVersion: 0.8.2 + storageUri: gs://kfserving-examples/models/torchserve/llm/Huggingface_accelerate/bloom + resources: + limits: + cpu: "2" + memory: 32Gi + nvidia.com/gpu: "2" + requests: + cpu: "2" + memory: 32Gi + nvidia.com/gpu: "2" +``` + +## Run the Inference + +Now, assuming that your ingress can be accessed at +`${INGRESS_HOST}:${INGRESS_PORT}` or you can follow [this instruction](../../../../../get_started/first_isvc.md#4-determine-the-ingress-ip-and-ports) +to find out your ingress IP and port. + +```bash +SERVICE_HOSTNAME=$(kubectl get inferenceservice bloom7b1 -o jsonpath='{.status.url}' | cut -d "/" -f 3) + +curl -v \ + -H "Host: ${SERVICE_HOSTNAME}" \ + -H "Content-Type: application/json" \ + -d @./text.json \ + http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/bloom7b1:predict + +{"predictions":["My dog is cute.\nNice.\n- Hey, Mom.\n- Yeah?\nWhat color's your dog?\n- It's gray.\n- Gray?\nYeah.\nIt looks gray to me.\n- Where'd you get it?\n- Well, Dad says it's kind of...\n- Gray?\n- Gray.\nYou got a gray dog?\n- It's gray.\n- Gray.\nIs your dog gray?\nAre you sure?\nNo.\nYou sure"]} +``` diff --git a/docs/modelserving/v1beta1/llm/torchserve/accelerate/bloom.yaml b/docs/modelserving/v1beta1/llm/torchserve/accelerate/bloom.yaml new file mode 100644 index 000000000..b01c1ae47 --- /dev/null +++ b/docs/modelserving/v1beta1/llm/torchserve/accelerate/bloom.yaml @@ -0,0 +1,18 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: "bloom-7b1" +spec: + predictor: + pytorch: + image: 0.8.2 + storageUri: gs://kfserving-examples/models/torchserve/llm/Huggingface_accelerate/bloom + resources: + limits: + cpu: "2" + memory: 32Gi + nvidia.com/gpu: "2" + requests: + cpu: "2" + memory: 32Gi + nvidia.com/gpu: "2" diff --git a/docs/modelserving/v1beta1/llm/torchserve/accelerate/config.properties b/docs/modelserving/v1beta1/llm/torchserve/accelerate/config.properties new file mode 100644 index 000000000..e23a7f906 --- /dev/null +++ b/docs/modelserving/v1beta1/llm/torchserve/accelerate/config.properties @@ -0,0 +1,13 @@ +inference_address=http://0.0.0.0:8085 +management_address=http://0.0.0.0:8085 +metrics_address=http://0.0.0.0:8082 +grpc_inference_port=7070 +grpc_management_port=7071 +enable_metrics_api=true +metrics_format=prometheus +number_of_netty_threads=4 +number_of_gpu=2 +job_queue_size=10 +enable_envvars_config=true +model_store=/mnt/models/model-store +model_snapshot={"name":"startup.cfg","modelCount":1,"models":{"bloom7b1":{"1.0":{"defaultVersion":true,"marName":"bloom7b1.mar","minWorkers":1,"maxWorkers":5,"batchSize":1,"maxBatchDelay":5000,"responseTimeout":120}}}} \ No newline at end of file diff --git a/docs/modelserving/v1beta1/llm/torchserve/accelerate/custom_handler.py b/docs/modelserving/v1beta1/llm/torchserve/accelerate/custom_handler.py new file mode 100644 index 000000000..ba0e1ee75 --- /dev/null +++ b/docs/modelserving/v1beta1/llm/torchserve/accelerate/custom_handler.py @@ -0,0 +1,164 @@ +import json +import logging +import os +import zipfile +from abc import ABC + +import torch +import transformers +from transformers import BloomForCausalLM, BloomTokenizerFast + +from ts.torch_handler.base_handler import BaseHandler + +logger = logging.getLogger(__name__) +logger.info("Transformers version %s", transformers.__version__) + + +TORCH_DTYPES = { + "float16": torch.float16, + "float32": torch.float32, + "float64": torch.float64, +} + + +class TransformersSeqClassifierHandler(BaseHandler, ABC): + """ + Transformers handler class for sequence, token classification and question answering. + """ + + def __init__(self): + super(TransformersSeqClassifierHandler, self).__init__() + self.initialized = False + + def initialize(self, ctx): + """In this initialize function, the BERT model is loaded and + the Layer Integrated Gradients Algorithm for Captum Explanations + is initialized here. + Args: + ctx (context): It is a JSON Object containing information + pertaining to the model artifacts parameters. + """ + self.manifest = ctx.manifest + properties = ctx.system_properties + model_dir = properties.get("model_dir") + + self.device = torch.device( + "cuda:" + str(properties.get("gpu_id")) + if torch.cuda.is_available() and properties.get("gpu_id") is not None + else "cpu" + ) + # Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode + # further setup config can be added. + with zipfile.ZipFile(model_dir + "/model.zip", "r") as zip_ref: + zip_ref.extractall(model_dir + "/model") + + # read configs for the mode, model_name, etc. from setup_config.json + setup_config_path = os.path.join(model_dir, "setup_config.json") + if os.path.isfile(setup_config_path): + with open(setup_config_path) as setup_config_file: + self.setup_config = json.load(setup_config_file) + else: + logger.warning("Missing the setup_config.json file.") + + self.model = BloomForCausalLM.from_pretrained( + model_dir + "/model", + revision=self.setup_config["revision"], + max_memory={ + int(key) if key.isnumeric() else key: value + for key, value in self.setup_config["max_memory"].items() + }, + low_cpu_mem_usage=self.setup_config["low_cpu_mem_usage"], + device_map=self.setup_config["device_map"], + offload_folder=self.setup_config["offload_folder"], + offload_state_dict=self.setup_config["offload_state_dict"], + torch_dtype=TORCH_DTYPES[self.setup_config["torch_dtype"]], + ) + + self.tokenizer = BloomTokenizerFast.from_pretrained( + model_dir + "/model", return_tensors="pt" + ) + + self.model.eval() + logger.info("Transformer model from path %s loaded successfully", model_dir) + + self.initialized = True + + def preprocess(self, requests): + """Basic text preprocessing, based on the user's chocie of application mode. + Args: + requests (str): The Input data in the form of text is passed on to the preprocess + function. + Returns: + list : The preprocess function returns a list of Tensor for the size of the word tokens. + """ + input_ids_batch = None + attention_mask_batch = None + for idx, data in enumerate(requests): + input_text = data.get("data") + if input_text is None: + input_text = data.get("body") + if isinstance(input_text, (bytes, bytearray)): + input_text = input_text.decode("utf-8") + + max_length = self.setup_config["max_length"] + logger.info("Received text: '%s'", input_text) + + inputs = self.tokenizer.encode_plus( + input_text, + max_length=int(max_length), + pad_to_max_length=True, + add_special_tokens=True, + return_tensors="pt", + ) + + input_ids = inputs["input_ids"].to(self.device) + attention_mask = inputs["attention_mask"].to(self.device) + # making a batch out of the recieved requests + # attention masks are passed for cases where input tokens are padded. + if input_ids.shape is not None: + if input_ids_batch is None: + input_ids_batch = input_ids + attention_mask_batch = attention_mask + else: + input_ids_batch = torch.cat((input_ids_batch, input_ids), 0) + attention_mask_batch = torch.cat( + (attention_mask_batch, attention_mask), 0 + ) + return (input_ids_batch, attention_mask_batch) + + def inference(self, input_batch): + """Predict the class (or classes) of the received text using the + serialized transformers checkpoint. + Args: + input_batch (list): List of Text Tensors from the pre-process function is passed here + Returns: + list : It returns a list of the predicted value for the input text + """ + (input_ids_batch, _) = input_batch + inferences = [] + input_ids_batch = input_ids_batch.to(self.device) + outputs = self.model.generate( + input_ids_batch, + do_sample=True, + max_new_tokens=int(self.setup_config["max_length"]), + top_p=0.95, + top_k=60, + ) + for i, _ in enumerate(outputs): + inferences.append( + self.tokenizer.decode(outputs[i], skip_special_tokens=True) + ) + + logger.info("Generated text: '%s'", inferences) + + print("Generated text", inferences) + return inferences + + def postprocess(self, inference_output): + """Post Process Function converts the predicted response into Torchserve readable format. + Args: + inference_output (list): It contains the predicted response of the input text. + Returns: + (list): Returns a list of the Predictions and Explanations. + """ + return inference_output diff --git a/docs/modelserving/v1beta1/llm/torchserve/accelerate/setup_config.json b/docs/modelserving/v1beta1/llm/torchserve/accelerate/setup_config.json new file mode 100644 index 000000000..95ffd8404 --- /dev/null +++ b/docs/modelserving/v1beta1/llm/torchserve/accelerate/setup_config.json @@ -0,0 +1,13 @@ +{ + "revision": "main", + "max_memory": { + "0": "10GB", + "cpu": "10GB" + }, + "low_cpu_mem_usage": true, + "device_map": "auto", + "offload_folder": "offload", + "offload_state_dict": true, + "torch_dtype":"float16", + "max_length":"80" +} diff --git a/docs/modelserving/v1beta1/llm/torchserve/accelerate/text.json b/docs/modelserving/v1beta1/llm/torchserve/accelerate/text.json new file mode 100644 index 000000000..76b42c30c --- /dev/null +++ b/docs/modelserving/v1beta1/llm/torchserve/accelerate/text.json @@ -0,0 +1,5 @@ +{ + "instances": [ + "Today the weather is really nice and I am planning on" + ] +} \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 5762330e5..dfd79d586 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -39,6 +39,9 @@ nav: - Torchscript: modelserving/v1beta1/triton/torchscript/README.md - Tensorflow: modelserving/v1beta1/triton/bert/README.md - AMD: modelserving/v1beta1/amd/README.md + - LLM Runtime: + - TorchServe LLM: + - Bloom7b1: modelserving/v1beta1/llm/torchserve/accelerate/README.md - How to write a custom predictor: modelserving/v1beta1/custom/custom_model/README.md - Multi Model Serving: - Overview: