From fdc9304a0579f3697067f359668f20dd63952cf2 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Thu, 6 Jun 2024 16:18:06 +0100 Subject: [PATCH 01/15] hf_eval first commit --- .../evaluation/hf_evaluate_config.yaml | 25 +++++ src/lm_buddy/buddy.py | 4 + src/lm_buddy/cli/evaluate.py | 15 ++- src/lm_buddy/configs/jobs/__init__.py | 6 +- src/lm_buddy/configs/jobs/hf_evaluate.py | 70 +++++++++++++ src/lm_buddy/jobs/asset_loader.py | 21 +++- src/lm_buddy/jobs/evaluation/hf_evaluate.py | 98 +++++++++++++++++++ 7 files changed, 236 insertions(+), 3 deletions(-) create mode 100644 examples/configs/evaluation/hf_evaluate_config.yaml create mode 100644 src/lm_buddy/configs/jobs/hf_evaluate.py create mode 100644 src/lm_buddy/jobs/evaluation/hf_evaluate.py diff --git a/examples/configs/evaluation/hf_evaluate_config.yaml b/examples/configs/evaluation/hf_evaluate_config.yaml new file mode 100644 index 0000000..411cf24 --- /dev/null +++ b/examples/configs/evaluation/hf_evaluate_config.yaml @@ -0,0 +1,25 @@ +name: "lm-buddy-hf-evaluate" + +dataset: + path: "s3://platform-storage/datasets/dialogsum" + +# Settings specific to hf_evaluate +evaluation: + metrics: ["rouge"] + # enable/disable tqdm to track eval progress + enable_tqdm: False + # rely on HF pipeline for summarization + use_pipeline: True + +# Model to evaluate +model: + path: "hf://facebook/bart-large-cnn" + +quantization: + load_in_4bit: True + bnb_4bit_quant_type: "fp4" + +# # Tracking info for where to log the run results +# tracking: +# project: "lm-buddy-examples" +# entity: "sample" diff --git a/src/lm_buddy/buddy.py b/src/lm_buddy/buddy.py index 4e6be5d..08586b0 100644 --- a/src/lm_buddy/buddy.py +++ b/src/lm_buddy/buddy.py @@ -3,12 +3,14 @@ from lm_buddy.configs.jobs import ( EvaluationJobConfig, FinetuningJobConfig, + HuggingFaceEvalJobConfig, JobConfig, LMHarnessJobConfig, PrometheusJobConfig, RagasJobConfig, ) from lm_buddy.jobs.common import EvaluationResult, FinetuningResult, JobType +from lm_buddy.jobs.evaluation.hf_evaluate import run_hf_evaluation from lm_buddy.jobs.evaluation.lm_harness import run_lm_harness from lm_buddy.jobs.evaluation.prometheus import run_prometheus from lm_buddy.jobs.evaluation.ragas import run_ragas @@ -66,6 +68,8 @@ def evaluate(self, config: EvaluationJobConfig) -> EvaluationResult: result = run_prometheus(prometheus_config) case RagasJobConfig() as ragas_config: result = run_ragas(ragas_config) + case HuggingFaceEvalJobConfig() as hf_eval_config: + result = run_hf_evaluation(hf_eval_config) case _: raise ValueError(f"Invlid configuration for evaluation: {type(config)}") self._generate_artifact_lineage(config, result.artifacts, JobType.EVALUATION) diff --git a/src/lm_buddy/cli/evaluate.py b/src/lm_buddy/cli/evaluate.py index 73f58a6..90ca0b8 100644 --- a/src/lm_buddy/cli/evaluate.py +++ b/src/lm_buddy/cli/evaluate.py @@ -2,7 +2,12 @@ from lm_buddy import LMBuddy from lm_buddy.cli.utils import parse_config_option -from lm_buddy.configs.jobs import LMHarnessJobConfig, PrometheusJobConfig, RagasJobConfig +from lm_buddy.configs.jobs import ( + HuggingFaceEvalJobConfig, + LMHarnessJobConfig, + PrometheusJobConfig, + RagasJobConfig, +) @click.group(name="evaluate", help="Run an LM Buddy evaluation job.") @@ -32,3 +37,11 @@ def ragas_command(config: str) -> None: config = parse_config_option(RagasJobConfig, config) buddy = LMBuddy() buddy.evaluate(config) + + +@group.command("huggingface", help="Run the HuggingFace evaluation job.") +@click.option("--config", type=str) +def huggingface_command(config: str) -> None: + config = parse_config_option(HuggingFaceEvalJobConfig, config) + buddy = LMBuddy() + buddy.evaluate(config) diff --git a/src/lm_buddy/configs/jobs/__init__.py b/src/lm_buddy/configs/jobs/__init__.py index b05459b..9509631 100644 --- a/src/lm_buddy/configs/jobs/__init__.py +++ b/src/lm_buddy/configs/jobs/__init__.py @@ -1,10 +1,13 @@ from lm_buddy.configs.jobs.common import JobConfig from lm_buddy.configs.jobs.finetuning import FinetuningJobConfig +from lm_buddy.configs.jobs.hf_evaluate import HuggingFaceEvalJobConfig from lm_buddy.configs.jobs.lm_harness import LMHarnessJobConfig from lm_buddy.configs.jobs.prometheus import PrometheusJobConfig from lm_buddy.configs.jobs.ragas import RagasJobConfig -EvaluationJobConfig = LMHarnessJobConfig | PrometheusJobConfig | RagasJobConfig +EvaluationJobConfig = ( + LMHarnessJobConfig | PrometheusJobConfig | RagasJobConfig | HuggingFaceEvalJobConfig +) __all__ = [ "JobConfig", @@ -12,5 +15,6 @@ "LMHarnessJobConfig", "PrometheusJobConfig", "RagasJobConfig", + "HuggingFaceEvalJobConfig", "EvaluationJobConfig", ] diff --git a/src/lm_buddy/configs/jobs/hf_evaluate.py b/src/lm_buddy/configs/jobs/hf_evaluate.py new file mode 100644 index 0000000..2e9c438 --- /dev/null +++ b/src/lm_buddy/configs/jobs/hf_evaluate.py @@ -0,0 +1,70 @@ +from pydantic import Field, conlist, field_validator, model_validator + +from lm_buddy.configs.common import LMBuddyConfig +from lm_buddy.configs.huggingface import ( + AutoModelConfig, + AutoTokenizerConfig, + DatasetConfig, + QuantizationConfig, +) +from lm_buddy.configs.jobs.common import JobConfig +from lm_buddy.configs.vllm import VLLMCompletionsConfig +from lm_buddy.paths import AssetPath + + +class HuggingFaceEvaluationConfig(LMBuddyConfig): + """Misc settings provided to an lm-harness evaluation job.""" + + metrics: conlist(str, min_length=1) + use_pipeline: bool = False + enable_tqdm: bool = False + + +class HuggingFaceEvalJobConfig(JobConfig): + """Configuration to run a HuggingFace evaluation job.""" + + dataset: DatasetConfig = Field( + description="Dataset of text completions to evaluate using the Prometheus judge model." + ) + evaluation: HuggingFaceEvaluationConfig + model: AutoModelConfig | VLLMCompletionsConfig + quantization: QuantizationConfig | None = None + tokenizer: AutoTokenizerConfig + + @model_validator(mode="before") + def ensure_tokenizer_config(cls, values): + """Set the tokenizer to the model path when not explicitly provided.""" + if values.get("tokenizer") is None: + values["tokenizer"] = {} + match values["model"]: + case str() as model_path: + values["tokenizer"]["path"] = model_path + case dict() as model_data: + values["tokenizer"]["path"] = model_data["path"] + case AutoModelConfig() as model_config: + values["tokenizer"]["path"] = model_config.path + # No fallback necessary, downstream validation will flag invalid model types + return values + + @field_validator("model", mode="before") + def validate_model_arg(cls, x): + """Allow for passing just a path string as the model argument.""" + if isinstance(x, str): + return AutoModelConfig(path=x) + return x + + @field_validator("tokenizer", mode="before") + def validate_tokenizer_arg(cls, x): + """Allow for passing just a path string as the tokenizer argument.""" + if isinstance(x, str): + return AutoTokenizerConfig(path=x) + return x + + def asset_paths(self) -> list[AssetPath]: + match self.model: + case AutoModelConfig() as config: + return {self.dataset.path, config.path, self.tokenizer.path} + case VLLMCompletionsConfig() as config if config.inference.engine is not None: + return {self.dataset.path, config.inference.engine, self.tokenizer.path} + case _: + return {} diff --git a/src/lm_buddy/jobs/asset_loader.py b/src/lm_buddy/jobs/asset_loader.py index ebddb80..d7663f3 100644 --- a/src/lm_buddy/jobs/asset_loader.py +++ b/src/lm_buddy/jobs/asset_loader.py @@ -8,11 +8,16 @@ from transformers import ( AutoConfig, AutoModelForCausalLM, + AutoModelForSeq2SeqLM, AutoTokenizer, PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, ) +from transformers.models.auto.modeling_auto import ( + MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, +) from lm_buddy.configs.huggingface import ( AutoModelConfig, @@ -120,7 +125,21 @@ def load_pretrained_model( # TODO: HuggingFace has many AutoModel classes with different "language model heads" # Can we abstract this to load with any type of AutoModel class? model_path = self.resolve_asset_path(config.path) - return AutoModelForCausalLM.from_pretrained( + + # load config first to get the model type + model_config = self.load_pretrained_config(config) + # print(model_config) + + if getattr(model_config, "model_type") in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES: + automodel_class = AutoModelForSeq2SeqLM + elif getattr(model_config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES: + automodel_class = AutoModelForCausalLM + else: + logger.info("Model type not supported. Trying AutoModelForCausalLM") + automodel_class = AutoModelForCausalLM + # print(automodel_class) + + return automodel_class.from_pretrained( pretrained_model_name_or_path=model_path, trust_remote_code=config.trust_remote_code, torch_dtype=config.torch_dtype, diff --git a/src/lm_buddy/jobs/evaluation/hf_evaluate.py b/src/lm_buddy/jobs/evaluation/hf_evaluate.py new file mode 100644 index 0000000..f48eb7c --- /dev/null +++ b/src/lm_buddy/jobs/evaluation/hf_evaluate.py @@ -0,0 +1,98 @@ +""" +lm-buddy entrypoint to run summary evaluation using huggingface eval +""" + +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import torch +from datasets import Dataset +from loguru import logger +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline + +from lm_buddy.configs.jobs.hf_evaluate import HuggingFaceEvalJobConfig +from lm_buddy.constants import LM_BUDDY_RESULTS_PATH +from lm_buddy.jobs.asset_loader import ( + HuggingFaceDatasetLoader, + HuggingFaceModelLoader, + HuggingFaceTokenizerLoader, +) +from lm_buddy.jobs.common import EvaluationResult + + +@dataclass +class BadResponseError(Exception): + def __init__(self, message, error=None): + self.message = message + self.error = error + + +def run_eval(config: HuggingFaceEvalJobConfig) -> Path: + # Init loaders + hf_dataset_loader = HuggingFaceDatasetLoader() + hf_model_loader = HuggingFaceModelLoader() + hf_tokenizer_loader = HuggingFaceTokenizerLoader() + + # Load dataset given its URI + dataset = hf_dataset_loader.load_dataset(config.dataset) + + # Enable / disable tqdm + input_samples = dataset.select(range(10))["examples"] + dataset_iterable = tqdm(input_samples) if config.evaluation.enable_tqdm else input_samples + results = [] + + # depending on config, use the summarizer pipeline or directly call the model + # for inference + if config.evaluation.use_pipeline: + logger.info("Using summarization pipeline") + summarizer = pipeline( + "summarization", + model=hf_model_loader.resolve_asset_path(config.model.path), + device=0 if torch.cuda.is_available() else -1, + ) + + t = time.time() + # for sample_txt in dataset_iterable: + # # summarizer output is a list (1 element in this case) of dict with key = "summary_text" + # results += summarizer(sample_txt, min_length=30, do_sample=False) + + # alternative: run on the whole dataset + results = summarizer(dataset.select(range(10))["examples"], min_length=30, do_sample=False) + + logger.info(f"Summarization performed in {time.time()-t} seconds") + + results = [r["summary_text"] for r in results] + + else: + logger.info("Using direct HF model invocation") + + device = "cuda" if torch.cuda.is_available() else "cpu" + model = hf_model_loader.load_pretrained_model(config.model).to(device) + tokenizer = hf_tokenizer_loader.load_pretrained_tokenizer(config.tokenizer) + + for sample_txt in dataset_iterable: + inputs = tokenizer(sample_txt, truncation=True, padding=True, return_tensors="pt").to( + device + ) + generated_ids = model.generate(**inputs, max_new_tokens=256) + output_txt = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + results += output_txt + + print(results) + + return "/tmp/dataset" + + +def run_hf_evaluation(config: HuggingFaceEvalJobConfig) -> EvaluationResult: + # Run eval and store output in local filename + result_dataset_path = run_eval(config) + logger.info(f"Prometheus evaluation dataset stored at {result_dataset_path}") + + return EvaluationResult( + artifacts=[], + dataset_path=result_dataset_path, + tables={}, + ) From f606dbfa88cdd3b7c31d9c69250159d246649189 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Mon, 10 Jun 2024 11:39:43 +0200 Subject: [PATCH 02/15] Added bert_score dependency in pyproject.toml --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6eff516..9221d9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "lm-buddy" -version = "0.10.4" +version = "0.10.5" authors = [ { name = "Sean Friedowitz", email = "sean@mozilla.ai" }, { name = "Aaron Gonzales", email = "aaron@mozilla.ai" }, @@ -37,6 +37,7 @@ dependencies = [ "peft==0.7.1", "trl==0.7.10", "bitsandbytes==0.42.0", + "bert_score==0.3.13", # Evaluation frameworks "lm-eval==0.4.2", "einops==0.7.0", From 8ff31b4c09a040f3332e88b70fb570b502c4e241 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Mon, 10 Jun 2024 11:40:58 +0200 Subject: [PATCH 03/15] Updated hf-eval code with all metrics --- .../evaluation/hf_evaluate_config.yaml | 2 +- src/lm_buddy/jobs/evaluation/hf_evaluate.py | 26 ++++--- src/lm_buddy/jobs/evaluation/metrics.py | 70 +++++++++++++++++++ 3 files changed, 88 insertions(+), 10 deletions(-) create mode 100644 src/lm_buddy/jobs/evaluation/metrics.py diff --git a/examples/configs/evaluation/hf_evaluate_config.yaml b/examples/configs/evaluation/hf_evaluate_config.yaml index 411cf24..39db968 100644 --- a/examples/configs/evaluation/hf_evaluate_config.yaml +++ b/examples/configs/evaluation/hf_evaluate_config.yaml @@ -5,7 +5,7 @@ dataset: # Settings specific to hf_evaluate evaluation: - metrics: ["rouge"] + metrics: ["rouge", "bertscore", "meteor"] # enable/disable tqdm to track eval progress enable_tqdm: False # rely on HF pipeline for summarization diff --git a/src/lm_buddy/jobs/evaluation/hf_evaluate.py b/src/lm_buddy/jobs/evaluation/hf_evaluate.py index f48eb7c..3b72a37 100644 --- a/src/lm_buddy/jobs/evaluation/hf_evaluate.py +++ b/src/lm_buddy/jobs/evaluation/hf_evaluate.py @@ -21,6 +21,7 @@ HuggingFaceTokenizerLoader, ) from lm_buddy.jobs.common import EvaluationResult +from lm_buddy.jobs.evaluation.metrics import EvaluationMetrics @dataclass @@ -42,7 +43,7 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path: # Enable / disable tqdm input_samples = dataset.select(range(10))["examples"] dataset_iterable = tqdm(input_samples) if config.evaluation.enable_tqdm else input_samples - results = [] + predictions = [] # depending on config, use the summarizer pipeline or directly call the model # for inference @@ -55,16 +56,17 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path: ) t = time.time() - # for sample_txt in dataset_iterable: - # # summarizer output is a list (1 element in this case) of dict with key = "summary_text" - # results += summarizer(sample_txt, min_length=30, do_sample=False) + for sample_txt in dataset_iterable: + # summarizer output is a list (1 element in this case) of dict with key = "summary_text" + predictions += summarizer(sample_txt, min_length=30, do_sample=False) - # alternative: run on the whole dataset - results = summarizer(dataset.select(range(10))["examples"], min_length=30, do_sample=False) + # alternative: run on the whole dataset (does not seem to be faster) + # TODO: test on GPU and changing #workers in pipeline definition + # results = summarizer(input_samples, min_length=30, do_sample=False) logger.info(f"Summarization performed in {time.time()-t} seconds") - results = [r["summary_text"] for r in results] + predictions = [r["summary_text"] for r in predictions] else: logger.info("Using direct HF model invocation") @@ -79,9 +81,15 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path: ) generated_ids = model.generate(**inputs, max_new_tokens=256) output_txt = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - results += output_txt + predictions += output_txt + + print(predictions) + + # Start evaluation + em = EvaluationMetrics(config.evaluation.metrics) + evaluation_results = em.run_all(predictions, input_samples) - print(results) + print(evaluation_results) return "/tmp/dataset" diff --git a/src/lm_buddy/jobs/evaluation/metrics.py b/src/lm_buddy/jobs/evaluation/metrics.py new file mode 100644 index 0000000..82ce3f7 --- /dev/null +++ b/src/lm_buddy/jobs/evaluation/metrics.py @@ -0,0 +1,70 @@ +import evaluate +import numpy as np +from loguru import logger + + +class EvaluationMetrics: + def __init__(self, metrics): + self._supported_metrics = { + "rouge": self._rouge, + "meteor": self._meteor, + "bertscore": self._bertscore, + } + + self._chosen_metrics = set(metrics).intersection(set(self._supported_metrics.keys())) + self._unsupported_metrics = set(metrics).difference(set(self._supported_metrics.keys())) + + if len(self._chosen_metrics) == 0: + logger.info("No valid metrics selected") + else: + logger.info(f"Chosen metrics: {self._chosen_metrics}") + + if len(self._unsupported_metrics) > 0: + logger.info(f"Unsupported metrics: {self._unsupported_metrics}") + + def _rouge(self, pred, ref): + ev = evaluate.load("rouge") + + # compute with use_aggregator = False to get individual scores + evals = ev.compute(predictions=pred, references=ref, use_aggregator=False) + + # calculate mean for each of the submetrics (rouge1, rouge2, rougeL, rougeLsum) + for k in ["rouge1", "rouge2", "rougeL", "rougeLsum"]: + evals[f"{k}_mean"] = np.mean(evals[k]) + + return evals + + def _meteor(self, pred, ref): + ev = evaluate.load("meteor") + + # initialize dictionary with metric name + evals = {"meteor": []} + + # run sample-wise evals (as default implementation only returns mean value) + for p, r in zip(pred, ref): + evals["meteor"].append(ev.compute(predictions=[p], references=[r])["meteor"]) + + # calculate mean + evals[f"meteor_mean"] = np.mean(evals["meteor"]) + + return evals + + def _bertscore(self, pred, ref): + ev = evaluate.load("bertscore") + + # calculate evals (the default is not to aggregate them) + evals = ev.compute(predictions=pred, references=ref, lang="en") + + # calculate mean for each of the submetrics (precision, recall, f1) + for k in ["precision", "recall", "f1"]: + evals[f"{k}_mean"] = np.mean(evals[k]) + + return evals + + def run_all(self, pred, ref): + results = {} + + for metric in self._chosen_metrics: + results[metric] = self._supported_metrics[metric](pred, ref) + + return results From 6a43bcc95cd7492dd737fd19afb4eb760f7609d3 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 11 Jun 2024 11:21:05 +0200 Subject: [PATCH 04/15] Added s3fs to pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 9221d9b..941afda 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "pydantic-yaml==1.2.0", "ray[default]==2.9.3", "loguru==0.7.2", + "s3fs=2024.6.0", # HuggingFace "datasets>=2.17.1", "transformers==4.36.2", From c5a701b427e54c3621d3840e5701a97828244e0d Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 11 Jun 2024 17:09:37 +0200 Subject: [PATCH 05/15] Fixed s3fs issue with pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 941afda..db9458f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ dependencies = [ "pydantic-yaml==1.2.0", "ray[default]==2.9.3", "loguru==0.7.2", - "s3fs=2024.6.0", + "s3fs", # HuggingFace "datasets>=2.17.1", "transformers==4.36.2", From 662ee950d5f83ac0f3be5e6ab8a9b6b7905147ac Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Wed, 12 Jun 2024 14:28:39 +0200 Subject: [PATCH 06/15] Adding support for oai:// prefix --- src/lm_buddy/jobs/asset_loader.py | 2 +- src/lm_buddy/paths.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/lm_buddy/jobs/asset_loader.py b/src/lm_buddy/jobs/asset_loader.py index d7663f3..c35428e 100644 --- a/src/lm_buddy/jobs/asset_loader.py +++ b/src/lm_buddy/jobs/asset_loader.py @@ -45,7 +45,7 @@ def resolve_asset_path(self, path: AssetPath) -> str: The returned string has its `PathPrefix` stripped away.. """ raw_path = strip_path_prefix(path) - if path.startswith((PathPrefix.FILE, PathPrefix.HUGGINGFACE)): + if path.startswith((PathPrefix.FILE, PathPrefix.HUGGINGFACE, PathPrefix.OPENAI)): return raw_path elif path.startswith(PathPrefix.WANDB): artifact = get_artifact_from_api(raw_path) diff --git a/src/lm_buddy/paths.py b/src/lm_buddy/paths.py index 633bce0..79244cf 100644 --- a/src/lm_buddy/paths.py +++ b/src/lm_buddy/paths.py @@ -14,6 +14,7 @@ class PathPrefix(str, Enum): HUGGINGFACE = "hf://" WANDB = "wandb://" S3 = "s3://" + OPENAI = "oai://" def strip_path_prefix(path: str) -> str: @@ -53,6 +54,9 @@ def validate_asset_path(path: str) -> "AssetPath": # e.g. if the assumption is we always want a file or a dir, we could # use https://s3pathlib.readthedocs.io to verify (.is_file() or ._is_dir()) pass + elif path.startswith(PathPrefix.OPENAI): + # TODO: Validate the OAI path structure? + pass else: allowed_prefixes = {x.value for x in PathPrefix} raise ValueError(f"'{path}' does not begin with an allowed prefix: {allowed_prefixes}") From dbe9d8472e650329c351cba77632cef4144193f5 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Wed, 12 Jun 2024 14:29:34 +0200 Subject: [PATCH 07/15] Added optional max_retries and system_prompt params to InferenceServerConfig --- src/lm_buddy/configs/vllm.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/lm_buddy/configs/vllm.py b/src/lm_buddy/configs/vllm.py index 431f011..a5bb10e 100644 --- a/src/lm_buddy/configs/vllm.py +++ b/src/lm_buddy/configs/vllm.py @@ -12,12 +12,18 @@ class InferenceServerConfig(LMBuddyConfig): Note: This configuration is intended to be generic and not bound to the interface of any specific training/evaluation framework. See `LocalChatCompletionConfig` - or `vLLMCompleptionsConfig` for intended usage alongside a third-party framework. + or `vLLMCompletionsConfig` for intended usage alongside a third-party framework. """ base_url: str engine: AssetPath + # optional system prompt to be used by default in chat completions + system_prompt: str | None = None + + # max number of retries when communication with server fails + max_retries: int | None = None + class VLLMCompletionsConfig(LMBuddyConfig): """Configuration for a vLLM-based completions service From 91a2e984572b2bd5e18692221f59e5d520f44461 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Wed, 12 Jun 2024 14:32:34 +0200 Subject: [PATCH 08/15] Updated hf_evaluate with model_clients support, results saving, better config --- .../evaluation/hf_evaluate_config.yaml | 58 +++++++-- src/lm_buddy/configs/jobs/hf_evaluate.py | 19 ++- src/lm_buddy/jobs/evaluation/hf_evaluate.py | 123 ++++++++++-------- src/lm_buddy/jobs/model_clients.py | 111 ++++++++++++++++ 4 files changed, 242 insertions(+), 69 deletions(-) create mode 100644 src/lm_buddy/jobs/model_clients.py diff --git a/examples/configs/evaluation/hf_evaluate_config.yaml b/examples/configs/evaluation/hf_evaluate_config.yaml index 39db968..44449df 100644 --- a/examples/configs/evaluation/hf_evaluate_config.yaml +++ b/examples/configs/evaluation/hf_evaluate_config.yaml @@ -1,25 +1,65 @@ name: "lm-buddy-hf-evaluate" +# Input dataset path dataset: path: "s3://platform-storage/datasets/dialogsum" -# Settings specific to hf_evaluate + +# Settings specific to the hf_evaluate entrypoint evaluation: - metrics: ["rouge", "bertscore", "meteor"] + # metrics to be used for the evaluation + # (you can add "rouge", "meteor", and "bertscore" atm) + metrics: ["rouge", "meteor"] # enable/disable tqdm to track eval progress - enable_tqdm: False - # rely on HF pipeline for summarization + # (useful when running interactively, noisy on ray logs) + enable_tqdm: True + # rely on HF pipeline for summarization (ignored if using OAI API) use_pipeline: True + # perform inference / evaluation on the first max_samples only + max_samples: 10 + # output file path + # - if you provide a path complete with a filename, results will be stored in it + # - if you provide a dir, results will be stored in //eval_results.json + # - if you don't provide a storage path, results will be stored locally (see ~/.lm-buddy/results) + # storage_path: "s3://platform-storage/experiments/results/" + +# Model to evaluate. Choose one of the following options by uncommenting it -# Model to evaluate +# 1. Local model +# - Provide model path to load the model locally +# - Make sure you add quantization details (see below) if the model is too large +# - Optionally, add a tokenizer (the one matching the specified model name is the default) model: path: "hf://facebook/bart-large-cnn" -quantization: - load_in_4bit: True - bnb_4bit_quant_type: "fp4" +# # 2. OpenAI +# # - The base_url is fixed +# # - Choose an engine name (see https://platform.openai.com/docs/models) +# # - Customize the system prompt if needed +# model: +# inference: +# base_url: "https://api.openai.com/v1" +# engine: "oai://gpt-4-turbo" +# system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences." +# max_retries: 3 + +# # 3. OpenAI - compatible model +# # - Works with local/remote vLLM-served models and llamafiles +# # - Provide base_url and engine +# # - Customize the system prompt if needed +# model: +# inference: +# base_url: "http://localhost:8081/v1" +# engine: "hf://mistralai/mistral-7b-instruct-v0.2" +# system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences." +# max_retries: 3 + +# Quantization (use it if you are dealing with models too large to fit in RAM) +# quantization: +# load_in_4bit: True +# bnb_4bit_quant_type: "fp4" -# # Tracking info for where to log the run results +# Tracking info for where to log the run results # tracking: # project: "lm-buddy-examples" # entity: "sample" diff --git a/src/lm_buddy/configs/jobs/hf_evaluate.py b/src/lm_buddy/configs/jobs/hf_evaluate.py index 2e9c438..e863111 100644 --- a/src/lm_buddy/configs/jobs/hf_evaluate.py +++ b/src/lm_buddy/configs/jobs/hf_evaluate.py @@ -18,6 +18,8 @@ class HuggingFaceEvaluationConfig(LMBuddyConfig): metrics: conlist(str, min_length=1) use_pipeline: bool = False enable_tqdm: bool = False + max_samples: int | None = None + storage_path: str | None = None class HuggingFaceEvalJobConfig(JobConfig): @@ -40,7 +42,13 @@ def ensure_tokenizer_config(cls, values): case str() as model_path: values["tokenizer"]["path"] = model_path case dict() as model_data: - values["tokenizer"]["path"] = model_data["path"] + # if dict we might have model.path specified + # if we don't it is VLLMCompletion and we are ok + # with anything as it will be ignored + if model_data.get("path") is None: + values["tokenizer"]["path"] = "oai://tokenizer" + else: + values["tokenizer"]["path"] = model_data.get("path") case AutoModelConfig() as model_config: values["tokenizer"]["path"] = model_config.path # No fallback necessary, downstream validation will flag invalid model types @@ -63,8 +71,13 @@ def validate_tokenizer_arg(cls, x): def asset_paths(self) -> list[AssetPath]: match self.model: case AutoModelConfig() as config: - return {self.dataset.path, config.path, self.tokenizer.path} + return { + self.dataset.path, + self.evaluation.output_path, + config.path, + self.tokenizer.path, + } case VLLMCompletionsConfig() as config if config.inference.engine is not None: - return {self.dataset.path, config.inference.engine, self.tokenizer.path} + return {self.dataset.path, self.evaluation.output_path, config.inference.engine} case _: return {} diff --git a/src/lm_buddy/jobs/evaluation/hf_evaluate.py b/src/lm_buddy/jobs/evaluation/hf_evaluate.py index 3b72a37..678c4b7 100644 --- a/src/lm_buddy/jobs/evaluation/hf_evaluate.py +++ b/src/lm_buddy/jobs/evaluation/hf_evaluate.py @@ -2,102 +2,111 @@ lm-buddy entrypoint to run summary evaluation using huggingface eval """ +import json import time -from dataclasses import dataclass from pathlib import Path -from typing import Any -import torch -from datasets import Dataset +import s3fs from loguru import logger from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline from lm_buddy.configs.jobs.hf_evaluate import HuggingFaceEvalJobConfig +from lm_buddy.configs.vllm import VLLMCompletionsConfig from lm_buddy.constants import LM_BUDDY_RESULTS_PATH from lm_buddy.jobs.asset_loader import ( HuggingFaceDatasetLoader, HuggingFaceModelLoader, - HuggingFaceTokenizerLoader, ) from lm_buddy.jobs.common import EvaluationResult from lm_buddy.jobs.evaluation.metrics import EvaluationMetrics +from lm_buddy.jobs.model_clients import ( + HuggingFaceModelClient, + OpenAIModelClient, + PipelineModelClient, +) -@dataclass -class BadResponseError(Exception): - def __init__(self, message, error=None): - self.message = message - self.error = error +def save_outputs(config: HuggingFaceEvalJobConfig, evaluation_results: dict) -> Path: + storage_path = config.evaluation.storage_path + + # generate local temp file ANYWAY + # (we don't want to lose all eval data if there is an issue wth s3) + local_path = Path(LM_BUDDY_RESULTS_PATH) / config.name / "eval_results.json" + local_path.parent.mkdir(exist_ok=True, parents=True) + with local_path.open("w") as f: + json.dump(evaluation_results, f) + + # copy to s3 and return path + if storage_path is not None and storage_path.startswith("s3://"): + s3 = s3fs.S3FileSystem() + if storage_path.endswith("/"): + storage_path = "s3://" + str(Path(storage_path[5:]) / config.name / "eval_results.json") + logger.info(f"Storing into {storage_path}...") + s3.put_file(local_path, storage_path) + return storage_path + else: + return local_path def run_eval(config: HuggingFaceEvalJobConfig) -> Path: # Init loaders hf_dataset_loader = HuggingFaceDatasetLoader() hf_model_loader = HuggingFaceModelLoader() - hf_tokenizer_loader = HuggingFaceTokenizerLoader() # Load dataset given its URI dataset = hf_dataset_loader.load_dataset(config.dataset) + # Limit dataset length if max_samples is specified + if config.evaluation.max_samples is not None: + dataset = dataset.select(range(config.evaluation.max_samples)) + # Enable / disable tqdm - input_samples = dataset.select(range(10))["examples"] + input_samples = dataset["examples"] dataset_iterable = tqdm(input_samples) if config.evaluation.enable_tqdm else input_samples predictions = [] - # depending on config, use the summarizer pipeline or directly call the model - # for inference - if config.evaluation.use_pipeline: - logger.info("Using summarization pipeline") - summarizer = pipeline( - "summarization", - model=hf_model_loader.resolve_asset_path(config.model.path), - device=0 if torch.cuda.is_available() else -1, - ) - - t = time.time() - for sample_txt in dataset_iterable: - # summarizer output is a list (1 element in this case) of dict with key = "summary_text" - predictions += summarizer(sample_txt, min_length=30, do_sample=False) - - # alternative: run on the whole dataset (does not seem to be faster) - # TODO: test on GPU and changing #workers in pipeline definition - # results = summarizer(input_samples, min_length=30, do_sample=False) - - logger.info(f"Summarization performed in {time.time()-t} seconds") - - predictions = [r["summary_text"] for r in predictions] - + # Choose which model client to use + if type(config.model) == VLLMCompletionsConfig: + model_name = config.model.inference.base_url else: - logger.info("Using direct HF model invocation") - - device = "cuda" if torch.cuda.is_available() else "cpu" - model = hf_model_loader.load_pretrained_model(config.model).to(device) - tokenizer = hf_tokenizer_loader.load_pretrained_tokenizer(config.tokenizer) - - for sample_txt in dataset_iterable: - inputs = tokenizer(sample_txt, truncation=True, padding=True, return_tensors="pt").to( - device - ) - generated_ids = model.generate(**inputs, max_new_tokens=256) - output_txt = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - predictions += output_txt + model_name = hf_model_loader.resolve_asset_path(config.model.path) - print(predictions) - - # Start evaluation + if model_name.startswith("http"): + # run the openai client + logger.info(f"Using OAI client. Endpoint: {model_name}") + model_client = OpenAIModelClient(model_name, config.model) + else: + # depending on config, use the summarizer pipeline or directly call the model + # for inference + if config.evaluation.use_pipeline: + logger.info(f"Using summarization pipeline. Model: {model_name}") + model_client = PipelineModelClient(model_name, config.model) + else: + logger.info(f"Using direct HF model invocation. Model: {model_name}") + model_client = HuggingFaceModelClient(model_name, config) + + # run inference + t = time.time() + for sample_txt in dataset_iterable: + predictions.append(model_client.predict(sample_txt)) + summarization_time = time.time() - t + logger.info(f"Summarization performed in {summarization_time} seconds") + + # run evaluation + ground_truth = dataset["ground_truth"] em = EvaluationMetrics(config.evaluation.metrics) - evaluation_results = em.run_all(predictions, input_samples) - - print(evaluation_results) + t = time.time() + evaluation_results = em.run_all(predictions, ground_truth) + summarization_time = time.time() - t + logger.info(f"Summarization performed in {summarization_time} seconds") - return "/tmp/dataset" + return save_outputs(config, evaluation_results) def run_hf_evaluation(config: HuggingFaceEvalJobConfig) -> EvaluationResult: # Run eval and store output in local filename result_dataset_path = run_eval(config) - logger.info(f"Prometheus evaluation dataset stored at {result_dataset_path}") + logger.info(f"Summarization eval results stored at {result_dataset_path}") return EvaluationResult( artifacts=[], diff --git a/src/lm_buddy/jobs/model_clients.py b/src/lm_buddy/jobs/model_clients.py new file mode 100644 index 0000000..0013950 --- /dev/null +++ b/src/lm_buddy/jobs/model_clients.py @@ -0,0 +1,111 @@ +from abc import abstractmethod + +import torch +from loguru import logger +from openai import OpenAI, OpenAIError +from openai.types import Completion +from transformers import pipeline + +from lm_buddy.configs.common import LMBuddyConfig +from lm_buddy.configs.huggingface import AutoModelConfig +from lm_buddy.configs.jobs.hf_evaluate import HuggingFaceEvalJobConfig +from lm_buddy.configs.vllm import VLLMCompletionsConfig +from lm_buddy.jobs.asset_loader import HuggingFaceModelLoader, HuggingFaceTokenizerLoader + + +class BaseModelClient: + @abstractmethod + def __init__(self, model: str, config: LMBuddyConfig): + pass + + @abstractmethod + def predict(self, prompt: str) -> str: + pass + + +class PipelineModelClient(BaseModelClient): + def __init__(self, model: str, config: AutoModelConfig): + self._summarizer = pipeline( + "summarization", + model=model, + device=0 if torch.cuda.is_available() else -1, + ) + + def predict(self, prompt): + # summarizer output is a list (1 element in this case) of dict with key = "summary_text" + # TODO: bring summarizer parameters out at some point (not needed at the moment) + pred = self._summarizer(prompt, min_length=30, do_sample=False) + return pred[0]["summary_text"] + + +class HuggingFaceModelClient(BaseModelClient): + def __init__(self, model: str, config: HuggingFaceEvalJobConfig): + self._config = config + self._device = "cuda" if torch.cuda.is_available() else "cpu" + + hf_model_loader = HuggingFaceModelLoader() + hf_tokenizer_loader = HuggingFaceTokenizerLoader() + self._model = hf_model_loader.load_pretrained_model(config.model).to(self._device) + self._tokenizer = hf_tokenizer_loader.load_pretrained_tokenizer(config.tokenizer) + + def predict(self, prompt): + inputs = self._tokenizer(prompt, truncation=True, padding=True, return_tensors="pt").to( + self._device + ) + generated_ids = self._model.generate(**inputs, max_new_tokens=256) + return self._tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + + +class OpenAIModelClient(BaseModelClient): + def __init__(self, model: str, config: VLLMCompletionsConfig): + self._config = config + + hf_model_loader = HuggingFaceModelLoader() + self._engine = hf_model_loader.resolve_asset_path(config.inference.engine) + self._system = config.inference.system_prompt + self._client = OpenAI(base_url=model) + + def _openai_chat_completion( + self, + config: VLLMCompletionsConfig, + client: OpenAI, + prompt: str, + system: str = "You are a helpful assisant.", + ) -> Completion: + """Connects to a remote OpenAI-API-compatible endpoint + and returns a chat completion holding the model's response. + """ + + return self._client.chat.completions.create( + model=self._engine, + messages=[{"role": "system", "content": system}, {"role": "user", "content": prompt}], + max_tokens=config.max_tokens, + frequency_penalty=config.frequency_penalty, + temperature=config.temperature, + top_p=config.top_p, + ) + + def _get_response_with_retries( + self, + config: VLLMCompletionsConfig, + prompt: str, + ) -> tuple[str, str]: + current_retry_attempt = 1 + max_retries = 1 if config.inference.max_retries is None else config.inference.max_retries + while current_retry_attempt <= max_retries: + try: + response = self._openai_chat_completion( + self._config, self._client, prompt, self._system + ) + break + except OpenAIError as e: + logger.warning(f"{e.message}: " f"Retrying ({current_retry_attempt}/{max_retries})") + current_retry_attempt += 1 + if current_retry_attempt > max_retries: + raise e + return response + + def predict(self, prompt): + response = self._get_response_with_retries(self._config, prompt) + + return response.choices[0].message.content From 9bff948754bb3d922051f6c456ca0a221a89a6a4 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Thu, 13 Jun 2024 11:50:44 +0200 Subject: [PATCH 09/15] Added timer decorator and used it for hf_evaluate entrypoint --- src/lm_buddy/jobs/evaluation/hf_evaluate.py | 42 +++++++++++++++------ src/lm_buddy/jobs/utils.py | 23 +++++++++++ 2 files changed, 53 insertions(+), 12 deletions(-) create mode 100644 src/lm_buddy/jobs/utils.py diff --git a/src/lm_buddy/jobs/evaluation/hf_evaluate.py b/src/lm_buddy/jobs/evaluation/hf_evaluate.py index 678c4b7..85262d1 100644 --- a/src/lm_buddy/jobs/evaluation/hf_evaluate.py +++ b/src/lm_buddy/jobs/evaluation/hf_evaluate.py @@ -3,7 +3,7 @@ """ import json -import time +from collections.abc import Iterable from pathlib import Path import s3fs @@ -20,10 +20,30 @@ from lm_buddy.jobs.common import EvaluationResult from lm_buddy.jobs.evaluation.metrics import EvaluationMetrics from lm_buddy.jobs.model_clients import ( + BaseModelClient, HuggingFaceModelClient, OpenAIModelClient, PipelineModelClient, ) +from lm_buddy.jobs.utils import timer + + +@timer +def predict(dataset_iterable: Iterable, model_client: BaseModelClient) -> list: + predictions = [] + + for sample_txt in dataset_iterable: + predictions.append(model_client.predict(sample_txt)) + + return predictions + + +@timer +def evaluate(predictions: list, ground_truth: list, evaluation_metrics: list): + em = EvaluationMetrics(evaluation_metrics) + evaluation_results = em.run_all(predictions, ground_truth) + + return evaluation_results def save_outputs(config: HuggingFaceEvalJobConfig, evaluation_results: dict) -> Path: @@ -63,7 +83,6 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path: # Enable / disable tqdm input_samples = dataset["examples"] dataset_iterable = tqdm(input_samples) if config.evaluation.enable_tqdm else input_samples - predictions = [] # Choose which model client to use if type(config.model) == VLLMCompletionsConfig: @@ -86,19 +105,18 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path: model_client = HuggingFaceModelClient(model_name, config) # run inference - t = time.time() - for sample_txt in dataset_iterable: - predictions.append(model_client.predict(sample_txt)) - summarization_time = time.time() - t - logger.info(f"Summarization performed in {summarization_time} seconds") + predictions, summarization_time = predict(dataset_iterable, model_client) # run evaluation ground_truth = dataset["ground_truth"] - em = EvaluationMetrics(config.evaluation.metrics) - t = time.time() - evaluation_results = em.run_all(predictions, ground_truth) - summarization_time = time.time() - t - logger.info(f"Summarization performed in {summarization_time} seconds") + print(type(ground_truth)) + evaluation_results, evaluation_time = evaluate( + predictions, ground_truth, config.evaluation.metrics + ) + + # add timing to results dict + evaluation_results["summarization_time"] = summarization_time + evaluation_results["evaluation_time"] = evaluation_time return save_outputs(config, evaluation_results) diff --git a/src/lm_buddy/jobs/utils.py b/src/lm_buddy/jobs/utils.py new file mode 100644 index 0000000..396ef25 --- /dev/null +++ b/src/lm_buddy/jobs/utils.py @@ -0,0 +1,23 @@ +import functools +import time + +from loguru import logger + + +def timer(func): + """ + Decorator which times the execution of the wrapped func. + Execution time is logged and also returned together with func's returned value + (output will be a tuple). + """ + + @functools.wraps(func) + def wrapper_timer(*args, **kwargs): + tic = time.perf_counter() + value = func(*args, **kwargs) + toc = time.perf_counter() + elapsed_time = toc - tic + logger.info(f"Elapsed time for {func.__name__}: {elapsed_time:0.4f} seconds") + return value, elapsed_time + + return wrapper_timer From 4271d4905275a1584d30114f9a29c72f90ab5d3d Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Thu, 13 Jun 2024 14:44:31 +0200 Subject: [PATCH 10/15] Refactored config file, split into 3, added model_client docstrings --- .../evaluation/hf_evaluate_config.yaml | 40 +++--------------- .../hf_evaluate_inference_server_config.yaml | 34 +++++++++++++++ .../evaluation/hf_evaluate_openai_config.yaml | 34 +++++++++++++++ src/lm_buddy/jobs/evaluation/hf_evaluate.py | 4 +- src/lm_buddy/jobs/model_clients.py | 41 ++++++++++++++++++- 5 files changed, 115 insertions(+), 38 deletions(-) create mode 100644 examples/configs/evaluation/hf_evaluate_inference_server_config.yaml create mode 100644 examples/configs/evaluation/hf_evaluate_openai_config.yaml diff --git a/examples/configs/evaluation/hf_evaluate_config.yaml b/examples/configs/evaluation/hf_evaluate_config.yaml index 44449df..2ccf5ea 100644 --- a/examples/configs/evaluation/hf_evaluate_config.yaml +++ b/examples/configs/evaluation/hf_evaluate_config.yaml @@ -4,12 +4,11 @@ name: "lm-buddy-hf-evaluate" dataset: path: "s3://platform-storage/datasets/dialogsum" - # Settings specific to the hf_evaluate entrypoint evaluation: # metrics to be used for the evaluation # (you can add "rouge", "meteor", and "bertscore" atm) - metrics: ["rouge", "meteor"] + metrics: ["rouge", "meteor", "bertscore"] # enable/disable tqdm to track eval progress # (useful when running interactively, noisy on ray logs) enable_tqdm: True @@ -23,43 +22,14 @@ evaluation: # - if you don't provide a storage path, results will be stored locally (see ~/.lm-buddy/results) # storage_path: "s3://platform-storage/experiments/results/" -# Model to evaluate. Choose one of the following options by uncommenting it - -# 1. Local model -# - Provide model path to load the model locally -# - Make sure you add quantization details (see below) if the model is too large -# - Optionally, add a tokenizer (the one matching the specified model name is the default) +# Model to evaluate (local). +# - Provide model path to load the model locally +# - Make sure you add quantization details (see below) if the model is too large +# - Optionally, add a tokenizer (the one matching the specified model name is the default) model: path: "hf://facebook/bart-large-cnn" -# # 2. OpenAI -# # - The base_url is fixed -# # - Choose an engine name (see https://platform.openai.com/docs/models) -# # - Customize the system prompt if needed -# model: -# inference: -# base_url: "https://api.openai.com/v1" -# engine: "oai://gpt-4-turbo" -# system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences." -# max_retries: 3 - -# # 3. OpenAI - compatible model -# # - Works with local/remote vLLM-served models and llamafiles -# # - Provide base_url and engine -# # - Customize the system prompt if needed -# model: -# inference: -# base_url: "http://localhost:8081/v1" -# engine: "hf://mistralai/mistral-7b-instruct-v0.2" -# system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences." -# max_retries: 3 - # Quantization (use it if you are dealing with models too large to fit in RAM) # quantization: # load_in_4bit: True # bnb_4bit_quant_type: "fp4" - -# Tracking info for where to log the run results -# tracking: -# project: "lm-buddy-examples" -# entity: "sample" diff --git a/examples/configs/evaluation/hf_evaluate_inference_server_config.yaml b/examples/configs/evaluation/hf_evaluate_inference_server_config.yaml new file mode 100644 index 0000000..d4b6a23 --- /dev/null +++ b/examples/configs/evaluation/hf_evaluate_inference_server_config.yaml @@ -0,0 +1,34 @@ +name: "lm-buddy-hf-evaluate-is" + +# Input dataset path +dataset: + path: "s3://platform-storage/datasets/dialogsum" + +# Settings specific to the hf_evaluate entrypoint +evaluation: + # metrics to be used for the evaluation + # (you can add "rouge", "meteor", and "bertscore" atm) + metrics: ["rouge", "meteor", "bertscore"] + # enable/disable tqdm to track eval progress + # (useful when running interactively, noisy on ray logs) + enable_tqdm: True + # rely on HF pipeline for summarization (ignored if using OAI API) + use_pipeline: True + # perform inference / evaluation on the first max_samples only + max_samples: 10 + # output file path + # - if you provide a path complete with a filename, results will be stored in it + # - if you provide a dir, results will be stored in //eval_results.json + # - if you don't provide a storage path, results will be stored locally (see ~/.lm-buddy/results) + # storage_path: "s3://platform-storage/experiments/results/" + +# Model to evaluate (OpenAI-compatible API) +# - Works with local/remote vLLM-served models and llamafiles +# - Provide base_url and engine +# - Customize the system prompt if needed +model: + inference: + base_url: "http://localhost:8081/v1" + engine: "hf://mistralai/mistral-7b-instruct-v0.2" + system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences." + max_retries: 3 diff --git a/examples/configs/evaluation/hf_evaluate_openai_config.yaml b/examples/configs/evaluation/hf_evaluate_openai_config.yaml new file mode 100644 index 0000000..0d6916c --- /dev/null +++ b/examples/configs/evaluation/hf_evaluate_openai_config.yaml @@ -0,0 +1,34 @@ +name: "lm-buddy-hf-evaluate-oai" + +# Input dataset path +dataset: + path: "s3://platform-storage/datasets/dialogsum" + +# Settings specific to the hf_evaluate entrypoint +evaluation: + # metrics to be used for the evaluation + # (you can add "rouge", "meteor", and "bertscore" atm) + metrics: ["rouge", "meteor", "bertscore"] + # enable/disable tqdm to track eval progress + # (useful when running interactively, noisy on ray logs) + enable_tqdm: True + # rely on HF pipeline for summarization (ignored if using OAI API) + use_pipeline: True + # perform inference / evaluation on the first max_samples only + max_samples: 10 + # output file path + # - if you provide a path complete with a filename, results will be stored in it + # - if you provide a dir, results will be stored in //eval_results.json + # - if you don't provide a storage path, results will be stored locally (see ~/.lm-buddy/results) + # storage_path: "s3://platform-storage/experiments/results/" + +# Model to evaluate (OpenAI) +# - The base_url is fixed +# - Choose an engine name (see https://platform.openai.com/docs/models) +# - Customize the system prompt if needed +model: + inference: + base_url: "https://api.openai.com/v1" + engine: "oai://gpt-4-turbo" + system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences." + max_retries: 3 diff --git a/src/lm_buddy/jobs/evaluation/hf_evaluate.py b/src/lm_buddy/jobs/evaluation/hf_evaluate.py index 85262d1..1343c20 100644 --- a/src/lm_buddy/jobs/evaluation/hf_evaluate.py +++ b/src/lm_buddy/jobs/evaluation/hf_evaluate.py @@ -23,7 +23,7 @@ BaseModelClient, HuggingFaceModelClient, OpenAIModelClient, - PipelineModelClient, + SummarizationPipelineModelClient, ) from lm_buddy.jobs.utils import timer @@ -99,7 +99,7 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path: # for inference if config.evaluation.use_pipeline: logger.info(f"Using summarization pipeline. Model: {model_name}") - model_client = PipelineModelClient(model_name, config.model) + model_client = SummarizationPipelineModelClient(model_name, config.model) else: logger.info(f"Using direct HF model invocation. Model: {model_name}") model_client = HuggingFaceModelClient(model_name, config) diff --git a/src/lm_buddy/jobs/model_clients.py b/src/lm_buddy/jobs/model_clients.py index 0013950..d0e1473 100644 --- a/src/lm_buddy/jobs/model_clients.py +++ b/src/lm_buddy/jobs/model_clients.py @@ -14,16 +14,34 @@ class BaseModelClient: + """ + Abstract class for a model client, used to provide a uniform interface + (currentnly just a simple predict method) to models served in different + ways (e.g. HF models loaded locally, OpenAI endpoints, vLLM inference + servers, llamafile). + """ + @abstractmethod def __init__(self, model: str, config: LMBuddyConfig): + """ + Used to initialize the model / inference service. + """ pass @abstractmethod def predict(self, prompt: str) -> str: + """ + Given a prompt, return a prediction. + """ pass -class PipelineModelClient(BaseModelClient): +class SummarizationPipelineModelClient(BaseModelClient): + """ + Model client for the huggingface summarization pipeline + (model is loaded locally). + """ + def __init__(self, model: str, config: AutoModelConfig): self._summarizer = pipeline( "summarization", @@ -39,6 +57,14 @@ def predict(self, prompt): class HuggingFaceModelClient(BaseModelClient): + """ + Model client for HF models (model is loaded locally, both Seq2SeqLM + and CausalLM are supported). + - Provide model path to load the model locally + - Make sure you add quantization details if the model is too large + - Optionally, add a tokenizer (the one matching the specified model name is the default) + """ + def __init__(self, model: str, config: HuggingFaceEvalJobConfig): self._config = config self._device = "cuda" if torch.cuda.is_available() else "cpu" @@ -57,6 +83,19 @@ def predict(self, prompt): class OpenAIModelClient(BaseModelClient): + """ + Model client for models served via openai-compatible API. + For OpenAI models: + - The base_url is fixed + - Choose an engine name (see https://platform.openai.com/docs/models) + - Customize the system prompt if needed + + For compatible models: + - Works with local/remote vLLM-served models and llamafiles + - Provide base_url and engine + - Customize the system prompt if needed + """ + def __init__(self, model: str, config: VLLMCompletionsConfig): self._config = config From 0bffb3a3338bcde6357a950087e8b70a6a552822 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Thu, 13 Jun 2024 14:46:49 +0200 Subject: [PATCH 11/15] removed print functions --- src/lm_buddy/jobs/asset_loader.py | 2 -- src/lm_buddy/jobs/evaluation/hf_evaluate.py | 1 - 2 files changed, 3 deletions(-) diff --git a/src/lm_buddy/jobs/asset_loader.py b/src/lm_buddy/jobs/asset_loader.py index c35428e..c12ff79 100644 --- a/src/lm_buddy/jobs/asset_loader.py +++ b/src/lm_buddy/jobs/asset_loader.py @@ -128,7 +128,6 @@ def load_pretrained_model( # load config first to get the model type model_config = self.load_pretrained_config(config) - # print(model_config) if getattr(model_config, "model_type") in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES: automodel_class = AutoModelForSeq2SeqLM @@ -137,7 +136,6 @@ def load_pretrained_model( else: logger.info("Model type not supported. Trying AutoModelForCausalLM") automodel_class = AutoModelForCausalLM - # print(automodel_class) return automodel_class.from_pretrained( pretrained_model_name_or_path=model_path, diff --git a/src/lm_buddy/jobs/evaluation/hf_evaluate.py b/src/lm_buddy/jobs/evaluation/hf_evaluate.py index 1343c20..e43b1ad 100644 --- a/src/lm_buddy/jobs/evaluation/hf_evaluate.py +++ b/src/lm_buddy/jobs/evaluation/hf_evaluate.py @@ -109,7 +109,6 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path: # run evaluation ground_truth = dataset["ground_truth"] - print(type(ground_truth)) evaluation_results, evaluation_time = evaluate( predictions, ground_truth, config.evaluation.metrics ) From f5b6e136e7122ec7cdb0301a3b9044ebd2eeb155 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Thu, 13 Jun 2024 15:16:24 +0200 Subject: [PATCH 12/15] Mixed fixes as per PR feedback --- src/lm_buddy/jobs/evaluation/hf_evaluate.py | 35 ++++++++++++++------- src/lm_buddy/jobs/evaluation/metrics.py | 8 +++-- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/src/lm_buddy/jobs/evaluation/hf_evaluate.py b/src/lm_buddy/jobs/evaluation/hf_evaluate.py index e43b1ad..2d24e1a 100644 --- a/src/lm_buddy/jobs/evaluation/hf_evaluate.py +++ b/src/lm_buddy/jobs/evaluation/hf_evaluate.py @@ -49,24 +49,36 @@ def evaluate(predictions: list, ground_truth: list, evaluation_metrics: list): def save_outputs(config: HuggingFaceEvalJobConfig, evaluation_results: dict) -> Path: storage_path = config.evaluation.storage_path - # generate local temp file ANYWAY - # (we don't want to lose all eval data if there is an issue wth s3) - local_path = Path(LM_BUDDY_RESULTS_PATH) / config.name / "eval_results.json" - local_path.parent.mkdir(exist_ok=True, parents=True) - with local_path.open("w") as f: - json.dump(evaluation_results, f) + def save_to_disk(local_path: Path): + logger.info(f"Storing into {local_path}...") + local_path.parent.mkdir(exist_ok=True, parents=True) + with local_path.open("w") as f: + json.dump(evaluation_results, f) - # copy to s3 and return path - if storage_path is not None and storage_path.startswith("s3://"): + def save_to_s3(local_path: Path, storage_path: str): s3 = s3fs.S3FileSystem() if storage_path.endswith("/"): storage_path = "s3://" + str(Path(storage_path[5:]) / config.name / "eval_results.json") logger.info(f"Storing into {storage_path}...") s3.put_file(local_path, storage_path) - return storage_path - else: + + # generate local temp file ANYWAY + # (we don't want to lose all eval data if there is an issue wth s3) + local_path = Path(LM_BUDDY_RESULTS_PATH) / config.name / "eval_results.json" + + try: + save_to_disk(local_path) + + # copy to s3 and return path + if storage_path is not None and storage_path.startswith("s3://"): + save_to_s3(local_path, storage_path) + return storage_path + return local_path + except Exception as e: + logger.error(e) + def run_eval(config: HuggingFaceEvalJobConfig) -> Path: # Init loaders @@ -117,7 +129,8 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path: evaluation_results["summarization_time"] = summarization_time evaluation_results["evaluation_time"] = evaluation_time - return save_outputs(config, evaluation_results) + output_path = save_outputs(config, evaluation_results) + return output_path def run_hf_evaluation(config: HuggingFaceEvalJobConfig) -> EvaluationResult: diff --git a/src/lm_buddy/jobs/evaluation/metrics.py b/src/lm_buddy/jobs/evaluation/metrics.py index 82ce3f7..dd9b215 100644 --- a/src/lm_buddy/jobs/evaluation/metrics.py +++ b/src/lm_buddy/jobs/evaluation/metrics.py @@ -11,8 +11,10 @@ def __init__(self, metrics): "bertscore": self._bertscore, } - self._chosen_metrics = set(metrics).intersection(set(self._supported_metrics.keys())) - self._unsupported_metrics = set(metrics).difference(set(self._supported_metrics.keys())) + # chosen metrics are the intersection between the provided and the supporterd ones + self._chosen_metrics = set(metrics) & set(self._supported_metrics.keys()) + # unsupported metrics are the difference between the provided and the supporterd ones + self._unsupported_metrics = set(metrics) - set(self._supported_metrics.keys()) if len(self._chosen_metrics) == 0: logger.info("No valid metrics selected") @@ -45,7 +47,7 @@ def _meteor(self, pred, ref): evals["meteor"].append(ev.compute(predictions=[p], references=[r])["meteor"]) # calculate mean - evals[f"meteor_mean"] = np.mean(evals["meteor"]) + evals["meteor_mean"] = np.mean(evals["meteor"]) return evals From 1c17326e2095b91a3a5b1108fcde5e71c37bb71f Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Thu, 13 Jun 2024 15:19:27 +0200 Subject: [PATCH 13/15] Linter fix --- src/lm_buddy/jobs/model_clients.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lm_buddy/jobs/model_clients.py b/src/lm_buddy/jobs/model_clients.py index d0e1473..0dbb60b 100644 --- a/src/lm_buddy/jobs/model_clients.py +++ b/src/lm_buddy/jobs/model_clients.py @@ -138,7 +138,7 @@ def _get_response_with_retries( ) break except OpenAIError as e: - logger.warning(f"{e.message}: " f"Retrying ({current_retry_attempt}/{max_retries})") + logger.warning(f"{e.message}: Retrying ({current_retry_attempt}/{max_retries})") current_retry_attempt += 1 if current_retry_attempt > max_retries: raise e From e83d94c5e1f1e01e0c105939d02066a185c2f8ef Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Thu, 13 Jun 2024 15:37:36 +0200 Subject: [PATCH 14/15] Frozen setuptools to 69.5.1, see https://stackoverflow.com/questions/78604018/importerror-cannot-import-name-packaging-from-pkg-resources-when-trying-to --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index db9458f..6245159 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools >= 61.0"] +requires = ["setuptools == 69.5.1"] build-backend = "setuptools.build_meta" [project] From bc94aaaa0b817c11f14ea64f7bb5af73f99767a4 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Thu, 13 Jun 2024 15:42:22 +0200 Subject: [PATCH 15/15] Frozen setuptools to 69.5.1, again --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6245159..98bc1ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools == 69.5.1"] +requires = ["setuptools==69.5.1"] build-backend = "setuptools.build_meta" [project]