From fdc9304a0579f3697067f359668f20dd63952cf2 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Thu, 6 Jun 2024 16:18:06 +0100
Subject: [PATCH 01/15] hf_eval first commit

---
 .../evaluation/hf_evaluate_config.yaml        | 25 +++++
 src/lm_buddy/buddy.py                         |  4 +
 src/lm_buddy/cli/evaluate.py                  | 15 ++-
 src/lm_buddy/configs/jobs/__init__.py         |  6 +-
 src/lm_buddy/configs/jobs/hf_evaluate.py      | 70 +++++++++++++
 src/lm_buddy/jobs/asset_loader.py             | 21 +++-
 src/lm_buddy/jobs/evaluation/hf_evaluate.py   | 98 +++++++++++++++++++
 7 files changed, 236 insertions(+), 3 deletions(-)
 create mode 100644 examples/configs/evaluation/hf_evaluate_config.yaml
 create mode 100644 src/lm_buddy/configs/jobs/hf_evaluate.py
 create mode 100644 src/lm_buddy/jobs/evaluation/hf_evaluate.py

diff --git a/examples/configs/evaluation/hf_evaluate_config.yaml b/examples/configs/evaluation/hf_evaluate_config.yaml
new file mode 100644
index 0000000..411cf24
--- /dev/null
+++ b/examples/configs/evaluation/hf_evaluate_config.yaml
@@ -0,0 +1,25 @@
+name: "lm-buddy-hf-evaluate"
+
+dataset:
+  path: "s3://platform-storage/datasets/dialogsum"
+
+# Settings specific to hf_evaluate
+evaluation:
+  metrics: ["rouge"]
+  # enable/disable tqdm to track eval progress
+  enable_tqdm: False
+  # rely on HF pipeline for summarization
+  use_pipeline: True
+
+# Model to evaluate
+model:
+  path: "hf://facebook/bart-large-cnn"
+
+quantization:
+  load_in_4bit: True
+  bnb_4bit_quant_type: "fp4"
+
+# # Tracking info for where to log the run results
+# tracking:
+#   project: "lm-buddy-examples"
+#   entity: "sample"
diff --git a/src/lm_buddy/buddy.py b/src/lm_buddy/buddy.py
index 4e6be5d..08586b0 100644
--- a/src/lm_buddy/buddy.py
+++ b/src/lm_buddy/buddy.py
@@ -3,12 +3,14 @@
 from lm_buddy.configs.jobs import (
     EvaluationJobConfig,
     FinetuningJobConfig,
+    HuggingFaceEvalJobConfig,
     JobConfig,
     LMHarnessJobConfig,
     PrometheusJobConfig,
     RagasJobConfig,
 )
 from lm_buddy.jobs.common import EvaluationResult, FinetuningResult, JobType
+from lm_buddy.jobs.evaluation.hf_evaluate import run_hf_evaluation
 from lm_buddy.jobs.evaluation.lm_harness import run_lm_harness
 from lm_buddy.jobs.evaluation.prometheus import run_prometheus
 from lm_buddy.jobs.evaluation.ragas import run_ragas
@@ -66,6 +68,8 @@ def evaluate(self, config: EvaluationJobConfig) -> EvaluationResult:
                 result = run_prometheus(prometheus_config)
             case RagasJobConfig() as ragas_config:
                 result = run_ragas(ragas_config)
+            case HuggingFaceEvalJobConfig() as hf_eval_config:
+                result = run_hf_evaluation(hf_eval_config)
             case _:
                 raise ValueError(f"Invlid configuration for evaluation: {type(config)}")
         self._generate_artifact_lineage(config, result.artifacts, JobType.EVALUATION)
diff --git a/src/lm_buddy/cli/evaluate.py b/src/lm_buddy/cli/evaluate.py
index 73f58a6..90ca0b8 100644
--- a/src/lm_buddy/cli/evaluate.py
+++ b/src/lm_buddy/cli/evaluate.py
@@ -2,7 +2,12 @@
 
 from lm_buddy import LMBuddy
 from lm_buddy.cli.utils import parse_config_option
-from lm_buddy.configs.jobs import LMHarnessJobConfig, PrometheusJobConfig, RagasJobConfig
+from lm_buddy.configs.jobs import (
+    HuggingFaceEvalJobConfig,
+    LMHarnessJobConfig,
+    PrometheusJobConfig,
+    RagasJobConfig,
+)
 
 
 @click.group(name="evaluate", help="Run an LM Buddy evaluation job.")
@@ -32,3 +37,11 @@ def ragas_command(config: str) -> None:
     config = parse_config_option(RagasJobConfig, config)
     buddy = LMBuddy()
     buddy.evaluate(config)
+
+
+@group.command("huggingface", help="Run the HuggingFace evaluation job.")
+@click.option("--config", type=str)
+def huggingface_command(config: str) -> None:
+    config = parse_config_option(HuggingFaceEvalJobConfig, config)
+    buddy = LMBuddy()
+    buddy.evaluate(config)
diff --git a/src/lm_buddy/configs/jobs/__init__.py b/src/lm_buddy/configs/jobs/__init__.py
index b05459b..9509631 100644
--- a/src/lm_buddy/configs/jobs/__init__.py
+++ b/src/lm_buddy/configs/jobs/__init__.py
@@ -1,10 +1,13 @@
 from lm_buddy.configs.jobs.common import JobConfig
 from lm_buddy.configs.jobs.finetuning import FinetuningJobConfig
+from lm_buddy.configs.jobs.hf_evaluate import HuggingFaceEvalJobConfig
 from lm_buddy.configs.jobs.lm_harness import LMHarnessJobConfig
 from lm_buddy.configs.jobs.prometheus import PrometheusJobConfig
 from lm_buddy.configs.jobs.ragas import RagasJobConfig
 
-EvaluationJobConfig = LMHarnessJobConfig | PrometheusJobConfig | RagasJobConfig
+EvaluationJobConfig = (
+    LMHarnessJobConfig | PrometheusJobConfig | RagasJobConfig | HuggingFaceEvalJobConfig
+)
 
 __all__ = [
     "JobConfig",
@@ -12,5 +15,6 @@
     "LMHarnessJobConfig",
     "PrometheusJobConfig",
     "RagasJobConfig",
+    "HuggingFaceEvalJobConfig",
     "EvaluationJobConfig",
 ]
diff --git a/src/lm_buddy/configs/jobs/hf_evaluate.py b/src/lm_buddy/configs/jobs/hf_evaluate.py
new file mode 100644
index 0000000..2e9c438
--- /dev/null
+++ b/src/lm_buddy/configs/jobs/hf_evaluate.py
@@ -0,0 +1,70 @@
+from pydantic import Field, conlist, field_validator, model_validator
+
+from lm_buddy.configs.common import LMBuddyConfig
+from lm_buddy.configs.huggingface import (
+    AutoModelConfig,
+    AutoTokenizerConfig,
+    DatasetConfig,
+    QuantizationConfig,
+)
+from lm_buddy.configs.jobs.common import JobConfig
+from lm_buddy.configs.vllm import VLLMCompletionsConfig
+from lm_buddy.paths import AssetPath
+
+
+class HuggingFaceEvaluationConfig(LMBuddyConfig):
+    """Misc settings provided to an lm-harness evaluation job."""
+
+    metrics: conlist(str, min_length=1)
+    use_pipeline: bool = False
+    enable_tqdm: bool = False
+
+
+class HuggingFaceEvalJobConfig(JobConfig):
+    """Configuration to run a HuggingFace evaluation job."""
+
+    dataset: DatasetConfig = Field(
+        description="Dataset of text completions to evaluate using the Prometheus judge model."
+    )
+    evaluation: HuggingFaceEvaluationConfig
+    model: AutoModelConfig | VLLMCompletionsConfig
+    quantization: QuantizationConfig | None = None
+    tokenizer: AutoTokenizerConfig
+
+    @model_validator(mode="before")
+    def ensure_tokenizer_config(cls, values):
+        """Set the tokenizer to the model path when not explicitly provided."""
+        if values.get("tokenizer") is None:
+            values["tokenizer"] = {}
+            match values["model"]:
+                case str() as model_path:
+                    values["tokenizer"]["path"] = model_path
+                case dict() as model_data:
+                    values["tokenizer"]["path"] = model_data["path"]
+                case AutoModelConfig() as model_config:
+                    values["tokenizer"]["path"] = model_config.path
+                # No fallback necessary, downstream validation will flag invalid model types
+        return values
+
+    @field_validator("model", mode="before")
+    def validate_model_arg(cls, x):
+        """Allow for passing just a path string as the model argument."""
+        if isinstance(x, str):
+            return AutoModelConfig(path=x)
+        return x
+
+    @field_validator("tokenizer", mode="before")
+    def validate_tokenizer_arg(cls, x):
+        """Allow for passing just a path string as the tokenizer argument."""
+        if isinstance(x, str):
+            return AutoTokenizerConfig(path=x)
+        return x
+
+    def asset_paths(self) -> list[AssetPath]:
+        match self.model:
+            case AutoModelConfig() as config:
+                return {self.dataset.path, config.path, self.tokenizer.path}
+            case VLLMCompletionsConfig() as config if config.inference.engine is not None:
+                return {self.dataset.path, config.inference.engine, self.tokenizer.path}
+            case _:
+                return {}
diff --git a/src/lm_buddy/jobs/asset_loader.py b/src/lm_buddy/jobs/asset_loader.py
index ebddb80..d7663f3 100644
--- a/src/lm_buddy/jobs/asset_loader.py
+++ b/src/lm_buddy/jobs/asset_loader.py
@@ -8,11 +8,16 @@
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
     AutoTokenizer,
     PretrainedConfig,
     PreTrainedModel,
     PreTrainedTokenizer,
 )
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+)
 
 from lm_buddy.configs.huggingface import (
     AutoModelConfig,
@@ -120,7 +125,21 @@ def load_pretrained_model(
         # TODO: HuggingFace has many AutoModel classes with different "language model heads"
         #   Can we abstract this to load with any type of AutoModel class?
         model_path = self.resolve_asset_path(config.path)
-        return AutoModelForCausalLM.from_pretrained(
+
+        # load config first to get the model type
+        model_config = self.load_pretrained_config(config)
+        # print(model_config)
+
+        if getattr(model_config, "model_type") in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:
+            automodel_class = AutoModelForSeq2SeqLM
+        elif getattr(model_config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
+            automodel_class = AutoModelForCausalLM
+        else:
+            logger.info("Model type not supported. Trying AutoModelForCausalLM")
+            automodel_class = AutoModelForCausalLM
+        # print(automodel_class)
+
+        return automodel_class.from_pretrained(
             pretrained_model_name_or_path=model_path,
             trust_remote_code=config.trust_remote_code,
             torch_dtype=config.torch_dtype,
diff --git a/src/lm_buddy/jobs/evaluation/hf_evaluate.py b/src/lm_buddy/jobs/evaluation/hf_evaluate.py
new file mode 100644
index 0000000..f48eb7c
--- /dev/null
+++ b/src/lm_buddy/jobs/evaluation/hf_evaluate.py
@@ -0,0 +1,98 @@
+"""
+lm-buddy entrypoint to run summary evaluation using huggingface eval
+"""
+
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import torch
+from datasets import Dataset
+from loguru import logger
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
+
+from lm_buddy.configs.jobs.hf_evaluate import HuggingFaceEvalJobConfig
+from lm_buddy.constants import LM_BUDDY_RESULTS_PATH
+from lm_buddy.jobs.asset_loader import (
+    HuggingFaceDatasetLoader,
+    HuggingFaceModelLoader,
+    HuggingFaceTokenizerLoader,
+)
+from lm_buddy.jobs.common import EvaluationResult
+
+
+@dataclass
+class BadResponseError(Exception):
+    def __init__(self, message, error=None):
+        self.message = message
+        self.error = error
+
+
+def run_eval(config: HuggingFaceEvalJobConfig) -> Path:
+    # Init loaders
+    hf_dataset_loader = HuggingFaceDatasetLoader()
+    hf_model_loader = HuggingFaceModelLoader()
+    hf_tokenizer_loader = HuggingFaceTokenizerLoader()
+
+    # Load dataset given its URI
+    dataset = hf_dataset_loader.load_dataset(config.dataset)
+
+    # Enable / disable tqdm
+    input_samples = dataset.select(range(10))["examples"]
+    dataset_iterable = tqdm(input_samples) if config.evaluation.enable_tqdm else input_samples
+    results = []
+
+    # depending on config, use the summarizer pipeline or directly call the model
+    # for inference
+    if config.evaluation.use_pipeline:
+        logger.info("Using summarization pipeline")
+        summarizer = pipeline(
+            "summarization",
+            model=hf_model_loader.resolve_asset_path(config.model.path),
+            device=0 if torch.cuda.is_available() else -1,
+        )
+
+        t = time.time()
+        # for sample_txt in dataset_iterable:
+        #     # summarizer output is a list (1 element in this case) of dict with key = "summary_text"
+        #     results += summarizer(sample_txt, min_length=30, do_sample=False)
+
+        # alternative: run on the whole dataset
+        results = summarizer(dataset.select(range(10))["examples"], min_length=30, do_sample=False)
+
+        logger.info(f"Summarization performed in {time.time()-t} seconds")
+
+        results = [r["summary_text"] for r in results]
+
+    else:
+        logger.info("Using direct HF model invocation")
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = hf_model_loader.load_pretrained_model(config.model).to(device)
+        tokenizer = hf_tokenizer_loader.load_pretrained_tokenizer(config.tokenizer)
+
+        for sample_txt in dataset_iterable:
+            inputs = tokenizer(sample_txt, truncation=True, padding=True, return_tensors="pt").to(
+                device
+            )
+            generated_ids = model.generate(**inputs, max_new_tokens=256)
+            output_txt = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+            results += output_txt
+
+    print(results)
+
+    return "/tmp/dataset"
+
+
+def run_hf_evaluation(config: HuggingFaceEvalJobConfig) -> EvaluationResult:
+    # Run eval and store output in local filename
+    result_dataset_path = run_eval(config)
+    logger.info(f"Prometheus evaluation dataset stored at {result_dataset_path}")
+
+    return EvaluationResult(
+        artifacts=[],
+        dataset_path=result_dataset_path,
+        tables={},
+    )

From f606dbfa88cdd3b7c31d9c69250159d246649189 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Mon, 10 Jun 2024 11:39:43 +0200
Subject: [PATCH 02/15] Added bert_score dependency in pyproject.toml

---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6eff516..9221d9b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "lm-buddy"
-version = "0.10.4"
+version = "0.10.5"
 authors = [
     { name = "Sean Friedowitz", email = "sean@mozilla.ai" },
     { name = "Aaron Gonzales", email = "aaron@mozilla.ai" },
@@ -37,6 +37,7 @@ dependencies = [
     "peft==0.7.1",
     "trl==0.7.10",
     "bitsandbytes==0.42.0",
+    "bert_score==0.3.13",
     # Evaluation frameworks
     "lm-eval==0.4.2",
     "einops==0.7.0",

From 8ff31b4c09a040f3332e88b70fb570b502c4e241 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Mon, 10 Jun 2024 11:40:58 +0200
Subject: [PATCH 03/15] Updated hf-eval code with all metrics

---
 .../evaluation/hf_evaluate_config.yaml        |  2 +-
 src/lm_buddy/jobs/evaluation/hf_evaluate.py   | 26 ++++---
 src/lm_buddy/jobs/evaluation/metrics.py       | 70 +++++++++++++++++++
 3 files changed, 88 insertions(+), 10 deletions(-)
 create mode 100644 src/lm_buddy/jobs/evaluation/metrics.py

diff --git a/examples/configs/evaluation/hf_evaluate_config.yaml b/examples/configs/evaluation/hf_evaluate_config.yaml
index 411cf24..39db968 100644
--- a/examples/configs/evaluation/hf_evaluate_config.yaml
+++ b/examples/configs/evaluation/hf_evaluate_config.yaml
@@ -5,7 +5,7 @@ dataset:
 
 # Settings specific to hf_evaluate
 evaluation:
-  metrics: ["rouge"]
+  metrics: ["rouge", "bertscore", "meteor"]
   # enable/disable tqdm to track eval progress
   enable_tqdm: False
   # rely on HF pipeline for summarization
diff --git a/src/lm_buddy/jobs/evaluation/hf_evaluate.py b/src/lm_buddy/jobs/evaluation/hf_evaluate.py
index f48eb7c..3b72a37 100644
--- a/src/lm_buddy/jobs/evaluation/hf_evaluate.py
+++ b/src/lm_buddy/jobs/evaluation/hf_evaluate.py
@@ -21,6 +21,7 @@
     HuggingFaceTokenizerLoader,
 )
 from lm_buddy.jobs.common import EvaluationResult
+from lm_buddy.jobs.evaluation.metrics import EvaluationMetrics
 
 
 @dataclass
@@ -42,7 +43,7 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path:
     # Enable / disable tqdm
     input_samples = dataset.select(range(10))["examples"]
     dataset_iterable = tqdm(input_samples) if config.evaluation.enable_tqdm else input_samples
-    results = []
+    predictions = []
 
     # depending on config, use the summarizer pipeline or directly call the model
     # for inference
@@ -55,16 +56,17 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path:
         )
 
         t = time.time()
-        # for sample_txt in dataset_iterable:
-        #     # summarizer output is a list (1 element in this case) of dict with key = "summary_text"
-        #     results += summarizer(sample_txt, min_length=30, do_sample=False)
+        for sample_txt in dataset_iterable:
+            # summarizer output is a list (1 element in this case) of dict with key = "summary_text"
+            predictions += summarizer(sample_txt, min_length=30, do_sample=False)
 
-        # alternative: run on the whole dataset
-        results = summarizer(dataset.select(range(10))["examples"], min_length=30, do_sample=False)
+        # alternative: run on the whole dataset (does not seem to be faster)
+        # TODO: test on GPU and changing #workers in pipeline definition
+        # results = summarizer(input_samples, min_length=30, do_sample=False)
 
         logger.info(f"Summarization performed in {time.time()-t} seconds")
 
-        results = [r["summary_text"] for r in results]
+        predictions = [r["summary_text"] for r in predictions]
 
     else:
         logger.info("Using direct HF model invocation")
@@ -79,9 +81,15 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path:
             )
             generated_ids = model.generate(**inputs, max_new_tokens=256)
             output_txt = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-            results += output_txt
+            predictions += output_txt
+
+    print(predictions)
+
+    # Start evaluation
+    em = EvaluationMetrics(config.evaluation.metrics)
+    evaluation_results = em.run_all(predictions, input_samples)
 
-    print(results)
+    print(evaluation_results)
 
     return "/tmp/dataset"
 
diff --git a/src/lm_buddy/jobs/evaluation/metrics.py b/src/lm_buddy/jobs/evaluation/metrics.py
new file mode 100644
index 0000000..82ce3f7
--- /dev/null
+++ b/src/lm_buddy/jobs/evaluation/metrics.py
@@ -0,0 +1,70 @@
+import evaluate
+import numpy as np
+from loguru import logger
+
+
+class EvaluationMetrics:
+    def __init__(self, metrics):
+        self._supported_metrics = {
+            "rouge": self._rouge,
+            "meteor": self._meteor,
+            "bertscore": self._bertscore,
+        }
+
+        self._chosen_metrics = set(metrics).intersection(set(self._supported_metrics.keys()))
+        self._unsupported_metrics = set(metrics).difference(set(self._supported_metrics.keys()))
+
+        if len(self._chosen_metrics) == 0:
+            logger.info("No valid metrics selected")
+        else:
+            logger.info(f"Chosen metrics: {self._chosen_metrics}")
+
+        if len(self._unsupported_metrics) > 0:
+            logger.info(f"Unsupported metrics: {self._unsupported_metrics}")
+
+    def _rouge(self, pred, ref):
+        ev = evaluate.load("rouge")
+
+        # compute with use_aggregator = False to get individual scores
+        evals = ev.compute(predictions=pred, references=ref, use_aggregator=False)
+
+        # calculate mean for each of the submetrics (rouge1, rouge2, rougeL, rougeLsum)
+        for k in ["rouge1", "rouge2", "rougeL", "rougeLsum"]:
+            evals[f"{k}_mean"] = np.mean(evals[k])
+
+        return evals
+
+    def _meteor(self, pred, ref):
+        ev = evaluate.load("meteor")
+
+        # initialize dictionary with metric name
+        evals = {"meteor": []}
+
+        # run sample-wise evals (as default implementation only returns mean value)
+        for p, r in zip(pred, ref):
+            evals["meteor"].append(ev.compute(predictions=[p], references=[r])["meteor"])
+
+        # calculate mean
+        evals[f"meteor_mean"] = np.mean(evals["meteor"])
+
+        return evals
+
+    def _bertscore(self, pred, ref):
+        ev = evaluate.load("bertscore")
+
+        # calculate evals (the default is not to aggregate them)
+        evals = ev.compute(predictions=pred, references=ref, lang="en")
+
+        # calculate mean for each of the submetrics (precision, recall, f1)
+        for k in ["precision", "recall", "f1"]:
+            evals[f"{k}_mean"] = np.mean(evals[k])
+
+        return evals
+
+    def run_all(self, pred, ref):
+        results = {}
+
+        for metric in self._chosen_metrics:
+            results[metric] = self._supported_metrics[metric](pred, ref)
+
+        return results

From 6a43bcc95cd7492dd737fd19afb4eb760f7609d3 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Tue, 11 Jun 2024 11:21:05 +0200
Subject: [PATCH 04/15] Added s3fs to pyproject.toml

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 9221d9b..941afda 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,7 @@ dependencies = [
     "pydantic-yaml==1.2.0",
     "ray[default]==2.9.3",
     "loguru==0.7.2",
+    "s3fs=2024.6.0",
     # HuggingFace
     "datasets>=2.17.1",
     "transformers==4.36.2",

From c5a701b427e54c3621d3840e5701a97828244e0d Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Tue, 11 Jun 2024 17:09:37 +0200
Subject: [PATCH 05/15] Fixed s3fs issue with pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 941afda..db9458f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,7 @@ dependencies = [
     "pydantic-yaml==1.2.0",
     "ray[default]==2.9.3",
     "loguru==0.7.2",
-    "s3fs=2024.6.0",
+    "s3fs",
     # HuggingFace
     "datasets>=2.17.1",
     "transformers==4.36.2",

From 662ee950d5f83ac0f3be5e6ab8a9b6b7905147ac Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Wed, 12 Jun 2024 14:28:39 +0200
Subject: [PATCH 06/15] Adding support for oai:// prefix

---
 src/lm_buddy/jobs/asset_loader.py | 2 +-
 src/lm_buddy/paths.py             | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/lm_buddy/jobs/asset_loader.py b/src/lm_buddy/jobs/asset_loader.py
index d7663f3..c35428e 100644
--- a/src/lm_buddy/jobs/asset_loader.py
+++ b/src/lm_buddy/jobs/asset_loader.py
@@ -45,7 +45,7 @@ def resolve_asset_path(self, path: AssetPath) -> str:
         The returned string has its `PathPrefix` stripped away..
         """
         raw_path = strip_path_prefix(path)
-        if path.startswith((PathPrefix.FILE, PathPrefix.HUGGINGFACE)):
+        if path.startswith((PathPrefix.FILE, PathPrefix.HUGGINGFACE, PathPrefix.OPENAI)):
             return raw_path
         elif path.startswith(PathPrefix.WANDB):
             artifact = get_artifact_from_api(raw_path)
diff --git a/src/lm_buddy/paths.py b/src/lm_buddy/paths.py
index 633bce0..79244cf 100644
--- a/src/lm_buddy/paths.py
+++ b/src/lm_buddy/paths.py
@@ -14,6 +14,7 @@ class PathPrefix(str, Enum):
     HUGGINGFACE = "hf://"
     WANDB = "wandb://"
     S3 = "s3://"
+    OPENAI = "oai://"
 
 
 def strip_path_prefix(path: str) -> str:
@@ -53,6 +54,9 @@ def validate_asset_path(path: str) -> "AssetPath":
         # e.g. if the assumption is we always want a file or a dir, we could
         # use https://s3pathlib.readthedocs.io to verify (.is_file() or ._is_dir())
         pass
+    elif path.startswith(PathPrefix.OPENAI):
+        # TODO: Validate the OAI path structure?
+        pass
     else:
         allowed_prefixes = {x.value for x in PathPrefix}
         raise ValueError(f"'{path}' does not begin with an allowed prefix: {allowed_prefixes}")

From dbe9d8472e650329c351cba77632cef4144193f5 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Wed, 12 Jun 2024 14:29:34 +0200
Subject: [PATCH 07/15] Added optional max_retries and system_prompt params to
 InferenceServerConfig

---
 src/lm_buddy/configs/vllm.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/lm_buddy/configs/vllm.py b/src/lm_buddy/configs/vllm.py
index 431f011..a5bb10e 100644
--- a/src/lm_buddy/configs/vllm.py
+++ b/src/lm_buddy/configs/vllm.py
@@ -12,12 +12,18 @@ class InferenceServerConfig(LMBuddyConfig):
 
     Note: This configuration is intended to be generic and not bound to the interface
     of any specific training/evaluation framework. See `LocalChatCompletionConfig`
-    or `vLLMCompleptionsConfig` for intended usage alongside a third-party framework.
+    or `vLLMCompletionsConfig` for intended usage alongside a third-party framework.
     """
 
     base_url: str
     engine: AssetPath
 
+    # optional system prompt to be used by default in chat completions
+    system_prompt: str | None = None
+
+    # max number of retries when communication with server fails
+    max_retries: int | None = None
+
 
 class VLLMCompletionsConfig(LMBuddyConfig):
     """Configuration for a vLLM-based completions service

From 91a2e984572b2bd5e18692221f59e5d520f44461 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Wed, 12 Jun 2024 14:32:34 +0200
Subject: [PATCH 08/15] Updated hf_evaluate with model_clients support, results
 saving, better config

---
 .../evaluation/hf_evaluate_config.yaml        |  58 +++++++--
 src/lm_buddy/configs/jobs/hf_evaluate.py      |  19 ++-
 src/lm_buddy/jobs/evaluation/hf_evaluate.py   | 123 ++++++++++--------
 src/lm_buddy/jobs/model_clients.py            | 111 ++++++++++++++++
 4 files changed, 242 insertions(+), 69 deletions(-)
 create mode 100644 src/lm_buddy/jobs/model_clients.py

diff --git a/examples/configs/evaluation/hf_evaluate_config.yaml b/examples/configs/evaluation/hf_evaluate_config.yaml
index 39db968..44449df 100644
--- a/examples/configs/evaluation/hf_evaluate_config.yaml
+++ b/examples/configs/evaluation/hf_evaluate_config.yaml
@@ -1,25 +1,65 @@
 name: "lm-buddy-hf-evaluate"
 
+# Input dataset path
 dataset:
   path: "s3://platform-storage/datasets/dialogsum"
 
-# Settings specific to hf_evaluate
+
+# Settings specific to the hf_evaluate entrypoint
 evaluation:
-  metrics: ["rouge", "bertscore", "meteor"]
+  # metrics to be used for the evaluation
+  # (you can add "rouge", "meteor", and "bertscore" atm)
+  metrics: ["rouge", "meteor"]
   # enable/disable tqdm to track eval progress
-  enable_tqdm: False
-  # rely on HF pipeline for summarization
+  # (useful when running interactively, noisy on ray logs)
+  enable_tqdm: True
+  # rely on HF pipeline for summarization (ignored if using OAI API)
   use_pipeline: True
+  # perform inference / evaluation on the first max_samples only
+  max_samples: 10
+  # output file path
+  # - if you provide a path complete with a filename, results will be stored in it
+  # - if you provide a dir, results will be stored in <dir>/<config.name>/eval_results.json
+  # - if you don't provide a storage path, results will be stored locally (see ~/.lm-buddy/results)
+  # storage_path: "s3://platform-storage/experiments/results/"
+
+# Model to evaluate. Choose one of the following options by uncommenting it
 
-# Model to evaluate
+# 1. Local model
+#    - Provide model path to load the model locally
+#    - Make sure you add quantization details (see below) if the model is too large
+#    - Optionally, add a tokenizer (the one matching the specified model name is the default)
 model:
   path: "hf://facebook/bart-large-cnn"
 
-quantization:
-  load_in_4bit: True
-  bnb_4bit_quant_type: "fp4"
+# # 2. OpenAI
+# #    - The base_url is fixed
+# #    - Choose an engine name (see https://platform.openai.com/docs/models)
+# #    - Customize the system prompt if needed
+# model:
+#   inference:
+#     base_url: "https://api.openai.com/v1"
+#     engine: "oai://gpt-4-turbo"
+#     system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences."
+#     max_retries: 3
+
+# # 3. OpenAI - compatible model
+# #    - Works with local/remote vLLM-served models and llamafiles
+# #    - Provide base_url and engine
+# #    - Customize the system prompt if needed
+# model:
+#   inference:
+#     base_url: "http://localhost:8081/v1"
+#     engine: "hf://mistralai/mistral-7b-instruct-v0.2"
+#     system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences."
+#     max_retries: 3
+
+# Quantization (use it if you are dealing with models too large to fit in RAM)
+# quantization:
+#   load_in_4bit: True
+#   bnb_4bit_quant_type: "fp4"
 
-# # Tracking info for where to log the run results
+# Tracking info for where to log the run results
 # tracking:
 #   project: "lm-buddy-examples"
 #   entity: "sample"
diff --git a/src/lm_buddy/configs/jobs/hf_evaluate.py b/src/lm_buddy/configs/jobs/hf_evaluate.py
index 2e9c438..e863111 100644
--- a/src/lm_buddy/configs/jobs/hf_evaluate.py
+++ b/src/lm_buddy/configs/jobs/hf_evaluate.py
@@ -18,6 +18,8 @@ class HuggingFaceEvaluationConfig(LMBuddyConfig):
     metrics: conlist(str, min_length=1)
     use_pipeline: bool = False
     enable_tqdm: bool = False
+    max_samples: int | None = None
+    storage_path: str | None = None
 
 
 class HuggingFaceEvalJobConfig(JobConfig):
@@ -40,7 +42,13 @@ def ensure_tokenizer_config(cls, values):
                 case str() as model_path:
                     values["tokenizer"]["path"] = model_path
                 case dict() as model_data:
-                    values["tokenizer"]["path"] = model_data["path"]
+                    # if dict we might have model.path specified
+                    # if we don't it is VLLMCompletion and we are ok
+                    # with anything as it will be ignored
+                    if model_data.get("path") is None:
+                        values["tokenizer"]["path"] = "oai://tokenizer"
+                    else:
+                        values["tokenizer"]["path"] = model_data.get("path")
                 case AutoModelConfig() as model_config:
                     values["tokenizer"]["path"] = model_config.path
                 # No fallback necessary, downstream validation will flag invalid model types
@@ -63,8 +71,13 @@ def validate_tokenizer_arg(cls, x):
     def asset_paths(self) -> list[AssetPath]:
         match self.model:
             case AutoModelConfig() as config:
-                return {self.dataset.path, config.path, self.tokenizer.path}
+                return {
+                    self.dataset.path,
+                    self.evaluation.output_path,
+                    config.path,
+                    self.tokenizer.path,
+                }
             case VLLMCompletionsConfig() as config if config.inference.engine is not None:
-                return {self.dataset.path, config.inference.engine, self.tokenizer.path}
+                return {self.dataset.path, self.evaluation.output_path, config.inference.engine}
             case _:
                 return {}
diff --git a/src/lm_buddy/jobs/evaluation/hf_evaluate.py b/src/lm_buddy/jobs/evaluation/hf_evaluate.py
index 3b72a37..678c4b7 100644
--- a/src/lm_buddy/jobs/evaluation/hf_evaluate.py
+++ b/src/lm_buddy/jobs/evaluation/hf_evaluate.py
@@ -2,102 +2,111 @@
 lm-buddy entrypoint to run summary evaluation using huggingface eval
 """
 
+import json
 import time
-from dataclasses import dataclass
 from pathlib import Path
-from typing import Any
 
-import torch
-from datasets import Dataset
+import s3fs
 from loguru import logger
 from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
 
 from lm_buddy.configs.jobs.hf_evaluate import HuggingFaceEvalJobConfig
+from lm_buddy.configs.vllm import VLLMCompletionsConfig
 from lm_buddy.constants import LM_BUDDY_RESULTS_PATH
 from lm_buddy.jobs.asset_loader import (
     HuggingFaceDatasetLoader,
     HuggingFaceModelLoader,
-    HuggingFaceTokenizerLoader,
 )
 from lm_buddy.jobs.common import EvaluationResult
 from lm_buddy.jobs.evaluation.metrics import EvaluationMetrics
+from lm_buddy.jobs.model_clients import (
+    HuggingFaceModelClient,
+    OpenAIModelClient,
+    PipelineModelClient,
+)
 
 
-@dataclass
-class BadResponseError(Exception):
-    def __init__(self, message, error=None):
-        self.message = message
-        self.error = error
+def save_outputs(config: HuggingFaceEvalJobConfig, evaluation_results: dict) -> Path:
+    storage_path = config.evaluation.storage_path
+
+    # generate local temp file ANYWAY
+    # (we don't want to lose all eval data if there is an issue wth s3)
+    local_path = Path(LM_BUDDY_RESULTS_PATH) / config.name / "eval_results.json"
+    local_path.parent.mkdir(exist_ok=True, parents=True)
+    with local_path.open("w") as f:
+        json.dump(evaluation_results, f)
+
+    # copy to s3 and return path
+    if storage_path is not None and storage_path.startswith("s3://"):
+        s3 = s3fs.S3FileSystem()
+        if storage_path.endswith("/"):
+            storage_path = "s3://" + str(Path(storage_path[5:]) / config.name / "eval_results.json")
+        logger.info(f"Storing into {storage_path}...")
+        s3.put_file(local_path, storage_path)
+        return storage_path
+    else:
+        return local_path
 
 
 def run_eval(config: HuggingFaceEvalJobConfig) -> Path:
     # Init loaders
     hf_dataset_loader = HuggingFaceDatasetLoader()
     hf_model_loader = HuggingFaceModelLoader()
-    hf_tokenizer_loader = HuggingFaceTokenizerLoader()
 
     # Load dataset given its URI
     dataset = hf_dataset_loader.load_dataset(config.dataset)
 
+    # Limit dataset length if max_samples is specified
+    if config.evaluation.max_samples is not None:
+        dataset = dataset.select(range(config.evaluation.max_samples))
+
     # Enable / disable tqdm
-    input_samples = dataset.select(range(10))["examples"]
+    input_samples = dataset["examples"]
     dataset_iterable = tqdm(input_samples) if config.evaluation.enable_tqdm else input_samples
     predictions = []
 
-    # depending on config, use the summarizer pipeline or directly call the model
-    # for inference
-    if config.evaluation.use_pipeline:
-        logger.info("Using summarization pipeline")
-        summarizer = pipeline(
-            "summarization",
-            model=hf_model_loader.resolve_asset_path(config.model.path),
-            device=0 if torch.cuda.is_available() else -1,
-        )
-
-        t = time.time()
-        for sample_txt in dataset_iterable:
-            # summarizer output is a list (1 element in this case) of dict with key = "summary_text"
-            predictions += summarizer(sample_txt, min_length=30, do_sample=False)
-
-        # alternative: run on the whole dataset (does not seem to be faster)
-        # TODO: test on GPU and changing #workers in pipeline definition
-        # results = summarizer(input_samples, min_length=30, do_sample=False)
-
-        logger.info(f"Summarization performed in {time.time()-t} seconds")
-
-        predictions = [r["summary_text"] for r in predictions]
-
+    # Choose which model client to use
+    if type(config.model) == VLLMCompletionsConfig:
+        model_name = config.model.inference.base_url
     else:
-        logger.info("Using direct HF model invocation")
-
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        model = hf_model_loader.load_pretrained_model(config.model).to(device)
-        tokenizer = hf_tokenizer_loader.load_pretrained_tokenizer(config.tokenizer)
-
-        for sample_txt in dataset_iterable:
-            inputs = tokenizer(sample_txt, truncation=True, padding=True, return_tensors="pt").to(
-                device
-            )
-            generated_ids = model.generate(**inputs, max_new_tokens=256)
-            output_txt = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-            predictions += output_txt
+        model_name = hf_model_loader.resolve_asset_path(config.model.path)
 
-    print(predictions)
-
-    # Start evaluation
+    if model_name.startswith("http"):
+        # run the openai client
+        logger.info(f"Using OAI client. Endpoint: {model_name}")
+        model_client = OpenAIModelClient(model_name, config.model)
+    else:
+        # depending on config, use the summarizer pipeline or directly call the model
+        # for inference
+        if config.evaluation.use_pipeline:
+            logger.info(f"Using summarization pipeline. Model: {model_name}")
+            model_client = PipelineModelClient(model_name, config.model)
+        else:
+            logger.info(f"Using direct HF model invocation. Model: {model_name}")
+            model_client = HuggingFaceModelClient(model_name, config)
+
+    # run inference
+    t = time.time()
+    for sample_txt in dataset_iterable:
+        predictions.append(model_client.predict(sample_txt))
+    summarization_time = time.time() - t
+    logger.info(f"Summarization performed in {summarization_time} seconds")
+
+    # run evaluation
+    ground_truth = dataset["ground_truth"]
     em = EvaluationMetrics(config.evaluation.metrics)
-    evaluation_results = em.run_all(predictions, input_samples)
-
-    print(evaluation_results)
+    t = time.time()
+    evaluation_results = em.run_all(predictions, ground_truth)
+    summarization_time = time.time() - t
+    logger.info(f"Summarization performed in {summarization_time} seconds")
 
-    return "/tmp/dataset"
+    return save_outputs(config, evaluation_results)
 
 
 def run_hf_evaluation(config: HuggingFaceEvalJobConfig) -> EvaluationResult:
     # Run eval and store output in local filename
     result_dataset_path = run_eval(config)
-    logger.info(f"Prometheus evaluation dataset stored at {result_dataset_path}")
+    logger.info(f"Summarization eval results stored at {result_dataset_path}")
 
     return EvaluationResult(
         artifacts=[],
diff --git a/src/lm_buddy/jobs/model_clients.py b/src/lm_buddy/jobs/model_clients.py
new file mode 100644
index 0000000..0013950
--- /dev/null
+++ b/src/lm_buddy/jobs/model_clients.py
@@ -0,0 +1,111 @@
+from abc import abstractmethod
+
+import torch
+from loguru import logger
+from openai import OpenAI, OpenAIError
+from openai.types import Completion
+from transformers import pipeline
+
+from lm_buddy.configs.common import LMBuddyConfig
+from lm_buddy.configs.huggingface import AutoModelConfig
+from lm_buddy.configs.jobs.hf_evaluate import HuggingFaceEvalJobConfig
+from lm_buddy.configs.vllm import VLLMCompletionsConfig
+from lm_buddy.jobs.asset_loader import HuggingFaceModelLoader, HuggingFaceTokenizerLoader
+
+
+class BaseModelClient:
+    @abstractmethod
+    def __init__(self, model: str, config: LMBuddyConfig):
+        pass
+
+    @abstractmethod
+    def predict(self, prompt: str) -> str:
+        pass
+
+
+class PipelineModelClient(BaseModelClient):
+    def __init__(self, model: str, config: AutoModelConfig):
+        self._summarizer = pipeline(
+            "summarization",
+            model=model,
+            device=0 if torch.cuda.is_available() else -1,
+        )
+
+    def predict(self, prompt):
+        # summarizer output is a list (1 element in this case) of dict with key = "summary_text"
+        # TODO: bring summarizer parameters out at some point (not needed at the moment)
+        pred = self._summarizer(prompt, min_length=30, do_sample=False)
+        return pred[0]["summary_text"]
+
+
+class HuggingFaceModelClient(BaseModelClient):
+    def __init__(self, model: str, config: HuggingFaceEvalJobConfig):
+        self._config = config
+        self._device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        hf_model_loader = HuggingFaceModelLoader()
+        hf_tokenizer_loader = HuggingFaceTokenizerLoader()
+        self._model = hf_model_loader.load_pretrained_model(config.model).to(self._device)
+        self._tokenizer = hf_tokenizer_loader.load_pretrained_tokenizer(config.tokenizer)
+
+    def predict(self, prompt):
+        inputs = self._tokenizer(prompt, truncation=True, padding=True, return_tensors="pt").to(
+            self._device
+        )
+        generated_ids = self._model.generate(**inputs, max_new_tokens=256)
+        return self._tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+
+
+class OpenAIModelClient(BaseModelClient):
+    def __init__(self, model: str, config: VLLMCompletionsConfig):
+        self._config = config
+
+        hf_model_loader = HuggingFaceModelLoader()
+        self._engine = hf_model_loader.resolve_asset_path(config.inference.engine)
+        self._system = config.inference.system_prompt
+        self._client = OpenAI(base_url=model)
+
+    def _openai_chat_completion(
+        self,
+        config: VLLMCompletionsConfig,
+        client: OpenAI,
+        prompt: str,
+        system: str = "You are a helpful assisant.",
+    ) -> Completion:
+        """Connects to a remote OpenAI-API-compatible endpoint
+        and returns a chat completion holding the model's response.
+        """
+
+        return self._client.chat.completions.create(
+            model=self._engine,
+            messages=[{"role": "system", "content": system}, {"role": "user", "content": prompt}],
+            max_tokens=config.max_tokens,
+            frequency_penalty=config.frequency_penalty,
+            temperature=config.temperature,
+            top_p=config.top_p,
+        )
+
+    def _get_response_with_retries(
+        self,
+        config: VLLMCompletionsConfig,
+        prompt: str,
+    ) -> tuple[str, str]:
+        current_retry_attempt = 1
+        max_retries = 1 if config.inference.max_retries is None else config.inference.max_retries
+        while current_retry_attempt <= max_retries:
+            try:
+                response = self._openai_chat_completion(
+                    self._config, self._client, prompt, self._system
+                )
+                break
+            except OpenAIError as e:
+                logger.warning(f"{e.message}: " f"Retrying ({current_retry_attempt}/{max_retries})")
+                current_retry_attempt += 1
+                if current_retry_attempt > max_retries:
+                    raise e
+        return response
+
+    def predict(self, prompt):
+        response = self._get_response_with_retries(self._config, prompt)
+
+        return response.choices[0].message.content

From 9bff948754bb3d922051f6c456ca0a221a89a6a4 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Thu, 13 Jun 2024 11:50:44 +0200
Subject: [PATCH 09/15] Added timer decorator and used it for hf_evaluate
 entrypoint

---
 src/lm_buddy/jobs/evaluation/hf_evaluate.py | 42 +++++++++++++++------
 src/lm_buddy/jobs/utils.py                  | 23 +++++++++++
 2 files changed, 53 insertions(+), 12 deletions(-)
 create mode 100644 src/lm_buddy/jobs/utils.py

diff --git a/src/lm_buddy/jobs/evaluation/hf_evaluate.py b/src/lm_buddy/jobs/evaluation/hf_evaluate.py
index 678c4b7..85262d1 100644
--- a/src/lm_buddy/jobs/evaluation/hf_evaluate.py
+++ b/src/lm_buddy/jobs/evaluation/hf_evaluate.py
@@ -3,7 +3,7 @@
 """
 
 import json
-import time
+from collections.abc import Iterable
 from pathlib import Path
 
 import s3fs
@@ -20,10 +20,30 @@
 from lm_buddy.jobs.common import EvaluationResult
 from lm_buddy.jobs.evaluation.metrics import EvaluationMetrics
 from lm_buddy.jobs.model_clients import (
+    BaseModelClient,
     HuggingFaceModelClient,
     OpenAIModelClient,
     PipelineModelClient,
 )
+from lm_buddy.jobs.utils import timer
+
+
+@timer
+def predict(dataset_iterable: Iterable, model_client: BaseModelClient) -> list:
+    predictions = []
+
+    for sample_txt in dataset_iterable:
+        predictions.append(model_client.predict(sample_txt))
+
+    return predictions
+
+
+@timer
+def evaluate(predictions: list, ground_truth: list, evaluation_metrics: list):
+    em = EvaluationMetrics(evaluation_metrics)
+    evaluation_results = em.run_all(predictions, ground_truth)
+
+    return evaluation_results
 
 
 def save_outputs(config: HuggingFaceEvalJobConfig, evaluation_results: dict) -> Path:
@@ -63,7 +83,6 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path:
     # Enable / disable tqdm
     input_samples = dataset["examples"]
     dataset_iterable = tqdm(input_samples) if config.evaluation.enable_tqdm else input_samples
-    predictions = []
 
     # Choose which model client to use
     if type(config.model) == VLLMCompletionsConfig:
@@ -86,19 +105,18 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path:
             model_client = HuggingFaceModelClient(model_name, config)
 
     # run inference
-    t = time.time()
-    for sample_txt in dataset_iterable:
-        predictions.append(model_client.predict(sample_txt))
-    summarization_time = time.time() - t
-    logger.info(f"Summarization performed in {summarization_time} seconds")
+    predictions, summarization_time = predict(dataset_iterable, model_client)
 
     # run evaluation
     ground_truth = dataset["ground_truth"]
-    em = EvaluationMetrics(config.evaluation.metrics)
-    t = time.time()
-    evaluation_results = em.run_all(predictions, ground_truth)
-    summarization_time = time.time() - t
-    logger.info(f"Summarization performed in {summarization_time} seconds")
+    print(type(ground_truth))
+    evaluation_results, evaluation_time = evaluate(
+        predictions, ground_truth, config.evaluation.metrics
+    )
+
+    # add timing to results dict
+    evaluation_results["summarization_time"] = summarization_time
+    evaluation_results["evaluation_time"] = evaluation_time
 
     return save_outputs(config, evaluation_results)
 
diff --git a/src/lm_buddy/jobs/utils.py b/src/lm_buddy/jobs/utils.py
new file mode 100644
index 0000000..396ef25
--- /dev/null
+++ b/src/lm_buddy/jobs/utils.py
@@ -0,0 +1,23 @@
+import functools
+import time
+
+from loguru import logger
+
+
+def timer(func):
+    """
+    Decorator which times the execution of the wrapped func.
+    Execution time is logged and also returned together with func's returned value
+    (output will be a tuple).
+    """
+
+    @functools.wraps(func)
+    def wrapper_timer(*args, **kwargs):
+        tic = time.perf_counter()
+        value = func(*args, **kwargs)
+        toc = time.perf_counter()
+        elapsed_time = toc - tic
+        logger.info(f"Elapsed time for {func.__name__}: {elapsed_time:0.4f} seconds")
+        return value, elapsed_time
+
+    return wrapper_timer

From 4271d4905275a1584d30114f9a29c72f90ab5d3d Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Thu, 13 Jun 2024 14:44:31 +0200
Subject: [PATCH 10/15] Refactored config file, split into 3, added
 model_client docstrings

---
 .../evaluation/hf_evaluate_config.yaml        | 40 +++---------------
 .../hf_evaluate_inference_server_config.yaml  | 34 +++++++++++++++
 .../evaluation/hf_evaluate_openai_config.yaml | 34 +++++++++++++++
 src/lm_buddy/jobs/evaluation/hf_evaluate.py   |  4 +-
 src/lm_buddy/jobs/model_clients.py            | 41 ++++++++++++++++++-
 5 files changed, 115 insertions(+), 38 deletions(-)
 create mode 100644 examples/configs/evaluation/hf_evaluate_inference_server_config.yaml
 create mode 100644 examples/configs/evaluation/hf_evaluate_openai_config.yaml

diff --git a/examples/configs/evaluation/hf_evaluate_config.yaml b/examples/configs/evaluation/hf_evaluate_config.yaml
index 44449df..2ccf5ea 100644
--- a/examples/configs/evaluation/hf_evaluate_config.yaml
+++ b/examples/configs/evaluation/hf_evaluate_config.yaml
@@ -4,12 +4,11 @@ name: "lm-buddy-hf-evaluate"
 dataset:
   path: "s3://platform-storage/datasets/dialogsum"
 
-
 # Settings specific to the hf_evaluate entrypoint
 evaluation:
   # metrics to be used for the evaluation
   # (you can add "rouge", "meteor", and "bertscore" atm)
-  metrics: ["rouge", "meteor"]
+  metrics: ["rouge", "meteor", "bertscore"]
   # enable/disable tqdm to track eval progress
   # (useful when running interactively, noisy on ray logs)
   enable_tqdm: True
@@ -23,43 +22,14 @@ evaluation:
   # - if you don't provide a storage path, results will be stored locally (see ~/.lm-buddy/results)
   # storage_path: "s3://platform-storage/experiments/results/"
 
-# Model to evaluate. Choose one of the following options by uncommenting it
-
-# 1. Local model
-#    - Provide model path to load the model locally
-#    - Make sure you add quantization details (see below) if the model is too large
-#    - Optionally, add a tokenizer (the one matching the specified model name is the default)
+# Model to evaluate (local).
+# - Provide model path to load the model locally
+# - Make sure you add quantization details (see below) if the model is too large
+# - Optionally, add a tokenizer (the one matching the specified model name is the default)
 model:
   path: "hf://facebook/bart-large-cnn"
 
-# # 2. OpenAI
-# #    - The base_url is fixed
-# #    - Choose an engine name (see https://platform.openai.com/docs/models)
-# #    - Customize the system prompt if needed
-# model:
-#   inference:
-#     base_url: "https://api.openai.com/v1"
-#     engine: "oai://gpt-4-turbo"
-#     system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences."
-#     max_retries: 3
-
-# # 3. OpenAI - compatible model
-# #    - Works with local/remote vLLM-served models and llamafiles
-# #    - Provide base_url and engine
-# #    - Customize the system prompt if needed
-# model:
-#   inference:
-#     base_url: "http://localhost:8081/v1"
-#     engine: "hf://mistralai/mistral-7b-instruct-v0.2"
-#     system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences."
-#     max_retries: 3
-
 # Quantization (use it if you are dealing with models too large to fit in RAM)
 # quantization:
 #   load_in_4bit: True
 #   bnb_4bit_quant_type: "fp4"
-
-# Tracking info for where to log the run results
-# tracking:
-#   project: "lm-buddy-examples"
-#   entity: "sample"
diff --git a/examples/configs/evaluation/hf_evaluate_inference_server_config.yaml b/examples/configs/evaluation/hf_evaluate_inference_server_config.yaml
new file mode 100644
index 0000000..d4b6a23
--- /dev/null
+++ b/examples/configs/evaluation/hf_evaluate_inference_server_config.yaml
@@ -0,0 +1,34 @@
+name: "lm-buddy-hf-evaluate-is"
+
+# Input dataset path
+dataset:
+  path: "s3://platform-storage/datasets/dialogsum"
+
+# Settings specific to the hf_evaluate entrypoint
+evaluation:
+  # metrics to be used for the evaluation
+  # (you can add "rouge", "meteor", and "bertscore" atm)
+  metrics: ["rouge", "meteor", "bertscore"]
+  # enable/disable tqdm to track eval progress
+  # (useful when running interactively, noisy on ray logs)
+  enable_tqdm: True
+  # rely on HF pipeline for summarization (ignored if using OAI API)
+  use_pipeline: True
+  # perform inference / evaluation on the first max_samples only
+  max_samples: 10
+  # output file path
+  # - if you provide a path complete with a filename, results will be stored in it
+  # - if you provide a dir, results will be stored in <dir>/<config.name>/eval_results.json
+  # - if you don't provide a storage path, results will be stored locally (see ~/.lm-buddy/results)
+  # storage_path: "s3://platform-storage/experiments/results/"
+
+# Model to evaluate (OpenAI-compatible API)
+# - Works with local/remote vLLM-served models and llamafiles
+# - Provide base_url and engine
+# - Customize the system prompt if needed
+model:
+  inference:
+    base_url: "http://localhost:8081/v1"
+    engine: "hf://mistralai/mistral-7b-instruct-v0.2"
+    system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences."
+    max_retries: 3
diff --git a/examples/configs/evaluation/hf_evaluate_openai_config.yaml b/examples/configs/evaluation/hf_evaluate_openai_config.yaml
new file mode 100644
index 0000000..0d6916c
--- /dev/null
+++ b/examples/configs/evaluation/hf_evaluate_openai_config.yaml
@@ -0,0 +1,34 @@
+name: "lm-buddy-hf-evaluate-oai"
+
+# Input dataset path
+dataset:
+  path: "s3://platform-storage/datasets/dialogsum"
+
+# Settings specific to the hf_evaluate entrypoint
+evaluation:
+  # metrics to be used for the evaluation
+  # (you can add "rouge", "meteor", and "bertscore" atm)
+  metrics: ["rouge", "meteor", "bertscore"]
+  # enable/disable tqdm to track eval progress
+  # (useful when running interactively, noisy on ray logs)
+  enable_tqdm: True
+  # rely on HF pipeline for summarization (ignored if using OAI API)
+  use_pipeline: True
+  # perform inference / evaluation on the first max_samples only
+  max_samples: 10
+  # output file path
+  # - if you provide a path complete with a filename, results will be stored in it
+  # - if you provide a dir, results will be stored in <dir>/<config.name>/eval_results.json
+  # - if you don't provide a storage path, results will be stored locally (see ~/.lm-buddy/results)
+  # storage_path: "s3://platform-storage/experiments/results/"
+
+# Model to evaluate (OpenAI)
+# - The base_url is fixed
+# - Choose an engine name (see https://platform.openai.com/docs/models)
+# - Customize the system prompt if needed
+model:
+  inference:
+    base_url: "https://api.openai.com/v1"
+    engine: "oai://gpt-4-turbo"
+    system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences."
+    max_retries: 3
diff --git a/src/lm_buddy/jobs/evaluation/hf_evaluate.py b/src/lm_buddy/jobs/evaluation/hf_evaluate.py
index 85262d1..1343c20 100644
--- a/src/lm_buddy/jobs/evaluation/hf_evaluate.py
+++ b/src/lm_buddy/jobs/evaluation/hf_evaluate.py
@@ -23,7 +23,7 @@
     BaseModelClient,
     HuggingFaceModelClient,
     OpenAIModelClient,
-    PipelineModelClient,
+    SummarizationPipelineModelClient,
 )
 from lm_buddy.jobs.utils import timer
 
@@ -99,7 +99,7 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path:
         # for inference
         if config.evaluation.use_pipeline:
             logger.info(f"Using summarization pipeline. Model: {model_name}")
-            model_client = PipelineModelClient(model_name, config.model)
+            model_client = SummarizationPipelineModelClient(model_name, config.model)
         else:
             logger.info(f"Using direct HF model invocation. Model: {model_name}")
             model_client = HuggingFaceModelClient(model_name, config)
diff --git a/src/lm_buddy/jobs/model_clients.py b/src/lm_buddy/jobs/model_clients.py
index 0013950..d0e1473 100644
--- a/src/lm_buddy/jobs/model_clients.py
+++ b/src/lm_buddy/jobs/model_clients.py
@@ -14,16 +14,34 @@
 
 
 class BaseModelClient:
+    """
+    Abstract class for a model client, used to provide a uniform interface
+    (currentnly just a simple predict method) to models served in different
+    ways (e.g. HF models loaded locally, OpenAI endpoints, vLLM inference
+    servers, llamafile).
+    """
+
     @abstractmethod
     def __init__(self, model: str, config: LMBuddyConfig):
+        """
+        Used to initialize the model / inference service.
+        """
         pass
 
     @abstractmethod
     def predict(self, prompt: str) -> str:
+        """
+        Given a prompt, return a prediction.
+        """
         pass
 
 
-class PipelineModelClient(BaseModelClient):
+class SummarizationPipelineModelClient(BaseModelClient):
+    """
+    Model client for the huggingface summarization pipeline
+    (model is loaded locally).
+    """
+
     def __init__(self, model: str, config: AutoModelConfig):
         self._summarizer = pipeline(
             "summarization",
@@ -39,6 +57,14 @@ def predict(self, prompt):
 
 
 class HuggingFaceModelClient(BaseModelClient):
+    """
+    Model client for HF models (model is loaded locally, both Seq2SeqLM
+    and CausalLM are supported).
+    - Provide model path to load the model locally
+    - Make sure you add quantization details if the model is too large
+    - Optionally, add a tokenizer (the one matching the specified model name is the default)
+    """
+
     def __init__(self, model: str, config: HuggingFaceEvalJobConfig):
         self._config = config
         self._device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -57,6 +83,19 @@ def predict(self, prompt):
 
 
 class OpenAIModelClient(BaseModelClient):
+    """
+    Model client for models served via openai-compatible API.
+    For OpenAI models:
+    - The base_url is fixed
+    - Choose an engine name (see https://platform.openai.com/docs/models)
+    - Customize the system prompt if needed
+
+    For compatible models:
+    - Works with local/remote vLLM-served models and llamafiles
+    - Provide base_url and engine
+    - Customize the system prompt if needed
+    """
+
     def __init__(self, model: str, config: VLLMCompletionsConfig):
         self._config = config
 

From 0bffb3a3338bcde6357a950087e8b70a6a552822 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Thu, 13 Jun 2024 14:46:49 +0200
Subject: [PATCH 11/15] removed print functions

---
 src/lm_buddy/jobs/asset_loader.py           | 2 --
 src/lm_buddy/jobs/evaluation/hf_evaluate.py | 1 -
 2 files changed, 3 deletions(-)

diff --git a/src/lm_buddy/jobs/asset_loader.py b/src/lm_buddy/jobs/asset_loader.py
index c35428e..c12ff79 100644
--- a/src/lm_buddy/jobs/asset_loader.py
+++ b/src/lm_buddy/jobs/asset_loader.py
@@ -128,7 +128,6 @@ def load_pretrained_model(
 
         # load config first to get the model type
         model_config = self.load_pretrained_config(config)
-        # print(model_config)
 
         if getattr(model_config, "model_type") in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:
             automodel_class = AutoModelForSeq2SeqLM
@@ -137,7 +136,6 @@ def load_pretrained_model(
         else:
             logger.info("Model type not supported. Trying AutoModelForCausalLM")
             automodel_class = AutoModelForCausalLM
-        # print(automodel_class)
 
         return automodel_class.from_pretrained(
             pretrained_model_name_or_path=model_path,
diff --git a/src/lm_buddy/jobs/evaluation/hf_evaluate.py b/src/lm_buddy/jobs/evaluation/hf_evaluate.py
index 1343c20..e43b1ad 100644
--- a/src/lm_buddy/jobs/evaluation/hf_evaluate.py
+++ b/src/lm_buddy/jobs/evaluation/hf_evaluate.py
@@ -109,7 +109,6 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path:
 
     # run evaluation
     ground_truth = dataset["ground_truth"]
-    print(type(ground_truth))
     evaluation_results, evaluation_time = evaluate(
         predictions, ground_truth, config.evaluation.metrics
     )

From f5b6e136e7122ec7cdb0301a3b9044ebd2eeb155 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Thu, 13 Jun 2024 15:16:24 +0200
Subject: [PATCH 12/15] Mixed fixes as per PR feedback

---
 src/lm_buddy/jobs/evaluation/hf_evaluate.py | 35 ++++++++++++++-------
 src/lm_buddy/jobs/evaluation/metrics.py     |  8 +++--
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/src/lm_buddy/jobs/evaluation/hf_evaluate.py b/src/lm_buddy/jobs/evaluation/hf_evaluate.py
index e43b1ad..2d24e1a 100644
--- a/src/lm_buddy/jobs/evaluation/hf_evaluate.py
+++ b/src/lm_buddy/jobs/evaluation/hf_evaluate.py
@@ -49,24 +49,36 @@ def evaluate(predictions: list, ground_truth: list, evaluation_metrics: list):
 def save_outputs(config: HuggingFaceEvalJobConfig, evaluation_results: dict) -> Path:
     storage_path = config.evaluation.storage_path
 
-    # generate local temp file ANYWAY
-    # (we don't want to lose all eval data if there is an issue wth s3)
-    local_path = Path(LM_BUDDY_RESULTS_PATH) / config.name / "eval_results.json"
-    local_path.parent.mkdir(exist_ok=True, parents=True)
-    with local_path.open("w") as f:
-        json.dump(evaluation_results, f)
+    def save_to_disk(local_path: Path):
+        logger.info(f"Storing into {local_path}...")
+        local_path.parent.mkdir(exist_ok=True, parents=True)
+        with local_path.open("w") as f:
+            json.dump(evaluation_results, f)
 
-    # copy to s3 and return path
-    if storage_path is not None and storage_path.startswith("s3://"):
+    def save_to_s3(local_path: Path, storage_path: str):
         s3 = s3fs.S3FileSystem()
         if storage_path.endswith("/"):
             storage_path = "s3://" + str(Path(storage_path[5:]) / config.name / "eval_results.json")
         logger.info(f"Storing into {storage_path}...")
         s3.put_file(local_path, storage_path)
-        return storage_path
-    else:
+
+    # generate local temp file ANYWAY
+    # (we don't want to lose all eval data if there is an issue wth s3)
+    local_path = Path(LM_BUDDY_RESULTS_PATH) / config.name / "eval_results.json"
+
+    try:
+        save_to_disk(local_path)
+
+        # copy to s3 and return path
+        if storage_path is not None and storage_path.startswith("s3://"):
+            save_to_s3(local_path, storage_path)
+            return storage_path
+
         return local_path
 
+    except Exception as e:
+        logger.error(e)
+
 
 def run_eval(config: HuggingFaceEvalJobConfig) -> Path:
     # Init loaders
@@ -117,7 +129,8 @@ def run_eval(config: HuggingFaceEvalJobConfig) -> Path:
     evaluation_results["summarization_time"] = summarization_time
     evaluation_results["evaluation_time"] = evaluation_time
 
-    return save_outputs(config, evaluation_results)
+    output_path = save_outputs(config, evaluation_results)
+    return output_path
 
 
 def run_hf_evaluation(config: HuggingFaceEvalJobConfig) -> EvaluationResult:
diff --git a/src/lm_buddy/jobs/evaluation/metrics.py b/src/lm_buddy/jobs/evaluation/metrics.py
index 82ce3f7..dd9b215 100644
--- a/src/lm_buddy/jobs/evaluation/metrics.py
+++ b/src/lm_buddy/jobs/evaluation/metrics.py
@@ -11,8 +11,10 @@ def __init__(self, metrics):
             "bertscore": self._bertscore,
         }
 
-        self._chosen_metrics = set(metrics).intersection(set(self._supported_metrics.keys()))
-        self._unsupported_metrics = set(metrics).difference(set(self._supported_metrics.keys()))
+        # chosen metrics are the intersection between the provided and the supporterd ones
+        self._chosen_metrics = set(metrics) & set(self._supported_metrics.keys())
+        # unsupported metrics are the difference between the provided and the supporterd ones
+        self._unsupported_metrics = set(metrics) - set(self._supported_metrics.keys())
 
         if len(self._chosen_metrics) == 0:
             logger.info("No valid metrics selected")
@@ -45,7 +47,7 @@ def _meteor(self, pred, ref):
             evals["meteor"].append(ev.compute(predictions=[p], references=[r])["meteor"])
 
         # calculate mean
-        evals[f"meteor_mean"] = np.mean(evals["meteor"])
+        evals["meteor_mean"] = np.mean(evals["meteor"])
 
         return evals
 

From 1c17326e2095b91a3a5b1108fcde5e71c37bb71f Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Thu, 13 Jun 2024 15:19:27 +0200
Subject: [PATCH 13/15] Linter fix

---
 src/lm_buddy/jobs/model_clients.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lm_buddy/jobs/model_clients.py b/src/lm_buddy/jobs/model_clients.py
index d0e1473..0dbb60b 100644
--- a/src/lm_buddy/jobs/model_clients.py
+++ b/src/lm_buddy/jobs/model_clients.py
@@ -138,7 +138,7 @@ def _get_response_with_retries(
                 )
                 break
             except OpenAIError as e:
-                logger.warning(f"{e.message}: " f"Retrying ({current_retry_attempt}/{max_retries})")
+                logger.warning(f"{e.message}: Retrying ({current_retry_attempt}/{max_retries})")
                 current_retry_attempt += 1
                 if current_retry_attempt > max_retries:
                     raise e

From e83d94c5e1f1e01e0c105939d02066a185c2f8ef Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Thu, 13 Jun 2024 15:37:36 +0200
Subject: [PATCH 14/15] Frozen setuptools to 69.5.1, see
 https://stackoverflow.com/questions/78604018/importerror-cannot-import-name-packaging-from-pkg-resources-when-trying-to

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index db9458f..6245159 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools >= 61.0"]
+requires = ["setuptools == 69.5.1"]
 build-backend = "setuptools.build_meta"
 
 [project]

From bc94aaaa0b817c11f14ea64f7bb5af73f99767a4 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Thu, 13 Jun 2024 15:42:22 +0200
Subject: [PATCH 15/15] Frozen setuptools to 69.5.1, again

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6245159..98bc1ad 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools == 69.5.1"]
+requires = ["setuptools==69.5.1"]
 build-backend = "setuptools.build_meta"
 
 [project]