From e4e05b73bf377fb5a20f8ddf45ad363080630e18 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Thu, 29 Feb 2024 20:10:47 +0000
Subject: [PATCH 01/18] Added v0 of prometheus lm-buddy entrypoint

---
 .../configs/prometheus/prometheus_config.yaml |  20 +++
 src/lm_buddy/cli/run.py                       |  14 +-
 src/lm_buddy/cli/schema.py                    |  13 +-
 .../integrations/wandb/artifact_utils.py      |  37 ++++-
 src/lm_buddy/jobs/__init__.py                 |  10 +-
 src/lm_buddy/jobs/_entrypoints/__init__.py    |   3 +-
 src/lm_buddy/jobs/_entrypoints/prometheus.py  | 128 ++++++++++++++++++
 src/lm_buddy/jobs/configs/__init__.py         |   3 +
 src/lm_buddy/jobs/configs/prometheus.py       |  42 ++++++
 9 files changed, 265 insertions(+), 5 deletions(-)
 create mode 100644 examples/configs/prometheus/prometheus_config.yaml
 create mode 100644 src/lm_buddy/jobs/_entrypoints/prometheus.py
 create mode 100644 src/lm_buddy/jobs/configs/prometheus.py

diff --git a/examples/configs/prometheus/prometheus_config.yaml b/examples/configs/prometheus/prometheus_config.yaml
new file mode 100644
index 00000000..a549aaa9
--- /dev/null
+++ b/examples/configs/prometheus/prometheus_config.yaml
@@ -0,0 +1,20 @@
+dataset:
+  load_from:
+    name: "wandb_file_artifact_name.json"
+    version: "latest"
+    project: "lm-buddy-prometheus"
+    entity: "mozilla-ai"
+  text_field: "instruction"
+
+prometheus:
+  inference:
+    base_url: "http://your.vllm.server:8000/v1"
+  tokenizer:
+    load_from: "meta-llama/Llama-2-7b-chat-hf"
+  max_tokens: 256
+  num_answers: 3
+
+tracking:
+  name: "lm-buddy-prometheus"
+  project: "lm-buddy-examples"
+  entity: "mozilla-ai"
diff --git a/src/lm_buddy/cli/run.py b/src/lm_buddy/cli/run.py
index 64d8ff90..1f0432c7 100644
--- a/src/lm_buddy/cli/run.py
+++ b/src/lm_buddy/cli/run.py
@@ -1,7 +1,12 @@
 import click
 
 import lm_buddy
-from lm_buddy.jobs.configs import FinetuningJobConfig, LMHarnessJobConfig, SimpleJobConfig
+from lm_buddy.jobs.configs import (
+    FinetuningJobConfig,
+    LMHarnessJobConfig,
+    PrometheusJobConfig,
+    SimpleJobConfig,
+)
 
 # TODO(RD2024-125): We should probably collapse all these commands into a single CLI command
 # - Need to figure out best way to polymorphically deserialize the job config classes
@@ -32,3 +37,10 @@ def run_finetuning(config: str) -> None:
 def run_lm_harness(config: str) -> None:
     config = LMHarnessJobConfig.from_yaml_file(config)
     lm_buddy.run_job(config)
+
+
+@group.command("prometheus", help="Run the prometheus evaluation job.")
+@click.option("--config", type=str)
+def run_prometheus(config: str) -> None:
+    config = PrometheusJobConfig.from_yaml_file(config)
+    lm_buddy.run_job(config)
diff --git a/src/lm_buddy/cli/schema.py b/src/lm_buddy/cli/schema.py
index d33ad25c..6149c2c5 100644
--- a/src/lm_buddy/cli/schema.py
+++ b/src/lm_buddy/cli/schema.py
@@ -2,7 +2,12 @@
 
 import click
 
-from lm_buddy.jobs.configs import FinetuningJobConfig, LMHarnessJobConfig, SimpleJobConfig
+from lm_buddy.jobs.configs import (
+    FinetuningJobConfig, 
+    LMHarnessJobConfig,
+    PrometheusJobConfig,
+    SimpleJobConfig,
+)
 
 
 @click.group(name="schema", help="Get a job configuration schema.")
@@ -26,3 +31,9 @@ def schema_finetuning() -> None:
 def schema_lm_harness() -> None:
     schema = LMHarnessJobConfig.model_json_schema()
     click.secho(json.dumps(schema, indent=2))
+
+
+@group.command("prometheus", help="Schema for the prometheus job configuration.")
+def schema_prometheus() -> None:
+    schema = PrometheusJobConfig.model_json_schema()
+    click.secho(json.dumps(schema, indent=2))
diff --git a/src/lm_buddy/integrations/wandb/artifact_utils.py b/src/lm_buddy/integrations/wandb/artifact_utils.py
index 315a9270..966ea537 100644
--- a/src/lm_buddy/integrations/wandb/artifact_utils.py
+++ b/src/lm_buddy/integrations/wandb/artifact_utils.py
@@ -4,7 +4,7 @@
 from urllib.parse import ParseResult, urlparse
 
 import wandb
-
+import os
 
 class ArtifactType(str, Enum):
     """Enumeration of artifact types used by the LM Buddy."""
@@ -110,3 +110,38 @@ def build_table_artifact(
         table = wandb.Table(data=table_data, columns=columns)
         artifact.add(table, name=table_name)
     return artifact
+
+
+def build_file_artifact(
+    artifact_name: str,
+    artifact_type: ArtifactType,
+    file_path: str | Path,
+    *,
+    reference: bool = False,
+    entry_name: str | None = None,
+) -> wandb.Artifact:
+    """Build an artifact containing a single file
+
+    Args:
+        artifact_name (str): Name of the artifact
+        artifact_type (ArtifactType): Type of artifact
+        file_path (str | Path): The full path (including filename) of the file
+
+    Keyword Args:
+        reference (bool): Only reference the file, do not copy contents. Defaults to False.
+        entry_name (str | None): Name for the file within the artifact. If None, defaults
+                                 to the original filename.
+
+    Returns:
+        wandb.Artifact: The generated artifact.
+    """
+    artifact = wandb.Artifact(name=artifact_name, type=artifact_type)
+
+    if reference:
+        artifact.add_reference(
+            uri=f"{ArtifactURIScheme.FILE}://{file_path}",
+            name=entry_name,
+        )
+    else:
+        artifact.add_file(str(file_path), name=entry_name)
+    return artifact
diff --git a/src/lm_buddy/jobs/__init__.py b/src/lm_buddy/jobs/__init__.py
index 659a03ca..41e57911 100644
--- a/src/lm_buddy/jobs/__init__.py
+++ b/src/lm_buddy/jobs/__init__.py
@@ -1,9 +1,15 @@
 from lm_buddy.integrations.wandb import ArtifactLoader, WandbArtifactLoader
-from lm_buddy.jobs._entrypoints import run_finetuning, run_lm_harness, run_simple
+from lm_buddy.jobs._entrypoints import (
+    run_finetuning,
+    run_lm_harness,
+    run_prometheus,
+    run_simple,
+)
 from lm_buddy.jobs.configs import (
     FinetuningJobConfig,
     LMBuddyJobConfig,
     LMHarnessJobConfig,
+    PrometheusJobConfig,
     SimpleJobConfig,
 )
 
@@ -26,5 +32,7 @@ def run_job(
             run_finetuning(finetuning_config, artifact_loader)
         case LMHarnessJobConfig() as lm_harness_config:
             run_lm_harness(lm_harness_config, artifact_loader)
+        case PrometheusJobConfig() as prometheus_config:
+            run_prometheus(prometheus_config, artifact_loader)
         case _:
             raise ValueError(f"Received invalid job configuration: {config}")
diff --git a/src/lm_buddy/jobs/_entrypoints/__init__.py b/src/lm_buddy/jobs/_entrypoints/__init__.py
index bef03bac..26de4304 100644
--- a/src/lm_buddy/jobs/_entrypoints/__init__.py
+++ b/src/lm_buddy/jobs/_entrypoints/__init__.py
@@ -1,5 +1,6 @@
 from lm_buddy.jobs._entrypoints.finetuning import run_finetuning
 from lm_buddy.jobs._entrypoints.lm_harness import run_lm_harness
+from lm_buddy.jobs._entrypoints.prometheus import run_prometheus
 from lm_buddy.jobs._entrypoints.simple import run_simple
 
-__all__ = ["run_finetuning", "run_lm_harness", "run_simple"]
+__all__ = ["run_finetuning", "run_lm_harness", "run_prometheus", "run_simple"]
diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py
new file mode 100644
index 00000000..a351a7f3
--- /dev/null
+++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py
@@ -0,0 +1,128 @@
+from lm_buddy.jobs.configs import PrometheusJobConfig
+from lm_buddy.integrations.huggingface import HuggingFaceAssetLoader
+from lm_buddy.integrations.wandb import (
+    ArtifactType, 
+    ArtifactLoader, 
+    build_file_artifact, 
+    wandb_init_from_config,
+)
+from fastchat.conversation import get_conv_template
+from transformers import AutoTokenizer
+from openai import OpenAIError, OpenAI
+
+from tqdm import tqdm
+import os
+import json
+import copy
+
+class BadResponseException(Exception):
+    def __init__(self, message, error):
+        self.message = message
+        self.error = error
+
+
+def openai_completion(config, client, prompt):
+    return client.completions.create(
+        model = "kaist-ai/prometheus-13b-v1.0",
+        prompt = prompt,
+        best_of = config.prometheus.best_of,
+        max_tokens = config.prometheus.max_tokens,
+        frequency_penalty = config.prometheus.frequency_penalty,
+        temperature = config.prometheus.temperature,
+        top_p = config.prometheus.top_p
+    )
+
+
+def parse_response(response):
+    try:
+        assert response is not None
+        response_text = response.choices[0].text
+        feedback, score = response_text.split('[RESULT]')
+        feedback = feedback.strip()
+        score = score.strip()
+        assert score in ["1","2","3","4","5"]
+    except (ValueError, AssertionError) as e:
+        raise BadResponseException("Server returned a bad response", e)
+
+    return feedback, score
+
+
+def instruction_to_prompt(instruction):
+    conv = get_conv_template("llama-2")
+    conv.set_system_message("You are a fair evaluator language model.")
+    conv.append_message(conv.roles[0], instruction)
+    conv.append_message(conv.roles[1], None)
+    return conv.get_prompt()
+
+
+def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader):
+
+    # load dataset from W&B artifact
+    hf_loader = HuggingFaceAssetLoader(artifact_loader)
+    artifact_path,_ = hf_loader.resolve_asset_path(config.dataset.load_from)
+    dataset_fname = os.path.join(artifact_path, config.dataset.load_from.name)
+    
+    with open(dataset_fname,'r') as f:
+        # eval samples are JSON-encoded, each takes one line in the dataset file
+        data = [json.loads(line) for line in f.readlines()]
+
+    # get the tokenizer
+    tokenizer = hf_loader.load_pretrained_tokenizer(config.prometheus.tokenizer)
+
+    # instantiate OpenAI client to speak with the vLLM endpoint
+    client = OpenAI(
+        base_url = config.prometheus.inference.base_url
+    )
+
+    # open the output file for writing and iterate on samples
+    output_fname = os.path.join("/tmp", config.tracking.name)
+    with open(output_fname,'w') as file:
+        for sample in tqdm(data):
+            # convert instructions from the dataset (`text_field` in a dict) to
+            # prompts that prometheus accepts
+            prompt = instruction_to_prompt(sample[config.dataset.text_field])
+
+            # skip those examples which are too long 
+            tokenized_prompt  = tokenizer(prompt, truncation=False)
+            if(len(tokenized_prompt['input_ids'])>3072):
+                continue
+
+            # prepare output
+            result = copy.deepcopy(sample)
+            result['prometheus_output'] = []
+            result['prometheus_score'] = []
+
+            for idx in range(config.prometheus.num_answers):
+
+                i = 0
+                while i < config.prometheus.max_retries: 
+                    try:
+                        response = openai_completion(config, client, prompt)
+                        feedback, score = parse_response(response)
+                        print(feedback, score)
+                        break
+                    except (OpenAIError, BadResponseException) as e:
+                        print(f"[w] {e.message}, retrying ({i+1}/{config.prometheus.max_retries})")
+                        i += 1
+                        if i == config.prometheus.max_retries:
+                            raise e
+                
+                result['prometheus_output'].append(feedback)
+                result['prometheus_score'].append(score)
+
+            # dump sample results
+            file.write(json.dumps(result)+"\n")
+
+
+    # Register a dataset file artifact if tracking is enabled
+    if config.tracking:
+        
+        with wandb_init_from_config(config.tracking) as run:
+            file_artifact = build_file_artifact(
+                artifact_name = config.tracking.name, 
+                artifact_type = ArtifactType.DATASET,
+                file_path = output_fname,
+                reference = False,
+            )
+            print("[i] Logging artifact for evaluation results...")
+            artifact_loader.log_artifact(file_artifact)
diff --git a/src/lm_buddy/jobs/configs/__init__.py b/src/lm_buddy/jobs/configs/__init__.py
index 294f4855..e289b80e 100644
--- a/src/lm_buddy/jobs/configs/__init__.py
+++ b/src/lm_buddy/jobs/configs/__init__.py
@@ -5,6 +5,7 @@
     LMHarnessJobConfig,
     LocalChatCompletionsConfig,
 )
+from lm_buddy.jobs.configs.prometheus import PrometheusCompletionsConfig, PrometheusJobConfig
 from lm_buddy.jobs.configs.simple import SimpleJobConfig
 
 __all__ = [
@@ -15,5 +16,7 @@
     "LMHarnessEvaluatorConfig",
     "LMHarnessJobConfig",
     "LocalChatCompletionsConfig",
+    "PrometheusCompletionsConfig",
+    "PrometheusJobConfig",
     "SimpleJobConfig",
 ]
diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py
new file mode 100644
index 00000000..5b7f7d05
--- /dev/null
+++ b/src/lm_buddy/jobs/configs/prometheus.py
@@ -0,0 +1,42 @@
+from typing import Literal
+
+from pydantic import conlist, model_validator
+
+from lm_buddy.types import BaseLMBuddyConfig
+from lm_buddy.jobs.configs import LMBuddyJobConfig
+from lm_buddy.integrations.wandb import WandbRunConfig
+from lm_buddy.integrations.vllm import InferenceServerConfig
+from lm_buddy.integrations.huggingface import TextDatasetConfig, AutoTokenizerConfig
+
+class PrometheusCompletionsConfig(BaseLMBuddyConfig):
+    """Configuration for a "local-completions" prometheus model.
+
+    The prometheus model is powered by a self-hosted inference server, specified
+    as an `InferenceServerConfig`. Additional arguments are also provided
+    to control the tokenizer type and generation parameters.
+    """
+
+    inference: InferenceServerConfig
+
+    # vLLM-served model params
+    best_of: int = 1
+    max_tokens: int = 512 
+    frequency_penalty: float = 1.03
+    temperature: float = 1.0
+    top_p: float = 0.9
+
+    # evaluation script params
+    tokenizer: AutoTokenizerConfig | None = None
+    num_answers: int = 3
+    max_retries: int = 5
+
+
+class PrometheusJobConfig(LMBuddyJobConfig):
+    """Configuration to run a prometheus evaluation job."""
+
+    # dataset (json artifact from which we'll extract `text_field`)
+    dataset: TextDatasetConfig
+    # details for our self-hosted prometheus endpoint
+    prometheus: PrometheusCompletionsConfig
+    # wandb experiment tracking details
+    tracking: WandbRunConfig | None = None

From 8e222f75360aac6e5a68b8114f39a1b8bda6e482 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide.eynard@gmail.com>
Date: Thu, 29 Feb 2024 20:24:20 +0000
Subject: [PATCH 02/18] Added comments to prometheus_config.yaml

Signed-off-by: Davide Eynard <davide.eynard@gmail.com>
---
 examples/configs/prometheus/prometheus_config.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/configs/prometheus/prometheus_config.yaml b/examples/configs/prometheus/prometheus_config.yaml
index a549aaa9..5784b299 100644
--- a/examples/configs/prometheus/prometheus_config.yaml
+++ b/examples/configs/prometheus/prometheus_config.yaml
@@ -4,6 +4,7 @@ dataset:
     version: "latest"
     project: "lm-buddy-prometheus"
     entity: "mozilla-ai"
+  # field containing scoring instructions in the json file
   text_field: "instruction"
 
 prometheus:
@@ -12,6 +13,7 @@ prometheus:
   tokenizer:
     load_from: "meta-llama/Llama-2-7b-chat-hf"
   max_tokens: 256
+  # number of times the model is called per sample
   num_answers: 3
 
 tracking:

From 4e9030234e85349f701a6258ff474282cb01260a Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide.eynard@gmail.com>
Date: Thu, 29 Feb 2024 20:36:34 +0000
Subject: [PATCH 03/18] Added link to kaistai's eval code to prometheus.py

Signed-off-by: Davide Eynard <davide.eynard@gmail.com>
---
 src/lm_buddy/jobs/_entrypoints/prometheus.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py
index a351a7f3..3bf311f4 100644
--- a/src/lm_buddy/jobs/_entrypoints/prometheus.py
+++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py
@@ -1,3 +1,6 @@
+# lm-buddy entrypoint to run evaluations using a Prometheus inference server
+# see https://github.com/kaistAI/prometheus/blob/main/evaluation/benchmark/run_absolute_scoring.py
+
 from lm_buddy.jobs.configs import PrometheusJobConfig
 from lm_buddy.integrations.huggingface import HuggingFaceAssetLoader
 from lm_buddy.integrations.wandb import (

From de4be8b03ff8f3f66edbea90fbe1cb0184e21732 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide.eynard@gmail.com>
Date: Fri, 1 Mar 2024 13:21:11 +0000
Subject: [PATCH 04/18] Update dataset in
 src/lm_buddy/jobs/configs/prometheus.py following Sean's comment

Co-authored-by: Sean Friedowitz <sean@mozilla.ai>
Signed-off-by: Davide Eynard <davide.eynard@gmail.com>
---
 src/lm_buddy/jobs/configs/prometheus.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py
index 5b7f7d05..80fea5cb 100644
--- a/src/lm_buddy/jobs/configs/prometheus.py
+++ b/src/lm_buddy/jobs/configs/prometheus.py
@@ -34,8 +34,7 @@ class PrometheusCompletionsConfig(BaseLMBuddyConfig):
 class PrometheusJobConfig(LMBuddyJobConfig):
     """Configuration to run a prometheus evaluation job."""
 
-    # dataset (json artifact from which we'll extract `text_field`)
-    dataset: TextDatasetConfig
+    dataset: TextDatasetConfig = Field(..., description="dataset (json artifact from which we'll extract `text_field`)")
     # details for our self-hosted prometheus endpoint
     prometheus: PrometheusCompletionsConfig
     # wandb experiment tracking details

From cba2386d5a77bff315d6127c8a32e93617bf631a Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Mon, 11 Mar 2024 15:26:27 +0000
Subject: [PATCH 05/18] Removed asserts from parse_response

---
 src/lm_buddy/jobs/_entrypoints/prometheus.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py
index 3bf311f4..906a64b2 100644
--- a/src/lm_buddy/jobs/_entrypoints/prometheus.py
+++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py
@@ -19,7 +19,7 @@
 import copy
 
 class BadResponseException(Exception):
-    def __init__(self, message, error):
+    def __init__(self, message, error=None):
         self.message = message
         self.error = error
 
@@ -37,15 +37,19 @@ def openai_completion(config, client, prompt):
 
 
 def parse_response(response):
+    if response is None:
+        raise BadResponseException("Server returned an empty response")
+
     try:
-        assert response is not None
         response_text = response.choices[0].text
+        # note: this can raise a ValueError if the message is malformed
         feedback, score = response_text.split('[RESULT]')
         feedback = feedback.strip()
         score = score.strip()
-        assert score in ["1","2","3","4","5"]
-    except (ValueError, AssertionError) as e:
-        raise BadResponseException("Server returned a bad response", e)
+        if score not in ["1","2","3","4","5"]:
+            raise BadResponseException("Score not in range")
+    except (ValueError, BadResponseException) as e:
+        raise BadResponseException(f"Server returned a malformed response ({e})",e)
 
     return feedback, score
 

From 6a6348f9eb805d2262d40906d8d3dc2c6e1907da Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Mon, 11 Mar 2024 15:41:55 +0000
Subject: [PATCH 06/18] Removed os.path in favor of pathlib

---
 src/lm_buddy/jobs/_entrypoints/prometheus.py | 8 ++++----
 src/lm_buddy/jobs/configs/prometheus.py      | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py
index 906a64b2..46c8c914 100644
--- a/src/lm_buddy/jobs/_entrypoints/prometheus.py
+++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py
@@ -14,7 +14,7 @@
 from openai import OpenAIError, OpenAI
 
 from tqdm import tqdm
-import os
+from pathlib import Path
 import json
 import copy
 
@@ -67,7 +67,7 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader)
     # load dataset from W&B artifact
     hf_loader = HuggingFaceAssetLoader(artifact_loader)
     artifact_path,_ = hf_loader.resolve_asset_path(config.dataset.load_from)
-    dataset_fname = os.path.join(artifact_path, config.dataset.load_from.name)
+    dataset_fname = Path(artifact_path) / config.dataset.load_from.name
     
     with open(dataset_fname,'r') as f:
         # eval samples are JSON-encoded, each takes one line in the dataset file
@@ -82,9 +82,9 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader)
     )
 
     # open the output file for writing and iterate on samples
-    output_fname = os.path.join("/tmp", config.tracking.name)
+    output_fname = Path("/tmp") / config.tracking.name
     with open(output_fname,'w') as file:
-        for sample in tqdm(data):
+        for sample in tqdm(data[:1]):
             # convert instructions from the dataset (`text_field` in a dict) to
             # prompts that prometheus accepts
             prompt = instruction_to_prompt(sample[config.dataset.text_field])
diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py
index 80fea5cb..b5d72b39 100644
--- a/src/lm_buddy/jobs/configs/prometheus.py
+++ b/src/lm_buddy/jobs/configs/prometheus.py
@@ -1,6 +1,6 @@
 from typing import Literal
 
-from pydantic import conlist, model_validator
+from pydantic import Field, conlist, model_validator
 
 from lm_buddy.types import BaseLMBuddyConfig
 from lm_buddy.jobs.configs import LMBuddyJobConfig
@@ -34,7 +34,7 @@ class PrometheusCompletionsConfig(BaseLMBuddyConfig):
 class PrometheusJobConfig(LMBuddyJobConfig):
     """Configuration to run a prometheus evaluation job."""
 
-    dataset: TextDatasetConfig = Field(..., description="dataset (json artifact from which we'll extract `text_field`)")
+    dataset: TextDatasetConfig = Field(description="dataset (json artifact from which we'll extract `text_field`)")
     # details for our self-hosted prometheus endpoint
     prometheus: PrometheusCompletionsConfig
     # wandb experiment tracking details

From 849b59ac35d0651e9309e286f70fec5106787327 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Tue, 12 Mar 2024 13:44:05 +0000
Subject: [PATCH 07/18] Updated config, parametrised engine, scores, tqdm,
 removed extra tokenizer

---
 src/lm_buddy/integrations/vllm.py            | 20 +++++++++-
 src/lm_buddy/jobs/_entrypoints/prometheus.py | 33 ++++++++++------
 src/lm_buddy/jobs/configs/__init__.py        |  4 +-
 src/lm_buddy/jobs/configs/prometheus.py      | 40 ++++++++------------
 4 files changed, 57 insertions(+), 40 deletions(-)

diff --git a/src/lm_buddy/integrations/vllm.py b/src/lm_buddy/integrations/vllm.py
index 14e8e452..8eb6f7b4 100644
--- a/src/lm_buddy/integrations/vllm.py
+++ b/src/lm_buddy/integrations/vllm.py
@@ -12,8 +12,26 @@ class InferenceServerConfig(BaseLMBuddyConfig):
 
     Note: This configuration is intended to be generic and not bound to the interface
     of any specific training/evaluation framework. See `LocalChatCompletionConfig`
-    for intended usage alongside a third-party framework.
+    or `vLLMCompleptionsConfig` for intended usage alongside a third-party framework.
     """
 
     base_url: str
     engine: str | HuggingFaceAssetPath | None = None
+
+
+class vLLMCompletionsConfig(BaseLMBuddyConfig):
+    """Configuration for a vLLM-based completions service
+
+    The "local-chat-completions" model is powered by a self-hosted inference server,
+    specified as an `InferenceServerConfig`. Additional arguments are also provided
+    to control the tokenizer type and generation parameters.
+    """
+
+    inference: InferenceServerConfig
+
+    # vLLM-specific params
+    best_of: int | None = None
+    max_tokens: int | None = None
+    frequency_penalty: float | None = None
+    temperature: float | None = None
+    top_p: float | None = None
\ No newline at end of file
diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py
index 46c8c914..d609cde2 100644
--- a/src/lm_buddy/jobs/_entrypoints/prometheus.py
+++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py
@@ -3,6 +3,7 @@
 
 from lm_buddy.jobs.configs import PrometheusJobConfig
 from lm_buddy.integrations.huggingface import HuggingFaceAssetLoader
+from lm_buddy.integrations.huggingface.tokenizer_config import AutoTokenizerConfig
 from lm_buddy.integrations.wandb import (
     ArtifactType, 
     ArtifactLoader, 
@@ -26,7 +27,7 @@ def __init__(self, message, error=None):
 
 def openai_completion(config, client, prompt):
     return client.completions.create(
-        model = "kaist-ai/prometheus-13b-v1.0",
+        model = config.prometheus.inference.engine,
         prompt = prompt,
         best_of = config.prometheus.best_of,
         max_tokens = config.prometheus.max_tokens,
@@ -36,7 +37,7 @@ def openai_completion(config, client, prompt):
     )
 
 
-def parse_response(response):
+def parse_response(config, response):
     if response is None:
         raise BadResponseException("Server returned an empty response")
 
@@ -46,8 +47,11 @@ def parse_response(response):
         feedback, score = response_text.split('[RESULT]')
         feedback = feedback.strip()
         score = score.strip()
-        if score not in ["1","2","3","4","5"]:
-            raise BadResponseException("Score not in range")
+        if score not in [str(s) for s in range(
+            config.evaluation.min_score,
+            config.evaluation.max_score+1
+            )]:
+            raise BadResponseException(f"Score {score} is not in range")
     except (ValueError, BadResponseException) as e:
         raise BadResponseException(f"Server returned a malformed response ({e})",e)
 
@@ -74,17 +78,23 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader)
         data = [json.loads(line) for line in f.readlines()]
 
     # get the tokenizer
-    tokenizer = hf_loader.load_pretrained_tokenizer(config.prometheus.tokenizer)
+    tokenizer_config = AutoTokenizerConfig(
+        load_from = config.prometheus.inference.engine
+    )
+    tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config)
 
     # instantiate OpenAI client to speak with the vLLM endpoint
     client = OpenAI(
         base_url = config.prometheus.inference.base_url
     )
 
+    # enable / disable tqdm
+    dataset_iterable = tqdm(data) if config.evaluation.enable_tqdm else data
+
     # open the output file for writing and iterate on samples
     output_fname = Path("/tmp") / config.tracking.name
     with open(output_fname,'w') as file:
-        for sample in tqdm(data[:1]):
+        for sample in dataset_iterable:
             # convert instructions from the dataset (`text_field` in a dict) to
             # prompts that prometheus accepts
             prompt = instruction_to_prompt(sample[config.dataset.text_field])
@@ -99,19 +109,18 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader)
             result['prometheus_output'] = []
             result['prometheus_score'] = []
 
-            for idx in range(config.prometheus.num_answers):
+            for idx in range(config.evaluation.num_answers):
 
                 i = 0
-                while i < config.prometheus.max_retries: 
+                while i < config.evaluation.max_retries: 
                     try:
                         response = openai_completion(config, client, prompt)
-                        feedback, score = parse_response(response)
-                        print(feedback, score)
+                        feedback, score = parse_response(config, response)
                         break
                     except (OpenAIError, BadResponseException) as e:
-                        print(f"[w] {e.message}, retrying ({i+1}/{config.prometheus.max_retries})")
+                        print(f"[w] {e.message}, retrying ({i+1}/{config.evaluation.max_retries})")
                         i += 1
-                        if i == config.prometheus.max_retries:
+                        if i == config.evaluation.max_retries:
                             raise e
                 
                 result['prometheus_output'].append(feedback)
diff --git a/src/lm_buddy/jobs/configs/__init__.py b/src/lm_buddy/jobs/configs/__init__.py
index e289b80e..e7f71236 100644
--- a/src/lm_buddy/jobs/configs/__init__.py
+++ b/src/lm_buddy/jobs/configs/__init__.py
@@ -5,7 +5,7 @@
     LMHarnessJobConfig,
     LocalChatCompletionsConfig,
 )
-from lm_buddy.jobs.configs.prometheus import PrometheusCompletionsConfig, PrometheusJobConfig
+from lm_buddy.jobs.configs.prometheus import PrometheusEvaluationTaskConfig, PrometheusJobConfig
 from lm_buddy.jobs.configs.simple import SimpleJobConfig
 
 __all__ = [
@@ -16,7 +16,7 @@
     "LMHarnessEvaluatorConfig",
     "LMHarnessJobConfig",
     "LocalChatCompletionsConfig",
-    "PrometheusCompletionsConfig",
+    "PrometheusEvaluationTaskConfig",
     "PrometheusJobConfig",
     "SimpleJobConfig",
 ]
diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py
index b5d72b39..3e7a782c 100644
--- a/src/lm_buddy/jobs/configs/prometheus.py
+++ b/src/lm_buddy/jobs/configs/prometheus.py
@@ -1,41 +1,31 @@
-from typing import Literal
-
 from pydantic import Field, conlist, model_validator
 
 from lm_buddy.types import BaseLMBuddyConfig
 from lm_buddy.jobs.configs import LMBuddyJobConfig
 from lm_buddy.integrations.wandb import WandbRunConfig
-from lm_buddy.integrations.vllm import InferenceServerConfig
-from lm_buddy.integrations.huggingface import TextDatasetConfig, AutoTokenizerConfig
-
-class PrometheusCompletionsConfig(BaseLMBuddyConfig):
-    """Configuration for a "local-completions" prometheus model.
-
-    The prometheus model is powered by a self-hosted inference server, specified
-    as an `InferenceServerConfig`. Additional arguments are also provided
-    to control the tokenizer type and generation parameters.
-    """
-
-    inference: InferenceServerConfig
+from lm_buddy.integrations.vllm import vLLMCompletionsConfig
+from lm_buddy.integrations.huggingface import TextDatasetConfig
 
-    # vLLM-served model params
-    best_of: int = 1
-    max_tokens: int = 512 
-    frequency_penalty: float = 1.03
-    temperature: float = 1.0
-    top_p: float = 0.9
 
-    # evaluation script params
-    tokenizer: AutoTokenizerConfig | None = None
+class PrometheusEvaluationTaskConfig(BaseLMBuddyConfig):
+    """Parameters specific to Prometheus evaluation."""
     num_answers: int = 3
     max_retries: int = 5
+    min_score: int = 0
+    max_score: int = 5
+    enable_tqdm: bool = False
 
 
 class PrometheusJobConfig(LMBuddyJobConfig):
-    """Configuration to run a prometheus evaluation job."""
+    """Configuration to run a prometheus job."""
 
     dataset: TextDatasetConfig = Field(description="dataset (json artifact from which we'll extract `text_field`)")
-    # details for our self-hosted prometheus endpoint
-    prometheus: PrometheusCompletionsConfig
+
+    # vLLM endpoint configuration
+    prometheus: vLLMCompletionsConfig
+
+    # evaluation task configuration
+    evaluation: PrometheusEvaluationTaskConfig | None = None
+
     # wandb experiment tracking details
     tracking: WandbRunConfig | None = None

From ae10970debd7466b45358b8313ecdfdc836af9a5 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Tue, 12 Mar 2024 14:17:07 +0000
Subject: [PATCH 08/18] Added type hints/return types + comments on new
 functions

---
 src/lm_buddy/jobs/_entrypoints/prometheus.py | 29 +++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py
index d609cde2..4baf9ad2 100644
--- a/src/lm_buddy/jobs/_entrypoints/prometheus.py
+++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py
@@ -12,7 +12,7 @@
 )
 from fastchat.conversation import get_conv_template
 from transformers import AutoTokenizer
-from openai import OpenAIError, OpenAI
+from openai import OpenAIError, OpenAI, Completion
 
 from tqdm import tqdm
 from pathlib import Path
@@ -25,7 +25,15 @@ def __init__(self, message, error=None):
         self.error = error
 
 
-def openai_completion(config, client, prompt):
+def openai_completion(
+    config: PrometheusJobConfig, 
+    client: OpenAI, 
+    prompt: str
+) -> Completion:
+    """ Connects to a remote OpenAI-API-compatible Prometheus endpoint
+        and returns a Completion holding the model's response.
+    """
+
     return client.completions.create(
         model = config.prometheus.inference.engine,
         prompt = prompt,
@@ -37,7 +45,15 @@ def openai_completion(config, client, prompt):
     )
 
 
-def parse_response(config, response):
+def parse_response(
+    config: PrometheusJobConfig, 
+    response: Completion
+) -> tuple[str, str]:
+    """ Given a Prometheus eval response as returned by the OpenAI API
+        endpoint (i.e. in Completion format), extract feedback
+        and score.
+    """
+    
     if response is None:
         raise BadResponseException("Server returned an empty response")
 
@@ -58,7 +74,12 @@ def parse_response(config, response):
     return feedback, score
 
 
-def instruction_to_prompt(instruction):
+def instruction_to_prompt(
+    instruction: str
+) -> str:
+    """ Given some text containing Prometheus instructions, transform it
+        into an actual llama-2 prompt.
+    """
     conv = get_conv_template("llama-2")
     conv.set_system_message("You are a fair evaluator language model.")
     conv.append_message(conv.roles[0], instruction)

From 2227c53ee0cd232e1bb1a3d716fe8ec930909df6 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Tue, 12 Mar 2024 14:20:56 +0000
Subject: [PATCH 09/18] Added new config example for prometheus

---
 .../configs/prometheus/prometheus_config.yaml | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/examples/configs/prometheus/prometheus_config.yaml b/examples/configs/prometheus/prometheus_config.yaml
index 5784b299..c8a6b5e8 100644
--- a/examples/configs/prometheus/prometheus_config.yaml
+++ b/examples/configs/prometheus/prometheus_config.yaml
@@ -10,11 +10,24 @@ dataset:
 prometheus:
   inference:
     base_url: "http://your.vllm.server:8000/v1"
-  tokenizer:
-    load_from: "meta-llama/Llama-2-7b-chat-hf"
-  max_tokens: 256
-  # number of times the model is called per sample
+    engine: "kaist-ai/prometheus-13b-v1.0"
+  best_of: 1
+  max_tokens: 512
+  frequency_penalty: 1.03
+  temperature: 1.0
+  top_p: 0.9
+
+evaluation:
+  # number of times a model is evaluated per sample
   num_answers: 3
+  # max number of retries if a communication error
+  # with the server occurs
+  max_retries: 5
+  # min and max scores as defined in the scoring rubric
+  min_score: 1
+  max_score: 5
+  # enable/disable tqdm to track eval progress
+  enable_tqdm: True
 
 tracking:
   name: "lm-buddy-prometheus"

From 8ee2da2468b9f38074a0f4eb8da6fa269c27a968 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Tue, 12 Mar 2024 15:24:51 +0000
Subject: [PATCH 10/18] Fixes for ruff

---
 src/lm_buddy/integrations/vllm.py            |   4 +-
 src/lm_buddy/jobs/_entrypoints/prometheus.py | 139 ++++++++-----------
 src/lm_buddy/jobs/configs/prometheus.py      |  17 ++-
 3 files changed, 73 insertions(+), 87 deletions(-)

diff --git a/src/lm_buddy/integrations/vllm.py b/src/lm_buddy/integrations/vllm.py
index 8eb6f7b4..f327ddad 100644
--- a/src/lm_buddy/integrations/vllm.py
+++ b/src/lm_buddy/integrations/vllm.py
@@ -19,7 +19,7 @@ class InferenceServerConfig(BaseLMBuddyConfig):
     engine: str | HuggingFaceAssetPath | None = None
 
 
-class vLLMCompletionsConfig(BaseLMBuddyConfig):
+class VLLMCompletionsConfig(BaseLMBuddyConfig):
     """Configuration for a vLLM-based completions service
 
     The "local-chat-completions" model is powered by a self-hosted inference server,
@@ -34,4 +34,4 @@ class vLLMCompletionsConfig(BaseLMBuddyConfig):
     max_tokens: int | None = None
     frequency_penalty: float | None = None
     temperature: float | None = None
-    top_p: float | None = None
\ No newline at end of file
+    top_p: float | None = None
diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py
index 4baf9ad2..f980a242 100644
--- a/src/lm_buddy/jobs/_entrypoints/prometheus.py
+++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py
@@ -1,84 +1,75 @@
 # lm-buddy entrypoint to run evaluations using a Prometheus inference server
 # see https://github.com/kaistAI/prometheus/blob/main/evaluation/benchmark/run_absolute_scoring.py
 
-from lm_buddy.jobs.configs import PrometheusJobConfig
+import copy
+import json
+from pathlib import Path
+
+from fastchat.conversation import get_conv_template
+from openai import Completion, OpenAI, OpenAIError
+from tqdm import tqdm
+
 from lm_buddy.integrations.huggingface import HuggingFaceAssetLoader
 from lm_buddy.integrations.huggingface.tokenizer_config import AutoTokenizerConfig
 from lm_buddy.integrations.wandb import (
-    ArtifactType, 
-    ArtifactLoader, 
-    build_file_artifact, 
+    ArtifactLoader,
+    ArtifactType,
+    build_file_artifact,
     wandb_init_from_config,
 )
-from fastchat.conversation import get_conv_template
-from transformers import AutoTokenizer
-from openai import OpenAIError, OpenAI, Completion
+from lm_buddy.jobs.configs import PrometheusJobConfig
 
-from tqdm import tqdm
-from pathlib import Path
-import json
-import copy
 
-class BadResponseException(Exception):
+class BadResponseError(Exception):
     def __init__(self, message, error=None):
         self.message = message
         self.error = error
 
 
-def openai_completion(
-    config: PrometheusJobConfig, 
-    client: OpenAI, 
-    prompt: str
-) -> Completion:
-    """ Connects to a remote OpenAI-API-compatible Prometheus endpoint
-        and returns a Completion holding the model's response.
+def openai_completion(config: PrometheusJobConfig, client: OpenAI, prompt: str) -> Completion:
+    """Connects to a remote OpenAI-API-compatible Prometheus endpoint
+    and returns a Completion holding the model's response.
     """
 
     return client.completions.create(
-        model = config.prometheus.inference.engine,
-        prompt = prompt,
-        best_of = config.prometheus.best_of,
-        max_tokens = config.prometheus.max_tokens,
-        frequency_penalty = config.prometheus.frequency_penalty,
-        temperature = config.prometheus.temperature,
-        top_p = config.prometheus.top_p
+        model=config.prometheus.inference.engine,
+        prompt=prompt,
+        best_of=config.prometheus.best_of,
+        max_tokens=config.prometheus.max_tokens,
+        frequency_penalty=config.prometheus.frequency_penalty,
+        temperature=config.prometheus.temperature,
+        top_p=config.prometheus.top_p,
     )
 
 
-def parse_response(
-    config: PrometheusJobConfig, 
-    response: Completion
-) -> tuple[str, str]:
-    """ Given a Prometheus eval response as returned by the OpenAI API
-        endpoint (i.e. in Completion format), extract feedback
-        and score.
+def parse_response(config: PrometheusJobConfig, response: Completion) -> tuple[str, str]:
+    """Given a Prometheus eval response as returned by the OpenAI API
+    endpoint (i.e. in Completion format), extract feedback
+    and score.
     """
-    
+
     if response is None:
-        raise BadResponseException("Server returned an empty response")
+        raise BadResponseError("Server returned an empty response")
 
     try:
         response_text = response.choices[0].text
         # note: this can raise a ValueError if the message is malformed
-        feedback, score = response_text.split('[RESULT]')
+        feedback, score = response_text.split("[RESULT]")
         feedback = feedback.strip()
         score = score.strip()
-        if score not in [str(s) for s in range(
-            config.evaluation.min_score,
-            config.evaluation.max_score+1
-            )]:
-            raise BadResponseException(f"Score {score} is not in range")
-    except (ValueError, BadResponseException) as e:
-        raise BadResponseException(f"Server returned a malformed response ({e})",e)
+        if score not in [
+            str(s) for s in range(config.evaluation.min_score, config.evaluation.max_score + 1)
+        ]:
+            raise BadResponseError(f"Score {score} is not in range")
+    except (ValueError, BadResponseError) as e:
+        raise BadResponseError(f"Server returned a malformed response ({e})", e)
 
     return feedback, score
 
 
-def instruction_to_prompt(
-    instruction: str
-) -> str:
-    """ Given some text containing Prometheus instructions, transform it
-        into an actual llama-2 prompt.
+def instruction_to_prompt(instruction: str) -> str:
+    """Given some text containing Prometheus instructions, transform it
+    into an actual llama-2 prompt.
     """
     conv = get_conv_template("llama-2")
     conv.set_system_message("You are a fair evaluator language model.")
@@ -88,78 +79,70 @@ def instruction_to_prompt(
 
 
 def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader):
-
     # load dataset from W&B artifact
     hf_loader = HuggingFaceAssetLoader(artifact_loader)
-    artifact_path,_ = hf_loader.resolve_asset_path(config.dataset.load_from)
+    artifact_path, _ = hf_loader.resolve_asset_path(config.dataset.load_from)
     dataset_fname = Path(artifact_path) / config.dataset.load_from.name
-    
-    with open(dataset_fname,'r') as f:
+
+    with Path(dataset_fname).open() as f:
         # eval samples are JSON-encoded, each takes one line in the dataset file
         data = [json.loads(line) for line in f.readlines()]
 
     # get the tokenizer
-    tokenizer_config = AutoTokenizerConfig(
-        load_from = config.prometheus.inference.engine
-    )
+    tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine)
     tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config)
 
     # instantiate OpenAI client to speak with the vLLM endpoint
-    client = OpenAI(
-        base_url = config.prometheus.inference.base_url
-    )
+    client = OpenAI(base_url=config.prometheus.inference.base_url)
 
     # enable / disable tqdm
     dataset_iterable = tqdm(data) if config.evaluation.enable_tqdm else data
 
     # open the output file for writing and iterate on samples
     output_fname = Path("/tmp") / config.tracking.name
-    with open(output_fname,'w') as file:
+    with output_fname.open("w") as file:
         for sample in dataset_iterable:
             # convert instructions from the dataset (`text_field` in a dict) to
             # prompts that prometheus accepts
             prompt = instruction_to_prompt(sample[config.dataset.text_field])
 
-            # skip those examples which are too long 
-            tokenized_prompt  = tokenizer(prompt, truncation=False)
-            if(len(tokenized_prompt['input_ids'])>3072):
+            # skip those examples which are too long
+            tokenized_prompt = tokenizer(prompt, truncation=False)
+            if len(tokenized_prompt["input_ids"]) > 3072:
                 continue
 
             # prepare output
             result = copy.deepcopy(sample)
-            result['prometheus_output'] = []
-            result['prometheus_score'] = []
+            result["prometheus_output"] = []
+            result["prometheus_score"] = []
 
             for idx in range(config.evaluation.num_answers):
-
                 i = 0
-                while i < config.evaluation.max_retries: 
+                while i < config.evaluation.max_retries:
                     try:
                         response = openai_completion(config, client, prompt)
                         feedback, score = parse_response(config, response)
                         break
-                    except (OpenAIError, BadResponseException) as e:
+                    except (OpenAIError, BadResponseError) as e:
                         print(f"[w] {e.message}, retrying ({i+1}/{config.evaluation.max_retries})")
                         i += 1
                         if i == config.evaluation.max_retries:
                             raise e
-                
-                result['prometheus_output'].append(feedback)
-                result['prometheus_score'].append(score)
 
-            # dump sample results
-            file.write(json.dumps(result)+"\n")
+                result["prometheus_output"].append(feedback)
+                result["prometheus_score"].append(score)
 
+            # dump sample results
+            file.write(json.dumps(result) + "\n")
 
     # Register a dataset file artifact if tracking is enabled
     if config.tracking:
-        
-        with wandb_init_from_config(config.tracking) as run:
+        with wandb_init_from_config(config.tracking):
             file_artifact = build_file_artifact(
-                artifact_name = config.tracking.name, 
-                artifact_type = ArtifactType.DATASET,
-                file_path = output_fname,
-                reference = False,
+                artifact_name=config.tracking.name,
+                artifact_type=ArtifactType.DATASET,
+                file_path=output_fname,
+                reference=False,
             )
             print("[i] Logging artifact for evaluation results...")
             artifact_loader.log_artifact(file_artifact)
diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py
index 3e7a782c..7b68012c 100644
--- a/src/lm_buddy/jobs/configs/prometheus.py
+++ b/src/lm_buddy/jobs/configs/prometheus.py
@@ -1,14 +1,15 @@
-from pydantic import Field, conlist, model_validator
+from pydantic import Field
 
-from lm_buddy.types import BaseLMBuddyConfig
-from lm_buddy.jobs.configs import LMBuddyJobConfig
-from lm_buddy.integrations.wandb import WandbRunConfig
-from lm_buddy.integrations.vllm import vLLMCompletionsConfig
 from lm_buddy.integrations.huggingface import TextDatasetConfig
+from lm_buddy.integrations.vllm import VLLMCompletionsConfig
+from lm_buddy.integrations.wandb import WandbRunConfig
+from lm_buddy.jobs.configs import LMBuddyJobConfig
+from lm_buddy.types import BaseLMBuddyConfig
 
 
 class PrometheusEvaluationTaskConfig(BaseLMBuddyConfig):
     """Parameters specific to Prometheus evaluation."""
+
     num_answers: int = 3
     max_retries: int = 5
     min_score: int = 0
@@ -19,10 +20,12 @@ class PrometheusEvaluationTaskConfig(BaseLMBuddyConfig):
 class PrometheusJobConfig(LMBuddyJobConfig):
     """Configuration to run a prometheus job."""
 
-    dataset: TextDatasetConfig = Field(description="dataset (json artifact from which we'll extract `text_field`)")
+    dataset: TextDatasetConfig = Field(
+        description="dataset (json artifact from which we'll extract `text_field`)"
+    )
 
     # vLLM endpoint configuration
-    prometheus: vLLMCompletionsConfig
+    prometheus: VLLMCompletionsConfig
 
     # evaluation task configuration
     evaluation: PrometheusEvaluationTaskConfig | None = None

From 482db8baa1974c8a2be4b239b555bb9ada0b1442 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Tue, 12 Mar 2024 15:26:44 +0000
Subject: [PATCH 11/18] Added fschat to libs

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 7d2f3dc4..bd6a74f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,7 @@ dependencies = [
     # Evaluation frameworks
     "lm-eval[openai]==0.4.1",
     "einops==0.7.0",
+    "fschat==0.2.36",
 ]
 
 [project.optional-dependencies]

From ff3516c2605c3a4caab33e8b2fa7cbdc90de61ac Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Tue, 12 Mar 2024 17:52:52 +0000
Subject: [PATCH 12/18] Added early break if wandb issues + load/save HF
 datasets

---
 src/lm_buddy/jobs/_entrypoints/prometheus.py | 65 ++++++++++++--------
 src/lm_buddy/jobs/configs/prometheus.py      |  1 +
 2 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py
index f980a242..f7ceb3cf 100644
--- a/src/lm_buddy/jobs/_entrypoints/prometheus.py
+++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py
@@ -5,18 +5,21 @@
 import json
 from pathlib import Path
 
+from datasets import Dataset, load_dataset
 from fastchat.conversation import get_conv_template
 from openai import Completion, OpenAI, OpenAIError
 from tqdm import tqdm
+from transformers import PreTrainedTokenizer
 
 from lm_buddy.integrations.huggingface import HuggingFaceAssetLoader
 from lm_buddy.integrations.huggingface.tokenizer_config import AutoTokenizerConfig
 from lm_buddy.integrations.wandb import (
     ArtifactLoader,
     ArtifactType,
-    build_file_artifact,
+    build_directory_artifact,
     wandb_init_from_config,
 )
+from lm_buddy.jobs.common import LMBuddyJobType
 from lm_buddy.jobs.configs import PrometheusJobConfig
 
 
@@ -78,28 +81,15 @@ def instruction_to_prompt(instruction: str) -> str:
     return conv.get_prompt()
 
 
-def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader):
-    # load dataset from W&B artifact
-    hf_loader = HuggingFaceAssetLoader(artifact_loader)
-    artifact_path, _ = hf_loader.resolve_asset_path(config.dataset.load_from)
-    dataset_fname = Path(artifact_path) / config.dataset.load_from.name
-
-    with Path(dataset_fname).open() as f:
-        # eval samples are JSON-encoded, each takes one line in the dataset file
-        data = [json.loads(line) for line in f.readlines()]
-
-    # get the tokenizer
-    tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine)
-    tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config)
-
-    # instantiate OpenAI client to speak with the vLLM endpoint
-    client = OpenAI(base_url=config.prometheus.inference.base_url)
-
+def run_eval(
+    config: PrometheusJobConfig, data: Dataset, tokenizer: PreTrainedTokenizer, client: OpenAI
+) -> str:
     # enable / disable tqdm
     dataset_iterable = tqdm(data) if config.evaluation.enable_tqdm else data
 
     # open the output file for writing and iterate on samples
-    output_fname = Path("/tmp") / config.tracking.name
+    tracking_name = config.tracking.name if config.tracking is not None else "output.json"
+    output_fname = Path(config.evaluation.tmp_folder) / tracking_name
     with output_fname.open("w") as file:
         for sample in dataset_iterable:
             # convert instructions from the dataset (`text_field` in a dict) to
@@ -132,17 +122,44 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader)
                 result["prometheus_output"].append(feedback)
                 result["prometheus_score"].append(score)
 
-            # dump sample results
+            # dump sample results incrementally
             file.write(json.dumps(result) + "\n")
 
+    # convert plain json dataset in HF format
+    output_hf_name = str(Path(config.evaluation.tmp_folder) / "hf" / tracking_name)
+    ds = load_dataset("json", data_files=str(output_fname), split="train")
+    ds.save_to_disk(output_hf_name)
+
+    return str(output_hf_name)
+
+
+def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader):
+    # load dataset from W&B artifact
+    hf_loader = HuggingFaceAssetLoader(artifact_loader)
+    data = hf_loader.load_dataset(config.dataset)
+
+    # get the tokenizer
+    tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine)
+    tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config)
+
+    # instantiate OpenAI client to speak with the vLLM endpoint
+    client = OpenAI(base_url=config.prometheus.inference.base_url)
+
     # Register a dataset file artifact if tracking is enabled
     if config.tracking:
-        with wandb_init_from_config(config.tracking):
-            file_artifact = build_file_artifact(
+        with wandb_init_from_config(config.tracking, job_type=LMBuddyJobType.EVALUATION):
+            # run eval and store output in local filename
+            output_dataset_name = run_eval(config, data, tokenizer, client)
+
+            # store HF dataset as a directory artifact
+            artifact = build_directory_artifact(
+                dir_path=output_dataset_name,
                 artifact_name=config.tracking.name,
                 artifact_type=ArtifactType.DATASET,
-                file_path=output_fname,
                 reference=False,
             )
             print("[i] Logging artifact for evaluation results...")
-            artifact_loader.log_artifact(file_artifact)
+            artifact_loader.log_artifact(artifact)
+    else:
+        output_dataset_name = run_eval(config, data, tokenizer, client)
+        print(f"[i] Evaluation results stored in {output_dataset_name}")
diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py
index 7b68012c..cc46ae0c 100644
--- a/src/lm_buddy/jobs/configs/prometheus.py
+++ b/src/lm_buddy/jobs/configs/prometheus.py
@@ -15,6 +15,7 @@ class PrometheusEvaluationTaskConfig(BaseLMBuddyConfig):
     min_score: int = 0
     max_score: int = 5
     enable_tqdm: bool = False
+    tmp_folder: str = "/tmp"
 
 
 class PrometheusJobConfig(LMBuddyJobConfig):

From 6f15d1d01ad8de915bc40f60abdd0bbe7af887ee Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Tue, 12 Mar 2024 17:58:12 +0000
Subject: [PATCH 13/18] Added openai lib to pyproject.toml

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index bd6a74f1..a765d64a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
     "lm-eval[openai]==0.4.1",
     "einops==0.7.0",
     "fschat==0.2.36",
+    "openai==1.3.9",
 ]
 
 [project.optional-dependencies]

From 317301fab530ea62612a32013fe37a9d75b64c7e Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Tue, 12 Mar 2024 18:00:27 +0000
Subject: [PATCH 14/18] Ruff compliance

---
 src/lm_buddy/cli/schema.py                        | 2 +-
 src/lm_buddy/integrations/wandb/artifact_utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lm_buddy/cli/schema.py b/src/lm_buddy/cli/schema.py
index 6149c2c5..4577ee84 100644
--- a/src/lm_buddy/cli/schema.py
+++ b/src/lm_buddy/cli/schema.py
@@ -3,7 +3,7 @@
 import click
 
 from lm_buddy.jobs.configs import (
-    FinetuningJobConfig, 
+    FinetuningJobConfig,
     LMHarnessJobConfig,
     PrometheusJobConfig,
     SimpleJobConfig,
diff --git a/src/lm_buddy/integrations/wandb/artifact_utils.py b/src/lm_buddy/integrations/wandb/artifact_utils.py
index 966ea537..995ba869 100644
--- a/src/lm_buddy/integrations/wandb/artifact_utils.py
+++ b/src/lm_buddy/integrations/wandb/artifact_utils.py
@@ -4,7 +4,7 @@
 from urllib.parse import ParseResult, urlparse
 
 import wandb
-import os
+
 
 class ArtifactType(str, Enum):
     """Enumeration of artifact types used by the LM Buddy."""

From ea35bc00dbeecc6ed7e47a6d0c68153456b4d96b Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Tue, 12 Mar 2024 20:38:54 +0000
Subject: [PATCH 15/18] Addressed latest comments in PR

---
 src/lm_buddy/integrations/vllm.py            |  5 ++
 src/lm_buddy/jobs/_entrypoints/prometheus.py | 90 ++++++++++++--------
 src/lm_buddy/jobs/configs/prometheus.py      |  7 +-
 3 files changed, 66 insertions(+), 36 deletions(-)

diff --git a/src/lm_buddy/integrations/vllm.py b/src/lm_buddy/integrations/vllm.py
index f327ddad..af99ccf6 100644
--- a/src/lm_buddy/integrations/vllm.py
+++ b/src/lm_buddy/integrations/vllm.py
@@ -25,6 +25,11 @@ class VLLMCompletionsConfig(BaseLMBuddyConfig):
     The "local-chat-completions" model is powered by a self-hosted inference server,
     specified as an `InferenceServerConfig`. Additional arguments are also provided
     to control the tokenizer type and generation parameters.
+
+    Note that this is just a subset of the parameters allowed by a vLLM server (see
+    https://github.com/vllm-project/vllm/blob/main/vllm/sampling_params.py). If we
+    choose to use this configuration to cover for more use cases, it will make sense
+    to add the other supported configuration parameters too.
     """
 
     inference: InferenceServerConfig
diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py
index f7ceb3cf..e6f2a30c 100644
--- a/src/lm_buddy/jobs/_entrypoints/prometheus.py
+++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py
@@ -1,8 +1,11 @@
-# lm-buddy entrypoint to run evaluations using a Prometheus inference server
-# see https://github.com/kaistAI/prometheus/blob/main/evaluation/benchmark/run_absolute_scoring.py
+"""
+lm-buddy entrypoint to run evaluations using a Prometheus inference server
+see https://github.com/kaistAI/prometheus/blob/main/evaluation/benchmark/run_absolute_scoring.py
+"""
 
 import copy
 import json
+from dataclasses import dataclass
 from pathlib import Path
 
 from datasets import Dataset, load_dataset
@@ -23,6 +26,7 @@
 from lm_buddy.jobs.configs import PrometheusJobConfig
 
 
+@dataclass
 class BadResponseError(Exception):
     def __init__(self, message, error=None):
         self.message = message
@@ -60,9 +64,7 @@ def parse_response(config: PrometheusJobConfig, response: Completion) -> tuple[s
         feedback, score = response_text.split("[RESULT]")
         feedback = feedback.strip()
         score = score.strip()
-        if score not in [
-            str(s) for s in range(config.evaluation.min_score, config.evaluation.max_score + 1)
-        ]:
+        if score not in [str(s) for s in config.evaluation.scores]:
             raise BadResponseError(f"Score {score} is not in range")
     except (ValueError, BadResponseError) as e:
         raise BadResponseError(f"Server returned a malformed response ({e})", e)
@@ -70,17 +72,38 @@ def parse_response(config: PrometheusJobConfig, response: Completion) -> tuple[s
     return feedback, score
 
 
-def instruction_to_prompt(instruction: str) -> str:
-    """Given some text containing Prometheus instructions, transform it
-    into an actual llama-2 prompt.
+def instruction_to_prompt(config: PrometheusJobConfig, instruction: str) -> str:
+    """Given some text containing Prometheus instructions, a conversation
+    template (e.g. "llama-2") and a system message (e.g. "You are a
+    fair evaluator language model"), generate an actual prompt.
     """
-    conv = get_conv_template("llama-2")
-    conv.set_system_message("You are a fair evaluator language model.")
+    conv = get_conv_template(config.evaluation.conversation_template)
+    conv.set_system_message(config.evaluation.conversation_system_message)
     conv.append_message(conv.roles[0], instruction)
     conv.append_message(conv.roles[1], None)
     return conv.get_prompt()
 
 
+def get_response_with_retries(
+    config: PrometheusJobConfig, client: OpenAI, prompt: str, max_retries: int
+) -> tuple[str, str]:
+    current_retry_attempt = 1
+    while current_retry_attempt <= config.evaluation.max_retries:
+        try:
+            response = openai_completion(config, client, prompt)
+            feedback, score = parse_response(config, response)
+            break
+        except (OpenAIError, BadResponseError) as e:
+            print(
+                f"[w] {e.message}, "
+                f"retrying ({current_retry_attempt}/{config.evaluation.max_retries})"
+            )
+            current_retry_attempt += 1
+            if current_retry_attempt > config.evaluation.max_retries:
+                raise e
+    return (feedback, score)
+
+
 def run_eval(
     config: PrometheusJobConfig, data: Dataset, tokenizer: PreTrainedTokenizer, client: OpenAI
 ) -> str:
@@ -89,12 +112,12 @@ def run_eval(
 
     # open the output file for writing and iterate on samples
     tracking_name = config.tracking.name if config.tracking is not None else "output.json"
-    output_fname = Path(config.evaluation.tmp_folder) / tracking_name
+    output_fname = Path(config.evaluation.output_folder) / tracking_name
     with output_fname.open("w") as file:
         for sample in dataset_iterable:
             # convert instructions from the dataset (`text_field` in a dict) to
             # prompts that prometheus accepts
-            prompt = instruction_to_prompt(sample[config.dataset.text_field])
+            prompt = instruction_to_prompt(config, sample[config.dataset.text_field])
 
             # skip those examples which are too long
             tokenized_prompt = tokenizer(prompt, truncation=False)
@@ -107,18 +130,9 @@ def run_eval(
             result["prometheus_score"] = []
 
             for idx in range(config.evaluation.num_answers):
-                i = 0
-                while i < config.evaluation.max_retries:
-                    try:
-                        response = openai_completion(config, client, prompt)
-                        feedback, score = parse_response(config, response)
-                        break
-                    except (OpenAIError, BadResponseError) as e:
-                        print(f"[w] {e.message}, retrying ({i+1}/{config.evaluation.max_retries})")
-                        i += 1
-                        if i == config.evaluation.max_retries:
-                            raise e
-
+                (feedback, score) = get_response_with_retries(
+                    config, client, prompt, config.evaluation.max_retries
+                )
                 result["prometheus_output"].append(feedback)
                 result["prometheus_score"].append(score)
 
@@ -126,7 +140,7 @@ def run_eval(
             file.write(json.dumps(result) + "\n")
 
     # convert plain json dataset in HF format
-    output_hf_name = str(Path(config.evaluation.tmp_folder) / "hf" / tracking_name)
+    output_hf_name = str(Path(config.evaluation.output_folder) / "hf" / tracking_name)
     ds = load_dataset("json", data_files=str(output_fname), split="train")
     ds.save_to_disk(output_hf_name)
 
@@ -134,20 +148,20 @@ def run_eval(
 
 
 def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader):
-    # load dataset from W&B artifact
-    hf_loader = HuggingFaceAssetLoader(artifact_loader)
-    data = hf_loader.load_dataset(config.dataset)
-
-    # get the tokenizer
-    tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine)
-    tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config)
-
     # instantiate OpenAI client to speak with the vLLM endpoint
     client = OpenAI(base_url=config.prometheus.inference.base_url)
 
     # Register a dataset file artifact if tracking is enabled
     if config.tracking:
         with wandb_init_from_config(config.tracking, job_type=LMBuddyJobType.EVALUATION):
+            # load dataset from W&B artifact
+            hf_loader = HuggingFaceAssetLoader(artifact_loader)
+            data = hf_loader.load_dataset(config.dataset)
+
+            # get the tokenizer
+            tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine)
+            tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config)
+
             # run eval and store output in local filename
             output_dataset_name = run_eval(config, data, tokenizer, client)
 
@@ -158,8 +172,16 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader)
                 artifact_type=ArtifactType.DATASET,
                 reference=False,
             )
-            print("[i] Logging artifact for evaluation results...")
+            print("Logging artifact for evaluation results...")
             artifact_loader.log_artifact(artifact)
     else:
+        # load dataset from W&B artifact
+        hf_loader = HuggingFaceAssetLoader(artifact_loader)
+        data = hf_loader.load_dataset(config.dataset)
+
+        # get the tokenizer
+        tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine)
+        tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config)
+
         output_dataset_name = run_eval(config, data, tokenizer, client)
         print(f"[i] Evaluation results stored in {output_dataset_name}")
diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py
index cc46ae0c..db428a97 100644
--- a/src/lm_buddy/jobs/configs/prometheus.py
+++ b/src/lm_buddy/jobs/configs/prometheus.py
@@ -12,17 +12,20 @@ class PrometheusEvaluationTaskConfig(BaseLMBuddyConfig):
 
     num_answers: int = 3
     max_retries: int = 5
+    scores: list = [1, 2, 3, 4, 5]
     min_score: int = 0
     max_score: int = 5
     enable_tqdm: bool = False
-    tmp_folder: str = "/tmp"
+    output_folder: str = "/tmp"
+    conversation_template: str = "llama-2"
+    conversation_system_message: str = "You are a fair evaluator language model."
 
 
 class PrometheusJobConfig(LMBuddyJobConfig):
     """Configuration to run a prometheus job."""
 
     dataset: TextDatasetConfig = Field(
-        description="dataset (json artifact from which we'll extract `text_field`)"
+        description="Dataset of text completions to evaluate using the Prometheus judge model."
     )
 
     # vLLM endpoint configuration

From 81ffbe6b7c0efd02c0ef0d4fe99935b6bcd7153c Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Tue, 12 Mar 2024 20:45:19 +0000
Subject: [PATCH 16/18] Bump version to 0.3.0

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index a765d64a..b7a61d00 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "lm-buddy"
-version = "0.2.4"
+version = "0.3.0"
 authors = [
     { name = "Sean Friedowitz", email = "sean@mozilla.ai" },
     { name = "Aaron Gonzales", email = "aaron@mozilla.ai" },

From f8e38878e7443da768344b8f0aa871a082395e49 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Tue, 12 Mar 2024 20:46:39 +0000
Subject: [PATCH 17/18] Added myself to authors

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index b7a61d00..3083fae1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,6 +9,7 @@ authors = [
     { name = "Sean Friedowitz", email = "sean@mozilla.ai" },
     { name = "Aaron Gonzales", email = "aaron@mozilla.ai" },
     { name = "Vicki Boykis", email = "vicki@mozilla.ai" },
+    { name = "Davide Eynard", email = "davide@mozilla.ai" },
 ]
 description = "Ray-centric library for finetuning and evaluation of (large) language models."
 readme = "README.md"

From ab66e0affd0ccf91ad10e589a75301d5e97b69f1 Mon Sep 17 00:00:00 2001
From: Davide Eynard <davide@mozilla.ai>
Date: Tue, 12 Mar 2024 20:52:47 +0000
Subject: [PATCH 18/18] Factor dataset and tokenizer into run_eval

---
 src/lm_buddy/jobs/_entrypoints/prometheus.py | 37 ++++++++------------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py
index e6f2a30c..75ca71ba 100644
--- a/src/lm_buddy/jobs/_entrypoints/prometheus.py
+++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py
@@ -8,11 +8,10 @@
 from dataclasses import dataclass
 from pathlib import Path
 
-from datasets import Dataset, load_dataset
+from datasets import load_dataset
 from fastchat.conversation import get_conv_template
 from openai import Completion, OpenAI, OpenAIError
 from tqdm import tqdm
-from transformers import PreTrainedTokenizer
 
 from lm_buddy.integrations.huggingface import HuggingFaceAssetLoader
 from lm_buddy.integrations.huggingface.tokenizer_config import AutoTokenizerConfig
@@ -105,8 +104,18 @@ def get_response_with_retries(
 
 
 def run_eval(
-    config: PrometheusJobConfig, data: Dataset, tokenizer: PreTrainedTokenizer, client: OpenAI
+    config: PrometheusJobConfig,
+    artifact_loader: ArtifactLoader,
+    client: OpenAI,
 ) -> str:
+    # load dataset from W&B artifact
+    hf_loader = HuggingFaceAssetLoader(artifact_loader)
+    data = hf_loader.load_dataset(config.dataset)
+
+    # get the tokenizer
+    tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine)
+    tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config)
+
     # enable / disable tqdm
     dataset_iterable = tqdm(data) if config.evaluation.enable_tqdm else data
 
@@ -154,16 +163,8 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader)
     # Register a dataset file artifact if tracking is enabled
     if config.tracking:
         with wandb_init_from_config(config.tracking, job_type=LMBuddyJobType.EVALUATION):
-            # load dataset from W&B artifact
-            hf_loader = HuggingFaceAssetLoader(artifact_loader)
-            data = hf_loader.load_dataset(config.dataset)
-
-            # get the tokenizer
-            tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine)
-            tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config)
-
             # run eval and store output in local filename
-            output_dataset_name = run_eval(config, data, tokenizer, client)
+            output_dataset_name = run_eval(config, artifact_loader, client)
 
             # store HF dataset as a directory artifact
             artifact = build_directory_artifact(
@@ -175,13 +176,5 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader)
             print("Logging artifact for evaluation results...")
             artifact_loader.log_artifact(artifact)
     else:
-        # load dataset from W&B artifact
-        hf_loader = HuggingFaceAssetLoader(artifact_loader)
-        data = hf_loader.load_dataset(config.dataset)
-
-        # get the tokenizer
-        tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine)
-        tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config)
-
-        output_dataset_name = run_eval(config, data, tokenizer, client)
-        print(f"[i] Evaluation results stored in {output_dataset_name}")
+        output_dataset_name = run_eval(config, artifact_loader, client)
+        print(f"Evaluation results stored in {output_dataset_name}")