mozilla-ai · aittalam · Mar 12, 2024 · Feb 29, 2024 · Feb 29, 2024 · Feb 29, 2024
diff --git a/examples/configs/prometheus/prometheus_config.yaml b/examples/configs/prometheus/prometheus_config.yaml
@@ -0,0 +1,22 @@
+dataset:
+  load_from:
+    name: "wandb_file_artifact_name.json"
+    version: "latest"
+    project: "lm-buddy-prometheus"
+    entity: "mozilla-ai"
+  # field containing scoring instructions in the json file
+  text_field: "instruction"
+
+prometheus:
+  inference:
+    base_url: "http://your.vllm.server:8000/v1"
+  tokenizer:
+    load_from: "meta-llama/Llama-2-7b-chat-hf"
+  max_tokens: 256
+  # number of times the model is called per sample
+  num_answers: 3
+
+tracking:
+  name: "lm-buddy-prometheus"
+  project: "lm-buddy-examples"
+  entity: "mozilla-ai"
diff --git a/src/lm_buddy/cli/run.py b/src/lm_buddy/cli/run.py
@@ -1,7 +1,12 @@
 import click
 
 import lm_buddy
-from lm_buddy.jobs.configs import FinetuningJobConfig, LMHarnessJobConfig, SimpleJobConfig
+from lm_buddy.jobs.configs import (
+    FinetuningJobConfig,
+    LMHarnessJobConfig,
+    PrometheusJobConfig,
+    SimpleJobConfig,
+)
 
 # TODO(RD2024-125): We should probably collapse all these commands into a single CLI command
 # - Need to figure out best way to polymorphically deserialize the job config classes
@@ -32,3 +37,10 @@ def run_finetuning(config: str) -> None:
 def run_lm_harness(config: str) -> None:
     config = LMHarnessJobConfig.from_yaml_file(config)
     lm_buddy.run_job(config)
+
+
+@group.command("prometheus", help="Run the prometheus evaluation job.")
+@click.option("--config", type=str)
+def run_prometheus(config: str) -> None:
+    config = PrometheusJobConfig.from_yaml_file(config)
+    lm_buddy.run_job(config)
diff --git a/src/lm_buddy/cli/schema.py b/src/lm_buddy/cli/schema.py
@@ -1,11 +1,16 @@
 import json
 
 import click
 
-from lm_buddy.jobs.configs import FinetuningJobConfig, LMHarnessJobConfig, SimpleJobConfig
+from lm_buddy.jobs.configs import (
+    FinetuningJobConfig, 
+    LMHarnessJobConfig,
+    PrometheusJobConfig,
+    SimpleJobConfig,
+)
 
 
 @click.group(name="schema", help="Get a job configuration schema.")
 def group():
    pass

@@ -26,3 +31,9 @@
 def schema_lm_harness() -> None:
     schema = LMHarnessJobConfig.model_json_schema()
     click.secho(json.dumps(schema, indent=2))
+
+
+@group.command("prometheus", help="Schema for the prometheus job configuration.")
+def schema_prometheus() -> None:
+    schema = PrometheusJobConfig.model_json_schema()
+    click.secho(json.dumps(schema, indent=2))
diff --git a/src/lm_buddy/integrations/wandb/artifact_utils.py b/src/lm_buddy/integrations/wandb/artifact_utils.py
@@ -1,10 +1,10 @@
 from enum import Enum
 from pathlib import Path
 from typing import Any
 from urllib.parse import ParseResult, urlparse
 
 import wandb
-
+import os
 
 class ArtifactType(str, Enum):
     """Enumeration of artifact types used by the LM Buddy."""
@@ -110,3 +110,38 @@
         table = wandb.Table(data=table_data, columns=columns)
         artifact.add(table, name=table_name)
     return artifact
+
+
+def build_file_artifact(
+    artifact_name: str,
+    artifact_type: ArtifactType,
+    file_path: str | Path,
+    *,
+    reference: bool = False,
+    entry_name: str | None = None,
+) -> wandb.Artifact:
+    """Build an artifact containing a single file
+
+    Args:
+        artifact_name (str): Name of the artifact
+        artifact_type (ArtifactType): Type of artifact
+        file_path (str | Path): The full path (including filename) of the file
+
+    Keyword Args:
+        reference (bool): Only reference the file, do not copy contents. Defaults to False.
+        entry_name (str | None): Name for the file within the artifact. If None, defaults
+                                 to the original filename.
+
+    Returns:
+        wandb.Artifact: The generated artifact.
+    """
+    artifact = wandb.Artifact(name=artifact_name, type=artifact_type)
+
+    if reference:
+        artifact.add_reference(
+            uri=f"{ArtifactURIScheme.FILE}://{file_path}",
+            name=entry_name,
+        )
+    else:
+        artifact.add_file(str(file_path), name=entry_name)
+    return artifact
diff --git a/src/lm_buddy/jobs/__init__.py b/src/lm_buddy/jobs/__init__.py
@@ -1,9 +1,15 @@
 from lm_buddy.integrations.wandb import ArtifactLoader, WandbArtifactLoader
-from lm_buddy.jobs._entrypoints import run_finetuning, run_lm_harness, run_simple
+from lm_buddy.jobs._entrypoints import (
+    run_finetuning,
+    run_lm_harness,
+    run_prometheus,
+    run_simple,
+)
 from lm_buddy.jobs.configs import (
     FinetuningJobConfig,
     LMBuddyJobConfig,
     LMHarnessJobConfig,
+    PrometheusJobConfig,
     SimpleJobConfig,
 )
 
@@ -26,5 +32,7 @@ def run_job(
             run_finetuning(finetuning_config, artifact_loader)
         case LMHarnessJobConfig() as lm_harness_config:
             run_lm_harness(lm_harness_config, artifact_loader)
+        case PrometheusJobConfig() as prometheus_config:
+            run_prometheus(prometheus_config, artifact_loader)
         case _:
             raise ValueError(f"Received invalid job configuration: {config}")
diff --git a/src/lm_buddy/jobs/_entrypoints/__init__.py b/src/lm_buddy/jobs/_entrypoints/__init__.py
@@ -1,5 +1,6 @@
 from lm_buddy.jobs._entrypoints.finetuning import run_finetuning
 from lm_buddy.jobs._entrypoints.lm_harness import run_lm_harness
+from lm_buddy.jobs._entrypoints.prometheus import run_prometheus
 from lm_buddy.jobs._entrypoints.simple import run_simple
 
-__all__ = ["run_finetuning", "run_lm_harness", "run_simple"]
+__all__ = ["run_finetuning", "run_lm_harness", "run_prometheus", "run_simple"]
diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py
@@ -0,0 +1,128 @@
+from lm_buddy.jobs.configs import PrometheusJobConfig
+from lm_buddy.integrations.huggingface import HuggingFaceAssetLoader
+from lm_buddy.integrations.wandb import (
+    ArtifactType, 
+    ArtifactLoader, 
+    build_file_artifact, 
+    wandb_init_from_config,
+)
+from fastchat.conversation import get_conv_template
+from transformers import AutoTokenizer
+from openai import OpenAIError, OpenAI
+
+from tqdm import tqdm
+import os
+import json
+import copy
+
+class BadResponseException(Exception):
+    def __init__(self, message, error):
+        self.message = message
+        self.error = error
+
+
+def openai_completion(config, client, prompt):
+    return client.completions.create(
+        model = "kaist-ai/prometheus-13b-v1.0",
+        prompt = prompt,
+        best_of = config.prometheus.best_of,
+        max_tokens = config.prometheus.max_tokens,
+        frequency_penalty = config.prometheus.frequency_penalty,
+        temperature = config.prometheus.temperature,
+        top_p = config.prometheus.top_p
+    )
+
+
+def parse_response(response):
+    try:
+        assert response is not None
+        response_text = response.choices[0].text
+        feedback, score = response_text.split('[RESULT]')
+        feedback = feedback.strip()
+        score = score.strip()
+        assert score in ["1","2","3","4","5"]
+    except (ValueError, AssertionError) as e:
+        raise BadResponseException("Server returned a bad response", e)
+
+    return feedback, score
+
+
+def instruction_to_prompt(instruction):
+    conv = get_conv_template("llama-2")
+    conv.set_system_message("You are a fair evaluator language model.")
+    conv.append_message(conv.roles[0], instruction)
+    conv.append_message(conv.roles[1], None)
+    return conv.get_prompt()
+
+
+def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader):
+
+    # load dataset from W&B artifact
+    hf_loader = HuggingFaceAssetLoader(artifact_loader)
+    artifact_path,_ = hf_loader.resolve_asset_path(config.dataset.load_from)
+    dataset_fname = os.path.join(artifact_path, config.dataset.load_from.name)
+
+    with open(dataset_fname,'r') as f:
+        # eval samples are JSON-encoded, each takes one line in the dataset file
+        data = [json.loads(line) for line in f.readlines()]
+
+    # get the tokenizer
+    tokenizer = hf_loader.load_pretrained_tokenizer(config.prometheus.tokenizer)
+
+    # instantiate OpenAI client to speak with the vLLM endpoint
+    client = OpenAI(
+        base_url = config.prometheus.inference.base_url
+    )
+
+    # open the output file for writing and iterate on samples
+    output_fname = os.path.join("/tmp", config.tracking.name)
+    with open(output_fname,'w') as file:
+        for sample in tqdm(data):
+            # convert instructions from the dataset (`text_field` in a dict) to
+            # prompts that prometheus accepts
+            prompt = instruction_to_prompt(sample[config.dataset.text_field])
+
+            # skip those examples which are too long 
+            tokenized_prompt  = tokenizer(prompt, truncation=False)
+            if(len(tokenized_prompt['input_ids'])>3072):
+                continue
+
+            # prepare output
+            result = copy.deepcopy(sample)
+            result['prometheus_output'] = []
+            result['prometheus_score'] = []
+
+            for idx in range(config.prometheus.num_answers):
+
+                i = 0
+                while i < config.prometheus.max_retries: 
+                    try:
+                        response = openai_completion(config, client, prompt)
+                        feedback, score = parse_response(response)
+                        print(feedback, score)
+                        break
+                    except (OpenAIError, BadResponseException) as e:
+                        print(f"[w] {e.message}, retrying ({i+1}/{config.prometheus.max_retries})")
+                        i += 1
+                        if i == config.prometheus.max_retries:
+                            raise e
+
+                result['prometheus_output'].append(feedback)
+                result['prometheus_score'].append(score)
+
+            # dump sample results
+            file.write(json.dumps(result)+"\n")
+
+
+    # Register a dataset file artifact if tracking is enabled
+    if config.tracking:
+
+        with wandb_init_from_config(config.tracking) as run:
+            file_artifact = build_file_artifact(
+                artifact_name = config.tracking.name, 
+                artifact_type = ArtifactType.DATASET,
+                file_path = output_fname,
+                reference = False,
+            )
+            print("[i] Logging artifact for evaluation results...")
+            artifact_loader.log_artifact(file_artifact)
diff --git a/src/lm_buddy/jobs/configs/__init__.py b/src/lm_buddy/jobs/configs/__init__.py
@@ -5,6 +5,7 @@
     LMHarnessJobConfig,
     LocalChatCompletionsConfig,
 )
+from lm_buddy.jobs.configs.prometheus import PrometheusCompletionsConfig, PrometheusJobConfig
 from lm_buddy.jobs.configs.simple import SimpleJobConfig
 
 __all__ = [
@@ -15,5 +16,7 @@
     "LMHarnessEvaluatorConfig",
     "LMHarnessJobConfig",
     "LocalChatCompletionsConfig",
+    "PrometheusCompletionsConfig",
+    "PrometheusJobConfig",
     "SimpleJobConfig",
 ]
diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py
@@ -0,0 +1,42 @@
+from typing import Literal
+
+from pydantic import conlist, model_validator
+
+from lm_buddy.types import BaseLMBuddyConfig
+from lm_buddy.jobs.configs import LMBuddyJobConfig
+from lm_buddy.integrations.wandb import WandbRunConfig
+from lm_buddy.integrations.vllm import InferenceServerConfig
+from lm_buddy.integrations.huggingface import TextDatasetConfig, AutoTokenizerConfig
+
+class PrometheusCompletionsConfig(BaseLMBuddyConfig):
+    """Configuration for a "local-completions" prometheus model.
+
+    The prometheus model is powered by a self-hosted inference server, specified
+    as an `InferenceServerConfig`. Additional arguments are also provided
+    to control the tokenizer type and generation parameters.
+    """
+
+    inference: InferenceServerConfig
+
+    # vLLM-served model params
+    best_of: int = 1
+    max_tokens: int = 512 
+    frequency_penalty: float = 1.03
+    temperature: float = 1.0
+    top_p: float = 0.9
+
+    # evaluation script params
+    tokenizer: AutoTokenizerConfig | None = None
+    num_answers: int = 3
+    max_retries: int = 5
+
+
+class PrometheusJobConfig(LMBuddyJobConfig):
+    """Configuration to run a prometheus evaluation job."""
+
+    # dataset (json artifact from which we'll extract `text_field`)
+    dataset: TextDatasetConfig
+    # details for our self-hosted prometheus endpoint
+    prometheus: PrometheusCompletionsConfig
+    # wandb experiment tracking details
+    tracking: WandbRunConfig | None = None