From e4e05b73bf377fb5a20f8ddf45ad363080630e18 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Thu, 29 Feb 2024 20:10:47 +0000 Subject: [PATCH 01/18] Added v0 of prometheus lm-buddy entrypoint --- .../configs/prometheus/prometheus_config.yaml | 20 +++ src/lm_buddy/cli/run.py | 14 +- src/lm_buddy/cli/schema.py | 13 +- .../integrations/wandb/artifact_utils.py | 37 ++++- src/lm_buddy/jobs/__init__.py | 10 +- src/lm_buddy/jobs/_entrypoints/__init__.py | 3 +- src/lm_buddy/jobs/_entrypoints/prometheus.py | 128 ++++++++++++++++++ src/lm_buddy/jobs/configs/__init__.py | 3 + src/lm_buddy/jobs/configs/prometheus.py | 42 ++++++ 9 files changed, 265 insertions(+), 5 deletions(-) create mode 100644 examples/configs/prometheus/prometheus_config.yaml create mode 100644 src/lm_buddy/jobs/_entrypoints/prometheus.py create mode 100644 src/lm_buddy/jobs/configs/prometheus.py diff --git a/examples/configs/prometheus/prometheus_config.yaml b/examples/configs/prometheus/prometheus_config.yaml new file mode 100644 index 00000000..a549aaa9 --- /dev/null +++ b/examples/configs/prometheus/prometheus_config.yaml @@ -0,0 +1,20 @@ +dataset: + load_from: + name: "wandb_file_artifact_name.json" + version: "latest" + project: "lm-buddy-prometheus" + entity: "mozilla-ai" + text_field: "instruction" + +prometheus: + inference: + base_url: "http://your.vllm.server:8000/v1" + tokenizer: + load_from: "meta-llama/Llama-2-7b-chat-hf" + max_tokens: 256 + num_answers: 3 + +tracking: + name: "lm-buddy-prometheus" + project: "lm-buddy-examples" + entity: "mozilla-ai" diff --git a/src/lm_buddy/cli/run.py b/src/lm_buddy/cli/run.py index 64d8ff90..1f0432c7 100644 --- a/src/lm_buddy/cli/run.py +++ b/src/lm_buddy/cli/run.py @@ -1,7 +1,12 @@ import click import lm_buddy -from lm_buddy.jobs.configs import FinetuningJobConfig, LMHarnessJobConfig, SimpleJobConfig +from lm_buddy.jobs.configs import ( + FinetuningJobConfig, + LMHarnessJobConfig, + PrometheusJobConfig, + SimpleJobConfig, +) # TODO(RD2024-125): We should probably collapse all these commands into a single CLI command # - Need to figure out best way to polymorphically deserialize the job config classes @@ -32,3 +37,10 @@ def run_finetuning(config: str) -> None: def run_lm_harness(config: str) -> None: config = LMHarnessJobConfig.from_yaml_file(config) lm_buddy.run_job(config) + + +@group.command("prometheus", help="Run the prometheus evaluation job.") +@click.option("--config", type=str) +def run_prometheus(config: str) -> None: + config = PrometheusJobConfig.from_yaml_file(config) + lm_buddy.run_job(config) diff --git a/src/lm_buddy/cli/schema.py b/src/lm_buddy/cli/schema.py index d33ad25c..6149c2c5 100644 --- a/src/lm_buddy/cli/schema.py +++ b/src/lm_buddy/cli/schema.py @@ -2,7 +2,12 @@ import click -from lm_buddy.jobs.configs import FinetuningJobConfig, LMHarnessJobConfig, SimpleJobConfig +from lm_buddy.jobs.configs import ( + FinetuningJobConfig, + LMHarnessJobConfig, + PrometheusJobConfig, + SimpleJobConfig, +) @click.group(name="schema", help="Get a job configuration schema.") @@ -26,3 +31,9 @@ def schema_finetuning() -> None: def schema_lm_harness() -> None: schema = LMHarnessJobConfig.model_json_schema() click.secho(json.dumps(schema, indent=2)) + + +@group.command("prometheus", help="Schema for the prometheus job configuration.") +def schema_prometheus() -> None: + schema = PrometheusJobConfig.model_json_schema() + click.secho(json.dumps(schema, indent=2)) diff --git a/src/lm_buddy/integrations/wandb/artifact_utils.py b/src/lm_buddy/integrations/wandb/artifact_utils.py index 315a9270..966ea537 100644 --- a/src/lm_buddy/integrations/wandb/artifact_utils.py +++ b/src/lm_buddy/integrations/wandb/artifact_utils.py @@ -4,7 +4,7 @@ from urllib.parse import ParseResult, urlparse import wandb - +import os class ArtifactType(str, Enum): """Enumeration of artifact types used by the LM Buddy.""" @@ -110,3 +110,38 @@ def build_table_artifact( table = wandb.Table(data=table_data, columns=columns) artifact.add(table, name=table_name) return artifact + + +def build_file_artifact( + artifact_name: str, + artifact_type: ArtifactType, + file_path: str | Path, + *, + reference: bool = False, + entry_name: str | None = None, +) -> wandb.Artifact: + """Build an artifact containing a single file + + Args: + artifact_name (str): Name of the artifact + artifact_type (ArtifactType): Type of artifact + file_path (str | Path): The full path (including filename) of the file + + Keyword Args: + reference (bool): Only reference the file, do not copy contents. Defaults to False. + entry_name (str | None): Name for the file within the artifact. If None, defaults + to the original filename. + + Returns: + wandb.Artifact: The generated artifact. + """ + artifact = wandb.Artifact(name=artifact_name, type=artifact_type) + + if reference: + artifact.add_reference( + uri=f"{ArtifactURIScheme.FILE}://{file_path}", + name=entry_name, + ) + else: + artifact.add_file(str(file_path), name=entry_name) + return artifact diff --git a/src/lm_buddy/jobs/__init__.py b/src/lm_buddy/jobs/__init__.py index 659a03ca..41e57911 100644 --- a/src/lm_buddy/jobs/__init__.py +++ b/src/lm_buddy/jobs/__init__.py @@ -1,9 +1,15 @@ from lm_buddy.integrations.wandb import ArtifactLoader, WandbArtifactLoader -from lm_buddy.jobs._entrypoints import run_finetuning, run_lm_harness, run_simple +from lm_buddy.jobs._entrypoints import ( + run_finetuning, + run_lm_harness, + run_prometheus, + run_simple, +) from lm_buddy.jobs.configs import ( FinetuningJobConfig, LMBuddyJobConfig, LMHarnessJobConfig, + PrometheusJobConfig, SimpleJobConfig, ) @@ -26,5 +32,7 @@ def run_job( run_finetuning(finetuning_config, artifact_loader) case LMHarnessJobConfig() as lm_harness_config: run_lm_harness(lm_harness_config, artifact_loader) + case PrometheusJobConfig() as prometheus_config: + run_prometheus(prometheus_config, artifact_loader) case _: raise ValueError(f"Received invalid job configuration: {config}") diff --git a/src/lm_buddy/jobs/_entrypoints/__init__.py b/src/lm_buddy/jobs/_entrypoints/__init__.py index bef03bac..26de4304 100644 --- a/src/lm_buddy/jobs/_entrypoints/__init__.py +++ b/src/lm_buddy/jobs/_entrypoints/__init__.py @@ -1,5 +1,6 @@ from lm_buddy.jobs._entrypoints.finetuning import run_finetuning from lm_buddy.jobs._entrypoints.lm_harness import run_lm_harness +from lm_buddy.jobs._entrypoints.prometheus import run_prometheus from lm_buddy.jobs._entrypoints.simple import run_simple -__all__ = ["run_finetuning", "run_lm_harness", "run_simple"] +__all__ = ["run_finetuning", "run_lm_harness", "run_prometheus", "run_simple"] diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py new file mode 100644 index 00000000..a351a7f3 --- /dev/null +++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py @@ -0,0 +1,128 @@ +from lm_buddy.jobs.configs import PrometheusJobConfig +from lm_buddy.integrations.huggingface import HuggingFaceAssetLoader +from lm_buddy.integrations.wandb import ( + ArtifactType, + ArtifactLoader, + build_file_artifact, + wandb_init_from_config, +) +from fastchat.conversation import get_conv_template +from transformers import AutoTokenizer +from openai import OpenAIError, OpenAI + +from tqdm import tqdm +import os +import json +import copy + +class BadResponseException(Exception): + def __init__(self, message, error): + self.message = message + self.error = error + + +def openai_completion(config, client, prompt): + return client.completions.create( + model = "kaist-ai/prometheus-13b-v1.0", + prompt = prompt, + best_of = config.prometheus.best_of, + max_tokens = config.prometheus.max_tokens, + frequency_penalty = config.prometheus.frequency_penalty, + temperature = config.prometheus.temperature, + top_p = config.prometheus.top_p + ) + + +def parse_response(response): + try: + assert response is not None + response_text = response.choices[0].text + feedback, score = response_text.split('[RESULT]') + feedback = feedback.strip() + score = score.strip() + assert score in ["1","2","3","4","5"] + except (ValueError, AssertionError) as e: + raise BadResponseException("Server returned a bad response", e) + + return feedback, score + + +def instruction_to_prompt(instruction): + conv = get_conv_template("llama-2") + conv.set_system_message("You are a fair evaluator language model.") + conv.append_message(conv.roles[0], instruction) + conv.append_message(conv.roles[1], None) + return conv.get_prompt() + + +def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader): + + # load dataset from W&B artifact + hf_loader = HuggingFaceAssetLoader(artifact_loader) + artifact_path,_ = hf_loader.resolve_asset_path(config.dataset.load_from) + dataset_fname = os.path.join(artifact_path, config.dataset.load_from.name) + + with open(dataset_fname,'r') as f: + # eval samples are JSON-encoded, each takes one line in the dataset file + data = [json.loads(line) for line in f.readlines()] + + # get the tokenizer + tokenizer = hf_loader.load_pretrained_tokenizer(config.prometheus.tokenizer) + + # instantiate OpenAI client to speak with the vLLM endpoint + client = OpenAI( + base_url = config.prometheus.inference.base_url + ) + + # open the output file for writing and iterate on samples + output_fname = os.path.join("/tmp", config.tracking.name) + with open(output_fname,'w') as file: + for sample in tqdm(data): + # convert instructions from the dataset (`text_field` in a dict) to + # prompts that prometheus accepts + prompt = instruction_to_prompt(sample[config.dataset.text_field]) + + # skip those examples which are too long + tokenized_prompt = tokenizer(prompt, truncation=False) + if(len(tokenized_prompt['input_ids'])>3072): + continue + + # prepare output + result = copy.deepcopy(sample) + result['prometheus_output'] = [] + result['prometheus_score'] = [] + + for idx in range(config.prometheus.num_answers): + + i = 0 + while i < config.prometheus.max_retries: + try: + response = openai_completion(config, client, prompt) + feedback, score = parse_response(response) + print(feedback, score) + break + except (OpenAIError, BadResponseException) as e: + print(f"[w] {e.message}, retrying ({i+1}/{config.prometheus.max_retries})") + i += 1 + if i == config.prometheus.max_retries: + raise e + + result['prometheus_output'].append(feedback) + result['prometheus_score'].append(score) + + # dump sample results + file.write(json.dumps(result)+"\n") + + + # Register a dataset file artifact if tracking is enabled + if config.tracking: + + with wandb_init_from_config(config.tracking) as run: + file_artifact = build_file_artifact( + artifact_name = config.tracking.name, + artifact_type = ArtifactType.DATASET, + file_path = output_fname, + reference = False, + ) + print("[i] Logging artifact for evaluation results...") + artifact_loader.log_artifact(file_artifact) diff --git a/src/lm_buddy/jobs/configs/__init__.py b/src/lm_buddy/jobs/configs/__init__.py index 294f4855..e289b80e 100644 --- a/src/lm_buddy/jobs/configs/__init__.py +++ b/src/lm_buddy/jobs/configs/__init__.py @@ -5,6 +5,7 @@ LMHarnessJobConfig, LocalChatCompletionsConfig, ) +from lm_buddy.jobs.configs.prometheus import PrometheusCompletionsConfig, PrometheusJobConfig from lm_buddy.jobs.configs.simple import SimpleJobConfig __all__ = [ @@ -15,5 +16,7 @@ "LMHarnessEvaluatorConfig", "LMHarnessJobConfig", "LocalChatCompletionsConfig", + "PrometheusCompletionsConfig", + "PrometheusJobConfig", "SimpleJobConfig", ] diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py new file mode 100644 index 00000000..5b7f7d05 --- /dev/null +++ b/src/lm_buddy/jobs/configs/prometheus.py @@ -0,0 +1,42 @@ +from typing import Literal + +from pydantic import conlist, model_validator + +from lm_buddy.types import BaseLMBuddyConfig +from lm_buddy.jobs.configs import LMBuddyJobConfig +from lm_buddy.integrations.wandb import WandbRunConfig +from lm_buddy.integrations.vllm import InferenceServerConfig +from lm_buddy.integrations.huggingface import TextDatasetConfig, AutoTokenizerConfig + +class PrometheusCompletionsConfig(BaseLMBuddyConfig): + """Configuration for a "local-completions" prometheus model. + + The prometheus model is powered by a self-hosted inference server, specified + as an `InferenceServerConfig`. Additional arguments are also provided + to control the tokenizer type and generation parameters. + """ + + inference: InferenceServerConfig + + # vLLM-served model params + best_of: int = 1 + max_tokens: int = 512 + frequency_penalty: float = 1.03 + temperature: float = 1.0 + top_p: float = 0.9 + + # evaluation script params + tokenizer: AutoTokenizerConfig | None = None + num_answers: int = 3 + max_retries: int = 5 + + +class PrometheusJobConfig(LMBuddyJobConfig): + """Configuration to run a prometheus evaluation job.""" + + # dataset (json artifact from which we'll extract `text_field`) + dataset: TextDatasetConfig + # details for our self-hosted prometheus endpoint + prometheus: PrometheusCompletionsConfig + # wandb experiment tracking details + tracking: WandbRunConfig | None = None From 8e222f75360aac6e5a68b8114f39a1b8bda6e482 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Thu, 29 Feb 2024 20:24:20 +0000 Subject: [PATCH 02/18] Added comments to prometheus_config.yaml Signed-off-by: Davide Eynard --- examples/configs/prometheus/prometheus_config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/configs/prometheus/prometheus_config.yaml b/examples/configs/prometheus/prometheus_config.yaml index a549aaa9..5784b299 100644 --- a/examples/configs/prometheus/prometheus_config.yaml +++ b/examples/configs/prometheus/prometheus_config.yaml @@ -4,6 +4,7 @@ dataset: version: "latest" project: "lm-buddy-prometheus" entity: "mozilla-ai" + # field containing scoring instructions in the json file text_field: "instruction" prometheus: @@ -12,6 +13,7 @@ prometheus: tokenizer: load_from: "meta-llama/Llama-2-7b-chat-hf" max_tokens: 256 + # number of times the model is called per sample num_answers: 3 tracking: From 4e9030234e85349f701a6258ff474282cb01260a Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Thu, 29 Feb 2024 20:36:34 +0000 Subject: [PATCH 03/18] Added link to kaistai's eval code to prometheus.py Signed-off-by: Davide Eynard --- src/lm_buddy/jobs/_entrypoints/prometheus.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py index a351a7f3..3bf311f4 100644 --- a/src/lm_buddy/jobs/_entrypoints/prometheus.py +++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py @@ -1,3 +1,6 @@ +# lm-buddy entrypoint to run evaluations using a Prometheus inference server +# see https://github.com/kaistAI/prometheus/blob/main/evaluation/benchmark/run_absolute_scoring.py + from lm_buddy.jobs.configs import PrometheusJobConfig from lm_buddy.integrations.huggingface import HuggingFaceAssetLoader from lm_buddy.integrations.wandb import ( From de4be8b03ff8f3f66edbea90fbe1cb0184e21732 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Fri, 1 Mar 2024 13:21:11 +0000 Subject: [PATCH 04/18] Update dataset in src/lm_buddy/jobs/configs/prometheus.py following Sean's comment Co-authored-by: Sean Friedowitz Signed-off-by: Davide Eynard --- src/lm_buddy/jobs/configs/prometheus.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py index 5b7f7d05..80fea5cb 100644 --- a/src/lm_buddy/jobs/configs/prometheus.py +++ b/src/lm_buddy/jobs/configs/prometheus.py @@ -34,8 +34,7 @@ class PrometheusCompletionsConfig(BaseLMBuddyConfig): class PrometheusJobConfig(LMBuddyJobConfig): """Configuration to run a prometheus evaluation job.""" - # dataset (json artifact from which we'll extract `text_field`) - dataset: TextDatasetConfig + dataset: TextDatasetConfig = Field(..., description="dataset (json artifact from which we'll extract `text_field`)") # details for our self-hosted prometheus endpoint prometheus: PrometheusCompletionsConfig # wandb experiment tracking details From cba2386d5a77bff315d6127c8a32e93617bf631a Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Mon, 11 Mar 2024 15:26:27 +0000 Subject: [PATCH 05/18] Removed asserts from parse_response --- src/lm_buddy/jobs/_entrypoints/prometheus.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py index 3bf311f4..906a64b2 100644 --- a/src/lm_buddy/jobs/_entrypoints/prometheus.py +++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py @@ -19,7 +19,7 @@ import copy class BadResponseException(Exception): - def __init__(self, message, error): + def __init__(self, message, error=None): self.message = message self.error = error @@ -37,15 +37,19 @@ def openai_completion(config, client, prompt): def parse_response(response): + if response is None: + raise BadResponseException("Server returned an empty response") + try: - assert response is not None response_text = response.choices[0].text + # note: this can raise a ValueError if the message is malformed feedback, score = response_text.split('[RESULT]') feedback = feedback.strip() score = score.strip() - assert score in ["1","2","3","4","5"] - except (ValueError, AssertionError) as e: - raise BadResponseException("Server returned a bad response", e) + if score not in ["1","2","3","4","5"]: + raise BadResponseException("Score not in range") + except (ValueError, BadResponseException) as e: + raise BadResponseException(f"Server returned a malformed response ({e})",e) return feedback, score From 6a6348f9eb805d2262d40906d8d3dc2c6e1907da Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Mon, 11 Mar 2024 15:41:55 +0000 Subject: [PATCH 06/18] Removed os.path in favor of pathlib --- src/lm_buddy/jobs/_entrypoints/prometheus.py | 8 ++++---- src/lm_buddy/jobs/configs/prometheus.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py index 906a64b2..46c8c914 100644 --- a/src/lm_buddy/jobs/_entrypoints/prometheus.py +++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py @@ -14,7 +14,7 @@ from openai import OpenAIError, OpenAI from tqdm import tqdm -import os +from pathlib import Path import json import copy @@ -67,7 +67,7 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader) # load dataset from W&B artifact hf_loader = HuggingFaceAssetLoader(artifact_loader) artifact_path,_ = hf_loader.resolve_asset_path(config.dataset.load_from) - dataset_fname = os.path.join(artifact_path, config.dataset.load_from.name) + dataset_fname = Path(artifact_path) / config.dataset.load_from.name with open(dataset_fname,'r') as f: # eval samples are JSON-encoded, each takes one line in the dataset file @@ -82,9 +82,9 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader) ) # open the output file for writing and iterate on samples - output_fname = os.path.join("/tmp", config.tracking.name) + output_fname = Path("/tmp") / config.tracking.name with open(output_fname,'w') as file: - for sample in tqdm(data): + for sample in tqdm(data[:1]): # convert instructions from the dataset (`text_field` in a dict) to # prompts that prometheus accepts prompt = instruction_to_prompt(sample[config.dataset.text_field]) diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py index 80fea5cb..b5d72b39 100644 --- a/src/lm_buddy/jobs/configs/prometheus.py +++ b/src/lm_buddy/jobs/configs/prometheus.py @@ -1,6 +1,6 @@ from typing import Literal -from pydantic import conlist, model_validator +from pydantic import Field, conlist, model_validator from lm_buddy.types import BaseLMBuddyConfig from lm_buddy.jobs.configs import LMBuddyJobConfig @@ -34,7 +34,7 @@ class PrometheusCompletionsConfig(BaseLMBuddyConfig): class PrometheusJobConfig(LMBuddyJobConfig): """Configuration to run a prometheus evaluation job.""" - dataset: TextDatasetConfig = Field(..., description="dataset (json artifact from which we'll extract `text_field`)") + dataset: TextDatasetConfig = Field(description="dataset (json artifact from which we'll extract `text_field`)") # details for our self-hosted prometheus endpoint prometheus: PrometheusCompletionsConfig # wandb experiment tracking details From 849b59ac35d0651e9309e286f70fec5106787327 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 12 Mar 2024 13:44:05 +0000 Subject: [PATCH 07/18] Updated config, parametrised engine, scores, tqdm, removed extra tokenizer --- src/lm_buddy/integrations/vllm.py | 20 +++++++++- src/lm_buddy/jobs/_entrypoints/prometheus.py | 33 ++++++++++------ src/lm_buddy/jobs/configs/__init__.py | 4 +- src/lm_buddy/jobs/configs/prometheus.py | 40 ++++++++------------ 4 files changed, 57 insertions(+), 40 deletions(-) diff --git a/src/lm_buddy/integrations/vllm.py b/src/lm_buddy/integrations/vllm.py index 14e8e452..8eb6f7b4 100644 --- a/src/lm_buddy/integrations/vllm.py +++ b/src/lm_buddy/integrations/vllm.py @@ -12,8 +12,26 @@ class InferenceServerConfig(BaseLMBuddyConfig): Note: This configuration is intended to be generic and not bound to the interface of any specific training/evaluation framework. See `LocalChatCompletionConfig` - for intended usage alongside a third-party framework. + or `vLLMCompleptionsConfig` for intended usage alongside a third-party framework. """ base_url: str engine: str | HuggingFaceAssetPath | None = None + + +class vLLMCompletionsConfig(BaseLMBuddyConfig): + """Configuration for a vLLM-based completions service + + The "local-chat-completions" model is powered by a self-hosted inference server, + specified as an `InferenceServerConfig`. Additional arguments are also provided + to control the tokenizer type and generation parameters. + """ + + inference: InferenceServerConfig + + # vLLM-specific params + best_of: int | None = None + max_tokens: int | None = None + frequency_penalty: float | None = None + temperature: float | None = None + top_p: float | None = None \ No newline at end of file diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py index 46c8c914..d609cde2 100644 --- a/src/lm_buddy/jobs/_entrypoints/prometheus.py +++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py @@ -3,6 +3,7 @@ from lm_buddy.jobs.configs import PrometheusJobConfig from lm_buddy.integrations.huggingface import HuggingFaceAssetLoader +from lm_buddy.integrations.huggingface.tokenizer_config import AutoTokenizerConfig from lm_buddy.integrations.wandb import ( ArtifactType, ArtifactLoader, @@ -26,7 +27,7 @@ def __init__(self, message, error=None): def openai_completion(config, client, prompt): return client.completions.create( - model = "kaist-ai/prometheus-13b-v1.0", + model = config.prometheus.inference.engine, prompt = prompt, best_of = config.prometheus.best_of, max_tokens = config.prometheus.max_tokens, @@ -36,7 +37,7 @@ def openai_completion(config, client, prompt): ) -def parse_response(response): +def parse_response(config, response): if response is None: raise BadResponseException("Server returned an empty response") @@ -46,8 +47,11 @@ def parse_response(response): feedback, score = response_text.split('[RESULT]') feedback = feedback.strip() score = score.strip() - if score not in ["1","2","3","4","5"]: - raise BadResponseException("Score not in range") + if score not in [str(s) for s in range( + config.evaluation.min_score, + config.evaluation.max_score+1 + )]: + raise BadResponseException(f"Score {score} is not in range") except (ValueError, BadResponseException) as e: raise BadResponseException(f"Server returned a malformed response ({e})",e) @@ -74,17 +78,23 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader) data = [json.loads(line) for line in f.readlines()] # get the tokenizer - tokenizer = hf_loader.load_pretrained_tokenizer(config.prometheus.tokenizer) + tokenizer_config = AutoTokenizerConfig( + load_from = config.prometheus.inference.engine + ) + tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config) # instantiate OpenAI client to speak with the vLLM endpoint client = OpenAI( base_url = config.prometheus.inference.base_url ) + # enable / disable tqdm + dataset_iterable = tqdm(data) if config.evaluation.enable_tqdm else data + # open the output file for writing and iterate on samples output_fname = Path("/tmp") / config.tracking.name with open(output_fname,'w') as file: - for sample in tqdm(data[:1]): + for sample in dataset_iterable: # convert instructions from the dataset (`text_field` in a dict) to # prompts that prometheus accepts prompt = instruction_to_prompt(sample[config.dataset.text_field]) @@ -99,19 +109,18 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader) result['prometheus_output'] = [] result['prometheus_score'] = [] - for idx in range(config.prometheus.num_answers): + for idx in range(config.evaluation.num_answers): i = 0 - while i < config.prometheus.max_retries: + while i < config.evaluation.max_retries: try: response = openai_completion(config, client, prompt) - feedback, score = parse_response(response) - print(feedback, score) + feedback, score = parse_response(config, response) break except (OpenAIError, BadResponseException) as e: - print(f"[w] {e.message}, retrying ({i+1}/{config.prometheus.max_retries})") + print(f"[w] {e.message}, retrying ({i+1}/{config.evaluation.max_retries})") i += 1 - if i == config.prometheus.max_retries: + if i == config.evaluation.max_retries: raise e result['prometheus_output'].append(feedback) diff --git a/src/lm_buddy/jobs/configs/__init__.py b/src/lm_buddy/jobs/configs/__init__.py index e289b80e..e7f71236 100644 --- a/src/lm_buddy/jobs/configs/__init__.py +++ b/src/lm_buddy/jobs/configs/__init__.py @@ -5,7 +5,7 @@ LMHarnessJobConfig, LocalChatCompletionsConfig, ) -from lm_buddy.jobs.configs.prometheus import PrometheusCompletionsConfig, PrometheusJobConfig +from lm_buddy.jobs.configs.prometheus import PrometheusEvaluationTaskConfig, PrometheusJobConfig from lm_buddy.jobs.configs.simple import SimpleJobConfig __all__ = [ @@ -16,7 +16,7 @@ "LMHarnessEvaluatorConfig", "LMHarnessJobConfig", "LocalChatCompletionsConfig", - "PrometheusCompletionsConfig", + "PrometheusEvaluationTaskConfig", "PrometheusJobConfig", "SimpleJobConfig", ] diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py index b5d72b39..3e7a782c 100644 --- a/src/lm_buddy/jobs/configs/prometheus.py +++ b/src/lm_buddy/jobs/configs/prometheus.py @@ -1,41 +1,31 @@ -from typing import Literal - from pydantic import Field, conlist, model_validator from lm_buddy.types import BaseLMBuddyConfig from lm_buddy.jobs.configs import LMBuddyJobConfig from lm_buddy.integrations.wandb import WandbRunConfig -from lm_buddy.integrations.vllm import InferenceServerConfig -from lm_buddy.integrations.huggingface import TextDatasetConfig, AutoTokenizerConfig - -class PrometheusCompletionsConfig(BaseLMBuddyConfig): - """Configuration for a "local-completions" prometheus model. - - The prometheus model is powered by a self-hosted inference server, specified - as an `InferenceServerConfig`. Additional arguments are also provided - to control the tokenizer type and generation parameters. - """ - - inference: InferenceServerConfig +from lm_buddy.integrations.vllm import vLLMCompletionsConfig +from lm_buddy.integrations.huggingface import TextDatasetConfig - # vLLM-served model params - best_of: int = 1 - max_tokens: int = 512 - frequency_penalty: float = 1.03 - temperature: float = 1.0 - top_p: float = 0.9 - # evaluation script params - tokenizer: AutoTokenizerConfig | None = None +class PrometheusEvaluationTaskConfig(BaseLMBuddyConfig): + """Parameters specific to Prometheus evaluation.""" num_answers: int = 3 max_retries: int = 5 + min_score: int = 0 + max_score: int = 5 + enable_tqdm: bool = False class PrometheusJobConfig(LMBuddyJobConfig): - """Configuration to run a prometheus evaluation job.""" + """Configuration to run a prometheus job.""" dataset: TextDatasetConfig = Field(description="dataset (json artifact from which we'll extract `text_field`)") - # details for our self-hosted prometheus endpoint - prometheus: PrometheusCompletionsConfig + + # vLLM endpoint configuration + prometheus: vLLMCompletionsConfig + + # evaluation task configuration + evaluation: PrometheusEvaluationTaskConfig | None = None + # wandb experiment tracking details tracking: WandbRunConfig | None = None From ae10970debd7466b45358b8313ecdfdc836af9a5 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 12 Mar 2024 14:17:07 +0000 Subject: [PATCH 08/18] Added type hints/return types + comments on new functions --- src/lm_buddy/jobs/_entrypoints/prometheus.py | 29 +++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py index d609cde2..4baf9ad2 100644 --- a/src/lm_buddy/jobs/_entrypoints/prometheus.py +++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py @@ -12,7 +12,7 @@ ) from fastchat.conversation import get_conv_template from transformers import AutoTokenizer -from openai import OpenAIError, OpenAI +from openai import OpenAIError, OpenAI, Completion from tqdm import tqdm from pathlib import Path @@ -25,7 +25,15 @@ def __init__(self, message, error=None): self.error = error -def openai_completion(config, client, prompt): +def openai_completion( + config: PrometheusJobConfig, + client: OpenAI, + prompt: str +) -> Completion: + """ Connects to a remote OpenAI-API-compatible Prometheus endpoint + and returns a Completion holding the model's response. + """ + return client.completions.create( model = config.prometheus.inference.engine, prompt = prompt, @@ -37,7 +45,15 @@ def openai_completion(config, client, prompt): ) -def parse_response(config, response): +def parse_response( + config: PrometheusJobConfig, + response: Completion +) -> tuple[str, str]: + """ Given a Prometheus eval response as returned by the OpenAI API + endpoint (i.e. in Completion format), extract feedback + and score. + """ + if response is None: raise BadResponseException("Server returned an empty response") @@ -58,7 +74,12 @@ def parse_response(config, response): return feedback, score -def instruction_to_prompt(instruction): +def instruction_to_prompt( + instruction: str +) -> str: + """ Given some text containing Prometheus instructions, transform it + into an actual llama-2 prompt. + """ conv = get_conv_template("llama-2") conv.set_system_message("You are a fair evaluator language model.") conv.append_message(conv.roles[0], instruction) From 2227c53ee0cd232e1bb1a3d716fe8ec930909df6 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 12 Mar 2024 14:20:56 +0000 Subject: [PATCH 09/18] Added new config example for prometheus --- .../configs/prometheus/prometheus_config.yaml | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/examples/configs/prometheus/prometheus_config.yaml b/examples/configs/prometheus/prometheus_config.yaml index 5784b299..c8a6b5e8 100644 --- a/examples/configs/prometheus/prometheus_config.yaml +++ b/examples/configs/prometheus/prometheus_config.yaml @@ -10,11 +10,24 @@ dataset: prometheus: inference: base_url: "http://your.vllm.server:8000/v1" - tokenizer: - load_from: "meta-llama/Llama-2-7b-chat-hf" - max_tokens: 256 - # number of times the model is called per sample + engine: "kaist-ai/prometheus-13b-v1.0" + best_of: 1 + max_tokens: 512 + frequency_penalty: 1.03 + temperature: 1.0 + top_p: 0.9 + +evaluation: + # number of times a model is evaluated per sample num_answers: 3 + # max number of retries if a communication error + # with the server occurs + max_retries: 5 + # min and max scores as defined in the scoring rubric + min_score: 1 + max_score: 5 + # enable/disable tqdm to track eval progress + enable_tqdm: True tracking: name: "lm-buddy-prometheus" From 8ee2da2468b9f38074a0f4eb8da6fa269c27a968 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 12 Mar 2024 15:24:51 +0000 Subject: [PATCH 10/18] Fixes for ruff --- src/lm_buddy/integrations/vllm.py | 4 +- src/lm_buddy/jobs/_entrypoints/prometheus.py | 139 ++++++++----------- src/lm_buddy/jobs/configs/prometheus.py | 17 ++- 3 files changed, 73 insertions(+), 87 deletions(-) diff --git a/src/lm_buddy/integrations/vllm.py b/src/lm_buddy/integrations/vllm.py index 8eb6f7b4..f327ddad 100644 --- a/src/lm_buddy/integrations/vllm.py +++ b/src/lm_buddy/integrations/vllm.py @@ -19,7 +19,7 @@ class InferenceServerConfig(BaseLMBuddyConfig): engine: str | HuggingFaceAssetPath | None = None -class vLLMCompletionsConfig(BaseLMBuddyConfig): +class VLLMCompletionsConfig(BaseLMBuddyConfig): """Configuration for a vLLM-based completions service The "local-chat-completions" model is powered by a self-hosted inference server, @@ -34,4 +34,4 @@ class vLLMCompletionsConfig(BaseLMBuddyConfig): max_tokens: int | None = None frequency_penalty: float | None = None temperature: float | None = None - top_p: float | None = None \ No newline at end of file + top_p: float | None = None diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py index 4baf9ad2..f980a242 100644 --- a/src/lm_buddy/jobs/_entrypoints/prometheus.py +++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py @@ -1,84 +1,75 @@ # lm-buddy entrypoint to run evaluations using a Prometheus inference server # see https://github.com/kaistAI/prometheus/blob/main/evaluation/benchmark/run_absolute_scoring.py -from lm_buddy.jobs.configs import PrometheusJobConfig +import copy +import json +from pathlib import Path + +from fastchat.conversation import get_conv_template +from openai import Completion, OpenAI, OpenAIError +from tqdm import tqdm + from lm_buddy.integrations.huggingface import HuggingFaceAssetLoader from lm_buddy.integrations.huggingface.tokenizer_config import AutoTokenizerConfig from lm_buddy.integrations.wandb import ( - ArtifactType, - ArtifactLoader, - build_file_artifact, + ArtifactLoader, + ArtifactType, + build_file_artifact, wandb_init_from_config, ) -from fastchat.conversation import get_conv_template -from transformers import AutoTokenizer -from openai import OpenAIError, OpenAI, Completion +from lm_buddy.jobs.configs import PrometheusJobConfig -from tqdm import tqdm -from pathlib import Path -import json -import copy -class BadResponseException(Exception): +class BadResponseError(Exception): def __init__(self, message, error=None): self.message = message self.error = error -def openai_completion( - config: PrometheusJobConfig, - client: OpenAI, - prompt: str -) -> Completion: - """ Connects to a remote OpenAI-API-compatible Prometheus endpoint - and returns a Completion holding the model's response. +def openai_completion(config: PrometheusJobConfig, client: OpenAI, prompt: str) -> Completion: + """Connects to a remote OpenAI-API-compatible Prometheus endpoint + and returns a Completion holding the model's response. """ return client.completions.create( - model = config.prometheus.inference.engine, - prompt = prompt, - best_of = config.prometheus.best_of, - max_tokens = config.prometheus.max_tokens, - frequency_penalty = config.prometheus.frequency_penalty, - temperature = config.prometheus.temperature, - top_p = config.prometheus.top_p + model=config.prometheus.inference.engine, + prompt=prompt, + best_of=config.prometheus.best_of, + max_tokens=config.prometheus.max_tokens, + frequency_penalty=config.prometheus.frequency_penalty, + temperature=config.prometheus.temperature, + top_p=config.prometheus.top_p, ) -def parse_response( - config: PrometheusJobConfig, - response: Completion -) -> tuple[str, str]: - """ Given a Prometheus eval response as returned by the OpenAI API - endpoint (i.e. in Completion format), extract feedback - and score. +def parse_response(config: PrometheusJobConfig, response: Completion) -> tuple[str, str]: + """Given a Prometheus eval response as returned by the OpenAI API + endpoint (i.e. in Completion format), extract feedback + and score. """ - + if response is None: - raise BadResponseException("Server returned an empty response") + raise BadResponseError("Server returned an empty response") try: response_text = response.choices[0].text # note: this can raise a ValueError if the message is malformed - feedback, score = response_text.split('[RESULT]') + feedback, score = response_text.split("[RESULT]") feedback = feedback.strip() score = score.strip() - if score not in [str(s) for s in range( - config.evaluation.min_score, - config.evaluation.max_score+1 - )]: - raise BadResponseException(f"Score {score} is not in range") - except (ValueError, BadResponseException) as e: - raise BadResponseException(f"Server returned a malformed response ({e})",e) + if score not in [ + str(s) for s in range(config.evaluation.min_score, config.evaluation.max_score + 1) + ]: + raise BadResponseError(f"Score {score} is not in range") + except (ValueError, BadResponseError) as e: + raise BadResponseError(f"Server returned a malformed response ({e})", e) return feedback, score -def instruction_to_prompt( - instruction: str -) -> str: - """ Given some text containing Prometheus instructions, transform it - into an actual llama-2 prompt. +def instruction_to_prompt(instruction: str) -> str: + """Given some text containing Prometheus instructions, transform it + into an actual llama-2 prompt. """ conv = get_conv_template("llama-2") conv.set_system_message("You are a fair evaluator language model.") @@ -88,78 +79,70 @@ def instruction_to_prompt( def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader): - # load dataset from W&B artifact hf_loader = HuggingFaceAssetLoader(artifact_loader) - artifact_path,_ = hf_loader.resolve_asset_path(config.dataset.load_from) + artifact_path, _ = hf_loader.resolve_asset_path(config.dataset.load_from) dataset_fname = Path(artifact_path) / config.dataset.load_from.name - - with open(dataset_fname,'r') as f: + + with Path(dataset_fname).open() as f: # eval samples are JSON-encoded, each takes one line in the dataset file data = [json.loads(line) for line in f.readlines()] # get the tokenizer - tokenizer_config = AutoTokenizerConfig( - load_from = config.prometheus.inference.engine - ) + tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine) tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config) # instantiate OpenAI client to speak with the vLLM endpoint - client = OpenAI( - base_url = config.prometheus.inference.base_url - ) + client = OpenAI(base_url=config.prometheus.inference.base_url) # enable / disable tqdm dataset_iterable = tqdm(data) if config.evaluation.enable_tqdm else data # open the output file for writing and iterate on samples output_fname = Path("/tmp") / config.tracking.name - with open(output_fname,'w') as file: + with output_fname.open("w") as file: for sample in dataset_iterable: # convert instructions from the dataset (`text_field` in a dict) to # prompts that prometheus accepts prompt = instruction_to_prompt(sample[config.dataset.text_field]) - # skip those examples which are too long - tokenized_prompt = tokenizer(prompt, truncation=False) - if(len(tokenized_prompt['input_ids'])>3072): + # skip those examples which are too long + tokenized_prompt = tokenizer(prompt, truncation=False) + if len(tokenized_prompt["input_ids"]) > 3072: continue # prepare output result = copy.deepcopy(sample) - result['prometheus_output'] = [] - result['prometheus_score'] = [] + result["prometheus_output"] = [] + result["prometheus_score"] = [] for idx in range(config.evaluation.num_answers): - i = 0 - while i < config.evaluation.max_retries: + while i < config.evaluation.max_retries: try: response = openai_completion(config, client, prompt) feedback, score = parse_response(config, response) break - except (OpenAIError, BadResponseException) as e: + except (OpenAIError, BadResponseError) as e: print(f"[w] {e.message}, retrying ({i+1}/{config.evaluation.max_retries})") i += 1 if i == config.evaluation.max_retries: raise e - - result['prometheus_output'].append(feedback) - result['prometheus_score'].append(score) - # dump sample results - file.write(json.dumps(result)+"\n") + result["prometheus_output"].append(feedback) + result["prometheus_score"].append(score) + # dump sample results + file.write(json.dumps(result) + "\n") # Register a dataset file artifact if tracking is enabled if config.tracking: - - with wandb_init_from_config(config.tracking) as run: + with wandb_init_from_config(config.tracking): file_artifact = build_file_artifact( - artifact_name = config.tracking.name, - artifact_type = ArtifactType.DATASET, - file_path = output_fname, - reference = False, + artifact_name=config.tracking.name, + artifact_type=ArtifactType.DATASET, + file_path=output_fname, + reference=False, ) print("[i] Logging artifact for evaluation results...") artifact_loader.log_artifact(file_artifact) diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py index 3e7a782c..7b68012c 100644 --- a/src/lm_buddy/jobs/configs/prometheus.py +++ b/src/lm_buddy/jobs/configs/prometheus.py @@ -1,14 +1,15 @@ -from pydantic import Field, conlist, model_validator +from pydantic import Field -from lm_buddy.types import BaseLMBuddyConfig -from lm_buddy.jobs.configs import LMBuddyJobConfig -from lm_buddy.integrations.wandb import WandbRunConfig -from lm_buddy.integrations.vllm import vLLMCompletionsConfig from lm_buddy.integrations.huggingface import TextDatasetConfig +from lm_buddy.integrations.vllm import VLLMCompletionsConfig +from lm_buddy.integrations.wandb import WandbRunConfig +from lm_buddy.jobs.configs import LMBuddyJobConfig +from lm_buddy.types import BaseLMBuddyConfig class PrometheusEvaluationTaskConfig(BaseLMBuddyConfig): """Parameters specific to Prometheus evaluation.""" + num_answers: int = 3 max_retries: int = 5 min_score: int = 0 @@ -19,10 +20,12 @@ class PrometheusEvaluationTaskConfig(BaseLMBuddyConfig): class PrometheusJobConfig(LMBuddyJobConfig): """Configuration to run a prometheus job.""" - dataset: TextDatasetConfig = Field(description="dataset (json artifact from which we'll extract `text_field`)") + dataset: TextDatasetConfig = Field( + description="dataset (json artifact from which we'll extract `text_field`)" + ) # vLLM endpoint configuration - prometheus: vLLMCompletionsConfig + prometheus: VLLMCompletionsConfig # evaluation task configuration evaluation: PrometheusEvaluationTaskConfig | None = None From 482db8baa1974c8a2be4b239b555bb9ada0b1442 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 12 Mar 2024 15:26:44 +0000 Subject: [PATCH 11/18] Added fschat to libs --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 7d2f3dc4..bd6a74f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ dependencies = [ # Evaluation frameworks "lm-eval[openai]==0.4.1", "einops==0.7.0", + "fschat==0.2.36", ] [project.optional-dependencies] From ff3516c2605c3a4caab33e8b2fa7cbdc90de61ac Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 12 Mar 2024 17:52:52 +0000 Subject: [PATCH 12/18] Added early break if wandb issues + load/save HF datasets --- src/lm_buddy/jobs/_entrypoints/prometheus.py | 65 ++++++++++++-------- src/lm_buddy/jobs/configs/prometheus.py | 1 + 2 files changed, 42 insertions(+), 24 deletions(-) diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py index f980a242..f7ceb3cf 100644 --- a/src/lm_buddy/jobs/_entrypoints/prometheus.py +++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py @@ -5,18 +5,21 @@ import json from pathlib import Path +from datasets import Dataset, load_dataset from fastchat.conversation import get_conv_template from openai import Completion, OpenAI, OpenAIError from tqdm import tqdm +from transformers import PreTrainedTokenizer from lm_buddy.integrations.huggingface import HuggingFaceAssetLoader from lm_buddy.integrations.huggingface.tokenizer_config import AutoTokenizerConfig from lm_buddy.integrations.wandb import ( ArtifactLoader, ArtifactType, - build_file_artifact, + build_directory_artifact, wandb_init_from_config, ) +from lm_buddy.jobs.common import LMBuddyJobType from lm_buddy.jobs.configs import PrometheusJobConfig @@ -78,28 +81,15 @@ def instruction_to_prompt(instruction: str) -> str: return conv.get_prompt() -def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader): - # load dataset from W&B artifact - hf_loader = HuggingFaceAssetLoader(artifact_loader) - artifact_path, _ = hf_loader.resolve_asset_path(config.dataset.load_from) - dataset_fname = Path(artifact_path) / config.dataset.load_from.name - - with Path(dataset_fname).open() as f: - # eval samples are JSON-encoded, each takes one line in the dataset file - data = [json.loads(line) for line in f.readlines()] - - # get the tokenizer - tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine) - tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config) - - # instantiate OpenAI client to speak with the vLLM endpoint - client = OpenAI(base_url=config.prometheus.inference.base_url) - +def run_eval( + config: PrometheusJobConfig, data: Dataset, tokenizer: PreTrainedTokenizer, client: OpenAI +) -> str: # enable / disable tqdm dataset_iterable = tqdm(data) if config.evaluation.enable_tqdm else data # open the output file for writing and iterate on samples - output_fname = Path("/tmp") / config.tracking.name + tracking_name = config.tracking.name if config.tracking is not None else "output.json" + output_fname = Path(config.evaluation.tmp_folder) / tracking_name with output_fname.open("w") as file: for sample in dataset_iterable: # convert instructions from the dataset (`text_field` in a dict) to @@ -132,17 +122,44 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader) result["prometheus_output"].append(feedback) result["prometheus_score"].append(score) - # dump sample results + # dump sample results incrementally file.write(json.dumps(result) + "\n") + # convert plain json dataset in HF format + output_hf_name = str(Path(config.evaluation.tmp_folder) / "hf" / tracking_name) + ds = load_dataset("json", data_files=str(output_fname), split="train") + ds.save_to_disk(output_hf_name) + + return str(output_hf_name) + + +def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader): + # load dataset from W&B artifact + hf_loader = HuggingFaceAssetLoader(artifact_loader) + data = hf_loader.load_dataset(config.dataset) + + # get the tokenizer + tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine) + tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config) + + # instantiate OpenAI client to speak with the vLLM endpoint + client = OpenAI(base_url=config.prometheus.inference.base_url) + # Register a dataset file artifact if tracking is enabled if config.tracking: - with wandb_init_from_config(config.tracking): - file_artifact = build_file_artifact( + with wandb_init_from_config(config.tracking, job_type=LMBuddyJobType.EVALUATION): + # run eval and store output in local filename + output_dataset_name = run_eval(config, data, tokenizer, client) + + # store HF dataset as a directory artifact + artifact = build_directory_artifact( + dir_path=output_dataset_name, artifact_name=config.tracking.name, artifact_type=ArtifactType.DATASET, - file_path=output_fname, reference=False, ) print("[i] Logging artifact for evaluation results...") - artifact_loader.log_artifact(file_artifact) + artifact_loader.log_artifact(artifact) + else: + output_dataset_name = run_eval(config, data, tokenizer, client) + print(f"[i] Evaluation results stored in {output_dataset_name}") diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py index 7b68012c..cc46ae0c 100644 --- a/src/lm_buddy/jobs/configs/prometheus.py +++ b/src/lm_buddy/jobs/configs/prometheus.py @@ -15,6 +15,7 @@ class PrometheusEvaluationTaskConfig(BaseLMBuddyConfig): min_score: int = 0 max_score: int = 5 enable_tqdm: bool = False + tmp_folder: str = "/tmp" class PrometheusJobConfig(LMBuddyJobConfig): From 6f15d1d01ad8de915bc40f60abdd0bbe7af887ee Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 12 Mar 2024 17:58:12 +0000 Subject: [PATCH 13/18] Added openai lib to pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index bd6a74f1..a765d64a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "lm-eval[openai]==0.4.1", "einops==0.7.0", "fschat==0.2.36", + "openai==1.3.9", ] [project.optional-dependencies] From 317301fab530ea62612a32013fe37a9d75b64c7e Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 12 Mar 2024 18:00:27 +0000 Subject: [PATCH 14/18] Ruff compliance --- src/lm_buddy/cli/schema.py | 2 +- src/lm_buddy/integrations/wandb/artifact_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lm_buddy/cli/schema.py b/src/lm_buddy/cli/schema.py index 6149c2c5..4577ee84 100644 --- a/src/lm_buddy/cli/schema.py +++ b/src/lm_buddy/cli/schema.py @@ -3,7 +3,7 @@ import click from lm_buddy.jobs.configs import ( - FinetuningJobConfig, + FinetuningJobConfig, LMHarnessJobConfig, PrometheusJobConfig, SimpleJobConfig, diff --git a/src/lm_buddy/integrations/wandb/artifact_utils.py b/src/lm_buddy/integrations/wandb/artifact_utils.py index 966ea537..995ba869 100644 --- a/src/lm_buddy/integrations/wandb/artifact_utils.py +++ b/src/lm_buddy/integrations/wandb/artifact_utils.py @@ -4,7 +4,7 @@ from urllib.parse import ParseResult, urlparse import wandb -import os + class ArtifactType(str, Enum): """Enumeration of artifact types used by the LM Buddy.""" From ea35bc00dbeecc6ed7e47a6d0c68153456b4d96b Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 12 Mar 2024 20:38:54 +0000 Subject: [PATCH 15/18] Addressed latest comments in PR --- src/lm_buddy/integrations/vllm.py | 5 ++ src/lm_buddy/jobs/_entrypoints/prometheus.py | 90 ++++++++++++-------- src/lm_buddy/jobs/configs/prometheus.py | 7 +- 3 files changed, 66 insertions(+), 36 deletions(-) diff --git a/src/lm_buddy/integrations/vllm.py b/src/lm_buddy/integrations/vllm.py index f327ddad..af99ccf6 100644 --- a/src/lm_buddy/integrations/vllm.py +++ b/src/lm_buddy/integrations/vllm.py @@ -25,6 +25,11 @@ class VLLMCompletionsConfig(BaseLMBuddyConfig): The "local-chat-completions" model is powered by a self-hosted inference server, specified as an `InferenceServerConfig`. Additional arguments are also provided to control the tokenizer type and generation parameters. + + Note that this is just a subset of the parameters allowed by a vLLM server (see + https://github.com/vllm-project/vllm/blob/main/vllm/sampling_params.py). If we + choose to use this configuration to cover for more use cases, it will make sense + to add the other supported configuration parameters too. """ inference: InferenceServerConfig diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py index f7ceb3cf..e6f2a30c 100644 --- a/src/lm_buddy/jobs/_entrypoints/prometheus.py +++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py @@ -1,8 +1,11 @@ -# lm-buddy entrypoint to run evaluations using a Prometheus inference server -# see https://github.com/kaistAI/prometheus/blob/main/evaluation/benchmark/run_absolute_scoring.py +""" +lm-buddy entrypoint to run evaluations using a Prometheus inference server +see https://github.com/kaistAI/prometheus/blob/main/evaluation/benchmark/run_absolute_scoring.py +""" import copy import json +from dataclasses import dataclass from pathlib import Path from datasets import Dataset, load_dataset @@ -23,6 +26,7 @@ from lm_buddy.jobs.configs import PrometheusJobConfig +@dataclass class BadResponseError(Exception): def __init__(self, message, error=None): self.message = message @@ -60,9 +64,7 @@ def parse_response(config: PrometheusJobConfig, response: Completion) -> tuple[s feedback, score = response_text.split("[RESULT]") feedback = feedback.strip() score = score.strip() - if score not in [ - str(s) for s in range(config.evaluation.min_score, config.evaluation.max_score + 1) - ]: + if score not in [str(s) for s in config.evaluation.scores]: raise BadResponseError(f"Score {score} is not in range") except (ValueError, BadResponseError) as e: raise BadResponseError(f"Server returned a malformed response ({e})", e) @@ -70,17 +72,38 @@ def parse_response(config: PrometheusJobConfig, response: Completion) -> tuple[s return feedback, score -def instruction_to_prompt(instruction: str) -> str: - """Given some text containing Prometheus instructions, transform it - into an actual llama-2 prompt. +def instruction_to_prompt(config: PrometheusJobConfig, instruction: str) -> str: + """Given some text containing Prometheus instructions, a conversation + template (e.g. "llama-2") and a system message (e.g. "You are a + fair evaluator language model"), generate an actual prompt. """ - conv = get_conv_template("llama-2") - conv.set_system_message("You are a fair evaluator language model.") + conv = get_conv_template(config.evaluation.conversation_template) + conv.set_system_message(config.evaluation.conversation_system_message) conv.append_message(conv.roles[0], instruction) conv.append_message(conv.roles[1], None) return conv.get_prompt() +def get_response_with_retries( + config: PrometheusJobConfig, client: OpenAI, prompt: str, max_retries: int +) -> tuple[str, str]: + current_retry_attempt = 1 + while current_retry_attempt <= config.evaluation.max_retries: + try: + response = openai_completion(config, client, prompt) + feedback, score = parse_response(config, response) + break + except (OpenAIError, BadResponseError) as e: + print( + f"[w] {e.message}, " + f"retrying ({current_retry_attempt}/{config.evaluation.max_retries})" + ) + current_retry_attempt += 1 + if current_retry_attempt > config.evaluation.max_retries: + raise e + return (feedback, score) + + def run_eval( config: PrometheusJobConfig, data: Dataset, tokenizer: PreTrainedTokenizer, client: OpenAI ) -> str: @@ -89,12 +112,12 @@ def run_eval( # open the output file for writing and iterate on samples tracking_name = config.tracking.name if config.tracking is not None else "output.json" - output_fname = Path(config.evaluation.tmp_folder) / tracking_name + output_fname = Path(config.evaluation.output_folder) / tracking_name with output_fname.open("w") as file: for sample in dataset_iterable: # convert instructions from the dataset (`text_field` in a dict) to # prompts that prometheus accepts - prompt = instruction_to_prompt(sample[config.dataset.text_field]) + prompt = instruction_to_prompt(config, sample[config.dataset.text_field]) # skip those examples which are too long tokenized_prompt = tokenizer(prompt, truncation=False) @@ -107,18 +130,9 @@ def run_eval( result["prometheus_score"] = [] for idx in range(config.evaluation.num_answers): - i = 0 - while i < config.evaluation.max_retries: - try: - response = openai_completion(config, client, prompt) - feedback, score = parse_response(config, response) - break - except (OpenAIError, BadResponseError) as e: - print(f"[w] {e.message}, retrying ({i+1}/{config.evaluation.max_retries})") - i += 1 - if i == config.evaluation.max_retries: - raise e - + (feedback, score) = get_response_with_retries( + config, client, prompt, config.evaluation.max_retries + ) result["prometheus_output"].append(feedback) result["prometheus_score"].append(score) @@ -126,7 +140,7 @@ def run_eval( file.write(json.dumps(result) + "\n") # convert plain json dataset in HF format - output_hf_name = str(Path(config.evaluation.tmp_folder) / "hf" / tracking_name) + output_hf_name = str(Path(config.evaluation.output_folder) / "hf" / tracking_name) ds = load_dataset("json", data_files=str(output_fname), split="train") ds.save_to_disk(output_hf_name) @@ -134,20 +148,20 @@ def run_eval( def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader): - # load dataset from W&B artifact - hf_loader = HuggingFaceAssetLoader(artifact_loader) - data = hf_loader.load_dataset(config.dataset) - - # get the tokenizer - tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine) - tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config) - # instantiate OpenAI client to speak with the vLLM endpoint client = OpenAI(base_url=config.prometheus.inference.base_url) # Register a dataset file artifact if tracking is enabled if config.tracking: with wandb_init_from_config(config.tracking, job_type=LMBuddyJobType.EVALUATION): + # load dataset from W&B artifact + hf_loader = HuggingFaceAssetLoader(artifact_loader) + data = hf_loader.load_dataset(config.dataset) + + # get the tokenizer + tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine) + tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config) + # run eval and store output in local filename output_dataset_name = run_eval(config, data, tokenizer, client) @@ -158,8 +172,16 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader) artifact_type=ArtifactType.DATASET, reference=False, ) - print("[i] Logging artifact for evaluation results...") + print("Logging artifact for evaluation results...") artifact_loader.log_artifact(artifact) else: + # load dataset from W&B artifact + hf_loader = HuggingFaceAssetLoader(artifact_loader) + data = hf_loader.load_dataset(config.dataset) + + # get the tokenizer + tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine) + tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config) + output_dataset_name = run_eval(config, data, tokenizer, client) print(f"[i] Evaluation results stored in {output_dataset_name}") diff --git a/src/lm_buddy/jobs/configs/prometheus.py b/src/lm_buddy/jobs/configs/prometheus.py index cc46ae0c..db428a97 100644 --- a/src/lm_buddy/jobs/configs/prometheus.py +++ b/src/lm_buddy/jobs/configs/prometheus.py @@ -12,17 +12,20 @@ class PrometheusEvaluationTaskConfig(BaseLMBuddyConfig): num_answers: int = 3 max_retries: int = 5 + scores: list = [1, 2, 3, 4, 5] min_score: int = 0 max_score: int = 5 enable_tqdm: bool = False - tmp_folder: str = "/tmp" + output_folder: str = "/tmp" + conversation_template: str = "llama-2" + conversation_system_message: str = "You are a fair evaluator language model." class PrometheusJobConfig(LMBuddyJobConfig): """Configuration to run a prometheus job.""" dataset: TextDatasetConfig = Field( - description="dataset (json artifact from which we'll extract `text_field`)" + description="Dataset of text completions to evaluate using the Prometheus judge model." ) # vLLM endpoint configuration From 81ffbe6b7c0efd02c0ef0d4fe99935b6bcd7153c Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 12 Mar 2024 20:45:19 +0000 Subject: [PATCH 16/18] Bump version to 0.3.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a765d64a..b7a61d00 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "lm-buddy" -version = "0.2.4" +version = "0.3.0" authors = [ { name = "Sean Friedowitz", email = "sean@mozilla.ai" }, { name = "Aaron Gonzales", email = "aaron@mozilla.ai" }, From f8e38878e7443da768344b8f0aa871a082395e49 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 12 Mar 2024 20:46:39 +0000 Subject: [PATCH 17/18] Added myself to authors --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index b7a61d00..3083fae1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ authors = [ { name = "Sean Friedowitz", email = "sean@mozilla.ai" }, { name = "Aaron Gonzales", email = "aaron@mozilla.ai" }, { name = "Vicki Boykis", email = "vicki@mozilla.ai" }, + { name = "Davide Eynard", email = "davide@mozilla.ai" }, ] description = "Ray-centric library for finetuning and evaluation of (large) language models." readme = "README.md" From ab66e0affd0ccf91ad10e589a75301d5e97b69f1 Mon Sep 17 00:00:00 2001 From: Davide Eynard Date: Tue, 12 Mar 2024 20:52:47 +0000 Subject: [PATCH 18/18] Factor dataset and tokenizer into run_eval --- src/lm_buddy/jobs/_entrypoints/prometheus.py | 37 ++++++++------------ 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/src/lm_buddy/jobs/_entrypoints/prometheus.py b/src/lm_buddy/jobs/_entrypoints/prometheus.py index e6f2a30c..75ca71ba 100644 --- a/src/lm_buddy/jobs/_entrypoints/prometheus.py +++ b/src/lm_buddy/jobs/_entrypoints/prometheus.py @@ -8,11 +8,10 @@ from dataclasses import dataclass from pathlib import Path -from datasets import Dataset, load_dataset +from datasets import load_dataset from fastchat.conversation import get_conv_template from openai import Completion, OpenAI, OpenAIError from tqdm import tqdm -from transformers import PreTrainedTokenizer from lm_buddy.integrations.huggingface import HuggingFaceAssetLoader from lm_buddy.integrations.huggingface.tokenizer_config import AutoTokenizerConfig @@ -105,8 +104,18 @@ def get_response_with_retries( def run_eval( - config: PrometheusJobConfig, data: Dataset, tokenizer: PreTrainedTokenizer, client: OpenAI + config: PrometheusJobConfig, + artifact_loader: ArtifactLoader, + client: OpenAI, ) -> str: + # load dataset from W&B artifact + hf_loader = HuggingFaceAssetLoader(artifact_loader) + data = hf_loader.load_dataset(config.dataset) + + # get the tokenizer + tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine) + tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config) + # enable / disable tqdm dataset_iterable = tqdm(data) if config.evaluation.enable_tqdm else data @@ -154,16 +163,8 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader) # Register a dataset file artifact if tracking is enabled if config.tracking: with wandb_init_from_config(config.tracking, job_type=LMBuddyJobType.EVALUATION): - # load dataset from W&B artifact - hf_loader = HuggingFaceAssetLoader(artifact_loader) - data = hf_loader.load_dataset(config.dataset) - - # get the tokenizer - tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine) - tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config) - # run eval and store output in local filename - output_dataset_name = run_eval(config, data, tokenizer, client) + output_dataset_name = run_eval(config, artifact_loader, client) # store HF dataset as a directory artifact artifact = build_directory_artifact( @@ -175,13 +176,5 @@ def run_prometheus(config: PrometheusJobConfig, artifact_loader: ArtifactLoader) print("Logging artifact for evaluation results...") artifact_loader.log_artifact(artifact) else: - # load dataset from W&B artifact - hf_loader = HuggingFaceAssetLoader(artifact_loader) - data = hf_loader.load_dataset(config.dataset) - - # get the tokenizer - tokenizer_config = AutoTokenizerConfig(load_from=config.prometheus.inference.engine) - tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config) - - output_dataset_name = run_eval(config, data, tokenizer, client) - print(f"[i] Evaluation results stored in {output_dataset_name}") + output_dataset_name = run_eval(config, artifact_loader, client) + print(f"Evaluation results stored in {output_dataset_name}")