From 6340fde6728d2e22a89bb8da5a05a98dbc961d84 Mon Sep 17 00:00:00 2001 From: Sean Friedowitz Date: Thu, 4 Apr 2024 14:19:13 -0700 Subject: [PATCH 1/8] update ragas and prometheus to unify storage mechanism --- pyproject.toml | 1 + src/lm_buddy/configs/jobs/prometheus.py | 2 +- src/lm_buddy/configs/jobs/ragas.py | 27 ++++++------ src/lm_buddy/jobs/common.py | 8 ++++ src/lm_buddy/jobs/evaluation/__init__.py | 0 src/lm_buddy/jobs/evaluation/lm_harness.py | 9 ++-- src/lm_buddy/jobs/evaluation/prometheus.py | 50 +++++++++++----------- src/lm_buddy/jobs/evaluation/ragas.py | 46 +++++++++----------- src/lm_buddy/jobs/finetuning.py | 3 +- 9 files changed, 77 insertions(+), 69 deletions(-) create mode 100644 src/lm_buddy/jobs/evaluation/__init__.py diff --git a/pyproject.toml b/pyproject.toml index 73d90f17..1453ee23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ dependencies = [ "pydantic==2.6.0", "pydantic-yaml==1.2.0", "ray[default]==2.9.3", + "loguru==0.7.2", # HuggingFace "datasets>=2.17.1", "transformers==4.36.2", diff --git a/src/lm_buddy/configs/jobs/prometheus.py b/src/lm_buddy/configs/jobs/prometheus.py index dd1b608a..a3d531d2 100644 --- a/src/lm_buddy/configs/jobs/prometheus.py +++ b/src/lm_buddy/configs/jobs/prometheus.py @@ -16,9 +16,9 @@ class PrometheusEvaluationConfig(LMBuddyConfig): min_score: int = 0 max_score: int = 5 enable_tqdm: bool = False - output_folder: str = "/tmp" conversation_template: str = "llama-2" conversation_system_message: str = "You are a fair evaluator language model." + storage_path: str | None = None class PrometheusJobConfig(JobConfig): diff --git a/src/lm_buddy/configs/jobs/ragas.py b/src/lm_buddy/configs/jobs/ragas.py index b0f22a1c..a5d0e7d4 100644 --- a/src/lm_buddy/configs/jobs/ragas.py +++ b/src/lm_buddy/configs/jobs/ragas.py @@ -1,4 +1,4 @@ -from typing import Literal +from typing import Literal, get_args from pydantic import Field, field_validator @@ -20,19 +20,20 @@ class RagasEvaluationConfig(LMBuddyConfig): """Parameters specifically required for RAGAs Evaluation""" metrics: list[RagasEvaluationMetric] = Field( - default=[ - "faithfulness", - "answer_relevancy", - "context_recall", - "context_precision", - ] + default_factory=lambda: list(get_args(RagasEvaluationMetric)), + description="List of metric names for Ragas evaluation.", + ) + embedding_model: AssetPath = Field( + default="hf://sentence-transformers/all-mpnet-base-v2", + description="Path to embedding model used with the evaluation judge.", + ) + storage_path: str | None = Field( + default=None, + description=( + "Path to store the evaluation outputs. " + "Defaults to the environment value for `RAY_STORAGE`." + ), ) - - # language model and embedding models used as evaluation judges - embedding_model: AutoModelConfig | None = "sentence-transformers/all-mpnet-base-v2" - - # path to store the generated ratings/evaluations of each dataset sample - output_folder: str = "/tmp" @field_validator("embedding_model", mode="before") def validate_embedding_model_arg(cls, x): diff --git a/src/lm_buddy/jobs/common.py b/src/lm_buddy/jobs/common.py index 3d6a6feb..7ca1fa17 100644 --- a/src/lm_buddy/jobs/common.py +++ b/src/lm_buddy/jobs/common.py @@ -1,3 +1,4 @@ +import os from dataclasses import dataclass from enum import Enum from pathlib import Path @@ -35,3 +36,10 @@ class EvaluationResult(JobResult): tables: dict[str, pd.DataFrame] dataset_path: Path | None + + +def resolve_storage_path(storage_path: str | None) -> str: + if storage_path is not None: + return storage_path + else: + return os.getenv("RAY_STORAGE", "/tmp") diff --git a/src/lm_buddy/jobs/evaluation/__init__.py b/src/lm_buddy/jobs/evaluation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/lm_buddy/jobs/evaluation/lm_harness.py b/src/lm_buddy/jobs/evaluation/lm_harness.py index 78f4c4b6..34193c36 100644 --- a/src/lm_buddy/jobs/evaluation/lm_harness.py +++ b/src/lm_buddy/jobs/evaluation/lm_harness.py @@ -5,6 +5,7 @@ import torch from lm_eval.models.huggingface import HFLM from lm_eval.models.openai_completions import OpenaiCompletionsLM +from loguru import logger from lm_buddy.configs.huggingface import AutoModelConfig from lm_buddy.configs.jobs.lm_harness import LMHarnessJobConfig, LocalChatCompletionsConfig @@ -69,7 +70,9 @@ def load_harness_model(config: LMHarnessJobConfig) -> HFLM | OpenaiCompletionsLM def run_lm_harness(config: LMHarnessJobConfig) -> EvaluationResult: - print(f"Running lm-harness evaluation with configuration:\n {config.model_dump_json(indent=2)}") + logger.info( + f"Running lm-harness evaluation with configuration:\n {config.model_dump_json(indent=2)}" + ) llm = load_harness_model(config) eval_results = lm_eval.simple_evaluate( @@ -80,10 +83,10 @@ def run_lm_harness(config: LMHarnessJobConfig) -> EvaluationResult: limit=config.evaluation.limit, log_samples=False, ) - print(f"Obtained evaluation results: {eval_results}") + logger.info(f"Obtained evaluation results: {eval_results}") + # Create an artifact containing eval tables result_tables = get_per_task_dataframes(eval_results["results"]) - artifact_name = default_artifact_name(config.name, ArtifactType.EVALUATION) table_artifact = build_table_artifact( artifact_name=artifact_name, diff --git a/src/lm_buddy/jobs/evaluation/prometheus.py b/src/lm_buddy/jobs/evaluation/prometheus.py index 64f128a0..e182f3ec 100644 --- a/src/lm_buddy/jobs/evaluation/prometheus.py +++ b/src/lm_buddy/jobs/evaluation/prometheus.py @@ -4,19 +4,19 @@ """ import copy -import json from dataclasses import dataclass from pathlib import Path -from datasets import load_dataset +from datasets import Dataset from fastchat.conversation import get_conv_template +from loguru import logger from openai import Completion, OpenAI, OpenAIError from tqdm import tqdm from lm_buddy.configs.huggingface import AutoTokenizerConfig from lm_buddy.configs.jobs.prometheus import PrometheusJobConfig from lm_buddy.jobs.asset_loader import HuggingFaceAssetLoader -from lm_buddy.jobs.common import EvaluationResult +from lm_buddy.jobs.common import EvaluationResult, resolve_storage_path from lm_buddy.preprocessing import format_dataset_with_prompt from lm_buddy.tracking.artifact_utils import ( ArtifactType, @@ -50,8 +50,7 @@ def openai_completion(config: PrometheusJobConfig, client: OpenAI, prompt: str) def parse_response(config: PrometheusJobConfig, response: Completion) -> tuple[str, str]: """Given a Prometheus eval response as returned by the OpenAI API - endpoint (i.e. in Completion format), extract feedback - and score. + endpoint (i.e. in Completion format), extract feedback and score. """ if response is None: @@ -93,9 +92,9 @@ def get_response_with_retries( feedback, score = parse_response(config, response) break except (OpenAIError, BadResponseError) as e: - print( - f"[w] {e.message}, " - f"retrying ({current_retry_attempt}/{config.evaluation.max_retries})" + logger.warn( + f"{e.message}: " + f"Retrying ({current_retry_attempt}/{config.evaluation.max_retries})" ) current_retry_attempt += 1 if current_retry_attempt > config.evaluation.max_retries: @@ -107,7 +106,7 @@ def run_eval(config: PrometheusJobConfig) -> Path: # Instantiate OpenAI client to speak with the vLLM endpoint client = OpenAI(base_url=config.prometheus.inference.base_url) - # load dataset from W&B artifact + # Load dataset from W&B artifact hf_loader = HuggingFaceAssetLoader() dataset = hf_loader.load_dataset(config.dataset) if config.dataset.prompt_template is not None: @@ -115,17 +114,14 @@ def run_eval(config: PrometheusJobConfig) -> Path: dataset, config.dataset.prompt_template, config.dataset.text_field ) - # get the tokenizer + # Get the tokenizer tokenizer_config = AutoTokenizerConfig(path=config.prometheus.inference.engine) tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config) - # enable / disable tqdm + # Enable / disable tqdm dataset_iterable = tqdm(dataset) if config.evaluation.enable_tqdm else dataset - # open the output file for writing and iterate on samples - tracking_name = config.tracking.name if config.tracking is not None else "output.json" - output_fname = Path(config.evaluation.output_folder) / tracking_name - with output_fname.open("w") as file: + def data_generator(): for sample in dataset_iterable: # convert instructions from the dataset (`text_field` in a dict) to # prompts that prometheus accepts @@ -134,6 +130,7 @@ def run_eval(config: PrometheusJobConfig) -> Path: # skip those examples which are too long tokenized_prompt = tokenizer(prompt, truncation=False) if len(tokenized_prompt["input_ids"]) > 3072: + logger.warn(f"Skipping row due to prompt exceeding token limit: {prompt=}") continue # prepare output @@ -148,33 +145,34 @@ def run_eval(config: PrometheusJobConfig) -> Path: result["prometheus_output"].append(feedback) result["prometheus_score"].append(score) - # dump sample results incrementally - file.write(json.dumps(result) + "\n") + yield result - # convert plain json dataset in HF format - output_dataset_path = Path(config.evaluation.output_folder) / "hf" / tracking_name - ds = load_dataset("json", data_files=str(output_fname), split="train") - ds.save_to_disk(output_dataset_path) + result_dataset = Dataset.from_generator(data_generator) - return output_dataset_path + # Save dataset to disk + storage_path = resolve_storage_path(config.evaluation.storage_path) + result_dataset_path = Path(storage_path) / config.name / "prometheus_evaluation" + result_dataset.save_to_disk(result_dataset_path) + + return result_dataset_path def run_prometheus(config: PrometheusJobConfig) -> EvaluationResult: # Run eval and store output in local filename - output_dataset_path = run_eval(config) - print(f"Prometheus evaluation dataset stored at {output_dataset_path}") + result_dataset_path = run_eval(config) + logger.info(f"Prometheus evaluation dataset stored at {result_dataset_path}") # Create a directory artifact for the HF dataset artifact_name = default_artifact_name(config.name, artifact_type=ArtifactType.DATASET) dataset_artifact = build_directory_artifact( artifact_name=artifact_name, artifact_type=ArtifactType.DATASET, - dir_path=output_dataset_path, + dir_path=result_dataset_path, reference=False, ) return EvaluationResult( artifacts=[dataset_artifact], - dataset_path=output_dataset_path, + dataset_path=result_dataset_path, tables={}, ) diff --git a/src/lm_buddy/jobs/evaluation/ragas.py b/src/lm_buddy/jobs/evaluation/ragas.py index 291fe696..81430e90 100644 --- a/src/lm_buddy/jobs/evaluation/ragas.py +++ b/src/lm_buddy/jobs/evaluation/ragas.py @@ -1,14 +1,15 @@ from pathlib import Path -from datasets import load_dataset +from datasets import Dataset from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings from langchain_openai import ChatOpenAI +from loguru import logger from ragas import evaluate as ragas_evaluate from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness from lm_buddy.configs.jobs.ragas import RagasJobConfig from lm_buddy.jobs.asset_loader import HuggingFaceAssetLoader -from lm_buddy.jobs.common import EvaluationResult +from lm_buddy.jobs.common import EvaluationResult, resolve_storage_path from lm_buddy.preprocessing import format_dataset_with_prompt from lm_buddy.tracking.artifact_utils import ( ArtifactType, @@ -33,14 +34,14 @@ def run_eval(config: RagasJobConfig) -> Path: evaluation_dataset, config.dataset.prompt_template, config.dataset.text_field ) - # ragas custom model args + # Ragas custom model args ragas_args = {} - # load embedding model - embedding_model = hf_loader.resolve_asset_path(config.evaluation.embedding_model.path) + # Load embedding model + embedding_model = hf_loader.resolve_asset_path(config.evaluation.embedding_model) ragas_args["embeddings"] = HuggingFaceEmbeddings(model_name=embedding_model) - # configure ragas to point to vllm instance for generation + # Configure ragas to point to vllm instance for generation inference_engine = hf_loader.resolve_asset_path(config.judge.inference.engine) ragas_args["llm"] = ChatOpenAI( model=inference_engine, @@ -51,41 +52,36 @@ def run_eval(config: RagasJobConfig) -> Path: top_k=config.judge.top_k, ) - result = ragas_evaluate( - dataset=evaluation_dataset, - metrics=RAGAS_METRICS_MAP[config.evaluation.metrics], - **ragas_args, - ) - result_df = result.to_pandas() + ragas_metrics = [RAGAS_METRICS_MAP[metric] for metric in config.evaluation.metrics] + result = ragas_evaluate(dataset=evaluation_dataset, metrics=ragas_metrics, **ragas_args) - # open the output file for writing and iterate on samples - tracking_name = config.tracking.name if config.tracking is not None else "output.json" - output_fname = Path(config.evaluation.output_folder) / tracking_name - result_df.to_json(output_fname) + # Return a new dataset with score concatenated + result_dataset = Dataset.from_pandas(result.to_pandas()) - # convert plain json dataset in HF format - output_dataset_path = Path(config.evaluation.output_folder) / "hf" / tracking_name - ds = load_dataset("json", data_files=str(output_fname), split="train") - ds.save_to_disk(output_dataset_path) + # Save dataset to disk + storage_path = resolve_storage_path(config.evaluation.storage_path) + result_dataset_path = Path(storage_path) / config.name / "ragas_evaluation" + result_dataset.save_to_disk(result_dataset_path) - return output_dataset_path + return result_dataset_path def run_ragas(config: RagasJobConfig) -> EvaluationResult: - output_dataset_path = run_eval(config) - print(f"Ragas evaluation dataset stored at {output_dataset_path}") + # Run evaluation + result_dataset_path = run_eval(config) + logger.info(f"Ragas evaluation dataset stored at {result_dataset_path}") # Create a directory artifact for the HF dataset artifact_name = default_artifact_name(config.name, artifact_type=ArtifactType.DATASET) dataset_artifact = build_directory_artifact( artifact_name=artifact_name, artifact_type=ArtifactType.DATASET, - dir_path=output_dataset_path, + dir_path=result_dataset_path, reference=False, ) return EvaluationResult( artifacts=[dataset_artifact], - dataset_path=output_dataset_path, + dataset_path=result_dataset_path, tables={}, ) diff --git a/src/lm_buddy/jobs/finetuning.py b/src/lm_buddy/jobs/finetuning.py index ff11fcd2..b016ade2 100644 --- a/src/lm_buddy/jobs/finetuning.py +++ b/src/lm_buddy/jobs/finetuning.py @@ -2,6 +2,7 @@ from typing import Any import wandb +from loguru import logger from ray import train from ray.train import CheckpointConfig, RunConfig, ScalingConfig from ray.train.huggingface.transformers import RayTrainReportCallback, prepare_trainer @@ -101,7 +102,7 @@ def run_finetuning(config: FinetuningJobConfig) -> FinetuningResult: run_config=run_config, ) result = trainer.fit() - print(f"Training result: {result}") + logger.info(f"Training result: {result}") # Create a checkpoint artifact if tracking is enabled and Ray saved a checkpoint if result.checkpoint: From 8d0719574231d0c76304ea02e2b2798923985e1d Mon Sep 17 00:00:00 2001 From: Sean Friedowitz Date: Thu, 4 Apr 2024 14:58:45 -0700 Subject: [PATCH 2/8] add storage path global --- src/lm_buddy/configs/jobs/ragas.py | 5 +---- src/lm_buddy/constants.py | 9 +++++++++ src/lm_buddy/jobs/asset_loader.py | 3 ++- src/lm_buddy/jobs/common.py | 8 -------- src/lm_buddy/jobs/evaluation/prometheus.py | 11 +++++++---- src/lm_buddy/jobs/evaluation/ragas.py | 7 ++++--- 6 files changed, 23 insertions(+), 20 deletions(-) create mode 100644 src/lm_buddy/constants.py diff --git a/src/lm_buddy/configs/jobs/ragas.py b/src/lm_buddy/configs/jobs/ragas.py index a5d0e7d4..22ebab5c 100644 --- a/src/lm_buddy/configs/jobs/ragas.py +++ b/src/lm_buddy/configs/jobs/ragas.py @@ -29,10 +29,7 @@ class RagasEvaluationConfig(LMBuddyConfig): ) storage_path: str | None = Field( default=None, - description=( - "Path to store the evaluation outputs. " - "Defaults to the environment value for `RAY_STORAGE`." - ), + description="Path to store evaluation outputs. Defaults to the `LM_BUDDY_STORAGE` path.", ) @field_validator("embedding_model", mode="before") diff --git a/src/lm_buddy/constants.py b/src/lm_buddy/constants.py new file mode 100644 index 00000000..89a674b5 --- /dev/null +++ b/src/lm_buddy/constants.py @@ -0,0 +1,9 @@ +import os +from pathlib import Path + +STORAGE_PATH_ENVIRONMENT_VARIABLE: str = "LM_BUDDY_STORAGE" + +DEFAULT_STORAGE_PATH: str = os.getenv( + STORAGE_PATH_ENVIRONMENT_VARIABLE, + str(Path.home() / "lm_buddy_results"), +) diff --git a/src/lm_buddy/jobs/asset_loader.py b/src/lm_buddy/jobs/asset_loader.py index 7a4879fe..8a9cf111 100644 --- a/src/lm_buddy/jobs/asset_loader.py +++ b/src/lm_buddy/jobs/asset_loader.py @@ -3,6 +3,7 @@ import torch from accelerate import Accelerator from datasets import Dataset, DatasetDict, load_dataset, load_from_disk +from loguru import logger from peft import PeftConfig from transformers import ( AutoConfig, @@ -103,7 +104,7 @@ def load_pretrained_model( Accelerator().local_process_index if torch.cuda.is_available() else "cpu" ) device_map = {"": current_device} - print(f"Setting model device_map = {device_map} to enable quantization") + logger.info(f"Setting model device_map = {device_map} to enable quantization") # TODO: HuggingFace has many AutoModel classes with different "language model heads" # Can we abstract this to load with any type of AutoModel class? diff --git a/src/lm_buddy/jobs/common.py b/src/lm_buddy/jobs/common.py index 7ca1fa17..3d6a6feb 100644 --- a/src/lm_buddy/jobs/common.py +++ b/src/lm_buddy/jobs/common.py @@ -1,4 +1,3 @@ -import os from dataclasses import dataclass from enum import Enum from pathlib import Path @@ -36,10 +35,3 @@ class EvaluationResult(JobResult): tables: dict[str, pd.DataFrame] dataset_path: Path | None - - -def resolve_storage_path(storage_path: str | None) -> str: - if storage_path is not None: - return storage_path - else: - return os.getenv("RAY_STORAGE", "/tmp") diff --git a/src/lm_buddy/jobs/evaluation/prometheus.py b/src/lm_buddy/jobs/evaluation/prometheus.py index e182f3ec..047a1f6d 100644 --- a/src/lm_buddy/jobs/evaluation/prometheus.py +++ b/src/lm_buddy/jobs/evaluation/prometheus.py @@ -6,6 +6,7 @@ import copy from dataclasses import dataclass from pathlib import Path +from typing import Any from datasets import Dataset from fastchat.conversation import get_conv_template @@ -15,8 +16,9 @@ from lm_buddy.configs.huggingface import AutoTokenizerConfig from lm_buddy.configs.jobs.prometheus import PrometheusJobConfig +from lm_buddy.constants import DEFAULT_STORAGE_PATH from lm_buddy.jobs.asset_loader import HuggingFaceAssetLoader -from lm_buddy.jobs.common import EvaluationResult, resolve_storage_path +from lm_buddy.jobs.common import EvaluationResult from lm_buddy.preprocessing import format_dataset_with_prompt from lm_buddy.tracking.artifact_utils import ( ArtifactType, @@ -121,6 +123,7 @@ def run_eval(config: PrometheusJobConfig) -> Path: # Enable / disable tqdm dataset_iterable = tqdm(dataset) if config.evaluation.enable_tqdm else dataset + # Generator that iterates over samples and yields new rows with the prometheus outputs def data_generator(): for sample in dataset_iterable: # convert instructions from the dataset (`text_field` in a dict) to @@ -134,7 +137,7 @@ def data_generator(): continue # prepare output - result = copy.deepcopy(sample) + result: dict[str, Any] = copy.deepcopy(sample) result["prometheus_output"] = [] result["prometheus_score"] = [] @@ -150,8 +153,8 @@ def data_generator(): result_dataset = Dataset.from_generator(data_generator) # Save dataset to disk - storage_path = resolve_storage_path(config.evaluation.storage_path) - result_dataset_path = Path(storage_path) / config.name / "prometheus_evaluation" + storage_path = config.evaluation.storage_path or DEFAULT_STORAGE_PATH + result_dataset_path = Path(storage_path) / config.name / "evaluation" / "prometheus" result_dataset.save_to_disk(result_dataset_path) return result_dataset_path diff --git a/src/lm_buddy/jobs/evaluation/ragas.py b/src/lm_buddy/jobs/evaluation/ragas.py index 81430e90..85a527a8 100644 --- a/src/lm_buddy/jobs/evaluation/ragas.py +++ b/src/lm_buddy/jobs/evaluation/ragas.py @@ -8,8 +8,9 @@ from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness from lm_buddy.configs.jobs.ragas import RagasJobConfig +from lm_buddy.constants import DEFAULT_STORAGE_PATH from lm_buddy.jobs.asset_loader import HuggingFaceAssetLoader -from lm_buddy.jobs.common import EvaluationResult, resolve_storage_path +from lm_buddy.jobs.common import EvaluationResult from lm_buddy.preprocessing import format_dataset_with_prompt from lm_buddy.tracking.artifact_utils import ( ArtifactType, @@ -59,8 +60,8 @@ def run_eval(config: RagasJobConfig) -> Path: result_dataset = Dataset.from_pandas(result.to_pandas()) # Save dataset to disk - storage_path = resolve_storage_path(config.evaluation.storage_path) - result_dataset_path = Path(storage_path) / config.name / "ragas_evaluation" + storage_path = config.evaluation.storage_path or DEFAULT_STORAGE_PATH + result_dataset_path = Path(storage_path) / config.name / "evaluation" / "ragas" result_dataset.save_to_disk(result_dataset_path) return result_dataset_path From 76097547df859d4a6da80da01a6d5abef9b636e4 Mon Sep 17 00:00:00 2001 From: Sean Friedowitz Date: Thu, 4 Apr 2024 15:00:11 -0700 Subject: [PATCH 3/8] remove constant --- src/lm_buddy/constants.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/lm_buddy/constants.py b/src/lm_buddy/constants.py index 89a674b5..74a375b7 100644 --- a/src/lm_buddy/constants.py +++ b/src/lm_buddy/constants.py @@ -1,9 +1,7 @@ import os from pathlib import Path -STORAGE_PATH_ENVIRONMENT_VARIABLE: str = "LM_BUDDY_STORAGE" - DEFAULT_STORAGE_PATH: str = os.getenv( - STORAGE_PATH_ENVIRONMENT_VARIABLE, + "LM_BUDDY_STORAGE", str(Path.home() / "lm_buddy_results"), ) From a66e433ac2a9364f10599e3023e2d7d2396de0ad Mon Sep 17 00:00:00 2001 From: Sean Friedowitz Date: Thu, 4 Apr 2024 16:07:29 -0700 Subject: [PATCH 4/8] fix engine passing in prometheus --- src/lm_buddy/jobs/evaluation/prometheus.py | 26 ++++++++++++++-------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/lm_buddy/jobs/evaluation/prometheus.py b/src/lm_buddy/jobs/evaluation/prometheus.py index 047a1f6d..f37c8b41 100644 --- a/src/lm_buddy/jobs/evaluation/prometheus.py +++ b/src/lm_buddy/jobs/evaluation/prometheus.py @@ -11,7 +11,8 @@ from datasets import Dataset from fastchat.conversation import get_conv_template from loguru import logger -from openai import Completion, OpenAI, OpenAIError +from openai import OpenAI, OpenAIError +from openai.types import Completion from tqdm import tqdm from lm_buddy.configs.huggingface import AutoTokenizerConfig @@ -34,13 +35,15 @@ def __init__(self, message, error=None): self.error = error -def openai_completion(config: PrometheusJobConfig, client: OpenAI, prompt: str) -> Completion: +def openai_completion( + config: PrometheusJobConfig, client: OpenAI, engine: str, prompt: str +) -> Completion: """Connects to a remote OpenAI-API-compatible Prometheus endpoint and returns a Completion holding the model's response. """ return client.completions.create( - model=config.prometheus.inference.engine, + model=engine, prompt=prompt, best_of=config.prometheus.best_of, max_tokens=config.prometheus.max_tokens, @@ -85,12 +88,15 @@ def instruction_to_prompt(config: PrometheusJobConfig, instruction: str) -> str: def get_response_with_retries( - config: PrometheusJobConfig, client: OpenAI, prompt: str, max_retries: int + config: PrometheusJobConfig, + client: OpenAI, + engine: str, + prompt: str, ) -> tuple[str, str]: current_retry_attempt = 1 while current_retry_attempt <= config.evaluation.max_retries: try: - response = openai_completion(config, client, prompt) + response = openai_completion(config, client, engine, prompt) feedback, score = parse_response(config, response) break except (OpenAIError, BadResponseError) as e: @@ -108,8 +114,12 @@ def run_eval(config: PrometheusJobConfig) -> Path: # Instantiate OpenAI client to speak with the vLLM endpoint client = OpenAI(base_url=config.prometheus.inference.base_url) - # Load dataset from W&B artifact hf_loader = HuggingFaceAssetLoader() + + # Resolve the engine model + engine_path = hf_loader.resolve_asset_path(config.prometheus.inference.engine) + + # Load dataset from W&B artifact dataset = hf_loader.load_dataset(config.dataset) if config.dataset.prompt_template is not None: dataset = format_dataset_with_prompt( @@ -142,9 +152,7 @@ def data_generator(): result["prometheus_score"] = [] for _ in range(config.evaluation.num_answers): - (feedback, score) = get_response_with_retries( - config, client, prompt, config.evaluation.max_retries - ) + (feedback, score) = get_response_with_retries(config, client, engine_path, prompt) result["prometheus_output"].append(feedback) result["prometheus_score"].append(score) From a51ae3ab4b1edac683db5fc9f347e01767a1047d Mon Sep 17 00:00:00 2001 From: Sean Friedowitz Date: Thu, 4 Apr 2024 16:11:58 -0700 Subject: [PATCH 5/8] version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1453ee23..1576db48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "lm-buddy" -version = "0.9.0" +version = "0.10.0" authors = [ { name = "Sean Friedowitz", email = "sean@mozilla.ai" }, { name = "Aaron Gonzales", email = "aaron@mozilla.ai" }, From cffc80cbd69f38d8022d9a90499f722ed7bf7e80 Mon Sep 17 00:00:00 2001 From: Sean Friedowitz Date: Fri, 5 Apr 2024 11:57:59 -0700 Subject: [PATCH 6/8] move constants to storage --- src/lm_buddy/jobs/evaluation/prometheus.py | 2 +- src/lm_buddy/jobs/evaluation/ragas.py | 2 +- src/lm_buddy/{constants.py => storage.py} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename src/lm_buddy/{constants.py => storage.py} (100%) diff --git a/src/lm_buddy/jobs/evaluation/prometheus.py b/src/lm_buddy/jobs/evaluation/prometheus.py index f37c8b41..ed9116fe 100644 --- a/src/lm_buddy/jobs/evaluation/prometheus.py +++ b/src/lm_buddy/jobs/evaluation/prometheus.py @@ -17,10 +17,10 @@ from lm_buddy.configs.huggingface import AutoTokenizerConfig from lm_buddy.configs.jobs.prometheus import PrometheusJobConfig -from lm_buddy.constants import DEFAULT_STORAGE_PATH from lm_buddy.jobs.asset_loader import HuggingFaceAssetLoader from lm_buddy.jobs.common import EvaluationResult from lm_buddy.preprocessing import format_dataset_with_prompt +from lm_buddy.storage import DEFAULT_STORAGE_PATH from lm_buddy.tracking.artifact_utils import ( ArtifactType, build_directory_artifact, diff --git a/src/lm_buddy/jobs/evaluation/ragas.py b/src/lm_buddy/jobs/evaluation/ragas.py index 85a527a8..09756d7f 100644 --- a/src/lm_buddy/jobs/evaluation/ragas.py +++ b/src/lm_buddy/jobs/evaluation/ragas.py @@ -8,10 +8,10 @@ from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness from lm_buddy.configs.jobs.ragas import RagasJobConfig -from lm_buddy.constants import DEFAULT_STORAGE_PATH from lm_buddy.jobs.asset_loader import HuggingFaceAssetLoader from lm_buddy.jobs.common import EvaluationResult from lm_buddy.preprocessing import format_dataset_with_prompt +from lm_buddy.storage import DEFAULT_STORAGE_PATH from lm_buddy.tracking.artifact_utils import ( ArtifactType, build_directory_artifact, diff --git a/src/lm_buddy/constants.py b/src/lm_buddy/storage.py similarity index 100% rename from src/lm_buddy/constants.py rename to src/lm_buddy/storage.py From a40b369a92d4845463b0f94e0836df3c6a05210b Mon Sep 17 00:00:00 2001 From: Sean Friedowitz Date: Mon, 8 Apr 2024 08:38:13 -0700 Subject: [PATCH 7/8] add lm-buddy home path --- src/lm_buddy/constants.py | 7 +++++++ src/lm_buddy/jobs/evaluation/prometheus.py | 6 +++--- src/lm_buddy/jobs/evaluation/ragas.py | 6 +++--- src/lm_buddy/storage.py | 7 ------- 4 files changed, 13 insertions(+), 13 deletions(-) create mode 100644 src/lm_buddy/constants.py delete mode 100644 src/lm_buddy/storage.py diff --git a/src/lm_buddy/constants.py b/src/lm_buddy/constants.py new file mode 100644 index 00000000..6239ecbe --- /dev/null +++ b/src/lm_buddy/constants.py @@ -0,0 +1,7 @@ +import os +from pathlib import Path + +LM_BUDDY_HOME_PATH: str = os.getenv( + "LM_BUDDY_HOME", + str(Path.home() / ".lm_buddy"), +) diff --git a/src/lm_buddy/jobs/evaluation/prometheus.py b/src/lm_buddy/jobs/evaluation/prometheus.py index ed9116fe..7a0fb561 100644 --- a/src/lm_buddy/jobs/evaluation/prometheus.py +++ b/src/lm_buddy/jobs/evaluation/prometheus.py @@ -17,10 +17,10 @@ from lm_buddy.configs.huggingface import AutoTokenizerConfig from lm_buddy.configs.jobs.prometheus import PrometheusJobConfig +from lm_buddy.constants import LM_BUDDY_HOME_PATH from lm_buddy.jobs.asset_loader import HuggingFaceAssetLoader from lm_buddy.jobs.common import EvaluationResult from lm_buddy.preprocessing import format_dataset_with_prompt -from lm_buddy.storage import DEFAULT_STORAGE_PATH from lm_buddy.tracking.artifact_utils import ( ArtifactType, build_directory_artifact, @@ -161,8 +161,8 @@ def data_generator(): result_dataset = Dataset.from_generator(data_generator) # Save dataset to disk - storage_path = config.evaluation.storage_path or DEFAULT_STORAGE_PATH - result_dataset_path = Path(storage_path) / config.name / "evaluation" / "prometheus" + storage_path = config.evaluation.storage_path or LM_BUDDY_HOME_PATH + result_dataset_path = Path(storage_path) / "datasets" / config.name / "prometheus" result_dataset.save_to_disk(result_dataset_path) return result_dataset_path diff --git a/src/lm_buddy/jobs/evaluation/ragas.py b/src/lm_buddy/jobs/evaluation/ragas.py index 09756d7f..d773460b 100644 --- a/src/lm_buddy/jobs/evaluation/ragas.py +++ b/src/lm_buddy/jobs/evaluation/ragas.py @@ -8,10 +8,10 @@ from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness from lm_buddy.configs.jobs.ragas import RagasJobConfig +from lm_buddy.constants import LM_BUDDY_HOME_PATH from lm_buddy.jobs.asset_loader import HuggingFaceAssetLoader from lm_buddy.jobs.common import EvaluationResult from lm_buddy.preprocessing import format_dataset_with_prompt -from lm_buddy.storage import DEFAULT_STORAGE_PATH from lm_buddy.tracking.artifact_utils import ( ArtifactType, build_directory_artifact, @@ -60,8 +60,8 @@ def run_eval(config: RagasJobConfig) -> Path: result_dataset = Dataset.from_pandas(result.to_pandas()) # Save dataset to disk - storage_path = config.evaluation.storage_path or DEFAULT_STORAGE_PATH - result_dataset_path = Path(storage_path) / config.name / "evaluation" / "ragas" + storage_path = config.evaluation.storage_path or LM_BUDDY_HOME_PATH + result_dataset_path = Path(storage_path) / "datasets" / config.name / "ragas" result_dataset.save_to_disk(result_dataset_path) return result_dataset_path diff --git a/src/lm_buddy/storage.py b/src/lm_buddy/storage.py deleted file mode 100644 index 74a375b7..00000000 --- a/src/lm_buddy/storage.py +++ /dev/null @@ -1,7 +0,0 @@ -import os -from pathlib import Path - -DEFAULT_STORAGE_PATH: str = os.getenv( - "LM_BUDDY_STORAGE", - str(Path.home() / "lm_buddy_results"), -) From 36d6b15f3b197ec3f67b5b8bc90f43ab321e1d1f Mon Sep 17 00:00:00 2001 From: Sean Friedowitz Date: Mon, 8 Apr 2024 08:49:28 -0700 Subject: [PATCH 8/8] do a flatter level results path --- src/lm_buddy/constants.py | 5 +++++ src/lm_buddy/jobs/evaluation/prometheus.py | 6 +++--- src/lm_buddy/jobs/evaluation/ragas.py | 6 +++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/lm_buddy/constants.py b/src/lm_buddy/constants.py index 6239ecbe..2ae0d5ed 100644 --- a/src/lm_buddy/constants.py +++ b/src/lm_buddy/constants.py @@ -5,3 +5,8 @@ "LM_BUDDY_HOME", str(Path.home() / ".lm_buddy"), ) + +LM_BUDDY_RESULTS_PATH: str = os.getenv( + "LM_BUDDY_RESULTS", + f"{LM_BUDDY_HOME_PATH}/results", +) diff --git a/src/lm_buddy/jobs/evaluation/prometheus.py b/src/lm_buddy/jobs/evaluation/prometheus.py index 7a0fb561..f916b769 100644 --- a/src/lm_buddy/jobs/evaluation/prometheus.py +++ b/src/lm_buddy/jobs/evaluation/prometheus.py @@ -17,7 +17,7 @@ from lm_buddy.configs.huggingface import AutoTokenizerConfig from lm_buddy.configs.jobs.prometheus import PrometheusJobConfig -from lm_buddy.constants import LM_BUDDY_HOME_PATH +from lm_buddy.constants import LM_BUDDY_RESULTS_PATH from lm_buddy.jobs.asset_loader import HuggingFaceAssetLoader from lm_buddy.jobs.common import EvaluationResult from lm_buddy.preprocessing import format_dataset_with_prompt @@ -161,8 +161,8 @@ def data_generator(): result_dataset = Dataset.from_generator(data_generator) # Save dataset to disk - storage_path = config.evaluation.storage_path or LM_BUDDY_HOME_PATH - result_dataset_path = Path(storage_path) / "datasets" / config.name / "prometheus" + storage_path = config.evaluation.storage_path or LM_BUDDY_RESULTS_PATH + result_dataset_path = Path(storage_path) / config.name / "prometheus" result_dataset.save_to_disk(result_dataset_path) return result_dataset_path diff --git a/src/lm_buddy/jobs/evaluation/ragas.py b/src/lm_buddy/jobs/evaluation/ragas.py index d773460b..9da70279 100644 --- a/src/lm_buddy/jobs/evaluation/ragas.py +++ b/src/lm_buddy/jobs/evaluation/ragas.py @@ -8,7 +8,7 @@ from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness from lm_buddy.configs.jobs.ragas import RagasJobConfig -from lm_buddy.constants import LM_BUDDY_HOME_PATH +from lm_buddy.constants import LM_BUDDY_RESULTS_PATH from lm_buddy.jobs.asset_loader import HuggingFaceAssetLoader from lm_buddy.jobs.common import EvaluationResult from lm_buddy.preprocessing import format_dataset_with_prompt @@ -60,8 +60,8 @@ def run_eval(config: RagasJobConfig) -> Path: result_dataset = Dataset.from_pandas(result.to_pandas()) # Save dataset to disk - storage_path = config.evaluation.storage_path or LM_BUDDY_HOME_PATH - result_dataset_path = Path(storage_path) / "datasets" / config.name / "ragas" + storage_path = config.evaluation.storage_path or LM_BUDDY_RESULTS_PATH + result_dataset_path = Path(storage_path) / config.name / "ragas" result_dataset.save_to_disk(result_dataset_path) return result_dataset_path