Skip to content
This repository has been archived by the owner on Sep 24, 2024. It is now read-only.

Update dataset generation and storage for ragas and prometheus #94

Merged
merged 8 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "lm-buddy"
version = "0.9.0"
version = "0.10.0"
authors = [
{ name = "Sean Friedowitz", email = "[email protected]" },
{ name = "Aaron Gonzales", email = "[email protected]" },
Expand All @@ -29,6 +29,7 @@ dependencies = [
"pydantic==2.6.0",
"pydantic-yaml==1.2.0",
"ray[default]==2.9.3",
"loguru==0.7.2",
# HuggingFace
"datasets>=2.17.1",
"transformers==4.36.2",
Expand Down
2 changes: 1 addition & 1 deletion src/lm_buddy/configs/jobs/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ class PrometheusEvaluationConfig(LMBuddyConfig):
min_score: int = 0
max_score: int = 5
enable_tqdm: bool = False
output_folder: str = "/tmp"
conversation_template: str = "llama-2"
conversation_system_message: str = "You are a fair evaluator language model."
storage_path: str | None = None


class PrometheusJobConfig(JobConfig):
Expand Down
24 changes: 11 additions & 13 deletions src/lm_buddy/configs/jobs/ragas.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Literal
from typing import Literal, get_args

from pydantic import Field, field_validator

Expand All @@ -20,19 +20,17 @@ class RagasEvaluationConfig(LMBuddyConfig):
"""Parameters specifically required for RAGAs Evaluation"""

metrics: list[RagasEvaluationMetric] = Field(
default=[
"faithfulness",
"answer_relevancy",
"context_recall",
"context_precision",
]
default_factory=lambda: list(get_args(RagasEvaluationMetric)),
description="List of metric names for Ragas evaluation.",
)
embedding_model: AssetPath = Field(
default="hf://sentence-transformers/all-mpnet-base-v2",
description="Path to embedding model used with the evaluation judge.",
)
storage_path: str | None = Field(
default=None,
description="Path to store evaluation outputs. Defaults to the `LM_BUDDY_STORAGE` path.",
)

# language model and embedding models used as evaluation judges
embedding_model: AutoModelConfig | None = "sentence-transformers/all-mpnet-base-v2"

# path to store the generated ratings/evaluations of each dataset sample
output_folder: str = "/tmp"

@field_validator("embedding_model", mode="before")
def validate_embedding_model_arg(cls, x):
Expand Down
3 changes: 2 additions & 1 deletion src/lm_buddy/jobs/asset_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import torch
from accelerate import Accelerator
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
from loguru import logger
from peft import PeftConfig
from transformers import (
AutoConfig,
Expand Down Expand Up @@ -103,7 +104,7 @@ def load_pretrained_model(
Accelerator().local_process_index if torch.cuda.is_available() else "cpu"
)
device_map = {"": current_device}
print(f"Setting model device_map = {device_map} to enable quantization")
logger.info(f"Setting model device_map = {device_map} to enable quantization")

# TODO: HuggingFace has many AutoModel classes with different "language model heads"
# Can we abstract this to load with any type of AutoModel class?
Expand Down
Empty file.
9 changes: 6 additions & 3 deletions src/lm_buddy/jobs/evaluation/lm_harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import torch
from lm_eval.models.huggingface import HFLM
from lm_eval.models.openai_completions import OpenaiCompletionsLM
from loguru import logger

from lm_buddy.configs.huggingface import AutoModelConfig
from lm_buddy.configs.jobs.lm_harness import LMHarnessJobConfig, LocalChatCompletionsConfig
Expand Down Expand Up @@ -69,7 +70,9 @@ def load_harness_model(config: LMHarnessJobConfig) -> HFLM | OpenaiCompletionsLM


def run_lm_harness(config: LMHarnessJobConfig) -> EvaluationResult:
print(f"Running lm-harness evaluation with configuration:\n {config.model_dump_json(indent=2)}")
logger.info(
f"Running lm-harness evaluation with configuration:\n {config.model_dump_json(indent=2)}"
)

llm = load_harness_model(config)
eval_results = lm_eval.simple_evaluate(
Expand All @@ -80,10 +83,10 @@ def run_lm_harness(config: LMHarnessJobConfig) -> EvaluationResult:
limit=config.evaluation.limit,
log_samples=False,
)
print(f"Obtained evaluation results: {eval_results}")
logger.info(f"Obtained evaluation results: {eval_results}")

# Create an artifact containing eval tables
result_tables = get_per_task_dataframes(eval_results["results"])

artifact_name = default_artifact_name(config.name, ArtifactType.EVALUATION)
table_artifact = build_table_artifact(
artifact_name=artifact_name,
Expand Down
77 changes: 43 additions & 34 deletions src/lm_buddy/jobs/evaluation/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,23 @@
"""

import copy
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Any

from datasets import load_dataset
from datasets import Dataset
from fastchat.conversation import get_conv_template
from openai import Completion, OpenAI, OpenAIError
from loguru import logger
from openai import OpenAI, OpenAIError
from openai.types import Completion
from tqdm import tqdm

from lm_buddy.configs.huggingface import AutoTokenizerConfig
from lm_buddy.configs.jobs.prometheus import PrometheusJobConfig
from lm_buddy.jobs.asset_loader import HuggingFaceAssetLoader
from lm_buddy.jobs.common import EvaluationResult
from lm_buddy.preprocessing import format_dataset_with_prompt
from lm_buddy.storage import DEFAULT_STORAGE_PATH
from lm_buddy.tracking.artifact_utils import (
ArtifactType,
build_directory_artifact,
Expand All @@ -32,13 +35,15 @@ def __init__(self, message, error=None):
self.error = error


def openai_completion(config: PrometheusJobConfig, client: OpenAI, prompt: str) -> Completion:
def openai_completion(
config: PrometheusJobConfig, client: OpenAI, engine: str, prompt: str
) -> Completion:
"""Connects to a remote OpenAI-API-compatible Prometheus endpoint
and returns a Completion holding the model's response.
"""

return client.completions.create(
model=config.prometheus.inference.engine,
model=engine,
prompt=prompt,
best_of=config.prometheus.best_of,
max_tokens=config.prometheus.max_tokens,
Expand All @@ -50,8 +55,7 @@ def openai_completion(config: PrometheusJobConfig, client: OpenAI, prompt: str)

def parse_response(config: PrometheusJobConfig, response: Completion) -> tuple[str, str]:
"""Given a Prometheus eval response as returned by the OpenAI API
endpoint (i.e. in Completion format), extract feedback
and score.
endpoint (i.e. in Completion format), extract feedback and score.
"""

if response is None:
Expand Down Expand Up @@ -84,18 +88,21 @@ def instruction_to_prompt(config: PrometheusJobConfig, instruction: str) -> str:


def get_response_with_retries(
config: PrometheusJobConfig, client: OpenAI, prompt: str, max_retries: int
config: PrometheusJobConfig,
client: OpenAI,
engine: str,
prompt: str,
sfriedowitz marked this conversation as resolved.
Show resolved Hide resolved
) -> tuple[str, str]:
current_retry_attempt = 1
while current_retry_attempt <= config.evaluation.max_retries:
try:
response = openai_completion(config, client, prompt)
response = openai_completion(config, client, engine, prompt)
feedback, score = parse_response(config, response)
break
except (OpenAIError, BadResponseError) as e:
print(
f"[w] {e.message}, "
f"retrying ({current_retry_attempt}/{config.evaluation.max_retries})"
logger.warn(
f"{e.message}: "
f"Retrying ({current_retry_attempt}/{config.evaluation.max_retries})"
)
current_retry_attempt += 1
if current_retry_attempt > config.evaluation.max_retries:
Expand All @@ -107,25 +114,27 @@ def run_eval(config: PrometheusJobConfig) -> Path:
# Instantiate OpenAI client to speak with the vLLM endpoint
client = OpenAI(base_url=config.prometheus.inference.base_url)

# load dataset from W&B artifact
hf_loader = HuggingFaceAssetLoader()

# Resolve the engine model
engine_path = hf_loader.resolve_asset_path(config.prometheus.inference.engine)
sfriedowitz marked this conversation as resolved.
Show resolved Hide resolved

# Load dataset from W&B artifact
dataset = hf_loader.load_dataset(config.dataset)
if config.dataset.prompt_template is not None:
dataset = format_dataset_with_prompt(
dataset, config.dataset.prompt_template, config.dataset.text_field
)

# get the tokenizer
# Get the tokenizer
tokenizer_config = AutoTokenizerConfig(path=config.prometheus.inference.engine)
tokenizer = hf_loader.load_pretrained_tokenizer(tokenizer_config)

# enable / disable tqdm
# Enable / disable tqdm
dataset_iterable = tqdm(dataset) if config.evaluation.enable_tqdm else dataset

# open the output file for writing and iterate on samples
tracking_name = config.tracking.name if config.tracking is not None else "output.json"
sfriedowitz marked this conversation as resolved.
Show resolved Hide resolved
output_fname = Path(config.evaluation.output_folder) / tracking_name
with output_fname.open("w") as file:
# Generator that iterates over samples and yields new rows with the prometheus outputs
sfriedowitz marked this conversation as resolved.
Show resolved Hide resolved
def data_generator():
for sample in dataset_iterable:
# convert instructions from the dataset (`text_field` in a dict) to
# prompts that prometheus accepts
Expand All @@ -134,47 +143,47 @@ def run_eval(config: PrometheusJobConfig) -> Path:
# skip those examples which are too long
tokenized_prompt = tokenizer(prompt, truncation=False)
if len(tokenized_prompt["input_ids"]) > 3072:
logger.warn(f"Skipping row due to prompt exceeding token limit: {prompt=}")
continue

# prepare output
result = copy.deepcopy(sample)
result: dict[str, Any] = copy.deepcopy(sample)
result["prometheus_output"] = []
result["prometheus_score"] = []

for _ in range(config.evaluation.num_answers):
(feedback, score) = get_response_with_retries(
config, client, prompt, config.evaluation.max_retries
)
(feedback, score) = get_response_with_retries(config, client, engine_path, prompt)
result["prometheus_output"].append(feedback)
result["prometheus_score"].append(score)

# dump sample results incrementally
file.write(json.dumps(result) + "\n")
yield result

result_dataset = Dataset.from_generator(data_generator)

# convert plain json dataset in HF format
output_dataset_path = Path(config.evaluation.output_folder) / "hf" / tracking_name
ds = load_dataset("json", data_files=str(output_fname), split="train")
ds.save_to_disk(output_dataset_path)
# Save dataset to disk
storage_path = config.evaluation.storage_path or DEFAULT_STORAGE_PATH
result_dataset_path = Path(storage_path) / config.name / "evaluation" / "prometheus"
result_dataset.save_to_disk(result_dataset_path)

return output_dataset_path
return result_dataset_path


def run_prometheus(config: PrometheusJobConfig) -> EvaluationResult:
# Run eval and store output in local filename
output_dataset_path = run_eval(config)
print(f"Prometheus evaluation dataset stored at {output_dataset_path}")
result_dataset_path = run_eval(config)
logger.info(f"Prometheus evaluation dataset stored at {result_dataset_path}")

# Create a directory artifact for the HF dataset
artifact_name = default_artifact_name(config.name, artifact_type=ArtifactType.DATASET)
dataset_artifact = build_directory_artifact(
artifact_name=artifact_name,
artifact_type=ArtifactType.DATASET,
dir_path=output_dataset_path,
dir_path=result_dataset_path,
reference=False,
)

return EvaluationResult(
artifacts=[dataset_artifact],
dataset_path=output_dataset_path,
dataset_path=result_dataset_path,
tables={},
)
45 changes: 21 additions & 24 deletions src/lm_buddy/jobs/evaluation/ragas.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
from pathlib import Path

from datasets import load_dataset
from datasets import Dataset
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from loguru import logger
from ragas import evaluate as ragas_evaluate
from ragas.metrics import answer_relevancy, context_precision, context_recall, faithfulness

from lm_buddy.configs.jobs.ragas import RagasJobConfig
from lm_buddy.jobs.asset_loader import HuggingFaceAssetLoader
from lm_buddy.jobs.common import EvaluationResult
from lm_buddy.preprocessing import format_dataset_with_prompt
from lm_buddy.storage import DEFAULT_STORAGE_PATH
from lm_buddy.tracking.artifact_utils import (
ArtifactType,
build_directory_artifact,
Expand All @@ -33,14 +35,14 @@ def run_eval(config: RagasJobConfig) -> Path:
evaluation_dataset, config.dataset.prompt_template, config.dataset.text_field
)

# ragas custom model args
# Ragas custom model args
ragas_args = {}

# load embedding model
embedding_model = hf_loader.resolve_asset_path(config.evaluation.embedding_model.path)
# Load embedding model
embedding_model = hf_loader.resolve_asset_path(config.evaluation.embedding_model)
ragas_args["embeddings"] = HuggingFaceEmbeddings(model_name=embedding_model)

# configure ragas to point to vllm instance for generation
# Configure ragas to point to vllm instance for generation
inference_engine = hf_loader.resolve_asset_path(config.judge.inference.engine)
ragas_args["llm"] = ChatOpenAI(
model=inference_engine,
Expand All @@ -51,41 +53,36 @@ def run_eval(config: RagasJobConfig) -> Path:
top_k=config.judge.top_k,
)

result = ragas_evaluate(
dataset=evaluation_dataset,
metrics=RAGAS_METRICS_MAP[config.evaluation.metrics],
**ragas_args,
)
result_df = result.to_pandas()
ragas_metrics = [RAGAS_METRICS_MAP[metric] for metric in config.evaluation.metrics]
result = ragas_evaluate(dataset=evaluation_dataset, metrics=ragas_metrics, **ragas_args)

# open the output file for writing and iterate on samples
tracking_name = config.tracking.name if config.tracking is not None else "output.json"
output_fname = Path(config.evaluation.output_folder) / tracking_name
result_df.to_json(output_fname)
# Return a new dataset with score concatenated
result_dataset = Dataset.from_pandas(result.to_pandas())

# convert plain json dataset in HF format
output_dataset_path = Path(config.evaluation.output_folder) / "hf" / tracking_name
ds = load_dataset("json", data_files=str(output_fname), split="train")
ds.save_to_disk(output_dataset_path)
# Save dataset to disk
storage_path = config.evaluation.storage_path or DEFAULT_STORAGE_PATH
result_dataset_path = Path(storage_path) / config.name / "evaluation" / "ragas"
result_dataset.save_to_disk(result_dataset_path)

return output_dataset_path
return result_dataset_path


def run_ragas(config: RagasJobConfig) -> EvaluationResult:
output_dataset_path = run_eval(config)
print(f"Ragas evaluation dataset stored at {output_dataset_path}")
# Run evaluation
result_dataset_path = run_eval(config)
logger.info(f"Ragas evaluation dataset stored at {result_dataset_path}")

# Create a directory artifact for the HF dataset
artifact_name = default_artifact_name(config.name, artifact_type=ArtifactType.DATASET)
dataset_artifact = build_directory_artifact(
artifact_name=artifact_name,
artifact_type=ArtifactType.DATASET,
dir_path=output_dataset_path,
dir_path=result_dataset_path,
reference=False,
)

return EvaluationResult(
artifacts=[dataset_artifact],
dataset_path=output_dataset_path,
dataset_path=result_dataset_path,
tables={},
)
Loading
Loading