Skip to content
This repository has been archived by the owner on Sep 24, 2024. It is now read-only.

New huggingface eval for the summarization use case with rouge, meteor, and bertscore #100

Merged
merged 15 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions examples/configs/evaluation/hf_evaluate_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: "lm-buddy-hf-evaluate"

# Input dataset path
dataset:
path: "s3://platform-storage/datasets/dialogsum"

# Settings specific to the hf_evaluate entrypoint
evaluation:
# metrics to be used for the evaluation
# (you can add "rouge", "meteor", and "bertscore" atm)
metrics: ["rouge", "meteor", "bertscore"]
# enable/disable tqdm to track eval progress
# (useful when running interactively, noisy on ray logs)
enable_tqdm: True
# rely on HF pipeline for summarization (ignored if using OAI API)
use_pipeline: True
# perform inference / evaluation on the first max_samples only
max_samples: 10
# output file path
# - if you provide a path complete with a filename, results will be stored in it
# - if you provide a dir, results will be stored in <dir>/<config.name>/eval_results.json
# - if you don't provide a storage path, results will be stored locally (see ~/.lm-buddy/results)
# storage_path: "s3://platform-storage/experiments/results/"

# Model to evaluate (local).
# - Provide model path to load the model locally
# - Make sure you add quantization details (see below) if the model is too large
# - Optionally, add a tokenizer (the one matching the specified model name is the default)
model:
path: "hf://facebook/bart-large-cnn"

# Quantization (use it if you are dealing with models too large to fit in RAM)
# quantization:
# load_in_4bit: True
# bnb_4bit_quant_type: "fp4"
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: "lm-buddy-hf-evaluate-is"

# Input dataset path
dataset:
path: "s3://platform-storage/datasets/dialogsum"

# Settings specific to the hf_evaluate entrypoint
evaluation:
# metrics to be used for the evaluation
# (you can add "rouge", "meteor", and "bertscore" atm)
metrics: ["rouge", "meteor", "bertscore"]
# enable/disable tqdm to track eval progress
# (useful when running interactively, noisy on ray logs)
enable_tqdm: True
# rely on HF pipeline for summarization (ignored if using OAI API)
use_pipeline: True
# perform inference / evaluation on the first max_samples only
max_samples: 10
# output file path
# - if you provide a path complete with a filename, results will be stored in it
# - if you provide a dir, results will be stored in <dir>/<config.name>/eval_results.json
# - if you don't provide a storage path, results will be stored locally (see ~/.lm-buddy/results)
# storage_path: "s3://platform-storage/experiments/results/"

# Model to evaluate (OpenAI-compatible API)
# - Works with local/remote vLLM-served models and llamafiles
# - Provide base_url and engine
# - Customize the system prompt if needed
model:
inference:
base_url: "http://localhost:8081/v1"
engine: "hf://mistralai/mistral-7b-instruct-v0.2"
system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences."
max_retries: 3
34 changes: 34 additions & 0 deletions examples/configs/evaluation/hf_evaluate_openai_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: "lm-buddy-hf-evaluate-oai"

# Input dataset path
dataset:
path: "s3://platform-storage/datasets/dialogsum"

# Settings specific to the hf_evaluate entrypoint
evaluation:
# metrics to be used for the evaluation
# (you can add "rouge", "meteor", and "bertscore" atm)
metrics: ["rouge", "meteor", "bertscore"]
# enable/disable tqdm to track eval progress
# (useful when running interactively, noisy on ray logs)
enable_tqdm: True
# rely on HF pipeline for summarization (ignored if using OAI API)
use_pipeline: True
# perform inference / evaluation on the first max_samples only
max_samples: 10
# output file path
# - if you provide a path complete with a filename, results will be stored in it
# - if you provide a dir, results will be stored in <dir>/<config.name>/eval_results.json
# - if you don't provide a storage path, results will be stored locally (see ~/.lm-buddy/results)
# storage_path: "s3://platform-storage/experiments/results/"

# Model to evaluate (OpenAI)
# - The base_url is fixed
# - Choose an engine name (see https://platform.openai.com/docs/models)
# - Customize the system prompt if needed
model:
inference:
base_url: "https://api.openai.com/v1"
engine: "oai://gpt-4-turbo"
system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences."
max_retries: 3
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
[build-system]
requires = ["setuptools >= 61.0"]
requires = ["setuptools==69.5.1"]
build-backend = "setuptools.build_meta"

[project]
name = "lm-buddy"
version = "0.10.4"
version = "0.10.5"
authors = [
{ name = "Sean Friedowitz", email = "[email protected]" },
{ name = "Aaron Gonzales", email = "[email protected]" },
Expand All @@ -30,13 +30,15 @@ dependencies = [
"pydantic-yaml==1.2.0",
"ray[default]==2.9.3",
"loguru==0.7.2",
"s3fs",
# HuggingFace
"datasets>=2.17.1",
"transformers==4.36.2",
"accelerate==0.26.1",
"peft==0.7.1",
"trl==0.7.10",
"bitsandbytes==0.42.0",
"bert_score==0.3.13",
# Evaluation frameworks
"lm-eval==0.4.2",
"einops==0.7.0",
Expand Down
4 changes: 4 additions & 0 deletions src/lm_buddy/buddy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
from lm_buddy.configs.jobs import (
EvaluationJobConfig,
FinetuningJobConfig,
HuggingFaceEvalJobConfig,
JobConfig,
LMHarnessJobConfig,
PrometheusJobConfig,
RagasJobConfig,
)
from lm_buddy.jobs.common import EvaluationResult, FinetuningResult, JobType
from lm_buddy.jobs.evaluation.hf_evaluate import run_hf_evaluation
aittalam marked this conversation as resolved.
Show resolved Hide resolved
from lm_buddy.jobs.evaluation.lm_harness import run_lm_harness
from lm_buddy.jobs.evaluation.prometheus import run_prometheus
from lm_buddy.jobs.evaluation.ragas import run_ragas
Expand Down Expand Up @@ -66,6 +68,8 @@ def evaluate(self, config: EvaluationJobConfig) -> EvaluationResult:
result = run_prometheus(prometheus_config)
case RagasJobConfig() as ragas_config:
result = run_ragas(ragas_config)
case HuggingFaceEvalJobConfig() as hf_eval_config:
result = run_hf_evaluation(hf_eval_config)
case _:
raise ValueError(f"Invlid configuration for evaluation: {type(config)}")
self._generate_artifact_lineage(config, result.artifacts, JobType.EVALUATION)
Expand Down
15 changes: 14 additions & 1 deletion src/lm_buddy/cli/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@

from lm_buddy import LMBuddy
from lm_buddy.cli.utils import parse_config_option
from lm_buddy.configs.jobs import LMHarnessJobConfig, PrometheusJobConfig, RagasJobConfig
from lm_buddy.configs.jobs import (
HuggingFaceEvalJobConfig,
LMHarnessJobConfig,
PrometheusJobConfig,
RagasJobConfig,
)


@click.group(name="evaluate", help="Run an LM Buddy evaluation job.")
Expand Down Expand Up @@ -32,3 +37,11 @@ def ragas_command(config: str) -> None:
config = parse_config_option(RagasJobConfig, config)
buddy = LMBuddy()
buddy.evaluate(config)


@group.command("huggingface", help="Run the HuggingFace evaluation job.")
@click.option("--config", type=str)
def huggingface_command(config: str) -> None:
config = parse_config_option(HuggingFaceEvalJobConfig, config)
buddy = LMBuddy()
buddy.evaluate(config)
6 changes: 5 additions & 1 deletion src/lm_buddy/configs/jobs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
from lm_buddy.configs.jobs.common import JobConfig
from lm_buddy.configs.jobs.finetuning import FinetuningJobConfig
from lm_buddy.configs.jobs.hf_evaluate import HuggingFaceEvalJobConfig
from lm_buddy.configs.jobs.lm_harness import LMHarnessJobConfig
from lm_buddy.configs.jobs.prometheus import PrometheusJobConfig
from lm_buddy.configs.jobs.ragas import RagasJobConfig

EvaluationJobConfig = LMHarnessJobConfig | PrometheusJobConfig | RagasJobConfig
EvaluationJobConfig = (
LMHarnessJobConfig | PrometheusJobConfig | RagasJobConfig | HuggingFaceEvalJobConfig
)

__all__ = [
"JobConfig",
"FinetuningJobConfig",
"LMHarnessJobConfig",
"PrometheusJobConfig",
"RagasJobConfig",
"HuggingFaceEvalJobConfig",
"EvaluationJobConfig",
]
83 changes: 83 additions & 0 deletions src/lm_buddy/configs/jobs/hf_evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from pydantic import Field, conlist, field_validator, model_validator

from lm_buddy.configs.common import LMBuddyConfig
from lm_buddy.configs.huggingface import (
AutoModelConfig,
AutoTokenizerConfig,
DatasetConfig,
QuantizationConfig,
)
from lm_buddy.configs.jobs.common import JobConfig
from lm_buddy.configs.vllm import VLLMCompletionsConfig
from lm_buddy.paths import AssetPath


class HuggingFaceEvaluationConfig(LMBuddyConfig):
"""Misc settings provided to an lm-harness evaluation job."""

metrics: conlist(str, min_length=1)
aittalam marked this conversation as resolved.
Show resolved Hide resolved
use_pipeline: bool = False
enable_tqdm: bool = False
max_samples: int | None = None
storage_path: str | None = None


class HuggingFaceEvalJobConfig(JobConfig):
"""Configuration to run a HuggingFace evaluation job."""

dataset: DatasetConfig = Field(
description="Dataset of text completions to evaluate using the Prometheus judge model."
)
evaluation: HuggingFaceEvaluationConfig
model: AutoModelConfig | VLLMCompletionsConfig
quantization: QuantizationConfig | None = None
tokenizer: AutoTokenizerConfig

@model_validator(mode="before")
def ensure_tokenizer_config(cls, values):
"""Set the tokenizer to the model path when not explicitly provided."""
if values.get("tokenizer") is None:
aittalam marked this conversation as resolved.
Show resolved Hide resolved
values["tokenizer"] = {}
match values["model"]:
case str() as model_path:
values["tokenizer"]["path"] = model_path
case dict() as model_data:
# if dict we might have model.path specified
# if we don't it is VLLMCompletion and we are ok
# with anything as it will be ignored
if model_data.get("path") is None:
values["tokenizer"]["path"] = "oai://tokenizer"
else:
values["tokenizer"]["path"] = model_data.get("path")
case AutoModelConfig() as model_config:
values["tokenizer"]["path"] = model_config.path
# No fallback necessary, downstream validation will flag invalid model types
return values

@field_validator("model", mode="before")
def validate_model_arg(cls, x):
"""Allow for passing just a path string as the model argument."""
if isinstance(x, str):
return AutoModelConfig(path=x)
return x

@field_validator("tokenizer", mode="before")
def validate_tokenizer_arg(cls, x):
"""Allow for passing just a path string as the tokenizer argument."""
if isinstance(x, str):
return AutoTokenizerConfig(path=x)
return x

def asset_paths(self) -> list[AssetPath]:
match self.model:
case AutoModelConfig() as config:
return {
self.dataset.path,
self.evaluation.output_path,
config.path,
self.tokenizer.path,
}
case VLLMCompletionsConfig() as config if config.inference.engine is not None:
return {self.dataset.path, self.evaluation.output_path, config.inference.engine}
case _:
return {}
8 changes: 7 additions & 1 deletion src/lm_buddy/configs/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,18 @@ class InferenceServerConfig(LMBuddyConfig):

Note: This configuration is intended to be generic and not bound to the interface
of any specific training/evaluation framework. See `LocalChatCompletionConfig`
or `vLLMCompleptionsConfig` for intended usage alongside a third-party framework.
or `vLLMCompletionsConfig` for intended usage alongside a third-party framework.
"""

base_url: str
engine: AssetPath

# optional system prompt to be used by default in chat completions
system_prompt: str | None = None

# max number of retries when communication with server fails
max_retries: int | None = None
aittalam marked this conversation as resolved.
Show resolved Hide resolved


class VLLMCompletionsConfig(LMBuddyConfig):
"""Configuration for a vLLM-based completions service
Expand Down
21 changes: 19 additions & 2 deletions src/lm_buddy/jobs/asset_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,16 @@
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoTokenizer,
PretrainedConfig,
PreTrainedModel,
PreTrainedTokenizer,
)
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
)

from lm_buddy.configs.huggingface import (
AutoModelConfig,
Expand Down Expand Up @@ -40,7 +45,7 @@ def resolve_asset_path(self, path: AssetPath) -> str:
The returned string has its `PathPrefix` stripped away..
"""
raw_path = strip_path_prefix(path)
if path.startswith((PathPrefix.FILE, PathPrefix.HUGGINGFACE)):
if path.startswith((PathPrefix.FILE, PathPrefix.HUGGINGFACE, PathPrefix.OPENAI)):
return raw_path
elif path.startswith(PathPrefix.WANDB):
artifact = get_artifact_from_api(raw_path)
Expand Down Expand Up @@ -120,7 +125,19 @@ def load_pretrained_model(
# TODO: HuggingFace has many AutoModel classes with different "language model heads"
# Can we abstract this to load with any type of AutoModel class?
model_path = self.resolve_asset_path(config.path)
return AutoModelForCausalLM.from_pretrained(

# load config first to get the model type
model_config = self.load_pretrained_config(config)

if getattr(model_config, "model_type") in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:
automodel_class = AutoModelForSeq2SeqLM
elif getattr(model_config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
automodel_class = AutoModelForCausalLM
else:
logger.info("Model type not supported. Trying AutoModelForCausalLM")
automodel_class = AutoModelForCausalLM

return automodel_class.from_pretrained(
pretrained_model_name_or_path=model_path,
trust_remote_code=config.trust_remote_code,
torch_dtype=config.torch_dtype,
Expand Down
Loading
Loading