mozilla-ai · aittalam · Jun 13, 2024 · Jun 6, 2024 · Jun 10, 2024 · Jun 10, 2024
diff --git a/examples/configs/evaluation/hf_evaluate_config.yaml b/examples/configs/evaluation/hf_evaluate_config.yaml
@@ -0,0 +1,35 @@
+name: "lm-buddy-hf-evaluate"
+
+# Input dataset path
+dataset:
+  path: "s3://platform-storage/datasets/dialogsum"
+
+# Settings specific to the hf_evaluate entrypoint
+evaluation:
+  # metrics to be used for the evaluation
+  # (you can add "rouge", "meteor", and "bertscore" atm)
+  metrics: ["rouge", "meteor", "bertscore"]
+  # enable/disable tqdm to track eval progress
+  # (useful when running interactively, noisy on ray logs)
+  enable_tqdm: True
+  # rely on HF pipeline for summarization (ignored if using OAI API)
+  use_pipeline: True
+  # perform inference / evaluation on the first max_samples only
+  max_samples: 10
+  # output file path
+  # - if you provide a path complete with a filename, results will be stored in it
+  # - if you provide a dir, results will be stored in <dir>/<config.name>/eval_results.json
+  # - if you don't provide a storage path, results will be stored locally (see ~/.lm-buddy/results)
+  # storage_path: "s3://platform-storage/experiments/results/"
+
+# Model to evaluate (local).
+# - Provide model path to load the model locally
+# - Make sure you add quantization details (see below) if the model is too large
+# - Optionally, add a tokenizer (the one matching the specified model name is the default)
+model:
+  path: "hf://facebook/bart-large-cnn"
+
+# Quantization (use it if you are dealing with models too large to fit in RAM)
+# quantization:
+#   load_in_4bit: True
+#   bnb_4bit_quant_type: "fp4"
diff --git a/examples/configs/evaluation/hf_evaluate_inference_server_config.yaml b/examples/configs/evaluation/hf_evaluate_inference_server_config.yaml
@@ -0,0 +1,34 @@
+name: "lm-buddy-hf-evaluate-is"
+
+# Input dataset path
+dataset:
+  path: "s3://platform-storage/datasets/dialogsum"
+
+# Settings specific to the hf_evaluate entrypoint
+evaluation:
+  # metrics to be used for the evaluation
+  # (you can add "rouge", "meteor", and "bertscore" atm)
+  metrics: ["rouge", "meteor", "bertscore"]
+  # enable/disable tqdm to track eval progress
+  # (useful when running interactively, noisy on ray logs)
+  enable_tqdm: True
+  # rely on HF pipeline for summarization (ignored if using OAI API)
+  use_pipeline: True
+  # perform inference / evaluation on the first max_samples only
+  max_samples: 10
+  # output file path
+  # - if you provide a path complete with a filename, results will be stored in it
+  # - if you provide a dir, results will be stored in <dir>/<config.name>/eval_results.json
+  # - if you don't provide a storage path, results will be stored locally (see ~/.lm-buddy/results)
+  # storage_path: "s3://platform-storage/experiments/results/"
+
+# Model to evaluate (OpenAI-compatible API)
+# - Works with local/remote vLLM-served models and llamafiles
+# - Provide base_url and engine
+# - Customize the system prompt if needed
+model:
+  inference:
+    base_url: "http://localhost:8081/v1"
+    engine: "hf://mistralai/mistral-7b-instruct-v0.2"
+    system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences."
+    max_retries: 3
diff --git a/examples/configs/evaluation/hf_evaluate_openai_config.yaml b/examples/configs/evaluation/hf_evaluate_openai_config.yaml
@@ -0,0 +1,34 @@
+name: "lm-buddy-hf-evaluate-oai"
+
+# Input dataset path
+dataset:
+  path: "s3://platform-storage/datasets/dialogsum"
+
+# Settings specific to the hf_evaluate entrypoint
+evaluation:
+  # metrics to be used for the evaluation
+  # (you can add "rouge", "meteor", and "bertscore" atm)
+  metrics: ["rouge", "meteor", "bertscore"]
+  # enable/disable tqdm to track eval progress
+  # (useful when running interactively, noisy on ray logs)
+  enable_tqdm: True
+  # rely on HF pipeline for summarization (ignored if using OAI API)
+  use_pipeline: True
+  # perform inference / evaluation on the first max_samples only
+  max_samples: 10
+  # output file path
+  # - if you provide a path complete with a filename, results will be stored in it
+  # - if you provide a dir, results will be stored in <dir>/<config.name>/eval_results.json
+  # - if you don't provide a storage path, results will be stored locally (see ~/.lm-buddy/results)
+  # storage_path: "s3://platform-storage/experiments/results/"
+
+# Model to evaluate (OpenAI)
+# - The base_url is fixed
+# - Choose an engine name (see https://platform.openai.com/docs/models)
+# - Customize the system prompt if needed
+model:
+  inference:
+    base_url: "https://api.openai.com/v1"
+    engine: "oai://gpt-4-turbo"
+    system_prompt: "You are a helpful assistant, expert in text summarization. For every prompt you receive, provide a summary of its contents in at most two sentences."
+    max_retries: 3
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,10 +1,10 @@
 [build-system]
-requires = ["setuptools >= 61.0"]
+requires = ["setuptools==69.5.1"]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "lm-buddy"
-version = "0.10.4"
+version = "0.10.5"
 authors = [
     { name = "Sean Friedowitz", email = "[email protected]" },
     { name = "Aaron Gonzales", email = "[email protected]" },
@@ -30,13 +30,15 @@ dependencies = [
     "pydantic-yaml==1.2.0",
     "ray[default]==2.9.3",
     "loguru==0.7.2",
+    "s3fs",
     # HuggingFace
     "datasets>=2.17.1",
     "transformers==4.36.2",
     "accelerate==0.26.1",
     "peft==0.7.1",
     "trl==0.7.10",
     "bitsandbytes==0.42.0",
+    "bert_score==0.3.13",
     # Evaluation frameworks
     "lm-eval==0.4.2",
     "einops==0.7.0",

diff --git a/src/lm_buddy/buddy.py b/src/lm_buddy/buddy.py
@@ -3,12 +3,14 @@
 from lm_buddy.configs.jobs import (
     EvaluationJobConfig,
     FinetuningJobConfig,
+    HuggingFaceEvalJobConfig,
     JobConfig,
     LMHarnessJobConfig,
     PrometheusJobConfig,
     RagasJobConfig,
 )
 from lm_buddy.jobs.common import EvaluationResult, FinetuningResult, JobType
+from lm_buddy.jobs.evaluation.hf_evaluate import run_hf_evaluation
 from lm_buddy.jobs.evaluation.lm_harness import run_lm_harness
 from lm_buddy.jobs.evaluation.prometheus import run_prometheus
 from lm_buddy.jobs.evaluation.ragas import run_ragas
@@ -66,6 +68,8 @@ def evaluate(self, config: EvaluationJobConfig) -> EvaluationResult:
                 result = run_prometheus(prometheus_config)
             case RagasJobConfig() as ragas_config:
                 result = run_ragas(ragas_config)
+            case HuggingFaceEvalJobConfig() as hf_eval_config:
+                result = run_hf_evaluation(hf_eval_config)
             case _:
                 raise ValueError(f"Invlid configuration for evaluation: {type(config)}")
         self._generate_artifact_lineage(config, result.artifacts, JobType.EVALUATION)

diff --git a/src/lm_buddy/cli/evaluate.py b/src/lm_buddy/cli/evaluate.py
@@ -2,7 +2,12 @@
 
 from lm_buddy import LMBuddy
 from lm_buddy.cli.utils import parse_config_option
-from lm_buddy.configs.jobs import LMHarnessJobConfig, PrometheusJobConfig, RagasJobConfig
+from lm_buddy.configs.jobs import (
+    HuggingFaceEvalJobConfig,
+    LMHarnessJobConfig,
+    PrometheusJobConfig,
+    RagasJobConfig,
+)
 
 
 @click.group(name="evaluate", help="Run an LM Buddy evaluation job.")
@@ -32,3 +37,11 @@ def ragas_command(config: str) -> None:
     config = parse_config_option(RagasJobConfig, config)
     buddy = LMBuddy()
     buddy.evaluate(config)
+
+
+@group.command("huggingface", help="Run the HuggingFace evaluation job.")
+@click.option("--config", type=str)
+def huggingface_command(config: str) -> None:
+    config = parse_config_option(HuggingFaceEvalJobConfig, config)
+    buddy = LMBuddy()
+    buddy.evaluate(config)
diff --git a/src/lm_buddy/configs/jobs/__init__.py b/src/lm_buddy/configs/jobs/__init__.py
@@ -1,16 +1,20 @@
 from lm_buddy.configs.jobs.common import JobConfig
 from lm_buddy.configs.jobs.finetuning import FinetuningJobConfig
+from lm_buddy.configs.jobs.hf_evaluate import HuggingFaceEvalJobConfig
 from lm_buddy.configs.jobs.lm_harness import LMHarnessJobConfig
 from lm_buddy.configs.jobs.prometheus import PrometheusJobConfig
 from lm_buddy.configs.jobs.ragas import RagasJobConfig
 
-EvaluationJobConfig = LMHarnessJobConfig | PrometheusJobConfig | RagasJobConfig
+EvaluationJobConfig = (
+    LMHarnessJobConfig | PrometheusJobConfig | RagasJobConfig | HuggingFaceEvalJobConfig
+)
 
 __all__ = [
     "JobConfig",
     "FinetuningJobConfig",
     "LMHarnessJobConfig",
     "PrometheusJobConfig",
     "RagasJobConfig",
+    "HuggingFaceEvalJobConfig",
     "EvaluationJobConfig",
 ]
diff --git a/src/lm_buddy/configs/jobs/hf_evaluate.py b/src/lm_buddy/configs/jobs/hf_evaluate.py
@@ -0,0 +1,83 @@
+from pydantic import Field, conlist, field_validator, model_validator
+
+from lm_buddy.configs.common import LMBuddyConfig
+from lm_buddy.configs.huggingface import (
+    AutoModelConfig,
+    AutoTokenizerConfig,
+    DatasetConfig,
+    QuantizationConfig,
+)
+from lm_buddy.configs.jobs.common import JobConfig
+from lm_buddy.configs.vllm import VLLMCompletionsConfig
+from lm_buddy.paths import AssetPath
+
+
+class HuggingFaceEvaluationConfig(LMBuddyConfig):
+    """Misc settings provided to an lm-harness evaluation job."""
+
+    metrics: conlist(str, min_length=1)
+    use_pipeline: bool = False
+    enable_tqdm: bool = False
+    max_samples: int | None = None
+    storage_path: str | None = None
+
+
+class HuggingFaceEvalJobConfig(JobConfig):
+    """Configuration to run a HuggingFace evaluation job."""
+
+    dataset: DatasetConfig = Field(
+        description="Dataset of text completions to evaluate using the Prometheus judge model."
+    )
+    evaluation: HuggingFaceEvaluationConfig
+    model: AutoModelConfig | VLLMCompletionsConfig
+    quantization: QuantizationConfig | None = None
+    tokenizer: AutoTokenizerConfig
+
+    @model_validator(mode="before")
+    def ensure_tokenizer_config(cls, values):
+        """Set the tokenizer to the model path when not explicitly provided."""
+        if values.get("tokenizer") is None:
+            values["tokenizer"] = {}
+            match values["model"]:
+                case str() as model_path:
+                    values["tokenizer"]["path"] = model_path
+                case dict() as model_data:
+                    # if dict we might have model.path specified
+                    # if we don't it is VLLMCompletion and we are ok
+                    # with anything as it will be ignored
+                    if model_data.get("path") is None:
+                        values["tokenizer"]["path"] = "oai://tokenizer"
+                    else:
+                        values["tokenizer"]["path"] = model_data.get("path")
+                case AutoModelConfig() as model_config:
+                    values["tokenizer"]["path"] = model_config.path
+                # No fallback necessary, downstream validation will flag invalid model types
+        return values
+
+    @field_validator("model", mode="before")
+    def validate_model_arg(cls, x):
+        """Allow for passing just a path string as the model argument."""
+        if isinstance(x, str):
+            return AutoModelConfig(path=x)
+        return x
+
+    @field_validator("tokenizer", mode="before")
+    def validate_tokenizer_arg(cls, x):
+        """Allow for passing just a path string as the tokenizer argument."""
+        if isinstance(x, str):
+            return AutoTokenizerConfig(path=x)
+        return x
+
+    def asset_paths(self) -> list[AssetPath]:
+        match self.model:
+            case AutoModelConfig() as config:
+                return {
+                    self.dataset.path,
+                    self.evaluation.output_path,
+                    config.path,
+                    self.tokenizer.path,
+                }
+            case VLLMCompletionsConfig() as config if config.inference.engine is not None:
+                return {self.dataset.path, self.evaluation.output_path, config.inference.engine}
+            case _:
+                return {}
diff --git a/src/lm_buddy/configs/vllm.py b/src/lm_buddy/configs/vllm.py
@@ -12,12 +12,18 @@ class InferenceServerConfig(LMBuddyConfig):
 
     Note: This configuration is intended to be generic and not bound to the interface
     of any specific training/evaluation framework. See `LocalChatCompletionConfig`
-    or `vLLMCompleptionsConfig` for intended usage alongside a third-party framework.
+    or `vLLMCompletionsConfig` for intended usage alongside a third-party framework.
     """
 
     base_url: str
     engine: AssetPath
 
+    # optional system prompt to be used by default in chat completions
+    system_prompt: str | None = None
+
+    # max number of retries when communication with server fails
+    max_retries: int | None = None
+
 
 class VLLMCompletionsConfig(LMBuddyConfig):
     """Configuration for a vLLM-based completions service

diff --git a/src/lm_buddy/jobs/asset_loader.py b/src/lm_buddy/jobs/asset_loader.py
@@ -8,11 +8,16 @@
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
     AutoTokenizer,
     PretrainedConfig,
     PreTrainedModel,
     PreTrainedTokenizer,
 )
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+)
 
 from lm_buddy.configs.huggingface import (
     AutoModelConfig,
@@ -40,7 +45,7 @@ def resolve_asset_path(self, path: AssetPath) -> str:
         The returned string has its `PathPrefix` stripped away..
         """
         raw_path = strip_path_prefix(path)
-        if path.startswith((PathPrefix.FILE, PathPrefix.HUGGINGFACE)):
+        if path.startswith((PathPrefix.FILE, PathPrefix.HUGGINGFACE, PathPrefix.OPENAI)):
             return raw_path
         elif path.startswith(PathPrefix.WANDB):
             artifact = get_artifact_from_api(raw_path)
@@ -120,7 +125,19 @@ def load_pretrained_model(
         # TODO: HuggingFace has many AutoModel classes with different "language model heads"
         #   Can we abstract this to load with any type of AutoModel class?
         model_path = self.resolve_asset_path(config.path)
-        return AutoModelForCausalLM.from_pretrained(
+
+        # load config first to get the model type
+        model_config = self.load_pretrained_config(config)
+
+        if getattr(model_config, "model_type") in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:
+            automodel_class = AutoModelForSeq2SeqLM
+        elif getattr(model_config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
+            automodel_class = AutoModelForCausalLM
+        else:
+            logger.info("Model type not supported. Trying AutoModelForCausalLM")
+            automodel_class = AutoModelForCausalLM
+
+        return automodel_class.from_pretrained(
             pretrained_model_name_or_path=model_path,
             trust_remote_code=config.trust_remote_code,
             torch_dtype=config.torch_dtype,