Skip to content
This repository has been archived by the owner on Sep 24, 2024. It is now read-only.

Centralize artifact lineage creation on LMBuddy class #92

Merged
merged 9 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion examples/configs/evaluation/lm_harness_hf_config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
name: "lm-buddy-lm-harness"
sfriedowitz marked this conversation as resolved.
Show resolved Hide resolved

# Model to evaluate
model:
path: "hf://distilgpt2"
Expand All @@ -15,6 +17,5 @@ quantization:

# Tracking info for where to log the run results
tracking:
name: "lm-buddy-lm-harness"
project: "lm-buddy-examples"
entity: "sample"
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
name: "lm-buddy-lm-harness-inference"

# Model to evaluate specified as a local-chat-completions inference server
model:
inference:
Expand All @@ -16,6 +18,5 @@ evaluation:
limit: 10

tracking:
name: "lm-buddy-lm-harness-inference"
project: "lm-buddy-examples"
entity: "sample"
3 changes: 2 additions & 1 deletion examples/configs/evaluation/prometheus_config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
name: "lm-buddy-prometheus-job"

dataset:
path: "wandb://sample-entity/lm-buddy-examples/wandb-file-artifact:latest"
# field containing scoring instructions in the json file
Expand Down Expand Up @@ -26,6 +28,5 @@ evaluation:
enable_tqdm: True

tracking:
name: "lm-buddy-prometheus"
project: "lm-buddy-examples"
entity: "sample"
3 changes: 2 additions & 1 deletion examples/configs/evaluation/ragas_config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
name: "lm-buddy-ragas"

dataset:
path: "wandb://sample-entity/lm-buddy-examples/wandb-file-artifact:latest"
# field containing scoring instructions in the json file
Expand All @@ -21,6 +23,5 @@ evaluation:
embedding_model: "sentence-transformers/all-mpnet-base-v2"

tracking:
name: "lm-buddy-ragas"
project: "lm-buddy-examples"
entity: "sample"
3 changes: 2 additions & 1 deletion examples/configs/finetuning/finetuning_config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
name: "lm-buddy-finetuning"

# Base model to load for finetuning
model:
path: "hf://distilgpt2"
Expand Down Expand Up @@ -36,7 +38,6 @@ adapter:

# Tracking info for where to log the run results
tracking:
name: "lm-buddy-finetuning"
project: "lm-buddy-examples"
entity: "mozilla-ai"

Expand Down
41 changes: 1 addition & 40 deletions examples/notebooks/dataset_preprocessing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@
"id": "5cea9f8f-7279-44ac-947c-1d79f6bf6ebc",
"metadata": {},
"source": [
"(3a) Log the dataset directory as an reference artifact using W&B directly"
"(3) Log the dataset directory as an reference artifact using W&B directly"
]
},
{
Expand All @@ -107,45 +107,6 @@
" artifact.add_reference(uri=f\"file://{dataset_save_path}\")\n",
" wandb.log_artifact(artifact)"
]
},
{
"cell_type": "markdown",
"id": "c5ab6772",
"metadata": {},
"source": [
"(3b) Log the dataset directory as an artifact using LM Buddy helper functions"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "8b09e47d-3ced-4eef-a89f-048754edc758",
"metadata": {},
"outputs": [],
"source": [
"from lm_buddy.integrations.wandb import (\n",
" ArtifactType,\n",
" WandbRunConfig,\n",
" build_directory_artifact,\n",
" wandb_init_from_config,\n",
")\n",
"from lm_buddy.jobs.utils import LMBuddyJobType\n",
"\n",
"run_config = WandbRunConfig(\n",
" name=\"lm-buddy-preprocessing-example\",\n",
" project=\"lm-buddy-examples\",\n",
" entity=\"mozilla-ai\",\n",
")\n",
"\n",
"with wandb_init_from_config(run_config, job_type=LMBuddyJobType.PREPROCESSING):\n",
" artifact = build_directory_artifact(\n",
" dir_path=dataset_save_path,\n",
" artifact_name=\"example-dataset-artfact-reference\",\n",
" artifact_type=ArtifactType.DATASET,\n",
" reference=True,\n",
" )\n",
" wandb.log_artifact(artifact)"
]
}
],
"metadata": {
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "lm-buddy"
version = "0.7.0"
version = "0.8.0"
authors = [
{ name = "Sean Friedowitz", email = "[email protected]" },
{ name = "Aaron Gonzales", email = "[email protected]" },
Expand Down
54 changes: 43 additions & 11 deletions src/lm_buddy/buddy.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
from lm_buddy.integrations.wandb import ArtifactLoader, WandbArtifactLoader
import wandb

from lm_buddy.integrations.wandb import WandbResumeMode
from lm_buddy.jobs._entrypoints import run_finetuning, run_lm_harness, run_prometheus, run_ragas
from lm_buddy.jobs.common import EvaluationResult, FinetuningResult
from lm_buddy.jobs.common import EvaluationResult, FinetuningResult, LMBuddyJobType
from lm_buddy.jobs.configs import (
EvaluationJobConfig,
FinetuningJobConfig,
LMBuddyJobConfig,
LMHarnessJobConfig,
PrometheusJobConfig,
RagasJobConfig,
)
from lm_buddy.paths import strip_path_prefix


class LMBuddy:
Expand All @@ -16,25 +20,53 @@ class LMBuddy:
Simple wrapper around executable functions for tasks available in the library.
"""

def __init__(self, artifact_loader: ArtifactLoader = WandbArtifactLoader()):
self._artifact_loader = artifact_loader
# TODO: Store some configuration (e.g., tracking info, name) globally on the buddy
def __init__(self):
pass

def _generate_artifact_lineage(
self,
sfriedowitz marked this conversation as resolved.
Show resolved Hide resolved
config: LMBuddyJobConfig,
results: list[wandb.Artifact],
job_type: LMBuddyJobType,
) -> None:
"""Link input artifacts and log output artifacts to a run.

A no-op if no tracking config is available.
"""
if config.tracking is not None:
with wandb.init(
name=config.name,
job_type=job_type,
resume=WandbResumeMode.ALLOW,
**config.tracking.model_dump(),
) as run:
for path in config.artifact_paths():
artifact_name = strip_path_prefix(path)
run.use_artifact(artifact_name)
for artifact in results:
artifact = run.log_artifact(artifact)
artifact.wait()

def finetune(self, config: FinetuningJobConfig) -> FinetuningResult:
"""Run a supervised finetuning task with the provided configuration."""
finetuning_result = run_finetuning(config, self._artifact_loader)
return finetuning_result
"""Run a supervised finetuning job with the provided configuration."""
result = run_finetuning(config)
self._generate_artifact_lineage(config, result.artifacts, LMBuddyJobType.FINETUNING)
return result

def evaluate(self, config: EvaluationJobConfig) -> EvaluationResult:
"""Run an evaluation task with the provided configuration.
"""Run an evaluation job with the provided configuration.

The underlying evaluation framework is determined by the configuration type.
"""
match config:
case LMHarnessJobConfig() as lm_harness_config:
return run_lm_harness(lm_harness_config, self._artifact_loader)
result = run_lm_harness(lm_harness_config)
case PrometheusJobConfig() as prometheus_config:
return run_prometheus(prometheus_config, self._artifact_loader)
result = run_prometheus(prometheus_config)
case RagasJobConfig() as ragas_config:
return run_ragas(ragas_config, self._artifact_loader)
result = run_ragas(ragas_config)
case _:
raise ValueError(f"Invlid configuration for evaluation: {type(config)}")
self._generate_artifact_lineage(config, result.artifacts, LMBuddyJobType.EVALUATION)
return result
62 changes: 31 additions & 31 deletions src/lm_buddy/integrations/huggingface/asset_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,45 +19,19 @@
DatasetConfig,
QuantizationConfig,
)
from lm_buddy.integrations.wandb import ArtifactLoader, get_artifact_directory
from lm_buddy.integrations.wandb import get_artifact_directory, get_artifact_from_api
from lm_buddy.paths import AssetPath, PathPrefix, strip_path_prefix


def resolve_peft_and_pretrained(path: str) -> tuple[str, str | None]:
"""Helper method for determining if a path corresponds to a PEFT model.

A PEFT model contains an `adapter_config.json` in its directory.
If this file can be loaded, we know the path is a for a PEFT model.
If not, we assume the provided path corresponds to a base HF model.

Args:
path (str): Name/path to a HuggingFace directory

Returns:
Tuple of (base model path, optional PEFT path)
"""
# We don't know if the checkpoint is adapter weights or merged model weights
# Try to load as an adapter and fall back to the checkpoint containing the full model
try:
peft_config = PeftConfig.from_pretrained(path)
return peft_config.base_model_name_or_path, path
except ValueError as e:
warnings.warn(
f"Unable to load model as adapter: {e}. "
"This is expected if the checkpoint does not contain adapter weights."
)
return path, None


class HuggingFaceAssetLoader:
"""Helper class for loading HuggingFace assets from LM Buddy configurations.

This class depends on an `ArtifactLoader` in order to resolve actual paths from
artifact references.
"""

def __init__(self, artifact_loader: ArtifactLoader):
self._artifact_loader = artifact_loader
TODO: We can probably move these to standalone functions now that ArtifactLoader is gone.
What if we add other deps (e.g, S3 client in the future?)
"""

def resolve_asset_path(self, path: AssetPath) -> str:
"""Resolve an `AssetPath` to a loadable string path.
Expand All @@ -69,11 +43,37 @@ def resolve_asset_path(self, path: AssetPath) -> str:
if path.startswith((PathPrefix.FILE, PathPrefix.HUGGINGFACE)):
return raw_path
elif path.startswith(PathPrefix.WANDB):
artifact = self._artifact_loader.use_artifact(raw_path)
artifact = get_artifact_from_api(raw_path)
return str(get_artifact_directory(artifact))
else:
raise ValueError(f"Unable to resolve asset path from {path}.")

def resolve_peft_and_pretrained(self, path: AssetPath) -> tuple[str, str | None]:
"""Helper method for determining if a path corresponds to a PEFT model.

A PEFT model contains an `adapter_config.json` in its directory.
If this file can be loaded, we know the path is a for a PEFT model.
If not, we assume the provided path corresponds to a base HF model.

Args:
path (AssetPath): Path for the asset with its `PathPrefix` present

Returns:
Tuple of (base model path, optional PEFT path)
"""
# We don't know if the checkpoint is adapter weights or merged model weights
# Try to load as an adapter and fall back to the checkpoint containing the full model
resolved_path = self.resolve_asset_path(path)
try:
peft_config = PeftConfig.from_pretrained(resolved_path)
return peft_config.base_model_name_or_path, resolved_path
except ValueError as e:
warnings.warn(
f"Unable to load model as adapter: {e}. "
"This is expected if the checkpoint does not contain adapter weights."
)
return resolved_path, None

def load_pretrained_config(
self,
config: AutoModelConfig,
Expand Down
1 change: 0 additions & 1 deletion src/lm_buddy/integrations/wandb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from lm_buddy.integrations.wandb.artifact_loader import *
from lm_buddy.integrations.wandb.artifact_utils import *
from lm_buddy.integrations.wandb.run_config import *
from lm_buddy.integrations.wandb.run_utils import *
47 changes: 0 additions & 47 deletions src/lm_buddy/integrations/wandb/artifact_loader.py

This file was deleted.

16 changes: 13 additions & 3 deletions src/lm_buddy/integrations/wandb/artifact_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,19 @@ class ArtifactType(str, Enum):
EVALUATION = "evaluation"


def default_artifact_name(name: str, artifact_type: ArtifactType) -> str:
"""A default name for an artifact based on the run name and type."""
return f"{name}-{artifact_type}"
def default_artifact_name(job_name: str, artifact_type: ArtifactType) -> str:
"""A default name for an artifact based on the job name and type."""
return f"{job_name}-{artifact_type}"


def get_artifact_from_api(artifact_name: str) -> wandb.Artifact:
"""Retrieve an artifact by fully qualified name from the W&B API.

This does not handle linking the artifact to an active run.
For that, use `run.use_artifact(artifact_name)`.
"""
api = wandb.Api()
return api.artifact(artifact_name)


def get_artifact_directory(
Expand Down
Loading
Loading