Skip to content
This repository has been archived by the owner on Sep 24, 2024. It is now read-only.

Centralize artifact lineage creation on LMBuddy class #92

Merged
merged 9 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion examples/configs/evaluation/lm_harness_hf_config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
name: "lm-buddy-lm-harness"
sfriedowitz marked this conversation as resolved.
Show resolved Hide resolved

# Model to evaluate
model:
path: "hf://distilgpt2"
Expand All @@ -15,6 +17,5 @@ quantization:

# Tracking info for where to log the run results
tracking:
name: "lm-buddy-lm-harness"
project: "lm-buddy-examples"
entity: "sample"
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
name: "lm-buddy-lm-harness-inference"

# Model to evaluate specified as a local-chat-completions inference server
model:
inference:
Expand All @@ -16,6 +18,5 @@ evaluation:
limit: 10

tracking:
name: "lm-buddy-lm-harness-inference"
project: "lm-buddy-examples"
entity: "sample"
3 changes: 2 additions & 1 deletion examples/configs/evaluation/prometheus_config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
name: "lm-buddy-prometheus-job"

dataset:
path: "wandb://sample-entity/lm-buddy-examples/wandb-file-artifact:latest"
# field containing scoring instructions in the json file
Expand Down Expand Up @@ -26,6 +28,5 @@ evaluation:
enable_tqdm: True

tracking:
name: "lm-buddy-prometheus"
project: "lm-buddy-examples"
entity: "sample"
3 changes: 2 additions & 1 deletion examples/configs/evaluation/ragas_config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
name: "lm-buddy-ragas"

dataset:
path: "wandb://sample-entity/lm-buddy-examples/wandb-file-artifact:latest"
# field containing scoring instructions in the json file
Expand All @@ -21,6 +23,5 @@ evaluation:
embedding_model: "sentence-transformers/all-mpnet-base-v2"

tracking:
name: "lm-buddy-ragas"
project: "lm-buddy-examples"
entity: "sample"
3 changes: 2 additions & 1 deletion examples/configs/finetuning/finetuning_config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
name: "lm-buddy-finetuning"

# Base model to load for finetuning
model:
path: "hf://distilgpt2"
Expand Down Expand Up @@ -36,7 +38,6 @@ adapter:

# Tracking info for where to log the run results
tracking:
name: "lm-buddy-finetuning"
project: "lm-buddy-examples"
entity: "mozilla-ai"

Expand Down
41 changes: 1 addition & 40 deletions examples/notebooks/dataset_preprocessing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@
"id": "5cea9f8f-7279-44ac-947c-1d79f6bf6ebc",
"metadata": {},
"source": [
"(3a) Log the dataset directory as an reference artifact using W&B directly"
"(3) Log the dataset directory as an reference artifact using W&B directly"
]
},
{
Expand All @@ -107,45 +107,6 @@
" artifact.add_reference(uri=f\"file://{dataset_save_path}\")\n",
" wandb.log_artifact(artifact)"
]
},
{
"cell_type": "markdown",
"id": "c5ab6772",
"metadata": {},
"source": [
"(3b) Log the dataset directory as an artifact using LM Buddy helper functions"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "8b09e47d-3ced-4eef-a89f-048754edc758",
"metadata": {},
"outputs": [],
"source": [
"from lm_buddy.integrations.wandb import (\n",
" ArtifactType,\n",
" WandbRunConfig,\n",
" build_directory_artifact,\n",
" wandb_init_from_config,\n",
")\n",
"from lm_buddy.jobs.utils import LMBuddyJobType\n",
"\n",
"run_config = WandbRunConfig(\n",
" name=\"lm-buddy-preprocessing-example\",\n",
" project=\"lm-buddy-examples\",\n",
" entity=\"mozilla-ai\",\n",
")\n",
"\n",
"with wandb_init_from_config(run_config, job_type=LMBuddyJobType.PREPROCESSING):\n",
" artifact = build_directory_artifact(\n",
" dir_path=dataset_save_path,\n",
" artifact_name=\"example-dataset-artfact-reference\",\n",
" artifact_type=ArtifactType.DATASET,\n",
" reference=True,\n",
" )\n",
" wandb.log_artifact(artifact)"
]
}
],
"metadata": {
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "lm-buddy"
version = "0.7.0"
version = "0.8.0"
authors = [
{ name = "Sean Friedowitz", email = "[email protected]" },
{ name = "Aaron Gonzales", email = "[email protected]" },
Expand Down
53 changes: 42 additions & 11 deletions src/lm_buddy/buddy.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
from lm_buddy.integrations.wandb import ArtifactLoader, WandbArtifactLoader
import wandb

from lm_buddy.integrations.wandb import WandbResumeMode
from lm_buddy.jobs._entrypoints import run_finetuning, run_lm_harness, run_prometheus, run_ragas
from lm_buddy.jobs.common import EvaluationResult, FinetuningResult
from lm_buddy.jobs.common import EvaluationResult, FinetuningResult, LMBuddyJobType
from lm_buddy.jobs.configs import (
EvaluationJobConfig,
FinetuningJobConfig,
LMBuddyJobConfig,
LMHarnessJobConfig,
PrometheusJobConfig,
RagasJobConfig,
)
from lm_buddy.paths import strip_path_prefix


class LMBuddy:
Expand All @@ -16,25 +20,52 @@ class LMBuddy:
Simple wrapper around executable functions for tasks available in the library.
"""

def __init__(self, artifact_loader: ArtifactLoader = WandbArtifactLoader()):
self._artifact_loader = artifact_loader
# TODO: Store some configuration (e.g., tracking info, name) globally on the buddy
def __init__(self):
pass

def _generate_artifact_lineage(
self,
sfriedowitz marked this conversation as resolved.
Show resolved Hide resolved
config: LMBuddyJobConfig,
results: list[wandb.Artifact],
job_type: LMBuddyJobType,
) -> None:
"""Link input artifacts and log output artifacts to a run.

A no-op if no tracking config is available.
"""
if config.tracking is not None:
with wandb.init(
name=config.name,
job_type=job_type,
resume=WandbResumeMode.ALLOW,
**config.tracking.model_dump(),
) as run:
for path in config.artifact_paths():
artifact_name = strip_path_prefix(path)
run.use_artifact(artifact_name)
for artifact in results:
run.log_artifact(artifact)

def finetune(self, config: FinetuningJobConfig) -> FinetuningResult:
"""Run a supervised finetuning task with the provided configuration."""
finetuning_result = run_finetuning(config, self._artifact_loader)
return finetuning_result
"""Run a supervised finetuning job with the provided configuration."""
result = run_finetuning(config)
self._generate_artifact_lineage(config, result.artifacts, LMBuddyJobType.FINETUNING)
return result

def evaluate(self, config: EvaluationJobConfig) -> EvaluationResult:
"""Run an evaluation task with the provided configuration.
"""Run an evaluation job with the provided configuration.

The underlying evaluation framework is determined by the configuration type.
"""
match config:
case LMHarnessJobConfig() as lm_harness_config:
return run_lm_harness(lm_harness_config, self._artifact_loader)
result = run_lm_harness(lm_harness_config)
case PrometheusJobConfig() as prometheus_config:
return run_prometheus(prometheus_config, self._artifact_loader)
result = run_prometheus(prometheus_config)
case RagasJobConfig() as ragas_config:
return run_ragas(ragas_config, self._artifact_loader)
result = run_ragas(ragas_config)
case _:
raise ValueError(f"Invlid configuration for evaluation: {type(config)}")
self._generate_artifact_lineage(config, result.artifacts, LMBuddyJobType.EVALUATION)
return result
61 changes: 30 additions & 31 deletions src/lm_buddy/integrations/huggingface/asset_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,45 +19,19 @@
DatasetConfig,
QuantizationConfig,
)
from lm_buddy.integrations.wandb import ArtifactLoader, get_artifact_directory
from lm_buddy.integrations.wandb import get_artifact_directory, get_artifact_from_api
from lm_buddy.paths import AssetPath, PathPrefix, strip_path_prefix


def resolve_peft_and_pretrained(path: str) -> tuple[str, str | None]:
"""Helper method for determining if a path corresponds to a PEFT model.

A PEFT model contains an `adapter_config.json` in its directory.
If this file can be loaded, we know the path is a for a PEFT model.
If not, we assume the provided path corresponds to a base HF model.

Args:
path (str): Name/path to a HuggingFace directory

Returns:
Tuple of (base model path, optional PEFT path)
"""
# We don't know if the checkpoint is adapter weights or merged model weights
# Try to load as an adapter and fall back to the checkpoint containing the full model
try:
peft_config = PeftConfig.from_pretrained(path)
return peft_config.base_model_name_or_path, path
except ValueError as e:
warnings.warn(
f"Unable to load model as adapter: {e}. "
"This is expected if the checkpoint does not contain adapter weights."
)
return path, None


class HuggingFaceAssetLoader:
"""Helper class for loading HuggingFace assets from LM Buddy configurations.

This class depends on an `ArtifactLoader` in order to resolve actual paths from
artifact references.
"""

def __init__(self, artifact_loader: ArtifactLoader):
self._artifact_loader = artifact_loader
TODO: We can probably move these to standalone functions now that ArtifactLoader is gone.
What if we add other deps (e.g, S3 client in the future?)
"""

def resolve_asset_path(self, path: AssetPath) -> str:
"""Resolve an `AssetPath` to a loadable string path.
Expand All @@ -69,11 +43,36 @@ def resolve_asset_path(self, path: AssetPath) -> str:
if path.startswith((PathPrefix.FILE, PathPrefix.HUGGINGFACE)):
return raw_path
elif path.startswith(PathPrefix.WANDB):
artifact = self._artifact_loader.use_artifact(raw_path)
artifact = get_artifact_from_api(raw_path)
return str(get_artifact_directory(artifact))
else:
raise ValueError(f"Unable to resolve asset path from {path}.")

def resolve_peft_and_pretrained(self, path: str) -> tuple[str, str | None]:
"""Helper method for determining if a path corresponds to a PEFT model.

A PEFT model contains an `adapter_config.json` in its directory.
If this file can be loaded, we know the path is a for a PEFT model.
If not, we assume the provided path corresponds to a base HF model.

Args:
path (str): Name/path to a HuggingFace directory

Returns:
Tuple of (base model path, optional PEFT path)
"""
# We don't know if the checkpoint is adapter weights or merged model weights
# Try to load as an adapter and fall back to the checkpoint containing the full model
try:
peft_config = PeftConfig.from_pretrained(path)
return peft_config.base_model_name_or_path, path
except ValueError as e:
warnings.warn(
f"Unable to load model as adapter: {e}. "
"This is expected if the checkpoint does not contain adapter weights."
)
return path, None

def load_pretrained_config(
self,
config: AutoModelConfig,
Expand Down
1 change: 0 additions & 1 deletion src/lm_buddy/integrations/wandb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from lm_buddy.integrations.wandb.artifact_loader import *
from lm_buddy.integrations.wandb.artifact_utils import *
from lm_buddy.integrations.wandb.run_config import *
from lm_buddy.integrations.wandb.run_utils import *
47 changes: 0 additions & 47 deletions src/lm_buddy/integrations/wandb/artifact_loader.py

This file was deleted.

16 changes: 13 additions & 3 deletions src/lm_buddy/integrations/wandb/artifact_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,19 @@ class ArtifactType(str, Enum):
EVALUATION = "evaluation"


def default_artifact_name(name: str, artifact_type: ArtifactType) -> str:
"""A default name for an artifact based on the run name and type."""
return f"{name}-{artifact_type}"
def default_artifact_name(job_name: str, artifact_type: ArtifactType) -> str:
"""A default name for an artifact based on the job name and type."""
return f"{job_name}-{artifact_type}"


def get_artifact_from_api(artifact_name: str) -> wandb.Artifact:
"""Retrieve an artifact by fully qualified name from the W&B API.

This does not handle linking the artifact to an active run.
For that, use `run.use_artifact(artifact_name)`.
"""
api = wandb.Api()
return api.artifact(artifact_name)


def get_artifact_directory(
Expand Down
Loading
Loading