mozilla-ai · sfriedowitz · Feb 20, 2024 · Feb 14, 2024 · Feb 15, 2024 · Feb 16, 2024
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -22,18 +22,14 @@ If you have an active Conda environment, Poetry should recognize it during insta
 and install the package dependencies there.
 This hasn't been explicitly tested with other virtual python environments, but will likely work.
 
-Alternatively, you can use poetry's own environment by running
+Alternatively, you can use Poetry's own environment by running
 ```
 poetry lock
 poetry env use python3.10
 poetry install
 ```
 where `python3.10` is your python interpreter.
 
-The `pyproject.toml` file defines dependency groups for the logical job types in the package.
-Individual dependency groups can be installed by running
-`poetry install --with <group1>,<group2>` or `poetry install --only <group>`.
-
 ## Code style
 
 This repository uses [Ruff](https://docs.astral.sh/ruff/) for Python formatting and linting.
@@ -53,10 +49,11 @@ local development branch of the `lm-buddy` repo.
 
 To do so, follow the steps:
 
-1. Export a copy of the package dependencies by running the following command, which will create a `requirements.txt` file in the `lm-buddy` repository. This will contain the dependencies for the `finetuning` and `evaluation` job groups:
+1. Export a copy of the package dependencies by running the following command, which will create a `requirements.txt` file in the `lm-buddy` repository. 
+This will contain all non-development dependencies for the package:
 
     ```
-    poetry export --without-hashes --with finetuning,evaluation -o requirements.txt
+    poetry export --without-hashes -o requirements.txt
     ```
 
 2. When submitting a job to a Ray cluster, specify in the Ray runtime environment the following:
@@ -109,7 +106,7 @@ poetry publish --repository testpypi --dry-run --build
 poetry publish --repository testpypi --build
 ```
 
-### Publish to PyPi
+### Publish to PyPI
 
 When you're ready, run:
 

diff --git a/examples/notebooks/dev_workflow.ipynb → examples/notebooks/dev_ray_submission.ipynb b/examples/notebooks/dev_workflow.ipynb → examples/notebooks/dev_ray_submission.ipynb
@@ -5,12 +5,33 @@
    "id": "123e34e9-70f8-42ab-b790-b59ddc01b1f3",
    "metadata": {},
    "source": [
-    "# Notebook Development Workflow"
+    "# Development Ray submission"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "fcd5240e",
+   "id": "8a4fc01e",
+   "metadata": {},
+   "source": [
+    "Generally, `lm-buddy` is installed as a pip requirement in the runtime environment of the Ray job.\n",
+    "During development, however, it can be helpful to execute a job from a local branch \n",
+    "that has not been published to PyPI.\n",
+    "\n",
+    "This example notebook shows how to bypass the pip requirements section of the Ray runtime environment\n",
+    "and instead upload a local copy of the `lm_buddy` Python module directly to Ray."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5518ab35",
+   "metadata": {},
+   "source": [
+    "## File-based submission"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae7c26d9",
    "metadata": {},
    "source": [
     "This demonstrates the basic workflow for submitting an LM Buddy job to Ray\n",
@@ -22,7 +43,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "fd3e4db3-829b-495f-9864-7567bd2ac0ce",
    "metadata": {},
    "outputs": [],
@@ -91,17 +112,15 @@
     "client.submit_job(\n",
     "    entrypoint=f\"python -m lm_buddy run simple --config simple_config.yaml\",\n",
     "    runtime_env=runtime_env,\n",
-    ")\n"
+    ")"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2f7ccdfd-0b09-47e5-b670-45c614dd4bd8",
+   "cell_type": "markdown",
+   "id": "ff88c2f6",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# Iterative Submission with Temp Config File"
+    "## Iterative submission with temporary config files"
    ]
   },
   {
@@ -213,7 +232,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.0"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,

diff --git a/examples/notebooks/direct_job_execution.ipynb b/examples/notebooks/direct_job_execution.ipynb
@@ -0,0 +1,207 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Direct job execution"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook illustrates how to use LM Buddy as a library to run jobs directly on the host machine.\n",
+    "\n",
+    "Jobs are fully specified by a `lm_buddy.jobs.configs.LMBuddyJobConfig` \n",
+    "and are executed with the `lm_buddy.run_job` method.\n",
+    "\n",
+    "**Warning**: This workflow is still considered experimental.\n",
+    "Some jobs depend on external services (e.g., W&B, Ray cluster) and host-machine GPU resources,\n",
+    "and may not work without a properly configured local environment."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import lm_buddy\n",
+    "from lm_buddy.jobs.configs import (\n",
+    "    FinetuningJobConfig,\n",
+    "    FinetuningRayConfig,\n",
+    "    LMHarnessJobConfig,\n",
+    "    LMHarnessEvaluatorConfig,\n",
+    ")\n",
+    "from lm_buddy.integrations.huggingface import (\n",
+    "    HuggingFaceRepoConfig,\n",
+    "    AutoModelConfig,\n",
+    "    TextDatasetConfig,\n",
+    "    TrainerConfig,\n",
+    "    AdapterConfig,\n",
+    ")\n",
+    "from lm_buddy.integrations.wandb import WandbRunConfig"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Finetuning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Base model to finetune from HuggingFace\n",
+    "model_config = AutoModelConfig(\n",
+    "    load_from=HuggingFaceRepoConfig(repo_id=\"distilgpt2\"),\n",
+    ")\n",
+    "\n",
+    "# Text dataset for finetuning\n",
+    "dataset_config = TextDatasetConfig(\n",
+    "    load_from=HuggingFaceRepoConfig(repo_id=\"imdb\"),\n",
+    "    split=\"train[:100]\",\n",
+    "    text_field=\"text\",\n",
+    ")\n",
+    "\n",
+    "# HuggingFace trainer arguments\n",
+    "trainer_config = TrainerConfig(\n",
+    "    max_seq_length=256,\n",
+    "    per_device_train_batch_size=8,\n",
+    "    learning_rate=1e-4,\n",
+    "    num_train_epochs=1,\n",
+    "    logging_strategy=\"steps\",\n",
+    "    logging_steps=1,\n",
+    "    save_strategy=\"epoch\",\n",
+    "    save_steps=1,\n",
+    ")\n",
+    "\n",
+    "# LORA adapter settings\n",
+    "adapter_config = AdapterConfig(\n",
+    "    peft_type=\"LORA\",\n",
+    "    task_type=\"CAUSAL_LM\",\n",
+    "    r=8,\n",
+    "    lora_alpha=16,\n",
+    "    lora_dropout=0.2,\n",
+    ")\n",
+    "\n",
+    "# Define tracking for finetuning run\n",
+    "tracking_config = WandbRunConfig(\n",
+    "    name=\"example-finetuning\",\n",
+    "    project=\"lm-buddy-examples\",  # Update to your project name\n",
+    "    entity=\"mozilla-ai\",  # Update to your entity name\n",
+    ")\n",
+    "\n",
+    "# Ray train settings\n",
+    "ray_config = FinetuningRayConfig(\n",
+    "    use_gpu=False,  # Change to True if GPUs are available on your machine\n",
+    "    num_workers=2,\n",
+    ")\n",
+    "\n",
+    "# Full finetuning config\n",
+    "finetuning_config = FinetuningJobConfig(\n",
+    "    model=model_config,\n",
+    "    dataset=dataset_config,\n",
+    "    trainer=trainer_config,\n",
+    "    adapter=adapter_config,\n",
+    "    tracking=tracking_config,\n",
+    "    ray=ray_config,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run the job\n",
+    "lm_buddy.run_job(finetuning_config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the model to be evaluated\n",
+    "# In this case, loading directly a pretrained model from HuggingFace\n",
+    "model_config = AutoModelConfig(\n",
+    "    load_from=HuggingFaceRepoConfig(repo_id=\"distilgpt2\"),\n",
+    ")\n",
+    "\n",
+    "# Define evaluation tasks and settings\n",
+    "evaluator_config = LMHarnessEvaluatorConfig(\n",
+    "    tasks=[\"hellaswag\"],\n",
+    "    limit=10,  # Only run 10 samples per task. Remove for a real run.\n",
+    "    num_fewshot=5,\n",
+    ")\n",
+    "\n",
+    "# Define tracking for eval run\n",
+    "tracking_config = WandbRunConfig(\n",
+    "    name=\"example-lm-harness\",\n",
+    "    project=\"lm-buddy-examples\",  # Update to your project name\n",
+    "    entity=\"mozilla-ai\",  # Update to your entity name\n",
+    ")\n",
+    "\n",
+    "# Full lm-harness job config\n",
+    "lm_harness_config = LMHarnessJobConfig(\n",
+    "    model=model_config,\n",
+    "    evaluator=evaluator_config,\n",
+    "    tracking=tracking_config,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run the job\n",
+    "lm_buddy.run_job(lm_harness_config)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "lm-buddy",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "lm-buddy"
-version = "0.1.0rc12"
+version = "0.2.0"
 description = "Ray-centric library for finetuning and evaluation of (large) language models."
 repository = "https://github.com/mozilla-ai/lm-buddy"
 readme = "README.md"
@@ -17,6 +17,7 @@ license = "Apache-2.0"
 packages = [{ include = "lm_buddy", from = "src" }]
 
 [tool.poetry.dependencies]
+# Core
 python = ">=3.10,<3.11"
 click = "8.1.7"
 torch = "2.1.2"
@@ -28,6 +29,18 @@ pydantic = "2.6.0"
 pydantic-yaml = "1.2.0"
 ray = { version = "2.9.1", extras = ["default"] }
 
+# HuggingFace
+datasets = "2.16.1"
+transformers = "4.36.2"
+accelerate = "0.26.1"
+peft = "0.7.1"
+trl = "0.7.10"
+bitsandbytes = "0.42.0"
+
+# Evaluation frameworks
+lm-eval = { version = "0.4.1", extras = ["openai"] }
+einops = "0.7.0"
+
 [tool.poetry.dev-dependencies]
 ruff = "0.2.1"
 pytest = "7.4.3"
@@ -39,18 +52,6 @@ nbsphinx = "0.9.3"
 myst-parser = "2.0.0"
 recommonmark = "^0.7.1"
 
-[tool.poetry.group.finetuning.dependencies]
-datasets = "2.16.1"
-transformers = "4.36.2"
-accelerate = "0.26.1"
-peft = "0.7.1"
-trl = "0.7.10"
-bitsandbytes = "0.42.0"
-
-[tool.poetry.group.evaluation.dependencies]
-lm-eval = { version = "0.4.1", extras = ["openai"] }
-einops = "0.7.0"
-
 [tool.poetry.scripts]
 lm_buddy = "lm_buddy.cli:cli"
 

diff --git a/src/lm_buddy/__init__.py b/src/lm_buddy/__init__.py
@@ -0,0 +1,3 @@
+from lm_buddy.jobs import run_job
+
+__all__ = ["run_job"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from lm_buddy.jobs import run_job

		__all__ = ["run_job"]