Merge pull request #7 from mozilla-ai/RD2024-18/dataset-artifact-helpers

RD2024-18: Helper for creating dataset artifacts from a directory
mozilla-ai · Jan 23, 2024 · e34c304 · e34c304
2 parents 53b4a1e + f086139
commit e34c304
Show file tree

Hide file tree

Showing 12 changed files with 449 additions and 207 deletions.
diff --git a/examples/dataset_preprocessing.ipynb b/examples/dataset_preprocessing.ipynb
@@ -0,0 +1,180 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f9fd719d-abf4-4ea5-9a8a-637727bf6d48",
+   "metadata": {},
+   "source": [
+    "This notebook shows a simple example of how to use some of the `flamingo` utilities to pre-process a dataset\n",
+    "and upload it as a W&B artifact.\n",
+    "\n",
+    "Generally, this workflow will be performed in a dev environment on cluster so that the dataset files\n",
+    "can be saved on a shared volume. \n",
+    "But this notebook can be run locally for educational purposes to illustrate the basic functions."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "03586a6c-6606-47a3-8947-9b09adfb1da4",
+   "metadata": {},
+   "source": [
+    "(1) Load and pre-process the base dataset from HuggingFace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7613c7ff-ef5a-430b-854a-b34c31547df0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "base_dataset = \"fka/awesome-chatgpt-prompts\"\n",
+    "dataset = load_dataset(base_dataset, split=\"train\")\n",
+    "\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b8b47047-ba99-4c70-9e6f-5967a89adca0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess_dataset(examples):\n",
+    "    texts = []\n",
+    "    for x in examples[\"prompt\"]:\n",
+    "        texts.append(x[::-1])  # Dummy reverse the prompt\n",
+    "    examples[\"text\"] = texts\n",
+    "    return examples\n",
+    "\n",
+    "\n",
+    "# Map some preprocessing function over the base dataset (e.g., for prompt formatting)\n",
+    "dataset = dataset.map(preprocess_dataset, batched=True, remove_columns=dataset.column_names)\n",
+    "\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ebbe99e-1994-4b3d-86ca-7863fa529618",
+   "metadata": {},
+   "source": [
+    "(2) Save the dataset to disk"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e74a92c7-49c2-4736-bc28-c843a403e650",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "# Add an actual path here to where you want the data to live on shared storage\n",
+    "dataset_save_path = str(Path(\"example_dataset\").absolute())\n",
+    "\n",
+    "dataset.save_to_disk(dataset_save_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5cea9f8f-7279-44ac-947c-1d79f6bf6ebc",
+   "metadata": {},
+   "source": [
+    "(3a) Log the dataset directory as an reference artifact using W&B directly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "924f7673",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import wandb\n",
+    "\n",
+    "with wandb.init(\n",
+    "    name=\"flamingo-preprocessing-example\",\n",
+    "    project=\"example-project\",\n",
+    "    entity=\"mozilla-ai\",\n",
+    "    job_type=\"preprocessing\",\n",
+    "):\n",
+    "    artifact = wandb.Artifact(name=\"example-dataset-reference\", type=\"dataset\")\n",
+    "    artifact.add_reference(uri=f\"file://{dataset_save_path}\")\n",
+    "    wandb.log_artifact(artifact)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5ab6772",
+   "metadata": {},
+   "source": [
+    "(3b) Log the dataset directory as an artifact using flamingo helper functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "8b09e47d-3ced-4eef-a89f-048754edc758",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from flamingo.integrations.wandb import (\n",
+    "    ArtifactType,\n",
+    "    ArtifactURIScheme,\n",
+    "    WandbRunConfig,\n",
+    "    log_directory_contents,\n",
+    "    log_directory_reference,\n",
+    "    wandb_init_from_config,\n",
+    ")\n",
+    "from flamingo.jobs.utils import FlamingoJobType\n",
+    "\n",
+    "run_config = WandbRunConfig(\n",
+    "    name=\"flamingo-preprocessing-example\",\n",
+    "    project=\"example-project\",\n",
+    "    entity=\"mozilla-ai\",\n",
+    ")\n",
+    "\n",
+    "with wandb_init_from_config(run_config, job_type=FlamingoJobType.PREPROCESSING):\n",
+    "    # Log a reference to the directory contents\n",
+    "    log_directory_reference(\n",
+    "        dir_path=dataset_save_path,\n",
+    "        artifact_name=\"example-dataset-artfact-reference\",\n",
+    "        artifact_type=ArtifactType.DATASET,\n",
+    "        scheme=ArtifactURIScheme.FILE,\n",
+    "    )\n",
+    "    # Log and upload the directory contents\n",
+    "    log_directory_contents(\n",
+    "        dir_path=dataset_save_path,\n",
+    "        artifact_name=\"example-dataset-artfact-upload\",\n",
+    "        artifact_type=ArtifactType.DATASET,\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/dev_workflow.ipynb b/examples/dev_workflow.ipynb
@@ -10,14 +10,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "id": "8c0f15ed-77dc-44ce-adb6-d1b59368f03c",
    "metadata": {},
    "outputs": [],
    "source": [
     "# Required imports\n",
     "import os\n",
     "from pathlib import Path\n",
+    "\n",
     "from ray.job_submission import JobSubmissionClient\n",
     "\n",
     "# flamingo should be installed in your development environment\n",
@@ -26,19 +27,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "id": "969884e5-d815-42d9-9d4e-3b8f890657e2",
    "metadata": {},
    "outputs": [],
    "source": [
     "# Create a submission client bound to a Ray cluster\n",
     "# Note: You will likely have to update the cluster address shown below\n",
-    "client = JobSubmissionClient(f\"http://10.146.174.91:8265\")"
+    "client = JobSubmissionClient(\"http://10.146.174.91:8265\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "id": "3258bb97-d3c6-4fee-aa0c-962c1411eaa7",
    "metadata": {},
    "outputs": [],
@@ -49,21 +50,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
    "id": "1db3b9aa-99a4-49d9-8773-7b91ccf89c85",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "SimpleJobConfig(magic_number=42)"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Load and inspect the config file\n",
     "# Not mandatory for job submission, but helpful when debugging\n",
@@ -78,7 +68,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "id": "b81b36be-35ce-4398-a6d4-ac1f719f5c95",
    "metadata": {},
    "outputs": [],
@@ -88,53 +78,25 @@
     "# pip contains an export of the dependencies for the flamingo package (see CONTRIBUTING.md for how to generate)\n",
     "runtime_env = {\n",
     "    \"working_dir\": str(CONFIG_DIR),\n",
-    "    \"env_vars\": {\"WANDB_API_KEY\": os.environ[\"WANDB_API_KEY\"]}, # If running a job that uses W&B\n",
+    "    \"env_vars\": {\"WANDB_API_KEY\": os.environ[\"WANDB_API_KEY\"]},  # If running a job that uses W&B\n",
     "    \"py_modules\": [str(flamingo_module)],\n",
-    "    \"pip\": \"/path/to/flamingo/requirements.txt\"\n",
+    "    \"pip\": \"/path/to/flamingo/requirements.txt\",\n",
     "}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "id": "4bd300f9-b863-4413-bd3a-430601656816",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-01-20 15:32:25,620\tINFO dashboard_sdk.py:385 -- Package gcs://_ray_pkg_ba0036a72fdb32af.zip already exists, skipping upload.\n",
-      "2024-01-20 15:32:25,814\tINFO dashboard_sdk.py:385 -- Package gcs://_ray_pkg_8f96eb40a239b233.zip already exists, skipping upload.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'raysubmit_tWfixDMGHavrhHPF'"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Submit the job to the Ray cluster\n",
     "# Note: flamingo is invoked by 'python -m flamingo' since the CLI is not installed in the environment\n",
     "client.submit_job(\n",
-    "    entrypoint=f\"python -m flamingo run simple --config {CONFIG_FILE}\",\n",
-    "    runtime_env=runtime_env\n",
+    "    entrypoint=f\"python -m flamingo run simple --config {CONFIG_FILE}\", runtime_env=runtime_env\n",
     ")"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3c82892d-bcdf-42e6-b95e-2393e01ab7d6",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/pyproject.toml b/pyproject.toml
@@ -49,6 +49,7 @@ testpaths = ["tests"]
 
 [tool.ruff]
 target-version = "py310"
+
 exclude = [
     ".bzr",
     ".direnv",
@@ -72,8 +73,16 @@ exclude = [
     "node_modules",
     "venv",
 ]
+
+extend-include = ["*.ipynb"]
+
 line-length = 100
 
+[tool.ruff.extend-per-file-ignores]
+"__init__.py" = [
+    "F401", # import unused
+    "F403", # undefined import names
+]
 
 [tool.ruff.lint]
 select = [
@@ -98,6 +107,9 @@ ignore = [
     "N805", # first param needs to be self; pydantic breaks this sometimes
 ]
 
+# Only format Jupyter notebooks, but don't lint them
+exclude = ["*.ipynb"]
+
 # Avoid trying to fix some violations
 unfixable = ["B", "SIM", "TRY", "RUF"]
 

diff --git a/src/flamingo/integrations/wandb/__init__.py b/src/flamingo/integrations/wandb/__init__.py
@@ -1,9 +1,5 @@
+# ruff: noqa: I001
 from flamingo.integrations.wandb.artifact_config import WandbArtifactConfig
-from flamingo.integrations.wandb.artifact_type import ArtifactType
 from flamingo.integrations.wandb.run_config import WandbRunConfig
-
-__all__ = [
-    "ArtifactType",
-    "WandbArtifactConfig",
-    "WandbRunConfig",
-]
+from flamingo.integrations.wandb.artifact_utils import *
+from flamingo.integrations.wandb.run_utils import *
diff --git a/src/flamingo/integrations/wandb/artifact_type.py b/src/flamingo/integrations/wandb/artifact_type.py