Skip to content
This repository has been archived by the owner on Sep 24, 2024. It is now read-only.

Commit

Permalink
Merge pull request #7 from mozilla-ai/RD2024-18/dataset-artifact-helpers
Browse files Browse the repository at this point in the history
RD2024-18: Helper for creating dataset artifacts from a directory
  • Loading branch information
Sean Friedowitz authored Jan 23, 2024
2 parents 53b4a1e + f086139 commit e34c304
Show file tree
Hide file tree
Showing 12 changed files with 449 additions and 207 deletions.
180 changes: 180 additions & 0 deletions examples/dataset_preprocessing.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "f9fd719d-abf4-4ea5-9a8a-637727bf6d48",
"metadata": {},
"source": [
"This notebook shows a simple example of how to use some of the `flamingo` utilities to pre-process a dataset\n",
"and upload it as a W&B artifact.\n",
"\n",
"Generally, this workflow will be performed in a dev environment on cluster so that the dataset files\n",
"can be saved on a shared volume. \n",
"But this notebook can be run locally for educational purposes to illustrate the basic functions."
]
},
{
"cell_type": "markdown",
"id": "03586a6c-6606-47a3-8947-9b09adfb1da4",
"metadata": {},
"source": [
"(1) Load and pre-process the base dataset from HuggingFace"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7613c7ff-ef5a-430b-854a-b34c31547df0",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"\n",
"base_dataset = \"fka/awesome-chatgpt-prompts\"\n",
"dataset = load_dataset(base_dataset, split=\"train\")\n",
"\n",
"dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b8b47047-ba99-4c70-9e6f-5967a89adca0",
"metadata": {},
"outputs": [],
"source": [
"def preprocess_dataset(examples):\n",
" texts = []\n",
" for x in examples[\"prompt\"]:\n",
" texts.append(x[::-1]) # Dummy reverse the prompt\n",
" examples[\"text\"] = texts\n",
" return examples\n",
"\n",
"\n",
"# Map some preprocessing function over the base dataset (e.g., for prompt formatting)\n",
"dataset = dataset.map(preprocess_dataset, batched=True, remove_columns=dataset.column_names)\n",
"\n",
"dataset"
]
},
{
"cell_type": "markdown",
"id": "8ebbe99e-1994-4b3d-86ca-7863fa529618",
"metadata": {},
"source": [
"(2) Save the dataset to disk"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e74a92c7-49c2-4736-bc28-c843a403e650",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"# Add an actual path here to where you want the data to live on shared storage\n",
"dataset_save_path = str(Path(\"example_dataset\").absolute())\n",
"\n",
"dataset.save_to_disk(dataset_save_path)"
]
},
{
"cell_type": "markdown",
"id": "5cea9f8f-7279-44ac-947c-1d79f6bf6ebc",
"metadata": {},
"source": [
"(3a) Log the dataset directory as an reference artifact using W&B directly"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "924f7673",
"metadata": {},
"outputs": [],
"source": [
"import wandb\n",
"\n",
"with wandb.init(\n",
" name=\"flamingo-preprocessing-example\",\n",
" project=\"example-project\",\n",
" entity=\"mozilla-ai\",\n",
" job_type=\"preprocessing\",\n",
"):\n",
" artifact = wandb.Artifact(name=\"example-dataset-reference\", type=\"dataset\")\n",
" artifact.add_reference(uri=f\"file://{dataset_save_path}\")\n",
" wandb.log_artifact(artifact)"
]
},
{
"cell_type": "markdown",
"id": "c5ab6772",
"metadata": {},
"source": [
"(3b) Log the dataset directory as an artifact using flamingo helper functions"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "8b09e47d-3ced-4eef-a89f-048754edc758",
"metadata": {},
"outputs": [],
"source": [
"from flamingo.integrations.wandb import (\n",
" ArtifactType,\n",
" ArtifactURIScheme,\n",
" WandbRunConfig,\n",
" log_directory_contents,\n",
" log_directory_reference,\n",
" wandb_init_from_config,\n",
")\n",
"from flamingo.jobs.utils import FlamingoJobType\n",
"\n",
"run_config = WandbRunConfig(\n",
" name=\"flamingo-preprocessing-example\",\n",
" project=\"example-project\",\n",
" entity=\"mozilla-ai\",\n",
")\n",
"\n",
"with wandb_init_from_config(run_config, job_type=FlamingoJobType.PREPROCESSING):\n",
" # Log a reference to the directory contents\n",
" log_directory_reference(\n",
" dir_path=dataset_save_path,\n",
" artifact_name=\"example-dataset-artfact-reference\",\n",
" artifact_type=ArtifactType.DATASET,\n",
" scheme=ArtifactURIScheme.FILE,\n",
" )\n",
" # Log and upload the directory contents\n",
" log_directory_contents(\n",
" dir_path=dataset_save_path,\n",
" artifact_name=\"example-dataset-artfact-upload\",\n",
" artifact_type=ArtifactType.DATASET,\n",
" )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
64 changes: 13 additions & 51 deletions examples/dev_workflow.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"id": "8c0f15ed-77dc-44ce-adb6-d1b59368f03c",
"metadata": {},
"outputs": [],
"source": [
"# Required imports\n",
"import os\n",
"from pathlib import Path\n",
"\n",
"from ray.job_submission import JobSubmissionClient\n",
"\n",
"# flamingo should be installed in your development environment\n",
Expand All @@ -26,19 +27,19 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"id": "969884e5-d815-42d9-9d4e-3b8f890657e2",
"metadata": {},
"outputs": [],
"source": [
"# Create a submission client bound to a Ray cluster\n",
"# Note: You will likely have to update the cluster address shown below\n",
"client = JobSubmissionClient(f\"http://10.146.174.91:8265\")"
"client = JobSubmissionClient(\"http://10.146.174.91:8265\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"id": "3258bb97-d3c6-4fee-aa0c-962c1411eaa7",
"metadata": {},
"outputs": [],
Expand All @@ -49,21 +50,10 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": null,
"id": "1db3b9aa-99a4-49d9-8773-7b91ccf89c85",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"SimpleJobConfig(magic_number=42)"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Load and inspect the config file\n",
"# Not mandatory for job submission, but helpful when debugging\n",
Expand All @@ -78,7 +68,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": null,
"id": "b81b36be-35ce-4398-a6d4-ac1f719f5c95",
"metadata": {},
"outputs": [],
Expand All @@ -88,53 +78,25 @@
"# pip contains an export of the dependencies for the flamingo package (see CONTRIBUTING.md for how to generate)\n",
"runtime_env = {\n",
" \"working_dir\": str(CONFIG_DIR),\n",
" \"env_vars\": {\"WANDB_API_KEY\": os.environ[\"WANDB_API_KEY\"]}, # If running a job that uses W&B\n",
" \"env_vars\": {\"WANDB_API_KEY\": os.environ[\"WANDB_API_KEY\"]}, # If running a job that uses W&B\n",
" \"py_modules\": [str(flamingo_module)],\n",
" \"pip\": \"/path/to/flamingo/requirements.txt\"\n",
" \"pip\": \"/path/to/flamingo/requirements.txt\",\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"id": "4bd300f9-b863-4413-bd3a-430601656816",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-01-20 15:32:25,620\tINFO dashboard_sdk.py:385 -- Package gcs://_ray_pkg_ba0036a72fdb32af.zip already exists, skipping upload.\n",
"2024-01-20 15:32:25,814\tINFO dashboard_sdk.py:385 -- Package gcs://_ray_pkg_8f96eb40a239b233.zip already exists, skipping upload.\n"
]
},
{
"data": {
"text/plain": [
"'raysubmit_tWfixDMGHavrhHPF'"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Submit the job to the Ray cluster\n",
"# Note: flamingo is invoked by 'python -m flamingo' since the CLI is not installed in the environment\n",
"client.submit_job(\n",
" entrypoint=f\"python -m flamingo run simple --config {CONFIG_FILE}\",\n",
" runtime_env=runtime_env\n",
" entrypoint=f\"python -m flamingo run simple --config {CONFIG_FILE}\", runtime_env=runtime_env\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c82892d-bcdf-42e6-b95e-2393e01ab7d6",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
12 changes: 12 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ testpaths = ["tests"]

[tool.ruff]
target-version = "py310"

exclude = [
".bzr",
".direnv",
Expand All @@ -72,8 +73,16 @@ exclude = [
"node_modules",
"venv",
]

extend-include = ["*.ipynb"]

line-length = 100

[tool.ruff.extend-per-file-ignores]
"__init__.py" = [
"F401", # import unused
"F403", # undefined import names
]

[tool.ruff.lint]
select = [
Expand All @@ -98,6 +107,9 @@ ignore = [
"N805", # first param needs to be self; pydantic breaks this sometimes
]

# Only format Jupyter notebooks, but don't lint them
exclude = ["*.ipynb"]

# Avoid trying to fix some violations
unfixable = ["B", "SIM", "TRY", "RUF"]

Expand Down
10 changes: 3 additions & 7 deletions src/flamingo/integrations/wandb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
# ruff: noqa: I001
from flamingo.integrations.wandb.artifact_config import WandbArtifactConfig
from flamingo.integrations.wandb.artifact_type import ArtifactType
from flamingo.integrations.wandb.run_config import WandbRunConfig

__all__ = [
"ArtifactType",
"WandbArtifactConfig",
"WandbRunConfig",
]
from flamingo.integrations.wandb.artifact_utils import *
from flamingo.integrations.wandb.run_utils import *
10 changes: 0 additions & 10 deletions src/flamingo/integrations/wandb/artifact_type.py

This file was deleted.

Loading

0 comments on commit e34c304

Please sign in to comment.