This repository has been archived by the owner on Sep 24, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #7 from mozilla-ai/RD2024-18/dataset-artifact-helpers
RD2024-18: Helper for creating dataset artifacts from a directory
- Loading branch information
Showing
12 changed files
with
449 additions
and
207 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "f9fd719d-abf4-4ea5-9a8a-637727bf6d48", | ||
"metadata": {}, | ||
"source": [ | ||
"This notebook shows a simple example of how to use some of the `flamingo` utilities to pre-process a dataset\n", | ||
"and upload it as a W&B artifact.\n", | ||
"\n", | ||
"Generally, this workflow will be performed in a dev environment on cluster so that the dataset files\n", | ||
"can be saved on a shared volume. \n", | ||
"But this notebook can be run locally for educational purposes to illustrate the basic functions." | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "03586a6c-6606-47a3-8947-9b09adfb1da4", | ||
"metadata": {}, | ||
"source": [ | ||
"(1) Load and pre-process the base dataset from HuggingFace" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "7613c7ff-ef5a-430b-854a-b34c31547df0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from datasets import load_dataset\n", | ||
"\n", | ||
"base_dataset = \"fka/awesome-chatgpt-prompts\"\n", | ||
"dataset = load_dataset(base_dataset, split=\"train\")\n", | ||
"\n", | ||
"dataset" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "b8b47047-ba99-4c70-9e6f-5967a89adca0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def preprocess_dataset(examples):\n", | ||
" texts = []\n", | ||
" for x in examples[\"prompt\"]:\n", | ||
" texts.append(x[::-1]) # Dummy reverse the prompt\n", | ||
" examples[\"text\"] = texts\n", | ||
" return examples\n", | ||
"\n", | ||
"\n", | ||
"# Map some preprocessing function over the base dataset (e.g., for prompt formatting)\n", | ||
"dataset = dataset.map(preprocess_dataset, batched=True, remove_columns=dataset.column_names)\n", | ||
"\n", | ||
"dataset" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "8ebbe99e-1994-4b3d-86ca-7863fa529618", | ||
"metadata": {}, | ||
"source": [ | ||
"(2) Save the dataset to disk" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "e74a92c7-49c2-4736-bc28-c843a403e650", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from pathlib import Path\n", | ||
"\n", | ||
"# Add an actual path here to where you want the data to live on shared storage\n", | ||
"dataset_save_path = str(Path(\"example_dataset\").absolute())\n", | ||
"\n", | ||
"dataset.save_to_disk(dataset_save_path)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "5cea9f8f-7279-44ac-947c-1d79f6bf6ebc", | ||
"metadata": {}, | ||
"source": [ | ||
"(3a) Log the dataset directory as an reference artifact using W&B directly" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "924f7673", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import wandb\n", | ||
"\n", | ||
"with wandb.init(\n", | ||
" name=\"flamingo-preprocessing-example\",\n", | ||
" project=\"example-project\",\n", | ||
" entity=\"mozilla-ai\",\n", | ||
" job_type=\"preprocessing\",\n", | ||
"):\n", | ||
" artifact = wandb.Artifact(name=\"example-dataset-reference\", type=\"dataset\")\n", | ||
" artifact.add_reference(uri=f\"file://{dataset_save_path}\")\n", | ||
" wandb.log_artifact(artifact)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "c5ab6772", | ||
"metadata": {}, | ||
"source": [ | ||
"(3b) Log the dataset directory as an artifact using flamingo helper functions" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "8b09e47d-3ced-4eef-a89f-048754edc758", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from flamingo.integrations.wandb import (\n", | ||
" ArtifactType,\n", | ||
" ArtifactURIScheme,\n", | ||
" WandbRunConfig,\n", | ||
" log_directory_contents,\n", | ||
" log_directory_reference,\n", | ||
" wandb_init_from_config,\n", | ||
")\n", | ||
"from flamingo.jobs.utils import FlamingoJobType\n", | ||
"\n", | ||
"run_config = WandbRunConfig(\n", | ||
" name=\"flamingo-preprocessing-example\",\n", | ||
" project=\"example-project\",\n", | ||
" entity=\"mozilla-ai\",\n", | ||
")\n", | ||
"\n", | ||
"with wandb_init_from_config(run_config, job_type=FlamingoJobType.PREPROCESSING):\n", | ||
" # Log a reference to the directory contents\n", | ||
" log_directory_reference(\n", | ||
" dir_path=dataset_save_path,\n", | ||
" artifact_name=\"example-dataset-artfact-reference\",\n", | ||
" artifact_type=ArtifactType.DATASET,\n", | ||
" scheme=ArtifactURIScheme.FILE,\n", | ||
" )\n", | ||
" # Log and upload the directory contents\n", | ||
" log_directory_contents(\n", | ||
" dir_path=dataset_save_path,\n", | ||
" artifact_name=\"example-dataset-artfact-upload\",\n", | ||
" artifact_type=ArtifactType.DATASET,\n", | ||
" )" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.13" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,5 @@ | ||
# ruff: noqa: I001 | ||
from flamingo.integrations.wandb.artifact_config import WandbArtifactConfig | ||
from flamingo.integrations.wandb.artifact_type import ArtifactType | ||
from flamingo.integrations.wandb.run_config import WandbRunConfig | ||
|
||
__all__ = [ | ||
"ArtifactType", | ||
"WandbArtifactConfig", | ||
"WandbRunConfig", | ||
] | ||
from flamingo.integrations.wandb.artifact_utils import * | ||
from flamingo.integrations.wandb.run_utils import * |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.