diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 24c9a62..0c31e4a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -24,12 +24,6 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: actions/checkout@v4 - with: - repository: laminlabs/lndocs - ssh-key: ${{ secrets.READ_LNDOCS }} - path: lndocs - ref: main - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -37,13 +31,10 @@ jobs: with: path: ~/.cache/pre-commit key: pre-commit-${{ runner.os }}-${{ hashFiles('.pre-commit-config.yaml') }} - - run: pip install "laminci@git+https://x-access-token:${{ secrets.LAMIN_BUILD_DOCS }}@github.com/laminlabs/laminci" - - run: sudo apt-get -y install graphviz - - uses: aws-actions/configure-aws-credentials@v2 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: eu-central-1 + - run: | + pip install "laminci[run-notebooks]@git+https://@github.com/laminlabs/laminci" + uv pip install --system "lndocs@git+https://x-access-token:${{ secrets.LAMIN_BUILD_DOCS }}@github.com/laminlabs/lndocs" + sudo apt-get -y install graphviz # - run: nox -s lint - run: nox -s build - uses: codecov/codecov-action@v2 diff --git a/docs/changelog.md b/docs/changelog.md deleted file mode 100644 index 467cc91..0000000 --- a/docs/changelog.md +++ /dev/null @@ -1,8 +0,0 @@ -# Changelog - - -Name | PR | Developer | Date | Version ---- | --- | --- | --- | --- -♻️ Curate the MNIST dataset in another notebook | [5](https://github.com/laminlabs/lamin-mlops/pull/5) | [falexwolf](https://github.com/falexwolf) | 2024-06-20 | -Refactoring text | [3](https://github.com/laminlabs/lamin-mlops/pull/3) | [Zethson](https://github.com/Zethson) | 2024-06-20 | -Example notebook for wandb integration | [2](https://github.com/laminlabs/lamin-mlops/pull/2) | [felix0097](https://github.com/felix0097) | 2024-06-20 | diff --git a/docs/index.md b/docs/index.md index ac00041..abcde09 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,5 +8,4 @@ :hidden: guide -changelog ``` diff --git a/docs/mnist.ipynb b/docs/mnist.ipynb index 13c1351..1d1e08a 100644 --- a/docs/mnist.ipynb +++ b/docs/mnist.ipynb @@ -19,6 +19,7 @@ }, "outputs": [], "source": [ + "# !pip install -q 'lamindb[jupyter,aws]' torch torchvision lightning wandb\n", "!lamin init --storage ./lamin-mlops" ] }, @@ -71,26 +72,8 @@ "metadata": {}, "outputs": [], "source": [ - "!ls -r download_mnist/MNIST/raw" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af27c2ae", - "metadata": {}, - "outputs": [], - "source": [ - "!rm -r download_mnist/MNIST/raw/*.gz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0eea1916", - "metadata": {}, - "outputs": [], - "source": [ + "# no need for the zipped files\n", + "!rm -r download_mnist/MNIST/raw/*.gz\n", "!ls -r download_mnist/MNIST/raw" ] }, @@ -107,7 +90,7 @@ "source": [ "training_data_artifact = ln.Artifact(\n", " \"download_mnist/\",\n", - " description=\"MNIST-dataset\",\n", + " key=\"testdata/mnist\",\n", " type=\"dataset\",\n", ").save()\n", "training_data_artifact" diff --git a/docs/wandb.ipynb b/docs/wandb.ipynb index ed0daeb..96288ff 100644 --- a/docs/wandb.ipynb +++ b/docs/wandb.ipynb @@ -14,26 +14,9 @@ "id": "9bff135177a7ae90", "metadata": {}, "source": [ - "# Wandb\n", + "# Weights & Biases\n", "\n", - "We show how LaminDB can be integrated with Wandb to track the whole training process, associate data with models, and facilitate model querying based on hyperparameters, among other criteria." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7b4df14aefff576a", - "metadata": { - "jupyter": { - "is_executing": true - } - }, - "outputs": [], - "source": [ - "# uncomment below to install necessary dependencies for this notebook:\n", - "# !pip install 'lamindb[jupyter,aws]' -q\n", - "# !pip install wandb -qU\n", - "# !pip install torch torchvision torchaudio lightning -q" + "We show how LaminDB can be integrated with W&B to track the training process and associate datasets & parameters with models." ] }, { @@ -47,8 +30,9 @@ }, "outputs": [], "source": [ - "# you can also pass s3://my-bucket\n", - "!lamin init --storage ./lamin-mlops" + "# !pip install -q 'lamindb[jupyter,aws]' torch torchvision lightning wandb\n", + "!lamin init --storage ./lamin-mlops\n", + "!wandb login" ] }, { @@ -69,20 +53,6 @@ "ln.context.track()" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "e72b9327e978259", - "metadata": { - "tags": [ - "hide-output" - ] - }, - "outputs": [], - "source": [ - "!wandb login" - ] - }, { "cell_type": "markdown", "id": "4271521b-0791-4df3-b043-3acc13b3f54c", @@ -103,10 +73,10 @@ "from torch import optim, nn, utils\n", "from torchvision.datasets import MNIST\n", "from torchvision.transforms import ToTensor\n", - "import lightning as L\n", + "import lightning\n", "\n", "\n", - "class LitAutoEncoder(L.LightningModule):\n", + "class LitAutoEncoder(lightning.LightningModule):\n", " def __init__(self, hidden_size, bottleneck_size):\n", " super().__init__()\n", " self.encoder = nn.Sequential(\n", @@ -141,7 +111,7 @@ "id": "04b3f68c-6e10-4b95-a4f5-729704690a25", "metadata": {}, "source": [ - "## Query & cache MNIST dataset from LaminDB" + "## Query & download the MNIST dataset" ] }, { @@ -149,7 +119,25 @@ "id": "7b03ee9c-eadd-479f-beca-ad84e4118a6e", "metadata": {}, "source": [ - "We curated the MNIST dataset in [another notebook](https://lamin.ai/laminlabs/lamindata/transform/mwaEQepEtFeh5zKv) and it now shows up on LaminHub:\n", + "We saved the MNIST dataset in [curation notebook](/mnist) and it now shows up in the artifact registry:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "775a7cc2", + "metadata": {}, + "outputs": [], + "source": [ + "ln.Artifact.filter(type=\"dataset\").df()" + ] + }, + { + "cell_type": "markdown", + "id": "4f19d468", + "metadata": {}, + "source": [ + "You can also see it on lamin.ai if you connected your instance.\n", "\n", "" ] @@ -159,9 +147,7 @@ "id": "ea687f6a", "metadata": {}, "source": [ - "We can either query it by `uid` from there or query it by any other metadata combination.\n", - "\n", - "Here, by description:" + "Let's get the dataset:" ] }, { @@ -175,8 +161,8 @@ }, "outputs": [], "source": [ - "training_data_artifact = ln.Artifact.get(description=\"MNIST-dataset\")\n", - "training_data_artifact" + "artifact = ln.Artifact.get(key=\"testdata/mnist\")\n", + "artifact" ] }, { @@ -184,18 +170,22 @@ "id": "524876dd", "metadata": {}, "source": [ - "Let's cache the dataset:" + "And download it to a local cache:" ] }, { "cell_type": "code", "execution_count": null, "id": "d485a6c3", - "metadata": {}, + "metadata": { + "tags": [ + "hide-output" + ] + }, "outputs": [], "source": [ - "cache_path = training_data_artifact.cache()\n", - "cache_path" + "path = artifact.cache()\n", + "path" ] }, { @@ -206,34 +196,18 @@ "Create a pytorch-compatible dataset:" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b11f03a", - "metadata": {}, - "outputs": [], - "source": [ - "!ls -r {cache_path.as_posix()}/MNIST/raw" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2be262c7", - "metadata": {}, - "outputs": [], - "source": [ - "cache_path.as_posix()" - ] - }, { "cell_type": "code", "execution_count": null, "id": "19c390c0", - "metadata": {}, + "metadata": { + "tags": [ + "hide-output" + ] + }, "outputs": [], "source": [ - "dataset = MNIST(cache_path.as_posix(), transform=ToTensor())\n", + "dataset = MNIST(path.as_posix(), transform=ToTensor())\n", "dataset" ] }, @@ -244,34 +218,7 @@ "source": [ "## Monitor training with wandb\n", "\n", - "Train our example model and track training progress with Wandb." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f2595336-fc58-4203-b859-28fbb49bd344", - "metadata": {}, - "outputs": [], - "source": [ - "MODEL_CONFIG = {\n", - " \"hidden_size\": 32,\n", - " \"bottleneck_size\": 16,\n", - " \"batch_size\": 32\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22eb33a95df37e62", - "metadata": {}, - "outputs": [], - "source": [ - "# create PyTorch dataloader\n", - "train_loader = utils.data.DataLoader(dataset, batch_size=MODEL_CONFIG[\"batch_size\"], shuffle=True)\n", - "# init model\n", - "autoencoder = LitAutoEncoder(MODEL_CONFIG[\"hidden_size\"], MODEL_CONFIG[\"bottleneck_size\"])" + "Train our example model and track the training progress with `wandb`." ] }, { @@ -287,8 +234,21 @@ "source": [ "from lightning.pytorch.loggers import WandbLogger\n", "\n", - "# initialise the wandb logger\n", + "MODEL_CONFIG = {\n", + " \"hidden_size\": 32,\n", + " \"bottleneck_size\": 16,\n", + " \"batch_size\": 32\n", + "}\n", + "\n", + "# create the data loader\n", + "train_loader = utils.data.DataLoader(dataset, batch_size=MODEL_CONFIG[\"batch_size\"], shuffle=True)\n", + "\n", + "# init model\n", + "autoencoder = LitAutoEncoder(MODEL_CONFIG[\"hidden_size\"], MODEL_CONFIG[\"bottleneck_size\"])\n", + "\n", + "# initialize the logger\n", "wandb_logger = WandbLogger(project=\"lamin\")\n", + "\n", "# add batch size to the wandb config\n", "wandb_logger.experiment.config[\"batch_size\"] = MODEL_CONFIG[\"batch_size\"]" ] @@ -297,7 +257,11 @@ "cell_type": "code", "execution_count": null, "id": "133d1dee-0e04-4150-898e-deb4e6060f31", - "metadata": {}, + "metadata": { + "tags": [ + "hide-output" + ] + }, "outputs": [], "source": [ "from lightning.pytorch.callbacks import ModelCheckpoint\n", @@ -308,22 +272,10 @@ " filename=\"last_epoch\",\n", " save_top_k=1,\n", " monitor=\"train_loss\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2f1a9295-90a4-4dcd-8054-19ac2eb44e49", - "metadata": { - "tags": [ - "hide-output" - ] - }, - "outputs": [], - "source": [ + ")\n", + "\n", "# train model\n", - "trainer = L.Trainer(\n", + "trainer = lightning.Trainer(\n", " accelerator=\"cpu\",\n", " limit_train_batches=3, \n", " max_epochs=2,\n", @@ -337,7 +289,11 @@ "cell_type": "code", "execution_count": null, "id": "4d98ca68-6a6e-4c9d-bbfa-9d31eae2eb76", - "metadata": {}, + "metadata": { + "tags": [ + "hide-output" + ] + }, "outputs": [], "source": [ "wandb_logger.experiment.name" @@ -347,7 +303,11 @@ "cell_type": "code", "execution_count": null, "id": "583004ac-aead-4ca5-8043-d5271af934e7", - "metadata": {}, + "metadata": { + "tags": [ + "hide-output" + ] + }, "outputs": [], "source": [ "wandb_logger.version" @@ -372,7 +332,7 @@ "id": "46cddbb5-50a6-4ef9-8534-f4bad6c07af6", "metadata": {}, "source": [ - "**Check out the training progress on the Wandb UI:**\n", + "**See the training progress in the `wandb` UI:**\n", "\n", "" ] @@ -382,77 +342,82 @@ "id": "62012e5c-e6bb-4b6e-a0cb-0becf9e495ea", "metadata": {}, "source": [ - "## Save model in LaminDB\n", - "\n", - "Upload the model checkpoint of the trained model to LaminDB.\n", - "\n", - "We annotate the LaminDB Artifact with the wandb experiment ID and the hyper parameters." + "## Save model in LaminDB" ] }, { "cell_type": "code", "execution_count": null, "id": "2f6a85a7-f7e8-4c81-bbe1-050659016d98", - "metadata": {}, + "metadata": { + "tags": [ + "hide-output" + ] + }, "outputs": [], "source": [ - "# save checkpoint in LaminDB\n", - "ckpt_artifact = ln.Artifact(\n", + "# save checkpoint as a model in LaminDB\n", + "artifact = ln.Artifact(\n", " f\"model_checkpoints/{wandb_logger.version}\",\n", - " description=\"model-checkpoint\",\n", + " key=\"testmodels/litautoencoder\", # is automatically versioned\n", " type=\"model\",\n", - ").save()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c43efe72-8831-4caa-afca-4941d511ecae", - "metadata": {}, - "outputs": [], - "source": [ + ").save()\n", + "\n", "# create a label with the wandb experiment name\n", "experiment_label = ln.ULabel(\n", " name=wandb_logger.experiment.name, \n", " description=\"wandb experiment name\"\n", ").save()\n", - "# annotate the artifact\n", - "ckpt_artifact.ulabels.add(experiment_label)" + "\n", + "# annotate the model artifact\n", + "artifact.ulabels.add(experiment_label)\n", + "\n", + "# define the associated model hyperparameters in ln.Param\n", + "for k, v in MODEL_CONFIG.items():\n", + " ln.Param(name=k, dtype=type(v).__name__).save()\n", + "artifact.params.add_values(MODEL_CONFIG)\n", + "\n", + "# describe the artifact\n", + "artifact.describe()" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "fbb379b6-eac2-4cbc-aa56-b839e413a1c5", + "cell_type": "markdown", + "id": "c2614158-cc36-4e52-87f4-32151e93b6da", "metadata": {}, - "outputs": [], "source": [ - "# define the associated model hyperparameters in ln.Param\n", - "for k, v in MODEL_CONFIG.items():\n", - " ln.Param(name=k, dtype=type(v).__name__).save()\n", - "# annotate the artifact with them\n", - "ckpt_artifact.params.add_values(MODEL_CONFIG)" + "**See the checkpoints:**\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "51300c29", + "metadata": {}, + "source": [ + "If later on, you want to re-use the checkpoint, you can download it like so:" ] }, { "cell_type": "code", "execution_count": null, - "id": "fe39ae23-57c5-4aaa-86f5-60f1d9fc6c3c", + "id": "bdf09a0a", "metadata": {}, "outputs": [], "source": [ - "# show info about the checkpoint artifact\n", - "ckpt_artifact.describe()" + "ln.Artifact.get(key='testmodels/litautoencoder').cache()" ] }, { "cell_type": "markdown", - "id": "c2614158-cc36-4e52-87f4-32151e93b6da", + "id": "4f83bf6c", "metadata": {}, "source": [ - "**Look at saved checkpoints in LaminHub:**\n", - "\n", - "" + "Or on the CLI:\n", + "```\n", + "lamin get artifact --key 'testmodels/litautoencoder'\n", + "```" ] }, { diff --git a/lamin-project.yaml b/lamin-project.yaml deleted file mode 100644 index 79542b7..0000000 --- a/lamin-project.yaml +++ /dev/null @@ -1,5 +0,0 @@ -project_name: lamin-mlops -description: MLOps use cases -project_slug: lamin-mlops -repository_name: lamin-mlops -package_name: lamin_mlops diff --git a/lamin_mlops/__init__.py b/lamin_mlops/__init__.py deleted file mode 100644 index 1770364..0000000 --- a/lamin_mlops/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -"""MLOps use cases. - -Import the package:: - - import lamin_mlops - -This is the complete API reference: - -.. autosummary:: - :toctree: . - - example_function - ExampleClass -""" - -__version__ = "0.0.1" # denote a pre-release for 0.1.0 with 0.1rc1 - -from ._core import ExampleClass, example_function diff --git a/lamin_mlops/_core.py b/lamin_mlops/_core.py deleted file mode 100644 index f6882ed..0000000 --- a/lamin_mlops/_core.py +++ /dev/null @@ -1,26 +0,0 @@ -def example_function(column_name: str) -> str: - """Lower case your input string. - - Args: - column_name: Column name to transform to lower case. - - Returns: - The lower-cased column name. - """ - return column_name.lower() - - -class ExampleClass: - """Awesome class.""" - - def __init__(self, value: int): - print("initializing") - - def bar(self) -> str: - """Bar function.""" - return "hello" - - @property - def foo(self) -> str: - """Foo property.""" - return "hello" diff --git a/lamin_mlops/migrations/__init__.py b/lamin_mlops/migrations/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/lamin_mlops/models.py b/lamin_mlops/models.py deleted file mode 100644 index e69de29..0000000 diff --git a/noxfile.py b/noxfile.py index 4552177..77ab812 100644 --- a/noxfile.py +++ b/noxfile.py @@ -1,12 +1,7 @@ import nox -from laminci import upload_docs_artifact -from laminci.nox import build_docs, login_testuser1, run_pre_commit, run_pytest - -# we'd like to aggregate coverage information across sessions -# and for this the code needs to be located in the same -# directory in every github action runner -# this also allows to break out an installation section -nox.options.default_venv_backend = "none" +from subprocess import run +from laminci import upload_docs_artifact, run_notebooks +from laminci.nox import run_pre_commit @nox.session @@ -16,8 +11,10 @@ def lint(session: nox.Session) -> None: @nox.session() def build(session): - session.run(*"uv pip install --system -e .[dev]".split()) - login_testuser1(session) - run_pytest(session) - build_docs(session, strict=True) + run( + "uv pip install --system 'lamindb[jupyter,aws]' torch torchvision lightning wandb", + shell=True, + ) + run_notebooks("./docs") + run("lndocs --strict", shell=True) upload_docs_artifact(aws=True) diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 1fe1158..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,127 +0,0 @@ -[build-system] -requires = ["flit_core >=3.2,<4"] -build-backend = "flit_core.buildapi" - -[project] -name = "lamin_mlops" -authors = [{name = "Lamin Labs", email = "open-source@lamin.ai"}] -readme = "README.md" -dynamic = ["version", "description"] -dependencies = [ - "lamindb[jupyter]", -] - -[project.urls] -Home = "https://github.com/laminlabs/lamin-mlops" - -[project.optional-dependencies] -dev = [ - "pre-commit", - "nox", - "pytest>=6.0", - "pytest-cov", - "nbproject_test", - "torch", - "torchvision", - "wandb", - "lightning" -] - -[tool.pytest.ini_options] -testpaths = [ - "tests", -] - -[tool.ruff] -src = ["src"] -line-length = 88 -select = [ - "F", # Errors detected by Pyflakes - "E", # Error detected by Pycodestyle - "W", # Warning detected by Pycodestyle - "I", # isort - "D", # pydocstyle - "B", # flake8-bugbear - "TID", # flake8-tidy-imports - "C4", # flake8-comprehensions - "BLE", # flake8-blind-except - "UP", # pyupgrade - "RUF100", # Report unused noqa directives - "TCH", # Typing imports - "NPY", # Numpy specific rules - "PTH" # Use pathlib -] -ignore = [ - # Do not catch blind exception: `Exception` - "BLE001", - # Errors from function calls in argument defaults. These are fine when the result is immutable. - "B008", - # line too long -> we accept long comment lines; black gets rid of long code lines - "E501", - # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient - "E731", - # allow I, O, l as variable names -> I is the identity matrix - "E741", - # Missing docstring in public module - "D100", - # undocumented-public-class - "D101", - # Missing docstring in public method - "D102", - # Missing docstring in public function - "D103", - # Missing docstring in public package - "D104", - # __magic__ methods are are often self-explanatory, allow missing docstrings - "D105", - # Missing docstring in public nested class - "D106", - # Missing docstring in __init__ - "D107", - ## Disable one in each pair of mutually incompatible rules - # We don’t want a blank line before a class docstring - "D203", - # 1 blank line required after class docstring - "D204", - # first line should end with a period [Bug: doesn't work with single-line docstrings] - # We want docstrings to start immediately after the opening triple quote - "D213", - # Section underline is over-indented ("{name}") - "D215", - # First line should end with a period - "D400", - # First line should be in imperative mood; try rephrasing - "D401", - # First word of the first line should be capitalized: {} -> {} - "D403", - # First word of the docstring should not be "This" - "D404", - # Section name should end with a newline ("{name}") - "D406", - # Missing dashed underline after section ("{name}") - "D407", - # Section underline should be in the line following the section's name ("{name}") - "D408", - # Section underline should match the length of its name ("{name}") - "D409", - # No blank lines allowed between a section header and its content ("{name}") - "D412", - # Missing blank line after last section ("{name}") - "D413", - # Imports unused - "F401", - # camcelcase imported as lowercase - "N813", - # module import not at top level of file - "E402", - # open()` should be replaced by `Path.open() - "PTH123", -] - -[tool.ruff.pydocstyle] -convention = "google" - -[tool.ruff.per-file-ignores] -"docs/*" = ["I"] -"tests/*" = ["D"] -"*/__init__.py" = ["F401"] diff --git a/tests/test_base.py b/tests/test_base.py deleted file mode 100644 index a95800f..0000000 --- a/tests/test_base.py +++ /dev/null @@ -1,7 +0,0 @@ -from lamin_mlops import ExampleClass, example_function - - -def test_dummy(): - assert example_function("A") == "a" - ex = ExampleClass(1) - assert ex.bar() == "hello" diff --git a/tests/test_notebooks.py b/tests/test_notebooks.py deleted file mode 100644 index 8ccfd84..0000000 --- a/tests/test_notebooks.py +++ /dev/null @@ -1,9 +0,0 @@ -from pathlib import Path - -import nbproject_test as test - - -def test_notebooks(): - docs_folder = Path(__file__).parents[1] / "docs/" - for check_folder in docs_folder.glob("./**"): - test.execute_notebooks(check_folder, write=True)