From bb2b58a013fd82a4494fc3301d43c088e9242d41 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Fri, 22 Sep 2023 15:27:33 +0100 Subject: [PATCH] [SDK] Consolidate Naming for CRUD APIs (#1907) * Add Flake and Black Lint * Change SDK APIs * Update E2E tests * Fix a few function parameters * Fix black format * Fix a few comments * Fix conftest location * Fix Job kind in tests * Fix client creation in test * Fix namespace arg in get_job_conditions * Update SDK examples with the latest changes * Rename SDK examples * Fix black action * Update checkout action version Co-authored-by: Yuki Iwai * Use Black 23.9.1 version * Fix GitHub Action for Black * Add unit test to create PyTorchJob from func * Rename timeout to wait_timeout * Validate that Job is not set with other input parameters * Update black in developer guide * Remove pip_index_url validation * Use locals to verify input * Print Job info when E2E fails * Remove duplicated delete --------- Co-authored-by: Yuki Iwai --- .flake8 | 2 + .gcloudignore | 7 - .github/workflows/test-python.yaml | 23 + .pylintrc | 405 ---- .style.yapf | 4 - docs/development/developer_guide.md | 14 +- .../sdk/create-pytorchjob-from-func.ipynb | 742 +++++++ .../sdk/create-pytorchjob.ipynb | 207 +- examples/sdk/create-tfjob.ipynb | 405 ++++ hack/python-sdk/post_gen.py | 5 +- .../create-pytorchjob-from-func.ipynb | 779 -------- sdk/python/examples/kubeflow-tfjob-sdk.ipynb | 714 ------- sdk/python/kubeflow/__init__.py | 2 +- sdk/python/kubeflow/training/__init__.py | 1 + .../kubeflow/training/api/training_client.py | 1715 +++++------------ .../kubeflow/training/constants/constants.py | 88 +- sdk/python/kubeflow/training/utils/utils.py | 386 ++-- sdk/python/setup.py | 8 +- sdk/python/test/conftest.py | 1 + sdk/python/test/e2e/constants.py | 5 +- sdk/python/test/e2e/test_e2e_mpijob.py | 126 +- sdk/python/test/e2e/test_e2e_mxjob.py | 150 +- sdk/python/test/e2e/test_e2e_paddlejob.py | 109 +- sdk/python/test/e2e/test_e2e_pytorchjob.py | 132 +- sdk/python/test/e2e/test_e2e_tfjob.py | 105 +- sdk/python/test/e2e/test_e2e_xgboostjob.py | 125 +- sdk/python/test/e2e/utils.py | 82 +- 27 files changed, 2529 insertions(+), 3813 deletions(-) create mode 100644 .flake8 delete mode 100644 .gcloudignore create mode 100644 .github/workflows/test-python.yaml delete mode 100644 .pylintrc delete mode 100644 .style.yapf create mode 100644 examples/sdk/create-pytorchjob-from-func.ipynb rename sdk/python/examples/kubeflow-pytorchjob-sdk.ipynb => examples/sdk/create-pytorchjob.ipynb (65%) create mode 100644 examples/sdk/create-tfjob.ipynb delete mode 100644 sdk/python/examples/create-pytorchjob-from-func.ipynb delete mode 100644 sdk/python/examples/kubeflow-tfjob-sdk.ipynb diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000..7da1f9608e --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 100 diff --git a/.gcloudignore b/.gcloudignore deleted file mode 100644 index bee692559b..0000000000 --- a/.gcloudignore +++ /dev/null @@ -1,7 +0,0 @@ -# This file specifies files that are not uploaded to GCP -# Uses same syntax as gitignore - -.git -.gitignore -#!include:.gitignore -#vendor diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml new file mode 100644 index 0000000000..09dc39d9cb --- /dev/null +++ b/.github/workflows/test-python.yaml @@ -0,0 +1,23 @@ +name: Python Test + +on: + - push + - pull_request + +jobs: + test: + name: Test + runs-on: ubuntu-latest + + steps: + - name: Check out code + uses: actions/checkout@v4 + + # TODO (andreyvelich): We need to replace this action with script to do + # linting and formatting for Training Operator SDK. + - name: Check Python code with Black + uses: psf/black@stable + with: + version: 23.9.1 + options: --check --exclude '/*kubeflow_org_v1*|__init__.py|api_client.py|configuration.py|exceptions.py|rest.py' + src: sdk/ diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index 72f0baa622..0000000000 --- a/.pylintrc +++ /dev/null @@ -1,405 +0,0 @@ -[MASTER] - -# Specify a configuration file. -#rcfile= - -# Python code to execute, usually for sys.path manipulation such as -# pygtk.require(). -#init-hook= - -# Add files or directories to the ignorelist. They should be base names, not -# paths. -ignore=third_party,jupyterhub_spawner.py,dist_mnist.py - -# Add files or directories matching the regex patterns to the ignorelist. The -# regex matches against base names, not paths. -# Ignore files that are created automatically during SDK generation. -ignore-patterns=api_client.py,configuration.py,v1_job_condition.py,v1_job_status.py,v1_replica_spec.py,v1_replica_status.py,v1_tf_job.py,v1_tf_job_list.py,v1_tf_job_spec.py,v1_time.py,rest.py,test_v1_job_status.py,test_v1_replica_spec.py,test_v1_replica_status.py,test_v1_tf_job.py,test_v1_tf_job_list.py,test_v1_tf_job_spec.py,test_v1_job_condition.py - -# Pickle collected data for later comparisons. -persistent=no - -# List of plugins (as comma separated values of python modules names) to load, -# usually to register additional checkers. -load-plugins= - -# Use multiple processes to speed up Pylint. -jobs=4 - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code -extension-pkg-whitelist= - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED -confidence= - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -#enable= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once).You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use"--disable=all --enable=classes -# --disable=W" -# -# Kubeflow disables wrong-import-order error because we were getting different results regarding third party -# modules when running locally vs in our test infrastructure. It looks like what counts -# as third party might vary. -# -disable=import-star-module-level,old-octal-literal,oct-method,print-statement,unpacking-in-except,parameter-unpacking,backtick,old-raise-syntax,old-ne-operator,long-suffix,dict-view-method,dict-iter-method,metaclass-assignment,next-method-called,raising-string,indexing-exception,raw_input-builtin,long-builtin,file-builtin,execfile-builtin,coerce-builtin,cmp-builtin,buffer-builtin,basestring-builtin,apply-builtin,filter-builtin-not-iterating,using-cmp-argument,useless-suppression,range-builtin-not-iterating,suppressed-message,missing-docstring,no-absolute-import,old-division,cmp-method,reload-builtin,zip-builtin-not-iterating,intern-builtin,unichr-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,input-builtin,round-builtin,hex-method,nonzero-method,map-builtin-not-iterating,relative-import,invalid-name,bad-continuation,no-member,locally-disabled,fixme,import-error,too-many-locals,wrong-import-order - - -[REPORTS] - -# Set the output format. Available formats are text, parseable, colorized, msvs -# (visual studio) and html. You can also give a reporter class, eg -# mypackage.mymodule.MyReporterClass. -output-format=text - -# Put messages in a separate file for each module / package specified on the -# command line instead of printing them on stdout. Reports (if any) will be -# written in a file name "pylint_global.[txt|html]". This option is deprecated -# and it will be removed in Pylint 2.0. -files-output=no - -# Tells whether to display a full report or only the messages -reports=no - -# Python expression which should return a note less than 10 (10 is the highest -# note). You have access to the variables errors warning, statement which -# respectively contain the number of errors / warnings messages and the total -# number of statements analyzed. This is used by the global evaluation report -# (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details -#msg-template= - - -[BASIC] - -# Good variable names which should always be accepted, separated by a comma -good-names=i,j,k,ex,Run,_ - -# Bad variable names which should always be refused, separated by a comma -bad-names=foo,bar,baz,toto,tutu,tata - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Include a hint for the correct naming format with invalid-name -include-naming-hint=no - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -property-classes=abc.abstractproperty - -# Regular expression matching correct function names -function-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Naming hint for function names -function-name-hint=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression matching correct variable names -variable-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Naming hint for variable names -variable-name-hint=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression matching correct constant names -const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ - -# Naming hint for constant names -const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ - -# Regular expression matching correct attribute names -attr-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Naming hint for attribute names -attr-name-hint=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression matching correct argument names -argument-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Naming hint for argument names -argument-name-hint=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression matching correct class attribute names -class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ - -# Naming hint for class attribute names -class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ - -# Regular expression matching correct inline iteration names -inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ - -# Naming hint for inline iteration names -inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ - -# Regular expression matching correct class names -class-rgx=[A-Z_][a-zA-Z0-9]+$ - -# Naming hint for class names -class-name-hint=[A-Z_][a-zA-Z0-9]+$ - -# Regular expression matching correct module names -module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ - -# Naming hint for module names -module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ - -# Regular expression matching correct method names -method-rgx=[a-z_][a-z0-9_]{2,30}$ - -# Naming hint for method names -method-name-hint=[a-z_][a-z0-9_]{2,30}$ - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=^_ - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=-1 - - -[ELIF] - -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 - - -[TYPECHECK] - -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis. It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members= - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager - - -[FORMAT] - -# Maximum number of characters on a single line. -max-line-length=100 - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no - -# List of optional constructs for which whitespace checking is disabled. `dict- -# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. -# `trailing-comma` allows a space between comma and closing bracket: (a, ). -# `empty-line` allows space-only lines. -no-space-check=trailing-comma,dict-separator - -# Maximum number of lines in a module -max-module-lines=1000 - -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -# Use 2 spaces consistent with TensorFlow style. -indent-string=' ' - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=FIXME,XXX,TODO - - -[VARIABLES] - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# A regular expression matching the name of dummy variables (i.e. expectedly -# not used). -dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid to define new builtins when possible. -additional-builtins= - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_,_cb - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six.moves,future.builtins - - -[LOGGING] - -# Logging modules to check that the string format arguments are in logging -# function parameter format -logging-modules=logging - - -[SIMILARITIES] - -# Minimum lines number of a similarity. -min-similarity-lines=4 - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - - -[SPELLING] - -# Spelling dictionary name. Available dictionaries: none. To make it working -# install python-enchant package. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to indicated private dictionary in -# --spelling-private-dict-file option instead of raising a message. -spelling-store-unknown-words=no - - -[IMPORTS] - -# Deprecated modules which should not be used, separated by a comma -deprecated-modules=regsub,TERMIOS,Bastion,rexec - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled) -import-graph= - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled) -ext-import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled) -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - - -[DESIGN] - -# Maximum number of arguments for function / method -max-args=7 - -# Argument names that match this expression will be ignored. Default to name -# with leading underscore -ignored-argument-names=_.* - -# Maximum number of locals for function / method body -max-locals=15 - -# Maximum number of return / yield for function / method body -max-returns=6 - -# Maximum number of branch for function / method body -max-branches=12 - -# Maximum number of statements in function / method body -max-statements=50 - -# Maximum number of parents for a class (see R0901). -max-parents=7 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=0 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of boolean expressions in a if statement -max-bool-expr=5 - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__,__new__,setUp - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=mcs - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict,_fields,_replace,_source,_make - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "Exception" -overgeneral-exceptions=Exception diff --git a/.style.yapf b/.style.yapf deleted file mode 100644 index 9668b4b10a..0000000000 --- a/.style.yapf +++ /dev/null @@ -1,4 +0,0 @@ -[style] -based_on_style = google -indent_width: 2 -continuation_indent_width: 2 diff --git a/docs/development/developer_guide.md b/docs/development/developer_guide.md index 8e89d866a7..406399467e 100644 --- a/docs/development/developer_guide.md +++ b/docs/development/developer_guide.md @@ -112,16 +112,16 @@ sdk/python/kubeflow/training/api ### Python -- Use [yapf](https://github.com/google/yapf) to format Python code -- `yapf` style is configured in `.style.yapf` file -- To autoformat code +- Use [`black`](https://github.com/psf/black) to format Python code - ```sh - yapf -i py/**/*.py +- Run the following to install `black`: + + ``` + pip install black==23.9.1 ``` -- To sort imports +- To check your code: ```sh - isort path/to/module.py + black --check --exclude '/*kubeflow_org_v1*|__init__.py|api_client.py|configuration.py|exceptions.py|rest.py' sdk/ ``` diff --git a/examples/sdk/create-pytorchjob-from-func.ipynb b/examples/sdk/create-pytorchjob-from-func.ipynb new file mode 100644 index 0000000000..5ab0cf21c2 --- /dev/null +++ b/examples/sdk/create-pytorchjob-from-func.ipynb @@ -0,0 +1,742 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "90d43b56-97e5-45e2-8e67-4488ed31d2df", + "metadata": { + "tags": [] + }, + "source": [ + "# Run PyTorchJob From Function\n", + "\n", + "In this Notebook we are going to create [Kubeflow PyTorchJob](https://www.kubeflow.org/docs/components/training/pytorch/).\n", + "\n", + "The PyTorchJob will run distributive training using [DistributedDataParallel strategy](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)." + ] + }, + { + "cell_type": "markdown", + "id": "a8bb6564-fde3-4c28-841c-012122643dd9", + "metadata": { + "tags": [] + }, + "source": [ + "## Install Kubeflow Python SDKs\n", + "\n", + "You need to install PyTorch packages and Kubeflow SDKs to run this Notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d49f072e-2221-48bb-9f6d-561713d1a45c", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install torch==1.12.1\n", + "!pip install torchvision==0.13.1\n", + "\n", + "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n", + "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python" + ] + }, + { + "cell_type": "markdown", + "id": "e9331a05-9127-4b3a-8077-31157e267827", + "metadata": {}, + "source": [ + "## Create Train Script for CNN Model\n", + "\n", + "This is simple **Convolutional Neural Network (CNN)** model for recognizing different picture of clothing using [Fashion MNIST Dataset](https://github.com/zalandoresearch/fashion-mnist)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "69f21f33-5c64-452c-90c4-977fc0dadb3b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def train_pytorch_model():\n", + " import logging\n", + " import os\n", + " from torchvision import transforms, datasets\n", + " import torch\n", + " from torch import nn\n", + " import torch.nn.functional as F\n", + " import torch.distributed as dist\n", + "\n", + " logging.basicConfig(\n", + " format=\"%(asctime)s %(levelname)-8s %(message)s\",\n", + " datefmt=\"%Y-%m-%dT%H:%M:%SZ\",\n", + " level=logging.DEBUG,\n", + " )\n", + "\n", + " # Create PyTorch CNN Model.\n", + " class Net(nn.Module):\n", + " def __init__(self):\n", + " super(Net, self).__init__()\n", + " self.conv1 = nn.Conv2d(1, 20, 5, 1)\n", + " self.conv2 = nn.Conv2d(20, 50, 5, 1)\n", + " self.fc1 = nn.Linear(4 * 4 * 50, 500)\n", + " self.fc2 = nn.Linear(500, 10)\n", + "\n", + " def forward(self, x):\n", + " x = F.relu(self.conv1(x))\n", + " x = F.max_pool2d(x, 2, 2)\n", + " x = F.relu(self.conv2(x))\n", + " x = F.max_pool2d(x, 2, 2)\n", + " x = x.view(-1, 4 * 4 * 50)\n", + " x = F.relu(self.fc1(x))\n", + " x = self.fc2(x)\n", + " return F.log_softmax(x, dim=1)\n", + "\n", + " # Get dist parameters.\n", + " # Kubeflow Training Operator automatically set appropriate RANK and WORLD_SIZE based on the configuration.\n", + " RANK = int(os.environ[\"RANK\"])\n", + " WORLD_SIZE = int(os.environ[\"WORLD_SIZE\"])\n", + " \n", + " model = Net()\n", + " # Attach model to DistributedDataParallel strategy.\n", + " dist.init_process_group(backend=\"gloo\", rank=RANK, world_size=WORLD_SIZE)\n", + " Distributor = nn.parallel.DistributedDataParallel\n", + " model = Distributor(model)\n", + "\n", + " # Split batch size for each worker.\n", + " batch_size = int(128 / WORLD_SIZE)\n", + "\n", + " # Get Fashion MNIST DataSet.\n", + " train_loader = torch.utils.data.DataLoader(\n", + " datasets.FashionMNIST(\n", + " \"./data\",\n", + " train=True,\n", + " download=True,\n", + " transform=transforms.Compose([transforms.ToTensor()]),\n", + " ),\n", + " batch_size=batch_size,\n", + " )\n", + "\n", + " # Start Training.\n", + " logging.info(f\"Start training for RANK: {RANK}. WORLD_SIZE: {WORLD_SIZE}\")\n", + " for epoch in range(1):\n", + " model.train()\n", + " optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)\n", + "\n", + " for batch_idx, (data, target) in enumerate(train_loader):\n", + " optimizer.zero_grad()\n", + " output = model(data)\n", + " loss = F.nll_loss(output, target)\n", + " loss.backward()\n", + " optimizer.step()\n", + " if batch_idx % 10 == 0:\n", + " logging.info(\n", + " \"Train Epoch: {} [{}/{} ({:.0f}%)]\\tloss={:.4f}\".format(\n", + " epoch,\n", + " batch_idx * len(data),\n", + " len(train_loader.dataset),\n", + " 100.0 * batch_idx / len(train_loader),\n", + " loss.item(),\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "8cfe8739-1f94-476a-80e3-dd6e3237d9ed", + "metadata": { + "execution": { + "iopub.execute_input": "2022-09-01T19:32:37.813779Z", + "iopub.status.busy": "2022-09-01T19:32:37.812759Z", + "iopub.status.idle": "2022-09-01T19:32:37.827050Z", + "shell.execute_reply": "2022-09-01T19:32:37.825186Z", + "shell.execute_reply.started": "2022-09-01T19:32:37.813690Z" + } + }, + "source": [ + "## Run Training Locally in the Notebook\n", + "\n", + "We are going to download Fashion MNIST Dataset and start local training." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9e2c6fd8-d0ba-4bc6-ac90-d4cf09751ace", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/avelichk/miniconda3/envs/training-operator/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "2023-09-08T22:00:27Z INFO Added key: store_based_barrier_key:1 to store for rank: 0\n", + "2023-09-08T22:00:27Z INFO Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26421880/26421880 [00:01<00:00, 22627052.40it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n", + "\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29515/29515 [00:00<00:00, 1596941.21it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n", + "\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4422102/4422102 [00:00<00:00, 20494516.72it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n", + "\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5148/5148 [00:00<00:00, 8510948.76it/s]\n", + "2023-09-08T22:00:30Z INFO Start training for RANK: 0. WORLD_SIZE: 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-08T22:00:30Z INFO Train Epoch: 0 [0/60000 (0%)]\tloss=2.2989\n", + "2023-09-08T22:00:30Z INFO Reducer buckets have been rebuilt in this iteration.\n", + "2023-09-08T22:00:30Z INFO Train Epoch: 0 [1280/60000 (2%)]\tloss=2.2991\n", + "2023-09-08T22:00:30Z INFO Train Epoch: 0 [2560/60000 (4%)]\tloss=2.2857\n", + "2023-09-08T22:00:31Z INFO Train Epoch: 0 [3840/60000 (6%)]\tloss=2.2795\n", + "2023-09-08T22:00:31Z INFO Train Epoch: 0 [5120/60000 (9%)]\tloss=2.2625\n", + "2023-09-08T22:00:31Z INFO Train Epoch: 0 [6400/60000 (11%)]\tloss=2.2526\n", + "2023-09-08T22:00:32Z INFO Train Epoch: 0 [7680/60000 (13%)]\tloss=2.2245\n", + "2023-09-08T22:00:32Z INFO Train Epoch: 0 [8960/60000 (15%)]\tloss=2.1893\n", + "2023-09-08T22:00:32Z INFO Train Epoch: 0 [10240/60000 (17%)]\tloss=2.1507\n", + "2023-09-08T22:00:33Z INFO Train Epoch: 0 [11520/60000 (19%)]\tloss=2.0778\n", + "2023-09-08T22:00:33Z INFO Train Epoch: 0 [12800/60000 (21%)]\tloss=1.9957\n", + "2023-09-08T22:00:34Z INFO Train Epoch: 0 [14080/60000 (23%)]\tloss=1.9257\n", + "2023-09-08T22:00:34Z INFO Train Epoch: 0 [15360/60000 (26%)]\tloss=1.7212\n", + "2023-09-08T22:00:34Z INFO Train Epoch: 0 [16640/60000 (28%)]\tloss=1.5281\n", + "2023-09-08T22:00:35Z INFO Train Epoch: 0 [17920/60000 (30%)]\tloss=1.3686\n", + "2023-09-08T22:00:35Z INFO Train Epoch: 0 [19200/60000 (32%)]\tloss=1.2350\n", + "2023-09-08T22:00:35Z INFO Train Epoch: 0 [20480/60000 (34%)]\tloss=1.1473\n", + "2023-09-08T22:00:36Z INFO Train Epoch: 0 [21760/60000 (36%)]\tloss=1.1870\n", + "2023-09-08T22:00:36Z INFO Train Epoch: 0 [23040/60000 (38%)]\tloss=1.0766\n", + "2023-09-08T22:00:36Z INFO Train Epoch: 0 [24320/60000 (41%)]\tloss=1.0574\n", + "2023-09-08T22:00:37Z INFO Train Epoch: 0 [25600/60000 (43%)]\tloss=0.9557\n", + "2023-09-08T22:00:37Z INFO Train Epoch: 0 [26880/60000 (45%)]\tloss=0.9279\n", + "2023-09-08T22:00:37Z INFO Train Epoch: 0 [28160/60000 (47%)]\tloss=0.9588\n", + "2023-09-08T22:00:38Z INFO Train Epoch: 0 [29440/60000 (49%)]\tloss=1.0970\n", + "2023-09-08T22:00:38Z INFO Train Epoch: 0 [30720/60000 (51%)]\tloss=0.9617\n", + "2023-09-08T22:00:38Z INFO Train Epoch: 0 [32000/60000 (53%)]\tloss=0.9025\n", + "2023-09-08T22:00:39Z INFO Train Epoch: 0 [33280/60000 (55%)]\tloss=0.8363\n", + "2023-09-08T22:00:39Z INFO Train Epoch: 0 [34560/60000 (58%)]\tloss=0.9448\n", + "2023-09-08T22:00:39Z INFO Train Epoch: 0 [35840/60000 (60%)]\tloss=0.7507\n", + "2023-09-08T22:00:40Z INFO Train Epoch: 0 [37120/60000 (62%)]\tloss=0.7347\n", + "2023-09-08T22:00:40Z INFO Train Epoch: 0 [38400/60000 (64%)]\tloss=0.6985\n", + "2023-09-08T22:00:40Z INFO Train Epoch: 0 [39680/60000 (66%)]\tloss=0.8104\n", + "2023-09-08T22:00:41Z INFO Train Epoch: 0 [40960/60000 (68%)]\tloss=0.8177\n", + "2023-09-08T22:00:41Z INFO Train Epoch: 0 [42240/60000 (70%)]\tloss=0.8442\n", + "2023-09-08T22:00:41Z INFO Train Epoch: 0 [43520/60000 (72%)]\tloss=0.7311\n", + "2023-09-08T22:00:42Z INFO Train Epoch: 0 [44800/60000 (75%)]\tloss=0.7861\n", + "2023-09-08T22:00:42Z INFO Train Epoch: 0 [46080/60000 (77%)]\tloss=0.7879\n", + "2023-09-08T22:00:42Z INFO Train Epoch: 0 [47360/60000 (79%)]\tloss=0.7863\n", + "2023-09-08T22:00:43Z INFO Train Epoch: 0 [48640/60000 (81%)]\tloss=0.8808\n", + "2023-09-08T22:00:43Z INFO Train Epoch: 0 [49920/60000 (83%)]\tloss=0.7993\n", + "2023-09-08T22:00:43Z INFO Train Epoch: 0 [51200/60000 (85%)]\tloss=0.7540\n", + "2023-09-08T22:00:44Z INFO Train Epoch: 0 [52480/60000 (87%)]\tloss=0.8387\n", + "2023-09-08T22:00:44Z INFO Train Epoch: 0 [53760/60000 (90%)]\tloss=0.7911\n", + "2023-09-08T22:00:44Z INFO Train Epoch: 0 [55040/60000 (92%)]\tloss=0.6176\n", + "2023-09-08T22:00:45Z INFO Train Epoch: 0 [56320/60000 (94%)]\tloss=0.6854\n", + "2023-09-08T22:00:45Z INFO Train Epoch: 0 [57600/60000 (96%)]\tloss=0.7593\n", + "2023-09-08T22:00:45Z INFO Train Epoch: 0 [58880/60000 (98%)]\tloss=0.7477\n" + ] + } + ], + "source": [ + "# Set dist env variables to run the above training locally on the Notebook.\n", + "import os\n", + "os.environ[\"RANK\"] = \"0\"\n", + "os.environ[\"WORLD_SIZE\"] = \"1\"\n", + "os.environ[\"MASTER_ADDR\"] = \"localhost\"\n", + "os.environ[\"MASTER_PORT\"] = \"1234\"\n", + "\n", + "# Train Model locally in the Notebook.\n", + "train_pytorch_model()" + ] + }, + { + "cell_type": "markdown", + "id": "5aae47e3-be31-468e-8f38-89e1e2f1c764", + "metadata": { + "tags": [] + }, + "source": [ + "## Start Distributive Training with PyTorchJob\n", + "\n", + "Before creating PyTorchJob, you have to create `TrainingClient()`. It uses [Kubernetes Python client](https://github.com/kubernetes-client/python) to communicate with Kubernetes API server. You can set path and context for [the kubeconfig file](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/). The default location for the kubeconfig is `~/.kube/config`.\n", + "\n", + "Kubeflow Training Operator automatically set the appropriate env variables (`MASTER_PORT`, `MASTER_ADDR`, `WORLD_SIZE`, `RANK`) for each PyTorchJob container." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "eb1acd34-ebcf-409b-8bb3-0225cee37110", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-08T22:01:42Z INFO PyTorchJob default/train-pytorch has been created\n" + ] + } + ], + "source": [ + "from kubeflow.training import TrainingClient\n", + "from kubeflow.training import constants\n", + "\n", + "# Start PyTorchJob Training.\n", + "pytorchjob_name = \"train-pytorch\"\n", + "\n", + "# Since we set `job_kind = PyTorchJob` APIs are going to use PyTorchJob as a default Job kind.\n", + "training_client = TrainingClient(job_kind=constants.PYTORCHJOB_KIND)\n", + "\n", + "training_client.create_job(\n", + " name=pytorchjob_name,\n", + " train_func=train_pytorch_model,\n", + " num_worker_replicas=3, # How many PyTorch Workers will be created.\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e44c3ad7-62c4-4b58-b52a-15fd8746b772", + "metadata": {}, + "source": [ + "### Check PyTorchJob Status\n", + "\n", + "Use `KubeflowClient` APIs to get information about created PyTorchJob." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4141f6c2-c38f-4972-b68a-35d150ef7485", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PyTorchJob Status: True\n" + ] + } + ], + "source": [ + "print(f\"PyTorchJob Status: {training_client.is_job_running(name=pytorchjob_name)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "42e10587-7ac2-45bf-9c4f-d418e1585974", + "metadata": {}, + "source": [ + "### Get PyTorchJob Pod Names" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "49b53308-a19b-45e8-942f-4333e727ee48", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['train-pytorch-master-0',\n", + " 'train-pytorch-worker-0',\n", + " 'train-pytorch-worker-1',\n", + " 'train-pytorch-worker-2']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_client.get_job_pod_names(pytorchjob_name)" + ] + }, + { + "cell_type": "markdown", + "id": "b91d332d-487c-4a95-937d-26ffb6199cda", + "metadata": { + "execution": { + "iopub.status.busy": "2022-09-01T20:10:25.759950Z", + "iopub.status.idle": "2022-09-01T20:10:25.760581Z", + "shell.execute_reply": "2022-09-01T20:10:25.760353Z", + "shell.execute_reply.started": "2022-09-01T20:10:25.760328Z" + }, + "tags": [] + }, + "source": [ + "### Get PyTorchJob Training Logs" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5232d542-d4bf-4c51-8b11-ad0534fb0b9d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-08T22:10:08Z INFO The logs of pod train-pytorch-master-0:\n", + " 2023-09-08T21:01:59Z INFO Added key: store_based_barrier_key:1 to store for rank: 0\n", + "2023-09-08T21:01:59Z INFO Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes.\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n", + "100%|██████████| 26421880/26421880 [00:02<00:00, 12793779.84it/s]\n", + "Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n", + "\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n", + "100%|██████████| 29515/29515 [00:00<00:00, 209261.44it/s]\n", + "Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n", + "\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n", + "100%|██████████| 4422102/4422102 [00:01<00:00, 3953124.28it/s]\n", + "Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n", + "\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n", + "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n", + "100%|██████████| 5148/5148 [00:00<00:00, 53182948.26it/s]\n", + "Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n", + "\n", + "2023-09-08T21:02:05Z INFO Start training for RANK: 0. WORLD_SIZE: 4\n", + "2023-09-08T21:02:05Z INFO Train Epoch: 0 [0/60000 (0%)]\tloss=2.3026\n", + "2023-09-08T21:02:05Z INFO Reducer buckets have been rebuilt in this iteration.\n", + "2023-09-08T21:02:07Z INFO Train Epoch: 0 [320/60000 (1%)]\tloss=2.2942\n", + "2023-09-08T21:02:10Z INFO Train Epoch: 0 [640/60000 (1%)]\tloss=2.2931\n", + "2023-09-08T21:02:12Z INFO Train Epoch: 0 [960/60000 (2%)]\tloss=2.2750\n", + "2023-09-08T21:02:14Z INFO Train Epoch: 0 [1280/60000 (2%)]\tloss=2.2734\n", + "2023-09-08T21:02:17Z INFO Train Epoch: 0 [1600/60000 (3%)]\tloss=2.2644\n", + "2023-09-08T21:02:19Z INFO Train Epoch: 0 [1920/60000 (3%)]\tloss=2.2451\n", + "2023-09-08T21:02:21Z INFO Train Epoch: 0 [2240/60000 (4%)]\tloss=2.1874\n", + "2023-09-08T21:02:23Z INFO Train Epoch: 0 [2560/60000 (4%)]\tloss=2.2048\n", + "2023-09-08T21:02:25Z INFO Train Epoch: 0 [2880/60000 (5%)]\tloss=2.1906\n", + "2023-09-08T21:02:26Z INFO Train Epoch: 0 [3200/60000 (5%)]\tloss=2.1379\n", + "2023-09-08T21:02:29Z INFO Train Epoch: 0 [3520/60000 (6%)]\tloss=2.0556\n", + "2023-09-08T21:02:31Z INFO Train Epoch: 0 [3840/60000 (6%)]\tloss=1.8509\n", + "2023-09-08T21:02:32Z INFO Train Epoch: 0 [4160/60000 (7%)]\tloss=1.6425\n", + "2023-09-08T21:02:34Z INFO Train Epoch: 0 [4480/60000 (7%)]\tloss=1.6744\n", + "2023-09-08T21:02:36Z INFO Train Epoch: 0 [4800/60000 (8%)]\tloss=1.3866\n", + "2023-09-08T21:02:38Z INFO Train Epoch: 0 [5120/60000 (9%)]\tloss=1.2680\n", + "2023-09-08T21:02:41Z INFO Train Epoch: 0 [5440/60000 (9%)]\tloss=1.2594\n", + "2023-09-08T21:02:43Z INFO Train Epoch: 0 [5760/60000 (10%)]\tloss=1.3052\n", + "2023-09-08T21:02:45Z INFO Train Epoch: 0 [6080/60000 (10%)]\tloss=1.1057\n", + "2023-09-08T21:02:47Z INFO Train Epoch: 0 [6400/60000 (11%)]\tloss=1.0416\n", + "2023-09-08T21:02:49Z INFO Train Epoch: 0 [6720/60000 (11%)]\tloss=1.2431\n", + "2023-09-08T21:02:50Z INFO Train Epoch: 0 [7040/60000 (12%)]\tloss=0.9392\n", + "2023-09-08T21:02:53Z INFO Train Epoch: 0 [7360/60000 (12%)]\tloss=0.9794\n", + "2023-09-08T21:02:55Z INFO Train Epoch: 0 [7680/60000 (13%)]\tloss=0.9787\n", + "2023-09-08T21:02:57Z INFO Train Epoch: 0 [8000/60000 (13%)]\tloss=1.2992\n", + "2023-09-08T21:02:59Z INFO Train Epoch: 0 [8320/60000 (14%)]\tloss=1.0311\n", + "2023-09-08T21:03:01Z INFO Train Epoch: 0 [8640/60000 (14%)]\tloss=1.0544\n", + "2023-09-08T21:03:02Z INFO Train Epoch: 0 [8960/60000 (15%)]\tloss=1.1326\n", + "2023-09-08T21:03:04Z INFO Train Epoch: 0 [9280/60000 (15%)]\tloss=0.6292\n", + "2023-09-08T21:03:06Z INFO Train Epoch: 0 [9600/60000 (16%)]\tloss=1.2502\n", + "2023-09-08T21:03:08Z INFO Train Epoch: 0 [9920/60000 (17%)]\tloss=0.8754\n", + "2023-09-08T21:03:10Z INFO Train Epoch: 0 [10240/60000 (17%)]\tloss=1.0590\n", + "2023-09-08T21:03:13Z INFO Train Epoch: 0 [10560/60000 (18%)]\tloss=1.0957\n", + "2023-09-08T21:03:15Z INFO Train Epoch: 0 [10880/60000 (18%)]\tloss=0.9105\n", + "2023-09-08T21:03:17Z INFO Train Epoch: 0 [11200/60000 (19%)]\tloss=0.6360\n", + "2023-09-08T21:03:19Z INFO Train Epoch: 0 [11520/60000 (19%)]\tloss=0.9720\n", + "2023-09-08T21:03:21Z INFO Train Epoch: 0 [11840/60000 (20%)]\tloss=1.1181\n", + "2023-09-08T21:03:23Z INFO Train Epoch: 0 [12160/60000 (20%)]\tloss=0.9001\n", + "2023-09-08T21:03:25Z INFO Train Epoch: 0 [12480/60000 (21%)]\tloss=0.6984\n", + "2023-09-08T21:03:27Z INFO Train Epoch: 0 [12800/60000 (21%)]\tloss=0.7768\n", + "2023-09-08T21:03:30Z INFO Train Epoch: 0 [13120/60000 (22%)]\tloss=1.1038\n", + "2023-09-08T21:03:32Z INFO Train Epoch: 0 [13440/60000 (22%)]\tloss=0.8548\n", + "2023-09-08T21:03:34Z INFO Train Epoch: 0 [13760/60000 (23%)]\tloss=0.8793\n", + "2023-09-08T21:03:37Z INFO Train Epoch: 0 [14080/60000 (23%)]\tloss=0.8937\n", + "2023-09-08T21:03:39Z INFO Train Epoch: 0 [14400/60000 (24%)]\tloss=0.8367\n", + "2023-09-08T21:03:42Z INFO Train Epoch: 0 [14720/60000 (25%)]\tloss=0.6917\n", + "2023-09-08T21:03:45Z INFO Train Epoch: 0 [15040/60000 (25%)]\tloss=0.8002\n", + "2023-09-08T21:03:47Z INFO Train Epoch: 0 [15360/60000 (26%)]\tloss=0.9557\n", + "2023-09-08T21:03:48Z INFO Train Epoch: 0 [15680/60000 (26%)]\tloss=0.7246\n", + "2023-09-08T21:03:50Z INFO Train Epoch: 0 [16000/60000 (27%)]\tloss=1.0920\n", + "2023-09-08T21:03:52Z INFO Train Epoch: 0 [16320/60000 (27%)]\tloss=0.4943\n", + "2023-09-08T21:03:54Z INFO Train Epoch: 0 [16640/60000 (28%)]\tloss=0.9251\n", + "2023-09-08T21:03:55Z INFO Train Epoch: 0 [16960/60000 (28%)]\tloss=0.6982\n", + "2023-09-08T21:03:58Z INFO Train Epoch: 0 [17280/60000 (29%)]\tloss=0.7784\n", + "2023-09-08T21:04:00Z INFO Train Epoch: 0 [17600/60000 (29%)]\tloss=0.6317\n", + "2023-09-08T21:04:02Z INFO Train Epoch: 0 [17920/60000 (30%)]\tloss=0.6022\n", + "2023-09-08T21:04:04Z INFO Train Epoch: 0 [18240/60000 (30%)]\tloss=1.1098\n", + "2023-09-08T21:04:06Z INFO Train Epoch: 0 [18560/60000 (31%)]\tloss=1.1230\n", + "2023-09-08T21:04:08Z INFO Train Epoch: 0 [18880/60000 (31%)]\tloss=0.7113\n", + "2023-09-08T21:04:10Z INFO Train Epoch: 0 [19200/60000 (32%)]\tloss=0.5611\n", + "2023-09-08T21:04:12Z INFO Train Epoch: 0 [19520/60000 (33%)]\tloss=0.8134\n", + "2023-09-08T21:04:14Z INFO Train Epoch: 0 [19840/60000 (33%)]\tloss=0.8513\n", + "2023-09-08T21:04:16Z INFO Train Epoch: 0 [20160/60000 (34%)]\tloss=1.1050\n", + "2023-09-08T21:04:18Z INFO Train Epoch: 0 [20480/60000 (34%)]\tloss=0.5541\n", + "2023-09-08T21:04:20Z INFO Train Epoch: 0 [20800/60000 (35%)]\tloss=0.9637\n", + "2023-09-08T21:04:22Z INFO Train Epoch: 0 [21120/60000 (35%)]\tloss=0.4796\n", + "2023-09-08T21:04:24Z INFO Train Epoch: 0 [21440/60000 (36%)]\tloss=0.9878\n", + "2023-09-08T21:04:26Z INFO Train Epoch: 0 [21760/60000 (36%)]\tloss=0.6691\n", + "2023-09-08T21:04:28Z INFO Train Epoch: 0 [22080/60000 (37%)]\tloss=0.7739\n", + "2023-09-08T21:04:31Z INFO Train Epoch: 0 [22400/60000 (37%)]\tloss=0.5405\n", + "2023-09-08T21:04:32Z INFO Train Epoch: 0 [22720/60000 (38%)]\tloss=0.6155\n", + "2023-09-08T21:04:35Z INFO Train Epoch: 0 [23040/60000 (38%)]\tloss=1.0303\n", + "2023-09-08T21:04:37Z INFO Train Epoch: 0 [23360/60000 (39%)]\tloss=0.5421\n", + "2023-09-08T21:04:39Z INFO Train Epoch: 0 [23680/60000 (39%)]\tloss=0.7717\n", + "2023-09-08T21:04:41Z INFO Train Epoch: 0 [24000/60000 (40%)]\tloss=0.8697\n", + "2023-09-08T21:04:43Z INFO Train Epoch: 0 [24320/60000 (41%)]\tloss=0.7996\n", + "2023-09-08T21:04:44Z INFO Train Epoch: 0 [24640/60000 (41%)]\tloss=0.6494\n", + "2023-09-08T21:04:46Z INFO Train Epoch: 0 [24960/60000 (42%)]\tloss=0.7669\n", + "2023-09-08T21:04:48Z INFO Train Epoch: 0 [25280/60000 (42%)]\tloss=0.4775\n", + "2023-09-08T21:04:50Z INFO Train Epoch: 0 [25600/60000 (43%)]\tloss=0.7363\n", + "2023-09-08T21:04:51Z INFO Train Epoch: 0 [25920/60000 (43%)]\tloss=0.5954\n", + "2023-09-08T21:04:53Z INFO Train Epoch: 0 [26240/60000 (44%)]\tloss=0.9329\n", + "2023-09-08T21:04:55Z INFO Train Epoch: 0 [26560/60000 (44%)]\tloss=0.7000\n", + "2023-09-08T21:04:57Z INFO Train Epoch: 0 [26880/60000 (45%)]\tloss=0.5993\n", + "2023-09-08T21:04:59Z INFO Train Epoch: 0 [27200/60000 (45%)]\tloss=0.9582\n", + "2023-09-08T21:05:01Z INFO Train Epoch: 0 [27520/60000 (46%)]\tloss=0.4871\n", + "2023-09-08T21:05:03Z INFO Train Epoch: 0 [27840/60000 (46%)]\tloss=0.6944\n", + "2023-09-08T21:05:06Z INFO Train Epoch: 0 [28160/60000 (47%)]\tloss=0.7795\n", + "2023-09-08T21:05:08Z INFO Train Epoch: 0 [28480/60000 (47%)]\tloss=0.7967\n", + "2023-09-08T21:05:10Z INFO Train Epoch: 0 [28800/60000 (48%)]\tloss=0.9489\n", + "2023-09-08T21:05:12Z INFO Train Epoch: 0 [29120/60000 (49%)]\tloss=0.6331\n", + "2023-09-08T21:05:14Z INFO Train Epoch: 0 [29440/60000 (49%)]\tloss=0.9203\n", + "2023-09-08T21:05:16Z INFO Train Epoch: 0 [29760/60000 (50%)]\tloss=0.7250\n", + "2023-09-08T21:05:18Z INFO Train Epoch: 0 [30080/60000 (50%)]\tloss=1.0080\n", + "2023-09-08T21:05:20Z INFO Train Epoch: 0 [30400/60000 (51%)]\tloss=0.6063\n", + "2023-09-08T21:05:23Z INFO Train Epoch: 0 [30720/60000 (51%)]\tloss=0.6403\n", + "2023-09-08T21:05:24Z INFO Train Epoch: 0 [31040/60000 (52%)]\tloss=0.4953\n", + "2023-09-08T21:05:26Z INFO Train Epoch: 0 [31360/60000 (52%)]\tloss=0.4997\n", + "2023-09-08T21:05:28Z INFO Train Epoch: 0 [31680/60000 (53%)]\tloss=0.7053\n", + "2023-09-08T21:05:30Z INFO Train Epoch: 0 [32000/60000 (53%)]\tloss=0.7847\n", + "2023-09-08T21:05:32Z INFO Train Epoch: 0 [32320/60000 (54%)]\tloss=0.5874\n", + "2023-09-08T21:05:34Z INFO Train Epoch: 0 [32640/60000 (54%)]\tloss=0.6826\n", + "2023-09-08T21:05:36Z INFO Train Epoch: 0 [32960/60000 (55%)]\tloss=0.5787\n", + "2023-09-08T21:05:39Z INFO Train Epoch: 0 [33280/60000 (55%)]\tloss=0.5482\n", + "2023-09-08T21:05:41Z INFO Train Epoch: 0 [33600/60000 (56%)]\tloss=0.5237\n", + "2023-09-08T21:05:42Z INFO Train Epoch: 0 [33920/60000 (57%)]\tloss=0.4103\n", + "2023-09-08T21:05:44Z INFO Train Epoch: 0 [34240/60000 (57%)]\tloss=0.4330\n", + "2023-09-08T21:05:46Z INFO Train Epoch: 0 [34560/60000 (58%)]\tloss=0.3828\n", + "2023-09-08T21:05:48Z INFO Train Epoch: 0 [34880/60000 (58%)]\tloss=0.6742\n", + "2023-09-08T21:05:49Z INFO Train Epoch: 0 [35200/60000 (59%)]\tloss=0.5098\n", + "2023-09-08T21:05:51Z INFO Train Epoch: 0 [35520/60000 (59%)]\tloss=0.5187\n", + "2023-09-08T21:05:53Z INFO Train Epoch: 0 [35840/60000 (60%)]\tloss=0.5226\n", + "2023-09-08T21:05:54Z INFO Train Epoch: 0 [36160/60000 (60%)]\tloss=0.7099\n", + "2023-09-08T21:05:56Z INFO Train Epoch: 0 [36480/60000 (61%)]\tloss=0.6922\n", + "2023-09-08T21:05:59Z INFO Train Epoch: 0 [36800/60000 (61%)]\tloss=0.6208\n", + "2023-09-08T21:06:01Z INFO Train Epoch: 0 [37120/60000 (62%)]\tloss=0.7056\n", + "2023-09-08T21:06:03Z INFO Train Epoch: 0 [37440/60000 (62%)]\tloss=0.5346\n", + "2023-09-08T21:06:05Z INFO Train Epoch: 0 [37760/60000 (63%)]\tloss=0.4693\n", + "2023-09-08T21:06:07Z INFO Train Epoch: 0 [38080/60000 (63%)]\tloss=0.8529\n", + "2023-09-08T21:06:10Z INFO Train Epoch: 0 [38400/60000 (64%)]\tloss=0.6755\n", + "2023-09-08T21:06:11Z INFO Train Epoch: 0 [38720/60000 (65%)]\tloss=0.5663\n", + "2023-09-08T21:06:13Z INFO Train Epoch: 0 [39040/60000 (65%)]\tloss=0.5107\n", + "2023-09-08T21:06:15Z INFO Train Epoch: 0 [39360/60000 (66%)]\tloss=0.4245\n", + "2023-09-08T21:06:17Z INFO Train Epoch: 0 [39680/60000 (66%)]\tloss=0.5797\n", + "2023-09-08T21:06:19Z INFO Train Epoch: 0 [40000/60000 (67%)]\tloss=0.5011\n", + "2023-09-08T21:06:20Z INFO Train Epoch: 0 [40320/60000 (67%)]\tloss=0.4641\n", + "2023-09-08T21:06:22Z INFO Train Epoch: 0 [40640/60000 (68%)]\tloss=0.2431\n", + "2023-09-08T21:06:24Z INFO Train Epoch: 0 [40960/60000 (68%)]\tloss=0.5040\n", + "2023-09-08T21:06:26Z INFO Train Epoch: 0 [41280/60000 (69%)]\tloss=0.6674\n", + "2023-09-08T21:06:29Z INFO Train Epoch: 0 [41600/60000 (69%)]\tloss=0.8426\n", + "2023-09-08T21:06:31Z INFO Train Epoch: 0 [41920/60000 (70%)]\tloss=0.5418\n", + "2023-09-08T21:06:33Z INFO Train Epoch: 0 [42240/60000 (70%)]\tloss=0.6396\n", + "2023-09-08T21:06:35Z INFO Train Epoch: 0 [42560/60000 (71%)]\tloss=0.4182\n", + "2023-09-08T21:06:38Z INFO Train Epoch: 0 [42880/60000 (71%)]\tloss=0.7471\n", + "2023-09-08T21:06:40Z INFO Train Epoch: 0 [43200/60000 (72%)]\tloss=0.6492\n", + "2023-09-08T21:06:42Z INFO Train Epoch: 0 [43520/60000 (73%)]\tloss=0.3955\n", + "2023-09-08T21:06:44Z INFO Train Epoch: 0 [43840/60000 (73%)]\tloss=0.5986\n", + "2023-09-08T21:06:46Z INFO Train Epoch: 0 [44160/60000 (74%)]\tloss=0.5604\n", + "2023-09-08T21:06:48Z INFO Train Epoch: 0 [44480/60000 (74%)]\tloss=0.4396\n", + "2023-09-08T21:06:50Z INFO Train Epoch: 0 [44800/60000 (75%)]\tloss=0.5718\n", + "2023-09-08T21:06:52Z INFO Train Epoch: 0 [45120/60000 (75%)]\tloss=0.5190\n", + "2023-09-08T21:06:54Z INFO Train Epoch: 0 [45440/60000 (76%)]\tloss=0.7500\n", + "2023-09-08T21:06:56Z INFO Train Epoch: 0 [45760/60000 (76%)]\tloss=0.4298\n", + "2023-09-08T21:06:58Z INFO Train Epoch: 0 [46080/60000 (77%)]\tloss=0.5909\n", + "2023-09-08T21:07:00Z INFO Train Epoch: 0 [46400/60000 (77%)]\tloss=0.4499\n", + "2023-09-08T21:07:02Z INFO Train Epoch: 0 [46720/60000 (78%)]\tloss=0.6639\n", + "2023-09-08T21:07:05Z INFO Train Epoch: 0 [47040/60000 (78%)]\tloss=0.3891\n", + "2023-09-08T21:07:08Z INFO Train Epoch: 0 [47360/60000 (79%)]\tloss=0.5912\n", + "2023-09-08T21:07:10Z INFO Train Epoch: 0 [47680/60000 (79%)]\tloss=0.4047\n", + "2023-09-08T21:07:12Z INFO Train Epoch: 0 [48000/60000 (80%)]\tloss=0.5517\n", + "2023-09-08T21:07:14Z INFO Train Epoch: 0 [48320/60000 (81%)]\tloss=0.5204\n", + "2023-09-08T21:07:17Z INFO Train Epoch: 0 [48640/60000 (81%)]\tloss=0.7532\n", + "2023-09-08T21:07:19Z INFO Train Epoch: 0 [48960/60000 (82%)]\tloss=0.6107\n", + "2023-09-08T21:07:20Z INFO Train Epoch: 0 [49280/60000 (82%)]\tloss=0.6882\n", + "2023-09-08T21:07:22Z INFO Train Epoch: 0 [49600/60000 (83%)]\tloss=0.3215\n", + "2023-09-08T21:07:24Z INFO Train Epoch: 0 [49920/60000 (83%)]\tloss=0.3356\n", + "2023-09-08T21:07:26Z INFO Train Epoch: 0 [50240/60000 (84%)]\tloss=0.4973\n", + "2023-09-08T21:07:28Z INFO Train Epoch: 0 [50560/60000 (84%)]\tloss=0.8383\n", + "2023-09-08T21:07:31Z INFO Train Epoch: 0 [50880/60000 (85%)]\tloss=0.4020\n", + "2023-09-08T21:07:32Z INFO Train Epoch: 0 [51200/60000 (85%)]\tloss=0.4866\n", + "2023-09-08T21:07:34Z INFO Train Epoch: 0 [51520/60000 (86%)]\tloss=0.4938\n", + "2023-09-08T21:07:36Z INFO Train Epoch: 0 [51840/60000 (86%)]\tloss=0.7432\n", + "2023-09-08T21:07:38Z INFO Train Epoch: 0 [52160/60000 (87%)]\tloss=0.4650\n", + "2023-09-08T21:07:40Z INFO Train Epoch: 0 [52480/60000 (87%)]\tloss=0.8149\n", + "2023-09-08T21:07:41Z INFO Train Epoch: 0 [52800/60000 (88%)]\tloss=0.5370\n", + "2023-09-08T21:07:43Z INFO Train Epoch: 0 [53120/60000 (89%)]\tloss=0.7261\n", + "2023-09-08T21:07:46Z INFO Train Epoch: 0 [53440/60000 (89%)]\tloss=0.6188\n", + "2023-09-08T21:07:48Z INFO Train Epoch: 0 [53760/60000 (90%)]\tloss=0.5179\n", + "2023-09-08T21:07:51Z INFO Train Epoch: 0 [54080/60000 (90%)]\tloss=0.7616\n", + "2023-09-08T21:07:53Z INFO Train Epoch: 0 [54400/60000 (91%)]\tloss=0.7180\n", + "2023-09-08T21:07:55Z INFO Train Epoch: 0 [54720/60000 (91%)]\tloss=0.4831\n", + "2023-09-08T21:07:56Z INFO Train Epoch: 0 [55040/60000 (92%)]\tloss=0.3719\n", + "2023-09-08T21:07:59Z INFO Train Epoch: 0 [55360/60000 (92%)]\tloss=0.4730\n", + "2023-09-08T21:08:01Z INFO Train Epoch: 0 [55680/60000 (93%)]\tloss=0.5402\n", + "2023-09-08T21:08:02Z INFO Train Epoch: 0 [56000/60000 (93%)]\tloss=0.7432\n", + "2023-09-08T21:08:04Z INFO Train Epoch: 0 [56320/60000 (94%)]\tloss=0.6275\n", + "2023-09-08T21:08:06Z INFO Train Epoch: 0 [56640/60000 (94%)]\tloss=0.3235\n", + "2023-09-08T21:08:07Z INFO Train Epoch: 0 [56960/60000 (95%)]\tloss=0.7855\n", + "2023-09-08T21:08:09Z INFO Train Epoch: 0 [57280/60000 (95%)]\tloss=0.5046\n", + "2023-09-08T21:08:11Z INFO Train Epoch: 0 [57600/60000 (96%)]\tloss=0.5732\n", + "2023-09-08T21:08:13Z INFO Train Epoch: 0 [57920/60000 (97%)]\tloss=0.2879\n", + "2023-09-08T21:08:15Z INFO Train Epoch: 0 [58240/60000 (97%)]\tloss=0.4233\n", + "2023-09-08T21:08:18Z INFO Train Epoch: 0 [58560/60000 (98%)]\tloss=0.5561\n", + "2023-09-08T21:08:20Z INFO Train Epoch: 0 [58880/60000 (98%)]\tloss=0.6785\n", + "2023-09-08T21:08:21Z INFO Train Epoch: 0 [59200/60000 (99%)]\tloss=0.3826\n", + "2023-09-08T21:08:23Z INFO Train Epoch: 0 [59520/60000 (99%)]\tloss=0.5397\n", + "2023-09-08T21:08:26Z INFO Train Epoch: 0 [59840/60000 (100%)]\tloss=0.5987\n", + "\n" + ] + } + ], + "source": [ + "training_client.get_job_logs(pytorchjob_name)" + ] + }, + { + "cell_type": "markdown", + "id": "17b0ca43-1936-4708-b03b-3ab9ac2bbdea", + "metadata": {}, + "source": [ + "## Delete PyTorchJob\n", + "\n", + "When PyTorchJob is finished, you can delete the resource." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "32ae88fd-5b5d-4ba1-a560-9a35c5ac17de", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-08T22:10:29Z INFO PyTorchJob default/train-pytorch has been deleted\n" + ] + } + ], + "source": [ + "training_client.delete_job(pytorchjob_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9641e9f-551d-44d5-872b-002fffaedcef", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/sdk/python/examples/kubeflow-pytorchjob-sdk.ipynb b/examples/sdk/create-pytorchjob.ipynb similarity index 65% rename from sdk/python/examples/kubeflow-pytorchjob-sdk.ipynb rename to examples/sdk/create-pytorchjob.ipynb index 0c2c28e02d..d960043ad3 100644 --- a/sdk/python/examples/kubeflow-pytorchjob-sdk.ipynb +++ b/examples/sdk/create-pytorchjob.ipynb @@ -8,7 +8,7 @@ } }, "source": [ - "# Sample for Kubeflow PyTorchJob SDK" + "# Create PyTorchJob using Kubeflow Training SDK" ] }, { @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": { "pycharm": { "name": "#%%\n" @@ -60,11 +60,13 @@ "from kubernetes.client import V1PodSpec\n", "from kubernetes.client import V1Container\n", "\n", - "from kubeflow.training import V1ReplicaSpec\n", + "from kubeflow.training import KubeflowOrgV1ReplicaSpec\n", "from kubeflow.training import KubeflowOrgV1PyTorchJob\n", "from kubeflow.training import KubeflowOrgV1PyTorchJobSpec\n", - "from kubeflow.training import V1RunPolicy\n", - "from kubeflow.training import TrainingClient" + "from kubeflow.training import KubeflowOrgV1RunPolicy\n", + "from kubeflow.training import TrainingClient\n", + "\n", + "from kubeflow.training import constants" ] }, { @@ -91,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 3, "metadata": { "pycharm": { "name": "#%%\n" @@ -109,7 +111,7 @@ " args=[\"--backend\", \"gloo\"],\n", ")\n", "\n", - "replica_spec = V1ReplicaSpec(\n", + "replica_spec = KubeflowOrgV1ReplicaSpec(\n", " replicas=1,\n", " restart_policy=\"OnFailure\",\n", " template=V1PodTemplateSpec(\n", @@ -121,23 +123,17 @@ " }\n", " ),\n", " spec=V1PodSpec(\n", - " containers=[\n", - " V1Container(\n", - " name=container_name,\n", - " image=\"gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0\",\n", - " args=[\"--backend\", \"gloo\"],\n", - " )\n", - " ]\n", + " containers=[container]\n", " )\n", " )\n", ")\n", "\n", "pytorchjob = KubeflowOrgV1PyTorchJob(\n", - " api_version=\"kubeflow.org/v1\",\n", - " kind=\"PyTorchJob\",\n", + " api_version=constants.API_VERSION,\n", + " kind=constants.PYTORCHJOB_KIND,\n", " metadata=V1ObjectMeta(name=name, namespace=namespace),\n", " spec=KubeflowOrgV1PyTorchJobSpec(\n", - " run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n", + " run_policy=KubeflowOrgV1RunPolicy(clean_pod_policy=\"None\"),\n", " pytorch_replica_specs={\n", " \"Master\": replica_spec,\n", " \"Worker\": replica_spec\n", @@ -156,12 +152,12 @@ "source": [ "## Create PyTorchJob\n", "\n", - "You have to create Training Client to deploy you PyTorchJob in you cluster." + "You have to create Training Client to deploy your PyTorchJob in you cluster." ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 4, "metadata": { "pycharm": { "name": "#%%\n" @@ -177,8 +173,11 @@ } ], "source": [ - "training_client = TrainingClient()\n", - "training_client.create_pytorchjob(pytorchjob, namespace=namespace)" + "# Namespace will be reused in every APIs.\n", + "training_client = TrainingClient(namespace=namespace)\n", + "\n", + "# If `job_kind` is not set in `TrainingClient`, we need to set it for each API.\n", + "training_client.create_job(pytorchjob, job_kind=constants.PYTORCHJOB_KIND)" ] }, { @@ -196,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 5, "metadata": { "pycharm": { "name": "#%%\n" @@ -209,13 +208,13 @@ "'pytorch-dist-mnist-gloo'" ] }, - "execution_count": 39, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "training_client.get_pytorchjob(name).metadata.name" + "training_client.get_job(name, job_kind=constants.PYTORCHJOB_KIND).metadata.name" ] }, { @@ -231,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 7, "metadata": { "pycharm": { "name": "#%%\n" @@ -241,27 +240,27 @@ { "data": { "text/plain": [ - "[{'last_transition_time': datetime.datetime(2023, 1, 12, 18, 30, 13, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2023, 1, 12, 18, 30, 13, tzinfo=tzlocal()),\n", + "[{'last_transition_time': datetime.datetime(2023, 9, 8, 21, 14, 59, tzinfo=tzutc()),\n", + " 'last_update_time': datetime.datetime(2023, 9, 8, 21, 14, 59, tzinfo=tzutc()),\n", " 'message': 'PyTorchJob pytorch-dist-mnist-gloo is created.',\n", " 'reason': 'PyTorchJobCreated',\n", " 'status': 'True',\n", " 'type': 'Created'},\n", - " {'last_transition_time': datetime.datetime(2023, 1, 12, 18, 30, 18, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2023, 1, 12, 18, 30, 18, tzinfo=tzlocal()),\n", + " {'last_transition_time': datetime.datetime(2023, 9, 8, 21, 15, 45, tzinfo=tzutc()),\n", + " 'last_update_time': datetime.datetime(2023, 9, 8, 21, 15, 45, tzinfo=tzutc()),\n", " 'message': 'PyTorchJob pytorch-dist-mnist-gloo is running.',\n", " 'reason': 'JobRunning',\n", " 'status': 'True',\n", " 'type': 'Running'}]" ] }, - "execution_count": 40, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "training_client.get_job_conditions(name=name, namespace=namespace, job_kind=\"PyTorchJob\")" + "training_client.get_job_conditions(name=name, job_kind=constants.PYTORCHJOB_KIND)" ] }, { @@ -277,27 +276,33 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 8, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "NAME STATE TIME\n", + "pytorch-dist-mnist-gloo Running 2023-09-08 21:15:45+00:00\n", + "pytorch-dist-mnist-gloo Running 2023-09-08 21:15:45+00:00\n", + "pytorch-dist-mnist-gloo Succeeded 2023-09-08 21:26:44+00:00\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "pytorch-dist-mnist-gloo Running 2023-01-12 18:30:18+00:00\n", - "pytorch-dist-mnist-gloo Running 2023-01-12 18:30:18+00:00\n", - "pytorch-dist-mnist-gloo Running 2023-01-12 18:30:18+00:00\n", - "pytorch-dist-mnist-gloo Succeeded 2023-01-12 18:36:48+00:00\n", "Succeeded number of replicas: 1\n" ] } ], "source": [ - "pytorchjob = training_client.wait_for_job_conditions(name=name, namespace=namespace, job_kind=\"PyTorchJob\")\n", + "pytorchjob = training_client.wait_for_job_conditions(name=name, job_kind=constants.PYTORCHJOB_KIND)\n", "\n", "print(f\"Succeeded number of replicas: {pytorchjob.status.replica_statuses['Master'].succeeded}\")" ] @@ -315,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 9, "metadata": { "pycharm": { "name": "#%%\n" @@ -328,13 +333,13 @@ "True" ] }, - "execution_count": 42, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "training_client.is_job_succeeded(name=name, namespace=namespace, job_kind=\"PyTorchJob\")" + "training_client.is_job_succeeded(name=name, job_kind=constants.PYTORCHJOB_KIND)" ] }, { @@ -350,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 10, "metadata": { "pycharm": { "name": "#%%\n" @@ -403,75 +408,75 @@ "Train Epoch: 1 [19840/60000 (33%)]\tloss=0.1191\n", "Train Epoch: 1 [20480/60000 (34%)]\tloss=0.1905\n", "Train Epoch: 1 [21120/60000 (35%)]\tloss=0.1408\n", - "Train Epoch: 1 [21760/60000 (36%)]\tloss=0.3150\n", - "Train Epoch: 1 [22400/60000 (37%)]\tloss=0.1506\n", - "Train Epoch: 1 [23040/60000 (38%)]\tloss=0.2899\n", - "Train Epoch: 1 [23680/60000 (39%)]\tloss=0.4676\n", - "Train Epoch: 1 [24320/60000 (41%)]\tloss=0.2157\n", - "Train Epoch: 1 [24960/60000 (42%)]\tloss=0.1520\n", - "Train Epoch: 1 [25600/60000 (43%)]\tloss=0.2244\n", - "Train Epoch: 1 [26240/60000 (44%)]\tloss=0.2632\n", + "Train Epoch: 1 [21760/60000 (36%)]\tloss=0.3147\n", + "Train Epoch: 1 [22400/60000 (37%)]\tloss=0.1505\n", + "Train Epoch: 1 [23040/60000 (38%)]\tloss=0.2898\n", + "Train Epoch: 1 [23680/60000 (39%)]\tloss=0.4685\n", + "Train Epoch: 1 [24320/60000 (41%)]\tloss=0.2158\n", + "Train Epoch: 1 [24960/60000 (42%)]\tloss=0.1521\n", + "Train Epoch: 1 [25600/60000 (43%)]\tloss=0.2248\n", + "Train Epoch: 1 [26240/60000 (44%)]\tloss=0.2623\n", "Train Epoch: 1 [26880/60000 (45%)]\tloss=0.2335\n", - "Train Epoch: 1 [27520/60000 (46%)]\tloss=0.2619\n", + "Train Epoch: 1 [27520/60000 (46%)]\tloss=0.2623\n", "Train Epoch: 1 [28160/60000 (47%)]\tloss=0.2126\n", - "Train Epoch: 1 [28800/60000 (48%)]\tloss=0.1324\n", - "Train Epoch: 1 [29440/60000 (49%)]\tloss=0.2795\n", - "Train Epoch: 1 [30080/60000 (50%)]\tloss=0.0951\n", - "Train Epoch: 1 [30720/60000 (51%)]\tloss=0.1284\n", - "Train Epoch: 1 [31360/60000 (52%)]\tloss=0.2461\n", - "Train Epoch: 1 [32000/60000 (53%)]\tloss=0.3394\n", - "Train Epoch: 1 [32640/60000 (54%)]\tloss=0.1517\n", + "Train Epoch: 1 [28800/60000 (48%)]\tloss=0.1328\n", + "Train Epoch: 1 [29440/60000 (49%)]\tloss=0.2779\n", + "Train Epoch: 1 [30080/60000 (50%)]\tloss=0.0943\n", + "Train Epoch: 1 [30720/60000 (51%)]\tloss=0.1285\n", + "Train Epoch: 1 [31360/60000 (52%)]\tloss=0.2455\n", + "Train Epoch: 1 [32000/60000 (53%)]\tloss=0.3396\n", + "Train Epoch: 1 [32640/60000 (54%)]\tloss=0.1523\n", "Train Epoch: 1 [33280/60000 (55%)]\tloss=0.0916\n", - "Train Epoch: 1 [33920/60000 (57%)]\tloss=0.1449\n", - "Train Epoch: 1 [34560/60000 (58%)]\tloss=0.1978\n", - "Train Epoch: 1 [35200/60000 (59%)]\tloss=0.2189\n", - "Train Epoch: 1 [35840/60000 (60%)]\tloss=0.0637\n", - "Train Epoch: 1 [36480/60000 (61%)]\tloss=0.1368\n", - "Train Epoch: 1 [37120/60000 (62%)]\tloss=0.1153\n", - "Train Epoch: 1 [37760/60000 (63%)]\tloss=0.2358\n", - "Train Epoch: 1 [38400/60000 (64%)]\tloss=0.0631\n", - "Train Epoch: 1 [39040/60000 (65%)]\tloss=0.1063\n", - "Train Epoch: 1 [39680/60000 (66%)]\tloss=0.1602\n", - "Train Epoch: 1 [40320/60000 (67%)]\tloss=0.1098\n", - "Train Epoch: 1 [40960/60000 (68%)]\tloss=0.1781\n", - "Train Epoch: 1 [41600/60000 (69%)]\tloss=0.2297\n", - "Train Epoch: 1 [42240/60000 (70%)]\tloss=0.0735\n", - "Train Epoch: 1 [42880/60000 (71%)]\tloss=0.1562\n", - "Train Epoch: 1 [43520/60000 (72%)]\tloss=0.2771\n", - "Train Epoch: 1 [44160/60000 (74%)]\tloss=0.1429\n", - "Train Epoch: 1 [44800/60000 (75%)]\tloss=0.1172\n", - "Train Epoch: 1 [45440/60000 (76%)]\tloss=0.1202\n", + "Train Epoch: 1 [33920/60000 (57%)]\tloss=0.1448\n", + "Train Epoch: 1 [34560/60000 (58%)]\tloss=0.1989\n", + "Train Epoch: 1 [35200/60000 (59%)]\tloss=0.2183\n", + "Train Epoch: 1 [35840/60000 (60%)]\tloss=0.0638\n", + "Train Epoch: 1 [36480/60000 (61%)]\tloss=0.1373\n", + "Train Epoch: 1 [37120/60000 (62%)]\tloss=0.1147\n", + "Train Epoch: 1 [37760/60000 (63%)]\tloss=0.2355\n", + "Train Epoch: 1 [38400/60000 (64%)]\tloss=0.0636\n", + "Train Epoch: 1 [39040/60000 (65%)]\tloss=0.1065\n", + "Train Epoch: 1 [39680/60000 (66%)]\tloss=0.1599\n", + "Train Epoch: 1 [40320/60000 (67%)]\tloss=0.1090\n", + "Train Epoch: 1 [40960/60000 (68%)]\tloss=0.1774\n", + "Train Epoch: 1 [41600/60000 (69%)]\tloss=0.2307\n", + "Train Epoch: 1 [42240/60000 (70%)]\tloss=0.0736\n", + "Train Epoch: 1 [42880/60000 (71%)]\tloss=0.1553\n", + "Train Epoch: 1 [43520/60000 (72%)]\tloss=0.2793\n", + "Train Epoch: 1 [44160/60000 (74%)]\tloss=0.1428\n", + "Train Epoch: 1 [44800/60000 (75%)]\tloss=0.1179\n", + "Train Epoch: 1 [45440/60000 (76%)]\tloss=0.1205\n", "Train Epoch: 1 [46080/60000 (77%)]\tloss=0.0767\n", - "Train Epoch: 1 [46720/60000 (78%)]\tloss=0.1938\n", - "Train Epoch: 1 [47360/60000 (79%)]\tloss=0.0699\n", - "Train Epoch: 1 [48000/60000 (80%)]\tloss=0.2114\n", - "Train Epoch: 1 [48640/60000 (81%)]\tloss=0.1373\n", - "Train Epoch: 1 [49280/60000 (82%)]\tloss=0.0934\n", - "Train Epoch: 1 [49920/60000 (83%)]\tloss=0.1075\n", - "Train Epoch: 1 [50560/60000 (84%)]\tloss=0.1185\n", - "Train Epoch: 1 [51200/60000 (85%)]\tloss=0.1457\n", - "Train Epoch: 1 [51840/60000 (86%)]\tloss=0.0694\n", - "Train Epoch: 1 [52480/60000 (87%)]\tloss=0.0242\n", - "Train Epoch: 1 [53120/60000 (88%)]\tloss=0.2635\n", + "Train Epoch: 1 [46720/60000 (78%)]\tloss=0.1946\n", + "Train Epoch: 1 [47360/60000 (79%)]\tloss=0.0703\n", + "Train Epoch: 1 [48000/60000 (80%)]\tloss=0.2094\n", + "Train Epoch: 1 [48640/60000 (81%)]\tloss=0.1378\n", + "Train Epoch: 1 [49280/60000 (82%)]\tloss=0.0950\n", + "Train Epoch: 1 [49920/60000 (83%)]\tloss=0.1066\n", + "Train Epoch: 1 [50560/60000 (84%)]\tloss=0.1182\n", + "Train Epoch: 1 [51200/60000 (85%)]\tloss=0.1455\n", + "Train Epoch: 1 [51840/60000 (86%)]\tloss=0.0684\n", + "Train Epoch: 1 [52480/60000 (87%)]\tloss=0.0241\n", + "Train Epoch: 1 [53120/60000 (88%)]\tloss=0.2626\n", "Train Epoch: 1 [53760/60000 (90%)]\tloss=0.0922\n", - "Train Epoch: 1 [54400/60000 (91%)]\tloss=0.1287\n", - "Train Epoch: 1 [55040/60000 (92%)]\tloss=0.1908\n", - "Train Epoch: 1 [55680/60000 (93%)]\tloss=0.0350\n", - "Train Epoch: 1 [56320/60000 (94%)]\tloss=0.0359\n", - "Train Epoch: 1 [56960/60000 (95%)]\tloss=0.0762\n", - "Train Epoch: 1 [57600/60000 (96%)]\tloss=0.1173\n", - "Train Epoch: 1 [58240/60000 (97%)]\tloss=0.1948\n", - "Train Epoch: 1 [58880/60000 (98%)]\tloss=0.2035\n", - "Train Epoch: 1 [59520/60000 (99%)]\tloss=0.0639\n", + "Train Epoch: 1 [54400/60000 (91%)]\tloss=0.1301\n", + "Train Epoch: 1 [55040/60000 (92%)]\tloss=0.1921\n", + "Train Epoch: 1 [55680/60000 (93%)]\tloss=0.0346\n", + "Train Epoch: 1 [56320/60000 (94%)]\tloss=0.0358\n", + "Train Epoch: 1 [56960/60000 (95%)]\tloss=0.0767\n", + "Train Epoch: 1 [57600/60000 (96%)]\tloss=0.1167\n", + "Train Epoch: 1 [58240/60000 (97%)]\tloss=0.1932\n", + "Train Epoch: 1 [58880/60000 (98%)]\tloss=0.2062\n", + "Train Epoch: 1 [59520/60000 (99%)]\tloss=0.0647\n", "\n", - "accuracy=0.9665\n", + "accuracy=0.9669\n", "\n", "\n" ] } ], "source": [ - "training_client.get_job_logs(name=name, namespace=namespace, container=container_name)" + "training_client.get_job_logs(name=name, job_kind=constants.PYTORCHJOB_KIND)" ] }, { @@ -487,7 +492,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 11, "metadata": { "pycharm": { "name": "#%%\n" @@ -503,7 +508,7 @@ } ], "source": [ - "training_client.delete_pytorchjob(name)" + "training_client.delete_job(name)" ] }, { @@ -530,7 +535,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.9.12" } }, "nbformat": 4, diff --git a/examples/sdk/create-tfjob.ipynb b/examples/sdk/create-tfjob.ipynb new file mode 100644 index 0000000000..182e977ea4 --- /dev/null +++ b/examples/sdk/create-tfjob.ipynb @@ -0,0 +1,405 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Create TFJob using Kubeflow Training SDK" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "This is a sample for Kubeflow Training SDK `kubeflow-training`.\n", + "\n", + "The notebook shows how to use Kubeflow TFJob SDK to create, get, wait, check and delete TFJob." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install Kubeflow Training Python SDKs\n", + "\n", + "You need to install Kubeflow Training SDK to run this Notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n", + "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "from kubernetes.client import V1PodTemplateSpec\n", + "from kubernetes.client import V1ObjectMeta\n", + "from kubernetes.client import V1PodSpec\n", + "from kubernetes.client import V1Container\n", + "\n", + "\n", + "from kubeflow.training import KubeflowOrgV1ReplicaSpec\n", + "from kubeflow.training import KubeflowOrgV1TFJob\n", + "from kubeflow.training import KubeflowOrgV1TFJobSpec\n", + "from kubeflow.training import KubeflowOrgV1RunPolicy\n", + "from kubeflow.training import TrainingClient\n", + "\n", + "from kubeflow.training import constants" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Define TFJob" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "The demo runs Tensorflow MNIST example with 2 workers, chief, and parameter server for TFJob." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "name = \"mnist\"\n", + "namespace = \"kubeflow-user-example-com\"\n", + "container_name = \"tensorflow\"\n", + "\n", + "container = V1Container(\n", + " name=container_name,\n", + " image=\"gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0\",\n", + " command=[\n", + " \"python\",\n", + " \"/var/tf_mnist/mnist_with_summaries.py\",\n", + " \"--log_dir=/train/logs\", \"--learning_rate=0.01\",\n", + " \"--batch_size=150\"\n", + " ]\n", + ")\n", + "\n", + "worker = KubeflowOrgV1ReplicaSpec(\n", + " replicas=2,\n", + " restart_policy=\"Never\",\n", + " template=V1PodTemplateSpec(\n", + " spec=V1PodSpec(\n", + " containers=[container]\n", + " )\n", + " )\n", + ")\n", + "\n", + "chief = KubeflowOrgV1ReplicaSpec(\n", + " replicas=1,\n", + " restart_policy=\"Never\",\n", + " template=V1PodTemplateSpec(\n", + " spec=V1PodSpec(\n", + " containers=[container]\n", + " )\n", + " )\n", + ")\n", + "\n", + "ps = KubeflowOrgV1ReplicaSpec(\n", + " replicas=1,\n", + " restart_policy=\"Never\",\n", + " template=V1PodTemplateSpec(\n", + " spec=V1PodSpec(\n", + " containers=[container]\n", + " )\n", + " )\n", + ")\n", + "\n", + "tfjob = KubeflowOrgV1TFJob(\n", + " api_version=constants.API_VERSION,\n", + " kind=constants.TFJOB_KIND,\n", + " metadata=V1ObjectMeta(name=\"mnist\",namespace=namespace),\n", + " spec=KubeflowOrgV1TFJobSpec(\n", + " run_policy=KubeflowOrgV1RunPolicy(clean_pod_policy=\"None\"),\n", + " tf_replica_specs={\"Worker\": worker,\n", + " \"Chief\": chief,\n", + " \"PS\": ps}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create TFJob\n", + "\n", + "You have to create Training Client to deploy your TFJob in you cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TFJob kubeflow-user-example-com/mnist has been created\n" + ] + } + ], + "source": [ + "# Namespace and Job kind will be reused in every APIs.\n", + "training_client = TrainingClient(namespace=namespace, job_kind=constants.TFJOB_KIND)\n", + "training_client.create_job(tfjob)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get the Created TFJob\n", + "\n", + "You can verify the created TFJob status." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'completion_time': None,\n", + " 'conditions': [{'last_transition_time': datetime.datetime(2023, 9, 8, 21, 42, 34, tzinfo=tzutc()),\n", + " 'last_update_time': datetime.datetime(2023, 9, 8, 21, 42, 34, tzinfo=tzutc()),\n", + " 'message': 'TFJob mnist is created.',\n", + " 'reason': 'TFJobCreated',\n", + " 'status': 'True',\n", + " 'type': 'Created'},\n", + " {'last_transition_time': datetime.datetime(2023, 9, 8, 21, 42, 35, tzinfo=tzutc()),\n", + " 'last_update_time': datetime.datetime(2023, 9, 8, 21, 42, 35, tzinfo=tzutc()),\n", + " 'message': 'TFJob kubeflow-user-example-com/mnist is running.',\n", + " 'reason': 'TFJobRunning',\n", + " 'status': 'True',\n", + " 'type': 'Running'}],\n", + " 'last_reconcile_time': None,\n", + " 'replica_statuses': {'Chief': {'active': 1,\n", + " 'failed': None,\n", + " 'label_selector': None,\n", + " 'selector': None,\n", + " 'succeeded': None},\n", + " 'PS': {'active': 1,\n", + " 'failed': None,\n", + " 'label_selector': None,\n", + " 'selector': None,\n", + " 'succeeded': None},\n", + " 'Worker': {'active': 2,\n", + " 'failed': None,\n", + " 'label_selector': None,\n", + " 'selector': None,\n", + " 'succeeded': None}},\n", + " 'start_time': datetime.datetime(2023, 9, 8, 21, 42, 34, tzinfo=tzutc())}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_client.get_job(name).status" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get the TFJob Conditions" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'last_transition_time': datetime.datetime(2023, 9, 8, 21, 42, 34, tzinfo=tzutc()),\n", + " 'last_update_time': datetime.datetime(2023, 9, 8, 21, 42, 34, tzinfo=tzutc()),\n", + " 'message': 'TFJob mnist is created.',\n", + " 'reason': 'TFJobCreated',\n", + " 'status': 'True',\n", + " 'type': 'Created'},\n", + " {'last_transition_time': datetime.datetime(2023, 9, 8, 21, 42, 35, tzinfo=tzutc()),\n", + " 'last_update_time': datetime.datetime(2023, 9, 8, 21, 42, 35, tzinfo=tzutc()),\n", + " 'message': 'TFJob kubeflow-user-example-com/mnist is running.',\n", + " 'reason': 'TFJobRunning',\n", + " 'status': 'True',\n", + " 'type': 'Running'}]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_client.get_job_conditions(name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wait Until TFJob Finishes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "training_client.wait_for_job_conditions(name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Verify if TFJob is Succeeded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "training_client.is_job_succeeded(name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get the TFJob Training Logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "training_client.get_job_logs(name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Delete the TFJob" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "training_client.delete_job(name)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/hack/python-sdk/post_gen.py b/hack/python-sdk/post_gen.py index ab496bfb64..07b15a4468 100755 --- a/hack/python-sdk/post_gen.py +++ b/hack/python-sdk/post_gen.py @@ -44,7 +44,7 @@ def fix_test_files() -> None: test_folder_dir = os.path.join(sdk_dir, "test") test_files = os.listdir(test_folder_dir) for test_file in test_files: - print(f"Precessing file {test_file}") + print(f"Processing file {test_file}") if test_file.endswith(".py"): with fileinput.FileInput( os.path.join(test_folder_dir, test_file), inplace=True @@ -56,8 +56,9 @@ def fix_test_files() -> None: def add_imports() -> None: with open(os.path.join(sdk_dir, "kubeflow/training/__init__.py"), "a") as f: f.write("from kubeflow.training.api.training_client import TrainingClient\n") + f.write("from kubeflow.training.constants import constants\n") with open(os.path.join(sdk_dir, "kubeflow/__init__.py"), "a") as f: - f.write("__path__ = __import__('pkgutil').extend_path(__path__, __name__)") + f.write("__path__ = __import__('pkgutil').extend_path(__path__, __name__)\n") # Add Kubernetes models to proper deserialization of Training models. with open(os.path.join(sdk_dir, "kubeflow/training/models/__init__.py"), "r") as f: diff --git a/sdk/python/examples/create-pytorchjob-from-func.ipynb b/sdk/python/examples/create-pytorchjob-from-func.ipynb deleted file mode 100644 index aaafdf8132..0000000000 --- a/sdk/python/examples/create-pytorchjob-from-func.ipynb +++ /dev/null @@ -1,779 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "90d43b56-97e5-45e2-8e67-4488ed31d2df", - "metadata": { - "tags": [] - }, - "source": [ - "# Run PyTorchJob From Function\n", - "\n", - "In this Notebook we are going to create [Kubeflow PyTorchJob](https://www.kubeflow.org/docs/components/training/pytorch/).\n", - "\n", - "The PyTorchJob will run distributive training using [DistributedDataParallel strategy](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)." - ] - }, - { - "cell_type": "markdown", - "id": "a8bb6564-fde3-4c28-841c-012122643dd9", - "metadata": { - "tags": [] - }, - "source": [ - "## Install Kubeflow Python SDKs\n", - "\n", - "You need to install PyTorch packages and Kubeflow SDKs to run this Notebook." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d49f072e-2221-48bb-9f6d-561713d1a45c", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install torch==1.12.1\n", - "!pip install torchvision==0.13.1\n", - "\n", - "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n", - "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python" - ] - }, - { - "cell_type": "markdown", - "id": "e9331a05-9127-4b3a-8077-31157e267827", - "metadata": {}, - "source": [ - "## Create Train Script for CNN Model\n", - "\n", - "This is simple **Convolutional Neural Network (CNN)** model for recognizing different picture of clothing using [Fashion MNIST Dataset](https://github.com/zalandoresearch/fashion-mnist)." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "69f21f33-5c64-452c-90c4-977fc0dadb3b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def train_pytorch_model():\n", - " import logging\n", - " import os\n", - " from torchvision import transforms, datasets\n", - " import torch\n", - " from torch import nn\n", - " import torch.nn.functional as F\n", - " import torch.distributed as dist\n", - "\n", - " logging.basicConfig(\n", - " format=\"%(asctime)s %(levelname)-8s %(message)s\",\n", - " datefmt=\"%Y-%m-%dT%H:%M:%SZ\",\n", - " level=logging.DEBUG,\n", - " )\n", - "\n", - " # Create PyTorch CNN Model.\n", - " class Net(nn.Module):\n", - " def __init__(self):\n", - " super(Net, self).__init__()\n", - " self.conv1 = nn.Conv2d(1, 20, 5, 1)\n", - " self.conv2 = nn.Conv2d(20, 50, 5, 1)\n", - " self.fc1 = nn.Linear(4 * 4 * 50, 500)\n", - " self.fc2 = nn.Linear(500, 10)\n", - "\n", - " def forward(self, x):\n", - " x = F.relu(self.conv1(x))\n", - " x = F.max_pool2d(x, 2, 2)\n", - " x = F.relu(self.conv2(x))\n", - " x = F.max_pool2d(x, 2, 2)\n", - " x = x.view(-1, 4 * 4 * 50)\n", - " x = F.relu(self.fc1(x))\n", - " x = self.fc2(x)\n", - " return F.log_softmax(x, dim=1)\n", - "\n", - " # Get dist parameters.\n", - " # Kubeflow Training Operator automatically set appropriate RANK and WORLD_SIZE based on the configuration.\n", - " RANK = int(os.environ[\"RANK\"])\n", - " WORLD_SIZE = int(os.environ[\"WORLD_SIZE\"])\n", - " \n", - " model = Net()\n", - " # Attach model to DistributedDataParallel strategy.\n", - " dist.init_process_group(backend=\"gloo\", rank=RANK, world_size=WORLD_SIZE)\n", - " Distributor = nn.parallel.DistributedDataParallel\n", - " model = Distributor(model)\n", - "\n", - " # Split batch size for each worker.\n", - " batch_size = int(128 / WORLD_SIZE)\n", - "\n", - " # Get Fashion MNIST DataSet.\n", - " train_loader = torch.utils.data.DataLoader(\n", - " datasets.FashionMNIST(\n", - " \"./data\",\n", - " train=True,\n", - " download=True,\n", - " transform=transforms.Compose([transforms.ToTensor()]),\n", - " ),\n", - " batch_size=batch_size,\n", - " )\n", - "\n", - " # Start Training.\n", - " logging.info(f\"Start training for RANK: {RANK}. WORLD_SIZE: {WORLD_SIZE}\")\n", - " for epoch in range(1):\n", - " model.train()\n", - " optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)\n", - "\n", - " for batch_idx, (data, target) in enumerate(train_loader):\n", - " optimizer.zero_grad()\n", - " output = model(data)\n", - " loss = F.nll_loss(output, target)\n", - " loss.backward()\n", - " optimizer.step()\n", - " if batch_idx % 10 == 0:\n", - " logging.info(\n", - " \"Train Epoch: {} [{}/{} ({:.0f}%)]\\tloss={:.4f}\".format(\n", - " epoch,\n", - " batch_idx * len(data),\n", - " len(train_loader.dataset),\n", - " 100.0 * batch_idx / len(train_loader),\n", - " loss.item(),\n", - " )\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "8cfe8739-1f94-476a-80e3-dd6e3237d9ed", - "metadata": { - "execution": { - "iopub.execute_input": "2022-09-01T19:32:37.813779Z", - "iopub.status.busy": "2022-09-01T19:32:37.812759Z", - "iopub.status.idle": "2022-09-01T19:32:37.827050Z", - "shell.execute_reply": "2022-09-01T19:32:37.825186Z", - "shell.execute_reply.started": "2022-09-01T19:32:37.813690Z" - } - }, - "source": [ - "## Run Training Locally in the Notebook\n", - "\n", - "We are going to download Fashion MNIST Dataset and start local training." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "9e2c6fd8-d0ba-4bc6-ac90-d4cf09751ace", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-09-12T18:21:28Z INFO Added key: store_based_barrier_key:1 to store for rank: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n", - "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "851b228ae0324915882f834224abe134", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/26421880 [00:00 constants.JOB_MODELS_TYPE: + """Get the Training Job. + + Args: + name: Name for the Job. + namespace: Namespace for the Job. By default namespace is taken from + `TrainingClient` object. + job_kind: Kind for the Job (e.g. `TFJob` or `PyTorchJob`). By default Job kind + is taken from `TrainingClient` object. + timeout: Kubernetes API server timeout in seconds to execute the request. + + Returns: + object: Job object. For example: KubeflowOrgV1PyTorchJob + + Raises: + TimeoutError: Timeout to get Job. + RuntimeError: Failed to get Job. + """ + + namespace = namespace or self.namespace + job_kind = job_kind or self.job_kind + + if job_kind not in constants.JOB_PARAMETERS: + raise ValueError( + f"Job kind must be one of these: {constants.JOB_PARAMETERS.keys()}" + ) + + try: + thread = self.custom_api.get_namespaced_custom_object( + constants.GROUP, + constants.VERSION, + namespace, + constants.JOB_PARAMETERS[job_kind]["plural"], + name, + async_req=True, + ) + response = utils.FakeResponse(thread.get(timeout)) + job = self.api_client.deserialize( + response, constants.JOB_PARAMETERS[job_kind]["model"] + ) + + except multiprocessing.TimeoutError: + raise TimeoutError(f"Timeout to get {job_kind}: {namespace}/{name}") + except Exception: + raise RuntimeError(f"Failed to get {job_kind}: {namespace}/{name}") + + return job + + def list_jobs( + self, + namespace: Optional[str] = None, + job_kind: Optional[str] = None, + timeout: int = constants.DEFAULT_TIMEOUT, + ) -> List[constants.JOB_MODELS_TYPE]: + """List of all Training Jobs with specific kind in namespace. + + Args: + namespace: Namespace to list the Jobs. By default namespace is taken from + `TrainingClient` object. + job_kind: Kind for the Job (e.g. `TFJob` or `PyTorchJob`). By default Job kind + is taken from `TrainingClient` object. + timeout: Kubernetes API server timeout in seconds to execute the request. + + Returns: + list[object]: List of Job objects. + For example: list of KubeflowOrgV1PyTorchJob objects. It returns empty list + if Jobs can't be found. + + Raises: + TimeoutError: Timeout to list Jobs + RuntimeError: Failed to list Jobs + """ + + namespace = namespace or self.namespace + job_kind = job_kind or self.job_kind + + if job_kind not in constants.JOB_PARAMETERS: + raise ValueError( + f"Job kind must be one of these: {constants.JOB_PARAMETERS.keys()}" + ) + + result = [] + try: + thread = self.custom_api.list_namespaced_custom_object( + constants.GROUP, + constants.VERSION, + namespace, + constants.JOB_PARAMETERS[job_kind]["plural"], + async_req=True, + ) + response = thread.get(timeout) + result = [ + self.api_client.deserialize( + utils.FakeResponse(item), + constants.JOB_PARAMETERS[job_kind]["model"], + ) + for item in response.get("items") + ] + except multiprocessing.TimeoutError: + raise TimeoutError(f"Timeout to list {job_kind}s in namespace: {namespace}") + except Exception: + raise RuntimeError(f"Failed to list {job_kind}s in namespace: {namespace}") + + return result + + def get_job_conditions( + self, + name: Optional[str] = None, + namespace: Optional[str] = None, + job_kind: Optional[str] = None, + job: Optional[constants.JOB_MODELS_TYPE] = None, + timeout: int = constants.DEFAULT_TIMEOUT, + ) -> List[models.V1JobCondition]: """Get the Training Job conditions. Training Job is in the condition when `status=True` for the appropriate condition `type`. For example, Training Job is Succeeded when `status=True` and `type=Succeeded`. Args: name: Name for the Job. - namespace: Namespace for the Job. - job_kind: Kind for the Training job to get conditions. - It should be one of these: `TFJob, PyTorchJob, MXJob, XGBoostJob, MPIJob, or PaddleJob`. - job: Optionally, Training Job object can be set to get the conditions. - It should be type of `KubeflowOrgV1TFJob, KubeflowOrgV1PyTorchJob, KubeflowOrgV1MXJob, - KubeflowOrgV1XGBoostJob, KubeflowOrgV1MPIJob, or KubeflowOrgV1PaddleJob` - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. + namespace: Namespace for the Job. By default namespace is taken from + `TrainingClient` object. + job_kind: Kind for the Job (e.g. `TFJob` or `PyTorchJob`). By default Job kind + is taken from `TrainingClient` object. + job: Job object can be set to get the conditions. Object must be one of + these types: KubeflowOrgV1TFJob, KubeflowOrgV1PyTorchJob, KubeflowOrgV1MXJob, etc. + If this parameter is omitted, it gets Job with the given name and kind. + timeout: Kubernetes API server timeout in seconds to execute the request. Returns: list[V1JobCondition]: List of Job conditions with last transition time, last update time, message, reason, type, and - status. It returns empty list if Training Job does not have any + status. It returns empty list if Job does not have any conditions yet. Raises: - ValueError: Job kind is invalid. - TimeoutError: Timeout to get Training Job. - RuntimeError: Failed to get Training Job. + ValueError: Invalid input parameters. + TimeoutError: Timeout to get Job. + RuntimeError: Failed to get Job. """ - models = tuple([d["model"] for d in list(constants.JOB_KINDS.values())]) - if job is not None and not isinstance(job, models): - raise ValueError(f"Job must be one of these types: {models}") + namespace = namespace or self.namespace + job_kind = job_kind or self.job_kind + + if job_kind not in constants.JOB_PARAMETERS: + raise ValueError( + f"Job kind must be one of these: {constants.JOB_PARAMETERS.keys()}" + ) + + if job is not None and not isinstance(job, constants.JOB_MODELS): + raise ValueError(f"Job must be one of these types: {constants.JOB_MODELS}") # If Job is not set, get the Training Job. if job is None: - if job_kind not in constants.JOB_KINDS: + # Job name must be set when Job object is not set. + if name is None: raise ValueError( - f"Job kind must be one of these: {list(constants.JOB_KINDS.keys())}" + "Job name must be set to configure Job from function or image" ) - job = utils.get_job( - custom_api=self.custom_api, - api_client=self.api_client, + + job = self.get_job( name=name, namespace=namespace, - job_model=constants.JOB_KINDS[job_kind]["model"], job_kind=job_kind, - job_plural=constants.JOB_KINDS[job_kind]["plural"], timeout=timeout, ) if job.status and job.status.conditions and len(job.status.conditions) > 0: @@ -128,30 +401,30 @@ def get_job_conditions( def is_job_created( self, - name: str, - namespace: str = utils.get_default_target_namespace(), - job_kind: str = constants.TFJOB_KIND, - job: object = None, + name: Optional[str] = None, + namespace: Optional[str] = None, + job_kind: Optional[str] = None, + job: Optional[constants.JOB_MODELS_TYPE] = None, timeout: int = constants.DEFAULT_TIMEOUT, - ): + ) -> bool: """Check if Training Job is Created. Args: name: Name for the Job. - namespace: Namespace for the Job. - job_kind: Kind for the Training job to check the status. - It should be one of these: `TFJob, PyTorchJob, MXJob, XGBoostJob, MPIJob, or PaddleJob`. - job: Optionally, Training Job object can be set to check the status. - It should be type of `KubeflowOrgV1TFJob, KubeflowOrgV1PyTorchJob, KubeflowOrgV1MXJob, - KubeflowOrgV1XGBoostJob, KubeflowOrgV1MPIJob, or KubeflowOrgV1PaddleJob` - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. + namespace: Namespace for the Job. By default namespace is taken from + `TrainingClient` object. + job_kind: Kind for the Job (e.g. `TFJob` or `PyTorchJob`). By default Job kind + is taken from `TrainingClient` object. + job: Job object can be set to get the conditions. Object must be one of + these types: KubeflowOrgV1TFJob, KubeflowOrgV1PyTorchJob, KubeflowOrgV1MXJob, etc. + If this parameter is omitted, it gets Job with the given name and kind. + timeout: Kubernetes API server timeout in seconds to execute the request. Returns: bool: True if Job is Created, else False. Raises: - ValueError: Job kind is invalid. + ValueError: Invalid input parameters. TimeoutError: Timeout to get Job. RuntimeError: Failed to get Job. """ @@ -163,30 +436,30 @@ def is_job_created( def is_job_running( self, - name: str, - namespace: str = utils.get_default_target_namespace(), - job_kind: str = constants.TFJOB_KIND, - job: object = None, + name: Optional[str] = None, + namespace: Optional[str] = None, + job_kind: Optional[str] = None, + job: Optional[constants.JOB_MODELS_TYPE] = None, timeout: int = constants.DEFAULT_TIMEOUT, - ): + ) -> bool: """Check if Training Job is Running. Args: name: Name for the Job. - namespace: Namespace for the Job. - job_kind: Kind for the Training job to check the status. - It should be one of these: `TFJob, PyTorchJob, MXJob, XGBoostJob, MPIJob, or PaddleJob`. - job: Optionally, Training Job object can be set to check the status. - It should be type of `KubeflowOrgV1TFJob, KubeflowOrgV1PyTorchJob, KubeflowOrgV1MXJob, - KubeflowOrgV1XGBoostJob, KubeflowOrgV1MPIJob, or KubeflowOrgV1PaddleJob` - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. + namespace: Namespace for the Job. By default namespace is taken from + `TrainingClient` object. + job_kind: Kind for the Job (e.g. `TFJob` or `PyTorchJob`). By default Job kind + is taken from `TrainingClient` object. + job: Job object can be set to get the conditions. Object must be one of + these types: KubeflowOrgV1TFJob, KubeflowOrgV1PyTorchJob, KubeflowOrgV1MXJob, etc. + If this parameter is omitted, it gets Job with the given name and kind. + timeout: Kubernetes API server timeout in seconds to execute the request. Returns: bool: True if Job is Running, else False. Raises: - ValueError: Job kind is invalid. + ValueError: Invalid input parameters. TimeoutError: Timeout to get Job. RuntimeError: Failed to get Job. """ @@ -198,30 +471,30 @@ def is_job_running( def is_job_restarting( self, - name: str, - namespace: str = utils.get_default_target_namespace(), - job_kind: str = constants.TFJOB_KIND, - job: object = None, + name: Optional[str] = None, + namespace: Optional[str] = None, + job_kind: Optional[str] = None, + job: Optional[constants.JOB_MODELS_TYPE] = None, timeout: int = constants.DEFAULT_TIMEOUT, - ): + ) -> bool: """Check if Training Job is Restarting. Args: name: Name for the Job. - namespace: Namespace for the Job. - job_kind: Kind for the Training job to check the status. - It should be one of these: `TFJob, PyTorchJob, MXJob, XGBoostJob, MPIJob, or PaddleJob`. - job: Optionally, Training Job object can be set to check the status. - It should be type of `KubeflowOrgV1TFJob, KubeflowOrgV1PyTorchJob, KubeflowOrgV1MXJob, - KubeflowOrgV1XGBoostJob, KubeflowOrgV1MPIJob, or KubeflowOrgV1PaddleJob` - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. + namespace: Namespace for the Job. By default namespace is taken from + `TrainingClient` object. + job_kind: Kind for the Job (e.g. `TFJob` or `PyTorchJob`). By default Job kind + is taken from `TrainingClient` object. + job: Job object can be set to get the conditions. Object must be one of + these types: KubeflowOrgV1TFJob, KubeflowOrgV1PyTorchJob, KubeflowOrgV1MXJob, etc. + If this parameter is omitted, it gets Job with the given name and kind. + timeout: Kubernetes API server timeout in seconds to execute the request. Returns: bool: True if Job is Restarting, else False. Raises: - ValueError: Job kind is invalid. + ValueError: Invalid input parameters. TimeoutError: Timeout to get Job. RuntimeError: Failed to get Job. """ @@ -233,30 +506,30 @@ def is_job_restarting( def is_job_succeeded( self, - name: str, - namespace: str = utils.get_default_target_namespace(), - job_kind: str = constants.TFJOB_KIND, - job: object = None, + name: Optional[str] = None, + namespace: Optional[str] = None, + job_kind: Optional[str] = None, + job: Optional[constants.JOB_MODELS_TYPE] = None, timeout: int = constants.DEFAULT_TIMEOUT, - ): + ) -> bool: """Check if Training Job is Succeeded. Args: name: Name for the Job. - namespace: Namespace for the Job. - job_kind: Kind for the Training job to check the status. - It should be one of these: `TFJob, PyTorchJob, MXJob, XGBoostJob, MPIJob, or PaddleJob`. - job: Optionally, Training Job object can be set to check the status. - It should be type of `KubeflowOrgV1TFJob, KubeflowOrgV1PyTorchJob, KubeflowOrgV1MXJob, - KubeflowOrgV1XGBoostJob, KubeflowOrgV1MPIJob, or KubeflowOrgV1PaddleJob` - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. + namespace: Namespace for the Job. By default namespace is taken from + `TrainingClient` object. + job_kind: Kind for the Job (e.g. `TFJob` or `PyTorchJob`). By default Job kind + is taken from `TrainingClient` object. + job: Job object can be set to get the conditions. Object must be one of + these types: KubeflowOrgV1TFJob, KubeflowOrgV1PyTorchJob, KubeflowOrgV1MXJob, etc. + If this parameter is omitted, it gets Job with the given name and kind. + timeout: Kubernetes API server timeout in seconds to execute the request. Returns: bool: True if Job is Succeeded, else False. Raises: - ValueError: Job kind is invalid. + ValueError: Invalid input parameters. TimeoutError: Timeout to get Job. RuntimeError: Failed to get Job. """ @@ -268,30 +541,30 @@ def is_job_succeeded( def is_job_failed( self, - name: str, - namespace: str = utils.get_default_target_namespace(), - job_kind: str = constants.TFJOB_KIND, - job: object = None, + name: Optional[str] = None, + namespace: Optional[str] = None, + job_kind: Optional[str] = None, + job: Optional[constants.JOB_MODELS_TYPE] = None, timeout: int = constants.DEFAULT_TIMEOUT, - ): + ) -> bool: """Check if Training Job is Failed. Args: name: Name for the Job. - namespace: Namespace for the Job. - job_kind: Kind for the Training job to check the status. - It should be one of these: `TFJob, PyTorchJob, MXJob, XGBoostJob, MPIJob, or PaddleJob`. - job: Optionally, Training Job object can be set to check the status. - It should be type of `KubeflowOrgV1TFJob, KubeflowOrgV1PyTorchJob, KubeflowOrgV1MXJob, - KubeflowOrgV1XGBoostJob, KubeflowOrgV1MPIJob, or KubeflowOrgV1PaddleJob` - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. + namespace: Namespace for the Job. By default namespace is taken from + `TrainingClient` object. + job_kind: Kind for the Job (e.g. `TFJob` or `PyTorchJob`). By default Job kind + is taken from `TrainingClient` object. + job: Job object can be set to get the conditions. Object must be one of + these types: KubeflowOrgV1TFJob, KubeflowOrgV1PyTorchJob, KubeflowOrgV1MXJob, etc. + If this parameter is omitted, it gets Job with the given name and kind. + timeout: Kubernetes API server timeout in seconds to execute the request. Returns: bool: True if Job is Failed, else False. Raises: - ValueError: Job kind is invalid. + ValueError: Invalid input parameters. TimeoutError: Timeout to get Job. RuntimeError: Failed to get Job. """ @@ -304,69 +577,69 @@ def is_job_failed( def wait_for_job_conditions( self, name: str, - namespace: str = utils.get_default_target_namespace(), - job_kind: str = constants.TFJOB_KIND, + namespace: Optional[str] = None, + job_kind: Optional[str] = None, expected_conditions: Set = {constants.JOB_CONDITION_SUCCEEDED}, - timeout: int = 600, + wait_timeout: int = 600, polling_interval: int = 15, - callback: Callable = None, - apiserver_timeout: int = constants.DEFAULT_TIMEOUT, - ): + callback: Optional[Callable] = None, + timeout: int = constants.DEFAULT_TIMEOUT, + ) -> constants.JOB_MODELS_TYPE: """Wait until Training Job reaches any of the specified conditions. By default it waits for the Succeeded condition. Args: name: Name for the Job. - namespace: Namespace for the Job. - job_kind: Kind for the Training job to wait for conditions. - It should be one of these: `TFJob, PyTorchJob, MXJob, XGBoostJob, MPIJob, or PaddleJob`. + namespace: Namespace for the Job. By default namespace is taken from + `TrainingClient` object. + job_kind: Kind for the Job (e.g. `TFJob` or `PyTorchJob`). By default Job kind + is taken from `TrainingClient` object. expected_conditions: Set of expected conditions. It must be subset of this: `{"Created", "Running", "Restarting", "Succeeded", "Failed"}` - timeout: How many seconds to wait until Job reaches one of + wait_timeout: How many seconds to wait until Job reaches one of the expected conditions. polling_interval: The polling interval in seconds to get Job status. - callback: Optional callback function that is invoked after Job + callback: Callback function that is invoked after Job status is polled. This function takes a single argument which is current Job object. - apiserver_timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. + timeout: Kubernetes API server timeout in seconds to execute the request. Returns: - object: Training Job object of type `KubeflowOrgV1TFJob, KubeflowOrgV1PyTorchJob, - KubeflowOrgV1MXJob, KubeflowOrgV1XGBoostJob, KubeflowOrgV1MPIJob, or - KubeflowOrgV1PaddleJob` which is reached required condition. + object: Job object. For example: KubeflowOrgV1PyTorchJob Raises: - ValueError: Expected conditions are invalid or Job kind is invalid + ValueError: Invalid input parameters. TimeoutError: Timeout to get Job. - RuntimeError: Failed to get Job. + RuntimeError: Failed to get Job or Job reaches unexpected Failed condition. """ + namespace = namespace or self.namespace + job_kind = job_kind or self.job_kind + if not expected_conditions.issubset(constants.JOB_CONDITIONS): raise ValueError( - f"Expected conditions: {expected_conditions} must be subset of {constants.JOB_CONDITIONS}" + f"Expected conditions: {expected_conditions} must be subset of \ + {constants.JOB_CONDITIONS}" ) - for _ in range(round(timeout / polling_interval)): - + for _ in range(round(wait_timeout / polling_interval)): # We should get Job only once per cycle and check the statuses. - job = utils.get_job( - custom_api=self.custom_api, - api_client=self.api_client, + job = self.get_job( name=name, namespace=namespace, - job_model=constants.JOB_KINDS[job_kind]["model"], job_kind=job_kind, - job_plural=constants.JOB_KINDS[job_kind]["plural"], - timeout=apiserver_timeout, - ) - conditions = self.get_job_conditions( - name, namespace, job_kind, job, timeout + timeout=timeout, ) + + # Get Job conditions. + conditions = self.get_job_conditions(job=job, timeout=timeout) if len(conditions) > 0: status_logger( - name, conditions[-1].type, conditions[-1].last_transition_time, + name, + conditions[-1].type, + conditions[-1].last_transition_time, ) - # Execute callback function. + + # Execute callback function is it is set. if callback: callback(job) @@ -388,27 +661,29 @@ def wait_for_job_conditions( time.sleep(polling_interval) raise TimeoutError( - f"Timeout waiting for {job_kind}: {namespace}/{name} to reach expected conditions: {expected_conditions}" + f"Timeout waiting for {job_kind}: {namespace}/{name} to reach expected conditions: \ + {expected_conditions}" ) def get_job_pod_names( self, name: str, - namespace: str = utils.get_default_target_namespace(), + namespace: Optional[str] = None, is_master: bool = False, - replica_type: str = None, - replica_index: int = None, + replica_type: Optional[str] = None, + replica_index: Optional[int] = None, timeout: int = constants.DEFAULT_TIMEOUT, - ): + ) -> List[str]: """Get pod names for the Training Job. Args: name: Name for the Job. - namespace: Namespace for the Job. + namespace: Namespace for the Job. By default namespace is taken from + `TrainingClient` object. is_master: Whether to get pods only with the label `training.kubeflow.org/job-role: master`. - replica_type: Optional, type of the Job replica. - For TFJob one of `chief`, `ps`, or `worker`. + replica_type: Type of the Job replica. + For TFJob one of `Chief`, `PS`, or `worker`. For PyTorchJob one of `master` or `worker`. @@ -420,9 +695,8 @@ def get_job_pod_names( For PaddleJob one of `master` or `worker`. - replica_index: Optional, index for the Job replica. - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. + replica_index: Index for the Job replica. + timeout: Kubernetes API server timeout in seconds to execute the request. Returns: list[str]: List of the Job pod names. @@ -433,6 +707,8 @@ def get_job_pod_names( RuntimeError: Failed to get Job pods. """ + namespace = namespace or self.namespace + if ( replica_type is not None and replica_type not in constants.TFJOB_REPLICA_TYPES @@ -471,7 +747,9 @@ def get_job_pod_names( pods = [] try: thread = self.core_api.list_namespaced_pod( - namespace, label_selector=label_selector, async_req=True, + namespace, + label_selector=label_selector, + async_req=True, ) response = thread.get(timeout) except multiprocessing.TimeoutError: @@ -486,11 +764,11 @@ def get_job_pod_names( def get_job_logs( self, name: str, - namespace: str = utils.get_default_target_namespace(), + namespace: Optional[str] = None, + job_kind: Optional[str] = None, is_master: bool = True, - replica_type: str = None, - replica_index: int = None, - container: str = constants.TFJOB_CONTAINER, + replica_type: Optional[str] = None, + replica_index: Optional[int] = None, follow: bool = False, timeout: int = constants.DEFAULT_TIMEOUT, ): @@ -499,7 +777,10 @@ def get_job_logs( Args: name: Name for the Job. - namespace: Namespace for the Job. + namespace: Namespace for the Job. By default namespace is taken from + `TrainingClient` object. + job_kind: Kind for the Job (e.g. `TFJob` or `PyTorchJob`). By default Job kind + is taken from `TrainingClient` object. is_master: Whether to get logs for the pod with the label `training.kubeflow.org/job-role: master`. replica_type: Optional, type of the Job replica. @@ -526,6 +807,9 @@ def get_job_logs( RuntimeError: Failed to get Job pods. """ + namespace = namespace or self.namespace + job_kind = job_kind or self.job_kind + pods = self.get_job_pod_names( name=name, namespace=namespace, @@ -543,7 +827,7 @@ def get_job_logs( self.core_api.read_namespaced_pod_log, name=pod, namespace=namespace, - container=container, + container=constants.JOB_PARAMETERS[job_kind]["container"], ) ) finished = [False for _ in log_streams] @@ -572,1089 +856,98 @@ def get_job_logs( for pod in pods: try: pod_logs = self.core_api.read_namespaced_pod_log( - pod, namespace, container=container + pod, + namespace, + container=constants.JOB_PARAMETERS[job_kind]["container"], ) logging.info("The logs of pod %s:\n %s", pod, pod_logs) except Exception: - raise RuntimeError( - f"Failed to read logs for pod {namespace}/{pod}" - ) - - # ------------------------------------------------------------------------ # - # TFJob Training Client APIs. - # ------------------------------------------------------------------------ # - def create_tfjob( - self, - tfjob: models.KubeflowOrgV1TFJob, - namespace=utils.get_default_target_namespace(), - ): - """Create the TFJob. - - Args: - tfjob: TFJob object of type KubeflowOrgV1TFJob. - namespace: Namespace for the TFJob. - - Raises: - TimeoutError: Timeout to create TFJob. - RuntimeError: Failed to create TFJob. - """ + raise RuntimeError(f"Failed to read logs for pod {namespace}/{pod}") - utils.create_job( - custom_api=self.custom_api, - job=tfjob, - namespace=namespace, - job_kind=constants.TFJOB_KIND, - job_plural=constants.TFJOB_PLURAL, - ) - - def create_tfjob_from_func( + def update_job( self, + job: constants.JOB_MODELS_TYPE, name: str, - func: Callable, - parameters: Dict[str, Any] = None, - base_image: str = constants.TFJOB_BASE_IMAGE, - namespace: str = utils.get_default_target_namespace(), - num_chief_replicas: int = None, - num_ps_replicas: int = None, - num_worker_replicas: int = None, - packages_to_install: List[str] = None, - pip_index_url: str = "https://pypi.org/simple", + namespace: Optional[str] = None, + job_kind: Optional[str] = None, ): - """Create TFJob from the function. + """Update the Training Job by using patch Kubernetes API. Args: - name: Name for the TFJob. - func: Function that TFJob uses to train the model. This function - must be Callable. Optionally, this function might have one dict - argument to define input parameters for the function. - parameters: Dict of input parameters that training function might receive. - base_image: Image to use when executing the training function. - namespace: Namespace for the TFJob. - num_chief_replicas: Number of Chief replicas for the TFJob. Number - of Chief replicas can't be more than 1. - num_ps_replicas: Number of Parameter Server replicas for the TFJob. - num_worker_replicas: Number of Worker replicas for the TFJob. - packages_to_install: List of Python packages to install in addition - to the base image packages. These packages are installed before - executing the objective function. - pip_index_url: The PyPI url from which to install Python packages. - + job: Job object. For example, object with type + KubeflowOrgV1TFJob or KubeflowOrgV1PyTorchJob. + name: Name for the Job. + namespace: Namespace for the Job. By default namespace is taken from + `TrainingClient` object. + job_kind: Kind for the Job (e.g. `TFJob` or `PyTorchJob`). By default Job kind + is taken from `TrainingClient` object. Raises: - ValueError: TFJob replicas are missing or training function is invalid. - TimeoutError: Timeout to create TFJob. - RuntimeError: Failed to create TFJob. + TimeoutError: Timeout to update Job + RuntimeError: Failed to update Job """ - # Check if at least one replica is set. - # TODO (andreyvelich): Remove this check once we have CEL validation. - # Ref: https://github.com/kubeflow/training-operator/issues/1708 - if ( - num_chief_replicas is None - and num_ps_replicas is None - and num_worker_replicas is None - ): - raise ValueError("At least one replica for TFJob must be set") + namespace = namespace or self.namespace + job_kind = job_kind or self.job_kind - # Check if function is callable. - if not callable(func): + if job_kind not in constants.JOB_PARAMETERS: raise ValueError( - f"Training function must be callable, got function type: {type(func)}" - ) - - # Get TFJob Pod template spec. - pod_template_spec = utils.get_pod_template_spec( - func=func, - parameters=parameters, - base_image=base_image, - container_name=constants.TFJOB_CONTAINER, - packages_to_install=packages_to_install, - pip_index_url=pip_index_url, - ) - - # Create TFJob template. - tfjob = models.KubeflowOrgV1TFJob( - api_version=f"{constants.KUBEFLOW_GROUP}/{constants.OPERATOR_VERSION}", - kind=constants.TFJOB_KIND, - metadata=client.V1ObjectMeta(name=name, namespace=namespace), - spec=models.KubeflowOrgV1TFJobSpec( - run_policy=models.V1RunPolicy(clean_pod_policy=None), - tf_replica_specs={}, - ), - ) - - # Add Chief, PS, and Worker replicas to the TFJob. - if num_chief_replicas is not None: - tfjob.spec.tf_replica_specs[ - constants.REPLICA_TYPE_CHIEF - ] = models.V1ReplicaSpec( - replicas=num_chief_replicas, template=pod_template_spec, + f"Job kind must be one of these: {constants.JOB_PARAMETERS.keys()}" ) - if num_ps_replicas is not None: - tfjob.spec.tf_replica_specs[ - constants.REPLICA_TYPE_PS - ] = models.V1ReplicaSpec( - replicas=num_ps_replicas, template=pod_template_spec, - ) - - if num_worker_replicas is not None: - tfjob.spec.tf_replica_specs[ - constants.REPLICA_TYPE_WORKER - ] = models.V1ReplicaSpec( - replicas=num_worker_replicas, template=pod_template_spec, + try: + self.custom_api.patch_namespaced_custom_object( + constants.GROUP, + constants.VERSION, + namespace, + constants.JOB_PARAMETERS[job_kind]["plural"], + name, + job, ) + except multiprocessing.TimeoutError: + raise TimeoutError(f"Timeout to update {job_kind}: {namespace}/{name}") + except Exception: + raise RuntimeError(f"Failed to update {job_kind}: {namespace}/{name}") - # Create TFJob. - self.create_tfjob(tfjob=tfjob, namespace=namespace) - - def get_tfjob( - self, - name: str, - namespace: str = utils.get_default_target_namespace(), - timeout: int = constants.DEFAULT_TIMEOUT, - ): - """Get the TFJob. - - Args: - name: Name for the TFJob. - namespace: Namespace for the TFJob. - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. - - Returns: - KubeflowOrgV1TFJob: TFJob object. - - Raises: - TimeoutError: Timeout to get TFJob. - RuntimeError: Failed to get TFJob. - """ - - return utils.get_job( - custom_api=self.custom_api, - api_client=self.api_client, - name=name, - namespace=namespace, - job_model=models.KubeflowOrgV1TFJob, - job_kind=constants.TFJOB_KIND, - job_plural=constants.TFJOB_PLURAL, - timeout=timeout, - ) - - def list_tfjobs( - self, - namespace: str = utils.get_default_target_namespace(), - timeout: int = constants.DEFAULT_TIMEOUT, - ): - """List of all TFJobs in namespace. - - Args: - namespace: Namespace to list the TFJobs. - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. - - Returns: - list[KubeflowOrgV1TFJob]: List of TFJobs objects. It returns - empty list if TFJobs cannot be found. - - Raises: - TimeoutError: Timeout to list TFJobs. - RuntimeError: Failed to list TFJobs. - """ - - return utils.list_jobs( - custom_api=self.custom_api, - api_client=self.api_client, - namespace=namespace, - job_model=models.KubeflowOrgV1TFJob, - job_kind=constants.TFJOB_KIND, - job_plural=constants.TFJOB_PLURAL, - timeout=timeout, - ) + logging.info(f"{job_kind} {namespace}/{name} has been updated") - def delete_tfjob( + def delete_job( self, name: str, - namespace: str = utils.get_default_target_namespace(), - delete_options: client.V1DeleteOptions = None, + namespace: Optional[str] = None, + job_kind: Optional[str] = None, + delete_options: Optional[client.V1DeleteOptions] = None, ): - """Delete the TFJob + """Delete the Training Job Args: - name: Name for the TFJob. - namespace: Namespace for the TFJob. + name: Name for the Job. + namespace: Namespace for the Job. By default namespace is taken from + `TrainingClient` object. + job_kind: Kind for the Job (e.g. `TFJob` or `PyTorchJob`). By default Job kind + is taken from `TrainingClient` object. delete_options: Optional, V1DeleteOptions to set while deleting - the TFJob. For example, grace period seconds. - - Raises: - TimeoutError: Timeout to delete TFJob. - RuntimeError: Failed to delete TFJob. - """ - - utils.delete_job( - custom_api=self.custom_api, - name=name, - namespace=namespace, - job_kind=constants.TFJOB_KIND, - job_plural=constants.TFJOB_PLURAL, - delete_options=delete_options, - ) - - def patch_tfjob( - self, - tfjob: models.KubeflowOrgV1TFJob, - name: str, - namespace: str = utils.get_default_target_namespace(), - ): - """Patch the TFJob. - - Args: - tfjob: TFJob object of type KubeflowOrgV1TFJob to patch. - name: Name for the TFJob. - namespace: Namespace for the TFJob. + the Job. For example, grace period seconds. Raises: - TimeoutError: Timeout to patch TFJob. - RuntimeError: Failed to patch TFJob. + TimeoutError: Timeout to delete Job. + RuntimeError: Failed to delete Job. """ - return utils.patch_job( - custom_api=self.custom_api, - job=tfjob, - name=name, - namespace=namespace, - job_kind=constants.TFJOB_KIND, - job_plural=constants.TFJOB_PLURAL, - ) - - # ------------------------------------------------------------------------ # - # PyTorchJob Training Client APIs. - # ------------------------------------------------------------------------ # - def create_pytorchjob( - self, - pytorchjob: models.KubeflowOrgV1PyTorchJob, - namespace=utils.get_default_target_namespace(), - ): - """Create the PyTorchJob. + namespace = namespace or self.namespace + job_kind = job_kind or self.job_kind - Args: - pytorchjob: PyTorchJob object of type KubeflowOrgV1PyTorchJob. - namespace: Namespace for the PyTorchJob. + try: + self.custom_api.delete_namespaced_custom_object( + constants.GROUP, + constants.VERSION, + namespace, + constants.JOB_PARAMETERS[job_kind]["plural"], + name=name, + body=delete_options, + ) + except multiprocessing.TimeoutError: + raise TimeoutError(f"Timeout to delete {job_kind}: {namespace}/{name}") + except Exception: + raise RuntimeError(f"Failed to delete {job_kind}: {namespace}/{name}") - Raises: - TimeoutError: Timeout to create PyTorchJob. - RuntimeError: Failed to create PyTorchJob. - """ - - utils.create_job( - custom_api=self.custom_api, - job=pytorchjob, - namespace=namespace, - job_kind=constants.PYTORCHJOB_KIND, - job_plural=constants.PYTORCHJOB_PLURAL, - ) - - def create_pytorchjob_from_func( - self, - name: str, - func: Callable, - parameters: Dict[str, Any] = None, - base_image: str = constants.PYTORCHJOB_BASE_IMAGE, - namespace: str = utils.get_default_target_namespace(), - num_worker_replicas: int = None, - packages_to_install: List[str] = None, - pip_index_url: str = "https://pypi.org/simple", - ): - """Create PyTorchJob from the function. - - Args: - name: Name for the PyTorchJob. - func: Function that PyTorchJob uses to train the model. This function - must be Callable. Optionally, this function might have one dict - argument to define input parameters for the function. - parameters: Dict of input parameters that training function might receive. - base_image: Image to use when executing the training function. - namespace: Namespace for the PyTorchJob. - num_worker_replicas: Number of Worker replicas for the PyTorchJob. - If number of Worker replicas is 1, PyTorchJob uses only - Master replica. - packages_to_install: List of Python packages to install in addition - to the base image packages. These packages are installed before - executing the objective function. - pip_index_url: The PyPI url from which to install Python packages. - """ - - # Check if at least one worker replica is set. - # TODO (andreyvelich): Remove this check once we have CEL validation. - # Ref: https://github.com/kubeflow/training-operator/issues/1708 - if num_worker_replicas is None: - raise ValueError("At least one Worker replica for PyTorchJob must be set") - - # Check if function is callable. - if not callable(func): - raise ValueError( - f"Training function must be callable, got function type: {type(func)}" - ) - - # Get PyTorchJob Pod template spec. - pod_template_spec = utils.get_pod_template_spec( - func=func, - parameters=parameters, - base_image=base_image, - container_name=constants.PYTORCHJOB_CONTAINER, - packages_to_install=packages_to_install, - pip_index_url=pip_index_url, - ) - - # Create PyTorchJob template. - pytorchjob = models.KubeflowOrgV1PyTorchJob( - api_version=f"{constants.KUBEFLOW_GROUP}/{constants.OPERATOR_VERSION}", - kind=constants.PYTORCHJOB_KIND, - metadata=client.V1ObjectMeta(name=name, namespace=namespace), - spec=models.KubeflowOrgV1PyTorchJobSpec( - run_policy=models.V1RunPolicy(clean_pod_policy=None), - pytorch_replica_specs={}, - ), - ) - - # Add Master and Worker replicas to the PyTorchJob. - pytorchjob.spec.pytorch_replica_specs[ - constants.REPLICA_TYPE_MASTER - ] = models.V1ReplicaSpec(replicas=1, template=pod_template_spec,) - - # If number of Worker replicas is 1, PyTorchJob uses only Master replica. - if num_worker_replicas != 1: - pytorchjob.spec.pytorch_replica_specs[ - constants.REPLICA_TYPE_WORKER - ] = models.V1ReplicaSpec( - replicas=num_worker_replicas, template=pod_template_spec, - ) - - # Create PyTorchJob - self.create_pytorchjob(pytorchjob=pytorchjob, namespace=namespace) - - def get_pytorchjob( - self, - name: str, - namespace: str = utils.get_default_target_namespace(), - timeout: int = constants.DEFAULT_TIMEOUT, - ): - """Get the PyTorchJob. - - Args: - name: Name for the PyTorchJob. - namespace: Namespace for the PyTorchJob. - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. - - Returns: - KubeflowOrgV1PyTorchJob: PyTorchJob object. - - Raises: - TimeoutError: Timeout to get PyTorchJob. - RuntimeError: Failed to get PyTorchJob. - """ - - return utils.get_job( - custom_api=self.custom_api, - api_client=self.api_client, - name=name, - namespace=namespace, - job_model=models.KubeflowOrgV1PyTorchJob, - job_kind=constants.PYTORCHJOB_KIND, - job_plural=constants.PYTORCHJOB_PLURAL, - timeout=timeout, - ) - - def list_pytorchjobs( - self, - namespace: str = utils.get_default_target_namespace(), - timeout: int = constants.DEFAULT_TIMEOUT, - ): - """List of all PyTorchJob in namespace. - - Args: - namespace: Namespace to list the PyTorchJob. - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. - - Returns: - list[KubeflowOrgV1PyTorchJob]: List of PyTorchJob objects. It returns - empty list if PyTorchJobs cannot be found. - - Raises: - TimeoutError: Timeout to list PyTorchJobs. - RuntimeError: Failed to list PyTorchJobs. - """ - - return utils.list_jobs( - custom_api=self.custom_api, - api_client=self.api_client, - namespace=namespace, - job_model=models.KubeflowOrgV1PyTorchJob, - job_kind=constants.PYTORCHJOB_KIND, - job_plural=constants.PYTORCHJOB_PLURAL, - timeout=timeout, - ) - - def delete_pytorchjob( - self, - name: str, - namespace: str = utils.get_default_target_namespace(), - delete_options: client.V1DeleteOptions = None, - ): - """Delete the PyTorchJob - - Args: - name: Name for the PyTorchJob. - namespace: Namespace for the PyTorchJob. - delete_options: Optional, V1DeleteOptions to set while deleting - the PyTorchJob. For example, grace period seconds. - - Raises: - TimeoutError: Timeout to delete PyTorchJob. - RuntimeError: Failed to delete PyTorchJob. - """ - - utils.delete_job( - custom_api=self.custom_api, - name=name, - namespace=namespace, - job_kind=constants.PYTORCHJOB_KIND, - job_plural=constants.PYTORCHJOB_PLURAL, - delete_options=delete_options, - ) - - def patch_pytorchjob( - self, - pytorchjob: models.KubeflowOrgV1PyTorchJob, - name: str, - namespace: str = utils.get_default_target_namespace(), - ): - """Patch the PyTorchJob. - - Args: - pytorchjob: PyTorchJob object of type KubeflowOrgV1PyTorchJob. - name: Name for the PyTorchJob. - namespace: Namespace for the PyTorchJob. - - Raises: - TimeoutError: Timeout to patch PyTorchJob. - RuntimeError: Failed to patch PyTorchJob. - """ - - return utils.patch_job( - custom_api=self.custom_api, - job=pytorchjob, - name=name, - namespace=namespace, - job_kind=constants.PYTORCHJOB_KIND, - job_plural=constants.PYTORCHJOB_PLURAL, - ) - - # ------------------------------------------------------------------------ # - # MXJob Training Client APIs. - # ------------------------------------------------------------------------ # - def create_mxjob( - self, - mxjob: models.KubeflowOrgV1MXJob, - namespace=utils.get_default_target_namespace(), - ): - """Create the MXJob. - - Args: - mxjob: MXJob object of type KubeflowOrgV1MXJob. - namespace: Namespace for the MXJob. - - Raises: - TimeoutError: Timeout to create MXJob. - RuntimeError: Failed to create MXJob. - """ - - utils.create_job( - custom_api=self.custom_api, - job=mxjob, - namespace=namespace, - job_kind=constants.MXJOB_KIND, - job_plural=constants.MXJOB_PLURAL, - ) - - def create_mxjob_from_func(self): - """Create MXJob from the function. - TODO (andreyvelich): Implement this function. - """ - logging.warning("This API has not been implemented yet.") - - def get_mxjob( - self, - name: str, - namespace: str = utils.get_default_target_namespace(), - timeout: int = constants.DEFAULT_TIMEOUT, - ): - """Get the MXJob. - - Args: - name: Name for the MXJob. - namespace: Namespace for the MXJob. - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. - - Returns: - KubeflowOrgV1MXJob: MXJob object. - - Raises: - TimeoutError: Timeout to get MXJob. - RuntimeError: Failed to get MXJob. - """ - - return utils.get_job( - custom_api=self.custom_api, - api_client=self.api_client, - name=name, - namespace=namespace, - job_model=models.KubeflowOrgV1MXJob, - job_kind=constants.MXJOB_KIND, - job_plural=constants.MXJOB_PLURAL, - timeout=timeout, - ) - - def list_mxjobs( - self, - namespace: str = utils.get_default_target_namespace(), - timeout: int = constants.DEFAULT_TIMEOUT, - ): - """List of all MXJobs in namespace. - - Args: - namespace: Namespace to list the MXJobs. - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. - - Returns: - list[KubeflowOrgV1MXJob]: List of MXJobs objects. It returns - empty list if MXJobs cannot be found. - - Raises: - TimeoutError: Timeout to list MXJobs. - RuntimeError: Failed to list MXJobs. - """ - - return utils.list_jobs( - custom_api=self.custom_api, - api_client=self.api_client, - namespace=namespace, - job_model=models.KubeflowOrgV1MXJob, - job_kind=constants.MXJOB_KIND, - job_plural=constants.MXJOB_PLURAL, - timeout=timeout, - ) - - def delete_mxjob( - self, - name: str, - namespace: str = utils.get_default_target_namespace(), - delete_options: client.V1DeleteOptions = None, - ): - """Delete the MXJob - - Args: - name: Name for the MXJob. - namespace: Namespace for the MXJob. - delete_options: Optional, V1DeleteOptions to set while deleting - the MXJob. For example, grace period seconds. - - Raises: - TimeoutError: Timeout to delete MXJob. - RuntimeError: Failed to delete MXJob. - """ - - utils.delete_job( - custom_api=self.custom_api, - name=name, - namespace=namespace, - job_kind=constants.MXJOB_KIND, - job_plural=constants.MXJOB_PLURAL, - delete_options=delete_options, - ) - - def patch_mxjob( - self, - mxjob: models.KubeflowOrgV1MXJob, - name: str, - namespace: str = utils.get_default_target_namespace(), - ): - """Patch the MXJob. - - Args: - mxjob: MXJob object of type KubeflowOrgV1MXJob. - name: Name for the MXJob. - namespace: Namespace for the MXJob. - - Raises: - TimeoutError: Timeout to patch MXJob. - RuntimeError: Failed to patch MXJob. - """ - - return utils.patch_job( - custom_api=self.custom_api, - job=mxjob, - name=name, - namespace=namespace, - job_kind=constants.MXJOB_KIND, - job_plural=constants.MXJOB_PLURAL, - ) - - # ------------------------------------------------------------------------ # - # XGBoostJob Training Client APIs. - # ------------------------------------------------------------------------ # - def create_xgboostjob( - self, - xgboostjob: models.KubeflowOrgV1XGBoostJob, - namespace=utils.get_default_target_namespace(), - ): - """Create the XGBoostJob. - - Args: - xgboostjob: XGBoostJob object of type KubeflowOrgV1XGBoostJob. - namespace: Namespace for the XGBoostJob. - - Raises: - TimeoutError: Timeout to create XGBoostJob. - RuntimeError: Failed to create XGBoostJob. - """ - - utils.create_job( - custom_api=self.custom_api, - job=xgboostjob, - namespace=namespace, - job_kind=constants.XGBOOSTJOB_KIND, - job_plural=constants.XGBOOSTJOB_PLURAL, - ) - - def create_xgboostjob_from_func(self): - """Create XGBoost from the function. - TODO (andreyvelich): Implement this function. - """ - logging.warning("This API has not been implemented yet.") - - def get_xgboostjob( - self, - name: str, - namespace: str = utils.get_default_target_namespace(), - timeout: int = constants.DEFAULT_TIMEOUT, - ): - """Get the XGBoostJob. - - Args: - name: Name for the XGBoostJob. - namespace: Namespace for the XGBoostJob. - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. - - Returns: - KubeflowOrgV1XGBoostJob: XGBoostJob object. - - Raises: - TimeoutError: Timeout to get XGBoostJob. - RuntimeError: Failed to get XGBoostJob. - """ - - return utils.get_job( - custom_api=self.custom_api, - api_client=self.api_client, - name=name, - namespace=namespace, - job_model=models.KubeflowOrgV1XGBoostJob, - job_kind=constants.XGBOOSTJOB_KIND, - job_plural=constants.XGBOOSTJOB_PLURAL, - timeout=timeout, - ) - - def list_xgboostjobs( - self, - namespace: str = utils.get_default_target_namespace(), - timeout: int = constants.DEFAULT_TIMEOUT, - ): - """List of all XGBoostJobs in namespace. - - Args: - namespace: Namespace to list the XGBoostJobs. - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. - - Returns: - list[KubeflowOrgV1XGBoostJob]: List of XGBoostJobs objects. It returns - empty list if XGBoostJobs cannot be found. - - Raises: - TimeoutError: Timeout to list XGBoostJobs. - RuntimeError: Failed to list XGBoostJobs. - """ - - return utils.list_jobs( - custom_api=self.custom_api, - api_client=self.api_client, - namespace=namespace, - job_model=models.KubeflowOrgV1XGBoostJob, - job_kind=constants.XGBOOSTJOB_KIND, - job_plural=constants.XGBOOSTJOB_PLURAL, - timeout=timeout, - ) - - def delete_xgboostjob( - self, - name: str, - namespace: str = utils.get_default_target_namespace(), - delete_options: client.V1DeleteOptions = None, - ): - """Delete the XGBoostJob - - Args: - name: Name for the XGBoostJob. - namespace: Namespace for the XGBoostJob. - delete_options: Optional, V1DeleteOptions to set while deleting - the XGBoostJob. For example, grace period seconds. - - Raises: - TimeoutError: Timeout to delete XGBoostJob. - RuntimeError: Failed to delete XGBoostJob. - """ - - utils.delete_job( - custom_api=self.custom_api, - name=name, - namespace=namespace, - job_kind=constants.XGBOOSTJOB_KIND, - job_plural=constants.XGBOOSTJOB_PLURAL, - delete_options=delete_options, - ) - - def patch_xgboostjob( - self, - xgboostjob: models.KubeflowOrgV1XGBoostJob, - name: str, - namespace: str = utils.get_default_target_namespace(), - ): - """Patch the XGBoostJob. - - Args: - xgboostjob: XGBoostJob object of type KubeflowOrgV1XGBoostJob. - name: Name for the XGBoostJob. - namespace: Namespace for the XGBoostJob. - - Raises: - TimeoutError: Timeout to patch XGBoostJob. - RuntimeError: Failed to patch XGBoostJob. - """ - - return utils.patch_job( - custom_api=self.custom_api, - job=xgboostjob, - name=name, - namespace=namespace, - job_kind=constants.XGBOOSTJOB_KIND, - job_plural=constants.XGBOOSTJOB_PLURAL, - ) - - # ------------------------------------------------------------------------ # - # MPIJob Training Client APIs. - # ------------------------------------------------------------------------ # - def create_mpijob( - self, - mpijob: models.KubeflowOrgV1MPIJob, - namespace=utils.get_default_target_namespace(), - ): - """Create the MPIJob. - - Args: - mpijob: MPIJob object of type KubeflowOrgV1MPIJob. - namespace: Namespace for the MPIJob. - - Raises: - TimeoutError: Timeout to create MPIJob. - RuntimeError: Failed to create MPIJob. - """ - - utils.create_job( - custom_api=self.custom_api, - job=mpijob, - namespace=namespace, - job_kind=constants.MPIJOB_KIND, - job_plural=constants.MPIJOB_PLURAL, - ) - - def create_mpijob_from_func(self): - """Create MPIJob from the function. - TODO (andreyvelich): Implement this function. - """ - logging.warning("This API has not been implemented yet.") - - def get_mpijob( - self, - name: str, - namespace: str = utils.get_default_target_namespace(), - timeout: int = constants.DEFAULT_TIMEOUT, - ): - """Get the MPIJob. - - Args: - name: Name for the MPIJob. - namespace: Namespace for the MPIJob. - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. - - Returns: - KubeflowOrgV1MPIJob: MPIJob object. - - Raises: - TimeoutError: Timeout to get MPIJob. - RuntimeError: Failed to get MPIJob. - """ - - return utils.get_job( - custom_api=self.custom_api, - api_client=self.api_client, - name=name, - namespace=namespace, - job_model=models.KubeflowOrgV1MPIJob, - job_kind=constants.MPIJOB_KIND, - job_plural=constants.MPIJOB_PLURAL, - timeout=timeout, - ) - - def list_mpijobs( - self, - namespace: str = utils.get_default_target_namespace(), - timeout: int = constants.DEFAULT_TIMEOUT, - ): - """List of all MPIJobs in namespace. - - Args: - namespace: Namespace to list the MPIJobs. - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. - - Returns: - list[KubeflowOrgV1MPIJob]: List of MPIJobs objects. It returns - empty list if MPIJobs cannot be found. - - Raises: - TimeoutError: Timeout to list MPIJobs. - RuntimeError: Failed to list MPIJobs. - """ - - return utils.list_jobs( - custom_api=self.custom_api, - api_client=self.api_client, - namespace=namespace, - job_model=models.KubeflowOrgV1MPIJob, - job_kind=constants.MPIJOB_KIND, - job_plural=constants.MPIJOB_PLURAL, - timeout=timeout, - ) - - def delete_mpijob( - self, - name: str, - namespace: str = utils.get_default_target_namespace(), - delete_options: client.V1DeleteOptions = None, - ): - """Delete the MPIJob - - Args: - name: Name for the MPIJob. - namespace: Namespace for the MPIJob. - delete_options: Optional, V1DeleteOptions to set while deleting - the MPIJob. For example, grace period seconds. - - Raises: - TimeoutError: Timeout to delete MPIJob. - RuntimeError: Failed to delete MPIJob. - """ - - utils.delete_job( - custom_api=self.custom_api, - name=name, - namespace=namespace, - job_kind=constants.MPIJOB_KIND, - job_plural=constants.MPIJOB_PLURAL, - delete_options=delete_options, - ) - - def patch_mpijob( - self, - mpijob: models.KubeflowOrgV1MPIJob, - name: str, - namespace: str = utils.get_default_target_namespace(), - ): - """Patch the MPIJob. - - Args: - mpijob: MPIJob object of type KubeflowOrgV1MPIJob. - name: Name for the MPIJob. - namespace: Namespace for the MPIJob. - - Raises: - TimeoutError: Timeout to patch MPIJob. - RuntimeError: Failed to patch MPIJob. - """ - - return utils.patch_job( - custom_api=self.custom_api, - job=mpijob, - name=name, - namespace=namespace, - job_kind=constants.MPIJOB_KIND, - job_plural=constants.MPIJOB_PLURAL, - ) - - # ------------------------------------------------------------------------ # - # PaddleJob Training Client APIs. - # ------------------------------------------------------------------------ # - def create_paddlejob( - self, - paddlejob: models.KubeflowOrgV1PaddleJob, - namespace=utils.get_default_target_namespace(), - ): - """Create the PaddleJob. - - Args: - paddlejob: PaddleJob object of type KubeflowOrgV1PaddleJob. - namespace: Namespace for the PaddleJob. - - Raises: - TimeoutError: Timeout to create PaddleJob. - RuntimeError: Failed to create PaddleJob. - """ - - utils.create_job( - custom_api=self.custom_api, - job=paddlejob, - namespace=namespace, - job_kind=constants.PADDLEJOB_KIND, - job_plural=constants.PADDLEJOB_PLURAL, - ) - - def create_paddlejob_from_func(self): - """Create PaddleJob from the function. - TODO (andreyvelich): Implement this function. - """ - logging.warning("This API has not been implemented yet.") - - def get_paddlejob( - self, - name: str, - namespace: str = utils.get_default_target_namespace(), - timeout: int = constants.DEFAULT_TIMEOUT, - ): - """Get the PaddleJob. - - Args: - name: Name for the PaddleJob. - namespace: Namespace for the PaddleJob. - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. - - Returns: - KubeflowOrgV1PaddleJob: PaddleJob object. - - Raises: - TimeoutError: Timeout to get PaddleJob. - RuntimeError: Failed to get PaddleJob. - """ - - return utils.get_job( - custom_api=self.custom_api, - api_client=self.api_client, - name=name, - namespace=namespace, - job_model=models.KubeflowOrgV1PaddleJob, - job_kind=constants.PADDLEJOB_KIND, - job_plural=constants.PADDLEJOB_PLURAL, - timeout=timeout, - ) - - def list_paddlejobs( - self, - namespace: str = utils.get_default_target_namespace(), - timeout: int = constants.DEFAULT_TIMEOUT, - ): - """List of all PaddleJobs in namespace. - - Args: - namespace: Namespace to list the PaddleJobs. - timeout: Optional, Kubernetes API server timeout in seconds - to execute the request. - - Returns: - list[KubeflowOrgV1PaddleJob]: List of PaddleJobs objects. It returns - empty list if PaddleJobs cannot be found. - - Raises: - TimeoutError: Timeout to list PaddleJobs. - RuntimeError: Failed to list PaddleJobs. - """ - - return utils.list_jobs( - custom_api=self.custom_api, - api_client=self.api_client, - namespace=namespace, - job_model=models.KubeflowOrgV1PaddleJob, - job_kind=constants.PADDLEJOB_KIND, - job_plural=constants.PADDLEJOB_PLURAL, - timeout=timeout, - ) - - def delete_paddlejob( - self, - name: str, - namespace: str = utils.get_default_target_namespace(), - delete_options: client.V1DeleteOptions = None, - ): - """Delete the PaddleJob - - Args: - name: Name for the PaddleJob. - namespace: Namespace for the PaddleJob. - delete_options: Optional, V1DeleteOptions to set while deleting - the PaddleJob. For example, grace period seconds. - - Raises: - TimeoutError: Timeout to delete PaddleJob. - RuntimeError: Failed to delete PaddleJob. - """ - - utils.delete_job( - custom_api=self.custom_api, - name=name, - namespace=namespace, - job_kind=constants.PADDLEJOB_KIND, - job_plural=constants.PADDLEJOB_PLURAL, - delete_options=delete_options, - ) - - def patch_paddlejob( - self, - paddlejob: models.KubeflowOrgV1PaddleJob, - name: str, - namespace: str = utils.get_default_target_namespace(), - ): - """Patch the PaddleJob. - - Args: - paddlejob: PaddleJob object of type KubeflowOrgV1PaddleJob. - name: Name for the PaddleJob. - namespace: Namespace for the PaddleJob. - - Raises: - TimeoutError: Timeout to patch PaddleJob. - RuntimeError: Failed to patch PaddleJob. - """ - - return utils.patch_job( - custom_api=self.custom_api, - job=paddlejob, - name=name, - namespace=namespace, - job_kind=constants.PADDLEJOB_KIND, - job_plural=constants.PADDLEJOB_PLURAL, - ) + logging.info(f"{job_kind} {namespace}/{name} has been deleted") diff --git a/sdk/python/kubeflow/training/constants/constants.py b/sdk/python/kubeflow/training/constants/constants.py index 2105fb684a..09547dfdd3 100644 --- a/sdk/python/kubeflow/training/constants/constants.py +++ b/sdk/python/kubeflow/training/constants/constants.py @@ -13,15 +13,22 @@ # limitations under the License. from kubeflow.training import models +from typing import Union # How long to wait in seconds for requests to the Kubernetes API Server. DEFAULT_TIMEOUT = 120 -# Common constants. -KUBEFLOW_GROUP = "kubeflow.org" -OPERATOR_VERSION = "v1" +# The default PIP index URL to download Python packages. +DEFAULT_PIP_INDEX_URL = "https://pypi.org/simple" + +# Annotation to disable Istio sidecar. ISTIO_SIDECAR_INJECTION = "sidecar.istio.io/inject" +# Common constants. +GROUP = "kubeflow.org" +VERSION = "v1" +API_VERSION = f"{GROUP}/{VERSION}" + # Training Job conditions. JOB_CONDITION_CREATED = "Created" JOB_CONDITION_RUNNING = "Running" @@ -50,12 +57,19 @@ REPLICA_TYPE_PS = "PS" REPLICA_TYPE_MASTER = "Master" REPLICA_TYPE_WORKER = "Worker" +REPLICA_TYPE_SCHEDULER = "Scheduler" +REPLICA_TYPE_SERVER = "Server" +REPLICA_TYPE_LAUNCHER = "Launcher" # TFJob constants. TFJOB_KIND = "TFJob" TFJOB_PLURAL = "tfjobs" TFJOB_CONTAINER = "tensorflow" -TFJOB_REPLICA_TYPES = {"ps", "chief", "worker"} +TFJOB_REPLICA_TYPES = ( + REPLICA_TYPE_PS.lower(), + REPLICA_TYPE_CHIEF.lower(), + REPLICA_TYPE_WORKER.lower(), +) TFJOB_BASE_IMAGE = "docker.io/tensorflow/tensorflow:2.9.1" TFJOB_BASE_IMAGE_GPU = "docker.io/tensorflow/tensorflow:2.9.1-gpu" @@ -64,49 +78,89 @@ PYTORCHJOB_KIND = "PyTorchJob" PYTORCHJOB_PLURAL = "pytorchjobs" PYTORCHJOB_CONTAINER = "pytorch" -PYTORCHJOB_REPLICA_TYPES = {"master", "worker"} +PYTORCHJOB_REPLICA_TYPES = (REPLICA_TYPE_MASTER.lower(), REPLICA_TYPE_WORKER.lower()) PYTORCHJOB_BASE_IMAGE = "docker.io/pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime" # MXJob constants MXJOB_KIND = "MXJob" MXJOB_PLURAL = "mxjobs" -MXJOB_REPLICA_TYPES = {"scheduler", "server", "worker"} +MXJOB_CONTAINER = "mxnet" +MXJOB_REPLICA_TYPES = ( + REPLICA_TYPE_SCHEDULER.lower(), + REPLICA_TYPE_SERVER.lower(), + REPLICA_TYPE_WORKER.lower(), +) # XGBoostJob constants XGBOOSTJOB_KIND = "XGBoostJob" XGBOOSTJOB_PLURAL = "xgboostjobs" -XGBOOSTJOB_REPLICA_TYPES = {"master", "worker"} +XGBOOSTJOB_CONTAINER = "xgboost" +XGBOOSTJOB_REPLICA_TYPES = (REPLICA_TYPE_MASTER.lower(), REPLICA_TYPE_WORKER.lower()) # MPIJob constants MPIJOB_KIND = "MPIJob" MPIJOB_PLURAL = "mpijobs" -MPIJOB_REPLICA_TYPES = {"launcher", "worker"} +MPIJOB_CONTAINER = "mpi" +MPIJOB_REPLICA_TYPES = (REPLICA_TYPE_LAUNCHER.lower(), REPLICA_TYPE_WORKER.lower()) # PaddleJob constants PADDLEJOB_KIND = "PaddleJob" PADDLEJOB_PLURAL = "paddlejobs" -PADDLEJOB_REPLICA_TYPES = {"master", "worker"} +PADDLEJOB_CONTAINER = "paddle" +PADDLEJOB_REPLICA_TYPES = (REPLICA_TYPE_MASTER.lower(), REPLICA_TYPE_WORKER.lower()) PADDLEJOB_BASE_IMAGE = ( "docker.io/paddlepaddle/paddle:2.4.0rc0-gpu-cuda11.2-cudnn8.1-trt8.0" ) -# Dictionary to get plural and model for each Job kind. -JOB_KINDS = { - TFJOB_KIND: {"plural": TFJOB_PLURAL, "model": models.KubeflowOrgV1TFJob}, + +# Dictionary to get plural, model, and container for each Job kind. +JOB_PARAMETERS = { + TFJOB_KIND: { + "model": models.KubeflowOrgV1TFJob, + "plural": TFJOB_PLURAL, + "container": TFJOB_CONTAINER, + "base_image": TFJOB_BASE_IMAGE, + }, PYTORCHJOB_KIND: { - "plural": PYTORCHJOB_PLURAL, "model": models.KubeflowOrgV1PyTorchJob, + "plural": PYTORCHJOB_PLURAL, + "container": PYTORCHJOB_CONTAINER, + "base_image": PYTORCHJOB_BASE_IMAGE, + }, + MXJOB_KIND: { + "model": models.KubeflowOrgV1MXJob, + "plural": MXJOB_PLURAL, + "container": MXJOB_CONTAINER, }, - MXJOB_KIND: {"plural": MXJOB_PLURAL, "model": models.KubeflowOrgV1MXJob}, XGBOOSTJOB_KIND: { - "plural": XGBOOSTJOB_PLURAL, "model": models.KubeflowOrgV1XGBoostJob, + "plural": XGBOOSTJOB_PLURAL, + "container": XGBOOSTJOB_CONTAINER, + }, + MPIJOB_KIND: { + "model": models.KubeflowOrgV1MPIJob, + "plural": MPIJOB_PLURAL, + "container": MPIJOB_CONTAINER, }, - MPIJOB_KIND: {"plural": MPIJOB_PLURAL, "model": models.KubeflowOrgV1MPIJob}, PADDLEJOB_KIND: { - "plural": PADDLEJOB_PLURAL, "model": models.KubeflowOrgV1PaddleJob, + "plural": PADDLEJOB_PLURAL, + "container": PADDLEJOB_CONTAINER, + "base_image": PADDLEJOB_BASE_IMAGE, }, } + +# Tuple of all Job models. +JOB_MODELS = tuple([d["model"] for d in list(JOB_PARAMETERS.values())]) + +# Union type of all Job models. +JOB_MODELS_TYPE = Union[ + models.KubeflowOrgV1TFJob, + models.KubeflowOrgV1PyTorchJob, + models.KubeflowOrgV1MXJob, + models.KubeflowOrgV1XGBoostJob, + models.KubeflowOrgV1MPIJob, + models.KubeflowOrgV1PaddleJob, +] diff --git a/sdk/python/kubeflow/training/utils/utils.py b/sdk/python/kubeflow/training/utils/utils.py index 84ceb172f3..1659b14cd3 100644 --- a/sdk/python/kubeflow/training/utils/utils.py +++ b/sdk/python/kubeflow/training/utils/utils.py @@ -16,16 +16,13 @@ import logging import textwrap import inspect -from typing import Callable, List, Dict, Any +from typing import Optional, Callable, List, Dict, Any import json import threading import queue -import multiprocessing - -from kubernetes import client from kubeflow.training.constants import constants -from kubeflow.training.api_client import ApiClient +from kubeflow.training import models logging.basicConfig(format="%(message)s") @@ -68,156 +65,6 @@ def get_default_target_namespace(): return f.readline() -def create_job( - custom_api: client.CustomObjectsApi, - job: object, - namespace: str, - job_kind: str, - job_plural: str, -): - """Create the Training Job.""" - - try: - custom_api.create_namespaced_custom_object( - constants.KUBEFLOW_GROUP, - constants.OPERATOR_VERSION, - namespace, - job_plural, - job, - ) - except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to create {job_kind}: {namespace}/{job.metadata.name}" - ) - except Exception: - raise RuntimeError( - f"Failed to create {job_kind}: {namespace}/{job.metadata.name}" - ) - - logging.info(f"{job_kind} {namespace}/{job.metadata.name} has been created") - - -def get_job( - custom_api: client.CustomObjectsApi, - api_client: ApiClient, - name: str, - namespace: str, - job_model: object, - job_kind: str, - job_plural: str, - timeout: int, -): - """Get the Training Job.""" - - try: - thread = custom_api.get_namespaced_custom_object( - constants.KUBEFLOW_GROUP, - constants.OPERATOR_VERSION, - namespace, - job_plural, - name, - async_req=True, - ) - response = FakeResponse(thread.get(timeout)) - job = api_client.deserialize(response, job_model) - return job - - except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to get {job_kind}: {namespace}/{name}") - except Exception: - raise RuntimeError(f"Failed to get {job_kind}: {namespace}/{name}") - - -def list_jobs( - custom_api: client.CustomObjectsApi, - api_client: ApiClient, - namespace: str, - job_model: object, - job_kind: str, - job_plural: str, - timeout: int, -): - """List the Training Jobs.""" - - result = [] - try: - thread = custom_api.list_namespaced_custom_object( - constants.KUBEFLOW_GROUP, - constants.OPERATOR_VERSION, - namespace, - job_plural, - async_req=True, - ) - response = thread.get(timeout) - result = [ - api_client.deserialize(FakeResponse(item), job_model) - for item in response.get("items") - ] - except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to list {job_kind}s in namespace: {namespace}") - except Exception: - raise RuntimeError(f"Failed to list {job_kind}s in namespace: {namespace}") - return result - - -def delete_job( - custom_api: client.CustomObjectsApi, - name: str, - namespace: str, - job_kind: str, - job_plural: str, - delete_options: client.V1DeleteOptions, -): - """Delete the Training Job.""" - - try: - custom_api.delete_namespaced_custom_object( - constants.KUBEFLOW_GROUP, - constants.OPERATOR_VERSION, - namespace, - job_plural, - name=name, - body=delete_options, - ) - except multiprocessing.TimeoutError: - raise TimeoutError(f"Timeout to delete {job_kind}: {namespace}/{name}") - except Exception: - raise RuntimeError(f"Failed to delete {job_kind}: {namespace}/{name}") - - logging.info(f"{job_kind} {namespace}/{name} has been deleted") - - -def patch_job( - custom_api: client.CustomObjectsApi, - job: object, - name: str, - namespace: str, - job_kind: str, - job_plural: str, -): - """Patch the Training Job.""" - - try: - custom_api.patch_namespaced_custom_object( - constants.KUBEFLOW_GROUP, - constants.OPERATOR_VERSION, - namespace, - job_plural, - name, - job, - ) - except multiprocessing.TimeoutError: - raise TimeoutError( - f"Timeout to patch {job_kind}: {namespace}/{job.metadata.name}" - ) - except Exception: - raise RuntimeError( - f"Failed to patch {job_kind}: {namespace}/{job.metadata.name}" - ) - - logging.info(f"{job_kind} {namespace}/{job.metadata.name} has been patched") - - def wrap_log_stream(q, stream): while True: try: @@ -237,8 +84,9 @@ def get_log_queue_pool(streams): return pool -def has_condition(conditions: object, condition_type: str): - """Verify if the condition list has the required condition. +def has_condition(conditions: List[models.V1JobCondition], condition_type: str) -> bool: + """ + Verify if the condition list has the required condition. Condition should be valid object with `type` and `status`. """ @@ -248,7 +96,12 @@ def has_condition(conditions: object, condition_type: str): return False -def get_script_for_python_packages(packages_to_install, pip_index_url): +def get_script_for_python_packages( + packages_to_install: List[str], pip_index_url: str +) -> str: + """ + Get init script to install Python packages from the given pip index URL. + """ packages_str = " ".join([str(package) for package in packages_to_install]) script_for_python_packages = textwrap.dedent( @@ -266,73 +119,184 @@ def get_script_for_python_packages(packages_to_install, pip_index_url): def get_pod_template_spec( - func: Callable, - parameters: Dict[str, Any], - base_image: str, - container_name: str, - packages_to_install: List[str], - pip_index_url: str, + job_kind: str, + base_image: Optional[str] = None, + train_func: Optional[Callable] = None, + parameters: Optional[Dict[str, Any]] = None, + packages_to_install: Optional[List[str]] = None, + pip_index_url: str = constants.DEFAULT_PIP_INDEX_URL, ): """ - Get Pod template spec from the given function and input parameters. + Get Pod template spec for the given function and base image. """ - # Check if function is callable. - if not callable(func): - raise ValueError( - f"Training function must be callable, got function type: {type(func)}" + # Assign the default base image. + # TODO (andreyvelich): Add base image for other Job kinds. + if base_image is None: + base_image = constants.JOB_PARAMETERS[job_kind]["base_image"] + + # Create Pod template spec. + pod_template_spec = models.V1PodTemplateSpec( + metadata=models.V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=models.V1PodSpec( + containers=[ + models.V1Container( + name=constants.JOB_PARAMETERS[job_kind]["container"], + image=base_image, + ) + ] + ), + ) + + # If Training function is set, convert function to container execution script. + if train_func is not None: + # Check if function is callable. + if not callable(train_func): + raise ValueError( + f"Training function must be callable, got function type: {type(train_func)}" + ) + + # Extract function implementation. + func_code = inspect.getsource(train_func) + + # Function might be defined in some indented scope (e.g. in another function). + # We need to dedent the function code. + func_code = textwrap.dedent(func_code) + + # Wrap function code to execute it from the file. For example: + # def train(parameters): + # print('Start Training...') + # train({'lr': 0.01}) + if parameters is None: + func_code = f"{func_code}\n{train_func.__name__}()\n" + else: + func_code = f"{func_code}\n{train_func.__name__}({parameters})\n" + + # Prepare execute script template. + exec_script = textwrap.dedent( + """ + program_path=$(mktemp -d) + read -r -d '' SCRIPT << EOM\n + {func_code} + EOM + printf "%s" \"$SCRIPT\" > \"$program_path/ephemeral_script.py\" + python3 -u \"$program_path/ephemeral_script.py\"""" ) - # Extract function implementation. - func_code = inspect.getsource(func) + # Add function code to the execute script. + exec_script = exec_script.format(func_code=func_code) - # Function might be defined in some indented scope (e.g. in another function). - # We need to dedent the function code. - func_code = textwrap.dedent(func_code) + # Install Python packages if that is required. + if packages_to_install is not None: + exec_script = ( + get_script_for_python_packages(packages_to_install, pip_index_url) + + exec_script + ) - # Wrap function code to execute it from the file. For example: - # def train(parameters): - # print('Start Training...') - # train({'lr': 0.01}) - if parameters is None: - func_code = f"{func_code}\n{func.__name__}()\n" - else: - func_code = f"{func_code}\n{func.__name__}({parameters})\n" + # Add execution script to container arguments. + pod_template_spec.spec.containers[0].command = ["bash", "-c"] + pod_template_spec.spec.containers[0].args = [exec_script] - # Prepare execute script template. - exec_script = textwrap.dedent( - """ - program_path=$(mktemp -d) - read -r -d '' SCRIPT << EOM\n - {func_code} - EOM - printf "%s" "$SCRIPT" > $program_path/ephemeral_script.py - python3 -u $program_path/ephemeral_script.py""" + return pod_template_spec + + +def get_tfjob_template( + name: str, + namespace: str, + pod_template_spec: models.V1PodTemplateSpec, + num_worker_replicas: Optional[int] = None, + num_chief_replicas: Optional[int] = None, + num_ps_replicas: Optional[int] = None, +): + # Check if at least one replica is set. + # TODO (andreyvelich): Remove this check once we have CEL validation. + # Ref: https://github.com/kubeflow/training-operator/issues/1708 + if ( + num_worker_replicas is None + and num_chief_replicas is None + and num_ps_replicas is None + ): + raise ValueError("At least one replica for TFJob must be set") + + # Create TFJob template. + tfjob = models.KubeflowOrgV1TFJob( + api_version=constants.API_VERSION, + kind=constants.TFJOB_KIND, + metadata=models.V1ObjectMeta(name=name, namespace=namespace), + spec=models.KubeflowOrgV1TFJobSpec( + run_policy=models.KubeflowOrgV1RunPolicy(clean_pod_policy=None), + tf_replica_specs={}, + ), ) - # Add function code to the execute script. - exec_script = exec_script.format(func_code=func_code) + # Add Chief, PS, and Worker replicas to the TFJob. + if num_chief_replicas is not None: + tfjob.spec.tf_replica_specs[ + constants.REPLICA_TYPE_CHIEF + ] = models.KubeflowOrgV1ReplicaSpec( + replicas=num_chief_replicas, + template=pod_template_spec, + ) - # Install Python packages if that is required. - if packages_to_install is not None: - exec_script = ( - get_script_for_python_packages(packages_to_install, pip_index_url) - + exec_script + if num_ps_replicas is not None: + tfjob.spec.tf_replica_specs[ + constants.REPLICA_TYPE_PS + ] = models.KubeflowOrgV1ReplicaSpec( + replicas=num_ps_replicas, + template=pod_template_spec, ) - # Create Pod template spec. - pod_template_spec = client.V1PodTemplateSpec( - metadata=client.V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), - spec=client.V1PodSpec( - containers=[ - client.V1Container( - name=container_name, - image=base_image, - command=["bash", "-c"], - args=[exec_script], - ) - ] + if num_worker_replicas is not None: + tfjob.spec.tf_replica_specs[ + constants.REPLICA_TYPE_WORKER + ] = models.KubeflowOrgV1ReplicaSpec( + replicas=num_worker_replicas, + template=pod_template_spec, + ) + + return tfjob + + +def get_pytorchjob_template( + name: str, + namespace: str, + pod_template_spec: models.V1PodTemplateSpec, + num_worker_replicas: Optional[int] = None, +): + # Check if at least one replica is set. + # TODO (andreyvelich): Remove this check once we have CEL validation. + # Ref: https://github.com/kubeflow/training-operator/issues/1708 + if num_worker_replicas is None: + raise ValueError("At least one Worker replica for PyTorchJob must be set") + + # Create PyTorchJob template. + pytorchjob = models.KubeflowOrgV1PyTorchJob( + api_version=constants.API_VERSION, + kind=constants.PYTORCHJOB_KIND, + metadata=models.V1ObjectMeta(name=name, namespace=namespace), + spec=models.KubeflowOrgV1PyTorchJobSpec( + run_policy=models.KubeflowOrgV1RunPolicy(clean_pod_policy=None), + pytorch_replica_specs={}, ), ) - return pod_template_spec + # Add Master and Worker replicas to the PyTorchJob. + pytorchjob.spec.pytorch_replica_specs[ + constants.REPLICA_TYPE_MASTER + ] = models.KubeflowOrgV1ReplicaSpec( + replicas=1, + template=pod_template_spec, + ) + + # If number of Worker replicas is 1, PyTorchJob uses only Master replica. + if num_worker_replicas != 1: + pytorchjob.spec.pytorch_replica_specs[ + constants.REPLICA_TYPE_WORKER + ] = models.KubeflowOrgV1ReplicaSpec( + replicas=num_worker_replicas, + template=pod_template_spec, + ) + + return pytorchjob diff --git a/sdk/python/setup.py b/sdk/python/setup.py index c837d447db..fc93bac67d 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -14,7 +14,13 @@ import setuptools -TESTS_REQUIRES = ["pytest", "pytest-tornasync", "mypy"] +TESTS_REQUIRES = [ + "pytest", + "pytest-tornasync", + "mypy", + "black==21.12b0", + "flake==4.0.1", +] REQUIRES = [ "certifi>=14.05.14", diff --git a/sdk/python/test/conftest.py b/sdk/python/test/conftest.py index 1371cb5374..756906f27f 100644 --- a/sdk/python/test/conftest.py +++ b/sdk/python/test/conftest.py @@ -1,5 +1,6 @@ import pytest + def pytest_addoption(parser): parser.addoption("--namespace", action="store", default="default") diff --git a/sdk/python/test/e2e/constants.py b/sdk/python/test/e2e/constants.py index 0eb28d72f8..04be27836f 100644 --- a/sdk/python/test/e2e/constants.py +++ b/sdk/python/test/e2e/constants.py @@ -17,7 +17,10 @@ TEST_GANG_SCHEDULER_NAME_VOLCANO = "volcano" TEST_GANG_SCHEDULER_NAME_NONE = "none" -GANG_SCHEDULERS = {TEST_GANG_SCHEDULER_NAME_SCHEDULER_PLUGINS, TEST_GANG_SCHEDULER_NAME_VOLCANO} +GANG_SCHEDULERS = { + TEST_GANG_SCHEDULER_NAME_SCHEDULER_PLUGINS, + TEST_GANG_SCHEDULER_NAME_VOLCANO, +} NONE_GANG_SCHEDULERS = {TEST_GANG_SCHEDULER_NAME_NONE, ""} DEFAULT_SCHEDULER_PLUGINS_NAME = "scheduler-plugins-scheduler" diff --git a/sdk/python/test/e2e/test_e2e_mpijob.py b/sdk/python/test/e2e/test_e2e_mpijob.py index aa34fde75e..abc2e78d50 100644 --- a/sdk/python/test/e2e/test_e2e_mpijob.py +++ b/sdk/python/test/e2e/test_e2e_mpijob.py @@ -15,7 +15,7 @@ import os import logging import pytest -from typing import Tuple +from typing import Tuple, Optional from kubernetes.client import V1PodTemplateSpec from kubernetes.client import V1ObjectMeta @@ -31,21 +31,22 @@ from kubeflow.training import KubeflowOrgV1SchedulingPolicy from kubeflow.training.constants import constants -from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name +import test.e2e.utils as utils from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS logging.basicConfig(format="%(message)s") logging.getLogger().setLevel(logging.INFO) -TRAINING_CLIENT = TrainingClient() +TRAINING_CLIENT = TrainingClient(job_kind=constants.MPIJOB_KIND) JOB_NAME = "mpijob-mxnet-ci-test" CONTAINER_NAME = "mpi" -GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY) +GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY, "") @pytest.mark.skipif( - GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling", + GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, + reason="For gang-scheduling", ) def test_sdk_e2e_with_gang_scheduling(job_namespace): launcher_container, worker_container = generate_containers() @@ -54,11 +55,13 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): replicas=1, restart_policy="Never", template=V1PodTemplateSpec( - metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), spec=V1PodSpec( containers=[launcher_container], - scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), - ) + scheduler_name=utils.get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), + ), ), ) @@ -66,45 +69,52 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): replicas=1, restart_policy="Never", template=V1PodTemplateSpec( - metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), spec=V1PodSpec( containers=[worker_container], - scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), - ) + scheduler_name=utils.get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), + ), ), ) - mpijob = generate_mpijob(launcher, worker, KubeflowOrgV1SchedulingPolicy(min_available=10), job_namespace) - patched_mpijob = generate_mpijob(launcher, worker, KubeflowOrgV1SchedulingPolicy(min_available=2), job_namespace) + mpijob = generate_mpijob( + job_namespace, launcher, worker, KubeflowOrgV1SchedulingPolicy(min_available=10) + ) + patched_mpijob = generate_mpijob( + job_namespace, launcher, worker, KubeflowOrgV1SchedulingPolicy(min_available=2) + ) - TRAINING_CLIENT.create_mpijob(mpijob, job_namespace) - logging.info(f"List of created {constants.MPIJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_mpijobs(job_namespace)) + TRAINING_CLIENT.create_job(job=mpijob, namespace=job_namespace) + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_unschedulable_job_e2e( - TRAINING_CLIENT, - JOB_NAME, - job_namespace, - constants.MPIJOB_KIND, - ) + try: + utils.verify_unschedulable_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"MPIJob E2E fails. Exception: {e}") - TRAINING_CLIENT.patch_mpijob(patched_mpijob, JOB_NAME, job_namespace) - logging.info(f"List of patched {constants.MPIJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_mpijobs(job_namespace)) + TRAINING_CLIENT.update_job(patched_mpijob, JOB_NAME, job_namespace) + logging.info(f"List of updated {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_job_e2e( - TRAINING_CLIENT, - JOB_NAME, - job_namespace, - constants.MPIJOB_KIND, - CONTAINER_NAME, - ) + try: + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"MPIJob E2E fails. Exception: {e}") - TRAINING_CLIENT.delete_mpijob(JOB_NAME, job_namespace) + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) @pytest.mark.skipif( - GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling", + GANG_SCHEDULER_NAME in GANG_SCHEDULERS, + reason="For plain scheduling", ) def test_sdk_e2e(job_namespace): launcher_container, worker_container = generate_containers() @@ -112,43 +122,51 @@ def test_sdk_e2e(job_namespace): launcher = KubeflowOrgV1ReplicaSpec( replicas=1, restart_policy="Never", - template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), - spec=V1PodSpec(containers=[launcher_container])), + template=V1PodTemplateSpec( + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=V1PodSpec(containers=[launcher_container]), + ), ) worker = KubeflowOrgV1ReplicaSpec( replicas=1, restart_policy="Never", - template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), - spec=V1PodSpec(containers=[worker_container])), + template=V1PodTemplateSpec( + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=V1PodSpec(containers=[worker_container]), + ), ) - mpijob = generate_mpijob(launcher, worker, job_namespace=job_namespace) + mpijob = generate_mpijob(job_namespace, launcher, worker) - TRAINING_CLIENT.create_mpijob(mpijob, job_namespace) - logging.info(f"List of created {constants.MPIJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_mpijobs(job_namespace)) + TRAINING_CLIENT.create_job(job=mpijob, namespace=job_namespace) + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_job_e2e( - TRAINING_CLIENT, - JOB_NAME, - job_namespace, - constants.MPIJOB_KIND, - CONTAINER_NAME, - ) + try: + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"MPIJob E2E fails. Exception: {e}") - TRAINING_CLIENT.delete_mpijob(JOB_NAME, job_namespace) + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) def generate_mpijob( + job_namespace: str, launcher: KubeflowOrgV1ReplicaSpec, worker: KubeflowOrgV1ReplicaSpec, - scheduling_policy: KubeflowOrgV1SchedulingPolicy = None, - job_namespace: str = "default", + scheduling_policy: Optional[KubeflowOrgV1SchedulingPolicy] = None, ) -> KubeflowOrgV1MPIJob: return KubeflowOrgV1MPIJob( - api_version="kubeflow.org/v1", - kind="MPIJob", + api_version=constants.API_VERSION, + kind=constants.MPIJOB_KIND, metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace), spec=KubeflowOrgV1MPIJobSpec( slots_per_worker=1, diff --git a/sdk/python/test/e2e/test_e2e_mxjob.py b/sdk/python/test/e2e/test_e2e_mxjob.py index 7d0b36370e..d0db8de467 100644 --- a/sdk/python/test/e2e/test_e2e_mxjob.py +++ b/sdk/python/test/e2e/test_e2e_mxjob.py @@ -15,7 +15,7 @@ import os import logging import pytest -from typing import Tuple +from typing import Tuple, Optional from kubernetes.client import V1PodTemplateSpec from kubernetes.client import V1ObjectMeta @@ -32,21 +32,22 @@ from kubeflow.training import KubeflowOrgV1SchedulingPolicy from kubeflow.training.constants import constants -from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name +import test.e2e.utils as utils from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS logging.basicConfig(format="%(message)s") logging.getLogger().setLevel(logging.INFO) -TRAINING_CLIENT = TrainingClient() +TRAINING_CLIENT = TrainingClient(job_kind=constants.MXJOB_KIND) JOB_NAME = "mxjob-mnist-ci-test" CONTAINER_NAME = "mxnet" -GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY) +GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY, "") @pytest.mark.skipif( - GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling", + GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, + reason="For gang-scheduling", ) def test_sdk_e2e_with_gang_scheduling(job_namespace): worker_container, server_container, scheduler_container = generate_containers() @@ -55,11 +56,13 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): replicas=1, restart_policy="Never", template=V1PodTemplateSpec( - metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), spec=V1PodSpec( containers=[worker_container], - scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), - ) + scheduler_name=utils.get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), + ), ), ) @@ -67,11 +70,13 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): replicas=1, restart_policy="Never", template=V1PodTemplateSpec( - metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), spec=V1PodSpec( containers=[server_container], - scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), - ) + scheduler_name=utils.get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), + ), ), ) @@ -79,45 +84,60 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): replicas=1, restart_policy="Never", template=V1PodTemplateSpec( - metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), spec=V1PodSpec( containers=[scheduler_container], - scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), - ) + scheduler_name=utils.get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), + ), ), ) - unschedulable_mxjob = generate_mxjob(scheduler, server, worker, KubeflowOrgV1SchedulingPolicy(min_available=10), job_namespace) - schedulable_mxjob = generate_mxjob(scheduler, server, worker, KubeflowOrgV1SchedulingPolicy(min_available=3), job_namespace) - - TRAINING_CLIENT.create_mxjob(unschedulable_mxjob, job_namespace) - logging.info(f"List of created {constants.MXJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_mxjobs(job_namespace)) - - verify_unschedulable_job_e2e( - TRAINING_CLIENT, - JOB_NAME, + unschedulable_mxjob = generate_mxjob( job_namespace, - constants.MXJOB_KIND, + scheduler, + server, + worker, + KubeflowOrgV1SchedulingPolicy(min_available=10), ) - - TRAINING_CLIENT.patch_mxjob(schedulable_mxjob, JOB_NAME, job_namespace) - logging.info(f"List of patched {constants.MXJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_mxjobs(job_namespace)) - - verify_job_e2e( - TRAINING_CLIENT, - JOB_NAME, + schedulable_mxjob = generate_mxjob( job_namespace, - constants.MXJOB_KIND, - CONTAINER_NAME, + scheduler, + server, + worker, + KubeflowOrgV1SchedulingPolicy(min_available=3), ) - TRAINING_CLIENT.delete_mxjob(JOB_NAME, job_namespace) + TRAINING_CLIENT.create_job(job=unschedulable_mxjob, namespace=job_namespace) + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) + + try: + utils.verify_unschedulable_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"MXJob E2E fails. Exception: {e}") + + TRAINING_CLIENT.update_job(schedulable_mxjob, JOB_NAME, job_namespace) + logging.info(f"List of updated {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) + + try: + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"MXJob E2E fails. Exception: {e}") + + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) @pytest.mark.skipif( - GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling", + GANG_SCHEDULER_NAME in GANG_SCHEDULERS, + reason="For plain scheduling", ) def test_sdk_e2e(job_namespace): worker_container, server_container, scheduler_container = generate_containers() @@ -125,51 +145,63 @@ def test_sdk_e2e(job_namespace): worker = KubeflowOrgV1ReplicaSpec( replicas=1, restart_policy="Never", - template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), - spec=V1PodSpec(containers=[worker_container])), + template=V1PodTemplateSpec( + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=V1PodSpec(containers=[worker_container]), + ), ) server = KubeflowOrgV1ReplicaSpec( replicas=1, restart_policy="Never", - template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), - spec=V1PodSpec(containers=[server_container])), + template=V1PodTemplateSpec( + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=V1PodSpec(containers=[server_container]), + ), ) scheduler = KubeflowOrgV1ReplicaSpec( replicas=1, restart_policy="Never", - template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), - spec=V1PodSpec(containers=[scheduler_container])), + template=V1PodTemplateSpec( + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=V1PodSpec(containers=[scheduler_container]), + ), ) - mxjob = generate_mxjob(scheduler, server, worker, job_namespace=job_namespace) + mxjob = generate_mxjob(job_namespace, scheduler, server, worker) - TRAINING_CLIENT.create_mxjob(mxjob, job_namespace) - logging.info(f"List of created {constants.MXJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_mxjobs(job_namespace)) + TRAINING_CLIENT.create_job(job=mxjob, namespace=job_namespace) + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_job_e2e( - TRAINING_CLIENT, - JOB_NAME, - job_namespace, - constants.MXJOB_KIND, - CONTAINER_NAME, - ) + try: + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"MXJob E2E fails. Exception: {e}") - TRAINING_CLIENT.delete_mxjob(JOB_NAME, job_namespace) + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) def generate_mxjob( + job_namespace: str, scheduler: KubeflowOrgV1ReplicaSpec, server: KubeflowOrgV1ReplicaSpec, worker: KubeflowOrgV1ReplicaSpec, - scheduling_policy: KubeflowOrgV1SchedulingPolicy = None, - job_namespace: str = "default", + scheduling_policy: Optional[KubeflowOrgV1SchedulingPolicy] = None, ) -> KubeflowOrgV1MXJob: return KubeflowOrgV1MXJob( - api_version="kubeflow.org/v1", - kind="MXJob", + api_version=constants.API_VERSION, + kind=constants.MXJOB_KIND, metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace), spec=KubeflowOrgV1MXJobSpec( job_mode="MXTrain", diff --git a/sdk/python/test/e2e/test_e2e_paddlejob.py b/sdk/python/test/e2e/test_e2e_paddlejob.py index 8f138492fc..7b41d729e9 100644 --- a/sdk/python/test/e2e/test_e2e_paddlejob.py +++ b/sdk/python/test/e2e/test_e2e_paddlejob.py @@ -15,6 +15,7 @@ import os import logging import pytest +from typing import Optional from kubernetes.client import V1PodTemplateSpec from kubernetes.client import V1ObjectMeta @@ -30,21 +31,22 @@ from kubeflow.training import KubeflowOrgV1SchedulingPolicy from kubeflow.training.constants import constants -from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name +import test.e2e.utils as utils from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS logging.basicConfig(format="%(message)s") logging.getLogger().setLevel(logging.INFO) -TRAINING_CLIENT = TrainingClient() +TRAINING_CLIENT = TrainingClient(job_kind=constants.PADDLEJOB_KIND) JOB_NAME = "paddlejob-cpu-ci-test" CONTAINER_NAME = "paddle" -GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY) +GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY, "") @pytest.mark.skipif( - GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling", + GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, + reason="For gang-scheduling", ) def test_sdk_e2e_with_gang_scheduling(job_namespace): container = generate_container() @@ -53,45 +55,52 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): replicas=2, restart_policy="OnFailure", template=V1PodTemplateSpec( - metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), spec=V1PodSpec( - scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), + scheduler_name=utils.get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), containers=[container], - ) + ), ), ) - unschedulable_paddlejob = generate_paddlejob(worker, KubeflowOrgV1SchedulingPolicy(min_available=10), job_namespace) - schedulable_paddlejob = generate_paddlejob(worker, KubeflowOrgV1SchedulingPolicy(min_available=2), job_namespace) + unschedulable_paddlejob = generate_paddlejob( + job_namespace, worker, KubeflowOrgV1SchedulingPolicy(min_available=10) + ) + schedulable_paddlejob = generate_paddlejob( + job_namespace, worker, KubeflowOrgV1SchedulingPolicy(min_available=2) + ) - TRAINING_CLIENT.create_paddlejob(unschedulable_paddlejob, job_namespace) - logging.info(f"List of created {constants.PADDLEJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_paddlejobs(job_namespace)) + TRAINING_CLIENT.create_job(job=unschedulable_paddlejob, namespace=job_namespace) + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_unschedulable_job_e2e( - TRAINING_CLIENT, - JOB_NAME, - job_namespace, - constants.PADDLEJOB_KIND, - ) + try: + utils.verify_unschedulable_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"PaddleJob E2E fails. Exception: {e}") - TRAINING_CLIENT.patch_paddlejob(schedulable_paddlejob, JOB_NAME, job_namespace) - logging.info(f"List of patched {constants.PADDLEJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_paddlejobs(job_namespace)) + TRAINING_CLIENT.update_job(schedulable_paddlejob, JOB_NAME, job_namespace) + logging.info(f"List of updated {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_job_e2e( - TRAINING_CLIENT, - JOB_NAME, - job_namespace, - constants.PADDLEJOB_KIND, - CONTAINER_NAME, - ) + try: + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"PaddleJob E2E fails. Exception: {e}") - TRAINING_CLIENT.delete_paddlejob(JOB_NAME, job_namespace) + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) @pytest.mark.skipif( - GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling", + GANG_SCHEDULER_NAME in GANG_SCHEDULERS, + reason="For plain scheduling", ) def test_sdk_e2e(job_namespace): container = generate_container() @@ -99,35 +108,39 @@ def test_sdk_e2e(job_namespace): worker = KubeflowOrgV1ReplicaSpec( replicas=2, restart_policy="OnFailure", - template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), - spec=V1PodSpec(containers=[container])), + template=V1PodTemplateSpec( + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=V1PodSpec(containers=[container]), + ), ) - paddlejob = generate_paddlejob(worker, job_namespace=job_namespace) + paddlejob = generate_paddlejob(job_namespace, worker) - TRAINING_CLIENT.create_paddlejob(paddlejob, job_namespace) - logging.info(f"List of created {constants.PADDLEJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_paddlejobs(job_namespace)) + TRAINING_CLIENT.create_job(job=paddlejob, namespace=job_namespace) + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_job_e2e( - TRAINING_CLIENT, - JOB_NAME, - job_namespace, - constants.PADDLEJOB_KIND, - CONTAINER_NAME, - ) + try: + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"PaddleJob E2E fails. Exception: {e}") - TRAINING_CLIENT.delete_paddlejob(JOB_NAME, job_namespace) + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) def generate_paddlejob( + job_namespace: str, worker: KubeflowOrgV1ReplicaSpec, - scheduling_policy: KubeflowOrgV1SchedulingPolicy = None, - job_namespace: str = "default", + scheduling_policy: Optional[KubeflowOrgV1SchedulingPolicy] = None, ) -> KubeflowOrgV1PaddleJob: return KubeflowOrgV1PaddleJob( - api_version="kubeflow.org/v1", - kind="PaddleJob", + api_version=constants.API_VERSION, + kind=constants.PADDLEJOB_KIND, metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace), spec=KubeflowOrgV1PaddleJobSpec( run_policy=KubeflowOrgV1RunPolicy( diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index 22a9b11b83..d7cc976f09 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -15,6 +15,7 @@ import os import logging import pytest +from typing import Optional from kubernetes.client import V1PodTemplateSpec from kubernetes.client import V1ObjectMeta @@ -28,23 +29,24 @@ from kubeflow.training import KubeflowOrgV1PyTorchJobSpec from kubeflow.training import KubeflowOrgV1RunPolicy from kubeflow.training import KubeflowOrgV1SchedulingPolicy -from kubeflow.training.constants import constants +from kubeflow.training import constants -from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name +import test.e2e.utils as utils from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS logging.basicConfig(format="%(message)s") logging.getLogger().setLevel(logging.INFO) -TRAINING_CLIENT = TrainingClient() +TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND) JOB_NAME = "pytorchjob-mnist-ci-test" CONTAINER_NAME = "pytorch" -GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY) +GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY, "") @pytest.mark.skipif( - GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling", + GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, + reason="For gang-scheduling", ) def test_sdk_e2e_with_gang_scheduling(job_namespace): container = generate_container() @@ -53,11 +55,13 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): replicas=1, restart_policy="OnFailure", template=V1PodTemplateSpec( - metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), spec=V1PodSpec( - scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), + scheduler_name=utils.get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), containers=[container], - ) + ), ), ) @@ -65,46 +69,55 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): replicas=1, restart_policy="OnFailure", template=V1PodTemplateSpec( - metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), spec=V1PodSpec( - scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), + scheduler_name=utils.get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), containers=[container], - ) + ), ), ) - unschedulable_pytorchjob = generate_pytorchjob(master, worker, KubeflowOrgV1SchedulingPolicy(min_available=10), job_namespace) - schedulable_pytorchjob = generate_pytorchjob(master, worker, KubeflowOrgV1SchedulingPolicy(min_available=2), job_namespace) - - TRAINING_CLIENT.create_pytorchjob(unschedulable_pytorchjob, job_namespace) - logging.info(f"List of created {constants.PYTORCHJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_pytorchjobs(job_namespace)) - - verify_unschedulable_job_e2e( - TRAINING_CLIENT, - JOB_NAME, + unschedulable_pytorchjob = generate_pytorchjob( job_namespace, - constants.PYTORCHJOB_KIND, + master, + worker, + KubeflowOrgV1SchedulingPolicy(min_available=10), + ) + schedulable_pytorchjob = generate_pytorchjob( + job_namespace, master, worker, KubeflowOrgV1SchedulingPolicy(min_available=2) ) - TRAINING_CLIENT.patch_pytorchjob(schedulable_pytorchjob, JOB_NAME, job_namespace) - logging.info(f"List of patched {constants.PYTORCHJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_pytorchjobs(job_namespace)) + TRAINING_CLIENT.create_job(job=unschedulable_pytorchjob, namespace=job_namespace) + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_job_e2e( - TRAINING_CLIENT, - JOB_NAME, - job_namespace, - constants.PYTORCHJOB_KIND, - CONTAINER_NAME, - timeout=900, - ) + try: + utils.verify_unschedulable_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"PyTorchJob E2E fails. Exception: {e}") + + TRAINING_CLIENT.update_job(schedulable_pytorchjob, JOB_NAME, job_namespace) + logging.info(f"List of updated {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - TRAINING_CLIENT.delete_pytorchjob(JOB_NAME, job_namespace) + try: + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"PyTorchJob E2E fails. Exception: {e}") + + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) @pytest.mark.skipif( - GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling", + GANG_SCHEDULER_NAME in GANG_SCHEDULERS, + reason="For plain scheduling", ) def test_sdk_e2e(job_namespace): container = generate_container() @@ -112,44 +125,51 @@ def test_sdk_e2e(job_namespace): master = KubeflowOrgV1ReplicaSpec( replicas=1, restart_policy="OnFailure", - template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), - spec=V1PodSpec(containers=[container])), + template=V1PodTemplateSpec( + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=V1PodSpec(containers=[container]), + ), ) worker = KubeflowOrgV1ReplicaSpec( replicas=1, restart_policy="OnFailure", - template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), - spec=V1PodSpec(containers=[container])), + template=V1PodTemplateSpec( + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=V1PodSpec(containers=[container]), + ), ) - pytorchjob = generate_pytorchjob(master, worker, job_namespace=job_namespace) + pytorchjob = generate_pytorchjob(job_namespace, master, worker) - TRAINING_CLIENT.create_pytorchjob(pytorchjob, job_namespace) - logging.info(f"List of created {constants.PYTORCHJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_pytorchjobs(job_namespace)) + TRAINING_CLIENT.create_job(job=pytorchjob, namespace=job_namespace) + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_job_e2e( - TRAINING_CLIENT, - JOB_NAME, - job_namespace, - constants.PYTORCHJOB_KIND, - CONTAINER_NAME, - timeout=900, - ) + try: + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"PyTorchJob E2E fails. Exception: {e}") - TRAINING_CLIENT.delete_pytorchjob(JOB_NAME, job_namespace) + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) def generate_pytorchjob( + job_namespace: str, master: KubeflowOrgV1ReplicaSpec, worker: KubeflowOrgV1ReplicaSpec, - scheduling_policy: KubeflowOrgV1SchedulingPolicy = None, - job_namespace: str = "default", + scheduling_policy: Optional[KubeflowOrgV1SchedulingPolicy] = None, ) -> KubeflowOrgV1PyTorchJob: return KubeflowOrgV1PyTorchJob( - api_version="kubeflow.org/v1", - kind="PyTorchJob", + api_version=constants.API_VERSION, + kind=constants.PYTORCHJOB_KIND, metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace), spec=KubeflowOrgV1PyTorchJobSpec( run_policy=KubeflowOrgV1RunPolicy( diff --git a/sdk/python/test/e2e/test_e2e_tfjob.py b/sdk/python/test/e2e/test_e2e_tfjob.py index 6eaa086a59..9f14cf1877 100644 --- a/sdk/python/test/e2e/test_e2e_tfjob.py +++ b/sdk/python/test/e2e/test_e2e_tfjob.py @@ -15,6 +15,7 @@ import os import logging import pytest +from typing import Optional from kubernetes.client import V1PodTemplateSpec from kubernetes.client import V1ObjectMeta @@ -30,21 +31,22 @@ from kubeflow.training import KubeflowOrgV1SchedulingPolicy from kubeflow.training.constants import constants -from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name +import test.e2e.utils as utils from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS logging.basicConfig(format="%(message)s") logging.getLogger().setLevel(logging.INFO) -TRAINING_CLIENT = TrainingClient() +TRAINING_CLIENT = TrainingClient(job_kind=constants.TFJOB_KIND) JOB_NAME = "tfjob-mnist-ci-test" CONTAINER_NAME = "tensorflow" -GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY) +GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY, "") @pytest.mark.skipif( - GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling", + GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, + reason="For gang-scheduling", ) def test_sdk_e2e_with_gang_scheduling(job_namespace): container = generate_container() @@ -53,45 +55,52 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): replicas=1, restart_policy="Never", template=V1PodTemplateSpec( - metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), spec=V1PodSpec( containers=[container], - scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), - ) + scheduler_name=utils.get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), + ), ), ) - unschedulable_tfjob = generate_tfjob(worker, KubeflowOrgV1SchedulingPolicy(min_available=10), job_namespace) - schedulable_tfjob = generate_tfjob(worker, KubeflowOrgV1SchedulingPolicy(min_available=1), job_namespace) + unschedulable_tfjob = generate_tfjob( + job_namespace, worker, KubeflowOrgV1SchedulingPolicy(min_available=10) + ) + schedulable_tfjob = generate_tfjob( + job_namespace, worker, KubeflowOrgV1SchedulingPolicy(min_available=1) + ) - TRAINING_CLIENT.create_tfjob(unschedulable_tfjob, job_namespace) - logging.info(f"List of created {constants.TFJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_tfjobs(job_namespace)) + TRAINING_CLIENT.create_job(job=unschedulable_tfjob, namespace=job_namespace) + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_unschedulable_job_e2e( - TRAINING_CLIENT, - JOB_NAME, - job_namespace, - constants.TFJOB_KIND, - ) + try: + utils.verify_unschedulable_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"TFJob E2E fails. Exception: {e}") - TRAINING_CLIENT.patch_tfjob(schedulable_tfjob, JOB_NAME, job_namespace) - logging.info(f"List of patched {constants.TFJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_tfjobs(job_namespace)) + TRAINING_CLIENT.update_job(schedulable_tfjob, JOB_NAME, job_namespace) + logging.info(f"List of updated {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_job_e2e( - TRAINING_CLIENT, - JOB_NAME, - job_namespace, - constants.TFJOB_KIND, - CONTAINER_NAME, - ) + try: + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"TFJob E2E fails. Exception: {e}") - TRAINING_CLIENT.delete_tfjob(JOB_NAME, job_namespace) + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) @pytest.mark.skipif( - GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling", + GANG_SCHEDULER_NAME in GANG_SCHEDULERS, + reason="For plain scheduling", ) def test_sdk_e2e(job_namespace): container = generate_container() @@ -99,31 +108,39 @@ def test_sdk_e2e(job_namespace): worker = KubeflowOrgV1ReplicaSpec( replicas=1, restart_policy="Never", - template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), - spec=V1PodSpec(containers=[container])), + template=V1PodTemplateSpec( + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=V1PodSpec(containers=[container]), + ), ) - tfjob = generate_tfjob(worker, job_namespace=job_namespace) + tfjob = generate_tfjob(job_namespace, worker) - TRAINING_CLIENT.create_tfjob(tfjob, job_namespace) - logging.info(f"List of created {constants.TFJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_tfjobs(job_namespace)) + TRAINING_CLIENT.create_job(job=tfjob, namespace=job_namespace) + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_job_e2e( - TRAINING_CLIENT, JOB_NAME, job_namespace, constants.TFJOB_KIND, CONTAINER_NAME, - ) + try: + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"TFJob E2E fails. Exception: {e}") - TRAINING_CLIENT.delete_tfjob(JOB_NAME, job_namespace) + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) def generate_tfjob( + job_namespace: str, worker: KubeflowOrgV1ReplicaSpec, - scheduling_policy: KubeflowOrgV1SchedulingPolicy = None, - job_namespace: str = "default", + scheduling_policy: Optional[KubeflowOrgV1SchedulingPolicy] = None, ) -> KubeflowOrgV1TFJob: return KubeflowOrgV1TFJob( - api_version="kubeflow.org/v1", - kind="TFJob", + api_version=constants.API_VERSION, + kind=constants.TFJOB_KIND, metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace), spec=KubeflowOrgV1TFJobSpec( run_policy=KubeflowOrgV1RunPolicy( diff --git a/sdk/python/test/e2e/test_e2e_xgboostjob.py b/sdk/python/test/e2e/test_e2e_xgboostjob.py index 0f0542e909..1c334e2b78 100644 --- a/sdk/python/test/e2e/test_e2e_xgboostjob.py +++ b/sdk/python/test/e2e/test_e2e_xgboostjob.py @@ -15,6 +15,7 @@ import os import logging import pytest +from typing import Optional from kubernetes.client import V1PodTemplateSpec from kubernetes.client import V1ObjectMeta @@ -30,21 +31,22 @@ from kubeflow.training import KubeflowOrgV1SchedulingPolicy from kubeflow.training.constants import constants -from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name +import test.e2e.utils as utils from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS logging.basicConfig(format="%(message)s") logging.getLogger().setLevel(logging.INFO) -TRAINING_CLIENT = TrainingClient() +TRAINING_CLIENT = TrainingClient(job_kind=constants.XGBOOSTJOB_KIND) JOB_NAME = "xgboostjob-iris-ci-test" CONTAINER_NAME = "xgboost" -GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY) +GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY, "") @pytest.mark.skipif( - GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling", + GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, + reason="For gang-scheduling", ) def test_sdk_e2e_with_gang_scheduling(job_namespace): container = generate_container() @@ -53,11 +55,13 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): replicas=1, restart_policy="OnFailure", template=V1PodTemplateSpec( - metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), spec=V1PodSpec( containers=[container], - scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), - ) + scheduler_name=utils.get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), + ), ), ) @@ -65,45 +69,52 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace): replicas=1, restart_policy="OnFailure", template=V1PodTemplateSpec( - metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), spec=V1PodSpec( containers=[container], - scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), - ) + scheduler_name=utils.get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), + ), ), ) - unschedulable_xgboostjob = generate_xgboostjob(master, worker, KubeflowOrgV1SchedulingPolicy(min_available=10), job_namespace) - schedulable_xgboostjob = generate_xgboostjob(master, worker, KubeflowOrgV1SchedulingPolicy(min_available=2), job_namespace) + unschedulable_xgboostjob = generate_xgboostjob( + job_namespace, master, worker, KubeflowOrgV1SchedulingPolicy(min_available=10) + ) + schedulable_xgboostjob = generate_xgboostjob( + job_namespace, master, worker, KubeflowOrgV1SchedulingPolicy(min_available=2) + ) - TRAINING_CLIENT.create_xgboostjob(unschedulable_xgboostjob, job_namespace) - logging.info(f"List of created {constants.XGBOOSTJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_xgboostjobs(job_namespace)) + TRAINING_CLIENT.create_job(job=unschedulable_xgboostjob, namespace=job_namespace) + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_unschedulable_job_e2e( - TRAINING_CLIENT, - JOB_NAME, - job_namespace, - constants.XGBOOSTJOB_KIND, - ) + try: + utils.verify_unschedulable_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"XGBoostJob E2E fails. Exception: {e}") - TRAINING_CLIENT.patch_xgboostjob(schedulable_xgboostjob, JOB_NAME, job_namespace) - logging.info(f"List of patched {constants.XGBOOSTJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_xgboostjobs(job_namespace)) + TRAINING_CLIENT.update_job(schedulable_xgboostjob, JOB_NAME, job_namespace) + logging.info(f"List of updated {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_job_e2e( - TRAINING_CLIENT, - JOB_NAME, - job_namespace, - constants.XGBOOSTJOB_KIND, - CONTAINER_NAME, - ) + try: + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"XGBoostJob E2E fails. Exception: {e}") - TRAINING_CLIENT.delete_xgboostjob(JOB_NAME, job_namespace) + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) @pytest.mark.skipif( - GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling", + GANG_SCHEDULER_NAME in GANG_SCHEDULERS, + reason="For plain scheduling", ) def test_sdk_e2e(job_namespace): container = generate_container() @@ -111,43 +122,51 @@ def test_sdk_e2e(job_namespace): master = KubeflowOrgV1ReplicaSpec( replicas=1, restart_policy="OnFailure", - template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), - spec=V1PodSpec(containers=[container])), + template=V1PodTemplateSpec( + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=V1PodSpec(containers=[container]), + ), ) worker = KubeflowOrgV1ReplicaSpec( replicas=1, restart_policy="OnFailure", - template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), - spec=V1PodSpec(containers=[container])), + template=V1PodTemplateSpec( + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=V1PodSpec(containers=[container]), + ), ) - xgboostjob = generate_xgboostjob(master, worker, job_namespace=job_namespace) + xgboostjob = generate_xgboostjob(job_namespace, master, worker) - TRAINING_CLIENT.create_xgboostjob(xgboostjob, job_namespace) - logging.info(f"List of created {constants.XGBOOSTJOB_KIND}s") - logging.info(TRAINING_CLIENT.list_xgboostjobs(job_namespace)) + TRAINING_CLIENT.create_job(job=xgboostjob, namespace=job_namespace) + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) - verify_job_e2e( - TRAINING_CLIENT, - JOB_NAME, - job_namespace, - constants.XGBOOSTJOB_KIND, - CONTAINER_NAME, - ) + try: + utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"XGBoostJob E2E fails. Exception: {e}") - TRAINING_CLIENT.delete_xgboostjob(JOB_NAME, job_namespace) + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) def generate_xgboostjob( + job_namespace: str, master: KubeflowOrgV1ReplicaSpec, worker: KubeflowOrgV1ReplicaSpec, - scheduling_policy: KubeflowOrgV1SchedulingPolicy = None, - job_namespace: str = "default", + scheduling_policy: Optional[KubeflowOrgV1SchedulingPolicy] = None, ) -> KubeflowOrgV1XGBoostJob: return KubeflowOrgV1XGBoostJob( - api_version="kubeflow.org/v1", - kind="XGBoostJob", + api_version=constants.API_VERSION, + kind=constants.XGBOOSTJOB_KIND, metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace), spec=KubeflowOrgV1XGBoostJobSpec( run_policy=KubeflowOrgV1RunPolicy( diff --git a/sdk/python/test/e2e/utils.py b/sdk/python/test/e2e/utils.py index 32994be79c..a52393bbfc 100644 --- a/sdk/python/test/e2e/utils.py +++ b/sdk/python/test/e2e/utils.py @@ -11,65 +11,57 @@ logging.getLogger().setLevel(logging.INFO) -def verify_unschedulable_job_e2e( - client: TrainingClient, name: str, namespace: str, job_kind: str -): +def verify_unschedulable_job_e2e(client: TrainingClient, name: str, namespace: str): """Verify unschedulable Training Job e2e test.""" - logging.info(f"\n\n\n{job_kind} is creating") - client.wait_for_job_conditions(name, namespace, job_kind, {constants.JOB_CONDITION_CREATED}) + logging.info(f"\n\n\n{client.job_kind} is creating") + job = client.wait_for_job_conditions( + name, namespace, expected_conditions={constants.JOB_CONDITION_CREATED} + ) logging.info("Checking 3 times that pods are not scheduled") for num in range(3): logging.info(f"Number of attempts: {int(num)+1}/3") - # Job should have a Created condition. - if not client.is_job_created(name, namespace, job_kind): - raise Exception(f"{job_kind} should be in Created condition") - # Job shouldn't have a Running condition. - if client.is_job_running(name, namespace, job_kind): - raise Exception(f"{job_kind} shouldn't be in Running condition") + # Job should have correct conditions + if not client.is_job_created(job=job) or client.is_job_running(job=job): + raise Exception( + f"{client.job_kind} should be in Created condition. " + f"{client.job_kind} should not be in Running condition." + ) logging.info("Sleeping 5 seconds...") time.sleep(5) def verify_job_e2e( - client: TrainingClient, name: str, namespace: str, job_kind: str, container: str, timeout: int = 600 + client: TrainingClient, + name: str, + namespace: str, + wait_timeout: int = 600, ): """Verify Training Job e2e test.""" # Wait until Job is Succeeded. - logging.info(f"\n\n\n{job_kind} is running") - client.wait_for_job_conditions(name, namespace, job_kind, timeout=timeout) + logging.info(f"\n\n\n{client.job_kind} is running") + job = client.wait_for_job_conditions(name, namespace, wait_timeout=wait_timeout) # Job should have Created, Running, and Succeeded conditions. - conditions = client.get_job_conditions(name, namespace, job_kind) + conditions = client.get_job_conditions(job=job) if len(conditions) != 3: - raise Exception(f"{job_kind} conditions are invalid: {conditions}") + raise Exception(f"{client.job_kind} conditions are invalid: {conditions}") # Job should have correct conditions. - if not client.is_job_created(name, namespace, job_kind): - raise Exception(f"{job_kind} should be in Created condition") - - if client.is_job_running(name, namespace, job_kind): - raise Exception(f"{job_kind} should not be in Running condition") - - if client.is_job_restarting(name, namespace, job_kind): - raise Exception(f"{job_kind} should not be in Restarting condition") - - if not client.is_job_succeeded(name, namespace, job_kind): - raise Exception(f"{job_kind} should be in Succeeded condition") - - if client.is_job_failed(name, namespace, job_kind): - raise Exception(f"{job_kind} should not be in Failed condition") - - # Print Job pod names. - logging.info(f"\n\n\n{job_kind} pod names") - logging.info(client.get_job_pod_names(name, namespace)) - - # Print Job logs. - logging.info(f"\n\n\n{job_kind} logs") - client.get_job_logs(name, namespace, container=container) + if ( + not client.is_job_created(job=job) + or not client.is_job_succeeded(job=job) + or client.is_job_running(job=job) + or client.is_job_restarting(job=job) + or client.is_job_failed(job=job) + ): + raise Exception( + f"{client.job_kind} should be in Succeeded and Created conditions. " + f"{client.job_kind} should not be in Running, Restarting, or Failed conditions." + ) def get_pod_spec_scheduler_name(gang_scheduler_name: str) -> str: @@ -79,3 +71,17 @@ def get_pod_spec_scheduler_name(gang_scheduler_name: str) -> str: return TEST_GANG_SCHEDULER_NAME_VOLCANO return "" + + +def print_job_results(client: TrainingClient, name: str, namespace: str): + # Print Job. + logging.info(f"\n\n\n{client.job_kind} info") + logging.info(client.get_job(name, namespace)) + + # Print Job pod names. + logging.info(f"\n\n\n{client.job_kind} pod names") + logging.info(client.get_job_pod_names(name, namespace)) + + # Print Job logs. + logging.info(f"\n\n\n{client.job_kind} logs") + client.get_job_logs(name, namespace)