Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pull] master from ray-project:master #2322

Merged
merged 5 commits into from
Aug 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions .buildkite/pipeline.ml.yml
Original file line number Diff line number Diff line change
Expand Up @@ -356,17 +356,18 @@
--test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
python/ray/train/...

- label: ":steam_locomotive: :octopus: :floppy_disk: New persistence mode: Train + Tune tests and examples"
conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"]
instance_size: medium
commands:
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
- TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh
- ./ci/env/env_info.sh
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only
--test_tag_filters=tune,-gpu_only,-ray_air,-gpu,-doctest,-no_new_storage
--test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
python/ray/train/...
# TODO(krfricke): Add new test for this suite
# - label: ":steam_locomotive: :octopus: :floppy_disk: New persistence mode: Train + Tune tests and examples"
# conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"]
# instance_size: medium
# commands:
# - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
# - TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh
# - ./ci/env/env_info.sh
# - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only
# --test_tag_filters=tune,-gpu_only,-ray_air,-gpu,-doctest,-no_new_storage
# --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
# python/ray/train/...


- label: ":octopus: :floppy_disk: New persistence mode: Tune tests and examples (small)"
Expand Down
23 changes: 6 additions & 17 deletions dashboard/client/src/pages/node/NodeRow.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ import useSWR from "swr";
import { CodeDialogButtonWithPreview } from "../../common/CodeDialogButton";
import { API_REFRESH_INTERVAL_MS } from "../../common/constants";
import { NodeLink } from "../../common/links";
import {
CpuProfilingLink,
CpuStackTraceLink,
} from "../../common/ProfilingLink";
import rowStyles from "../../common/RowStyles";
import PercentageBar from "../../components/PercentageBar";
import { StatusChip } from "../../components/StatusChip";
Expand Down Expand Up @@ -254,23 +258,8 @@ export const WorkerRow = ({ node, worker }: WorkerRowProps) => {
Logs
</Link>
<br />
<a
href={`/worker/traceback?pid=${pid}&ip=${ip}&native=0`}
target="_blank"
title="Sample the current Python stack trace for this worker."
rel="noreferrer"
>
Stack&nbsp;Trace
</a>
<br />
<a
href={`/worker/cpu_profile?pid=${pid}&ip=${ip}&duration=5&native=0`}
target="_blank"
title="Profile the Python worker for 5 seconds (default) and display a CPU flame graph."
rel="noreferrer"
>
CPU&nbsp;Flame&nbsp;Graph
</a>
<CpuStackTraceLink pid={pid} ip={ip} type="" />
<CpuProfilingLink pid={pid} ip={ip} type="" />
<br />
</TableCell>
<TableCell>
Expand Down
2 changes: 0 additions & 2 deletions doc/source/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,6 @@ parts:
title: "PyTorch Lightning Advanced Example"
- file: train/examples/lightning/lightning_exp_tracking
title: "PyTorch Lightning with Experiment Tracking Tools"
- file: train/examples/transformers/transformers_example
title: "HF Transformers Example"
- file: train/examples/tf/tensorflow_mnist_example
title: "TensorFlow MNIST Example"
- file: train/examples/horovod/horovod_example
Expand Down
35 changes: 14 additions & 21 deletions doc/source/ray-overview/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ Ray Examples
Training 175B Parameter Language Models at 1000 GPU scale with Alpa and Ray

.. grid-item-card:: :bdg-primary:`Blog`
:class-item: gallery-item gen-ai
:class-item: gallery-item gen-ai cv training
:link: https://www.anyscale.com/blog/faster-stable-diffusion-fine-tuning-with-ray-air

Faster stable diffusion fine-tuning with Ray AIR
Expand All @@ -61,7 +61,7 @@ Ray Examples
How OpenAI Uses Ray to Train Tools like ChatGPT

.. grid-item-card:: :bdg-secondary:`Code example`
:class-item: gallery-item llm gen-ai huggingface
:class-item: gallery-item llm gen-ai huggingface training nlp
:link: /ray-air/examples/gptj_deepspeed_fine_tuning
:link-type: doc

Expand All @@ -74,7 +74,7 @@ Ray Examples
Aviary toolkit serving live traffic for LLMs

.. grid-item-card:: :bdg-success:`Tutorial`
:class-item: gallery-item pytorch
:class-item: gallery-item pytorch training
:link: /ray-air/examples/convert_existing_pytorch_code_to_ray_air
:link-type: doc

Expand Down Expand Up @@ -102,7 +102,7 @@ Ray Examples
Perform batch tuning on NYC Taxi Dataset with Ray AIR

.. grid-item-card:: :bdg-secondary:`Code example`
:class-item: gallery-item
:class-item: gallery-item llm nlp gen-ai
:link: /ray-air/examples/gptj_batch_prediction
:link-type: doc

Expand Down Expand Up @@ -214,14 +214,14 @@ Ray Examples
Java tutorial for Ray Serve

.. grid-item-card:: :bdg-secondary:`Code example`
:class-item: gallery-item serving
:class-item: gallery-item serving cv
:link: /serve/tutorials/stable-diffusion
:link-type: doc

Serving a Stable Diffusion Model

.. grid-item-card:: :bdg-secondary:`Code example`
:class-item: gallery-item serving
:class-item: gallery-item serving nlp
:link: /serve/tutorials/text-classification
:link-type: doc

Expand Down Expand Up @@ -456,7 +456,7 @@ Ray Examples
Simple Distributed Hyperparameter Optimization

.. grid-item-card:: :bdg-primary:`Blog`
:class-item: gallery-item tuning
:class-item: gallery-item tuning nlp huggingface
:link: https://www.anyscale.com/blog/hyperparameter-search-hugging-face-transformers-ray-tune

Hyperparameter Search with 🤗 Transformers
Expand Down Expand Up @@ -518,7 +518,7 @@ Ray Examples
A Guide To Tuning Horovod Parameters With Tune

.. grid-item-card:: :bdg-secondary:`Code example`
:class-item: gallery-item tuning huggingface tune serve
:class-item: gallery-item tuning huggingface tune serve nlp
:link: tune-huggingface-example
:link-type: ref

Expand All @@ -544,28 +544,21 @@ Ray Examples
Getting Started with Ray Train

.. grid-item-card:: :bdg-secondary:`Code example`
:class-item: gallery-item training huggingface
:class-item: gallery-item training huggingface nlp
:link: /ray-air/examples/huggingface_text_classification
:link-type: doc

Fine-tune a 🤗 Transformers model

.. grid-item-card:: :bdg-secondary:`Code example`
:class-item: gallery-item pytorch training train
:class-item: gallery-item pytorch training train cv
:link: torch_fashion_mnist_ex
:link-type: ref

PyTorch Fashion MNIST Training Example

.. grid-item-card:: :bdg-secondary:`Code example`
:class-item: gallery-item pytorch training train
:link: train_transformers_example
:link-type: ref

Transformers with PyTorch Training Example

.. grid-item-card:: :bdg-secondary:`Code example`
:class-item: gallery-item tensorflow training train
:class-item: gallery-item tensorflow training train cv
:link: tensorflow_mnist_example
:link-type: ref

Expand All @@ -579,21 +572,21 @@ Ray Examples
End-to-end Horovod Training Example

.. grid-item-card:: :bdg-secondary:`Code example`
:class-item: gallery-item pytorch training train
:class-item: gallery-item pytorch training cv
:link: lightning_mnist_example
:link-type: ref

End-to-end PyTorch Lightning Training Example

.. grid-item-card:: :bdg-secondary:`Code example`
:class-item: gallery-item data-processing train
:class-item: gallery-item data-processing training nlp
:link: lightning_advanced_example
:link-type: ref

Use LightningTrainer with Ray Data and Batch Predictor

.. grid-item-card:: :bdg-secondary:`Code example`
:class-item: gallery-item tensorflow
:class-item: gallery-item tensorflow tuning
:link: tune_train_tf_example
:link-type: ref

Expand Down
8 changes: 0 additions & 8 deletions doc/source/train/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,6 @@ Distributed Training Examples using Ray Train

PyTorch Fashion MNIST Training Example

.. grid-item-card::
:img-top: /images/hugging.png
:class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img

.. button-ref:: train_transformers_example

Transformers with PyTorch Training Example

.. grid-item-card::
:img-top: /images/tf_logo.png
:class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img
Expand Down

This file was deleted.

29 changes: 19 additions & 10 deletions doc/source/train/user-guides/data-loading-preprocessing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -209,10 +209,13 @@ Your preprocessed datasets can be passed into a Ray Train Trainer (e.g. :class:`

The datasets passed into the Trainer's ``datasets`` can be accessed inside of the ``train_loop_per_worker`` run on each distributed training worker by calling :meth:`ray.train.get_dataset_shard`.

The default splitting behavior is as follows:
All datasets are split (i.e. sharded) across the training workers by default. :meth:`~ray.train.get_dataset_shard` will return ``1/n`` of the dataset, where ``n`` is the number of training workers.

- The ``"train"`` dataset is split (i.e. sharded) across the training workers. :meth:`~ray.train.get_dataset_shard` will return ``1/n`` of the dataset, where ``n`` is the number of training workers.
- All other dataset are not split. :meth:`~ray.train.get_dataset_shard` will return the full dataset.
.. note::

Please be aware that as the evaluation dataset is split, users have to aggregate the evaluation results across workers.
You might consider using `TorchMetrics <https://torchmetrics.readthedocs.io/en/latest/>`_ (:ref:`example <deepspeed_example>`) or
utilities available in other frameworks that you can explore.

This behavior can be overwritten by passing in the ``dataset_config`` argument. For more information on configuring splitting logic, see :ref:`Splitting datasets <train-datasets-split>`.

Expand Down Expand Up @@ -298,11 +301,11 @@ For more details, see the following sections for each framework.

Splitting datasets
------------------
By default, Ray Train splits the ``"train"`` dataset across workers using :meth:`Dataset.streaming_split <ray.data.Dataset.streaming_split>`. Each worker sees a disjoint subset of the data, instead of iterating over the entire dataset. Unless randomly shuffled, the same splits are used for each iteration of the dataset.
By default, Ray Train splits all datasets across workers using :meth:`Dataset.streaming_split <ray.data.Dataset.streaming_split>`. Each worker sees a disjoint subset of the data, instead of iterating over the entire dataset. Unless randomly shuffled, the same splits are used for each iteration of the dataset.

For all other datasets, Ray Train passes the entire dataset to each worker.
If want to customize which datasets are split, pass in a :class:`DataConfig <ray.train.DataConfig>` to the Trainer constructor.

To customize this, pass in a :class:`DataConfig <ray.train.DataConfig>` to the Trainer constructor. For example, to split both the training and validation datasets, do the following:
For example, to split only the training dataset, do the following:

.. testcode::

Expand All @@ -317,18 +320,24 @@ To customize this, pass in a :class:`DataConfig <ray.train.DataConfig>` to the T
train_ds, val_ds = ds.train_test_split(0.3)

def train_loop_per_worker():
# Get an iterator to the dataset we passed in below.
it = train.get_dataset_shard("train")
# Get the sharded training dataset
train_ds = train.get_dataset_shard("train")
for _ in range(2):
for batch in it.iter_batches(batch_size=128):
for batch in train_ds.iter_batches(batch_size=128):
print("Do some training on batch", batch)

# Get the unsharded full validation dataset
val_ds = train.get_dataset_shard("val")
for _ in range(2):
for batch in val_ds.iter_batches(batch_size=128):
print("Do some evaluation on batch", batch)

my_trainer = TorchTrainer(
train_loop_per_worker,
scaling_config=ScalingConfig(num_workers=2),
datasets={"train": train_ds, "val": val_ds},
dataset_config=ray.train.DataConfig(
datasets_to_split=["train", "val"],
datasets_to_split=["train"],
),
)
my_trainer.fit()
Expand Down
57 changes: 55 additions & 2 deletions python/ray/air/tests/test_new_dataset_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ def test_basic(ray_start_4_cpus):
test = TestBasic(1, True, {"train": 10, "test": -1}, datasets={"train": ds})
test.fit()

# Two workers, train split.
# Two workers, train and test split.
test = TestBasic(
2, True, {"train": 5, "test": 10}, datasets={"train": ds, "test": ds}
2, True, {"train": 5, "test": 5}, datasets={"train": ds, "test": ds}
)
test.fit()

Expand All @@ -78,6 +78,59 @@ def test_basic(ray_start_4_cpus):
test.fit()


def test_split(ray_start_4_cpus):
ds = ray.data.range(10)

# Split all by default
test = TestBasic(
2,
True,
{"train": 5, "test": 5, "val": 5},
datasets={"train": ds, "test": ds, "val": ds},
)
test.fit()

# Test flag "all"
test = TestBasic(
2,
True,
{"train": 5, "test": 5},
datasets={"train": ds, "test": ds},
dataset_config=DataConfig(datasets_to_split="all"),
)

# Test split train only.
test = TestBasic(
2,
True,
{"train": 5, "test": 10},
datasets={"train": ds, "test": ds},
dataset_config=DataConfig(datasets_to_split=["train"]),
)
test.fit()

# Test invalid arguments
for datasets_to_split in ["train", ("train"), {}]:
with pytest.raises(TypeError, match="`datasets_to_split` should be.*"):
test = TestBasic(
2,
True,
{"train": 5, "test": 10},
datasets={"train": ds, "test": ds},
dataset_config=DataConfig(datasets_to_split=datasets_to_split),
)

# Test empty `datasets_to_split` list
test = TestBasic(
2,
True,
{"train": 10, "test": 10},
datasets={"train": ds, "test": ds},
dataset_config=DataConfig(datasets_to_split=[]),
)
test.fit()


@pytest.mark.skip(
reason="Incomplete implementation of _validate_dag causes other errors, so we "
"remove DAG validation for now; see https://github.com/ray-project/ray/pull/37829"
Expand Down
Loading