miqdigital · pull · Aug 23, 2023 · Aug 23, 2023 · Aug 23, 2023 · Aug 23, 2023
diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml
@@ -356,17 +356,18 @@
       --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
       python/ray/train/...
 
-- label: ":steam_locomotive: :octopus: :floppy_disk: New persistence mode: Train + Tune tests and examples"
-  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"]
-  instance_size: medium
-  commands:
-    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
-    - TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh
-    - ./ci/env/env_info.sh
-    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only 
-      --test_tag_filters=tune,-gpu_only,-ray_air,-gpu,-doctest,-no_new_storage
-      --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
-      python/ray/train/...
+# TODO(krfricke): Add new test for this suite
+# - label: ":steam_locomotive: :octopus: :floppy_disk: New persistence mode: Train + Tune tests and examples"
+#   conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"]
+#   instance_size: medium
+#   commands:
+#     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+#     - TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh
+#     - ./ci/env/env_info.sh
+#     - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only 
+#       --test_tag_filters=tune,-gpu_only,-ray_air,-gpu,-doctest,-no_new_storage
+#       --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
+#       python/ray/train/...
 
 
 - label: ":octopus: :floppy_disk: New persistence mode: Tune tests and examples (small)"

diff --git a/dashboard/client/src/pages/node/NodeRow.tsx b/dashboard/client/src/pages/node/NodeRow.tsx
@@ -15,6 +15,10 @@ import useSWR from "swr";
 import { CodeDialogButtonWithPreview } from "../../common/CodeDialogButton";
 import { API_REFRESH_INTERVAL_MS } from "../../common/constants";
 import { NodeLink } from "../../common/links";
+import {
+  CpuProfilingLink,
+  CpuStackTraceLink,
+} from "../../common/ProfilingLink";
 import rowStyles from "../../common/RowStyles";
 import PercentageBar from "../../components/PercentageBar";
 import { StatusChip } from "../../components/StatusChip";
@@ -254,23 +258,8 @@ export const WorkerRow = ({ node, worker }: WorkerRowProps) => {
           Logs
         </Link>
         <br />
-        <a
-          href={`/worker/traceback?pid=${pid}&ip=${ip}&native=0`}
-          target="_blank"
-          title="Sample the current Python stack trace for this worker."
-          rel="noreferrer"
-        >
-          Stack&nbsp;Trace
-        </a>
-        <br />
-        <a
-          href={`/worker/cpu_profile?pid=${pid}&ip=${ip}&duration=5&native=0`}
-          target="_blank"
-          title="Profile the Python worker for 5 seconds (default) and display a CPU flame graph."
-          rel="noreferrer"
-        >
-          CPU&nbsp;Flame&nbsp;Graph
-        </a>
+        <CpuStackTraceLink pid={pid} ip={ip} type="" />
+        <CpuProfilingLink pid={pid} ip={ip} type="" />
         <br />
       </TableCell>
       <TableCell>

diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml
@@ -93,8 +93,6 @@ parts:
                 title: "PyTorch Lightning Advanced Example"
               - file: train/examples/lightning/lightning_exp_tracking
                 title: "PyTorch Lightning with Experiment Tracking Tools"
-              - file: train/examples/transformers/transformers_example
-                title: "HF Transformers Example"
               - file: train/examples/tf/tensorflow_mnist_example
                 title: "TensorFlow MNIST Example"
               - file: train/examples/horovod/horovod_example

diff --git a/doc/source/ray-overview/examples.rst b/doc/source/ray-overview/examples.rst
@@ -43,7 +43,7 @@ Ray Examples
         Training 175B Parameter Language Models at 1000 GPU scale with Alpa and Ray
 
     .. grid-item-card:: :bdg-primary:`Blog`
-        :class-item: gallery-item gen-ai
+        :class-item: gallery-item gen-ai cv training
         :link: https://www.anyscale.com/blog/faster-stable-diffusion-fine-tuning-with-ray-air
 
         Faster stable diffusion fine-tuning with Ray AIR
@@ -61,7 +61,7 @@ Ray Examples
         How OpenAI Uses Ray to Train Tools like ChatGPT
 
     .. grid-item-card:: :bdg-secondary:`Code example`
-        :class-item: gallery-item llm gen-ai huggingface
+        :class-item: gallery-item llm gen-ai huggingface training nlp
         :link: /ray-air/examples/gptj_deepspeed_fine_tuning
         :link-type: doc
 
@@ -74,7 +74,7 @@ Ray Examples
         Aviary toolkit serving live traffic for LLMs
 
     .. grid-item-card:: :bdg-success:`Tutorial`
-        :class-item: gallery-item pytorch
+        :class-item: gallery-item pytorch training
         :link: /ray-air/examples/convert_existing_pytorch_code_to_ray_air
         :link-type: doc
 
@@ -102,7 +102,7 @@ Ray Examples
         Perform batch tuning on NYC Taxi Dataset with Ray AIR
 
     .. grid-item-card:: :bdg-secondary:`Code example`
-        :class-item: gallery-item
+        :class-item: gallery-item llm nlp gen-ai
         :link: /ray-air/examples/gptj_batch_prediction
         :link-type: doc
 
@@ -214,14 +214,14 @@ Ray Examples
         Java tutorial for Ray Serve
 
     .. grid-item-card:: :bdg-secondary:`Code example`
-        :class-item: gallery-item serving
+        :class-item: gallery-item serving cv
         :link: /serve/tutorials/stable-diffusion
         :link-type: doc
 
         Serving a Stable Diffusion Model
 
     .. grid-item-card:: :bdg-secondary:`Code example`
-        :class-item: gallery-item serving
+        :class-item: gallery-item serving nlp
         :link: /serve/tutorials/text-classification
         :link-type: doc
 
@@ -456,7 +456,7 @@ Ray Examples
         Simple Distributed Hyperparameter Optimization
 
     .. grid-item-card:: :bdg-primary:`Blog`
-        :class-item: gallery-item tuning
+        :class-item: gallery-item tuning nlp huggingface
         :link: https://www.anyscale.com/blog/hyperparameter-search-hugging-face-transformers-ray-tune
 
         Hyperparameter Search with 🤗 Transformers
@@ -518,7 +518,7 @@ Ray Examples
         A Guide To Tuning Horovod Parameters With Tune
 
     .. grid-item-card:: :bdg-secondary:`Code example`
-        :class-item: gallery-item tuning huggingface tune serve
+        :class-item: gallery-item tuning huggingface tune serve nlp
         :link: tune-huggingface-example
         :link-type: ref
 
@@ -544,28 +544,21 @@ Ray Examples
         Getting Started with Ray Train
 
     .. grid-item-card:: :bdg-secondary:`Code example`
-        :class-item: gallery-item training huggingface
+        :class-item: gallery-item training huggingface nlp
         :link: /ray-air/examples/huggingface_text_classification
         :link-type: doc
 
         Fine-tune a 🤗 Transformers model
 
     .. grid-item-card:: :bdg-secondary:`Code example`
-        :class-item: gallery-item pytorch training train
+        :class-item: gallery-item pytorch training train cv
         :link: torch_fashion_mnist_ex
         :link-type: ref
 
         PyTorch Fashion MNIST Training Example
 
     .. grid-item-card:: :bdg-secondary:`Code example`
-        :class-item: gallery-item pytorch training train
-        :link: train_transformers_example
-        :link-type: ref
-
-        Transformers with PyTorch Training Example
-
-    .. grid-item-card:: :bdg-secondary:`Code example`
-        :class-item: gallery-item tensorflow training train
+        :class-item: gallery-item tensorflow training train cv
         :link: tensorflow_mnist_example
         :link-type: ref
 
@@ -579,21 +572,21 @@ Ray Examples
         End-to-end Horovod Training Example
 
     .. grid-item-card:: :bdg-secondary:`Code example`
-        :class-item: gallery-item pytorch training train
+        :class-item: gallery-item pytorch training cv
         :link: lightning_mnist_example
         :link-type: ref
 
         End-to-end PyTorch Lightning Training Example
 
     .. grid-item-card:: :bdg-secondary:`Code example`
-        :class-item: gallery-item data-processing train
+        :class-item: gallery-item data-processing training nlp
         :link: lightning_advanced_example
         :link-type: ref
 
         Use LightningTrainer with Ray Data and Batch Predictor
 
     .. grid-item-card:: :bdg-secondary:`Code example`
-        :class-item: gallery-item tensorflow
+        :class-item: gallery-item tensorflow tuning
         :link: tune_train_tf_example
         :link-type: ref
 

diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst
@@ -25,14 +25,6 @@ Distributed Training Examples using Ray Train
 
             PyTorch Fashion MNIST Training Example
 
-    .. grid-item-card::
-        :img-top: /images/hugging.png
-        :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img
-
-        .. button-ref:: train_transformers_example
-
-            Transformers with PyTorch Training Example
-
     .. grid-item-card::
         :img-top: /images/tf_logo.png
         :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img

diff --git a/doc/source/train/examples/transformers/transformers_example.rst b/doc/source/train/examples/transformers/transformers_example.rst
diff --git a/doc/source/train/user-guides/data-loading-preprocessing.rst b/doc/source/train/user-guides/data-loading-preprocessing.rst
@@ -209,10 +209,13 @@ Your preprocessed datasets can be passed into a Ray Train Trainer (e.g. :class:`
 
 The datasets passed into the Trainer's ``datasets`` can be accessed inside of the ``train_loop_per_worker`` run on each distributed training worker by calling :meth:`ray.train.get_dataset_shard`. 
 
-The default splitting behavior is as follows:
+All datasets are split (i.e. sharded) across the training workers by default. :meth:`~ray.train.get_dataset_shard` will return ``1/n`` of the dataset, where ``n`` is the number of training workers.
 
-- The ``"train"`` dataset is split (i.e. sharded) across the training workers. :meth:`~ray.train.get_dataset_shard` will return ``1/n`` of the dataset, where ``n`` is the number of training workers.
-- All other dataset are not split. :meth:`~ray.train.get_dataset_shard` will return the full dataset.
+.. note::
+
+    Please be aware that as the evaluation dataset is split, users have to aggregate the evaluation results across workers. 
+    You might consider using `TorchMetrics <https://torchmetrics.readthedocs.io/en/latest/>`_ (:ref:`example <deepspeed_example>`) or 
+    utilities available in other frameworks that you can explore.
 
 This behavior can be overwritten by passing in the ``dataset_config`` argument. For more information on configuring splitting logic, see :ref:`Splitting datasets <train-datasets-split>`.
 
@@ -298,11 +301,11 @@ For more details, see the following sections for each framework.
 
 Splitting datasets
 ------------------
-By default, Ray Train splits the ``"train"`` dataset across workers using :meth:`Dataset.streaming_split <ray.data.Dataset.streaming_split>`. Each worker sees a disjoint subset of the data, instead of iterating over the entire dataset. Unless randomly shuffled, the same splits are used for each iteration of the dataset. 
+By default, Ray Train splits all datasets across workers using :meth:`Dataset.streaming_split <ray.data.Dataset.streaming_split>`. Each worker sees a disjoint subset of the data, instead of iterating over the entire dataset. Unless randomly shuffled, the same splits are used for each iteration of the dataset. 
 
-For all other datasets, Ray Train passes the entire dataset to each worker.
+If want to customize which datasets are split, pass in a :class:`DataConfig <ray.train.DataConfig>` to the Trainer constructor. 
 
-To customize this, pass in a :class:`DataConfig <ray.train.DataConfig>` to the Trainer constructor. For example, to split both the training and validation datasets, do the following:
+For example, to split only the training dataset, do the following:
 
 .. testcode::
 
@@ -317,18 +320,24 @@ To customize this, pass in a :class:`DataConfig <ray.train.DataConfig>` to the T
     train_ds, val_ds = ds.train_test_split(0.3)
 
     def train_loop_per_worker():
-        # Get an iterator to the dataset we passed in below.
-        it = train.get_dataset_shard("train")
+        # Get the sharded training dataset
+        train_ds = train.get_dataset_shard("train")
         for _ in range(2):
-            for batch in it.iter_batches(batch_size=128):
+            for batch in train_ds.iter_batches(batch_size=128):
                 print("Do some training on batch", batch)
+
+        # Get the unsharded full validation dataset
+        val_ds = train.get_dataset_shard("val")
+        for _ in range(2):
+            for batch in val_ds.iter_batches(batch_size=128):
+                print("Do some evaluation on batch", batch)
 
     my_trainer = TorchTrainer(
         train_loop_per_worker,
         scaling_config=ScalingConfig(num_workers=2),
         datasets={"train": train_ds, "val": val_ds},
         dataset_config=ray.train.DataConfig(
-            datasets_to_split=["train", "val"],
+            datasets_to_split=["train"],
         ),
     )
     my_trainer.fit()

diff --git a/python/ray/air/tests/test_new_dataset_config.py b/python/ray/air/tests/test_new_dataset_config.py
@@ -59,9 +59,9 @@ def test_basic(ray_start_4_cpus):
     test = TestBasic(1, True, {"train": 10, "test": -1}, datasets={"train": ds})
     test.fit()
 
-    # Two workers, train split.
+    # Two workers, train and test split.
     test = TestBasic(
-        2, True, {"train": 5, "test": 10}, datasets={"train": ds, "test": ds}
+        2, True, {"train": 5, "test": 5}, datasets={"train": ds, "test": ds}
     )
     test.fit()
 
@@ -78,6 +78,59 @@ def test_basic(ray_start_4_cpus):
     test.fit()
 
 
+def test_split(ray_start_4_cpus):
+    ds = ray.data.range(10)
+
+    # Split all by default
+    test = TestBasic(
+        2,
+        True,
+        {"train": 5, "test": 5, "val": 5},
+        datasets={"train": ds, "test": ds, "val": ds},
+    )
+    test.fit()
+
+    # Test flag "all"
+    test = TestBasic(
+        2,
+        True,
+        {"train": 5, "test": 5},
+        datasets={"train": ds, "test": ds},
+        dataset_config=DataConfig(datasets_to_split="all"),
+    )
+
+    # Test split train only.
+    test = TestBasic(
+        2,
+        True,
+        {"train": 5, "test": 10},
+        datasets={"train": ds, "test": ds},
+        dataset_config=DataConfig(datasets_to_split=["train"]),
+    )
+    test.fit()
+
+    # Test invalid arguments
+    for datasets_to_split in ["train", ("train"), {}]:
+        with pytest.raises(TypeError, match="`datasets_to_split` should be.*"):
+            test = TestBasic(
+                2,
+                True,
+                {"train": 5, "test": 10},
+                datasets={"train": ds, "test": ds},
+                dataset_config=DataConfig(datasets_to_split=datasets_to_split),
+            )
+
+    # Test empty `datasets_to_split` list
+    test = TestBasic(
+        2,
+        True,
+        {"train": 10, "test": 10},
+        datasets={"train": ds, "test": ds},
+        dataset_config=DataConfig(datasets_to_split=[]),
+    )
+    test.fit()
+
+
 @pytest.mark.skip(
     reason="Incomplete implementation of _validate_dag causes other errors, so we "
     "remove DAG validation for now; see https://github.com/ray-project/ray/pull/37829"