diff --git a/.buildkite/forge.rayci.yml b/.buildkite/forge.rayci.yml new file mode 100644 index 000000000000..cf77b58562db --- /dev/null +++ b/.buildkite/forge.rayci.yml @@ -0,0 +1,4 @@ +group: forge +steps: + - name: forge + wanda: ci/docker/forge.wanda.yaml diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml index 2a4e4b6232da..7de9411b3423 100644 --- a/.buildkite/pipeline.ml.yml +++ b/.buildkite/pipeline.ml.yml @@ -53,6 +53,7 @@ - ./ci/env/env_info.sh - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=tune,-gpu_only,-ray_air,-gpu,-doctest python/ray/train/... + - label: ":brain: RLlib: Benchmarks (Torch 2.x)" conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_RLLIB_AFFECTED"] instance_size: medium @@ -314,6 +315,7 @@ --test_env=AIR_VERBOSITY=1 python/ray/tune/... + - label: ":octopus: :brain: Tune tests and examples {using RLlib}" conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED", "RAY_CI_RLLIB_AFFECTED"] instance_size: large @@ -335,6 +337,71 @@ - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only python/ray/tests/horovod/... + +##### STORAGE REFACTOR + +- label: ":steam_locomotive: :floppy_disk: New persistence mode: Train tests and examples" + conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"] + instance_size: large + parallelism: 4 + commands: + - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + # Todo (krfricke): Move mosaicml to train-test-requirements.txt + - pip install "mosaicml==0.12.1" + - TRAIN_TESTING=1 DATA_PROCESSING_TESTING=1 INSTALL_HOROVOD=1 ./ci/env/install-dependencies.sh + - ./ci/env/env_info.sh + - ./ci/run/run_bazel_test_with_sharding.sh + --config=ci $(./ci/run/bazel_export_options) + --test_tag_filters=-gpu_only,-gpu,-minimal,-tune,-needs_credentials,-doctest,-no_new_storage + --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1 + python/ray/train/... + +- label: ":steam_locomotive: :octopus: :floppy_disk: New persistence mode: Train + Tune tests and examples" + conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"] + instance_size: medium + commands: + - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + - TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh + - ./ci/env/env_info.sh + - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only + --test_tag_filters=tune,-gpu_only,-ray_air,-gpu,-doctest,-no_new_storage + --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1 + python/ray/train/... + + +- label: ":octopus: :floppy_disk: New persistence mode: Tune tests and examples (small)" + conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"] + instance_size: small + parallelism: 3 + commands: + - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + - TUNE_TESTING=1 ./ci/env/install-dependencies.sh + - ./ci/env/env_info.sh + - ./ci/run/run_bazel_test_with_sharding.sh + --config=ci $(./ci/run/bazel_export_options) --build_tests_only + --test_tag_filters=-medium_instance,-soft_imports,-gpu_only,-rllib,-multinode,-no_new_storage + --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1 + python/ray/tune/... + +- label: ":octopus: :floppy_disk: New persistence mode: Tune tests and examples (medium)" + conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"] + instance_size: medium + commands: + - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + - TUNE_TESTING=1 DATA_PROCESSING_TESTING=1 ./ci/env/install-dependencies.sh + - ./ci/env/env_info.sh + - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only + --test_tag_filters=medium_instance,-soft_imports,-gpu_only,-rllib,-multinode,-no_new_storage + --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1 + python/ray/tune/... + + +###### END STORAGE REFACTOR + + + + + # TODO(amogkam): Re-enable Ludwig tests after Ludwig supports Ray 2.0 #- label: ":octopus: Ludwig tests and examples. Python 3.7" # conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"] diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 805aa619c196..8b05d61522af 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -54,11 +54,11 @@ #!/python/ray/rllib/ @ray-project/ray-core # Java worker. -/java/dependencies.bzl @kfstorm @raulchen @ericl @iycheng @WangTaoTheTonic @SongGuyang @jovany-wang -/java/pom.xml @kfstorm @raulchen @ericl @iycheng @WangTaoTheTonic @SongGuyang @jovany-wang -/java/pom_template.xml @kfstorm @raulchen @ericl @iycheng @WangTaoTheTonic @SongGuyang @jovany-wang -/java/*/pom_template.xml @kfstorm @raulchen @ericl @iycheng @WangTaoTheTonic @SongGuyang @jovany-wang -/java/api/ @kfstorm @raulchen @ericl @iycheng @WangTaoTheTonic @SongGuyang @jovany-wang +/java/dependencies.bzl @kfstorm @raulchen @ericl @iycheng @WangTaoTheTonic @SongGuyang +/java/pom.xml @kfstorm @raulchen @ericl @iycheng @WangTaoTheTonic @SongGuyang +/java/pom_template.xml @kfstorm @raulchen @ericl @iycheng @WangTaoTheTonic @SongGuyang +/java/*/pom_template.xml @kfstorm @raulchen @ericl @iycheng @WangTaoTheTonic @SongGuyang +/java/api/ @kfstorm @raulchen @ericl @iycheng @WangTaoTheTonic @SongGuyang # C++ worker /cpp/include/ray @SongGuyang @raulchen @kfstorm @ray-project/ray-core diff --git a/ci/forge/Dockerfile.forge b/ci/docker/forge.Dockerfile similarity index 100% rename from ci/forge/Dockerfile.forge rename to ci/docker/forge.Dockerfile diff --git a/ci/docker/forge.wanda.yaml b/ci/docker/forge.wanda.yaml new file mode 100644 index 000000000000..0e22eec79cdf --- /dev/null +++ b/ci/docker/forge.wanda.yaml @@ -0,0 +1,4 @@ +name: "forge" +froms: + - ubuntu:20.04 +dockerfile: ci/docker/forge.Dockerfile diff --git a/cpp/include/ray/api/msgpack_adaptor.h b/cpp/include/ray/api/msgpack_adaptor.h new file mode 100644 index 000000000000..7b9378256803 --- /dev/null +++ b/cpp/include/ray/api/msgpack_adaptor.h @@ -0,0 +1,88 @@ +// Copyright 2023 The Ray Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +// TODO(Larry Lian) Adapt on windows +#ifndef _WIN32 + +namespace msgpack { +namespace adaptor { + +template <> +struct pack { + template + msgpack::packer &operator()(msgpack::packer &o, + const std::any &v) const { + const auto &any_type = v.type(); + if (any_type == typeid(msgpack::type::nil_t)) { + o.pack(std::any_cast(v)); + } else if (any_type == typeid(bool)) { + o.pack(std::any_cast(v)); + } else if (any_type == typeid(uint64_t)) { + o.pack(std::any_cast(v)); + } else if (any_type == typeid(int64_t)) { + o.pack(std::any_cast(v)); + } else if (any_type == typeid(double)) { + o.pack(std::any_cast(v)); + } else if (any_type == typeid(std::string)) { + o.pack(std::any_cast(v)); + } else if (any_type == typeid(std::vector)) { + o.pack(std::any_cast>(v)); + } else { + throw msgpack::type_error(); + } + return o; + } +}; + +template <> +struct convert { + msgpack::object const &operator()(msgpack::object const &o, std::any &v) const { + switch (o.type) { + case type::NIL: + v = o.as(); + break; + case type::BOOLEAN: + v = o.as(); + break; + case type::POSITIVE_INTEGER: + v = o.as(); + break; + case type::NEGATIVE_INTEGER: + v = o.as(); + break; + case type::FLOAT32: + case type::FLOAT64: + v = o.as(); + break; + case type::STR: + v = o.as(); + break; + case type::BIN: + v = o.as>(); + break; + default: + throw msgpack::type_error(); + } + return o; + } +}; + +} // namespace adaptor +} // namespace msgpack +#endif diff --git a/cpp/include/ray/api/serializer.h b/cpp/include/ray/api/serializer.h index 40adf5884c30..7c91c7952a3d 100644 --- a/cpp/include/ray/api/serializer.h +++ b/cpp/include/ray/api/serializer.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include diff --git a/cpp/src/ray/test/cluster/counter.cc b/cpp/src/ray/test/cluster/counter.cc index 85b31eb24c8e..eb77917189d9 100644 --- a/cpp/src/ray/test/cluster/counter.cc +++ b/cpp/src/ray/test/cluster/counter.cc @@ -127,9 +127,11 @@ RAY_REMOTE(RAY_FUNC(Counter::FactoryCreate), &Counter::GetCount, &Counter::CreateNestedChildActor, &Counter::GetBytes, - &Counter::echoBytes, - &Counter::echoString, - &Counter::GetIntByObjectRef); + &Counter::EchoBytes, + &Counter::EchoString, + &Counter::GetIntByObjectRef, + &Counter::EchoStrings, + &Counter::EchoAnyArray); RAY_REMOTE(ActorConcurrentCall::FactoryCreate, &ActorConcurrentCall::CountDown); diff --git a/cpp/src/ray/test/cluster/counter.h b/cpp/src/ray/test/cluster/counter.h index f282c8eb384d..1e7c50afe5c9 100644 --- a/cpp/src/ray/test/cluster/counter.h +++ b/cpp/src/ray/test/cluster/counter.h @@ -16,6 +16,7 @@ #include +#include #include #include @@ -57,9 +58,52 @@ class Counter { return bytes; } - std::vector echoBytes(const std::vector &bytes) { return bytes; } + std::vector EchoBytes(const std::vector &bytes) { return bytes; } - std::string echoString(const std::string &str) { return str; } + std::string EchoString(const std::string &str) { return str; } + + std::vector EchoStrings(const std::vector &strings) { + return strings; + } + + std::vector EchoAnyArray(const std::vector &anys) { + assert(anys.size() == 11); + // check bool + assert(anys[0].type() == typeid(bool)); + assert(std::any_cast(anys[0]) == true); + // equal to java Byte.MAX_VALUE + assert(anys[1].type() == typeid(uint64_t)); + assert(std::any_cast(anys[1]) == 127); + // equal to java Short.MAX_VALUE + assert(anys[2].type() == typeid(uint64_t)); + assert(std::any_cast(anys[2]) == 32767); + // equal to java Integer.MAX_VALUE + assert(anys[3].type() == typeid(uint64_t)); + assert(std::any_cast(anys[3]) == 2147483647); + // equal to java Short.MAX_VALUE + assert(anys[4].type() == typeid(uint64_t)); + assert(std::any_cast(anys[4]) == 9223372036854775807); + // equal to java Long.MIN_VALUE + assert(anys[5].type() == typeid(int64_t)); + assert(std::any_cast(anys[5]) == -9223372036854775808); + // equal to java BigInteger.valueOf(Long.MAX_VALUE) + assert(anys[6].type() == typeid(uint64_t)); + assert(std::any_cast(anys[6]) == 9223372036854775807); + // equal to java string "Hello World!" + assert(anys[7].type() == typeid(std::string)); + assert(std::any_cast(anys[7]) == "Hello World!"); + // equal to java float 1.234f + assert(anys[8].type() == typeid(double)); + assert(std::any_cast(anys[8]) == 1.234); + // equal to java double 1.234 + assert(anys[9].type() == typeid(double)); + assert(std::any_cast(anys[9]) == 1.234); + // equal to java byte[] + std::vector bytes = {'b', 'i', 'n', 'a', 'r', 'y'}; + assert(anys[10].type() == typeid(std::vector)); + assert(std::any_cast>(anys[10]) == bytes); + return anys; + } int GetIntVal(ray::ObjectRef> obj) { auto val = *obj.Get(); diff --git a/cpp/src/ray/test/serialization_test.cc b/cpp/src/ray/test/serialization_test.cc index deeb42d87b38..b157ee51ab40 100644 --- a/cpp/src/ray/test/serialization_test.cc +++ b/cpp/src/ray/test/serialization_test.cc @@ -15,9 +15,38 @@ #include #include +bool IsAnyEqual(const std::any &lhs, const std::any &rhs) { + if (lhs.type() != rhs.type()) { + return false; + } + if (lhs.type() == typeid(uint64_t)) { + return std::any_cast(lhs) == std::any_cast(rhs); + } else if (lhs.type() == typeid(int64_t)) { + return std::any_cast(lhs) == std::any_cast(rhs); + } else if (lhs.type() == typeid(bool)) { + return std::any_cast(lhs) == std::any_cast(rhs); + } else if (lhs.type() == typeid(float)) { + return std::any_cast(lhs) == std::any_cast(rhs); + } else if (lhs.type() == typeid(double)) { + return std::any_cast(lhs) == std::any_cast(rhs); + } else if (lhs.type() == typeid(std::string)) { + return std::any_cast(lhs) == std::any_cast(rhs); + } else if (lhs.type() == typeid(std::vector)) { + return std::any_cast>(lhs) == std::any_cast>(rhs); + } else { + return false; + } +} + TEST(SerializationTest, TypeHybridTest) { uint32_t in_arg1 = 123456789, out_arg1; std::string in_arg2 = "123567ABC", out_arg2; + double in_arg3 = 1.1; + float in_arg4 = 1.2; + uint64_t in_arg5 = 100; + int64_t in_arg6 = -100; + bool in_arg7 = false; + std::vector in_arg8 = {'a', 'b'}; // 1 arg // marshall @@ -40,6 +69,40 @@ TEST(SerializationTest, TypeHybridTest) { EXPECT_EQ(in_arg1, out_arg1); EXPECT_EQ(in_arg2, out_arg2); + +// TODO(Larry Lian) Adapt on windows +#ifndef _WIN32 + // Test std::any + msgpack::sbuffer buffer3 = ray::internal::Serializer::Serialize(std::make_tuple( + in_arg1, in_arg2, in_arg3, in_arg4, in_arg5, in_arg6, in_arg7, in_arg8)); + + std::vector result = + ray::internal::Serializer::Deserialize>(buffer3.data(), + buffer3.size()); + EXPECT_TRUE(result[0].type() == typeid(uint64_t)); + EXPECT_EQ(std::any_cast(result[0]), in_arg1); + EXPECT_TRUE(result[1].type() == typeid(std::string)); + EXPECT_EQ(std::any_cast(result[1]), in_arg2); + EXPECT_TRUE(result[2].type() == typeid(double)); + EXPECT_EQ(std::any_cast(result[2]), in_arg3); + EXPECT_TRUE(result[3].type() == typeid(double)); + EXPECT_EQ(std::any_cast(result[3]), in_arg4); + EXPECT_TRUE(result[4].type() == typeid(uint64_t)); + EXPECT_EQ(std::any_cast(result[4]), in_arg5); + EXPECT_TRUE(result[5].type() == typeid(int64_t)); + EXPECT_EQ(std::any_cast(result[5]), in_arg6); + EXPECT_TRUE(result[6].type() == typeid(bool)); + EXPECT_EQ(std::any_cast(result[6]), in_arg7); + EXPECT_TRUE(result[7].type() == typeid(std::vector)); + EXPECT_EQ(std::any_cast>(result[7]), in_arg8); + + msgpack::sbuffer buffer4 = ray::internal::Serializer::Serialize(result); + + std::vector result_2 = + ray::internal::Serializer::Deserialize>(buffer4.data(), + buffer4.size()); + EXPECT_TRUE(std::equal(result.begin(), result.end(), result_2.begin(), IsAnyEqual)); +#endif } TEST(SerializationTest, BoundaryValueTest) { diff --git a/doc/source/_static/js/top-navigation.js b/doc/source/_static/js/top-navigation.js index 0b974a018e16..d09beac95b36 100644 --- a/doc/source/_static/js/top-navigation.js +++ b/doc/source/_static/js/top-navigation.js @@ -22,7 +22,6 @@ is_examples = window.location.href.endsWith("ray-overview/examples.html") is_get_started = window.location.href.endsWith("ray-overview/getting-started.html") is_use_cases = window.location.href.endsWith("ray-overview/use-cases.html") is_libraries = window.location.href.includes("/ray-core/") || - window.location.href.includes("/ray-air/") || window.location.href.includes("/data/") || window.location.href.includes("/train/") || window.location.href.includes("/tune/") || @@ -89,7 +88,6 @@ librariesMenu.setAttribute("class", "menu") librariesMenu.innerHTML = "Libraries" + downCaret + "" librariesList = document.createElement("ul") librariesList.innerHTML += "
  • Ray CoreScale general Python applications
  • " -librariesList.innerHTML += "
  • Ray AIRScale AI applications
  • " librariesList.innerHTML += "
  • Ray DataScale data ingest and preprocessing
  • " librariesList.innerHTML += "
  • Ray TrainScale machine learning training
  • " librariesList.innerHTML += "
  • Ray TuneScale hyperparameter tuning
  • " diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml index 0c6ede765161..5b6f0118cea2 100644 --- a/doc/source/_toc.yml +++ b/doc/source/_toc.yml @@ -62,20 +62,21 @@ parts: - file: train/key-concepts title: Key Concepts - file: train/getting-started-pytorch - title: PyTorch + title: PyTorch Guide - file: train/getting-started-pytorch-lightning - title: PyTorch Lightning + title: PyTorch Lightning Guide - file: train/getting-started-transformers - title: Hugging Face Transformers + title: Hugging Face Transformers Guide - file: train/more-frameworks sections: - file: train/huggingface-accelerate - title: Hugging Face Accelerate + title: Hugging Face Accelerate Guide - file: train/distributed-tensorflow-keras - title: TensorFlow & Keras + title: TensorFlow and Keras Guide - file: train/distributed-xgboost-lightgbm - title: XGBoost & LightGBM + title: XGBoost and LightGBM Guide - file: train/horovod + title: Horovod Guide - file: train/user-guides title: User Guides - file: train/examples diff --git a/doc/source/images/map-of-ray.png b/doc/source/images/map-of-ray.png index b8563f367335..56943d0a2b8d 100644 Binary files a/doc/source/images/map-of-ray.png and b/doc/source/images/map-of-ray.png differ diff --git a/doc/source/index.md b/doc/source/index.md index 07c46b779ac3..63615f3d9df8 100644 --- a/doc/source/index.md +++ b/doc/source/index.md @@ -333,7 +333,7 @@ ppo_algo.evaluate()
    -

    Ray AI Runtime Libraries

    +

    Ray Libraries

    Scale the entire ML pipeline from data ingest to model serving with high-level Python APIs that integrate with popular ecosystem frameworks.

    Learn more about Ray Libraries> diff --git a/doc/source/ray-air/examples/dreambooth_finetuning.rst b/doc/source/ray-air/examples/dreambooth_finetuning.rst index 7795b1caf0bf..8079d60af107 100644 --- a/doc/source/ray-air/examples/dreambooth_finetuning.rst +++ b/doc/source/ray-air/examples/dreambooth_finetuning.rst @@ -16,8 +16,8 @@ See the HuggingFace tutorial for useful explanations and suggestions on hyperpar **Compute requirements:** -* Because of the large model sizes, you'll need a machine with at least 2 A10G GPUs. -* Each training worker uses 2 GPUs, so you'll need ``N * 2`` total GPUs in your Ray cluster, if training with ``N`` distributed training workers. The example can leverage data-parallel training with more GPUs to speed up training time. +* Because of the large model sizes, you'll need a machine with at least 1 A10G GPU. +* Each training worker uses 1 GPU. You can use multiple GPUs/workers to leverage data-parallel training to speed up training time. This example fine-tunes both the ``text_encoder`` and ``unet`` models used in the Stable Diffusion process, with respect to a prior preserving loss. @@ -123,9 +123,9 @@ Configuring the scale In the TorchTrainer, we can easily configure our scale. The above example uses the ``num_workers`` argument to specify the number -of workers. This defaults to 2 workers with 2 GPUs each - so 4 GPUs in total. +of workers. This defaults to 2 workers with 1 GPU each - so 2 GPUs in total. -To run the example on 8 GPUs, just set the number of workers to 4 using ``--num-workers=4``! +To run the example on 4 GPUs, just set the number of workers to 4 using ``--num-workers=4``! Or you can change the scaling config directly: .. code-block:: diff @@ -134,9 +134,6 @@ Or you can change the scaling config directly: use_gpu=True, - num_workers=args.num_workers, + num_workers=4, - resources_per_worker={ - "GPU": 2, - }, ) If you're running multi-node training, you should make sure that all nodes have access to a shared @@ -146,45 +143,34 @@ storage (e.g. via NFS or EFS). In the example script below, you can adjust this Training throughput ~~~~~~~~~~~~~~~~~~~ -We ran training using 1, 2, 4, and 8 workers (and 2, 4, 8, and 16 GPUs, respectively) to compare throughput. +We ran training using 1, 2, and 4 workers/GPUs to compare throughput. Setup: -* 2 x g5.12xlarge nodes with 4 A10G GPUs each +* 1 GCE g2-standard-48-nvidia-l4-4 instance with 4 GPUs * Model as configured below * Data from this example * 200 regularization images -* Training for 4 epochs (800 steps) -* Use a mounted External File System to share data between nodes +* Training for 4 epochs (local batch size = 2) * 3 runs per configuration -Because network storage can be slow, we excluded the time it takes to save the final model from the training time. - We expect that the training time should benefit from scale and decreases when running with more workers and GPUs. - .. image:: /templates/05_dreambooth_finetuning/dreambooth/images/dreambooth_training.png :alt: DreamBooth training times .. list-table:: :header-rows: 1 - * - Number of workers - - Number of GPUs - - Training time + * - Number of workers/GPUs + - Training time (seconds) * - 1 - - 2 - - 458.16 (3.82) + - 802.14 * - 2 - - 4 - - 364.61 (1.65) + - 487.82 * - 4 - - 8 - - 252.37 (3.18) - * - 8 - - 16 - - 160.97 (1.36) + - 313.25 While the training time decreases linearly with the amount of workers/GPUs, we observe some penalty. @@ -223,11 +209,23 @@ Prepare some directories and environment variables. .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth_run.sh :language: bash - :start-after: Step 0 cont - :end-at: export UNIQUE_TOKEN + :start-after: __preparation_start__ + :end-before: __preparation_end__ + + +Step 1: Download the pre-trained model +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Download and cache a pre-trained Stable-Diffusion model locally. + +.. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth_run.sh + :language: bash + :start-after: __cache_model_start__ + :end-before: __cache_model_end__ +You can access the downloaded model checkpoint at the ``$ORIG_MODEL_PATH``. -Step 1: Supply images of your subject +Step 2: Supply images of your subject ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Use one of the sample datasets (dog, lego car), or provide your own directory @@ -237,26 +235,14 @@ Then, we copy these images to ``$IMAGES_OWN_DIR``. .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth_run.sh :language: bash - :start-after: Step 1 - :end-at: cp -rf $INSTANCE_DIR/* + :start-after: __supply_own_images_start__ + :end-before: __supply_own_images_end__ The ``$CLASS_NAME`` should be the general category of your subject. The images produced by the prompt ``photo of a unqtkn `` should be diverse images that are different enough from the subject in order for generated images to clearly show the effect of fine-tuning. -Step 2: Download the pre-trained model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Download and cache a pre-trained Stable-Diffusion model locally. - -.. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth_run.sh - :language: bash - :start-after: Step 2 - :end-at: python cache_model.py - -You can access the downloaded model checkpoint at the ``$ORIG_MODEL_PATH``. - Step 3: Create the regularization images ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -267,7 +253,7 @@ rather than just optimize for producing good images of the subject. .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth_run.sh :language: bash - :start-at: Step 3: START + :start-after: Step 3: START :end-before: Step 3: END We use Ray Data to do batch inference with 4 workers, so more images can be generated in parallel. diff --git a/doc/source/ray-overview/examples.rst b/doc/source/ray-overview/examples.rst index f2705e9283ed..5b1614b963c9 100644 --- a/doc/source/ray-overview/examples.rst +++ b/doc/source/ray-overview/examples.rst @@ -61,7 +61,7 @@ Ray Examples How OpenAI Uses Ray to Train Tools like ChatGPT .. grid-item-card:: :bdg-secondary:`Code example` - :class-item: gallery-item llm gen-ai + :class-item: gallery-item llm gen-ai huggingface :link: /ray-air/examples/gptj_deepspeed_fine_tuning :link-type: doc diff --git a/doc/source/ray-overview/index.md b/doc/source/ray-overview/index.md index 3d0bae60daa4..93d70c41e581 100644 --- a/doc/source/ray-overview/index.md +++ b/doc/source/ray-overview/index.md @@ -44,9 +44,9 @@ These are some common ML workloads that individuals, organizations, and companie Ray's unified compute framework consists of three layers: -1. **Ray AI Runtime (AIR) Libraries**--An open-source, Python, domain-specific set of libraries that equip ML engineers, data scientists, and researchers with a scalable and unified toolkit for ML applications. +1. **Ray AI Libraries**--An open-source, Python, domain-specific set of libraries that equip ML engineers, data scientists, and researchers with a scalable and unified toolkit for ML applications. 2. **Ray Core**--An open-source, Python, general purpose, distributed computing library that enables ML engineers and Python developers to scale Python applications and accelerate machine learning workloads. -3. **Ray cluster**--A set of worker nodes connected to a common Ray head node. Ray clusters can be fixed-size, or they can autoscale up and down according to the resources requested by applications running on the cluster. +3. **Ray Clusters**--A set of worker nodes connected to a common Ray head node. Ray clusters can be fixed-size, or they can autoscale up and down according to the resources requested by applications running on the cluster. ```{eval-rst} .. grid:: 1 2 3 3 @@ -117,7 +117,7 @@ Each of [Ray's](../ray-air/getting-started) five native libraries distributes a Ray's libraries are for both data scientists and ML engineers alike. For data scientists, these libraries can be used to scale individual workloads, and also end-to-end ML applications. For ML Engineers, these libraries provides scalable platform abstractions that can be used to easily onboard and integrate tooling from the broader ML ecosystem. -For custom applications, the [Ray Core](../ray-core/walkthrough) library enables Python developers to easily build scalable, distributed systems that can run on a laptop, cluster, cloud, or Kubernetes. It's the foundation that Ray AI Runtime libraries and third-party integrations (Ray ecosystem) are built on. +For custom applications, the [Ray Core](../ray-core/walkthrough) library enables Python developers to easily build scalable, distributed systems that can run on a laptop, cluster, cloud, or Kubernetes. It's the foundation that Ray AI libraries and third-party integrations (Ray ecosystem) are built on. Ray runs on any machine, cluster, cloud provider, and Kubernetes, and features a growing [ecosystem of community integrations](ray-libraries). diff --git a/doc/source/serve/doc_code/production_fruit_example.py b/doc/source/serve/doc_code/production_fruit_example.py index 45eb14baf398..a6125b22246b 100644 --- a/doc/source/serve/doc_code/production_fruit_example.py +++ b/doc/source/serve/doc_code/production_fruit_example.py @@ -128,10 +128,10 @@ def check_fruit_deployment_graph_updates(): # Test behavior from this documentation example serve.start(detached=True) -app = build(deployment_graph) +app = build(deployment_graph, "default") for deployment in app.deployments.values(): deployment.set_options(ray_actor_options={"num_cpus": 0.1}) -serve.run(app) +serve.run(app, name="default") check_fruit_deployment_graph() print("Example ran successfully from the file.") serve.shutdown() diff --git a/doc/source/templates/05_dreambooth_finetuning/README.md b/doc/source/templates/05_dreambooth_finetuning/README.md index 70134594601f..093fdfcc8c68 100644 --- a/doc/source/templates/05_dreambooth_finetuning/README.md +++ b/doc/source/templates/05_dreambooth_finetuning/README.md @@ -4,7 +4,7 @@ | ---------------------- | ----------- | | Summary | This example shows how to do [DreamBooth fine-tuning](https://dreambooth.github.io/) of a Stable Diffusion model using Ray Train for data-parallel training with many workers and Ray Data for data ingestion. Use one of the provided datasets, or supply your own photos. By the end of this example, you'll be able to generate images of your subject in a variety of situations, just by feeding in a text prompt! | | Time to Run | ~10-15 minutes to generate a regularization dataset and fine-tune the model on photos of your subject. | -| Minimum Compute Requirements | At least 2 GPUs, where each GPU has >= 24GB GRAM. The default is 1 node with 4 GPUS: A10G GPU (AWS) or L4 GPU (GCE). | +| Minimum Compute Requirements | At least 1 GPUs, where each GPU has >= 24GB GRAM. The default is 1 node with 4 GPUS: A10G GPU (AWS) or L4 GPU (GCE). | | Cluster Environment | This template uses a docker image built on top of the latest Anyscale-provided Ray image using Python 3.9: [`anyscale/ray:latest-py39-cu118`](https://docs.anyscale.com/reference/base-images/overview). See the appendix below for more details. | ![Dreambooth fine-tuning sample results](https://raw.githubusercontent.com/ray-project/ray/workspace_templates_2.6.1/doc/source/templates/05_dreambooth_finetuning/dreambooth/images/dreambooth_example.png) @@ -31,11 +31,27 @@ Here are a few modifications to the `dreambooth_run.sh` script that you may want 2. The `$DATA_PREFIX` that the pre-trained model is downloaded to. This directory is also where the training dataset and the fine-tuned model checkpoint are written at the end of training. - If you add more worker nodes to the cluster, you should `$DATA_PREFIX` this to a shared NFS filesystem such as `/mnt/cluster_storage`. See [this page of the docs](https://docs.anyscale.com/develop/workspaces/storage#storage-shared-across-nodes) for all the options. - Note that each run of the script will overwrite the fine-tuned model checkpoint from the previous run, so consider changing the `$DATA_PREFIX` environment variable on each run if you don't want to lose the models/data of previous runs. -3. The `$NUM_WORKERS` variable sets the number of data-parallel workers used during fine-tuning. The default is 2 workers (2 workers, each using 2 GPUs), and you should increase this number if you add more GPU worker nodes to the cluster. +3. The `$NUM_WORKERS` variable sets the number of data-parallel workers used during fine-tuning. The default is 2 workers (2 workers, each using 1 GPU), and you should increase this number if you add more GPU worker nodes to the cluster. 4. Setting `--num_epochs` and `--max_train_steps` determines the number of fine-tuning steps to take. - Depending on the batch size and number of data-parallel workers, one epoch will run for a certain number of steps. The run will terminate when one of these values (epoch vs. total number of steps) is reached. 5. `generate.py` is used to generate stable diffusion images after loading the model from a checkpoint. You should modify the prompt at the end to be something more interesting, rather than just a photo of your subject. 6. If you want to launch another fine-tuning run, you may want to run *only* the `python train.py ...` command. Running the bash script will start from the beginning (generating another regularization dataset). +7. Use the following command for LoRA fine-tuning. +```bash +python train.py \ + --model_dir=$ORIG_MODEL_PATH \ + --output_dir=$TUNED_MODEL_DIR \ + --instance_images_dir=$IMAGES_OWN_DIR \ + --instance_prompt="photo of $UNIQUE_TOKEN $CLASS_NAME" \ + --class_images_dir=$IMAGES_REG_DIR \ + --class_prompt="photo of a $CLASS_NAME" \ + --train_batch_size=2 \ + --lr=1e-4 \ # Note a much higher learning rate here! + --num_epochs=10 \ + --max_train_steps=400 \ + --num_workers $NUM_WORKERS + --use_lora +``` ## Interact with the fine-tuned model @@ -53,6 +69,17 @@ python generate.py \ --num_samples_per_prompt=5 ``` +To generate images using LoRA fine-tuned model: + +```bash +python generate.py \ + --model_dir=$ORIG_MODEL_PATH \ + --lora_weights_dir=$TUNED_MODEL_DIR \ + --output_dir=$IMAGES_NEW_DIR \ + --prompts="photo of a $UNIQUE_TOKEN $CLASS_NAME" \ + --num_samples_per_prompt=5 +``` + ### Generate images interactively in a notebook See the `playground.ipynb` notebook for a more interactive way to generate images with the fine-tuned model. diff --git a/doc/source/templates/05_dreambooth_finetuning/dreambooth/dataset.py b/doc/source/templates/05_dreambooth_finetuning/dreambooth/dataset.py index e938a0d1dbbf..704abc7723e5 100644 --- a/doc/source/templates/05_dreambooth_finetuning/dreambooth/dataset.py +++ b/doc/source/templates/05_dreambooth_finetuning/dreambooth/dataset.py @@ -110,7 +110,7 @@ def _tokenize(prompt): return train_dataset.random_shuffle() -def collate(batch, device, dtype): +def collate(batch, dtype): """Build Torch training batch. B = batch size @@ -135,7 +135,7 @@ def collate(batch, device, dtype): """ images = torch.cat([batch["instance_image"], batch["class_image"]], dim=0) - images = images.to(memory_format=torch.contiguous_format).float() + images = images.to(memory_format=torch.contiguous_format).to(dtype) batch_size = len(batch["instance_prompt_ids"]) @@ -144,6 +144,6 @@ def collate(batch, device, dtype): ).reshape(batch_size * 2, -1) return { - "images": images.to(device, dtype=dtype), - "prompt_ids": prompt_ids.to(device), # token ids should stay int. + "images": images, + "prompt_ids": prompt_ids, # token ids should stay int. } diff --git a/doc/source/templates/05_dreambooth_finetuning/dreambooth/flags.py b/doc/source/templates/05_dreambooth_finetuning/dreambooth/flags.py index 6fcd81f5fb4d..8a7e765ac12a 100644 --- a/doc/source/templates/05_dreambooth_finetuning/dreambooth/flags.py +++ b/doc/source/templates/05_dreambooth_finetuning/dreambooth/flags.py @@ -17,7 +17,10 @@ def train_arguments(): type=str, default=None, required=True, - help="Directory where trained models are saved.", + help="Directory where trained models or LoRA weights are saved.", + ) + parser.add_argument( + "--use_lora", default=False, action="store_true", help="Use LoRA." ) parser.add_argument( "--instance_images_dir", @@ -146,5 +149,10 @@ def run_model_flags(): "Enable using Ray Data to use multiple GPU workers to perform inference." ), ) + parser.add_argument( + "--lora_weights_dir", + default=None, + help=("The directory where `pytorch_lora_weights.bin` is stored."), + ) return parser diff --git a/doc/source/templates/05_dreambooth_finetuning/dreambooth/generate.py b/doc/source/templates/05_dreambooth_finetuning/dreambooth/generate.py index 8328b7c622fc..0da8839b6aca 100644 --- a/doc/source/templates/05_dreambooth_finetuning/dreambooth/generate.py +++ b/doc/source/templates/05_dreambooth_finetuning/dreambooth/generate.py @@ -1,22 +1,19 @@ import hashlib from os import path -from diffusers import DiffusionPipeline import time import torch import ray from flags import run_model_flags +from generate_utils import get_pipeline def run(args): class StableDiffusionCallable: - def __init__(self, model_dir, output_dir): + def __init__(self, model_dir, output_dir, lora_weights_dir=None): print(f"Loading model from {model_dir}") - - self.pipeline = DiffusionPipeline.from_pretrained( - model_dir, torch_dtype=torch.float16 - ) + self.pipeline = get_pipeline(model_dir, lora_weights_dir) self.pipeline.set_progress_bar_config(disable=True) if torch.cuda.is_available(): self.pipeline.to("cuda") @@ -65,7 +62,7 @@ def __call__(self, batch): else: # Generate images one by one stable_diffusion_predictor = StableDiffusionCallable( - args.model_dir, args.output_dir + args.model_dir, args.output_dir, args.lora_weights_dir ) for prompt in prompts: for i in range(args.num_samples_per_prompt): diff --git a/doc/source/templates/05_dreambooth_finetuning/dreambooth/generate_utils.py b/doc/source/templates/05_dreambooth_finetuning/dreambooth/generate_utils.py new file mode 100644 index 000000000000..3b08babc4660 --- /dev/null +++ b/doc/source/templates/05_dreambooth_finetuning/dreambooth/generate_utils.py @@ -0,0 +1,26 @@ +from diffusers import DiffusionPipeline +from diffusers.loaders import LoraLoaderMixin +import torch + + +def load_lora_weights(unet, text_encoder, input_dir): + lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir) + LoraLoaderMixin.load_lora_into_unet( + lora_state_dict, network_alphas=network_alphas, unet=unet + ) + LoraLoaderMixin.load_lora_into_text_encoder( + lora_state_dict, network_alphas=network_alphas, text_encoder=text_encoder + ) + return unet, text_encoder + + +def get_pipeline(model_dir, lora_weights_dir=None): + pipeline = DiffusionPipeline.from_pretrained(model_dir, torch_dtype=torch.float16) + if lora_weights_dir: + unet = pipeline.unet + text_encoder = pipeline.text_encoder + print(f"Loading LoRA weights from {lora_weights_dir}") + unet, text_encoder = load_lora_weights(unet, text_encoder, lora_weights_dir) + pipeline.unet = unet + pipeline.text_encoder = text_encoder + return pipeline diff --git a/doc/source/templates/05_dreambooth_finetuning/dreambooth/images/dreambooth_training.png b/doc/source/templates/05_dreambooth_finetuning/dreambooth/images/dreambooth_training.png index 8a759bcd51da..34f971e4397a 100644 Binary files a/doc/source/templates/05_dreambooth_finetuning/dreambooth/images/dreambooth_training.png and b/doc/source/templates/05_dreambooth_finetuning/dreambooth/images/dreambooth_training.png differ diff --git a/doc/source/templates/05_dreambooth_finetuning/dreambooth/requirements.txt b/doc/source/templates/05_dreambooth_finetuning/dreambooth/requirements.txt index 440fa0ce5d24..76ddb9799407 100644 --- a/doc/source/templates/05_dreambooth_finetuning/dreambooth/requirements.txt +++ b/doc/source/templates/05_dreambooth_finetuning/dreambooth/requirements.txt @@ -1,6 +1,6 @@ accelerate==0.20.3 bitsandbytes==0.39.1 -diffusers==0.17.1 +diffusers==0.19.3 flax==0.6.11 ipywidgets huggingface_hub==0.16.2 diff --git a/doc/source/templates/05_dreambooth_finetuning/dreambooth/train.py b/doc/source/templates/05_dreambooth_finetuning/dreambooth/train.py index 3dc8384bca58..c56366588024 100644 --- a/doc/source/templates/05_dreambooth_finetuning/dreambooth/train.py +++ b/doc/source/templates/05_dreambooth_finetuning/dreambooth/train.py @@ -1,24 +1,42 @@ -import itertools +from typing import Dict +import itertools from diffusers import ( AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel, ) + +# LoRA related imports begin ## +from diffusers.loaders import ( + LoraLoaderMixin, + text_encoder_lora_state_dict, +) +from diffusers.models.attention_processor import ( + AttnAddedKVProcessor, + AttnAddedKVProcessor2_0, + LoRAAttnAddedKVProcessor, + LoRAAttnProcessor, + LoRAAttnProcessor2_0, + SlicedAttnAddedKVProcessor, +) + +# LoRA related imports end ## from diffusers.utils.import_utils import is_xformers_available +from ray.air import ScalingConfig from ray import train -from ray.train import ScalingConfig from ray.train.torch import TorchTrainer import torch import torch.nn.functional as F -from torch.nn.parallel import DistributedDataParallel from torch.nn.utils import clip_grad_norm_ from transformers import CLIPTextModel from dataset import collate, get_train_dataset from flags import train_arguments +LORA_RANK = 4 + def prior_preserving_loss(model_pred, target, weight): # Chunk the noise and model_pred into two parts and compute @@ -48,7 +66,61 @@ def get_target(scheduler, noise, latents, timesteps): raise ValueError(f"Unknown prediction type {pred_type}") -def load_models(config, cuda): +def add_lora_layers(unet, text_encoder): + """Add LoRA layers for unet and text encoder. + + `unet` and `text_encoder` will be modified in place. + + Returns: + The LoRA parameters for unet and text encoder correspondingly. + """ + unet_lora_attn_procs = {} + unet_lora_parameters = [] + for name, attn_processor in unet.attn_processors.items(): + cross_attention_dim = ( + None + if name.endswith("attn1.processor") + else unet.config.cross_attention_dim + ) + if name.startswith("mid_block"): + hidden_size = unet.config.block_out_channels[-1] + elif name.startswith("up_blocks"): + block_id = int(name[len("up_blocks.")]) + hidden_size = list(reversed(unet.config.block_out_channels))[block_id] + elif name.startswith("down_blocks"): + block_id = int(name[len("down_blocks.")]) + hidden_size = unet.config.block_out_channels[block_id] + + if isinstance( + attn_processor, + (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_0), + ): + lora_attn_processor_class = LoRAAttnAddedKVProcessor + else: + lora_attn_processor_class = ( + LoRAAttnProcessor2_0 + if hasattr(F, "scaled_dot_product_attention") + else LoRAAttnProcessor + ) + + module = lora_attn_processor_class( + hidden_size=hidden_size, + cross_attention_dim=cross_attention_dim, + rank=LORA_RANK, + ) + unet_lora_attn_procs[name] = module + unet_lora_parameters.extend(module.parameters()) + + unet.set_attn_processor(unet_lora_attn_procs) + + text_lora_parameters = LoraLoaderMixin._modify_text_encoder( + text_encoder, dtype=torch.float32, rank=LORA_RANK + ) + + return unet_lora_parameters, text_lora_parameters + + +def load_models(config): """Load pre-trained Stable Diffusion models.""" # Load all models in bfloat16 to save GRAM. # For models that are only used for inferencing, @@ -60,8 +132,6 @@ def load_models(config, cuda): subfolder="text_encoder", torch_dtype=dtype, ) - text_encoder.to(cuda[1]) - text_encoder.train() noise_scheduler = DDPMScheduler.from_pretrained( config["model_dir"], @@ -77,7 +147,6 @@ def load_models(config, cuda): ) # We are not training VAE part of the model. vae.requires_grad_(False) - vae.to(cuda[1]) # Convert unet to bf16 to save GRAM. unet = UNet2DConditionModel.from_pretrained( @@ -85,39 +154,56 @@ def load_models(config, cuda): subfolder="unet", torch_dtype=dtype, ) + if is_xformers_available(): unet.enable_xformers_memory_efficient_attention() - # UNET is the largest component, occupying first GPU by itself. - unet.to(cuda[0]) - unet.train() - torch.cuda.empty_cache() + if not config["use_lora"]: + unet_trainable_parameters = unet.parameters() + text_trainable_parameters = text_encoder.parameters() + else: + text_encoder.requires_grad_(False) + unet.requires_grad_(False) + unet_trainable_parameters, text_trainable_parameters = add_lora_layers( + unet, text_encoder + ) - return text_encoder, noise_scheduler, vae, unet + text_encoder.train() + unet.train() + torch.cuda.empty_cache() -def get_cuda_devices(): - devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())] - local_rank = train.get_context().get_local_rank() - assert len(devices) >= 2, "Require at least 2 GPU devices to work." - return devices[(local_rank * 2) : ((local_rank * 2) + 2)] + return ( + text_encoder, + noise_scheduler, + vae, + unet, + unet_trainable_parameters, + text_trainable_parameters, + ) def train_fn(config): - cuda = get_cuda_devices() # Load pre-trained models. - text_encoder, noise_scheduler, vae, unet = load_models(config, cuda) - - # Wrap in DDP - text_encoder = DistributedDataParallel( - text_encoder, device_ids=[cuda[1]], output_device=cuda[1] - ) - unet = DistributedDataParallel(unet, device_ids=[cuda[0]], output_device=cuda[0]) + ( + text_encoder, + noise_scheduler, + vae, + unet, + unet_trainable_parameters, + text_trainable_parameters, + ) = load_models(config) + + text_encoder = train.torch.prepare_model(text_encoder) + unet = train.torch.prepare_model(unet) + # manually move to device as `prepare_model` can't be used on + # non-training models. + vae = vae.to(train.torch.get_device()) # Use the regular AdamW optimizer to work with bfloat16 weights. optimizer = torch.optim.AdamW( - itertools.chain(text_encoder.parameters(), unet.parameters()), + itertools.chain(unet_trainable_parameters, text_trainable_parameters), lr=config["lr"], ) @@ -129,18 +215,18 @@ def train_fn(config): print(f"Running {num_train_epochs} epochs.") global_step = 0 - for epoch in range(num_train_epochs): + for _ in range(num_train_epochs): if global_step >= config["max_train_steps"]: print(f"Stopping training after reaching {global_step} steps...") break - for step, batch in enumerate( + for _, batch in enumerate( train_dataset.iter_torch_batches( - batch_size=config["train_batch_size"], device=cuda[1] + batch_size=config["train_batch_size"], + device=train.torch.get_device(), ) ): - # Load batch on GPU 2 because VAE and text encoder are there. - batch = collate(batch, cuda[1], torch.bfloat16) + batch = collate(batch, torch.bfloat16) optimizer.zero_grad() @@ -166,15 +252,14 @@ def train_fn(config): # Get the text embedding for conditioning encoder_hidden_states = text_encoder(batch["prompt_ids"])[0] - # Predict the noise residual. We need to move all data bits to GPU 1. + # Predict the noise residual. model_pred = unet( - noisy_latents.to(cuda[0]), - timesteps.to(cuda[0]), - encoder_hidden_states.to(cuda[0]), + noisy_latents.to(train.torch.get_device()), + timesteps.to(train.torch.get_device()), + encoder_hidden_states.to(train.torch.get_device()), ).sample - target = get_target(noise_scheduler, noise, latents, timesteps).to(cuda[0]) + target = get_target(noise_scheduler, noise, latents, timesteps) - # Now, move model prediction to GPU 2 for loss calculation. loss = prior_preserving_loss( model_pred, target, config["prior_loss_weight"] ) @@ -182,7 +267,7 @@ def train_fn(config): # Gradient clipping before optimizer stepping. clip_grad_norm_( - itertools.chain(text_encoder.parameters(), unet.parameters()), + itertools.chain(unet_trainable_parameters, text_trainable_parameters), config["max_grad_norm"], ) @@ -201,12 +286,45 @@ def train_fn(config): # Create pipeline using the trained modules and save it. if train.get_context().get_world_rank() == 0: - pipeline = DiffusionPipeline.from_pretrained( - config["model_dir"], - text_encoder=text_encoder.module, - unet=unet.module, - ) - pipeline.save_pretrained(config["output_dir"]) + if not config["use_lora"]: + pipeline = DiffusionPipeline.from_pretrained( + config["model_dir"], + text_encoder=text_encoder.module, + unet=unet.module, + ) + pipeline.save_pretrained(config["output_dir"]) + else: + save_lora_weights(unet.module, text_encoder.module, config["output_dir"]) + + +def unet_attn_processors_state_dict(unet) -> Dict[str, torch.tensor]: + """ + Returns: + a state dict containing just the attention processor parameters. + """ + attn_processors = unet.attn_processors + + attn_processors_state_dict = {} + + for attn_processor_key, attn_processor in attn_processors.items(): + for parameter_key, parameter in attn_processor.state_dict().items(): + param_name = f"{attn_processor_key}.{parameter_key}" + attn_processors_state_dict[param_name] = parameter + return attn_processors_state_dict + + +def save_lora_weights(unet, text_encoder, output_dir): + unet_lora_layers_to_save = None + text_encoder_lora_layers_to_save = None + + unet_lora_layers_to_save = unet_attn_processors_state_dict(unet) + text_encoder_lora_layers_to_save = text_encoder_lora_state_dict(text_encoder) + + LoraLoaderMixin.save_lora_weights( + output_dir, + unet_lora_layers=unet_lora_layers_to_save, + text_encoder_lora_layers=text_encoder_lora_layers_to_save, + ) if __name__ == "__main__": @@ -224,7 +342,6 @@ def train_fn(config): scaling_config=ScalingConfig( use_gpu=True, num_workers=args.num_workers, - resources_per_worker={"GPU": 2}, ), datasets={ "train": train_dataset, diff --git a/doc/source/templates/05_dreambooth_finetuning/dreambooth_run.sh b/doc/source/templates/05_dreambooth_finetuning/dreambooth_run.sh index 104c340b41ed..f5485bc82a45 100755 --- a/doc/source/templates/05_dreambooth_finetuning/dreambooth_run.sh +++ b/doc/source/templates/05_dreambooth_finetuning/dreambooth_run.sh @@ -7,6 +7,7 @@ set -xe pushd dreambooth || true # Step 0 cont +# __preparation_start__ # TODO: If running on multiple nodes, change this path to a shared directory (ex: NFS) export DATA_PREFIX="/tmp" export ORIG_MODEL_NAME="CompVis/stable-diffusion-v1-4" @@ -21,49 +22,101 @@ export IMAGES_NEW_DIR="$DATA_PREFIX/images-new" export NUM_WORKERS=2 mkdir -p $ORIG_MODEL_DIR $TUNED_MODEL_DIR $IMAGES_REG_DIR $IMAGES_OWN_DIR $IMAGES_NEW_DIR +# __preparation_end__ # Unique token to identify our subject (e.g., a random dog vs. our unqtkn dog) export UNIQUE_TOKEN="unqtkn" -# Step 1 -# Only uncomment one of the following: - -# Option 1: Use the dog dataset --------- -export CLASS_NAME="dog" -python download_example_dataset.py ./images/dog -export INSTANCE_DIR=./images/dog -# --------------------------------------- - -# Option 2: Use the lego car dataset ---- -# export CLASS_NAME="car" -# export INSTANCE_DIR=./images/lego-car -# --------------------------------------- - -# Option 3: Use your own images --------- -# export CLASS_NAME="" -# export INSTANCE_DIR="/path/to/images/of/subject" -# --------------------------------------- - -# Copy own images into IMAGES_OWN_DIR -cp -rf $INSTANCE_DIR/* "$IMAGES_OWN_DIR/" +skip_image_setup=false +use_lora=false +# parse args +for arg in "$@"; do + case $arg in + --skip_image_setup) + echo "Option --skip_image_setup is set" + skip_image_setup=true + ;; + --lora) + echo "Option --lora is set" + use_lora=true + ;; + *) + echo "Invalid option: $arg" + ;; + esac +done -# Step 2 +# Step 1 +# __cache_model_start__ python cache_model.py --model_dir=$ORIG_MODEL_DIR --model_name=$ORIG_MODEL_NAME --revision=$ORIG_MODEL_HASH - -# Clear reg dir -rm -rf "$IMAGES_REG_DIR"/*.jpg - -# Step 3: START -python generate.py \ - --model_dir=$ORIG_MODEL_PATH \ - --output_dir=$IMAGES_REG_DIR \ - --prompts="photo of a $CLASS_NAME" \ - --num_samples_per_prompt=200 \ - --use_ray_data -# Step 3: END - -# Step 4: START -python train.py \ +# __cache_model_end__ + +download_image() { + # Step 2 + # __supply_own_images_start__ + # Only uncomment one of the following: + + # Option 1: Use the dog dataset --------- + export CLASS_NAME="dog" + python download_example_dataset.py ./images/dog + export INSTANCE_DIR=./images/dog + # --------------------------------------- + + # Option 2: Use the lego car dataset ---- + # export CLASS_NAME="car" + # export INSTANCE_DIR=./images/lego-car + # --------------------------------------- + + # Option 3: Use your own images --------- + # export CLASS_NAME="" + # export INSTANCE_DIR="/path/to/images/of/subject" + # --------------------------------------- + + # Copy own images into IMAGES_OWN_DIR + cp -rf $INSTANCE_DIR/* "$IMAGES_OWN_DIR/" + # __supply_own_images_end__ + + # Clear reg dir + rm -rf "$IMAGES_REG_DIR"/*.jpg + + # Step 3: START + python generate.py \ + --model_dir=$ORIG_MODEL_PATH \ + --output_dir=$IMAGES_REG_DIR \ + --prompts="photo of a $CLASS_NAME" \ + --num_samples_per_prompt=200 \ + --use_ray_data + # Step 3: END +} + +# Skip step 2 and 3 if skip_image_setup=true + +if $skip_image_setup; then + echo "Skipping image downloading..." +else + download_image +fi + +if [ "$use_lora" = false ]; then + echo "Start full-finetuning..." + # Step 4: START + python train.py \ + --model_dir=$ORIG_MODEL_PATH \ + --output_dir=$TUNED_MODEL_DIR \ + --instance_images_dir=$IMAGES_OWN_DIR \ + --instance_prompt="photo of $UNIQUE_TOKEN $CLASS_NAME" \ + --class_images_dir=$IMAGES_REG_DIR \ + --class_prompt="photo of a $CLASS_NAME" \ + --train_batch_size=2 \ + --lr=5e-6 \ + --num_epochs=4 \ + --max_train_steps=200 \ + --num_workers $NUM_WORKERS + # Step 4: END +else + echo "Start LoRA finetuning..." + python train.py \ + --use_lora \ --model_dir=$ORIG_MODEL_PATH \ --output_dir=$TUNED_MODEL_DIR \ --instance_images_dir=$IMAGES_OWN_DIR \ @@ -71,23 +124,31 @@ python train.py \ --class_images_dir=$IMAGES_REG_DIR \ --class_prompt="photo of a $CLASS_NAME" \ --train_batch_size=2 \ - --lr=5e-6 \ - --num_epochs=10 \ - --max_train_steps=400 \ + --lr=1e-4 \ + --num_epochs=4 \ + --max_train_steps=200 \ --num_workers $NUM_WORKERS -# Step 4: END +fi # Clear new dir rm -rf "$IMAGES_NEW_DIR"/*.jpg -# TODO: Change the prompt to something more interesting! -# Step 5: START -python generate.py \ - --model_dir=$TUNED_MODEL_DIR \ +if [ "$use_lora" = false ]; then + # Step 5: START + python generate.py \ + --model_dir=$TUNED_MODEL_DIR \ + --output_dir=$IMAGES_NEW_DIR \ + --prompts="photo of a $UNIQUE_TOKEN $CLASS_NAME in a bucket" \ + --num_samples_per_prompt=5 + # Step 5: END +else + python generate.py \ + --model_dir=$ORIG_MODEL_PATH \ + --lora_weights_dir=$TUNED_MODEL_DIR \ --output_dir=$IMAGES_NEW_DIR \ - --prompts="photo of a $UNIQUE_TOKEN $CLASS_NAME" \ + --prompts="photo of a $UNIQUE_TOKEN $CLASS_NAME in a bucket" \ --num_samples_per_prompt=5 -# Step 5: END +fi # Save artifact mkdir -p /tmp/artifacts diff --git a/doc/source/templates/05_dreambooth_finetuning/playground.ipynb b/doc/source/templates/05_dreambooth_finetuning/playground.ipynb index ad744b357fa0..fd9673c3a9a1 100644 --- a/doc/source/templates/05_dreambooth_finetuning/playground.ipynb +++ b/doc/source/templates/05_dreambooth_finetuning/playground.ipynb @@ -19,7 +19,10 @@ "source": [ "# TODO: Change this to the path of your fine-tuned model checkpoint!\n", "# This is the $TUNED_MODEL_DIR variable defined in the run script.\n", - "TUNED_MODEL_PATH = \"/tmp/model-tuned\"\n" + "TUNED_MODEL_PATH = \"/tmp/model-tuned\"\n", + "# TODO: Set the following variables if you fine-tuned with LoRA.\n", + "ORIG_MODEL_PATH = \"/tmp/model-orig/models--CompVis--stable-diffusion-v1-4/snapshots/b95be7d6f134c3a9e62ee616f310733567f069ce/\"\n", + "LORA_WEIGHTS_DIR = \"/tmp/model-tuned\"" ] }, { @@ -38,13 +41,26 @@ }, "outputs": [], "source": [ + "from os import environ\n", + "\n", "import torch\n", "from diffusers import DiffusionPipeline\n", "\n", - "pipeline = DiffusionPipeline.from_pretrained(\n", - " TUNED_MODEL_PATH, torch_dtype=torch.float16\n", - ")\n", - "pipeline.to(\"cuda\")\n" + "from dreambooth.generate_utils import load_lora_weights, get_pipeline\n", + "\n", + "pipeline = None\n", + "\n", + "def on_full_ft():\n", + " global pipeline\n", + " pipeline = get_pipeline(TUNED_MODEL_PATH)\n", + " pipeline.to(\"cuda\")\n", + " \n", + "def on_lora_ft():\n", + " assert ORIG_MODEL_PATH\n", + " assert LORA_WEIGHTS_DIR\n", + " global pipeline\n", + " pipeline = get_pipeline(ORIG_MODEL_PATH, LORA_WEIGHTS_DIR)\n", + " pipeline.to(\"cuda\")\n" ] }, { @@ -91,6 +107,26 @@ "# you can give the prompt \"photo of a unqtkn dog on the beach\".\n", "\n", "# IPython GUI Layouts\n", + "\n", + "output = widgets.Output()\n", + "toggle_buttons = widgets.ToggleButtons(\n", + " options=[\"Full fine-tuning\",\"LoRA fine-tuning\"],\n", + " disabled=False,\n", + " button_style='', # 'success', 'info', 'warning', 'danger' or ''\n", + " value=None,\n", + " # layout=widgets.Layout(width='100px')\n", + ")\n", + "\n", + "def toggle_callback(change):\n", + " with output:\n", + " clear_output()\n", + " if change[\"new\"] == \"Full fine-tuning\":\n", + " on_full_ft()\n", + " else:\n", + " on_lora_ft()\n", + " \n", + "toggle_buttons.observe(toggle_callback, names=\"value\")\n", + " \n", "input_text = widgets.Text(\n", " value=\"photo of a unqtkn dog on the beach\",\n", " placeholder=\"\",\n", @@ -100,7 +136,6 @@ ")\n", "\n", "button = widgets.Button(description=\"Generate!\")\n", - "output = widgets.Output()\n", "\n", "# Define button click event\n", "def on_button_clicked(b):\n", @@ -120,15 +155,21 @@ "button.on_click(on_button_clicked)\n", "\n", "# Display the widgets\n", - "display(widgets.HBox([input_text, button]), output)\n" + "display(toggle_buttons, widgets.HBox([input_text, button]), output)\n" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], - "source": [] + "source": [ + "# release memory properly\n", + "del pipeline \n", + "torch.cuda.empty_cache()" + ] } ], "metadata": { @@ -147,7 +188,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.15" + "version": "3.8.13" } }, "nbformat": 4, diff --git a/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb b/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb index 7abf5af46b00..f0b2136c19d4 100644 --- a/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb +++ b/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb @@ -64,7 +64,7 @@ "### Local Storage\n", "To demonstrate offline inference, we need to download and consolidate the model checkpoint onto the head node. This action requires around 200GB disk storage. Therefore, we mounted the NVMe SSD provided by g5 instances at `/dev/nvme1n1` to `/mnt/local_storage`, and we will save the checkpoints in this folder.\n", "\n", - "For more details, please refer to[Amazon EBS and NVMe on Linux instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/nvme-ebs-volumes.html).\n" + "For more details, see [Amazon EBS and NVMe on Linux instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/nvme-ebs-volumes.html).\n" ] }, { diff --git a/doc/source/train/user-guides/monitoring-logging.rst b/doc/source/train/user-guides/monitoring-logging.rst index b57d3513ea69..31d7e532543b 100644 --- a/doc/source/train/user-guides/monitoring-logging.rst +++ b/doc/source/train/user-guides/monitoring-logging.rst @@ -1,7 +1,7 @@ .. _train-monitoring-and-logging: -Monitoring and Logging -====================== +Monitoring and Logging Metrics +============================== Ray Train provides an API for reporting intermediate results and checkpoints from the training function (run on distributed workers) up to the diff --git a/java/dependencies.bzl b/java/dependencies.bzl index 18acad420ed9..5c7a99f36cbb 100644 --- a/java/dependencies.bzl +++ b/java/dependencies.bzl @@ -4,10 +4,10 @@ load("@rules_jvm_external//:specs.bzl", "maven") def gen_java_deps(): maven_install( artifacts = [ - "com.fasterxml.jackson.core:jackson-databind:2.13.5", + "com.fasterxml.jackson.core:jackson-databind:2.15.2", "com.github.java-json-tools:json-schema-validator:2.2.14", "com.google.code.gson:gson:2.9.1", - "com.google.guava:guava:30.0-jre", + "com.google.guava:guava:32.0.1-jre", "com.google.protobuf:protobuf-java:3.19.6", "com.google.protobuf:protobuf-java-util:3.19.6", "com.puppycrawl.tools:checkstyle:8.15", diff --git a/java/test/src/main/java/io/ray/test/CrossLanguageInvocationTest.java b/java/test/src/main/java/io/ray/test/CrossLanguageInvocationTest.java index 39721b81ff5e..89c730790faf 100644 --- a/java/test/src/main/java/io/ray/test/CrossLanguageInvocationTest.java +++ b/java/test/src/main/java/io/ray/test/CrossLanguageInvocationTest.java @@ -241,14 +241,14 @@ public void testCallingCppActor() { Assert.assertEquals(b.get(), "C++ Worker".getBytes()); ObjectRef b2 = - actor.task(CppActorMethod.of("echoBytes", byte[].class), "C++ Worker".getBytes()).remote(); + actor.task(CppActorMethod.of("EchoBytes", byte[].class), "C++ Worker".getBytes()).remote(); Assert.assertEquals(b2.get(), "C++ Worker".getBytes()); ObjectRef b3 = - actor.task(CppActorMethod.of("echoBytes", byte[].class), new byte[0]).remote(); + actor.task(CppActorMethod.of("EchoBytes", byte[].class), new byte[0]).remote(); Assert.assertEquals(b3.get(), new byte[0]); - ObjectRef b4 = actor.task(CppActorMethod.of("echoBytes", byte[].class), null).remote(); + ObjectRef b4 = actor.task(CppActorMethod.of("EchoBytes", byte[].class), null).remote(); Assert.assertThrows(CrossLanguageException.class, () -> b4.get()); // Test get cpp actor by actor name. @@ -268,6 +268,45 @@ public void testCallingCppActor() { CppActorHandle actor3 = optional3.get(); ObjectRef res4 = actor3.task(CppActorMethod.of("Plus1", Integer.class)).remote(); Assert.assertEquals(res4.get(), Integer.valueOf(1)); + Object[] strings = new Object[] {"str1", "str2"}; + ObjectRef ref_strings = + actor.task(CppActorMethod.of("EchoStrings", Object[].class), strings).remote(); + Assert.assertEquals(ref_strings.get(), strings); + + if (System.getProperty("os.name").toUpperCase().indexOf("Windows") == -1) { + // Testing the supported parameter types for c++ worker std::any type + Object[] inputs = + new Object[] { + true, // Boolean + Byte.MAX_VALUE, // Byte + Short.MAX_VALUE, // Short + Integer.MAX_VALUE, // Integer + Long.MAX_VALUE, // Long + Long.MIN_VALUE, + BigInteger.valueOf(Long.MAX_VALUE), // BigInteger + "Hello World!", // String + 1.234f, // Float + 1.234, // Double + "binary".getBytes() // byte[] + }; + ObjectRef ref_objs = + actor.task(CppActorMethod.of("EchoAnyArray", Object[].class), inputs).remote(); + Object[] objs_result = ref_objs.get(); + Assert.assertEquals(objs_result.length, inputs.length); + Assert.assertEquals((Boolean) objs_result[0], inputs[0]); + Assert.assertEquals((Byte) objs_result[1], inputs[1]); + Assert.assertEquals((Short) objs_result[2], inputs[2]); + Assert.assertEquals((Integer) objs_result[3], inputs[3]); + Assert.assertEquals((Long) objs_result[4], inputs[4]); + Assert.assertEquals((Long) objs_result[5], inputs[5]); + // BigInteger change to Long + Assert.assertEquals((Long) objs_result[6], Long.MAX_VALUE); + Assert.assertEquals((String) objs_result[7], inputs[7]); + // Float change to double + Assert.assertTrue(Math.abs((Double) objs_result[8] - 1.234) < 0.000001); + Assert.assertTrue(Math.abs((Double) objs_result[9] - (Double) inputs[9]) < 0.000001); + Assert.assertEquals((byte[]) objs_result[10], (byte[]) inputs[10]); + } } @Test diff --git a/python/ray/data/BUILD b/python/ray/data/BUILD index f750667250d1..aeab7ff21823 100644 --- a/python/ray/data/BUILD +++ b/python/ray/data/BUILD @@ -219,7 +219,7 @@ py_test( py_test( name = "test_numpy", - size = "small", + size = "medium", srcs = ["tests/test_numpy.py"], tags = ["team:data", "exclusive"], deps = ["//:ray_lib", ":conftest"], diff --git a/python/ray/data/_internal/arrow_block.py b/python/ray/data/_internal/arrow_block.py index 2078aea57575..dbdc3738d9ed 100644 --- a/python/ray/data/_internal/arrow_block.py +++ b/python/ray/data/_internal/arrow_block.py @@ -1,3 +1,4 @@ +import bisect import collections import heapq import random @@ -24,7 +25,7 @@ is_valid_udf_return, ) from ray.data._internal.table_block import TableBlockAccessor, TableBlockBuilder -from ray.data._internal.util import _truncated_repr, find_partitions +from ray.data._internal.util import _truncated_repr from ray.data.block import ( Block, BlockAccessor, @@ -426,7 +427,41 @@ def sort_and_partition( if len(boundaries) == 0: return [table] - return find_partitions(table, boundaries, sort_key) + partitions = [] + # For each boundary value, count the number of items that are less + # than it. Since the block is sorted, these counts partition the items + # such that boundaries[i] <= x < boundaries[i + 1] for each x in + # partition[i]. If `descending` is true, `boundaries` would also be + # in descending order and we only need to count the number of items + # *greater than* the boundary value instead. + + def find_partition_index(records, boundary, sort_key): + if sort_key.get_descending(): + return len(records) - bisect.bisect_left(records[::-1], boundary) + else: + return bisect.bisect_left(records, boundary) + + def searchsorted(table, boundaries, sort_key): + records = [ + tuple(d.values()) + for d in transform_pyarrow.to_pylist( + table.select(sort_key.get_columns()) + ) + ] + return [ + find_partition_index(records, boundary, sort_key) + for boundary in boundaries + ] + + bounds = searchsorted(table, boundaries, sort_key) + + partitions = [] + last_idx = 0 + for idx in bounds: + partitions.append(table.slice(last_idx, idx - last_idx)) + last_idx = idx + partitions.append(table.slice(last_idx)) + return partitions def combine(self, key: str, aggs: Tuple["AggregateFn"]) -> Block: """Combine rows with the same key into an accumulator. diff --git a/python/ray/data/_internal/arrow_ops/transform_pyarrow.py b/python/ray/data/_internal/arrow_ops/transform_pyarrow.py index 40164d6add85..badae9aed3fe 100644 --- a/python/ray/data/_internal/arrow_ops/transform_pyarrow.py +++ b/python/ray/data/_internal/arrow_ops/transform_pyarrow.py @@ -286,3 +286,17 @@ def combine_chunks(table: "pyarrow.Table") -> "pyarrow.Table": arr = col.combine_chunks() new_cols.append(arr) return pyarrow.Table.from_arrays(new_cols, schema=table.schema) + + +def to_pylist(table: "pyarrow.Table") -> "pyarrow.Table": + """Convert the Table to a list of rows / dictionaries. + + Required for compatibility with Arrow 6. + """ + pydict = table.to_pydict() + names = table.schema.names + pylist = [ + {column: pydict[column][row] for column in names} + for row in range(table.num_rows) + ] + return pylist diff --git a/python/ray/data/_internal/pandas_block.py b/python/ray/data/_internal/pandas_block.py index fb10f9c22577..4c3b204cc307 100644 --- a/python/ray/data/_internal/pandas_block.py +++ b/python/ray/data/_internal/pandas_block.py @@ -1,3 +1,4 @@ +import bisect import collections import heapq from typing import ( @@ -17,7 +18,6 @@ from ray.air.constants import TENSOR_COLUMN_NAME from ray.data._internal.table_block import TableBlockAccessor, TableBlockBuilder -from ray.data._internal.util import find_partitions from ray.data.block import ( Block, BlockAccessor, @@ -357,7 +357,38 @@ def sort_and_partition( if len(boundaries) == 0: return [table] - return find_partitions(table, boundaries, sort_key) + partitions = [] + # For each boundary value, count the number of items that are less + # than it. Since the block is sorted, these counts partition the items + # such that boundaries[i] <= x < boundaries[i + 1] for each x in + # partition[i]. If `descending` is true, `boundaries` would also be + # in descending order and we only need to count the number of items + # *greater than* the boundary value instead. + + def find_partition_index(records, boundary, sort_key): + if sort_key.get_descending(): + return len(records) - bisect.bisect_left(records[::-1], boundary) + else: + return bisect.bisect_left(records, boundary) + + def searchsorted(table, boundaries, sort_key): + records = list( + table[sort_key.get_columns()].itertuples(index=False, name=None) + ) + + return [ + find_partition_index(records, boundary, sort_key) + for boundary in boundaries + ] + + bounds = searchsorted(table, boundaries, sort_key) + + last_idx = 0 + for idx in bounds: + partitions.append(table[last_idx:idx]) + last_idx = idx + partitions.append(table[last_idx:]) + return partitions def combine(self, key: str, aggs: Tuple["AggregateFn"]) -> "pandas.DataFrame": """Combine rows with the same key into an accumulator. diff --git a/python/ray/data/_internal/util.py b/python/ray/data/_internal/util.py index 5e06192d362b..4aaddd437b4b 100644 --- a/python/ray/data/_internal/util.py +++ b/python/ray/data/_internal/util.py @@ -19,7 +19,6 @@ import pyarrow from ray.data._internal.compute import ComputeStrategy - from ray.data._internal.sort import SortKey from ray.data.block import Block, BlockMetadata, UserDefinedFunction from ray.data.datasource import Reader from ray.util.placement_group import PlacementGroup @@ -516,69 +515,6 @@ def unify_block_metadata_schema( return None -def find_partition_index( - table: Union["pyarrow.Table", "pandas.DataFrame"], - desired: List[Any], - sort_key: "SortKey", -) -> int: - columns = sort_key.get_columns() - descending = sort_key.get_descending() - - left, right = 0, len(table) - for i in range(len(desired)): - if left == right: - return right - col_name = columns[i] - col_vals = table[col_name].to_numpy()[left:right] - desired_val = desired[i] - - prevleft = left - if descending is True: - left = prevleft + ( - len(col_vals) - - np.searchsorted( - col_vals, - desired_val, - side="right", - sorter=np.arange(len(col_vals) - 1, -1, -1), - ) - ) - right = prevleft + ( - len(col_vals) - - np.searchsorted( - col_vals, - desired_val, - side="left", - sorter=np.arange(len(col_vals) - 1, -1, -1), - ) - ) - else: - left = prevleft + np.searchsorted(col_vals, desired_val, side="left") - right = prevleft + np.searchsorted(col_vals, desired_val, side="right") - return right - - -def find_partitions(table, boundaries, sort_key): - partitions = [] - - # For each boundary value, count the number of items that are less - # than it. Since the block is sorted, these counts partition the items - # such that boundaries[i] <= x < boundaries[i + 1] for each x in - # partition[i]. If `descending` is true, `boundaries` would also be - # in descending order and we only need to count the number of items - # *greater than* the boundary value instead. - bounds = [ - find_partition_index(table, boundary, sort_key) for boundary in boundaries - ] - - last_idx = 0 - for idx in bounds: - partitions.append(table[last_idx:idx]) - last_idx = idx - partitions.append(table[last_idx:]) - return partitions - - def get_attribute_from_class_name(class_name: str) -> Any: """Get Python attribute from the provided class name. diff --git a/python/ray/includes/ray_config.pxd b/python/ray/includes/ray_config.pxd index 0a177b11eda6..a8845272e166 100644 --- a/python/ray/includes/ray_config.pxd +++ b/python/ray/includes/ray_config.pxd @@ -63,8 +63,6 @@ cdef extern from "ray/common/ray_config.h" nogil: c_bool start_python_importer_thread() const - c_bool use_ray_syncer() const - c_string REDIS_CA_CERT() const c_string REDIS_CA_PATH() const diff --git a/python/ray/includes/ray_config.pxi b/python/ray/includes/ray_config.pxi index d0d1ff4e09cb..0b1bf40ca8df 100644 --- a/python/ray/includes/ray_config.pxi +++ b/python/ray/includes/ray_config.pxi @@ -101,10 +101,6 @@ cdef class Config: def start_python_importer_thread(): return RayConfig.instance().start_python_importer_thread() - @staticmethod - def use_ray_syncer(): - return RayConfig.instance().use_ray_syncer() - @staticmethod def REDIS_CA_CERT(): return RayConfig.instance().REDIS_CA_CERT() diff --git a/python/ray/serve/BUILD b/python/ray/serve/BUILD index d9c6bb041c76..0ba8dacedf44 100644 --- a/python/ray/serve/BUILD +++ b/python/ray/serve/BUILD @@ -723,7 +723,7 @@ py_test( py_test( name = "test_proxy", - size = "small", + size = "medium", srcs = serve_tests_srcs, tags = ["exclusive", "team:serve"], deps = [":serve_lib"], diff --git a/python/ray/serve/_private/api.py b/python/ray/serve/_private/api.py index db04df80e7d6..f4a0e950873e 100644 --- a/python/ray/serve/_private/api.py +++ b/python/ray/serve/_private/api.py @@ -40,7 +40,7 @@ ) -def get_deployment(name: str): +def get_deployment(name: str, app_name: str = ""): """Dynamically fetch a handle to a Deployment object. Args: @@ -54,7 +54,7 @@ def get_deployment(name: str): ( deployment_info, route_prefix, - ) = get_global_client().get_deployment_info(name) + ) = get_global_client().get_deployment_info(name, app_name) except KeyError: raise KeyError( f"Deployment {name} was not found. Did you call Deployment.deploy()?" diff --git a/python/ray/serve/_private/application_state.py b/python/ray/serve/_private/application_state.py index a3262863fe27..a0c46fed50ea 100644 --- a/python/ray/serve/_private/application_state.py +++ b/python/ray/serve/_private/application_state.py @@ -15,11 +15,13 @@ from ray.serve.exceptions import RayServeException from ray.serve._private.common import ( + DeploymentID, DeploymentStatus, DeploymentStatusInfo, ApplicationStatusInfo, ApplicationStatus, EndpointInfo, + EndpointTag, DeploymentInfo, ) from ray.serve._private.constants import ( @@ -257,8 +259,9 @@ def _set_target_state_deleting(self): self._set_target_state(dict(), None, None, True) def _delete_deployment(self, name): - self._endpoint_state.delete_endpoint(name) - self._deployment_state_manager.delete_deployment(name) + id = EndpointTag(name, self._name) + self._endpoint_state.delete_endpoint(id) + self._deployment_state_manager.delete_deployment(str(id)) def delete(self): """Delete the application""" @@ -286,20 +289,20 @@ def apply_deployment_info( f'Invalid route prefix "{route_prefix}", it must start with "/"' ) - self._deployment_state_manager.deploy(deployment_name, deployment_info) + deployment_id = DeploymentID(deployment_name, self._name) + self._deployment_state_manager.deploy(str(deployment_id), deployment_info) if deployment_info.route_prefix is not None: config = deployment_info.deployment_config self._endpoint_state.update_endpoint( - deployment_name, + deployment_id, EndpointInfo( route=deployment_info.route_prefix, - app_name=self._name, app_is_cross_language=config.is_cross_language, ), ) else: - self._endpoint_state.delete_endpoint(deployment_name) + self._endpoint_state.delete_endpoint(deployment_id) def deploy(self, deployment_infos: Dict[str, DeploymentInfo]): """Deploy application from list of deployment infos. @@ -388,7 +391,9 @@ def deploy_config( ) def _get_live_deployments(self) -> List[str]: - return self._deployment_state_manager.get_deployments_in_application(self._name) + deps = self._deployment_state_manager.get_deployments_in_application(self._name) + prefix = self._name + DEPLOYMENT_NAME_PREFIX_SEPARATOR + return [deployment[len(prefix) :] for deployment in deps] def _determine_app_status(self) -> Tuple[ApplicationStatus, str]: """Check deployment statuses and target state, and determine the @@ -617,13 +622,16 @@ def get_checkpoint_data(self) -> ApplicationTargetState: def get_deployment(self, name: str) -> DeploymentInfo: """Get deployment info for deployment by name.""" - return self._deployment_state_manager.get_deployment(name) + deployment_id = DeploymentID(name, self._name) + return self._deployment_state_manager.get_deployment(str(deployment_id)) def get_deployments_statuses(self) -> List[DeploymentStatusInfo]: """Return all deployment status information""" - return self._deployment_state_manager.get_deployment_statuses( - self.target_deployments - ) + deployments = [ + str(DeploymentID(deployment, self._name)) + for deployment in self.target_deployments + ] + return self._deployment_state_manager.get_deployment_statuses(deployments) def get_application_status_info(self) -> ApplicationStatusInfo: """Return the application status information""" @@ -644,9 +652,13 @@ def list_deployment_details(self) -> Dict[str, DeploymentDetails]: deployments, or when the application is deleting and some deployments have been deleted. """ + deployments = [ + str(DeploymentID(deployment, self._name)) + for deployment in self.target_deployments + ] details = { name: self._deployment_state_manager.get_deployment_details(name) - for name in self.target_deployments + for name in deployments } return {k: v for k, v in details.items() if v is not None} @@ -911,10 +923,7 @@ def build_serve_application( # Check that all deployments specified in config are valid for deployment_name in config_deployments: - unique_deployment_name = ( - (name + DEPLOYMENT_NAME_PREFIX_SEPARATOR) if len(name) else "" - ) + deployment_name - if unique_deployment_name not in app.deployments: + if deployment_name not in app.deployments: raise KeyError( f'There is no deployment named "{deployment_name}" in the ' f'application "{name}".' @@ -967,10 +976,7 @@ def override_deployment_info( # Override options for each deployment listed in the config. for options in deployment_override_options: deployment_name = options["name"] - unique_deployment_name = ( - (app_name + DEPLOYMENT_NAME_PREFIX_SEPARATOR) if len(app_name) else "" - ) + deployment_name - info = deployment_infos[unique_deployment_name] + info = deployment_infos[deployment_name] if ( info.deployment_config.autoscaling_config is not None @@ -1030,7 +1036,7 @@ def override_deployment_info( original_options.update(options) override_options["deployment_config"] = DeploymentConfig(**original_options) - deployment_infos[unique_deployment_name] = info.update(**override_options) + deployment_infos[deployment_name] = info.update(**override_options) # Overwrite ingress route prefix app_route_prefix = config_dict.get("route_prefix", DEFAULT.VALUE) diff --git a/python/ray/serve/_private/client.py b/python/ray/serve/_private/client.py index c2a8bbe12513..f61b0fecdc8f 100644 --- a/python/ray/serve/_private/client.py +++ b/python/ray/serve/_private/client.py @@ -8,6 +8,7 @@ import ray from ray.actor import ActorHandle from ray.serve._private.common import ( + DeploymentID, DeploymentInfo, DeploymentStatus, StatusOverview, @@ -190,7 +191,9 @@ def _wait_for_deployment_deleted(self, name: str, timeout_s: int = 60): else: raise TimeoutError(f"Deployment {name} wasn't deleted after {timeout_s}s.") - def _wait_for_deployment_created(self, name: str, timeout_s: int = -1): + def _wait_for_deployment_created( + self, deployment_name: str, app_name: str, timeout_s: int = -1 + ): """Waits for the named deployment to be created. A deployment being created simply means that its been registered @@ -201,17 +204,25 @@ def _wait_for_deployment_created(self, name: str, timeout_s: int = -1): Raises TimeoutError if this doesn't happen before timeout_s. """ + deployment_id = DeploymentID(deployment_name, app_name) start = time.time() while time.time() - start < timeout_s or timeout_s < 0: - status_bytes = ray.get(self._controller.get_deployment_status.remote(name)) + status_bytes = ray.get( + self._controller.get_deployment_status.remote(str(deployment_id)) + ) if status_bytes is not None: break + logger.debug( + f"Waiting for deployment '{deployment_id.name}' in application " + f"'{deployment_id.app}' to be created." + ) time.sleep(CLIENT_CHECK_CREATION_POLLING_INTERVAL_S) else: raise TimeoutError( - f"Deployment {name} did not become HEALTHY after {timeout_s}s." + f"Deployment '{deployment_id}' in application '{deployment_id.app}' " + f"did not become HEALTHY after {timeout_s}s." ) def _wait_for_application_running(self, name: str, timeout_s: int = -1): @@ -410,9 +421,11 @@ def delete_deployments(self, names: Iterable[str], blocking: bool = True) -> Non self._wait_for_deployment_deleted(name) @_ensure_connected - def get_deployment_info(self, name: str) -> Tuple[DeploymentInfo, str]: + def get_deployment_info( + self, name: str, app_name: str + ) -> Tuple[DeploymentInfo, str]: deployment_route = DeploymentRoute.FromString( - ray.get(self._controller.get_deployment_info.remote(name)) + ray.get(self._controller.get_deployment_info.remote(name, app_name)) ) return ( DeploymentInfo.from_proto(deployment_route.deployment_info), @@ -462,6 +475,7 @@ def get_serve_details(self) -> Dict: def get_handle( self, deployment_name: str, + app_name: Optional[str] = "default", missing_ok: Optional[bool] = False, sync: bool = True, _is_for_http_requests: bool = False, @@ -481,24 +495,30 @@ def get_handle( Returns: RayServeHandle """ - cache_key = (deployment_name, missing_ok, sync) + cache_key = (deployment_name, app_name, missing_ok, sync) if cache_key in self.handle_cache: cached_handle = self.handle_cache[cache_key] if cached_handle._is_same_loop: return cached_handle all_deployments = ray.get(self._controller.list_deployment_names.remote()) - if not missing_ok and deployment_name not in all_deployments: - raise KeyError(f"Deployment '{deployment_name}' does not exist.") + deployment_id = DeploymentID(deployment_name, app_name) + if not missing_ok and str(deployment_id) not in all_deployments: + raise KeyError( + f"Deployment '{deployment_name}' in application '{app_name}' does not " + "exist." + ) if sync: handle = RayServeSyncHandle( deployment_name, + app_name, _is_for_http_requests=_is_for_http_requests, ) else: handle = RayServeHandle( deployment_name, + app_name, _is_for_http_requests=_is_for_http_requests, ) diff --git a/python/ray/serve/_private/common.py b/python/ray/serve/_private/common.py index 94fdcb926cd8..e334675cef69 100644 --- a/python/ray/serve/_private/common.py +++ b/python/ray/serve/_private/common.py @@ -1,7 +1,7 @@ from enum import Enum from dataclasses import dataclass, field, asdict import json -from typing import Any, List, Dict, Optional +from typing import Any, List, Dict, Optional, NamedTuple import ray from ray.actor import ActorHandle @@ -16,8 +16,21 @@ StatusOverview as StatusOverviewProto, ) from ray.serve._private.autoscaling_policy import BasicAutoscalingPolicy +from ray.serve._private.constants import DEPLOYMENT_NAME_PREFIX_SEPARATOR -EndpointTag = str + +class DeploymentID(NamedTuple): + name: str + app: str + + def __str__(self): + if self.app: + return f"{self.app}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}{self.name}" + else: + return self.name + + +EndpointTag = DeploymentID ReplicaTag = str NodeId = str Duration = float @@ -27,7 +40,6 @@ @dataclass class EndpointInfo: route: str - app_name: str app_is_cross_language: bool = False diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py index c4556b31ebb8..c2cdfc5152ad 100644 --- a/python/ray/serve/_private/constants.py +++ b/python/ray/serve/_private/constants.py @@ -248,3 +248,8 @@ RAY_SERVE_ENABLE_MEMORY_PROFILING = ( os.environ.get("RAY_SERVE_ENABLE_MEMORY_PROFILING", "0") == "1" ) + +# Enable cProfile in all Serve actors. +RAY_SERVE_ENABLE_CPU_PROFILING = ( + os.environ.get("RAY_SERVE_ENABLE_CPU_PROFILING", "0") == "1" +) diff --git a/python/ray/serve/_private/deploy_utils.py b/python/ray/serve/_private/deploy_utils.py index b4d61c8d1549..29f946ee3240 100644 --- a/python/ray/serve/_private/deploy_utils.py +++ b/python/ray/serve/_private/deploy_utils.py @@ -7,7 +7,7 @@ from ray.serve.config import ReplicaConfig, DeploymentConfig from ray.serve.schema import ServeApplicationSchema from ray.serve._private.constants import SERVE_LOGGER_NAME -from ray.serve._private.common import DeploymentInfo +from ray.serve._private.common import DeploymentInfo, DeploymentID import ray import ray.util.serialization_addons @@ -111,7 +111,7 @@ def deploy_args_to_deployment_info( ).hex() return DeploymentInfo( - actor_name=deployment_name, + actor_name=str(DeploymentID(deployment_name, app_name)), version=version, deployment_config=deployment_config, replica_config=replica_config, diff --git a/python/ray/serve/_private/deployment_function_node.py b/python/ray/serve/_private/deployment_function_node.py index 09d38c35549e..379a312a61d9 100644 --- a/python/ray/serve/_private/deployment_function_node.py +++ b/python/ray/serve/_private/deployment_function_node.py @@ -15,6 +15,7 @@ def __init__( self, func_body: Union[Callable, str], deployment_name, + app_name, func_args, func_kwargs, func_options, @@ -22,6 +23,7 @@ def __init__( ): self._body = func_body self._deployment_name = deployment_name + self._app_name = app_name super().__init__( func_args, func_kwargs, @@ -63,7 +65,7 @@ def __init__( _internal=True, ) - self._deployment_handle = RayServeHandle(self._deployment.name) + self._deployment_handle = RayServeHandle(self._deployment.name, self._app_name) def _copy_impl( self, @@ -75,6 +77,7 @@ def _copy_impl( return DeploymentFunctionNode( self._body, self._deployment_name, + self._app_name, new_args, new_kwargs, new_options, diff --git a/python/ray/serve/_private/deployment_graph_build.py b/python/ray/serve/_private/deployment_graph_build.py index d2aaf3fac82e..ea2c98e33abd 100644 --- a/python/ray/serve/_private/deployment_graph_build.py +++ b/python/ray/serve/_private/deployment_graph_build.py @@ -6,7 +6,7 @@ from ray.serve.deployment import Deployment, schema_to_deployment from ray.serve.deployment_graph import RayServeDAGHandle -from ray.serve._private.constants import DEPLOYMENT_NAME_PREFIX_SEPARATOR +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME from ray.serve._private.deployment_method_node import DeploymentMethodNode from ray.serve._private.deployment_node import DeploymentNode from ray.serve._private.deployment_function_node import DeploymentFunctionNode @@ -33,7 +33,9 @@ from ray.experimental.gradio_utils import type_to_string -def build(ray_dag_root_node: DAGNode, name: str = None) -> List[Deployment]: +def build( + ray_dag_root_node: DAGNode, name: str = SERVE_DEFAULT_APP_NAME +) -> List[Deployment]: """Do all the DAG transformation, extraction and generation needed to produce a runnable and deployable serve pipeline application from a valid DAG authored with Ray DAG API. @@ -151,7 +153,7 @@ def get_and_validate_ingress_deployment( def transform_ray_dag_to_serve_dag( - dag_node: DAGNode, node_name_generator: _DAGNodeNameGenerator, name: str = None + dag_node: DAGNode, node_name_generator: _DAGNodeNameGenerator, app_name: str ): """ Transform a Ray DAG to a Serve DAG. Map ClassNode to DeploymentNode with @@ -175,7 +177,7 @@ def replace_with_handle(node): if isinstance(node, DeploymentNode) or isinstance( node, DeploymentFunctionNode ): - return RayServeHandle(node._deployment.name) + return RayServeHandle(node._deployment.name, app_name) elif isinstance(node, DeploymentExecutorNode): return node._deployment_handle @@ -216,9 +218,6 @@ def replace_with_handle(node): ): deployment_name = deployment_shell.name - if name: - deployment_name = name + DEPLOYMENT_NAME_PREFIX_SEPARATOR + deployment_name - # Set the route prefix, prefer the one user supplied, # otherwise set it to /deployment_name if ( @@ -241,6 +240,7 @@ def replace_with_handle(node): return DeploymentNode( deployment, + app_name, dag_node.get_args(), dag_node.get_kwargs(), dag_node.get_options(), @@ -262,6 +262,7 @@ def replace_with_handle(node): return DeploymentMethodNode( parent_deployment_node._deployment, dag_node._method_name, + app_name, dag_node.get_args(), dag_node.get_kwargs(), dag_node.get_options(), @@ -290,13 +291,10 @@ def replace_with_handle(node): ): deployment_name = schema.name - # Update the deployment name if the application name provided. - if name: - deployment_name = name + DEPLOYMENT_NAME_PREFIX_SEPARATOR + deployment_name - return DeploymentFunctionNode( dag_node._body, deployment_name, + app_name, dag_node.get_args(), dag_node.get_kwargs(), dag_node.get_options(), diff --git a/python/ray/serve/_private/deployment_method_node.py b/python/ray/serve/_private/deployment_method_node.py index 02230163465e..bb820ede4b24 100644 --- a/python/ray/serve/_private/deployment_method_node.py +++ b/python/ray/serve/_private/deployment_method_node.py @@ -13,6 +13,7 @@ def __init__( self, deployment: Deployment, deployment_method_name: str, + app_name: str, method_args: Tuple[Any], method_kwargs: Dict[str, Any], method_options: Dict[str, Any], @@ -20,6 +21,7 @@ def __init__( ): self._deployment = deployment self._deployment_method_name: str = deployment_method_name + self._app_name = app_name self._deployment_handle = other_args_to_resolve[PARENT_CLASS_NODE_KEY] super().__init__( method_args, @@ -38,6 +40,7 @@ def _copy_impl( return DeploymentMethodNode( self._deployment, self._deployment_method_name, + self._app_name, new_args, new_kwargs, new_options, diff --git a/python/ray/serve/_private/deployment_node.py b/python/ray/serve/_private/deployment_node.py index 3e6b49fbfeec..bedf5d2ea5ae 100644 --- a/python/ray/serve/_private/deployment_node.py +++ b/python/ray/serve/_private/deployment_node.py @@ -17,6 +17,7 @@ def __init__( # For serve structured deployment, deployment body can be import path # to the class or function instead. deployment: Deployment, + app_name: str, deployment_init_args: Tuple[Any], deployment_init_kwargs: Dict[str, Any], ray_actor_options: Dict[str, Any], @@ -30,7 +31,8 @@ def __init__( other_args_to_resolve=other_args_to_resolve, ) self._deployment = deployment - self._deployment_handle = RayServeHandle(self._deployment.name) + self._app_name = app_name + self._deployment_handle = RayServeHandle(self._deployment.name, app_name) def _copy_impl( self, @@ -41,6 +43,7 @@ def _copy_impl( ): return DeploymentNode( self._deployment, + self._app_name, new_args, new_kwargs, new_options, @@ -53,6 +56,7 @@ def __getattr__(self, method_name: str): call_node = DeploymentMethodNode( self._deployment, method_name, + self._app_name, (), {}, {}, diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py index a3415d5489b8..82228b04b90b 100644 --- a/python/ray/serve/_private/deployment_state.py +++ b/python/ray/serve/_private/deployment_state.py @@ -23,6 +23,7 @@ from ray.serve._private.autoscaling_metrics import InMemoryMetricsStore from ray.serve._private.common import ( + DeploymentID, DeploymentInfo, DeploymentStatus, DeploymentStatusInfo, @@ -45,6 +46,7 @@ REPLICA_HEALTH_CHECK_UNHEALTHY_THRESHOLD, SERVE_LOGGER_NAME, SERVE_NAMESPACE, + DEPLOYMENT_NAME_PREFIX_SEPARATOR, ) from ray.serve.generated.serve_pb2 import DeploymentLanguage from ray.serve._private.long_poll import LongPollHost, LongPollNamespace @@ -1301,8 +1303,21 @@ def notify_running_replicas_changed(self) -> None: ): return + prefix = ( + self.app_name + DEPLOYMENT_NAME_PREFIX_SEPARATOR if self.app_name else "" + ) + deployment_id = DeploymentID(self._name[len(prefix) :], self.app_name) + self._long_poll_host.notify_changed( - (LongPollNamespace.RUNNING_REPLICAS, self._name), + (LongPollNamespace.RUNNING_REPLICAS, deployment_id), + running_replica_infos, + ) + # NOTE(zcin): notify changed for Java routers. Since Java only + # supports 1.x API, there is no concept of applications in Java, + # so the key should remain a string describing the deployment + # name. If there are no Java routers, this is a no-op. + self._long_poll_host.notify_changed( + (LongPollNamespace.RUNNING_REPLICAS, str(deployment_id)), running_replica_infos, ) self._last_notified_running_replica_infos = running_replica_infos @@ -2446,22 +2461,29 @@ def get_running_replica_infos( for name, deployment_state in self._deployment_states.items() } - def get_deployment_configs( + def get_deployment_infos( self, filter_tag: Optional[str] = None, include_deleted: Optional[bool] = False - ) -> Dict[str, DeploymentConfig]: - configs: Dict[str, DeploymentConfig] = {} + ) -> Dict[DeploymentID, DeploymentInfo]: + infos: Dict[str, DeploymentInfo] = {} for deployment_name, deployment_state in self._deployment_states.items(): if filter_tag is None or deployment_name == filter_tag: - configs[ - deployment_name - ] = deployment_state.target_info.deployment_config + app_name = deployment_state.target_info.app_name + prefix = app_name + DEPLOYMENT_NAME_PREFIX_SEPARATOR if app_name else "" + deployment_id = DeploymentID(deployment_name[len(prefix) :], app_name) + infos[deployment_id] = deployment_state.target_info if include_deleted: for name, info in self._deleted_deployment_metadata.items(): if filter_tag is None or name == filter_tag: - configs[name] = info.deployment_config + prefix = ( + info.app_name + DEPLOYMENT_NAME_PREFIX_SEPARATOR + if info.app_name + else "" + ) + deployment_id = DeploymentID(name[len(prefix) :], info.app_name) + infos[deployment_id] = info - return configs + return infos def get_deployment( self, deployment_name: str, include_deleted: Optional[bool] = False diff --git a/python/ray/serve/_private/http_proxy.py b/python/ray/serve/_private/http_proxy.py index ec7cee677321..93a0dd078b49 100644 --- a/python/ray/serve/_private/http_proxy.py +++ b/python/ray/serve/_private/http_proxy.py @@ -56,6 +56,7 @@ from ray.serve._private.logging_utils import ( access_log_msg, configure_component_logger, + configure_component_cpu_profiler, configure_component_memory_profiler, get_component_logger_file_path, ) @@ -116,9 +117,9 @@ def __init__(self, get_handle: Callable): # Routes sorted in order of decreasing length. self.sorted_routes: List[str] = list() # Endpoints associated with the routes. - self.route_info: Dict[str, Tuple[EndpointTag, ApplicationName]] = dict() + self.route_info: Dict[str, EndpointTag] = dict() # Contains a ServeHandle for each endpoint. - self.handles: Dict[str, RayServeHandle] = dict() + self.handles: Dict[EndpointTag, RayServeHandle] = dict() # Map of application name to is_cross_language. self.app_to_is_cross_language: Dict[ApplicationName, bool] = dict() @@ -136,12 +137,14 @@ def update_routes(self, endpoints: Dict[EndpointTag, EndpointInfo]) -> None: app_to_is_cross_language = {} for endpoint, info in endpoints.items(): routes.append(info.route) - route_info[info.route] = (endpoint, info.app_name) - app_to_is_cross_language[info.app_name] = info.app_is_cross_language + route_info[info.route] = endpoint + app_to_is_cross_language[endpoint.app] = info.app_is_cross_language if endpoint in self.handles: existing_handles.remove(endpoint) else: - self.handles[endpoint] = self._get_handle(endpoint).options( + self.handles[endpoint] = self._get_handle( + endpoint.name, endpoint.app + ).options( # Streaming codepath isn't supported for Java. stream=( RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING @@ -193,12 +196,12 @@ def match_route( matched = True if matched: - endpoint, app_name = self.route_info[route] + endpoint = self.route_info[route] return ( route, self.handles[endpoint], - app_name, - self.app_to_is_cross_language[app_name], + endpoint.app, + self.app_to_is_cross_language[endpoint.app], ) return None @@ -251,9 +254,10 @@ def __init__( extra={"log_to_stderr": False}, ) - def get_handle(name): + def get_handle(deployment_name, app_name): return serve.context.get_global_client().get_handle( - name, + deployment_name, + app_name, sync=False, missing_ok=True, _is_for_http_requests=True, @@ -346,7 +350,7 @@ def _is_draining(self) -> bool: return self._draining_start_time is not None def _update_routes(self, endpoints: Dict[EndpointTag, EndpointInfo]) -> None: - self.route_info: Dict[str, Tuple[EndpointTag, List[str]]] = dict() + self.route_info: Dict[str, EndpointTag] = dict() for endpoint, info in endpoints.items(): route = info.route self.route_info[route] = endpoint @@ -563,7 +567,7 @@ async def proxy_request(self, scope, receive, send): ) self.deployment_request_error_counter.inc( tags={ - "deployment": handle.deployment_name, + "deployment": str(handle.deployment_id), "error_code": status_code, "method": method, "route": route_path, @@ -690,9 +694,9 @@ async def timeout_response(self, scope, receive, send, request_id): await response.send(scope, receive, send) async def routes_response(self, scope, receive, send): - return await starlette.responses.JSONResponse(self.route_info)( - scope, receive, send - ) + return await starlette.responses.JSONResponse( + {route: str(endpoint) for route, endpoint in self.route_info.items()} + )(scope, receive, send) async def health_response(self, scope, receive, send): return await starlette.responses.PlainTextResponse("success")( @@ -1074,6 +1078,9 @@ def __init__( configure_component_memory_profiler( component_name="http_proxy", component_id=node_ip_address ) + self.cpu_profiler, self.cpu_profiler_log = configure_component_cpu_profiler( + component_name="http_proxy", component_id=node_ip_address + ) if http_middlewares is None: http_middlewares = [Middleware(RequestIdMiddleware)] @@ -1217,6 +1224,27 @@ async def receive_asgi_messages(self, request_id: str) -> bytes: """ return pickle.dumps(await self.app.receive_asgi_messages(request_id)) + def _save_cpu_profile_data(self) -> str: + """Saves CPU profiling data, if CPU profiling is enabled. + + Logs a warning if CPU profiling is disabled. + """ + + if self.cpu_profiler is not None: + import marshal + + self.cpu_profiler.snapshot_stats() + with open(self.cpu_profiler_log, "wb") as f: + marshal.dump(self.cpu_profiler.stats, f) + logger.info(f'Saved CPU profile data to file "{self.cpu_profiler_log}"') + return self.cpu_profiler_log + else: + logger.error( + "Attempted to save CPU profile data, but failed because no " + "CPU profiler was running! Enable CPU profiling by enabling " + "the RAY_SERVE_ENABLE_CPU_PROFILING env var." + ) + async def _uvicorn_keep_alive(self) -> Optional[int]: """Get the keep alive timeout used for the running uvicorn server. diff --git a/python/ray/serve/_private/logging_utils.py b/python/ray/serve/_private/logging_utils.py index 08acea3a6fc6..0be603bffda1 100644 --- a/python/ray/serve/_private/logging_utils.py +++ b/python/ray/serve/_private/logging_utils.py @@ -2,7 +2,12 @@ import copy import logging import os -from typing import Optional +from typing import Optional, Tuple + +try: + import cProfile +except ImportError: + pass import ray from ray.serve._private.common import ServeComponentType @@ -22,6 +27,7 @@ SERVE_LOG_LEVEL_NAME, SERVE_LOG_REPLICA, RAY_SERVE_ENABLE_MEMORY_PROFILING, + RAY_SERVE_ENABLE_CPU_PROFILING, ) @@ -246,10 +252,23 @@ def configure_component_memory_profiler( component_name=component_name, component_id=component_id, component_type=component_type, - suffix="_memray.bin", + suffix="_memray_0.bin", ) memray_file_path = os.path.join(logs_dir, memray_file_name) + # If the actor restarted, memray requires a new file to start + # tracking memory. + restart_counter = 1 + while os.path.exists(memray_file_path): + memray_file_name = get_component_log_file_name( + component_name=component_name, + component_id=component_id, + component_type=component_type, + suffix=f"_memray_{restart_counter}.bin", + ) + memray_file_path = os.path.join(logs_dir, memray_file_name) + restart_counter += 1 + # Memray usually tracks the memory usage of only a block of code # within a context manager. We explicitly call __enter__ here # instead of using a context manager to track memory usage across @@ -270,6 +289,61 @@ def configure_component_memory_profiler( ) +def configure_component_cpu_profiler( + component_name: str, + component_id: str, + component_type: Optional[ServeComponentType] = None, +) -> Tuple[Optional[cProfile.Profile], Optional[str]]: + """Configures the CPU profiler for this component. + + Does nothing if RAY_SERVE_ENABLE_CPU_PROFILING is disabled. + + Returns: + 2-tuple containing profiler object and log file name for profile stats. + """ + + if RAY_SERVE_ENABLE_CPU_PROFILING: + logger = logging.getLogger(SERVE_LOGGER_NAME) + + try: + import cProfile + except ImportError: + logger.warning( + "RAY_SERVE_ENABLE_CPU_PROFILING is enabled, but cProfile " + "is not installed. No CPU profiling is happening." + ) + return None, None + try: + # Need marshal to dump data. Check if marshal is installed before + # starting the profiler. + import marshal # noqa: F401 + except ImportError: + logger.warning( + "RAY_SERVE_ENABLE_CPU_PROFILING is enabled, but marshal " + "is not installed. No CPU profiling is happening." + ) + return None, None + + logs_dir = get_serve_logs_dir() + cpu_profiler_file_name = get_component_log_file_name( + component_name=component_name, + component_id=component_id, + component_type=component_type, + suffix="_cprofile.prof", + ) + cpu_profiler_file_path = os.path.join(logs_dir, cpu_profiler_file_name) + + profile = cProfile.Profile() + profile.enable() + logger.info( + "RAY_SERVE_ENABLE_CPU_PROFILING is enabled. Started cProfile " + "on this actor." + ) + return profile, cpu_profiler_file_path + else: + return None, None + + def get_serve_logs_dir() -> str: """Get the directory that stores Serve log files.""" diff --git a/python/ray/serve/_private/long_poll.py b/python/ray/serve/_private/long_poll.py index 0c4f49e56068..4f0fc06e1c4c 100644 --- a/python/ray/serve/_private/long_poll.py +++ b/python/ray/serve/_private/long_poll.py @@ -6,7 +6,7 @@ import logging import os import random -from typing import Any, Tuple, Callable, DefaultDict, Dict, Set, Union +from typing import Any, Callable, DefaultDict, Dict, Optional, Set, Tuple, Union from ray._private.utils import get_or_create_event_loop from ray.serve._private.common import ReplicaName @@ -197,7 +197,12 @@ class LongPollHost: the object is updated. """ - def __init__(self): + def __init__( + self, + listen_for_change_request_timeout_s: Tuple[ + int, int + ] = LISTEN_FOR_CHANGE_REQUEST_TIMEOUT_S, + ): # Map object_key -> int self.snapshot_ids: DefaultDict[KeyType, int] = defaultdict( lambda: random.randint(0, 1_000_000) @@ -209,6 +214,15 @@ def __init__(self): set ) + self._listen_for_change_request_timeout_s = listen_for_change_request_timeout_s + + def _get_num_notifier_events(self, key: Optional[KeyType] = None): + """Used for testing.""" + if key is not None: + return len(self.notifier_events[key]) + else: + return sum(len(events) for events in self.notifier_events.values()) + async def listen_for_change( self, keys_to_snapshot_ids: Dict[KeyType, int], @@ -233,24 +247,35 @@ async def listen_for_change( return client_outdated_keys # Otherwise, register asyncio events to be waited. + async_task_to_events = {} async_task_to_watched_keys = {} for key in watched_keys: - # Create a new asyncio event for this key + # Create a new asyncio event for this key. event = asyncio.Event() - task = get_or_create_event_loop().create_task(event.wait()) - async_task_to_watched_keys[task] = key # Make sure future caller of notify_changed will unblock this # asyncio Event. self.notifier_events[key].add(event) + task = get_or_create_event_loop().create_task(event.wait()) + async_task_to_events[task] = event + async_task_to_watched_keys[task] = key + done, not_done = await asyncio.wait( async_task_to_watched_keys.keys(), return_when=asyncio.FIRST_COMPLETED, - timeout=random.uniform(*LISTEN_FOR_CHANGE_REQUEST_TIMEOUT_S), + timeout=random.uniform(*self._listen_for_change_request_timeout_s), ) - [task.cancel() for task in not_done] + for task in not_done: + task.cancel() + try: + event = async_task_to_events[task] + self.notifier_events[async_task_to_watched_keys[task]].remove(event) + except KeyError: + # Because we use `FIRST_COMPLETED` above, a task in `not_done` may + # actually have had its event removed in `notify_changed`. + pass if len(done) == 0: return LongPollState.TIME_OUT @@ -314,8 +339,11 @@ def _object_snapshot_to_proto_bytes( ) -> bytes: if key == LongPollNamespace.ROUTE_TABLE: # object_snapshot is Dict[EndpointTag, EndpointInfo] + # NOTE(zcin): the endpoint dictionary broadcasted to Java + # HTTP proxies should use string as key because Java does + # not yet support 2.x or applications xlang_endpoints = { - endpoint_tag: EndpointInfoProto(route=endpoint_info.route) + str(endpoint_tag): EndpointInfoProto(route=endpoint_info.route) for endpoint_tag, endpoint_info in object_snapshot.items() } return EndpointSet(endpoints=xlang_endpoints).SerializeToString() diff --git a/python/ray/serve/_private/replica.py b/python/ray/serve/_private/replica.py index 04c27bebce0f..4261bee29b24 100644 --- a/python/ray/serve/_private/replica.py +++ b/python/ray/serve/_private/replica.py @@ -53,6 +53,7 @@ from ray.serve._private.logging_utils import ( access_log_msg, configure_component_logger, + configure_component_cpu_profiler, configure_component_memory_profiler, get_component_logger_file_path, ) @@ -107,6 +108,11 @@ async def __init__( component_name=deployment_name, component_id=replica_tag, ) + self.cpu_profiler, self.cpu_profiler_log = configure_component_cpu_profiler( + component_type=ServeComponentType.DEPLOYMENT, + component_name=deployment_name, + component_id=replica_tag, + ) self._event_loop = get_or_create_event_loop() @@ -435,6 +441,27 @@ async def _get_metadata( ) -> Tuple[DeploymentConfig, DeploymentVersion]: return self.replica.version.deployment_config, self.replica.version + def _save_cpu_profile_data(self) -> str: + """Saves CPU profiling data, if CPU profiling is enabled. + + Logs a warning if CPU profiling is disabled. + """ + + if self.cpu_profiler is not None: + import marshal + + self.cpu_profiler.snapshot_stats() + with open(self.cpu_profiler_log, "wb") as f: + marshal.dump(self.cpu_profiler.stats, f) + logger.info(f'Saved CPU profile data to file "{self.cpu_profiler_log}"') + return self.cpu_profiler_log + else: + logger.error( + "Attempted to save CPU profile data, but failed because no " + "CPU profiler was running! Enable CPU profiling by enabling " + "the RAY_SERVE_ENABLE_CPU_PROFILING env var." + ) + async def prepare_for_shutdown(self): if self.replica is not None: return await self.replica.prepare_for_shutdown() diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index 9ddb0d036d31..ac9fb16b2c84 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -31,7 +31,7 @@ from ray.util import metrics from ray._private.utils import make_asyncio_event_version_compat, load_class -from ray.serve._private.common import RunningReplicaInfo, DeploymentInfo +from ray.serve._private.common import RunningReplicaInfo, DeploymentInfo, DeploymentID from ray.serve._private.constants import ( SERVE_LOGGER_NAME, HANDLE_METRIC_PUSH_INTERVAL_S, @@ -254,6 +254,10 @@ async def assign_replica( def update_running_replicas(self, running_replicas: List[RunningReplicaInfo]): pass + @property + def curr_replicas(self) -> Dict[str, ReplicaWrapper]: + pass + @dataclass class PendingRequest: @@ -295,10 +299,10 @@ class PowerOfTwoChoicesReplicaScheduler(ReplicaScheduler): def __init__( self, event_loop: asyncio.AbstractEventLoop, - deployment_name: str, + deployment_id: DeploymentID, ): self._loop = event_loop - self._deployment_name = deployment_name + self._deployment_id = deployment_id # Current replicas available to be scheduled. # Updated via `update_replicas`. @@ -375,8 +379,8 @@ def update_replicas(self, replicas: List[ReplicaWrapper]): if self._replica_id_set != new_replica_id_set: logger.info( - "Got updated replicas for deployment " - f"{self._deployment_name}: {new_replica_id_set}.", + f"Got updated replicas for deployment '{self._deployment_id.name}' " + f"in application '{self._deployment_id.app}': {new_replica_id_set}.", extra={"log_to_stderr": False}, ) @@ -460,14 +464,16 @@ async def choose_two_replicas_with_backoff( while len(self._replicas) == 0: logger.info( "Tried to assign replica for deployment " - f"{self._deployment_name} but none are available. " - "Waiting for new replicas to be added.", + f"'{self._deployment_id.name}' in application " + f"'{self._deployment_id.app}' but none are available. Waiting for " + "new replicas to be added.", extra={"log_to_stderr": False}, ) self._replicas_updated_event.clear() await self._replicas_updated_event.wait() logger.info( - "Got replicas for deployment {self._deployment_name}, waking up.", + f"Got replicas for deployment '{self._deployment_id.name}' in " + f"application '{self._deployment_id.app}', waking up.", extra={"log_to_stderr": False}, ) @@ -750,6 +756,12 @@ def __init__(self, event_loop: asyncio.AbstractEventLoop): str, List[RunningReplicaInfo] ] = defaultdict(list) + @property + def curr_replicas(self) -> Dict[str, ReplicaWrapper]: + return { + r.replica_tag: ActorReplicaWrapper(r) for r in self.in_flight_queries.keys() + } + def _reset_replica_iterator(self): """Reset the iterator used to load balance replicas. @@ -961,7 +973,7 @@ class Router: def __init__( self, controller_handle: ActorHandle, - deployment_name: str, + deployment_id: DeploymentID, event_loop: asyncio.BaseEventLoop = None, _use_new_routing: bool = False, _router_cls: Optional[str] = None, @@ -975,11 +987,11 @@ def __init__( if _router_cls: self._replica_scheduler = load_class(_router_cls)( - event_loop=event_loop, deployment_name=deployment_name + event_loop=event_loop, deployment_id=deployment_id ) elif _use_new_routing: self._replica_scheduler = PowerOfTwoChoicesReplicaScheduler( - event_loop, deployment_name + event_loop, deployment_id ) else: self._replica_scheduler = RoundRobinReplicaScheduler(event_loop) @@ -994,7 +1006,8 @@ def __init__( description="The number of requests processed by the router.", tag_keys=("deployment", "route", "application"), ) - self.num_router_requests.set_default_tags({"deployment": deployment_name}) + # TODO(zcin): use deployment name and application name instead of deployment id + self.num_router_requests.set_default_tags({"deployment": str(deployment_id)}) self.num_queued_queries = 0 self.num_queued_queries_gauge = metrics.Gauge( @@ -1005,38 +1018,47 @@ def __init__( ), tag_keys=("deployment", "application"), ) - self.num_queued_queries_gauge.set_default_tags({"deployment": deployment_name}) + # TODO(zcin): use deployment name and application name instead of deployment id + self.num_queued_queries_gauge.set_default_tags( + {"deployment": str(deployment_id)} + ) self.long_poll_client = LongPollClient( controller_handle, { ( LongPollNamespace.RUNNING_REPLICAS, - deployment_name, + deployment_id, ): self._replica_scheduler.update_running_replicas, }, call_in_event_loop=event_loop, ) # Start the metrics pusher if autoscaling is enabled. - self.deployment_name = deployment_name + self.deployment_id = deployment_id deployment_route = DeploymentRoute.FromString( - ray.get(controller_handle.get_deployment_info.remote(self.deployment_name)) + ray.get(controller_handle.get_deployment_info.remote(*deployment_id)) ) deployment_info = DeploymentInfo.from_proto(deployment_route.deployment_info) self.metrics_pusher = None if deployment_info.deployment_config.autoscaling_config: + self.autoscaling_enabled = True + self.push_metrics_to_controller = ( + controller_handle.record_handle_metrics.remote + ) self.metrics_pusher = MetricsPusher() self.metrics_pusher.register_task( self._collect_handle_queue_metrics, HANDLE_METRIC_PUSH_INTERVAL_S, - controller_handle.record_handle_metrics.remote, + self.push_metrics_to_controller, ) self.metrics_pusher.start() + else: + self.autoscaling_enabled = False def _collect_handle_queue_metrics(self) -> Dict[str, int]: - return {self.deployment_name: self.num_queued_queries} + return {str(self.deployment_id): self.num_queued_queries} async def assign_request( self, @@ -1057,6 +1079,16 @@ async def assign_request( }, ) + # Optimization: if there are currently zero replicas for a deployment, + # push handle metric to controller to allow for fast cold start time. + # Only do it for the first query to arrive on the router. + if ( + self.autoscaling_enabled + and len(self._replica_scheduler.curr_replicas) == 0 + and self.num_queued_queries == 1 + ): + self.push_metrics_to_controller({self.deployment_name: 1}, time.time()) + try: query = Query( args=list(request_args), diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py index e5994ec5d2ae..f7e6275e24bc 100644 --- a/python/ray/serve/api.py +++ b/python/ray/serve/api.py @@ -403,7 +403,8 @@ def get_deployment(name: str) -> Deployment: """Dynamically fetch a handle to a Deployment object. This can be used to update and redeploy a deployment without access to - the original definition. + the original definition. This should only be used to fetch deployments + that were deployed using 1.x API. Example: >>> from ray import serve @@ -530,8 +531,9 @@ def run( # The deployment state is not guaranteed to be created after # deploy_application returns; the application state manager will # need another reconcile iteration to create it. - client._wait_for_deployment_created(ingress.name) - return ingress._get_handle() + client._wait_for_deployment_created(ingress.name, name) + handle = client.get_handle(ingress.name, name, missing_ok=True) + return handle @PublicAPI(stability="alpha") @@ -823,7 +825,7 @@ def f(val: int) -> int: # and default to sync outside a deployment sync = internal_replica_context is None - return client.get_handle(ingress, sync=sync) + return client.get_handle(ingress, name, sync=sync) @DeveloperAPI @@ -868,4 +870,4 @@ def get_deployment_handle( # and default to sync outside a deployment sync = internal_replica_context is None - return client.get_handle(f"{app_name}_{deployment_name}", sync=sync) + return client.get_handle(deployment_name, app_name, sync=sync) diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py index 7bf72dcc5fec..938792c935b3 100644 --- a/python/ray/serve/controller.py +++ b/python/ray/serve/controller.py @@ -14,6 +14,7 @@ from ray._private.resource_spec import HEAD_NODE_RESOURCE_NAME from ray._raylet import GcsClient from ray.serve._private.common import ( + DeploymentID, DeploymentInfo, EndpointInfo, EndpointTag, @@ -42,6 +43,7 @@ from ray.serve._private.logging_utils import ( configure_component_logger, configure_component_memory_profiler, + configure_component_cpu_profiler, get_component_logger_file_path, ) from ray.serve._private.long_poll import LongPollHost @@ -121,6 +123,9 @@ async def __init__( configure_component_memory_profiler( component_name="controller", component_id=str(os.getpid()) ) + self.cpu_profiler, self.cpu_profiler_log = configure_component_cpu_profiler( + component_name="controller", component_id=str(os.getpid()) + ) if RAY_SERVE_CONTROLLER_CALLBACK_IMPORT_PATH: logger.info( "Calling user-provided callback from import path " @@ -279,7 +284,7 @@ def get_all_endpoints_java(self) -> bytes: endpoints = self.get_all_endpoints() data = { - endpoint_tag: EndpointInfoProto(route=endppint_dict["route"]) + str(endpoint_tag): EndpointInfoProto(route=endppint_dict["route"]) for endpoint_tag, endppint_dict in endpoints.items() } return EndpointSet(endpoints=data).SerializeToString() @@ -638,22 +643,16 @@ def deploy( deployer_job_id: Union[str, bytes], docs_path: Optional[str] = None, is_driver_deployment: Optional[bool] = False, - app_name: str = None, # TODO(edoakes): this is a hack because the deployment_language doesn't seem # to get set properly from Java. is_deployed_from_python: bool = False, ) -> bool: - """Deploys a deployment.""" + """Deploys a deployment. This should only be used for 1.x deployments.""" if route_prefix is not None: assert route_prefix.startswith("/") if docs_path is not None: assert docs_path.startswith("/") - # app_name is None for V1 API, reset it to empty string to avoid - # breaking metrics. - if app_name is None: - app_name = "" - deployment_info = deploy_args_to_deployment_info( deployment_name=name, deployment_config_proto_bytes=deployment_config_proto_bytes, @@ -662,7 +661,7 @@ def deploy( route_prefix=route_prefix, docs_path=docs_path, is_driver_deployment=is_driver_deployment, - app_name=app_name, + app_name="", ) # TODO(architkulkarni): When a deployment is redeployed, even if @@ -673,12 +672,11 @@ def deploy( if route_prefix is not None: endpoint_info = EndpointInfo( route=route_prefix, - app_name=app_name, app_is_cross_language=not is_deployed_from_python, ) - self.endpoint_state.update_endpoint(name, endpoint_info) + self.endpoint_state.update_endpoint(EndpointTag(name, ""), endpoint_info) else: - self.endpoint_state.delete_endpoint(name) + self.endpoint_state.delete_endpoint(EndpointTag(name, "")) return updating @@ -789,14 +787,18 @@ def deploy_apps( self.delete_apps(existing_applications.difference(new_applications)) def delete_deployment(self, name: str): - self.endpoint_state.delete_endpoint(name) + """Should only be used for 1.x deployments.""" + + self.endpoint_state.delete_endpoint(EndpointTag(name, "")) return self.deployment_state_manager.delete_deployment(name) def delete_deployments(self, names: Iterable[str]) -> None: + """Should only be used for 1.x deployments.""" + for name in names: self.delete_deployment(name) - def get_deployment_info(self, name: str) -> bytes: + def get_deployment_info(self, name: str, app_name: str = "") -> bytes: """Get the current information about a deployment. Args: @@ -808,11 +810,14 @@ def get_deployment_info(self, name: str) -> bytes: Raises: KeyError if the deployment doesn't exist. """ - deployment_info = self.deployment_state_manager.get_deployment(name) + id = DeploymentID(name, app_name) + deployment_info = self.deployment_state_manager.get_deployment(str(id)) if deployment_info is None: - raise KeyError(f"Deployment {name} does not exist.") + raise KeyError( + f"Deployment '{name}' does not exist in application '{app_name}'." + ) - route = self.endpoint_state.get_endpoint_route(name) + route = self.endpoint_state.get_endpoint_route(id) from ray.serve.generated.serve_pb2 import DeploymentRoute @@ -837,15 +842,10 @@ def list_deployments_internal( KeyError if the deployment doesn't exist. """ return { - name: ( - self.deployment_state_manager.get_deployment( - name, include_deleted=include_deleted - ), - self.endpoint_state.get_endpoint_route(name), - ) - for name in self.deployment_state_manager.get_deployment_configs( + str(id): (info, self.endpoint_state.get_endpoint_route(id)) + for id, info in self.deployment_state_manager.get_deployment_infos( include_deleted=include_deleted - ) + ).items() } def list_deployments(self, include_deleted: Optional[bool] = False) -> bytes: @@ -1030,6 +1030,27 @@ async def graceful_shutdown(self, wait: bool = True): await self._shutdown.wait() + def _save_cpu_profile_data(self) -> str: + """Saves CPU profiling data, if CPU profiling is enabled. + + Logs a warning if CPU profiling is disabled. + """ + + if self.cpu_profiler is not None: + import marshal + + self.cpu_profiler.snapshot_stats() + with open(self.cpu_profiler_log, "wb") as f: + marshal.dump(self.cpu_profiler.stats, f) + logger.info(f'Saved CPU profile data to file "{self.cpu_profiler_log}"') + return self.cpu_profiler_log + else: + logger.error( + "Attempted to save CPU profile data, but failed because no " + "CPU profiler was running! Enable CPU profiling by enabling " + "the RAY_SERVE_ENABLE_CPU_PROFILING env var." + ) + @ray.remote(num_cpus=0) class ServeControllerAvatar: diff --git a/python/ray/serve/deployment.py b/python/ray/serve/deployment.py index 9e4b6b4772e7..18bd01c04e0a 100644 --- a/python/ray/serve/deployment.py +++ b/python/ray/serve/deployment.py @@ -406,6 +406,7 @@ def _get_handle( return get_global_client().get_handle( self._name, + app_name="", missing_ok=True, sync=sync, ) diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py index 2da9b01735ad..1ddf745432bc 100644 --- a/python/ray/serve/handle.py +++ b/python/ray/serve/handle.py @@ -129,13 +129,14 @@ async def __call__(self, name: str) -> str: def __init__( self, - deployment_name: EndpointTag, + deployment_name: str, + app_name: str, *, handle_options: Optional[_HandleOptions] = None, _router: Optional[Router] = None, _is_for_http_requests: bool = False, ): - self.deployment_name = deployment_name + self.deployment_id = EndpointTag(deployment_name, app_name) self.handle_options = handle_options or _HandleOptions() self._is_for_http_requests = _is_for_http_requests @@ -147,9 +148,14 @@ def __init__( ), tag_keys=("handle", "deployment", "route", "application"), ) - handle_tag = f"{self.deployment_name}#{get_random_letters()}" + if app_name: + handle_tag = f"{app_name}#{deployment_name}#{get_random_letters()}" + else: + handle_tag = f"{deployment_name}#{get_random_letters()}" + + # TODO(zcin): Separate deployment_id into deployment and application tags self.request_counter.set_default_tags( - {"handle": handle_tag, "deployment": self.deployment_name} + {"handle": handle_tag, "deployment": str(self.deployment_id)} ) self._router: Optional[Router] = _router @@ -158,7 +164,7 @@ def _get_or_create_router(self) -> Router: if self._router is None: self._router = Router( serve.context.get_global_client()._controller, - self.deployment_name, + self.deployment_id, event_loop=get_or_create_event_loop(), _use_new_routing=RAY_SERVE_ENABLE_NEW_ROUTING, _router_cls=self.handle_options._router_cls, @@ -166,6 +172,14 @@ def _get_or_create_router(self) -> Router: return self._router + @property + def deployment_name(self) -> str: + return self.deployment_id.name + + @property + def app_name(self) -> str: + return self.deployment_id.app + @property def _is_same_loop(self) -> bool: """Whether the caller's asyncio loop is the same loop for handle. @@ -194,6 +208,7 @@ def _options( return self.__class__( self.deployment_name, + self.app_name, handle_options=new_handle_options, _router=None if _router_cls != DEFAULT.VALUE else self._router, _is_for_http_requests=self._is_for_http_requests, @@ -226,11 +241,11 @@ def options( _router_cls=_router_cls, ) - def _remote(self, deployment_name, handle_options, args, kwargs) -> Coroutine: + def _remote(self, deployment_id, handle_options, args, kwargs) -> Coroutine: _request_context = ray.serve.context._serve_request_context.get() request_metadata = RequestMetadata( _request_context.request_id, - deployment_name, + deployment_id.name, call_method=handle_options.method_name, is_http_request=self._is_for_http_requests, route=_request_context.route, @@ -265,12 +280,10 @@ async def remote(self, *args, **kwargs) -> asyncio.Task: result = await obj_ref """ - return await self._remote( - self.deployment_name, self.handle_options, args, kwargs - ) + return await self._remote(self.deployment_id, self.handle_options, args, kwargs) def __repr__(self): - return f"{self.__class__.__name__}" f"(deployment='{self.deployment_name}')" + return f"{self.__class__.__name__}" f"(deployment='{self.deployment_id}')" @classmethod def _deserialize(cls, kwargs): @@ -280,6 +293,7 @@ def _deserialize(cls, kwargs): def __reduce__(self): serialized_data = { "deployment_name": self.deployment_name, + "app_name": self.app_name, "handle_options": self.handle_options, "_is_for_http_requests": self._is_for_http_requests, } @@ -329,7 +343,7 @@ def _get_or_create_router(self) -> Router: if self._router is None: self._router = Router( serve.context.get_global_client()._controller, - self.deployment_name, + self.deployment_id, event_loop=_create_or_get_async_loop_in_thread(), _use_new_routing=RAY_SERVE_ENABLE_NEW_ROUTING, _router_cls=self.handle_options._router_cls, @@ -376,7 +390,7 @@ def remote(self, *args, **kwargs) -> ray.ObjectRef: result = ray.get(obj_ref) """ - coro = self._remote(self.deployment_name, self.handle_options, args, kwargs) + coro = self._remote(self.deployment_id, self.handle_options, args, kwargs) future: concurrent.futures.Future = asyncio.run_coroutine_threadsafe( coro, self._get_or_create_router()._event_loop ) @@ -385,6 +399,7 @@ def remote(self, *args, **kwargs) -> ray.ObjectRef: def __reduce__(self): serialized_data = { "deployment_name": self.deployment_name, + "app_name": self.app_name, "handle_options": self.handle_options, "_is_for_http_requests": self._is_for_http_requests, } diff --git a/python/ray/serve/scripts.py b/python/ray/serve/scripts.py index c78130f5b08e..3eb9194989e2 100644 --- a/python/ray/serve/scripts.py +++ b/python/ray/serve/scripts.py @@ -704,7 +704,7 @@ def build_app_config(import_path: str, _kubernetes_format: bool, name: str = Non f"Expected '{import_path}' to be an Application but got {type(app)}." ) - app = build_app(app) + app = build_app(app, name) schema = ServeApplicationSchema( import_path=import_path, runtime_env={}, @@ -740,7 +740,9 @@ def build_app_config(import_path: str, _kubernetes_format: bool, name: str = Non ) config_str += yaml.dump( - build_app_config(import_paths[0], kubernetes_format), + build_app_config( + import_paths[0], kubernetes_format, SERVE_DEFAULT_APP_NAME + ), Dumper=ServeApplicationSchemaDumper, default_flow_style=False, sort_keys=False, diff --git a/python/ray/serve/tests/conftest.py b/python/ray/serve/tests/conftest.py index 69f53fcacf20..a0ecfa125d7c 100644 --- a/python/ray/serve/tests/conftest.py +++ b/python/ray/serve/tests/conftest.py @@ -78,9 +78,9 @@ def _shared_serve_instance(): @pytest.fixture def serve_instance(_shared_serve_instance): yield _shared_serve_instance - # Clear all applications to avoid naming & route_prefix collisions. + # Clear all state for 2.x applications and deployments. _shared_serve_instance.delete_all_apps() - # Clear all state between tests to avoid naming collisions. + # Clear all state for 1.x deployments. _shared_serve_instance.delete_deployments(serve.list_deployments().keys()) # Clear the ServeHandle cache between tests to avoid them piling up. _shared_serve_instance.shutdown_cached_handles() diff --git a/python/ray/serve/tests/test_air_integrations.py b/python/ray/serve/tests/test_air_integrations.py index fde75d957433..7d48b9b4a849 100644 --- a/python/ray/serve/tests/test_air_integrations.py +++ b/python/ray/serve/tests/test_air_integrations.py @@ -266,7 +266,7 @@ async def __call__(self, data): checkpoint=Checkpoint.from_uri(uri), ) dag = m1.__call__.bind(dag_input) - deployments = build(Ingress.bind(dag)) + deployments = build(Ingress.bind(dag), "") for d in deployments: d.deploy() @@ -303,7 +303,7 @@ async def __call__(self, data): checkpoint=Checkpoint.from_uri(uri), ) dag = m1.__call__.bind(dag_input) - deployments = build(Ingress.bind(dag)) + deployments = build(Ingress.bind(dag), "") for d in deployments: d.deploy() diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py index dceb090def71..80744a7c961f 100644 --- a/python/ray/serve/tests/test_api.py +++ b/python/ray/serve/tests/test_api.py @@ -462,7 +462,7 @@ def f(): f.set_options(max_concurrent_queries=-4) -def test_deploy_application(serve_instance): +def test_deploy_application_basic(serve_instance): """Test deploy multiple applications""" @serve.deployment diff --git a/python/ray/serve/tests/test_application_state.py b/python/ray/serve/tests/test_application_state.py index 62c1791e92c4..a9aeab65eaf6 100644 --- a/python/ray/serve/tests/test_application_state.py +++ b/python/ray/serve/tests/test_application_state.py @@ -12,6 +12,7 @@ override_deployment_info, ) from ray.serve._private.common import ( + DeploymentID, ApplicationStatus, DeploymentConfig, DeploymentStatus, @@ -258,6 +259,8 @@ def test_deploy_and_delete_app(mocked_application_state): app_state, deployment_state_manager = mocked_application_state # DEPLOY application with deployments {d1, d2} + d1_id = DeploymentID("d1", "test_app") + d2_id = DeploymentID("d2", "test_app") app_state.deploy( { "d1": deployment_info("d1", "/hi", "/documentation"), @@ -273,8 +276,8 @@ def test_deploy_and_delete_app(mocked_application_state): app_state.update() # After one update, deployments {d1, d2} should be created - assert deployment_state_manager.get_deployment("d1") - assert deployment_state_manager.get_deployment("d2") + assert deployment_state_manager.get_deployment(str(d1_id)) + assert deployment_state_manager.get_deployment(str(d2_id)) assert app_state.status == ApplicationStatus.DEPLOYING # Until both deployments are healthy, app should be deploying @@ -282,12 +285,12 @@ def test_deploy_and_delete_app(mocked_application_state): assert app_state.status == ApplicationStatus.DEPLOYING # Mark deployment d1 healthy (app should be deploying) - deployment_state_manager.set_deployment_healthy("d1") + deployment_state_manager.set_deployment_healthy(str(d1_id)) app_state.update() assert app_state.status == ApplicationStatus.DEPLOYING # Mark deployment d2 healthy (app should be running) - deployment_state_manager.set_deployment_healthy("d2") + deployment_state_manager.set_deployment_healthy(str(d2_id)) app_state.update() assert app_state.status == ApplicationStatus.RUNNING @@ -300,13 +303,13 @@ def test_deploy_and_delete_app(mocked_application_state): assert app_state.status == ApplicationStatus.DELETING app_state.update() - deployment_state_manager.set_deployment_deleted("d1") + deployment_state_manager.set_deployment_deleted(str(d1_id)) ready_to_be_deleted = app_state.update() assert not ready_to_be_deleted assert app_state.status == ApplicationStatus.DELETING # Once both deployments are deleted, the app should be ready to delete - deployment_state_manager.set_deployment_deleted("d2") + deployment_state_manager.set_deployment_deleted(str(d2_id)) ready_to_be_deleted = app_state.update() assert ready_to_be_deleted @@ -314,6 +317,8 @@ def test_deploy_and_delete_app(mocked_application_state): def test_app_deploy_failed_and_redeploy(mocked_application_state): """Test DEPLOYING -> DEPLOY_FAILED -> (redeploy) -> DEPLOYING -> RUNNING""" app_state, deployment_state_manager = mocked_application_state + d1_id = DeploymentID("d1", "test_app") + d2_id = DeploymentID("d2", "test_app") app_state.deploy({"d1": deployment_info("d1")}) assert app_state.status == ApplicationStatus.DEPLOYING @@ -322,7 +327,7 @@ def test_app_deploy_failed_and_redeploy(mocked_application_state): assert app_state.status == ApplicationStatus.DEPLOYING # Mark deployment unhealthy -> app should be DEPLOY_FAILED - deployment_state_manager.set_deployment_unhealthy("d1") + deployment_state_manager.set_deployment_unhealthy(str(d1_id)) app_state.update() assert app_state.status == ApplicationStatus.DEPLOY_FAILED @@ -339,12 +344,12 @@ def test_app_deploy_failed_and_redeploy(mocked_application_state): # After one update, deployments {d1, d2} should be created app_state.update() - assert deployment_state_manager.get_deployment("d1") - assert deployment_state_manager.get_deployment("d2") + assert deployment_state_manager.get_deployment(str(d1_id)) + assert deployment_state_manager.get_deployment(str(d2_id)) assert app_state.status == ApplicationStatus.DEPLOYING - deployment_state_manager.set_deployment_healthy("d1") - deployment_state_manager.set_deployment_healthy("d2") + deployment_state_manager.set_deployment_healthy(str(d1_id)) + deployment_state_manager.set_deployment_healthy(str(d2_id)) app_state.update() assert app_state.status == ApplicationStatus.RUNNING @@ -364,6 +369,7 @@ def test_app_deploy_failed_and_recover(mocked_application_state): the application status should update to running. """ app_state, deployment_state_manager = mocked_application_state + deployment_id = DeploymentID("d1", "test_app") app_state.deploy({"d1": deployment_info("d1")}) assert app_state.status == ApplicationStatus.DEPLOYING @@ -372,14 +378,14 @@ def test_app_deploy_failed_and_recover(mocked_application_state): assert app_state.status == ApplicationStatus.DEPLOYING # Mark deployment unhealthy -> app should be DEPLOY_FAILED - deployment_state_manager.set_deployment_unhealthy("d1") + deployment_state_manager.set_deployment_unhealthy(str(deployment_id)) app_state.update() assert app_state.status == ApplicationStatus.DEPLOY_FAILED app_state.update() assert app_state.status == ApplicationStatus.DEPLOY_FAILED # Deployment recovers to healthy -> app should be RUNNING - deployment_state_manager.set_deployment_healthy("d1") + deployment_state_manager.set_deployment_healthy(str(deployment_id)) app_state.update() assert app_state.status == ApplicationStatus.RUNNING app_state.update() @@ -393,19 +399,20 @@ def test_app_unhealthy(mocked_application_state): updated to unhealthy. """ app_state, deployment_state_manager = mocked_application_state + id_a, id_b = DeploymentID("a", "test_app"), DeploymentID("b", "test_app") app_state.deploy({"a": deployment_info("a"), "b": deployment_info("b")}) assert app_state.status == ApplicationStatus.DEPLOYING app_state.update() assert app_state.status == ApplicationStatus.DEPLOYING # Once both deployments become healthy, app should be running - deployment_state_manager.set_deployment_healthy("a") - deployment_state_manager.set_deployment_healthy("b") + deployment_state_manager.set_deployment_healthy(str(id_a)) + deployment_state_manager.set_deployment_healthy(str(id_b)) app_state.update() assert app_state.status == ApplicationStatus.RUNNING # If a deployment becomes unhealthy, application should become unhealthy - deployment_state_manager.set_deployment_unhealthy("a") + deployment_state_manager.set_deployment_unhealthy(str(id_a)) app_state.update() assert app_state.status == ApplicationStatus.UNHEALTHY # Rerunning update shouldn't make a difference @@ -413,7 +420,7 @@ def test_app_unhealthy(mocked_application_state): assert app_state.status == ApplicationStatus.UNHEALTHY # If the deployment recovers, the application should also recover - deployment_state_manager.set_deployment_healthy("a") + deployment_state_manager.set_deployment_healthy(str(id_a)) app_state.update() assert app_state.status == ApplicationStatus.RUNNING @@ -426,6 +433,7 @@ def test_deploy_through_config_succeed(check_obj_ref_ready_nowait): Deploy obj ref finishes successfully, so status should transition to running. """ kv_store = MockKVStore() + deployment_id = DeploymentID("a", "test_app") deployment_state_manager = MockDeploymentStateManager(kv_store) app_state_manager = ApplicationStateManager( deployment_state_manager, MockEndpointState(), kv_store @@ -454,7 +462,7 @@ def test_deploy_through_config_succeed(check_obj_ref_ready_nowait): assert app_state.docs_path == "/docs" # Set healthy - deployment_state_manager.set_deployment_healthy("a") + deployment_state_manager.set_deployment_healthy(str(deployment_id)) app_state.update() assert app_state.status == ApplicationStatus.RUNNING @@ -498,6 +506,9 @@ def test_deploy_through_config_fail(check_obj_ref_ready_nowait): def test_redeploy_same_app(mocked_application_state): """Test redeploying same application with updated deployments.""" app_state, deployment_state_manager = mocked_application_state + a_id = DeploymentID("a", "test_app") + b_id = DeploymentID("b", "test_app") + c_id = DeploymentID("c", "test_app") app_state.deploy({"a": deployment_info("a"), "b": deployment_info("b")}) assert app_state.status == ApplicationStatus.DEPLOYING @@ -507,10 +518,10 @@ def test_redeploy_same_app(mocked_application_state): assert set(app_state.target_deployments) == {"a", "b"} # Transition to running - deployment_state_manager.set_deployment_healthy("a") + deployment_state_manager.set_deployment_healthy(str(a_id)) app_state.update() assert app_state.status == ApplicationStatus.DEPLOYING - deployment_state_manager.set_deployment_healthy("b") + deployment_state_manager.set_deployment_healthy(str(b_id)) app_state.update() assert app_state.status == ApplicationStatus.RUNNING @@ -522,15 +533,15 @@ def test_redeploy_same_app(mocked_application_state): # Remove deployment `a` app_state.update() - deployment_state_manager.set_deployment_deleted("a") + deployment_state_manager.set_deployment_deleted(str(a_id)) app_state.update() assert app_state.status == ApplicationStatus.DEPLOYING # Move to running - deployment_state_manager.set_deployment_healthy("c") + deployment_state_manager.set_deployment_healthy(str(c_id)) app_state.update() assert app_state.status == ApplicationStatus.DEPLOYING - deployment_state_manager.set_deployment_healthy("b") + deployment_state_manager.set_deployment_healthy(str(b_id)) app_state.update() assert app_state.status == ApplicationStatus.RUNNING @@ -550,6 +561,7 @@ def test_deploy_with_renamed_app(mocked_application_state_manager): conflict with an old app running on the cluster. """ app_state_manager, deployment_state_manager, _ = mocked_application_state_manager + a_id, b_id = DeploymentID("a", "app1"), DeploymentID("b", "app2") # deploy app1 app_state_manager.apply_deployment_args("app1", [deployment_params("a", "/url1")]) @@ -562,7 +574,7 @@ def test_deploy_with_renamed_app(mocked_application_state_manager): assert set(app_state.target_deployments) == {"a"} # Once its single deployment is healthy, app1 should be running - deployment_state_manager.set_deployment_healthy("a") + deployment_state_manager.set_deployment_healthy(str(a_id)) app_state_manager.update() assert app_state_manager.get_app_status("app1") == ApplicationStatus.RUNNING @@ -577,13 +589,13 @@ def test_deploy_with_renamed_app(mocked_application_state_manager): app_state_manager.update() # app2 deploys before app1 finishes deleting - deployment_state_manager.set_deployment_healthy("b") + deployment_state_manager.set_deployment_healthy(str(b_id)) app_state_manager.update() assert app_state_manager.get_app_status("app2") == ApplicationStatus.RUNNING assert app_state_manager.get_app_status("app1") == ApplicationStatus.DELETING # app1 finally finishes deleting - deployment_state_manager.set_deployment_deleted("a") + deployment_state_manager.set_deployment_deleted(str(a_id)) app_state_manager.update() assert app_state_manager.get_app_status("app1") == ApplicationStatus.NOT_STARTED assert app_state_manager.get_app_status("app2") == ApplicationStatus.RUNNING @@ -596,6 +608,7 @@ def test_application_state_recovery(mocked_application_state_manager): deployment_state_manager, kv_store, ) = mocked_application_state_manager + deployment_id = DeploymentID("d1", "test_app") app_name = "test_app" # DEPLOY application with deployments {d1, d2} @@ -606,15 +619,15 @@ def test_application_state_recovery(mocked_application_state_manager): # Once deployment is healthy, app should be running app_state_manager.update() - assert deployment_state_manager.get_deployment("d1") - deployment_state_manager.set_deployment_healthy("d1") + assert deployment_state_manager.get_deployment(str(deployment_id)) + deployment_state_manager.set_deployment_healthy(str(deployment_id)) app_state_manager.update() assert app_state.status == ApplicationStatus.RUNNING # Simulate controller crashed!! Create new deployment state manager, # which should recover target state for deployment "d1" from kv store new_deployment_state_manager = MockDeploymentStateManager(kv_store) - version1 = new_deployment_state_manager.deployment_infos["d1"].version + version1 = new_deployment_state_manager.deployment_infos[str(deployment_id)].version # Create new application state manager, and it should call _recover_from_checkpoint new_app_state_manager = ApplicationStateManager( @@ -624,7 +637,7 @@ def test_application_state_recovery(mocked_application_state_manager): assert app_state.status == ApplicationStatus.DEPLOYING assert app_state._target_state.deployment_infos["d1"].version == version1 - new_deployment_state_manager.set_deployment_healthy("d1") + new_deployment_state_manager.set_deployment_healthy(str(deployment_id)) new_app_state_manager.update() assert app_state.status == ApplicationStatus.RUNNING @@ -644,6 +657,7 @@ def test_recover_during_update(mocked_application_state_manager): deployment_state_manager, kv_store, ) = mocked_application_state_manager + deployment_id = DeploymentID("d1", "test_app") app_name = "test_app" # DEPLOY application with deployment "d1" @@ -654,8 +668,8 @@ def test_recover_during_update(mocked_application_state_manager): # Once deployment is healthy, app should be running app_state_manager.update() - assert deployment_state_manager.get_deployment("d1") - deployment_state_manager.set_deployment_healthy("d1") + assert deployment_state_manager.get_deployment(str(deployment_id)) + deployment_state_manager.set_deployment_healthy(str(deployment_id)) app_state_manager.update() assert app_state.status == ApplicationStatus.RUNNING @@ -669,7 +683,9 @@ def test_recover_during_update(mocked_application_state_manager): # Create new deployment state manager. It should recover the old # version of the deployment from the kv store new_deployment_state_manager = MockDeploymentStateManager(kv_store) - dr_version = new_deployment_state_manager.deployment_infos["d1"].version + dr_version = new_deployment_state_manager.deployment_infos[ + str(deployment_id) + ].version # Create new application state manager, and it should call _recover_from_checkpoint new_app_state_manager = ApplicationStateManager( @@ -681,10 +697,13 @@ def test_recover_during_update(mocked_application_state_manager): assert ar_version != dr_version new_app_state_manager.update() - assert new_deployment_state_manager.deployment_infos["d1"].version == ar_version + assert ( + new_deployment_state_manager.deployment_infos[str(deployment_id)].version + == ar_version + ) assert app_state.status == ApplicationStatus.DEPLOYING - new_deployment_state_manager.set_deployment_healthy("d1") + new_deployment_state_manager.set_deployment_healthy(str(deployment_id)) new_app_state_manager.update() assert app_state.status == ApplicationStatus.RUNNING @@ -705,6 +724,7 @@ def test_is_ready_for_shutdown(mocked_application_state_manager): ) = mocked_application_state_manager app_name = "test_app" deployment_name = "d1" + deployment_id = DeploymentID(deployment_name, app_name) # DEPLOY application with deployment "d1" params = deployment_params(deployment_name) @@ -714,8 +734,8 @@ def test_is_ready_for_shutdown(mocked_application_state_manager): # Once deployment is healthy, app should be running app_state_manager.update() - assert deployment_state_manager.get_deployment(deployment_name) - deployment_state_manager.set_deployment_healthy(deployment_name) + assert deployment_state_manager.get_deployment(str(deployment_id)) + deployment_state_manager.set_deployment_healthy(str(deployment_id)) app_state_manager.update() assert app_state.status == ApplicationStatus.RUNNING @@ -728,8 +748,8 @@ def test_is_ready_for_shutdown(mocked_application_state_manager): # When shutting down applications after deployments are deleted, application state # `is_deleted()` should return True and `is_ready_for_shutdown()` should return True - deployment_state_manager.delete_deployment(deployment_name) - deployment_state_manager.set_deployment_deleted(deployment_name) + deployment_state_manager.delete_deployment(str(deployment_id)) + deployment_state_manager.set_deployment_deleted(str(deployment_id)) app_state_manager.update() assert app_state.is_deleted() assert app_state_manager.is_ready_for_shutdown() @@ -766,8 +786,8 @@ def test_override_deployment_config(self, info): ], ) - updated_infos = override_deployment_info("default", {"default_A": info}, config) - updated_info = updated_infos["default_A"] + updated_infos = override_deployment_info("default", {"A": info}, config) + updated_info = updated_infos["A"] assert updated_info.app_name == "default" assert updated_info.route_prefix == "/" assert updated_info.version == "123" @@ -794,8 +814,8 @@ def test_override_autoscaling_config(self, info): ], ) - updated_infos = override_deployment_info("default", {"default_A": info}, config) - updated_info = updated_infos["default_A"] + updated_infos = override_deployment_info("default", {"A": info}, config) + updated_info = updated_infos["A"] assert updated_info.app_name == "default" assert updated_info.route_prefix == "/" assert updated_info.version == "123" @@ -810,8 +830,8 @@ def test_override_route_prefix_1(self, info): deployments=[DeploymentSchema(name="A", route_prefix="/alice")], ) - updated_infos = override_deployment_info("default", {"default_A": info}, config) - updated_info = updated_infos["default_A"] + updated_infos = override_deployment_info("default", {"A": info}, config) + updated_info = updated_infos["A"] assert updated_info.app_name == "default" assert updated_info.route_prefix == "/alice" assert updated_info.version == "123" @@ -828,8 +848,8 @@ def test_override_route_prefix_2(self, info): ], ) - updated_infos = override_deployment_info("default", {"default_A": info}, config) - updated_info = updated_infos["default_A"] + updated_infos = override_deployment_info("default", {"A": info}, config) + updated_info = updated_infos["A"] assert updated_info.app_name == "default" assert updated_info.route_prefix == "/bob" assert updated_info.version == "123" @@ -842,8 +862,8 @@ def test_override_route_prefix_3(self, info): deployments=[DeploymentSchema(name="A", route_prefix="/alice")], ) - updated_infos = override_deployment_info("default", {"default_A": info}, config) - updated_info = updated_infos["default_A"] + updated_infos = override_deployment_info("default", {"A": info}, config) + updated_info = updated_infos["A"] assert updated_info.app_name == "default" assert updated_info.route_prefix == "/bob" assert updated_info.version == "123" @@ -860,8 +880,8 @@ def test_override_is_driver_deployment(self, info): ], ) - updated_infos = override_deployment_info("default", {"default_A": info}, config) - updated_info = updated_infos["default_A"] + updated_infos = override_deployment_info("default", {"A": info}, config) + updated_info = updated_infos["A"] assert updated_info.app_name == "default" assert updated_info.route_prefix == "/" assert updated_info.version == "123" @@ -880,8 +900,8 @@ def test_override_ray_actor_options_1(self, info): ], ) - updated_infos = override_deployment_info("default", {"default_A": info}, config) - updated_info = updated_infos["default_A"] + updated_infos = override_deployment_info("default", {"A": info}, config) + updated_info = updated_infos["A"] assert updated_info.app_name == "default" assert updated_info.route_prefix == "/" assert updated_info.version == "123" @@ -903,8 +923,8 @@ def test_override_ray_actor_options_2(self, info): ], ) - updated_infos = override_deployment_info("default", {"default_A": info}, config) - updated_info = updated_infos["default_A"] + updated_infos = override_deployment_info("default", {"A": info}, config) + updated_info = updated_infos["A"] assert updated_info.app_name == "default" assert updated_info.route_prefix == "/" assert updated_info.version == "123" @@ -929,8 +949,8 @@ def test_override_ray_actor_options_3(self, info): ], ) - updated_infos = override_deployment_info("default", {"default_A": info}, config) - updated_info = updated_infos["default_A"] + updated_infos = override_deployment_info("default", {"A": info}, config) + updated_info = updated_infos["A"] assert updated_info.app_name == "default" assert updated_info.route_prefix == "/" assert updated_info.version == "123" @@ -966,8 +986,8 @@ def test_override_ray_actor_options_4(self): ], ) - updated_infos = override_deployment_info("default", {"default_A": info}, config) - updated_info = updated_infos["default_A"] + updated_infos = override_deployment_info("default", {"A": info}, config) + updated_info = updated_infos["A"] assert updated_info.app_name == "default" assert updated_info.route_prefix == "/" assert updated_info.version == "123" @@ -1007,8 +1027,8 @@ def test_override_ray_actor_options_5(self): ], ) - updated_infos = override_deployment_info("default", {"default_A": info}, config) - updated_info = updated_infos["default_A"] + updated_infos = override_deployment_info("default", {"A": info}, config) + updated_info = updated_infos["A"] assert updated_info.app_name == "default" assert updated_info.route_prefix == "/" assert updated_info.version == "123" diff --git a/python/ray/serve/tests/test_autoscaling_policy.py b/python/ray/serve/tests/test_autoscaling_policy.py index 9a009c61daa0..9913e8881ee0 100644 --- a/python/ray/serve/tests/test_autoscaling_policy.py +++ b/python/ray/serve/tests/test_autoscaling_policy.py @@ -330,6 +330,39 @@ def __call__(self): assert len(get_running_replicas(controller, "A")) == 2 +def test_cold_start_time(serve_instance): + """Send 100 requests and check that we autoscale up, and then back down.""" + + @serve.deployment( + autoscaling_config={ + "min_replicas": 0, + "max_replicas": 1, + "look_back_period_s": 0.2, + }, + ) + class A: + def __call__(self): + return "hello" + + handle = serve.run(A.bind()) + + def check_running(): + assert serve.status().applications["default"].status == "RUNNING" + return True + + wait_for_condition(check_running) + + start = time.time() + result = ray.get(handle.remote()) + cold_start_time = time.time() - start + assert cold_start_time < 2 + print( + "Time taken for deployment at 0 replicas to serve first request:", + cold_start_time, + ) + assert result == "hello" + + def test_smoothing_factor_scale_up_from_0_replicas(): """Test that the smoothing factor is respected when scaling up from 0 replicas.""" @@ -961,7 +994,7 @@ def test_e2e_raise_min_replicas(serve_instance): assert check_autoscale_num_replicas(controller, "A") == 0 - handle = serve.get_deployment("default_A").get_handle() + handle = serve.get_deployment_handle("A", "default") [handle.remote() for _ in range(1)] print("Issued one request.") diff --git a/python/ray/serve/tests/test_built_application.py b/python/ray/serve/tests/test_built_application.py index 26f78d21bc2d..0d22c2662278 100644 --- a/python/ray/serve/tests/test_built_application.py +++ b/python/ray/serve/tests/test_built_application.py @@ -4,6 +4,7 @@ import ray from ray import serve from ray.serve.built_application import BuiltApplication +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME from ray._private.test_utils import wait_for_condition @@ -75,12 +76,10 @@ def deploy_and_check_responses( ) def check_all_deployed(): - try: - for deployment, response in zip(deployments, responses): - if ray.get(deployment.get_handle().remote()) != response: - return False - except Exception: - return False + for i in range(len(deployments)): + handle = serve.get_deployment_handle(deployments[i].name, f"app{i}") + if ray.get(handle.remote()) != responses[i]: + return False return True @@ -122,16 +121,16 @@ def test_mutual_handles(self, serve_instance): @serve.deployment(route_prefix=None) class MutualHandles: async def __init__(self, handle_name): - self.handle = serve.get_deployment(handle_name).get_handle() + self.handle = serve.get_deployment_handle(handle_name) async def __call__(self, echo: str): - return await self.handle.request_echo.remote(echo) + return await (await self.handle.request_echo.remote(echo)) async def request_echo(self, echo: str): return echo names = [] - for i in range(10): + for i in range(1, 10): names.append("a" * i) deployments = [] @@ -148,7 +147,10 @@ async def request_echo(self, echo: str): serve.run(BuiltApplication(deployments, deployments[-1].name), _blocking=True) for deployment in deployments: - assert (ray.get(deployment.get_handle().remote("hello"))) == "hello" + handle = serve.get_deployment_handle( + deployment.name, SERVE_DEFAULT_APP_NAME + ) + assert ray.get(handle.remote("hello")) == "hello" def test_decorated_deployments(self, serve_instance): """ @@ -256,8 +258,10 @@ def test_import_path_deployment_decorated(self, serve_instance): self.deploy_and_check_responses(deployments, responses) # Check that non-default decorated values were overwritten - assert serve.get_deployment("decorated_func").max_concurrent_queries != 17 - assert serve.get_deployment("decorated_clss").max_concurrent_queries != 17 + info, _ = serve_instance.get_deployment_info("decorated_func", "app0") + assert info.deployment_config.max_concurrent_queries != 17 + info, _ = serve_instance.get_deployment_info("decorated_clss", "app1") + assert info.deployment_config.max_concurrent_queries != 17 # Decorated function with non-default max_concurrent queries diff --git a/python/ray/serve/tests/test_controller.py b/python/ray/serve/tests/test_controller.py index 89f7836b2b59..d0b986d2c37f 100644 --- a/python/ray/serve/tests/test_controller.py +++ b/python/ray/serve/tests/test_controller.py @@ -7,18 +7,11 @@ from ray import serve from ray.serve._private.common import DeploymentInfo from ray.serve.generated.serve_pb2 import DeploymentRoute -from ray.serve._private.constants import ( - SERVE_DEFAULT_APP_NAME, - DEPLOYMENT_NAME_PREFIX_SEPARATOR, -) +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME from ray.serve.schema import ServeDeploySchema from ray.serve._private.common import ApplicationStatus -def get_deployment_name(name: str): - return f"{SERVE_DEFAULT_APP_NAME}{DEPLOYMENT_NAME_PREFIX_SEPARATOR}{name}" - - def test_redeploy_start_time(serve_instance): """Check that redeploying a deployment doesn't reset its start time.""" @@ -29,9 +22,8 @@ def test(_): return "1" serve.run(test.bind()) - deployment_name = get_deployment_name("test") deployment_route = DeploymentRoute.FromString( - ray.get(controller.get_deployment_info.remote(deployment_name)) + ray.get(controller.get_deployment_info.remote("test", SERVE_DEFAULT_APP_NAME)) ) deployment_info_1 = DeploymentInfo.from_proto(deployment_route.deployment_info) start_time_ms_1 = deployment_info_1.start_time_ms @@ -44,7 +36,7 @@ def test(_): serve.run(test.bind()) deployment_route = DeploymentRoute.FromString( - ray.get(controller.get_deployment_info.remote(deployment_name)) + ray.get(controller.get_deployment_info.remote("test", SERVE_DEFAULT_APP_NAME)) ) deployment_info_2 = DeploymentInfo.from_proto(deployment_route.deployment_info) start_time_ms_2 = deployment_info_2.start_time_ms diff --git a/python/ray/serve/tests/test_deploy_app.py b/python/ray/serve/tests/test_deploy_app.py index f79af74a53fd..c207b5598cac 100644 --- a/python/ray/serve/tests/test_deploy_app.py +++ b/python/ray/serve/tests/test_deploy_app.py @@ -614,9 +614,7 @@ def check_dead(): ] ) for actor in actors: - assert ( - "app1" not in actor["class_name"] and "app2" not in actor["class_name"] - ) + assert "app1" not in actor["name"] and "app2" not in actor["name"] return True # Deployments from app1 and app2 should be deleted @@ -699,25 +697,28 @@ def test_deploy_multi_app_deployments_removed(client: ServeControllerClient): # Deploy with pizza graph first client.deploy_apps(test_config) - def check_pizza(): + def check_app(deployments): # Check that the live deployments and actors are what we expect: exactly the # set of deployments in the pizza graph - actor_class_names = { - actor["class_name"] - for actor in list_actors(filters=[("state", "=", "ALIVE")]) + actor_names = { + actor["name"] for actor in list_actors(filters=[("state", "=", "ALIVE")]) } + expected_actor_name_prefixes = { + "SERVE_CONTROLLER_ACTOR:SERVE_PROXY_ACTOR", + "SERVE_CONTROLLER_ACTOR", + }.union({f"SERVE_REPLICA::app1_{deployment}" for deployment in deployments}) + for prefix in expected_actor_name_prefixes: + assert any(name.startswith(prefix) for name in actor_names) + deployment_replicas = set( ray.get(client._controller._all_running_replicas.remote()).keys() ) assert { - f"app1_{deployment}" for deployment in pizza_deployments + f"app1_{deployment}" for deployment in deployments } == deployment_replicas - assert {"HTTPProxyActor", "ServeController"}.union( - {f"ServeReplica:app1_{deployment}" for deployment in pizza_deployments} - ) == actor_class_names return True - wait_for_condition(check_pizza) + wait_for_condition(check_app, deployments=pizza_deployments) wait_for_condition( lambda: requests.post("http://localhost:8000/app1", json=["ADD", 2]).json() == "4 pizzas please!" @@ -727,25 +728,7 @@ def check_pizza(): test_config.applications[0].import_path = world_import_path client.deploy_apps(test_config) - def check_world(): - # Check that the live deployments and actors are what we expect: exactly the - # set of deployments in the world graph - actor_class_names = { - actor["class_name"] - for actor in list_actors(filters=[("state", "=", "ALIVE")]) - } - deployment_replicas = set( - ray.get(client._controller._all_running_replicas.remote()).keys() - ) - assert { - f"app1_{deployment}" for deployment in world_deployments - } == deployment_replicas - assert {"HTTPProxyActor", "ServeController"}.union( - {f"ServeReplica:app1_{deployment}" for deployment in world_deployments} - ) == actor_class_names - return True - - wait_for_condition(check_world) + wait_for_condition(check_app, deployments=world_deployments) wait_for_condition( lambda: requests.get("http://localhost:8000/app1").text == "wonderful world" ) diff --git a/python/ray/serve/tests/test_deployment_graph_build.py b/python/ray/serve/tests/test_deployment_graph_build.py index 5652e72c2e49..2a57fdad85f4 100644 --- a/python/ray/serve/tests/test_deployment_graph_build.py +++ b/python/ray/serve/tests/test_deployment_graph_build.py @@ -70,7 +70,7 @@ def test_simple_single_class(serve_instance): with _DAGNodeNameGenerator() as node_name_generator: serve_root_dag = ray_dag.apply_recursive( - lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator) + lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator, "") ) deployments = extract_deployments_from_serve_dag(serve_root_dag) assert len(deployments) == 1 @@ -92,7 +92,7 @@ async def test_func_class_with_class_method_dag(serve_instance): with _DAGNodeNameGenerator() as node_name_generator: serve_root_dag = ray_dag.apply_recursive( - lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator) + lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator, "") ) deployments = extract_deployments_from_serve_dag(serve_root_dag) serve_executor_root_dag = serve_root_dag.apply_recursive( @@ -116,7 +116,7 @@ def test_multi_instantiation_class_deployment_in_init_args(serve_instance): with _DAGNodeNameGenerator() as node_name_generator: serve_root_dag = ray_dag.apply_recursive( - lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator) + lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator, "") ) print(f"Serve DAG: \n{serve_root_dag}") deployments = extract_deployments_from_serve_dag(serve_root_dag) @@ -138,7 +138,7 @@ def test_shared_deployment_handle(serve_instance): with _DAGNodeNameGenerator() as node_name_generator: serve_root_dag = ray_dag.apply_recursive( - lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator) + lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator, "") ) print(f"Serve DAG: \n{serve_root_dag}") deployments = extract_deployments_from_serve_dag(serve_root_dag) @@ -161,7 +161,7 @@ def test_multi_instantiation_class_nested_deployment_arg(serve_instance): with _DAGNodeNameGenerator() as node_name_generator: serve_root_dag = ray_dag.apply_recursive( - lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator) + lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator, "") ) print(f"Serve DAG: \n{serve_root_dag}") deployments = extract_deployments_from_serve_dag(serve_root_dag) @@ -189,7 +189,7 @@ def test_get_pipeline_input_node(): ray_dag = combine.bind(1, 2) with _DAGNodeNameGenerator() as node_name_generator: serve_dag = ray_dag.apply_recursive( - lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator) + lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator, "") ) with pytest.raises( AssertionError, match="There should be one and only one InputNode" @@ -207,7 +207,9 @@ def test_get_pipeline_input_node(): ): with _DAGNodeNameGenerator() as node_name_generator: serve_dag = ray_dag.apply_recursive( - lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator) + lambda node: transform_ray_dag_to_serve_dag( + node, node_name_generator, "" + ) ) get_pipeline_input_node(serve_dag) @@ -216,7 +218,7 @@ def test_unique_name_reset_upon_build(serve_instance): ray_dag, _ = get_multi_instantiation_class_deployment_in_init_args_dag() with _DAGNodeNameGenerator() as node_name_generator: serve_root_dag = ray_dag.apply_recursive( - lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator) + lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator, "") ) deployments = extract_deployments_from_serve_dag(serve_root_dag) assert deployments[0].name == "Model" @@ -224,7 +226,7 @@ def test_unique_name_reset_upon_build(serve_instance): with _DAGNodeNameGenerator() as node_name_generator: serve_root_dag = ray_dag.apply_recursive( - lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator) + lambda node: transform_ray_dag_to_serve_dag(node, node_name_generator, "") ) deployments = extract_deployments_from_serve_dag(serve_root_dag) # Assert we don't keep increasing suffix id between build() calls diff --git a/python/ray/serve/tests/test_http_prefix_matching.py b/python/ray/serve/tests/test_http_prefix_matching.py index d42a53248adf..9acdee4c91ec 100644 --- a/python/ray/serve/tests/test_http_prefix_matching.py +++ b/python/ray/serve/tests/test_http_prefix_matching.py @@ -1,6 +1,6 @@ import pytest -from ray.serve._private.common import EndpointInfo +from ray.serve._private.common import EndpointInfo, EndpointTag from ray.serve._private.http_proxy import LongestPrefixRouter @@ -24,13 +24,17 @@ def mock_get_handle(name, *args, **kwargs): def test_no_match(mock_longest_prefix_router): router = mock_longest_prefix_router - router.update_routes({"endpoint": EndpointInfo(route="/hello", app_name="")}) + router.update_routes( + {EndpointTag("endpoint", "default"): EndpointInfo(route="/hello")} + ) assert router.match_route("/nonexistent") is None def test_default_route(mock_longest_prefix_router): router = mock_longest_prefix_router - router.update_routes({"endpoint": EndpointInfo(route="/endpoint", app_name="")}) + router.update_routes( + {EndpointTag("endpoint", "default"): EndpointInfo(route="/endpoint")} + ) assert router.match_route("/nonexistent") is None @@ -39,7 +43,7 @@ def test_default_route(mock_longest_prefix_router): [ route == "/endpoint", handle == "endpoint", - app_name == "", + app_name == "default", not app_is_cross_language, ] ) @@ -49,7 +53,7 @@ def test_trailing_slash(mock_longest_prefix_router): router = mock_longest_prefix_router router.update_routes( { - "endpoint": EndpointInfo(route="/test", app_name=""), + EndpointTag("endpoint", "default"): EndpointInfo(route="/test"), } ) @@ -58,7 +62,7 @@ def test_trailing_slash(mock_longest_prefix_router): router.update_routes( { - "endpoint": EndpointInfo(route="/test/", app_name=""), + EndpointTag("endpoint", "default"): EndpointInfo(route="/test/"), } ) @@ -69,9 +73,9 @@ def test_prefix_match(mock_longest_prefix_router): router = mock_longest_prefix_router router.update_routes( { - "endpoint1": EndpointInfo(route="/test/test2", app_name=""), - "endpoint2": EndpointInfo(route="/test", app_name=""), - "endpoint3": EndpointInfo(route="/", app_name=""), + EndpointTag("endpoint1", "default"): EndpointInfo(route="/test/test2"), + EndpointTag("endpoint2", "default"): EndpointInfo(route="/test"), + EndpointTag("endpoint3", "default"): EndpointInfo(route="/"), } ) @@ -97,7 +101,9 @@ def test_prefix_match(mock_longest_prefix_router): def test_update_routes(mock_longest_prefix_router): router = mock_longest_prefix_router - router.update_routes({"endpoint": EndpointInfo(route="/endpoint", app_name="app1")}) + router.update_routes( + {EndpointTag("endpoint", "app1"): EndpointInfo(route="/endpoint")} + ) route, handle, app_name, app_is_cross_language = router.match_route("/endpoint") assert all( @@ -111,9 +117,8 @@ def test_update_routes(mock_longest_prefix_router): router.update_routes( { - "endpoint2": EndpointInfo( + EndpointTag("endpoint2", "app2"): EndpointInfo( route="/endpoint2", - app_name="app2", app_is_cross_language=True, ) } diff --git a/python/ray/serve/tests/test_long_poll.py b/python/ray/serve/tests/test_long_poll.py index c5286808c666..76816b28cc5e 100644 --- a/python/ray/serve/tests/test_long_poll.py +++ b/python/ray/serve/tests/test_long_poll.py @@ -12,6 +12,7 @@ from ray.serve._private.long_poll import ( LongPollClient, LongPollHost, + LongPollState, UpdatedObject, LongPollNamespace, ) @@ -23,6 +24,32 @@ ) +def test_notifier_events_cleared_without_update(serve_instance): + """Verify that notifier events are not leaked. + + Previously, events were leaked if there were timeouts and no updates on the key. + """ + host = ray.remote(LongPollHost).remote( + listen_for_change_request_timeout_s=(0.1, 0.1) + ) + ray.get(host.notify_changed.remote("key_1", 999)) + + # Get an initial object snapshot for the key. + object_ref = host.listen_for_change.remote({"key_1": -1}) + result: Dict[str, UpdatedObject] = ray.get(object_ref) + assert set(result.keys()) == {"key_1"} + assert {v.object_snapshot for v in result.values()} == {999} + new_snapshot_ids = {k: v.snapshot_id for k, v in result.items()} + + # Listen for changes -- this should time out without an update. + object_ref = host.listen_for_change.remote(new_snapshot_ids) + assert ray.get(object_ref) == LongPollState.TIME_OUT + + # Verify that the `asyncio.Event` used for the `listen_for_change` task + # is removed. + assert ray.get(host._get_num_notifier_events.remote()) == 0 + + def test_host_standalone(serve_instance): host = ray.remote(LongPollHost).remote() @@ -182,8 +209,8 @@ def test_listen_for_change_java(serve_instance): assert poll_result_1.updated_objects["key_1"].object_snapshot.decode() == "999" request_2 = {"keys_to_snapshot_ids": {"ROUTE_TABLE": -1}} endpoints: Dict[EndpointTag, EndpointInfo] = dict() - endpoints["deployment_name"] = EndpointInfo(route="/test/xlang/poll", app_name="") - endpoints["deployment_name1"] = EndpointInfo(route="/test/xlang/poll1", app_name="") + endpoints["deployment_name"] = EndpointInfo(route="/test/xlang/poll") + endpoints["deployment_name1"] = EndpointInfo(route="/test/xlang/poll1") ray.get(host.notify_changed.remote(LongPollNamespace.ROUTE_TABLE, endpoints)) object_ref_2 = host.listen_for_change_java.remote( LongPollRequest(**request_2).SerializeToString() diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py index ce323dd60967..4cabaf94a809 100644 --- a/python/ray/serve/tests/test_metrics.py +++ b/python/ray/serve/tests/test_metrics.py @@ -551,12 +551,13 @@ def check(): metrics_route, metrics_app_name = self._generate_metrics_summary( get_metric_dictionaries(metric_name) ) - assert metrics_route["app1_f"] == {"/app1"} - assert metrics_route["app2_g"] == {"/app2"} - assert metrics_route["app3_h"] == {"/app3"} - assert metrics_app_name["app1_f"] == "app1" - assert metrics_app_name["app2_g"] == "app2" - assert metrics_app_name["app3_h"] == "app3" + msg = f"Incorrect metrics for {metric_name}" + assert metrics_route["app1_f"] == {"/app1"}, msg + assert metrics_route["app2_g"] == {"/app2"}, msg + assert metrics_route["app3_h"] == {"/app3"}, msg + assert metrics_app_name["app1_f"] == "app1", msg + assert metrics_app_name["app2_g"] == "app2", msg + assert metrics_app_name["app3_h"] == "app3", msg def test_request_context_pass_for_handle_passing(self, serve_start_shutdown): """Test handle passing contexts between replicas""" diff --git a/python/ray/serve/tests/test_regression.py b/python/ray/serve/tests/test_regression.py index 4f5b002e5e26..f015ee5c3433 100644 --- a/python/ray/serve/tests/test_regression.py +++ b/python/ray/serve/tests/test_regression.py @@ -169,7 +169,7 @@ def f(): assert len(handle_cache) == initial_num_cached + 1 def sender_where_handle_goes_out_of_scope(): - f = get_global_client().get_handle("app_f", missing_ok=True, sync=True) + f = get_global_client().get_handle("f", "app", missing_ok=True, sync=True) assert f is handle assert ray.get(f.remote()) == "hi" diff --git a/python/ray/serve/tests/test_replica_scheduler.py b/python/ray/serve/tests/test_replica_scheduler.py index 559e61b2bc81..74a07ce8fb3b 100644 --- a/python/ray/serve/tests/test_replica_scheduler.py +++ b/python/ray/serve/tests/test_replica_scheduler.py @@ -9,6 +9,7 @@ import ray from ray._private.utils import get_or_create_event_loop +from ray.serve._private.common import DeploymentID from ray.serve._private.router import ( PowerOfTwoChoicesReplicaScheduler, Query, @@ -79,7 +80,7 @@ def send_query( def pow_2_scheduler() -> PowerOfTwoChoicesReplicaScheduler: s = PowerOfTwoChoicesReplicaScheduler( get_or_create_event_loop(), - "TEST_DEPLOYMENT", + DeploymentID("TEST_DEPLOYMENT", "TEST_APP"), ) # Update the RAY_SERVE_MULTIPLEXED_MODEL_ID_MATCHING_TIMEOUT_S diff --git a/python/ray/serve/tests/test_standalone_2.py b/python/ray/serve/tests/test_standalone_2.py index 45d732ea8b2a..4e3d06caeb7c 100644 --- a/python/ray/serve/tests/test_standalone_2.py +++ b/python/ray/serve/tests/test_standalone_2.py @@ -221,7 +221,7 @@ def run_graph(): # Import and build the graph graph = import_attr("test_config_files.pizza.serve_dag") - app = build(graph) + app = build(graph, SERVE_DEFAULT_APP_NAME) # Override options for each deployment for name in app.deployments: diff --git a/python/ray/tests/conftest.py b/python/ray/tests/conftest.py index 1172b97b39bb..3773b0981380 100644 --- a/python/ray/tests/conftest.py +++ b/python/ray/tests/conftest.py @@ -1237,16 +1237,6 @@ def set_runtime_env_plugin_schemas(request): del os.environ["RAY_RUNTIME_ENV_PLUGIN_SCHEMAS"] -@pytest.fixture(params=[True, False]) -def enable_syncer_test(request, monkeypatch): - with_syncer = request.param - monkeypatch.setenv("RAY_use_ray_syncer", "true" if with_syncer else "false") - ray._raylet.Config.initialize("") - yield - monkeypatch.delenv("RAY_use_ray_syncer") - ray._raylet.Config.initialize("") - - @pytest.fixture(scope="function") def temp_file(request): with tempfile.NamedTemporaryFile("r+b") as fp: diff --git a/python/ray/tests/test_actor_resources.py b/python/ray/tests/test_actor_resources.py index 6aa6f01f4fec..e501fb658b45 100644 --- a/python/ray/tests/test_actor_resources.py +++ b/python/ray/tests/test_actor_resources.py @@ -320,7 +320,7 @@ def get_location_and_ids(self): assert ready_ids == [] -def test_actors_and_tasks_with_gpus(enable_syncer_test, ray_start_cluster): +def test_actors_and_tasks_with_gpus(ray_start_cluster): cluster = ray_start_cluster num_nodes = 3 num_gpus_per_raylet = 2 diff --git a/python/ray/tests/test_failure_3.py b/python/ray/tests/test_failure_3.py index 4702ec56ccb6..1e2b3a56da1a 100644 --- a/python/ray/tests/test_failure_3.py +++ b/python/ray/tests/test_failure_3.py @@ -451,6 +451,118 @@ def task(): ) +@pytest.mark.skipif(sys.platform != "linux", reason="Only works on linux.") +def test_worker_cleans_up_child_procs_on_raylet_death(ray_start_cluster, tmp_path): + """ + CoreWorker kills its child processes if the raylet dies. + This test creates 20 leaked processes; 10 from a single actor task, and + 10 from distinct non-actor tasks. + + Once the raylet dies, the test verifies all leaked processes are cleaned up. + """ + + output_file_path = tmp_path / "leaked_pids.json" + ray_start_cluster.add_node() + driver_script = f""" +import ray +import json +import multiprocessing +import shutil +import time +import os +import setproctitle + +def change_name_and_sleep(label: str, index: int) -> None: + proctitle = "child_proc_name_prefix_" + label + "_" + str(index) + setproctitle.setproctitle(proctitle) + time.sleep(1000) + +def create_child_proc(label, index): + proc = multiprocessing.Process( + target=change_name_and_sleep, + args=(label, index,), + daemon=True, + ) + proc.start() + return proc.pid + +@ray.remote +class LeakerActor: + def create_leaked_child_process(self, num_to_leak): + print("creating leaked process", os.getpid()) + + pids = [] + for index in range(num_to_leak): + pid = create_child_proc("actor", index) + pids.append(pid) + + return pids + +@ray.remote +def leaker_task(index): + print("Creating leaked process", os.getpid()) + return create_child_proc("task", index) + +num_to_leak_per_type = 10 +print('starting actors') +actor = LeakerActor.remote() +actor_leaked_pids = ray.get(actor.create_leaked_child_process.remote( + num_to_leak=num_to_leak_per_type, +)) + +task_leaked_pids = ray.get([ + leaker_task.remote(index) for index in range(num_to_leak_per_type) +]) +leaked_pids = actor_leaked_pids + task_leaked_pids + +final_file = "{output_file_path}" +tmp_file = final_file + ".tmp" +with open(tmp_file, "w") as f: + json.dump(leaked_pids, f) +shutil.move(tmp_file, final_file) + +while True: + print(os.getpid()) + time.sleep(1) + """ + + print("Running string as driver") + driver_proc = run_string_as_driver_nonblocking(driver_script) + + # Wait for the json file containing the child PIDS + # to be present. + print("Waiting for child pids json") + wait_for_condition( + condition_predictor=lambda: Path(output_file_path).exists(), + timeout=30, + ) + + # Load the PIDs of the child processes. + with open(output_file_path, "r") as f: + pids = json.load(f) + + # Validate all children of the worker processes are in a sleeping state. + processes = [psutil.Process(pid) for pid in pids] + assert all([proc.status() == psutil.STATUS_SLEEPING for proc in processes]) + + # Obtain psutil handle for raylet process + raylet_proc = [p for p in psutil.process_iter() if p.name() == "raylet"] + assert len(raylet_proc) == 1 + raylet_proc = raylet_proc[0] + + # Kill the raylet process and reap the zombie + raylet_proc.kill() + raylet_proc.wait() + + print("Waiting for child procs to die") + wait_for_condition( + condition_predictor=lambda: all([not proc.is_running() for proc in processes]), + timeout=30, + ) + + driver_proc.kill() + + if __name__ == "__main__": import pytest diff --git a/python/ray/tests/test_metrics_agent.py b/python/ray/tests/test_metrics_agent.py index 06166b5c3941..d672e7e312b4 100644 --- a/python/ray/tests/test_metrics_agent.py +++ b/python/ray/tests/test_metrics_agent.py @@ -83,9 +83,6 @@ "ray_gcs_actors_count", ] -if not ray._raylet.Config.use_ray_syncer(): - _METRICS.append("ray_outbound_heartbeat_size_kb_sum") - # This list of metrics should be kept in sync with # ray/python/ray/autoscaler/_private/prom_metrics.py _AUTOSCALER_METRICS = [ diff --git a/python/ray/tests/test_output.py b/python/ray/tests/test_output.py index 367b61c5997a..849ffe2a4cfe 100644 --- a/python/ray/tests/test_output.py +++ b/python/ray/tests/test_output.py @@ -185,9 +185,7 @@ def foo(): ], indirect=True, ) -def test_autoscaler_warn_deadlock( - enable_syncer_test, ray_start_cluster_head_with_env_vars -): +def test_autoscaler_warn_deadlock(ray_start_cluster_head_with_env_vars): script = """ import ray import time diff --git a/python/ray/tests/test_placement_group_3.py b/python/ray/tests/test_placement_group_3.py index 5e580d3e8979..41876ac9f48d 100644 --- a/python/ray/tests/test_placement_group_3.py +++ b/python/ray/tests/test_placement_group_3.py @@ -458,18 +458,6 @@ def f(): assert len(gpu_ids_res) == 2 -@pytest.mark.parametrize( - "ray_start_cluster", - [ - generate_system_config_map( - use_ray_syncer=True, - ), - generate_system_config_map( - use_ray_syncer=False, - ), - ], - indirect=True, -) @pytest.mark.repeat(3) def test_actor_scheduling_not_block_with_placement_group(ray_start_cluster): """Tests the scheduling of lots of actors will not be blocked diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index d4e301e97334..3dda10edc686 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -104,7 +104,7 @@ py_test( size = "medium", main = "examples/pytorch/tune_cifar_torch_pbt_example.py", srcs = ["examples/pytorch/tune_cifar_torch_pbt_example.py"], - tags = ["team:ml", "exclusive", "pytorch", "tune"], + tags = ["team:ml", "exclusive", "pytorch", "tune", "no_new_storage"], deps = [":train_lib"], args = ["--smoke-test"] ) @@ -114,7 +114,7 @@ py_test( size = "small", main = "examples/pytorch/tune_torch_regression_example.py", srcs = ["examples/pytorch/tune_torch_regression_example.py"], - tags = ["team:ml", "exclusive", "tune"], + tags = ["team:ml", "exclusive", "tune", "no_new_storage"], deps = [":train_lib"], args = ["--smoke-test"] ) @@ -135,7 +135,7 @@ py_test( name = "horovod_cifar_pbt_example", size = "small", srcs = ["examples/horovod/horovod_cifar_pbt_example.py"], - tags = ["team:ml", "exlusive"], + tags = ["team:ml", "exlusive", "no_new_storage"], deps = [":train_lib"], args = ["--smoke-test"] ) @@ -144,7 +144,7 @@ py_test( name = "horovod_pytorch_example", size = "small", srcs = ["examples/horovod/horovod_pytorch_example.py"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], deps = [":train_lib"], args = ["--num-epochs=1"] ) @@ -163,7 +163,7 @@ py_test ( size = "medium", srcs = ["examples/huggingface/huggingface_basic_language_modeling_example.py"], args = ["--smoke-test", "--num-epochs 3"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], deps = [":train_lib"] ) @@ -172,7 +172,7 @@ py_test( size = "medium", main = "examples/tf/tensorflow_regression_example.py", srcs = ["examples/tf/tensorflow_regression_example.py"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], deps = [":train_lib"], args = ["--smoke-test"] ) @@ -215,7 +215,7 @@ py_test( size = "medium", main = "examples/pytorch/torch_regression_example.py", srcs = ["examples/pytorch/torch_regression_example.py"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], deps = [":train_lib"], args = ["--smoke-test"] ) @@ -236,7 +236,7 @@ py_test( size = "medium", main = "examples/tf/tune_tensorflow_mnist_example.py", srcs = ["examples/tf/tune_tensorflow_mnist_example.py"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], deps = [":train_lib"], args = ["--smoke-test"] ) @@ -258,7 +258,7 @@ py_test( name = "test_backend", size = "large", srcs = ["tests/test_backend.py"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], deps = [":train_lib", ":conftest"] ) @@ -266,7 +266,7 @@ py_test( name = "test_base_trainer", size = "medium", srcs = ["tests/test_base_trainer.py"], - tags = ["team:ml", "exclusive", "ray_air"], + tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"], deps = [":train_lib", ":conftest"] ) @@ -298,7 +298,7 @@ py_test( name = "test_data_parallel_trainer", size = "medium", srcs = ["tests/test_data_parallel_trainer.py"], - tags = ["team:ml", "exclusive", "ray_air"], + tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"], deps = [":train_lib"] ) @@ -306,7 +306,7 @@ py_test( name = "test_data_parallel_trainer_checkpointing", size = "medium", srcs = ["tests/test_data_parallel_trainer_checkpointing.py"], - tags = ["team:ml", "exclusive", "ray_air"], + tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"], deps = [":train_lib"] ) @@ -314,7 +314,7 @@ py_test( name = "test_examples", size = "large", srcs = ["tests/test_examples.py"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], deps = [":train_lib", ":conftest"] ) @@ -378,7 +378,7 @@ py_test( name = "test_horovod_trainer", size = "large", srcs = ["tests/test_horovod_trainer.py"], - tags = ["team:ml", "exclusive", "ray_air"], + tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"], deps = [":train_lib"] ) @@ -402,7 +402,7 @@ py_test( name = "test_lightgbm_trainer", size = "medium", srcs = ["tests/test_lightgbm_trainer.py"], - tags = ["team:ml", "exclusive", "ray_air"], + tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"], deps = [":train_lib"] ) @@ -490,7 +490,7 @@ py_test( name = "test_session", size = "small", srcs = ["tests/test_session.py"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], deps = [":train_lib", ":conftest"] ) @@ -506,7 +506,7 @@ py_test( name = "test_sklearn_trainer", size = "medium", srcs = ["tests/test_sklearn_trainer.py"], - tags = ["team:ml", "exclusive", "ray_air"], + tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"], deps = [":train_lib"] ) @@ -514,7 +514,7 @@ py_test( name = "test_tensorflow_checkpoint", size = "small", srcs = ["tests/test_tensorflow_checkpoint.py"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], deps = [":train_lib"] ) @@ -530,7 +530,7 @@ py_test( name = "test_tensorflow_trainer", size = "medium", srcs = ["tests/test_tensorflow_trainer.py"], - tags = ["team:ml", "exclusive", "ray_air"], + tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"], deps = [":train_lib"] ) @@ -562,7 +562,7 @@ py_test( name = "test_torch_trainer", size = "large", srcs = ["tests/test_torch_trainer.py"], - tags = ["team:ml", "exclusive", "ray_air"], + tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"], deps = [":train_lib"] ) @@ -578,7 +578,7 @@ py_test( name = "test_training_iterator", size = "large", srcs = ["tests/test_training_iterator.py"], - tags = ["team:ml", "exclusive", "ray_air"], + tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"], deps = [":train_lib"] ) @@ -602,7 +602,7 @@ py_test( name = "test_transformers_trainer_steps", size = "enormous", # TODO: Reduce this. srcs = ["tests/test_transformers_trainer_steps.py"], - tags = ["team:ml", "exclusive", "ray_air"], + tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"], deps = [":train_lib"] ) @@ -610,7 +610,7 @@ py_test( name = "test_transformers_trainer", size = "large", srcs = ["tests/test_transformers_trainer.py"], - tags = ["team:ml", "exclusive", "ray_air"], + tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"], deps = [":train_lib"] ) @@ -618,7 +618,7 @@ py_test( name = "test_tune", size = "large", srcs = ["tests/test_tune.py"], - tags = ["team:ml", "exclusive", "tune"], + tags = ["team:ml", "exclusive", "tune", "no_new_storage"], deps = [":train_lib", ":conftest"] ) @@ -634,7 +634,7 @@ py_test( name = "test_e2e_wandb_integration", size = "small", srcs = ["tests/test_e2e_wandb_integration.py"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], deps = [":train_lib"] ) @@ -658,7 +658,7 @@ py_test( name = "test_xgboost_trainer", size = "medium", srcs = ["tests/test_xgboost_trainer.py"], - tags = ["team:ml", "exclusive", "ray_air"], + tags = ["team:ml", "exclusive", "ray_air", "no_new_storage"], deps = [":train_lib"] ) @@ -670,6 +670,7 @@ py_test( "exclusive", "ray_air", "team:ml", + "no_new_storage", ], deps = [":train_lib", ":conftest"], ) diff --git a/python/ray/train/__init__.py b/python/ray/train/__init__.py index 5fc80812a8e3..84ccf0b28a08 100644 --- a/python/ray/train/__init__.py +++ b/python/ray/train/__init__.py @@ -1,3 +1,15 @@ +# Try import ray[train] core requirements (defined in setup.py) +try: + import pandas # noqa: F401 + import requests # noqa: F401 + import pyarrow # noqa: F401 +except ImportError as exc: + raise ImportError( + "Can't import ray.train as some dependencies are missing. " + 'Run `pip install "ray[train]"` to fix.' + ) from exc + + from ray._private.usage import usage_lib from ray.train._internal.data_config import DataConfig from ray.train._internal.session import get_checkpoint, get_dataset_shard, report diff --git a/python/ray/train/_internal/session.py b/python/ray/train/_internal/session.py index 49e4aa7ebace..9d3fe2a04665 100644 --- a/python/ray/train/_internal/session.py +++ b/python/ray/train/_internal/session.py @@ -171,8 +171,7 @@ def noop(x): if _use_storage_context(): assert storage - logger.debug(f"StorageContext on TRAIN WORKER {world_rank}:\n{storage}") - storage._check_validation_file() + logger.debug(f"StorageContext on SESSION {world_rank}:\n{storage}") # Change the working directory to the local trial directory. # -> All workers on the same node share a working directory. diff --git a/python/ray/train/_internal/storage.py b/python/ray/train/_internal/storage.py index c80f6cf645ed..5c2a8d1725dd 100644 --- a/python/ray/train/_internal/storage.py +++ b/python/ray/train/_internal/storage.py @@ -169,7 +169,7 @@ def _download_from_fs_path( else: _pyarrow_fs_copy_files(fs_path, local_path, source_filesystem=fs) except Exception as e: - # Clean up the directory if downloading was unsuccessful. + # Clean up the directory if downloading was unsuccessful if not exists_before: shutil.rmtree(local_path, ignore_errors=True) raise e @@ -530,9 +530,9 @@ def persist_current_checkpoint(self, checkpoint: "Checkpoint") -> "Checkpoint": "Current" is defined by the `current_checkpoint_index` attribute of the storage context. - This method copies the checkpoint files to the storage location, - drops a marker at the storage path to indicate that the checkpoint - is completely uploaded, then deletes the original checkpoint directory. + This method copies the checkpoint files to the storage location. + It's up to the user to delete the original checkpoint files if desired. + For example, the original directory is typically a local temp directory. Args: @@ -553,6 +553,13 @@ def persist_current_checkpoint(self, checkpoint: "Checkpoint") -> "Checkpoint": dest_fs=self.storage_filesystem, ) ) + + # Raise an error if the storage path is not accessible when + # attempting to upload a checkpoint from a remote worker. + # Ex: If storage_path is a local path, then a validation marker + # will only exist on the head node but not the worker nodes. + self._check_validation_file() + self.storage_filesystem.create_dir(self.checkpoint_fs_path) _pyarrow_fs_copy_files( source=checkpoint.path, @@ -561,17 +568,12 @@ def persist_current_checkpoint(self, checkpoint: "Checkpoint") -> "Checkpoint": destination_filesystem=self.storage_filesystem, ) - # Delete local checkpoint files. - # TODO(justinvyu): What if checkpoint.path == self.checkpoint_fs_path? - # TODO(justinvyu): What if users don't want to delete the local checkpoint? - checkpoint.filesystem.delete_dir(checkpoint.path) - - uploaded_checkpoint = Checkpoint( + persisted_checkpoint = Checkpoint( filesystem=self.storage_filesystem, path=self.checkpoint_fs_path, ) - logger.debug(f"Checkpoint successfully created at: {uploaded_checkpoint}") - return uploaded_checkpoint + logger.debug(f"Checkpoint successfully created at: {persisted_checkpoint}") + return persisted_checkpoint @property def experiment_fs_path(self) -> str: diff --git a/python/ray/train/data_parallel_trainer.py b/python/ray/train/data_parallel_trainer.py index 741f3e50c226..6acf65cd3987 100644 --- a/python/ray/train/data_parallel_trainer.py +++ b/python/ray/train/data_parallel_trainer.py @@ -13,6 +13,7 @@ from ray.air.constants import MODEL_KEY, PREPROCESSOR_KEY, LAZY_CHECKPOINT_MARKER_FILE from ray.air._internal.checkpoint_manager import _TrackedCheckpoint from ray.train import BackendConfig, TrainingIterator +from ray.train._checkpoint import Checkpoint as NewCheckpoint from ray.train._internal import session from ray.train._internal.session import _TrainingResult, get_session from ray.train._internal.backend_executor import BackendExecutor, TrialInfo @@ -428,15 +429,53 @@ def _report(self, training_iterator: TrainingIterator) -> None: # TODO(ml-team): add ability to report results from multiple workers. first_worker_result = results[0] if _use_storage_context(): - assert isinstance(first_worker_result, _TrainingResult) + assert all(isinstance(result, _TrainingResult) for result in results) + + tune_session = get_session() + + # Check if any workers reported a checkpoint. + # If so, report a checkpoint pointing to the persisted location + # to Tune for book-keeping. + # NOTE: This removes the restriction for any individual worker + # (ex: global rank 0 worker) from needing to report a checkpoint. + # All workers reported a checkpoint to the same fs path, so there's + # no need to report multiple checkpoints to Tune. + worker_checkpoints = [ + result.checkpoint + for result in results + if result.checkpoint is not None + ] + at_least_one_reported_checkpoint = len(worker_checkpoints) > 0 + # Make sure that all workers uploaded to the same location. + assert all( + checkpoint.path == tune_session.storage.checkpoint_fs_path + for checkpoint in worker_checkpoints + ) + + checkpoint = ( + NewCheckpoint( + filesystem=tune_session.storage.storage_filesystem, + # NOTE: The checkpoint index has not been incremented yet + # at this point, which is why `checkpoint_fs_path` points + # to the most recent checkpoint. + path=tune_session.storage.checkpoint_fs_path, + ) + if at_least_one_reported_checkpoint + else None + ) + tracked_training_result = _TrainingResult( + checkpoint=checkpoint, + metrics=first_worker_result.metrics, + ) + logger.debug( "Report (metrics, checkpoint) to the Tune session:\n" - f" metrics={first_worker_result.metrics}\n" - f" checkpoint={first_worker_result.checkpoint}" + f" metrics={tracked_training_result.metrics}\n" + f" checkpoint={tracked_training_result.checkpoint}" ) + # Report the metrics and checkpoint to Tune. - tune_session = get_session() - tune_session._report_training_result(first_worker_result) + tune_session._report_training_result(tracked_training_result) else: tune.report(**first_worker_result) diff --git a/python/ray/train/examples/pytorch/torch_linear_example.py b/python/ray/train/examples/pytorch/torch_linear_example.py index 2f92edb010e1..8db431205165 100644 --- a/python/ray/train/examples/pytorch/torch_linear_example.py +++ b/python/ray/train/examples/pytorch/torch_linear_example.py @@ -1,11 +1,15 @@ import argparse +import os +import tempfile import numpy as np import torch import torch.nn as nn import ray.train as train from ray.train import RunConfig, ScalingConfig +from ray.train._checkpoint import Checkpoint from ray.train.torch import TorchTrainer, LegacyTorchCheckpoint +from ray.train._internal.storage import _use_storage_context class LinearDataset(torch.utils.data.Dataset): @@ -79,9 +83,16 @@ def train_func(config): state_dict, loss = validate_epoch(validation_loader, model, loss_fn) result = dict(loss=loss) results.append(result) - train.report( - result, checkpoint=LegacyTorchCheckpoint.from_state_dict(state_dict) - ) + + if _use_storage_context(): + with tempfile.TemporaryDirectory() as tmpdir: + torch.save(state_dict, os.path.join(tmpdir, "model.pt")) + train.report(result, checkpoint=Checkpoint.from_directory(tmpdir)) + else: + # TODO(justinvyu): Temporary for CI to pass during the API transition. + train.report( + result, checkpoint=LegacyTorchCheckpoint.from_state_dict(state_dict) + ) return results diff --git a/python/ray/train/lightgbm/lightgbm_trainer.py b/python/ray/train/lightgbm/lightgbm_trainer.py index fa5a2b1278f1..db6a98498f2e 100644 --- a/python/ray/train/lightgbm/lightgbm_trainer.py +++ b/python/ray/train/lightgbm/lightgbm_trainer.py @@ -63,7 +63,7 @@ class LightGBMTrainer(GBDTTrainer): ... Args: - datasets: Datasets to use for training and validation. Must include a + datasets: The Ray Datasets to use for training and validation. Must include a "train" key denoting the training dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed by the ``preprocessor`` if @@ -77,7 +77,7 @@ class LightGBMTrainer(GBDTTrainer): dmatrix_params: Dict of ``dataset name:dict of kwargs`` passed to respective :class:`xgboost_ray.RayDMatrix` initializations, which in turn are passed to ``lightgbm.Dataset`` objects created on each worker. For example, this - can be used to add sample weights with the ``weights`` parameter. + can be used to add sample weights with the ``weight`` parameter. num_boost_round: Target number of boosting iterations (trees in the model). Note that unlike in ``lightgbm.train``, this is the target number of trees, meaning that if you set ``num_boost_round=10`` and pass a model diff --git a/python/ray/train/tests/test_new_persistence.py b/python/ray/train/tests/test_new_persistence.py index 658a3c51a2e6..f6d00d6ad587 100644 --- a/python/ray/train/tests/test_new_persistence.py +++ b/python/ray/train/tests/test_new_persistence.py @@ -3,9 +3,10 @@ from pathlib import Path import pickle import pytest +import re import tempfile import time -from typing import Optional, Tuple +from typing import List, Optional, Tuple import pyarrow.fs @@ -26,7 +27,7 @@ _SCORE_KEY = "score" NUM_ITERATIONS = 6 # == num_checkpoints == num_artifacts NUM_TRIALS = 2 -NUM_WORKERS = 2 +NUM_WORKERS = 3 @contextmanager @@ -137,6 +138,18 @@ def _get_checkpoint_index(checkpoint_dir_name: str) -> int: return int(checkpoint_dir_name.split("_")[-1]) +def _create_checkpoint_shard_filename(rank_str: str) -> str: + return f"checkpoint_shard-rank={rank_str}.pkl" + + +def _get_checkpoint_shard_rank(checkpoint_shard_filename: str) -> int: + """Get the checkpoint shard rank from the filename.""" + pattern = _create_checkpoint_shard_filename(r"(\d+)") + match = re.search(pattern, checkpoint_shard_filename) + assert match + return int(match.group(1)) + + def train_fn(config): in_trainer = config.get("in_trainer", False) if in_trainer: @@ -165,26 +178,33 @@ def train_fn(config): for i in range(start, config.get("num_iterations", 5)): time.sleep(0.25) - temp_dir = tempfile.mkdtemp() - with open(os.path.join(temp_dir, "checkpoint.pkl"), "wb") as f: - pickle.dump({"iter": i}, f) + metrics = {"iter": i, _SCORE_KEY: i} - artifact_file_name = f"artifact-iter={i}.txt" - if in_trainer: - rank = train.get_context().get_world_rank() - artifact_file_name = f"artifact-rank={rank}-iter={i}.txt" + if in_trainer and train.get_context().get_world_rank() in config.get( + "no_checkpoint_ranks", [] + ): + train.report(metrics) + else: + with tempfile.TemporaryDirectory() as temp_dir: + with open(os.path.join(temp_dir, "checkpoint.pkl"), "wb") as f: + pickle.dump({"iter": i}, f) - checkpoint_file_name = f"checkpoint_shard-rank={rank}.pkl" - with open(os.path.join(temp_dir, checkpoint_file_name), "wb") as f: - pickle.dump({"iter": i}, f) + # artifact_file_name = f"artifact-iter={i}.txt" + if in_trainer: + rank = train.get_context().get_world_rank() + # artifact_file_name = f"artifact-rank={rank}-iter={i}.txt" - with open(artifact_file_name, "w") as f: - f.write(f"{i}") + checkpoint_file_name = _create_checkpoint_shard_filename(str(rank)) + with open(os.path.join(temp_dir, checkpoint_file_name), "wb") as f: + pickle.dump({"iter": i}, f) + + # with open(artifact_file_name, "w") as f: + # f.write(f"{i}") + + train.report(metrics, checkpoint=NewCheckpoint.from_directory(temp_dir)) + # `train.report` should not have deleted this! + assert os.path.exists(temp_dir) - train.report( - {"iter": i, _SCORE_KEY: i}, - checkpoint=NewCheckpoint.from_directory(temp_dir), - ) if i in config.get("fail_iters", []): raise RuntimeError(f"Failing on iter={i}!!") @@ -274,6 +294,7 @@ def _assert_storage_contents( checkpoint_config: train.CheckpointConfig, trainable_name: str, test_trainer: bool, + no_checkpoint_ranks: List[int] = None, ): # Second, inspect the contents of the storage path storage_path_ls = list(local_inspect_dir.glob("*")) @@ -322,10 +343,13 @@ def _assert_storage_contents( ) if test_trainer: # 1 checkpoint shard per worker. - assert ( - len(list(checkpoint_dir.glob("checkpoint_shard-*.pkl"))) - == NUM_WORKERS - ) + # Unless the worker did not report a checkpoint (no_checkpoint_ranks). + assert { + _get_checkpoint_shard_rank(checkpoint_shard.name) + for checkpoint_shard in checkpoint_dir.glob( + "checkpoint_shard-*.pkl" + ) + } == {i for i in range(NUM_WORKERS) if i not in no_checkpoint_ranks} # NOTE: These next 2 are technically synced by the driver. assert len(list(trial_dir.glob(EXPR_RESULT_FILE))) == 1 @@ -493,6 +517,8 @@ def test_trainer( monkeypatch.setenv("RAY_AIR_LOCAL_CACHE_DIR", str(LOCAL_CACHE_DIR)) exp_name = "trainer_new_persistence" + no_checkpoint_ranks = [0] + with _resolve_storage_type(storage_path_type, tmp_path) as ( storage_path, storage_filesystem, @@ -503,8 +529,12 @@ def test_trainer( "in_trainer": True, "num_iterations": NUM_ITERATIONS, "fail_iters": [2, 4], + # TODO(justinvyu): This should be separated into its own test once + # CI has been fully migrated. + # Test that global rank 0 is not required to checkpoint. + "no_checkpoint_ranks": no_checkpoint_ranks, }, - scaling_config=train.ScalingConfig(num_workers=2), + scaling_config=train.ScalingConfig(num_workers=NUM_WORKERS), run_config=train.RunConfig( storage_path=storage_path, storage_filesystem=storage_filesystem, @@ -554,6 +584,7 @@ def test_trainer( checkpoint_config, trainable_name="DataParallelTrainer", test_trainer=True, + no_checkpoint_ranks=no_checkpoint_ranks, ) diff --git a/python/ray/train/xgboost/xgboost_trainer.py b/python/ray/train/xgboost/xgboost_trainer.py index 95cb63c9b43f..d93ad7216fe9 100644 --- a/python/ray/train/xgboost/xgboost_trainer.py +++ b/python/ray/train/xgboost/xgboost_trainer.py @@ -58,7 +58,7 @@ class XGBoostTrainer(GBDTTrainer): ... Args: - datasets: Datasets to use for training and validation. Must include a + datasets: The Ray Datasets to use for training and validation. Must include a "train" key denoting the training dataset. If a ``preprocessor`` is provided and has not already been fit, it will be fit on the training dataset. All datasets will be transformed by the ``preprocessor`` if @@ -72,7 +72,7 @@ class XGBoostTrainer(GBDTTrainer): dmatrix_params: Dict of ``dataset name:dict of kwargs`` passed to respective :class:`xgboost_ray.RayDMatrix` initializations, which in turn are passed to ``xgboost.DMatrix`` objects created on each worker. For example, this can - be used to add sample weights with the ``weights`` parameter. + be used to add sample weights with the ``weight`` parameter. num_boost_round: Target number of boosting iterations (trees in the model). Note that unlike in ``xgboost.train``, this is the target number of trees, meaning that if you set ``num_boost_round=10`` and pass a model diff --git a/python/ray/tune/BUILD b/python/ray/tune/BUILD index ee7028e6ec7d..83261ed95f18 100644 --- a/python/ray/tune/BUILD +++ b/python/ray/tune/BUILD @@ -24,7 +24,7 @@ doctest( "utils/trainable.py", ] ), - tags = ["team:ml"] + tags = ["team:ml", "no_new_storage"] ) py_library( @@ -50,7 +50,7 @@ py_test( name = "test_actor_reuse", size = "large", srcs = ["tests/test_actor_reuse.py"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], deps = [":tune_lib", ":conftest"], ) @@ -83,7 +83,7 @@ py_test( size = "large", srcs = ["tests/test_client.py"], deps = [":tune_lib"], - tags = ["team:ml", "client", "exclusive"] + tags = ["team:ml", "client", "exclusive", "no_new_storage"] ) py_test( @@ -100,7 +100,7 @@ py_test( srcs = ["tests/test_cluster_searcher.py"], data = ["tests/_test_cluster_interrupt_searcher.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -108,7 +108,7 @@ py_test( size = "medium", srcs = ["tests/test_commands.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -116,7 +116,7 @@ py_test( size = "medium", srcs = ["tests/test_convergence.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -124,7 +124,7 @@ py_test( size = "small", srcs = ["tests/test_dependency.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -132,7 +132,7 @@ py_test( size = "small", srcs = ["tests/test_experiment.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -140,7 +140,7 @@ py_test( size = "medium", srcs = ["tests/test_experiment_analysis.py"], deps = [":tune_lib", ":conftest"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -148,7 +148,7 @@ py_test( size = "medium", srcs = ["tests/test_experiment_analysis_mem.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -156,7 +156,7 @@ py_test( size = "medium", srcs = ["tests/test_function_api.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -172,7 +172,7 @@ py_test( size = "small", srcs = ["tests/test_integration_pytorch_lightning.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -191,7 +191,7 @@ py_test( "tests/_test_multi_tenancy_run.py" ], deps = [":tune_lib"], - tags = ["team:ml"], + tags = ["team:ml", "no_new_storage"], ) py_test( @@ -207,7 +207,7 @@ py_test( size = "medium", srcs = ["tests/test_progress_reporter.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -223,7 +223,7 @@ py_test( size = "medium", srcs = ["tests/test_run_experiment.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -231,7 +231,7 @@ py_test( size = "medium", srcs = ["tests/test_remote.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -239,7 +239,7 @@ py_test( size = "medium", srcs = ["tests/test_result_grid.py"], deps = [":tune_lib", ":conftest"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -247,7 +247,7 @@ py_test( size = "medium", srcs = ["tests/test_warnings.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -255,7 +255,7 @@ py_test( size = "large", srcs = ["tests/test_sample.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "medium_instance"], + tags = ["team:ml", "exclusive", "medium_instance", "no_new_storage"], ) py_test( @@ -271,7 +271,7 @@ py_test( size = "large", srcs = ["tests/test_searchers.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "medium_instance"], + tags = ["team:ml", "exclusive", "medium_instance", "no_new_storage"], ) py_test( @@ -311,7 +311,7 @@ py_test( size = "medium", srcs = ["tests/test_syncer.py"], deps = [":tune_lib", ":conftest"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -319,7 +319,7 @@ py_test( size = "medium", srcs = ["tests/test_syncer_callback.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -327,7 +327,7 @@ py_test( size = "medium", srcs = ["tests/test_trainable.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -335,7 +335,7 @@ py_test( size = "small", srcs = ["tests/test_trainable_util.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -343,7 +343,7 @@ py_test( size = "medium", srcs = ["tests/test_trial_relative_logdir.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -359,7 +359,7 @@ py_test( size = "large", srcs = ["tests/test_trial_scheduler.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "medium_instance"], + tags = ["team:ml", "exclusive", "medium_instance", "no_new_storage"], ) py_test( @@ -367,7 +367,7 @@ py_test( size = "large", srcs = ["tests/test_trial_scheduler_pbt.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "medium_instance"], + tags = ["team:ml", "exclusive", "medium_instance", "no_new_storage"], ) py_test( @@ -375,7 +375,7 @@ py_test( size = "small", srcs = ["tests/test_trial_scheduler_resource_changing.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -383,7 +383,7 @@ py_test( size = "large", srcs = ["tests/test_tune_restore_warm_start.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -399,7 +399,7 @@ py_test( size = "small", srcs = ["tests/test_tune_save_restore.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -407,7 +407,7 @@ py_test( size = "medium", srcs = ["tests/test_tune_server.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -415,7 +415,7 @@ py_test( size = "large", srcs = ["tests/test_tuner.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "medium_instance"], + tags = ["team:ml", "exclusive", "medium_instance", "no_new_storage"], ) py_test( @@ -423,7 +423,7 @@ py_test( size = "large", srcs = ["tests/test_tuner_restore.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"], + tags = ["team:ml", "exclusive", "no_new_storage"], ) py_test( @@ -445,7 +445,7 @@ py_test( size = "small", srcs = ["tests/example.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example", "no_main"], + tags = ["team:ml", "exclusive", "example", "no_main", "no_new_storage"], ) # Todo: Ensure MPLBACKEND=Agg @@ -469,7 +469,7 @@ py_test( size = "small", srcs = ["tests/execution/test_actor_caching.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"] + tags = ["team:ml", "exclusive", "no_new_storage"] ) py_test( @@ -477,7 +477,7 @@ py_test( size = "large", srcs = ["tests/execution/test_controller_callback_integration.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"] + tags = ["team:ml", "exclusive", "no_new_storage"] ) py_test( @@ -485,7 +485,7 @@ py_test( size = "large", srcs = ["tests/execution/test_controller_checkpointing_integration.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"] + tags = ["team:ml", "exclusive", "no_new_storage"] ) py_test( @@ -493,7 +493,7 @@ py_test( size = "large", srcs = ["tests/execution/test_controller_control_integration.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"] + tags = ["team:ml", "exclusive", "no_new_storage"] ) py_test( @@ -501,7 +501,7 @@ py_test( size = "large", srcs = ["tests/execution/test_controller_errors_integration.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"] + tags = ["team:ml", "exclusive", "no_new_storage"] ) py_test( @@ -509,7 +509,7 @@ py_test( size = "large", srcs = ["tests/execution/test_controller_resources_integration.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"] + tags = ["team:ml", "exclusive", "no_new_storage"] ) py_test( @@ -517,7 +517,7 @@ py_test( size = "large", srcs = ["tests/execution/test_controller_search_alg_integration.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive"] + tags = ["team:ml", "exclusive", "no_new_storage"] ) # -------------------------------------------------------------------- @@ -565,7 +565,7 @@ py_test( size = "medium", srcs = ["examples/bohb_example.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example", "exclude_new_execution"] + tags = ["team:ml", "exclusive", "example", "no_new_storage"] ) py_test( @@ -582,7 +582,7 @@ py_test( size = "medium", srcs = ["examples/cifar10_pytorch.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example", "pytorch"], + tags = ["team:ml", "exclusive", "example", "pytorch", "no_new_storage"], args = ["--smoke-test"] ) @@ -591,7 +591,7 @@ py_test( size = "small", srcs = ["examples/custom_func_checkpointing.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example"], + tags = ["team:ml", "exclusive", "example", "no_new_storage"], args = ["--smoke-test"] ) @@ -618,7 +618,7 @@ py_test( size = "small", srcs = ["examples/hyperband_function_example.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example"], + tags = ["team:ml", "exclusive", "example", "no_new_storage"], args = ["--smoke-test"] ) @@ -636,7 +636,7 @@ py_test( size = "small", srcs = ["examples/lightgbm_example.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example"] + tags = ["team:ml", "exclusive", "example", "no_new_storage"] ) py_test( @@ -645,7 +645,7 @@ py_test( main = "examples/lightgbm_example.py", srcs = ["examples/lightgbm_example.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example"], + tags = ["team:ml", "exclusive", "example", "no_new_storage"], args = ["--use-cv"] ) @@ -689,7 +689,7 @@ py_test( size = "medium", srcs = ["examples/mnist_pytorch_lightning.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example", "pytorch"], + tags = ["team:ml", "exclusive", "example", "pytorch", "no_new_storage"], args = ["--smoke-test"] ) @@ -698,7 +698,7 @@ py_test( size = "medium", srcs = ["examples/mnist_ptl_mini.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example", "pytorch"], + tags = ["team:ml", "exclusive", "example", "pytorch", "no_new_storage"], args = ["--smoke-test"] ) @@ -761,7 +761,7 @@ py_test( size = "small", srcs = ["examples/pbt_convnet_example.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example"], + tags = ["team:ml", "exclusive", "example", "no_new_storage"], args = ["--smoke-test"] ) @@ -770,7 +770,7 @@ py_test( size = "small", srcs = ["examples/pbt_convnet_function_example.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example"], + tags = ["team:ml", "exclusive", "example", "no_new_storage"], args = ["--smoke-test"] ) @@ -779,7 +779,7 @@ py_test( size = "medium", srcs = ["examples/pbt_dcgan_mnist/pbt_dcgan_mnist_func.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example"], + tags = ["team:ml", "exclusive", "example", "no_new_storage"], args = ["--smoke-test"] ) @@ -788,7 +788,7 @@ py_test( size = "medium", srcs = ["examples/pbt_dcgan_mnist/pbt_dcgan_mnist_trainable.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example"], + tags = ["team:ml", "exclusive", "example", "no_new_storage"], args = ["--smoke-test"] ) @@ -797,7 +797,7 @@ py_test( size = "small", srcs = ["examples/pbt_example.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example"], + tags = ["team:ml", "exclusive", "example", "no_new_storage"], args = ["--smoke-test"] ) @@ -834,7 +834,7 @@ py_test( size = "small", srcs = ["examples/pbt_transformers/pbt_transformers.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example"], + tags = ["team:ml", "exclusive", "example", "no_new_storage"], args = ["--smoke-test"] ) @@ -903,7 +903,7 @@ py_test( size = "medium", srcs = ["examples/tune_mnist_keras.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example"], + tags = ["team:ml", "exclusive", "example", "no_new_storage"], args = ["--smoke-test"] ) @@ -912,7 +912,7 @@ py_test( size = "small", srcs = ["examples/xgboost_example.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example"] + tags = ["team:ml", "exclusive", "example", "no_new_storage"] ) py_test( @@ -921,7 +921,7 @@ py_test( main = "examples/xgboost_example.py", srcs = ["examples/xgboost_example.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example"], + tags = ["team:ml", "exclusive", "example", "no_new_storage"], args = ["--use-cv"] ) @@ -930,7 +930,7 @@ py_test( size = "large", srcs = ["examples/xgboost_dynamic_resources_example.py"], deps = [":tune_lib"], - tags = ["team:ml", "exclusive", "example"], + tags = ["team:ml", "exclusive", "example", "no_new_storage"], args = ["--test"] ) diff --git a/python/ray/tune/__init__.py b/python/ray/tune/__init__.py index a635ad679dcb..8ec4cf6b98f7 100644 --- a/python/ray/tune/__init__.py +++ b/python/ray/tune/__init__.py @@ -1,3 +1,15 @@ +# Try import ray[tune] core requirements (defined in setup.py) +try: + import pandas # noqa: F401 + import requests # noqa: F401 + import pyarrow # noqa: F401 +except ImportError as exc: + raise ImportError( + "Can't import ray.tune as some dependencies are missing. " + 'Run `pip install "ray[tune]"` to fix.' + ) from exc + + from ray.tune.error import TuneError from ray.tune.tune import run_experiments, run from ray.tune.syncer import SyncConfig diff --git a/python/ray/tune/callback.py b/python/ray/tune/callback.py index 6ef0b9821f03..719c87366a84 100644 --- a/python/ray/tune/callback.py +++ b/python/ray/tune/callback.py @@ -8,7 +8,7 @@ from ray.tune.utils.util import _atomic_save, _load_newest_checkpoint if TYPE_CHECKING: - from ray.air._internal.checkpoint_manager import _TrackedCheckpoint + from ray.train._checkpoint import Checkpoint from ray.tune.experiment import Trial from ray.tune.stopper import Stopper @@ -282,7 +282,7 @@ def on_checkpoint( iteration: int, trials: List["Trial"], trial: "Trial", - checkpoint: "_TrackedCheckpoint", + checkpoint: "Checkpoint", **info, ): """Called after a trial saved a checkpoint with Tune. diff --git a/python/ray/tune/execution/class_cache.py b/python/ray/tune/execution/class_cache.py index 8f81f11a8e38..f53a0dfb4f67 100644 --- a/python/ray/tune/execution/class_cache.py +++ b/python/ray/tune/execution/class_cache.py @@ -5,6 +5,7 @@ DISABLE_LAZY_CHECKPOINTING_ENV, COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV, ) +from ray.train.constants import RAY_AIR_NEW_PERSISTENCE_MODE DEFAULT_ENV_VARS = { # https://github.com/ray-project/ray/issues/28197 @@ -13,6 +14,7 @@ ENV_VARS_TO_PROPAGATE = { DISABLE_LAZY_CHECKPOINTING_ENV, COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV, + RAY_AIR_NEW_PERSISTENCE_MODE, "TUNE_CHECKPOINT_CLOUD_RETRY_NUM", "TUNE_CHECKPOINT_CLOUD_RETRY_WAIT_TIME_S", "AWS_ACCESS_KEY_ID", diff --git a/python/ray/tune/execution/tune_controller.py b/python/ray/tune/execution/tune_controller.py index 46abcfce5946..f307991e0913 100644 --- a/python/ray/tune/execution/tune_controller.py +++ b/python/ray/tune/execution/tune_controller.py @@ -1980,9 +1980,24 @@ def _process_trial_save( from ray.train._internal.checkpoint_manager import _TrainingResult try: - if _use_storage_context(): - assert isinstance(checkpoint_value, _TrainingResult), checkpoint_value - # TODO(justinvyu): Update callbacks to take in a _TrainingResult + if _use_storage_context() and isinstance(checkpoint_value, _TrainingResult): + try: + self._callbacks.on_checkpoint( + iteration=self._iteration, + trials=self._trials, + trial=trial, + checkpoint=checkpoint_value.checkpoint, + ) + except Exception: + logger.warning( + "Error encountered during processing of callbacks. " + "Ray Train/Tune recently changed the checkpoint interface " + "that is passed to callbacks. If you implemented your own " + "callback with an `on_checkpoint` handler, please review " + "the checkpoint interface and adjust your code accordingly." + ) + raise + trial.on_checkpoint(checkpoint_value) self._checkpoint_manager.on_trial_checkpoint(trial) diff --git a/python/ray/tune/experiment/trial.py b/python/ray/tune/experiment/trial.py index e2238c73c726..255fb7cc8d76 100644 --- a/python/ray/tune/experiment/trial.py +++ b/python/ray/tune/experiment/trial.py @@ -1074,7 +1074,7 @@ def clear_checkpoint(self): self.temporary_state.restoring_from = None self.run_metadata.invalidate_cache() - def on_checkpoint(self, checkpoint: _TrackedCheckpoint): + def on_checkpoint(self, checkpoint: Union[_TrackedCheckpoint, _TrainingResult]): """Hook for handling checkpoints taken by the Trainable. Args: diff --git a/python/ray/tune/experimental/output.py b/python/ray/tune/experimental/output.py index e912c9560c2c..37badd05211b 100644 --- a/python/ray/tune/experimental/output.py +++ b/python/ray/tune/experimental/output.py @@ -26,6 +26,7 @@ import time from ray.air._internal.usage import AirEntrypoint +from ray.train._checkpoint import Checkpoint from ray.tune.search.sample import Domain from ray.tune.utils.log import Verbosity @@ -44,7 +45,7 @@ Line, DataRow, ) -from ray.air._internal.checkpoint_manager import _TrackedCheckpoint +from ray.air._internal.checkpoint_manager import _TrackedCheckpoint, CheckpointStorage from ray.air.constants import TRAINING_ITERATION from ray.tune.callback import Callback from ray.tune.result import ( @@ -766,21 +767,29 @@ def on_checkpoint( iteration: int, trials: List[Trial], trial: Trial, - checkpoint: "_TrackedCheckpoint", + checkpoint: Union["_TrackedCheckpoint", "Checkpoint"], **info, ): if self._verbosity < self._intermediate_result_verbosity: return - # don't think this is supposed to happen but just to be save. + # don't think this is supposed to happen but just to be safe. saved_iter = "?" if trial.last_result and TRAINING_ITERATION in trial.last_result: saved_iter = trial.last_result[TRAINING_ITERATION] self._start_block(f"trial_{trial}_result_{saved_iter}") + + if isinstance(checkpoint, Checkpoint): + loc = f"({checkpoint.filesystem.type_name}){checkpoint.path}" + elif checkpoint.storage_mode == CheckpointStorage.MEMORY: + loc = "(memory)" + else: + loc = checkpoint.dir_or_data + print( f"{self._addressing_tmpl.format(trial)} " f"saved a checkpoint for iteration {saved_iter} " - f"at: {checkpoint.dir_or_data}" + f"at: {loc}" ) def on_trial_start(self, iteration: int, trials: List[Trial], trial: Trial, **info): diff --git a/python/ray/tune/syncer.py b/python/ray/tune/syncer.py index 0ee82c460068..de5eb1d136fd 100644 --- a/python/ray/tune/syncer.py +++ b/python/ray/tune/syncer.py @@ -20,6 +20,7 @@ import time from dataclasses import dataclass + try: import fsspec except Exception: @@ -55,6 +56,7 @@ from ray.widgets import Template if TYPE_CHECKING: + from ray.train._checkpoint import Checkpoint from ray.tune.experiment import Trial logger = logging.getLogger(__name__) @@ -930,9 +932,17 @@ def on_checkpoint( iteration: int, trials: List["Trial"], trial: "Trial", - checkpoint: _TrackedCheckpoint, + checkpoint: Union["_TrackedCheckpoint", "Checkpoint"], **info, ): + if not hasattr(checkpoint, "storage_mode"): + # Syncer should be disabled for new storage path + raise RuntimeError( + "Internal error: Got new Train Checkpoint object in Syncer: " + f"{checkpoint}. Please raise an error on " + f"https://github.com/ray-project/ray/issues" + ) + if not self._enabled or trial.uses_cloud_checkpointing: return diff --git a/python/ray/tune/trainable/trainable.py b/python/ray/tune/trainable/trainable.py index e1fad721a06d..07e17a865f8a 100644 --- a/python/ray/tune/trainable/trainable.py +++ b/python/ray/tune/trainable/trainable.py @@ -181,7 +181,6 @@ def __init__( assert storage assert storage.trial_fs_path logger.debug(f"StorageContext on the TRAINABLE:\n{storage}") - storage._check_validation_file() self.setup(copy.deepcopy(self.config)) setup_time = time.time() - self._start_time @@ -468,6 +467,12 @@ def get_state(self): def _create_checkpoint_dir( self, checkpoint_dir: Optional[str] = None ) -> Optional[str]: + if _use_storage_context(): + # NOTE: There's no need to supply the checkpoint directory inside + # the local trial dir, since it'll get persisted to the right location. + checkpoint_dir = tempfile.mkdtemp() + return checkpoint_dir + # Create checkpoint_xxxxx directory and drop checkpoint marker checkpoint_dir = TrainableUtil.make_checkpoint_dir( checkpoint_dir or self.logdir, index=self.iteration, override=True diff --git a/python/ray/tune/tune.py b/python/ray/tune/tune.py index c5a1bd2a4f1a..5c0e1520ab12 100644 --- a/python/ray/tune/tune.py +++ b/python/ray/tune/tune.py @@ -124,7 +124,7 @@ def _check_default_resources_override( ) -> bool: trainable_cls = _get_trainable(run_identifier) if not trainable_cls: - # Default to True + # If no trainable, assume override return True return hasattr(trainable_cls, "default_resource_request") and ( diff --git a/python/ray/tune/utils/callback.py b/python/ray/tune/utils/callback.py index 1dd05f89d6a4..97d8c8504f52 100644 --- a/python/ray/tune/utils/callback.py +++ b/python/ray/tune/utils/callback.py @@ -2,6 +2,7 @@ import os from typing import Collection, List, Optional, Type, Union, TYPE_CHECKING +from ray.train._internal.storage import _use_storage_context from ray.tune.callback import Callback, CallbackList from ray.tune.syncer import SyncConfig @@ -164,6 +165,7 @@ def _create_default_callbacks( if ( not has_syncer_callback and os.environ.get("TUNE_DISABLE_AUTO_CALLBACK_SYNCER", "0") != "1" + and not _use_storage_context() ): syncer_callback = SyncerCallback( enabled=bool(sync_config.syncer), sync_period=sync_config.sync_period diff --git a/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py b/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py index 0e1fa9b8867a..5f2c8f69881c 100644 --- a/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py +++ b/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py @@ -73,7 +73,8 @@ def _handle(self, logs: dict, when: str = None): logs["local_time_taken"] = time.monotonic() - local_start_time super()._handle(logs, when) - callbacks = [CustomReportCallback(checkpoint_on="test_end")] + # NOTE: We shouldn't checkpoint to be identical to the vanilla TF run. + callbacks = [CustomReportCallback(checkpoint_on=[])] else: callbacks = [] diff --git a/release/cluster_tests/workloads/tune_scale_up_down.py b/release/cluster_tests/workloads/tune_scale_up_down.py index 7909ce8aa0c7..e6868f3522f3 100644 --- a/release/cluster_tests/workloads/tune_scale_up_down.py +++ b/release/cluster_tests/workloads/tune_scale_up_down.py @@ -27,20 +27,20 @@ import ray -from ray import tune +from ray import train, tune -def train(config): +def train_fn(config): this_node_ip = ray.util.get_node_ip_address() if config["head_node_ip"] == this_node_ip: # On the head node, run for 30 minutes for i in range(30): - tune.report(metric=i) + train.report({"metric": i}) time.sleep(60) else: # On worker nodes, run for 3 minutes for i in range(3): - tune.report(metric=i) + train.report({"metric": i}) time.sleep(60) @@ -65,7 +65,7 @@ def main(): node_counter = NodeCountCallback() tune.run( - train, + train_fn, num_samples=3, config={"head_node_ip": head_node_ip}, callbacks=[node_counter], diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 60389750fa32..7bb41df0cfac 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -92,7 +92,7 @@ run: timeout: 3600 - script: python workloads/tune_scale_up_down.py + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 python workloads/tune_scale_up_down.py wait_for_nodes: num_nodes: 0 @@ -122,7 +122,7 @@ byod: type: gpu cluster_compute: compute_gpu_1_cpu_16_aws.yaml - + run: timeout: 500 script: python workloads/gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw --data-format raw @@ -149,7 +149,7 @@ byod: type: gpu cluster_compute: compute_gpu_1_cpu_16_aws.yaml - + run: timeout: 500 script: python workloads/gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw-parquet --data-format parquet @@ -248,7 +248,7 @@ wait_for_nodes: num_nodes: 20 - + alert: default @@ -336,7 +336,7 @@ run: timeout: 3600 - script: python workloads/data_benchmark.py --dataset-size-gb=200 --num-workers=20 + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 python workloads/data_benchmark.py --dataset-size-gb=200 --num-workers=20 wait_for_nodes: num_nodes: 20 @@ -591,7 +591,7 @@ run: timeout: 5400 - script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8 + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 8 wait_for_nodes: num_nodes: 4 @@ -621,7 +621,7 @@ run: timeout: 5400 - script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2 + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 4 --cpus-per-worker 2 variations: - __suffix__: aws @@ -650,7 +650,7 @@ run: timeout: 5400 - script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2 + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 20 --num-workers 16 --cpus-per-worker 2 wait_for_nodes: num_nodes: 4 @@ -682,7 +682,7 @@ run: timeout: 5400 - script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 200 --num-workers 16 --cpus-per-worker 4 --batch-size 1024 --use-gpu + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 200 --num-workers 16 --cpus-per-worker 4 --batch-size 1024 --use-gpu wait_for_nodes: num_nodes: 4 @@ -694,7 +694,7 @@ cluster_compute: compute_gpu_2x2_aws.yaml run: - script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu wait_for_nodes: num_nodes: 2 @@ -852,11 +852,29 @@ run: timeout: 1800 - script: pip install -Ur dreambooth/requirements.txt && bash dreambooth_run.sh + script: pip install -Ur dreambooth/requirements.txt && RAY_AIR_NEW_PERSISTENCE_MODE=1 bash dreambooth_run.sh artifact_path: /tmp/artifacts/example_out.jpg # variations: A10G not available on GCE, yet. +- name: air_example_dreambooth_finetuning_lora + group: AIR examples + working_dir: air_examples/dreambooth + + stable: false + + frequency: weekly + team: ml + + cluster: + byod: + type: gpu + cluster_compute: dreambooth_compute_aws.yaml + + run: + timeout: 1800 + script: pip install -Ur dreambooth/requirements.txt && bash dreambooth_run.sh --lora + artifact_path: /tmp/artifacts/example_out.jpg - name: air_example_gptj_deepspeed_fine_tuning group: AIR examples @@ -969,6 +987,7 @@ python: "3.9" frequency: nightly-3x team: ml + cluster: byod: type: gpu @@ -976,7 +995,7 @@ run: timeout: 600 - script: jupyter nbconvert --to script --output _test start.ipynb && ipython _test.py + script: jupyter nbconvert --to script --output _test start.ipynb && RAY_AIR_NEW_PERSISTENCE_MODE=1 ipython _test.py variations: - __suffix__: aws @@ -1605,7 +1624,7 @@ run: timeout: 1200 - script: python horovod/horovod_user_test.py + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 python horovod/horovod_user_test.py wait_for_nodes: num_nodes: 4 @@ -1634,7 +1653,7 @@ run: timeout: 1200 - script: python horovod/horovod_user_test.py + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 python horovod/horovod_user_test.py wait_for_nodes: num_nodes: 4 @@ -1664,7 +1683,7 @@ run: timeout: 36000 - script: python train/train_tensorflow_mnist_test.py + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 python train/train_tensorflow_mnist_test.py wait_for_nodes: num_nodes: 3 @@ -1694,7 +1713,7 @@ run: timeout: 36000 - script: python train/train_torch_linear_test.py + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 python train/train_torch_linear_test.py wait_for_nodes: num_nodes: 3 @@ -1784,7 +1803,7 @@ run: timeout: 2000 - script: python tune_rllib/run_connect_tests.py + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 python tune_rllib/run_connect_tests.py wait_for_nodes: num_nodes: 9 @@ -2552,7 +2571,7 @@ run: timeout: 86400 - script: python workloads/many_ppo.py + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 python workloads/many_ppo.py long_running: true wait_for_nodes: num_nodes: 1 @@ -2836,7 +2855,7 @@ team: serve cluster: - byod: + byod: pip: # TODO: https://github.com/Farama-Foundation/AutoROM/issues/48 - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl @@ -3480,7 +3499,7 @@ run: timeout: 3000 - script: python train_horovod_multi_node_test.py + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 python train_horovod_multi_node_test.py wait_for_nodes: num_nodes: 2 @@ -3528,7 +3547,7 @@ cluster_compute: gpu_2x4_t4_gce.yaml run: timeout: 3600 - script: bash run_train_opt_2_7b.sh --storage gcs + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 bash run_train_opt_2_7b.sh --storage gcs wait_for_nodes: num_nodes: 2 @@ -3550,7 +3569,7 @@ run: timeout: 3600 - script: bash run_inference_opt_30b.sh --storage aws + script: RAY_AIR_NEW_PERSISTENCE_MODE=1 bash run_inference_opt_30b.sh --storage aws wait_for_nodes: num_nodes: 1 diff --git a/release/serve_tests/workloads/serve_handle_long_chain.py b/release/serve_tests/workloads/serve_handle_long_chain.py index 5e90839db47c..ce39f2e05f4f 100644 --- a/release/serve_tests/workloads/serve_handle_long_chain.py +++ b/release/serve_tests/workloads/serve_handle_long_chain.py @@ -70,7 +70,9 @@ def construct_long_chain_graph_with_pure_handle( Node.options(name=str(id)).deploy( id, prev_handle, init_delay_secs, compute_delay_secs, sync_handle ) - prev_handle = get_global_client().get_handle(str(id), sync=sync_handle) + prev_handle = get_global_client().get_handle( + str(id), app_name="", sync=sync_handle + ) return prev_handle diff --git a/release/serve_tests/workloads/serve_handle_wide_ensemble.py b/release/serve_tests/workloads/serve_handle_wide_ensemble.py index 0ad06e4e0302..8aa21a360893 100644 --- a/release/serve_tests/workloads/serve_handle_wide_ensemble.py +++ b/release/serve_tests/workloads/serve_handle_wide_ensemble.py @@ -85,11 +85,11 @@ def construct_wide_fanout_graph_with_pure_handle( nodes = [] for id in range(fanout_degree): Node.options(name=str(id)).deploy(id, init_delay_secs=init_delay_secs) - nodes.append(get_global_client().get_handle(str(id), sync=sync_handle)) + nodes.append(get_global_client().get_handle(str(id), "", sync=sync_handle)) CombineNode.options(name="combine").deploy( nodes, compute_delay_secs, sync_handle=sync_handle ) - return get_global_client().get_handle("combine", sync=sync_handle) + return get_global_client().get_handle("combine", "", sync=sync_handle) async def sanity_check_graph_deployment_with_async_handle(handle, expected_result): diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index 02dc02693c4c..56a8f11ab453 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -456,8 +456,6 @@ RAY_CONFIG(uint64_t, gcs_grpc_max_request_queued_max_bytes, 1024UL * 1024 * 1024 /// The duration between two checks for grpc status. RAY_CONFIG(int32_t, gcs_client_check_connection_status_interval_milliseconds, 1000) -/// Feature flag to use the ray syncer for resource synchronization -RAY_CONFIG(bool, use_ray_syncer, true) /// Due to the protocol drawback, raylet needs to refresh the message if /// no message is received for a while. /// Refer to https://tinyurl.com/n6kvsp87 for more details diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index c9ac1752063f..6971e19124ab 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -947,6 +947,10 @@ void CoreWorker::ExitIfParentRayletDies() { if (should_shutdown) { RAY_LOG(WARNING) << "Shutting down the core worker because the local raylet failed. " << "Check out the raylet.out log file. Raylet pid: " << raylet_pid; + + // Kill child procs so that child processes of the workers do not outlive the workers. + KillChildProcs(); + QuickExit(); } } diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc b/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc index 25aa70c74d35..80782e43712a 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc +++ b/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.cc @@ -25,14 +25,12 @@ GcsPlacementGroupScheduler::GcsPlacementGroupScheduler( std::shared_ptr gcs_table_storage, const gcs::GcsNodeManager &gcs_node_manager, ClusterResourceScheduler &cluster_resource_scheduler, - std::shared_ptr raylet_client_pool, - gcs_syncer::RaySyncer *ray_syncer) + std::shared_ptr raylet_client_pool) : return_timer_(io_context), gcs_table_storage_(std::move(gcs_table_storage)), gcs_node_manager_(gcs_node_manager), cluster_resource_scheduler_(cluster_resource_scheduler), - raylet_client_pool_(raylet_client_pool), - ray_syncer_(ray_syncer) {} + raylet_client_pool_(raylet_client_pool) {} void GcsPlacementGroupScheduler::ScheduleUnplacedBundles( std::shared_ptr placement_group, @@ -243,19 +241,10 @@ void GcsPlacementGroupScheduler::CancelResourceReserve( return_client->CancelResourceReserve( *bundle_spec, - [this, bundle_spec, node_id](const Status &status, - const rpc::CancelResourceReserveReply &reply) { + [bundle_spec, node_id](const Status &status, + const rpc::CancelResourceReserveReply &reply) { RAY_LOG(DEBUG) << "Finished cancelling the resource reserved for bundle: " << bundle_spec->DebugString() << " at node " << node_id; - if (ray_syncer_ != nullptr) { - auto &resources = bundle_spec->GetFormattedResources(); - rpc::NodeResourceChange node_resource_change; - for (const auto &iter : resources) { - node_resource_change.add_deleted_resources(iter.first); - } - node_resource_change.set_node_id(node_id.Binary()); - ray_syncer_->Update(std::move(node_resource_change)); - } }); } @@ -306,16 +295,6 @@ void GcsPlacementGroupScheduler::CommitAllBundles( for (const auto &bundle : bundles_per_node) { lease_status_tracker->MarkCommitRequestReturned(node_id, bundle, status); (*commited_bundle_locations)[bundle->BundleId()] = {node_id, bundle}; - - if (ray_syncer_ != nullptr) { - auto &resources = bundle->GetFormattedResources(); - // Push the message to syncer so that it can be broadcasted to all other nodes - rpc::NodeResourceChange node_resource_change; - node_resource_change.set_node_id(node_id.Binary()); - node_resource_change.mutable_updated_resources()->insert(resources.begin(), - resources.end()); - ray_syncer_->Update(std::move(node_resource_change)); - } } // Commit the bundle resources on the remote node to the cluster resources. CommitBundleResources(commited_bundle_locations); diff --git a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h b/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h index 2950848e3744..c90939d62077 100644 --- a/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h +++ b/src/ray/gcs/gcs_server/gcs_placement_group_scheduler.h @@ -21,7 +21,6 @@ #include "ray/common/scheduling/scheduling_ids.h" #include "ray/gcs/gcs_server/gcs_node_manager.h" #include "ray/gcs/gcs_server/gcs_table_storage.h" -#include "ray/gcs/gcs_server/ray_syncer.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" #include "ray/raylet/scheduling/policy/scheduling_context.h" #include "ray/raylet_client/raylet_client.h" @@ -287,8 +286,7 @@ class GcsPlacementGroupScheduler : public GcsPlacementGroupSchedulerInterface { std::shared_ptr gcs_table_storage, const GcsNodeManager &gcs_node_manager, ClusterResourceScheduler &cluster_resource_scheduler, - std::shared_ptr raylet_client_pool, - gcs_syncer::RaySyncer *ray_syncer); + std::shared_ptr raylet_client_pool); virtual ~GcsPlacementGroupScheduler() = default; @@ -493,10 +491,6 @@ class GcsPlacementGroupScheduler : public GcsPlacementGroupSchedulerInterface { /// The nodes which are releasing unused bundles. absl::flat_hash_set nodes_of_releasing_unused_bundles_; - /// The syncer of resource. This is used to report placement group updates. - /// TODO (iycheng): Remove this one from pg once we finish the refactor - gcs_syncer::RaySyncer *ray_syncer_; - /// The resources changed listeners. std::vector> resources_changed_listeners_; diff --git a/src/ray/gcs/gcs_server/gcs_server.cc b/src/ray/gcs/gcs_server/gcs_server.cc index 4581759302d9..a2939338d243 100644 --- a/src/ray/gcs/gcs_server/gcs_server.cc +++ b/src/ray/gcs/gcs_server/gcs_server.cc @@ -275,13 +275,9 @@ void GcsServer::DoStart(const GcsInitData &gcs_init_data) { void GcsServer::Stop() { if (!is_stopped_) { RAY_LOG(INFO) << "Stopping GCS server."; - if (RayConfig::instance().use_ray_syncer()) { - ray_syncer_io_context_.stop(); - ray_syncer_thread_->join(); - ray_syncer_.reset(); - } else { - gcs_ray_syncer_->Stop(); - } + ray_syncer_io_context_.stop(); + ray_syncer_thread_->join(); + ray_syncer_.reset(); gcs_task_manager_->Stop(); @@ -497,8 +493,7 @@ void GcsServer::InitGcsPlacementGroupManager(const GcsInitData &gcs_init_data) { gcs_table_storage_, *gcs_node_manager_, *cluster_resource_scheduler_, - raylet_client_pool_, - gcs_ray_syncer_.get()); + raylet_client_pool_); gcs_placement_group_manager_ = std::make_shared( main_service_, @@ -535,30 +530,18 @@ GcsServer::StorageType GcsServer::GetStorageType() const { } void GcsServer::InitRaySyncer(const GcsInitData &gcs_init_data) { - if (RayConfig::instance().use_ray_syncer()) { - ray_syncer_ = - std::make_unique(ray_syncer_io_context_, kGCSNodeID.Binary()); - ray_syncer_->Register( - syncer::MessageType::RESOURCE_VIEW, nullptr, gcs_resource_manager_.get()); - ray_syncer_->Register( - syncer::MessageType::COMMANDS, nullptr, gcs_resource_manager_.get()); - ray_syncer_thread_ = std::make_unique([this]() { - boost::asio::io_service::work work(ray_syncer_io_context_); - ray_syncer_io_context_.run(); - }); - ray_syncer_service_ = std::make_unique(*ray_syncer_); - rpc_server_.RegisterService(*ray_syncer_service_); - } else { - /* - The current synchronization flow is: - raylet -> syncer::poller --> syncer::update -> gcs_resource_manager - gcs_placement_scheduler --/ - */ - gcs_ray_syncer_ = std::make_unique( - main_service_, raylet_client_pool_, *gcs_resource_manager_); - gcs_ray_syncer_->Initialize(gcs_init_data); - gcs_ray_syncer_->Start(); - } + ray_syncer_ = + std::make_unique(ray_syncer_io_context_, kGCSNodeID.Binary()); + ray_syncer_->Register( + syncer::MessageType::RESOURCE_VIEW, nullptr, gcs_resource_manager_.get()); + ray_syncer_->Register( + syncer::MessageType::COMMANDS, nullptr, gcs_resource_manager_.get()); + ray_syncer_thread_ = std::make_unique([this]() { + boost::asio::io_service::work work(ray_syncer_io_context_); + ray_syncer_io_context_.run(); + }); + ray_syncer_service_ = std::make_unique(*ray_syncer_); + rpc_server_.RegisterService(*ray_syncer_service_); } void GcsServer::InitFunctionManager() { @@ -736,10 +719,6 @@ void GcsServer::InstallEventListeners() { gcs_healthcheck_manager_->AddNode(node_id, channel); } cluster_task_manager_->ScheduleAndDispatchTasks(); - - if (!RayConfig::instance().use_ray_syncer()) { - gcs_ray_syncer_->AddNode(*node); - } }); gcs_node_manager_->AddNodeRemovedListener( [this](std::shared_ptr node) { @@ -753,10 +732,6 @@ void GcsServer::InstallEventListeners() { raylet_client_pool_->Disconnect(node_id); gcs_healthcheck_manager_->RemoveNode(node_id); pubsub_handler_->RemoveSubscriberFrom(node_id.Binary()); - - if (!RayConfig::instance().use_ray_syncer()) { - gcs_ray_syncer_->RemoveNode(*node); - } }); // Install worker event listener. @@ -843,9 +818,6 @@ std::string GcsServer::GetDebugState() const { << gcs_publisher_->DebugString() << "\n\n" << runtime_env_manager_->DebugString() << "\n\n" << gcs_task_manager_->DebugString() << "\n\n"; - if (gcs_ray_syncer_) { - stream << gcs_ray_syncer_->DebugString(); - } return stream.str(); } @@ -889,20 +861,14 @@ void GcsServer::TryGlobalGC() { rpc::ResourcesData resources_data; resources_data.set_should_global_gc(true); - if (RayConfig::instance().use_ray_syncer()) { - auto msg = std::make_shared(); - msg->set_version(absl::GetCurrentTimeNanos()); - msg->set_node_id(kGCSNodeID.Binary()); - msg->set_message_type(syncer::MessageType::COMMANDS); - std::string serialized_msg; - RAY_CHECK(resources_data.SerializeToString(&serialized_msg)); - msg->set_sync_message(std::move(serialized_msg)); - ray_syncer_->BroadcastRaySyncMessage(std::move(msg)); - } else { - resources_data.set_node_id(kGCSNodeID.Binary()); - gcs_ray_syncer_->Update(resources_data); - } - + auto msg = std::make_shared(); + msg->set_version(absl::GetCurrentTimeNanos()); + msg->set_node_id(kGCSNodeID.Binary()); + msg->set_message_type(syncer::MessageType::COMMANDS); + std::string serialized_msg; + RAY_CHECK(resources_data.SerializeToString(&serialized_msg)); + msg->set_sync_message(std::move(serialized_msg)); + ray_syncer_->BroadcastRaySyncMessage(std::move(msg)); global_gc_throttler_->RunNow(); } } diff --git a/src/ray/gcs/gcs_server/gcs_server.h b/src/ray/gcs/gcs_server/gcs_server.h index c5092257e72e..cf119fc9116e 100644 --- a/src/ray/gcs/gcs_server/gcs_server.h +++ b/src/ray/gcs/gcs_server/gcs_server.h @@ -28,7 +28,6 @@ #include "ray/gcs/gcs_server/gcs_task_manager.h" #include "ray/gcs/gcs_server/grpc_based_resource_broadcaster.h" #include "ray/gcs/gcs_server/pubsub_handler.h" -#include "ray/gcs/gcs_server/ray_syncer.h" #include "ray/gcs/gcs_server/runtime_env_handler.h" #include "ray/gcs/pubsub/gcs_pub_sub.h" #include "ray/gcs/redis_client.h" @@ -249,11 +248,6 @@ class GcsServer { /// Monitor service for monitor server std::unique_ptr monitor_grpc_service_; - /// Synchronization service for ray. - /// TODO(iycheng): Deprecate this gcs_ray_syncer_ one once we roll out - /// to ray_syncer_. - std::unique_ptr gcs_ray_syncer_; - /// Ray Syncer realted fields. std::unique_ptr ray_syncer_; std::unique_ptr ray_syncer_service_; diff --git a/src/ray/gcs/gcs_server/ray_syncer.h b/src/ray/gcs/gcs_server/ray_syncer.h deleted file mode 100644 index 431679ba6d26..000000000000 --- a/src/ray/gcs/gcs_server/ray_syncer.h +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright 2022 The Ray Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once - -#include -#include - -#include "ray/common/asio/periodical_runner.h" -#include "ray/gcs/gcs_server/gcs_resource_manager.h" -#include "ray/gcs/gcs_server/gcs_resource_report_poller.h" -#include "ray/gcs/gcs_server/grpc_based_resource_broadcaster.h" - -namespace ray { -class GcsPlacementGroupSchedulerTest; -namespace gcs_syncer { - -// RaySyncer is a service to sync components in the cluster. -// It's supposed to be used to synchronize resource usage and scheduling information -// in ray cluster. The gold of this component is to make sure each node in the cluster -// eventually consistent on the information of each other. -// Right now, RaySyncer has two sub-components: -// - poller: periodically get the message from the nodes in the cluster. -// - broadcaster: periodically broadcast the information to the nodes. -// This component is still in developing. The goal of this node is to make raylet -// and GCS be able to use the same code and synchronize the information. -class RaySyncer { - public: - RaySyncer(instrumented_io_context &main_thread, - std::shared_ptr raylet_client_pool, - ::ray::gcs::GcsResourceManager &gcs_resource_manager) - : ticker_(main_thread), gcs_resource_manager_(gcs_resource_manager) { - poller_ = std::make_unique<::ray::gcs::GcsResourceReportPoller>( - raylet_client_pool, [this, &main_thread](const rpc::ResourcesData &data) { - main_thread.post([this, data]() mutable { Update(std::move(data)); }, - "ResourceUpdate"); - }); - - broadcaster_ = - std::make_unique<::ray::gcs::GrpcBasedResourceBroadcaster>(raylet_client_pool); - } - - void Start() { - poller_->Start(); - broadcast_thread_ = std::make_unique([this]() { - SetThreadName("resource_bcast"); - boost::asio::io_service::work work(broadcast_service_); - broadcast_service_.run(); - }); - - // Perodically broadcast the messages received to other nodes. - ticker_.RunFnPeriodically( - [this] { - auto beg = resources_buffer_.begin(); - auto ptr = beg; - static auto max_batch = RayConfig::instance().resource_broadcast_batch_size(); - // Prepare the to-be-sent messages. - for (size_t cnt = resources_buffer_proto_.batch().size(); - cnt < max_batch && cnt < resources_buffer_.size(); - ++ptr, ++cnt) { - resources_buffer_proto_.add_batch()->mutable_data()->Swap(&ptr->second); - } - resources_buffer_.erase(beg, ptr); - - // Broadcast the messages to other nodes. - broadcast_service_.dispatch( - [this, resources = std::move(resources_buffer_proto_)]() mutable { - broadcaster_->SendBroadcast(std::move(resources)); - }, - "SendBroadcast"); - resources_buffer_proto_.Clear(); - }, - RayConfig::instance().raylet_report_resources_period_milliseconds(), - "RaySyncer.deadline_timer.report_resource_report"); - } - - void Stop() { - poller_->Stop(); - if (broadcast_thread_ != nullptr) { - broadcast_service_.stop(); - if (broadcast_thread_->joinable()) { - broadcast_thread_->join(); - } - } - } - - /// This method should be used by: - /// 1) GCS to report resource updates; - /// 2) poller to report what have received. - /// Right now it only put the received message into the buffer - /// and broadcaster will use this to send the messages to other - /// nodes. - /// - /// \\\param update: The messages should be sent. Right now we only support - /// rpc::NodeResourceChange and rpc::ResourcesData. - /// - /// TODO(iycheng): This API should be deleted in later PRs in favor of a more general - /// solution. - template - void Update(T update) { - static_assert(std::is_same_v || - std::is_same_v, - "unknown type"); - if constexpr (std::is_same_v) { - resources_buffer_proto_.add_batch()->mutable_change()->Swap(&update); - } else if constexpr (std::is_same_v) { - gcs_resource_manager_.UpdateFromResourceReport(update); - if (update.should_global_gc() || update.resources_total_size() > 0 || - update.resources_available_changed() || update.resource_load_changed()) { - update.clear_resource_load(); - update.clear_resource_load_by_shape(); - update.clear_resources_normal_task(); - auto &orig = resources_buffer_[update.node_id()]; - orig.Swap(&update); - } - } - } - - void Initialize(const ::ray::gcs::GcsInitData &gcs_init_data) { - poller_->Initialize(gcs_init_data); - broadcaster_->Initialize(gcs_init_data); - } - - /// Handle a node registration. - /// This will call the sub-components add function. - /// - /// \param node The specified node to add. - void AddNode(const rpc::GcsNodeInfo &node_info) { - broadcaster_->HandleNodeAdded(node_info); - poller_->HandleNodeAdded(node_info); - } - - /// Handle a node removal. - /// This will call the sub-components removal function - /// - /// \param node The specified node to remove. - void RemoveNode(const rpc::GcsNodeInfo &node_info) { - broadcaster_->HandleNodeRemoved(node_info); - poller_->HandleNodeRemoved(node_info); - resources_buffer_.erase(node_info.node_id()); - } - - std::string DebugString() { return broadcaster_->DebugString(); } - - private: - // Right now the threading is messy here due to legacy reason. - // TODO (iycheng): Clean these up in follow-up PRs. - - // ticker is running from main thread. - PeriodicalRunner ticker_; - // The receiver of this syncer. - // TODO (iycheng): Generalize this module in the future PR. - ::ray::gcs::GcsResourceManager &gcs_resource_manager_; - // All operations in broadcaster is supposed to be put in broadcast thread - std::unique_ptr broadcast_thread_; - instrumented_io_context broadcast_service_; - std::unique_ptr<::ray::gcs::GrpcBasedResourceBroadcaster> broadcaster_; - // Poller is running in its own thread. - std::unique_ptr<::ray::gcs::GcsResourceReportPoller> poller_; - - // Accessing data related fields should be from main thread. - // These fields originally were put in GcsResourceManager. - // resources_buffer_ is the place where global view is stored. - // resources_buffer_proto_ is the actual message to be broadcasted. - // Right now there are two type of messages: rpc::NodeResourceChange and - // rpc::ResourcesData. Ideally we should unify these two, but as the first - // step, we keep them separate as before. - // rpc::NodeResourceChange is about placement group, it'll be put to - // resources_buffer_proto_ so that it'll be sent in next sending cycle. - // rpc::ResourcesData is stored in resources_buffer_ and when we do broadcasting, - // it'll be copied to resources_buffer_proto_ and sent to other nodes. - // resources_buffer_proto_ will be cleared after each broadcasting. - absl::flat_hash_map resources_buffer_; - rpc::ResourceUsageBroadcastData resources_buffer_proto_; - friend class ray::GcsPlacementGroupSchedulerTest; -}; - -} // namespace gcs_syncer -} // namespace ray diff --git a/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc b/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc index d967da65583d..ebd8d756eeeb 100644 --- a/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc +++ b/src/ray/gcs/gcs_server/test/gcs_placement_group_scheduler_test.cc @@ -18,7 +18,6 @@ // clang-format off #include "gtest/gtest.h" #include "ray/common/asio/instrumented_io_context.h" -#include "ray/gcs/gcs_server/ray_syncer.h" #include "ray/gcs/gcs_server/test/gcs_server_test_util.h" #include "ray/gcs/test/gcs_test_util.h" #include "ray/raylet/scheduling/cluster_resource_scheduler.h" @@ -59,8 +58,6 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { io_service_, cluster_resource_scheduler_->GetClusterResourceManager(), local_node_id); - ray_syncer_ = std::make_shared( - io_service_, nullptr, *gcs_resource_manager_); store_client_ = std::make_shared(io_service_); raylet_client_pool_ = std::make_shared( [this](const rpc::Address &addr) { return raylet_clients_[addr.port()]; }); @@ -71,8 +68,7 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { gcs_table_storage_, *gcs_node_manager_, *cluster_resource_scheduler_, - raylet_client_pool_, - ray_syncer_.get()); + raylet_client_pool_); counter_.reset(new CounterMap()); } @@ -159,53 +155,6 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { CheckEqWithPlacementGroupFront(placement_group, GcsPlacementGroupStatus::FAILURE); } - void CheckResourceUpdateMatch( - const std::vector> &placement_groups, - bool create) { - auto resource_buffer = ray_syncer_->resources_buffer_proto_; - if (create) { - absl::flat_hash_map> updates, - pg; - for (auto placement_group : placement_groups) { - for (auto bundle : placement_group->GetBundles()) { - const auto &resources = bundle->GetFormattedResources(); - pg[bundle->NodeId().Binary()].insert(resources.begin(), resources.end()); - } - } - for (auto batch : resource_buffer.batch()) { - updates[batch.change().node_id()].insert( - batch.change().updated_resources().begin(), - batch.change().updated_resources().end()); - } - ASSERT_EQ(updates, pg); - } else { - absl::flat_hash_map> updates, pg; - for (auto placement_group : placement_groups) { - for (auto bundle : placement_group->GetBundles()) { - const auto &resources = bundle->GetFormattedResources(); - for (auto [key, _] : resources) { - pg[bundle->NodeId().Binary()].insert(key); - } - } - } - for (auto batch : resource_buffer.batch()) { - updates[batch.change().node_id()].insert( - batch.change().deleted_resources().begin(), - batch.change().deleted_resources().end()); - } - ASSERT_EQ(updates, pg); - } - } - - void WaitUntilSyncMessage(int n) { - for (int i = 0; i < 20; ++i) { - if (ray_syncer_->resources_buffer_proto_.batch().size() == n) { - break; - } - std::this_thread::sleep_for(std::chrono::microseconds(1)); - } - } - void SchedulePlacementGroupSuccessTest(rpc::PlacementStrategy strategy) { auto node = Mocker::GenNodeInfo(); AddNode(node); @@ -237,11 +186,6 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { WaitPlacementGroupPendingDone(0, GcsPlacementGroupStatus::FAILURE); WaitPlacementGroupPendingDone(1, GcsPlacementGroupStatus::SUCCESS); CheckEqWithPlacementGroupFront(placement_group, GcsPlacementGroupStatus::SUCCESS); - WaitUntilSyncMessage(2); - { - absl::MutexLock lock(&placement_group_requests_mutex_); - CheckResourceUpdateMatch(success_placement_groups_, true); - } } void ReschedulingWhenNodeAddTest(rpc::PlacementStrategy strategy) { @@ -354,7 +298,6 @@ class GcsPlacementGroupSchedulerTest : public ::testing::Test { std::shared_ptr gcs_publisher_; std::shared_ptr gcs_table_storage_; std::shared_ptr raylet_client_pool_; - std::shared_ptr ray_syncer_; std::shared_ptr> counter_; }; @@ -593,11 +536,6 @@ TEST_F(GcsPlacementGroupSchedulerTest, DestroyPlacementGroup) { scheduler_->DestroyPlacementGroupBundleResourcesIfExists(placement_group_id); ASSERT_TRUE(raylet_clients_[0]->GrantCancelResourceReserve()); ASSERT_TRUE(raylet_clients_[0]->GrantCancelResourceReserve()); - WaitUntilSyncMessage(4); - { - absl::MutexLock lock(&placement_group_requests_mutex_); - CheckResourceUpdateMatch(success_placement_groups_, false); - } // Subsequent destroy request should not do anything. scheduler_->DestroyPlacementGroupBundleResourcesIfExists(placement_group_id); ASSERT_FALSE(raylet_clients_[0]->GrantCancelResourceReserve()); diff --git a/src/ray/object_manager/object_buffer_pool.h b/src/ray/object_manager/object_buffer_pool.h index e514b2d72516..321a9ba1b0e7 100644 --- a/src/ray/object_manager/object_buffer_pool.h +++ b/src/ray/object_manager/object_buffer_pool.h @@ -46,7 +46,7 @@ class ObjectBufferPool { data(data), buffer_length(buffer_length), buffer_ref(buffer_ref){}; - /// A pointer to the start position of this object chunk. + /// The index of this object chunk within the object, starting with 0. uint64_t chunk_index; /// A pointer to the start position of this object chunk. uint8_t *data; diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index d2e000a9b145..04d08acb76a5 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -399,9 +399,7 @@ NodeManager::NodeManager(instrumented_io_context &io_service, RAY_CHECK_OK(store_client_.Connect(config.store_socket_name.c_str())); // Run the node manger rpc server. node_manager_server_.RegisterService(node_manager_service_, false /* token_auth */); - if (RayConfig::instance().use_ray_syncer()) { - node_manager_server_.RegisterService(ray_syncer_service_); - } + node_manager_server_.RegisterService(ray_syncer_service_); node_manager_server_.Run(); // GCS will check the health of the service named with the node id. // Fail to setup this will lead to the health check failure. @@ -441,37 +439,35 @@ ray::Status NodeManager::RegisterGcs() { auto on_node_change_subscribe_done = [this](Status status) { RAY_CHECK_OK(status); - if (RayConfig::instance().use_ray_syncer()) { - // Register resource manager and scheduler - ray_syncer_.Register( - /* message_type */ syncer::MessageType::RESOURCE_VIEW, - /* reporter */ &cluster_resource_scheduler_->GetLocalResourceManager(), - /* receiver */ this, - /* pull_from_reporter_interval_ms */ - RayConfig::instance().raylet_report_resources_period_milliseconds()); - - // Register a commands channel. - // It's only used for GC right now. - ray_syncer_.Register( - /* message_type */ syncer::MessageType::COMMANDS, - /* reporter */ this, - /* receiver */ this, - /* pull_from_reporter_interval_ms */ 0); - - auto gcs_channel = gcs_client_->GetGcsRpcClient().GetChannel(); - ray_syncer_.Connect(kGCSNodeID.Binary(), gcs_channel); - periodical_runner_.RunFnPeriodically( - [this] { - auto triggered_by_global_gc = TryLocalGC(); - // If plasma store is under high pressure, we should try to schedule a global - // gc. - if (triggered_by_global_gc) { - ray_syncer_.OnDemandBroadcasting(syncer::MessageType::COMMANDS); - } - }, - RayConfig::instance().raylet_check_gc_period_milliseconds(), - "NodeManager.CheckGC"); - } + // Register resource manager and scheduler + ray_syncer_.Register( + /* message_type */ syncer::MessageType::RESOURCE_VIEW, + /* reporter */ &cluster_resource_scheduler_->GetLocalResourceManager(), + /* receiver */ this, + /* pull_from_reporter_interval_ms */ + RayConfig::instance().raylet_report_resources_period_milliseconds()); + + // Register a commands channel. + // It's only used for GC right now. + ray_syncer_.Register( + /* message_type */ syncer::MessageType::COMMANDS, + /* reporter */ this, + /* receiver */ this, + /* pull_from_reporter_interval_ms */ 0); + + auto gcs_channel = gcs_client_->GetGcsRpcClient().GetChannel(); + ray_syncer_.Connect(kGCSNodeID.Binary(), gcs_channel); + periodical_runner_.RunFnPeriodically( + [this] { + auto triggered_by_global_gc = TryLocalGC(); + // If plasma store is under high pressure, we should try to schedule a global + // gc. + if (triggered_by_global_gc) { + ray_syncer_.OnDemandBroadcasting(syncer::MessageType::COMMANDS); + } + }, + RayConfig::instance().raylet_check_gc_period_milliseconds(), + "NodeManager.CheckGC"); }; // Register a callback to monitor new nodes and a callback to monitor removed nodes. RAY_RETURN_NOT_OK(gcs_client_->Nodes().AsyncSubscribeToNodeChange( @@ -1022,12 +1018,10 @@ void NodeManager::NodeAdded(const GcsNodeInfo &node_info) { cluster_task_manager_->ScheduleAndDispatchTasks(); } // Update the resource view if a new message has been sent. - if (RayConfig::instance().use_ray_syncer()) { - if (auto sync_msg = ray_syncer_.GetSyncMessage(node_id.Binary(), - syncer::MessageType::RESOURCE_VIEW)) { - if (sync_msg) { - ConsumeSyncMessage(sync_msg); - } + if (auto sync_msg = ray_syncer_.GetSyncMessage(node_id.Binary(), + syncer::MessageType::RESOURCE_VIEW)) { + if (sync_msg) { + ConsumeSyncMessage(sync_msg); } } } @@ -1762,75 +1756,6 @@ void NodeManager::ProcessPushErrorRequestMessage(const uint8_t *message_data) { RAY_CHECK_OK(gcs_client_->Errors().AsyncReportJobError(error_data_ptr, nullptr)); } -void NodeManager::HandleUpdateResourceUsage(rpc::UpdateResourceUsageRequest request, - rpc::UpdateResourceUsageReply *reply, - rpc::SendReplyCallback send_reply_callback) { - if (RayConfig::instance().use_ray_syncer()) { - RAY_LOG(WARNING) - << "There is a GCS outside of this cluster sending message to this raylet."; - send_reply_callback(Status::OK(), nullptr, nullptr); - return; - } - - rpc::ResourceUsageBroadcastData resource_usage_batch; - resource_usage_batch.ParseFromString(request.serialized_resource_usage_batch()); - // When next_resource_seq_no_ == 0 it means it just started. - // TODO: Fetch a snapshot from gcs for lightweight resource broadcasting - if (next_resource_seq_no_ != 0 && - resource_usage_batch.seq_no() != next_resource_seq_no_) { - // TODO (Alex): Ideally we would be really robust, and potentially eagerly - // pull a full resource "snapshot" from gcs to make sure our state doesn't - // diverge from GCS. - RAY_LOG(WARNING) - << "Raylet may have missed a resource broadcast. This either means that GCS has " - "restarted, the network is heavily congested and is dropping, reordering, or " - "duplicating packets. Expected seq#: " - << next_resource_seq_no_ << ", but got: " << resource_usage_batch.seq_no() << "."; - if (resource_usage_batch.seq_no() < next_resource_seq_no_) { - RAY_LOG(WARNING) << "Discard the the resource update since local version is newer"; - send_reply_callback(Status::OK(), nullptr, nullptr); - return; - } - } - next_resource_seq_no_ = resource_usage_batch.seq_no() + 1; - - bool updated = false; - for (const auto &resource_change_or_data : resource_usage_batch.batch()) { - if (resource_change_or_data.has_data()) { - const auto &resource_usage = resource_change_or_data.data(); - auto node_id = NodeID::FromBinary(resource_usage.node_id()); - // Skip messages from self. - if (node_id != self_node_id_) { - if (UpdateResourceUsage(node_id, resource_usage)) { - updated = true; - } - } - } else if (resource_change_or_data.has_change()) { - const auto &resource_notification = resource_change_or_data.change(); - auto node_id = NodeID::FromBinary(resource_notification.node_id()); - if (resource_notification.updated_resources_size() != 0) { - auto resources = ResourceMapToResourceRequest( - MapFromProtobuf(resource_notification.updated_resources()), false); - if (ResourceCreateUpdated(node_id, resources)) { - updated = true; - } - } - - if (resource_notification.deleted_resources_size() != 0) { - if (ResourceDeleted( - node_id, VectorFromProtobuf(resource_notification.deleted_resources()))) { - updated = true; - } - } - } - } - - if (updated) { - cluster_task_manager_->ScheduleAndDispatchTasks(); - } - send_reply_callback(Status::OK(), nullptr, nullptr); -} - void NodeManager::HandleRequestResourceReport( rpc::RequestResourceReportRequest request, rpc::RequestResourceReportReply *reply, diff --git a/src/ray/raylet/node_manager.h b/src/ray/raylet/node_manager.h index 4e01848d44db..930b28be0bb0 100644 --- a/src/ray/raylet/node_manager.h +++ b/src/ray/raylet/node_manager.h @@ -485,10 +485,6 @@ class NodeManager : public rpc::NodeManagerServiceHandler, void ProcessSubscribePlasmaReady(const std::shared_ptr &client, const uint8_t *message_data); - void HandleUpdateResourceUsage(rpc::UpdateResourceUsageRequest request, - rpc::UpdateResourceUsageReply *reply, - rpc::SendReplyCallback send_reply_callback) override; - /// Handle a `RequestResourceReport` request. void HandleRequestResourceReport(rpc::RequestResourceReportRequest request, rpc::RequestResourceReportReply *reply, diff --git a/src/ray/raylet/scheduling/cluster_resource_manager.cc b/src/ray/raylet/scheduling/cluster_resource_manager.cc index b71fa347e188..645754451bf2 100644 --- a/src/ray/raylet/scheduling/cluster_resource_manager.cc +++ b/src/ray/raylet/scheduling/cluster_resource_manager.cc @@ -24,20 +24,18 @@ namespace ray { ClusterResourceManager::ClusterResourceManager(instrumented_io_context &io_service) : timer_(io_service) { - if (RayConfig::instance().use_ray_syncer()) { - timer_.RunFnPeriodically( - [this]() { - auto syncer_delay = absl::Milliseconds( - RayConfig::instance().ray_syncer_message_refresh_interval_ms()); - for (auto &[node_id, resource] : received_node_resources_) { - auto modified_ts = GetNodeResourceModifiedTs(node_id); - if (modified_ts && *modified_ts + syncer_delay < absl::Now()) { - AddOrUpdateNode(node_id, resource); - } + timer_.RunFnPeriodically( + [this]() { + auto syncer_delay = absl::Milliseconds( + RayConfig::instance().ray_syncer_message_refresh_interval_ms()); + for (auto &[node_id, resource] : received_node_resources_) { + auto modified_ts = GetNodeResourceModifiedTs(node_id); + if (modified_ts && *modified_ts + syncer_delay < absl::Now()) { + AddOrUpdateNode(node_id, resource); } - }, - RayConfig::instance().ray_syncer_message_refresh_interval_ms()); - } + } + }, + RayConfig::instance().ray_syncer_message_refresh_interval_ms()); } std::optional ClusterResourceManager::GetNodeResourceModifiedTs( diff --git a/src/ray/rpc/node_manager/node_manager_server.h b/src/ray/rpc/node_manager/node_manager_server.h index d027d1b85fab..4ed18f113335 100644 --- a/src/ray/rpc/node_manager/node_manager_server.h +++ b/src/ray/rpc/node_manager/node_manager_server.h @@ -25,7 +25,6 @@ namespace rpc { /// NOTE: See src/ray/core_worker/core_worker.h on how to add a new grpc handler. #define RAY_NODE_MANAGER_RPC_HANDLERS \ - RPC_SERVICE_HANDLER(NodeManagerService, UpdateResourceUsage, -1) \ RPC_SERVICE_HANDLER(NodeManagerService, RequestResourceReport, -1) \ RPC_SERVICE_HANDLER(NodeManagerService, GetResourceLoad, -1) \ RPC_SERVICE_HANDLER(NodeManagerService, NotifyGCSRestart, -1) \ @@ -64,10 +63,6 @@ class NodeManagerServiceHandler { /// \param[out] reply The reply message. /// \param[in] send_reply_callback The callback to be called when the request is done. - virtual void HandleUpdateResourceUsage(rpc::UpdateResourceUsageRequest request, - rpc::UpdateResourceUsageReply *reply, - rpc::SendReplyCallback send_reply_callback) = 0; - virtual void HandleRequestResourceReport( rpc::RequestResourceReportRequest request, rpc::RequestResourceReportReply *reply, diff --git a/src/ray/util/process.cc b/src/ray/util/process.cc index a3e8d6bb6055..34be370198d4 100644 --- a/src/ray/util/process.cc +++ b/src/ray/util/process.cc @@ -629,6 +629,8 @@ bool IsProcessAlive(pid_t pid) { } return false; #else + // Note if the process is a zombie (dead but not yet reaped), it will + // still be alive by this check. if (kill(pid, 0) == -1 && errno == ESRCH) { return false; }