diff --git a/.buildkite/pipeline.build.yml b/.buildkite/pipeline.build.yml
index f24b10a5f529..a22ffa3081ae 100644
--- a/.buildkite/pipeline.build.yml
+++ b/.buildkite/pipeline.build.yml
@@ -220,6 +220,9 @@
     # Todo (krfricke): Move mosaicml to train-test-requirements.txt
     - pip install "mosaicml==0.12.1"
     - DOC_TESTING=1 ./ci/env/install-dependencies.sh
+    # TODO(scottjlee): Move datasets to train/data-test-requirements.txt 
+    # (see https://github.com/ray-project/ray/pull/38432/)
+    - pip install "datasets==2.14.0"
     - ./ci/env/install-horovod.sh
     - ./ci/env/env_info.sh
     - bazel test --config=ci $(./scripts/bazel_export_options)
@@ -383,6 +386,57 @@
     - ray job submit --address http://localhost:8265 --runtime-env python/ray/tests/chaos/runtime_env.yaml --working-dir python/ray/tests/chaos -- python potato_passer.py --num-actors=3 --pass-times=1000 --sleep-secs=0.01
 
 
+- label: ":kubernetes: :mending_heart: :ray-serve: serve chaos network delay test"
+  conditions: ["RAY_CI_LINUX_WHEELS_AFFECTED"]
+  instance_size: medium
+  commands:
+    - |
+      cleanup() {
+        if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi
+        kind delete cluster
+      }
+      trap cleanup EXIT
+    - ./ci/env/install-minimal.sh 3.8
+    - PYTHON=3.8 ./ci/env/install-dependencies.sh
+    # Specifying above somehow messes up the Ray install.
+    # Uninstall and re-install Ray so that we can use Ray Client.
+    # (Remove thirdparty_files to sidestep an issue with psutil.)
+    - pip uninstall -y ray && rm -rf /ray/python/ray/thirdparty_files
+    - pip install -e /ray/python
+    - echo "--- Setting up local kind cluster."
+    - ./ci/k8s/prep-k8s-environment.sh
+    - ./ci/k8s/prep-helm.sh
+    - echo "--- Building py38-cpu Ray image for the test."
+    - LINUX_WHEELS=1 ./ci/ci.sh build
+    - pip install -q docker
+    - python ci/build/build-docker-images.py --py-versions py38 --device-types cpu --build-type LOCAL --build-base
+    # Tag the image built in the last step. We want to be sure to distinguish the image from the real Ray nightly.
+    - docker tag rayproject/ray:nightly-py38-cpu ray-ci:kuberay-test
+    # Load the image into the kind node
+    - kind load docker-image ray-ci:kuberay-test
+    # Helm install KubeRay
+    - echo "--- Installing KubeRay operator and cluser."
+    - helm repo add kuberay https://ray-project.github.io/kuberay-helm/
+    - helm install kuberay-operator kuberay/kuberay-operator
+    - kubectl wait pod  -l app.kubernetes.io/name=kuberay-operator --for=condition=Ready=True  --timeout=5m
+    # We are in m4i.xlarge and have 4 cpus. Can't have too many nodes.
+    - helm install raycluster kuberay/ray-cluster --set image.repository=ray-ci --set image.tag=kuberay-test --set worker.replicas=2 --set worker.resources.limits.cpu=500m --set worker.resources.requests.cpu=500m --set head.resources.limits.cpu=500m --set head.resources.requests.cpu=500m --set head.containerEnv[0].name=RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING --set head.containerEnv[0].value=\"1\"
+    - kubectl wait pod -l ray.io/cluster=raycluster-kuberay --for=condition=Ready=True --timeout=5m
+    - kubectl port-forward --address 0.0.0.0 service/raycluster-kuberay-head-svc 8265:8265 &
+    # Helm install chaos-mesh
+    - echo "--- Installing chaos-mesh operator and CR."
+    - helm repo add chaos-mesh https://charts.chaos-mesh.org
+    - kubectl create ns chaos-mesh
+    - helm install chaos-mesh chaos-mesh/chaos-mesh -n=chaos-mesh --set chaosDaemon.runtime=containerd --set chaosDaemon.socketPath=/run/containerd/containerd.sock --version 2.6.1
+    - kubectl wait pod  --namespace chaos-mesh  -l app.kubernetes.io/instance=chaos-mesh --for=condition=Ready=True
+    - echo "--- Running the script without faults"
+    - ray job submit --address http://localhost:8265 --runtime-env python/ray/tests/chaos/streaming_llm.yaml --working-dir python/ray/tests/chaos -- python streaming_llm.py --num_queries_per_task=100 --num_tasks=2 --num_words_per_query=100
+    # Now add the delay, rerun the job
+    - kubectl apply -f python/ray/tests/chaos/chaos_network_delay.yaml
+    - echo "--- Running the script with fault of networking delay"
+    - ray job submit --address http://localhost:8265 --runtime-env python/ray/tests/chaos/streaming_llm.yaml --working-dir python/ray/tests/chaos -- python streaming_llm.py --num_queries_per_task=100 --num_tasks=2 --num_words_per_query=100
+    
+
 - label: ":book: Documentation"
   commands:
     - export LINT=1
@@ -445,4 +499,6 @@
       - TRAIN_MINIMAL_INSTALL=1 ./ci/env/install-minimal.sh
       - ./ci/env/env_info.sh
       - python ./ci/env/check_minimal_install.py
-      - bazel test --config=ci $(./ci/run/bazel_export_options)  --build_tests_only --test_tag_filters=minimal python/ray/train/...
+      - bazel test --config=ci $(./ci/run/bazel_export_options)  --build_tests_only --test_tag_filters=minimal 
+        --test_env=RAY_AIR_NEW_PERSISTENCE_MODE=1
+        python/ray/train/...
diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml
index 8395236268a0..d788555d5454 100644
--- a/.buildkite/pipeline.gpu_large.yml
+++ b/.buildkite/pipeline.gpu_large.yml
@@ -67,11 +67,13 @@
     - DOC_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh
     - pip install -Ur ./python/requirements/ml/dl-gpu-requirements.txt
     - ./ci/env/install-horovod.sh
-    - ./ci/env/env_info.sh
     # Test examples with newer version of `transformers`
     # TODO(amogkam): Remove when https://github.com/ray-project/ray/issues/36011 
     # is resolved.
-    - pip install transformers==4.30.2
+    # TODO(scottjlee): Move datasets to train/data-test-requirements.txt 
+    # (see https://github.com/ray-project/ray/pull/38432/)
+    - pip install transformers==4.30.2 datasets==2.14.0
+    - ./ci/env/env_info.sh
     - bazel test --config=ci $(./scripts/bazel_export_options)
       --test_tag_filters=doctest,-cpu python/ray/... doc/...
 
diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml
index 3eed205e90ce..b52728f07b44 100644
--- a/.buildkite/pipeline.ml.yml
+++ b/.buildkite/pipeline.ml.yml
@@ -32,7 +32,7 @@
 - label: ":steam_locomotive: Train tests and examples"
   conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"]
   instance_size: large
-  parallelism: 4
+  parallelism: 3
   commands:
     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
     # Todo (krfricke): Move mosaicml to train-test-requirements.txt
@@ -343,7 +343,7 @@
 - label: ":steam_locomotive: :floppy_disk: New persistence mode: Train tests and examples"
   conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"]
   instance_size: large
-  parallelism: 4
+  parallelism: 3
   commands:
     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
     # Todo (krfricke): Move mosaicml to train-test-requirements.txt
@@ -454,6 +454,9 @@
   commands:
     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
     - DATA_PROCESSING_TESTING=1 ARROW_VERSION=12.* ./ci/env/install-dependencies.sh
+    # TODO(scottjlee): Move datasets to train/data-test-requirements.txt 
+    # (see https://github.com/ray-project/ray/pull/38432/)
+    - pip install "datasets==2.14.0"
     - ./ci/env/env_info.sh
     - ./ci/run/run_bazel_test_with_sharding.sh --config=ci $(./ci/run/bazel_export_options) --action_env=RAY_DATA_USE_STREAMING_EXECUTOR=1 --build_tests_only --test_tag_filters=-data_integration,-doctest python/ray/data/...
     - ./ci/run/run_bazel_test_with_sharding.sh --config=ci $(./ci/run/bazel_export_options) --action_env=RAY_DATA_USE_STREAMING_EXECUTOR=1 --build_tests_only --test_tag_filters=ray_data,-doctest python/ray/air/...
@@ -465,6 +468,9 @@
   commands:
     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
     - DATA_PROCESSING_TESTING=1 ARROW_VERSION=nightly ./ci/env/install-dependencies.sh
+    # TODO(scottjlee): Move datasets to train/data-test-requirements.txt 
+    # (see https://github.com/ray-project/ray/pull/38432/)
+    - pip install "datasets==2.14.0"
     - ./ci/env/env_info.sh
     - ./ci/run/run_bazel_test_with_sharding.sh --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-data_integration,-doctest python/ray/data/...
     - ./ci/run/run_bazel_test_with_sharding.sh --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=ray_data,-doctest python/ray/air/...
@@ -476,6 +482,9 @@
   commands:
     - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
     - DATA_PROCESSING_TESTING=1 ARROW_VERSION=12.* ./ci/env/install-dependencies.sh
+    # TODO(scottjlee): Move datasets to train/data-test-requirements.txt 
+    # (see https://github.com/ray-project/ray/pull/38432/)
+    - pip install "datasets==2.14.0"
     - ./ci/env/env_info.sh
     - ./ci/run/run_bazel_test_with_sharding.sh --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-data_integration,-doctest python/ray/data/...
     - ./ci/run/run_bazel_test_with_sharding.sh --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=ray_data,-doctest python/ray/air/...
@@ -510,7 +519,9 @@
     - DOC_TESTING=1 INSTALL_HOROVOD=1 ./ci/env/install-dependencies.sh
     # TODO (shrekris-anyscale): Remove transformers after core transformer
     # requirement is upgraded
-    - pip install "transformers==4.30.2"
+    # TODO(scottjlee): Move datasets to train/data-test-requirements.txt 
+    # (see https://github.com/ray-project/ray/pull/38432/)
+    - pip install "transformers==4.30.2" "datasets==2.14.0"
     - ./ci/env/env_info.sh
     - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-timeseries_libs,-external,-ray_air,-gpu,-post_wheel_build,-doctest,-datasets_train,-highly_parallel doc/...
 
diff --git a/LICENSE b/LICENSE
index 7a30b06a8457..429d535fb33a 100644
--- a/LICENSE
+++ b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright {yyyy} {name of copyright owner}
+   Copyright 2023 Ray Authors
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -447,4 +447,4 @@ Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
-limitations under the License.
\ No newline at end of file
+limitations under the License.
diff --git a/dashboard/modules/metrics/dashboards/serve_dashboard_panels.py b/dashboard/modules/metrics/dashboards/serve_dashboard_panels.py
index 14ab570448da..b9792a2703c6 100644
--- a/dashboard/modules/metrics/dashboards/serve_dashboard_panels.py
+++ b/dashboard/modules/metrics/dashboards/serve_dashboard_panels.py
@@ -139,8 +139,8 @@
         unit="replicas",
         targets=[
             Target(
-                expr="sum(ray_serve_deployment_replica_healthy{{{global_filters}}}) by (deployment)",
-                legend="{{deployment}}",
+                expr="sum(ray_serve_deployment_replica_healthy{{{global_filters}}}) by (application, deployment)",
+                legend="{{application, deployment}}",
             ),
         ],
         grid_pos=GridPos(0, 2, 8, 8),
@@ -152,8 +152,8 @@
         unit="qps",
         targets=[
             Target(
-                expr='sum(rate(ray_serve_deployment_request_counter{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (deployment)',
-                legend="{{deployment}}",
+                expr='sum(rate(ray_serve_deployment_request_counter{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment)',
+                legend="{{application, deployment}}",
             ),
         ],
         grid_pos=GridPos(8, 2, 8, 8),
@@ -165,8 +165,8 @@
         unit="qps",
         targets=[
             Target(
-                expr='sum(rate(ray_serve_deployment_error_counter{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (deployment)',
-                legend="{{deployment}}",
+                expr='sum(rate(ray_serve_deployment_error_counter{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment)',
+                legend="{{application, deployment}}",
             ),
         ],
         grid_pos=GridPos(16, 2, 8, 8),
@@ -178,8 +178,8 @@
         unit="ms",
         targets=[
             Target(
-                expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (deployment, le))',
-                legend="{{deployment}}",
+                expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, le))',
+                legend="{{application, deployment}}",
             ),
             Target(
                 expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (le))',
@@ -197,8 +197,8 @@
         unit="ms",
         targets=[
             Target(
-                expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (deployment, le))',
-                legend="{{deployment}}",
+                expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, le))',
+                legend="{{application, deployment}}",
             ),
             Target(
                 expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (le))',
@@ -216,8 +216,8 @@
         unit="ms",
         targets=[
             Target(
-                expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (deployment, le))',
-                legend="{{deployment}}",
+                expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, le))',
+                legend="{{application, deployment}}",
             ),
             Target(
                 expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (le))',
@@ -235,8 +235,8 @@
         unit="requests",
         targets=[
             Target(
-                expr="sum(ray_serve_deployment_queued_queries{{{global_filters}}}) by (deployment)",
-                legend="{{deployment}}",
+                expr="sum(ray_serve_deployment_queued_queries{{{global_filters}}}) by (application, deployment)",
+                legend="{{application, deployment}}",
             ),
         ],
         fill=0,
diff --git a/dashboard/modules/metrics/dashboards/serve_deployment_dashboard_panels.py b/dashboard/modules/metrics/dashboards/serve_deployment_dashboard_panels.py
index b1c26dd5cdf9..be42f9c03132 100644
--- a/dashboard/modules/metrics/dashboards/serve_deployment_dashboard_panels.py
+++ b/dashboard/modules/metrics/dashboards/serve_deployment_dashboard_panels.py
@@ -15,8 +15,8 @@
         unit="replicas",
         targets=[
             Target(
-                expr="sum(ray_serve_deployment_replica_healthy{{{global_filters}}}) by (deployment)",
-                legend="{{deployment}}",
+                expr="sum(ray_serve_deployment_replica_healthy{{{global_filters}}}) by (application, deployment)",
+                legend="{{application, deployment}}",
             ),
         ],
         grid_pos=GridPos(0, 0, 8, 8),
@@ -28,7 +28,7 @@
         unit="qps",
         targets=[
             Target(
-                expr='sum(rate(ray_serve_deployment_request_counter{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (deployment, replica)',
+                expr='sum(rate(ray_serve_deployment_request_counter{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica)',
                 legend="{{replica}}",
             ),
         ],
@@ -41,7 +41,7 @@
         unit="qps",
         targets=[
             Target(
-                expr='sum(rate(ray_serve_deployment_error_counter{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (deployment, replica)',
+                expr='sum(rate(ray_serve_deployment_error_counter{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica)',
                 legend="{{replica}}",
             ),
         ],
@@ -54,7 +54,7 @@
         unit="ms",
         targets=[
             Target(
-                expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (deployment, replica, le))',
+                expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica, le))',
                 legend="{{replica}}",
             ),
             Target(
@@ -73,7 +73,7 @@
         unit="ms",
         targets=[
             Target(
-                expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (deployment, replica, le))',
+                expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica, le))',
                 legend="{{replica}}",
             ),
             Target(
@@ -92,7 +92,7 @@
         unit="ms",
         targets=[
             Target(
-                expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (deployment, replica, le))',
+                expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica, le))',
                 legend="{{replica}}",
             ),
             Target(
@@ -111,8 +111,8 @@
         unit="requests",
         targets=[
             Target(
-                expr="sum(ray_serve_deployment_queued_queries{{{global_filters}}}) by (deployment)",
-                legend="{{deployment}}",
+                expr="sum(ray_serve_deployment_queued_queries{{{global_filters}}}) by (application, deployment)",
+                legend="{{application, deployment}}",
             ),
         ],
         fill=0,
@@ -126,7 +126,7 @@
         unit="requests",
         targets=[
             Target(
-                expr="sum(ray_serve_replica_pending_queries{{{global_filters}}}) by (deployment, replica)",
+                expr="sum(ray_serve_replica_pending_queries{{{global_filters}}}) by (application, deployment, replica)",
                 legend="{{replica}}",
             ),
         ],
@@ -141,7 +141,7 @@
         unit="requests",
         targets=[
             Target(
-                expr="sum(ray_serve_replica_processing_queries{{{global_filters}}}) by (deployment, replica)",
+                expr="sum(ray_serve_replica_processing_queries{{{global_filters}}}) by (application, deployment, replica)",
                 legend="{{replica}}",
             ),
         ],
@@ -156,7 +156,7 @@
         unit="models",
         targets=[
             Target(
-                expr="sum(ray_serve_num_multiplexed_models{{{global_filters}}}) by (deployment, replica)",
+                expr="sum(ray_serve_num_multiplexed_models{{{global_filters}}}) by (application, deployment, replica)",
                 legend="{{replica}}",
             ),
         ],
@@ -171,7 +171,7 @@
         unit="times",
         targets=[
             Target(
-                expr="sum(ray_serve_multiplexed_models_load_counter{{{global_filters}}}) by (deployment, replica)",
+                expr="sum(ray_serve_multiplexed_models_load_counter{{{global_filters}}}) by (application, deployment, replica)",
                 legend="{{replica}}",
             ),
         ],
@@ -186,7 +186,7 @@
         unit="times",
         targets=[
             Target(
-                expr="sum(ray_serve_multiplexed_models_unload_counter{{{global_filters}}}) by (deployment, replica)",
+                expr="sum(ray_serve_multiplexed_models_unload_counter{{{global_filters}}}) by (application, deployment, replica)",
                 legend="{{replica}}",
             ),
         ],
@@ -201,7 +201,7 @@
         unit="ms",
         targets=[
             Target(
-                expr="histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{{{global_filters}}}[5m])) by (deployment, replica, le))",
+                expr="histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{{{global_filters}}}[5m])) by (application, deployment, replica, le))",
                 legend="{{replica}}",
             ),
         ],
@@ -216,7 +216,7 @@
         unit="ms",
         targets=[
             Target(
-                expr="histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{{{global_filters}}}[5m])) by (deployment, replica, le))",
+                expr="histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{{{global_filters}}}[5m])) by (application, deployment, replica, le))",
                 legend="{{replica}}",
             ),
         ],
@@ -264,6 +264,10 @@
     name="SERVE_DEPLOYMENT",
     default_uid="rayServeDeploymentDashboard",
     panels=SERVE_DEPLOYMENT_GRAFANA_PANELS,
-    standard_global_filters=['deployment=~"$Deployment"', 'replica=~"$Replica"'],
+    standard_global_filters=[
+        'application=~"$Application"',
+        'deployment=~"$Deployment"',
+        'replica=~"$Replica"',
+    ],
     base_json_file_name="serve_deployment_grafana_dashboard_base.json",
 )
diff --git a/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json b/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json
index 76be2c1c5cdd..40cfff967058 100644
--- a/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json
+++ b/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json
@@ -52,7 +52,42 @@
           ]
         },
         "datasource": "${datasource}",
-        "definition": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, deployment)",
+        "definition": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, application)",
+        "description": null,
+        "error": null,
+        "hide": 0,
+        "includeAll": true,
+        "label": null,
+        "multi": true,
+        "name": "Application",
+        "options": [],
+        "query": {
+          "query": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, application)",
+          "refId": "Prometheus-Instance-Variable-Query"
+        },
+        "refresh": 2,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      },
+      {
+        "allValue": ".*",
+        "current": {
+          "selected": true,
+          "text": [
+            "All"
+          ],
+          "value": [
+            "$__all"
+          ]
+        },
+        "datasource": "${datasource}",
+        "definition": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",{global_filters}}}, deployment)",
         "description": null,
         "error": null,
         "hide": 0,
@@ -62,7 +97,7 @@
         "name": "Deployment",
         "options": [],
         "query": {
-          "query": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, deployment)",
+          "query": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",{global_filters}}}, deployment)",
           "refId": "Prometheus-Instance-Variable-Query"
         },
         "refresh": 2,
@@ -87,7 +122,7 @@
           ]
         },
         "datasource": "${datasource}",
-        "definition": "label_values(ray_serve_deployment_replica_healthy{{deployment=~\"$Deployment\",{global_filters}}}, replica)",
+        "definition": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",deployment=~\"$Deployment\",{global_filters}}}, replica)",
         "description": null,
         "error": null,
         "hide": 0,
@@ -97,7 +132,7 @@
         "name": "Replica",
         "options": [],
         "query": {
-          "query": "label_values(ray_serve_deployment_replica_healthy{{deployment=~\"$Deployment\",{global_filters}}}, replica)",
+          "query": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",deployment=~\"$Deployment\",{global_filters}}}, replica)",
           "refId": "Prometheus-Instance-Variable-Query"
         },
         "refresh": 2,
diff --git a/doc/source/serve/api/index.md b/doc/source/serve/api/index.md
index 01cd14d760dc..4cc119226195 100644
--- a/doc/source/serve/api/index.md
+++ b/doc/source/serve/api/index.md
@@ -53,6 +53,8 @@ This is fixed by added custom filename mappings in `source/conf.py` (look for "a
 
    serve.get_replica_context
    serve.get_multiplexed_model_id
+   serve.get_app_handle
+   serve.get_deployment_handle
 ```
 
 ### Running Applications
@@ -66,6 +68,7 @@ This is fixed by added custom filename mappings in `source/conf.py` (look for "a
    serve.delete
    serve.start
    serve.shutdown
+   serve.status
 ```
 
 (serve-cli)=
diff --git a/doc/source/serve/scaling-and-resource-allocation.md b/doc/source/serve/scaling-and-resource-allocation.md
index 7fbbb8061f05..c9b9783cb06d 100644
--- a/doc/source/serve/scaling-and-resource-allocation.md
+++ b/doc/source/serve/scaling-and-resource-allocation.md
@@ -80,7 +80,9 @@ Ray Serve Autoscaling allows the `min_replicas` to be 0 when starting your deplo
 `downscale_delay_s` and `upscale_delay_s` control the frequency of doing autoscaling work. For example, if your application takes a long time to do initialization work, you can increase `downscale_delay_s` to make the downscaling happen slowly.
 :::
 
-**smoothing_factor[default_value=1.0]**: The multiplicative factor to speed up or slow down each autoscaling step. For example, when the application has high traffic volume in short period of time, you can increase `smoothing_factor` to scale up the resource quickly.  You can think of this as a "gain" factor to amplify the response of the autoscaling algorithm.
+**upscale_smoothing_factor[default_value=1.0]**: The multiplicative factor to speed up or slow down each upscaling decision. For example, when the application has high traffic volume in a short period of time, you can increase `upscale_smoothing_factor` to scale up the resource quickly. You can think of this as a "gain" factor to amplify the response of the autoscaling algorithm.
+
+**downscale_smoothing_factor[default_value=1.0]**: The multiplicative factor to speed up or slow down each downscaling decision. For example, if you want your application to be less sensitive to drops in traffic and scale down more conservatively, you can decrease `downscale_smoothing_factor` to slow down the pace of downscaling.
 
 **metrics_interval_s[default_value=10]**: This controls how often each replica sends metrics to the autoscaler. (Normally you don't need to change this config.)
 
diff --git a/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb b/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb
index f0b2136c19d4..1c347ee3d39c 100644
--- a/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb
+++ b/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb
@@ -7,13 +7,13 @@
    "source": [
     "(vicuna_lightning_deepspeed_finetuning)=\n",
     "\n",
-    "# Fine-tune `vicuna-13b` with Ray LightningTrainer and DeepSpeed\n",
+    "# Fine-tune `vicuna-13b` with Lightning and DeepSpeed\n",
     "\n",
-    "In this example, we will demonstrate how to perform full fine-tuning for a [`vicuna-13b-v1.3`](https://huggingface.co/lmsys/vicuna-13b-v1.3) model using LightningTrainer with the DeepSpeed ZeRO-3 strategy.\n",
+    "In this example, we will demonstrate how to perform full fine-tuning for a [`vicuna-13b-v1.3`](https://huggingface.co/lmsys/vicuna-13b-v1.3) model using Ray Train PyTorch Lightning integrations with the DeepSpeed ZeRO-3 strategy.\n",
     "\n",
     "- [DeepSpeed](<https://github.com/microsoft/DeepSpeed>) is an open-source deep learning optimization library for PyTorch. It's designed to reduce computing power and memory usage, and to train large distributed models by leveraging state-of-the-art innovations like ZeRO, 3D-Parallelism, DeepSpeed-MoE, and ZeRO-Infinity. \n",
     "- PyTorch Lightning offers a [DeepSpeed integration](https://lightning.ai/docs/pytorch/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), which provides a simple interface to configure the knobs for DeepSpeed and automatically trigger your training process with the DeepSpeed Engine.\n",
-    "- {class}`Ray LightningTrainer <ray.train.lightning.LightningTrainer>` allows you to easily scale your PyTorch Lightning job across multiple nodes in a Ray cluster, without worrying about the underlying cluster management, autoscaling, and distributed process group settings.\n",
+    "- {class}`Ray TorchTrainer <ray.train.torch.TorchTrainer>` allows you to easily scale your PyTorch Lightning job across multiple nodes in a Ray cluster, without worrying about the underlying cluster management, autoscaling, and distributed process group settings.\n",
     "\n",
     "Our demo aims to illustrate how these three tools can be combined effectively to finetune the Vicuna-13B model, leveraging the strengths of each to create an efficient and high-performance deep learning solution.\n"
    ]
@@ -24,11 +24,11 @@
    "metadata": {},
    "source": [
     "```{note}\n",
-    "This is an advanced example of Large Language Model fine-tuning with Ray Train. If you're a beginner or new to the concepts of Ray Train and LightningTrainer, it would be beneficial to first explore the introductory documentation below to build a foundational understanding. \n",
+    "This is an advanced example of Large Language Model fine-tuning with Ray Train. If you're a beginner or new to the concepts of Ray Train and our Lightning integrations, it would be beneficial to first explore the introductory documentation below to build a foundational understanding. \n",
     "- [Ray Train Key Concepts](train-key-concepts) \n",
     "- [Ray Data Key Concepts](data_key_concepts)\n",
-    "- {ref}`[Basic] Image Classification with LightningTrainer <lightning_mnist_example>`\n",
-    "- {ref}`[Intermediate] Using LightningTrainer with Ray Data <lightning_advanced_example>`\n",
+    "- {ref}`[Basic] Image Classification with PyTorch Lightning and Ray Train <lightning_mnist_example>`\n",
+    "- {ref}`[Intermediate] Fine-tuning Lightning Modules with with Ray Data <lightning_advanced_example>`\n",
     "```\n"
    ]
   },
@@ -81,6 +81,21 @@
     "```"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# TODO(@justinvyu): Remove it\n",
+    "import os\n",
+    "os.environ[\"RAY_AIR_NEW_PERSISTENCE_MODE\"] = \"1\""
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -102,7 +117,8 @@
     "            \"accelerate==0.20.3\",\n",
     "            \"transformers==4.30.2\",\n",
     "            \"pytorch_lightning==2.0.3\",\n",
-    "        ]\n",
+    "        ],\n",
+    "        \"env_vars\": {\"RAY_AIR_NEW_PERSISTENCE_MODE\": \"1\"} # TODO(@justinvyu): Remove it\n",
     "    }\n",
     ")"
    ]
@@ -219,12 +235,26 @@
     "processed_ds = ray_ds.map_batches(fill_prompt, batch_format=\"pandas\").map_batches(tokenize, batch_format=\"pandas\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# To accelerate release tests\n",
+    "processed_ds = processed_ds.limit(16 * 8 * 16)  # each worker has 16 batches"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Define your model\n",
+    "## Define a Lightning Module\n",
     "\n",
     "Here we load the pre-trained model weights from HuggingFace Model Hub, and wrap them into `pl.LightningModule`. We adopted the efficient model initialization techniques introduced in [Lightning-transformers](https://github.com/Lightning-Universe/lightning-transformers) to avoid unnecessary full weights loading."
    ]
@@ -306,7 +336,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Training Configurations\n",
+    "## DeepSpeed Configurations\n",
     "\n",
     "Before training, let's calculate the memory usage of finetuning a `vicuna-13b` model. Assume we are using FP16 mixed-precision training, and the optimizer is Adam with FP32 states.\n",
     "\n",
@@ -324,7 +354,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ray.train.lightning import LightningTrainer, LightningConfigBuilder\n",
     "from transformers import AutoConfig\n",
     "\n",
     "config = AutoConfig.from_pretrained(MODEL_NAME)\n",
@@ -342,55 +371,7 @@
     "        \"stage3_prefetch_bucket_size\": 0.9 * HIDDEN_SIZE * HIDDEN_SIZE,\n",
     "        \"stage3_param_persistence_threshold\": 10 * HIDDEN_SIZE,\n",
     "    },\n",
-    "}\n",
-    "\n",
-    "lightning_config = (\n",
-    "    LightningConfigBuilder()\n",
-    "    .module(cls=Vicuna13BModel)\n",
-    "    .trainer(\n",
-    "        max_epochs=1,\n",
-    "        accelerator=\"gpu\",\n",
-    "        precision=\"bf16-mixed\",\n",
-    "        accumulate_grad_batches=2,\n",
-    "    )\n",
-    "    .strategy(name=\"deepspeed\", config=deepspeed_configs)\n",
-    "    .checkpointing(save_top_k=0, save_weights_only=True, save_last=True)\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": [
-     "remove-cell"
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "from pytorch_lightning.callbacks import TQDMProgressBar\n",
-    "\n",
-    "# Create a customized progress bar for LightningTrainer\n",
-    "class VicunaProgressBar(TQDMProgressBar):\n",
-    "    def __init__(self, num_iters_per_epoch, *args, **kwargs):\n",
-    "        super().__init__(*args, **kwargs)\n",
-    "        self.num_iters_per_epoch = num_iters_per_epoch\n",
-    "\n",
-    "    def on_train_epoch_start(self, trainer, *_):\n",
-    "        super().on_train_epoch_start(trainer, *_)\n",
-    "        self.train_progress_bar.reset(self.num_iters_per_epoch)\n",
-    "\n",
-    "\n",
-    "total_batches = processed_ds.count()\n",
-    "num_iters_per_epoch = total_batches // (NUM_WORKERS * BATCH_SIZE_PER_WORKER)\n",
-    "progress_bar = VicunaProgressBar(num_iters_per_epoch)\n",
-    "\n",
-    "\n",
-    "lightning_config.trainer(\n",
-    "    callbacks=[progress_bar],\n",
-    "    # Take a subset to accelerate release tests\n",
-    "    limit_train_batches=20,\n",
-    ")"
+    "}"
    ]
   },
   {
@@ -398,7 +379,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Finally, combine all the configurations with {class}`LightningConfigBuilder <ray.train.lightning.LightningConfigBuilder>` and instantiate a LightningTrainer. "
+    "## Define your training function\n",
+    "\n",
+    "Finally, define the training function that will be launched on multiple workers. The training function is generally the same as the pure pytorch Lightning training code, with additional Ray Train utilities:\n",
+    "\n",
+    "- {class}`~ray.train.lightning.RayDeepSpeedStrategy`: Same argument list as Lightning DeepSpeedStrategy but integrated with Ray Train.\n",
+    "- {class}`~ray.train.lightning.RayLightningEnvironment`: Lightning environments for Ray cluster.\n",
+    "- {class}`~ray.train.lightning.RayTrainReportCallback`: On each epoch end, it reports the checkpoint from each worker to the ray train (distributed checkpointing).\n",
+    "- {meth}`~ray.train.lightning.prepare_trainer`: Validate your lightning Trainer configurations.\n",
+    "\n",
+    "For Ray Data ingestion, we fetched the preprocessed and sharded dataset with {meth}`~ray.train.get_dataset_shard`, and created a dataloader with {meth}`~ray.data.Dataset.iter_torch_batches`. It returns a custom iterator that replaces the Torch DataLoader.\n"
    ]
   },
   {
@@ -407,19 +397,58 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import ray.train\n",
     "from ray.train import CheckpointConfig, RunConfig, ScalingConfig\n",
+    "from ray.train.torch import TorchTrainer\n",
+    "from ray.train.lightning import (\n",
+    "    prepare_trainer,\n",
+    "    RayDeepSpeedStrategy, \n",
+    "    RayLightningEnvironment, \n",
+    "    RayTrainReportCallback\n",
+    ")\n",
     "\n",
-    "trainer = LightningTrainer(\n",
-    "    lightning_config=lightning_config.build(),\n",
+    "\n",
+    "def train_func(config):\n",
+    "    \"\"\"Training function for each worker.\"\"\"\n",
+    "\n",
+    "    # Unpack the `train_loop_config`\n",
+    "    max_epochs = config[\"max_epochs\"]\n",
+    "    batch_size = config[\"batch_size\"]\n",
+    "    accumulate_grad_batches = config[\"accumulate_grad_batches\"]\n",
+    "\n",
+    "    model = Vicuna13BModel()\n",
+    "    \n",
+    "    # Prepare Ray Data Ingestion\n",
+    "    train_ds = ray.train.get_dataset_shard(\"train\")\n",
+    "    train_dataloader = train_ds.iter_torch_batches(batch_size=batch_size)\n",
+    "    \n",
+    "    pl_trainer = pl.Trainer(\n",
+    "        devices=\"auto\",\n",
+    "        accelerator=\"auto\",\n",
+    "        strategy=RayDeepSpeedStrategy(config=deepspeed_configs),\n",
+    "        plugins=[RayLightningEnvironment()],\n",
+    "        callbacks=[RayTrainReportCallback()],\n",
+    "        enable_checkpointing=False, # RayTrainReportCallback will save the checkpoints\n",
+    "        max_epochs=max_epochs,\n",
+    "        precision=\"bf16-mixed\",\n",
+    "        accumulate_grad_batches=accumulate_grad_batches,\n",
+    "    )\n",
+    "    pl_trainer = prepare_trainer(pl_trainer)\n",
+    "\n",
+    "    pl_trainer.fit(model, train_dataloaders=train_dataloader)\n",
+    "    \n",
+    "\n",
+    "trainer = TorchTrainer(\n",
+    "    train_loop_per_worker=train_func,\n",
+    "    train_loop_config={\n",
+    "        \"max_epochs\": 1,\n",
+    "        \"batch_size\": BATCH_SIZE_PER_WORKER,\n",
+    "        \"accumulate_grad_batches\": 2\n",
+    "    },\n",
     "    run_config=RunConfig(\n",
     "        name=\"vicuna-13b-finetune\",\n",
     "        storage_path=\"s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/air-release-tests\",\n",
-    "        checkpoint_config=CheckpointConfig(\n",
-    "            num_to_keep=1,\n",
-    "            # Enable distributed checkpointing\n",
-    "            _checkpoint_keep_all_ranks=True,\n",
-    "            _checkpoint_upload_from_workers=True,\n",
-    "        ),\n",
+    "        checkpoint_config=CheckpointConfig(num_to_keep=1),\n",
     "    ),\n",
     "    scaling_config=ScalingConfig(\n",
     "        num_workers=NUM_WORKERS,\n",
@@ -427,22 +456,9 @@
     "        resources_per_worker={\"CPU\": 15, \"GPU\": 1},\n",
     "    ),\n",
     "    datasets={\"train\": processed_ds},\n",
-    "    datasets_iter_config={\"batch_size\": BATCH_SIZE_PER_WORKER},\n",
     ")"
    ]
   },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "```{tip}\n",
-    "\n",
-    "Here, we highly recommend saving checkpoints with cloud storage and enabling distributed checkpointing by setting `_checkpoint_keep_all_ranks` and `_checkpoint_upload_from_workers` to True when training huge models. Otherwise, all checkpoint shards will be synced to the head node, which may introduce enormous syncing overhead and even cause out-of-memory.\n",
-    "\n",
-    "```"
-   ]
-  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -450,7 +466,7 @@
    "source": [
     "## Model Fine-tuning\n",
     "\n",
-    "Once everything is configured in LightningTrainer, training becomes easy. Simply call `trainer.fit()`, and your workload will be scaled to the Ray cluster, initiating ZeRO-3 parallel training."
+    "Once everything is configured in TorchTrainer, training becomes easy. Simply call `trainer.fit()`, and your workload will be scaled to the Ray cluster, initiating ZeRO-3 parallel training."
    ]
   },
   {
@@ -1022,7 +1038,7 @@
     "- Training takes: 36:06 = 2166s\n",
     "- Training + initialization + checkpointing takes 2473s\n",
     "\n",
-    "Therefore, the model initialization and checkpoint syncing takes 307s. It will be amortized when you have larger datasets and spend more time on training."
+    "Model initialization and checkpoint synchronization took 307 seconds. It will be amortized as you have larger datasets and take more time to train."
    ]
   },
   {
@@ -1091,7 +1107,7 @@
    "source": [
     "import os\n",
     "\n",
-    "os.system(f\"awsv2 s3 sync {result.checkpoint.uri} /mnt/local_storage/checkpoint\")"
+    "os.system(f\"awsv2 s3 sync s3://{result.checkpoint.path} /mnt/local_storage\")"
    ]
   },
   {
@@ -1136,8 +1152,8 @@
     "    torch.save(vicuna_state_dict, os.path.join(zero_ckpt_dir, \"full_model.pt\"))\n",
     "\n",
     "\n",
-    "full_model_ckpt_path = \"/mnt/local_storage/checkpoint/model/full_model.pt\"\n",
-    "extract_fp32_ckpt_from_zero(\"/mnt/local_storage/checkpoint/model\")"
+    "full_model_ckpt_path = \"/mnt/local_storage/checkpoint.ckpt/full_model.pt\"\n",
+    "extract_fp32_ckpt_from_zero(\"/mnt/local_storage/checkpoint.ckpt\")"
    ]
   },
   {
diff --git a/python/ray/_private/serialization.py b/python/ray/_private/serialization.py
index 35925c8b455f..94969ec4440e 100644
--- a/python/ray/_private/serialization.py
+++ b/python/ray/_private/serialization.py
@@ -4,6 +4,8 @@
 import traceback
 from typing import Any
 
+import google.protobuf.message
+
 import ray._private.utils
 import ray.cloudpickle as pickle
 from ray._private import ray_constants
@@ -296,11 +298,22 @@ def _deserialize_object(self, data, metadata, object_ref):
             elif error_type == ErrorType.Value("LOCAL_RAYLET_DIED"):
                 return LocalRayletDiedError()
             elif error_type == ErrorType.Value("TASK_CANCELLED"):
-                error_message = ""
-                if data:
-                    error_info = self._deserialize_error_info(data, metadata_fields)
-                    error_message = error_info.error_message
-                return TaskCancelledError(error_message=error_message)
+                # Task cancellations are serialized in two ways, so check both
+                # deserialization paths.
+                # TODO(swang): We should only have one serialization path.
+                try:
+                    # Deserialization from C++ (the CoreWorker task submitter).
+                    # The error info will be stored as a RayErrorInfo.
+                    error_message = ""
+                    if data:
+                        error_info = self._deserialize_error_info(data, metadata_fields)
+                        error_message = error_info.error_message
+                    return TaskCancelledError(error_message=error_message)
+                except google.protobuf.message.DecodeError:
+                    # Deserialization from Python. The TaskCancelledError is
+                    # serialized and returned directly.
+                    obj = self._deserialize_msgpack_data(data, metadata_fields)
+                    return RayError.from_bytes(obj)
             elif error_type == ErrorType.Value("OBJECT_LOST"):
                 return ObjectLostError(
                     object_ref.hex(), object_ref.owner_address(), object_ref.call_site()
@@ -423,8 +436,17 @@ def _serialize_to_msgpack(self, value):
         contained_object_refs = []
 
         if isinstance(value, RayTaskError):
-            metadata = str(ErrorType.Value("TASK_EXECUTION_EXCEPTION")).encode("ascii")
-            value = value.to_bytes()
+            if issubclass(value.cause.__class__, TaskCancelledError):
+                # Handle task cancellation errors separately because we never
+                # want to warn about tasks that were intentionally cancelled by
+                # the user.
+                metadata = str(ErrorType.Value("TASK_CANCELLED")).encode("ascii")
+                value = value.to_bytes()
+            else:
+                metadata = str(ErrorType.Value("TASK_EXECUTION_EXCEPTION")).encode(
+                    "ascii"
+                )
+                value = value.to_bytes()
         elif isinstance(value, ray.actor.ActorHandle):
             # TODO(fyresone): ActorHandle should be serialized via the
             # custom type feature of cross-language.
diff --git a/python/ray/air/result.py b/python/ray/air/result.py
index 35e42bd5c304..93532b16a581 100644
--- a/python/ray/air/result.py
+++ b/python/ray/air/result.py
@@ -1,6 +1,7 @@
 import os
 import json
 import pandas as pd
+import pyarrow
 import warnings
 from dataclasses import dataclass
 from pathlib import Path
@@ -46,7 +47,6 @@ class Result:
             saved checkpoints is determined by the ``checkpoint_config``
             argument of ``run_config`` (by default, all checkpoints will
             be saved).
-
     """
 
     metrics: Optional[Dict[str, Any]]
@@ -56,7 +56,8 @@ class Result:
     best_checkpoints: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]] = None
     _local_path: Optional[str] = None
     _remote_path: Optional[str] = None
-    _items_to_repr = ["error", "metrics", "path", "checkpoint"]
+    _storage_filesystem: Optional[pyarrow.fs.FileSystem] = None
+    _items_to_repr = ["error", "metrics", "path", "filesystem", "checkpoint"]
     # Deprecate: raise in 2.5, remove in 2.6
     log_dir: Optional[Path] = None
 
@@ -86,13 +87,23 @@ def path(self) -> str:
         """Path pointing to the result directory on persistent storage.
 
         This can point to a remote storage location (e.g. S3) or to a local
-        location (path on the head node).
+        location (path on the head node). The path is accessible via the result's
+        associated `filesystem`.
 
-        For instance, if your remote storage path is ``s3://bucket/location``,
-        this will point to ``s3://bucket/location/experiment_name/trial_name``.
+        For instance, for a result stored in S3 at ``s3://bucket/location``,
+        ``path`` will have the value ``bucket/location``.
         """
         return self._remote_path or self._local_path
 
+    @property
+    def filesystem(self) -> pyarrow.fs.FileSystem:
+        """Return the filesystem that can be used to access the result path.
+
+        Returns:
+            pyarrow.fs.FileSystem implementation.
+        """
+        return self._storage_filesystem or pyarrow.fs.LocalFileSystem()
+
     def _repr(self, indent: int = 0) -> str:
         """Construct the representation with specified number of space indent."""
         from ray.tune.result import AUTO_RESULT_KEYS
@@ -104,6 +115,8 @@ def _repr(self, indent: int = 0) -> str:
         else:
             shown_attributes.pop("error")
 
+        shown_attributes["filesystem"] = shown_attributes["filesystem"].type_name
+
         if self.metrics:
             exclude = set(AUTO_RESULT_KEYS)
             exclude.update(BLACKLISTED_KEYS)
@@ -218,6 +231,7 @@ def from_path(cls, path: Union[str, os.PathLike]) -> "Result":
             checkpoint=latest_checkpoint,
             _local_path=local_path,
             _remote_path=None,
+            _storage_filesystem=pyarrow.fs.LocalFileSystem(),
             metrics_dataframe=metrics_df,
             best_checkpoints=best_checkpoints,
             error=error,
diff --git a/python/ray/data/_internal/logical/optimizers.py b/python/ray/data/_internal/logical/optimizers.py
index 9821ee406c15..72dfd7035976 100644
--- a/python/ray/data/_internal/logical/optimizers.py
+++ b/python/ray/data/_internal/logical/optimizers.py
@@ -6,19 +6,30 @@
     PhysicalPlan,
     Rule,
 )
-from ray.data._internal.logical.rules._default_optimizer_rules import (
-    get_logical_optimizer_rules,
-    get_physical_optimizer_rules,
+from ray.data._internal.logical.rules._user_provided_optimizer_rules import (
+    USER_PROVIDED_LOGICAL_RULES,
+    USER_PROVIDED_PHYSICAL_RULES,
 )
+from ray.data._internal.logical.rules.operator_fusion import OperatorFusionRule
+from ray.data._internal.logical.rules.randomize_blocks import ReorderRandomizeBlocksRule
 from ray.data._internal.planner.planner import Planner
 
+DEFAULT_LOGICAL_RULES = [
+    ReorderRandomizeBlocksRule,
+]
+
+DEFAULT_PHYSICAL_RULES = [
+    OperatorFusionRule,
+]
+
 
 class LogicalOptimizer(Optimizer):
     """The optimizer for logical operators."""
 
     @property
     def rules(self) -> List[Rule]:
-        return [rule_cls() for rule_cls in get_logical_optimizer_rules()]
+        rules = DEFAULT_LOGICAL_RULES + USER_PROVIDED_LOGICAL_RULES
+        return [rule_cls() for rule_cls in rules]
 
 
 class PhysicalOptimizer(Optimizer):
@@ -26,7 +37,8 @@ class PhysicalOptimizer(Optimizer):
 
     @property
     def rules(self) -> List["Rule"]:
-        return [rule_cls() for rule_cls in get_physical_optimizer_rules()]
+        rules = DEFAULT_PHYSICAL_RULES + USER_PROVIDED_PHYSICAL_RULES
+        return [rule_cls() for rule_cls in rules]
 
 
 def get_execution_plan(logical_plan: LogicalPlan) -> PhysicalPlan:
diff --git a/python/ray/data/_internal/logical/rules/_default_optimizer_rules.py b/python/ray/data/_internal/logical/rules/_default_optimizer_rules.py
deleted file mode 100644
index 410a1c13a4ee..000000000000
--- a/python/ray/data/_internal/logical/rules/_default_optimizer_rules.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from ray.data._internal.logical.rules.operator_fusion import OperatorFusionRule
-from ray.data._internal.logical.rules.randomize_blocks import ReorderRandomizeBlocksRule
-from ray.data._internal.logical.rules.zero_copy_map_fusion import (
-    EliminateBuildOutputBlocks,
-)
-
-
-def get_logical_optimizer_rules():
-    rules = [ReorderRandomizeBlocksRule]
-    return rules
-
-
-def get_physical_optimizer_rules():
-    # Subclasses of ZeroCopyMapFusionRule (e.g., EliminateBuildOutputBlocks) should
-    # be run after OperatorFusionRule.
-    rules = [OperatorFusionRule, EliminateBuildOutputBlocks]
-    return rules
diff --git a/python/ray/data/_internal/logical/rules/_user_provided_optimizer_rules.py b/python/ray/data/_internal/logical/rules/_user_provided_optimizer_rules.py
new file mode 100644
index 000000000000..0034b13c669c
--- /dev/null
+++ b/python/ray/data/_internal/logical/rules/_user_provided_optimizer_rules.py
@@ -0,0 +1,7 @@
+# Users can provide extra logical optimization rules here
+# to be used in `LogicalOptimizer`.
+USER_PROVIDED_LOGICAL_RULES = []
+
+# Users can provide extra physical optimization rules here
+# to be used in `PhysicalOptimizer`.
+USER_PROVIDED_PHYSICAL_RULES = []
diff --git a/python/ray/data/datasource/__init__.py b/python/ray/data/datasource/__init__.py
index 85a09de0da7e..2f8f704d52e2 100644
--- a/python/ray/data/datasource/__init__.py
+++ b/python/ray/data/datasource/__init__.py
@@ -41,6 +41,9 @@
 from ray.data.datasource.tfrecords_datasource import TFRecordDatasource
 from ray.data.datasource.webdataset_datasource import WebDatasetDatasource
 
+# Note: HuggingFaceDatasource should NOT be imported here, because
+# we want to only import the Hugging Face datasets library when we use
+# ray.data.from_huggingface() or HuggingFaceDatasource() directly.
 __all__ = [
     "BaseFileMetadataProvider",
     "BinaryDatasource",
diff --git a/python/ray/data/datasource/file_based_datasource.py b/python/ray/data/datasource/file_based_datasource.py
index 9cb46187972d..28ad9b43e91a 100644
--- a/python/ray/data/datasource/file_based_datasource.py
+++ b/python/ray/data/datasource/file_based_datasource.py
@@ -59,6 +59,15 @@
 # 16 file size fetches from S3 takes ~1.5 seconds with Arrow's S3FileSystem.
 PATHS_PER_FILE_SIZE_FETCH_TASK = 16
 
+# The errors to retry for opening file.
+OPEN_FILE_RETRY_ON_ERRORS = ["AWS Error SLOW_DOWN"]
+
+# The max retry backoff in seconds for opening file.
+OPEN_FILE_RETRY_MAX_BACKOFF_SECONDS = 32
+
+# The max number of attempts for opening file.
+OPEN_FILE_MAX_ATTEMPTS = 10
+
 
 @DeveloperAPI
 class BlockWritePathProvider:
@@ -348,7 +357,10 @@ def write(
                     )
                     write_path = os.path.join(path, filename)
                     logger.get_logger().debug(f"Writing {write_path} file.")
-                    with fs.open_output_stream(write_path, **open_stream_args) as f:
+                    with _open_file_with_retry(
+                        write_path,
+                        lambda: fs.open_output_stream(write_path, **open_stream_args),
+                    ) as f:
                         _write_row_to_file(
                             f,
                             row,
@@ -366,7 +378,10 @@ def write(
                     file_format=file_format,
                 )
                 logger.get_logger().debug(f"Writing {write_path} file.")
-                with fs.open_output_stream(write_path, **open_stream_args) as f:
+                with _open_file_with_retry(
+                    write_path,
+                    lambda: fs.open_output_stream(write_path, **open_stream_args),
+                ) as f:
                     _write_block_to_file(
                         f,
                         block,
@@ -567,7 +582,10 @@ def read_files(
                     parse = PathPartitionParser(partitioning)
                     partitions = parse(read_path)
 
-                with open_input_source(fs, read_path, **open_stream_args) as f:
+                with _open_file_with_retry(
+                    read_path,
+                    lambda: open_input_source(fs, read_path, **open_stream_args),
+                ) as f:
                     for data in read_stream(f, read_path, **reader_args):
                         if partitions:
                             data = _add_partitions(data, partitions)
@@ -909,3 +927,44 @@ def _fetch_metadata_parallel(
         fetch_tasks.append(remote_fetch_func.remote(uri_chunk))
     results = metadata_fetch_bar.fetch_until_complete(fetch_tasks)
     yield from itertools.chain.from_iterable(results)
+
+
+def _open_file_with_retry(
+    file_path: str,
+    open_file: Callable[[], "pyarrow.NativeFile"],
+) -> "pyarrow.NativeFile":
+    """Open file with an exponential backoff retry strategy.
+
+    This is to avoid transient task failure with remote storage (such as S3),
+    when the remote storage throttles the requests.
+    """
+    import random
+    import time
+
+    if OPEN_FILE_MAX_ATTEMPTS < 1:
+        raise ValueError(
+            "OPEN_FILE_MAX_ATTEMPTS cannot be negative or 0. Get: "
+            f"{OPEN_FILE_MAX_ATTEMPTS}"
+        )
+
+    for i in range(OPEN_FILE_MAX_ATTEMPTS):
+        try:
+            return open_file()
+        except Exception as e:
+            error_message = str(e)
+            is_retryable = any(
+                [error in error_message for error in OPEN_FILE_RETRY_ON_ERRORS]
+            )
+            if is_retryable and i + 1 < OPEN_FILE_MAX_ATTEMPTS:
+                # Retry with binary expoential backoff with random jitter.
+                backoff = min(
+                    (2 ** (i + 1)) * random.random(),
+                    OPEN_FILE_RETRY_MAX_BACKOFF_SECONDS,
+                )
+                logger.get_logger().debug(
+                    f"Retrying {i+1} attempts to open file {file_path} after "
+                    f"{backoff} seconds."
+                )
+                time.sleep(backoff)
+            else:
+                raise e from None
diff --git a/python/ray/data/datasource/file_meta_provider.py b/python/ray/data/datasource/file_meta_provider.py
index b90049b2e5b9..400e9b31c0a6 100644
--- a/python/ray/data/datasource/file_meta_provider.py
+++ b/python/ray/data/datasource/file_meta_provider.py
@@ -220,7 +220,7 @@ def _get_block_metadata(
         paths: List[str],
         schema: Optional[Union[type, "pyarrow.lib.Schema"]],
         *,
-        pieces: List["pyarrow.dataset.ParquetFileFragment"],
+        num_pieces: int,
         prefetched_metadata: Optional[List[Any]],
     ) -> BlockMetadata:
         """Resolves and returns block metadata for files of a single dataset block.
@@ -229,7 +229,8 @@ def _get_block_metadata(
             paths: The file paths for a single dataset block.
             schema: The user-provided or inferred schema for the given file
                 paths, if any.
-            pieces: The Parquet file fragments derived from the input file paths.
+            num_pieces: The number of Parquet file fragments derived from the input
+                file paths.
             prefetched_metadata: Metadata previously returned from
                 `prefetch_file_metadata()` for each file fragment, where
                 `prefetched_metadata[i]` contains the metadata for `pieces[i]`.
@@ -277,10 +278,10 @@ def _get_block_metadata(
         paths: List[str],
         schema: Optional[Union[type, "pyarrow.lib.Schema"]],
         *,
-        pieces: List["pyarrow.dataset.ParquetFileFragment"],
+        num_pieces: int,
         prefetched_metadata: Optional[List["pyarrow.parquet.FileMetaData"]],
     ) -> BlockMetadata:
-        if prefetched_metadata is not None and len(prefetched_metadata) == len(pieces):
+        if prefetched_metadata is not None and len(prefetched_metadata) == num_pieces:
             # Piece metadata was available, construct a normal
             # BlockMetadata.
             block_metadata = BlockMetadata(
diff --git a/python/ray/data/datasource/huggingface_datasource.py b/python/ray/data/datasource/huggingface_datasource.py
new file mode 100644
index 000000000000..3aed1135a3c8
--- /dev/null
+++ b/python/ray/data/datasource/huggingface_datasource.py
@@ -0,0 +1,115 @@
+import sys
+from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+
+from ray.data._internal.dataset_logger import DatasetLogger
+from ray.data._internal.util import _check_pyarrow_version
+from ray.data.block import Block, BlockMetadata
+from ray.data.datasource import Datasource, Reader, ReadTask
+from ray.util.annotations import DeveloperAPI
+
+logger = DatasetLogger(__name__)
+
+if TYPE_CHECKING:
+    import datasets
+
+
+TRANSFORMERS_IMPORT_ERROR: Optional[ImportError] = None
+
+try:
+    # Due to HF Dataset's dynamic module system, we need to dynamically import the
+    # datasets_modules module on every actor when training.
+    # We accomplish this by simply running the following bit of code directly
+    # in the module you are currently viewing. This ensures that when we
+    # unpickle the Dataset, it runs before pickle tries to
+    # import datasets_modules and prevents an exception from being thrown.
+    # Same logic is present inside Ray's TransformersTrainer and HF Transformers Ray
+    # integration: https://github.com/huggingface/transformers/blob/\
+    # 7d5fde991d598370d961be8cb7add6541e2b59ce/src/transformers/integrations.py#L271
+    # Also see https://github.com/ray-project/ray/issues/28084
+    from transformers.utils import is_datasets_available
+
+    if "datasets_modules" not in sys.modules and is_datasets_available():
+        import importlib
+        import os
+
+        import datasets.load
+
+        dynamic_modules_path = os.path.join(
+            datasets.load.init_dynamic_modules(), "__init__.py"
+        )
+        # load dynamic_modules from path
+        spec = importlib.util.spec_from_file_location(
+            "datasets_modules", dynamic_modules_path
+        )
+        datasets_modules = importlib.util.module_from_spec(spec)
+        sys.modules[spec.name] = datasets_modules
+        spec.loader.exec_module(datasets_modules)
+except ImportError as e:
+    TRANSFORMERS_IMPORT_ERROR = e
+
+
+@DeveloperAPI
+class HuggingFaceDatasource(Datasource):
+    """Hugging Face Dataset datasource, for reading from a
+    `Hugging Face Datasets Dataset <https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset/>`_.
+    This Datasource implements a streamed read using a
+    single read task, most beneficial for a
+    `Hugging Face Datasets IterableDataset <https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.IterableDataset/>`_
+    or datasets which are too large to fit in-memory.
+    For an in-memory Hugging Face Dataset (`datasets.Dataset`), use :meth:`~ray.data.from_huggingface`
+    directly for faster performance.
+    """  # noqa: E501
+
+    def create_reader(
+        self,
+        dataset: Union["datasets.Dataset", "datasets.IterableDataset"],
+    ) -> "_HuggingFaceDatasourceReader":
+        if TRANSFORMERS_IMPORT_ERROR is not None:
+            raise TRANSFORMERS_IMPORT_ERROR
+        return _HuggingFaceDatasourceReader(dataset)
+
+
+class _HuggingFaceDatasourceReader(Reader):
+    def __init__(
+        self,
+        dataset: Union["datasets.Dataset", "datasets.IterableDataset"],
+        batch_size: int = 4096,
+    ):
+        self._dataset = dataset
+        self._batch_size = batch_size
+
+    def estimate_inmemory_data_size(self) -> Optional[int]:
+        return self._dataset.dataset_size
+
+    def get_read_tasks(
+        self,
+        parallelism: int,
+    ) -> List[ReadTask]:
+        # Note: `parallelism` arg is currently not used by HuggingFaceDatasource.
+        # We always generate a single ReadTask to perform the read.
+        _check_pyarrow_version()
+        import pyarrow
+
+        def _read_dataset(dataset: "datasets.IterableDataset") -> Iterable[Block]:
+            for batch in dataset.with_format("arrow").iter(batch_size=self._batch_size):
+                block = pyarrow.Table.from_pydict(batch)
+                yield block
+
+        # TODO(scottjlee): IterableDataset doesn't provide APIs
+        # for getting number of rows, byte size, etc., so the
+        # BlockMetadata is currently empty. Properly retrieve
+        # or calculate these so that progress bars have meaning.
+        meta = BlockMetadata(
+            num_rows=None,
+            size_bytes=None,
+            schema=None,
+            input_files=None,
+            exec_stats=None,
+        )
+        read_tasks: List[ReadTask] = [
+            ReadTask(
+                lambda hfds=self._dataset: _read_dataset(hfds),
+                meta,
+            )
+        ]
+        return read_tasks
diff --git a/python/ray/data/datasource/parquet_datasource.py b/python/ray/data/datasource/parquet_datasource.py
index a95e2534f9bd..79e3be4beda9 100644
--- a/python/ray/data/datasource/parquet_datasource.py
+++ b/python/ray/data/datasource/parquet_datasource.py
@@ -262,7 +262,12 @@ def __init__(
             )
         except OSError as e:
             _handle_read_os_error(e, paths)
-        self._pq_ds = pq_ds
+
+        # NOTE: Store the custom serialized `ParquetFileFragment` to avoid unexpected
+        # network calls when `_ParquetDatasourceReader` is serialized. See
+        # `_SerializedPiece()` implementation for more details.
+        self._pq_pieces = [_SerializedPiece(p) for p in pq_ds.pieces]
+        self._pq_paths = [p.path for p in pq_ds.pieces]
         self._meta_provider = meta_provider
         self._inferred_schema = inferred_schema
         self._block_udf = _block_udf
@@ -285,18 +290,18 @@ def get_read_tasks(self, parallelism: int) -> List[ReadTask]:
         # which simplifies partitioning logic. We still use
         # FileBasedDatasource's write side (do_write), however.
         read_tasks = []
-        for pieces, metadata in zip(
-            np.array_split(self._pq_ds.pieces, parallelism),
+        for pieces, paths, metadata in zip(
+            np.array_split(self._pq_pieces, parallelism),
+            np.array_split(self._pq_paths, parallelism),
             np.array_split(self._metadata, parallelism),
         ):
             if len(pieces) <= 0:
                 continue
-            serialized_pieces = [_SerializedPiece(p) for p in pieces]
-            input_files = [p.path for p in pieces]
+
             meta = self._meta_provider(
-                input_files,
+                paths,
                 self._inferred_schema,
-                pieces=pieces,
+                num_pieces=len(pieces),
                 prefetched_metadata=metadata,
             )
             # If there is a filter operation, reset the calculated row count,
@@ -333,7 +338,7 @@ def get_read_tasks(self, parallelism: int) -> List[ReadTask]:
             )
             read_tasks.append(
                 ReadTask(
-                    lambda p=serialized_pieces: _read_pieces(
+                    lambda p=pieces: _read_pieces(
                         block_udf,
                         reader_args,
                         default_read_batch_size,
@@ -359,7 +364,7 @@ def _estimate_files_encoding_ratio(self) -> float:
         # Launch tasks to sample multiple files remotely in parallel.
         # Evenly distributed to sample N rows in i-th row group in i-th file.
         # TODO(ekl/cheng) take into account column pruning.
-        num_files = len(self._pq_ds.pieces)
+        num_files = len(self._pq_pieces)
         num_samples = int(num_files * PARQUET_ENCODING_RATIO_ESTIMATE_SAMPLING_RATIO)
         min_num_samples = min(
             PARQUET_ENCODING_RATIO_ESTIMATE_MIN_NUM_SAMPLES, num_files
@@ -372,7 +377,7 @@ def _estimate_files_encoding_ratio(self) -> float:
         # Evenly distributed to choose which file to sample, to avoid biased prediction
         # if data is skewed.
         file_samples = [
-            self._pq_ds.pieces[idx]
+            self._pq_pieces[idx]
             for idx in np.linspace(0, num_files - 1, num_samples).astype(int).tolist()
         ]
 
@@ -383,13 +388,12 @@ def _estimate_files_encoding_ratio(self) -> float:
             # Sample the first rows batch in i-th file.
             # Use SPREAD scheduling strategy to avoid packing many sampling tasks on
             # same machine to cause OOM issue, as sampling can be memory-intensive.
-            serialized_sample = _SerializedPiece(sample)
             futures.append(
                 sample_piece.options(scheduling_strategy=scheduling).remote(
                     self._reader_args,
                     self._columns,
                     self._schema,
-                    serialized_sample,
+                    sample,
                 )
             )
         sample_bar = ProgressBar("Parquet Files Sample", len(futures))
diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py
index 1eaa3d59516d..ae039e3d2ce8 100644
--- a/python/ray/data/read_api.py
+++ b/python/ray/data/read_api.py
@@ -2131,13 +2131,13 @@ def from_spark(
 
 
 @PublicAPI
-def from_huggingface(dataset: "datasets.Dataset") -> MaterializedDataset:
-    """Create a :class:`~ray.data.Dataset` from a
-    `Hugging Face Datasets Dataset <https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset/>`_.
-
-    This function isn't parallelized, and is intended to be used
-    with Hugging Face Datasets that are loaded into memory (as opposed
-    to memory-mapped).
+def from_huggingface(
+    dataset: Union["datasets.Dataset", "datasets.IterableDataset"],
+) -> Union[MaterializedDataset, Dataset]:
+    """Create a :class:`~ray.data.MaterializedDataset` from a
+    `Hugging Face Datasets Dataset <https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset/>`_
+    or a :class:`~ray.data.Dataset` from a `Hugging Face Datasets IterableDataset <https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.IterableDataset/>`_.
+    For an `IterableDataset`, we use a streaming implementation to read data.
 
     Example:
 
@@ -2154,6 +2154,10 @@ def from_huggingface(dataset: "datasets.Dataset") -> MaterializedDataset:
             ray_ds = ray.data.from_huggingface(hf_dataset["train"])
             print(ray_ds)
 
+            hf_dataset_stream = datasets.load_dataset("tweet_eval", "emotion", streaming=True)
+            ray_ds_stream = ray.data.from_huggingface(hf_dataset_stream["train"])
+            print(ray_ds_stream)
+
         .. testoutput::
             :options: +MOCK
 
@@ -2162,11 +2166,16 @@ def from_huggingface(dataset: "datasets.Dataset") -> MaterializedDataset:
                 num_rows=3257,
                 schema={text: string, label: int64}
             )
+            Dataset(
+                num_blocks=...,
+                num_rows=3257,
+                schema={text: string, label: int64}
+            )
 
     Args:
-        dataset: A `Hugging Face Datasets Dataset`_.
-            ``IterableDataset`` and
+        dataset: A `Hugging Face Datasets Dataset`_ or `Hugging Face Datasets IterableDataset`_.
             `DatasetDict <https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.DatasetDict/>`_
+            and `IterableDatasetDict <https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.IterableDatasetDict/>`_
             are not supported.
 
     Returns:
@@ -2174,22 +2183,33 @@ def from_huggingface(dataset: "datasets.Dataset") -> MaterializedDataset:
     """  # noqa: E501
     import datasets
 
+    if isinstance(dataset, datasets.IterableDataset):
+        # HuggingFaceDatasource should not be imported at top level, because
+        # we only want the Hugging Face datasets package to be imported
+        # if Hugging Face Datasets are used.
+        from ray.data.datasource.huggingface_datasource import HuggingFaceDatasource
+
+        # For an IterableDataset, we can use a streaming implementation to read data.
+        return read_datasource(
+            HuggingFaceDatasource(),
+            dataset=dataset,
+        )
     if isinstance(dataset, datasets.Dataset):
         # To get the resulting Arrow table from a Hugging Face Dataset after
-        # applying transformations (e.g. train_test_split(), shard(), select()),
+        # applying transformations (e.g., train_test_split(), shard(), select()),
         # we create a copy of the Arrow table, which applies the indices
         # mapping from the transformations.
         hf_ds_arrow = dataset.with_format("arrow")
         ray_ds = from_arrow(hf_ds_arrow[:])
         return ray_ds
-    elif isinstance(dataset, datasets.DatasetDict):
+    elif isinstance(dataset, (datasets.DatasetDict, datasets.IterableDatasetDict)):
         available_keys = list(dataset.keys())
         raise DeprecationWarning(
-            "You provided a Hugging Face DatasetDict which contains multiple "
-            "datasets, but `from_huggingface` now only accepts a single Hugging Face "
-            "Dataset. To convert just a single Hugging Face Dataset to a "
-            "Ray Dataset, specify a split. For example, "
-            "`ray.data.from_huggingface(my_dataset_dictionary"
+            "You provided a Hugging Face DatasetDict or IterableDatasetDict, "
+            "which contains multiple datasets, but `from_huggingface` now "
+            "only accepts a single Hugging Face Dataset. To convert just "
+            "a single Hugging Face Dataset to a Ray Dataset, specify a split. "
+            "For example, `ray.data.from_huggingface(my_dataset_dictionary"
             f"['{available_keys[0]}'])`. "
             f"Available splits are {available_keys}."
         )
diff --git a/python/ray/data/tests/test_dynamic_block_split.py b/python/ray/data/tests/test_dynamic_block_split.py
index 0d5de4acbf6e..8faa975243ba 100644
--- a/python/ray/data/tests/test_dynamic_block_split.py
+++ b/python/ray/data/tests/test_dynamic_block_split.py
@@ -325,6 +325,7 @@ def test_lazy_block_list(shutdown_only, target_max_block_size):
         assert block_metadata.schema is not None
 
 
+@pytest.mark.skip("Needs zero-copy optimization for read->map_batches.")
 def test_read_large_data(ray_start_cluster):
     # Test 20G input with single task
     num_blocks_per_task = 20
diff --git a/python/ray/data/tests/test_execution_optimizer.py b/python/ray/data/tests/test_execution_optimizer.py
index 10fd6bb99174..78ca90330410 100644
--- a/python/ray/data/tests/test_execution_optimizer.py
+++ b/python/ray/data/tests/test_execution_optimizer.py
@@ -1484,6 +1484,7 @@ def check_transform_fns(op, expected_types):
         assert isinstance(transform_fn, expected_types[i]), transform_fn
 
 
+@pytest.mark.skip("Needs zero-copy optimization for read->map_batches.")
 def test_zero_copy_fusion_eliminate_build_output_blocks(
     ray_start_regular_shared, enable_optimizer
 ):
diff --git a/python/ray/data/tests/test_file_based_datasource.py b/python/ray/data/tests/test_file_based_datasource.py
index 0f79e2f3dace..850034f89c9d 100644
--- a/python/ray/data/tests/test_file_based_datasource.py
+++ b/python/ray/data/tests/test_file_based_datasource.py
@@ -6,6 +6,10 @@
 import ray
 from ray.data.block import BlockAccessor
 from ray.data.datasource import FileBasedDatasource
+from ray.data.datasource.file_based_datasource import (
+    OPEN_FILE_MAX_ATTEMPTS,
+    _open_file_with_retry,
+)
 
 
 class MockFileBasedDatasource(FileBasedDatasource):
@@ -37,6 +41,38 @@ def test_write_creates_dir(tmp_path, ray_start_regular_shared):
     assert os.path.isdir(path)
 
 
+def test_open_file_with_retry(ray_start_regular_shared):
+    class FlakyFileOpener:
+        def __init__(self, max_attempts: int):
+            self.retry_attempts = 0
+            self.max_attempts = max_attempts
+
+        def open(self):
+            self.retry_attempts += 1
+            if self.retry_attempts < self.max_attempts:
+                raise OSError(
+                    "When creating key x in bucket y: AWS Error SLOW_DOWN during "
+                    "PutObject operation: Please reduce your request rate."
+                )
+            return "dummy"
+
+    original_max_attempts = OPEN_FILE_MAX_ATTEMPTS
+    try:
+        # Test openning file successfully after retries.
+        opener = FlakyFileOpener(3)
+        assert _open_file_with_retry("dummy", lambda: opener.open()) == "dummy"
+
+        # Test exhausting retries and failed eventually.
+        ray.data.datasource.file_based_datasource.OPEN_FILE_MAX_ATTEMPTS = 3
+        opener = FlakyFileOpener(4)
+        with pytest.raises(OSError):
+            _open_file_with_retry("dummy", lambda: opener.open())
+    finally:
+        ray.data.datasource.file_based_datasource.OPEN_FILE_MAX_ATTEMPTS = (
+            original_max_attempts
+        )
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/python/ray/data/tests/test_huggingface.py b/python/ray/data/tests/test_huggingface.py
index e1d726828767..311f75e78d08 100644
--- a/python/ray/data/tests/test_huggingface.py
+++ b/python/ray/data/tests/test_huggingface.py
@@ -41,6 +41,18 @@ def test_huggingface(ray_start_regular_shared):
     assert ray_dataset_split_test.count() == hf_dataset_split["test"].num_rows
 
 
+@pytest.mark.skipif(
+    datasets.Version(datasets.__version__) < datasets.Version("2.8.0"),
+    reason="IterableDataset.iter() added in 2.8.0",
+)
+def test_from_huggingface_streaming(ray_start_regular_shared):
+    hfds = datasets.load_dataset("tweet_eval", "emotion", streaming=True, split="train")
+
+    assert isinstance(hfds, datasets.IterableDataset)
+    ds = ray.data.from_huggingface(hfds)
+    assert ds.count() == 3257
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/python/ray/data/tests/test_metadata_provider.py b/python/ray/data/tests/test_metadata_provider.py
index d706921716bb..54ddca394b1c 100644
--- a/python/ray/data/tests/test_metadata_provider.py
+++ b/python/ray/data/tests/test_metadata_provider.py
@@ -70,7 +70,7 @@ def test_file_metadata_providers_not_implemented():
         meta_provider.expand_paths(["/foo/bar.csv"], None)
     meta_provider = ParquetMetadataProvider()
     with pytest.raises(NotImplementedError):
-        meta_provider(["/foo/bar.csv"], None, pieces=[], prefetched_metadata=None)
+        meta_provider(["/foo/bar.csv"], None, num_pieces=0, prefetched_metadata=None)
     assert meta_provider.prefetch_file_metadata(["test"]) is None
 
 
@@ -112,7 +112,7 @@ def test_default_parquet_metadata_provider(fs, data_path):
     meta = meta_provider(
         [p.path for p in pq_ds.pieces],
         pq_ds.schema,
-        pieces=pq_ds.pieces,
+        num_pieces=len(pq_ds.pieces),
         prefetched_metadata=file_metas,
     )
     expected_meta_size_bytes = _get_parquet_file_meta_size_bytes(file_metas)
diff --git a/python/ray/serve/BUILD b/python/ray/serve/BUILD
index 4c0d6d66c6b0..c62ef75f7cc2 100644
--- a/python/ray/serve/BUILD
+++ b/python/ray/serve/BUILD
@@ -177,6 +177,14 @@ py_test(
     deps = [":serve_lib"],
 )
 
+py_test(
+    name = "test_telemetry_2",
+    size = "large",
+    srcs = serve_tests_srcs,
+    tags = ["exclusive", "team:serve"],
+    deps = [":serve_lib"],
+)
+
 py_test(
     name = "test_batching",
     size = "small",
diff --git a/python/ray/serve/_private/autoscaling_policy.py b/python/ray/serve/_private/autoscaling_policy.py
index 8207ea61c731..80abde22e4e5 100644
--- a/python/ray/serve/_private/autoscaling_policy.py
+++ b/python/ray/serve/_private/autoscaling_policy.py
@@ -42,8 +42,19 @@ def calculate_desired_num_replicas(
         / autoscaling_config.target_num_ongoing_requests_per_replica
     )
 
+    # If error ratio >= 1, then the number of ongoing requests per
+    # replica exceeds the target and we will make an upscale decision,
+    # so we apply the upscale smoothing factor. Otherwise, the number of
+    # ongoing requests per replica is lower than the target and we will
+    # make a downscale decision, so we apply the downscale smoothing
+    # factor.
+    if error_ratio >= 1:
+        smoothing_factor = autoscaling_config.get_upscale_smoothing_factor()
+    else:
+        smoothing_factor = autoscaling_config.get_downscale_smoothing_factor()
+
     # Multiply the distance to 1 by the smoothing ("gain") factor (default=1).
-    smoothed_error_ratio = 1 + ((error_ratio - 1) * autoscaling_config.smoothing_factor)
+    smoothed_error_ratio = 1 + ((error_ratio - 1) * smoothing_factor)
     desired_num_replicas = math.ceil(current_num_replicas * smoothed_error_ratio)
 
     # If error_ratio = 0, meaning there is no more traffic, and desired
@@ -148,7 +159,7 @@ def get_decision_num_replicas(
             # When 0 replicas and queries are queued, scale up the replicas
             if current_handle_queued_queries > 0:
                 return max(
-                    math.ceil(1 * self.config.smoothing_factor),
+                    math.ceil(1 * self.config.get_upscale_smoothing_factor()),
                     curr_target_num_replicas,
                 )
             return curr_target_num_replicas
diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py
index b2d10d4c0902..a382fd37c7dc 100644
--- a/python/ray/serve/_private/deployment_state.py
+++ b/python/ray/serve/_private/deployment_state.py
@@ -1888,7 +1888,7 @@ def _stop_replica(self, replica, graceful_stop=True):
         self.health_check_gauge.set(
             0,
             tags={
-                "deployment": str(self._id),
+                "deployment": self.deployment_name,
                 "replica": replica.replica_tag,
                 "application": self.app_name,
             },
@@ -1907,7 +1907,7 @@ def _check_and_update_replicas(self):
                 self.health_check_gauge.set(
                     1,
                     tags={
-                        "deployment": str(self._id),
+                        "deployment": self.deployment_name,
                         "replica": replica.replica_tag,
                         "application": self.app_name,
                     },
@@ -1922,7 +1922,7 @@ def _check_and_update_replicas(self):
                 self.health_check_gauge.set(
                     0,
                     tags={
-                        "deployment": str(self._id),
+                        "deployment": self.deployment_name,
                         "replica": replica.replica_tag,
                         "application": self.app_name,
                     },
diff --git a/python/ray/serve/_private/http_proxy.py b/python/ray/serve/_private/http_proxy.py
index 6e9e692d5370..a14435998fd1 100644
--- a/python/ray/serve/_private/http_proxy.py
+++ b/python/ray/serve/_private/http_proxy.py
@@ -518,7 +518,7 @@ async def proxy_request(self, proxy_request: ProxyRequest) -> ProxyResponse:
                 )
                 self.deployment_request_error_counter.inc(
                     tags={
-                        "deployment": str(handle.deployment_id),
+                        "deployment": handle.deployment_id.name,
                         "error_code": proxy_response.status_code,
                         "method": method,
                         "route": route_path,
diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py
index 1a173dd1ccbd..10b250a8cc60 100644
--- a/python/ray/serve/_private/router.py
+++ b/python/ray/serve/_private/router.py
@@ -1063,7 +1063,7 @@ def __init__(
         )
         # TODO(zcin): use deployment name and application name instead of deployment id
         self.num_router_requests.set_default_tags(
-            {"deployment": str(deployment_id), "application": deployment_id.app}
+            {"deployment": deployment_id.name, "application": deployment_id.app}
         )
 
         self.num_queued_queries = 0
@@ -1077,7 +1077,7 @@ def __init__(
         )
         # TODO(zcin): use deployment name and application name instead of deployment id
         self.num_queued_queries_gauge.set_default_tags(
-            {"deployment": str(deployment_id), "application": deployment_id.app}
+            {"deployment": deployment_id.name, "application": deployment_id.app}
         )
 
         self.long_poll_client = LongPollClient(
diff --git a/python/ray/serve/_private/usage.py b/python/ray/serve/_private/usage.py
index 405064eef880..c7e41db1d932 100644
--- a/python/ray/serve/_private/usage.py
+++ b/python/ray/serve/_private/usage.py
@@ -29,6 +29,9 @@ class ServeUsageTag(Enum):
     MULTIPLEXED_API_USED = TagKey.SERVE_MULTIPLEXED_API_USED
     HTTP_PROXY_USED = TagKey.SERVE_HTTP_PROXY_USED
     GRPC_PROXY_USED = TagKey.SERVE_GRPC_PROXY_USED
+    SERVE_STATUS_API_USED = TagKey.SERVE_STATUS_API_USED
+    SERVE_GET_APP_HANDLE_API_USED = TagKey.SERVE_GET_APP_HANDLE_API_USED
+    SERVE_GET_DEPLOYMENT_HANDLE_API_USED = TagKey.SERVE_GET_DEPLOYMENT_HANDLE_API_USED
 
     def record(self, value: str):
         """Record telemetry value."""
diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py
index 6ecca03724dd..61fe216152ed 100644
--- a/python/ray/serve/api.py
+++ b/python/ray/serve/api.py
@@ -804,6 +804,7 @@ class MyDeployment:
         # Serve has not started yet
         return ServeStatus()
 
+    ServeUsageTag.SERVE_STATUS_API_USED.record("1")
     details = ServeInstanceDetails(**client.get_serve_details())
     return details._get_status()
 
@@ -840,6 +841,7 @@ def f(val: int) -> int:
     if ingress is None:
         raise RayServeException(f"Application '{name}' does not exist.")
 
+    ServeUsageTag.SERVE_GET_APP_HANDLE_API_USED.record("1")
     # Default to async within a deployment and sync outside a deployment.
     sync = get_internal_replica_context() is None
     return client.get_handle(ingress, name, sync=sync).options(
@@ -854,6 +856,8 @@ def get_deployment_handle(
 ) -> DeploymentHandle:
     """Get a handle to the named deployment.
 
+    This is a developer API and is for advanced Ray users and library developers.
+
     Args:
         deployment_name: Name of deployment to get a handle to.
         app_name: Application in which deployment resides. If calling
@@ -879,6 +883,7 @@ def get_deployment_handle(
         else:
             app_name = internal_replica_context.app_name
 
+    ServeUsageTag.SERVE_GET_DEPLOYMENT_HANDLE_API_USED.record("1")
     # Default to async within a deployment and sync outside a deployment.
     sync = internal_replica_context is None
     return client.get_handle(deployment_name, app_name, sync=sync).options(
diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py
index b2df5533a8cc..4a4bbc21090f 100644
--- a/python/ray/serve/config.py
+++ b/python/ray/serve/config.py
@@ -70,6 +70,8 @@ class AutoscalingConfig(BaseModel):
 
     # Multiplicative "gain" factor to limit scaling decisions
     smoothing_factor: PositiveFloat = 1.0
+    upscale_smoothing_factor: Optional[PositiveFloat] = None
+    downscale_smoothing_factor: Optional[PositiveFloat] = None
 
     # How frequently to make autoscaling decisions
     # loop_period_s: float = CONTROL_LOOP_PERIOD_S
@@ -102,6 +104,12 @@ def replicas_settings_valid(cls, max_replicas, values):
 
         return max_replicas
 
+    def get_upscale_smoothing_factor(self) -> PositiveFloat:
+        return self.upscale_smoothing_factor or self.smoothing_factor
+
+    def get_downscale_smoothing_factor(self) -> PositiveFloat:
+        return self.downscale_smoothing_factor or self.smoothing_factor
+
     # TODO(architkulkarni): implement below
     # The num_ongoing_requests_per_replica error ratio (desired / current)
     # threshold for overriding `upscale_delay_s`
@@ -201,7 +209,6 @@ class DeploymentConfig(BaseModel):
 
     class Config:
         validate_assignment = True
-        extra = "forbid"
         arbitrary_types_allowed = True
 
     # Dynamic default for max_concurrent_queries
@@ -273,6 +280,10 @@ def from_proto(cls, proto: DeploymentConfigProto):
             else:
                 data["user_config"] = None
         if "autoscaling_config" in data:
+            if not data["autoscaling_config"].get("upscale_smoothing_factor"):
+                data["autoscaling_config"]["upscale_smoothing_factor"] = None
+            if not data["autoscaling_config"].get("downscale_smoothing_factor"):
+                data["autoscaling_config"]["downscale_smoothing_factor"] = None
             data["autoscaling_config"] = AutoscalingConfig(**data["autoscaling_config"])
         if "version" in data:
             if data["version"] == "":
@@ -712,7 +723,6 @@ def fixed_number_replicas_should_exist(cls, v, values):
 
     class Config:
         validate_assignment = True
-        extra = "forbid"
         arbitrary_types_allowed = True
 
 
diff --git a/python/ray/serve/handle.py b/python/ray/serve/handle.py
index 0ecfa5e7ffb5..5ee11a21bf71 100644
--- a/python/ray/serve/handle.py
+++ b/python/ray/serve/handle.py
@@ -111,7 +111,7 @@ def __init__(
         self.request_counter.set_default_tags(
             {
                 "handle": handle_tag,
-                "deployment": str(self.deployment_id),
+                "deployment": self.deployment_id.name,
                 "application": self.deployment_id.app,
             }
         )
diff --git a/python/ray/serve/metrics.py b/python/ray/serve/metrics.py
index 479bea4fa6eb..fa3cbac1e37e 100644
--- a/python/ray/serve/metrics.py
+++ b/python/ray/serve/metrics.py
@@ -1,7 +1,6 @@
 from ray.util import metrics
 from typing import Tuple, Optional, Dict, List, Union
 from ray.serve import context
-from ray.serve._private.common import DeploymentID
 import ray
 
 DEPLOYMENT_TAG = "deployment"
@@ -49,9 +48,7 @@ def _add_serve_metric_default_tags(default_tags: Dict[str, str]):
         raise ValueError(f"'{APPLICATION_TAG}' tag is reserved for Ray Serve metrics")
     replica_context = context.get_internal_replica_context()
     # TODO(zcin): use replica_context.deployment for deployment tag
-    default_tags[DEPLOYMENT_TAG] = str(
-        DeploymentID(replica_context.deployment, replica_context.app_name)
-    )
+    default_tags[DEPLOYMENT_TAG] = replica_context.deployment
     default_tags[REPLICA_TAG] = replica_context.replica_tag
     if replica_context.app_name:
         default_tags[APPLICATION_TAG] = replica_context.app_name
diff --git a/python/ray/serve/schema.py b/python/ray/serve/schema.py
index 62a399166b08..3716669ec334 100644
--- a/python/ray/serve/schema.py
+++ b/python/ray/serve/schema.py
@@ -58,7 +58,7 @@ def _route_prefix_format(cls, v):
 
 
 @PublicAPI(stability="beta")
-class RayActorOptionsSchema(BaseModel, extra=Extra.forbid):
+class RayActorOptionsSchema(BaseModel):
     """Options with which to start a replica actor."""
 
     runtime_env: dict = Field(
@@ -138,9 +138,7 @@ def runtime_env_contains_remote_uris(cls, v):
 
 
 @PublicAPI(stability="beta")
-class DeploymentSchema(
-    BaseModel, extra=Extra.forbid, allow_population_by_field_name=True
-):
+class DeploymentSchema(BaseModel, allow_population_by_field_name=True):
     """
     Specifies options for one deployment within a Serve application. For each deployment
     this can optionally be included in `ServeApplicationSchema` to override deployment
@@ -315,7 +313,7 @@ def _deployment_info_to_schema(name: str, info: DeploymentInfo) -> DeploymentSch
 
 
 @PublicAPI(stability="beta")
-class ServeApplicationSchema(BaseModel, extra=Extra.forbid):
+class ServeApplicationSchema(BaseModel):
     """
     Describes one Serve application, and currently can also be used as a standalone
     config to deploy a single application to a Ray cluster.
@@ -532,7 +530,12 @@ class gRPCOptionsSchema(BaseModel):
 
 @PublicAPI(stability="alpha")
 class HTTPOptionsSchema(BaseModel):
-    """Options to start the HTTP Proxy with."""
+    """Options to start the HTTP Proxy with.
+
+    NOTE: This config allows extra parameters to make it forward-compatible (ie
+          older versions of Serve are able to accept configs from a newer versions,
+          simply ignoring new parameters)
+    """
 
     host: str = Field(
         default="0.0.0.0",
@@ -571,13 +574,17 @@ class HTTPOptionsSchema(BaseModel):
 
 
 @PublicAPI(stability="alpha")
-class ServeDeploySchema(BaseModel, extra=Extra.forbid):
+class ServeDeploySchema(BaseModel):
     """
     Multi-application config for deploying a list of Serve applications to the Ray
     cluster.
 
     This is the request JSON schema for the v2 REST API
     `PUT "/api/serve/applications/"`.
+
+    NOTE: This config allows extra parameters to make it forward-compatible (ie
+          older versions of Serve are able to accept configs from a newer versions,
+          simply ignoring new parameters)
     """
 
     proxy_location: DeploymentMode = Field(
diff --git a/python/ray/serve/tests/conftest.py b/python/ray/serve/tests/conftest.py
index a0ecfa125d7c..98ae6cc8c39a 100644
--- a/python/ray/serve/tests/conftest.py
+++ b/python/ray/serve/tests/conftest.py
@@ -10,7 +10,9 @@
 from ray import serve
 
 from ray._private.test_utils import wait_for_condition
+from ray._private.usage import usage_lib
 from ray.tests.conftest import pytest_runtest_makereport, propagate_logs  # noqa
+from ray.serve.tests.utils import check_ray_stopped, TELEMETRY_ROUTE_PREFIX
 
 # https://tools.ietf.org/html/rfc6335#section-6
 MIN_DYNAMIC_PORT = 49152
@@ -145,3 +147,29 @@ def ray_instance(request):
 
     os.environ.clear()
     os.environ.update(original_env_vars)
+
+
+@pytest.fixture
+def manage_ray_with_telemetry(monkeypatch):
+    with monkeypatch.context() as m:
+        m.setenv("RAY_USAGE_STATS_ENABLED", "1")
+        m.setenv(
+            "RAY_USAGE_STATS_REPORT_URL",
+            f"http://127.0.0.1:8000{TELEMETRY_ROUTE_PREFIX}",
+        )
+        m.setenv("RAY_USAGE_STATS_REPORT_INTERVAL_S", "1")
+        subprocess.check_output(["ray", "stop", "--force"])
+        wait_for_condition(check_ray_stopped, timeout=5)
+        yield
+
+        # Call Python API shutdown() methods to clear global variable state
+        serve.shutdown()
+        ray.shutdown()
+
+        # Reset global state (any keys that may have been set and cached while the
+        # workload was running).
+        usage_lib.reset_global_state()
+
+        # Shut down Ray cluster with CLI
+        subprocess.check_output(["ray", "stop", "--force"])
+        wait_for_condition(check_ray_stopped, timeout=5)
diff --git a/python/ray/serve/tests/test_autoscaling_policy.py b/python/ray/serve/tests/test_autoscaling_policy.py
index 429efdb92aba..62eff72b57a5 100644
--- a/python/ray/serve/tests/test_autoscaling_policy.py
+++ b/python/ray/serve/tests/test_autoscaling_policy.py
@@ -113,6 +113,52 @@ def test_smoothing_factor(self):
         )
         assert 5 <= desired_num_replicas <= 8  # 10 + 0.5 * (2.5 - 10) = 6.25
 
+    def test_upscale_smoothing_factor(self):
+        config = AutoscalingConfig(
+            min_replicas=0,
+            max_replicas=100,
+            target_num_ongoing_requests_per_replica=1,
+            upscale_smoothing_factor=0.5,
+        )
+        num_replicas = 10
+
+        # Should use upscale smoothing factor of 0.5
+        num_ongoing_requests = [4.0] * num_replicas
+        desired_num_replicas = calculate_desired_num_replicas(
+            autoscaling_config=config, current_num_ongoing_requests=num_ongoing_requests
+        )
+        assert 24 <= desired_num_replicas <= 26  # 10 + 0.5 * (40 - 10) = 25
+
+        # Should use downscale smoothing factor of 1 (default)
+        num_ongoing_requests = [0.25] * num_replicas
+        desired_num_replicas = calculate_desired_num_replicas(
+            autoscaling_config=config, current_num_ongoing_requests=num_ongoing_requests
+        )
+        assert 1 <= desired_num_replicas <= 4  # 10 + (2.5 - 10) = 2.5
+
+    def test_downscale_smoothing_factor(self):
+        config = AutoscalingConfig(
+            min_replicas=0,
+            max_replicas=100,
+            target_num_ongoing_requests_per_replica=1,
+            downscale_smoothing_factor=0.5,
+        )
+        num_replicas = 10
+
+        # Should use upscale smoothing factor of 1 (default)
+        num_ongoing_requests = [4.0] * num_replicas
+        desired_num_replicas = calculate_desired_num_replicas(
+            autoscaling_config=config, current_num_ongoing_requests=num_ongoing_requests
+        )
+        assert 39 <= desired_num_replicas <= 41  # 10 + (40 - 10) = 40
+
+        # Should use downscale smoothing factor of 0.5
+        num_ongoing_requests = [0.25] * num_replicas
+        desired_num_replicas = calculate_desired_num_replicas(
+            autoscaling_config=config, current_num_ongoing_requests=num_ongoing_requests
+        )
+        assert 5 <= desired_num_replicas <= 8  # 10 + 0.5 * (2.5 - 10) = 6.25
+
 
 def get_deployment_status(controller, name) -> DeploymentStatus:
     ref = ray.get(controller.get_deployment_status.remote(name, SERVE_DEFAULT_APP_NAME))
@@ -241,22 +287,31 @@ def __call__(self):
 
 @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.")
 @pytest.mark.parametrize("smoothing_factor", [1, 0.2])
-def test_e2e_scale_up_down_with_0_replica(serve_instance, smoothing_factor):
+@pytest.mark.parametrize("use_upscale_downscale_config", [True, False])
+def test_e2e_scale_up_down_with_0_replica(
+    serve_instance, smoothing_factor, use_upscale_downscale_config
+):
     """Send 100 requests and check that we autoscale up, and then back down."""
 
     controller = serve_instance._controller
     signal = SignalActor.remote()
 
+    autoscaling_config = {
+        "metrics_interval_s": 0.1,
+        "min_replicas": 0,
+        "max_replicas": 2,
+        "look_back_period_s": 0.2,
+        "downscale_delay_s": 0,
+        "upscale_delay_s": 0,
+    }
+    if use_upscale_downscale_config:
+        autoscaling_config["upscale_smoothing_factor"] = smoothing_factor
+        autoscaling_config["downscale_smoothing_factor"] = smoothing_factor
+    else:
+        autoscaling_config["smoothing_factor"] = smoothing_factor
+
     @serve.deployment(
-        autoscaling_config={
-            "metrics_interval_s": 0.1,
-            "min_replicas": 0,
-            "max_replicas": 2,
-            "look_back_period_s": 0.2,
-            "downscale_delay_s": 0,
-            "upscale_delay_s": 0,
-            "smoothing_factor": smoothing_factor,
-        },
+        autoscaling_config=autoscaling_config,
         # We will send over a lot of queries. This will make sure replicas are
         # killed quickly during cleanup.
         graceful_shutdown_timeout_s=1,
diff --git a/python/ray/serve/tests/test_cli.py b/python/ray/serve/tests/test_cli.py
index 1fec45e48cc1..99ac90665398 100644
--- a/python/ray/serve/tests/test_cli.py
+++ b/python/ray/serve/tests/test_cli.py
@@ -310,7 +310,7 @@ def test_deploy_duplicate_routes(ray_start_stop):
 
 
 @pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.")
-def test_deploy_bad_config1(ray_start_stop):
+def test_deploy_bad_v2_config(ray_start_stop):
     """Deploy a bad config with field applications, should try to parse as v2 config."""
 
     config_file = os.path.join(
@@ -321,12 +321,16 @@ def test_deploy_bad_config1(ray_start_stop):
         subprocess.check_output(
             ["serve", "deploy", config_file], stderr=subprocess.STDOUT
         )
-    assert "ValidationError" in e.value.output.decode("utf-8")
-    assert "ServeDeploySchema" in e.value.output.decode("utf-8")
+
+    output = e.value.output.decode("utf-8")
+
+    assert "ValidationError" in output, output
+    assert "ServeDeploySchema" in output, output
+    assert "Please ensure each application's route_prefix is unique" in output
 
 
 @pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.")
-def test_deploy_bad_config2(ray_start_stop):
+def test_deploy_bad_v1_config(ray_start_stop):
     """
     Deploy a bad config without field applications, should try to parse as v1 config.
     """
@@ -339,8 +343,12 @@ def test_deploy_bad_config2(ray_start_stop):
         subprocess.check_output(
             ["serve", "deploy", config_file], stderr=subprocess.STDOUT
         )
-    assert "ValidationError" in e.value.output.decode("utf-8")
-    assert "ServeApplicationSchema" in e.value.output.decode("utf-8")
+
+    output = e.value.output.decode("utf-8")
+
+    assert "none is not an allowed value" in output
+    assert "ValidationError" in output, output
+    assert "ServeApplicationSchema" in output, output
 
 
 @pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.")
diff --git a/python/ray/serve/tests/test_cli_2.py b/python/ray/serve/tests/test_cli_2.py
index ded9e6bae8ff..f190ce86ef37 100644
--- a/python/ray/serve/tests/test_cli_2.py
+++ b/python/ray/serve/tests/test_cli_2.py
@@ -924,7 +924,9 @@ def test_serving_request_through_grpc_proxy(ray_start_stop):
     channel = grpc.insecure_channel("localhost:9000")
 
     # Ensures ListApplications method succeeding.
-    ping_grpc_list_applications(channel, app_names)
+    wait_for_condition(
+        ping_grpc_list_applications, channel=channel, app_names=app_names
+    )
 
     # Ensures Healthz method succeeding.
     ping_grpc_healthz(channel)
@@ -964,7 +966,9 @@ def test_grpc_proxy_model_composition(ray_start_stop):
     channel = grpc.insecure_channel("localhost:9000")
 
     # Ensures ListApplications method succeeding.
-    ping_grpc_list_applications(channel, app_names)
+    wait_for_condition(
+        ping_grpc_list_applications, channel=channel, app_names=app_names
+    )
 
     # Ensures Healthz method succeeding.
     ping_grpc_healthz(channel)
diff --git a/python/ray/serve/tests/test_config.py b/python/ray/serve/tests/test_config.py
index f595ff271cb8..77a2dd3750b3 100644
--- a/python/ray/serve/tests/test_config.py
+++ b/python/ray/serve/tests/test_config.py
@@ -14,6 +14,12 @@
 from ray.serve._private.utils import DEFAULT
 from ray.serve.generated.serve_pb2_grpc import add_UserDefinedServiceServicer_to_server
 from ray.serve._private.constants import DEFAULT_GRPC_PORT
+from ray.serve.schema import (
+    ServeDeploySchema,
+    HTTPOptionsSchema,
+    ServeApplicationSchema,
+    DeploymentSchema,
+)
 
 
 def test_autoscaling_config_validation():
@@ -59,9 +65,8 @@ def test_autoscaling_config_validation():
 
 class TestDeploymentConfig:
     def test_deployment_config_validation(self):
-        # Test unknown key.
-        with pytest.raises(ValidationError):
-            DeploymentConfig(unknown_key=-1)
+        # Test config ignoring unknown keys (required for forward-compatibility)
+        DeploymentConfig(new_version_key=-1)
 
         # Test num_replicas validation.
         DeploymentConfig(num_replicas=1)
@@ -343,9 +348,36 @@ def f():
         assert config.init_kwargs == dict()
 
 
+def test_config_schemas_forward_compatible():
+    # Test configs ignoring unknown keys (required for forward-compatibility)
+    ServeDeploySchema(
+        http_options=HTTPOptionsSchema(
+            new_version_config_key="this config is from newer version of Ray"
+        ),
+        applications=[
+            ServeApplicationSchema(
+                import_path="module.app",
+                deployments=[
+                    DeploymentSchema(
+                        name="deployment",
+                        new_version_config_key="this config is from newer version"
+                        " of Ray",
+                    )
+                ],
+                new_version_config_key="this config is from newer version of Ray",
+            ),
+        ],
+        new_version_config_key="this config is from newer version of Ray",
+    )
+
+
 def test_http_options():
     HTTPOptions()
     HTTPOptions(host="8.8.8.8", middlewares=[object()])
+
+    # Test configs ignoring unknown keys (required for forward-compatibility)
+    HTTPOptions(new_version_config_key="this config is from newer version of Ray")
+
     assert HTTPOptions(host=None).location == "NoServer"
     assert HTTPOptions(location=None).location == "NoServer"
     assert HTTPOptions(location=DeploymentMode.EveryNode).location == "EveryNode"
diff --git a/python/ray/serve/tests/test_config_files/bad_multi_config.yaml b/python/ray/serve/tests/test_config_files/bad_multi_config.yaml
index b7b9361f8c5a..4bd5ed931251 100644
--- a/python/ray/serve/tests/test_config_files/bad_multi_config.yaml
+++ b/python/ray/serve/tests/test_config_files/bad_multi_config.yaml
@@ -12,6 +12,7 @@ applications:
           increment: 1
 
   - name: "app2"
+    # Route prefixes should be unique across all apps!
     route_prefix: "/app1"
     import_path: ray.serve.tests.test_config_files.pizza.serve_dag
     deployments:
diff --git a/python/ray/serve/tests/test_config_files/bad_single_config.yaml b/python/ray/serve/tests/test_config_files/bad_single_config.yaml
index 2d5fe2ee1797..4340f1ad51a5 100644
--- a/python/ray/serve/tests/test_config_files/bad_single_config.yaml
+++ b/python/ray/serve/tests/test_config_files/bad_single_config.yaml
@@ -1,7 +1,8 @@
-import_path: ray.serve.tests.test_config_files.test_dag.conditional_dag.serve_dag
+# Import path is a required field
+import_path:
 
 deployments:
 
   - name: Multiplier
     user_config:
-    factor: 1
+      factor: 1
diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
index 8c43c0d75d9b..07e5fb1db2b5 100644
--- a/python/ray/serve/tests/test_metrics.py
+++ b/python/ray/serve/tests/test_metrics.py
@@ -13,7 +13,6 @@
 import ray.util.state as state_api
 from fastapi import FastAPI
 from ray.serve.metrics import Counter, Histogram, Gauge
-from ray.serve._private.common import DeploymentID
 from ray.serve._private.constants import DEFAULT_LATENCY_BUCKET_MS
 from ray.serve.drivers import DAGDriver
 from ray.serve.http_adapters import json_request
@@ -107,7 +106,7 @@ async def __call__(self):
         "serve_replica_processing_queries", timeout=5
     )
     assert len(processing_requests) == 1
-    assert processing_requests[0]["deployment"] == "app1_A"
+    assert processing_requests[0]["deployment"] == "A"
     assert processing_requests[0]["application"] == "app1"
     print("serve_replica_processing_queries exists.")
 
@@ -115,7 +114,7 @@ async def __call__(self):
         "serve_replica_pending_queries", timeout=5
     )
     assert len(pending_requests) == 1
-    assert pending_requests[0]["deployment"] == "app1_A"
+    assert pending_requests[0]["deployment"] == "A"
     assert pending_requests[0]["application"] == "app1"
     print("serve_replica_pending_queries exists.")
 
@@ -216,8 +215,8 @@ def verify_error_count(do_assert=False):
             elif "serve_num_deployment_http_error_requests" in metrics:
                 # deployment A should have error count 2
                 if do_assert:
-                    assert 'deployment="app_A"' in metrics and "2.0" in metrics
-                if 'deployment="app_A"' not in metrics or "2.0" not in metrics:
+                    assert 'deployment="A"' in metrics and "2.0" in metrics
+                if 'deployment="A"' not in metrics or "2.0" not in metrics:
                     return False
         return True
 
@@ -268,7 +267,7 @@ def f(*args):
         "serve_num_deployment_http_error_requests"
     )
     assert len(num_deployment_errors) == 1
-    assert num_deployment_errors[0]["deployment"] == "app_f"
+    assert num_deployment_errors[0]["deployment"] == "f"
     assert num_deployment_errors[0]["error_code"] == "500"
     assert num_deployment_errors[0]["method"] == "GET"
     assert num_deployment_errors[0]["application"] == "app"
@@ -372,14 +371,14 @@ def verify_metrics(metric, expected_output):
 
     num_requests = get_metric_dictionaries("serve_deployment_request_counter")
     assert len(num_requests) == 2
-    expected_output = {"route": "/f", "deployment": "app1_f", "application": "app1"}
+    expected_output = {"route": "/f", "deployment": "f", "application": "app1"}
     verify_metrics(num_requests[0], expected_output)
 
     start_metrics = get_metric_dictionaries("serve_deployment_replica_starts")
     assert len(start_metrics) == 2
-    expected_output = {"deployment": "app1_f", "application": "app1"}
+    expected_output = {"deployment": "f", "application": "app1"}
     verify_metrics(start_metrics[0], expected_output)
-    expected_output = {"deployment": "app2_g", "application": "app2"}
+    expected_output = {"deployment": "g", "application": "app2"}
     verify_metrics(start_metrics[1], expected_output)
 
     # Latency metrics
@@ -397,8 +396,8 @@ def verify_metrics(metric, expected_output):
         latency_metrics = get_metric_dictionaries(metric_name)
         print(f"checking metric {metric_name}, {latency_metrics}")
         assert len(latency_metrics) == 2
-        expected_output1 = {"deployment": "app1_f", "application": "app1"}
-        expected_output2 = {"deployment": "app2_g", "application": "app2"}
+        expected_output1 = {"deployment": "f", "application": "app1"}
+        expected_output2 = {"deployment": "g", "application": "app2"}
         verify_metrics(latency_metrics[0], expected_output1)
         verify_metrics(latency_metrics[1], expected_output2)
 
@@ -406,8 +405,8 @@ def verify_metrics(metric, expected_output):
         lambda: len(get_metric_dictionaries("serve_replica_processing_queries")) == 2
     )
     processing_queries = get_metric_dictionaries("serve_replica_processing_queries")
-    expected_output1 = {"deployment": "app1_f", "application": "app1"}
-    expected_output2 = {"deployment": "app2_g", "application": "app2"}
+    expected_output1 = {"deployment": "f", "application": "app1"}
+    expected_output2 = {"deployment": "g", "application": "app2"}
     verify_metrics(processing_queries[0], expected_output1)
     verify_metrics(processing_queries[1], expected_output2)
 
@@ -423,15 +422,15 @@ def h():
     )
     err_requests = get_metric_dictionaries("serve_deployment_error_counter")
     assert len(err_requests) == 1
-    expected_output = {"route": "/h", "deployment": "app3_h", "application": "app3"}
+    expected_output = {"route": "/h", "deployment": "h", "application": "app3"}
     verify_metrics(err_requests[0], expected_output)
 
     health_metrics = get_metric_dictionaries("serve_deployment_replica_healthy")
     assert len(health_metrics) == 3
     expected_outputs = [
-        {"deployment": "app1_f", "application": "app1"},
-        {"deployment": "app2_g", "application": "app2"},
-        {"deployment": "app3_h", "application": "app3"},
+        {"deployment": "f", "application": "app1"},
+        {"deployment": "g", "application": "app2"},
+        {"deployment": "h", "application": "app3"},
     ]
     for i in range(len(health_metrics)):
         verify_metrics(health_metrics[i], expected_outputs[i])
@@ -523,14 +522,12 @@ def check():
 
         # Check replica qps & latency
         wait_for_route_and_name(
-            "serve_deployment_request_counter", "app1_f", "app1", "/app1"
+            "serve_deployment_request_counter", "f", "app1", "/app1"
         )
         wait_for_route_and_name(
-            "serve_deployment_request_counter", "app2_g", "app2", "/app2"
-        )
-        wait_for_route_and_name(
-            "serve_deployment_error_counter", "app3_h", "app3", "/app3"
+            "serve_deployment_request_counter", "g", "app2", "/app2"
         )
+        wait_for_route_and_name("serve_deployment_error_counter", "h", "app3", "/app3")
 
         # Check http proxy qps & latency
         for metric_name in [
@@ -553,12 +550,12 @@ def check():
                 get_metric_dictionaries(metric_name)
             )
             msg = f"Incorrect metrics for {metric_name}"
-            assert metrics_route["app1_f"] == {"/app1"}, msg
-            assert metrics_route["app2_g"] == {"/app2"}, msg
-            assert metrics_route["app3_h"] == {"/app3"}, msg
-            assert metrics_app_name["app1_f"] == "app1", msg
-            assert metrics_app_name["app2_g"] == "app2", msg
-            assert metrics_app_name["app3_h"] == "app3", msg
+            assert metrics_route["f"] == {"/app1"}, msg
+            assert metrics_route["g"] == {"/app2"}, msg
+            assert metrics_route["h"] == {"/app3"}, msg
+            assert metrics_app_name["f"] == "app1", msg
+            assert metrics_app_name["g"] == "app2", msg
+            assert metrics_app_name["h"] == "app3", msg
 
     def test_request_context_pass_for_handle_passing(self, serve_start_shutdown):
         """Test handle passing contexts between replicas"""
@@ -611,12 +608,12 @@ async def app2(self):
         ) = self._generate_metrics_summary(
             get_metric_dictionaries("serve_deployment_request_counter")
         )
-        assert requests_metrics_route["app_G"] == {"/api", "/api2"}
-        assert requests_metrics_route["app_g1"] == {"/api"}
-        assert requests_metrics_route["app_g2"] == {"/api2"}
-        assert requests_metrics_app_name["app_G"] == "app"
-        assert requests_metrics_app_name["app_g1"] == "app"
-        assert requests_metrics_app_name["app_g2"] == "app"
+        assert requests_metrics_route["G"] == {"/api", "/api2"}
+        assert requests_metrics_route["g1"] == {"/api"}
+        assert requests_metrics_route["g2"] == {"/api2"}
+        assert requests_metrics_app_name["G"] == "app"
+        assert requests_metrics_app_name["g1"] == "app"
+        assert requests_metrics_app_name["g2"] == "app"
 
     def test_customer_metrics_with_context(self, serve_start_shutdown):
         @serve.deployment
@@ -661,12 +658,7 @@ def __call__(self):
                 return [
                     # NOTE(zcin): this is to match the current implementation in
                     # Serve's _add_serve_metric_default_tags().
-                    str(
-                        DeploymentID(
-                            ray.serve.context._INTERNAL_REPLICA_CONTEXT.deployment,
-                            ray.serve.context._INTERNAL_REPLICA_CONTEXT.app_name,
-                        )
-                    ),
+                    ray.serve.context._INTERNAL_REPLICA_CONTEXT.deployment,
                     ray.serve.context._INTERNAL_REPLICA_CONTEXT.replica_tag,
                 ]
 
diff --git a/python/ray/serve/tests/test_schema.py b/python/ray/serve/tests/test_schema.py
index 6a4e6d3c48cb..45fd9792f983 100644
--- a/python/ray/serve/tests/test_schema.py
+++ b/python/ray/serve/tests/test_schema.py
@@ -187,10 +187,9 @@ def test_extra_fields_invalid_ray_actor_options(self):
         # Schema should be createable with valid fields
         RayActorOptionsSchema.parse_obj(ray_actor_options_schema)
 
-        # Schema should raise error when a nonspecified field is included
-        ray_actor_options_schema["fake_field"] = None
-        with pytest.raises(ValidationError):
-            RayActorOptionsSchema.parse_obj(ray_actor_options_schema)
+        # Schema should NOT raise error when extra field is included
+        ray_actor_options_schema["extra_field"] = None
+        RayActorOptionsSchema.parse_obj(ray_actor_options_schema)
 
     def test_dict_defaults_ray_actor_options(self):
         # Dictionary fields should have empty dictionaries as defaults, not None
@@ -339,10 +338,9 @@ def test_extra_fields_invalid_deployment_schema(self):
         # Schema should be createable with valid fields
         DeploymentSchema.parse_obj(deployment_schema)
 
-        # Schema should raise error when a nonspecified field is included
-        deployment_schema["fake_field"] = None
-        with pytest.raises(ValidationError):
-            DeploymentSchema.parse_obj(deployment_schema)
+        # Schema should NOT raise error when extra field is included
+        deployment_schema["extra_field"] = None
+        DeploymentSchema.parse_obj(deployment_schema)
 
     @pytest.mark.parametrize(
         "option",
@@ -430,10 +428,9 @@ def test_extra_fields_invalid_serve_application_schema(self):
         # Schema should be createable with valid fields
         ServeApplicationSchema.parse_obj(serve_application_schema)
 
-        # Schema should raise error when a nonspecified field is included
-        serve_application_schema["fake_field"] = None
-        with pytest.raises(ValidationError):
-            ServeApplicationSchema.parse_obj(serve_application_schema)
+        # Schema should NOT raise error when extra field is included
+        serve_application_schema["extra_field"] = None
+        ServeApplicationSchema.parse_obj(serve_application_schema)
 
     @pytest.mark.parametrize("env", get_valid_runtime_envs())
     def test_serve_application_valid_runtime_env(self, env):
diff --git a/python/ray/serve/tests/test_telemetry.py b/python/ray/serve/tests/test_telemetry.py
index d5a96c824841..f3dc70f7fc11 100644
--- a/python/ray/serve/tests/test_telemetry.py
+++ b/python/ray/serve/tests/test_telemetry.py
@@ -5,12 +5,10 @@
 import subprocess
 from typing import Dict
 from fastapi import FastAPI
-from starlette.requests import Request
 
 import ray
 from ray.dag.input_node import InputNode
 from ray._private.test_utils import wait_for_condition
-from ray._private.usage import usage_lib
 
 from ray import serve
 from ray.serve.context import get_global_client
@@ -21,105 +19,18 @@
 from ray.serve._private.constants import (
     SERVE_DEFAULT_APP_NAME,
     SERVE_MULTIPLEXED_MODEL_ID,
-    SERVE_NAMESPACE,
 )
 from ray._private.usage.usage_lib import get_extra_usage_tags_to_report
 from ray.serve._private.usage import ServeUsageTag
+from ray.serve.tests.utils import (
+    check_ray_started,
+    start_telemetry_app,
+    TelemetryStorage,
+    TELEMETRY_ROUTE_PREFIX,
+)
 
 
-TELEMETRY_ROUTE_PREFIX = "/telemetry"
-STORAGE_ACTOR_NAME = "storage"
-
-
-def check_ray_stopped():
-    try:
-        requests.get("http://localhost:52365/api/ray/version")
-        return False
-    except Exception:
-        return True
-
-
-def check_ray_started():
-    return requests.get("http://localhost:52365/api/ray/version").status_code == 200
-
-
-@pytest.fixture
-def manage_ray(monkeypatch):
-    with monkeypatch.context() as m:
-        m.setenv("RAY_USAGE_STATS_ENABLED", "1")
-        m.setenv(
-            "RAY_USAGE_STATS_REPORT_URL",
-            f"http://127.0.0.1:8000{TELEMETRY_ROUTE_PREFIX}",
-        )
-        m.setenv("RAY_USAGE_STATS_REPORT_INTERVAL_S", "1")
-        subprocess.check_output(["ray", "stop", "--force"])
-        wait_for_condition(check_ray_stopped, timeout=5)
-        yield
-
-        # Call Python API shutdown() methods to clear global variable state
-        serve.shutdown()
-        ray.shutdown()
-
-        # Reset global state (any keys that may have been set and cached while the
-        # workload was running).
-        usage_lib.reset_global_state()
-
-        # Shut down Ray cluster with CLI
-        subprocess.check_output(["ray", "stop", "--force"])
-        wait_for_condition(check_ray_stopped, timeout=5)
-
-
-@ray.remote(name=STORAGE_ACTOR_NAME, namespace=SERVE_NAMESPACE, num_cpus=0)
-class TelemetryStorage:
-    def __init__(self):
-        self.reports_received = 0
-        self.current_report = dict()
-
-    def store_report(self, report: Dict) -> None:
-        self.reports_received += 1
-        self.current_report = report
-
-    def get_report(self) -> Dict:
-        return self.current_report
-
-    def get_reports_received(self) -> int:
-        return self.reports_received
-
-
-@serve.deployment(ray_actor_options={"num_cpus": 0})
-class TelemetryReceiver:
-    def __init__(self):
-        self.storage = ray.get_actor(name=STORAGE_ACTOR_NAME, namespace=SERVE_NAMESPACE)
-
-    async def __call__(self, request: Request) -> bool:
-        report = await request.json()
-        ray.get(self.storage.store_report.remote(report))
-        return True
-
-
-receiver_app = TelemetryReceiver.bind()
-
-
-def start_telemetry_app():
-    """Start a telemetry Serve app.
-
-    Ray should be initialized before calling this method.
-
-    NOTE: If you're running the TelemetryReceiver Serve app to check telemetry,
-    remember that the receiver itself is counted in the telemetry. E.g. if you
-    deploy a Serve app other than the receiver, the number of apps in the
-    cluster is 2- not 1– since the receiver is also running.
-
-    Returns a handle to a TelemetryStorage actor. You can use this actor
-    to access the latest telemetry reports.
-    """
-
-    storage = TelemetryStorage.remote()
-    serve.run(receiver_app, name="telemetry", route_prefix=TELEMETRY_ROUTE_PREFIX)
-    return storage
-
-
-def test_fastapi_detected(manage_ray):
+def test_fastapi_detected(manage_ray_with_telemetry):
     """
     Check that FastAPI is detected by telemetry.
     """
@@ -170,7 +81,7 @@ async def app2(self):
     assert ServeUsageTag.REST_API_VERSION.get_value_from_report(report) is None
 
 
-def test_grpc_detected(manage_ray):
+def test_grpc_detected(manage_ray_with_telemetry):
     """
     Check that gRPCIngress is detected by telemetry.
     """
@@ -216,7 +127,7 @@ def greeter(inputs: Dict[str, bytes]):
 
 
 @pytest.mark.parametrize("use_adapter", [True, False])
-def test_graph_detected(manage_ray, use_adapter):
+def test_graph_detected(manage_ray_with_telemetry, use_adapter):
     """
     Check that DAGDriver and HTTP adapters are detected by telemetry.
     """
@@ -278,7 +189,7 @@ class Stub:
 
 @pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.")
 @pytest.mark.parametrize("version", ["v1", "v2"])
-def test_rest_api(manage_ray, tmp_dir, version):
+def test_rest_api(manage_ray_with_telemetry, tmp_dir, version):
     """
     Check that telemetry works with REST API.
     """
@@ -289,13 +200,13 @@ def test_rest_api(manage_ray, tmp_dir, version):
     storage = TelemetryStorage.remote()
 
     if version == "v1":
-        config = {"import_path": "ray.serve.tests.test_telemetry.receiver_app"}
+        config = {"import_path": "ray.serve.tests.utils.receiver_app"}
     elif version == "v2":
         config = {
             "applications": [
                 {
                     "name": "receiver_app",
-                    "import_path": "ray.serve.tests.test_telemetry.receiver_app",
+                    "import_path": "ray.serve.tests.utils.receiver_app",
                     "route_prefix": TELEMETRY_ROUTE_PREFIX,
                 },
                 {
@@ -361,7 +272,7 @@ def test_rest_api(manage_ray, tmp_dir, version):
             "applications": [
                 {
                     "name": "receiver_app",
-                    "import_path": "ray.serve.tests.test_telemetry.receiver_app",
+                    "import_path": "ray.serve.tests.utils.receiver_app",
                     "route_prefix": TELEMETRY_ROUTE_PREFIX,
                 },
             ]
@@ -406,7 +317,9 @@ def reconfigure(self, *args):
         ("autoscaling_config", {"max_replicas": 5}),
     ],
 )
-def test_lightweight_config_options(manage_ray, lightweight_option, value):
+def test_lightweight_config_options(
+    manage_ray_with_telemetry, lightweight_option, value
+):
     """
     Check that lightweight config options are detected by telemetry.
     """
@@ -425,7 +338,7 @@ def test_lightweight_config_options(manage_ray, lightweight_option, value):
         "applications": [
             {
                 "name": "receiver_app",
-                "import_path": "ray.serve.tests.test_telemetry.receiver_app",
+                "import_path": "ray.serve.tests.utils.receiver_app",
                 "route_prefix": TELEMETRY_ROUTE_PREFIX,
             },
             {
@@ -492,7 +405,9 @@ def test_lightweight_config_options(manage_ray, lightweight_option, value):
 
 @pytest.mark.parametrize("use_new_handle_api", [False, True])
 @pytest.mark.parametrize("call_in_deployment", [False, True])
-def test_handle_apis_detected(manage_ray, use_new_handle_api, call_in_deployment):
+def test_handle_apis_detected(
+    manage_ray_with_telemetry, use_new_handle_api, call_in_deployment
+):
     """Check that the various handles are detected correctly by telemetry."""
 
     subprocess.check_output(["ray", "start", "--head"])
@@ -570,7 +485,7 @@ def check_telemetry():
 
 
 @pytest.mark.parametrize("mode", ["http", "outside_deployment", "inside_deployment"])
-def test_deployment_handle_to_obj_ref_detected(manage_ray, mode):
+def test_deployment_handle_to_obj_ref_detected(manage_ray_with_telemetry, mode):
     """Check that the handle to_object_ref API is detected correctly by telemetry."""
 
     subprocess.check_output(["ray", "start", "--head"])
@@ -642,7 +557,7 @@ def check_telemetry(tag_should_be_set: bool):
         wait_for_condition(check_telemetry, tag_should_be_set=True)
 
 
-def test_multiplexed_detect(manage_ray):
+def test_multiplexed_detect(manage_ray_with_telemetry):
     """Check that multiplexed api is detected by telemetry."""
 
     subprocess.check_output(["ray", "start", "--head"])
diff --git a/python/ray/serve/tests/test_telemetry_2.py b/python/ray/serve/tests/test_telemetry_2.py
new file mode 100644
index 000000000000..ccc34ec21e8f
--- /dev/null
+++ b/python/ray/serve/tests/test_telemetry_2.py
@@ -0,0 +1,137 @@
+import sys
+import pytest
+import subprocess
+import time
+
+import ray
+from ray._private.test_utils import wait_for_condition
+
+from ray import serve
+from ray.serve.tests.utils import (
+    check_ray_started,
+    start_telemetry_app,
+    check_telemetry_recorded,
+    check_telemetry_not_recorded,
+)
+
+
+@pytest.mark.parametrize("location", ["driver", "deployment", None])
+def test_status_api_detected(manage_ray_with_telemetry, location):
+    """Check that serve.status is detected correctly by telemetry."""
+
+    subprocess.check_output(["ray", "start", "--head"])
+    wait_for_condition(check_ray_started, timeout=5)
+
+    storage_handle = start_telemetry_app()
+    wait_for_condition(
+        lambda: ray.get(storage_handle.get_reports_received.remote()) > 0, timeout=5
+    )
+    # Check telemetry is not recorded before test starts
+    check_telemetry_not_recorded(storage_handle, "serve_status_api_used")
+
+    @serve.deployment
+    class Model:
+        async def __call__(self):
+            return serve.status()
+
+    if location:
+        if location == "deployment":
+            handle = serve.run(Model.bind(), route_prefix="/model")
+            handle.remote()
+        elif location == "driver":
+            serve.status()
+
+        wait_for_condition(
+            check_telemetry_recorded,
+            storage_handle=storage_handle,
+            key="serve_status_api_used",
+            expected_value="1",
+        )
+    else:
+        for _ in range(3):
+            check_telemetry_not_recorded(storage_handle, "serve_status_api_used")
+            time.sleep(1)
+
+
+@pytest.mark.parametrize("location", ["driver", "deployment", None])
+def test_get_app_handle_api_detected(manage_ray_with_telemetry, location):
+    """Check that serve.get_app_handle is detected correctly by telemetry."""
+
+    subprocess.check_output(["ray", "start", "--head"])
+    wait_for_condition(check_ray_started, timeout=5)
+
+    storage_handle = start_telemetry_app()
+    wait_for_condition(
+        lambda: ray.get(storage_handle.get_reports_received.remote()) > 0, timeout=5
+    )
+    # Check telemetry is not recorded before test starts
+    check_telemetry_not_recorded(storage_handle, "serve_get_app_handle_api_used")
+
+    @serve.deployment
+    class Model:
+        async def __call__(self):
+            serve.get_app_handle("telemetry")
+
+    if location:
+        if location == "deployment":
+            handle = serve.run(Model.bind(), route_prefix="/model")
+            handle.remote()
+        elif location == "driver":
+            serve.get_app_handle("telemetry")
+
+        wait_for_condition(
+            check_telemetry_recorded,
+            storage_handle=storage_handle,
+            key="serve_get_app_handle_api_used",
+            expected_value="1",
+        )
+    else:
+        for _ in range(3):
+            check_telemetry_not_recorded(
+                storage_handle, "serve_get_app_handle_api_used"
+            )
+            time.sleep(1)
+
+
+@pytest.mark.parametrize("location", ["driver", "deployment", None])
+def test_get_deployment_handle_api_detected(manage_ray_with_telemetry, location):
+    """Check that serve.get_deployment_handle is detected correctly by telemetry."""
+
+    subprocess.check_output(["ray", "start", "--head"])
+    wait_for_condition(check_ray_started, timeout=5)
+
+    storage_handle = start_telemetry_app()
+    wait_for_condition(
+        lambda: ray.get(storage_handle.get_reports_received.remote()) > 0, timeout=5
+    )
+    # Check telemetry is not recorded before test starts
+    check_telemetry_not_recorded(storage_handle, "serve_get_deployment_handle_api_used")
+
+    @serve.deployment
+    class Model:
+        async def __call__(self):
+            serve.get_deployment_handle("TelemetryReceiver", "telemetry")
+
+    if location:
+        if location == "deployment":
+            handle = serve.run(Model.bind(), route_prefix="/model")
+            handle.remote()
+        elif location == "driver":
+            serve.get_deployment_handle("TelemetryReceiver", "telemetry")
+
+        wait_for_condition(
+            check_telemetry_recorded,
+            storage_handle=storage_handle,
+            key="serve_get_deployment_handle_api_used",
+            expected_value="1",
+        )
+    else:
+        for _ in range(3):
+            check_telemetry_not_recorded(
+                storage_handle, "serve_get_deployment_handle_api_used"
+            )
+            time.sleep(1)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(["-v", "-s", __file__]))
diff --git a/python/ray/serve/tests/utils.py b/python/ray/serve/tests/utils.py
index 71bdf56d381e..824bdd4d47ad 100644
--- a/python/ray/serve/tests/utils.py
+++ b/python/ray/serve/tests/utils.py
@@ -1,5 +1,18 @@
 import time
 from typing import Any
+import requests
+from typing import Dict
+from starlette.requests import Request
+
+import ray
+
+from ray import serve
+from ray.serve._private.usage import ServeUsageTag
+from ray.serve._private.constants import SERVE_NAMESPACE
+
+
+TELEMETRY_ROUTE_PREFIX = "/telemetry"
+STORAGE_ACTOR_NAME = "storage"
 
 
 class MockTimer:
@@ -42,3 +55,81 @@ def delete(self, key: str) -> bool:
             return True
 
         return False
+
+
+def check_ray_stopped():
+    try:
+        requests.get("http://localhost:52365/api/ray/version")
+        return False
+    except Exception:
+        return True
+
+
+def check_ray_started():
+    return requests.get("http://localhost:52365/api/ray/version").status_code == 200
+
+
+def check_telemetry_recorded(storage_handle, key, expected_value):
+    report = ray.get(storage_handle.get_report.remote())
+    assert report["extra_usage_tags"][key] == expected_value
+    return True
+
+
+def check_telemetry_not_recorded(storage_handle, key):
+    report = ray.get(storage_handle.get_report.remote())
+    assert (
+        ServeUsageTag.DEPLOYMENT_HANDLE_TO_OBJECT_REF_API_USED.get_value_from_report(
+            report
+        )
+        is None
+    )
+
+
+@ray.remote(name=STORAGE_ACTOR_NAME, namespace=SERVE_NAMESPACE, num_cpus=0)
+class TelemetryStorage:
+    def __init__(self):
+        self.reports_received = 0
+        self.current_report = dict()
+
+    def store_report(self, report: Dict) -> None:
+        self.reports_received += 1
+        self.current_report = report
+
+    def get_report(self) -> Dict:
+        return self.current_report
+
+    def get_reports_received(self) -> int:
+        return self.reports_received
+
+
+@serve.deployment(ray_actor_options={"num_cpus": 0})
+class TelemetryReceiver:
+    def __init__(self):
+        self.storage = ray.get_actor(name=STORAGE_ACTOR_NAME, namespace=SERVE_NAMESPACE)
+
+    async def __call__(self, request: Request) -> bool:
+        report = await request.json()
+        ray.get(self.storage.store_report.remote(report))
+        return True
+
+
+receiver_app = TelemetryReceiver.bind()
+
+
+def start_telemetry_app():
+    """Start a telemetry Serve app.
+
+    Ray should be initialized before calling this method.
+
+    NOTE: If you're running the TelemetryReceiver Serve app to check telemetry,
+    remember that the receiver itself is counted in the telemetry. E.g. if you
+    deploy a Serve app other than the receiver, the number of apps in the
+    cluster is 2- not 1– since the receiver is also running.
+
+    Returns a handle to a TelemetryStorage actor. You can use this actor
+    to access the latest telemetry reports.
+    """
+
+    storage = TelemetryStorage.remote()
+    serve.run(receiver_app, name="telemetry", route_prefix=TELEMETRY_ROUTE_PREFIX)
+    return storage
diff --git a/python/ray/tests/BUILD b/python/ray/tests/BUILD
index 41940daebb23..33cea03e9dce 100644
--- a/python/ray/tests/BUILD
+++ b/python/ray/tests/BUILD
@@ -177,6 +177,8 @@ py_test_module_list(
     "test_draining.py",
     "test_streaming_generator.py",
     "test_streaming_generator_2.py",
+    "test_streaming_generator_3.py",
+    "test_scheduling_performance.py",
   ],
   size = "medium",
   tags = ["exclusive", "medium_size_python_tests_k_to_z", "team:core"],
@@ -209,7 +211,6 @@ py_test_module_list(
     "test_runtime_env_agent.py",
     "test_redis_tls.py",
     "test_raylet_output.py",
-    "test_scheduling_performance.py",
     "test_top_level_api.py",
     "test_unhandled_error.py",
     "test_utils.py",
diff --git a/python/ray/tests/chaos/streaming_llm.py b/python/ray/tests/chaos/streaming_llm.py
new file mode 100644
index 000000000000..80034af6bf7e
--- /dev/null
+++ b/python/ray/tests/chaos/streaming_llm.py
@@ -0,0 +1,106 @@
+import asyncio
+import logging
+import requests
+import argparse
+
+from fastapi import FastAPI
+from starlette.responses import StreamingResponse
+
+import ray
+from ray import serve
+
+logger = logging.getLogger("ray.serve")
+
+fastapi_app = FastAPI()
+
+
+# Input: a prompt of words
+# Output: each word reversed and produced N times.
+@serve.deployment(
+    num_replicas=6, ray_actor_options={"num_cpus": 0.01, "memory": 10 * 1024 * 1024}
+)
+class ReverseAndDupEachWord:
+    def __init__(self, dup_times: int):
+        self.dup_times = dup_times
+
+    async def __call__(self, prompt: str):
+        for word in prompt.split():
+            rev = word[::-1]
+            for _ in range(self.dup_times):
+                await asyncio.sleep(0.001)
+                yield rev
+
+
+@serve.deployment(
+    num_replicas=6, ray_actor_options={"num_cpus": 0.01, "memory": 10 * 1024 * 1024}
+)
+@serve.ingress(fastapi_app)
+class Textbot:
+    def __init__(self, llm):
+        self.llm = llm.options(stream=True)
+
+    @fastapi_app.post("/")
+    async def handle_request(self, prompt: str) -> StreamingResponse:
+        logger.info(f'Got prompt: "{prompt}"')
+        remote_async_gen = await self.llm.remote(prompt)
+        return StreamingResponse(
+            self.local_async_gen(remote_async_gen), media_type="text/plain"
+        )
+
+    async def local_async_gen(self, iterable):
+        async for i in iterable:
+            yield await i
+            # Ideally we want to do " ".join(words), but for the sake of simplicity we
+            # also have an extra trailing space.
+            yield " "
+
+
+@ray.remote(num_cpus=0.1, memory=10 * 1024 * 1024)
+def make_http_query(num_words, num_queries):
+    for _ in range(num_queries):
+        words = "Lorem ipsum dolor sit amet".split()
+        prompt_words = [words[i % len(words)] for i in range(num_words)]
+        prompt = " ".join(prompt_words)
+        expected_words = [word[::-1] for word in prompt_words for _ in range(2)]
+
+        response = requests.post(f"http://localhost:8000/?prompt={prompt}", stream=True)
+        response.raise_for_status()
+        content = response.content.decode()
+        assert content == " ".join(expected_words) + " ", content
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generates HTTP workloads with Ray.")
+
+    parser.add_argument("--num_tasks", type=int, required=True, help="Number of tasks.")
+    parser.add_argument(
+        "--num_queries_per_task",
+        type=int,
+        required=True,
+        help="Number of queries per task.",
+    )
+    parser.add_argument(
+        "--num_words_per_query",
+        type=int,
+        required=True,
+        help="Number of words per query",
+    )
+
+    args = parser.parse_args()
+
+    # Run the serve, run the client, then showdown serve.
+    llm = ReverseAndDupEachWord.bind(2)
+    app = Textbot.bind(llm)
+
+    serve.run(app)
+
+    objs = [
+        make_http_query.remote(args.num_words_per_query, args.num_queries_per_task)
+        for _ in range(args.num_tasks)
+    ]
+    ray.get(objs)
+
+    serve.shutdown()
+
+
+main()
diff --git a/python/ray/tests/chaos/streaming_llm.yaml b/python/ray/tests/chaos/streaming_llm.yaml
new file mode 100644
index 000000000000..7f09f07f4c49
--- /dev/null
+++ b/python/ray/tests/chaos/streaming_llm.yaml
@@ -0,0 +1,2 @@
+env_vars:
+  RAY_DEDUP_LOGS: "0"
diff --git a/python/ray/tests/test_streaming_generator.py b/python/ray/tests/test_streaming_generator.py
index b597de6d542e..2d0275c850bf 100644
--- a/python/ray/tests/test_streaming_generator.py
+++ b/python/ray/tests/test_streaming_generator.py
@@ -6,8 +6,6 @@
 import threading
 import gc
 
-from collections import Counter
-
 from unittest.mock import patch, Mock
 
 import ray
@@ -17,17 +15,6 @@
 from ray.cloudpickle import dumps
 from ray.exceptions import WorkerCrashedError
 
-RECONSTRUCTION_CONFIG = {
-    "health_check_failure_threshold": 10,
-    "health_check_period_ms": 100,
-    "health_check_timeout_ms": 100,
-    "health_check_initial_delay_ms": 0,
-    "max_direct_call_object_size": 100,
-    "task_retry_delay_ms": 100,
-    "object_timeout_milliseconds": 200,
-    "fetch_warn_timeout_milliseconds": 1000,
-}
-
 
 class MockedWorker:
     def __init__(self, mocked_core_worker):
@@ -587,227 +574,6 @@ async def async_f(self):
         ray.get(next(g))
 
 
-def test_threaded_actor_generator(shutdown_only):
-    ray.init()
-
-    @ray.remote(max_concurrency=10)
-    class Actor:
-        def f(self):
-            for i in range(30):
-                time.sleep(0.1)
-                yield np.ones(1024 * 1024) * i
-
-    @ray.remote(max_concurrency=20)
-    class AsyncActor:
-        async def f(self):
-            for i in range(30):
-                await asyncio.sleep(0.1)
-                yield np.ones(1024 * 1024) * i
-
-    async def main():
-        a = Actor.remote()
-        asy = AsyncActor.remote()
-
-        async def run():
-            i = 0
-            async for ref in a.f.options(num_returns="streaming").remote():
-                val = ray.get(ref)
-                print(val)
-                print(ref)
-                assert np.array_equal(val, np.ones(1024 * 1024) * i)
-                i += 1
-                del ref
-
-        async def run2():
-            i = 0
-            async for ref in asy.f.options(num_returns="streaming").remote():
-                val = await ref
-                print(ref)
-                print(val)
-                assert np.array_equal(val, np.ones(1024 * 1024) * i), ref
-                i += 1
-                del ref
-
-        coroutines = [run() for _ in range(10)]
-        coroutines = [run2() for _ in range(20)]
-
-        await asyncio.gather(*coroutines)
-
-    asyncio.run(main())
-
-
-def test_generator_dist_gather(ray_start_cluster):
-    cluster = ray_start_cluster
-    cluster.add_node(num_cpus=0, object_store_memory=1 * 1024 * 1024 * 1024)
-    ray.init()
-    cluster.add_node(num_cpus=1)
-    cluster.add_node(num_cpus=1)
-    cluster.add_node(num_cpus=1)
-    cluster.add_node(num_cpus=1)
-
-    @ray.remote(num_cpus=1)
-    class Actor:
-        def __init__(self, child=None):
-            self.child = child
-
-        def get_data(self):
-            for _ in range(10):
-                time.sleep(0.1)
-                yield np.ones(5 * 1024 * 1024)
-
-    async def all_gather():
-        actor = Actor.remote()
-        async for ref in actor.get_data.options(num_returns="streaming").remote():
-            val = await ref
-            assert np.array_equal(np.ones(5 * 1024 * 1024), val)
-            del ref
-
-    async def main():
-        await asyncio.gather(all_gather(), all_gather(), all_gather(), all_gather())
-
-    asyncio.run(main())
-    summary = ray._private.internal_api.memory_summary(stats_only=True)
-    print(summary)
-
-
-def test_generator_wait(shutdown_only):
-    """
-    Make sure the generator works with ray.wait.
-    """
-    ray.init(num_cpus=8)
-
-    @ray.remote
-    def f(sleep_time):
-        for i in range(2):
-            time.sleep(sleep_time)
-            yield i
-
-    @ray.remote
-    def g(sleep_time):
-        time.sleep(sleep_time)
-        return 10
-
-    gen = f.options(num_returns="streaming").remote(1)
-
-    """
-    Test basic cases.
-    """
-    for expected_rval in [0, 1]:
-        s = time.time()
-        r, ur = ray.wait([gen], num_returns=1)
-        print(time.time() - s)
-        assert len(r) == 1
-        assert ray.get(next(r[0])) == expected_rval
-        assert len(ur) == 0
-
-    # Should raise a stop iteration.
-    for _ in range(3):
-        s = time.time()
-        r, ur = ray.wait([gen], num_returns=1)
-        print(time.time() - s)
-        assert len(r) == 1
-        with pytest.raises(StopIteration):
-            assert next(r[0]) == 0
-        assert len(ur) == 0
-
-    gen = f.options(num_returns="streaming").remote(0)
-    # Wait until the generator task finishes
-    ray.get(gen._generator_ref)
-    for i in range(2):
-        r, ur = ray.wait([gen], timeout=0)
-        assert len(r) == 1
-        assert len(ur) == 0
-        assert ray.get(next(r[0])) == i
-
-    """
-    Test the case ref is mixed with regular object ref.
-    """
-    gen = f.options(num_returns="streaming").remote(0)
-    ref = g.remote(3)
-    ready, unready = [], [gen, ref]
-    result_set = set()
-    while unready:
-        ready, unready = ray.wait(unready)
-        print(ready, unready)
-        assert len(ready) == 1
-        for r in ready:
-            if isinstance(r, StreamingObjectRefGenerator):
-                try:
-                    ref = next(r)
-                    print(ref)
-                    print(ray.get(ref))
-                    result_set.add(ray.get(ref))
-                except StopIteration:
-                    pass
-                else:
-                    unready.append(r)
-            else:
-                result_set.add(ray.get(r))
-
-    assert result_set == {0, 1, 10}
-
-    """
-    Test timeout.
-    """
-    gen = f.options(num_returns="streaming").remote(3)
-    ref = g.remote(1)
-    ready, unready = ray.wait([gen, ref], timeout=2)
-    assert len(ready) == 1
-    assert len(unready) == 1
-
-    """
-    Test num_returns
-    """
-    gen = f.options(num_returns="streaming").remote(1)
-    ref = g.remote(1)
-    ready, unready = ray.wait([ref, gen], num_returns=2)
-    assert len(ready) == 2
-    assert len(unready) == 0
-
-
-def test_generator_wait_e2e(shutdown_only):
-    ray.init(num_cpus=8)
-
-    @ray.remote
-    def f(sleep_time):
-        for i in range(2):
-            time.sleep(sleep_time)
-            yield i
-
-    @ray.remote
-    def g(sleep_time):
-        time.sleep(sleep_time)
-        return 10
-
-    gen = [f.options(num_returns="streaming").remote(1) for _ in range(4)]
-    ref = [g.remote(2) for _ in range(4)]
-    ready, unready = [], [*gen, *ref]
-    result = []
-    start = time.time()
-    while unready:
-        ready, unready = ray.wait(unready, num_returns=len(unready), timeout=0.1)
-        for r in ready:
-            if isinstance(r, StreamingObjectRefGenerator):
-                try:
-                    ref = next(r)
-                    result.append(ray.get(ref))
-                except StopIteration:
-                    pass
-                else:
-                    unready.append(r)
-            else:
-                result.append(ray.get(r))
-    elapsed = time.time() - start
-    assert elapsed < 4
-    assert 2 < elapsed
-
-    assert len(result) == 12
-    result = Counter(result)
-    assert result[0] == 4
-    assert result[1] == 4
-    assert result[10] == 4
-
-
 if __name__ == "__main__":
     import os
 
diff --git a/python/ray/tests/test_streaming_generator_3.py b/python/ray/tests/test_streaming_generator_3.py
new file mode 100644
index 000000000000..7190a22edbb7
--- /dev/null
+++ b/python/ray/tests/test_streaming_generator_3.py
@@ -0,0 +1,240 @@
+import asyncio
+import pytest
+import numpy as np
+import sys
+import time
+
+from collections import Counter
+
+import ray
+from ray._raylet import StreamingObjectRefGenerator
+
+
+def test_threaded_actor_generator(shutdown_only):
+    ray.init()
+
+    @ray.remote(max_concurrency=10)
+    class Actor:
+        def f(self):
+            for i in range(30):
+                time.sleep(0.1)
+                yield np.ones(1024 * 1024) * i
+
+    @ray.remote(max_concurrency=20)
+    class AsyncActor:
+        async def f(self):
+            for i in range(30):
+                await asyncio.sleep(0.1)
+                yield np.ones(1024 * 1024) * i
+
+    async def main():
+        a = Actor.remote()
+        asy = AsyncActor.remote()
+
+        async def run():
+            i = 0
+            async for ref in a.f.options(num_returns="streaming").remote():
+                val = ray.get(ref)
+                print(val)
+                print(ref)
+                assert np.array_equal(val, np.ones(1024 * 1024) * i)
+                i += 1
+                del ref
+
+        async def run2():
+            i = 0
+            async for ref in asy.f.options(num_returns="streaming").remote():
+                val = await ref
+                print(ref)
+                print(val)
+                assert np.array_equal(val, np.ones(1024 * 1024) * i), ref
+                i += 1
+                del ref
+
+        coroutines = [run() for _ in range(10)]
+        coroutines = [run2() for _ in range(20)]
+
+        await asyncio.gather(*coroutines)
+
+    asyncio.run(main())
+
+
+def test_generator_dist_gather(ray_start_cluster):
+    cluster = ray_start_cluster
+    cluster.add_node(num_cpus=0, object_store_memory=1 * 1024 * 1024 * 1024)
+    ray.init()
+    cluster.add_node(num_cpus=1)
+    cluster.add_node(num_cpus=1)
+    cluster.add_node(num_cpus=1)
+    cluster.add_node(num_cpus=1)
+
+    @ray.remote(num_cpus=1)
+    class Actor:
+        def __init__(self, child=None):
+            self.child = child
+
+        def get_data(self):
+            for _ in range(10):
+                time.sleep(0.1)
+                yield np.ones(5 * 1024 * 1024)
+
+    async def all_gather():
+        actor = Actor.remote()
+        async for ref in actor.get_data.options(num_returns="streaming").remote():
+            val = await ref
+            assert np.array_equal(np.ones(5 * 1024 * 1024), val)
+            del ref
+
+    async def main():
+        await asyncio.gather(all_gather(), all_gather(), all_gather(), all_gather())
+
+    asyncio.run(main())
+    summary = ray._private.internal_api.memory_summary(stats_only=True)
+    print(summary)
+
+
+def test_generator_wait(shutdown_only):
+    """
+    Make sure the generator works with ray.wait.
+    """
+    ray.init(num_cpus=8)
+
+    @ray.remote
+    def f(sleep_time):
+        for i in range(2):
+            time.sleep(sleep_time)
+            yield i
+
+    @ray.remote
+    def g(sleep_time):
+        time.sleep(sleep_time)
+        return 10
+
+    gen = f.options(num_returns="streaming").remote(1)
+
+    """
+    Test basic cases.
+    """
+    for expected_rval in [0, 1]:
+        s = time.time()
+        r, ur = ray.wait([gen], num_returns=1)
+        print(time.time() - s)
+        assert len(r) == 1
+        assert ray.get(next(r[0])) == expected_rval
+        assert len(ur) == 0
+
+    # Should raise a stop iteration.
+    for _ in range(3):
+        s = time.time()
+        r, ur = ray.wait([gen], num_returns=1)
+        print(time.time() - s)
+        assert len(r) == 1
+        with pytest.raises(StopIteration):
+            assert next(r[0]) == 0
+        assert len(ur) == 0
+
+    gen = f.options(num_returns="streaming").remote(0)
+    # Wait until the generator task finishes
+    ray.get(gen._generator_ref)
+    for i in range(2):
+        r, ur = ray.wait([gen], timeout=0)
+        assert len(r) == 1
+        assert len(ur) == 0
+        assert ray.get(next(r[0])) == i
+
+    """
+    Test the case ref is mixed with regular object ref.
+    """
+    gen = f.options(num_returns="streaming").remote(0)
+    ref = g.remote(3)
+    ready, unready = [], [gen, ref]
+    result_set = set()
+    while unready:
+        ready, unready = ray.wait(unready)
+        print(ready, unready)
+        assert len(ready) == 1
+        for r in ready:
+            if isinstance(r, StreamingObjectRefGenerator):
+                try:
+                    ref = next(r)
+                    print(ref)
+                    print(ray.get(ref))
+                    result_set.add(ray.get(ref))
+                except StopIteration:
+                    pass
+                else:
+                    unready.append(r)
+            else:
+                result_set.add(ray.get(r))
+
+    assert result_set == {0, 1, 10}
+
+    """
+    Test timeout.
+    """
+    gen = f.options(num_returns="streaming").remote(3)
+    ref = g.remote(1)
+    ready, unready = ray.wait([gen, ref], timeout=2)
+    assert len(ready) == 1
+    assert len(unready) == 1
+
+    """
+    Test num_returns
+    """
+    gen = f.options(num_returns="streaming").remote(1)
+    ref = g.remote(1)
+    ready, unready = ray.wait([ref, gen], num_returns=2)
+    assert len(ready) == 2
+    assert len(unready) == 0
+
+
+def test_generator_wait_e2e(shutdown_only):
+    ray.init(num_cpus=8)
+
+    @ray.remote
+    def f(sleep_time):
+        for i in range(2):
+            time.sleep(sleep_time)
+            yield i
+
+    @ray.remote
+    def g(sleep_time):
+        time.sleep(sleep_time)
+        return 10
+
+    gen = [f.options(num_returns="streaming").remote(1) for _ in range(4)]
+    ref = [g.remote(2) for _ in range(4)]
+    ready, unready = [], [*gen, *ref]
+    result = []
+    start = time.time()
+    while unready:
+        ready, unready = ray.wait(unready, num_returns=len(unready), timeout=0.1)
+        for r in ready:
+            if isinstance(r, StreamingObjectRefGenerator):
+                try:
+                    ref = next(r)
+                    result.append(ray.get(ref))
+                except StopIteration:
+                    pass
+                else:
+                    unready.append(r)
+            else:
+                result.append(ray.get(r))
+    elapsed = time.time() - start
+    assert elapsed < 4
+    assert 2 < elapsed
+
+    assert len(result) == 12
+    result = Counter(result)
+    assert result[0] == 4
+    assert result[1] == 4
+    assert result[10] == 4
+
+
+if __name__ == "__main__":
+    import os
+
+    if os.environ.get("PARALLEL_CI"):
+        sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))
+    else:
+        sys.exit(pytest.main(["-sv", __file__]))
diff --git a/python/ray/train/_checkpoint.py b/python/ray/train/_checkpoint.py
index ac42a94c2b2d..0b60b4ea7799 100644
--- a/python/ray/train/_checkpoint.py
+++ b/python/ray/train/_checkpoint.py
@@ -30,9 +30,9 @@
 class Checkpoint:
     """A reference to data persisted as a directory in local or remote storage.
 
-    Access checkpoint contents locally using `checkpoint.to_directory()`.
+    Access checkpoint contents locally using ``checkpoint.to_directory()``.
 
-    Example creating a checkpoint using `Checkpoint.from_directory`:
+    Example creating a checkpoint using ``Checkpoint.from_directory``:
 
         >>> from ray.train._checkpoint import Checkpoint
         >>> checkpoint = Checkpoint.from_directory("/tmp/example_checkpoint_dir")
diff --git a/python/ray/train/_internal/storage.py b/python/ray/train/_internal/storage.py
index c5b3037d77c7..9f52df7acec0 100644
--- a/python/ray/train/_internal/storage.py
+++ b/python/ray/train/_internal/storage.py
@@ -26,7 +26,7 @@
 
 
 from ray.air._internal.filelock import TempFileLock
-from ray.air._internal.uri_utils import URI, is_uri
+from ray.air._internal.uri_utils import is_uri
 from ray.tune.syncer import Syncer, SyncConfig, _BackgroundSyncer
 from ray.tune.result import _get_defaults_results_dir
 
@@ -63,7 +63,7 @@ def fsid(self):
         return "_excluding_local"
 
     def _should_exclude(self, name: str) -> bool:
-        """Return True if `name` matches any of the `self._exclude` patterns."""
+        """Return True if `name` matches any of the ``self._exclude`` patterns."""
         alt = None
         if os.path.isdir(name):
             # If this is a directory, also test it with trailing slash
@@ -402,10 +402,6 @@ class StorageContext:
         >>> storage.current_checkpoint_index = 1
         >>> storage.checkpoint_fs_path
         'bucket/path/exp_name/trial_dir/checkpoint_000001'
-        >>> storage.storage_prefix
-        URI<mock://netloc?param=1>
-        >>> str(storage.storage_prefix / storage.experiment_fs_path)
-        'mock://netloc/bucket/path/exp_name?param=1'
 
     Example with storage_path=None:
 
@@ -429,10 +425,6 @@ class StorageContext:
         True
         >>> storage.storage_filesystem   # Auto-resolved  # doctest: +ELLIPSIS
         <pyarrow._fs.LocalFileSystem object...
-        >>> storage.storage_prefix
-        URI<.>
-        >>> str(storage.storage_prefix / storage.experiment_fs_path)
-        '/tmp/ray_results/exp_name'
 
     Internal Usage Examples:
     - To copy files to the trial directory on the storage filesystem:
@@ -471,18 +463,6 @@ def __init__(
             self.storage_path, storage_filesystem
         )
 
-        # The storage prefix is part of the URI that is stripped away
-        # from the user-provided `storage_path` by pyarrow's `from_uri`.
-        # Ex: `storage_path="s3://bucket/path?param=1`
-        #  -> `storage_prefix=URI<s3://.?param=1>`
-        # See the doctests for more examples.
-        # This is used to construct URI's of the same format as `storage_path`.
-        # However, we don't track these URI's internally, because pyarrow only
-        # needs to interact with the prefix-stripped fs_path.
-        self.storage_prefix: URI = URI(self.storage_path).rstrip_subpath(
-            Path(self.storage_fs_path)
-        )
-
         # Syncing is always needed if a custom `storage_filesystem` is provided.
         # Otherwise, syncing is only needed if storage_local_path
         # and storage_fs_path point to different locations.
diff --git a/python/ray/train/lightning/_lightning_utils.py b/python/ray/train/lightning/_lightning_utils.py
index 5ad917f469ef..23065c291ff1 100644
--- a/python/ray/train/lightning/_lightning_utils.py
+++ b/python/ray/train/lightning/_lightning_utils.py
@@ -9,7 +9,6 @@
 import shutil
 import torch
 import tempfile
-from tempfile import TemporaryDirectory
 from ray.train import Checkpoint
 from ray.train._checkpoint import Checkpoint as NewCheckpoint
 from ray.train._internal.storage import _use_storage_context
@@ -197,26 +196,40 @@ def prepare_trainer(trainer: pl.Trainer) -> pl.Trainer:
 class RayTrainReportCallback(Callback):
     """A simple callback that reports checkpoints to Ray on train epoch end."""
 
+    def __init__(self) -> None:
+        super().__init__()
+        self.trial_name = train.get_context().get_trial_name()
+        self.local_rank = train.get_context().get_local_rank()
+        self.tmpdir_prefix = os.path.join(tempfile.gettempdir(), self.trial_name)
+        if os.path.isdir(self.tmpdir_prefix) and self.local_rank == 0:
+            shutil.rmtree(self.tmpdir_prefix)
+
     def on_train_epoch_end(self, trainer, pl_module) -> None:
-        with TemporaryDirectory() as tmpdir:
-            # Fetch metrics
-            metrics = trainer.callback_metrics
-            metrics = {k: v.item() for k, v in metrics.items()}
+        # Creates a checkpoint dir with fixed name
+        tmpdir = os.path.join(self.tmpdir_prefix, str(trainer.current_epoch))
+        os.makedirs(tmpdir, exist_ok=True)
 
-            # (Optional) Add customized metrics
-            metrics["epoch"] = trainer.current_epoch
-            metrics["step"] = trainer.global_step
+        # Fetch metrics
+        metrics = trainer.callback_metrics
+        metrics = {k: v.item() for k, v in metrics.items()}
 
-            # Save checkpoint to local
-            ckpt_path = os.path.join(tmpdir, "checkpoint.ckpt")
-            trainer.save_checkpoint(ckpt_path, weights_only=False)
+        # (Optional) Add customized metrics
+        metrics["epoch"] = trainer.current_epoch
+        metrics["step"] = trainer.global_step
 
-            # Report to train session
-            if _use_storage_context():
-                checkpoint = NewCheckpoint.from_directory(tmpdir)
-            else:
-                checkpoint = Checkpoint.from_directory(tmpdir)
-            train.report(metrics=metrics, checkpoint=checkpoint)
+        # Save checkpoint to local
+        ckpt_path = os.path.join(tmpdir, "checkpoint.ckpt")
+        trainer.save_checkpoint(ckpt_path, weights_only=False)
+
+        # Report to train session
+        if _use_storage_context():
+            checkpoint = NewCheckpoint.from_directory(tmpdir)
+        else:
+            checkpoint = Checkpoint.from_directory(tmpdir)
+        train.report(metrics=metrics, checkpoint=checkpoint)
+
+        if self.local_rank == 0:
+            shutil.rmtree(tmpdir)
 
 
 class RayIterableDataset(IterableDataset):
diff --git a/python/ray/train/tests/test_minimal.py b/python/ray/train/tests/test_minimal.py
index 2cf848ac6c61..9a5fe8c2e230 100644
--- a/python/ray/train/tests/test_minimal.py
+++ b/python/ray/train/tests/test_minimal.py
@@ -2,10 +2,11 @@
 
 import ray
 from ray import train
-from ray.train import Checkpoint, ScalingConfig
+from ray.train import ScalingConfig
 from ray.train._internal.worker_group import WorkerGroup
 from ray.train.backend import Backend, BackendConfig
 from ray.train.data_parallel_trainer import DataParallelTrainer
+from ray.train.tests.util import create_dict_checkpoint, load_dict_checkpoint
 
 
 @pytest.fixture
@@ -39,26 +40,23 @@ def test_run(ray_start_4_cpus):
 
     def train_func():
         checkpoint = train.get_checkpoint()
-        train.report(metrics=checkpoint.to_dict(), checkpoint=checkpoint)
-        return checkpoint.to_dict()[key]
-
-    checkpoint = Checkpoint.from_dict(
-        {
-            # this would be set during checkpoint saving
-            "_current_checkpoint_id": 1,
-            key: value,
-        }
-    )
-
-    trainer = DataParallelTrainer(
-        train_func,
-        backend_config=config,
-        resume_from_checkpoint=checkpoint,
-        scaling_config=ScalingConfig(num_workers=num_workers),
-    )
-    results = trainer.fit()
-
-    assert results.checkpoint.to_dict()[key] == checkpoint.to_dict()[key]
+        checkpoint_dict = load_dict_checkpoint(checkpoint)
+        train.report(metrics=checkpoint_dict, checkpoint=checkpoint)
+        return checkpoint_dict[key]
+
+    with create_dict_checkpoint({key: value}) as checkpoint:
+
+        trainer = DataParallelTrainer(
+            train_func,
+            backend_config=config,
+            resume_from_checkpoint=checkpoint,
+            scaling_config=ScalingConfig(num_workers=num_workers),
+        )
+        results = trainer.fit()
+
+        assert load_dict_checkpoint(results.checkpoint) == load_dict_checkpoint(
+            checkpoint
+        )
 
 
 def test_failure():
diff --git a/python/ray/train/tests/test_new_persistence.py b/python/ray/train/tests/test_new_persistence.py
index e1d7a39a5b8e..ba329170c32e 100644
--- a/python/ray/train/tests/test_new_persistence.py
+++ b/python/ray/train/tests/test_new_persistence.py
@@ -117,23 +117,6 @@ def _get_local_inspect_dir(
     return local_inspect_dir, storage_fs_path
 
 
-def _convert_path_to_fs_path(
-    path: str, storage_filesystem: Optional[pyarrow.fs.FileSystem]
-) -> str:
-    """Converts a path to a (prefix-stripped) filesystem path.
-
-    Ex: "s3://bucket/path/to/file" -> "bucket/path/to/file"
-    Ex: "/mnt/nfs/path/to/file" -> "/mnt/nfs/bucket/path/to/file"
-    """
-    if not storage_filesystem:
-        _, fs_path = pyarrow.fs.FileSystem.from_uri(path)
-        return fs_path
-
-    # Otherwise, we're using a custom filesystem,
-    # and the provided path is already the fs path.
-    return path
-
-
 def _get_checkpoint_index(checkpoint_dir_name: str) -> int:
     """Gets the checkpoint index from the checkpoint directory name."""
     return int(checkpoint_dir_name.split("_")[-1])
@@ -454,10 +437,12 @@ def test_tuner(
     # First, check that the ResultGrid returns the correct paths.
     print(result_grid)
     experiment_fs_path = result_grid.experiment_path
+    assert isinstance(result_grid.filesystem, pyarrow.fs.FileSystem), result_grid
     assert experiment_fs_path == os.path.join(storage_fs_path, exp_name)
     assert len(result_grid) == NUM_TRIALS
     for result in result_grid:
-        trial_fs_path = _convert_path_to_fs_path(result.path, storage_filesystem)
+        trial_fs_path = result.path
+        assert isinstance(result.filesystem, pyarrow.fs.FileSystem), result
         assert trial_fs_path.startswith(experiment_fs_path)
         for checkpoint, _ in result.best_checkpoints:
             assert checkpoint.path.startswith(trial_fs_path)
@@ -577,7 +562,7 @@ def test_trainer(
 
     # First, inspect that the result object returns the correct paths.
     print(result)
-    trial_fs_path = _convert_path_to_fs_path(result.path, storage_filesystem)
+    trial_fs_path = result.path
     assert trial_fs_path.startswith(storage_fs_path)
     for checkpoint, _ in result.best_checkpoints:
         assert checkpoint.path.startswith(trial_fs_path)
diff --git a/python/ray/train/tests/util.py b/python/ray/train/tests/util.py
index 966d8dc516dd..99079596e4d9 100644
--- a/python/ray/train/tests/util.py
+++ b/python/ray/train/tests/util.py
@@ -5,6 +5,7 @@
 
 import ray.cloudpickle as ray_pickle
 from ray.train._checkpoint import Checkpoint
+from ray.train._internal.storage import StorageContext
 
 
 @contextlib.contextmanager
@@ -22,3 +23,11 @@ def load_dict_checkpoint(checkpoint: Checkpoint) -> Dict[str, Any]:
     with checkpoint.as_directory() as checkpoint_dir:
         with open(os.path.join(checkpoint_dir, "data.pkl"), "rb") as f:
             return ray_pickle.load(f)
+
+
+def mock_storage_context() -> StorageContext:
+    return StorageContext(
+        storage_path=tempfile.mkdtemp(),
+        experiment_dir_name="exp_name",
+        trial_dir_name="trial_name",
+    )
diff --git a/python/ray/tune/BUILD b/python/ray/tune/BUILD
index f015e9731b18..ce313a0bd53e 100644
--- a/python/ray/tune/BUILD
+++ b/python/ray/tune/BUILD
@@ -477,7 +477,7 @@ py_test(
     size = "large",
     srcs = ["tests/execution/test_controller_checkpointing_integration.py"],
     deps = [":tune_lib"],
-    tags = ["team:ml", "exclusive", "no_new_storage"]
+    tags = ["team:ml", "exclusive", "new_storage"]
 )
 
 py_test(
diff --git a/python/ray/tune/analysis/experiment_analysis.py b/python/ray/tune/analysis/experiment_analysis.py
index d9dd05fc15b0..f499e7c19267 100644
--- a/python/ray/tune/analysis/experiment_analysis.py
+++ b/python/ray/tune/analysis/experiment_analysis.py
@@ -27,7 +27,6 @@
 )
 from ray.train._internal.storage import (
     _use_storage_context,
-    _is_directory,
     _list_at_fs_path,
     _exists_at_fs_path,
     get_fs_and_path,
@@ -88,10 +87,11 @@ class NewExperimentAnalysis:
     def __init__(
         self,
         experiment_checkpoint_path: Union[str, os.PathLike],
+        *,
+        storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
         trials: Optional[List[Trial]] = None,
         default_metric: Optional[str] = None,
         default_mode: Optional[str] = None,
-        storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
     ):
         self.default_metric = default_metric
         if default_mode and default_mode not in ["min", "max"]:
@@ -101,41 +101,31 @@ def __init__(
             # If only a mode was passed, use anonymous metric
             self.default_metric = DEFAULT_METRIC
 
-        (
-            self._fs,
-            self._experiment_fs_path,
-            self._experiment_json_fs_path,
-        ) = self._get_experiment_fs_and_path(
-            experiment_checkpoint_path, storage_filesystem
-        )
-
-        self.trials = trials or self._load_trials()
-        self._trial_dataframes = self._fetch_trial_dataframes()
-        self._configs = self.get_all_configs()
-
-    def _get_experiment_fs_and_path(
-        self,
-        experiment_path: Union[str, os.PathLike],
-        storage_filesystem: Optional[pyarrow.fs.FileSystem],
-    ) -> Tuple[pyarrow.fs.FileSystem, str, str]:
-        """Returns the filesystem and paths to the experiment directory
-        + the experiment checkpoint file."""
+        # Resolve the filesystem if not specified.
         if storage_filesystem:
-            fs, experiment_fs_path = storage_filesystem, str(experiment_path)
+            self._fs = storage_filesystem
         else:
-            fs, experiment_fs_path = get_fs_and_path(experiment_path)
+            self._fs, experiment_checkpoint_path = get_fs_and_path(
+                experiment_checkpoint_path
+            )
 
-        if not _is_directory(fs, experiment_fs_path):
-            experiment_json_fs_path = experiment_fs_path
-            experiment_fs_path = os.path.dirname(experiment_fs_path)
+        # Find the json state file.
+        experiment_checkpoint_path = str(experiment_checkpoint_path)
+        if experiment_checkpoint_path.endswith(".json"):
+            self._experiment_fs_path = os.path.dirname(experiment_checkpoint_path)
+            self._experiment_json_fs_path = experiment_checkpoint_path
         else:
-            experiment_json_fs_path = os.path.join(
-                experiment_fs_path,
+            self._experiment_fs_path = experiment_checkpoint_path
+            self._experiment_json_fs_path = os.path.join(
+                experiment_checkpoint_path,
                 NewExperimentAnalysis._find_newest_experiment_checkpoint(
-                    fs, experiment_fs_path
+                    self._fs, experiment_checkpoint_path
                 ),
             )
-        return fs, experiment_fs_path, experiment_json_fs_path
+
+        self.trials = trials or self._load_trials()
+        self._trial_dataframes = self._fetch_trial_dataframes()
+        self._configs = self.get_all_configs()
 
     def _load_trials(self) -> List[Trial]:
         with self._fs.open_input_stream(self._experiment_json_fs_path) as f:
diff --git a/python/ray/tune/execution/tune_controller.py b/python/ray/tune/execution/tune_controller.py
index f5d377d64531..28f0795d8bbe 100644
--- a/python/ray/tune/execution/tune_controller.py
+++ b/python/ray/tune/execution/tune_controller.py
@@ -421,7 +421,7 @@ def experiment_state_path(self) -> str:
     @property
     def experiment_path(self) -> str:
         if _use_storage_context():
-            return str(self._storage.storage_prefix / self._storage.experiment_fs_path)
+            return self._storage.experiment_fs_path
 
         return self._legacy_remote_experiment_path or self._legacy_local_experiment_path
 
@@ -2051,8 +2051,7 @@ def _checkpoint_trial_if_needed(self, trial, force=False):
     # RESTORE
     def _schedule_trial_restore(self, trial: Trial) -> bool:
         if _use_storage_context():
-            cpm = trial.run_metadata.checkpoint_manager
-            checkpoint_result = cpm.latest_checkpoint_result
+            checkpoint_result = trial.latest_checkpoint_result
 
             if not checkpoint_result:
                 logger.debug(f"Not restoring trial {trial}: No checkpoint found.")
diff --git a/python/ray/tune/experiment/experiment.py b/python/ray/tune/experiment/experiment.py
index 716ef8c48466..ef5afdfe5294 100644
--- a/python/ray/tune/experiment/experiment.py
+++ b/python/ray/tune/experiment/experiment.py
@@ -508,7 +508,7 @@ def local_dir(self):
     @property
     def remote_path(self) -> Optional[str]:
         if _use_storage_context():
-            return str(self.storage.storage_prefix / self.storage.experiment_fs_path)
+            return self.storage.experiment_fs_path
 
         if not self._legacy_remote_storage_path:
             return None
diff --git a/python/ray/tune/experiment/trial.py b/python/ray/tune/experiment/trial.py
index 0dd98e9b50ab..7aef47649063 100644
--- a/python/ray/tune/experiment/trial.py
+++ b/python/ray/tune/experiment/trial.py
@@ -549,6 +549,12 @@ def __init__(
 
         # Restoration fields
         self.restore_path = restore_path
+        self._restore_checkpoint_result: Optional[_TrainingResult] = None
+        if restore_path:
+            # tune.run(restore) passes in a path without metrics.
+            self._restore_checkpoint_result = _TrainingResult(
+                checkpoint=Checkpoint.from_directory(restore_path), metrics={}
+            )
 
         if trial_name_creator:
             self.custom_trial_name = trial_name_creator(self)
@@ -693,7 +699,7 @@ def experiment_dir_name(self, name: str):
     @property
     def remote_experiment_path(self) -> str:
         if _use_storage_context():
-            return str(self.storage.storage_prefix / self.storage.experiment_fs_path)
+            return self.storage.experiment_fs_path
 
         return str(self._legacy_remote_experiment_path)
 
@@ -803,7 +809,7 @@ def remote_path(self) -> Optional[str]:
     @property
     def path(self) -> Optional[str]:
         if _use_storage_context():
-            return str(self.storage.storage_prefix / self.storage.trial_fs_path)
+            return self.storage.trial_fs_path
 
         return self.remote_path or self.local_path
 
@@ -832,15 +838,22 @@ def checkpoint_freq(self):
         config = self.run_metadata.checkpoint_manager.checkpoint_config
         return config.checkpoint_frequency
 
+    @property
+    def latest_checkpoint_result(self) -> Optional[_TrainingResult]:
+        # NOTE: Fallback to the checkpoint passed in from `tune.run(restore)`
+        # if the trial hasn't saved any checkpoints itself yet.
+        return (
+            self.run_metadata.checkpoint_manager.latest_checkpoint_result
+            or self._restore_checkpoint_result
+        )
+
     @property
     def checkpoint(self) -> Optional[Checkpoint]:
         """Returns the most recent checkpoint if one has been saved."""
         if _use_storage_context():
-            checkpoint_manager = self.run_metadata.checkpoint_manager
-            latest_checkpoint_result = checkpoint_manager.latest_checkpoint_result
             return (
-                latest_checkpoint_result.checkpoint
-                if latest_checkpoint_result
+                self.latest_checkpoint_result.checkpoint
+                if self.latest_checkpoint_result
                 else None
             )
 
@@ -1070,9 +1083,19 @@ def should_checkpoint(self):
         )
 
     def has_checkpoint(self):
+        if _use_storage_context():
+            return self.checkpoint is not None
+
         return self.checkpoint.dir_or_data is not None
 
     def clear_checkpoint(self):
+        if _use_storage_context():
+            if self.latest_checkpoint_result:
+                self.latest_checkpoint_result.checkpoint = None
+            self.temporary_state.restoring_from = None
+            self.run_metadata.invalidate_cache()
+            return
+
         self.checkpoint.dir_or_data = None
         self.temporary_state.restoring_from = None
         self.run_metadata.invalidate_cache()
diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index f5d98619d0e4..654af2df6f90 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -1,6 +1,7 @@
 from functools import partial
 import os
 import pandas as pd
+import pyarrow
 from typing import Optional, Union
 
 from ray.air.result import Result
@@ -8,6 +9,7 @@
 from ray.cloudpickle import cloudpickle
 from ray.exceptions import RayTaskError
 from ray.tune.analysis import ExperimentAnalysis
+from ray.tune.analysis.experiment_analysis import NewExperimentAnalysis
 from ray.tune.error import TuneError
 from ray.tune.experiment import Trial
 from ray.tune.trainable.util import TrainableUtil
@@ -91,6 +93,15 @@ def experiment_path(self) -> str:
         location (path on the head node)."""
         return self._experiment_analysis.experiment_path
 
+    @property
+    def filesystem(self) -> pyarrow.fs.FileSystem:
+        """Return the filesystem that can be used to access the experiment path.
+
+        Returns:
+            pyarrow.fs.FileSystem implementation.
+        """
+        return self._experiment_analysis._fs
+
     def get_best_result(
         self,
         metric: Optional[str] = None,
@@ -307,6 +318,9 @@ def _trial_to_result(self, trial: Trial) -> Result:
             error=self._populate_exception(trial),
             _local_path=trial.local_path,
             _remote_path=trial.remote_path,
+            _storage_filesystem=self._experiment_analysis._fs
+            if isinstance(self._experiment_analysis, NewExperimentAnalysis)
+            else None,
             metrics_dataframe=metrics_df,
             best_checkpoints=best_checkpoints,
         )
diff --git a/python/ray/tune/tests/execution/test_controller_checkpointing_integration.py b/python/ray/tune/tests/execution/test_controller_checkpointing_integration.py
index 72665f7cddd4..93d080c3df8d 100644
--- a/python/ray/tune/tests/execution/test_controller_checkpointing_integration.py
+++ b/python/ray/tune/tests/execution/test_controller_checkpointing_integration.py
@@ -1,22 +1,27 @@
 import json
 import os
-import shutil
 
 import pytest
 import sys
 
 import ray
 from ray.train import CheckpointConfig
-from ray.air._internal.checkpoint_manager import _TrackedCheckpoint, CheckpointStorage
 from ray.air.execution import FixedResourceManager, PlacementGroupResourceManager
 from ray.air.constants import TRAINING_ITERATION
+from ray.train._checkpoint import Checkpoint
+from ray.train._internal.session import _TrainingResult
+from ray.train._internal.storage import StorageContext
 from ray.tune import PlacementGroupFactory
 from ray.tune.execution.tune_controller import TuneController
 from ray.tune.experiment import Trial
 from ray.tune.result import DONE
 from ray.tune.schedulers import FIFOScheduler
 from ray.tune.search import BasicVariantGenerator
-from ray.tune.trainable import TrainableUtil
+
+from ray.train.tests.util import mock_storage_context
+
+
+STORAGE = mock_storage_context()
 
 
 @pytest.fixture(scope="function")
@@ -61,13 +66,13 @@ def test_checkpoint_save_restore(
     Legacy test: test_trial_runner_2.py::TrialRunnerTest::testRestoreMetricsAfterCheckpointing  # noqa
     """
     runner = TuneController(
-        resource_manager_factory=lambda: resource_manager_cls(),
-        experiment_path=str(tmpdir),
+        resource_manager_factory=lambda: resource_manager_cls(), storage=STORAGE
     )
     kwargs = {
         "stopping_criterion": {"training_iteration": 1},
         "placement_group_factory": PlacementGroupFactory([{"CPU": 1, "GPU": 1}]),
         "checkpoint_config": CheckpointConfig(checkpoint_frequency=1),
+        "storage": STORAGE,
     }
     runner.add_trial(Trial("__fake", **kwargs))
     trials = runner.get_trials()
@@ -83,13 +88,14 @@ def test_checkpoint_save_restore(
     while trials[0].status != Trial.TERMINATED:
         runner.step()
 
-    assert trials[0].checkpoint.metrics[TRAINING_ITERATION] == 1
+    assert trials[0].latest_checkpoint_result.metrics[TRAINING_ITERATION] == 1
     assert trials[0].last_result[TRAINING_ITERATION] == 1
     assert trials[0].last_result["iterations_since_restore"] == 1
 
     # Prepare new trial
-    kwargs["restore_path"] = trials[0].checkpoint.dir_or_data
-    runner.add_trial(Trial("__fake", **kwargs))
+    kwargs["restore_path"] = trials[0].checkpoint.path
+    new_trial = Trial("__fake", **kwargs)
+    runner.add_trial(new_trial)
     trials = runner.get_trials()
 
     assert trials[1].status == Trial.PENDING
@@ -107,10 +113,9 @@ def test_checkpoint_save_restore(
     while trials[1].status != Trial.TERMINATED:
         runner.step()
 
-    assert trials[1].checkpoint.metrics[TRAINING_ITERATION] == 2
-    assert trials[1].last_result[TRAINING_ITERATION] == 2
+    assert trials[0].latest_checkpoint_result.metrics[TRAINING_ITERATION] == 1
+    assert trials[1].last_result[TRAINING_ITERATION] == 1
     assert trials[1].last_result["iterations_since_restore"] == 1
-    assert trials[1].last_result["time_since_restore"] > 0
 
 
 @pytest.mark.parametrize(
@@ -124,12 +129,13 @@ def test_checkpoint_at_end(ray_start_4_cpus_2_gpus_extra, resource_manager_cls,
     """
     runner = TuneController(
         resource_manager_factory=lambda: resource_manager_cls(),
-        experiment_path=str(tmpdir),
+        storage=STORAGE,
     )
     kwargs = {
         "stopping_criterion": {"training_iteration": 2},
         "checkpoint_config": CheckpointConfig(checkpoint_at_end=True),
         "placement_group_factory": PlacementGroupFactory([{"CPU": 1, "GPU": 1}]),
+        "storage": STORAGE,
     }
     runner.add_trial(Trial("__fake", **kwargs))
     trials = runner.get_trials()
@@ -151,14 +157,17 @@ def test_pause_resume_trial(
 
     Legacy test: test_trial_runner_2.py::TrialRunnerTest::testPauseThenResume
     """
+    # TODO(krfricke): Unskip once pause trial changes are in.
+    pytest.skip("Skipping for now.")
     runner = TuneController(
         resource_manager_factory=lambda: resource_manager_cls(),
-        experiment_path=str(tmpdir),
+        storage=STORAGE,
     )
     kwargs = {
         "stopping_criterion": {"training_iteration": 2},
         "placement_group_factory": PlacementGroupFactory([{"CPU": 1, "GPU": 1}]),
         "checkpoint_config": CheckpointConfig(checkpoint_frequency=1),
+        "storage": STORAGE,
     }
     runner.add_trial(Trial("__fake", **kwargs))
     trials = runner.get_trials()
@@ -188,7 +197,7 @@ def test_pause_resume_trial(
     while trials[0].status != Trial.TERMINATED:
         runner.step()
 
-    assert trials[0].checkpoint.metrics[TRAINING_ITERATION] == 2
+    assert trials[0].checkpoint
     assert trials[0].last_result[TRAINING_ITERATION] == 2
     assert trials[0].last_result["iterations_since_restore"] == 1
     assert trials[0].last_result["time_since_restore"] > 0
@@ -198,7 +207,7 @@ def test_pause_resume_trial(
     "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager]
 )
 def test_checkpoint_num_to_keep(
-    ray_start_4_cpus_2_gpus_extra, resource_manager_cls, tmpdir
+    ray_start_4_cpus_2_gpus_extra, resource_manager_cls, tmp_path
 ):
     """Test that only num_to_keep checkpoints are kept.
 
@@ -207,38 +216,25 @@ def test_checkpoint_num_to_keep(
     Legacy test: test_trial_runner_2.py::TrialRunnerTest::testPauseResumeCheckpointCount
     """
     trial = Trial(
-        "__fake",
-        experiment_path=str(tmpdir),
-        checkpoint_config=CheckpointConfig(num_to_keep=2),
+        "__fake", checkpoint_config=CheckpointConfig(num_to_keep=2), storage=STORAGE
     )
     trial.init_local_path()
-    trial.run_metadata.checkpoint_manager.set_delete_fn(
-        lambda cp: shutil.rmtree(cp.dir_or_data)
-    )
 
     def write_checkpoint(trial: Trial, index: int):
-        checkpoint_dir = TrainableUtil.make_checkpoint_dir(
-            trial.local_path, index=index
-        )
+        checkpoint_dir = tmp_path / StorageContext._make_checkpoint_dir_name(index)
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
         result = {"training_iteration": index}
         with open(os.path.join(checkpoint_dir, "cp.json"), "w") as f:
             json.dump(result, f)
 
-        tune_cp = _TrackedCheckpoint(
-            dir_or_data=checkpoint_dir,
-            storage_mode=CheckpointStorage.PERSISTENT,
-            metrics=result,
-        )
-        trial.temporary_state.saving_to = tune_cp
-
-        return checkpoint_dir
+        checkpoint = Checkpoint.from_directory(checkpoint_dir)
+        return _TrainingResult(checkpoint=checkpoint, metrics=result)
 
     def get_checkpoint_dirs(trial: Trial):
-        return [d for d in os.listdir(trial.local_path) if d.startswith("checkpoint_")]
+        return [d for d in os.listdir(tmp_path) if d.startswith("checkpoint_")]
 
     runner = TuneController(
-        resource_manager_factory=lambda: resource_manager_cls(),
-        experiment_path=str(tmpdir),
+        resource_manager_factory=lambda: resource_manager_cls(), storage=STORAGE
     )
 
     runner.add_trial(trial)
@@ -270,15 +266,11 @@ def get_checkpoint_dirs(trial: Trial):
     # Re-instantiate trial runner and resume
     runner.checkpoint(force=True)
     runner = TuneController(
-        resource_manager_factory=lambda: resource_manager_cls(),
-        experiment_path=str(tmpdir),
+        resource_manager_factory=lambda: resource_manager_cls(), storage=STORAGE
     )
     runner.resume()
 
     trial = runner.get_trials()[0]
-    trial.run_metadata.checkpoint_manager.set_delete_fn(
-        lambda cp: shutil.rmtree(cp.dir_or_data)
-    )
 
     # Write fourth checkpoint
     result = write_checkpoint(trial, 4)
diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py
index f7b0a3a1cbcd..e9754908228d 100644
--- a/python/ray/tune/tests/test_result_grid.py
+++ b/python/ray/tune/tests/test_result_grid.py
@@ -140,12 +140,14 @@ class MockExperimentAnalysis:
   Result(
     metrics={'loss': 1.0},
     path='log_1',
+    filesystem='local',
     checkpoint=Checkpoint(filesystem=local, path=/tmp/ckpt1)
   ),
   Result(
     error='RuntimeError',
     metrics={'loss': 2.0},
     path='log_2',
+    filesystem='local',
     checkpoint=Checkpoint(filesystem=local, path=/tmp/ckpt2)
   )
 ]>"""
diff --git a/python/ray/tune/trainable/trainable.py b/python/ray/tune/trainable/trainable.py
index 52ee0ea4d90a..730a5c5d1d91 100644
--- a/python/ray/tune/trainable/trainable.py
+++ b/python/ray/tune/trainable/trainable.py
@@ -177,8 +177,7 @@ def __init__(
 
         self._storage = storage
 
-        if _use_storage_context():
-            assert storage
+        if _use_storage_context() and storage:
             assert storage.trial_fs_path
             logger.debug(f"StorageContext on the TRAINABLE:\n{storage}")
 
@@ -525,17 +524,29 @@ def save(
                         )
 
                 local_checkpoint = NewCheckpoint.from_directory(checkpoint_dir)
-                persisted_checkpoint = self._storage.persist_current_checkpoint(
-                    local_checkpoint
-                )
-                # The checkpoint index needs to be incremented.
-                # NOTE: This is no longer using "iteration" as the folder indexing
-                # to be consistent with fn trainables.
-                self._storage.current_checkpoint_index += 1
 
-                checkpoint_result = _TrainingResult(
-                    checkpoint=persisted_checkpoint, metrics=self._last_result.copy()
-                )
+                if self._storage:
+                    persisted_checkpoint = self._storage.persist_current_checkpoint(
+                        local_checkpoint
+                    )
+                    # The checkpoint index needs to be incremented.
+                    # NOTE: This is no longer using "iteration" as the folder indexing
+                    # to be consistent with fn trainables.
+                    self._storage.current_checkpoint_index += 1
+
+                    checkpoint_result = _TrainingResult(
+                        checkpoint=persisted_checkpoint,
+                        metrics=self._last_result.copy(),
+                    )
+                else:
+                    # `storage=None` only happens when initializing the
+                    # Trainable manually, outside of Tune/Train.
+                    # In this case, no storage is set, so the default behavior
+                    # is to just not upload anything and report a local checkpoint.
+                    # This is fine for the main use case of local debugging.
+                    checkpoint_result = _TrainingResult(
+                        checkpoint=local_checkpoint, metrics=self._last_result.copy()
+                    )
 
             else:
                 checkpoint_result: _TrainingResult = checkpoint_dict_or_path
@@ -897,8 +908,8 @@ def restore(
             assert isinstance(checkpoint_result, _TrainingResult)
 
             checkpoint_metrics = checkpoint_result.metrics
-            self._iteration = checkpoint_metrics[TRAINING_ITERATION]
-            self._time_total = checkpoint_metrics[TIME_TOTAL_S]
+            self._iteration = checkpoint_metrics.get(TRAINING_ITERATION, 0)
+            self._time_total = checkpoint_metrics.get(TIME_TOTAL_S, 0)
             self._time_since_restore = 0.0
             self._iterations_since_restore = 0
 
diff --git a/python/ray/tune/tune.py b/python/ray/tune/tune.py
index c086a6101b60..3b36c02a8272 100644
--- a/python/ray/tune/tune.py
+++ b/python/ray/tune/tune.py
@@ -1170,7 +1170,7 @@ class and registered trainables.
             default_metric=metric,
             default_mode=mode,
             trials=all_trials,
-            storage_filesystem=storage_filesystem,
+            storage_filesystem=experiments[0].storage.storage_filesystem,
         )
     else:
         return ExperimentAnalysis(
diff --git a/python/requirements/compat/requirements_py37_compat.txt b/python/requirements/compat/requirements_py37_compat.txt
index e43abee2e045..7119bc620cc0 100644
--- a/python/requirements/compat/requirements_py37_compat.txt
+++ b/python/requirements/compat/requirements_py37_compat.txt
@@ -16,7 +16,7 @@ pytorch-lightning==1.6.5
 
 # Upstream libraries
 lightgbm_ray==0.1.8
-xgboost_ray==0.1.15
+xgboost_ray==0.1.17
 
 # Datasets
 pyarrow==6.0.1
diff --git a/python/requirements/ml/core-requirements.txt b/python/requirements/ml/core-requirements.txt
index c3f22cc47b62..987a84bf0a5e 100644
--- a/python/requirements/ml/core-requirements.txt
+++ b/python/requirements/ml/core-requirements.txt
@@ -7,10 +7,10 @@ wandb==0.13.4
 # ML training frameworks
 xgboost==1.6.2; python_version <= '3.7'
 xgboost==1.7.6; python_version > '3.7'
-xgboost_ray==0.1.15
+xgboost_ray==0.1.18
 
 lightgbm==3.3.5
-lightgbm_ray==0.1.8
+lightgbm_ray==0.1.9
 
 # Huggingface
 transformers==4.19.1
diff --git a/release/ray_release/byod/requirements_ml_byod_3.9.in b/release/ray_release/byod/requirements_ml_byod_3.9.in
index 83913060b834..d8c07cf383e0 100644
--- a/release/ray_release/byod/requirements_ml_byod_3.9.in
+++ b/release/ray_release/byod/requirements_ml_byod_3.9.in
@@ -3,6 +3,7 @@ accelerate
 bitsandbytes
 dataset
 datasets
+decord
 deepspeed
 diffusers
 evaluate
@@ -24,3 +25,4 @@ torchtext
 torchvision
 transformers>=4.31.0
 wandb
+whisper
diff --git a/release/ray_release/byod/requirements_ml_byod_3.9.txt b/release/ray_release/byod/requirements_ml_byod_3.9.txt
index 2d8ac7798e0b..3fd6da598eca 100644
--- a/release/ray_release/byod/requirements_ml_byod_3.9.txt
+++ b/release/ray_release/byod/requirements_ml_byod_3.9.txt
@@ -391,6 +391,13 @@ decorator==5.1.1 \
     --hash=sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330 \
     --hash=sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186
     # via ipython
+decord==0.6.0 \
+    --hash=sha256:02665d7c4f1193a330205a791bc128f7e108eb6ae5b67144437a02f700943bad \
+    --hash=sha256:51997f20be8958e23b7c4061ba45d0efcd86bffd5fe81c695d0befee0d442976 \
+    --hash=sha256:85ef90d2f872384657d7774cc486c237c5b12df62d4ac5cb5c8d6001fa611323 \
+    --hash=sha256:9c20674964fb1490c677bd911d2023d2a09fec7a58a4bb0b7ddf1ccc269f107a \
+    --hash=sha256:a0eb1258beade34dceb29d97856a7764d179db1b5182899b61874f3418a1abc8
+    # via -r release/ray_release/byod/requirements_ml_byod_3.9.in
 deepspeed==0.9.4 \
     --hash=sha256:155575707df5f4d83d3e180ffb8b6e38ada119053010f17bd6d384aa7b50adfc
     # via -r release/ray_release/byod/requirements_ml_byod_3.9.in
@@ -1188,6 +1195,7 @@ numpy==1.24.3 \
     #   accelerate
     #   contourpy
     #   datasets
+    #   decord
     #   deepspeed
     #   diffusers
     #   evaluate
@@ -2135,6 +2143,7 @@ six==1.16.0 \
     #   retrying
     #   rouge-score
     #   triad
+    #   whisper
 smmap==5.0.0 \
     --hash=sha256:2aba19d6a040e78d8b09de5c57e96207b09ed71d8e55ce0959eeee6c8e190d94 \
     --hash=sha256:c840e62059cd3be204b0c9c9f74be2c09d5648eddd4580d9314c3ecde0b30936
@@ -2579,6 +2588,9 @@ wheel==0.40.0 \
     #   nvidia-curand-cu11
     #   nvidia-cusparse-cu11
     #   nvidia-nvtx-cu11
+whisper==1.1.10 \
+    --hash=sha256:435b4fb843c4c752719bdf0511a652d5be710e9bb35ad9ebe3b133268ee31c44
+    # via -r release/ray_release/byod/requirements_ml_byod_3.9.in
 widgetsnbextension==4.0.7 \
     --hash=sha256:be3228a73bbab189a16be2d4a3cd89ecbd4e31948bfdc64edac17dcdee3cd99c \
     --hash=sha256:ea67c17a7cd4ae358f8f46c3b304c40698bc0423732e3f273321ee141232c8be
diff --git a/src/ray/protobuf/serve.proto b/src/ray/protobuf/serve.proto
index de8d51c784d6..46c3b5549e25 100644
--- a/src/ray/protobuf/serve.proto
+++ b/src/ray/protobuf/serve.proto
@@ -54,6 +54,12 @@ message AutoscalingConfig {
 
   // Initial number of replicas deployment should start with. Must be non-negative.
   optional uint32 initial_replicas = 9;
+
+  // The multiplicative "gain" factor to limit upscale.
+  optional double upscale_smoothing_factor = 10;
+
+  // The multiplicative "gain" factor to limit downscale.
+  optional double downscale_smoothing_factor = 11;
 }
 
 // Configuration options for a deployment, to be set by the user.
diff --git a/src/ray/protobuf/usage.proto b/src/ray/protobuf/usage.proto
index cd13858d8b9f..f4a65dfec062 100644
--- a/src/ray/protobuf/usage.proto
+++ b/src/ray/protobuf/usage.proto
@@ -81,6 +81,12 @@ enum TagKey {
   SERVE_HTTP_PROXY_USED = 23;
   // Whether or not an gRPC proxy was used ("1" if used).
   SERVE_GRPC_PROXY_USED = 24;
+  // Whether the serve.status API was used ("1" if used)
+  SERVE_STATUS_API_USED = 25;
+  // Whether the serve.get_app_handle API was used ("1" if used)
+  SERVE_GET_APP_HANDLE_API_USED = 26;
+  // Whether the serve.get_deployment_handle API was used ("1" if used)
+  SERVE_GET_DEPLOYMENT_HANDLE_API_USED = 27;
 
   // Ray Core State API
   // NOTE(rickyxx): Currently only setting "1" for tracking existence of