diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index 5e02a5910b115..9c5adca7e2863 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -55,16 +55,28 @@ import org.apache.spark.util.SparkClassUtils
  * only `Column` but also other types such as a native string. The other variants currently exist
  * for historical reasons.
  *
- * @groupname udf_funcs UDF functions
+ * @groupname udf_funcs UDF, UDAF and UDT
  * @groupname agg_funcs Aggregate functions
- * @groupname datetime_funcs Date time functions
- * @groupname sort_funcs Sorting functions
- * @groupname normal_funcs Non-aggregate functions
- * @groupname math_funcs Math functions
+ * @groupname datetime_funcs Date and Timestamp functions
+ * @groupname sort_funcs Sort functions
+ * @groupname normal_funcs Normal functions
+ * @groupname math_funcs Mathematical functions
+ * @groupname bitwise_funcs Bitwise functions
+ * @groupname predicate_funcs Predicate functions
+ * @groupname conditional_funcs Conditional functions
+ * @groupname hash_funcs Hash functions
  * @groupname misc_funcs Misc functions
  * @groupname window_funcs Window functions
+ * @groupname generator_funcs Generator functions
  * @groupname string_funcs String functions
  * @groupname collection_funcs Collection functions
+ * @groupname array_funcs Array functions
+ * @groupname map_funcs Map functions
+ * @groupname struct_funcs Struct functions
+ * @groupname csv_funcs CSV functions
+ * @groupname json_funcs JSON functions
+ * @groupname xml_funcs XML functions
+ * @groupname url_funcs URL functions
  * @groupname partition_transforms Partition transform functions
  * @groupname Ungrouped Support functions for DataFrames
  *
@@ -101,6 +113,7 @@ object functions {
    * Scala Symbol, it is converted into a [[Column]] also. Otherwise, a new [[Column]] is created
    * to represent the literal value.
    *
+   * @group normal_funcs
    * @since 3.4.0
    */
   def lit(literal: Any): Column = {
@@ -145,7 +158,7 @@ object functions {
   /**
    * Creates a struct with the given field names and values.
    *
-   * @group normal_funcs
+   * @group struct_funcs
    * @since 3.5.0
    */
   def named_struct(cols: Column*): Column = Column.fn("named_struct", cols: _*)
@@ -1610,7 +1623,7 @@ object functions {
   /**
    * Creates a new array column. The input columns must all have the same data type.
    *
-   * @group normal_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -1619,7 +1632,7 @@ object functions {
   /**
    * Creates a new array column. The input columns must all have the same data type.
    *
-   * @group normal_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -1632,7 +1645,7 @@ object functions {
    * value1, key2, value2, ...). The key columns must all have the same data type, and can't be
    * null. The value columns must all have the same data type.
    *
-   * @group normal_funcs
+   * @group map_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -1642,7 +1655,7 @@ object functions {
    * Creates a new map column. The array in the first column is used for keys. The array in the
    * second column is used for values. All elements in the array for key should not be null.
    *
-   * @group normal_funcs
+   * @group map_funcs
    * @since 3.4.0
    */
   def map_from_arrays(keys: Column, values: Column): Column =
@@ -1698,7 +1711,7 @@ object functions {
    * For example, `coalesce(a, b, c)` will return a if a is not null, or b if a is null and b is
    * not null, or c if both a and b are null but c is not null.
    *
-   * @group normal_funcs
+   * @group conditional_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -1707,7 +1720,7 @@ object functions {
   /**
    * Creates a string column for the file name of the current Spark task.
    *
-   * @group normal_funcs
+   * @group misc_funcs
    * @since 3.4.0
    */
   def input_file_name(): Column = Column.fn("input_file_name")
@@ -1715,7 +1728,7 @@ object functions {
   /**
    * Return true iff the column is NaN.
    *
-   * @group normal_funcs
+   * @group predicate_funcs
    * @since 3.4.0
    */
   def isnan(e: Column): Column = e.isNaN
@@ -1723,7 +1736,7 @@ object functions {
   /**
    * Return true iff the column is null.
    *
-   * @group normal_funcs
+   * @group predicate_funcs
    * @since 3.4.0
    */
   def isnull(e: Column): Column = e.isNull
@@ -1743,7 +1756,7 @@ object functions {
    * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
    * }}}
    *
-   * @group normal_funcs
+   * @group misc_funcs
    * @since 3.4.0
    */
   @deprecated("Use monotonically_increasing_id()", "2.0.0")
@@ -1764,7 +1777,7 @@ object functions {
    * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
    * }}}
    *
-   * @group normal_funcs
+   * @group misc_funcs
    * @since 3.4.0
    */
   def monotonically_increasing_id(): Column = Column.fn("monotonically_increasing_id")
@@ -1774,7 +1787,7 @@ object functions {
    *
    * Both inputs should be floating point columns (DoubleType or FloatType).
    *
-   * @group normal_funcs
+   * @group conditional_funcs
    * @since 3.4.0
    */
   def nanvl(col1: Column, col2: Column): Column = Column.fn("nanvl", col1, col2)
@@ -1790,7 +1803,7 @@ object functions {
    *   df.select( negate(df.col("amount")) );
    * }}}
    *
-   * @group normal_funcs
+   * @group math_funcs
    * @since 3.4.0
    */
   def negate(e: Column): Column = -e
@@ -1805,7 +1818,7 @@ object functions {
    *   df.filter( not(df.col("isActive")) );
    * }}}
    *
-   * @group normal_funcs
+   * @group predicate_funcs
    * @since 3.4.0
    */
   def not(e: Column): Column = !e
@@ -1817,7 +1830,7 @@ object functions {
    * @note
    *   The function is non-deterministic in general case.
    *
-   * @group normal_funcs
+   * @group math_funcs
    * @since 3.4.0
    */
   def rand(seed: Long): Column = Column.fn("rand", lit(seed))
@@ -1829,7 +1842,7 @@ object functions {
    * @note
    *   The function is non-deterministic in general case.
    *
-   * @group normal_funcs
+   * @group math_funcs
    * @since 3.4.0
    */
   def rand(): Column = Column.fn("rand", lit(SparkClassUtils.random.nextLong))
@@ -1841,7 +1854,7 @@ object functions {
    * @note
    *   The function is non-deterministic in general case.
    *
-   * @group normal_funcs
+   * @group math_funcs
    * @since 3.4.0
    */
   def randn(seed: Long): Column = Column.fn("randn", lit(seed))
@@ -1853,7 +1866,7 @@ object functions {
    * @note
    *   The function is non-deterministic in general case.
    *
-   * @group normal_funcs
+   * @group math_funcs
    * @since 3.4.0
    */
   def randn(): Column = Column.fn("randn", lit(SparkClassUtils.random.nextLong))
@@ -1864,7 +1877,7 @@ object functions {
    * @note
    *   This is non-deterministic because it depends on data partitioning and task scheduling.
    *
-   * @group normal_funcs
+   * @group misc_funcs
    * @since 3.4.0
    */
   def spark_partition_id(): Column = Column.fn("spark_partition_id")
@@ -1943,7 +1956,7 @@ object functions {
    * StructField's name, otherwise, the newly generated StructField's name would be auto generated
    * as `col` with a suffix `index + 1`, i.e. col1, col2, col3, ...
    *
-   * @group normal_funcs
+   * @group struct_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -1952,7 +1965,7 @@ object functions {
   /**
    * Creates a new struct column that composes multiple input columns.
    *
-   * @group normal_funcs
+   * @group struct_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -1978,7 +1991,7 @@ object functions {
    *     .otherwise(2))
    * }}}
    *
-   * @group normal_funcs
+   * @group conditional_funcs
    * @since 3.4.0
    */
   def when(condition: Column, value: Any): Column = Column { builder =>
@@ -1991,7 +2004,7 @@ object functions {
   /**
    * Computes bitwise NOT (~) of a number.
    *
-   * @group normal_funcs
+   * @group bitwise_funcs
    * @since 3.4.0
    */
   @deprecated("Use bitwise_not", "3.2.0")
@@ -2000,7 +2013,7 @@ object functions {
   /**
    * Computes bitwise NOT (~) of a number.
    *
-   * @group normal_funcs
+   * @group bitwise_funcs
    * @since 3.4.0
    */
   def bitwise_not(e: Column): Column = Column.fn("~", e)
@@ -2512,7 +2525,7 @@ object functions {
    * Returns the greatest value of the list of values, skipping null values. This function takes
    * at least 2 parameters. It will return null iff all parameters are null.
    *
-   * @group normal_funcs
+   * @group math_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -2522,7 +2535,7 @@ object functions {
    * Returns the greatest value of the list of column names, skipping null values. This function
    * takes at least 2 parameters. It will return null iff all parameters are null.
    *
-   * @group normal_funcs
+   * @group math_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -2615,7 +2628,7 @@ object functions {
    * Returns the least value of the list of values, skipping null values. This function takes at
    * least 2 parameters. It will return null iff all parameters are null.
    *
-   * @group normal_funcs
+   * @group math_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -2625,7 +2638,7 @@ object functions {
    * Returns the least value of the list of column names, skipping null values. This function
    * takes at least 2 parameters. It will return null iff all parameters are null.
    *
-   * @group normal_funcs
+   * @group math_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -2909,7 +2922,7 @@ object functions {
    * Shift the given value numBits left. If the given value is a long value, this function will
    * return a long value else it will return an integer value.
    *
-   * @group math_funcs
+   * @group bitwise_funcs
    * @since 3.4.0
    */
   @deprecated("Use shiftleft", "3.2.0")
@@ -2919,7 +2932,7 @@ object functions {
    * Shift the given value numBits left. If the given value is a long value, this function will
    * return a long value else it will return an integer value.
    *
-   * @group math_funcs
+   * @group bitwise_funcs
    * @since 3.4.0
    */
   def shiftleft(e: Column, numBits: Int): Column = Column.fn("shiftleft", e, lit(numBits))
@@ -2928,7 +2941,7 @@ object functions {
    * (Signed) shift the given value numBits right. If the given value is a long value, it will
    * return a long value else it will return an integer value.
    *
-   * @group math_funcs
+   * @group bitwise_funcs
    * @since 3.4.0
    */
   @deprecated("Use shiftright", "3.2.0")
@@ -2938,7 +2951,7 @@ object functions {
    * (Signed) shift the given value numBits right. If the given value is a long value, it will
    * return a long value else it will return an integer value.
    *
-   * @group math_funcs
+   * @group bitwise_funcs
    * @since 3.4.0
    */
   def shiftright(e: Column, numBits: Int): Column = Column.fn("shiftright", e, lit(numBits))
@@ -2947,7 +2960,7 @@ object functions {
    * Unsigned shift the given value numBits right. If the given value is a long value, it will
    * return a long value else it will return an integer value.
    *
-   * @group math_funcs
+   * @group bitwise_funcs
    * @since 3.4.0
    */
   @deprecated("Use shiftrightunsigned", "3.2.0")
@@ -2957,7 +2970,7 @@ object functions {
    * Unsigned shift the given value numBits right. If the given value is a long value, it will
    * return a long value else it will return an integer value.
    *
-   * @group math_funcs
+   * @group bitwise_funcs
    * @since 3.4.0
    */
   def shiftrightunsigned(e: Column, numBits: Int): Column =
@@ -3220,7 +3233,7 @@ object functions {
    * Calculates the MD5 digest of a binary column and returns the value as a 32 character hex
    * string.
    *
-   * @group misc_funcs
+   * @group hash_funcs
    * @since 3.4.0
    */
   def md5(e: Column): Column = Column.fn("md5", e)
@@ -3229,7 +3242,7 @@ object functions {
    * Calculates the SHA-1 digest of a binary column and returns the value as a 40 character hex
    * string.
    *
-   * @group misc_funcs
+   * @group hash_funcs
    * @since 3.4.0
    */
   def sha1(e: Column): Column = Column.fn("sha1", e)
@@ -3243,7 +3256,7 @@ object functions {
    * @param numBits
    *   one of 224, 256, 384, or 512.
    *
-   * @group misc_funcs
+   * @group hash_funcs
    * @since 3.4.0
    */
   def sha2(e: Column, numBits: Int): Column = {
@@ -3257,7 +3270,7 @@ object functions {
    * Calculates the cyclic redundancy check value (CRC32) of a binary column and returns the value
    * as a bigint.
    *
-   * @group misc_funcs
+   * @group hash_funcs
    * @since 3.4.0
    */
   def crc32(e: Column): Column = Column.fn("crc32", e)
@@ -3265,7 +3278,7 @@ object functions {
   /**
    * Calculates the hash code of given columns, and returns the result as an int column.
    *
-   * @group misc_funcs
+   * @group hash_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -3275,7 +3288,7 @@ object functions {
    * Calculates the hash code of given columns using the 64-bit variant of the xxHash algorithm,
    * and returns the result as a long column. The hash computation uses an initial seed of 42.
    *
-   * @group misc_funcs
+   * @group hash_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -3632,7 +3645,7 @@ object functions {
   /**
    * Returns a sha1 hash value as a hex string of the `col`.
    *
-   * @group misc_funcs
+   * @group hash_funcs
    * @since 3.5.0
    */
   def sha(col: Column): Column = Column.fn("sha", col)
@@ -3708,7 +3721,7 @@ object functions {
    * Returns a random value with independent and identically distributed (i.i.d.) uniformly
    * distributed values in [0, 1).
    *
-   * @group misc_funcs
+   * @group math_funcs
    * @since 3.5.0
    */
   def random(seed: Column): Column = Column.fn("random", seed)
@@ -3717,7 +3730,7 @@ object functions {
    * Returns a random value with independent and identically distributed (i.i.d.) uniformly
    * distributed values in [0, 1).
    *
-   * @group misc_funcs
+   * @group math_funcs
    * @since 3.5.0
    */
   def random(): Column = Column.fn("random", lit(SparkClassUtils.random.nextLong))
@@ -3744,7 +3757,7 @@ object functions {
    * Returns a bitmap with the positions of the bits set from all the values from the input
    * column. The input column will most likely be bitmap_bit_position().
    *
-   * @group misc_funcs
+   * @group agg_funcs
    * @since 3.5.0
    */
   def bitmap_construct_agg(col: Column): Column =
@@ -3762,7 +3775,7 @@ object functions {
    * Returns a bitmap that is the bitwise OR of all of the bitmaps from the input column. The
    * input column should be bitmaps created from bitmap_construct_agg().
    *
-   * @group misc_funcs
+   * @group agg_funcs
    * @since 3.5.0
    */
   def bitmap_or_agg(col: Column): Column = Column.fn("bitmap_or_agg", col)
@@ -3993,7 +4006,7 @@ object functions {
   /**
    * Returns true if `str` matches `regexp`, or false otherwise.
    *
-   * @group string_funcs
+   * @group predicate_funcs
    * @since 3.5.0
    */
   def rlike(str: Column, regexp: Column): Column = Column.fn("rlike", str, regexp)
@@ -4001,7 +4014,7 @@ object functions {
   /**
    * Returns true if `str` matches `regexp`, or false otherwise.
    *
-   * @group string_funcs
+   * @group predicate_funcs
    * @since 3.5.0
    */
   def regexp(str: Column, regexp: Column): Column = Column.fn("regexp", str, regexp)
@@ -4009,7 +4022,7 @@ object functions {
   /**
    * Returns true if `str` matches `regexp`, or false otherwise.
    *
-   * @group string_funcs
+   * @group predicate_funcs
    * @since 3.5.0
    */
   def regexp_like(str: Column, regexp: Column): Column = Column.fn("regexp_like", str, regexp)
@@ -4477,7 +4490,7 @@ object functions {
   /**
    * Extracts a part from a URL.
    *
-   * @group string_funcs
+   * @group url_funcs
    * @since 3.5.0
    */
   def parse_url(url: Column, partToExtract: Column, key: Column): Column =
@@ -4486,7 +4499,7 @@ object functions {
   /**
    * Extracts a part from a URL.
    *
-   * @group string_funcs
+   * @group url_funcs
    * @since 3.5.0
    */
   def parse_url(url: Column, partToExtract: Column): Column =
@@ -4505,7 +4518,7 @@ object functions {
    * Decodes a `str` in 'application/x-www-form-urlencoded' format using a specific encoding
    * scheme.
    *
-   * @group string_funcs
+   * @group url_funcs
    * @since 3.5.0
    */
   def url_decode(str: Column): Column = Column.fn("url_decode", str)
@@ -4514,7 +4527,7 @@ object functions {
    * Translates a string into 'application/x-www-form-urlencoded' format using a specific encoding
    * scheme.
    *
-   * @group string_funcs
+   * @group url_funcs
    * @since 3.5.0
    */
   def url_encode(str: Column): Column = Column.fn("url_encode", str)
@@ -4677,7 +4690,7 @@ object functions {
    * Returns true if str matches `pattern` with `escapeChar`, null if any arguments are null,
    * false otherwise.
    *
-   * @group string_funcs
+   * @group predicate_funcs
    * @since 3.5.0
    */
   def like(str: Column, pattern: Column, escapeChar: Column): Column =
@@ -4687,7 +4700,7 @@ object functions {
    * Returns true if str matches `pattern` with `escapeChar`('\'), null if any arguments are null,
    * false otherwise.
    *
-   * @group string_funcs
+   * @group predicate_funcs
    * @since 3.5.0
    */
   def like(str: Column, pattern: Column): Column = Column.fn("like", str, pattern)
@@ -4696,7 +4709,7 @@ object functions {
    * Returns true if str matches `pattern` with `escapeChar` case-insensitively, null if any
    * arguments are null, false otherwise.
    *
-   * @group string_funcs
+   * @group predicate_funcs
    * @since 3.5.0
    */
   def ilike(str: Column, pattern: Column, escapeChar: Column): Column =
@@ -4706,7 +4719,7 @@ object functions {
    * Returns true if str matches `pattern` with `escapeChar`('\') case-insensitively, null if any
    * arguments are null, false otherwise.
    *
-   * @group string_funcs
+   * @group predicate_funcs
    * @since 3.5.0
    */
   def ilike(str: Column, pattern: Column): Column = Column.fn("ilike", str, pattern)
@@ -5945,7 +5958,7 @@ object functions {
 
   /**
    * Returns null if the array is null, true if the array contains `value`, and false otherwise.
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_contains(column: Column, value: Any): Column =
@@ -5955,7 +5968,7 @@ object functions {
    * Returns an ARRAY containing all elements from the source ARRAY as well as the new element.
    * The new element/column is located at end of the ARRAY.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_append(column: Column, element: Any): Column =
@@ -5965,7 +5978,7 @@ object functions {
    * Returns `true` if `a1` and `a2` have at least one non-null element in common. If not and both
    * the arrays are non-empty and any of them contains a `null`, it returns `null`. It returns
    * `false` otherwise.
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def arrays_overlap(a1: Column, a2: Column): Column = Column.fn("arrays_overlap", a1, a2)
@@ -5981,7 +5994,7 @@ object functions {
    * @param length
    *   the length of the slice
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def slice(x: Column, start: Int, length: Int): Column =
@@ -5998,7 +6011,7 @@ object functions {
    * @param length
    *   the length of the slice
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def slice(x: Column, start: Column, length: Column): Column =
@@ -6007,7 +6020,7 @@ object functions {
   /**
    * Concatenates the elements of `column` using the `delimiter`. Null values are replaced with
    * `nullReplacement`.
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_join(column: Column, delimiter: String, nullReplacement: String): Column =
@@ -6015,7 +6028,7 @@ object functions {
 
   /**
    * Concatenates the elements of `column` using the `delimiter`.
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_join(column: Column, delimiter: String): Column =
@@ -6039,7 +6052,7 @@ object functions {
    *   The position is not zero based, but 1 based index. Returns 0 if value could not be found in
    *   array.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_position(column: Column, value: Any): Column =
@@ -6062,7 +6075,7 @@ object functions {
    * (map, key) - Returns value for given key. The function always returns NULL if the key is not
    * contained in the map.
    *
-   * @group map_funcs
+   * @group collection_funcs
    * @since 3.5.0
    */
   def try_element_at(column: Column, value: Column): Column =
@@ -6072,7 +6085,7 @@ object functions {
    * Returns element of array at given (0-based) index. If the index points outside of the array
    * boundaries, then this function returns NULL.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def get(column: Column, index: Column): Column = Column.fn("get", column, index)
@@ -6102,7 +6115,7 @@ object functions {
   /**
    * Remove all elements that equal to element from the given array.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_remove(column: Column, element: Any): Column =
@@ -6111,7 +6124,7 @@ object functions {
   /**
    * Remove all null elements from the given array.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_compact(column: Column): Column = Column.fn("array_compact", column)
@@ -6120,7 +6133,7 @@ object functions {
    * Returns an array containing value as well as all elements from array. The new element is
    * positioned at the beginning of the array.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.5.0
    */
   def array_prepend(column: Column, element: Any): Column =
@@ -6128,7 +6141,7 @@ object functions {
 
   /**
    * Removes duplicate values from the array.
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_distinct(e: Column): Column = Column.fn("array_distinct", e)
@@ -6137,7 +6150,7 @@ object functions {
    * Returns an array of the elements in the intersection of the given two arrays, without
    * duplicates.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_intersect(col1: Column, col2: Column): Column =
@@ -6146,7 +6159,7 @@ object functions {
   /**
    * Adds an item into a given array at a specified position
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_insert(arr: Column, pos: Column, value: Column): Column =
@@ -6155,7 +6168,7 @@ object functions {
   /**
    * Returns an array of the elements in the union of the given two arrays, without duplicates.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_union(col1: Column, col2: Column): Column =
@@ -6165,7 +6178,7 @@ object functions {
    * Returns an array of the elements in the first array but not in the second array, without
    * duplicates. The order of elements in the result is not determined
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_except(col1: Column, col2: Column): Column =
@@ -6174,7 +6187,7 @@ object functions {
   /**
    * Returns a string array of values within the nodes of xml that match the XPath expression.
    *
-   * @group "xml_funcs"
+   * @group xml_funcs
    * @since 3.5.0
    */
   def xpath(xml: Column, path: Column): Column =
@@ -6183,7 +6196,7 @@ object functions {
   /**
    * Returns true if the XPath expression evaluates to true, or if a matching node is found.
    *
-   * @group "xml_funcs"
+   * @group xml_funcs
    * @since 3.5.0
    */
   def xpath_boolean(xml: Column, path: Column): Column =
@@ -6193,7 +6206,7 @@ object functions {
    * Returns a double value, the value zero if no match is found, or NaN if a match is found but
    * the value is non-numeric.
    *
-   * @group "xml_funcs"
+   * @group xml_funcs
    * @since 3.5.0
    */
   def xpath_double(xml: Column, path: Column): Column =
@@ -6203,7 +6216,7 @@ object functions {
    * Returns a double value, the value zero if no match is found, or NaN if a match is found but
    * the value is non-numeric.
    *
-   * @group "xml_funcs"
+   * @group xml_funcs
    * @since 3.5.0
    */
   def xpath_number(xml: Column, path: Column): Column =
@@ -6213,7 +6226,7 @@ object functions {
    * Returns a float value, the value zero if no match is found, or NaN if a match is found but
    * the value is non-numeric.
    *
-   * @group "xml_funcs"
+   * @group xml_funcs
    * @since 3.5.0
    */
   def xpath_float(xml: Column, path: Column): Column =
@@ -6223,7 +6236,7 @@ object functions {
    * Returns an integer value, or the value zero if no match is found, or a match is found but the
    * value is non-numeric.
    *
-   * @group "xml_funcs"
+   * @group xml_funcs
    * @since 3.5.0
    */
   def xpath_int(xml: Column, path: Column): Column =
@@ -6233,7 +6246,7 @@ object functions {
    * Returns a long integer value, or the value zero if no match is found, or a match is found but
    * the value is non-numeric.
    *
-   * @group "xml_funcs"
+   * @group xml_funcs
    * @since 3.5.0
    */
   def xpath_long(xml: Column, path: Column): Column =
@@ -6243,7 +6256,7 @@ object functions {
    * Returns a short integer value, or the value zero if no match is found, or a match is found
    * but the value is non-numeric.
    *
-   * @group "xml_funcs"
+   * @group xml_funcs
    * @since 3.5.0
    */
   def xpath_short(xml: Column, path: Column): Column =
@@ -6252,7 +6265,7 @@ object functions {
   /**
    * Returns the text contents of the first xml node that matches the XPath expression.
    *
-   * @group "xml_funcs"
+   * @group xml_funcs
    * @since 3.5.0
    */
   def xpath_string(xml: Column, path: Column): Column =
@@ -6603,7 +6616,7 @@ object functions {
    * name `col` for elements in the array and `key` and `value` for elements in the map unless
    * specified otherwise.
    *
-   * @group collection_funcs
+   * @group generator_funcs
    * @since 3.4.0
    */
   def explode(e: Column): Column = Column.fn("explode", e)
@@ -6613,7 +6626,7 @@ object functions {
    * name `col` for elements in the array and `key` and `value` for elements in the map unless
    * specified otherwise. Unlike explode, if the array/map is null or empty then null is produced.
    *
-   * @group collection_funcs
+   * @group generator_funcs
    * @since 3.4.0
    */
   def explode_outer(e: Column): Column = Column.fn("explode_outer", e)
@@ -6623,7 +6636,7 @@ object functions {
    * default column name `pos` for position, and `col` for elements in the array and `key` and
    * `value` for elements in the map unless specified otherwise.
    *
-   * @group collection_funcs
+   * @group generator_funcs
    * @since 3.4.0
    */
   def posexplode(e: Column): Column = Column.fn("posexplode", e)
@@ -6634,7 +6647,7 @@ object functions {
    * `value` for elements in the map unless specified otherwise. Unlike posexplode, if the
    * array/map is null or empty then the row (null, null) is produced.
    *
-   * @group collection_funcs
+   * @group generator_funcs
    * @since 3.4.0
    */
   def posexplode_outer(e: Column): Column = Column.fn("posexplode_outer", e)
@@ -6642,7 +6655,7 @@ object functions {
   /**
    * Creates a new row for each element in the given array of structs.
    *
-   * @group collection_funcs
+   * @group generator_funcs
    * @since 3.4.0
    */
   def inline(e: Column): Column = Column.fn("inline", e)
@@ -6651,7 +6664,7 @@ object functions {
    * Creates a new row for each element in the given array of structs. Unlike inline, if the array
    * is null or empty then null is produced for each nested column.
    *
-   * @group collection_funcs
+   * @group generator_funcs
    * @since 3.4.0
    */
   def inline_outer(e: Column): Column = Column.fn("inline_outer", e)
@@ -6660,7 +6673,7 @@ object functions {
    * Extracts json object from a json string based on json path specified, and returns json string
    * of the extracted json object. It will return null if the input json string is invalid.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   def get_json_object(e: Column, path: String): Column =
@@ -6669,7 +6682,7 @@ object functions {
   /**
    * Creates a new row for a json column according to the given field names.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -6693,7 +6706,7 @@ object functions {
    *   "https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option"> Data
    *   Source Option</a> in the version you use.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   // scalastyle:on line.size.limit
@@ -6716,7 +6729,7 @@ object functions {
    *   "https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option"> Data
    *   Source Option</a> in the version you use.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   // scalastyle:on line.size.limit
@@ -6739,7 +6752,7 @@ object functions {
    *   "https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option"> Data
    *   Source Option</a> in the version you use.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   // scalastyle:on line.size.limit
@@ -6762,7 +6775,7 @@ object functions {
    *   "https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option"> Data
    *   Source Option</a> in the version you use.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   // scalastyle:on line.size.limit
@@ -6779,7 +6792,7 @@ object functions {
    * @param schema
    *   the schema to use when parsing the json string
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   def from_json(e: Column, schema: StructType): Column =
@@ -6795,7 +6808,7 @@ object functions {
    * @param schema
    *   the schema to use when parsing the json string
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   def from_json(e: Column, schema: DataType): Column =
@@ -6817,7 +6830,7 @@ object functions {
    *   "https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option"> Data
    *   Source Option</a> in the version you use.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   // scalastyle:on line.size.limit
@@ -6841,7 +6854,7 @@ object functions {
    *   "https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option"> Data
    *   Source Option</a> in the version you use.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   // scalastyle:on line.size.limit
@@ -6861,7 +6874,7 @@ object functions {
    * @param schema
    *   the schema to use when parsing the json string
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   def from_json(e: Column, schema: Column): Column = {
@@ -6884,7 +6897,7 @@ object functions {
    *   "https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option"> Data
    *   Source Option</a> in the version you use.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   // scalastyle:on line.size.limit
@@ -6924,7 +6937,7 @@ object functions {
    * @param json
    *   a JSON string.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   def schema_of_json(json: String): Column = schema_of_json(lit(json))
@@ -6935,7 +6948,7 @@ object functions {
    * @param json
    *   a foldable string column containing a JSON string.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   def schema_of_json(json: Column): Column = Column.fn("schema_of_json", json)
@@ -6954,7 +6967,7 @@ object functions {
    * @return
    *   a column with string literal containing schema in DDL format.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   // scalastyle:on line.size.limit
@@ -6976,7 +6989,7 @@ object functions {
    *   Source Option</a> in the version you use. Additionally the function supports the `pretty`
    *   option which enables pretty JSON generation.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   // scalastyle:on line.size.limit
@@ -6998,7 +7011,7 @@ object functions {
    *   Source Option</a> in the version you use. Additionally the function supports the `pretty`
    *   option which enables pretty JSON generation.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   // scalastyle:on line.size.limit
@@ -7012,7 +7025,7 @@ object functions {
    * @param e
    *   a column containing a struct, an array or a map.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.4.0
    */
   def to_json(e: Column): Column =
@@ -7035,7 +7048,7 @@ object functions {
    * ordering of the array elements. Null elements will be placed at the beginning of the returned
    * array.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def sort_array(e: Column): Column = sort_array(e, asc = true)
@@ -7046,7 +7059,7 @@ object functions {
    * double/float type. Null elements will be placed at the beginning of the returned array in
    * ascending order or at the end of the returned array in descending order.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def sort_array(e: Column, asc: Boolean): Column = Column.fn("sort_array", e, lit(asc))
@@ -7055,7 +7068,7 @@ object functions {
    * Returns the minimum value in the array. NaN is greater than any non-NaN elements for
    * double/float type. NULL elements are skipped.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_min(e: Column): Column = Column.fn("array_min", e)
@@ -7064,7 +7077,7 @@ object functions {
    * Returns the maximum value in the array. NaN is greater than any non-NaN elements for
    * double/float type. NULL elements are skipped.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_max(e: Column): Column = Column.fn("array_max", e)
@@ -7075,7 +7088,7 @@ object functions {
    * @note
    *   The function is non-deterministic.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def shuffle(e: Column): Column = Column.fn("shuffle", e, lit(SparkClassUtils.random.nextLong))
@@ -7090,7 +7103,7 @@ object functions {
   /**
    * Creates a single array from an array of arrays. If a structure of nested arrays is deeper
    * than two levels, only one level of nesting is removed.
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def flatten(e: Column): Column = Column.fn("flatten", e)
@@ -7098,7 +7111,7 @@ object functions {
   /**
    * Generate a sequence of integers from start to stop, incrementing by step.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def sequence(start: Column, stop: Column, step: Column): Column =
@@ -7108,7 +7121,7 @@ object functions {
    * Generate a sequence of integers from start to stop, incrementing by 1 if start is less than
    * or equal to stop, otherwise -1.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def sequence(start: Column, stop: Column): Column =
@@ -7118,7 +7131,7 @@ object functions {
    * Creates an array containing the left argument repeated the number of times given by the right
    * argument.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_repeat(left: Column, right: Column): Column = Column.fn("array_repeat", left, right)
@@ -7127,14 +7140,14 @@ object functions {
    * Creates an array containing the left argument repeated the number of times given by the right
    * argument.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   def array_repeat(e: Column, count: Int): Column = array_repeat(e, lit(count))
 
   /**
    * Returns true if the map contains the key.
-   * @group collection_funcs
+   * @group map_funcs
    * @since 3.4.0
    */
   def map_contains_key(column: Column, key: Any): Column =
@@ -7142,28 +7155,28 @@ object functions {
 
   /**
    * Returns an unordered array containing the keys of the map.
-   * @group collection_funcs
+   * @group map_funcs
    * @since 3.4.0
    */
   def map_keys(e: Column): Column = Column.fn("map_keys", e)
 
   /**
    * Returns an unordered array containing the values of the map.
-   * @group collection_funcs
+   * @group map_funcs
    * @since 3.4.0
    */
   def map_values(e: Column): Column = Column.fn("map_values", e)
 
   /**
    * Returns an unordered array of all entries in the given map.
-   * @group collection_funcs
+   * @group map_funcs
    * @since 3.4.0
    */
   def map_entries(e: Column): Column = Column.fn("map_entries", e)
 
   /**
    * Returns a map created from the given array of entries.
-   * @group collection_funcs
+   * @group map_funcs
    * @since 3.4.0
    */
   def map_from_entries(e: Column): Column = Column.fn("map_from_entries", e)
@@ -7171,7 +7184,7 @@ object functions {
   /**
    * Returns a merged array of structs in which the N-th struct contains all N-th values of input
    * arrays.
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -7179,7 +7192,7 @@ object functions {
 
   /**
    * Returns the union of all the given maps.
-   * @group collection_funcs
+   * @group map_funcs
    * @since 3.4.0
    */
   @scala.annotation.varargs
@@ -7200,7 +7213,7 @@ object functions {
    *   "https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option"> Data
    *   Source Option</a> in the version you use.
    *
-   * @group collection_funcs
+   * @group csv_funcs
    * @since 3.4.0
    */
   // scalastyle:on line.size.limit
@@ -7222,7 +7235,7 @@ object functions {
    *   "https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option"> Data
    *   Source Option</a> in the version you use.
    *
-   * @group collection_funcs
+   * @group csv_funcs
    * @since 3.4.0
    */
   // scalastyle:on line.size.limit
@@ -7238,7 +7251,7 @@ object functions {
    * @param csv
    *   a CSV string.
    *
-   * @group collection_funcs
+   * @group csv_funcs
    * @since 3.4.0
    */
   def schema_of_csv(csv: String): Column = schema_of_csv(lit(csv))
@@ -7249,7 +7262,7 @@ object functions {
    * @param csv
    *   a foldable string column containing a CSV string.
    *
-   * @group collection_funcs
+   * @group csv_funcs
    * @since 3.4.0
    */
   def schema_of_csv(csv: Column): Column = schema_of_csv(csv, Collections.emptyMap())
@@ -7268,7 +7281,7 @@ object functions {
    * @return
    *   a column with string literal containing schema in DDL format.
    *
-   * @group collection_funcs
+   * @group csv_funcs
    * @since 3.4.0
    */
   // scalastyle:on line.size.limit
@@ -7288,7 +7301,7 @@ object functions {
    *   "https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option"> Data
    *   Source Option</a> in the version you use.
    *
-   * @group collection_funcs
+   * @group csv_funcs
    * @since 3.4.0
    */
   // scalastyle:on line.size.limit
@@ -7302,7 +7315,7 @@ object functions {
    * @param e
    *   a column containing a struct.
    *
-   * @group collection_funcs
+   * @group csv_funcs
    * @since 3.4.0
    */
   def to_csv(e: Column): Column = to_csv(e, Collections.emptyMap())
@@ -7321,12 +7334,12 @@ object functions {
    *   See <a href=
    *   "https://spark.apache.org/docs/latest/sql-data-sources-xml.html#data-source-option"> Data
    *   Source Option</a> in the version you use.
-   * @group collection_funcs
+   * @group xml_funcs
    * @since 4.0.0
    */
   // scalastyle:on line.size.limit
   def from_xml(e: Column, schema: StructType, options: java.util.Map[String, String]): Column =
-    from_xml(e, lit(schema.json), options.asScala.toIterator)
+    from_xml(e, lit(schema.json), options.asScala.iterator)
 
   // scalastyle:off line.size.limit
 
@@ -7343,7 +7356,7 @@ object functions {
    *   See <a href=
    *   "https://spark.apache.org/docs/latest/sql-data-sources-xml.html#data-source-option"> Data
    *   Source Option</a> in the version you use.
-   * @group collection_funcs
+   * @group xml_funcs
    * @since 4.0.0
    */
   // scalastyle:on line.size.limit
@@ -7366,7 +7379,7 @@ object functions {
    *   a string column containing XML data.
    * @param schema
    *   the schema to use when parsing the XML string
-   * @group collection_funcs
+   * @group xml_funcs
    * @since 4.0.0
    */
   // scalastyle:on line.size.limit
@@ -7388,7 +7401,7 @@ object functions {
    *   See <a href=
    *   "https://spark.apache.org/docs/latest/sql-data-sources-xml.html#data-source-option"> Data
    *   Source Option</a> in the version you use.
-   * @group collection_funcs
+   * @group xml_funcs
    *
    * @since 4.0.0
    */
@@ -7404,7 +7417,7 @@ object functions {
    *   a string column containing XML data.
    * @param schema
    *   the schema to use when parsing the XML string
-   * @group collection_funcs
+   * @group xml_funcs
    *
    * @since 4.0.0
    */
@@ -7430,7 +7443,7 @@ object functions {
    *
    * @param xml
    *   a foldable string column containing a XML string.
-   * @group collection_funcs
+   * @group xml_funcs
    * @since 4.0.0
    */
   def schema_of_xml(xml: Column): Column = Column.fn("schema_of_xml", xml)
@@ -7449,7 +7462,7 @@ object functions {
    *   Source Option</a> in the version you use.
    * @return
    *   a column with string literal containing schema in DDL format.
-   * @group collection_funcs
+   * @group xml_funcs
    * @since 4.0.0
    */
   // scalastyle:on line.size.limit
@@ -7460,7 +7473,7 @@ object functions {
   /**
    * Returns the total number of elements in the array. The function returns null for null input.
    *
-   * @group collection_funcs
+   * @group array_funcs
    * @since 3.5.0
    */
   def array_size(e: Column): Column = Column.fn("array_size", e)
@@ -7481,7 +7494,7 @@ object functions {
    * Returns the number of elements in the outermost JSON array. `NULL` is returned in case of any
    * other valid JSON string, `NULL` or an invalid JSON.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.5.0
    */
   def json_array_length(e: Column): Column = Column.fn("json_array_length", e)
@@ -7491,7 +7504,7 @@ object functions {
    * given, all the keys of the outermost object will be returned as an array. If it is any other
    * valid JSON string, an invalid JSON string or an empty string, the function returns null.
    *
-   * @group collection_funcs
+   * @group json_funcs
    * @since 3.5.0
    */
   def json_object_keys(e: Column): Column = Column.fn("json_object_keys", e)
@@ -7836,7 +7849,7 @@ object functions {
   /**
    * Returns `col2` if `col1` is null, or `col1` otherwise.
    *
-   * @group predicates_funcs
+   * @group conditional_funcs
    * @since 3.5.0
    */
   def ifnull(col1: Column, col2: Column): Column = Column.fn("ifnull", col1, col2)
@@ -7844,7 +7857,7 @@ object functions {
   /**
    * Returns true if `col` is not null, or false otherwise.
    *
-   * @group predicates_funcs
+   * @group predicate_funcs
    * @since 3.5.0
    */
   def isnotnull(col: Column): Column = Column.fn("isnotnull", col)
@@ -7853,7 +7866,7 @@ object functions {
    * Returns same result as the EQUAL(=) operator for non-null operands, but returns true if both
    * are null, false if one of the them is null.
    *
-   * @group predicates_funcs
+   * @group predicate_funcs
    * @since 3.5.0
    */
   def equal_null(col1: Column, col2: Column): Column = Column.fn("equal_null", col1, col2)
@@ -7861,7 +7874,7 @@ object functions {
   /**
    * Returns null if `col1` equals to `col2`, or `col1` otherwise.
    *
-   * @group predicates_funcs
+   * @group conditional_funcs
    * @since 3.5.0
    */
   def nullif(col1: Column, col2: Column): Column = Column.fn("nullif", col1, col2)
@@ -7869,7 +7882,7 @@ object functions {
   /**
    * Returns `col2` if `col1` is null, or `col1` otherwise.
    *
-   * @group predicates_funcs
+   * @group conditional_funcs
    * @since 3.5.0
    */
   def nvl(col1: Column, col2: Column): Column = Column.fn("nvl", col1, col2)
@@ -7877,7 +7890,7 @@ object functions {
   /**
    * Returns `col2` if `col1` is not null, or `col3` otherwise.
    *
-   * @group predicates_funcs
+   * @group conditional_funcs
    * @since 3.5.0
    */
   def nvl2(col1: Column, col2: Column, col3: Column): Column = Column.fn("nvl2", col1, col2, col3)
@@ -8350,6 +8363,7 @@ object functions {
    *   function name that follows the SQL identifier syntax (can be quoted, can be qualified)
    * @param cols
    *   the expression parameters of function
+   * @group normal_funcs
    * @since 3.5.0
    */
   @scala.annotation.varargs
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
index 6d825f22b35fa..6e0a04cf4eb4d 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
@@ -647,7 +647,7 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM
 
   test("Dataset result collection") {
     def checkResult(rows: IterableOnce[java.lang.Long], expectedValues: Long*): Unit = {
-      rows.toIterator.zipAll(expectedValues.iterator, null, null).foreach {
+      rows.iterator.zipAll(expectedValues.iterator, null, null).foreach {
         case (actual, expected) => assert(actual === expected)
       }
     }
diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala
index 08496c36b28a2..ba5ecc7a045ac 100644
--- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala
+++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala
@@ -63,15 +63,15 @@ private[connect] class ExecuteGrpcResponseSender[T <: Message](
   /**
    * Interrupt this sender and make it exit.
    */
-  def interrupt(): Unit = executionObserver.synchronized {
+  def interrupt(): Unit = {
     interrupted = true
-    executionObserver.notifyAll()
+    wakeUp()
   }
 
   // For testing
-  private[connect] def setDeadline(deadlineMs: Long) = executionObserver.synchronized {
+  private[connect] def setDeadline(deadlineMs: Long) = {
     deadlineTimeMillis = deadlineMs
-    executionObserver.notifyAll()
+    wakeUp()
   }
 
   def run(lastConsumedStreamIndex: Long): Unit = {
@@ -152,9 +152,6 @@ private[connect] class ExecuteGrpcResponseSender[T <: Message](
         s"lastConsumedStreamIndex=$lastConsumedStreamIndex")
     val startTime = System.nanoTime()
 
-    // register to be notified about available responses.
-    executionObserver.attachConsumer(this)
-
     var nextIndex = lastConsumedStreamIndex + 1
     var finished = false
 
@@ -191,7 +188,7 @@ private[connect] class ExecuteGrpcResponseSender[T <: Message](
         sentResponsesSize > maximumResponseSize || deadlineTimeMillis < System.currentTimeMillis()
 
       logTrace(s"Trying to get next response with index=$nextIndex.")
-      executionObserver.synchronized {
+      executionObserver.responseLock.synchronized {
         logTrace(s"Acquired executionObserver lock.")
         val sleepStart = System.nanoTime()
         var sleepEnd = 0L
@@ -208,7 +205,7 @@ private[connect] class ExecuteGrpcResponseSender[T <: Message](
           if (response.isEmpty) {
             val timeout = Math.max(1, deadlineTimeMillis - System.currentTimeMillis())
             logTrace(s"Wait for response to become available with timeout=$timeout ms.")
-            executionObserver.wait(timeout)
+            executionObserver.responseLock.wait(timeout)
             logTrace(s"Reacquired executionObserver lock after waiting.")
             sleepEnd = System.nanoTime()
           }
@@ -339,4 +336,15 @@ private[connect] class ExecuteGrpcResponseSender[T <: Message](
       }
     }
   }
+
+  private def wakeUp(): Unit = {
+    // Can be sleeping on either of these two locks, wake them up.
+    // (Neither of these locks is ever taken for extended period of time, so this won't block)
+    executionObserver.responseLock.synchronized {
+      executionObserver.responseLock.notifyAll()
+    }
+    grpcCallObserverReadySignal.synchronized {
+      grpcCallObserverReadySignal.notifyAll()
+    }
+  }
 }
diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteResponseObserver.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteResponseObserver.scala
index 859ec7e6b1983..e99e3a94f73a6 100644
--- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteResponseObserver.scala
+++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteResponseObserver.scala
@@ -33,8 +33,7 @@ import org.apache.spark.sql.connect.service.ExecuteHolder
 /**
  * This StreamObserver is running on the execution thread. Execution pushes responses to it, it
  * caches them. ExecuteResponseGRPCSender is the consumer of the responses ExecuteResponseObserver
- * "produces". It waits on the monitor of ExecuteResponseObserver. New produced responses notify
- * the monitor.
+ * "produces". It waits on the responseLock. New produced responses notify the responseLock.
  * @see
  *   getResponse.
  *
@@ -85,10 +84,12 @@ private[connect] class ExecuteResponseObserver[T <: Message](val executeHolder:
   private[connect] var highestConsumedIndex: Long = 0
 
   /**
-   * Consumer that waits for available responses. There can be only one at a time, @see
-   * attachConsumer.
+   * Lock used for synchronization between responseObserver and grpcResponseSenders. *
+   * grpcResponseSenders wait on it for a new response to be available. * grpcResponseSenders also
+   * notify it to wake up when interrupted * responseObserver notifies it when new responses are
+   * available.
    */
-  private var responseSender: Option[ExecuteGrpcResponseSender[T]] = None
+  private[connect] val responseLock = new Object()
 
   // Statistics about cached responses.
   private val cachedSizeUntilHighestConsumed = CachedSize()
@@ -106,7 +107,7 @@ private[connect] class ExecuteResponseObserver[T <: Message](val executeHolder:
     0
   }
 
-  def onNext(r: T): Unit = synchronized {
+  def onNext(r: T): Unit = responseLock.synchronized {
     if (finalProducedIndex.nonEmpty) {
       throw new IllegalStateException("Stream onNext can't be called after stream completed")
     }
@@ -125,10 +126,10 @@ private[connect] class ExecuteResponseObserver[T <: Message](val executeHolder:
     logDebug(
       s"Execution opId=${executeHolder.operationId} produced response " +
         s"responseId=${responseId} idx=$lastProducedIndex")
-    notifyAll()
+    responseLock.notifyAll()
   }
 
-  def onError(t: Throwable): Unit = synchronized {
+  def onError(t: Throwable): Unit = responseLock.synchronized {
     if (finalProducedIndex.nonEmpty) {
       throw new IllegalStateException("Stream onError can't be called after stream completed")
     }
@@ -137,10 +138,10 @@ private[connect] class ExecuteResponseObserver[T <: Message](val executeHolder:
     logDebug(
       s"Execution opId=${executeHolder.operationId} produced error. " +
         s"Last stream index is $lastProducedIndex.")
-    notifyAll()
+    responseLock.notifyAll()
   }
 
-  def onCompleted(): Unit = synchronized {
+  def onCompleted(): Unit = responseLock.synchronized {
     if (finalProducedIndex.nonEmpty) {
       throw new IllegalStateException("Stream onCompleted can't be called after stream completed")
     }
@@ -148,14 +149,7 @@ private[connect] class ExecuteResponseObserver[T <: Message](val executeHolder:
     logDebug(
       s"Execution opId=${executeHolder.operationId} completed stream. " +
         s"Last stream index is $lastProducedIndex.")
-    notifyAll()
-  }
-
-  /** Attach a new consumer (ExecuteResponseGRPCSender). */
-  def attachConsumer(newSender: ExecuteGrpcResponseSender[T]): Unit = synchronized {
-    // interrupt the current sender before attaching new one
-    responseSender.foreach(_.interrupt())
-    responseSender = Some(newSender)
+    responseLock.notifyAll()
   }
 
   /**
@@ -163,7 +157,7 @@ private[connect] class ExecuteResponseObserver[T <: Message](val executeHolder:
    * this response observer assumes that the response is consumed, and the response and previous
    * response can be uncached, keeping retryBufferSize of responses for the case of retries.
    */
-  def consumeResponse(index: Long): Option[CachedStreamResponse[T]] = synchronized {
+  def consumeResponse(index: Long): Option[CachedStreamResponse[T]] = responseLock.synchronized {
     // we index stream responses from 1, getting a lower index would be invalid.
     assert(index >= 1)
     // it would be invalid if consumer would skip a response
@@ -198,17 +192,17 @@ private[connect] class ExecuteResponseObserver[T <: Message](val executeHolder:
   }
 
   /** Get the stream error if there is one, otherwise None. */
-  def getError(): Option[Throwable] = synchronized {
+  def getError(): Option[Throwable] = responseLock.synchronized {
     error
   }
 
   /** If the stream is finished, the index of the last response, otherwise None. */
-  def getLastResponseIndex(): Option[Long] = synchronized {
+  def getLastResponseIndex(): Option[Long] = responseLock.synchronized {
     finalProducedIndex
   }
 
   /** Get the index in the stream for given response id. */
-  def getResponseIndexById(responseId: String): Long = synchronized {
+  def getResponseIndexById(responseId: String): Long = responseLock.synchronized {
     responseIdToIndex.getOrElse(
       responseId,
       throw new SparkSQLException(
@@ -217,7 +211,7 @@ private[connect] class ExecuteResponseObserver[T <: Message](val executeHolder:
   }
 
   /** Remove cached responses up to and including response with given id. */
-  def removeResponsesUntilId(responseId: String): Unit = synchronized {
+  def removeResponsesUntilId(responseId: String): Unit = responseLock.synchronized {
     val index = getResponseIndexById(responseId)
     removeResponsesUntilIndex(index)
     logDebug(
@@ -229,7 +223,7 @@ private[connect] class ExecuteResponseObserver[T <: Message](val executeHolder:
   }
 
   /** Remove all cached responses */
-  def removeAll(): Unit = synchronized {
+  def removeAll(): Unit = responseLock.synchronized {
     removeResponsesUntilIndex(lastProducedIndex)
     logInfo(
       s"Release all for opId=${executeHolder.operationId}. Execution stats: " +
@@ -242,7 +236,7 @@ private[connect] class ExecuteResponseObserver[T <: Message](val executeHolder:
   }
 
   /** Returns if the stream is finished. */
-  def completed(): Boolean = synchronized {
+  def completed(): Boolean = responseLock.synchronized {
     finalProducedIndex.isDefined
   }
 
diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala
index 96ed593e72ff2..ea2bbe0093fcd 100644
--- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala
+++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala
@@ -195,11 +195,8 @@ private[connect] class ExecuteThreadRunner(executeHolder: ExecuteHolder) extends
     val responseObserver = executeHolder.responseObserver
 
     val command = request.getPlan.getCommand
-    val planner = new SparkConnectPlanner(executeHolder.sessionHolder)
-    planner.process(
-      command = command,
-      responseObserver = responseObserver,
-      executeHolder = executeHolder)
+    val planner = new SparkConnectPlanner(executeHolder)
+    planner.process(command = command, responseObserver = responseObserver)
   }
 
   private def requestString(request: Message) = {
diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala
index 546e4446d195b..ddad7da447557 100644
--- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala
+++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala
@@ -56,7 +56,7 @@ private[execution] class SparkConnectPlanExecution(executeHolder: ExecuteHolder)
       throw new IllegalStateException(
         s"Illegal operation type ${request.getPlan.getOpTypeCase} to be handled here.")
     }
-    val planner = new SparkConnectPlanner(sessionHolder)
+    val planner = new SparkConnectPlanner(executeHolder)
     val tracker = executeHolder.eventsManager.createQueryPlanningTracker
     val dataframe =
       Dataset.ofRows(
diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
index eead5cb38ad85..fa964c02a253e 100644
--- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
+++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
@@ -86,7 +86,18 @@ final case class InvalidCommandInput(
     private val cause: Throwable = null)
     extends Exception(message, cause)
 
-class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
+class SparkConnectPlanner(
+    val sessionHolder: SessionHolder,
+    val executeHolderOpt: Option[ExecuteHolder] = None)
+    extends Logging {
+
+  def this(executeHolder: ExecuteHolder) = {
+    this(executeHolder.sessionHolder, Some(executeHolder))
+  }
+
+  if (!executeHolderOpt.forall { e => e.sessionHolder == sessionHolder }) {
+    throw new IllegalArgumentException("executeHolder does not belong to sessionHolder")
+  }
 
   private[connect] def session: SparkSession = sessionHolder.session
 
@@ -94,6 +105,10 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
 
   private[connect] def sessionId: String = sessionHolder.sessionId
 
+  lazy val executeHolder = executeHolderOpt.getOrElse {
+    throw new IllegalArgumentException("executeHolder is not set")
+  }
+
   private lazy val pythonExec =
     sys.env.getOrElse("PYSPARK_PYTHON", sys.env.getOrElse("PYSPARK_DRIVER_PYTHON", "python3"))
 
@@ -2461,48 +2476,39 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
 
   def process(
       command: proto.Command,
-      responseObserver: StreamObserver[ExecutePlanResponse],
-      executeHolder: ExecuteHolder): Unit = {
+      responseObserver: StreamObserver[ExecutePlanResponse]): Unit = {
     command.getCommandTypeCase match {
       case proto.Command.CommandTypeCase.REGISTER_FUNCTION =>
-        handleRegisterUserDefinedFunction(command.getRegisterFunction, executeHolder)
+        handleRegisterUserDefinedFunction(command.getRegisterFunction)
       case proto.Command.CommandTypeCase.REGISTER_TABLE_FUNCTION =>
-        handleRegisterUserDefinedTableFunction(command.getRegisterTableFunction, executeHolder)
+        handleRegisterUserDefinedTableFunction(command.getRegisterTableFunction)
       case proto.Command.CommandTypeCase.WRITE_OPERATION =>
-        handleWriteOperation(command.getWriteOperation, executeHolder)
+        handleWriteOperation(command.getWriteOperation)
       case proto.Command.CommandTypeCase.CREATE_DATAFRAME_VIEW =>
-        handleCreateViewCommand(command.getCreateDataframeView, executeHolder)
+        handleCreateViewCommand(command.getCreateDataframeView)
       case proto.Command.CommandTypeCase.WRITE_OPERATION_V2 =>
-        handleWriteOperationV2(command.getWriteOperationV2, executeHolder)
+        handleWriteOperationV2(command.getWriteOperationV2)
       case proto.Command.CommandTypeCase.EXTENSION =>
-        handleCommandPlugin(command.getExtension, executeHolder)
+        handleCommandPlugin(command.getExtension)
       case proto.Command.CommandTypeCase.SQL_COMMAND =>
-        handleSqlCommand(command.getSqlCommand, responseObserver, executeHolder)
+        handleSqlCommand(command.getSqlCommand, responseObserver)
       case proto.Command.CommandTypeCase.WRITE_STREAM_OPERATION_START =>
-        handleWriteStreamOperationStart(
-          command.getWriteStreamOperationStart,
-          responseObserver,
-          executeHolder)
+        handleWriteStreamOperationStart(command.getWriteStreamOperationStart, responseObserver)
       case proto.Command.CommandTypeCase.STREAMING_QUERY_COMMAND =>
-        handleStreamingQueryCommand(
-          command.getStreamingQueryCommand,
-          responseObserver,
-          executeHolder)
+        handleStreamingQueryCommand(command.getStreamingQueryCommand, responseObserver)
       case proto.Command.CommandTypeCase.STREAMING_QUERY_MANAGER_COMMAND =>
         handleStreamingQueryManagerCommand(
           command.getStreamingQueryManagerCommand,
-          responseObserver,
-          executeHolder)
+          responseObserver)
       case proto.Command.CommandTypeCase.GET_RESOURCES_COMMAND =>
-        handleGetResourcesCommand(responseObserver, executeHolder)
+        handleGetResourcesCommand(responseObserver)
       case _ => throw new UnsupportedOperationException(s"$command not supported.")
     }
   }
 
   def handleSqlCommand(
       getSqlCommand: SqlCommand,
-      responseObserver: StreamObserver[ExecutePlanResponse],
-      executeHolder: ExecuteHolder): Unit = {
+      responseObserver: StreamObserver[ExecutePlanResponse]): Unit = {
     // Eagerly execute commands of the provided SQL string.
     val args = getSqlCommand.getArgsMap
     val namedArguments = getSqlCommand.getNamedArgumentsMap
@@ -2600,8 +2606,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
   }
 
   private def handleRegisterUserDefinedFunction(
-      fun: proto.CommonInlineUserDefinedFunction,
-      executeHolder: ExecuteHolder): Unit = {
+      fun: proto.CommonInlineUserDefinedFunction): Unit = {
     fun.getFunctionCase match {
       case proto.CommonInlineUserDefinedFunction.FunctionCase.PYTHON_UDF =>
         handleRegisterPythonUDF(fun)
@@ -2617,8 +2622,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
   }
 
   private def handleRegisterUserDefinedTableFunction(
-      fun: proto.CommonInlineUserDefinedTableFunction,
-      executeHolder: ExecuteHolder): Unit = {
+      fun: proto.CommonInlineUserDefinedTableFunction): Unit = {
     fun.getFunctionCase match {
       case proto.CommonInlineUserDefinedTableFunction.FunctionCase.PYTHON_UDTF =>
         val function = createPythonUserDefinedTableFunction(fun)
@@ -2685,7 +2689,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
     session.udf.register(fun.getFunctionName, udf)
   }
 
-  private def handleCommandPlugin(extension: ProtoAny, executeHolder: ExecuteHolder): Unit = {
+  private def handleCommandPlugin(extension: ProtoAny): Unit = {
     SparkConnectPluginRegistry.commandRegistry
       // Lazily traverse the collection.
       .view
@@ -2698,9 +2702,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
     executeHolder.eventsManager.postFinished()
   }
 
-  private def handleCreateViewCommand(
-      createView: proto.CreateDataFrameViewCommand,
-      executeHolder: ExecuteHolder): Unit = {
+  private def handleCreateViewCommand(createView: proto.CreateDataFrameViewCommand): Unit = {
     val viewType = if (createView.getIsGlobal) GlobalTempView else LocalTempView
 
     val tableIdentifier =
@@ -2736,9 +2738,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
    *
    * @param writeOperation
    */
-  private def handleWriteOperation(
-      writeOperation: proto.WriteOperation,
-      executeHolder: ExecuteHolder): Unit = {
+  private def handleWriteOperation(writeOperation: proto.WriteOperation): Unit = {
     // Transform the input plan into the logical plan.
     val plan = transformRelation(writeOperation.getInput)
     // And create a Dataset from the plan.
@@ -2810,9 +2810,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
    *
    * @param writeOperation
    */
-  def handleWriteOperationV2(
-      writeOperation: proto.WriteOperationV2,
-      executeHolder: ExecuteHolder): Unit = {
+  def handleWriteOperationV2(writeOperation: proto.WriteOperationV2): Unit = {
     // Transform the input plan into the logical plan.
     val plan = transformRelation(writeOperation.getInput)
     // And create a Dataset from the plan.
@@ -2873,8 +2871,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
 
   def handleWriteStreamOperationStart(
       writeOp: WriteStreamOperationStart,
-      responseObserver: StreamObserver[ExecutePlanResponse],
-      executeHolder: ExecuteHolder): Unit = {
+      responseObserver: StreamObserver[ExecutePlanResponse]): Unit = {
     val plan = transformRelation(writeOp.getInput)
     val tracker = executeHolder.eventsManager.createQueryPlanningTracker
     val dataset = Dataset.ofRows(session, plan, tracker)
@@ -2999,8 +2996,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
 
   def handleStreamingQueryCommand(
       command: StreamingQueryCommand,
-      responseObserver: StreamObserver[ExecutePlanResponse],
-      executeHolder: ExecuteHolder): Unit = {
+      responseObserver: StreamObserver[ExecutePlanResponse]): Unit = {
 
     val id = command.getQueryId.getId
     val runId = command.getQueryId.getRunId
@@ -3177,8 +3173,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
 
   def handleStreamingQueryManagerCommand(
       command: StreamingQueryManagerCommand,
-      responseObserver: StreamObserver[ExecutePlanResponse],
-      executeHolder: ExecuteHolder): Unit = {
+      responseObserver: StreamObserver[ExecutePlanResponse]): Unit = {
     val respBuilder = StreamingQueryManagerCommandResult.newBuilder()
 
     command.getCommandCase match {
@@ -3257,8 +3252,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
   }
 
   def handleGetResourcesCommand(
-      responseObserver: StreamObserver[proto.ExecutePlanResponse],
-      executeHolder: ExecuteHolder): Unit = {
+      responseObserver: StreamObserver[proto.ExecutePlanResponse]): Unit = {
     executeHolder.eventsManager.postFinished()
     responseObserver.onNext(
       proto.ExecutePlanResponse
diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteHolder.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteHolder.scala
index 0593edc2f6fda..eed8cc01f7c66 100644
--- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteHolder.scala
+++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/ExecuteHolder.scala
@@ -164,6 +164,10 @@ private[connect] class ExecuteHolder(
   private def addGrpcResponseSender(
       sender: ExecuteGrpcResponseSender[proto.ExecutePlanResponse]) = synchronized {
     if (closedTime.isEmpty) {
+      // Interrupt all other senders - there can be only one active sender.
+      // Interrupted senders will remove themselves with removeGrpcResponseSender when they exit.
+      grpcResponseSenders.foreach(_.interrupt())
+      // And add this one.
       grpcResponseSenders += sender
       lastAttachedRpcTime = None
     } else {
diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala
index eb84dfc4e3df8..dfada825df47d 100644
--- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala
+++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala
@@ -58,8 +58,8 @@ trait SparkConnectPlanTest extends SharedSparkSession {
 
   def transform(cmd: proto.Command): Unit = {
     val executeHolder = buildExecutePlanHolder(cmd)
-    new SparkConnectPlanner(executeHolder.sessionHolder)
-      .process(cmd, new MockObserver(), executeHolder)
+    new SparkConnectPlanner(executeHolder)
+      .process(cmd, new MockObserver())
   }
 
   def readRel: proto.Relation =
@@ -148,7 +148,7 @@ class SparkConnectPlannerSuite extends SparkFunSuite with SparkConnectPlanTest {
 
   test("Simple Limit") {
     assertThrows[IndexOutOfBoundsException] {
-      new SparkConnectPlanner(None.orNull)
+      new SparkConnectPlanner(SessionHolder.forTesting(None.orNull))
         .transformRelation(
           proto.Relation.newBuilder
             .setLimit(proto.Limit.newBuilder.setLimit(10))
@@ -159,10 +159,11 @@ class SparkConnectPlannerSuite extends SparkFunSuite with SparkConnectPlanTest {
   test("InvalidInputs") {
     // No Relation Set
     intercept[IndexOutOfBoundsException](
-      new SparkConnectPlanner(None.orNull).transformRelation(proto.Relation.newBuilder().build()))
+      new SparkConnectPlanner(SessionHolder.forTesting(None.orNull))
+        .transformRelation(proto.Relation.newBuilder().build()))
 
     intercept[InvalidPlanInput](
-      new SparkConnectPlanner(None.orNull)
+      new SparkConnectPlanner(SessionHolder.forTesting(None.orNull))
         .transformRelation(
           proto.Relation.newBuilder.setUnknown(proto.Unknown.newBuilder().build()).build()))
   }
diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/plugin/SparkConnectPluginRegistrySuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/plugin/SparkConnectPluginRegistrySuite.scala
index fdb9032379419..ea9ae3ed9d9c1 100644
--- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/plugin/SparkConnectPluginRegistrySuite.scala
+++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/plugin/SparkConnectPluginRegistrySuite.scala
@@ -196,8 +196,8 @@ class SparkConnectPluginRegistrySuite extends SharedSparkSession with SparkConne
         .build()
 
       val executeHolder = buildExecutePlanHolder(plan)
-      new SparkConnectPlanner(executeHolder.sessionHolder)
-        .process(plan, new MockObserver(), executeHolder)
+      new SparkConnectPlanner(executeHolder)
+        .process(plan, new MockObserver())
       assert(spark.sparkContext.getLocalProperty("testingProperty").equals("Martin"))
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
index cc4370ad02e06..5c1887be5b8b3 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
@@ -153,7 +153,11 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
               <li><strong>Drivers:</strong>
                 {state.activeDrivers.length} Running
                 ({state.activeDrivers.count(_.state == DriverState.SUBMITTED)} Waiting),
-                {state.completedDrivers.length} Completed </li>
+                {state.completedDrivers.length} Completed
+                ({state.completedDrivers.count(_.state == DriverState.KILLED)} Killed,
+                {state.completedDrivers.count(_.state == DriverState.FAILED)} Failed,
+                {state.completedDrivers.count(_.state == DriverState.ERROR)} Error)
+              </li>
               <li><strong>Status:</strong> {state.status}</li>
             </ul>
           </div>
diff --git a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
index 95d2bdc39e133..856e639fcd9ae 100644
--- a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.serializer
 
 import java.io._
+import java.lang.reflect.{InvocationHandler, Method, Proxy}
 import java.nio.ByteBuffer
 
 import scala.reflect.ClassTag
@@ -79,7 +80,7 @@ private[spark] class JavaDeserializationStream(in: InputStream, loader: ClassLoa
       // scalastyle:off classforname
       val resolved = ifaces.map(iface => Class.forName(iface, false, loader))
       // scalastyle:on classforname
-      java.lang.reflect.Proxy.getProxyClass(loader, resolved: _*)
+      Proxy.newProxyInstance(loader, resolved, DummyInvocationHandler).getClass
     }
 
   }
@@ -88,6 +89,12 @@ private[spark] class JavaDeserializationStream(in: InputStream, loader: ClassLoa
   def close(): Unit = { objIn.close() }
 }
 
+private[spark] object DummyInvocationHandler extends InvocationHandler {
+  override def invoke(proxy: Any, method: Method, args: Array[AnyRef]): AnyRef = {
+    throw new UnsupportedOperationException("Not implemented")
+  }
+}
+
 private object JavaDeserializationStream {
 
   val primitiveMappings = Map[String, Class[_]](
diff --git a/core/src/main/scala/org/apache/spark/util/collection/Utils.scala b/core/src/main/scala/org/apache/spark/util/collection/Utils.scala
index 436899448d633..1695f06c35be9 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/Utils.scala
@@ -56,7 +56,7 @@ private[spark] object Utils extends SparkCollectionUtils {
       override def compare(l: T, r: T): Int = ord.compare(l, r)
     }
     GuavaIterators.mergeSorted(
-      inputs.map(_.toIterator.asJava).asJava, ordering).asScala
+      inputs.map(_.iterator.asJava).asJava, ordering).asScala
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
index 44dc9a5f97dab..842a26148b4e6 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -161,7 +161,7 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
     def resultHandler(x: Int, y: Unit): Unit = {}
     val futureAction: SimpleFutureAction[Unit] = sc.submitJob[Int, Unit, Unit](rdd,
       OutputCommitFunctions(tempDir.getAbsolutePath).commitSuccessfully,
-      0 until rdd.partitions.size, resultHandler, () => ())
+      0 until rdd.partitions.size, resultHandler, ())
     // It's an error if the job completes successfully even though no committer was authorized,
     // so throw an exception if the job was allowed to complete.
     intercept[TimeoutException] {
diff --git a/core/src/test/scala/org/apache/spark/util/CompletionIteratorSuite.scala b/core/src/test/scala/org/apache/spark/util/CompletionIteratorSuite.scala
index 29421f7aa9e36..297e4fd53ab4f 100644
--- a/core/src/test/scala/org/apache/spark/util/CompletionIteratorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/CompletionIteratorSuite.scala
@@ -57,13 +57,13 @@ class CompletionIteratorSuite extends SparkFunSuite {
     sub = null
     iter.toArray
 
-    for (_ <- 1 to 100 if !ref.isEnqueued) {
+    for (_ <- 1 to 100 if !ref.refersTo(null)) {
       System.gc()
-      if (!ref.isEnqueued) {
+      if (!ref.refersTo(null)) {
         Thread.sleep(10)
       }
     }
-    assert(ref.isEnqueued)
+    assert(ref.refersTo(null))
     assert(refQueue.poll() === ref)
   }
 }
diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 774422a9cd9d1..9fb823abaa3ab 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -2385,6 +2385,11 @@ Here are the configs regarding to RocksDB instance of the state store provider:
     <td>Total memory to be occupied by blocks in high priority pool as a fraction of memory allocated across all RocksDB instances on a single node using maxMemoryUsageMB.</td>
     <td>0.1</td>
   </tr>
+  <tr>
+    <td>spark.sql.streaming.stateStore.rocksdb.allowFAllocate</td>
+    <td>Allow the rocksdb runtime to use fallocate to pre-allocate disk space for logs, etc...  Disable for apps that have many smaller state stores to trade off disk space for write performance.</td>
+    <td>true</td>
+  </tr>
 </table>
 
 ##### RocksDB State Store Memory Management
diff --git a/pom.xml b/pom.xml
index 3e43bc047079d..e3a19257c8c12 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2970,9 +2970,6 @@
               <arg>-Wconf:cat=scaladoc:wv</arg>
               <arg>-Wconf:cat=lint-multiarg-infix:wv</arg>
               <arg>-Wconf:cat=other-nullary-override:wv</arg>
-              <arg>-Wconf:cat=other-match-analysis&amp;site=org.apache.spark.sql.catalyst.catalog.SessionCatalog.lookupFunction.catalogFunction:wv</arg>
-              <arg>-Wconf:cat=other-pure-statement&amp;site=org.apache.spark.streaming.util.FileBasedWriteAheadLog.readAll.readFile:wv</arg>
-              <arg>-Wconf:cat=other-pure-statement&amp;site=org.apache.spark.scheduler.OutputCommitCoordinatorSuite:wv</arg>
               <!--
                 SPARK-33775 Suppress compilation warnings that contain the following contents.
                 TODO(SPARK-33805): Undo the corresponding deprecated usage suppression rule after fixed.
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index d9a79f6aa82fa..e84a4bb0fecae 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -234,10 +234,6 @@ object SparkBuild extends PomBuild {
         "-Wunused:imports",
         "-Wconf:cat=lint-multiarg-infix:wv",
         "-Wconf:cat=other-nullary-override:wv",
-        "-Wconf:cat=other-match-analysis&site=org.apache.spark.sql.catalyst.catalog.SessionCatalog.lookupFunction.catalogFunction:wv",
-        "-Wconf:cat=other-pure-statement&site=org.apache.spark.streaming.util.FileBasedWriteAheadLog.readAll.readFile:wv",
-        "-Wconf:cat=other-pure-statement&site=org.apache.spark.scheduler.OutputCommitCoordinatorSuite:wv",
-        "-Wconf:cat=other-pure-statement&site=org.apache.spark.sql.streaming.sources.StreamingDataSourceV2Suite.testPositiveCase.\\$anonfun:wv",
         // SPARK-33775 Suppress compilation warnings that contain the following contents.
         // TODO(SPARK-33805): Undo the corresponding deprecated usage suppression rule after
         //  fixed.
diff --git a/python/docs/source/reference/pyspark.ml.connect.rst b/python/docs/source/reference/pyspark.ml.connect.rst
index 1a3e6a593980f..834c36d6968fd 100644
--- a/python/docs/source/reference/pyspark.ml.connect.rst
+++ b/python/docs/source/reference/pyspark.ml.connect.rst
@@ -53,6 +53,7 @@ Feature
     MaxAbsScalerModel
     StandardScaler
     StandardScalerModel
+    ArrayAssembler
 
 
 Classification
diff --git a/python/docs/source/user_guide/sql/python_udtf.rst b/python/docs/source/user_guide/sql/python_udtf.rst
index 74d8eb8898615..fb42644dc7026 100644
--- a/python/docs/source/user_guide/sql/python_udtf.rst
+++ b/python/docs/source/user_guide/sql/python_udtf.rst
@@ -50,10 +50,108 @@ To implement a Python UDTF, you first need to define a class implementing the me
 
             Notes
             -----
-            - This method does not accept any extra arguments. Only the default
-              constructor is supported.
             - You cannot create or reference the Spark session within the UDTF. Any
               attempt to do so will result in a serialization error.
+            - If the below `analyze` method is implemented, it is also possible to define this
+              method as: `__init__(self, analyze_result: AnalyzeResult)`. In this case, the result
+              of the `analyze` method is passed into all future instantiations of this UDTF class.
+              In this way, the UDTF may inspect the schema and metadata of the output table as
+              needed during execution of other methods in this class. Note that it is possible to
+              create a subclass of the `AnalyzeResult` class if desired for purposes of passing
+              custom information generated just once during UDTF analysis to other method calls;
+              this can be especially useful if this initialization is expensive.
+            """
+            ...
+
+        def analyze(self, *args: Any) -> AnalyzeResult:
+            """
+            Computes the output schema of a particular call to this function in response to the
+            arguments provided.
+
+            This method is optional and only needed if the registration of the UDTF did not provide
+            a static output schema to be use for all calls to the function. In this context,
+            `output schema` refers to the ordered list of the names and types of the columns in the
+            function's result table.
+
+            This method accepts zero or more parameters mapping 1:1 with the arguments provided to
+            the particular UDTF call under consideration. Each parameter is an instance of the
+            `AnalyzeArgument` class, which contains fields including the provided argument's data
+            type and value (in the case of literal scalar arguments only). For table arguments, the
+            `is_table` field is set to true and the `data_type` field is a StructType representing
+            the table's column types:
+
+                data_type: DataType
+                value: Optional[Any]
+                is_table: bool
+
+            This method returns an instance of the `AnalyzeResult` class which includes the result
+            table's schema as a StructType. If the UDTF accepts an input table argument, then the
+            `AnalyzeResult` can also include a requested way to partition the rows of the input
+            table across several UDTF calls. If `with_single_partition` is set to True, the query
+            planner will arrange a repartitioning operation from the previous execution stage such
+            that all rows of the input table are consumed by the `eval` method from exactly one
+            instance of the UDTF class. On the other hand, if the `partition_by` list is non-empty,
+            the query planner will arrange a repartitioning such that all rows with each unique
+            combination of values of the partitioning columns are consumed by a separate unique
+            instance of the UDTF class. If `order_by` is non-empty, this specifies the requested
+            ordering of rows within each partition.
+
+                schema: StructType
+                with_single_partition: bool = False
+                partition_by: Sequence[PartitioningColumn] = field(default_factory=tuple)
+                order_by: Sequence[OrderingColumn] = field(default_factory=tuple)
+
+            Examples
+            --------
+            analyze implementation that returns one output column for each word in the input string
+            argument.
+
+            >>> def analyze(self, text: str) -> AnalyzeResult:
+            ...     schema = StructType()
+            ...     for index, word in enumerate(text.split(" ")):
+            ...         schema = schema.add(f"word_{index}")
+            ...     return AnalyzeResult(schema=schema)
+
+            Same as above, but using *args to accept the arguments.
+
+            >>> def analyze(self, *args) -> AnalyzeResult:
+            ...     assert len(args) == 1, "This function accepts one argument only"
+            ...     assert args[0].data_type == StringType(), "Only string arguments are supported"
+            ...     text = args[0]
+            ...     schema = StructType()
+            ...     for index, word in enumerate(text.split(" ")):
+            ...         schema = schema.add(f"word_{index}")
+            ...     return AnalyzeResult(schema=schema)
+
+            Same as above, but using **kwargs to accept the arguments.
+
+            >>> def analyze(self, **kwargs) -> AnalyzeResult:
+            ...     assert len(kwargs) == 1, "This function accepts one argument only"
+            ...     assert "text" in kwargs, "An argument named 'text' is required"
+            ...     assert kwargs["text"].data_type == StringType(), "Only strings are supported"
+            ...     text = args["text"]
+            ...     schema = StructType()
+            ...     for index, word in enumerate(text.split(" ")):
+            ...         schema = schema.add(f"word_{index}")
+            ...     return AnalyzeResult(schema=schema)
+
+            analyze implementation that returns a constant output schema, but add custom information
+            in the result metadata to be consumed by future __init__ method calls:
+
+            >>> def analyze(self, text: str) -> AnalyzeResult:
+            ...     @dataclass
+            ...     class AnalyzeResultWithOtherMetadata(AnalyzeResult):
+            ...         num_words: int
+            ...         num_articles: int
+            ...     words = text.split(" ")
+            ...     return AnalyzeResultWithOtherMetadata(
+            ...         schema=StructType()
+            ...             .add("word", StringType())
+            ...             .add('total", IntegerType()),
+            ...         num_words=len(words),
+            ...         num_articles=len((
+            ...             word for word in words
+            ...             if word == 'a' or word == 'an' or word == 'the')))
             """
             ...
 
@@ -89,7 +187,9 @@ To implement a Python UDTF, you first need to define a class implementing the me
             -----
             - The result of the function must be a tuple representing a single row
               in the UDTF result table.
-            - UDTFs currently do not accept keyword arguments during the function call.
+            - It is also possible for UDTFs to accept the exact arguments expected, along with
+              their types.
+            - UDTFs can instead accept keyword arguments during the function call if needed.
 
             Examples
             --------
@@ -103,6 +203,24 @@ To implement a Python UDTF, you first need to define a class implementing the me
             >>> def eval(self, x: int, y: int):
             ...     yield (x + y, x - y)
             ...     yield (y + x, y - x)
+
+            Same as above, but using *args to accept the arguments:
+
+            >>> def eval(self, *args):
+            ...     assert len(args) == 2, "This function accepts two integer arguments only"
+            ...     x = args[0]
+            ...     y = args[1]
+            ...     yield (x + y, x - y)
+            ...     yield (y + x, y - x)
+
+            Same as above, but using **kwargs to accept the arguments:
+
+            >>> def eval(self, **kwargs):
+            ...     assert len(kwargs) == 2, "This function accepts two integer arguments only"
+            ...     x = kwargs["x"]
+            ...     y = kwargs["y"]
+            ...     yield (x + y, x - y)
+            ...     yield (y + x, y - x)
             """
             ...
 
diff --git a/python/pyspark/ml/connect/base.py b/python/pyspark/ml/connect/base.py
index cbddc8fcfd1c6..c5b5fa2d726d2 100644
--- a/python/pyspark/ml/connect/base.py
+++ b/python/pyspark/ml/connect/base.py
@@ -134,10 +134,10 @@ def _output_columns(self) -> List[Tuple[str, str]]:
         """
         raise NotImplementedError()
 
-    def _get_transform_fn(self) -> Callable[["pd.Series"], Any]:
+    def _get_transform_fn(self) -> Callable[..., Any]:
         """
-        Return a transformation function that accepts an instance of `pd.Series` as input and
-        returns transformed result as an instance of `pd.Series` or `pd.DataFrame`.
+        Return a transformation function that accepts one or more `pd.Series` instances as inputs
+        and returns transformed result as an instance of `pd.Series` or `pd.DataFrame`.
         If there's only one output column, the transformed result must be an
         instance of `pd.Series`, if there are multiple output columns, the transformed result
         must be an instance of `pd.DataFrame` with column names matching output schema
diff --git a/python/pyspark/ml/connect/feature.py b/python/pyspark/ml/connect/feature.py
index 1ffc6f1613af2..e618a6af5ae1e 100644
--- a/python/pyspark/ml/connect/feature.py
+++ b/python/pyspark/ml/connect/feature.py
@@ -23,8 +23,17 @@
 
 from pyspark import keyword_only
 from pyspark.sql import DataFrame
-from pyspark.ml.param.shared import HasInputCol, HasOutputCol
-from pyspark.ml.connect.base import Estimator, Model
+from pyspark.ml.param.shared import (
+    HasInputCol,
+    HasInputCols,
+    HasOutputCol,
+    HasFeatureSizes,
+    HasHandleInvalid,
+    Param,
+    Params,
+    TypeConverters,
+)
+from pyspark.ml.connect.base import Estimator, Model, Transformer
 from pyspark.ml.connect.io_utils import ParamsReadWrite, CoreModelReadWrite
 from pyspark.ml.connect.summarizer import summarize_dataframe
 
@@ -257,3 +266,146 @@ def _load_core_model(self, path: str) -> None:
         self.scale_values = sk_model.scale_
         self.mean_values = sk_model.mean_
         self.n_samples_seen = sk_model.n_samples_seen_
+
+
+class ArrayAssembler(
+    Transformer,
+    HasInputCols,
+    HasOutputCol,
+    HasFeatureSizes,
+    HasHandleInvalid,
+    ParamsReadWrite,
+):
+    """
+    A feature transformer that merges multiple input columns into an array type column.
+
+    Parameters
+    ----------
+    You need to set param `inputCols` for specifying input column names,
+    and set param `featureSizes` for specifying corresponding input column
+    feature size, for scalar type input column, corresponding feature size must be set to 1,
+    otherwise, set corresponding feature size to feature array length.
+    Output column is "array<double"> type and contains array of assembled features.
+    All elements in input feature columns must be convertible to double type.
+
+    You can set 'handler_invalid' param to specify how to handle invalid input value
+    (None or NaN), if it is set to 'error', error is thrown for invalid input value,
+    if it is set to 'keep', it returns relevant number of NaN in the output.
+
+    .. versionadded:: 4.0.0
+
+    Examples
+    --------
+    >>> from pyspark.ml.connect.feature import ArrayAssembler
+    >>> import numpy as np
+    >>>
+    >>> spark_df = spark.createDataFrame(
+    ...     [
+    ...         ([2.0, 3.5, 1.5], 3.0, True, 1),
+    ...         ([-3.0, np.nan, -2.5], 4.0, False, 2),
+    ...     ],
+    ...     schema=["f1", "f2", "f3", "f4"],
+    ... )
+    >>> assembler = ArrayAssembler(
+    ...     inputCols=["f1", "f2", "f3", "f4"],
+    ...     outputCol="out",
+    ...     featureSizes=[3, 1, 1, 1],
+    ...     handleInvalid="keep",
+    ... )
+    >>> assembler.transform(spark_df).select("out").show(truncate=False)
+    """
+
+    _input_kwargs: Dict[str, Any]
+
+    # Override doc of handleInvalid param.
+    handleInvalid: Param[str] = Param(
+        Params._dummy(),
+        "handleInvalid",
+        "how to handle invalid entries. Options are 'error' (throw an error), "
+        "or 'keep' (return relevant number of NaN in the output). Default value "
+        "is 'error'",
+        typeConverter=TypeConverters.toString,
+    )
+
+    @keyword_only
+    def __init__(
+        self,
+        *,
+        inputCols: Optional[List[str]] = None,
+        outputCol: Optional[str] = None,
+        featureSizes: Optional[List[int]] = None,
+        handleInvalid: Optional[str] = "error",
+    ) -> None:
+        """
+        __init__(
+            self, \\*, inputCols=None, outputCol=None, featureSizes=None, handleInvalid="error"
+        )
+        """
+        super().__init__()
+        kwargs = self._input_kwargs
+        self._set(**kwargs)
+        self._setDefault(handleInvalid="error")
+
+    def _input_columns(self) -> List[str]:
+        return self.getInputCols()
+
+    def _output_columns(self) -> List[Tuple[str, str]]:
+        return [(self.getOutputCol(), "array<double>")]
+
+    def _get_transform_fn(self) -> Callable[..., Any]:
+        feature_size_list = self.getFeatureSizes()
+        if feature_size_list is None or len(feature_size_list) != len(self.getInputCols()):
+            raise ValueError(
+                "'feature_size_list' param must be set with an array of integer, and"
+                "its length must be equal to number of input columns."
+            )
+        for feature_size in feature_size_list:
+            if feature_size <= 0:
+                raise ValueError("All input feature sizes must be an positive integer.")
+
+        assembled_feature_size = sum(feature_size_list)
+        handler_invalid = self.getHandleInvalid()
+
+        if handler_invalid not in ["error", "keep"]:
+            raise ValueError("'handler_invalid' param must be set with 'error' or 'keep' value.")
+
+        keep_invalid = handler_invalid == "keep"
+
+        def assemble_features(*feature_list: Any) -> Any:
+            assembled_array = np.empty(assembled_feature_size, dtype=np.float64)
+            pos = 0
+            for index, feature in enumerate(feature_list):
+                feature_size = feature_size_list[index]
+
+                if feature is not None:
+                    if np.isscalar(feature) and feature_size != 1:
+                        raise ValueError(
+                            f"The {index + 1}th input feature is a scalar value, but provided "
+                            f"feature size is {feature_size}."
+                        )
+                    if not np.isscalar(feature) and len(feature) != feature_size:
+                        raise ValueError(
+                            f"The {index + 1}th input feature size does not match "
+                            f"with provided feature size {feature_size}."
+                        )
+                if keep_invalid:
+                    if feature is None:
+                        assembled_array[pos : pos + feature_size] = np.nan
+                    else:
+                        assembled_array[pos : pos + feature_size] = feature
+                else:
+                    if feature is None or np.isnan(feature).any():
+                        raise ValueError(
+                            f"The input features contains invalid value: {str(feature)}"
+                        )
+                    else:
+                        assembled_array[pos : pos + feature_size] = feature
+
+                pos += feature_size
+
+            return assembled_array
+
+        def transform_fn(*series_list: Any) -> Any:
+            return pd.Series(assemble_features(*feature_list) for feature_list in zip(*series_list))
+
+        return transform_fn
diff --git a/python/pyspark/ml/connect/util.py b/python/pyspark/ml/connect/util.py
index c139482784f34..63f9a30381d24 100644
--- a/python/pyspark/ml/connect/util.py
+++ b/python/pyspark/ml/connect/util.py
@@ -119,7 +119,7 @@ def transform_dataframe_column(
         A list of names of input columns to be transformed
 
     transform_fn:
-        A transforming function with one arguments of `pandas.Series` type,
+        A transforming function with one or more arguments of `pandas.Series` type,
         if the transform function output is only one column data,
         return transformed result as a `pandas.Series` object,
         otherwise return transformed result as a `pandas.DataFrame` object
@@ -162,8 +162,8 @@ def transform_dataframe_column(
         return dataframe
 
     @pandas_udf(returnType=spark_udf_return_type)  # type: ignore[call-overload]
-    def transform_fn_pandas_udf(s: "pd.Series") -> "pd.Series":
-        return transform_fn(s)
+    def transform_fn_pandas_udf(*s: "pd.Series") -> "pd.Series":
+        return transform_fn(*s)
 
     result_spark_df = dataframe.withColumn(output_col_name, transform_fn_pandas_udf(*input_cols))
 
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 2bec3a5053f2e..ade96da0a4f13 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -42,6 +42,7 @@
     "TypeConverters.toFloat": "float",
     "TypeConverters.toInt": "int",
     "TypeConverters.toListFloat": "List[float]",
+    "TypeConverters.toListInt": "List[int]",
     "TypeConverters.toListString": "List[str]",
     "TypeConverters.toString": "str",
 }
@@ -356,6 +357,12 @@ def get{name[0].upper()}{name[1:]}(self) -> {paramType}:
             None,
             "TypeConverters.toFloat",
         ),
+        (
+            "featureSizes",
+            "input feature size list for input columns of vector assembler",
+            None,
+            "TypeConverters.toListInt",
+        ),
     ]
 
     code = []
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index d61d206d219cd..bc444bf9cbf9c 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -876,3 +876,25 @@ def getMomentum(self) -> float:
         Gets the value of momentum or its default value.
         """
         return self.getOrDefault(self.momentum)
+
+
+class HasFeatureSizes(Params):
+    """
+    Mixin for param featureSizes: input feature size list for input columns of vector assembler
+    """
+
+    featureSizes: "Param[List[int]]" = Param(
+        Params._dummy(),
+        "featureSizes",
+        "input feature size list for input columns of vector assembler",
+        typeConverter=TypeConverters.toListInt,
+    )
+
+    def __init__(self) -> None:
+        super(HasFeatureSizes, self).__init__()
+
+    def getFeatureSizes(self) -> List[int]:
+        """
+        Gets the value of featureSizes or its default value.
+        """
+        return self.getOrDefault(self.featureSizes)
diff --git a/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py b/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py
index f440fa9d682fe..8e07879d8a268 100644
--- a/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py
+++ b/python/pyspark/ml/tests/connect/test_legacy_mode_feature.py
@@ -33,6 +33,7 @@
         MaxAbsScalerModel,
         StandardScaler,
         StandardScalerModel,
+        ArrayAssembler,
     )
     import pandas as pd
 
@@ -145,6 +146,53 @@ def test_standard_scaler(self):
                 sk_result = sk_model.transform(np.stack(list(local_df1.features)))
                 np.testing.assert_allclose(sk_result, expected_result)
 
+    def test_array_assembler(self):
+        spark_df = self.spark.createDataFrame(
+            [
+                ([2.0, 3.5, 1.5], 3.0, True, 1),
+                ([-3.0, np.nan, -2.5], 4.0, False, 2),
+            ],
+            schema=["f1", "f2", "f3", "f4"],
+        )
+        pandas_df = spark_df.toPandas()
+
+        assembler1 = ArrayAssembler(
+            inputCols=["f1", "f2", "f3", "f4"],
+            outputCol="out",
+            featureSizes=[3, 1, 1, 1],
+            handleInvalid="keep",
+        )
+        expected_result = [
+            [2.0, 3.5, 1.5, 3.0, 1.0, 1.0],
+            [-3.0, np.nan, -2.5, 4.0, 0.0, 2.0],
+        ]
+        result1 = assembler1.transform(pandas_df)["out"].tolist()
+        np.testing.assert_allclose(result1, expected_result)
+
+        result2 = assembler1.transform(spark_df).toPandas()["out"].tolist()
+        # For spark UDF, if output is a array type, 'NaN' values in UDF output array
+        # are converted to 'None' value.
+        if result2[1][1] is None:
+            result2[1][1] = np.nan
+        np.testing.assert_allclose(result2, expected_result)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            save_path = os.path.join(tmp_dir, "assembler")
+            assembler1.saveToLocal(save_path)
+            loaded_assembler = ArrayAssembler.loadFromLocal(save_path)
+            assert loaded_assembler.getInputCols() == ["f1", "f2", "f3", "f4"]
+            assert loaded_assembler.getFeatureSizes() == [3, 1, 1, 1]
+
+        assembler2 = ArrayAssembler(
+            inputCols=["f1", "f2", "f3", "f4"],
+            outputCol="out",
+            featureSizes=[3, 1, 1, 1],
+            handleInvalid="error",
+        )
+
+        with self.assertRaisesRegex(Exception, "The input features contains invalid value"):
+            assembler2.transform(pandas_df)["out"].tolist()
+
 
 @unittest.skipIf(not should_test_connect, connect_requirement_message)
 class FeatureTests(FeatureTestsMixin, unittest.TestCase):
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index c44838c0ee117..637787ceb660b 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -887,7 +887,8 @@ def isEmpty(self) -> bool:
         return self._jdf.isEmpty()
 
     def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool = False) -> None:
-        """Prints the first ``n`` rows to the console.
+        """
+        Prints the first ``n`` rows of the DataFrame to the console.
 
         .. versionadded:: 1.3.0
 
@@ -896,20 +897,32 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
 
         Parameters
         ----------
-        n : int, optional
+        n : int, optional, default 20
             Number of rows to show.
-        truncate : bool or int, optional
-            If set to ``True``, truncate strings longer than 20 chars by default.
+        truncate : bool or int, optional, default True
+            If set to ``True``, truncate strings longer than 20 chars.
             If set to a number greater than one, truncates long strings to length ``truncate``
             and align cells right.
         vertical : bool, optional
-            If set to ``True``, print output rows vertically (one line
-            per column value).
+            If set to ``True``, print output rows vertically (one line per column value).
 
         Examples
         --------
         >>> df = spark.createDataFrame([
-        ...     (14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        ...     (14, "Tom"), (23, "Alice"), (16, "Bob"), (19, "This is a super long name")],
+        ...     ["age", "name"])
+
+        Show :class:`DataFrame`
+
+        >>> df.show()
+        +---+--------------------+
+        |age|                name|
+        +---+--------------------+
+        | 14|                 Tom|
+        | 23|               Alice|
+        | 16|                 Bob|
+        | 19|This is a super l...|
+        +---+--------------------+
 
         Show only top 2 rows.
 
@@ -922,6 +935,18 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
         +---+-----+
         only showing top 2 rows
 
+        Show full column content without truncation.
+
+        >>> df.show(truncate=False)
+        +---+-------------------------+
+        |age|name                     |
+        +---+-------------------------+
+        |14 |Tom                      |
+        |23 |Alice                    |
+        |16 |Bob                      |
+        |19 |This is a super long name|
+        +---+-------------------------+
+
         Show :class:`DataFrame` where the maximum number of characters is 3.
 
         >>> df.show(truncate=3)
@@ -931,20 +956,24 @@ def show(self, n: int = 20, truncate: Union[bool, int] = True, vertical: bool =
         | 14| Tom|
         | 23| Ali|
         | 16| Bob|
+        | 19| Thi|
         +---+----+
 
         Show :class:`DataFrame` vertically.
 
         >>> df.show(vertical=True)
-        -RECORD 0-----
+        -RECORD 0--------------------
         age  | 14
         name | Tom
-        -RECORD 1-----
+        -RECORD 1--------------------
         age  | 23
         name | Alice
-        -RECORD 2-----
+        -RECORD 2--------------------
         age  | 16
         name | Bob
+        -RECORD 3--------------------
+        age  | 19
+        name | This is a super l...
         """
 
         if not isinstance(n, int) or isinstance(n, bool):
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 04968440e3942..25958bdf15dae 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -3751,7 +3751,8 @@ def collect_list(col: "ColumnOrName") -> Column:
 
     >>> from pyspark.sql import functions as sf
     >>> df = spark.createDataFrame([(1, "John"), (2, "John"), (3, "Ana")], ("id", "name"))
-    >>> df.groupBy("name").agg(sf.sort_array(sf.collect_list('id')).alias('sorted_list')).show()
+    >>> df = df.groupBy("name").agg(sf.sort_array(sf.collect_list('id')).alias('sorted_list'))
+    >>> df.orderBy(sf.desc("name")).show()
     +----+-----------+
     |name|sorted_list|
     +----+-----------+
@@ -3842,7 +3843,8 @@ def collect_set(col: "ColumnOrName") -> Column:
 
     >>> from pyspark.sql import functions as sf
     >>> df = spark.createDataFrame([(1, "John"), (2, "John"), (3, "Ana")], ("id", "name"))
-    >>> df.groupBy("name").agg(sf.sort_array(sf.collect_set('id')).alias('sorted_set')).show()
+    >>> df = df.groupBy("name").agg(sf.sort_array(sf.collect_set('id')).alias('sorted_set'))
+    >>> df.orderBy(sf.desc("name")).show()
     +----+----------+
     |name|sorted_set|
     +----+----------+
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index cfac8fdbc68b9..ea429a75e1578 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -495,6 +495,7 @@ def parquet(self, *paths: str, **options: "OptionalPrimitiveType") -> "DataFrame
         Parameters
         ----------
         paths : str
+            One or more file paths to read the Parquet files from.
 
         Other Parameters
         ----------------
@@ -505,24 +506,71 @@ def parquet(self, *paths: str, **options: "OptionalPrimitiveType") -> "DataFrame
 
             .. # noqa
 
+        Returns
+        -------
+        :class:`DataFrame`
+            A DataFrame containing the data from the Parquet files.
+
         Examples
         --------
+        Create sample dataframes.
+
+        >>> df = spark.createDataFrame(
+        ...     [(10, "Alice"), (15, "Bob"), (20, "Tom")], schema=["age", "name"])
+        >>> df2 = spark.createDataFrame([(70, "Alice"), (80, "Bob")], schema=["height", "name"])
+
         Write a DataFrame into a Parquet file and read it back.
 
         >>> import tempfile
         >>> with tempfile.TemporaryDirectory() as d:
-        ...     # Write a DataFrame into a Parquet file
-        ...     spark.createDataFrame(
-        ...         [{"age": 100, "name": "Hyukjin Kwon"}]
-        ...     ).write.mode("overwrite").format("parquet").save(d)
+        ...     # Write a DataFrame into a Parquet file.
+        ...     df.write.mode("overwrite").format("parquet").save(d)
         ...
         ...     # Read the Parquet file as a DataFrame.
-        ...     spark.read.parquet(d).show()
-        +---+------------+
-        |age|        name|
-        +---+------------+
-        |100|Hyukjin Kwon|
-        +---+------------+
+        ...     spark.read.parquet(d).orderBy("name").show()
+        +---+-----+
+        |age| name|
+        +---+-----+
+        | 10|Alice|
+        | 15|  Bob|
+        | 20|  Tom|
+        +---+-----+
+
+        Read a Parquet file with a specific column.
+
+        >>> with tempfile.TemporaryDirectory() as d:
+        ...     df.write.mode("overwrite").format("parquet").save(d)
+        ...
+        ...     # Read the Parquet file with only the 'name' column.
+        ...     spark.read.schema("name string").parquet(d).orderBy("name").show()
+        +-----+
+        | name|
+        +-----+
+        |Alice|
+        |  Bob|
+        |  Tom|
+        +-----+
+
+        Read multiple Parquet files and merge schema.
+
+        >>> with tempfile.TemporaryDirectory() as d1, tempfile.TemporaryDirectory() as d2:
+        ...     df.write.mode("overwrite").format("parquet").save(d1)
+        ...     df2.write.mode("overwrite").format("parquet").save(d2)
+        ...
+        ...     spark.read.option(
+        ...         "mergeSchema", "true"
+        ...     ).parquet(d1, d2).select(
+        ...         "name", "age", "height"
+        ...     ).orderBy("name", "age").show()
+        +-----+----+------+
+        | name| age|height|
+        +-----+----+------+
+        |Alice|NULL|    70|
+        |Alice|  10|  NULL|
+        |  Bob|NULL|    80|
+        |  Bob|  15|  NULL|
+        |  Tom|  20|  NULL|
+        +-----+----+------+
         """
         mergeSchema = options.get("mergeSchema", None)
         pathGlobFilter = options.get("pathGlobFilter", None)
diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py
index 9c821f4bde9c2..98676bd7be493 100644
--- a/python/pyspark/sql/tests/test_udtf.py
+++ b/python/pyspark/sql/tests/test_udtf.py
@@ -18,6 +18,7 @@
 import shutil
 import tempfile
 import unittest
+from dataclasses import dataclass
 from typing import Iterator
 
 from py4j.protocol import Py4JJavaError
@@ -2365,6 +2366,58 @@ def terminate(self):
             + [Row(partition_col=42, count=3, total=3, last=None)],
         )
 
+    def test_udtf_with_prepare_string_from_analyze(self):
+        @dataclass
+        class AnalyzeResultWithBuffer(AnalyzeResult):
+            buffer: str = ""
+
+        @udtf
+        class TestUDTF:
+            def __init__(self, analyze_result=None):
+                self._total = 0
+                if analyze_result is not None:
+                    self._buffer = analyze_result.buffer
+                else:
+                    self._buffer = ""
+
+            @staticmethod
+            def analyze(argument, _):
+                if (
+                    argument.value is None
+                    or argument.is_table
+                    or not isinstance(argument.value, str)
+                    or len(argument.value) == 0
+                ):
+                    raise Exception("The first argument must be non-empty string")
+                assert argument.data_type == StringType()
+                assert not argument.is_table
+                return AnalyzeResultWithBuffer(
+                    schema=StructType().add("total", IntegerType()).add("buffer", StringType()),
+                    with_single_partition=True,
+                    buffer=argument.value,
+                )
+
+            def eval(self, argument, row: Row):
+                self._total += 1
+
+            def terminate(self):
+                yield self._total, self._buffer
+
+        self.spark.udtf.register("test_udtf", TestUDTF)
+
+        assertDataFrameEqual(
+            self.spark.sql(
+                """
+                WITH t AS (
+                  SELECT id FROM range(1, 21)
+                )
+                SELECT total, buffer
+                FROM test_udtf("abc", TABLE(t))
+                """
+            ).collect(),
+            [Row(count=20, buffer="abc")],
+        )
+
 
 class UDTFTests(BaseUDTFTestsMixin, ReusedSQLTestCase):
     @classmethod
diff --git a/python/pyspark/sql/udtf.py b/python/pyspark/sql/udtf.py
index ba4bac2ffdfa9..26ce68111db80 100644
--- a/python/pyspark/sql/udtf.py
+++ b/python/pyspark/sql/udtf.py
@@ -85,7 +85,10 @@ class OrderingColumn:
     overrideNullsFirst: Optional[bool] = None
 
 
-@dataclass(frozen=True)
+# Note: this class is a "dataclass" for purposes of convenience, but it is not marked "frozen"
+# because the intention is that users may create subclasses of it for purposes of returning custom
+# information from the "analyze" method.
+@dataclass
 class AnalyzeResult:
     """
     The return of Python UDTF's analyze static method.
diff --git a/python/pyspark/sql/worker/analyze_udtf.py b/python/pyspark/sql/worker/analyze_udtf.py
index 6fb3ca995e5d1..a6aa381eb14a5 100644
--- a/python/pyspark/sql/worker/analyze_udtf.py
+++ b/python/pyspark/sql/worker/analyze_udtf.py
@@ -126,6 +126,8 @@ def main(infile: IO, outfile: IO) -> None:
 
         # Return the analyzed schema.
         write_with_length(result.schema.json().encode("utf-8"), outfile)
+        # Return the pickled 'AnalyzeResult' class instance.
+        pickleSer._write_with_length(result, outfile)
         # Return whether the "with single partition" property is requested.
         write_int(1 if result.with_single_partition else 0, outfile)
         # Return the list of partitioning columns, if any.
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index 3d08f6c4baea1..df7dd1bc2f73f 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -20,6 +20,7 @@
 """
 import os
 import sys
+import dataclasses
 import time
 from inspect import getfullargspec
 import json
@@ -666,7 +667,7 @@ def read_udtf(pickleSer, infile, eval_type):
         # Each row is a group so do not batch but send one by one.
         ser = BatchedSerializer(CPickleSerializer(), 1)
 
-    # See `PythonUDTFRunner.PythonUDFWriterThread.writeCommand'
+    # See 'PythonUDTFRunner.PythonUDFWriterThread.writeCommand'
     num_arg = read_int(infile)
     args_offsets = []
     kwargs_offsets = {}
@@ -679,6 +680,14 @@ def read_udtf(pickleSer, infile, eval_type):
             args_offsets.append(offset)
     num_partition_child_indexes = read_int(infile)
     partition_child_indexes = [read_int(infile) for i in range(num_partition_child_indexes)]
+    has_pickled_analyze_result = read_bool(infile)
+    if has_pickled_analyze_result:
+        pickled_analyze_result = pickleSer._read_with_length(infile)
+    else:
+        pickled_analyze_result = None
+    # Initially we assume that the UDTF __init__ method accepts the pickled AnalyzeResult,
+    # although we may set this to false later if we find otherwise.
+    udtf_init_method_accepts_analyze_result = True
     handler = read_command(pickleSer, infile)
     if not isinstance(handler, type):
         raise PySparkRuntimeError(
@@ -692,6 +701,29 @@ def read_udtf(pickleSer, infile, eval_type):
             f"The return type of a UDTF must be a struct type, but got {type(return_type)}."
         )
 
+    # Update the handler that creates a new UDTF instance to first try calling the UDTF constructor
+    # with one argument containing the previous AnalyzeResult. If that fails, then try a constructor
+    # with no arguments. In this way each UDTF class instance can decide if it wants to inspect the
+    # AnalyzeResult.
+    if has_pickled_analyze_result:
+        prev_handler = handler
+
+        def construct_udtf():
+            nonlocal udtf_init_method_accepts_analyze_result
+            if not udtf_init_method_accepts_analyze_result:
+                return prev_handler()
+            else:
+                try:
+                    # Here we pass the AnalyzeResult to the UDTF's __init__ method.
+                    return prev_handler(dataclasses.replace(pickled_analyze_result))
+                except TypeError:
+                    # This means that the UDTF handler does not accept an AnalyzeResult object in
+                    # its __init__ method.
+                    udtf_init_method_accepts_analyze_result = False
+                    return prev_handler()
+
+        handler = construct_udtf
+
     class UDTFWithPartitions:
         """
         This implements the logic of a UDTF that accepts an input TABLE argument with one or more
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
index 02559b8de2d0a..b80e72c768c6e 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
@@ -49,6 +49,7 @@ COPY sbin /opt/spark/sbin
 COPY kubernetes/dockerfiles/spark/entrypoint.sh /opt/
 COPY kubernetes/dockerfiles/spark/decom.sh /opt/
 COPY examples /opt/spark/examples
+RUN ln -s $(basename $(ls /opt/spark/examples/jars/spark-examples_*.jar)) /opt/spark/examples/jars/spark-examples.jar
 COPY kubernetes/tests /opt/spark/tests
 COPY data /opt/spark/data
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index cc0bfd3fc31bd..18a0aec8fc610 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -2229,8 +2229,9 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor
                 analyzeResult.applyToTableArgument(u.name, t)
               case c => c
             }
-            PythonUDTF(u.name, u.func, analyzeResult.schema, newChildren,
-              u.evalType, u.udfDeterministic, u.resultId)
+            PythonUDTF(
+              u.name, u.func, analyzeResult.schema, Some(analyzeResult.pickledAnalyzeResult),
+              newChildren, u.evalType, u.udfDeterministic, u.resultId)
           }
         }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
index 51586a0065e95..ec01b56f9eb7c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.expressions.ExprUtils
 import org.apache.spark.sql.catalyst.util.{DateFormatter, TimestampFormatter}
 import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
 import org.apache.spark.sql.errors.QueryExecutionErrors
-import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf}
 import org.apache.spark.sql.types._
 
 class CSVInferSchema(val options: CSVOptions) extends Serializable {
@@ -202,8 +202,11 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
     // We can only parse the value as TimestampNTZType if it does not have zone-offset or
     // time-zone component and can be parsed with the timestamp formatter.
     // Otherwise, it is likely to be a timestamp with timezone.
-    if (timestampNTZFormatter.parseWithoutTimeZoneOptional(field, false).isDefined) {
-      SQLConf.get.timestampType
+    val timestampType = SQLConf.get.timestampType
+    if ((SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY ||
+        timestampType == TimestampNTZType) &&
+        timestampNTZFormatter.parseWithoutTimeZoneOptional(field, false).isDefined) {
+      timestampType
     } else {
       tryParseTimestamp(field)
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/PythonUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/PythonUDF.scala
index 539505543a402..f886b50e8a23a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/PythonUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/PythonUDF.scala
@@ -159,6 +159,10 @@ abstract class UnevaluableGenerator extends Generator {
  * @param name name of the Python UDTF being called
  * @param func string contents of the Python code in the UDTF, along with other environment state
  * @param elementSchema result schema of the function call
+ * @param pickledAnalyzeResult if the UDTF defined an 'analyze' method, this contains the pickled
+ *                             'AnalyzeResult' instance from that method, which contains all
+ *                             metadata returned including the result schema of the function call as
+ *                             well as optional other information
  * @param children input arguments to the UDTF call; for scalar arguments these are the expressions
  *                 themeselves, and for TABLE arguments, these are instances of
  *                 [[FunctionTableSubqueryArgumentExpression]]
@@ -167,15 +171,15 @@ abstract class UnevaluableGenerator extends Generator {
  * @param udfDeterministic true if this function is deterministic wherein it returns the same result
  *                         rows for every call with the same input arguments
  * @param resultId unique expression ID for this function invocation
- * @param pythonUDTFPartitionColumnIndexes holds the indexes of the TABLE argument to the Python
- *                                         UDTF call, if applicable
- * @param analyzeResult holds the result of the polymorphic Python UDTF 'analze' method, if the UDTF
- *                      defined one
+ * @param pythonUDTFPartitionColumnIndexes holds the zero-based indexes of the projected results of
+ *                                         all PARTITION BY expressions within the TABLE argument of
+ *                                         the Python UDTF call, if applicable
  */
 case class PythonUDTF(
     name: String,
     func: PythonFunction,
     elementSchema: StructType,
+    pickledAnalyzeResult: Option[Array[Byte]],
     children: Seq[Expression],
     evalType: Int,
     udfDeterministic: Boolean,
@@ -224,6 +228,7 @@ case class UnresolvedPolymorphicPythonUDTF(
 /**
  * Represents the result of invoking the polymorphic 'analyze' method on a Python user-defined table
  * function. This returns the table function's output schema in addition to other optional metadata.
+ *
  * @param schema result schema of this particular function call in response to the particular
  *               arguments provided, including the types of any provided scalar arguments (and
  *               their values, in the case of literals) as well as the names and types of columns of
@@ -241,12 +246,17 @@ case class UnresolvedPolymorphicPythonUDTF(
  * @param orderByExpressions if non-empty, this contains the list of ordering items that the
  *                           'analyze' method explicitly indicated that the UDTF call should consume
  *                           the input table rows by
+ * @param pickledAnalyzeResult this is the pickled 'AnalyzeResult' instance from the UDTF, which
+ *                             contains all metadata returned by the Python UDTF 'analyze' method
+ *                             including the result schema of the function call as well as optional
+ *                             other information
  */
 case class PythonUDTFAnalyzeResult(
     schema: StructType,
     withSinglePartition: Boolean,
     partitionByExpressions: Seq[Expression],
-    orderByExpressions: Seq[SortOrder]) {
+    orderByExpressions: Seq[SortOrder],
+    pickledAnalyzeResult: Array[Byte]) {
   /**
    * Applies the requested properties from this analysis result to the target TABLE argument
    * expression of a UDTF call, throwing an error if any properties of the UDTF call are
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index e7df542ddab82..04bc457b66a4a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -28,7 +28,8 @@ import com.fasterxml.jackson.core.json.JsonReadFeature
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
-import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, CodegenFallback, ExprCode}
+import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper
 import org.apache.spark.sql.catalyst.json._
 import org.apache.spark.sql.catalyst.trees.TreePattern.{JSON_TO_STRUCT, TreePattern}
 import org.apache.spark.sql.catalyst.util._
@@ -125,13 +126,7 @@ private[this] object SharedFactory {
   group = "json_funcs",
   since = "1.5.0")
 case class GetJsonObject(json: Expression, path: Expression)
-  extends BinaryExpression with ExpectsInputTypes with CodegenFallback {
-
-  import com.fasterxml.jackson.core.JsonToken._
-
-  import PathInstruction._
-  import SharedFactory._
-  import WriteStyle._
+  extends BinaryExpression with ExpectsInputTypes {
 
   override def left: Expression = json
   override def right: Expression = path
@@ -140,18 +135,114 @@ case class GetJsonObject(json: Expression, path: Expression)
   override def nullable: Boolean = true
   override def prettyName: String = "get_json_object"
 
-  @transient private lazy val parsedPath = parsePath(path.eval().asInstanceOf[UTF8String])
+  @transient
+  private lazy val evaluator = if (path.foldable) {
+    new GetJsonObjectEvaluator(path.eval().asInstanceOf[UTF8String])
+  } else {
+    new GetJsonObjectEvaluator()
+  }
 
   override def eval(input: InternalRow): Any = {
-    val jsonStr = json.eval(input).asInstanceOf[UTF8String]
+    evaluator.setJson(json.eval(input).asInstanceOf[UTF8String])
+    if (!path.foldable) {
+      evaluator.setPath(path.eval(input).asInstanceOf[UTF8String])
+    }
+    evaluator.evaluate()
+  }
+
+  protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val evaluatorClass = classOf[GetJsonObjectEvaluator].getName
+    val initEvaluator = path.foldable match {
+      case true if path.eval() != null =>
+        val cachedPath = path.eval().asInstanceOf[UTF8String]
+        val refCachedPath = ctx.addReferenceObj("cachedPath", cachedPath)
+        s"new $evaluatorClass($refCachedPath)"
+      case _ => s"new $evaluatorClass()"
+    }
+    val evaluator = ctx.addMutableState(evaluatorClass, "evaluator",
+      v => s"""$v = $initEvaluator;""", forceInline = true)
+
+    val jsonEval = json.genCode(ctx)
+    val pathEval = path.genCode(ctx)
+
+    val setJson =
+      s"""
+         |if (${jsonEval.isNull}) {
+         |  $evaluator.setJson(null);
+         |} else {
+         |  $evaluator.setJson(${jsonEval.value});
+         |}
+         |""".stripMargin
+    val setPath = if (!path.foldable) {
+      s"""
+         |if (${pathEval.isNull}) {
+         |  $evaluator.setPath(null);
+         |} else {
+         |  $evaluator.setPath(${pathEval.value});
+         |}
+         |""".stripMargin
+    } else {
+      ""
+    }
+
+    val resultType = CodeGenerator.boxedType(dataType)
+    val resultTerm = ctx.freshName("result")
+    ev.copy(code =
+      code"""
+         |${jsonEval.code}
+         |${pathEval.code}
+         |$setJson
+         |$setPath
+         |$resultType $resultTerm = ($resultType) $evaluator.evaluate();
+         |boolean ${ev.isNull} = $resultTerm == null;
+         |${CodeGenerator.javaType(dataType)} ${ev.value} = ${CodeGenerator.defaultValue(dataType)};
+         |if (!${ev.isNull}) {
+         |  ${ev.value} = $resultTerm;
+         |}
+         |""".stripMargin
+    )
+  }
+
+  override protected def withNewChildrenInternal(
+      newLeft: Expression, newRight: Expression): GetJsonObject =
+    copy(json = newLeft, path = newRight)
+}
+
+class GetJsonObjectEvaluator(cachedPath: UTF8String) {
+  import com.fasterxml.jackson.core.JsonToken._
+  import PathInstruction._
+  import SharedFactory._
+  import WriteStyle._
+
+  def this() = this(null)
+
+  @transient
+  private lazy val parsedPath: Option[List[PathInstruction]] =
+    parsePath(cachedPath)
+
+  @transient
+  private var jsonStr: UTF8String = null
+
+  @transient
+  private var pathStr: UTF8String = null
+
+  def setJson(arg: UTF8String): Unit = {
+    jsonStr = arg
+  }
+
+  def setPath(arg: UTF8String): Unit = {
+    pathStr = arg
+  }
+
+  def evaluate(): Any = {
     if (jsonStr == null) {
       return null
     }
 
-    val parsed = if (path.foldable) {
+    val parsed = if (cachedPath != null) {
       parsedPath
     } else {
-      parsePath(path.eval(input).asInstanceOf[UTF8String])
+      parsePath(pathStr)
     }
 
     if (parsed.isDefined) {
@@ -294,7 +385,7 @@ case class GetJsonObject(json: Expression, path: Expression)
           g.writeRawValue(buf.toString)
         } else if (dirty == 1) {
           // remove outer array tokens
-          g.writeRawValue(buf.substring(1, buf.length()-1))
+          g.writeRawValue(buf.substring(1, buf.length() - 1))
         } // else do not write anything
 
         dirty > 0
@@ -337,10 +428,6 @@ case class GetJsonObject(json: Expression, path: Expression)
         false
     }
   }
-
-  override protected def withNewChildrenInternal(
-      newLeft: Expression, newRight: Expression): GetJsonObject =
-    copy(json = newLeft, path = newRight)
 }
 
 // scalastyle:off line.size.limit line.contains.tab
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index a3e049c980c2e..fdebca76f5fd2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -1041,7 +1041,7 @@ case class MapObjects private(
         val it = ctx.freshName("it")
         (
           s"${genInputData.value}.size()",
-          s"scala.collection.Iterator $it = ${genInputData.value}.toIterator();",
+          s"scala.collection.Iterator $it = ${genInputData.value}.iterator();",
           s"$it.next()"
         )
       case ObjectType(cls) if cls.isArray =>
@@ -1067,7 +1067,7 @@ case class MapObjects private(
         val it = ctx.freshName("it")
         (
           s"$seq == null ? $array.length : $seq.size()",
-          s"scala.collection.Iterator $it = $seq == null ? null : $seq.toIterator();",
+          s"scala.collection.Iterator $it = $seq == null ? null : $seq.iterator();",
           s"$it == null ? $array[$loopIndex] : $it.next()"
         )
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
index 5385afe8c9353..4123c5290b6a1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.json.JacksonUtils.nextUntil
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
 import org.apache.spark.sql.errors.QueryExecutionErrors
-import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -148,11 +148,13 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable {
           val bigDecimal = decimalParser(field)
             DecimalType(bigDecimal.precision, bigDecimal.scale)
         }
+        val timestampType = SQLConf.get.timestampType
         if (options.prefersDecimal && decimalTry.isDefined) {
           decimalTry.get
-        } else if (options.inferTimestamp &&
+        } else if (options.inferTimestamp && (SQLConf.get.legacyTimeParserPolicy ==
+          LegacyBehaviorPolicy.LEGACY || timestampType == TimestampNTZType) &&
             timestampNTZFormatter.parseWithoutTimeZoneOptional(field, false).isDefined) {
-          SQLConf.get.timestampType
+          timestampType
         } else if (options.inferTimestamp &&
             timestampFormatter.parseOptional(field).isDefined) {
           TimestampType
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 65d2e6136e9fe..12ec9e911d31d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -44,6 +44,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
 import org.apache.spark.sql.types.{AtomicType, TimestampNTZType, TimestampType}
+import org.apache.spark.storage.{StorageLevel, StorageLevelMapper}
 import org.apache.spark.unsafe.array.ByteArrayMethods
 import org.apache.spark.util.Utils
 
@@ -1563,6 +1564,15 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
+  val DEFAULT_CACHE_STORAGE_LEVEL = buildConf("spark.sql.defaultCacheStorageLevel")
+    .doc("The default storage level of `dataset.cache()`, `catalog.cacheTable()` and " +
+      "sql query `CACHE TABLE t`.")
+    .version("4.0.0")
+    .stringConf
+    .transform(_.toUpperCase(Locale.ROOT))
+    .checkValues(StorageLevelMapper.values.map(_.name()).toSet)
+    .createWithDefault(StorageLevelMapper.MEMORY_AND_DISK.name())
+
   val CROSS_JOINS_ENABLED = buildConf("spark.sql.crossJoin.enabled")
     .internal()
     .doc("When false, we will throw an error if a query contains a cartesian product without " +
@@ -5027,6 +5037,9 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
 
   def groupByAliases: Boolean = getConf(GROUP_BY_ALIASES)
 
+  def defaultCacheStorageLevel: StorageLevel =
+    StorageLevel.fromString(getConf(DEFAULT_CACHE_STORAGE_LEVEL))
+
   def crossJoinEnabled: Boolean = getConf(SQLConf.CROSS_JOINS_ENABLED)
 
   override def sessionLocalTimeZone: String = getConf(SQLConf.SESSION_LOCAL_TIMEZONE)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
index acedf7998c2d5..fb91200557a65 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
@@ -263,4 +263,14 @@ class CSVInferSchemaSuite extends SparkFunSuite with SQLHelper {
     inferSchema = new CSVInferSchema(options)
     assert(inferSchema.inferField(DateType, "2012_12_12") == DateType)
   }
+
+  test("SPARK-45433: inferring the schema when timestamps do not match specified timestampFormat" +
+    " with only one row") {
+    val options = new CSVOptions(
+      Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss"),
+      columnPruning = false,
+      defaultTimeZoneId = "UTC")
+    val inferSchema = new CSVInferSchema(options)
+    assert(inferSchema.inferField(NullType, "2884-06-24T02:45:51.138") == StringType)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
index 8290b38e33934..81a4858dce82a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
@@ -112,4 +112,12 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
     checkType(Map("inferTimestamp" -> "true"), json, TimestampType)
     checkType(Map("inferTimestamp" -> "false"), json, StringType)
   }
+
+  test("SPARK-45433: inferring the schema when timestamps do not match specified timestampFormat" +
+    " with only one row") {
+    checkType(
+      Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", "inferTimestamp" -> "true"),
+      """{"a": "2884-06-24T02:45:51.138"}""",
+      StringType)
+  }
 }
diff --git a/sql/core/benchmarks/JsonBenchmark-jdk21-results.txt b/sql/core/benchmarks/JsonBenchmark-jdk21-results.txt
index 3b48a59e660a1..f0e19c0ecf9af 100644
--- a/sql/core/benchmarks/JsonBenchmark-jdk21-results.txt
+++ b/sql/core/benchmarks/JsonBenchmark-jdk21-results.txt
@@ -3,127 +3,128 @@ Benchmark for performance of JSON parsing
 ================================================================================================
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 21+35 on Linux 5.15.0-1046-azure
-Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 21+35-LTS on Linux 5.15.0-1047-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 JSON schema inferring:                    Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                        2855           2912          65          1.8         571.0       1.0X
-UTF-8 is set                                       4699           4723          31          1.1         939.9       0.6X
+No encoding                                        2944           3061         191          1.7         588.8       1.0X
+UTF-8 is set                                       4437           4465          26          1.1         887.5       0.7X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 21+35 on Linux 5.15.0-1046-azure
-Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 21+35-LTS on Linux 5.15.0-1047-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 count a short column:                     Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                        2946           2952          10          1.7         589.1       1.0X
-UTF-8 is set                                       4557           4580          32          1.1         911.4       0.6X
+No encoding                                        2545           2567          31          2.0         509.0       1.0X
+UTF-8 is set                                       4020           4028           9          1.2         804.1       0.6X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 21+35 on Linux 5.15.0-1046-azure
-Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 21+35-LTS on Linux 5.15.0-1047-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 count a wide column:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                        6977           7229         433          0.1        6977.2       1.0X
-UTF-8 is set                                       6373           6394          25          0.2        6372.9       1.1X
+No encoding                                        6786           6939         264          0.1        6785.7       1.0X
+UTF-8 is set                                       5668           5680          11          0.2        5668.1       1.2X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 21+35 on Linux 5.15.0-1046-azure
-Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 21+35-LTS on Linux 5.15.0-1047-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 select wide row:                          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       15128          15242         148          0.0      302554.9       1.0X
-UTF-8 is set                                      16572          16678         143          0.0      331438.1       0.9X
+No encoding                                       12016          12190         274          0.0      240310.5       1.0X
+UTF-8 is set                                      13209          13266          50          0.0      264186.2       0.9X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 21+35 on Linux 5.15.0-1046-azure
-Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 21+35-LTS on Linux 5.15.0-1047-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 Select a subset of 10 columns:            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Select 10 columns                                  2698           2717          30          0.4        2698.0       1.0X
-Select 1 column                                    1713           1722          11          0.6        1713.3       1.6X
+Select 10 columns                                  2433           2436           5          0.4        2432.7       1.0X
+Select 1 column                                    1675           1678           5          0.6        1675.3       1.5X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 21+35 on Linux 5.15.0-1046-azure
-Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 21+35-LTS on Linux 5.15.0-1047-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 creation of JSON parser per line:         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Short column without encoding                       837            858          33          1.2         837.4       1.0X
-Short column with UTF-8                            1151           1156           4          0.9        1151.4       0.7X
-Wide column without encoding                       7283           7353          79          0.1        7283.2       0.1X
-Wide column with UTF-8                             8935           9006         109          0.1        8935.4       0.1X
+Short column without encoding                       714            725          15          1.4         714.3       1.0X
+Short column with UTF-8                            1020           1024           4          1.0        1020.4       0.7X
+Wide column without encoding                       6743           6807          73          0.1        6743.2       0.1X
+Wide column with UTF-8                             9714           9734          19          0.1        9713.7       0.1X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 21+35 on Linux 5.15.0-1046-azure
-Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 21+35-LTS on Linux 5.15.0-1047-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 JSON functions:                           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                            80             83           3         12.5          80.0       1.0X
-from_json                                          2247           2276          41          0.4        2246.5       0.0X
-json_tuple                                         2205           2214          11          0.5        2205.1       0.0X
-get_json_object                                    2111           2115           5          0.5        2111.2       0.0X
+Text read                                            74             75           1         13.5          74.1       1.0X
+from_json                                          1691           1703          13          0.6        1691.2       0.0X
+json_tuple                                         1830           1849          22          0.5        1830.3       0.0X
+get_json_object wholestage off                     1761           1767           5          0.6        1761.4       0.0X
+get_json_object wholestage on                      1648           1656           9          0.6        1647.6       0.0X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 21+35 on Linux 5.15.0-1046-azure
-Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 21+35-LTS on Linux 5.15.0-1047-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 Dataset of json strings:                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                           332            334           2         15.0          66.5       1.0X
-schema inferring                                   2319           2321           5          2.2         463.8       0.1X
-parsing                                            3706           3735          49          1.3         741.1       0.1X
+Text read                                           303            305           2         16.5          60.6       1.0X
+schema inferring                                   2336           2346           9          2.1         467.2       0.1X
+parsing                                            3154           3175          26          1.6         630.8       0.1X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 21+35 on Linux 5.15.0-1046-azure
-Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 21+35-LTS on Linux 5.15.0-1047-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 Json files in the per-line mode:          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                           811            817           5          6.2         162.3       1.0X
-Schema inferring                                   2964           2965           0          1.7         592.9       0.3X
-Parsing without charset                            3803           3806           4          1.3         760.6       0.2X
-Parsing with UTF-8                                 5557           5563           6          0.9        1111.4       0.1X
+Text read                                           739            750          16          6.8         147.8       1.0X
+Schema inferring                                   3175           3187          12          1.6         635.0       0.2X
+Parsing without charset                            3359           3370           9          1.5         671.8       0.2X
+Parsing with UTF-8                                 4819           4828          11          1.0         963.8       0.2X
 
-OpenJDK 64-Bit Server VM 21+35 on Linux 5.15.0-1046-azure
-Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 21+35-LTS on Linux 5.15.0-1047-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 Write dates and timestamps:               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps                      198            203           5          5.1         197.6       1.0X
-to_json(timestamp)                                  962            974          12          1.0         961.8       0.2X
-write timestamps to files                           859            872          14          1.2         859.3       0.2X
-Create a dataset of dates                           183            192           8          5.5         183.0       1.1X
-to_json(date)                                       770            776           6          1.3         769.6       0.3X
-write dates to files                                614            631          22          1.6         613.8       0.3X
+Create a dataset of timestamps                      138            148          13          7.3         137.5       1.0X
+to_json(timestamp)                                  917            924          12          1.1         917.3       0.1X
+write timestamps to files                           873            883           9          1.1         873.1       0.2X
+Create a dataset of dates                           153            165          10          6.5         152.9       0.9X
+to_json(date)                                       683            689           8          1.5         682.6       0.2X
+write dates to files                                598            605           8          1.7         598.3       0.2X
 
-OpenJDK 64-Bit Server VM 21+35 on Linux 5.15.0-1046-azure
-Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 21+35-LTS on Linux 5.15.0-1047-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 Read dates and timestamps:                                             Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 -----------------------------------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files                                                   217            224           7          4.6         216.9       1.0X
-read timestamps from files                                                      2614           2645          48          0.4        2614.0       0.1X
-infer timestamps from files                                                     6395           6411          20          0.2        6395.4       0.0X
-read date text from files                                                        192            197           9          5.2         191.6       1.1X
-read date from files                                                             920            923           2          1.1         920.3       0.2X
-timestamp strings                                                                209            215           7          4.8         209.3       1.0X
-parse timestamps from Dataset[String]                                           2799           2812          13          0.4        2799.2       0.1X
-infer timestamps from Dataset[String]                                           6517           6537          19          0.2        6516.8       0.0X
-date strings                                                                     278            289          10          3.6         277.5       0.8X
-parse dates from Dataset[String]                                                1251           1252           1          0.8        1250.9       0.2X
-from_json(timestamp)                                                            4256           4260           4          0.2        4256.0       0.1X
-from_json(date)                                                                 2716           2731          19          0.4        2715.9       0.1X
-infer error timestamps from Dataset[String] with default format                 1838           1855          15          0.5        1838.5       0.1X
-infer error timestamps from Dataset[String] with user-provided format           1846           1870          33          0.5        1846.3       0.1X
-infer error timestamps from Dataset[String] with legacy format                  1822           1857          34          0.5        1822.3       0.1X
+read timestamp text from files                                                   186            190           7          5.4         185.7       1.0X
+read timestamps from files                                                      2596           2638          60          0.4        2595.9       0.1X
+infer timestamps from files                                                     6351           6355           4          0.2        6350.9       0.0X
+read date text from files                                                        175            177           2          5.7         174.7       1.1X
+read date from files                                                             843            844           0          1.2         843.3       0.2X
+timestamp strings                                                                196            199           5          5.1         195.6       0.9X
+parse timestamps from Dataset[String]                                           2903           2907           3          0.3        2903.2       0.1X
+infer timestamps from Dataset[String]                                           6634           6638           6          0.2        6633.9       0.0X
+date strings                                                                     260            263           2          3.8         260.2       0.7X
+parse dates from Dataset[String]                                                1253           1259           6          0.8        1253.1       0.1X
+from_json(timestamp)                                                            3891           3900           8          0.3        3890.9       0.0X
+from_json(date)                                                                 2089           2103          13          0.5        2088.6       0.1X
+infer error timestamps from Dataset[String] with default format                 1717           1729          17          0.6        1717.2       0.1X
+infer error timestamps from Dataset[String] with user-provided format           1722           1728           9          0.6        1722.4       0.1X
+infer error timestamps from Dataset[String] with legacy format                  1705           1708           5          0.6        1704.6       0.1X
 
-OpenJDK 64-Bit Server VM 21+35 on Linux 5.15.0-1046-azure
-Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 21+35-LTS on Linux 5.15.0-1047-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 Filters pushdown:                         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-w/o filters                                       18911          18925          13          0.0      189110.9       1.0X
-pushdown disabled                                 18841          18860          18          0.0      188411.4       1.0X
-w/ filters                                         1015           1033          16          0.1       10153.1      18.6X
+w/o filters                                       18530          18533           5          0.0      185299.9       1.0X
+pushdown disabled                                 18343          18365          24          0.0      183429.8       1.0X
+w/ filters                                          828            833           6          0.1        8279.8      22.4X
 
-OpenJDK 64-Bit Server VM 21+35 on Linux 5.15.0-1046-azure
-Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+OpenJDK 64-Bit Server VM 21+35-LTS on Linux 5.15.0-1047-azure
+Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz
 Partial JSON results:                     Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-parse invalid JSON                                 3721           3848         201          0.0      372114.6       1.0X
+parse invalid JSON                                 3262           3291          47          0.0      326246.2       1.0X
 
 
diff --git a/sql/core/benchmarks/JsonBenchmark-results.txt b/sql/core/benchmarks/JsonBenchmark-results.txt
index 035e0165ffd70..ae4a9ae0c7908 100644
--- a/sql/core/benchmarks/JsonBenchmark-results.txt
+++ b/sql/core/benchmarks/JsonBenchmark-results.txt
@@ -3,127 +3,128 @@ Benchmark for performance of JSON parsing
 ================================================================================================
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 JSON schema inferring:                    Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                        2858           2897          62          1.7         571.7       1.0X
-UTF-8 is set                                       4281           4291           9          1.2         856.1       0.7X
+No encoding                                        2929           3010          86          1.7         585.9       1.0X
+UTF-8 is set                                       4313           4344          41          1.2         862.5       0.7X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 count a short column:                     Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                        3070           3076           5          1.6         614.1       1.0X
-UTF-8 is set                                       4641           4666          22          1.1         928.2       0.7X
+No encoding                                        2797           2857          68          1.8         559.4       1.0X
+UTF-8 is set                                       4262           4281          17          1.2         852.4       0.7X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 count a wide column:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                        4258           4424         282          0.2        4258.4       1.0X
-UTF-8 is set                                       6180           6194          18          0.2        6180.0       0.7X
+No encoding                                        4265           4360          88          0.2        4265.4       1.0X
+UTF-8 is set                                       6400           6434          29          0.2        6400.4       0.7X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 select wide row:                          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       12765          12772          11          0.0      255294.1       1.0X
-UTF-8 is set                                      14144          14209          78          0.0      282874.0       0.9X
+No encoding                                       12301          12381         113          0.0      246024.1       1.0X
+UTF-8 is set                                      13846          13912          57          0.0      276925.6       0.9X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Select a subset of 10 columns:            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Select 10 columns                                  2352           2372          25          0.4        2352.3       1.0X
-Select 1 column                                    1683           1705          28          0.6        1682.6       1.4X
+Select 10 columns                                  2316           2323           7          0.4        2316.3       1.0X
+Select 1 column                                    1702           1717          17          0.6        1702.0       1.4X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 creation of JSON parser per line:         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Short column without encoding                       873            890          22          1.1         873.3       1.0X
-Short column with UTF-8                            1169           1177          14          0.9        1168.5       0.7X
-Wide column without encoding                       7404           8401        1445          0.1        7404.1       0.1X
-Wide column with UTF-8                             9207           9222          16          0.1        9207.2       0.1X
+Short column without encoding                       827            850          22          1.2         827.1       1.0X
+Short column with UTF-8                            1111           1116           7          0.9        1111.0       0.7X
+Wide column without encoding                       7409           7447          50          0.1        7409.2       0.1X
+Wide column with UTF-8                            10580          10616          34          0.1       10580.4       0.1X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 JSON functions:                           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                            81             93          11         12.4          80.8       1.0X
-from_json                                          1824           1866          45          0.5        1823.7       0.0X
-json_tuple                                         1716           1737          23          0.6        1716.2       0.0X
-get_json_object                                    1623           1637          22          0.6        1622.6       0.0X
+Text read                                            88             92           6         11.3          88.3       1.0X
+from_json                                          2083           2091           7          0.5        2083.1       0.0X
+json_tuple                                         2101           2133          42          0.5        2101.4       0.0X
+get_json_object wholestage off                     2032           2037           8          0.5        2032.0       0.0X
+get_json_object wholestage on                      1917           1926          10          0.5        1917.3       0.0X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Dataset of json strings:                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                           342            344           2         14.6          68.3       1.0X
-schema inferring                                   2213           2218           5          2.3         442.6       0.2X
-parsing                                            3734           3740           5          1.3         746.9       0.1X
+Text read                                           351            351           0         14.3          70.1       1.0X
+schema inferring                                   2342           2344           4          2.1         468.3       0.1X
+parsing                                            3728           3751          26          1.3         745.6       0.1X
 
 Preparing data for benchmarking ...
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Json files in the per-line mode:          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                           917            923           6          5.4         183.5       1.0X
-Schema inferring                                   2952           2956           4          1.7         590.4       0.3X
-Parsing without charset                            3979           3988          10          1.3         795.8       0.2X
-Parsing with UTF-8                                 5459           5464           6          0.9        1091.9       0.2X
+Text read                                           876            883           9          5.7         175.2       1.0X
+Schema inferring                                   3072           3082          14          1.6         614.4       0.3X
+Parsing without charset                            3870           3877           7          1.3         774.1       0.2X
+Parsing with UTF-8                                 5287           5290           5          0.9        1057.3       0.2X
 
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Write dates and timestamps:               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps                      170            171           2          5.9         169.6       1.0X
-to_json(timestamp)                                 1033           1036           4          1.0        1032.6       0.2X
-write timestamps to files                           925            934           8          1.1         924.9       0.2X
-Create a dataset of dates                           171            177           6          5.8         171.5       1.0X
-to_json(date)                                       741            743           5          1.4         740.7       0.2X
-write dates to files                                616            624          11          1.6         616.3       0.3X
+Create a dataset of timestamps                      193            200          10          5.2         192.5       1.0X
+to_json(timestamp)                                 1034           1044          14          1.0        1033.6       0.2X
+write timestamps to files                           945            966          26          1.1         945.0       0.2X
+Create a dataset of dates                           200            205           6          5.0         199.8       1.0X
+to_json(date)                                       757            763           6          1.3         757.0       0.3X
+write dates to files                                647            660          20          1.5         646.8       0.3X
 
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Read dates and timestamps:                                             Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 -----------------------------------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files                                                   222            225           4          4.5         221.7       1.0X
-read timestamps from files                                                      2595           2634          46          0.4        2595.4       0.1X
-infer timestamps from files                                                     6351           6359           8          0.2        6350.7       0.0X
-read date text from files                                                        203            207           5          4.9         203.2       1.1X
-read date from files                                                             973            978           4          1.0         973.2       0.2X
-timestamp strings                                                                220            225           5          4.6         219.7       1.0X
-parse timestamps from Dataset[String]                                           2812           2815           3          0.4        2811.5       0.1X
-infer timestamps from Dataset[String]                                           6520           6523           4          0.2        6519.6       0.0X
-date strings                                                                     294            304           9          3.4         293.6       0.8X
-parse dates from Dataset[String]                                                1355           1359           6          0.7        1354.5       0.2X
-from_json(timestamp)                                                            3797           3800           2          0.3        3797.2       0.1X
-from_json(date)                                                                 2267           2282          13          0.4        2266.8       0.1X
-infer error timestamps from Dataset[String] with default format                 1863           1864           1          0.5        1862.5       0.1X
-infer error timestamps from Dataset[String] with user-provided format           1849           1855           6          0.5        1849.2       0.1X
-infer error timestamps from Dataset[String] with legacy format                  1832           1847          24          0.5        1831.7       0.1X
+read timestamp text from files                                                   227            231           4          4.4         227.3       1.0X
+read timestamps from files                                                      2670           2725          70          0.4        2670.2       0.1X
+infer timestamps from files                                                     6703           6714          17          0.1        6703.1       0.0X
+read date text from files                                                        201            205           5          5.0         200.8       1.1X
+read date from files                                                             944            951           7          1.1         944.0       0.2X
+timestamp strings                                                                219            224           6          4.6         218.9       1.0X
+parse timestamps from Dataset[String]                                           2847           2856           8          0.4        2847.3       0.1X
+infer timestamps from Dataset[String]                                           6725           6737          13          0.1        6724.9       0.0X
+date strings                                                                     300            304           4          3.3         299.6       0.8X
+parse dates from Dataset[String]                                                1230           1245          16          0.8        1230.5       0.2X
+from_json(timestamp)                                                            4123           4125           2          0.2        4123.0       0.1X
+from_json(date)                                                                 2574           2585           9          0.4        2574.4       0.1X
+infer error timestamps from Dataset[String] with default format                 1871           1878           8          0.5        1870.8       0.1X
+infer error timestamps from Dataset[String] with user-provided format           1869           1877          13          0.5        1868.9       0.1X
+infer error timestamps from Dataset[String] with legacy format                  1847           1875          43          0.5        1847.2       0.1X
 
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Filters pushdown:                         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-w/o filters                                       18958          18970          10          0.0      189581.8       1.0X
-pushdown disabled                                 18640          18656          15          0.0      186401.4       1.0X
-w/ filters                                          874            881           6          0.1        8742.7      21.7X
+w/o filters                                       19347          19382          40          0.0      193474.6       1.0X
+pushdown disabled                                 19320          19329          11          0.0      193196.4       1.0X
+w/ filters                                          897            898           1          0.1        8968.3      21.6X
 
-OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1046-azure
+OpenJDK 64-Bit Server VM 17.0.8+7-LTS on Linux 5.15.0-1047-azure
 Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
 Partial JSON results:                     Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-parse invalid JSON                                 3533           3693         239          0.0      353318.7       1.0X
+parse invalid JSON                                 3398           3589         249          0.0      339830.8       1.0X
 
 
diff --git a/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.css b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.css
index dbdbf9fbf57b1..d6a498e93872c 100644
--- a/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.css
+++ b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.css
@@ -57,3 +57,21 @@
 .job-url {
   word-wrap: break-word;
 }
+
+#plan-viz-graph svg g.node rect.selected {
+  fill: #E25A1CFF;
+  stroke: #317EACFF;
+  stroke-width: 2px;
+}
+
+#plan-viz-graph svg g.node rect.linked {
+  fill: #FFC106FF;
+  stroke: #317EACFF;
+  stroke-width: 2px;
+}
+
+#plan-viz-graph svg path.linked {
+  fill: #317EACFF;
+  stroke: #317EACFF;
+  stroke-width: 2px;
+}
diff --git a/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js
index 96a7a7a3cc0e2..d4cc45a1639ab 100644
--- a/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js
+++ b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.js
@@ -47,6 +47,7 @@ function renderPlanViz() {
     .attr("ry", "5");
 
   setupLayoutForSparkPlanCluster(g, svg);
+  setupSelectionForSparkPlanNode(g);
   setupTooltipForSparkPlanNode(g);
   resizeSvg(svg);
   postprocessForAdditionalMetrics();
@@ -269,3 +270,45 @@ function togglePlanViz() { // eslint-disable-line no-unused-vars
     planVizContainer().style("display", "none");
   }
 }
+
+/*
+ * Light up the selected node and its linked nodes and edges.
+ */
+function setupSelectionForSparkPlanNode(g) {
+  const linkedNodes = new Map();
+  const linkedEdges = new Map();
+
+  g.edges().forEach(function (e) {
+    const edge = g.edge(e);
+    const from = g.node(e.v);
+    const to = g.node(e.w);
+    collectLinks(linkedNodes, from.id, to.id);
+    collectLinks(linkedNodes, to.id, from.id);
+    collectLinks(linkedEdges, from.id, edge.arrowheadId);
+    collectLinks(linkedEdges, to.id, edge.arrowheadId);
+  });
+
+  linkedNodes.forEach((linkedNodes, selectNode) => {
+    d3.select("#" + selectNode).on("click", () => {
+      planVizContainer().selectAll(".selected").classed("selected", false);
+      planVizContainer().selectAll(".linked").classed("linked", false);
+      d3.select("#" + selectNode + " rect").classed("selected", true);
+      linkedNodes.forEach((linkedNode) => {
+        d3.select("#" + linkedNode + " rect").classed("linked", true);
+      });
+      linkedEdges.get(selectNode).forEach((linkedEdge) => {
+        const arrowHead = d3.select("#" + linkedEdge + " path");
+        arrowHead.classed("linked", true);
+        const arrowShaft = $(arrowHead.node()).parents("g.edgePath").children("path");
+        arrowShaft.addClass("linked");
+      });
+    });
+  });
+}
+
+function collectLinks(map, key, value) {
+  if (!map.has(key)) {
+    map.set(key, new Set());
+  }
+  map.get(key).add(value);
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 0cc037b157e07..5079cfcca9dc8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -3798,10 +3798,7 @@ class Dataset[T] private[sql](
    * @group basic
    * @since 1.6.0
    */
-  def persist(): this.type = {
-    sparkSession.sharedState.cacheManager.cacheQuery(this)
-    this
-  }
+  def persist(): this.type = persist(sparkSession.sessionState.conf.defaultCacheStorageLevel)
 
   /**
    * Persist this Dataset with the default storage level (`MEMORY_AND_DISK`).
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
index 11d0448e026fb..987c0668d94a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.types._
 private[execution] sealed case class LazyIterator(func: () => IterableOnce[InternalRow])
   extends Iterator[InternalRow] {
 
-  lazy val results: Iterator[InternalRow] = func().toIterator
+  lazy val results: Iterator[InternalRow] = func().iterator
   override def hasNext: Boolean = results.hasNext
   override def next(): InternalRow = results.next()
 }
@@ -100,7 +100,7 @@ case class GenerateExec(
           if (outer && outputRows.isEmpty) {
             joinedRow.withRight(generatorNullRow) :: Nil
           } else {
-            outputRows.toIterator.map(joinedRow.withRight)
+            outputRows.iterator.map(joinedRow.withRight)
           }
         } ++ LazyIterator(() => boundGenerator.terminate()).map { row =>
           // we leave the left side as the last element of its child output
@@ -272,7 +272,7 @@ case class GenerateExec(
       val outerVal = ctx.freshName("outer")
       s"""
          |${data.code}
-         |scala.collection.Iterator<InternalRow> $iterator = ${data.value}.toIterator();
+         |scala.collection.Iterator<InternalRow> $iterator = ${data.value}.iterator();
          |boolean $outerVal = true;
          |while ($iterator.hasNext() || $outerVal) {
          |  $numOutput.add(1);
@@ -285,7 +285,7 @@ case class GenerateExec(
     } else {
       s"""
          |${data.code}
-         |scala.collection.Iterator<InternalRow> $iterator = ${data.value}.toIterator();
+         |scala.collection.Iterator<InternalRow> $iterator = ${data.value}.iterator();
          |while ($iterator.hasNext()) {
          |  $numOutput.add(1);
          |  InternalRow $current = (InternalRow)($iterator.next());
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
index c9733a25af90e..404c46a4a0878 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -400,7 +400,7 @@ private[sql] object ArrowConverters extends Logging {
     } else {
       logDebug("Using LocalRelation in createDataFrame with Arrow optimization.")
       val data = ArrowConverters.fromBatchIterator(
-        batchesInDriver.toIterator,
+        batchesInDriver.iterator,
         schema,
         session.sessionState.conf.sessionLocalTimeZone,
         errorOnDuplicatedFieldNames = false,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala
index 8c14b5e370736..1744df8303306 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala
@@ -38,25 +38,19 @@ trait BaseCacheTableExec extends LeafV2CommandExec {
 
   override def run(): Seq[InternalRow] = {
     val storageLevelKey = "storagelevel"
-    val storageLevelValue =
-      CaseInsensitiveMap(options).get(storageLevelKey).map(_.toUpperCase(Locale.ROOT))
+    val storageLevel = CaseInsensitiveMap(options).get(storageLevelKey)
+      .map(s => StorageLevel.fromString(s.toUpperCase(Locale.ROOT)))
+      .getOrElse(conf.defaultCacheStorageLevel)
     val withoutStorageLevel = options.filterKeys(_.toLowerCase(Locale.ROOT) != storageLevelKey)
     if (withoutStorageLevel.nonEmpty) {
       logWarning(s"Invalid options: ${withoutStorageLevel.mkString(", ")}")
     }
 
-    if (storageLevelValue.nonEmpty) {
-      session.sharedState.cacheManager.cacheQuery(
-        session,
-        planToCache,
-        Some(relationName),
-        StorageLevel.fromString(storageLevelValue.get))
-    } else {
-      session.sharedState.cacheManager.cacheQuery(
-        session,
-        planToCache,
-        Some(relationName))
-    }
+    session.sharedState.cacheManager.cacheQuery(
+      session,
+      planToCache,
+      Some(relationName),
+      storageLevel)
 
     if (!isLazy) {
       // Performs eager caching.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala
index 84c0cd127f45a..c56cc294a9f33 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala
@@ -289,7 +289,7 @@ case class BroadcastNestedLoopJoinExec(
     def notMatchedBroadcastRows: RDD[InternalRow] = {
       getMatchedBroadcastRowsBitSetRDD(streamRdd, relation)
         .repartition(1)
-        .mapPartitions(iter => Seq(iter.fold(new BitSet(relation.value.length))(_ | _)).toIterator)
+        .mapPartitions(iter => Seq(iter.fold(new BitSet(relation.value.length))(_ | _)).iterator)
         .flatMap { matchedBroadcastRows =>
           val nulls = new GenericInternalRow(streamed.output.size)
           val buf: CompactBuffer[InternalRow] = new CompactBuffer()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonUDTFExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonUDTFExec.scala
index 01fb3bd7ac6a9..40993f96e7a0c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonUDTFExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonUDTFExec.scala
@@ -84,7 +84,7 @@ case class BatchEvalPythonUDTFExec(
       val res = results.asInstanceOf[Array[_]]
       pythonMetrics("pythonNumRowsReceived") += res.length
       fromJava(results).asInstanceOf[GenericArrayData]
-        .array.map(_.asInstanceOf[InternalRow]).toIterator
+        .array.map(_.asInstanceOf[InternalRow]).iterator
     }
   }
 
@@ -112,6 +112,7 @@ object PythonUDTFRunner {
       dataOut: DataOutputStream,
       udtf: PythonUDTF,
       argMetas: Array[ArgumentMetadata]): Unit = {
+    // Write the argument types of the UDTF.
     dataOut.writeInt(argMetas.length)
     argMetas.foreach {
       case ArgumentMetadata(offset, name) =>
@@ -124,6 +125,8 @@ object PythonUDTFRunner {
             dataOut.writeBoolean(false)
         }
     }
+    // Write the zero-based indexes of the projected results of all PARTITION BY expressions within
+    // the TABLE argument of the Python UDTF call, if applicable.
     udtf.pythonUDTFPartitionColumnIndexes match {
       case Some(partitionColumnIndexes) =>
         dataOut.writeInt(partitionColumnIndexes.partitionChildIndexes.length)
@@ -132,7 +135,12 @@ object PythonUDTFRunner {
       case None =>
         dataOut.writeInt(0)
     }
+    // Write the pickled AnalyzeResult buffer from the UDTF "analyze" method, if any.
+    dataOut.writeBoolean(udtf.pickledAnalyzeResult.nonEmpty)
+    udtf.pickledAnalyzeResult.foreach(PythonWorkerUtils.writeBytes(_, dataOut))
+    // Write the contents of the Python script itself.
     PythonWorkerUtils.writePythonFunction(udtf.func, dataOut)
+    // Write the result schema of the UDTF call.
     PythonWorkerUtils.writeUTF(udtf.elementSchema.json, dataOut)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
index b03942cdf43c4..d8d3cc9b7fc43 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
@@ -129,6 +129,7 @@ case class UserDefinedPythonTableFunction(
           name = name,
           func = func,
           elementSchema = rt,
+          pickledAnalyzeResult = None,
           children = exprs,
           evalType = pythonEvalType,
           udfDeterministic = udfDeterministic)
@@ -283,6 +284,9 @@ object UserDefinedPythonTableFunction {
       val schema = DataType.fromJson(
         PythonWorkerUtils.readUTF(length, dataIn)).asInstanceOf[StructType]
 
+      // Receive the pickled AnalyzeResult buffer, if any.
+      val pickledAnalyzeResult: Array[Byte] = PythonWorkerUtils.readBytes(dataIn)
+
       // Receive whether the "with single partition" property is requested.
       val withSinglePartition = dataIn.readInt() == 1
       // Receive the list of requested partitioning columns, if any.
@@ -324,7 +328,8 @@ object UserDefinedPythonTableFunction {
         schema = schema,
         withSinglePartition = withSinglePartition,
         partitionByExpressions = partitionByColumns.toSeq,
-        orderByExpressions = orderBy.toSeq)
+        orderByExpressions = orderBy.toSeq,
+        pickledAnalyzeResult = pickledAnalyzeResult)
     } catch {
       case eof: EOFException =>
         throw new SparkException("Python worker exited unexpectedly (crashed)", eof)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala
index a2868df941178..60249550c4eda 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/RocksDB.scala
@@ -109,6 +109,7 @@ class RocksDB(
   dbOptions.setCreateIfMissing(true)
   dbOptions.setTableFormatConfig(tableFormatConfig)
   dbOptions.setMaxOpenFiles(conf.maxOpenFiles)
+  dbOptions.setAllowFAllocate(conf.allowFAllocate)
 
   if (conf.boundedMemoryUsage) {
     dbOptions.setWriteBufferManager(writeBufferManager)
@@ -674,7 +675,8 @@ case class RocksDBConf(
     totalMemoryUsageMB: Long,
     writeBufferCacheRatio: Double,
     highPriorityPoolRatio: Double,
-    compressionCodec: String)
+    compressionCodec: String,
+    allowFAllocate: Boolean)
 
 object RocksDBConf {
   /** Common prefix of all confs in SQLConf that affects RocksDB */
@@ -757,6 +759,14 @@ object RocksDBConf {
   private val HIGH_PRIORITY_POOL_RATIO_CONF = SQLConfEntry(HIGH_PRIORITY_POOL_RATIO_CONF_KEY,
     "0.1")
 
+  // Allow files to be pre-allocated on disk using fallocate
+  // Disabling may slow writes, but can solve an issue where
+  // significant quantities of disk are wasted if there are
+  // many smaller concurrent state-stores running with the
+  // spark context
+  val ALLOW_FALLOCATE_CONF_KEY = "allowFAllocate"
+  private val ALLOW_FALLOCATE_CONF = SQLConfEntry(ALLOW_FALLOCATE_CONF_KEY, "true")
+
   def apply(storeConf: StateStoreConf): RocksDBConf = {
     val sqlConfs = CaseInsensitiveMap[String](storeConf.sqlConfs)
     val extraConfs = CaseInsensitiveMap[String](storeConf.extraOptions)
@@ -834,7 +844,8 @@ object RocksDBConf {
       getLongConf(MAX_MEMORY_USAGE_MB_CONF),
       getRatioConf(WRITE_BUFFER_CACHE_RATIO_CONF),
       getRatioConf(HIGH_PRIORITY_POOL_RATIO_CONF),
-      storeConf.compressionCodec)
+      storeConf.compressionCodec,
+      getBooleanConf(ALLOW_FALLOCATE_CONF))
   }
 
   def apply(): RocksDBConf = apply(new StateStoreConf())
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index b698eafa7fc97..34e18cdf27a04 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -7096,7 +7096,7 @@ object functions {
    */
   // scalastyle:on line.size.limit
   def from_xml(e: Column, schema: StructType, options: java.util.Map[String, String]): Column =
-    from_xml(e, lit(CharVarcharUtils.failIfHasCharVarchar(schema).sql), options.asScala.toIterator)
+    from_xml(e, lit(CharVarcharUtils.failIfHasCharVarchar(schema).sql), options.asScala.iterator)
 
   // scalastyle:off line.size.limit
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
index acc2055d77934..74a4f1c9d4c90 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -760,13 +760,13 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
   }
 
   /**
-   * Caches the specified table or view in-memory.
+   * Persist the specified table or view with the default storage level,
    *
    * @group cachemgmt
    * @since 2.0.0
    */
   override def cacheTable(tableName: String): Unit = {
-    sparkSession.sharedState.cacheManager.cacheQuery(sparkSession.table(tableName), Some(tableName))
+    cacheTable(tableName, sparkSession.sessionState.conf.defaultCacheStorageLevel)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index c9e47e866c07e..22625523a0423 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -177,10 +177,12 @@ abstract class JdbcDialect extends Serializable with Logging {
    * To allow certain options to append when create a new table, which can be
    * table_options or partition_options.
    * E.g., "CREATE TABLE t (name string) ENGINE=InnoDB DEFAULT CHARSET=utf8"
-   * @param statement
-   * @param tableName
-   * @param strSchema
-   * @param options
+   *
+   * @param statement The Statement object used to execute SQL statements.
+   * @param tableName The name of the table to be created.
+   * @param strSchema The schema of the table to be created.
+   * @param options The JDBC options. It contains the create table option, which can be
+   *                table_options or partition_options.
    */
   def createTable(
       statement: Statement,
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udtf/udtf.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udtf/udtf.sql.out
index f7b2bada26ecb..1b923442207ed 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/udtf/udtf.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udtf/udtf.sql.out
@@ -123,13 +123,19 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 
 
 -- !query
-SELECT * FROM UDTFWithSinglePartition(TABLE(t2))
+SELECT * FROM UDTFWithSinglePartition(0, TABLE(t2))
 -- !query analysis
 [Analyzer test output redacted due to nondeterminism]
 
 
 -- !query
-SELECT * FROM UDTFWithSinglePartition(TABLE(t2) WITH SINGLE PARTITION)
+SELECT * FROM UDTFWithSinglePartition(1, TABLE(t2))
+-- !query analysis
+[Analyzer test output redacted due to nondeterminism]
+
+
+-- !query
+SELECT * FROM UDTFWithSinglePartition(0, TABLE(t2) WITH SINGLE PARTITION)
 -- !query analysis
 org.apache.spark.sql.AnalysisException
 {
@@ -144,14 +150,14 @@ org.apache.spark.sql.AnalysisException
     "objectType" : "",
     "objectName" : "",
     "startIndex" : 15,
-    "stopIndex" : 70,
-    "fragment" : "UDTFWithSinglePartition(TABLE(t2) WITH SINGLE PARTITION)"
+    "stopIndex" : 73,
+    "fragment" : "UDTFWithSinglePartition(0, TABLE(t2) WITH SINGLE PARTITION)"
   } ]
 }
 
 
 -- !query
-SELECT * FROM UDTFWithSinglePartition(TABLE(t2) PARTITION BY partition_col)
+SELECT * FROM UDTFWithSinglePartition(0, TABLE(t2) PARTITION BY partition_col)
 -- !query analysis
 org.apache.spark.sql.AnalysisException
 {
@@ -166,8 +172,8 @@ org.apache.spark.sql.AnalysisException
     "objectType" : "",
     "objectName" : "",
     "startIndex" : 15,
-    "stopIndex" : 75,
-    "fragment" : "UDTFWithSinglePartition(TABLE(t2) PARTITION BY partition_col)"
+    "stopIndex" : 78,
+    "fragment" : "UDTFWithSinglePartition(0, TABLE(t2) PARTITION BY partition_col)"
   } ]
 }
 
@@ -176,7 +182,7 @@ org.apache.spark.sql.AnalysisException
 SELECT * FROM
     VALUES (0), (1) AS t(col)
     JOIN LATERAL
-    UDTFWithSinglePartition(TABLE(t2) PARTITION BY partition_col)
+    UDTFWithSinglePartition(0, TABLE(t2) PARTITION BY partition_col)
 -- !query analysis
 org.apache.spark.sql.AnalysisException
 {
@@ -191,8 +197,8 @@ org.apache.spark.sql.AnalysisException
     "objectType" : "",
     "objectName" : "",
     "startIndex" : 66,
-    "stopIndex" : 126,
-    "fragment" : "UDTFWithSinglePartition(TABLE(t2) PARTITION BY partition_col)"
+    "stopIndex" : 129,
+    "fragment" : "UDTFWithSinglePartition(0, TABLE(t2) PARTITION BY partition_col)"
   } ]
 }
 
diff --git a/sql/core/src/test/resources/sql-tests/inputs/udtf/udtf.sql b/sql/core/src/test/resources/sql-tests/inputs/udtf/udtf.sql
index 6d49177c4f6a9..6d34b91e2f168 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/udtf/udtf.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/udtf/udtf.sql
@@ -47,13 +47,14 @@ SELECT * FROM
 --           order_by=[
 --               OrderingColumn("input"),
 --               OrderingColumn("partition_col")])
-SELECT * FROM UDTFWithSinglePartition(TABLE(t2));
-SELECT * FROM UDTFWithSinglePartition(TABLE(t2) WITH SINGLE PARTITION);
-SELECT * FROM UDTFWithSinglePartition(TABLE(t2) PARTITION BY partition_col);
+SELECT * FROM UDTFWithSinglePartition(0, TABLE(t2));
+SELECT * FROM UDTFWithSinglePartition(1, TABLE(t2));
+SELECT * FROM UDTFWithSinglePartition(0, TABLE(t2) WITH SINGLE PARTITION);
+SELECT * FROM UDTFWithSinglePartition(0, TABLE(t2) PARTITION BY partition_col);
 SELECT * FROM
     VALUES (0), (1) AS t(col)
     JOIN LATERAL
-    UDTFWithSinglePartition(TABLE(t2) PARTITION BY partition_col);
+    UDTFWithSinglePartition(0, TABLE(t2) PARTITION BY partition_col);
 -- As a reminder, the UDTFPartitionByOrderBy function returns this analyze result:
 --     AnalyzeResult(
 --         schema=StructType()
diff --git a/sql/core/src/test/resources/sql-tests/results/udtf/udtf.sql.out b/sql/core/src/test/resources/sql-tests/results/udtf/udtf.sql.out
index a93aac9450156..11295c43d8cbb 100644
--- a/sql/core/src/test/resources/sql-tests/results/udtf/udtf.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/udtf/udtf.sql.out
@@ -161,7 +161,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 
 
 -- !query
-SELECT * FROM UDTFWithSinglePartition(TABLE(t2))
+SELECT * FROM UDTFWithSinglePartition(0, TABLE(t2))
 -- !query schema
 struct<count:int,total:int,last:int>
 -- !query output
@@ -169,7 +169,15 @@ struct<count:int,total:int,last:int>
 
 
 -- !query
-SELECT * FROM UDTFWithSinglePartition(TABLE(t2) WITH SINGLE PARTITION)
+SELECT * FROM UDTFWithSinglePartition(1, TABLE(t2))
+-- !query schema
+struct<count:int,total:int,last:int>
+-- !query output
+3	6	3
+
+
+-- !query
+SELECT * FROM UDTFWithSinglePartition(0, TABLE(t2) WITH SINGLE PARTITION)
 -- !query schema
 struct<>
 -- !query output
@@ -186,14 +194,14 @@ org.apache.spark.sql.AnalysisException
     "objectType" : "",
     "objectName" : "",
     "startIndex" : 15,
-    "stopIndex" : 70,
-    "fragment" : "UDTFWithSinglePartition(TABLE(t2) WITH SINGLE PARTITION)"
+    "stopIndex" : 73,
+    "fragment" : "UDTFWithSinglePartition(0, TABLE(t2) WITH SINGLE PARTITION)"
   } ]
 }
 
 
 -- !query
-SELECT * FROM UDTFWithSinglePartition(TABLE(t2) PARTITION BY partition_col)
+SELECT * FROM UDTFWithSinglePartition(0, TABLE(t2) PARTITION BY partition_col)
 -- !query schema
 struct<>
 -- !query output
@@ -210,8 +218,8 @@ org.apache.spark.sql.AnalysisException
     "objectType" : "",
     "objectName" : "",
     "startIndex" : 15,
-    "stopIndex" : 75,
-    "fragment" : "UDTFWithSinglePartition(TABLE(t2) PARTITION BY partition_col)"
+    "stopIndex" : 78,
+    "fragment" : "UDTFWithSinglePartition(0, TABLE(t2) PARTITION BY partition_col)"
   } ]
 }
 
@@ -220,7 +228,7 @@ org.apache.spark.sql.AnalysisException
 SELECT * FROM
     VALUES (0), (1) AS t(col)
     JOIN LATERAL
-    UDTFWithSinglePartition(TABLE(t2) PARTITION BY partition_col)
+    UDTFWithSinglePartition(0, TABLE(t2) PARTITION BY partition_col)
 -- !query schema
 struct<>
 -- !query output
@@ -237,8 +245,8 @@ org.apache.spark.sql.AnalysisException
     "objectType" : "",
     "objectName" : "",
     "startIndex" : 66,
-    "stopIndex" : 126,
-    "fragment" : "UDTFWithSinglePartition(TABLE(t2) PARTITION BY partition_col)"
+    "stopIndex" : 129,
+    "fragment" : "UDTFWithSinglePartition(0, TABLE(t2) PARTITION BY partition_col)"
   } ]
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 51fe3ffd34d4c..7d411331b8e2f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -1694,4 +1694,41 @@ class CachedTableSuite extends QueryTest with SQLTestUtils
       }
     }
   }
+
+  test("SPARK-45451: Make the default storage level of dataset cache configurable") {
+    def validateStorageLevel(expected: StorageLevel): Unit = {
+      withTempView("t") {
+        spark.range(10).createOrReplaceTempView("t")
+
+        Seq(() => spark.table("t").cache(),
+          () => spark.catalog.cacheTable("t"),
+          () => spark.sql("CACHE TABLE t")).foreach { f =>
+          withCache("t") {
+            f()
+            val cached = spark.table("t")
+            val tableCache = collect(cached.queryExecution.executedPlan) {
+              case i: InMemoryTableScanExec => i
+            }
+            if (expected == StorageLevel.NONE) {
+              assert(tableCache.isEmpty)
+            } else {
+              assert(tableCache.size == 1)
+              assert(tableCache.head.relation.cacheBuilder.storageLevel == expected)
+            }
+          }
+        }
+      }
+    }
+
+    validateStorageLevel(StorageLevel.MEMORY_AND_DISK)
+    withSQLConf(SQLConf.DEFAULT_CACHE_STORAGE_LEVEL.key -> "NONE") {
+      validateStorageLevel(StorageLevel.NONE)
+    }
+    withSQLConf(SQLConf.DEFAULT_CACHE_STORAGE_LEVEL.key -> "MEMORY_AND_DISK_2") {
+      validateStorageLevel(StorageLevel.MEMORY_AND_DISK_2)
+    }
+    intercept[IllegalArgumentException] {
+      withSQLConf(SQLConf.DEFAULT_CACHE_STORAGE_LEVEL.key -> "DISK") {}
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala
index ef4606b70caed..3c30c414f81fd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala
@@ -524,8 +524,15 @@ object IntegratedUDFTestUtils extends SQLHelper {
     val name: String = "UDTFWithSinglePartition"
     val pythonScript: String =
       s"""
+        |import json
+        |from dataclasses import dataclass
         |from pyspark.sql.functions import AnalyzeResult, OrderingColumn, PartitioningColumn
         |from pyspark.sql.types import IntegerType, Row, StructType
+        |
+        |@dataclass
+        |class AnalyzeResultWithBuffer(AnalyzeResult):
+        |    buffer: str = ""
+        |
         |class $name:
         |    def __init__(self):
         |        self._count = 0
@@ -533,8 +540,14 @@ object IntegratedUDFTestUtils extends SQLHelper {
         |        self._last = None
         |
         |    @staticmethod
-        |    def analyze(self):
-        |        return AnalyzeResult(
+        |    def analyze(initial_count, input_table):
+        |        buffer = ""
+        |        if initial_count.value is not None:
+        |            assert(not initial_count.is_table)
+        |            assert(initial_count.data_type == IntegerType())
+        |            count = initial_count.value
+        |            buffer = json.dumps({"initial_count": count})
+        |        return AnalyzeResultWithBuffer(
         |            schema=StructType()
         |                .add("count", IntegerType())
         |                .add("total", IntegerType())
@@ -542,9 +555,10 @@ object IntegratedUDFTestUtils extends SQLHelper {
         |            with_single_partition=True,
         |            order_by=[
         |                OrderingColumn("input"),
-        |                OrderingColumn("partition_col")])
+        |                OrderingColumn("partition_col")],
+        |            buffer=buffer)
         |
-        |    def eval(self, row: Row):
+        |    def eval(self, initial_count, row):
         |        self._count += 1
         |        self._last = row["input"]
         |        self._sum += row["input"]
@@ -693,6 +707,48 @@ object IntegratedUDFTestUtils extends SQLHelper {
         "without a corresponding partitioning table requirement"
   }
 
+  object TestPythonUDTFForwardStateFromAnalyze extends TestUDTF {
+    val name: String = "TestPythonUDTFForwardStateFromAnalyze"
+    val pythonScript: String =
+      s"""
+         |from dataclasses import dataclass
+         |from pyspark.sql.functions import AnalyzeResult
+         |from pyspark.sql.types import StringType, StructType
+         |
+         |@dataclass
+         |class AnalyzeResultWithBuffer(AnalyzeResult):
+         |    buffer: str = ""
+         |
+         |class $name:
+         |    def __init__(self, analyze_result):
+         |        self._analyze_result = analyze_result
+         |
+         |    @staticmethod
+         |    def analyze(argument):
+         |        assert(argument.data_type == StringType())
+         |        return AnalyzeResultWithBuffer(
+         |            schema=StructType()
+         |                .add("result", StringType()),
+         |            buffer=argument.value)
+         |
+         |    def eval(self, argument):
+         |        pass
+         |
+         |    def terminate(self):
+         |        yield self._analyze_result.buffer,
+         |""".stripMargin
+
+    val udtf: UserDefinedPythonTableFunction = createUserDefinedPythonTableFunction(
+      name = name,
+      pythonScript = pythonScript,
+      returnType = None)
+
+    def apply(session: SparkSession, exprs: Column*): DataFrame =
+      udtf.apply(session, exprs: _*)
+
+    val prettyName: String = "Python UDTF whose 'analyze' method sets state and reads it later"
+  }
+
   /**
    * A Scalar Pandas UDF that takes one column, casts into string, executes the
    * Python native function, and casts back to the type of input column.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
index b7b34129a959f..51e66f40121bc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala
@@ -29,6 +29,7 @@ import org.apache.spark.{SparkException, SparkRuntimeException}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Literal, StructsToJson}
 import org.apache.spark.sql.catalyst.expressions.Cast._
+import org.apache.spark.sql.execution.WholeStageCodegenExec
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
@@ -1397,4 +1398,31 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession {
     checkAnswer(df.selectExpr("json_object_keys(a)"), expected)
     checkAnswer(df.select(json_object_keys($"a")), expected)
   }
+
+  test("function get_json_object - Codegen Support") {
+    withTempView("GetJsonObjectTable") {
+      val data = Seq(("1", """{"f1": "value1", "f5": 5.23}""")).toDF("key", "jstring")
+      data.createOrReplaceTempView("GetJsonObjectTable")
+      val df = sql("SELECT key, get_json_object(jstring, '$.f1') FROM GetJsonObjectTable")
+      val plan = df.queryExecution.executedPlan
+      assert(plan.isInstanceOf[WholeStageCodegenExec])
+      checkAnswer(df, Seq(Row("1", "value1")))
+    }
+  }
+
+  test("function get_json_object - path is null") {
+    val data = Seq(("""{"name": "alice", "age": 5}""", "")).toDF("a", "b")
+    val df = data.selectExpr("get_json_object(a, null)")
+    val plan = df.queryExecution.executedPlan
+    assert(plan.isInstanceOf[WholeStageCodegenExec])
+    checkAnswer(df, Row(null))
+  }
+
+  test("function get_json_object - json is null") {
+    val data = Seq(("""{"name": "alice", "age": 5}""", "")).toDF("a", "b")
+    val df = data.selectExpr("get_json_object(null, '$.name')")
+    val plan = df.queryExecution.executedPlan
+    assert(plan.isInstanceOf[WholeStageCodegenExec])
+    checkAnswer(df, Row(null))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
index 5b86543648f07..02ed2a16d1137 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
@@ -272,9 +272,18 @@ object JsonBenchmark extends SqlBasedBenchmark {
       json_tuple_ds.noop()
     }
 
-    benchmark.addCase("get_json_object", iters) { _ =>
-      val get_json_object_ds = in.select(get_json_object($"value", "$.a"))
-      get_json_object_ds.noop()
+    benchmark.addCase("get_json_object wholestage off", iters) { _ =>
+      withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") {
+        val get_json_object_ds = in.select(get_json_object($"value", "$.a"))
+        get_json_object_ds.noop()
+      }
+    }
+
+    benchmark.addCase("get_json_object wholestage on", iters) { _ =>
+      withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true") {
+        val get_json_object_ds = in.select(get_json_object($"value", "$.a"))
+        get_json_object_ds.noop()
+      }
     }
 
     benchmark.run()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala
index cdc3ef9e41782..efab685236de3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonUDTFSuite.scala
@@ -48,15 +48,15 @@ class PythonUDTFSuite extends QueryTest with SharedSparkSession {
 
   private val pythonUDTFCountSumLast: UserDefinedPythonTableFunction =
     createUserDefinedPythonTableFunction(
-      "UDTFCountSumLast", TestPythonUDTFCountSumLast.pythonScript, None)
+      TestPythonUDTFCountSumLast.name, TestPythonUDTFCountSumLast.pythonScript, None)
 
   private val pythonUDTFWithSinglePartition: UserDefinedPythonTableFunction =
     createUserDefinedPythonTableFunction(
-      "UDTFWithSinglePartition", TestPythonUDTFWithSinglePartition.pythonScript, None)
+      TestPythonUDTFWithSinglePartition.name, TestPythonUDTFWithSinglePartition.pythonScript, None)
 
   private val pythonUDTFPartitionByOrderBy: UserDefinedPythonTableFunction =
     createUserDefinedPythonTableFunction(
-      "UDTFPartitionByOrderBy", TestPythonUDTFPartitionBy.pythonScript, None)
+      TestPythonUDTFPartitionBy.name, TestPythonUDTFPartitionBy.pythonScript, None)
 
   private val arrowPythonUDTF: UserDefinedPythonTableFunction =
     createUserDefinedPythonTableFunction(
@@ -65,6 +65,11 @@ class PythonUDTFSuite extends QueryTest with SharedSparkSession {
       Some(returnType),
       evalType = PythonEvalType.SQL_ARROW_TABLE_UDF)
 
+  private val pythonUDTFForwardStateFromAnalyze: UserDefinedPythonTableFunction =
+    createUserDefinedPythonTableFunction(
+      TestPythonUDTFForwardStateFromAnalyze.name,
+      TestPythonUDTFForwardStateFromAnalyze.pythonScript, None)
+
   test("Simple PythonUDTF") {
     assume(shouldTestPythonUDFs)
     val df = pythonUDTF(spark, lit(1), lit(2))
@@ -200,14 +205,14 @@ class PythonUDTFSuite extends QueryTest with SharedSparkSession {
           stop = 29))
     }
 
-    spark.udtf.registerPython("UDTFCountSumLast", pythonUDTFCountSumLast)
+    spark.udtf.registerPython(TestPythonUDTFCountSumLast.name, pythonUDTFCountSumLast)
     var plan = sql(
-      """
+      s"""
         |WITH t AS (
         |  VALUES (0, 1), (1, 2), (1, 3) t(partition_col, input)
         |)
         |SELECT count, total, last
-        |FROM UDTFCountSumLast(TABLE(t) WITH SINGLE PARTITION)
+        |FROM ${TestPythonUDTFCountSumLast.name}(TABLE(t) WITH SINGLE PARTITION)
         |ORDER BY 1, 2
         |""".stripMargin).queryExecution.analyzed
     plan.collectFirst { case r: Repartition => r } match {
@@ -216,16 +221,16 @@ class PythonUDTFSuite extends QueryTest with SharedSparkSession {
         failure(plan)
     }
 
-    spark.udtf.registerPython("UDTFWithSinglePartition", pythonUDTFWithSinglePartition)
+    spark.udtf.registerPython(TestPythonUDTFWithSinglePartition.name, pythonUDTFWithSinglePartition)
     plan = sql(
-      """
+      s"""
         |WITH t AS (
         |    SELECT id AS partition_col, 1 AS input FROM range(1, 21)
         |    UNION ALL
         |    SELECT id AS partition_col, 2 AS input FROM range(1, 21)
         |)
         |SELECT count, total, last
-        |FROM UDTFWithSinglePartition(TABLE(t))
+        |FROM ${TestPythonUDTFWithSinglePartition.name}(0, TABLE(t))
         |ORDER BY 1, 2
         |""".stripMargin).queryExecution.analyzed
     plan.collectFirst { case r: Repartition => r } match {
@@ -234,16 +239,16 @@ class PythonUDTFSuite extends QueryTest with SharedSparkSession {
         failure(plan)
     }
 
-    spark.udtf.registerPython("UDTFPartitionByOrderBy", pythonUDTFPartitionByOrderBy)
+    spark.udtf.registerPython(TestPythonUDTFPartitionBy.name, pythonUDTFPartitionByOrderBy)
     plan = sql(
-      """
+      s"""
         |WITH t AS (
         |    SELECT id AS partition_col, 1 AS input FROM range(1, 21)
         |    UNION ALL
         |    SELECT id AS partition_col, 2 AS input FROM range(1, 21)
         |)
         |SELECT partition_col, count, total, last
-        |FROM UDTFPartitionByOrderBy(TABLE(t))
+        |FROM ${TestPythonUDTFPartitionBy.name}(TABLE(t))
         |ORDER BY 1, 2
         |""".stripMargin).queryExecution.analyzed
     plan.collectFirst { case r: RepartitionByExpression => r } match {
@@ -345,4 +350,17 @@ class PythonUDTFSuite extends QueryTest with SharedSparkSession {
       Literal("abc"))) ==
       Seq(2, 3))
   }
+
+  test("SPARK-45402: Add UDTF API for 'analyze' to return a buffer to consume on class creation") {
+    spark.udtf.registerPython(
+      TestPythonUDTFForwardStateFromAnalyze.name,
+      pythonUDTFForwardStateFromAnalyze)
+    withTable("t") {
+      sql("create table t(col array<int>) using parquet")
+      val query = s"select * from ${TestPythonUDTFForwardStateFromAnalyze.name}('abc')"
+      checkAnswer(
+        sql(query),
+        Row("abc"))
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreSuite.scala
index d1cc7e0b3b9c4..82f677a98162c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreSuite.scala
@@ -86,6 +86,7 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid
         (RocksDBConf.ROCKSDB_SQL_CONF_NAME_PREFIX + ".maxOpenFiles", "1000"),
         (RocksDBConf.ROCKSDB_SQL_CONF_NAME_PREFIX + ".maxWriteBufferNumber", "3"),
         (RocksDBConf.ROCKSDB_SQL_CONF_NAME_PREFIX + ".writeBufferSizeMB", "16"),
+        (RocksDBConf.ROCKSDB_SQL_CONF_NAME_PREFIX + ".allowFAllocate", "false"),
         (SQLConf.STATE_STORE_ROCKSDB_FORMAT_VERSION.key, "4")
       )
       testConfs.foreach { case (k, v) => spark.conf.set(k, v) }
@@ -115,6 +116,7 @@ class RocksDBStateStoreSuite extends StateStoreSuiteBase[RocksDBStateStoreProvid
       assert(rocksDBConfInTask.maxOpenFiles == 1000)
       assert(rocksDBConfInTask.maxWriteBufferNumber == 3)
       assert(rocksDBConfInTask.writeBufferSizeMB == 16L)
+      assert(rocksDBConfInTask.allowFAllocate == false)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBSuite.scala
index 764358dc1f09c..b5e1eccba339b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/RocksDBSuite.scala
@@ -1040,6 +1040,12 @@ class RocksDBSuite extends AlsoTestWithChangelogCheckpointingEnabled with Shared
     }
   }
 
+  test("Verify that fallocate is allowed by default") {
+     val sqlConf = new SQLConf
+     val dbConf = RocksDBConf(StateStoreConf(sqlConf))
+     assert(dbConf.allowFAllocate == true)
+  }
+
  /** RocksDB memory management tests for bounded memory usage */
   test("Memory mgmt - invalid config") {
     withTempDir { dir =>
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
index 87d3cb6faa734..07c9e63545d73 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
@@ -139,7 +139,7 @@ private[streaming] class FileBasedWriteAheadLog(
     def readFile(file: String): Iterator[ByteBuffer] = {
       logDebug(s"Creating log reader with $file")
       val reader = new FileBasedWriteAheadLogReader(file, hadoopConf)
-      CompletionIterator[ByteBuffer, Iterator[ByteBuffer]](reader, () => reader.close())
+      CompletionIterator[ByteBuffer, Iterator[ByteBuffer]](reader, reader.close())
     }
     if (!closeFileAfterWrite) {
       logFilesToRead.iterator.flatMap(readFile).asJava