delta-io · sezruby · Oct 4, 2023 · Oct 4, 2023 · Oct 5, 2023 · Oct 12, 2023
diff --git a/core/src/main/scala/io/delta/tables/DeltaMergeBuilder.scala b/core/src/main/scala/io/delta/tables/DeltaMergeBuilder.scala
@@ -20,6 +20,7 @@ import scala.collection.JavaConverters._
 import scala.collection.Map
 
 import org.apache.spark.sql.delta.{DeltaErrors, PreprocessTableMerge}
+import org.apache.spark.sql.delta.DeltaTableUtils.withActiveSession
 import org.apache.spark.sql.delta.metering.DeltaLogging
 import org.apache.spark.sql.delta.util.AnalysisHelper
 
@@ -203,24 +204,26 @@ class DeltaMergeBuilder private(
    */
   def execute(): Unit = improveUnsupportedOpError {
     val sparkSession = targetTable.toDF.sparkSession
-    // Note: We are explicitly resolving DeltaMergeInto plan rather than going to through the
-    // Analyzer using `Dataset.ofRows()` because the Analyzer incorrectly resolves all
-    // references in the DeltaMergeInto using both source and target child plans, even before
-    // DeltaAnalysis rule kicks in. This is because the Analyzer  understands only MergeIntoTable,
-    // and handles that separately by skipping resolution (for Delta) and letting the
-    // DeltaAnalysis rule do the resolving correctly. This can be solved by generating
-    // MergeIntoTable instead, which blocked by the different issue with MergeIntoTable as explained
-    // in the function `mergePlan` and https://issues.apache.org/jira/browse/SPARK-34962.
-    val resolvedMergeInto =
-      DeltaMergeInto.resolveReferences(mergePlan, sparkSession.sessionState.conf)(
-        tryResolveReferences(sparkSession) _)
-    if (!resolvedMergeInto.resolved) {
-      throw DeltaErrors.analysisException("Failed to resolve\n", plan = Some(resolvedMergeInto))
+    withActiveSession(sparkSession) {
+      // Analyzer using `Dataset.ofRows()` because the Analyzer incorrectly resolves all
+      // references in the DeltaMergeInto using both source and target child plans, even before
+      // DeltaAnalysis rule kicks in. This is because the Analyzer  understands only MergeIntoTable,
+      // and handles that separately by skipping resolution (for Delta) and letting the
+      // DeltaAnalysis rule do the resolving correctly. This can be solved by generating
+      // MergeIntoTable instead, which blocked by the different issue with MergeIntoTable as
+      // explained in the function `mergePlan` and
+      // https://issues.apache.org/jira/browse/SPARK-34962.
+      val resolvedMergeInto =
+        DeltaMergeInto.resolveReferences(mergePlan, sparkSession.sessionState.conf)(
+          tryResolveReferences(sparkSession) _)
+      if (!resolvedMergeInto.resolved) {
+        throw DeltaErrors.analysisException("Failed to resolve\n", plan = Some(resolvedMergeInto))
+      }
+      // Preprocess the actions and verify
+      val mergeIntoCommand = PreprocessTableMerge(sparkSession.sessionState.conf)(resolvedMergeInto)
+      sparkSession.sessionState.analyzer.checkAnalysis(mergeIntoCommand)
+      mergeIntoCommand.run(sparkSession)
     }
-    // Preprocess the actions and verify
-    val mergeIntoCommand = PreprocessTableMerge(sparkSession.sessionState.conf)(resolvedMergeInto)
-    sparkSession.sessionState.analyzer.checkAnalysis(mergeIntoCommand)
-    mergeIntoCommand.run(sparkSession)
   }
 
   /**

diff --git a/core/src/main/scala/io/delta/tables/DeltaTable.scala b/core/src/main/scala/io/delta/tables/DeltaTable.scala
@@ -19,6 +19,7 @@ package io.delta.tables
 import scala.collection.JavaConverters._
 
 import org.apache.spark.sql.delta._
+import org.apache.spark.sql.delta.DeltaTableUtils.withActiveSession
 import org.apache.spark.sql.delta.actions.Protocol
 import org.apache.spark.sql.delta.catalog.DeltaTableV2
 import org.apache.spark.sql.delta.sources.DeltaSQLConf
@@ -806,7 +807,7 @@ object DeltaTable {
    * @since 1.0.0
    */
   @Evolving
-  def createOrReplace(spark: SparkSession): DeltaTableBuilder = {
+  def createOrReplace(spark: SparkSession): DeltaTableBuilder = withActiveSession(spark) {
     new DeltaTableBuilder(spark, ReplaceTableOptions(orCreate = true))
   }
 

diff --git a/core/src/main/scala/io/delta/tables/DeltaTableBuilder.scala b/core/src/main/scala/io/delta/tables/DeltaTableBuilder.scala
@@ -19,6 +19,7 @@ package io.delta.tables
 import scala.collection.mutable
 
 import org.apache.spark.sql.delta.{DeltaErrors, DeltaTableUtils}
+import org.apache.spark.sql.delta.DeltaTableUtils.withActiveSession
 import io.delta.tables.execution._
 
 import org.apache.spark.annotation._
@@ -295,7 +296,7 @@ class DeltaTableBuilder private[tables](
    * @since 1.0.0
    */
   @Evolving
-  def execute(): DeltaTable = {
+  def execute(): DeltaTable = withActiveSession(spark) {
     if (identifier == null && location.isEmpty) {
       throw DeltaErrors.analysisException("Table name or location has to be specified")
     }
@@ -357,7 +358,7 @@ class DeltaTableBuilder private[tables](
 
     // Return DeltaTable Object.
     if (DeltaTableUtils.isValidPath(tableId)) {
-        DeltaTable.forPath(location.get)
+      DeltaTable.forPath(location.get)
     } else {
       DeltaTable.forName(this.identifier)
     }

diff --git a/core/src/main/scala/io/delta/tables/execution/DeltaConvert.scala b/core/src/main/scala/io/delta/tables/execution/DeltaConvert.scala
@@ -16,6 +16,7 @@
 
 package io.delta.tables.execution
 
+import org.apache.spark.sql.delta.DeltaTableUtils.withActiveSession
 import org.apache.spark.sql.delta.commands.ConvertToDeltaCommand
 import io.delta.tables.DeltaTable
 
@@ -28,7 +29,7 @@ trait DeltaConvertBase {
       spark: SparkSession,
       tableIdentifier: TableIdentifier,
       partitionSchema: Option[StructType],
-      deltaPath: Option[String]): DeltaTable = {
+      deltaPath: Option[String]): DeltaTable = withActiveSession(spark) {
     val cvt = ConvertToDeltaCommand(tableIdentifier, partitionSchema, deltaPath)
     cvt.run(spark)
     if (cvt.isCatalogTable(spark.sessionState.analyzer, tableIdentifier)) {

diff --git a/core/src/main/scala/io/delta/tables/execution/DeltaTableOperations.scala b/core/src/main/scala/io/delta/tables/execution/DeltaTableOperations.scala
@@ -19,6 +19,7 @@ package io.delta.tables.execution
 import scala.collection.Map
 
 import org.apache.spark.sql.delta.{DeltaErrors, DeltaHistoryManager, DeltaLog, PreprocessTableUpdate}
+import org.apache.spark.sql.delta.DeltaTableUtils.withActiveSession
 import org.apache.spark.sql.delta.commands.{DeleteCommand, DeltaGenerateCommand, VacuumCommand}
 import org.apache.spark.sql.delta.util.AnalysisHelper
 import io.delta.tables.DeltaTable
@@ -35,42 +36,48 @@ import org.apache.spark.sql.catalyst.plans.logical._
 trait DeltaTableOperations extends AnalysisHelper { self: DeltaTable =>
 
   protected def executeDelete(condition: Option[Expression]): Unit = improveUnsupportedOpError {
-    val delete = DeleteFromTable(self.toDF.queryExecution.analyzed, condition)
-    toDataset(sparkSession, delete)
+    withActiveSession(sparkSession) {
+      val delete = DeleteFromTable(self.toDF.queryExecution.analyzed, condition)
+      toDataset(sparkSession, delete)
+    }
   }
 
   protected def executeHistory(
       deltaLog: DeltaLog,
       limit: Option[Int] = None,
-      tableId: Option[TableIdentifier] = None): DataFrame = {
+      tableId: Option[TableIdentifier] = None): DataFrame = withActiveSession(sparkSession) {
     val history = deltaLog.history
-    val spark = self.toDF.sparkSession
-    spark.createDataFrame(history.getHistory(limit))
+    sparkSession.createDataFrame(history.getHistory(limit))
   }
 
   protected def executeGenerate(tblIdentifier: String, mode: String): Unit = {
-    val tableId: TableIdentifier = sparkSession
-      .sessionState
-      .sqlParser
-      .parseTableIdentifier(tblIdentifier)
-    val generate = DeltaGenerateCommand(mode, tableId)
-    toDataset(sparkSession, generate)
+    withActiveSession(sparkSession) {
+      val tableId: TableIdentifier = sparkSession
+        .sessionState
+        .sqlParser
+        .parseTableIdentifier(tblIdentifier)
+      val generate = DeltaGenerateCommand(mode, tableId)
+      toDataset(sparkSession, generate)
+    }
   }
 
   protected def executeUpdate(
       set: Map[String, Column],
       condition: Option[Column]): Unit = improveUnsupportedOpError {
-    val assignments = set.map { case (targetColName, column) =>
-      Assignment(UnresolvedAttribute.quotedString(targetColName), column.expr)
-    }.toSeq
-    val update = UpdateTable(self.toDF.queryExecution.analyzed, assignments, condition.map(_.expr))
-    toDataset(sparkSession, update)
+    withActiveSession(sparkSession) {
+      val assignments = set.map { case (targetColName, column) =>
+        Assignment(UnresolvedAttribute.quotedString(targetColName), column.expr)
+      }.toSeq
+      val update =
+        UpdateTable(self.toDF.queryExecution.analyzed, assignments, condition.map(_.expr))
+      toDataset(sparkSession, update)
+    }
   }
 
   protected def executeVacuum(
       deltaLog: DeltaLog,
       retentionHours: Option[Double],
-      tableId: Option[TableIdentifier] = None): DataFrame = {
+      tableId: Option[TableIdentifier] = None): DataFrame = withActiveSession(sparkSession) {
     VacuumCommand.gc(sparkSession, deltaLog, false, retentionHours)
     sparkSession.emptyDataFrame
   }

diff --git a/core/src/main/scala/org/apache/spark/sql/delta/DeltaTable.scala b/core/src/main/scala/org/apache/spark/sql/delta/DeltaTable.scala
@@ -356,4 +356,7 @@ object DeltaTableUtils extends PredicateHelper
   def parseColToTransform(col: String): IdentityTransform = {
     IdentityTransform(FieldReference(Seq(col)))
   }
+
+  // Workaround for withActive not being visible in io/delta.
+  def withActiveSession[T](spark: SparkSession)(body: => T): T = spark.withActive(body)
 }
diff --git a/core/src/test/scala/org/apache/spark/sql/delta/MergeIntoSuiteBase.scala b/core/src/test/scala/org/apache/spark/sql/delta/MergeIntoSuiteBase.scala
@@ -2919,4 +2919,33 @@ abstract class MergeIntoSuiteBase
     customConditionErrorRegex =
       Option("Aggregate functions are not supported in the .* condition of MERGE operation.*")
   )
+
+  test("Merge should use the same SparkSession consistently") {
+    withTempDir { dir =>
+      withSQLConf(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> "false") {
+        val r = dir.getCanonicalPath
+        val sourcePath = s"$r/source"
+        val targetPath = s"$r/target"
+        val numSourceRecords = 20
+        spark.range(numSourceRecords)
+          .withColumn("x", $"id")
+          .withColumn("y", $"id")
+          .write.mode("overwrite").format("delta").save(sourcePath)
+        spark.range(1)
+          .withColumn("x", $"id")
+          .write.mode("overwrite").format("delta").save(targetPath)
+        val spark2 = spark.newSession
+        spark2.conf.set(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key, "true")
+        val target = io.delta.tables.DeltaTable.forPath(spark2, targetPath)
+        val source = spark.read.format("delta").load(sourcePath).alias("s")
+        val merge = target.alias("t")
+          .merge(source, "t.id = s.id")
+          .whenMatched.updateExpr(Map("t.x" -> "t.x + 1"))
+          .whenNotMatched.insertAll()
+          .execute()
+        // The target table should have the same number of rows as the source after the merge
+        assert(spark.read.format("delta").load(targetPath).count() == numSourceRecords)
+      }
+    }
+  }
 }
diff --git a/python/delta/tests/test_deltatable.py b/python/delta/tests/test_deltatable.py
@@ -16,6 +16,7 @@
 
 import unittest
 import os
+from multiprocessing.pool import ThreadPool
 
 from pyspark.sql import Row
 from pyspark.sql.column import _to_seq
@@ -279,6 +280,35 @@ def reset_table():
             dt.merge(source, "key = k").whenNotMatchedInsert(
                 values="k = 'a'", condition={"value": 1})
 
+    def test_merge_with_inconsistent_sessions(self) -> None:
+        source_path = os.path.join(self.tempFile, "source")
+        target_path = os.path.join(self.tempFile, "target")
+        spark = self.spark
+
+        def f(spark):
+            spark.range(20) \
+                .withColumn("x", col("id")) \
+                .withColumn("y", col("id")) \
+                .write.mode("overwrite").format("delta").save(source_path)
+            spark.range(1) \
+                .withColumn("x", col("id")) \
+                .write.mode("overwrite").format("delta").save(target_path)
+            target = DeltaTable.forPath(spark, target_path)
+            source = spark.read.format("delta").load(source_path).alias("s")
+            target.alias("t") \
+                .merge(source, "t.id = s.id") \
+                .whenMatchedUpdate(set={"t.x": "t.x + 1"}) \
+                .whenNotMatchedInsertAll() \
+                .execute()
+            assert(spark.read.format("delta").load(target_path).count() == 20)
+
+        pool = ThreadPool(3)
+        spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
+        try:
+            pool.starmap(f, [(spark,)])
+        finally:
+            spark.conf.unset("spark.databricks.delta.schema.autoMerge.enabled")
+
     def test_history(self):
         self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)])
         self.__overwriteDeltaTable([('a', 3), ('b', 2), ('c', 1)])