-
Notifications
You must be signed in to change notification settings - Fork 1.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Spark] Support external DSV2 catalog in Vacuum command #2039
Changes from all commits
3ec4741
8e81699
1ae8c85
9c986ee
4f3946f
5d9bf88
d8a8846
12d89a6
2d7264b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
/* | ||
* Copyright (2023) The Delta Lake Project Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql.delta | ||
|
||
import org.apache.hadoop.fs.Path | ||
import org.apache.spark.sql.SparkSession | ||
import org.apache.spark.sql.catalyst.analysis.{ResolvedTable, UnresolvedTable} | ||
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
import org.apache.spark.sql.catalyst.rules.Rule | ||
import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.{CatalogHelper, MultipartIdentifierHelper} | ||
import org.apache.spark.sql.delta.catalog.DeltaTableV2 | ||
|
||
/** | ||
* Replaces [[UnresolvedTable]]s if the plan is for direct query on files. | ||
*/ | ||
case class ResolveDeltaPathTable(sparkSession: SparkSession) extends Rule[LogicalPlan] { | ||
|
||
private def maybeSQLFile(u: UnresolvedTable): Boolean = { | ||
sparkSession.sessionState.conf.runSQLonFile && u.multipartIdentifier.size == 2 | ||
} | ||
|
||
override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { | ||
case u: UnresolvedTable if maybeSQLFile(u) => | ||
val tableId = u.multipartIdentifier.asTableIdentifier | ||
if (DeltaTableUtils.isValidPath(tableId)) { | ||
val deltaTableV2 = DeltaTableV2(sparkSession, new Path(tableId.table)) | ||
val sessionCatalog = | ||
sparkSession.sessionState.catalogManager.v2SessionCatalog.asTableCatalog | ||
ResolvedTable.create(sessionCatalog, u.multipartIdentifier.asIdentifier, deltaTableV2) | ||
} else { | ||
u | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
/* | ||
* Copyright (2023) The Delta Lake Project Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.spark.sql.delta | ||
|
||
import io.delta.tables.execution.VacuumTableCommand | ||
import org.apache.hadoop.fs.{FileSystem, Path} | ||
import org.apache.spark.sql.{QueryTest, SparkSession} | ||
import org.apache.spark.sql.connector.catalog.{Identifier, Table, TableCatalog, TableChange} | ||
import org.apache.spark.sql.connector.expressions.Transform | ||
import org.apache.spark.sql.delta.catalog.DeltaTableV2 | ||
import org.apache.spark.sql.delta.test.DeltaSQLCommandTest | ||
import org.apache.spark.sql.test.SharedSparkSession | ||
import org.apache.spark.sql.types.StructType | ||
import org.apache.spark.sql.util.CaseInsensitiveStringMap | ||
import org.apache.spark.util.Utils | ||
import org.apache.spark.SparkConf | ||
import org.apache.spark.sql.delta.commands.VacuumCommand.getDeltaTable | ||
|
||
class CustomCatalogSuite extends QueryTest with SharedSparkSession | ||
with DeltaSQLCommandTest { | ||
|
||
override def sparkConf: SparkConf = | ||
super.sparkConf.set("spark.sql.catalog.dummy", classOf[DummyCatalog].getName) | ||
|
||
private def verifyVacuumPath(query: String, expected: Path): Unit = { | ||
val plan = sql(query).queryExecution.analyzed | ||
assert(plan.isInstanceOf[VacuumTableCommand]) | ||
val path = getDeltaTable(plan.asInstanceOf[VacuumTableCommand].child, "VACUUM").path | ||
assert(path == expected) | ||
} | ||
|
||
test("Vacuum a table from DummyCatalog") { | ||
val tableName = "vacuum_table" | ||
withTable(tableName) { | ||
sql("SET CATALOG dummy") | ||
val dummyCatalog = | ||
spark.sessionState.catalogManager.catalog("dummy").asInstanceOf[DummyCatalog] | ||
sql(f"CREATE TABLE $tableName (id bigint) USING delta") | ||
val tablePath = dummyCatalog.getTablePath(tableName) | ||
verifyVacuumPath(s"VACUUM $tableName", tablePath) | ||
verifyVacuumPath(s"VACUUM delta.`$tablePath`", tablePath) | ||
|
||
sql("SET CATALOG spark_catalog") | ||
verifyVacuumPath(s"VACUUM dummy.default.$tableName", tablePath) | ||
} | ||
} | ||
} | ||
|
||
class DummyCatalog extends TableCatalog { | ||
private val spark: SparkSession = SparkSession.active | ||
private val tempDir: Path = new Path(Utils.createTempDir().getAbsolutePath) | ||
// scalastyle:off deltahadoopconfiguration | ||
private val fs: FileSystem = tempDir.getFileSystem(spark.sessionState.newHadoopConf()) | ||
// scalastyle:on deltahadoopconfiguration | ||
|
||
override def name: String = "dummy" | ||
|
||
def getTablePath(tableName: String): Path = { | ||
new Path(tempDir, tableName) | ||
} | ||
override def defaultNamespace(): Array[String] = Array("default") | ||
|
||
override def listTables(namespace: Array[String]): Array[Identifier] = { | ||
val status = fs.listStatus(tempDir) | ||
status.filter(_.isDirectory).map { dir => | ||
Identifier.of(namespace, dir.getPath.getName) | ||
} | ||
} | ||
|
||
override def tableExists(ident: Identifier): Boolean = { | ||
val tablePath = getTablePath(ident.name()) | ||
fs.exists(tablePath) | ||
} | ||
override def loadTable(ident: Identifier): Table = { | ||
val tablePath = getTablePath(ident.name()) | ||
DeltaTableV2(spark, tablePath) | ||
} | ||
|
||
override def createTable( | ||
ident: Identifier, | ||
schema: StructType, | ||
partitions: Array[Transform], | ||
properties: java.util.Map[String, String]): Table = { | ||
val tablePath = getTablePath(ident.name()) | ||
// Create an empty Delta table on the tablePath | ||
spark.range(0).write.format("delta").save(tablePath.toString) | ||
DeltaTableV2(spark, tablePath) | ||
} | ||
|
||
override def alterTable(ident: Identifier, changes: TableChange*): Table = { | ||
throw new UnsupportedOperationException("Alter table operation is not supported.") | ||
} | ||
|
||
override def dropTable(ident: Identifier): Boolean = { | ||
val tablePath = getTablePath(ident.name()) | ||
try { | ||
fs.delete(tablePath, true) | ||
true | ||
} catch { | ||
case _: Exception => false | ||
} | ||
} | ||
|
||
override def renameTable(oldIdent: Identifier, newIdent: Identifier): Unit = { | ||
throw new UnsupportedOperationException("Rename table operation is not supported.") | ||
} | ||
|
||
override def initialize(name: String, options: CaseInsensitiveStringMap): Unit = { | ||
// Initialize tempDir here | ||
if (!fs.exists(tempDir)) { | ||
fs.mkdirs(tempDir) | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -498,8 +498,7 @@ class DeltaVacuumSuite | |
val e = intercept[AnalysisException] { | ||
vacuumSQLTest(tablePath, viewName) | ||
} | ||
assert(e.getMessage.contains("not found") || | ||
e.getMessage.contains("TABLE_OR_VIEW_NOT_FOUND")) | ||
assert(e.getMessage.contains("v is a temp view. 'VACUUM' expects a table.")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The error message here is improved. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't we be checking for an error class, rather than specific strings? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The error class from Spark is a temporary one and it won't be displayed. We can check it after it is assigned to a delegated name. |
||
} | ||
} | ||
} | ||
|
@@ -788,7 +787,7 @@ class DeltaVacuumSuite | |
sql(s"vacuum '$path/v2=a' retain 0 hours") | ||
} | ||
assert(ex.getMessage.contains( | ||
s"Please provide the base path ($path) when Vacuuming Delta tables.")) | ||
s"`$path/v2=a` is not a Delta table. VACUUM is only supported for Delta tables.")) | ||
} | ||
} | ||
|
||
|
@@ -993,9 +992,7 @@ class DeltaVacuumSuite | |
val e = intercept[AnalysisException] { | ||
sql(s"vacuum $table") | ||
} | ||
Seq("VACUUM", "only supported for Delta tables").foreach { msg => | ||
assert(e.getMessage.contains(msg)) | ||
} | ||
assert(e.getMessage.contains("is not a Delta table.")) | ||
} | ||
} | ||
withTempPath { tempDir => | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems to copy ideas from ResolveSQLOnFile from spark? Is there a reason we can't leverage that here, and let
DeltaDataSource.getTable
produce theDeltaTableV2
we need?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ugh,
UnresolvedTable
!=UnresolvedRelation
, and it looks like the data source code usesUnresolvedRelation
whileUnresolvedPathBasedDeltaTable
usesUnresolvedTable
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
UnresolvedDeltaPathOrIdentifier
and it will produceUnresolvedTable
on table identifiers (including the file path table delta.path
)UnresolvedRelation
as the child ofVacuumTableCommand
, the resolved relation from Apache Spark will be a Parquet data source relation. There is some issue with my debugger and I haven't figured out the reason.ResolveRelations
, bothUnresolvedTable
andUnresolvedRelation
are processed.UnresolvedTable
always result inResolvedTable
, whileUnresolvedRelation
results inSubqueryAlias
with various nodes. I think usingUnresolvedTable
is simpler here. Any reason why we should useUnresolvedRelation
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah,
UnresolvedRelation
only makes sense if it allows us to reuse existing machinery in some way. But:That's... awkward. Tho I've noticed that the file index for Delta is parquet source because that's the physical file format Delta reads. Is there no trace of Delta in the resulting scan node, tho?