Set the temporaryGcsBucket to default to fs.gs.system.bucket (#1320)

GoogleCloudDataproc · Dec 3, 2024 · b842be6 · b842be6
1 parent a4c20e3
commit b842be6
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 4 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -3,7 +3,8 @@
 ## Next
 * Issue #1290: Stopped using metadata for optimized count path
 * Issue #1317: Improving OpenLineage 1.24.0+ compatibility
-* PR #1311 : Improve read session expired error message
+* PR #1311: Improve read session expired error message
+* PR #1320: Set the `temporaryGcsBucket` to default to `fs.gs.system.bucket` if exists, negating the need to set it in Dataproc clusters.
 
 ## 0.41.0 - 2024-09-05
 

diff --git a/README-template.md b/README-template.md
@@ -573,7 +573,8 @@ word-break:break-word
    <td>The GCS bucket that temporarily holds the data before it is loaded to
        BigQuery. Required unless set in the Spark configuration
        (<code>spark.conf.set(...)</code>).
-       <br/><i>Not supported by the `DIRECT` write method.</i>
+       <br/>Defaults to the `fs.gs.system.bucket` if exists, for example on Google Cloud Dataproc clusters, starting version 0.42.0.
+       <br/><i>Supported only by the `INDIRECT` write method.</i>
    </td>
    <td>Write</td>
   </tr>
@@ -583,7 +584,7 @@ word-break:break-word
    <td>The GCS bucket that holds the data before it is loaded to
        BigQuery. If informed, the data won't be deleted after write data
        into BigQuery.
-       <br/><i>Not supported by the `DIRECT` write method.</i>
+       <br/><i>Supported only by the `INDIRECT` write method.</i>
    </td>
    <td>Write</td>
   </tr>

diff --git a/...y-connector-common/src/main/java/com/google/cloud/spark/bigquery/SparkBigQueryConfig.java b/...y-connector-common/src/main/java/com/google/cloud/spark/bigquery/SparkBigQueryConfig.java
@@ -171,6 +171,8 @@ public static WriteMethod from(@Nullable String writeMethod) {
   public static final String BIG_NUMERIC_DEFAULT_PRECISION = "bigNumericDefaultPrecision";
   public static final String BIG_NUMERIC_DEFAULT_SCALE = "bigNumericDefaultScale";
 
+  private static final String DATAPROC_SYSTEM_BUCKET_CONFIGURATION = "fs.gs.system.bucket";
+
   TableId tableId;
   // as the config needs to be Serializable, internally it uses
   // com.google.common.base.Optional<String> but externally it uses the regular java.util.Optional
@@ -398,7 +400,10 @@ public static SparkBigQueryConfig from(
             .orNull();
     config.defaultParallelism = defaultParallelism;
     config.temporaryGcsBucket =
-        stripPrefix(getAnyOption(globalOptions, options, "temporaryGcsBucket"));
+        stripPrefix(getAnyOption(globalOptions, options, "temporaryGcsBucket"))
+            .or(
+                com.google.common.base.Optional.fromNullable(
+                    hadoopConfiguration.get(DATAPROC_SYSTEM_BUCKET_CONFIGURATION)));
     config.persistentGcsBucket =
         stripPrefix(getAnyOption(globalOptions, options, "persistentGcsBucket"));
     config.persistentGcsPath = getOption(options, "persistentGcsPath");

diff --git a/...nnector-common/src/test/java/com/google/cloud/spark/bigquery/SparkBigQueryConfigTest.java b/...nnector-common/src/test/java/com/google/cloud/spark/bigquery/SparkBigQueryConfigTest.java
@@ -1182,4 +1182,22 @@ public void testEnableListInferenceWithDefaultIntermediateFormat() {
     assertThat(config.getIntermediateFormat())
         .isEqualTo(SparkBigQueryConfig.IntermediateFormat.PARQUET_LIST_INFERENCE_ENABLED);
   }
+
+  @Test
+  public void testSystemBucketAsDefaultTemporaryGcsBucket() {
+    Configuration hadoopConfiguration = new Configuration();
+    hadoopConfiguration.set("fs.gs.system.bucket", "foo");
+    SparkBigQueryConfig config =
+        SparkBigQueryConfig.from(
+            asDataSourceOptionsMap(defaultOptions),
+            emptyMap, // allConf
+            hadoopConfiguration,
+            emptyMap, // customDefaults
+            1,
+            new SQLConf(),
+            sparkVersion,
+            /* schema */ Optional.empty(),
+            /* tableIsMandatory */ true);
+    assertThat(config.getTemporaryGcsBucket()).hasValue("foo");
+  }
 }