Skip to content

Commit

Permalink
Set the temporaryGcsBucket to default to fs.gs.system.bucket (#1320)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidrabinowitz authored Dec 3, 2024
1 parent a4c20e3 commit b842be6
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 4 deletions.
3 changes: 2 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
## Next
* Issue #1290: Stopped using metadata for optimized count path
* Issue #1317: Improving OpenLineage 1.24.0+ compatibility
* PR #1311 : Improve read session expired error message
* PR #1311: Improve read session expired error message
* PR #1320: Set the `temporaryGcsBucket` to default to `fs.gs.system.bucket` if exists, negating the need to set it in Dataproc clusters.

## 0.41.0 - 2024-09-05

Expand Down
5 changes: 3 additions & 2 deletions README-template.md
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,8 @@ word-break:break-word
<td>The GCS bucket that temporarily holds the data before it is loaded to
BigQuery. Required unless set in the Spark configuration
(<code>spark.conf.set(...)</code>).
<br/><i>Not supported by the `DIRECT` write method.</i>
<br/>Defaults to the `fs.gs.system.bucket` if exists, for example on Google Cloud Dataproc clusters, starting version 0.42.0.
<br/><i>Supported only by the `INDIRECT` write method.</i>
</td>
<td>Write</td>
</tr>
Expand All @@ -583,7 +584,7 @@ word-break:break-word
<td>The GCS bucket that holds the data before it is loaded to
BigQuery. If informed, the data won't be deleted after write data
into BigQuery.
<br/><i>Not supported by the `DIRECT` write method.</i>
<br/><i>Supported only by the `INDIRECT` write method.</i>
</td>
<td>Write</td>
</tr>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ public static WriteMethod from(@Nullable String writeMethod) {
public static final String BIG_NUMERIC_DEFAULT_PRECISION = "bigNumericDefaultPrecision";
public static final String BIG_NUMERIC_DEFAULT_SCALE = "bigNumericDefaultScale";

private static final String DATAPROC_SYSTEM_BUCKET_CONFIGURATION = "fs.gs.system.bucket";

TableId tableId;
// as the config needs to be Serializable, internally it uses
// com.google.common.base.Optional<String> but externally it uses the regular java.util.Optional
Expand Down Expand Up @@ -398,7 +400,10 @@ public static SparkBigQueryConfig from(
.orNull();
config.defaultParallelism = defaultParallelism;
config.temporaryGcsBucket =
stripPrefix(getAnyOption(globalOptions, options, "temporaryGcsBucket"));
stripPrefix(getAnyOption(globalOptions, options, "temporaryGcsBucket"))
.or(
com.google.common.base.Optional.fromNullable(
hadoopConfiguration.get(DATAPROC_SYSTEM_BUCKET_CONFIGURATION)));
config.persistentGcsBucket =
stripPrefix(getAnyOption(globalOptions, options, "persistentGcsBucket"));
config.persistentGcsPath = getOption(options, "persistentGcsPath");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1182,4 +1182,22 @@ public void testEnableListInferenceWithDefaultIntermediateFormat() {
assertThat(config.getIntermediateFormat())
.isEqualTo(SparkBigQueryConfig.IntermediateFormat.PARQUET_LIST_INFERENCE_ENABLED);
}

@Test
public void testSystemBucketAsDefaultTemporaryGcsBucket() {
Configuration hadoopConfiguration = new Configuration();
hadoopConfiguration.set("fs.gs.system.bucket", "foo");
SparkBigQueryConfig config =
SparkBigQueryConfig.from(
asDataSourceOptionsMap(defaultOptions),
emptyMap, // allConf
hadoopConfiguration,
emptyMap, // customDefaults
1,
new SQLConf(),
sparkVersion,
/* schema */ Optional.empty(),
/* tableIsMandatory */ true);
assertThat(config.getTemporaryGcsBucket()).hasValue("foo");
}
}

0 comments on commit b842be6

Please sign in to comment.