Update build_and_test.yml #1

Workflow file for this run

.github/workflows/build_and_test.yml at 3c61b27

	- name: Setup .NET Core SDK
Check failure on line 1 in .github/workflows/build_and_test.yml View workflow run for this annotation GitHub Actions / .github/workflows/build_and_test.yml Invalid workflow file `You have an error in your yaml syntax on line 1`
	uses: actions/[email protected]
	with:
	# Optional SDK version(s) to use. If not provided, will install global.json version when available. Examples: 2.2.104, 3.1, 3.1.x, 3.x, 6.0.2xx
	dotnet-version: # optional
	# Optional quality of the build. The possible values are: daily, signed, validated, preview, ga.
	dotnet-quality: # optional
	# Optional global.json location, if your global.json isn't located in the root of the repo.
	global-json-file: # optional
	# Optional package source for which to set up authentication. Will consult any existing NuGet.config in the root of the repo and provide a temporary NuGet.config using the NUGET_AUTH_TOKEN environment variable as a ClearTextPassword
	source-url: # optional
	# Optional OWNER for using packages from GitHub Package Registry organizations/users other than the current repository's owner. Only used if a GPR URL is also provided in source-url
	owner: # optional
	# Optional NuGet.config location, if your NuGet.config isn't located in the root of the repo.
	config-file: # optional
	# Optional input to enable caching of the NuGet global-packages folder
	cache: # optional
	# Used to specify the path to a dependency file: packages.lock.json. Supports wildcards or a list of file names for caching multiple dependencies.
	cache-dependency-path: # optional

	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#

	name: Build and test

	on:
	workflow_call:
	inputs:
	java:
	required: false
	type: string
	default: 17
	branch:
	description: Branch to run the build against
	required: false
	type: string
	# Change 'master' to 'branch-4.0' in branch-4.0 branch after cutting it.
	default: master
	hadoop:
	description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it.
	required: false
	type: string
	default: hadoop3
	envs:
	description: Additional environment variables to set when running the tests. Should be in JSON format.
	required: false
	type: string
	default: '{}'
	jobs:
	description: >-
	Jobs to run, and should be in JSON format. The values should be matched with the job's key defined
	in this file, e.g., build. See precondition job below.
	required: false
	type: string
	default: ''
	jobs:
	precondition:
	name: Check changes
	runs-on: ubuntu-latest
	env:
	GITHUB_PREV_SHA: ${{ github.event.before }}
	outputs:
	required: ${{ steps.set-outputs.outputs.required }}
	image_url: ${{ steps.infra-image-outputs.outputs.image_url }}
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v4
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: ${{ inputs.branch }}
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	run: \|
	echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
	- name: Check all modules
	id: set-outputs
	run: \|
	if [ -z "${{ inputs.jobs }}" ]; then
	pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
	pyspark=`./dev/is-changed.py -m $pyspark_modules`
	if [[ "${{ github.repository }}" != 'apache/spark' ]]; then
	pandas=$pyspark
	yarn=`./dev/is-changed.py -m yarn`
	kubernetes=`./dev/is-changed.py -m kubernetes`
	sparkr=`./dev/is-changed.py -m sparkr`
	tpcds=`./dev/is-changed.py -m sql`
	docker=`./dev/is-changed.py -m docker-integration-tests`
	buf=true
	ui=true
	docs=true
	else
	pandas=false
	yarn=false
	kubernetes=false
	sparkr=false
	tpcds=false
	docker=false
	buf=false
	ui=false
	docs=false
	fi
	build=`./dev/is-changed.py -m "core,unsafe,kvstore,avro,utils,network-common,network-shuffle,repl,launcher,examples,sketch,variant,api,catalyst,hive-thriftserver,mllib-local,mllib,graphx,streaming,sql-kafka-0-10,streaming-kafka-0-10,streaming-kinesis-asl,kubernetes,hadoop-cloud,spark-ganglia-lgpl,protobuf,yarn,connect,sql,hive"`
	precondition="
	{
	\"build\": \"$build\",
	\"pyspark\": \"$pyspark\",
	\"pyspark-pandas\": \"$pandas\",
	\"sparkr\": \"$sparkr\",
	\"tpcds-1g\": \"$tpcds\",
	\"docker-integration-tests\": \"$docker\",
	\"lint\" : \"true\",
	\"docs\" : \"$docs\",
	\"yarn\" : \"$yarn\",
	\"k8s-integration-tests\" : \"$kubernetes\",
	\"buf\" : \"$buf\",
	\"ui\" : \"$ui\",
	}"
	echo $precondition # For debugging
	# Remove `\n` to avoid "Invalid format" error
	precondition="${precondition//$'\n'/}}"
	echo "required=$precondition" >> $GITHUB_OUTPUT
	else
	# This is usually set by scheduled jobs.
	precondition='${{ inputs.jobs }}'
	echo $precondition # For debugging
	precondition="${precondition//$'\n'/}"
	echo "required=$precondition" >> $GITHUB_OUTPUT
	fi
	- name: Generate infra image URL
	id: infra-image-outputs
	run: \|
	# Convert to lowercase to meet Docker repo name requirement
	REPO_OWNER=$(echo "${{ github.repository_owner }}" \| tr '[:upper:]' '[:lower:]')
	IMG_NAME="apache-spark-ci-image:${{ inputs.branch }}-${{ github.run_id }}"
	IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
	echo "image_url=$IMG_URL" >> $GITHUB_OUTPUT

	# Build: build Spark and run the tests for specified modules.
	build:
	name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }}"
	needs: precondition
	if: fromJson(needs.precondition.outputs.required).build == 'true'
	runs-on: ubuntu-latest
	timeout-minutes: 180
	strategy:
	fail-fast: false
	matrix:
	java:
	- ${{ inputs.java }}
	hadoop:
	- ${{ inputs.hadoop }}
	hive:
	- hive2.3
	# Note that the modules below are from sparktestsupport/modules.py.
	modules:
	- >-
	core, unsafe, kvstore, avro, utils,
	network-common, network-shuffle, repl, launcher,
	examples, sketch, variant
	- >-
	api, catalyst, hive-thriftserver
	- >-
	mllib-local, mllib, graphx
	- >-
	streaming, sql-kafka-0-10, streaming-kafka-0-10, streaming-kinesis-asl,
	kubernetes, hadoop-cloud, spark-ganglia-lgpl, protobuf, connect
	- yarn
	# Here, we split Hive and SQL tests into some of slow ones and the rest of them.
	included-tags: [""]
	excluded-tags: [""]
	comment: [""]
	include:
	# Hive tests
	- modules: hive
	java: ${{ inputs.java }}
	hadoop: ${{ inputs.hadoop }}
	hive: hive2.3
	included-tags: org.apache.spark.tags.SlowHiveTest
	comment: "- slow tests"
	- modules: hive
	java: ${{ inputs.java }}
	hadoop: ${{ inputs.hadoop }}
	hive: hive2.3
	excluded-tags: org.apache.spark.tags.SlowHiveTest
	comment: "- other tests"
	# SQL tests
	- modules: sql
	java: ${{ inputs.java }}
	hadoop: ${{ inputs.hadoop }}
	hive: hive2.3
	included-tags: org.apache.spark.tags.ExtendedSQLTest
	comment: "- extended tests"
	- modules: sql
	java: ${{ inputs.java }}
	hadoop: ${{ inputs.hadoop }}
	hive: hive2.3
	included-tags: org.apache.spark.tags.SlowSQLTest
	comment: "- slow tests"
	- modules: sql
	java: ${{ inputs.java }}
	hadoop: ${{ inputs.hadoop }}
	hive: hive2.3
	excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.SlowSQLTest
	comment: "- other tests"
	exclude:
	# Always run if yarn == 'true', even infra-image is skip (such as non-master job)
	# In practice, the build will run in individual PR, but not against the individual commit
	# in Apache Spark repository.
	- modules: ${{ fromJson(needs.precondition.outputs.required).yarn != 'true' && 'yarn' }}
	env:
	MODULES_TO_TEST: ${{ matrix.modules }}
	EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
	INCLUDED_TAGS: ${{ matrix.included-tags }}
	HADOOP_PROFILE: ${{ matrix.hadoop }}
	HIVE_PROFILE: ${{ matrix.hive }}
	# GitHub Actions' default miniconda to use in pip packaging test.
	CONDA_PREFIX: /usr/share/miniconda
	GITHUB_PREV_SHA: ${{ github.event.before }}
	SPARK_LOCAL_IP: localhost
	NOLINT_ON_COMPILE: true
	SKIP_UNIDOC: true
	SKIP_MIMA: true
	SKIP_PACKAGING: true
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v4
	# In order to fetch changed files
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: ${{ inputs.branch }}
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	run: \|
	echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
	# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
	- name: Cache SBT and Maven
	uses: actions/cache@v4
	with:
	path: \|
	build/apache-maven-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v4
	with:
	path: ~/.cache/coursier
	key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	${{ matrix.java }}-${{ matrix.hadoop }}-coursier-
	- name: Free up disk space
	run: \|
	if [ -f ./dev/free_disk_space ]; then
	./dev/free_disk_space
	fi
	- name: Install Java ${{ matrix.java }}
	uses: actions/setup-java@v4
	with:
	distribution: zulu
	java-version: ${{ matrix.java }}
	- name: Install Python 3.11
	uses: actions/setup-python@v5
	# We should install one Python that is higher than 3+ for SQL and Yarn because:
	# - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
	# - Yarn has a Python specific test too, for example, YarnClusterSuite.
	if: contains(matrix.modules, 'yarn') \|\| (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) \|\| contains(matrix.modules, 'connect')
	with:
	python-version: '3.11'
	architecture: x64
	- name: Install Python packages (Python 3.11)
	if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) \|\| contains(matrix.modules, 'connect')
	run: \|
	python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3'
	python3.11 -m pip list
	# Run the tests.
	- name: Run tests
	env: ${{ fromJSON(inputs.envs) }}
	shell: 'script -q -e -c "bash {0}"'
	run: \|
	# Fix for TTY related issues when launching the Ammonite REPL in tests.
	export TERM=vt100
	# Hive "other tests" test needs larger metaspace size based on experiment.
	if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi
	# SPARK-46283: should delete the following env replacement after SPARK 3.x EOL
	if [[ "$MODULES_TO_TEST" == "streaming-kinesis-asl" ]] && [[ "${{ inputs.branch }}" =~ ^branch-3 ]]; then
	MODULES_TO_TEST=${MODULES_TO_TEST//streaming-kinesis-asl, /}
	fi
	export SERIAL_SBT_TESTS=1
	./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
	- name: Upload test results to report
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
	path: "*/target/test-reports/.xml"
	- name: Upload unit tests log files
	if: ${{ !success() }}
	uses: actions/upload-artifact@v4
	with:
	name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
	path: "*/target/.log"

	infra-image:
	name: "Base image build"
	needs: precondition
	if: >-
	fromJson(needs.precondition.outputs.required).pyspark == 'true' \|\|
	fromJson(needs.precondition.outputs.required).lint == 'true' \|\|
	fromJson(needs.precondition.outputs.required).sparkr == 'true'
	runs-on: ubuntu-latest
	permissions:
	packages: write
	steps:
	- name: Login to GitHub Container Registry
	uses: docker/login-action@v3
	with:
	registry: ghcr.io
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}
	- name: Checkout Spark repository
	uses: actions/checkout@v4
	# In order to fetch changed files
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: ${{ inputs.branch }}
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	run: \|
	echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
	- name: Set up QEMU
	uses: docker/setup-qemu-action@v3
	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v3
	- name: Build and push
	id: docker_build
	uses: docker/build-push-action@v6
	with:
	context: ./dev/infra/
	push: true
	tags: \|
	${{ needs.precondition.outputs.image_url }}
	# Use the infra image cache to speed up
	cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ inputs.branch }}

	pyspark:
	needs: [precondition, infra-image]
	# always run if pyspark == 'true', even infra-image is skip (such as non-master job)
	if: (!cancelled()) && fromJson(needs.precondition.outputs.required).pyspark == 'true'
	name: "Build modules: ${{ matrix.modules }}"
	runs-on: ubuntu-latest
	timeout-minutes: 180
	container:
	image: ${{ needs.precondition.outputs.image_url }}
	strategy:
	fail-fast: false
	matrix:
	java:
	- ${{ inputs.java }}
	modules:
	- >-
	pyspark-sql, pyspark-resource, pyspark-testing
	- >-
	pyspark-core, pyspark-errors, pyspark-streaming
	- >-
	pyspark-mllib, pyspark-ml, pyspark-ml-connect
	- >-
	pyspark-connect
	- >-
	pyspark-pandas
	- >-
	pyspark-pandas-slow
	- >-
	pyspark-pandas-connect-part0
	- >-
	pyspark-pandas-connect-part1
	- >-
	pyspark-pandas-connect-part2
	- >-
	pyspark-pandas-connect-part3
	exclude:
	# Always run if pyspark-pandas == 'true', even infra-image is skip (such as non-master job)
	# In practice, the build will run in individual PR, but not against the individual commit
	# in Apache Spark repository.
	- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas' }}
	- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-slow' }}
	- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part0' }}
	- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part1' }}
	- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part2' }}
	- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part3' }}
	env:
	MODULES_TO_TEST: ${{ matrix.modules }}
	PYTHON_TO_TEST: 'python3.11'
	HADOOP_PROFILE: ${{ inputs.hadoop }}
	HIVE_PROFILE: hive2.3
	# GitHub Actions' default miniconda to use in pip packaging test.
	CONDA_PREFIX: /usr/share/miniconda
	GITHUB_PREV_SHA: ${{ github.event.before }}
	SPARK_LOCAL_IP: localhost
	SKIP_UNIDOC: true
	SKIP_MIMA: true
	SKIP_PACKAGING: true
	METASPACE_SIZE: 1g
	BRANCH: ${{ inputs.branch }}
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v4
	# In order to fetch changed files
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: ${{ inputs.branch }}
	- name: Add GITHUB_WORKSPACE to git trust safe.directory
	run: \|
	git config --global --add safe.directory ${GITHUB_WORKSPACE}
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	run: \|
	echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
	# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
	- name: Cache SBT and Maven
	uses: actions/cache@v4
	with:
	path: \|
	build/apache-maven-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v4
	with:
	path: ~/.cache/coursier
	key: pyspark-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	pyspark-coursier-
	- name: Free up disk space
	shell: 'script -q -e -c "bash {0}"'
	run: ./dev/free_disk_space_container
	- name: Install Java ${{ matrix.java }}
	uses: actions/setup-java@v4
	with:
	distribution: zulu
	java-version: ${{ matrix.java }}
	- name: List Python packages (${{ env.PYTHON_TO_TEST }})
	env: ${{ fromJSON(inputs.envs) }}
	shell: 'script -q -e -c "bash {0}"'
	run: \|
	for py in $(echo $PYTHON_TO_TEST \| tr "," "\n")
	do
	echo $py
	$py -m pip list
	done
	# Run the tests.
	- name: Run tests
	env: ${{ fromJSON(inputs.envs) }}
	shell: 'script -q -e -c "bash {0}"'
	run: \|
	if [[ "$MODULES_TO_TEST" == "pyspark-errors" ]]; then
	export SKIP_PACKAGING=false
	echo "Python Packaging Tests Enabled!"
	fi
	if [ ! -z "$PYTHON_TO_TEST" ]; then
	./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST"
	else
	# For branch-3.5 and below, it uses the default Python versions.
	./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST"
	fi
	- name: Upload coverage to Codecov
	if: fromJSON(inputs.envs).PYSPARK_CODECOV == 'true'
	uses: codecov/codecov-action@v4
	with:
	files: ./python/coverage.xml
	flags: unittests
	name: PySpark
	- name: Upload test results to report
	env: ${{ fromJSON(inputs.envs) }}
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
	path: "*/target/test-reports/.xml"
	- name: Upload unit tests log files
	env: ${{ fromJSON(inputs.envs) }}
	if: ${{ !success() }}
	uses: actions/upload-artifact@v4
	with:
	name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }}
	path: "**/target/unit-tests.log"

	sparkr:
	needs: [precondition, infra-image]
	# always run if sparkr == 'true', even infra-image is skip (such as non-master job)
	if: (!cancelled()) && fromJson(needs.precondition.outputs.required).sparkr == 'true'
	name: "Build modules: sparkr"
	runs-on: ubuntu-latest
	timeout-minutes: 180
	container:
	image: ${{ needs.precondition.outputs.image_url }}
	env:
	HADOOP_PROFILE: ${{ inputs.hadoop }}
	HIVE_PROFILE: hive2.3
	GITHUB_PREV_SHA: ${{ github.event.before }}
	SPARK_LOCAL_IP: localhost
	SKIP_UNIDOC: true
	SKIP_MIMA: true
	SKIP_PACKAGING: true
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v4
	# In order to fetch changed files
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: ${{ inputs.branch }}
	- name: Add GITHUB_WORKSPACE to git trust safe.directory
	run: \|
	git config --global --add safe.directory ${GITHUB_WORKSPACE}
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	run: \|
	echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
	# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
	- name: Cache SBT and Maven
	uses: actions/cache@v4
	with:
	path: \|
	build/apache-maven-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v4
	with:
	path: ~/.cache/coursier
	key: sparkr-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	sparkr-coursier-
	- name: Free up disk space
	run: ./dev/free_disk_space_container
	- name: Install Java ${{ inputs.java }}
	uses: actions/setup-java@v4
	with:
	distribution: zulu
	java-version: ${{ inputs.java }}
	- name: Run tests
	env: ${{ fromJSON(inputs.envs) }}
	run: \|
	# The followings are also used by `r-lib/actions/setup-r` to avoid
	# R issues at docker environment
	export TZ=UTC
	export _R_CHECK_SYSTEM_CLOCK_=FALSE
	./dev/run-tests --parallelism 1 --modules sparkr
	- name: Upload test results to report
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: test-results-sparkr--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
	path: "*/target/test-reports/.xml"

	buf:
	needs: [precondition]
	if: (!cancelled()) && fromJson(needs.precondition.outputs.required).buf == 'true'
	name: Protobuf breaking change detection and Python CodeGen check
	runs-on: ubuntu-latest
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v4
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: ${{ inputs.branch }}
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	run: \|
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
	- name: Install Buf
	uses: bufbuild/buf-setup-action@v1
	with:
	github_token: ${{ secrets.GITHUB_TOKEN }}
	- name: Protocol Buffers Linter
	uses: bufbuild/buf-lint-action@v1
	with:
	input: core/src/main/protobuf
	# Change 'branch-3.5' to 'branch-4.0' in master branch after cutting branch-4.0 branch.
	- name: Breaking change detection against branch-3.5
	uses: bufbuild/buf-breaking-action@v1
	with:
	input: sql/connect/common/src/main
	against: 'https://github.com/apache/spark.git#branch=branch-3.5,subdir=connector/connect/common/src/main'
	- name: Install Python 3.11
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'
	- name: Install dependencies for Python CodeGen check
	run: \|
	python3.11 -m pip install 'black==23.9.1' 'protobuf==5.28.3' 'mypy==1.8.0' 'mypy-protobuf==3.3.0'
	python3.11 -m pip list
	- name: Python CodeGen check
	run: ./dev/connect-check-protos.py

	# Static analysis
	lint:
	needs: [precondition, infra-image]
	# always run if lint == 'true', even infra-image is skip (such as non-master job)
	if: (!cancelled()) && fromJson(needs.precondition.outputs.required).lint == 'true'
	name: Linters, licenses, and dependencies
	runs-on: ubuntu-latest
	timeout-minutes: 180
	env:
	LC_ALL: C.UTF-8
	LANG: C.UTF-8
	NOLINT_ON_COMPILE: false
	PYSPARK_DRIVER_PYTHON: python3.9
	PYSPARK_PYTHON: python3.9
	GITHUB_PREV_SHA: ${{ github.event.before }}
	container:
	image: ${{ needs.precondition.outputs.image_url }}
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v4
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: ${{ inputs.branch }}
	- name: Add GITHUB_WORKSPACE to git trust safe.directory
	run: \|
	git config --global --add safe.directory ${GITHUB_WORKSPACE}
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	run: \|
	echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
	# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
	- name: Cache SBT and Maven
	uses: actions/cache@v4
	with:
	path: \|
	build/apache-maven-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v4
	with:
	path: ~/.cache/coursier
	key: docs-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	docs-coursier-
	- name: Cache Maven local repository
	uses: actions/cache@v4
	with:
	path: ~/.m2/repository
	key: docs-maven-${{ hashFiles('**/pom.xml') }}
	restore-keys: \|
	docs-maven-
	- name: Free up disk space
	run: ./dev/free_disk_space_container
	- name: Install Java ${{ inputs.java }}
	uses: actions/setup-java@v4
	with:
	distribution: zulu
	java-version: ${{ inputs.java }}
	- name: License test
	run: ./dev/check-license
	- name: Dependencies test
	run: ./dev/test-dependencies.sh
	- name: MIMA test
	run: ./dev/mima
	- name: Scala linter
	run: ./dev/lint-scala
	- name: Scala structured logging check
	run: \|
	if [ -f ./dev/structured_logging_style.py ]; then
	python3.9 ./dev/structured_logging_style.py
	fi
	- name: Java linter
	run: ./dev/lint-java
	- name: Spark connect jvm client mima check
	run: ./dev/connect-jvm-client-mima-check
	- name: Install Python linter dependencies for branch-3.5
	if: inputs.branch == 'branch-3.5'
	run: \|
	# SPARK-45212: Copy from https://github.com/apache/spark/blob/555c8def51e5951c7bf5165a332795e9e330ec9d/.github/workflows/build_and_test.yml#L631-L638
	# Should delete this section after SPARK 3.5 EOL.
	python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0'
	python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
	- name: Install Python dependencies for python linter and documentation generation
	if: inputs.branch != 'branch-3.5'
	run: \|
	# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
	# See 'ipython_genutils' in SPARK-38517
	# See 'docutils<0.18.0' in SPARK-39421
	python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
	ipython ipython_genutils sphinx_plotly_directive numpy pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
	'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
	'pandas-stubs==1.2.0.53' 'grpcio==1.67.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
	'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
	python3.9 -m pip list
	- name: Python linter
	run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python
	# Should delete this section after SPARK 3.5 EOL.
	- name: Install dependencies for Python code generation check for branch-3.5
	if: inputs.branch == 'branch-3.5'
	run: \|
	# See more in "Installation" https://docs.buf.build/installation#tarball
	curl -LO https://github.com/bufbuild/buf/releases/download/v1.28.1/buf-Linux-x86_64.tar.gz
	mkdir -p $HOME/buf
	tar -xvzf buf-Linux-x86_64.tar.gz -C $HOME/buf --strip-components 1
	rm buf-Linux-x86_64.tar.gz
	python3.9 -m pip install 'protobuf==4.25.1' 'mypy-protobuf==3.3.0'
	# Should delete this section after SPARK 3.5 EOL.
	- name: Python code generation check for branch-3.5
	if: inputs.branch == 'branch-3.5'
	run: if test -f ./dev/connect-check-protos.py; then PATH=$PATH:$HOME/buf/bin PYTHON_EXECUTABLE=python3.9 ./dev/connect-check-protos.py; fi
	# Should delete this section after SPARK 3.5 EOL.
	- name: Install JavaScript linter dependencies for branch-3.5
	if: inputs.branch == 'branch-3.5'
	run: \|
	apt update
	apt-get install -y nodejs npm
	- name: JS linter
	run: ./dev/lint-js
	# Should delete this section after SPARK 3.5 EOL.
	- name: Install R linter dependencies for branch-3.5
	if: inputs.branch == 'branch-3.5'
	run: \|
	apt update
	apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \
	libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev \
	libtiff5-dev libjpeg-dev
	Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
	Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')"
	- name: Install R linter dependencies and SparkR
	run: ./R/install-dev.sh
	- name: R linter
	run: ./dev/lint-r

	# Documentation build
	docs:
	needs: [precondition, infra-image]
	# always run if lint == 'true', even infra-image is skip (such as non-master job)
	if: (!cancelled()) && fromJson(needs.precondition.outputs.required).docs == 'true'
	name: Documentation generation
	runs-on: ubuntu-latest
	timeout-minutes: 180
	env:
	LC_ALL: C.UTF-8
	LANG: C.UTF-8
	NOLINT_ON_COMPILE: false
	PYSPARK_DRIVER_PYTHON: python3.9
	PYSPARK_PYTHON: python3.9
	GITHUB_PREV_SHA: ${{ github.event.before }}
	container:
	image: ${{ needs.precondition.outputs.image_url }}
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v4
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: ${{ inputs.branch }}
	- name: Add GITHUB_WORKSPACE to git trust safe.directory
	run: \|
	git config --global --add safe.directory ${GITHUB_WORKSPACE}
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	run: \|
	echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
	# Cache local repositories. Note that GitHub Actions cache has a 10G limit.
	- name: Cache SBT and Maven
	uses: actions/cache@v4
	with:
	path: \|
	build/apache-maven-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v4
	with:
	path: ~/.cache/coursier
	key: docs-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	docs-coursier-
	- name: Cache Maven local repository
	uses: actions/cache@v4
	with:
	path: ~/.m2/repository
	key: docs-maven-${{ hashFiles('**/pom.xml') }}
	restore-keys: \|
	docs-maven-
	- name: Free up disk space
	run: ./dev/free_disk_space_container
	- name: Install Java ${{ inputs.java }}
	uses: actions/setup-java@v4
	with:
	distribution: zulu
	java-version: ${{ inputs.java }}
	- name: Install Python dependencies for python linter and documentation generation
	if: inputs.branch != 'branch-3.5'
	run: \|
	# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
	# See 'ipython_genutils' in SPARK-38517
	# See 'docutils<0.18.0' in SPARK-39421
	python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \
	ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \
	'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \
	'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \
	'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
	python3.9 -m pip list
	- name: Install dependencies for documentation generation for branch-3.5
	if: inputs.branch == 'branch-3.5'
	run: \|
	# pandoc is required to generate PySpark APIs as well in nbsphinx.
	apt-get update -y
	apt-get install -y libcurl4-openssl-dev pandoc
	apt-get install -y ruby ruby-dev
	Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2', 'ggplot2', 'mvtnorm', 'statmod'), repos='https://cloud.r-project.org/')"
	Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')"
	Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')"
	# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5
	python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5'
	python3.9 -m pip install ipython_genutils # See SPARK-38517
	python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8'
	python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421
	- name: Install dependencies for documentation generation
	run: \|
	# Keep the version of Bundler here in sync with the following locations:
	# - dev/create-release/spark-rm/Dockerfile
	# - docs/README.md
	gem install bundler -v 2.4.22
	cd docs
	bundle install --retry=100
	- name: Run documentation build
	run: \|
	# We need this link to make sure `python3` points to `python3.9` which contains the prerequisite packages.
	ln -s "$(which python3.9)" "/usr/local/bin/python3"
	# Build docs first with SKIP_API to ensure they are buildable without requiring any
	# language docs to be built beforehand.
	cd docs; SKIP_ERRORDOC=1 SKIP_API=1 bundle exec jekyll build; cd ..
	if [ -f "./dev/is-changed.py" ]; then
	# Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs
	pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
	if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi
	if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi
	fi
	# Print the values of environment variables `SKIP_ERRORDOC`, `SKIP_SCALADOC`, `SKIP_PYTHONDOC`, `SKIP_RDOC` and `SKIP_SQLDOC`
	echo "SKIP_ERRORDOC: $SKIP_ERRORDOC"
	echo "SKIP_SCALADOC: $SKIP_SCALADOC"
	echo "SKIP_PYTHONDOC: $SKIP_PYTHONDOC"
	echo "SKIP_RDOC: $SKIP_RDOC"
	echo "SKIP_SQLDOC: $SKIP_SQLDOC"
	cd docs
	bundle exec jekyll build
	- name: Tar documentation
	if: github.repository != 'apache/spark'
	run: tar cjf site.tar.bz2 docs/_site
	- name: Upload documentation
	if: github.repository != 'apache/spark'
	uses: actions/upload-artifact@v4
	with:
	name: site
	path: site.tar.bz2
	retention-days: 1

	# Any TPC-DS related updates on this job need to be applied to tpcds-1g-gen job of benchmark.yml as well
	tpcds-1g:
	needs: precondition
	if: fromJson(needs.precondition.outputs.required).tpcds-1g == 'true'
	name: Run TPC-DS queries with SF=1
	# Pin to 'Ubuntu 20.04' due to 'databricks/tpcds-kit' compilation
	runs-on: ubuntu-20.04
	timeout-minutes: 180
	env:
	SPARK_LOCAL_IP: localhost
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v4
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: ${{ inputs.branch }}
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	run: \|
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
	- name: Cache SBT and Maven
	uses: actions/cache@v4
	with:
	path: \|
	build/apache-maven-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v4
	with:
	path: ~/.cache/coursier
	key: tpcds-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	tpcds-coursier-
	- name: Install Java ${{ inputs.java }}
	uses: actions/setup-java@v4
	with:
	distribution: zulu
	java-version: ${{ inputs.java }}
	- name: Cache TPC-DS generated data
	id: cache-tpcds-sf-1
	uses: actions/cache@v4
	with:
	path: ./tpcds-sf-1
	key: tpcds-${{ hashFiles('.github/workflows/build_and_test.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
	- name: Checkout tpcds-kit repository
	if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
	uses: actions/checkout@v4
	with:
	repository: databricks/tpcds-kit
	ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069
	path: ./tpcds-kit
	- name: Build tpcds-kit
	if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
	run: cd tpcds-kit/tools && make OS=LINUX
	- name: Generate TPC-DS (SF=1) table data
	if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
	run: build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite"
	- name: Run TPC-DS queries (Sort merge join)
	run: \|
	SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
	env:
	SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }}
	SPARK_TPCDS_JOIN_CONF: \|
	spark.sql.autoBroadcastJoinThreshold=-1
	spark.sql.join.preferSortMergeJoin=true
	- name: Run TPC-DS queries (Broadcast hash join)
	run: \|
	SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
	env:
	SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }}
	SPARK_TPCDS_JOIN_CONF: \|
	spark.sql.autoBroadcastJoinThreshold=10485760
	- name: Run TPC-DS queries (Shuffled hash join)
	run: \|
	SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
	env:
	SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }}
	SPARK_TPCDS_JOIN_CONF: \|
	spark.sql.autoBroadcastJoinThreshold=-1
	spark.sql.join.forceApplyShuffledHashJoin=true
	- name: Run TPC-DS queries on collated data
	run: \|
	SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSCollationQueryTestSuite"
	- name: Upload test results to report
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: test-results-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
	path: "*/target/test-reports/.xml"
	- name: Upload unit tests log files
	if: ${{ !success() }}
	uses: actions/upload-artifact@v4
	with:
	name: unit-tests-log-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
	path: "**/target/unit-tests.log"

	docker-integration-tests:
	needs: precondition
	if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true'
	name: Run Docker integration tests
	runs-on: ubuntu-latest
	timeout-minutes: 180
	env:
	HADOOP_PROFILE: ${{ inputs.hadoop }}
	HIVE_PROFILE: hive2.3
	GITHUB_PREV_SHA: ${{ github.event.before }}
	SPARK_LOCAL_IP: localhost
	SKIP_UNIDOC: true
	SKIP_MIMA: true
	SKIP_PACKAGING: true
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v4
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: ${{ inputs.branch }}
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	run: \|
	echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
	- name: Cache SBT and Maven
	uses: actions/cache@v4
	with:
	path: \|
	build/apache-maven-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v4
	with:
	path: ~/.cache/coursier
	key: docker-integration-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	docker-integration-coursier-
	- name: Install Java ${{ inputs.java }}
	uses: actions/setup-java@v4
	with:
	distribution: zulu
	java-version: ${{ inputs.java }}
	- name: Run tests
	env: ${{ fromJSON(inputs.envs) }}
	run: \|
	./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest
	- name: Upload test results to report
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: test-results-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
	path: "*/target/test-reports/.xml"
	- name: Upload unit tests log files
	if: ${{ !success() }}
	uses: actions/upload-artifact@v4
	with:
	name: unit-tests-log-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3
	path: "**/target/unit-tests.log"

	k8s-integration-tests:
	needs: precondition
	if: fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true'
	name: Run Spark on Kubernetes Integration test
	runs-on: ubuntu-latest
	timeout-minutes: 180
	steps:
	- name: Checkout Spark repository
	uses: actions/checkout@v4
	with:
	fetch-depth: 0
	repository: apache/spark
	ref: ${{ inputs.branch }}
	- name: Sync the current branch with the latest in Apache Spark
	if: github.repository != 'apache/spark'
	run: \|
	echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
	git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
	git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty
	- name: Cache SBT and Maven
	uses: actions/cache@v4
	with:
	path: \|
	build/apache-maven-*
	build/*.jar
	~/.sbt
	key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
	restore-keys: \|
	build-
	- name: Cache Coursier local repository
	uses: actions/cache@v4
	with:
	path: ~/.cache/coursier
	key: k8s-integration-coursier-${{ hashFiles('/pom.xml', '/plugins.sbt') }}
	restore-keys: \|
	k8s-integration-coursier-
	- name: Install Java ${{ inputs.java }}
	uses: actions/setup-java@v4
	with:
	distribution: zulu
	java-version: ${{ inputs.java }}
	- name: Install R
	run: \|
	sudo apt update
	sudo apt-get install r-base
	- name: Start Minikube
	uses: medyagh/[email protected]
	with:
	# Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic
	cpus: 2
	memory: 6144m
	- name: Print K8S pods and nodes info
	run: \|
	kubectl get pods -A
	kubectl describe node
	- name: Run Spark on K8S integration test
	run: \|
	# Prepare PV test
	PVC_TMP_DIR=$(mktemp -d)
	export PVC_TESTS_HOST_PATH=$PVC_TMP_DIR
	export PVC_TESTS_VM_PATH=$PVC_TMP_DIR
	minikube mount ${PVC_TESTS_HOST_PATH}:${PVC_TESTS_VM_PATH} --gid=0 --uid=185 &
	kubectl create clusterrolebinding serviceaccounts-cluster-admin --clusterrole=cluster-admin --group=system:serviceaccounts \|\| true
	if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then
	kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml \|\| true
	else
	kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.9.0/installer/volcano-development.yaml \|\| true
	fi
	eval $(minikube docker-env)
	build/sbt -Phadoop-3 -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test"
	- name: Upload Spark on K8S integration tests log files
	if: ${{ !success() }}
	uses: actions/upload-artifact@v4
	with:
	name: spark-on-kubernetes-it-log
	path: "**/target/integration-tests.log"

	ui:
	needs: [precondition]
	if: fromJson(needs.precondition.outputs.required).ui == 'true'
	name: Run Spark UI tests
	runs-on: ubuntu-latest
	timeout-minutes: 180
	steps:
	- uses: actions/checkout@v4
	- name: Use Node.js
	uses: actions/setup-node@v4
	with:
	node-version: 20
	cache: 'npm'
	cache-dependency-path: ui-test/package-lock.json
	- run: \|
	cd ui-test
	npm install --save-dev
	node --experimental-vm-modules node_modules/.bin/jest

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Update build_and_test.yml #1

Workflow file

Update build_and_test.yml #1

Jobs

Run details

Workflow file for this run

GitHub Actions / .github/workflows/build_and_test.yml