This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
- name: Setup .NET Core SDK | ||
uses: actions/[email protected] | ||
with: | ||
# Optional SDK version(s) to use. If not provided, will install global.json version when available. Examples: 2.2.104, 3.1, 3.1.x, 3.x, 6.0.2xx | ||
dotnet-version: # optional | ||
# Optional quality of the build. The possible values are: daily, signed, validated, preview, ga. | ||
dotnet-quality: # optional | ||
# Optional global.json location, if your global.json isn't located in the root of the repo. | ||
global-json-file: # optional | ||
# Optional package source for which to set up authentication. Will consult any existing NuGet.config in the root of the repo and provide a temporary NuGet.config using the NUGET_AUTH_TOKEN environment variable as a ClearTextPassword | ||
source-url: # optional | ||
# Optional OWNER for using packages from GitHub Package Registry organizations/users other than the current repository's owner. Only used if a GPR URL is also provided in source-url | ||
owner: # optional | ||
# Optional NuGet.config location, if your NuGet.config isn't located in the root of the repo. | ||
config-file: # optional | ||
# Optional input to enable caching of the NuGet global-packages folder | ||
cache: # optional | ||
# Used to specify the path to a dependency file: packages.lock.json. Supports wildcards or a list of file names for caching multiple dependencies. | ||
cache-dependency-path: # optional | ||
name: Build and test | ||
on: | ||
workflow_call: | ||
inputs: | ||
java: | ||
required: false | ||
type: string | ||
default: 17 | ||
branch: | ||
description: Branch to run the build against | ||
required: false | ||
type: string | ||
# Change 'master' to 'branch-4.0' in branch-4.0 branch after cutting it. | ||
default: master | ||
hadoop: | ||
description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it. | ||
required: false | ||
type: string | ||
default: hadoop3 | ||
envs: | ||
description: Additional environment variables to set when running the tests. Should be in JSON format. | ||
required: false | ||
type: string | ||
default: '{}' | ||
jobs: | ||
description: >- | ||
Jobs to run, and should be in JSON format. The values should be matched with the job's key defined | ||
in this file, e.g., build. See precondition job below. | ||
required: false | ||
type: string | ||
default: '' | ||
jobs: | ||
precondition: | ||
name: Check changes | ||
runs-on: ubuntu-latest | ||
env: | ||
GITHUB_PREV_SHA: ${{ github.event.before }} | ||
outputs: | ||
required: ${{ steps.set-outputs.outputs.required }} | ||
image_url: ${{ steps.infra-image-outputs.outputs.image_url }} | ||
steps: | ||
- name: Checkout Spark repository | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 0 | ||
repository: apache/spark | ||
ref: ${{ inputs.branch }} | ||
- name: Sync the current branch with the latest in Apache Spark | ||
if: github.repository != 'apache/spark' | ||
run: | | ||
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | ||
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty | ||
- name: Check all modules | ||
id: set-outputs | ||
run: | | ||
if [ -z "${{ inputs.jobs }}" ]; then | ||
pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` | ||
pyspark=`./dev/is-changed.py -m $pyspark_modules` | ||
if [[ "${{ github.repository }}" != 'apache/spark' ]]; then | ||
pandas=$pyspark | ||
yarn=`./dev/is-changed.py -m yarn` | ||
kubernetes=`./dev/is-changed.py -m kubernetes` | ||
sparkr=`./dev/is-changed.py -m sparkr` | ||
tpcds=`./dev/is-changed.py -m sql` | ||
docker=`./dev/is-changed.py -m docker-integration-tests` | ||
buf=true | ||
ui=true | ||
docs=true | ||
else | ||
pandas=false | ||
yarn=false | ||
kubernetes=false | ||
sparkr=false | ||
tpcds=false | ||
docker=false | ||
buf=false | ||
ui=false | ||
docs=false | ||
fi | ||
build=`./dev/is-changed.py -m "core,unsafe,kvstore,avro,utils,network-common,network-shuffle,repl,launcher,examples,sketch,variant,api,catalyst,hive-thriftserver,mllib-local,mllib,graphx,streaming,sql-kafka-0-10,streaming-kafka-0-10,streaming-kinesis-asl,kubernetes,hadoop-cloud,spark-ganglia-lgpl,protobuf,yarn,connect,sql,hive"` | ||
precondition=" | ||
{ | ||
\"build\": \"$build\", | ||
\"pyspark\": \"$pyspark\", | ||
\"pyspark-pandas\": \"$pandas\", | ||
\"sparkr\": \"$sparkr\", | ||
\"tpcds-1g\": \"$tpcds\", | ||
\"docker-integration-tests\": \"$docker\", | ||
\"lint\" : \"true\", | ||
\"docs\" : \"$docs\", | ||
\"yarn\" : \"$yarn\", | ||
\"k8s-integration-tests\" : \"$kubernetes\", | ||
\"buf\" : \"$buf\", | ||
\"ui\" : \"$ui\", | ||
}" | ||
echo $precondition # For debugging | ||
# Remove `\n` to avoid "Invalid format" error | ||
precondition="${precondition//$'\n'/}}" | ||
echo "required=$precondition" >> $GITHUB_OUTPUT | ||
else | ||
# This is usually set by scheduled jobs. | ||
precondition='${{ inputs.jobs }}' | ||
echo $precondition # For debugging | ||
precondition="${precondition//$'\n'/}" | ||
echo "required=$precondition" >> $GITHUB_OUTPUT | ||
fi | ||
- name: Generate infra image URL | ||
id: infra-image-outputs | ||
run: | | ||
# Convert to lowercase to meet Docker repo name requirement | ||
REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') | ||
IMG_NAME="apache-spark-ci-image:${{ inputs.branch }}-${{ github.run_id }}" | ||
IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" | ||
echo "image_url=$IMG_URL" >> $GITHUB_OUTPUT | ||
# Build: build Spark and run the tests for specified modules. | ||
build: | ||
name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }}" | ||
needs: precondition | ||
if: fromJson(needs.precondition.outputs.required).build == 'true' | ||
runs-on: ubuntu-latest | ||
timeout-minutes: 180 | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
java: | ||
- ${{ inputs.java }} | ||
hadoop: | ||
- ${{ inputs.hadoop }} | ||
hive: | ||
- hive2.3 | ||
# Note that the modules below are from sparktestsupport/modules.py. | ||
modules: | ||
- >- | ||
core, unsafe, kvstore, avro, utils, | ||
network-common, network-shuffle, repl, launcher, | ||
examples, sketch, variant | ||
- >- | ||
api, catalyst, hive-thriftserver | ||
- >- | ||
mllib-local, mllib, graphx | ||
- >- | ||
streaming, sql-kafka-0-10, streaming-kafka-0-10, streaming-kinesis-asl, | ||
kubernetes, hadoop-cloud, spark-ganglia-lgpl, protobuf, connect | ||
- yarn | ||
# Here, we split Hive and SQL tests into some of slow ones and the rest of them. | ||
included-tags: [""] | ||
excluded-tags: [""] | ||
comment: [""] | ||
include: | ||
# Hive tests | ||
- modules: hive | ||
java: ${{ inputs.java }} | ||
hadoop: ${{ inputs.hadoop }} | ||
hive: hive2.3 | ||
included-tags: org.apache.spark.tags.SlowHiveTest | ||
comment: "- slow tests" | ||
- modules: hive | ||
java: ${{ inputs.java }} | ||
hadoop: ${{ inputs.hadoop }} | ||
hive: hive2.3 | ||
excluded-tags: org.apache.spark.tags.SlowHiveTest | ||
comment: "- other tests" | ||
# SQL tests | ||
- modules: sql | ||
java: ${{ inputs.java }} | ||
hadoop: ${{ inputs.hadoop }} | ||
hive: hive2.3 | ||
included-tags: org.apache.spark.tags.ExtendedSQLTest | ||
comment: "- extended tests" | ||
- modules: sql | ||
java: ${{ inputs.java }} | ||
hadoop: ${{ inputs.hadoop }} | ||
hive: hive2.3 | ||
included-tags: org.apache.spark.tags.SlowSQLTest | ||
comment: "- slow tests" | ||
- modules: sql | ||
java: ${{ inputs.java }} | ||
hadoop: ${{ inputs.hadoop }} | ||
hive: hive2.3 | ||
excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.SlowSQLTest | ||
comment: "- other tests" | ||
exclude: | ||
# Always run if yarn == 'true', even infra-image is skip (such as non-master job) | ||
# In practice, the build will run in individual PR, but not against the individual commit | ||
# in Apache Spark repository. | ||
- modules: ${{ fromJson(needs.precondition.outputs.required).yarn != 'true' && 'yarn' }} | ||
env: | ||
MODULES_TO_TEST: ${{ matrix.modules }} | ||
EXCLUDED_TAGS: ${{ matrix.excluded-tags }} | ||
INCLUDED_TAGS: ${{ matrix.included-tags }} | ||
HADOOP_PROFILE: ${{ matrix.hadoop }} | ||
HIVE_PROFILE: ${{ matrix.hive }} | ||
# GitHub Actions' default miniconda to use in pip packaging test. | ||
CONDA_PREFIX: /usr/share/miniconda | ||
GITHUB_PREV_SHA: ${{ github.event.before }} | ||
SPARK_LOCAL_IP: localhost | ||
NOLINT_ON_COMPILE: true | ||
SKIP_UNIDOC: true | ||
SKIP_MIMA: true | ||
SKIP_PACKAGING: true | ||
steps: | ||
- name: Checkout Spark repository | ||
uses: actions/checkout@v4 | ||
# In order to fetch changed files | ||
with: | ||
fetch-depth: 0 | ||
repository: apache/spark | ||
ref: ${{ inputs.branch }} | ||
- name: Sync the current branch with the latest in Apache Spark | ||
if: github.repository != 'apache/spark' | ||
run: | | ||
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | ||
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty | ||
# Cache local repositories. Note that GitHub Actions cache has a 10G limit. | ||
- name: Cache SBT and Maven | ||
uses: actions/cache@v4 | ||
with: | ||
path: | | ||
build/apache-maven-* | ||
build/*.jar | ||
~/.sbt | ||
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | ||
restore-keys: | | ||
build- | ||
- name: Cache Coursier local repository | ||
uses: actions/cache@v4 | ||
with: | ||
path: ~/.cache/coursier | ||
key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | ||
restore-keys: | | ||
${{ matrix.java }}-${{ matrix.hadoop }}-coursier- | ||
- name: Free up disk space | ||
run: | | ||
if [ -f ./dev/free_disk_space ]; then | ||
./dev/free_disk_space | ||
fi | ||
- name: Install Java ${{ matrix.java }} | ||
uses: actions/setup-java@v4 | ||
with: | ||
distribution: zulu | ||
java-version: ${{ matrix.java }} | ||
- name: Install Python 3.11 | ||
uses: actions/setup-python@v5 | ||
# We should install one Python that is higher than 3+ for SQL and Yarn because: | ||
# - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. | ||
# - Yarn has a Python specific test too, for example, YarnClusterSuite. | ||
if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect') | ||
with: | ||
python-version: '3.11' | ||
architecture: x64 | ||
- name: Install Python packages (Python 3.11) | ||
if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect') | ||
run: | | ||
python3.11 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.67.0' 'grpcio-status==1.67.0' 'protobuf==5.28.3' | ||
python3.11 -m pip list | ||
# Run the tests. | ||
- name: Run tests | ||
env: ${{ fromJSON(inputs.envs) }} | ||
shell: 'script -q -e -c "bash {0}"' | ||
run: | | ||
# Fix for TTY related issues when launching the Ammonite REPL in tests. | ||
export TERM=vt100 | ||
# Hive "other tests" test needs larger metaspace size based on experiment. | ||
if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi | ||
# SPARK-46283: should delete the following env replacement after SPARK 3.x EOL | ||
if [[ "$MODULES_TO_TEST" == *"streaming-kinesis-asl"* ]] && [[ "${{ inputs.branch }}" =~ ^branch-3 ]]; then | ||
MODULES_TO_TEST=${MODULES_TO_TEST//streaming-kinesis-asl, /} | ||
fi | ||
export SERIAL_SBT_TESTS=1 | ||
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" | ||
- name: Upload test results to report | ||
if: always() | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} | ||
path: "**/target/test-reports/*.xml" | ||
- name: Upload unit tests log files | ||
if: ${{ !success() }} | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} | ||
path: "**/target/*.log" | ||
infra-image: | ||
name: "Base image build" | ||
needs: precondition | ||
if: >- | ||
fromJson(needs.precondition.outputs.required).pyspark == 'true' || | ||
fromJson(needs.precondition.outputs.required).lint == 'true' || | ||
fromJson(needs.precondition.outputs.required).sparkr == 'true' | ||
runs-on: ubuntu-latest | ||
permissions: | ||
packages: write | ||
steps: | ||
- name: Login to GitHub Container Registry | ||
uses: docker/login-action@v3 | ||
with: | ||
registry: ghcr.io | ||
username: ${{ github.actor }} | ||
password: ${{ secrets.GITHUB_TOKEN }} | ||
- name: Checkout Spark repository | ||
uses: actions/checkout@v4 | ||
# In order to fetch changed files | ||
with: | ||
fetch-depth: 0 | ||
repository: apache/spark | ||
ref: ${{ inputs.branch }} | ||
- name: Sync the current branch with the latest in Apache Spark | ||
if: github.repository != 'apache/spark' | ||
run: | | ||
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | ||
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty | ||
- name: Set up QEMU | ||
uses: docker/setup-qemu-action@v3 | ||
- name: Set up Docker Buildx | ||
uses: docker/setup-buildx-action@v3 | ||
- name: Build and push | ||
id: docker_build | ||
uses: docker/build-push-action@v6 | ||
with: | ||
context: ./dev/infra/ | ||
push: true | ||
tags: | | ||
${{ needs.precondition.outputs.image_url }} | ||
# Use the infra image cache to speed up | ||
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ inputs.branch }} | ||
pyspark: | ||
needs: [precondition, infra-image] | ||
# always run if pyspark == 'true', even infra-image is skip (such as non-master job) | ||
if: (!cancelled()) && fromJson(needs.precondition.outputs.required).pyspark == 'true' | ||
name: "Build modules: ${{ matrix.modules }}" | ||
runs-on: ubuntu-latest | ||
timeout-minutes: 180 | ||
container: | ||
image: ${{ needs.precondition.outputs.image_url }} | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
java: | ||
- ${{ inputs.java }} | ||
modules: | ||
- >- | ||
pyspark-sql, pyspark-resource, pyspark-testing | ||
- >- | ||
pyspark-core, pyspark-errors, pyspark-streaming | ||
- >- | ||
pyspark-mllib, pyspark-ml, pyspark-ml-connect | ||
- >- | ||
pyspark-connect | ||
- >- | ||
pyspark-pandas | ||
- >- | ||
pyspark-pandas-slow | ||
- >- | ||
pyspark-pandas-connect-part0 | ||
- >- | ||
pyspark-pandas-connect-part1 | ||
- >- | ||
pyspark-pandas-connect-part2 | ||
- >- | ||
pyspark-pandas-connect-part3 | ||
exclude: | ||
# Always run if pyspark-pandas == 'true', even infra-image is skip (such as non-master job) | ||
# In practice, the build will run in individual PR, but not against the individual commit | ||
# in Apache Spark repository. | ||
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas' }} | ||
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-slow' }} | ||
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part0' }} | ||
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part1' }} | ||
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part2' }} | ||
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part3' }} | ||
env: | ||
MODULES_TO_TEST: ${{ matrix.modules }} | ||
PYTHON_TO_TEST: 'python3.11' | ||
HADOOP_PROFILE: ${{ inputs.hadoop }} | ||
HIVE_PROFILE: hive2.3 | ||
# GitHub Actions' default miniconda to use in pip packaging test. | ||
CONDA_PREFIX: /usr/share/miniconda | ||
GITHUB_PREV_SHA: ${{ github.event.before }} | ||
SPARK_LOCAL_IP: localhost | ||
SKIP_UNIDOC: true | ||
SKIP_MIMA: true | ||
SKIP_PACKAGING: true | ||
METASPACE_SIZE: 1g | ||
BRANCH: ${{ inputs.branch }} | ||
steps: | ||
- name: Checkout Spark repository | ||
uses: actions/checkout@v4 | ||
# In order to fetch changed files | ||
with: | ||
fetch-depth: 0 | ||
repository: apache/spark | ||
ref: ${{ inputs.branch }} | ||
- name: Add GITHUB_WORKSPACE to git trust safe.directory | ||
run: | | ||
git config --global --add safe.directory ${GITHUB_WORKSPACE} | ||
- name: Sync the current branch with the latest in Apache Spark | ||
if: github.repository != 'apache/spark' | ||
run: | | ||
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | ||
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty | ||
# Cache local repositories. Note that GitHub Actions cache has a 10G limit. | ||
- name: Cache SBT and Maven | ||
uses: actions/cache@v4 | ||
with: | ||
path: | | ||
build/apache-maven-* | ||
build/*.jar | ||
~/.sbt | ||
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | ||
restore-keys: | | ||
build- | ||
- name: Cache Coursier local repository | ||
uses: actions/cache@v4 | ||
with: | ||
path: ~/.cache/coursier | ||
key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | ||
restore-keys: | | ||
pyspark-coursier- | ||
- name: Free up disk space | ||
shell: 'script -q -e -c "bash {0}"' | ||
run: ./dev/free_disk_space_container | ||
- name: Install Java ${{ matrix.java }} | ||
uses: actions/setup-java@v4 | ||
with: | ||
distribution: zulu | ||
java-version: ${{ matrix.java }} | ||
- name: List Python packages (${{ env.PYTHON_TO_TEST }}) | ||
env: ${{ fromJSON(inputs.envs) }} | ||
shell: 'script -q -e -c "bash {0}"' | ||
run: | | ||
for py in $(echo $PYTHON_TO_TEST | tr "," "\n") | ||
do | ||
echo $py | ||
$py -m pip list | ||
done | ||
# Run the tests. | ||
- name: Run tests | ||
env: ${{ fromJSON(inputs.envs) }} | ||
shell: 'script -q -e -c "bash {0}"' | ||
run: | | ||
if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then | ||
export SKIP_PACKAGING=false | ||
echo "Python Packaging Tests Enabled!" | ||
fi | ||
if [ ! -z "$PYTHON_TO_TEST" ]; then | ||
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST" | ||
else | ||
# For branch-3.5 and below, it uses the default Python versions. | ||
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" | ||
fi | ||
- name: Upload coverage to Codecov | ||
if: fromJSON(inputs.envs).PYSPARK_CODECOV == 'true' | ||
uses: codecov/codecov-action@v4 | ||
with: | ||
files: ./python/coverage.xml | ||
flags: unittests | ||
name: PySpark | ||
- name: Upload test results to report | ||
env: ${{ fromJSON(inputs.envs) }} | ||
if: always() | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} | ||
path: "**/target/test-reports/*.xml" | ||
- name: Upload unit tests log files | ||
env: ${{ fromJSON(inputs.envs) }} | ||
if: ${{ !success() }} | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} | ||
path: "**/target/unit-tests.log" | ||
sparkr: | ||
needs: [precondition, infra-image] | ||
# always run if sparkr == 'true', even infra-image is skip (such as non-master job) | ||
if: (!cancelled()) && fromJson(needs.precondition.outputs.required).sparkr == 'true' | ||
name: "Build modules: sparkr" | ||
runs-on: ubuntu-latest | ||
timeout-minutes: 180 | ||
container: | ||
image: ${{ needs.precondition.outputs.image_url }} | ||
env: | ||
HADOOP_PROFILE: ${{ inputs.hadoop }} | ||
HIVE_PROFILE: hive2.3 | ||
GITHUB_PREV_SHA: ${{ github.event.before }} | ||
SPARK_LOCAL_IP: localhost | ||
SKIP_UNIDOC: true | ||
SKIP_MIMA: true | ||
SKIP_PACKAGING: true | ||
steps: | ||
- name: Checkout Spark repository | ||
uses: actions/checkout@v4 | ||
# In order to fetch changed files | ||
with: | ||
fetch-depth: 0 | ||
repository: apache/spark | ||
ref: ${{ inputs.branch }} | ||
- name: Add GITHUB_WORKSPACE to git trust safe.directory | ||
run: | | ||
git config --global --add safe.directory ${GITHUB_WORKSPACE} | ||
- name: Sync the current branch with the latest in Apache Spark | ||
if: github.repository != 'apache/spark' | ||
run: | | ||
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | ||
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty | ||
# Cache local repositories. Note that GitHub Actions cache has a 10G limit. | ||
- name: Cache SBT and Maven | ||
uses: actions/cache@v4 | ||
with: | ||
path: | | ||
build/apache-maven-* | ||
build/*.jar | ||
~/.sbt | ||
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | ||
restore-keys: | | ||
build- | ||
- name: Cache Coursier local repository | ||
uses: actions/cache@v4 | ||
with: | ||
path: ~/.cache/coursier | ||
key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | ||
restore-keys: | | ||
sparkr-coursier- | ||
- name: Free up disk space | ||
run: ./dev/free_disk_space_container | ||
- name: Install Java ${{ inputs.java }} | ||
uses: actions/setup-java@v4 | ||
with: | ||
distribution: zulu | ||
java-version: ${{ inputs.java }} | ||
- name: Run tests | ||
env: ${{ fromJSON(inputs.envs) }} | ||
run: | | ||
# The followings are also used by `r-lib/actions/setup-r` to avoid | ||
# R issues at docker environment | ||
export TZ=UTC | ||
export _R_CHECK_SYSTEM_CLOCK_=FALSE | ||
./dev/run-tests --parallelism 1 --modules sparkr | ||
- name: Upload test results to report | ||
if: always() | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: test-results-sparkr--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 | ||
path: "**/target/test-reports/*.xml" | ||
buf: | ||
needs: [precondition] | ||
if: (!cancelled()) && fromJson(needs.precondition.outputs.required).buf == 'true' | ||
name: Protobuf breaking change detection and Python CodeGen check | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout Spark repository | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 0 | ||
repository: apache/spark | ||
ref: ${{ inputs.branch }} | ||
- name: Sync the current branch with the latest in Apache Spark | ||
if: github.repository != 'apache/spark' | ||
run: | | ||
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty | ||
- name: Install Buf | ||
uses: bufbuild/buf-setup-action@v1 | ||
with: | ||
github_token: ${{ secrets.GITHUB_TOKEN }} | ||
- name: Protocol Buffers Linter | ||
uses: bufbuild/buf-lint-action@v1 | ||
with: | ||
input: core/src/main/protobuf | ||
# Change 'branch-3.5' to 'branch-4.0' in master branch after cutting branch-4.0 branch. | ||
- name: Breaking change detection against branch-3.5 | ||
uses: bufbuild/buf-breaking-action@v1 | ||
with: | ||
input: sql/connect/common/src/main | ||
against: 'https://github.com/apache/spark.git#branch=branch-3.5,subdir=connector/connect/common/src/main' | ||
- name: Install Python 3.11 | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: '3.11' | ||
- name: Install dependencies for Python CodeGen check | ||
run: | | ||
python3.11 -m pip install 'black==23.9.1' 'protobuf==5.28.3' 'mypy==1.8.0' 'mypy-protobuf==3.3.0' | ||
python3.11 -m pip list | ||
- name: Python CodeGen check | ||
run: ./dev/connect-check-protos.py | ||
# Static analysis | ||
lint: | ||
needs: [precondition, infra-image] | ||
# always run if lint == 'true', even infra-image is skip (such as non-master job) | ||
if: (!cancelled()) && fromJson(needs.precondition.outputs.required).lint == 'true' | ||
name: Linters, licenses, and dependencies | ||
runs-on: ubuntu-latest | ||
timeout-minutes: 180 | ||
env: | ||
LC_ALL: C.UTF-8 | ||
LANG: C.UTF-8 | ||
NOLINT_ON_COMPILE: false | ||
PYSPARK_DRIVER_PYTHON: python3.9 | ||
PYSPARK_PYTHON: python3.9 | ||
GITHUB_PREV_SHA: ${{ github.event.before }} | ||
container: | ||
image: ${{ needs.precondition.outputs.image_url }} | ||
steps: | ||
- name: Checkout Spark repository | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 0 | ||
repository: apache/spark | ||
ref: ${{ inputs.branch }} | ||
- name: Add GITHUB_WORKSPACE to git trust safe.directory | ||
run: | | ||
git config --global --add safe.directory ${GITHUB_WORKSPACE} | ||
- name: Sync the current branch with the latest in Apache Spark | ||
if: github.repository != 'apache/spark' | ||
run: | | ||
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | ||
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty | ||
# Cache local repositories. Note that GitHub Actions cache has a 10G limit. | ||
- name: Cache SBT and Maven | ||
uses: actions/cache@v4 | ||
with: | ||
path: | | ||
build/apache-maven-* | ||
build/*.jar | ||
~/.sbt | ||
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | ||
restore-keys: | | ||
build- | ||
- name: Cache Coursier local repository | ||
uses: actions/cache@v4 | ||
with: | ||
path: ~/.cache/coursier | ||
key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | ||
restore-keys: | | ||
docs-coursier- | ||
- name: Cache Maven local repository | ||
uses: actions/cache@v4 | ||
with: | ||
path: ~/.m2/repository | ||
key: docs-maven-${{ hashFiles('**/pom.xml') }} | ||
restore-keys: | | ||
docs-maven- | ||
- name: Free up disk space | ||
run: ./dev/free_disk_space_container | ||
- name: Install Java ${{ inputs.java }} | ||
uses: actions/setup-java@v4 | ||
with: | ||
distribution: zulu | ||
java-version: ${{ inputs.java }} | ||
- name: License test | ||
run: ./dev/check-license | ||
- name: Dependencies test | ||
run: ./dev/test-dependencies.sh | ||
- name: MIMA test | ||
run: ./dev/mima | ||
- name: Scala linter | ||
run: ./dev/lint-scala | ||
- name: Scala structured logging check | ||
run: | | ||
if [ -f ./dev/structured_logging_style.py ]; then | ||
python3.9 ./dev/structured_logging_style.py | ||
fi | ||
- name: Java linter | ||
run: ./dev/lint-java | ||
- name: Spark connect jvm client mima check | ||
run: ./dev/connect-jvm-client-mima-check | ||
- name: Install Python linter dependencies for branch-3.5 | ||
if: inputs.branch == 'branch-3.5' | ||
run: | | ||
# SPARK-45212: Copy from https://github.com/apache/spark/blob/555c8def51e5951c7bf5165a332795e9e330ec9d/.github/workflows/build_and_test.yml#L631-L638 | ||
# Should delete this section after SPARK 3.5 EOL. | ||
python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0' | ||
python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' | ||
- name: Install Python dependencies for python linter and documentation generation | ||
if: inputs.branch != 'branch-3.5' | ||
run: | | ||
# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 | ||
# See 'ipython_genutils' in SPARK-38517 | ||
# See 'docutils<0.18.0' in SPARK-39421 | ||
python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \ | ||
ipython ipython_genutils sphinx_plotly_directive numpy pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \ | ||
'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \ | ||
'pandas-stubs==1.2.0.53' 'grpcio==1.67.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ | ||
'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' | ||
python3.9 -m pip list | ||
- name: Python linter | ||
run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python | ||
# Should delete this section after SPARK 3.5 EOL. | ||
- name: Install dependencies for Python code generation check for branch-3.5 | ||
if: inputs.branch == 'branch-3.5' | ||
run: | | ||
# See more in "Installation" https://docs.buf.build/installation#tarball | ||
curl -LO https://github.com/bufbuild/buf/releases/download/v1.28.1/buf-Linux-x86_64.tar.gz | ||
mkdir -p $HOME/buf | ||
tar -xvzf buf-Linux-x86_64.tar.gz -C $HOME/buf --strip-components 1 | ||
rm buf-Linux-x86_64.tar.gz | ||
python3.9 -m pip install 'protobuf==4.25.1' 'mypy-protobuf==3.3.0' | ||
# Should delete this section after SPARK 3.5 EOL. | ||
- name: Python code generation check for branch-3.5 | ||
if: inputs.branch == 'branch-3.5' | ||
run: if test -f ./dev/connect-check-protos.py; then PATH=$PATH:$HOME/buf/bin PYTHON_EXECUTABLE=python3.9 ./dev/connect-check-protos.py; fi | ||
# Should delete this section after SPARK 3.5 EOL. | ||
- name: Install JavaScript linter dependencies for branch-3.5 | ||
if: inputs.branch == 'branch-3.5' | ||
run: | | ||
apt update | ||
apt-get install -y nodejs npm | ||
- name: JS linter | ||
run: ./dev/lint-js | ||
# Should delete this section after SPARK 3.5 EOL. | ||
- name: Install R linter dependencies for branch-3.5 | ||
if: inputs.branch == 'branch-3.5' | ||
run: | | ||
apt update | ||
apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \ | ||
libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev \ | ||
libtiff5-dev libjpeg-dev | ||
Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" | ||
Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')" | ||
- name: Install R linter dependencies and SparkR | ||
run: ./R/install-dev.sh | ||
- name: R linter | ||
run: ./dev/lint-r | ||
# Documentation build | ||
docs: | ||
needs: [precondition, infra-image] | ||
# always run if lint == 'true', even infra-image is skip (such as non-master job) | ||
if: (!cancelled()) && fromJson(needs.precondition.outputs.required).docs == 'true' | ||
name: Documentation generation | ||
runs-on: ubuntu-latest | ||
timeout-minutes: 180 | ||
env: | ||
LC_ALL: C.UTF-8 | ||
LANG: C.UTF-8 | ||
NOLINT_ON_COMPILE: false | ||
PYSPARK_DRIVER_PYTHON: python3.9 | ||
PYSPARK_PYTHON: python3.9 | ||
GITHUB_PREV_SHA: ${{ github.event.before }} | ||
container: | ||
image: ${{ needs.precondition.outputs.image_url }} | ||
steps: | ||
- name: Checkout Spark repository | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 0 | ||
repository: apache/spark | ||
ref: ${{ inputs.branch }} | ||
- name: Add GITHUB_WORKSPACE to git trust safe.directory | ||
run: | | ||
git config --global --add safe.directory ${GITHUB_WORKSPACE} | ||
- name: Sync the current branch with the latest in Apache Spark | ||
if: github.repository != 'apache/spark' | ||
run: | | ||
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | ||
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty | ||
# Cache local repositories. Note that GitHub Actions cache has a 10G limit. | ||
- name: Cache SBT and Maven | ||
uses: actions/cache@v4 | ||
with: | ||
path: | | ||
build/apache-maven-* | ||
build/*.jar | ||
~/.sbt | ||
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | ||
restore-keys: | | ||
build- | ||
- name: Cache Coursier local repository | ||
uses: actions/cache@v4 | ||
with: | ||
path: ~/.cache/coursier | ||
key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | ||
restore-keys: | | ||
docs-coursier- | ||
- name: Cache Maven local repository | ||
uses: actions/cache@v4 | ||
with: | ||
path: ~/.m2/repository | ||
key: docs-maven-${{ hashFiles('**/pom.xml') }} | ||
restore-keys: | | ||
docs-maven- | ||
- name: Free up disk space | ||
run: ./dev/free_disk_space_container | ||
- name: Install Java ${{ inputs.java }} | ||
uses: actions/setup-java@v4 | ||
with: | ||
distribution: zulu | ||
java-version: ${{ inputs.java }} | ||
- name: Install Python dependencies for python linter and documentation generation | ||
if: inputs.branch != 'branch-3.5' | ||
run: | | ||
# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 | ||
# See 'ipython_genutils' in SPARK-38517 | ||
# See 'docutils<0.18.0' in SPARK-39421 | ||
python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \ | ||
ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \ | ||
'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \ | ||
'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ | ||
'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' | ||
python3.9 -m pip list | ||
- name: Install dependencies for documentation generation for branch-3.5 | ||
if: inputs.branch == 'branch-3.5' | ||
run: | | ||
# pandoc is required to generate PySpark APIs as well in nbsphinx. | ||
apt-get update -y | ||
apt-get install -y libcurl4-openssl-dev pandoc | ||
apt-get install -y ruby ruby-dev | ||
Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2', 'ggplot2', 'mvtnorm', 'statmod'), repos='https://cloud.r-project.org/')" | ||
Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" | ||
Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" | ||
# Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 | ||
python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' | ||
python3.9 -m pip install ipython_genutils # See SPARK-38517 | ||
python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' | ||
python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421 | ||
- name: Install dependencies for documentation generation | ||
run: | | ||
# Keep the version of Bundler here in sync with the following locations: | ||
# - dev/create-release/spark-rm/Dockerfile | ||
# - docs/README.md | ||
gem install bundler -v 2.4.22 | ||
cd docs | ||
bundle install --retry=100 | ||
- name: Run documentation build | ||
run: | | ||
# We need this link to make sure `python3` points to `python3.9` which contains the prerequisite packages. | ||
ln -s "$(which python3.9)" "/usr/local/bin/python3" | ||
# Build docs first with SKIP_API to ensure they are buildable without requiring any | ||
# language docs to be built beforehand. | ||
cd docs; SKIP_ERRORDOC=1 SKIP_API=1 bundle exec jekyll build; cd .. | ||
if [ -f "./dev/is-changed.py" ]; then | ||
# Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs | ||
pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` | ||
if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi | ||
if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi | ||
fi | ||
# Print the values of environment variables `SKIP_ERRORDOC`, `SKIP_SCALADOC`, `SKIP_PYTHONDOC`, `SKIP_RDOC` and `SKIP_SQLDOC` | ||
echo "SKIP_ERRORDOC: $SKIP_ERRORDOC" | ||
echo "SKIP_SCALADOC: $SKIP_SCALADOC" | ||
echo "SKIP_PYTHONDOC: $SKIP_PYTHONDOC" | ||
echo "SKIP_RDOC: $SKIP_RDOC" | ||
echo "SKIP_SQLDOC: $SKIP_SQLDOC" | ||
cd docs | ||
bundle exec jekyll build | ||
- name: Tar documentation | ||
if: github.repository != 'apache/spark' | ||
run: tar cjf site.tar.bz2 docs/_site | ||
- name: Upload documentation | ||
if: github.repository != 'apache/spark' | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: site | ||
path: site.tar.bz2 | ||
retention-days: 1 | ||
# Any TPC-DS related updates on this job need to be applied to tpcds-1g-gen job of benchmark.yml as well | ||
tpcds-1g: | ||
needs: precondition | ||
if: fromJson(needs.precondition.outputs.required).tpcds-1g == 'true' | ||
name: Run TPC-DS queries with SF=1 | ||
# Pin to 'Ubuntu 20.04' due to 'databricks/tpcds-kit' compilation | ||
runs-on: ubuntu-20.04 | ||
timeout-minutes: 180 | ||
env: | ||
SPARK_LOCAL_IP: localhost | ||
steps: | ||
- name: Checkout Spark repository | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 0 | ||
repository: apache/spark | ||
ref: ${{ inputs.branch }} | ||
- name: Sync the current branch with the latest in Apache Spark | ||
if: github.repository != 'apache/spark' | ||
run: | | ||
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty | ||
- name: Cache SBT and Maven | ||
uses: actions/cache@v4 | ||
with: | ||
path: | | ||
build/apache-maven-* | ||
build/*.jar | ||
~/.sbt | ||
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | ||
restore-keys: | | ||
build- | ||
- name: Cache Coursier local repository | ||
uses: actions/cache@v4 | ||
with: | ||
path: ~/.cache/coursier | ||
key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | ||
restore-keys: | | ||
tpcds-coursier- | ||
- name: Install Java ${{ inputs.java }} | ||
uses: actions/setup-java@v4 | ||
with: | ||
distribution: zulu | ||
java-version: ${{ inputs.java }} | ||
- name: Cache TPC-DS generated data | ||
id: cache-tpcds-sf-1 | ||
uses: actions/cache@v4 | ||
with: | ||
path: ./tpcds-sf-1 | ||
key: tpcds-${{ hashFiles('.github/workflows/build_and_test.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }} | ||
- name: Checkout tpcds-kit repository | ||
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' | ||
uses: actions/checkout@v4 | ||
with: | ||
repository: databricks/tpcds-kit | ||
ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069 | ||
path: ./tpcds-kit | ||
- name: Build tpcds-kit | ||
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' | ||
run: cd tpcds-kit/tools && make OS=LINUX | ||
- name: Generate TPC-DS (SF=1) table data | ||
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' | ||
run: build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite" | ||
- name: Run TPC-DS queries (Sort merge join) | ||
run: | | ||
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" | ||
env: | ||
SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} | ||
SPARK_TPCDS_JOIN_CONF: | | ||
spark.sql.autoBroadcastJoinThreshold=-1 | ||
spark.sql.join.preferSortMergeJoin=true | ||
- name: Run TPC-DS queries (Broadcast hash join) | ||
run: | | ||
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" | ||
env: | ||
SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} | ||
SPARK_TPCDS_JOIN_CONF: | | ||
spark.sql.autoBroadcastJoinThreshold=10485760 | ||
- name: Run TPC-DS queries (Shuffled hash join) | ||
run: | | ||
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" | ||
env: | ||
SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} | ||
SPARK_TPCDS_JOIN_CONF: | | ||
spark.sql.autoBroadcastJoinThreshold=-1 | ||
spark.sql.join.forceApplyShuffledHashJoin=true | ||
- name: Run TPC-DS queries on collated data | ||
run: | | ||
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSCollationQueryTestSuite" | ||
- name: Upload test results to report | ||
if: always() | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: test-results-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 | ||
path: "**/target/test-reports/*.xml" | ||
- name: Upload unit tests log files | ||
if: ${{ !success() }} | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: unit-tests-log-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 | ||
path: "**/target/unit-tests.log" | ||
docker-integration-tests: | ||
needs: precondition | ||
if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true' | ||
name: Run Docker integration tests | ||
runs-on: ubuntu-latest | ||
timeout-minutes: 180 | ||
env: | ||
HADOOP_PROFILE: ${{ inputs.hadoop }} | ||
HIVE_PROFILE: hive2.3 | ||
GITHUB_PREV_SHA: ${{ github.event.before }} | ||
SPARK_LOCAL_IP: localhost | ||
SKIP_UNIDOC: true | ||
SKIP_MIMA: true | ||
SKIP_PACKAGING: true | ||
steps: | ||
- name: Checkout Spark repository | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 0 | ||
repository: apache/spark | ||
ref: ${{ inputs.branch }} | ||
- name: Sync the current branch with the latest in Apache Spark | ||
if: github.repository != 'apache/spark' | ||
run: | | ||
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | ||
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty | ||
- name: Cache SBT and Maven | ||
uses: actions/cache@v4 | ||
with: | ||
path: | | ||
build/apache-maven-* | ||
build/*.jar | ||
~/.sbt | ||
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | ||
restore-keys: | | ||
build- | ||
- name: Cache Coursier local repository | ||
uses: actions/cache@v4 | ||
with: | ||
path: ~/.cache/coursier | ||
key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | ||
restore-keys: | | ||
docker-integration-coursier- | ||
- name: Install Java ${{ inputs.java }} | ||
uses: actions/setup-java@v4 | ||
with: | ||
distribution: zulu | ||
java-version: ${{ inputs.java }} | ||
- name: Run tests | ||
env: ${{ fromJSON(inputs.envs) }} | ||
run: | | ||
./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest | ||
- name: Upload test results to report | ||
if: always() | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: test-results-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 | ||
path: "**/target/test-reports/*.xml" | ||
- name: Upload unit tests log files | ||
if: ${{ !success() }} | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: unit-tests-log-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 | ||
path: "**/target/unit-tests.log" | ||
k8s-integration-tests: | ||
needs: precondition | ||
if: fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true' | ||
name: Run Spark on Kubernetes Integration test | ||
runs-on: ubuntu-latest | ||
timeout-minutes: 180 | ||
steps: | ||
- name: Checkout Spark repository | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 0 | ||
repository: apache/spark | ||
ref: ${{ inputs.branch }} | ||
- name: Sync the current branch with the latest in Apache Spark | ||
if: github.repository != 'apache/spark' | ||
run: | | ||
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV | ||
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD | ||
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit" --allow-empty | ||
- name: Cache SBT and Maven | ||
uses: actions/cache@v4 | ||
with: | ||
path: | | ||
build/apache-maven-* | ||
build/*.jar | ||
~/.sbt | ||
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} | ||
restore-keys: | | ||
build- | ||
- name: Cache Coursier local repository | ||
uses: actions/cache@v4 | ||
with: | ||
path: ~/.cache/coursier | ||
key: k8s-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} | ||
restore-keys: | | ||
k8s-integration-coursier- | ||
- name: Install Java ${{ inputs.java }} | ||
uses: actions/setup-java@v4 | ||
with: | ||
distribution: zulu | ||
java-version: ${{ inputs.java }} | ||
- name: Install R | ||
run: | | ||
sudo apt update | ||
sudo apt-get install r-base | ||
- name: Start Minikube | ||
uses: medyagh/[email protected] | ||
with: | ||
# Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic | ||
cpus: 2 | ||
memory: 6144m | ||
- name: Print K8S pods and nodes info | ||
run: | | ||
kubectl get pods -A | ||
kubectl describe node | ||
- name: Run Spark on K8S integration test | ||
run: | | ||
# Prepare PV test | ||
PVC_TMP_DIR=$(mktemp -d) | ||
export PVC_TESTS_HOST_PATH=$PVC_TMP_DIR | ||
export PVC_TESTS_VM_PATH=$PVC_TMP_DIR | ||
minikube mount ${PVC_TESTS_HOST_PATH}:${PVC_TESTS_VM_PATH} --gid=0 --uid=185 & | ||
kubectl create clusterrolebinding serviceaccounts-cluster-admin --clusterrole=cluster-admin --group=system:serviceaccounts || true | ||
if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then | ||
kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml || true | ||
else | ||
kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.9.0/installer/volcano-development.yaml || true | ||
fi | ||
eval $(minikube docker-env) | ||
build/sbt -Phadoop-3 -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test" | ||
- name: Upload Spark on K8S integration tests log files | ||
if: ${{ !success() }} | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: spark-on-kubernetes-it-log | ||
path: "**/target/integration-tests.log" | ||
ui: | ||
needs: [precondition] | ||
if: fromJson(needs.precondition.outputs.required).ui == 'true' | ||
name: Run Spark UI tests | ||
runs-on: ubuntu-latest | ||
timeout-minutes: 180 | ||
steps: | ||
- uses: actions/checkout@v4 | ||
- name: Use Node.js | ||
uses: actions/setup-node@v4 | ||
with: | ||
node-version: 20 | ||
cache: 'npm' | ||
cache-dependency-path: ui-test/package-lock.json | ||
- run: | | ||
cd ui-test | ||
npm install --save-dev | ||
node --experimental-vm-modules node_modules/.bin/jest |