Skip to content

Consistency CI

Consistency CI #1

Workflow file for this run

# SPDX-FileCopyrightText: 2023 LakeSoul Contributors
#
# SPDX-License-Identifier: Apache-2.0
on:
push:
paths:
- "rust/**"
branches:
- 'main'
pull_request:
paths:
- "rust/**"
branches:
- 'main'
- 'release/**'
workflow_dispatch:
name: Consistency CI
env:
RUSTFLAGS: "-Awarnings"
jobs:
verify-hash-consistency:
name: verify benchmark results (amd64)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
with:
rust-version: stable
- name: Generate benchmark data and expected query results
run: |
mkdir -p lakesoul/test_files/tpch/data
git clone https://github.com/databricks/tpch-dbgen.git
cd tpch-dbgen
make
./dbgen -f -s 0.1
mv *.tbl ../lakesoul/test_files/tpch/data
- name: Verify that benchmark queries return expected results
run: |
export TPCH_DATA=`realpath lakesoul/test_files/tpch/data`
# use release build for plan verificaton because debug build causes stack overflow
# sudo cargo test verify_hash_consistency --package lakesoul-datafusion --features=ci -- --test-threads=1
- uses: actions/checkout@v4
- name: Set up JDK 8
uses: actions/setup-java@v4
with:
java-version: '8'
distribution: 'temurin'
cache: maven
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: '3.9'
- name: Install dependencies
run: |
python -m pip install --upgrade pip setuptools wheel
pip install pymysql cryptography jproperties --no-cache-dir
wget https://dlcdn.apache.org/hadoop/common/hadoop-3.3.5/hadoop-3.3.5.tar.gz -O $HOME/hadoop-3.3.5.tar.gz && tar xf $HOME/hadoop-3.3.5.tar.gz -C $HOME
echo "HADOOP_HOME=$HOME/hadoop-3.3.5" >> $GITHUB_ENV
wget https://repo1.maven.org/maven2/org/apache/flink/flink-s3-fs-hadoop/1.17.1/flink-s3-fs-hadoop-1.17.1.jar -O $HOME/flink-s3-fs-hadoop-1.17.1.jar
wget https://repo1.maven.org/maven2/org/apache/parquet/parquet-hadoop-bundle/1.12.3/parquet-hadoop-bundle-1.12.3.jar -O $HOME/parquet-hadoop-bundle-1.12.3.jar
wget https://repo1.maven.org/maven2/org/apache/flink/flink-parquet/1.17.1/flink-parquet-1.17.1.jar -O $HOME/flink-parquet-1.17.1.jar
- name: Install Protoc
uses: arduino/setup-protoc@v2
with:
version: "23.x"
repo-token: ${{ secrets.GITHUB_TOKEN }}
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: nightly-2023-05-20
default: true
- uses: Swatinem/rust-cache@v2
with:
workspaces: "./rust -> target"
- name: Pull images
run: |
docker pull -q bitnami/spark:3.3.1
- uses: actions-rs/cargo@v1
with:
use-cross: true
command: build
args: '--manifest-path rust/Cargo.toml --target x86_64-unknown-linux-gnu --release --all-features'
- name: Build with Maven
run: |
mkdir -p rust/target/release
cp rust/target/x86_64-unknown-linux-gnu/release/liblakesoul_io_c.so rust/target/release
cp rust/target/x86_64-unknown-linux-gnu/release/liblakesoul_metadata_c.so rust/target/release
MAVEN_OPTS="-Xmx4000m" mvn -q -B package -f pom.xml -Pcross-build -DskipTests
- name: Get jar names
run: |
echo "FLINK_JAR_NAME=$(python script/get_jar_name.py lakesoul-flink)" >> $GITHUB_ENV
echo "FLINK_TEST_JAR_NAME=$(python script/get_jar_name.py lakesoul-flink | sed -e 's/.jar/-tests.jar/g')" >> $GITHUB_ENV
echo "SPARK_JAR_NAME=$(python script/get_jar_name.py lakesoul-spark)" >> $GITHUB_ENV
echo "SPARK_TEST_JAR_NAME=$(python script/get_jar_name.py lakesoul-spark | sed -e 's/.jar/-tests.jar/g')" >> $GITHUB_ENV
- name: Copy built jar to work-dir
run: |
cp ./lakesoul-flink/target/$FLINK_JAR_NAME ./script/benchmark/work-dir
cp ./lakesoul-flink/target/$FLINK_TEST_JAR_NAME ./script/benchmark/work-dir
cp ./lakesoul-spark/target/$SPARK_JAR_NAME ./script/benchmark/work-dir
cp ./lakesoul-spark/target/$SPARK_TEST_JAR_NAME ./script/benchmark/work-dir
- name: Hash-Consistency Verification task
run: |
cd ./script/benchmark
docker run --cpus 2 -m 5000m --rm -t -v ${PWD}/work-dir:/opt/spark/work-dir --env lakesoul_home=/opt/spark/work-dir/lakesoul.properties bitnami/spark:3.3.1 spark-submit --driver-memory 4G --executor-memory 4G --conf spark.driver.memoryOverhead=1500m --conf spark.executor.memoryOverhead=1500m --jars /opt/spark/work-dir/$SPARK_JAR_NAME --class org.apache.spark.sql.lakesoul.benchmark.Benchmark --master local[4] /opt/spark/work-dir/$SPARK_TEST_JAR_NAME