Skip to content

Dataflow Engine Chaos #6522

Dataflow Engine Chaos

Dataflow Engine Chaos #6522

name: Dataflow Engine Chaos
on:
schedule:
- cron: '0 17-23 * * *' # run at minute 0 every hour from 01:00 ~ 07:00 UTC+8
workflow_dispatch:
inputs:
pr:
description: 'Which PR do you want to trigger (use PR number, such as 6127)'
required: true
default: ''
# See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency.
concurrency:
group: ${{ github.ref }}-${{ github.workflow }}
cancel-in-progress: true
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
# This workflow contains a single job called "base"
base:
# The type of runner that the job will run on
runs-on: ubuntu-20.04
timeout-minutes: 50
strategy:
fail-fast: false
matrix:
chaos-obj:
[
"pod-failure-dataflow",
"pod-kill-dataflow",
"network-partition-dataflow",
"network-emulation-dataflow",
"time-shift-dataflow",
]
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
- uses: actions/checkout@v2
- name: check out code by workerflow dispatch PR
if: ${{ github.event.inputs.pr != '' }}
uses: actions/checkout@v2
with:
ref: refs/pull/${{ github.event.inputs.pr }}/head
- uses: actions/setup-go@v3
with:
go-version: '1.23'
- name: Cache go modules
uses: actions/cache@v2
with:
path: ~/go/pkg/mod
key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }}
- name: Create k8s Kind Cluster
uses: helm/[email protected]
with:
cluster_name: dataflow-engine-cluster
config: ${{ github.workspace }}/engine/chaos/manifests/kind-cluster.yaml
- name: Print cluster information
run: |
kubectl config view
kubectl cluster-info
kubectl get nodes
kubectl get pods -n kube-system
kubectl get sc
kubectl version
helm version
- name: Build dataflow engine binary
run: |
make tiflow tiflow-chaos-case
cp -r $GITHUB_WORKSPACE/engine/chaos/cases/conf/ $GITHUB_WORKSPACE/bin/engine-conf
- name: Build Dataflow engine docker image
run: |
docker build -f $GITHUB_WORKSPACE/engine/chaos/manifests/Dockerfile -t dataflow:chaos $GITHUB_WORKSPACE/bin
docker image list
- name: Load docker image to kind cluster
run: |
kind load docker-image dataflow:chaos --name dataflow-engine-cluster
# Set up upstream instances
- name: Set up sources
run: |
kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/sources.yaml
- name: Wait for sources ready # kubectl wait --all not working
run: |
kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=300s || true
kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=300s || true
kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=300s || true
sleep 10
echo show pvc
kubectl get pvc -l app=sources -o wide
echo show pv
kubectl get pv -o wide
echo show svc
kubectl get svc -l app=sources -o wide
echo show sts
kubectl get sts -l app=sources -o wide
echo show po
kubectl get po -l app=sources -o wide
echo describe po
kubectl describe po -l app=sources
echo describe pvc
kubectl describe pvc -l app=sources
kubectl wait --for=condition=Ready pod/mysql57-0 --timeout=0s
kubectl wait --for=condition=Ready pod/mysql8-0 --timeout=0s
kubectl wait --for=condition=Ready pod/mariadb-0 --timeout=0s
# Set up downstream TiDB instance (deploy a TiDB with mockTiKV, not a TidbCluster managed by TiDB-operator)
- name: Set up TiDB
run: |
kubectl apply -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
kubectl get -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
kubectl describe -f $GITHUB_WORKSPACE/dm/chaos/manifests/tidb.yaml
- name: Wait for TiDB ready
run: |
kubectl wait --for=condition=Ready pod/tidb-0 --timeout=10m || true
echo show pvc
kubectl get pvc -l app=tidb -o wide
echo show pv
kubectl get pv -o wide
echo show svc
kubectl get svc -l app=tidb -o wide
echo show sts
kubectl get sts -l app=tidb -o wide
echo show po
kubectl get po -l app=tidb -o wide
echo describe po
kubectl describe po -l app=tidb
echo describe pvc
kubectl describe pvc -l app=tidb
kubectl wait --for=condition=Ready pod/tidb-0 --timeout=0s
# Set up minio and create a bucket for tests
- name: Set up minio
run: |
kubectl apply -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml
kubectl get -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml
kubectl describe -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio.yaml
- name: Wait for minio ready
run: |
kubectl wait --for=condition=Ready pod/chaos-minio-0 --timeout=10m || true
echo show pvc
kubectl get pvc -l app=minio -o wide
echo show pv
kubectl get pv -o wide
echo show svc
kubectl get svc -l app=minio -o wide
echo show sts
kubectl get sts -l app=minio -o wide
echo show po
kubectl get po -l app=minio -o wide
echo describe po
kubectl describe po -l app=minio
echo describe pvc
kubectl describe pvc -l app=minio
kubectl wait --for=condition=Ready pod/chaos-minio-0 --timeout=0s
- name: Set up minio-create-bucket job
run: |
kubectl apply -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml
kubectl get -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml
kubectl describe -f $GITHUB_WORKSPACE/deployments/engine/helm/minio/minio-create-bucket.yaml
kubectl wait --for=condition=Complete job/chaos-minio-create-bucket-job --timeout=2m
# Set up metastore and basic services
- name: Set up metastore and basic services
run: |
helm install -f $GITHUB_WORKSPACE/deployments/engine/helm/tiflow/values.yaml chaos $GITHUB_WORKSPACE/deployments/engine/helm/tiflow
helm list
sleep 5
kubectl get pods
- name: Wait for metastore ready
run: |
kubectl wait --for=condition=Ready pod/chaos-metastore-mysql-0 --timeout=60s || true
kubectl wait --for=condition=Ready pod/chaos-metastore-etcd-0 --timeout=60s || true
echo show pvc
kubectl get pvc -l app=chaos-metastore-etcd -o wide
echo show pv
kubectl get pv -o wide
echo show svc
kubectl get svc -l app=chaos-metastore-etcd -o wide
echo show sts
kubectl get sts -l app=chaos-metastore-etcd -o wide
echo show po
kubectl get po -l app=chaos-metastore-etcd -o wide
echo describe po
kubectl describe po -l app=chaos-metastore-etcd
echo describe pvc
kubectl describe pvc -l app=chaos-metastore-etcd
echo show pvc
kubectl get pvc -l app=chaos-metastore-mysql -o wide
echo show pv
kubectl get pv -o wide
echo show svc
kubectl get svc -l app=chaos-metastore-mysql -o wide
echo show sts
kubectl get sts -l app=chaos-metastore-mysql -o wide
echo show po
kubectl get po -l app=chaos-metastore-mysql -o wide
echo describe po
kubectl describe po -l app=chaos-metastore-framework
echo describe pvc
kubectl describe pvc -l app=chaos-metastore-framework
- name: Wait for server-master ready
run: |
kubectl wait --for=condition=Ready pod -l app=chaos-server-master --all --timeout=20s|| true
echo "<<<<< show pvc >>>>>"
kubectl get pvc -l app=chaos-server-master -o wide
echo "<<<<< show pv >>>>>"
kubectl get pv -o wide
echo "<<<<< show svc >>>>>"
kubectl get svc -l app=chaos-server-master -o wide
echo "<<<<< show sts >>>>>"
kubectl get sts -l app=chaos-server-master -o wide
echo "<<<<< show po >>>>>"
kubectl get po -l app=chaos-server-master -o wide
echo "<<<<< describe po >>>>>"
kubectl describe po -l app=chaos-server-master
echo "<<<<< describe pvc >>>>>"
kubectl describe pvc -l app=chaos-server-master
echo "<<<<< show current log for chaos-server-master-0 >>>>>"
kubectl logs chaos-server-master-0 || true
echo "<<<<< show previous log for chaos-server-master-0 >>>>>"
kubectl logs chaos-server-master-0 -p || true
echo "<<<<< show current log for chaos-server-master-1 >>>>>"
kubectl logs chaos-server-master-1 || true
echo "<<<<< show previous log for chaos-server-master-1 >>>>>"
kubectl logs chaos-server-master-1 -p || true
echo "<<<<< show current log for chaos-server-master-2 >>>>>"
kubectl logs chaos-server-master-2 || true
echo "<<<<< show previous log for chaos-server-master-2 >>>>>"
kubectl logs chaos-server-master-2 -p || true
kubectl logs chaos-server-master-0 -c wait-mysql || true
- name: Wait for executor ready
run: |
kubectl wait --for=condition=Ready pod -l app=chaos-executor --all --timeout=15s|| true
echo "<<<<< show pvc >>>>>"
kubectl get pvc -l app=chaos-executor -o wide
echo "<<<<< show pv >>>>>"
kubectl get pv -o wide
echo "<<<<< show svc >>>>>"
kubectl get svc -l app=chaos-executor -o wide
echo "<<<<< show sts >>>>>"
kubectl get sts -l app=chaos-executor -o wide
echo "<<<<< show po >>>>>"
kubectl get po -l app=chaos-executor -o wide
echo "<<<<< describe po >>>>>"
kubectl describe po -l app=chaos-executor
echo "<<<<< describe pvc >>>>>"
kubectl describe pvc -l app=chaos-executor
echo "<<<<< show current log for chaos-executor-0 >>>>>"
kubectl logs chaos-executor-0 || true
echo "<<<<< show previous log for chaos-executor-0 >>>>>"
kubectl logs chaos-executor-0 -p || true
echo "<<<<< show current log for chaos-executor-1 >>>>>"
kubectl logs chaos-executor-1 || true
echo "<<<<< show previous log for worker-master-1 >>>>>"
kubectl logs chaos-executor-1 -p || true
echo "<<<<< show current log for chaos-executor-2 >>>>>"
kubectl logs chaos-executor-2 || true
echo "<<<<< show previous log for chaos-executor-2 >>>>>"
kubectl logs chaos-executor-2 -p || true
kubectl logs chaos-executor-0 -c wait-server-master || true
- name: Set up chaos test cases
run: |
kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
kubectl get -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
kubectl get pods
# FIXME: remove this after fix https://github.com/pingcap/tiflow/issues/7304
- name: Wait DM enter sync stage
run: |
for idx in $(seq 0 300); do
echo "wait dm enter sync stage"
if kubectl logs job.batch/chaos-test-case | grep "full mode of the task has completed" ; then
break
fi
sleep 1
done
- name: Encode chaos-mesh action
run: |
echo CFG_BASE64=$(base64 -w 0 $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml) >> $GITHUB_ENV
- name: Run chaos mesh action
uses: chaos-mesh/chaos-mesh-action@master
env:
CFG_BASE64: ${{ env.CFG_BASE64 }}
# check whether complete with 1m * 20 times.
- name: Wait for chaos test case complete
run: |
$GITHUB_WORKSPACE/engine/chaos/scripts/check-case.sh
- name: Pause all chaos
if: ${{ always() }}
run: |
kubectl delete -f $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml
- name: Dump goroutines
if: ${{ failure() }}
run: |
# Add a delay if test fails, to check whether the cluster can recover after chaos is removed
sleep 60
kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "server-master"|xargs -I{} kubectl exec -i -c server-master {} -- wget http://127.0.0.1:10240/debug/pprof/goroutine?debug=2 -O /log/{}_goroutines.log || true
kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "executor"|xargs -I{} kubectl exec -i -c executor {} -- wget http://127.0.0.1:10241/debug/pprof/goroutine?debug=2 -O /log/{}_goroutines.log || true
- name: Copy logs to hack permission
if: ${{ always() }}
run: |
mkdir ./logs
kubectl get pods --no-headers -o custom-columns=":metadata.name"|grep -E "server-master|executor"|xargs -I{} kubectl cp {}:/log ./logs || true
kind export logs ./logs/kind --name dataflow-engine-cluster
sudo chown -R runner ./logs
# Upload logs as artifact seems not stable, so we set `continue-on-error: true` here.
- name: Upload logs
continue-on-error: true
uses: actions/upload-artifact@v4
if: ${{ always() }}
with:
name: chaos-base-logs.${{ matrix.chaos-obj }}
path: |
./logs
# Send feishu notification if failed.
- name: Feishu notification
continue-on-error: true
uses: foxundermoon/feishu-action@v2
if: ${{ failure() }}
with:
url: ${{ secrets.ENGINE_FEISHU_NOTIFY_URL }}
msg_type: text
content: |
text: |
dataflow engine chaos job failed, see https://github.com/pingcap/tiflow/actions/runs/${{ github.run_id }}