Soak Testing #717
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Pre-requisites: | |
# - AWS Account with correct permissions | |
# - `gh-pages` branch | |
# - AWS Account needs to add the LOG_GROUP_NAME defined below | |
name: Soak Testing | |
on: | |
workflow_dispatch: | |
inputs: | |
target_commit_sha: | |
description: 'The commit SHA on this repo to use for the Soak Tests.' | |
required: true | |
test_duration_minutes: | |
description: 'The duration of the Soak Tests in minutes.' | |
required: true | |
default: 300 | |
schedule: | |
- cron: '0 15 * * *' | |
env: | |
# NOTE: The configuration of `APP_PROCESS_EXECUTABLE_NAME` is repo dependent | |
APP_PROCESS_EXECUTABLE_NAME: ruby | |
AWS_DEFAULT_REGION: us-east-1 | |
DEFAULT_TEST_DURATION_MINUTES: 300 | |
HOSTMETRICS_INTERVAL_SECS: 600 | |
CPU_LOAD_THRESHOLD: 45 | |
TOTAL_MEMORY_THRESHOLD: 2147483648 # 2 GiB | |
MAX_BENCHMARKS_TO_KEEP: 100 | |
LISTEN_ADDRESS_PORT: 8080 | |
# TODO: We might be able to adapt the "Soak Tests" to be "Overhead Tests". | |
# This means monitoring the Sample App's performance using high levels of TPS | |
# for the Load Generator over a shorter period of testing time. For example: | |
# https://github.com/aws-observability/aws-otel-collector/blob/main/docs/performance_model.md | |
# THROUGHPUT_PER_SECOND: TBD? | |
jobs: | |
test_apps_and_publish_results: | |
name: Soak Performance Test - (${{ matrix.app-platform }}, ${{ matrix.instrumentation-type }}) | |
runs-on: ubuntu-latest | |
permissions: | |
contents: write | |
id-token: write | |
issues: write | |
strategy: | |
fail-fast: false | |
matrix: | |
app-platform: [ ruby-on-rails ] | |
instrumentation-type: [ manual ] | |
env: | |
# NOTE: The configuration of `APP_PATH` is repo dependent | |
APP_PATH: sample-apps/${{ matrix.instrumentation-type}}-instrumentation/${{ matrix.app-platform }} | |
LOGS_NAMESPACE: ${{ github.repository }}/soak-tests-${{ matrix.app-platform }}-${{ matrix.instrumentation-type }} | |
steps: | |
# MARK: - GitHub Workflow Event Type Specific Values | |
- name: Use INPUT as commit SHA | |
if: ${{ github.event_name == 'workflow_dispatch' }} | |
run: | | |
echo "TARGET_SHA=${{ github.event.inputs.target_commit_sha }}" | tee --append $GITHUB_ENV; | |
- name: Use LATEST as commit SHA | |
if: ${{ github.event_name != 'workflow_dispatch' }} | |
run: | | |
echo "TARGET_SHA=${{ github.sha }}" | tee --append $GITHUB_ENV; | |
- name: Configure Performance Test Duration | |
run: | | |
echo "TEST_DURATION_MINUTES=${{ github.event.inputs.test_duration_minutes || env.DEFAULT_TEST_DURATION_MINUTES }}" | tee --append $GITHUB_ENV; | |
- name: Clone This Repo @ ${{ env.TARGET_SHA }} | |
uses: actions/checkout@v3 | |
with: | |
ref: ${{ env.TARGET_SHA }} | |
# MARK: - Instrumentation-Type Specific Values | |
# NOTE: The configuration of `APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE` is | |
# repo dependent | |
- name: Configure Manual Instrumentation-Type Specific Values | |
if: ${{ matrix.instrumentation-type == 'manual' }} | |
# TODO: (enowell) Update this line to parse the version from | |
# `bundle info puma`. This is because depend-a-bot might change this | |
# version in the Sample App but won't update it here. | |
# | |
# See: https://github.com/aws-observability/aws-otel-ruby/pull/17 | |
run: | | |
echo 'APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE<<EOF' >> $GITHUB_ENV | |
echo 'puma 5.6.2 (tcp://0.0.0.0:${{ env.LISTEN_ADDRESS_PORT }}) [app]' | tee --append $GITHUB_ENV; | |
echo 'EOF' >> $GITHUB_ENV | |
# MARK: - Uniquely identify this Sample App environment | |
- name: Create unique combination using matrix + commit parameters | |
run: | | |
echo "MATRIX_COMMIT_COMBO=${{ matrix.app-platform }}-${{ matrix.instrumentation-type }}-${{ env.TARGET_SHA }}" | tee --append $GITHUB_ENV; | |
# MARK: - Run Performance Tests | |
- name: Configure AWS Credentials | |
uses: aws-actions/configure-aws-credentials@v1 | |
with: | |
role-to-assume: ${{ secrets.AWS_ASSUME_ROLE_ARN }} | |
role-duration-seconds: 21600 # 6 Hours | |
aws-region: ${{ env.AWS_DEFAULT_REGION }} | |
- name: Configure Performance Test environment variables | |
run: | | |
echo "NUM_OF_CPUS=$(nproc --all)" | tee --append $GITHUB_ENV; | |
- name: Run All Docker Containers - Sample App + OTel Collector + Load Generator + Alarm Poller | |
id: check-failure-during-performance-tests | |
continue-on-error: true | |
working-directory: .github/docker-performance-tests | |
env: | |
INSTANCE_ID: ${{ github.run_id }}-${{ github.run_number }} | |
LOG_GROUP_NAME: otel-sdk-performance-tests | |
# Also uses: | |
# AWS_ACCESS_KEY_ID | |
# AWS_SECRET_ACCESS_KEY | |
# AWS_SESSION_TOKEN | |
# APP_PATH | |
# TARGET_SHA | |
# LISTEN_ADDRESS_PORT | |
# LOGS_NAMESPACE | |
# APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE | |
# APP_PROCESS_EXECUTABLE_NAME | |
# HOSTMETRICS_INTERVAL_SECS | |
# TEST_DURATION_MINUTES | |
# NUM_OF_CPUS | |
# CPU_LOAD_THRESHOLD | |
# TOTAL_MEMORY_THRESHOLD | |
# AWS_DEFAULT_REGION | |
# MATRIX_COMMIT_COMBO | |
# GITHUB_RUN_ID | |
run: |- | |
docker-compose up --build; | |
RUN_TESTS_EXIT_CODE=$( | |
docker inspect $( | |
docker ps --quiet --all --filter "name=docker-performance-tests_alarms-poller" | |
) --format="{{.State.ExitCode}}" | |
); | |
echo "RUN_TESTS_EXIT_CODE=$RUN_TESTS_EXIT_CODE" | tee --append $GITHUB_ENV; | |
exit $RUN_TESTS_EXIT_CODE; | |
- name: Fail early if Soak Tests failed to start | |
if: ${{ env.RUN_TESTS_EXIT_CODE == '' || env.RUN_TESTS_EXIT_CODE == 1 }} | |
run: exit 1; | |
# MARK: - Report on Performance Test Results | |
- name: Install script dependencies | |
run: pip install boto3 | |
- name: Get a snapshot of metrics and commit them to the repository | |
run: | | |
python3 .github/scripts/performance-tests/produce_metric_widget_images.py \ | |
--logs-namespace ${{ env.LOGS_NAMESPACE }} \ | |
--metrics-period ${{ env.HOSTMETRICS_INTERVAL_SECS }} \ | |
--num-of-cpus ${{ env.NUM_OF_CPUS }} \ | |
--app-process-command-line-dimension-value "${{ env.APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE }}" \ | |
--target-sha ${{ env.TARGET_SHA }} \ | |
--github-run-id ${GITHUB_RUN_ID} \ | |
--test-duration-minutes ${{ env.TEST_DURATION_MINUTES }} \ | |
--cpu-load-threshold ${{ env.CPU_LOAD_THRESHOLD }} \ | |
--total-memory-threshold ${{ env.TOTAL_MEMORY_THRESHOLD }} \ | |
--app-platform ${{ matrix.app-platform }} \ | |
--instrumentation-type ${{ matrix.instrumentation-type }} \ | |
--max-benchmarks-to-keep ${{ env.MAX_BENCHMARKS_TO_KEEP }} \ | |
--github-repository ${GITHUB_REPOSITORY} | |
echo "::warning::Checkout Snapshots at this link: https://github.com/${GITHUB_REPOSITORY}/blob/gh-pages/soak-tests/snapshots/commits/${{ env.TARGET_SHA }}/runs/${GITHUB_RUN_ID}/${{ matrix.app-platform }}"; | |
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"; | |
git config user.name "GitHub Actions"; | |
git fetch; | |
git checkout gh-pages; | |
git add soak-tests/snapshots/commits; | |
git commit -m "Soak Test Snapshots from ${{ env.TARGET_SHA }} - ${GITHUB_RUN_ID}"; | |
git push; | |
git checkout main; | |
- name: Prepare Performance Test results as JSON output | |
run: python3 .github/scripts/performance-tests/get-metric-data/produce_performance_test_results.py | |
--logs-namespace ${{ env.LOGS_NAMESPACE }} | |
--metrics-period ${{ env.HOSTMETRICS_INTERVAL_SECS }} | |
--num-of-cpus ${{ env.NUM_OF_CPUS }} | |
--app-process-command-line-dimension-value "${{ env.APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE }}" | |
--target-sha ${{ env.TARGET_SHA }} | |
--github-run-id ${GITHUB_RUN_ID} | |
--test-duration-minutes ${{ env.TEST_DURATION_MINUTES }} | |
- name: Do we already have Performance Test graph results for this commit? | |
continue-on-error: true | |
id: check-already-have-performance-results | |
run: | | |
git checkout gh-pages; | |
HAS_RESULTS_ALREADY=$( | |
sed 's/window.BENCHMARK_DATA = //' soak-tests/per-commit-overall-results/data.js | | |
jq " | |
.entries | | |
.\"Soak Test Results - sample-app-${{ matrix.app-platform }}-${{ matrix.instrumentation-type }}\" | | |
any(.commit.id == \"${{ env.TARGET_SHA }}\") | |
" || echo false | |
); | |
git checkout main; | |
[[ $HAS_RESULTS_ALREADY == true ]] | |
- name: Graph and Report Performance Test Averages result | |
uses: benchmark-action/github-action-benchmark@v1 | |
continue-on-error: true | |
id: check-failure-after-performance-tests | |
with: | |
name: Soak Test Results - sample-app-${{ matrix.app-platform }}-${{ matrix.instrumentation-type }} | |
tool: customSmallerIsBetter | |
output-file-path: output.json | |
github-token: ${{ secrets.GITHUB_TOKEN }} | |
max-items-in-chart: ${{ env.MAX_BENCHMARKS_TO_KEEP }} | |
alert-threshold: 175% | |
# Does not work as expected, see: | |
# https://github.com/open-telemetry/opentelemetry-python/pull/1478 | |
# comment-always: true | |
fail-on-alert: true | |
auto-push: ${{ github.event_name == 'schedule' && | |
steps.check-already-have-performance-results.outcome == 'failure' && | |
github.ref == 'refs/heads/main' }} | |
gh-pages-branch: gh-pages | |
benchmark-data-dir-path: soak-tests/per-commit-overall-results | |
- name: Publish Issue if failed DURING Performance Tests | |
uses: JasonEtco/create-an-issue@v2 | |
if: ${{ github.event_name == 'schedule' && | |
steps.check-failure-during-performance-tests.outcome == 'failure' }} | |
env: | |
APP_PLATFORM: ${{ matrix.app-platform }} | |
INSTRUMENTATION_TYPE: ${{ matrix.instrumentation-type }} | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
with: | |
filename: .github/auto-issue-templates/failure-during-soak_tests.md | |
update_existing: true | |
- name: Publish Issue if failed AFTER Performance Tests | |
uses: JasonEtco/create-an-issue@v2 | |
if: ${{ github.event_name == 'schedule' && | |
steps.check-failure-after-performance-tests.outcome == 'failure' }} | |
env: | |
APP_PLATFORM: ${{ matrix.app-platform }} | |
INSTRUMENTATION_TYPE: ${{ matrix.instrumentation-type }} | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
with: | |
filename: .github/auto-issue-templates/failure-after-soak_tests.md | |
update_existing: true | |
- name: Check for Performance Degradation either DURING or AFTER Performance Tests | |
if: ${{ steps.check-failure-during-performance-tests.outcome == 'failure' || | |
steps.check-failure-after-performance-tests.outcome == 'failure' }} | |
run: >- | |
echo 'Performance Tests failed, see the logs above for details'; | |
exit 1; |