Soak Testing #717

Workflow file for this run

.github/workflows/soak-testing.yml at 06afbdf

	# Pre-requisites:
	# - AWS Account with correct permissions
	# - `gh-pages` branch
	# - AWS Account needs to add the LOG_GROUP_NAME defined below

	name: Soak Testing
	on:
	workflow_dispatch:
	inputs:
	target_commit_sha:
	description: 'The commit SHA on this repo to use for the Soak Tests.'
	required: true
	test_duration_minutes:
	description: 'The duration of the Soak Tests in minutes.'
	required: true
	default: 300
	schedule:
	- cron: '0 15 * * *'
	env:
	# NOTE: The configuration of `APP_PROCESS_EXECUTABLE_NAME` is repo dependent
	APP_PROCESS_EXECUTABLE_NAME: ruby
	AWS_DEFAULT_REGION: us-east-1
	DEFAULT_TEST_DURATION_MINUTES: 300
	HOSTMETRICS_INTERVAL_SECS: 600
	CPU_LOAD_THRESHOLD: 45
	TOTAL_MEMORY_THRESHOLD: 2147483648 # 2 GiB
	MAX_BENCHMARKS_TO_KEEP: 100
	LISTEN_ADDRESS_PORT: 8080
	# TODO: We might be able to adapt the "Soak Tests" to be "Overhead Tests".
	# This means monitoring the Sample App's performance using high levels of TPS
	# for the Load Generator over a shorter period of testing time. For example:
	# https://github.com/aws-observability/aws-otel-collector/blob/main/docs/performance_model.md
	# THROUGHPUT_PER_SECOND: TBD?

	jobs:
	test_apps_and_publish_results:
	name: Soak Performance Test - (${{ matrix.app-platform }}, ${{ matrix.instrumentation-type }})
	runs-on: ubuntu-latest
	permissions:
	contents: write
	id-token: write
	issues: write
	strategy:
	fail-fast: false
	matrix:
	app-platform: [ ruby-on-rails ]
	instrumentation-type: [ manual ]
	env:
	# NOTE: The configuration of `APP_PATH` is repo dependent
	APP_PATH: sample-apps/${{ matrix.instrumentation-type}}-instrumentation/${{ matrix.app-platform }}
	LOGS_NAMESPACE: ${{ github.repository }}/soak-tests-${{ matrix.app-platform }}-${{ matrix.instrumentation-type }}
	steps:
	# MARK: - GitHub Workflow Event Type Specific Values

	- name: Use INPUT as commit SHA
	if: ${{ github.event_name == 'workflow_dispatch' }}
	run: \|
	echo "TARGET_SHA=${{ github.event.inputs.target_commit_sha }}" \| tee --append $GITHUB_ENV;
	- name: Use LATEST as commit SHA
	if: ${{ github.event_name != 'workflow_dispatch' }}
	run: \|
	echo "TARGET_SHA=${{ github.sha }}" \| tee --append $GITHUB_ENV;
	- name: Configure Performance Test Duration
	run: \|
	echo "TEST_DURATION_MINUTES=${{ github.event.inputs.test_duration_minutes \|\| env.DEFAULT_TEST_DURATION_MINUTES }}" \| tee --append $GITHUB_ENV;
	- name: Clone This Repo @ ${{ env.TARGET_SHA }}
	uses: actions/checkout@v3
	with:
	ref: ${{ env.TARGET_SHA }}

	# MARK: - Instrumentation-Type Specific Values

	# NOTE: The configuration of `APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE` is
	# repo dependent
	- name: Configure Manual Instrumentation-Type Specific Values
	if: ${{ matrix.instrumentation-type == 'manual' }}
	# TODO: (enowell) Update this line to parse the version from
	# `bundle info puma`. This is because depend-a-bot might change this
	# version in the Sample App but won't update it here.
	#
	# See: https://github.com/aws-observability/aws-otel-ruby/pull/17
	run: \|
	echo 'APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE<<EOF' >> $GITHUB_ENV
	echo 'puma 5.6.2 (tcp://0.0.0.0:${{ env.LISTEN_ADDRESS_PORT }}) [app]' \| tee --append $GITHUB_ENV;
	echo 'EOF' >> $GITHUB_ENV

	# MARK: - Uniquely identify this Sample App environment

	- name: Create unique combination using matrix + commit parameters
	run: \|
	echo "MATRIX_COMMIT_COMBO=${{ matrix.app-platform }}-${{ matrix.instrumentation-type }}-${{ env.TARGET_SHA }}" \| tee --append $GITHUB_ENV;

	# MARK: - Run Performance Tests

	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@v1
	with:
	role-to-assume: ${{ secrets.AWS_ASSUME_ROLE_ARN }}
	role-duration-seconds: 21600 # 6 Hours
	aws-region: ${{ env.AWS_DEFAULT_REGION }}
	- name: Configure Performance Test environment variables
	run: \|
	echo "NUM_OF_CPUS=$(nproc --all)" \| tee --append $GITHUB_ENV;
	- name: Run All Docker Containers - Sample App + OTel Collector + Load Generator + Alarm Poller
	id: check-failure-during-performance-tests
	continue-on-error: true
	working-directory: .github/docker-performance-tests
	env:
	INSTANCE_ID: ${{ github.run_id }}-${{ github.run_number }}
	LOG_GROUP_NAME: otel-sdk-performance-tests
	# Also uses:
	# AWS_ACCESS_KEY_ID
	# AWS_SECRET_ACCESS_KEY
	# AWS_SESSION_TOKEN
	# APP_PATH
	# TARGET_SHA
	# LISTEN_ADDRESS_PORT
	# LOGS_NAMESPACE
	# APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE
	# APP_PROCESS_EXECUTABLE_NAME
	# HOSTMETRICS_INTERVAL_SECS
	# TEST_DURATION_MINUTES
	# NUM_OF_CPUS
	# CPU_LOAD_THRESHOLD
	# TOTAL_MEMORY_THRESHOLD
	# AWS_DEFAULT_REGION
	# MATRIX_COMMIT_COMBO
	# GITHUB_RUN_ID
	run: \|-
	docker-compose up --build;
	RUN_TESTS_EXIT_CODE=$(
	docker inspect $(
	docker ps --quiet --all --filter "name=docker-performance-tests_alarms-poller"
	) --format="{{.State.ExitCode}}"
	);
	echo "RUN_TESTS_EXIT_CODE=$RUN_TESTS_EXIT_CODE" \| tee --append $GITHUB_ENV;
	exit $RUN_TESTS_EXIT_CODE;
	- name: Fail early if Soak Tests failed to start
	if: ${{ env.RUN_TESTS_EXIT_CODE == '' \|\| env.RUN_TESTS_EXIT_CODE == 1 }}
	run: exit 1;

	# MARK: - Report on Performance Test Results

	- name: Install script dependencies
	run: pip install boto3
	- name: Get a snapshot of metrics and commit them to the repository
	run: \|
	python3 .github/scripts/performance-tests/produce_metric_widget_images.py \
	--logs-namespace ${{ env.LOGS_NAMESPACE }} \
	--metrics-period ${{ env.HOSTMETRICS_INTERVAL_SECS }} \
	--num-of-cpus ${{ env.NUM_OF_CPUS }} \
	--app-process-command-line-dimension-value "${{ env.APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE }}" \
	--target-sha ${{ env.TARGET_SHA }} \
	--github-run-id ${GITHUB_RUN_ID} \
	--test-duration-minutes ${{ env.TEST_DURATION_MINUTES }} \
	--cpu-load-threshold ${{ env.CPU_LOAD_THRESHOLD }} \
	--total-memory-threshold ${{ env.TOTAL_MEMORY_THRESHOLD }} \
	--app-platform ${{ matrix.app-platform }} \
	--instrumentation-type ${{ matrix.instrumentation-type }} \
	--max-benchmarks-to-keep ${{ env.MAX_BENCHMARKS_TO_KEEP }} \
	--github-repository ${GITHUB_REPOSITORY}
	echo "::warning::Checkout Snapshots at this link: https://github.com/${GITHUB_REPOSITORY}/blob/gh-pages/soak-tests/snapshots/commits/${{ env.TARGET_SHA }}/runs/${GITHUB_RUN_ID}/${{ matrix.app-platform }}";
	git config user.email "41898282+github-actions[bot]@users.noreply.github.com";
	git config user.name "GitHub Actions";
	git fetch;
	git checkout gh-pages;
	git add soak-tests/snapshots/commits;
	git commit -m "Soak Test Snapshots from ${{ env.TARGET_SHA }} - ${GITHUB_RUN_ID}";
	git push;
	git checkout main;
	- name: Prepare Performance Test results as JSON output
	run: python3 .github/scripts/performance-tests/get-metric-data/produce_performance_test_results.py
	--logs-namespace ${{ env.LOGS_NAMESPACE }}
	--metrics-period ${{ env.HOSTMETRICS_INTERVAL_SECS }}
	--num-of-cpus ${{ env.NUM_OF_CPUS }}
	--app-process-command-line-dimension-value "${{ env.APP_PROCESS_COMMAND_LINE_DIMENSION_VALUE }}"
	--target-sha ${{ env.TARGET_SHA }}
	--github-run-id ${GITHUB_RUN_ID}
	--test-duration-minutes ${{ env.TEST_DURATION_MINUTES }}
	- name: Do we already have Performance Test graph results for this commit?
	continue-on-error: true
	id: check-already-have-performance-results
	run: \|
	git checkout gh-pages;
	HAS_RESULTS_ALREADY=$(
	sed 's/window.BENCHMARK_DATA = //' soak-tests/per-commit-overall-results/data.js \|
	jq "
	.entries \|
	.\"Soak Test Results - sample-app-${{ matrix.app-platform }}-${{ matrix.instrumentation-type }}\" \|
	any(.commit.id == \"${{ env.TARGET_SHA }}\")
	" \|\| echo false
	);
	git checkout main;
	[[ $HAS_RESULTS_ALREADY == true ]]
	- name: Graph and Report Performance Test Averages result
	uses: benchmark-action/github-action-benchmark@v1
	continue-on-error: true
	id: check-failure-after-performance-tests
	with:
	name: Soak Test Results - sample-app-${{ matrix.app-platform }}-${{ matrix.instrumentation-type }}
	tool: customSmallerIsBetter
	output-file-path: output.json
	github-token: ${{ secrets.GITHUB_TOKEN }}
	max-items-in-chart: ${{ env.MAX_BENCHMARKS_TO_KEEP }}
	alert-threshold: 175%
	# Does not work as expected, see:
	# https://github.com/open-telemetry/opentelemetry-python/pull/1478
	# comment-always: true
	fail-on-alert: true
	auto-push: ${{ github.event_name == 'schedule' &&
	steps.check-already-have-performance-results.outcome == 'failure' &&
	github.ref == 'refs/heads/main' }}
	gh-pages-branch: gh-pages
	benchmark-data-dir-path: soak-tests/per-commit-overall-results
	- name: Publish Issue if failed DURING Performance Tests
	uses: JasonEtco/create-an-issue@v2
	if: ${{ github.event_name == 'schedule' &&
	steps.check-failure-during-performance-tests.outcome == 'failure' }}
	env:
	APP_PLATFORM: ${{ matrix.app-platform }}
	INSTRUMENTATION_TYPE: ${{ matrix.instrumentation-type }}
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	with:
	filename: .github/auto-issue-templates/failure-during-soak_tests.md
	update_existing: true
	- name: Publish Issue if failed AFTER Performance Tests
	uses: JasonEtco/create-an-issue@v2
	if: ${{ github.event_name == 'schedule' &&
	steps.check-failure-after-performance-tests.outcome == 'failure' }}
	env:
	APP_PLATFORM: ${{ matrix.app-platform }}
	INSTRUMENTATION_TYPE: ${{ matrix.instrumentation-type }}
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	with:
	filename: .github/auto-issue-templates/failure-after-soak_tests.md
	update_existing: true
	- name: Check for Performance Degradation either DURING or AFTER Performance Tests
	if: ${{ steps.check-failure-during-performance-tests.outcome == 'failure' \|\|
	steps.check-failure-after-performance-tests.outcome == 'failure' }}
	run: >-
	echo 'Performance Tests failed, see the logs above for details';
	exit 1;

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Soak Testing #717

Workflow file

Soak Testing #717

Jobs

Run details

Workflow file for this run