From 7b6309f1b4f9ccdcc9e0a4b84e3f5def51fd21bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Mac=C3=ADk?= Date: Fri, 22 Mar 2024 08:58:39 +0100 Subject: [PATCH] fix(load-test): fix max concurrency collect results script, add tekton result watcher metrics (#1074) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix(load-test): fix max concurrency collect results script Signed-off-by: Pavel MacĂ­k --- .../max-concurrency/cluster_read_config.yaml | 16 +- .../max-concurrency/collect-results.sh | 162 ++++++++++++------ tests/load-tests/run-max-concurrency.sh | 18 +- 3 files changed, 126 insertions(+), 70 deletions(-) diff --git a/tests/load-tests/ci-scripts/max-concurrency/cluster_read_config.yaml b/tests/load-tests/ci-scripts/max-concurrency/cluster_read_config.yaml index 03baea2b3..2afa8af15 100644 --- a/tests/load-tests/ci-scripts/max-concurrency/cluster_read_config.yaml +++ b/tests/load-tests/ci-scripts/max-concurrency/cluster_read_config.yaml @@ -23,26 +23,26 @@ monitoring_step: {{ step }} {%- endmacro %} -{% macro monitor_pod_container(namespace, pod, container, step=15, pod_suffix_regex='-[0-9a-f]+-.*') -%} +{% macro monitor_pod_container(namespace, pod, container, step=20, pod_suffix_regex='-[0-9a-f]+-.*') -%} # Gather monitoring data about the pod's container - name: measurements.{{ pod }}.container[{{ container }}].memory monitoring_query: sum(container_memory_usage_bytes{namespace='{{ namespace }}', pod=~'{{ pod }}{{ pod_suffix_regex }}', container='{{container}}'}) monitoring_step: {{ step }} {%- endmacro %} -{% macro pv_stats(namespace, pvc) -%} +{% macro pv_stats(namespace, pvc, step=20) -%} # Collect data for PV stats - name: measurements.cluster.pv_stats.{{namespace}}/{{pvc}}.capacity_bytes monitoring_query: kubelet_volume_stats_capacity_bytes{namespace="{{ namespace }}", persistentvolumeclaim="{{ pvc }}"} - monitoring_step: 15 + monitoring_step: {{ step }} - name: measurements.cluster.pv_stats.{{namespace}}/{{pvc}}.used_bytes monitoring_query: kubelet_volume_stats_used_bytes{namespace="{{ namespace }}", persistentvolumeclaim="{{ pvc }}"} - monitoring_step: 15 + monitoring_step: {{ step }} - name: measurements.cluster.pv_stats.{{namespace}}/{{pvc}}.available_bytes monitoring_query: kubelet_volume_stats_available_bytes{namespace="{{ namespace }}", persistentvolumeclaim="{{ pvc }}"} - monitoring_step: 15 + monitoring_step: {{ step }} {%- endmacro %} -{{ monitor_pod('tekton-results', 'tekton-results-watcher', 15, '-.*') }} -{{ monitor_pod_container('tekton-results', 'tekton-results-watcher', 'watcher', 15, '-.*') }} -{{ pv_stats('tekton-results', 'data-postgres-postgresql-0') }} +{{ monitor_pod('tekton-results', 'tekton-results-watcher', 20, '-.*') }} +{{ monitor_pod_container('tekton-results', 'tekton-results-watcher', 'watcher', 20, '-.*') }} +{{ pv_stats('tekton-results', 'data-postgres-postgresql-0', 20) }} diff --git a/tests/load-tests/ci-scripts/max-concurrency/collect-results.sh b/tests/load-tests/ci-scripts/max-concurrency/collect-results.sh index 23387b825..ef3528570 100755 --- a/tests/load-tests/ci-scripts/max-concurrency/collect-results.sh +++ b/tests/load-tests/ci-scripts/max-concurrency/collect-results.sh @@ -11,11 +11,17 @@ pushd "${2:-.}" output_dir="${OUTPUT_DIR:-./tests/load-tests}" +csv_delim=";" +csv_delim_quoted="\"$csv_delim\"" +dt_format='"%Y-%m-%dT%H:%M:%SZ"' + collect_artifacts() { echo "Collecting load test artifacts.." - find "$output_dir" -type f -name 'load-tests.max-concurrency.*.log' -exec cp -vf {} "${ARTIFACT_DIR}" \; + mkdir -p "${ARTIFACT_DIR}/logs" + find "$output_dir" -type f -name 'load-tests.max-concurrency.*.log' -exec cp -vf {} "${ARTIFACT_DIR}/logs" \; find "$output_dir" -type f -name 'load-tests.max-concurrency.json' -exec cp -vf {} "${ARTIFACT_DIR}" \; - find "$output_dir" -type f -name '*.pprof' -exec cp -vf {} "${ARTIFACT_DIR}" \; + mkdir -p "${ARTIFACT_DIR}/pprof" + find "$output_dir" -type f -name '*.pprof' -exec cp -vf {} "${ARTIFACT_DIR}/pprof" \; } collect_monitoring_data() { @@ -81,21 +87,29 @@ collect_tekton_profiling_data() { for pprof_profile in $(find "$output_dir" -name "*.pprof"); do if [ -s "$pprof_profile" ]; then file=$(basename "$pprof_profile") - go tool pprof -text "$pprof_profile" >"$ARTIFACT_DIR/$file.txt" || true - go tool pprof -svg -output="$ARTIFACT_DIR/$file.svg" "$pprof_profile" || true + go tool pprof -text "$pprof_profile" >"$ARTIFACT_DIR/pprof/$file.txt" || true + go tool pprof -svg -output="$ARTIFACT_DIR/pprof/$file.svg" "$pprof_profile" || true fi done fi } +get_tekton_results_watcher_pod_count() { + find "$output_dir" -type f -name 'tekton-results-watcher.tekton-results-watcher-*.goroutine-dump-0.0001-*.pprof' | wc -l +} + collect_scalability_data() { echo "Collecting scalability data..." - csv_delim=";" - csv_delim_quoted="\"$csv_delim\"" - dt_format='"%Y-%m-%dT%H:%M:%SZ"' + + tekton_results_watcher_pod_count=$(get_tekton_results_watcher_pod_count) + tekton_results_watcher_pod_headers="" + for i in $(seq -w 1 "$tekton_results_watcher_pod_count"); do + tekton_results_watcher_pod_headers="${tekton_results_watcher_pod_headers}${csv_delim}ParkedGoRoutinesPod$i" + done max_concurrency_csv=$ARTIFACT_DIR/max-concurrency.csv - echo "Threads\ + echo "Iteration\ +${csv_delim}Threads\ ${csv_delim}WorkloadKPI\ ${csv_delim}Errors\ ${csv_delim}UserAvgTime\ @@ -108,10 +122,10 @@ ${csv_delim}ComponentsAvgTime\ ${csv_delim}ComponentsMaxTime\ ${csv_delim}PipelineRunAvgTime\ ${csv_delim}PipelineRunMaxTime\ -${csv_delim}integrationTestsRunPipelineSucceededTimeAvg\ -${csv_delim}integrationTestsRunPipelineSucceededTimeMax\ -${csv_delim}deploymentSucceededTimeAvg\ -${csv_delim}deploymentSucceededTimeMax\ +${csv_delim}IntegrationTestsRunPipelineSucceededTimeAvg\ +${csv_delim}IntegrationTestsRunPipelineSucceededTimeMax\ +${csv_delim}DeploymentSucceededTimeAvg\ +${csv_delim}DeploymentSucceededTimeMax\ ${csv_delim}ClusterCPUUsageAvg\ ${csv_delim}ClusterDiskUsageAvg\ ${csv_delim}ClusterMemoryUsageAvg\ @@ -119,6 +133,10 @@ ${csv_delim}ClusterPodCountAvg\ ${csv_delim}ClusterNodesWorkerCountAvg\ ${csv_delim}ClusterRunningPodsOnWorkersCountAvg\ ${csv_delim}ClusterPVCInUseAvg\ +${csv_delim}TektonResultsWatcherMemoryMin\ +${csv_delim}TektonResultsWatcherMemoryMax\ +${csv_delim}TektonResultsWatcherMemoryRange\ +${tekton_results_watcher_pod_headers}\ ${csv_delim}SchedulerPendingPodsCountAvg\ ${csv_delim}TokenPoolRatePrimaryAvg\ ${csv_delim}TokenPoolRateSecondaryAvg\ @@ -135,50 +153,81 @@ ${csv_delim}NodeDiskIoTimeSecondsTotalAvg" \ >"$max_concurrency_csv" mc_files=$(find "$output_dir" -type f -name 'load-tests.max-concurrency.*.json') if [ -n "$mc_files" ]; then - cat $mc_files | - jq -rc "(.threads | tostring) \ - + $csv_delim_quoted + (.workloadKPI | tostring) \ - + $csv_delim_quoted + (.errorsTotal | tostring) \ - + $csv_delim_quoted + (.createUserTimeAvg | tostring) \ - + $csv_delim_quoted + (.createUserTimeMax | tostring) \ - + $csv_delim_quoted + (.createApplicationsTimeAvg | tostring) \ - + $csv_delim_quoted + (.createApplicationsTimeMax | tostring) \ - + $csv_delim_quoted + (.createCDQsTimeAvg | tostring) \ - + $csv_delim_quoted + (.createCDQsTimeMax | tostring) \ - + $csv_delim_quoted + (.createComponentsTimeAvg | tostring) \ - + $csv_delim_quoted + (.createComponentsTimeMax | tostring) \ - + $csv_delim_quoted + (.runPipelineSucceededTimeAvg | tostring) \ - + $csv_delim_quoted + (.runPipelineSucceededTimeMax | tostring) \ - + $csv_delim_quoted + (.integrationTestsRunPipelineSucceededTimeAvg | tostring) \ - + $csv_delim_quoted + (.integrationTestsRunPipelineSucceededTimeMax | tostring) \ - + $csv_delim_quoted + (.deploymentSucceededTimeAvg | tostring) \ - + $csv_delim_quoted + (.deploymentSucceededTimeMax | tostring) \ - + $csv_delim_quoted + (.measurements.cluster_cpu_usage_seconds_total_rate.mean | tostring) \ - + $csv_delim_quoted + (.measurements.cluster_disk_throughput_total.mean | tostring) \ - + $csv_delim_quoted + (.measurements.cluster_memory_usage_rss_total.mean | tostring) \ - + $csv_delim_quoted + (.measurements.cluster_pods_count.mean | tostring) \ - + $csv_delim_quoted + (.measurements.cluster_nodes_worker_count.mean | tostring) \ - + $csv_delim_quoted + (.measurements.cluster_running_pods_on_workers_count.mean | tostring) \ - + $csv_delim_quoted + (.measurements.storage_count_attachable_volumes_in_use.mean | tostring) \ - + $csv_delim_quoted + (.measurements.scheduler_pending_pods_count.mean | tostring) \ - + $csv_delim_quoted + (.measurements.token_pool_rate_primary.mean | tostring) \ - + $csv_delim_quoted + (.measurements.token_pool_rate_secondary.mean | tostring) \ - + $csv_delim_quoted + (.measurements.tekton_pipelines_controller_running_pipelineruns_count.mean | tostring) \ - + $csv_delim_quoted + (.measurements.tekton_tekton_pipelines_controller_workqueue_depth.mean | tostring) \ - + $csv_delim_quoted + (.measurements.pipelinerun_duration_scheduled_seconds.mean | tostring) \ - + $csv_delim_quoted + (.measurements.tekton_pipelines_controller_running_taskruns_throttled_by_node.mean | tostring) \ - + $csv_delim_quoted + (.measurements.tekton_pipelines_controller_running_taskruns_throttled_by_quota.mean | tostring) \ - + $csv_delim_quoted + (.measurements.etcd_request_duration_seconds_average.mean | tostring) \ - + $csv_delim_quoted + (.measurements.cluster_network_bytes_total.mean | tostring) \ - + $csv_delim_quoted + (.measurements.cluster_network_receive_bytes_total.mean | tostring) \ - + $csv_delim_quoted + (.measurements.cluster_network_transmit_bytes_total.mean | tostring) \ - + $csv_delim_quoted + (.measurements.node_disk_io_time_seconds_total.mean | tostring)" \ - >>"$max_concurrency_csv" + for i in $mc_files; do + iteration_index=$(echo "$i" | sed -e 's,'"$output_dir"'/load-tests.max-concurrency.\([0-9-]\+\).*,\1,g') + + parked_go_routines=$(get_parked_go_routines "$iteration_index") + parked_go_routines_columns="" + if [ -n "$parked_go_routines" ]; then + for g in $parked_go_routines; do + parked_go_routines_columns="$parked_go_routines_columns + $csv_delim_quoted + \"$g\"" + done + else + for _ in $(seq 1 "$(get_tekton_results_watcher_pod_count)"); do + parked_go_routines_columns="$parked_go_routines_columns + $csv_delim_quoted" + done + fi + jq -rc "(.metadata.\"max-concurrency\".iteration | tostring) \ + + $csv_delim_quoted + (.threads | tostring) \ + + $csv_delim_quoted + (.workloadKPI | tostring) \ + + $csv_delim_quoted + (.errorsTotal | tostring) \ + + $csv_delim_quoted + (.createUserTimeAvg | tostring) \ + + $csv_delim_quoted + (.createUserTimeMax | tostring) \ + + $csv_delim_quoted + (.createApplicationsTimeAvg | tostring) \ + + $csv_delim_quoted + (.createApplicationsTimeMax | tostring) \ + + $csv_delim_quoted + (.createCDQsTimeAvg | tostring) \ + + $csv_delim_quoted + (.createCDQsTimeMax | tostring) \ + + $csv_delim_quoted + (.createComponentsTimeAvg | tostring) \ + + $csv_delim_quoted + (.createComponentsTimeMax | tostring) \ + + $csv_delim_quoted + (.runPipelineSucceededTimeAvg | tostring) \ + + $csv_delim_quoted + (.runPipelineSucceededTimeMax | tostring) \ + + $csv_delim_quoted + (.integrationTestsRunPipelineSucceededTimeAvg | tostring) \ + + $csv_delim_quoted + (.integrationTestsRunPipelineSucceededTimeMax | tostring) \ + + $csv_delim_quoted + (.deploymentSucceededTimeAvg | tostring) \ + + $csv_delim_quoted + (.deploymentSucceededTimeMax | tostring) \ + + $csv_delim_quoted + (.measurements.cluster_cpu_usage_seconds_total_rate.mean | tostring) \ + + $csv_delim_quoted + (.measurements.cluster_disk_throughput_total.mean | tostring) \ + + $csv_delim_quoted + (.measurements.cluster_memory_usage_rss_total.mean | tostring) \ + + $csv_delim_quoted + (.measurements.cluster_pods_count.mean | tostring) \ + + $csv_delim_quoted + (.measurements.cluster_nodes_worker_count.mean | tostring) \ + + $csv_delim_quoted + (.measurements.cluster_running_pods_on_workers_count.mean | tostring) \ + + $csv_delim_quoted + (.measurements.storage_count_attachable_volumes_in_use.mean | tostring) \ + + $csv_delim_quoted + (.measurements.\"tekton-results-watcher\".\"container[watcher]\".memory.min | tostring) \ + + $csv_delim_quoted + (.measurements.\"tekton-results-watcher\".\"container[watcher]\".memory.max | tostring) \ + + $csv_delim_quoted + (.measurements.\"tekton-results-watcher\".\"container[watcher]\".memory.range | tostring) \ + ${parked_go_routines_columns} \ + + $csv_delim_quoted + (.measurements.scheduler_pending_pods_count.mean | tostring) \ + + $csv_delim_quoted + (.measurements.token_pool_rate_primary.mean | tostring) \ + + $csv_delim_quoted + (.measurements.token_pool_rate_secondary.mean | tostring) \ + + $csv_delim_quoted + (.measurements.tekton_pipelines_controller_running_pipelineruns_count.mean | tostring) \ + + $csv_delim_quoted + (.measurements.tekton_tekton_pipelines_controller_workqueue_depth.mean | tostring) \ + + $csv_delim_quoted + (.measurements.pipelinerun_duration_scheduled_seconds.mean | tostring) \ + + $csv_delim_quoted + (.measurements.tekton_pipelines_controller_running_taskruns_throttled_by_node.mean | tostring) \ + + $csv_delim_quoted + (.measurements.tekton_pipelines_controller_running_taskruns_throttled_by_quota.mean | tostring) \ + + $csv_delim_quoted + (.measurements.etcd_request_duration_seconds_average.mean | tostring) \ + + $csv_delim_quoted + (.measurements.cluster_network_bytes_total.mean | tostring) \ + + $csv_delim_quoted + (.measurements.cluster_network_receive_bytes_total.mean | tostring) \ + + $csv_delim_quoted + (.measurements.cluster_network_transmit_bytes_total.mean | tostring) \ + + $csv_delim_quoted + (.measurements.node_disk_io_time_seconds_total.mean | tostring)" \ + "$i" >>"$max_concurrency_csv" + done else echo "WARNING: No file matching '$output_dir/load-tests.max-concurrency.*.json' found!" fi } +get_parked_go_routines() { + goroutines_pprof=$(find "$output_dir" -name "tekton-results-watcher.tekton-results-watcher-*.goroutine-dump-0.$1.pprof") + count=0 + for i in $goroutines_pprof; do + if [ $count -gt 0 ]; then + echo -n " " + fi + echo -n "$(go tool pprof -text "$i" 2>/dev/null | grep 'runtime.gopark$' | sed -e 's,[ ]*\([0-9]\+\) .*,\1,g')" + count=$((count + 1)) + done +} + collect_timestamp_csvs() { echo "Collecting PipelineRun timestamps..." pipelinerun_timestamps=$ARTIFACT_DIR/pipelineruns.tekton.dev_timestamps.csv @@ -201,10 +250,9 @@ collect_timestamp_csvs() { } echo "Collecting max concurrency results..." -collect_artifacts -collect_scalability_data -collect_tekton_profiling_data -collect_timestamp_csvs -collect_monitoring_data - +collect_artifacts || true +collect_timestamp_csvs || true +collect_monitoring_data || true +collect_scalability_data || true +collect_tekton_profiling_data || true popd diff --git a/tests/load-tests/run-max-concurrency.sh b/tests/load-tests/run-max-concurrency.sh index b4b40d336..4a58375d6 100755 --- a/tests/load-tests/run-max-concurrency.sh +++ b/tests/load-tests/run-max-concurrency.sh @@ -6,6 +6,10 @@ output_dir="${OUTPUT_DIR:-.}" export TEKTON_PERF_PROFILE_CPU_PERIOD=${TEKTON_PERF_PROFILE_CPU_PERIOD:-${THRESHOLD:-300}} USER_PREFIX=${USER_PREFIX:-testuser} +OPENSHIFT_API="${OPENSHIFT_API:-$(yq '.clusters[0].cluster.server' "$KUBECONFIG")}" +OPENSHIFT_USERNAME="${OPENSHIFT_USERNAME:-kubeadmin}" +OPENSHIFT_PASSWORD="${OPENSHIFT_PASSWORD:-$(cat "$KUBEADMIN_PASSWORD_FILE")}" + load_test() { local threads iteration index iteration_index threads=${1:-1} @@ -45,6 +49,8 @@ load_test() { echo $! >"$output_dir/$file.pid" done fi + rm -rvf "$output_dir/load-test.json" + rm -rvf "$output_dir/load-test.log" go run loadtest.go \ --component-repo "${COMPONENT_REPO:-https://github.com/nodeshift-starters/devfile-sample.git}" \ --username "$USER_PREFIX-$index" \ @@ -61,7 +67,7 @@ load_test() { --pipeline-skip-initial-checks="${PIPELINE_SKIP_INITIAL_CHECKS:-true}" if [ "${TEKTON_PERF_ENABLE_CPU_PROFILING:-}" == "true" ] || [ "${TEKTON_PERF_ENABLE_MEMORY_PROFILING:-}" == "true" ]; then echo "Waiting for the Tekton profiling to finish up to ${TEKTON_PERF_PROFILE_CPU_PERIOD}s" - for pid_file in $(find $output_dir -name 'tekton*.pid'); do + for pid_file in $(find "$output_dir" -name 'tekton*.pid'); do wait "$(cat "$pid_file")" rm -rvf "$pid_file" done @@ -88,9 +94,9 @@ remove_finalizers() { res=$1 while [ "$(oc get "$res" -A -o json | jq -rc '.items[] | select(.metadata.namespace | startswith("'"$USER_PREFIX"'"))' | wc -l)" != "0" ]; do echo "## Removing finalizers for all $res" - while read line; do + while read -r line; do echo "$line '{\"metadata\":{\"finalizers\":[]}}' --type=merge;" - done <<<$(oc get "$res" -A -o json | jq -rc '.items[] | select(.metadata.namespace | startswith("'"$USER_PREFIX"'")) | "oc patch '"$res"' " + .metadata.name + " -n " + .metadata.namespace + " -p "') | bash -s + done <<<"$(oc get "$res" -A -o json | jq -rc '.items[] | select(.metadata.namespace | startswith("'"$USER_PREFIX"'")) | "oc patch '"$res"' " + .metadata.name + " -n " + .metadata.namespace + " -p "')" | bash -s done } @@ -134,16 +140,18 @@ max_concurrency() { IFS="," read -r -a maxConcurrencySteps <<<"$(echo "${MAX_CONCURRENCY_STEPS:-1\ 5\ 10\ 25\ 50\ 100\ 150\ 200}" | sed 's/ /,/g')" maxThreads=${MAX_THREADS:-10} threshold=${THRESHOLD:-300} - echo '{"startTimestamp":"'$(date +%FT%T%:z)'", "maxThreads": '"$maxThreads"', "maxConcurrencySteps": "'"${maxConcurrencySteps[*]}"'", "threshold": '"$threshold"', "maxConcurrencyReached": 0, "computedConcurrency": 0, "workloadKPI": 0, "endTimestamp": "", "errorsTotal": -1}' | jq >"$output" + echo '{"startTimestamp":"'"$(date +%FT%T%:z)"'", "maxThreads": '"$maxThreads"', "maxConcurrencySteps": "'"${maxConcurrencySteps[*]}"'", "threshold": '"$threshold"', "maxConcurrencyReached": 0, "computedConcurrency": 0, "workloadKPI": 0, "endTimestamp": "", "errorsTotal": -1}' | jq >"$output" iteration=0 for t in "${maxConcurrencySteps[@]}"; do iteration="$((iteration + 1))" if (("$t" > "$maxThreads")); then break fi + oc login "$OPENSHIFT_API" -u "$OPENSHIFT_USERNAME" -p "$OPENSHIFT_PASSWORD" clean_namespaces load_test "$t" "$iteration" iteration_index="$(printf "%04d" "$iteration")-$(printf "%04d" "$t")" + jq ".metadata.\"max-concurrency\".iteration = \"$(printf "%04d" "$iteration")\"" "$output_dir/load-tests.json" >"$output_dir/$$.json" && mv -f "$output_dir/$$.json" "$output_dir/load-tests.json" cp -vf "$output_dir/load-tests.json" "$output_dir/load-tests.max-concurrency.$iteration_index.json" cp -vf "$output_dir/load-tests.log" "$output_dir/load-tests.max-concurrency.$iteration_index.log" workloadKPI=$(jq '.workloadKPI' "$output_dir/load-tests.json") @@ -158,7 +166,7 @@ max_concurrency() { jq ".maxConcurrencyReached = $t" "$output" >"$output_dir/$$.json" && mv -f "$output_dir/$$.json" "$output" jq ".workloadKPI = $workloadKPI" "$output" >"$output_dir/$$.json" && mv -f "$output_dir/$$.json" "$output" jq ".computedConcurrency = $t" "$output" >"$output_dir/$$.json" && mv -f "$output_dir/$$.json" "$output" - jq '.endTimestamp = "'$(date +%FT%T%:z)'"' "$output" >"$output_dir/$$.json" && mv -f "$output_dir/$$.json" "$output" + jq '.endTimestamp = "'"$(date +%FT%T%:z)"'"' "$output" >"$output_dir/$$.json" && mv -f "$output_dir/$$.json" "$output" errorsTotal=$(jq '.errorsTotal' "$output_dir/load-tests.json") jq ".errorsTotal = $errorsTotal" "$output" >"$output_dir/$$.json" && mv -f "$output_dir/$$.json" "$output" fi