Skip to content

Commit

Permalink
fix(load-test): fix max concurrency collect results script, add tekto…
Browse files Browse the repository at this point in the history
…n result watcher metrics (#1074)

fix(load-test): fix max concurrency collect results script

Signed-off-by: Pavel Macík <[email protected]>
  • Loading branch information
pmacik authored Mar 22, 2024
1 parent 13ba743 commit 7b6309f
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 70 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,26 @@
monitoring_step: {{ step }}
{%- endmacro %}

{% macro monitor_pod_container(namespace, pod, container, step=15, pod_suffix_regex='-[0-9a-f]+-.*') -%}
{% macro monitor_pod_container(namespace, pod, container, step=20, pod_suffix_regex='-[0-9a-f]+-.*') -%}
# Gather monitoring data about the pod's container
- name: measurements.{{ pod }}.container[{{ container }}].memory
monitoring_query: sum(container_memory_usage_bytes{namespace='{{ namespace }}', pod=~'{{ pod }}{{ pod_suffix_regex }}', container='{{container}}'})
monitoring_step: {{ step }}
{%- endmacro %}

{% macro pv_stats(namespace, pvc) -%}
{% macro pv_stats(namespace, pvc, step=20) -%}
# Collect data for PV stats
- name: measurements.cluster.pv_stats.{{namespace}}/{{pvc}}.capacity_bytes
monitoring_query: kubelet_volume_stats_capacity_bytes{namespace="{{ namespace }}", persistentvolumeclaim="{{ pvc }}"}
monitoring_step: 15
monitoring_step: {{ step }}
- name: measurements.cluster.pv_stats.{{namespace}}/{{pvc}}.used_bytes
monitoring_query: kubelet_volume_stats_used_bytes{namespace="{{ namespace }}", persistentvolumeclaim="{{ pvc }}"}
monitoring_step: 15
monitoring_step: {{ step }}
- name: measurements.cluster.pv_stats.{{namespace}}/{{pvc}}.available_bytes
monitoring_query: kubelet_volume_stats_available_bytes{namespace="{{ namespace }}", persistentvolumeclaim="{{ pvc }}"}
monitoring_step: 15
monitoring_step: {{ step }}
{%- endmacro %}

{{ monitor_pod('tekton-results', 'tekton-results-watcher', 15, '-.*') }}
{{ monitor_pod_container('tekton-results', 'tekton-results-watcher', 'watcher', 15, '-.*') }}
{{ pv_stats('tekton-results', 'data-postgres-postgresql-0') }}
{{ monitor_pod('tekton-results', 'tekton-results-watcher', 20, '-.*') }}
{{ monitor_pod_container('tekton-results', 'tekton-results-watcher', 'watcher', 20, '-.*') }}
{{ pv_stats('tekton-results', 'data-postgres-postgresql-0', 20) }}
162 changes: 105 additions & 57 deletions tests/load-tests/ci-scripts/max-concurrency/collect-results.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,17 @@ pushd "${2:-.}"

output_dir="${OUTPUT_DIR:-./tests/load-tests}"

csv_delim=";"
csv_delim_quoted="\"$csv_delim\""
dt_format='"%Y-%m-%dT%H:%M:%SZ"'

collect_artifacts() {
echo "Collecting load test artifacts.."
find "$output_dir" -type f -name 'load-tests.max-concurrency.*.log' -exec cp -vf {} "${ARTIFACT_DIR}" \;
mkdir -p "${ARTIFACT_DIR}/logs"
find "$output_dir" -type f -name 'load-tests.max-concurrency.*.log' -exec cp -vf {} "${ARTIFACT_DIR}/logs" \;
find "$output_dir" -type f -name 'load-tests.max-concurrency.json' -exec cp -vf {} "${ARTIFACT_DIR}" \;
find "$output_dir" -type f -name '*.pprof' -exec cp -vf {} "${ARTIFACT_DIR}" \;
mkdir -p "${ARTIFACT_DIR}/pprof"
find "$output_dir" -type f -name '*.pprof' -exec cp -vf {} "${ARTIFACT_DIR}/pprof" \;
}

collect_monitoring_data() {
Expand Down Expand Up @@ -81,21 +87,29 @@ collect_tekton_profiling_data() {
for pprof_profile in $(find "$output_dir" -name "*.pprof"); do
if [ -s "$pprof_profile" ]; then
file=$(basename "$pprof_profile")
go tool pprof -text "$pprof_profile" >"$ARTIFACT_DIR/$file.txt" || true
go tool pprof -svg -output="$ARTIFACT_DIR/$file.svg" "$pprof_profile" || true
go tool pprof -text "$pprof_profile" >"$ARTIFACT_DIR/pprof/$file.txt" || true
go tool pprof -svg -output="$ARTIFACT_DIR/pprof/$file.svg" "$pprof_profile" || true
fi
done
fi
}

get_tekton_results_watcher_pod_count() {
find "$output_dir" -type f -name 'tekton-results-watcher.tekton-results-watcher-*.goroutine-dump-0.0001-*.pprof' | wc -l
}

collect_scalability_data() {
echo "Collecting scalability data..."
csv_delim=";"
csv_delim_quoted="\"$csv_delim\""
dt_format='"%Y-%m-%dT%H:%M:%SZ"'

tekton_results_watcher_pod_count=$(get_tekton_results_watcher_pod_count)
tekton_results_watcher_pod_headers=""
for i in $(seq -w 1 "$tekton_results_watcher_pod_count"); do
tekton_results_watcher_pod_headers="${tekton_results_watcher_pod_headers}${csv_delim}ParkedGoRoutinesPod$i"
done

max_concurrency_csv=$ARTIFACT_DIR/max-concurrency.csv
echo "Threads\
echo "Iteration\
${csv_delim}Threads\
${csv_delim}WorkloadKPI\
${csv_delim}Errors\
${csv_delim}UserAvgTime\
Expand All @@ -108,17 +122,21 @@ ${csv_delim}ComponentsAvgTime\
${csv_delim}ComponentsMaxTime\
${csv_delim}PipelineRunAvgTime\
${csv_delim}PipelineRunMaxTime\
${csv_delim}integrationTestsRunPipelineSucceededTimeAvg\
${csv_delim}integrationTestsRunPipelineSucceededTimeMax\
${csv_delim}deploymentSucceededTimeAvg\
${csv_delim}deploymentSucceededTimeMax\
${csv_delim}IntegrationTestsRunPipelineSucceededTimeAvg\
${csv_delim}IntegrationTestsRunPipelineSucceededTimeMax\
${csv_delim}DeploymentSucceededTimeAvg\
${csv_delim}DeploymentSucceededTimeMax\
${csv_delim}ClusterCPUUsageAvg\
${csv_delim}ClusterDiskUsageAvg\
${csv_delim}ClusterMemoryUsageAvg\
${csv_delim}ClusterPodCountAvg\
${csv_delim}ClusterNodesWorkerCountAvg\
${csv_delim}ClusterRunningPodsOnWorkersCountAvg\
${csv_delim}ClusterPVCInUseAvg\
${csv_delim}TektonResultsWatcherMemoryMin\
${csv_delim}TektonResultsWatcherMemoryMax\
${csv_delim}TektonResultsWatcherMemoryRange\
${tekton_results_watcher_pod_headers}\
${csv_delim}SchedulerPendingPodsCountAvg\
${csv_delim}TokenPoolRatePrimaryAvg\
${csv_delim}TokenPoolRateSecondaryAvg\
Expand All @@ -135,50 +153,81 @@ ${csv_delim}NodeDiskIoTimeSecondsTotalAvg" \
>"$max_concurrency_csv"
mc_files=$(find "$output_dir" -type f -name 'load-tests.max-concurrency.*.json')
if [ -n "$mc_files" ]; then
cat $mc_files |
jq -rc "(.threads | tostring) \
+ $csv_delim_quoted + (.workloadKPI | tostring) \
+ $csv_delim_quoted + (.errorsTotal | tostring) \
+ $csv_delim_quoted + (.createUserTimeAvg | tostring) \
+ $csv_delim_quoted + (.createUserTimeMax | tostring) \
+ $csv_delim_quoted + (.createApplicationsTimeAvg | tostring) \
+ $csv_delim_quoted + (.createApplicationsTimeMax | tostring) \
+ $csv_delim_quoted + (.createCDQsTimeAvg | tostring) \
+ $csv_delim_quoted + (.createCDQsTimeMax | tostring) \
+ $csv_delim_quoted + (.createComponentsTimeAvg | tostring) \
+ $csv_delim_quoted + (.createComponentsTimeMax | tostring) \
+ $csv_delim_quoted + (.runPipelineSucceededTimeAvg | tostring) \
+ $csv_delim_quoted + (.runPipelineSucceededTimeMax | tostring) \
+ $csv_delim_quoted + (.integrationTestsRunPipelineSucceededTimeAvg | tostring) \
+ $csv_delim_quoted + (.integrationTestsRunPipelineSucceededTimeMax | tostring) \
+ $csv_delim_quoted + (.deploymentSucceededTimeAvg | tostring) \
+ $csv_delim_quoted + (.deploymentSucceededTimeMax | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_cpu_usage_seconds_total_rate.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_disk_throughput_total.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_memory_usage_rss_total.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_pods_count.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_nodes_worker_count.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_running_pods_on_workers_count.mean | tostring) \
+ $csv_delim_quoted + (.measurements.storage_count_attachable_volumes_in_use.mean | tostring) \
+ $csv_delim_quoted + (.measurements.scheduler_pending_pods_count.mean | tostring) \
+ $csv_delim_quoted + (.measurements.token_pool_rate_primary.mean | tostring) \
+ $csv_delim_quoted + (.measurements.token_pool_rate_secondary.mean | tostring) \
+ $csv_delim_quoted + (.measurements.tekton_pipelines_controller_running_pipelineruns_count.mean | tostring) \
+ $csv_delim_quoted + (.measurements.tekton_tekton_pipelines_controller_workqueue_depth.mean | tostring) \
+ $csv_delim_quoted + (.measurements.pipelinerun_duration_scheduled_seconds.mean | tostring) \
+ $csv_delim_quoted + (.measurements.tekton_pipelines_controller_running_taskruns_throttled_by_node.mean | tostring) \
+ $csv_delim_quoted + (.measurements.tekton_pipelines_controller_running_taskruns_throttled_by_quota.mean | tostring) \
+ $csv_delim_quoted + (.measurements.etcd_request_duration_seconds_average.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_network_bytes_total.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_network_receive_bytes_total.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_network_transmit_bytes_total.mean | tostring) \
+ $csv_delim_quoted + (.measurements.node_disk_io_time_seconds_total.mean | tostring)" \
>>"$max_concurrency_csv"
for i in $mc_files; do
iteration_index=$(echo "$i" | sed -e 's,'"$output_dir"'/load-tests.max-concurrency.\([0-9-]\+\).*,\1,g')

parked_go_routines=$(get_parked_go_routines "$iteration_index")
parked_go_routines_columns=""
if [ -n "$parked_go_routines" ]; then
for g in $parked_go_routines; do
parked_go_routines_columns="$parked_go_routines_columns + $csv_delim_quoted + \"$g\""
done
else
for _ in $(seq 1 "$(get_tekton_results_watcher_pod_count)"); do
parked_go_routines_columns="$parked_go_routines_columns + $csv_delim_quoted"
done
fi
jq -rc "(.metadata.\"max-concurrency\".iteration | tostring) \
+ $csv_delim_quoted + (.threads | tostring) \
+ $csv_delim_quoted + (.workloadKPI | tostring) \
+ $csv_delim_quoted + (.errorsTotal | tostring) \
+ $csv_delim_quoted + (.createUserTimeAvg | tostring) \
+ $csv_delim_quoted + (.createUserTimeMax | tostring) \
+ $csv_delim_quoted + (.createApplicationsTimeAvg | tostring) \
+ $csv_delim_quoted + (.createApplicationsTimeMax | tostring) \
+ $csv_delim_quoted + (.createCDQsTimeAvg | tostring) \
+ $csv_delim_quoted + (.createCDQsTimeMax | tostring) \
+ $csv_delim_quoted + (.createComponentsTimeAvg | tostring) \
+ $csv_delim_quoted + (.createComponentsTimeMax | tostring) \
+ $csv_delim_quoted + (.runPipelineSucceededTimeAvg | tostring) \
+ $csv_delim_quoted + (.runPipelineSucceededTimeMax | tostring) \
+ $csv_delim_quoted + (.integrationTestsRunPipelineSucceededTimeAvg | tostring) \
+ $csv_delim_quoted + (.integrationTestsRunPipelineSucceededTimeMax | tostring) \
+ $csv_delim_quoted + (.deploymentSucceededTimeAvg | tostring) \
+ $csv_delim_quoted + (.deploymentSucceededTimeMax | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_cpu_usage_seconds_total_rate.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_disk_throughput_total.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_memory_usage_rss_total.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_pods_count.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_nodes_worker_count.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_running_pods_on_workers_count.mean | tostring) \
+ $csv_delim_quoted + (.measurements.storage_count_attachable_volumes_in_use.mean | tostring) \
+ $csv_delim_quoted + (.measurements.\"tekton-results-watcher\".\"container[watcher]\".memory.min | tostring) \
+ $csv_delim_quoted + (.measurements.\"tekton-results-watcher\".\"container[watcher]\".memory.max | tostring) \
+ $csv_delim_quoted + (.measurements.\"tekton-results-watcher\".\"container[watcher]\".memory.range | tostring) \
${parked_go_routines_columns} \
+ $csv_delim_quoted + (.measurements.scheduler_pending_pods_count.mean | tostring) \
+ $csv_delim_quoted + (.measurements.token_pool_rate_primary.mean | tostring) \
+ $csv_delim_quoted + (.measurements.token_pool_rate_secondary.mean | tostring) \
+ $csv_delim_quoted + (.measurements.tekton_pipelines_controller_running_pipelineruns_count.mean | tostring) \
+ $csv_delim_quoted + (.measurements.tekton_tekton_pipelines_controller_workqueue_depth.mean | tostring) \
+ $csv_delim_quoted + (.measurements.pipelinerun_duration_scheduled_seconds.mean | tostring) \
+ $csv_delim_quoted + (.measurements.tekton_pipelines_controller_running_taskruns_throttled_by_node.mean | tostring) \
+ $csv_delim_quoted + (.measurements.tekton_pipelines_controller_running_taskruns_throttled_by_quota.mean | tostring) \
+ $csv_delim_quoted + (.measurements.etcd_request_duration_seconds_average.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_network_bytes_total.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_network_receive_bytes_total.mean | tostring) \
+ $csv_delim_quoted + (.measurements.cluster_network_transmit_bytes_total.mean | tostring) \
+ $csv_delim_quoted + (.measurements.node_disk_io_time_seconds_total.mean | tostring)" \
"$i" >>"$max_concurrency_csv"
done
else
echo "WARNING: No file matching '$output_dir/load-tests.max-concurrency.*.json' found!"
fi
}

get_parked_go_routines() {
goroutines_pprof=$(find "$output_dir" -name "tekton-results-watcher.tekton-results-watcher-*.goroutine-dump-0.$1.pprof")
count=0
for i in $goroutines_pprof; do
if [ $count -gt 0 ]; then
echo -n " "
fi
echo -n "$(go tool pprof -text "$i" 2>/dev/null | grep 'runtime.gopark$' | sed -e 's,[ ]*\([0-9]\+\) .*,\1,g')"
count=$((count + 1))
done
}

collect_timestamp_csvs() {
echo "Collecting PipelineRun timestamps..."
pipelinerun_timestamps=$ARTIFACT_DIR/pipelineruns.tekton.dev_timestamps.csv
Expand All @@ -201,10 +250,9 @@ collect_timestamp_csvs() {
}

echo "Collecting max concurrency results..."
collect_artifacts
collect_scalability_data
collect_tekton_profiling_data
collect_timestamp_csvs
collect_monitoring_data

collect_artifacts || true
collect_timestamp_csvs || true
collect_monitoring_data || true
collect_scalability_data || true
collect_tekton_profiling_data || true
popd
18 changes: 13 additions & 5 deletions tests/load-tests/run-max-concurrency.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ output_dir="${OUTPUT_DIR:-.}"
export TEKTON_PERF_PROFILE_CPU_PERIOD=${TEKTON_PERF_PROFILE_CPU_PERIOD:-${THRESHOLD:-300}}
USER_PREFIX=${USER_PREFIX:-testuser}

OPENSHIFT_API="${OPENSHIFT_API:-$(yq '.clusters[0].cluster.server' "$KUBECONFIG")}"
OPENSHIFT_USERNAME="${OPENSHIFT_USERNAME:-kubeadmin}"
OPENSHIFT_PASSWORD="${OPENSHIFT_PASSWORD:-$(cat "$KUBEADMIN_PASSWORD_FILE")}"

load_test() {
local threads iteration index iteration_index
threads=${1:-1}
Expand Down Expand Up @@ -45,6 +49,8 @@ load_test() {
echo $! >"$output_dir/$file.pid"
done
fi
rm -rvf "$output_dir/load-test.json"
rm -rvf "$output_dir/load-test.log"
go run loadtest.go \
--component-repo "${COMPONENT_REPO:-https://github.com/nodeshift-starters/devfile-sample.git}" \
--username "$USER_PREFIX-$index" \
Expand All @@ -61,7 +67,7 @@ load_test() {
--pipeline-skip-initial-checks="${PIPELINE_SKIP_INITIAL_CHECKS:-true}"
if [ "${TEKTON_PERF_ENABLE_CPU_PROFILING:-}" == "true" ] || [ "${TEKTON_PERF_ENABLE_MEMORY_PROFILING:-}" == "true" ]; then
echo "Waiting for the Tekton profiling to finish up to ${TEKTON_PERF_PROFILE_CPU_PERIOD}s"
for pid_file in $(find $output_dir -name 'tekton*.pid'); do
for pid_file in $(find "$output_dir" -name 'tekton*.pid'); do
wait "$(cat "$pid_file")"
rm -rvf "$pid_file"
done
Expand All @@ -88,9 +94,9 @@ remove_finalizers() {
res=$1
while [ "$(oc get "$res" -A -o json | jq -rc '.items[] | select(.metadata.namespace | startswith("'"$USER_PREFIX"'"))' | wc -l)" != "0" ]; do
echo "## Removing finalizers for all $res"
while read line; do
while read -r line; do
echo "$line '{\"metadata\":{\"finalizers\":[]}}' --type=merge;"
done <<<$(oc get "$res" -A -o json | jq -rc '.items[] | select(.metadata.namespace | startswith("'"$USER_PREFIX"'")) | "oc patch '"$res"' " + .metadata.name + " -n " + .metadata.namespace + " -p "') | bash -s
done <<<"$(oc get "$res" -A -o json | jq -rc '.items[] | select(.metadata.namespace | startswith("'"$USER_PREFIX"'")) | "oc patch '"$res"' " + .metadata.name + " -n " + .metadata.namespace + " -p "')" | bash -s
done
}

Expand Down Expand Up @@ -134,16 +140,18 @@ max_concurrency() {
IFS="," read -r -a maxConcurrencySteps <<<"$(echo "${MAX_CONCURRENCY_STEPS:-1\ 5\ 10\ 25\ 50\ 100\ 150\ 200}" | sed 's/ /,/g')"
maxThreads=${MAX_THREADS:-10}
threshold=${THRESHOLD:-300}
echo '{"startTimestamp":"'$(date +%FT%T%:z)'", "maxThreads": '"$maxThreads"', "maxConcurrencySteps": "'"${maxConcurrencySteps[*]}"'", "threshold": '"$threshold"', "maxConcurrencyReached": 0, "computedConcurrency": 0, "workloadKPI": 0, "endTimestamp": "", "errorsTotal": -1}' | jq >"$output"
echo '{"startTimestamp":"'"$(date +%FT%T%:z)"'", "maxThreads": '"$maxThreads"', "maxConcurrencySteps": "'"${maxConcurrencySteps[*]}"'", "threshold": '"$threshold"', "maxConcurrencyReached": 0, "computedConcurrency": 0, "workloadKPI": 0, "endTimestamp": "", "errorsTotal": -1}' | jq >"$output"
iteration=0
for t in "${maxConcurrencySteps[@]}"; do
iteration="$((iteration + 1))"
if (("$t" > "$maxThreads")); then
break
fi
oc login "$OPENSHIFT_API" -u "$OPENSHIFT_USERNAME" -p "$OPENSHIFT_PASSWORD"
clean_namespaces
load_test "$t" "$iteration"
iteration_index="$(printf "%04d" "$iteration")-$(printf "%04d" "$t")"
jq ".metadata.\"max-concurrency\".iteration = \"$(printf "%04d" "$iteration")\"" "$output_dir/load-tests.json" >"$output_dir/$$.json" && mv -f "$output_dir/$$.json" "$output_dir/load-tests.json"
cp -vf "$output_dir/load-tests.json" "$output_dir/load-tests.max-concurrency.$iteration_index.json"
cp -vf "$output_dir/load-tests.log" "$output_dir/load-tests.max-concurrency.$iteration_index.log"
workloadKPI=$(jq '.workloadKPI' "$output_dir/load-tests.json")
Expand All @@ -158,7 +166,7 @@ max_concurrency() {
jq ".maxConcurrencyReached = $t" "$output" >"$output_dir/$$.json" && mv -f "$output_dir/$$.json" "$output"
jq ".workloadKPI = $workloadKPI" "$output" >"$output_dir/$$.json" && mv -f "$output_dir/$$.json" "$output"
jq ".computedConcurrency = $t" "$output" >"$output_dir/$$.json" && mv -f "$output_dir/$$.json" "$output"
jq '.endTimestamp = "'$(date +%FT%T%:z)'"' "$output" >"$output_dir/$$.json" && mv -f "$output_dir/$$.json" "$output"
jq '.endTimestamp = "'"$(date +%FT%T%:z)"'"' "$output" >"$output_dir/$$.json" && mv -f "$output_dir/$$.json" "$output"
errorsTotal=$(jq '.errorsTotal' "$output_dir/load-tests.json")
jq ".errorsTotal = $errorsTotal" "$output" >"$output_dir/$$.json" && mv -f "$output_dir/$$.json" "$output"
fi
Expand Down

0 comments on commit 7b6309f

Please sign in to comment.