From 3f07a6b38b3e53c7b562c66ee2a6d2beb344ac1a Mon Sep 17 00:00:00 2001 From: Sergio Cazzolato Date: Thu, 22 Feb 2024 17:57:16 -0300 Subject: [PATCH] send halt-timeout as a tag to allow per backend garbage collection --- spread/testflinger.go | 36 ++++++++++++++++++--- tests/tesflinger/fake_testflinger.py | 48 ++++++++++++++++++++++++++-- tests/tesflinger/task.yaml | 4 +++ 3 files changed, 81 insertions(+), 7 deletions(-) diff --git a/spread/testflinger.go b/spread/testflinger.go index d226ed89..b074c281 100644 --- a/spread/testflinger.go +++ b/spread/testflinger.go @@ -11,6 +11,7 @@ import ( "net/url" "os" "regexp" + "strings" "time" "golang.org/x/net/context" @@ -65,6 +66,11 @@ type TestFlingerJobResponse struct { JobId string `json:"job_id"` } +type TestFlingerJobInfoResponse struct { + JobId string `json:"job_id"` + Tags []string `json:"tags"` +} + type TestFlingerDeviceInfo struct { DeviceIP string `json:"device_ip"` } @@ -142,14 +148,33 @@ func (p *TestFlingerProvider) GarbageCollect() error { // Iterate over all the running instances for _, s := range jobs { jobTimeout := haltTimeout - if jobTimeout == 0 { - continue - } if s.d.JobState == CANCELLED || s.d.JobState == COMPLETE || s.d.JobState == COMPLETED { continue } printf("Checking %s...", s.d.JobId) + var result TestFlingerJobInfoResponse + err := p.do("GET", "/job/" + s.d.JobId , nil, &result) + if err != nil { + return fmt.Errorf("cannot get instance info: %v", err) + } + + // Use specific job timeout if a tag is set with halt-timeout=TIMEOUT + for _, tag := range result.Tags { + if strings.HasPrefix(tag, "halt-timeout=") { + value := strings.SplitAfter(tag, "=")[1] + d, err := time.ParseDuration(strings.TrimSpace(value)) + if err != nil { + printf("WARNING: Ignoring bad TestFlinger job %s halt-timeout tag: %q", s.d.JobId, value) + } else { + jobTimeout = d + } + } + } + if jobTimeout == 0 { + continue + } + runningTime := now.Sub(s.d.CreatedAt) if runningTime > jobTimeout { printf("Job %s exceeds halt-timeout. Shutting it down...", s.d.JobId) @@ -222,7 +247,10 @@ func (p *TestFlingerProvider) requestDevice(ctx context.Context, system *System) AllocateData: TestFlingerAllocateData{ Allocate: true, }, - Tags: []string{"spread"}, + // Tags used are: + // 1. spread which is used to find the spread active jobs + // 2. halt-timeout=DURATION which is used to determine when a running job has to be cancelled + Tags: []string{"spread", "halt-timeout=" + p.backend.HaltTimeout.Duration.String()}, } var jobRes TestFlingerJobResponse diff --git a/tests/tesflinger/fake_testflinger.py b/tests/tesflinger/fake_testflinger.py index 0085db02..3de2e846 100644 --- a/tests/tesflinger/fake_testflinger.py +++ b/tests/tesflinger/fake_testflinger.py @@ -112,19 +112,61 @@ def mock_action_delete(): return jsonify({}) +@app.route('/v1/job/00000000-0000-0000-0000-000000000004', methods=['GET']) +def mock_job_tags_4(): + return jsonify({ + "allocate_data": {}, + "allocation_timeout": 0, + "firmware_update_data": {}, + "global_timeout": 0, + "job_id": "00000000-0000-0000-0000-000000000004", + "job_queue": "queue", + "name": "name", + "output_timeout": 0, + "parent_job_id": "job", + "provision_data": {}, + "reserve_data": {}, + "tags": ["spread", "halt-timeout=4h"], + "test_data": {} + }) + @app.route('/v1/job/00000000-0000-0000-0000-000000000005/action', methods=['POST']) -def mock_action_error_1(): +def mock_action_error_5(): abort(500) +@app.route('/v1/job/00000000-0000-0000-0000-000000000005', methods=['GET']) +def mock_job_tags_5(): + return jsonify({ + "allocate_data": {}, + "allocation_timeout": 0, + "firmware_update_data": {}, + "global_timeout": 0, + "job_id": "00000000-0000-0000-0000-000000000005", + "job_queue": "queue", + "name": "name", + "output_timeout": 0, + "parent_job_id": "job", + "provision_data": {}, + "reserve_data": {}, + "tags": ["spread", "halt-timeout=4h"], + "test_data": {} + }) @app.route('/v1/job/00000000-0000-0000-0000-000000000006/action', methods=['POST']) -def mock_action_error_2(): +def mock_action_error_6(): abort(500) +@app.route('/v1/job/00000000-0000-0000-0000-000000000006', methods=['GET']) +def mock_job_error_6(): + abort(500) @app.route('/v1/job/00000000-0000-0000-0000-000000000007/action', methods=['POST']) -def mock_action_error_3(): +def mock_action_error_7(): + abort(500) + +@app.route('/v1/job/00000000-0000-0000-0000-000000000007', methods=['GET']) +def mock_job_error_7(): abort(500) diff --git a/tests/tesflinger/task.yaml b/tests/tesflinger/task.yaml index 886ccd86..68f9e17a 100644 --- a/tests/tesflinger/task.yaml +++ b/tests/tesflinger/task.yaml @@ -75,6 +75,10 @@ execute: | stop_mock # Garbage collection scenario + # Job 4: has to be deleted (it is active and halt-timeout exceeded) + # Job 5: has not to be deleted (it is active but halt-timeout not exceeded) + # Job 6: has not to be deleted (it is not active) + # Job 7: has not to be deleted (it is not active) start_mock 00000000-0000-0000-0000-000000000004 spread -gc &> /tmp/task.out cat /tmp/task.out | grep 'Checking 00000000-0000-0000-0000-000000000004'