Skip to content

Commit

Permalink
send halt-timeout as a tag to allow per backend garbage collection
Browse files Browse the repository at this point in the history
  • Loading branch information
sergiocazzolato committed Feb 22, 2024
1 parent 6dfe06a commit 3f07a6b
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 7 deletions.
36 changes: 32 additions & 4 deletions spread/testflinger.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"net/url"
"os"
"regexp"
"strings"
"time"

"golang.org/x/net/context"
Expand Down Expand Up @@ -65,6 +66,11 @@ type TestFlingerJobResponse struct {
JobId string `json:"job_id"`
}

type TestFlingerJobInfoResponse struct {
JobId string `json:"job_id"`
Tags []string `json:"tags"`
}

type TestFlingerDeviceInfo struct {
DeviceIP string `json:"device_ip"`
}
Expand Down Expand Up @@ -142,14 +148,33 @@ func (p *TestFlingerProvider) GarbageCollect() error {
// Iterate over all the running instances
for _, s := range jobs {
jobTimeout := haltTimeout
if jobTimeout == 0 {
continue
}
if s.d.JobState == CANCELLED || s.d.JobState == COMPLETE || s.d.JobState == COMPLETED {
continue
}
printf("Checking %s...", s.d.JobId)

var result TestFlingerJobInfoResponse
err := p.do("GET", "/job/" + s.d.JobId , nil, &result)
if err != nil {
return fmt.Errorf("cannot get instance info: %v", err)
}

// Use specific job timeout if a tag is set with halt-timeout=TIMEOUT
for _, tag := range result.Tags {
if strings.HasPrefix(tag, "halt-timeout=") {
value := strings.SplitAfter(tag, "=")[1]
d, err := time.ParseDuration(strings.TrimSpace(value))
if err != nil {
printf("WARNING: Ignoring bad TestFlinger job %s halt-timeout tag: %q", s.d.JobId, value)
} else {
jobTimeout = d
}
}
}
if jobTimeout == 0 {
continue
}

runningTime := now.Sub(s.d.CreatedAt)
if runningTime > jobTimeout {
printf("Job %s exceeds halt-timeout. Shutting it down...", s.d.JobId)
Expand Down Expand Up @@ -222,7 +247,10 @@ func (p *TestFlingerProvider) requestDevice(ctx context.Context, system *System)
AllocateData: TestFlingerAllocateData{
Allocate: true,
},
Tags: []string{"spread"},
// Tags used are:
// 1. spread which is used to find the spread active jobs
// 2. halt-timeout=DURATION which is used to determine when a running job has to be cancelled
Tags: []string{"spread", "halt-timeout=" + p.backend.HaltTimeout.Duration.String()},
}

var jobRes TestFlingerJobResponse
Expand Down
48 changes: 45 additions & 3 deletions tests/tesflinger/fake_testflinger.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,19 +112,61 @@ def mock_action_delete():

return jsonify({})

@app.route('/v1/job/00000000-0000-0000-0000-000000000004', methods=['GET'])
def mock_job_tags_4():
return jsonify({
"allocate_data": {},
"allocation_timeout": 0,
"firmware_update_data": {},
"global_timeout": 0,
"job_id": "00000000-0000-0000-0000-000000000004",
"job_queue": "queue",
"name": "name",
"output_timeout": 0,
"parent_job_id": "job",
"provision_data": {},
"reserve_data": {},
"tags": ["spread", "halt-timeout=4h"],
"test_data": {}
})


@app.route('/v1/job/00000000-0000-0000-0000-000000000005/action', methods=['POST'])
def mock_action_error_1():
def mock_action_error_5():
abort(500)

@app.route('/v1/job/00000000-0000-0000-0000-000000000005', methods=['GET'])
def mock_job_tags_5():
return jsonify({
"allocate_data": {},
"allocation_timeout": 0,
"firmware_update_data": {},
"global_timeout": 0,
"job_id": "00000000-0000-0000-0000-000000000005",
"job_queue": "queue",
"name": "name",
"output_timeout": 0,
"parent_job_id": "job",
"provision_data": {},
"reserve_data": {},
"tags": ["spread", "halt-timeout=4h"],
"test_data": {}
})

@app.route('/v1/job/00000000-0000-0000-0000-000000000006/action', methods=['POST'])
def mock_action_error_2():
def mock_action_error_6():
abort(500)

@app.route('/v1/job/00000000-0000-0000-0000-000000000006', methods=['GET'])
def mock_job_error_6():
abort(500)

@app.route('/v1/job/00000000-0000-0000-0000-000000000007/action', methods=['POST'])
def mock_action_error_3():
def mock_action_error_7():
abort(500)

@app.route('/v1/job/00000000-0000-0000-0000-000000000007', methods=['GET'])
def mock_job_error_7():
abort(500)


Expand Down
4 changes: 4 additions & 0 deletions tests/tesflinger/task.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ execute: |
stop_mock
# Garbage collection scenario
# Job 4: has to be deleted (it is active and halt-timeout exceeded)
# Job 5: has not to be deleted (it is active but halt-timeout not exceeded)
# Job 6: has not to be deleted (it is not active)
# Job 7: has not to be deleted (it is not active)
start_mock 00000000-0000-0000-0000-000000000004
spread -gc &> /tmp/task.out
cat /tmp/task.out | grep 'Checking 00000000-0000-0000-0000-000000000004'
Expand Down

0 comments on commit 3f07a6b

Please sign in to comment.