Skip to content

Commit

Permalink
fix: delete upgrade meta key from nodes after upgrades
Browse files Browse the repository at this point in the history
Remove the meta key to prevent auto-rollbacks, as Omni manages upgrades & downgrades.

Closes #509.

Signed-off-by: Utku Ozdemir <[email protected]>
  • Loading branch information
utkuozdemir committed Aug 26, 2024
1 parent 3f5c0f8 commit 00ae084
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 6 deletions.
3 changes: 3 additions & 0 deletions client/pkg/meta/meta.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
// Package meta keeps Talos meta partition utils.
package meta

// Upgrade is github.com/siderolabs/talos/internal/pkg/meta.Upgrade.
const Upgrade = 6

// StateEncryptionConfig is github.com/siderolabs/talos/internal/pkg/meta.StateEncryptionConfig.
const StateEncryptionConfig = 9

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,9 @@ func NewClusterMachineConfigStatusController() *ClusterMachineConfigStatusContro

// don't run the upgrade check if the running version and expected versions match
if versionMismatch && installImage.TalosVersion != "" {
inSync, err := handler.syncInstallImageAndSchematic(ctx, configStatus, machineStatus, machineConfig, statusSnapshot, installImage)
if err != nil {
return err
inSync, syncErr := handler.syncInstallImageAndSchematic(ctx, configStatus, machineStatus, machineConfig, statusSnapshot, installImage)
if syncErr != nil {
return syncErr
}

if !inSync {
Expand All @@ -152,6 +152,13 @@ func NewClusterMachineConfigStatusController() *ClusterMachineConfigStatusContro
}
}

stage := statusSnapshot.TypedSpec().Value.GetMachineStatus().GetStage()
if stage == machineapi.MachineStatusEvent_BOOTING || stage == machineapi.MachineStatusEvent_RUNNING {
if err = handler.deleteUpgradeMetaKey(ctx, machineStatus, machineConfig); err != nil {
return err
}
}

shaSum := sha256.Sum256(machineConfig.TypedSpec().Value.Data)
shaSumString := hex.EncodeToString(shaSum[:])

Expand Down Expand Up @@ -744,6 +751,29 @@ func (h *clusterMachineConfigStatusControllerHandler) getClient(
return result, nil
}

func (h *clusterMachineConfigStatusControllerHandler) deleteUpgradeMetaKey(ctx context.Context, machineStatus *omni.MachineStatus, machineConfig *omni.ClusterMachineConfig) error {
client, err := h.getClient(ctx, false, machineStatus, machineConfig)
if err != nil {
return fmt.Errorf("failed to get client for machine %q: %w", machineConfig.Metadata().ID(), err)
}

defer logClose(client, h.logger, fmt.Sprintf("machine %q", machineConfig.Metadata().ID()))

if err = client.MetaDelete(ctx, meta.Upgrade); err != nil {
if status.Code(err) == codes.NotFound {
h.logger.Debug("upgrade meta key not found", zap.String("machine", machineConfig.Metadata().ID()))

return nil
}

return err
}

h.logger.Info("deleted upgrade meta key", zap.String("machine", machineConfig.Metadata().ID()))

return nil
}

var insecureTLSConfig = &tls.Config{
InsecureSkipVerify: true,
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ import (
"github.com/siderolabs/talos/pkg/machinery/api/machine"
"github.com/siderolabs/talos/pkg/machinery/resources/runtime"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/stretchr/testify/suite"

"github.com/siderolabs/omni/client/api/omni/specs"
"github.com/siderolabs/omni/client/pkg/constants"
"github.com/siderolabs/omni/client/pkg/meta"
"github.com/siderolabs/omni/client/pkg/omni/resources"
"github.com/siderolabs/omni/client/pkg/omni/resources/omni"
"github.com/siderolabs/omni/client/pkg/omni/resources/siderolink"
Expand Down Expand Up @@ -429,11 +431,16 @@ func (suite *ClusterMachineConfigStatusSuite) TestUpgrades() {
suite.Require().NoError(err)
}

const (
expectedTalosVersion = "1.6.1"
expectedSchematicID = "cccc"
)

for _, m := range machines {
talosVersion := omni.NewClusterMachineTalosVersion(resources.DefaultNamespace, m.Metadata().ID())
_, err := safe.StateUpdateWithConflicts(suite.ctx, suite.state, talosVersion.Metadata(), func(res *omni.ClusterMachineTalosVersion) error {
res.TypedSpec().Value.TalosVersion = "1.6.1"
res.TypedSpec().Value.SchematicId = "cccc"
res.TypedSpec().Value.TalosVersion = expectedTalosVersion
res.TypedSpec().Value.SchematicId = expectedSchematicID

return nil
})
Expand All @@ -446,7 +453,7 @@ func (suite *ClusterMachineConfigStatusSuite) TestUpgrades() {
return retry.ExpectedErrorf("no upgrade requests received")
}

expectedImage := "factory.talos.dev/installer/cccc:v1.6.1"
expectedImage := fmt.Sprintf("factory.talos.dev/installer/%s:v%s", expectedSchematicID, expectedTalosVersion)
for i, r := range requests {
if r.Image != expectedImage {
return fmt.Errorf("%d request image is invalid: expected %q got %q", i, expectedImage, r.Image)
Expand All @@ -455,6 +462,45 @@ func (suite *ClusterMachineConfigStatusSuite) TestUpgrades() {

return nil
}))

// simulate a finished upgrade by setting related resources to have expected values.
// this satisfies (versionMismatch==false), and makes the controller proceed with the post-upgrade steps.
for _, m := range machines {
cmcs := omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, m.Metadata().ID())

cmcs.TypedSpec().Value.TalosVersion = expectedTalosVersion
cmcs.TypedSpec().Value.SchematicId = expectedSchematicID

suite.Require().NoError(suite.state.Create(suite.ctx, cmcs))

_, err := safe.StateUpdateWithConflicts(suite.ctx, suite.state,
omni.NewMachineStatus(resources.DefaultNamespace, m.Metadata().ID()).Metadata(),
func(res *omni.MachineStatus) error {
res.TypedSpec().Value.TalosVersion = expectedTalosVersion
res.TypedSpec().Value.Schematic.Id = expectedSchematicID

return nil
})
suite.Require().NoError(err)
}

require.EventuallyWithT(suite.T(), func(collect *assert.CollectT) {
metaDeletes := suite.machineService.getMetaDeleteKeyToCount()

if !assert.Len(collect, metaDeletes, 1) {
return
}

count, ok := metaDeletes[meta.Upgrade]

if !assert.True(collect, ok) {
return
}

if assert.Greater(collect, count, 0) {
suite.T().Logf("upgrade meta key deletes: %v", count)
}
}, time.Second*5, 50*time.Millisecond)
}

func (suite *ClusterMachineConfigStatusSuite) TestSchematicChanges() {
Expand Down
23 changes: 23 additions & 0 deletions internal/backend/runtime/omni/controllers/omni/omni_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"context"
"errors"
"fmt"
"maps"
"net"
"os"
"path/filepath"
Expand Down Expand Up @@ -71,6 +72,8 @@ type machineService struct {
serviceList *machine.ServiceListResponse
etcdLeaveClusterHandler func(context.Context, *machine.EtcdLeaveClusterRequest) (*machine.EtcdLeaveClusterResponse, error)

metaDeleteKeyToCount map[uint32]int

address string
state state.State
}
Expand All @@ -89,6 +92,13 @@ func (ms *machineService) clearUpgradeRequests() {
ms.upgradeRequests = nil
}

func (ms *machineService) getMetaDeleteKeyToCount() map[uint32]int {
ms.lock.Lock()
defer ms.lock.Unlock()

return maps.Clone(ms.metaDeleteKeyToCount)
}

func (ms *machineService) ApplyConfiguration(_ context.Context, req *machine.ApplyConfigurationRequest) (*machine.ApplyConfigurationResponse, error) {
ms.lock.Lock()
defer ms.lock.Unlock()
Expand Down Expand Up @@ -275,6 +285,19 @@ func (ms *machineService) ServiceList(context.Context, *emptypb.Empty) (*machine
return ms.serviceList, nil
}

func (ms *machineService) MetaDelete(_ context.Context, req *machine.MetaDeleteRequest) (*machine.MetaDeleteResponse, error) {
ms.lock.Lock()
defer ms.lock.Unlock()

if ms.metaDeleteKeyToCount == nil {
ms.metaDeleteKeyToCount = map[uint32]int{}
}

ms.metaDeleteKeyToCount[req.Key]++

return &machine.MetaDeleteResponse{}, nil
}

type OmniSuite struct { //nolint:govet
suite.Suite

Expand Down

0 comments on commit 00ae084

Please sign in to comment.