diff --git a/client/pkg/meta/meta.go b/client/pkg/meta/meta.go index e47235e3..0e03ac31 100644 --- a/client/pkg/meta/meta.go +++ b/client/pkg/meta/meta.go @@ -5,6 +5,9 @@ // Package meta keeps Talos meta partition utils. package meta +// Upgrade is github.com/siderolabs/talos/internal/pkg/meta.Upgrade. +const Upgrade = 6 + // StateEncryptionConfig is github.com/siderolabs/talos/internal/pkg/meta.StateEncryptionConfig. const StateEncryptionConfig = 9 diff --git a/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status.go b/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status.go index c5cd8b67..d57765e7 100644 --- a/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status.go +++ b/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status.go @@ -138,9 +138,9 @@ func NewClusterMachineConfigStatusController() *ClusterMachineConfigStatusContro // don't run the upgrade check if the running version and expected versions match if versionMismatch && installImage.TalosVersion != "" { - inSync, err := handler.syncInstallImageAndSchematic(ctx, configStatus, machineStatus, machineConfig, statusSnapshot, installImage) - if err != nil { - return err + inSync, syncErr := handler.syncInstallImageAndSchematic(ctx, configStatus, machineStatus, machineConfig, statusSnapshot, installImage) + if syncErr != nil { + return syncErr } if !inSync { @@ -152,6 +152,13 @@ func NewClusterMachineConfigStatusController() *ClusterMachineConfigStatusContro } } + stage := statusSnapshot.TypedSpec().Value.GetMachineStatus().GetStage() + if stage == machineapi.MachineStatusEvent_BOOTING || stage == machineapi.MachineStatusEvent_RUNNING { + if err = handler.deleteUpgradeMetaKey(ctx, machineStatus, machineConfig); err != nil { + return err + } + } + shaSum := sha256.Sum256(machineConfig.TypedSpec().Value.Data) shaSumString := hex.EncodeToString(shaSum[:]) @@ -744,6 +751,29 @@ func (h *clusterMachineConfigStatusControllerHandler) getClient( return result, nil } +func (h *clusterMachineConfigStatusControllerHandler) deleteUpgradeMetaKey(ctx context.Context, machineStatus *omni.MachineStatus, machineConfig *omni.ClusterMachineConfig) error { + client, err := h.getClient(ctx, false, machineStatus, machineConfig) + if err != nil { + return fmt.Errorf("failed to get client for machine %q: %w", machineConfig.Metadata().ID(), err) + } + + defer logClose(client, h.logger, fmt.Sprintf("machine %q", machineConfig.Metadata().ID())) + + if err = client.MetaDelete(ctx, meta.Upgrade); err != nil { + if status.Code(err) == codes.NotFound { + h.logger.Debug("upgrade meta key not found", zap.String("machine", machineConfig.Metadata().ID())) + + return nil + } + + return err + } + + h.logger.Info("deleted upgrade meta key", zap.String("machine", machineConfig.Metadata().ID())) + + return nil +} + var insecureTLSConfig = &tls.Config{ InsecureSkipVerify: true, } diff --git a/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status_test.go b/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status_test.go index 007405bb..e8485719 100644 --- a/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status_test.go +++ b/internal/backend/runtime/omni/controllers/omni/cluster_machine_config_status_test.go @@ -21,10 +21,12 @@ import ( "github.com/siderolabs/talos/pkg/machinery/api/machine" "github.com/siderolabs/talos/pkg/machinery/resources/runtime" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" "github.com/siderolabs/omni/client/api/omni/specs" "github.com/siderolabs/omni/client/pkg/constants" + "github.com/siderolabs/omni/client/pkg/meta" "github.com/siderolabs/omni/client/pkg/omni/resources" "github.com/siderolabs/omni/client/pkg/omni/resources/omni" "github.com/siderolabs/omni/client/pkg/omni/resources/siderolink" @@ -429,11 +431,16 @@ func (suite *ClusterMachineConfigStatusSuite) TestUpgrades() { suite.Require().NoError(err) } + const ( + expectedTalosVersion = "1.6.1" + expectedSchematicID = "cccc" + ) + for _, m := range machines { talosVersion := omni.NewClusterMachineTalosVersion(resources.DefaultNamespace, m.Metadata().ID()) _, err := safe.StateUpdateWithConflicts(suite.ctx, suite.state, talosVersion.Metadata(), func(res *omni.ClusterMachineTalosVersion) error { - res.TypedSpec().Value.TalosVersion = "1.6.1" - res.TypedSpec().Value.SchematicId = "cccc" + res.TypedSpec().Value.TalosVersion = expectedTalosVersion + res.TypedSpec().Value.SchematicId = expectedSchematicID return nil }) @@ -446,7 +453,7 @@ func (suite *ClusterMachineConfigStatusSuite) TestUpgrades() { return retry.ExpectedErrorf("no upgrade requests received") } - expectedImage := "factory.talos.dev/installer/cccc:v1.6.1" + expectedImage := fmt.Sprintf("factory.talos.dev/installer/%s:v%s", expectedSchematicID, expectedTalosVersion) for i, r := range requests { if r.Image != expectedImage { return fmt.Errorf("%d request image is invalid: expected %q got %q", i, expectedImage, r.Image) @@ -455,6 +462,45 @@ func (suite *ClusterMachineConfigStatusSuite) TestUpgrades() { return nil })) + + // simulate a finished upgrade by setting related resources to have expected values. + // this satisfies (versionMismatch==false), and makes the controller proceed with the post-upgrade steps. + for _, m := range machines { + cmcs := omni.NewClusterMachineConfigStatus(resources.DefaultNamespace, m.Metadata().ID()) + + cmcs.TypedSpec().Value.TalosVersion = expectedTalosVersion + cmcs.TypedSpec().Value.SchematicId = expectedSchematicID + + suite.Require().NoError(suite.state.Create(suite.ctx, cmcs)) + + _, err := safe.StateUpdateWithConflicts(suite.ctx, suite.state, + omni.NewMachineStatus(resources.DefaultNamespace, m.Metadata().ID()).Metadata(), + func(res *omni.MachineStatus) error { + res.TypedSpec().Value.TalosVersion = expectedTalosVersion + res.TypedSpec().Value.Schematic.Id = expectedSchematicID + + return nil + }) + suite.Require().NoError(err) + } + + require.EventuallyWithT(suite.T(), func(collect *assert.CollectT) { + metaDeletes := suite.machineService.getMetaDeleteKeyToCount() + + if !assert.Len(collect, metaDeletes, 1) { + return + } + + count, ok := metaDeletes[meta.Upgrade] + + if !assert.True(collect, ok) { + return + } + + if assert.Greater(collect, count, 0) { + suite.T().Logf("upgrade meta key deletes: %v", count) + } + }, time.Second*5, 50*time.Millisecond) } func (suite *ClusterMachineConfigStatusSuite) TestSchematicChanges() { diff --git a/internal/backend/runtime/omni/controllers/omni/omni_test.go b/internal/backend/runtime/omni/controllers/omni/omni_test.go index e869e36f..c03c06f1 100644 --- a/internal/backend/runtime/omni/controllers/omni/omni_test.go +++ b/internal/backend/runtime/omni/controllers/omni/omni_test.go @@ -9,6 +9,7 @@ import ( "context" "errors" "fmt" + "maps" "net" "os" "path/filepath" @@ -71,6 +72,8 @@ type machineService struct { serviceList *machine.ServiceListResponse etcdLeaveClusterHandler func(context.Context, *machine.EtcdLeaveClusterRequest) (*machine.EtcdLeaveClusterResponse, error) + metaDeleteKeyToCount map[uint32]int + address string state state.State } @@ -89,6 +92,13 @@ func (ms *machineService) clearUpgradeRequests() { ms.upgradeRequests = nil } +func (ms *machineService) getMetaDeleteKeyToCount() map[uint32]int { + ms.lock.Lock() + defer ms.lock.Unlock() + + return maps.Clone(ms.metaDeleteKeyToCount) +} + func (ms *machineService) ApplyConfiguration(_ context.Context, req *machine.ApplyConfigurationRequest) (*machine.ApplyConfigurationResponse, error) { ms.lock.Lock() defer ms.lock.Unlock() @@ -275,6 +285,19 @@ func (ms *machineService) ServiceList(context.Context, *emptypb.Empty) (*machine return ms.serviceList, nil } +func (ms *machineService) MetaDelete(_ context.Context, req *machine.MetaDeleteRequest) (*machine.MetaDeleteResponse, error) { + ms.lock.Lock() + defer ms.lock.Unlock() + + if ms.metaDeleteKeyToCount == nil { + ms.metaDeleteKeyToCount = map[uint32]int{} + } + + ms.metaDeleteKeyToCount[req.Key]++ + + return &machine.MetaDeleteResponse{}, nil +} + type OmniSuite struct { //nolint:govet suite.Suite