diff --git a/e2e/fixtures/factory.go b/e2e/fixtures/factory.go index 265f169d3..a1904ab10 100644 --- a/e2e/fixtures/factory.go +++ b/e2e/fixtures/factory.go @@ -165,10 +165,19 @@ func (factory *Factory) CreateFdbClusterFromSpec( config *ClusterConfig, options ...ClusterOption, ) *FdbCluster { + startTime := time.Now() config.SetDefaults(factory) log.Printf("create cluster: %s", ToJSON(spec)) - return factory.startFDBFromClusterSpec(spec, config, options...) + cluster := factory.startFDBFromClusterSpec(spec, config, options...) + log.Println( + "FoundationDB cluster created (at version", + cluster.cluster.Spec.Version, + ") in minutes", + time.Since(startTime).Minutes(), + ) + + return cluster } // CreateFdbHaCluster creates a HA FDB Cluster based on the cluster config and cluster options @@ -176,6 +185,7 @@ func (factory *Factory) CreateFdbHaCluster( config *ClusterConfig, options ...ClusterOption, ) *HaFdbCluster { + startTime := time.Now() config.SetDefaults(factory) cluster, err := factory.ensureHAFdbClusterExists( @@ -183,6 +193,13 @@ func (factory *Factory) CreateFdbHaCluster( options, ) + log.Println( + "FoundationDB HA cluster created (at version", + cluster.GetPrimary().cluster.Spec.Version, + ") in minutes", + time.Since(startTime).Minutes(), + ) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) return cluster diff --git a/e2e/test_operator_ha_flaky_upgrades/operator_ha_flaky_upgrade_test.go b/e2e/test_operator_ha_flaky_upgrades/operator_ha_flaky_upgrade_test.go new file mode 100644 index 000000000..3f263b030 --- /dev/null +++ b/e2e/test_operator_ha_flaky_upgrades/operator_ha_flaky_upgrade_test.go @@ -0,0 +1,158 @@ +/* + * operator_ha_flaky_upgrades_test.go + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2023 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package operatorhaflakyupgrades + +/* +This test suite includes tests to validate the behaviour of the operator during upgrades on a HA FoundationDB cluster. +The executed tests include a base test without any chaos/faults. +Each test will create a new HA FoundationDB cluster which will be upgraded. +*/ + +import ( + "log" + "time" + + fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2" + "github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures" + chaosmesh "github.com/chaos-mesh/chaos-mesh/api/v1alpha1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func init() { + testOptions = fixtures.InitFlags() +} + +var ( + factory *fixtures.Factory + fdbCluster *fixtures.HaFdbCluster + testOptions *fixtures.FactoryOptions +) + +var _ = AfterSuite(func() { + if CurrentSpecReport().Failed() { + log.Printf("failed due to %s", CurrentSpecReport().FailureMessage()) + } +}) + +func clusterSetupWithHealthCheckOption(beforeVersion string, enableOperatorPodChaos bool, enableHealthCheck bool) { + // We set the before version here to overwrite the before version from the specific flag + // the specific flag will be removed in the future. + factory.SetBeforeVersion(beforeVersion) + + fdbCluster = factory.CreateFdbHaCluster( + fixtures.DefaultClusterConfigWithHaMode(fixtures.HaFourZoneSingleSat, false), + factory.GetClusterOptions(fixtures.UseVersionBeforeUpgrade)..., + ) + if enableHealthCheck { + Expect( + fdbCluster.GetPrimary().InvariantClusterStatusAvailableWithThreshold(15 * time.Second), + ).ShouldNot(HaveOccurred()) + } + + if enableOperatorPodChaos && factory.ChaosTestsEnabled() { + for _, curCluster := range fdbCluster.GetAllClusters() { + factory.ScheduleInjectPodKill( + fixtures.GetOperatorSelector(curCluster.Namespace()), + "*/5 * * * *", + chaosmesh.OneMode, + ) + } + } +} + +func clusterSetup(beforeVersion string, enableOperatorPodChaos bool) { + clusterSetupWithHealthCheckOption(beforeVersion, enableOperatorPodChaos, true) +} + +// Checks if cluster is running at the expectedVersion. This is done by checking the status of the FoundationDBCluster status. +// Before that we checked the cluster status json by checking the reported version of all processes. This approach only worked for +// version compatible upgrades, since incompatible processes won't be part of the cluster anyway. To simplify the check +// we verify the reported running version from the operator. +func checkVersion(cluster *fixtures.HaFdbCluster, expectedVersion string) { + Eventually(func() bool { + for _, singleCluster := range cluster.GetAllClusters() { + if singleCluster.GetCluster().Status.RunningVersion != expectedVersion { + return false + } + } + + return true + }).WithTimeout(10 * time.Minute).WithPolling(2 * time.Second).Should(BeTrue()) +} + +var _ = Describe("Operator HA Upgrades", Label("e2e", "nightly"), func() { + BeforeEach(func() { + factory = fixtures.CreateFactory(testOptions) + }) + + AfterEach(func() { + if CurrentSpecReport().Failed() { + fdbCluster.DumpState() + } + factory.Shutdown() + }) + + // https://github.com/FoundationDB/fdb-kubernetes-operator/issues/172, debug why this test is flaky and how + // to make it stable. + DescribeTable( + "when no remote processes are restarted", + func(beforeVersion string, targetVersion string) { + clusterSetup(beforeVersion, false) + + // Select remote processes and use the buggify option to skip those + // processes during the restart command. + remoteProcessGroups := fdbCluster.GetRemote().GetCluster().Status.ProcessGroups + ignoreDuringRestart := make( + []fdbv1beta2.ProcessGroupID, + 0, + len(remoteProcessGroups), + ) + + for _, processGroup := range remoteProcessGroups { + ignoreDuringRestart = append( + ignoreDuringRestart, + processGroup.ProcessGroupID, + ) + } + + log.Println( + "Selected Process Groups:", + ignoreDuringRestart, + "to be skipped during the restart", + ) + + // We have to set this to all clusters as any operator could be doing the cluster wide restart. + for _, cluster := range fdbCluster.GetAllClusters() { + cluster.SetIgnoreDuringRestart(ignoreDuringRestart) + } + + // The cluster should still be able to upgrade. + Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred()) + // Verify that the upgrade proceeds + checkVersion(fdbCluster, targetVersion) + + // TODO add validation here processes are updated new version + }, + EntryDescription("Upgrade from %[1]s to %[2]s"), + fixtures.GenerateUpgradeTableEntries(testOptions), + ) +}) diff --git a/e2e/test_operator_ha_flaky_upgrades/suite_test.go b/e2e/test_operator_ha_flaky_upgrades/suite_test.go new file mode 100644 index 000000000..bdf489d71 --- /dev/null +++ b/e2e/test_operator_ha_flaky_upgrades/suite_test.go @@ -0,0 +1,34 @@ +/* + * suite_test.go + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2023 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package operatorhaflakyupgrades + +import ( + "testing" + "time" + + "github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures" + . "github.com/onsi/gomega" +) + +func TestOperatorHaUpgrade(t *testing.T) { + SetDefaultEventuallyTimeout(3 * time.Minute) + fixtures.RunGinkgoTests(t, "FDB Operator HA Upgrade Test Suite") +} diff --git a/e2e/test_operator_ha_upgrades/operator_ha_upgrade_test.go b/e2e/test_operator_ha_upgrades/operator_ha_upgrade_test.go index 6a300ee3e..e879dc1ee 100644 --- a/e2e/test_operator_ha_upgrades/operator_ha_upgrade_test.go +++ b/e2e/test_operator_ha_upgrades/operator_ha_upgrade_test.go @@ -62,7 +62,6 @@ func clusterSetupWithHealthCheckOption(beforeVersion string, enableOperatorPodCh // We set the before version here to overwrite the before version from the specific flag // the specific flag will be removed in the future. factory.SetBeforeVersion(beforeVersion) - startTime := time.Now() fdbCluster = factory.CreateFdbHaCluster( fixtures.DefaultClusterConfigWithHaMode(fixtures.HaFourZoneSingleSat, false), factory.GetClusterOptions(fixtures.UseVersionBeforeUpgrade)..., @@ -73,13 +72,6 @@ func clusterSetupWithHealthCheckOption(beforeVersion string, enableOperatorPodCh ).ShouldNot(HaveOccurred()) } - log.Println( - "FoundationDB HA cluster created (at version", - beforeVersion, - ") in minutes", - time.Since(startTime).Minutes(), - ) - if enableOperatorPodChaos && factory.ChaosTestsEnabled() { for _, curCluster := range fdbCluster.GetAllClusters() { factory.ScheduleInjectPodKill( @@ -524,47 +516,4 @@ var _ = Describe("Operator HA Upgrades", Label("e2e", "pr"), func() { EntryDescription("Upgrade from %[1]s to %[2]s"), fixtures.GenerateUpgradeTableEntries(testOptions), ) - - DescribeTable( - "when no remote processes are restarted", - func(beforeVersion string, targetVersion string) { - clusterSetup(beforeVersion, false) - - // Select remote processes and use the buggify option to skip those - // processes during the restart command. - remoteProcessGroups := fdbCluster.GetRemote().GetCluster().Status.ProcessGroups - ignoreDuringRestart := make( - []fdbv1beta2.ProcessGroupID, - 0, - len(remoteProcessGroups), - ) - - for _, processGroup := range remoteProcessGroups { - ignoreDuringRestart = append( - ignoreDuringRestart, - processGroup.ProcessGroupID, - ) - } - - log.Println( - "Selected Process Groups:", - ignoreDuringRestart, - "to be skipped during the restart", - ) - - // We have to set this to all clusters as any operator could be doing the cluster wide restart. - for _, cluster := range fdbCluster.GetAllClusters() { - cluster.SetIgnoreDuringRestart(ignoreDuringRestart) - } - - // The cluster should still be able to upgrade. - Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred()) - // Verify that the upgrade proceeds - checkVersion(fdbCluster, targetVersion) - - // TODO add validation here processes are updated new version - }, - EntryDescription("Upgrade from %[1]s to %[2]s"), - fixtures.GenerateUpgradeTableEntries(testOptions), - ) }) diff --git a/e2e/test_operator_upgrades/operator_upgrades_test.go b/e2e/test_operator_upgrades/operator_upgrades_test.go index c07e5b658..dda041ad2 100644 --- a/e2e/test_operator_upgrades/operator_upgrades_test.go +++ b/e2e/test_operator_upgrades/operator_upgrades_test.go @@ -28,7 +28,6 @@ Since FoundationDB is version incompatible for major and minor versions and the */ import ( - "fmt" "log" "strings" "time" @@ -37,7 +36,6 @@ import ( fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2" "github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures" - chaosmesh "github.com/chaos-mesh/chaos-mesh/api/v1alpha1" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" @@ -98,126 +96,6 @@ var _ = Describe("Operator Upgrades", Label("e2e", "pr"), func() { // Ginkgo lacks the support for AfterEach and BeforeEach in tables, so we have to put everything inside the testing function // this setup allows to dynamically generate the table entries that will be executed e.g. to test different upgrades // for different versions without hard coding or having multiple flags. - DescribeTable( - "upgrading a cluster with a partitioned pod", - func(beforeVersion string, targetVersion string) { - if !factory.ChaosTestsEnabled() { - Skip("chaos mesh is disabled") - } - - clusterSetup(beforeVersion, true) - Expect(fdbCluster.SetAutoReplacements(false, 20*time.Minute)).ToNot(HaveOccurred()) - // Ensure the operator is not skipping the process because it's missing for to long - Expect( - fdbCluster.SetIgnoreMissingProcessesSeconds(5 * time.Minute), - ).NotTo(HaveOccurred()) - - // 1. Introduce network partition b/w Pod and cluster. - // Inject chaos only to storage Pods to reduce the risk of a long recovery because a transaction - // system Pod was partitioned. The test still has the same checks that will be performed. - // Once we have a better availability check for upgrades we can target all pods again. - partitionedPod := fixtures.ChooseRandomPod(fdbCluster.GetStoragePods()) - log.Println("Injecting network partition to pod: ", partitionedPod.Name) - exp := factory.InjectPartitionBetween( - fixtures.PodSelector(partitionedPod), - chaosmesh.PodSelectorSpec{ - GenericSelectorSpec: chaosmesh.GenericSelectorSpec{ - Namespaces: []string{partitionedPod.Namespace}, - LabelSelectors: fdbCluster.GetCachedCluster().GetMatchLabels(), - }, - }, - ) - - Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred()) - - if !fixtures.VersionsAreProtocolCompatible(beforeVersion, targetVersion) { - // 2. Until we remove the partition, cluster should not have - // upgraded. Keep checking for 2m. - // - // This is only true for version incompatible upgrades. - Consistently(func() bool { - return fdbCluster.GetCluster().Status.RunningVersion == beforeVersion - }).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(BeTrue()) - } - - // 3. Delete the partition, and the upgrade should proceed. - log.Println("deleting chaos experiment and cluster should upgrade") - factory.DeleteChaosMeshExperimentSafe(exp) - fdbCluster.VerifyVersion(targetVersion) - }, - - EntryDescription("Upgrade from %[1]s to %[2]s with partitioned Pod"), - fixtures.GenerateUpgradeTableEntries(testOptions), - ) - - DescribeTable( - "with a partitioned pod which eventually gets replaced", - func(beforeVersion string, targetVersion string) { - if !factory.ChaosTestsEnabled() { - Skip("chaos mesh is disabled") - } - - clusterSetup(beforeVersion, true) - Expect(fdbCluster.SetAutoReplacements(false, 5*time.Minute)).ToNot(HaveOccurred()) - // Ensure the operator is not skipping the process because it's missing for to long - Expect( - fdbCluster.SetIgnoreMissingProcessesSeconds(5 * time.Minute), - ).NotTo(HaveOccurred()) - - // 1. Introduce network partition b/w Pod and cluster. - partitionedPod := fixtures.ChooseRandomPod(fdbCluster.GetStoragePods()) - log.Println("Injecting network partition to pod: ", partitionedPod.Name) - _ = factory.InjectPartitionBetween( - fixtures.PodSelector(partitionedPod), - chaosmesh.PodSelectorSpec{ - GenericSelectorSpec: chaosmesh.GenericSelectorSpec{ - Namespaces: []string{partitionedPod.Namespace}, - LabelSelectors: fdbCluster.GetCachedCluster().GetMatchLabels(), - }, - }, - ) - - Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred()) - - if !fixtures.VersionsAreProtocolCompatible(beforeVersion, targetVersion) { - // 2. Until we remove the partition, cluster should not have upgraded. Keep checking for 2m. - // - // This is only true for version incompatible upgrades. - Consistently(func() bool { - return fdbCluster.GetCluster().Status.RunningVersion == beforeVersion - }).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(BeTrue()) - } else { - // Simulate the 2 minute wait otherwise the auto replacement might not succeed in time. - time.Sleep(2 * time.Minute) - } - - // Enable the auto replacement feature again, but don't wait for reconciliation, otherwise we wait - // until the whole cluster is upgraded. - Expect( - fdbCluster.SetAutoReplacementsWithWait(true, 3*time.Minute, false), - ).ToNot(HaveOccurred()) - - // In the case of a version compatible upgrade the operator will proceed with recreating the storage Pods - // to make sure they use the new version. At the same time all transaction processes are marked for removal - // in order to bring up a new set of transaction processes with the new version, this prevents the operator - // from doing the actual replacement of the partitioned Pod. If th partitioned Pod gets recreated by the operator - // the injected partition from chaos-mesh is lost and therefore the process is reporting to the cluster again. - if !fixtures.VersionsAreProtocolCompatible(beforeVersion, targetVersion) { - log.Println("waiting for pod removal:", partitionedPod.Name) - Expect(fdbCluster.WaitForPodRemoval(partitionedPod)).ShouldNot(HaveOccurred()) - log.Println("pod removed:", partitionedPod.Name) - } - - // 3. Upgrade should proceed without removing the partition, as Pod will be replaced. - fdbCluster.VerifyVersion(targetVersion) - }, - - EntryDescription( - "Upgrade from %[1]s to %[2]s", - ), - fixtures.GenerateUpgradeTableEntries(testOptions), - ) - DescribeTable( "upgrading a cluster with a random Pod deleted during rolling bounce phase", func(beforeVersion string, targetVersion string) { @@ -501,52 +379,6 @@ var _ = Describe("Operator Upgrades", Label("e2e", "pr"), func() { fixtures.GenerateUpgradeTableEntries(testOptions), ) - DescribeTable( - "upgrading a cluster with link that drops some packets", - func(beforeVersion string, targetVersion string) { - if !factory.ChaosTestsEnabled() { - Skip("chaos mesh is disabled") - } - - clusterSetup(beforeVersion, true) - - // 1. Introduce packet loss b/w pods. - log.Println("Injecting packet loss b/w pod") - allPods := fdbCluster.GetAllPods() - - pickedPods := fixtures.RandomPickPod(allPods.Items, len(allPods.Items)/5) - Expect(pickedPods).ToNot(BeEmpty()) - for _, pod := range pickedPods { - log.Println("Picked Pod", pod.Name) - } - - factory.InjectNetworkLoss("20", fixtures.PodsSelector(pickedPods), - chaosmesh.PodSelectorSpec{ - GenericSelectorSpec: chaosmesh.GenericSelectorSpec{ - Namespaces: []string{fdbCluster.Namespace()}, - LabelSelectors: fdbCluster.GetCachedCluster().GetMatchLabels(), - }, - }, chaosmesh.Both) - - // Also inject network loss b/w operator pods and cluster pods. - operatorPods := factory.GetOperatorPods(fdbCluster.Namespace()) - factory.InjectNetworkLoss( - "20", - fixtures.PodsSelector(allPods.Items), - fixtures.PodsSelector(operatorPods.Items), - chaosmesh.Both) - - // 2. Start cluster upgrade. - log.Println("Starting cluster upgrade.") - Expect(fdbCluster.UpgradeCluster(targetVersion, true)).NotTo(HaveOccurred()) - - // 3. Upgrade should finish. - fdbCluster.VerifyVersion(targetVersion) - }, - EntryDescription("Upgrade from %[1]s to %[2]s with network link that drops some packets"), - fixtures.GenerateUpgradeTableEntries(testOptions), - ) - DescribeTable( "upgrading a cluster and no coordinator is restarted", func(beforeVersion string, targetVersion string) { @@ -597,226 +429,6 @@ var _ = Describe("Operator Upgrades", Label("e2e", "pr"), func() { fixtures.GenerateUpgradeTableEntries(testOptions), ) - DescribeTable( - "upgrading a cluster and one process has the fdbmonitor.conf file not ready", - func(beforeVersion string, targetVersion string) { - if fixtures.VersionsAreProtocolCompatible(beforeVersion, targetVersion) { - Skip("this test only affects version incompatible upgrades") - } - - clusterSetup(beforeVersion, true) - - // Update the cluster version. - Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred()) - // Skip the reonciliation here to have time to stage everything. - Expect(fdbCluster.SetSkipReconciliation(true)).NotTo(HaveOccurred()) - - // Select one Pod, this Pod will mount the fdbmonitor config file as read-only. - // This should block the upgrade. - faultyPod := fixtures.RandomPickOnePod(fdbCluster.GetPods().Items) - - // We have to update the sidecar before the operator is doing it. If we don't do this here the operator - // will update the sidecar and then the I/O chaos will be gone. So we prepare the faulty Pod to already - // be using the new sidecar image and then we inject IO chaos. - sidecarImage := fdbCluster.GetSidecarImageForVersion(targetVersion) - fdbCluster.UpdateContainerImage(&faultyPod, fdbv1beta2.SidecarContainerName, sidecarImage) - - Eventually(func() bool { - pod := fdbCluster.GetPod(faultyPod.Name) - - for _, status := range pod.Status.ContainerStatuses { - if status.Name != fdbv1beta2.SidecarContainerName { - continue - } - - log.Println("expected", sidecarImage, "got", status.Image) - return status.Image == sidecarImage - } - - return false - }).WithTimeout(10 * time.Minute).WithPolling(5 * time.Second).MustPassRepeatedly(5).Should(BeTrue()) - - log.Println("Inject IO chaos to", faultyPod.Name) - // Ensure that the fdbmonitor config file is not writeable for the sidecar. - exp := factory.InjectDiskFailureWithPath( - fixtures.PodSelector(&faultyPod), - "/var/output-files", - "/var/output-files/fdbmonitor.conf", - []chaosmesh.IoMethod{ - chaosmesh.Write, - chaosmesh.Read, - chaosmesh.Open, - chaosmesh.Flush, - chaosmesh.Fsync, - }, - []string{ - fdbv1beta2.SidecarContainerName, - }) - - // Make sure the sidecar is not able to write the fdbmonitor config. - Eventually(func() error { - stdout, stderr, err := fdbCluster.ExecuteCmdOnPod( - faultyPod, - fdbv1beta2.SidecarContainerName, - "cat /var/output-files/fdbmonitor.conf && echo '\n' >> /var/output-files/fdbmonitor.conf", - false) - - log.Println(stdout, stderr) - - return err - }).WithTimeout(5 * time.Minute).WithPolling(5 * time.Second).Should(HaveOccurred()) - - // Now we have the faulty Pod prepared with I/O chaos injected, so we can continue with the upgrade. - Expect(fdbCluster.SetSkipReconciliation(false)).NotTo(HaveOccurred()) - - // The cluster will be stuck in this state until the I/O chaos is resolved. - expectedConditions := map[fdbv1beta2.ProcessGroupConditionType]bool{ - fdbv1beta2.IncorrectConfigMap: true, - fdbv1beta2.IncorrectCommandLine: true, - } - faultyProcessGroupID := fixtures.GetProcessGroupID(faultyPod) - - // The upgrade will be stuck until the I/O chaos is removed and the sidecar is able to provide the latest - // fdbmonitor conf file. - Eventually(func() bool { - cluster := fdbCluster.GetCluster() - for _, processGroup := range cluster.Status.ProcessGroups { - if processGroup.ProcessGroupID != faultyProcessGroupID { - continue - } - - for _, condition := range processGroup.ProcessGroupConditions { - log.Println(processGroup.ProcessGroupID, string(condition.ProcessGroupConditionType)) - } - - if !processGroup.MatchesConditions(expectedConditions) { - return false - } - } - - return true - }).WithTimeout(5 * time.Minute).WithPolling(2 * time.Second).MustPassRepeatedly(30).Should(BeTrue()) - - // Make sure the cluster was not upgraded yet. - cluster := fdbCluster.GetCluster() - Expect(cluster.Spec.Version).NotTo(Equal(cluster.Status.RunningVersion)) - - // Remove the IO chaos, the cluster should proceed. - factory.DeleteChaosMeshExperimentSafe(exp) - - // Ensure the upgrade proceeds and is able to finish. - fdbCluster.VerifyVersion(targetVersion) - }, - EntryDescription("Upgrade from %[1]s to %[2]s and one process has the fdbmonitor.conf file not ready"), - fixtures.GenerateUpgradeTableEntries(testOptions), - ) - - DescribeTable( - "upgrading a cluster and one process is missing the new binary", - func(beforeVersion string, targetVersion string) { - if fixtures.VersionsAreProtocolCompatible(beforeVersion, targetVersion) { - Skip("this test only affects version incompatible upgrades") - } - - clusterSetup(beforeVersion, true) - - // Update the cluster version. - Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred()) - // Skip the reonciliation here to have time to stage everything. - Expect(fdbCluster.SetSkipReconciliation(true)).NotTo(HaveOccurred()) - - // Select one Pod, this Pod will miss the new fdbserver binary. - // This should block the upgrade. - faultyPod := fixtures.RandomPickOnePod(fdbCluster.GetPods().Items) - - // We have to update the sidecar before the operator is doing it. If we don't do this here the operator - // will update the sidecar and then the sidecar will copy the binaries at start-up. So we prepare the faulty Pod to already - // be using the new sidecar image and then we delete he new fdbserver binary. - sidecarImage := fdbCluster.GetSidecarImageForVersion(targetVersion) - fdbCluster.UpdateContainerImage(&faultyPod, fdbv1beta2.SidecarContainerName, sidecarImage) - - Eventually(func() bool { - pod := fdbCluster.GetPod(faultyPod.Name) - - for _, status := range pod.Status.ContainerStatuses { - if status.Name != fdbv1beta2.SidecarContainerName { - continue - } - - log.Println("expected", sidecarImage, "got", status.Image) - return status.Image == sidecarImage - } - - return false - }).WithTimeout(10 * time.Minute).WithPolling(5 * time.Second).MustPassRepeatedly(5).Should(BeTrue()) - - // Ensure that the new fdbserver binary is deleted by the sidecar. - fdbserverBinary := fmt.Sprintf("/var/output-files/bin/%s/fdbserver", targetVersion) - log.Println("Delete", fdbserverBinary, "from", faultyPod.Name) - - // Make sure the sidecar is missing the fdbserver binary. - Eventually(func() error { - _, _, err := fdbCluster.ExecuteCmdOnPod( - faultyPod, - fdbv1beta2.SidecarContainerName, - fmt.Sprintf("rm -f %s", fdbserverBinary), - false) - - return err - }).WithTimeout(5 * time.Minute).WithPolling(5 * time.Second).ShouldNot(HaveOccurred()) - - // Now we have the faulty Pod prepared, so we can continue with the upgrade. - Expect(fdbCluster.SetSkipReconciliation(false)).NotTo(HaveOccurred()) - - // The cluster will be stuck in this state until the Pod is restarted and the new binary is present. - expectedConditions := map[fdbv1beta2.ProcessGroupConditionType]bool{ - fdbv1beta2.IncorrectConfigMap: true, - fdbv1beta2.IncorrectCommandLine: true, - } - faultyProcessGroupID := fixtures.GetProcessGroupID(faultyPod) - - // The upgrade will be stuck until the new fdbserver binary is copied to the shared directory again. - Eventually(func() bool { - cluster := fdbCluster.GetCluster() - for _, processGroup := range cluster.Status.ProcessGroups { - if processGroup.ProcessGroupID != faultyProcessGroupID { - continue - } - - for _, condition := range processGroup.ProcessGroupConditions { - log.Println(processGroup.ProcessGroupID, string(condition.ProcessGroupConditionType)) - } - - if !processGroup.MatchesConditions(expectedConditions) { - return false - } - } - - return true - }).WithTimeout(5 * time.Minute).WithPolling(2 * time.Second).MustPassRepeatedly(30).Should(BeTrue()) - - // Make sure the cluster was not upgraded yet. - cluster := fdbCluster.GetCluster() - Expect(cluster.Spec.Version).NotTo(Equal(cluster.Status.RunningVersion)) - - // Ensure the binary is present in the shared folder. - Eventually(func() error { - _, _, err := fdbCluster.ExecuteCmdOnPod( - faultyPod, - fdbv1beta2.SidecarContainerName, - fmt.Sprintf("cp -f /usr/bin/fdbserver %s", fdbserverBinary), - false) - - return err - }).WithTimeout(5 * time.Minute).WithPolling(5 * time.Second).ShouldNot(HaveOccurred()) - - // Ensure the upgrade proceeds and is able to finish. - fdbCluster.VerifyVersion(targetVersion) - }, - EntryDescription("Upgrade from %[1]s to %[2]s and one process is missing the new binary"), - fixtures.GenerateUpgradeTableEntries(testOptions), - ) - DescribeTable( "one process is marked for removal", func(beforeVersion string, targetVersion string) { diff --git a/e2e/test_operator_upgrades_variations/operator_upgrades__variations_test.go b/e2e/test_operator_upgrades_variations/operator_upgrades_variations_test.go similarity index 99% rename from e2e/test_operator_upgrades_variations/operator_upgrades__variations_test.go rename to e2e/test_operator_upgrades_variations/operator_upgrades_variations_test.go index 345dd76db..3b17d73e8 100644 --- a/e2e/test_operator_upgrades_variations/operator_upgrades__variations_test.go +++ b/e2e/test_operator_upgrades_variations/operator_upgrades_variations_test.go @@ -1,5 +1,5 @@ /* - * operator_upgrades_test.go + * operator_upgrades_variations_test.go * * This source file is part of the FoundationDB open source project * diff --git a/e2e/test_operator_upgrades_with_chaos/operator_upgrades_with_chaos_test.go b/e2e/test_operator_upgrades_with_chaos/operator_upgrades_with_chaos_test.go new file mode 100644 index 000000000..5df0e88a2 --- /dev/null +++ b/e2e/test_operator_upgrades_with_chaos/operator_upgrades_with_chaos_test.go @@ -0,0 +1,463 @@ +/* + * operator_upgrades_with_chaos_test.go + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2023 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package operatorupgradeschaosmesh + +/* +This test suite includes tests to validate the behaviour of the operator during upgrades on a FoundationDB cluster. +The executed tests will verify that the upgrades can proceed under different failure scenarios injected with chaos-mesh. +Each test will create a new FoundationDB cluster which will be upgraded. +Since FoundationDB is version incompatible for major and minor versions and the upgrade process for FoundationDB on Kubernetes requires multiple steps (see the documentation in the docs folder) we test different scenarios where only some processes are restarted. +*/ + +import ( + "fmt" + "log" + "time" + + fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2" + "github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures" + chaosmesh "github.com/chaos-mesh/chaos-mesh/api/v1alpha1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var ( + factory *fixtures.Factory + fdbCluster *fixtures.FdbCluster + testOptions *fixtures.FactoryOptions +) + +func init() { + testOptions = fixtures.InitFlags() +} + +var _ = AfterSuite(func() { + if CurrentSpecReport().Failed() { + log.Printf("failed due to %s", CurrentSpecReport().FailureMessage()) + } +}) + +func clusterSetup(beforeVersion string) { + factory.SetBeforeVersion(beforeVersion) + fdbCluster = factory.CreateFdbCluster( + &fixtures.ClusterConfig{ + DebugSymbols: false, + }, + factory.GetClusterOptions(fixtures.UseVersionBeforeUpgrade)..., + ) + + Expect( + fdbCluster.InvariantClusterStatusAvailableWithThreshold(15 * time.Second), + ).ShouldNot(HaveOccurred()) +} + +var _ = Describe("Operator Upgrades with chaos-mesh", Label("e2e", "pr"), func() { + BeforeEach(func() { + factory = fixtures.CreateFactory(testOptions) + if !factory.ChaosTestsEnabled() { + Skip("chaos mesh is disabled") + } + }) + + AfterEach(func() { + if CurrentSpecReport().Failed() { + factory.DumpState(fdbCluster) + } + factory.Shutdown() + }) + + // Ginkgo lacks the support for AfterEach and BeforeEach in tables, so we have to put everything inside the testing function + // this setup allows to dynamically generate the table entries that will be executed e.g. to test different upgrades + // for different versions without hard coding or having multiple flags. + DescribeTable( + "upgrading a cluster with a partitioned pod", + func(beforeVersion string, targetVersion string) { + clusterSetup(beforeVersion) + Expect(fdbCluster.SetAutoReplacements(false, 20*time.Minute)).ToNot(HaveOccurred()) + // Ensure the operator is not skipping the process because it's missing for to long + Expect( + fdbCluster.SetIgnoreMissingProcessesSeconds(5 * time.Minute), + ).NotTo(HaveOccurred()) + + // 1. Introduce network partition b/w Pod and cluster. + // Inject chaos only to storage Pods to reduce the risk of a long recovery because a transaction + // system Pod was partitioned. The test still has the same checks that will be performed. + // Once we have a better availability check for upgrades we can target all pods again. + partitionedPod := fixtures.ChooseRandomPod(fdbCluster.GetStoragePods()) + log.Println("Injecting network partition to pod: ", partitionedPod.Name) + exp := factory.InjectPartitionBetween( + fixtures.PodSelector(partitionedPod), + chaosmesh.PodSelectorSpec{ + GenericSelectorSpec: chaosmesh.GenericSelectorSpec{ + Namespaces: []string{partitionedPod.Namespace}, + LabelSelectors: fdbCluster.GetCachedCluster().GetMatchLabels(), + }, + }, + ) + + Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred()) + + if !fixtures.VersionsAreProtocolCompatible(beforeVersion, targetVersion) { + // 2. Until we remove the partition, cluster should not have + // upgraded. Keep checking for 2m. + // + // This is only true for version incompatible upgrades. + Consistently(func() bool { + return fdbCluster.GetCluster().Status.RunningVersion == beforeVersion + }).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(BeTrue()) + } + + // 3. Delete the partition, and the upgrade should proceed. + log.Println("deleting chaos experiment and cluster should upgrade") + factory.DeleteChaosMeshExperimentSafe(exp) + fdbCluster.VerifyVersion(targetVersion) + }, + + EntryDescription("Upgrade from %[1]s to %[2]s with partitioned Pod"), + fixtures.GenerateUpgradeTableEntries(testOptions), + ) + + DescribeTable( + "with a partitioned pod which eventually gets replaced", + func(beforeVersion string, targetVersion string) { + clusterSetup(beforeVersion) + Expect(fdbCluster.SetAutoReplacements(false, 5*time.Minute)).ToNot(HaveOccurred()) + // Ensure the operator is not skipping the process because it's missing for to long + Expect( + fdbCluster.SetIgnoreMissingProcessesSeconds(5 * time.Minute), + ).NotTo(HaveOccurred()) + + // 1. Introduce network partition b/w Pod and cluster. + partitionedPod := fixtures.ChooseRandomPod(fdbCluster.GetStoragePods()) + log.Println("Injecting network partition to pod: ", partitionedPod.Name) + _ = factory.InjectPartitionBetween( + fixtures.PodSelector(partitionedPod), + chaosmesh.PodSelectorSpec{ + GenericSelectorSpec: chaosmesh.GenericSelectorSpec{ + Namespaces: []string{partitionedPod.Namespace}, + LabelSelectors: fdbCluster.GetCachedCluster().GetMatchLabels(), + }, + }, + ) + + Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred()) + + if !fixtures.VersionsAreProtocolCompatible(beforeVersion, targetVersion) { + // 2. Until we remove the partition, cluster should not have upgraded. Keep checking for 2m. + // + // This is only true for version incompatible upgrades. + Consistently(func() bool { + return fdbCluster.GetCluster().Status.RunningVersion == beforeVersion + }).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(BeTrue()) + } else { + // Simulate the 2 minute wait otherwise the auto replacement might not succeed in time. + time.Sleep(2 * time.Minute) + } + + // Enable the auto replacement feature again, but don't wait for reconciliation, otherwise we wait + // until the whole cluster is upgraded. + Expect( + fdbCluster.SetAutoReplacementsWithWait(true, 3*time.Minute, false), + ).ToNot(HaveOccurred()) + + // In the case of a version compatible upgrade the operator will proceed with recreating the storage Pods + // to make sure they use the new version. At the same time all transaction processes are marked for removal + // in order to bring up a new set of transaction processes with the new version, this prevents the operator + // from doing the actual replacement of the partitioned Pod. If th partitioned Pod gets recreated by the operator + // the injected partition from chaos-mesh is lost and therefore the process is reporting to the cluster again. + if !fixtures.VersionsAreProtocolCompatible(beforeVersion, targetVersion) { + log.Println("waiting for pod removal:", partitionedPod.Name) + Expect(fdbCluster.WaitForPodRemoval(partitionedPod)).ShouldNot(HaveOccurred()) + log.Println("pod removed:", partitionedPod.Name) + } + + // 3. Upgrade should proceed without removing the partition, as Pod will be replaced. + fdbCluster.VerifyVersion(targetVersion) + }, + + EntryDescription( + "Upgrade from %[1]s to %[2]s", + ), + fixtures.GenerateUpgradeTableEntries(testOptions), + ) + + DescribeTable( + "upgrading a cluster with link that drops some packets", + func(beforeVersion string, targetVersion string) { + clusterSetup(beforeVersion) + + // 1. Introduce packet loss b/w pods. + log.Println("Injecting packet loss b/w pod") + allPods := fdbCluster.GetAllPods() + + pickedPods := fixtures.RandomPickPod(allPods.Items, len(allPods.Items)/5) + Expect(pickedPods).ToNot(BeEmpty()) + for _, pod := range pickedPods { + log.Println("Picked Pod", pod.Name) + } + + factory.InjectNetworkLoss("20", fixtures.PodsSelector(pickedPods), + chaosmesh.PodSelectorSpec{ + GenericSelectorSpec: chaosmesh.GenericSelectorSpec{ + Namespaces: []string{fdbCluster.Namespace()}, + LabelSelectors: fdbCluster.GetCachedCluster().GetMatchLabels(), + }, + }, chaosmesh.Both) + + // Also inject network loss b/w operator pods and cluster pods. + operatorPods := factory.GetOperatorPods(fdbCluster.Namespace()) + factory.InjectNetworkLoss( + "20", + fixtures.PodsSelector(allPods.Items), + fixtures.PodsSelector(operatorPods.Items), + chaosmesh.Both) + + // 2. Start cluster upgrade. + log.Println("Starting cluster upgrade.") + Expect(fdbCluster.UpgradeCluster(targetVersion, true)).NotTo(HaveOccurred()) + + // 3. Upgrade should finish. + fdbCluster.VerifyVersion(targetVersion) + }, + EntryDescription("Upgrade from %[1]s to %[2]s with network link that drops some packets"), + fixtures.GenerateUpgradeTableEntries(testOptions), + ) + + DescribeTable( + "upgrading a cluster and one process has the fdbmonitor.conf file not ready", + func(beforeVersion string, targetVersion string) { + if fixtures.VersionsAreProtocolCompatible(beforeVersion, targetVersion) { + Skip("this test only affects version incompatible upgrades") + } + + clusterSetup(beforeVersion) + + // Update the cluster version. + Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred()) + // Skip the reonciliation here to have time to stage everything. + Expect(fdbCluster.SetSkipReconciliation(true)).NotTo(HaveOccurred()) + + // Select one Pod, this Pod will mount the fdbmonitor config file as read-only. + // This should block the upgrade. + faultyPod := fixtures.RandomPickOnePod(fdbCluster.GetPods().Items) + + // We have to update the sidecar before the operator is doing it. If we don't do this here the operator + // will update the sidecar and then the I/O chaos will be gone. So we prepare the faulty Pod to already + // be using the new sidecar image and then we inject IO chaos. + sidecarImage := fdbCluster.GetSidecarImageForVersion(targetVersion) + fdbCluster.UpdateContainerImage(&faultyPod, fdbv1beta2.SidecarContainerName, sidecarImage) + + Eventually(func() bool { + pod := fdbCluster.GetPod(faultyPod.Name) + + for _, status := range pod.Status.ContainerStatuses { + if status.Name != fdbv1beta2.SidecarContainerName { + continue + } + + log.Println("expected", sidecarImage, "got", status.Image) + return status.Image == sidecarImage + } + + return false + }).WithTimeout(10 * time.Minute).WithPolling(5 * time.Second).MustPassRepeatedly(5).Should(BeTrue()) + + log.Println("Inject IO chaos to", faultyPod.Name) + // Ensure that the fdbmonitor config file is not writeable for the sidecar. + exp := factory.InjectDiskFailureWithPath( + fixtures.PodSelector(&faultyPod), + "/var/output-files", + "/var/output-files/fdbmonitor.conf", + []chaosmesh.IoMethod{ + chaosmesh.Write, + chaosmesh.Read, + chaosmesh.Open, + chaosmesh.Flush, + chaosmesh.Fsync, + }, + []string{ + fdbv1beta2.SidecarContainerName, + }) + + // Make sure the sidecar is not able to write the fdbmonitor config. + Eventually(func() error { + stdout, stderr, err := fdbCluster.ExecuteCmdOnPod( + faultyPod, + fdbv1beta2.SidecarContainerName, + "cat /var/output-files/fdbmonitor.conf && echo '\n' >> /var/output-files/fdbmonitor.conf", + false) + + log.Println(stdout, stderr) + + return err + }).WithTimeout(5 * time.Minute).WithPolling(5 * time.Second).Should(HaveOccurred()) + + // Now we have the faulty Pod prepared with I/O chaos injected, so we can continue with the upgrade. + Expect(fdbCluster.SetSkipReconciliation(false)).NotTo(HaveOccurred()) + + // The cluster will be stuck in this state until the I/O chaos is resolved. + expectedConditions := map[fdbv1beta2.ProcessGroupConditionType]bool{ + fdbv1beta2.IncorrectConfigMap: true, + fdbv1beta2.IncorrectCommandLine: true, + } + faultyProcessGroupID := fixtures.GetProcessGroupID(faultyPod) + + // The upgrade will be stuck until the I/O chaos is removed and the sidecar is able to provide the latest + // fdbmonitor conf file. + Eventually(func() bool { + cluster := fdbCluster.GetCluster() + for _, processGroup := range cluster.Status.ProcessGroups { + if processGroup.ProcessGroupID != faultyProcessGroupID { + continue + } + + for _, condition := range processGroup.ProcessGroupConditions { + log.Println(processGroup.ProcessGroupID, string(condition.ProcessGroupConditionType)) + } + + if !processGroup.MatchesConditions(expectedConditions) { + return false + } + } + + return true + }).WithTimeout(5 * time.Minute).WithPolling(2 * time.Second).MustPassRepeatedly(30).Should(BeTrue()) + + // Make sure the cluster was not upgraded yet. + cluster := fdbCluster.GetCluster() + Expect(cluster.Spec.Version).NotTo(Equal(cluster.Status.RunningVersion)) + + // Remove the IO chaos, the cluster should proceed. + factory.DeleteChaosMeshExperimentSafe(exp) + + // Ensure the upgrade proceeds and is able to finish. + fdbCluster.VerifyVersion(targetVersion) + }, + EntryDescription("Upgrade from %[1]s to %[2]s and one process has the fdbmonitor.conf file not ready"), + fixtures.GenerateUpgradeTableEntries(testOptions), + ) + + DescribeTable( + "upgrading a cluster and one process is missing the new binary", + func(beforeVersion string, targetVersion string) { + if fixtures.VersionsAreProtocolCompatible(beforeVersion, targetVersion) { + Skip("this test only affects version incompatible upgrades") + } + + clusterSetup(beforeVersion) + + // Update the cluster version. + Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred()) + // Skip the reonciliation here to have time to stage everything. + Expect(fdbCluster.SetSkipReconciliation(true)).NotTo(HaveOccurred()) + + // Select one Pod, this Pod will miss the new fdbserver binary. + // This should block the upgrade. + faultyPod := fixtures.RandomPickOnePod(fdbCluster.GetPods().Items) + + // We have to update the sidecar before the operator is doing it. If we don't do this here the operator + // will update the sidecar and then the sidecar will copy the binaries at start-up. So we prepare the faulty Pod to already + // be using the new sidecar image and then we delete he new fdbserver binary. + sidecarImage := fdbCluster.GetSidecarImageForVersion(targetVersion) + fdbCluster.UpdateContainerImage(&faultyPod, fdbv1beta2.SidecarContainerName, sidecarImage) + + Eventually(func() bool { + pod := fdbCluster.GetPod(faultyPod.Name) + + for _, status := range pod.Status.ContainerStatuses { + if status.Name != fdbv1beta2.SidecarContainerName { + continue + } + + log.Println("expected", sidecarImage, "got", status.Image) + return status.Image == sidecarImage + } + + return false + }).WithTimeout(10 * time.Minute).WithPolling(5 * time.Second).MustPassRepeatedly(5).Should(BeTrue()) + + // Ensure that the new fdbserver binary is deleted by the sidecar. + fdbserverBinary := fmt.Sprintf("/var/output-files/bin/%s/fdbserver", targetVersion) + log.Println("Delete", fdbserverBinary, "from", faultyPod.Name) + + // Make sure the sidecar is missing the fdbserver binary. + Eventually(func() error { + _, _, err := fdbCluster.ExecuteCmdOnPod( + faultyPod, + fdbv1beta2.SidecarContainerName, + fmt.Sprintf("rm -f %s", fdbserverBinary), + false) + + return err + }).WithTimeout(5 * time.Minute).WithPolling(5 * time.Second).ShouldNot(HaveOccurred()) + + // Now we have the faulty Pod prepared, so we can continue with the upgrade. + Expect(fdbCluster.SetSkipReconciliation(false)).NotTo(HaveOccurred()) + + // The cluster will be stuck in this state until the Pod is restarted and the new binary is present. + expectedConditions := map[fdbv1beta2.ProcessGroupConditionType]bool{ + fdbv1beta2.IncorrectConfigMap: true, + fdbv1beta2.IncorrectCommandLine: true, + } + faultyProcessGroupID := fixtures.GetProcessGroupID(faultyPod) + + // The upgrade will be stuck until the new fdbserver binary is copied to the shared directory again. + Eventually(func() bool { + cluster := fdbCluster.GetCluster() + for _, processGroup := range cluster.Status.ProcessGroups { + if processGroup.ProcessGroupID != faultyProcessGroupID { + continue + } + + for _, condition := range processGroup.ProcessGroupConditions { + log.Println(processGroup.ProcessGroupID, string(condition.ProcessGroupConditionType)) + } + + if !processGroup.MatchesConditions(expectedConditions) { + return false + } + } + + return true + }).WithTimeout(5 * time.Minute).WithPolling(2 * time.Second).MustPassRepeatedly(30).Should(BeTrue()) + + // Make sure the cluster was not upgraded yet. + cluster := fdbCluster.GetCluster() + Expect(cluster.Spec.Version).NotTo(Equal(cluster.Status.RunningVersion)) + + // Ensure the binary is present in the shared folder. + Eventually(func() error { + _, _, err := fdbCluster.ExecuteCmdOnPod( + faultyPod, + fdbv1beta2.SidecarContainerName, + fmt.Sprintf("cp -f /usr/bin/fdbserver %s", fdbserverBinary), + false) + + return err + }).WithTimeout(5 * time.Minute).WithPolling(5 * time.Second).ShouldNot(HaveOccurred()) + + // Ensure the upgrade proceeds and is able to finish. + fdbCluster.VerifyVersion(targetVersion) + }, + EntryDescription("Upgrade from %[1]s to %[2]s and one process is missing the new binary"), + fixtures.GenerateUpgradeTableEntries(testOptions), + ) +}) diff --git a/e2e/test_operator_upgrades_with_chaos/suite_test.go b/e2e/test_operator_upgrades_with_chaos/suite_test.go new file mode 100644 index 000000000..37e017b93 --- /dev/null +++ b/e2e/test_operator_upgrades_with_chaos/suite_test.go @@ -0,0 +1,34 @@ +/* + * suite_test.go + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2023 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package operatorupgradeschaosmesh + +import ( + "testing" + "time" + + "github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures" + . "github.com/onsi/gomega" +) + +func TestOperatorUpgrade(t *testing.T) { + SetDefaultEventuallyTimeout(3 * time.Minute) + fixtures.RunGinkgoTests(t, "FDB Operator Upgrade Test Suite with chaos-mesh") +}