From 256194958e188026acfd7bc2ee07a11bddfcfcd2 Mon Sep 17 00:00:00 2001 From: Koorous Vargha Date: Wed, 20 Nov 2024 12:29:48 -0800 Subject: [PATCH] [test] Make StoreIngestionTask::testStartServerAndShutdownWithPartitionAssignmentVerification & testCheckBeforeJoinCluster less flaky (#1330) Make StoreIngestionTask::testStartServerAndShutdownWithPartitionAssignmentVerification less flaky by checking that the number of partitions is 0 instead of storageEngines is 0. This is because after the node comes online, Helix tries to re-assign partitions to it immediately, which re-creates the storageEngine. Also made StoreIngestionTask::testCheckBeforeJoinCluster less flaky by turning on maintenance mode on the Helix cluster after removing the node from the cluster. --- .../venice/server/VeniceServerTest.java | 46 ++++++++++++------- .../venice/controller/HelixAdminClient.java | 10 ++++ .../venice/controller/ZkHelixAdminClient.java | 11 +++++ 3 files changed, 51 insertions(+), 16 deletions(-) diff --git a/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/server/VeniceServerTest.java b/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/server/VeniceServerTest.java index cd0800b742c..21d05b8618d 100644 --- a/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/server/VeniceServerTest.java +++ b/internal/venice-test-common/src/integrationTest/java/com/linkedin/venice/server/VeniceServerTest.java @@ -13,7 +13,6 @@ import com.linkedin.r2.message.rest.RestRequest; import com.linkedin.r2.message.rest.RestRequestBuilder; import com.linkedin.r2.message.rest.RestResponse; -import com.linkedin.venice.controllerapi.ControllerClient; import com.linkedin.venice.controllerapi.MultiSchemaResponse; import com.linkedin.venice.controllerapi.StoreResponse; import com.linkedin.venice.controllerapi.UpdateStoreQueryParams; @@ -139,19 +138,36 @@ public void testCheckBeforeJoinCluster() { // Stop server, remove it from the cluster then restart. We expect that all local storage would be deleted. Once // the server join again. cluster.stopVeniceServer(server.getPort()); - try (ControllerClient client = ControllerClient - .constructClusterControllerClient(cluster.getClusterName(), cluster.getAllControllersURLs())) { - client.removeNodeFromCluster(Utils.getHelixNodeIdentifier(Utils.getHostName(), server.getPort())); - } + + TestUtils.waitForNonDeterministicAssertion(10, TimeUnit.SECONDS, () -> Assert.assertFalse(server.isRunning())); + + cluster.useControllerClient(controllerClient -> { + controllerClient.removeNodeFromCluster(Utils.getHelixNodeIdentifier(Utils.getHostName(), server.getPort())); + + TestUtils.waitForNonDeterministicAssertion( + 10, + TimeUnit.SECONDS, + () -> Assert.assertEquals(controllerClient.listStorageNodes().getNodes().length, 0)); + }); + + cluster.getLeaderVeniceController() + .getVeniceHelixAdmin() + .getHelixAdminClient() + .manuallyEnableMaintenanceMode(cluster.getClusterName(), true, "Prevent nodes from re-joining", null); cluster.restartVeniceServer(server.getPort()); - Assert.assertTrue( - server.getVeniceServer() - .getStorageService() - .getStorageEngineRepository() - .getAllLocalStorageEngines() - .isEmpty(), - "After removing the node from cluster, local storage should be cleaned up once the server join the cluster again."); + TestUtils.waitForNonDeterministicAssertion(10, TimeUnit.SECONDS, () -> Assert.assertTrue(server.isRunning())); + + TestUtils.waitForNonDeterministicAssertion( + 30, + TimeUnit.SECONDS, + () -> Assert.assertTrue( + server.getVeniceServer() + .getStorageService() + .getStorageEngineRepository() + .getAllLocalStorageEngines() + .isEmpty(), + "After removing the node from cluster, local storage should be cleaned up once the server join the cluster again.")); } } @@ -207,13 +223,11 @@ public void testStartServerAndShutdownWithPartitionAssignmentVerification() { cluster.addVeniceServer(featureProperties, new Properties()); cluster.restartVeniceServer(server.getPort()); - StorageEngineRepository storageEngineRepository = - server.getVeniceServer().getStorageService().getStorageEngineRepository(); TestUtils.waitForNonDeterministicAssertion( - 3, + 30, TimeUnit.SECONDS, - () -> Assert.assertEquals(storageEngineRepository.getAllLocalStorageEngines().size(), 0)); + () -> Assert.assertEquals(storageService.getStorageEngine(storeName).getPartitionIds().size(), 0)); } } diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/HelixAdminClient.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/HelixAdminClient.java index 2281a709029..3ee0bf381f5 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/HelixAdminClient.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/HelixAdminClient.java @@ -133,4 +133,14 @@ void createVeniceStorageClusterResources( * Release resources. */ void close(); + + /** + * Manually enable maintenance mode. To be called by the REST client that accepts KV mappings as + * the payload. + */ + void manuallyEnableMaintenanceMode( + String clusterName, + boolean enabled, + String reason, + Map customFields); } diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/ZkHelixAdminClient.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/ZkHelixAdminClient.java index 4111a063695..de01ad1ba63 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/ZkHelixAdminClient.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/ZkHelixAdminClient.java @@ -333,4 +333,15 @@ public void resetPartition( public void close() { helixAdmin.close(); } + + /** + * @see HelixAdminClient#manuallyEnableMaintenanceMode(String, boolean, String, Map) + */ + public void manuallyEnableMaintenanceMode( + String clusterName, + boolean enabled, + String reason, + Map customFields) { + helixAdmin.manuallyEnableMaintenanceMode(clusterName, enabled, reason, customFields); + } }