Skip to content

Commit

Permalink
[test] Make StoreIngestionTask::testStartServerAndShutdownWithPartiti…
Browse files Browse the repository at this point in the history
…onAssignmentVerification & testCheckBeforeJoinCluster less flaky (#1330)

Make StoreIngestionTask::testStartServerAndShutdownWithPartitionAssignmentVerification less flaky by checking that the number of partitions is 0 instead of storageEngines is 0. This is because after the node comes online, Helix tries to re-assign partitions to it immediately, which re-creates the storageEngine.

Also made StoreIngestionTask::testCheckBeforeJoinCluster less flaky by turning on maintenance mode on the Helix cluster after removing the node from the cluster.
  • Loading branch information
kvargha authored Nov 20, 2024
1 parent e23c375 commit 2561949
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import com.linkedin.r2.message.rest.RestRequest;
import com.linkedin.r2.message.rest.RestRequestBuilder;
import com.linkedin.r2.message.rest.RestResponse;
import com.linkedin.venice.controllerapi.ControllerClient;
import com.linkedin.venice.controllerapi.MultiSchemaResponse;
import com.linkedin.venice.controllerapi.StoreResponse;
import com.linkedin.venice.controllerapi.UpdateStoreQueryParams;
Expand Down Expand Up @@ -139,19 +138,36 @@ public void testCheckBeforeJoinCluster() {
// Stop server, remove it from the cluster then restart. We expect that all local storage would be deleted. Once
// the server join again.
cluster.stopVeniceServer(server.getPort());
try (ControllerClient client = ControllerClient
.constructClusterControllerClient(cluster.getClusterName(), cluster.getAllControllersURLs())) {
client.removeNodeFromCluster(Utils.getHelixNodeIdentifier(Utils.getHostName(), server.getPort()));
}

TestUtils.waitForNonDeterministicAssertion(10, TimeUnit.SECONDS, () -> Assert.assertFalse(server.isRunning()));

cluster.useControllerClient(controllerClient -> {
controllerClient.removeNodeFromCluster(Utils.getHelixNodeIdentifier(Utils.getHostName(), server.getPort()));

TestUtils.waitForNonDeterministicAssertion(
10,
TimeUnit.SECONDS,
() -> Assert.assertEquals(controllerClient.listStorageNodes().getNodes().length, 0));
});

cluster.getLeaderVeniceController()
.getVeniceHelixAdmin()
.getHelixAdminClient()
.manuallyEnableMaintenanceMode(cluster.getClusterName(), true, "Prevent nodes from re-joining", null);

cluster.restartVeniceServer(server.getPort());
Assert.assertTrue(
server.getVeniceServer()
.getStorageService()
.getStorageEngineRepository()
.getAllLocalStorageEngines()
.isEmpty(),
"After removing the node from cluster, local storage should be cleaned up once the server join the cluster again.");
TestUtils.waitForNonDeterministicAssertion(10, TimeUnit.SECONDS, () -> Assert.assertTrue(server.isRunning()));

TestUtils.waitForNonDeterministicAssertion(
30,
TimeUnit.SECONDS,
() -> Assert.assertTrue(
server.getVeniceServer()
.getStorageService()
.getStorageEngineRepository()
.getAllLocalStorageEngines()
.isEmpty(),
"After removing the node from cluster, local storage should be cleaned up once the server join the cluster again."));
}
}

Expand Down Expand Up @@ -207,13 +223,11 @@ public void testStartServerAndShutdownWithPartitionAssignmentVerification() {
cluster.addVeniceServer(featureProperties, new Properties());

cluster.restartVeniceServer(server.getPort());
StorageEngineRepository storageEngineRepository =
server.getVeniceServer().getStorageService().getStorageEngineRepository();

TestUtils.waitForNonDeterministicAssertion(
3,
30,
TimeUnit.SECONDS,
() -> Assert.assertEquals(storageEngineRepository.getAllLocalStorageEngines().size(), 0));
() -> Assert.assertEquals(storageService.getStorageEngine(storeName).getPartitionIds().size(), 0));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,4 +133,14 @@ void createVeniceStorageClusterResources(
* Release resources.
*/
void close();

/**
* Manually enable maintenance mode. To be called by the REST client that accepts KV mappings as
* the payload.
*/
void manuallyEnableMaintenanceMode(
String clusterName,
boolean enabled,
String reason,
Map<String, String> customFields);
}
Original file line number Diff line number Diff line change
Expand Up @@ -333,4 +333,15 @@ public void resetPartition(
public void close() {
helixAdmin.close();
}

/**
* @see HelixAdminClient#manuallyEnableMaintenanceMode(String, boolean, String, Map<String, String>)
*/
public void manuallyEnableMaintenanceMode(
String clusterName,
boolean enabled,
String reason,
Map<String, String> customFields) {
helixAdmin.manuallyEnableMaintenanceMode(clusterName, enabled, reason, customFields);
}
}

0 comments on commit 2561949

Please sign in to comment.