From 9ed9c601a61aadd83ca258c3074b3aaf46adcada Mon Sep 17 00:00:00 2001 From: Andrey Yegorov Date: Fri, 31 May 2024 11:52:25 -0700 Subject: [PATCH] trying to reduce or fix test flakiness --- .../replication/BookieAutoRecoveryTest.java | 88 +++++++++++++++---- 1 file changed, 73 insertions(+), 15 deletions(-) diff --git a/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java b/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java index ccb262ed268..b316a96f9d0 100644 --- a/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java +++ b/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java @@ -140,6 +140,46 @@ public void tearDown() throws Exception { } } + @Test + public void testAuditorRunsOnChangeInWritableBookies() throws Exception { + List listOfReplicaIndex = new ArrayList(); + List listOfLedgerHandle = createLedgersAndAddEntries(1, 5); + closeLedgers(listOfLedgerHandle); + LedgerHandle lhandle = listOfLedgerHandle.get(0); + int ledgerReplicaIndex = 0; + BookieId replicaToKillAddr = lhandle.getLedgerMetadata().getAllEnsembles().get(0L).get(0); + + CountDownLatch latch = new CountDownLatch(listOfLedgerHandle.size()); + for (LedgerHandle lh : listOfLedgerHandle) { + ledgerReplicaIndex = getReplicaIndexInLedger(lh, replicaToKillAddr); + listOfReplicaIndex.add(ledgerReplicaIndex); + assertNull("UrLedger already exists!", + watchUrLedgerNode(getUrLedgerZNode(lh), latch)); + } + + LOG.info("Killing Bookie :" + replicaToKillAddr); + killBookie(replicaToKillAddr); + + // don't force auditor run here, wait for its watcher to do the work + + // waiting to publish urLedger znode by Auditor + latch.await(100, TimeUnit.SECONDS); + + latch = new CountDownLatch(listOfLedgerHandle.size()); + for (LedgerHandle lh : listOfLedgerHandle) { + ledgerReplicaIndex = getReplicaIndexInLedger(lh, replicaToKillAddr); + listOfReplicaIndex.add(ledgerReplicaIndex); + assertNotNull("UrLedger already exists!", + watchUrLedgerNode(getUrLedgerZNode(lh), latch)); + } + + startNewBookie(); + // don't force auditor run here, wait for its watcher to do the work + + // waiting to publish urLedger znode by Auditor + latch.await(100, TimeUnit.SECONDS); + } + /** * Test verifies publish urLedger by Auditor and replication worker is * picking up the entries and finishing the rereplication of open ledger. @@ -160,9 +200,10 @@ public void testOpenLedgers() throws Exception { LOG.info("Killing Bookie :" + replicaToKillAddr); killBookie(replicaToKillAddr); + forceAuditorRun(); // waiting to publish urLedger znode by Auditor - latch.await(); + latch.await(10, TimeUnit.SECONDS); latch = new CountDownLatch(1); LOG.info("Watching on urLedgerPath:" + urLedgerZNode + " to know the status of rereplication process"); @@ -174,12 +215,13 @@ public void testOpenLedgers() throws Exception { startNewBookie(); int newBookieIndex = lastBookieIndex(); BookieServer newBookieServer = serverByIndex(newBookieIndex); + forceAuditorRun(); if (LOG.isDebugEnabled()) { LOG.debug("Waiting to finish the replication of failed bookie : " + replicaToKillAddr); } - latch.await(); + latch.await(10, TimeUnit.SECONDS); // grace period to update the urledger metadata in zookeeper LOG.info("Waiting to update the urledger metadata in zookeeper"); @@ -211,9 +253,10 @@ public void testClosedLedgers() throws Exception { LOG.info("Killing Bookie :" + replicaToKillAddr); killBookie(replicaToKillAddr); + forceAuditorRun(); // waiting to publish urLedger znode by Auditor - latch.await(); + latch.await(10, TimeUnit.SECONDS); // Again watching the urLedger znode to know the replication status latch = new CountDownLatch(listOfLedgerHandle.size()); @@ -230,6 +273,7 @@ public void testClosedLedgers() throws Exception { startNewBookie(); int newBookieIndex = lastBookieIndex(); BookieServer newBookieServer = serverByIndex(newBookieIndex); + forceAuditorRun(); if (LOG.isDebugEnabled()) { LOG.debug("Waiting to finish the replication of failed bookie : " @@ -237,7 +281,7 @@ public void testClosedLedgers() throws Exception { } // waiting to finish replication - latch.await(); + latch.await(10, TimeUnit.SECONDS); // grace period to update the urledger metadata in zookeeper LOG.info("Waiting to update the urledger metadata in zookeeper"); @@ -280,9 +324,10 @@ public void testStopWhileReplicationInProgress() throws Exception { LOG.info("Killing Bookie :" + replicaToKillAddr); killBookie(replicaToKillAddr); + forceAuditorRun(); // waiting to publish urLedger znode by Auditor - latch.await(); + latch.await(10, TimeUnit.SECONDS); // Again watching the urLedger znode to know the replication status latch = new CountDownLatch(listOfLedgerHandle.size()); @@ -299,6 +344,7 @@ public void testStopWhileReplicationInProgress() throws Exception { startNewBookie(); int newBookieIndex = lastBookieIndex(); BookieServer newBookieServer = serverByIndex(newBookieIndex); + forceAuditorRun(); if (LOG.isDebugEnabled()) { LOG.debug("Waiting to finish the replication of failed bookie : " @@ -315,9 +361,10 @@ public void testStopWhileReplicationInProgress() throws Exception { } startReplicationService(); + forceAuditorRun(); LOG.info("Waiting to finish rereplication processes"); - latch.await(); + latch.await(10, TimeUnit.SECONDS); // grace period to update the urledger metadata in zookeeper LOG.info("Waiting to update the urledger metadata in zookeeper"); @@ -350,8 +397,10 @@ public void testNoSuchLedgerExists() throws Exception { .getLedgerMetadata().getAllEnsembles() .get(0L).get(0); killBookie(replicaToKillAddr); + forceAuditorRun(); + // waiting to publish urLedger znode by Auditor - latch.await(); + latch.await(10, TimeUnit.SECONDS); latch = new CountDownLatch(listOfLedgerHandle.size()); for (LedgerHandle lh : listOfLedgerHandle) { @@ -364,9 +413,10 @@ public void testNoSuchLedgerExists() throws Exception { bkc.deleteLedger(lh.getId()); } startNewBookie(); + forceAuditorRun(); // waiting to delete published urledgers, since it doesn't exists - latch.await(); + latch.await(10, TimeUnit.SECONDS); for (LedgerHandle lh : listOfLedgerHandle) { assertNull("UrLedger still exists after rereplication", @@ -391,7 +441,7 @@ public void testEmptyLedgerLosesQuorumEventually() throws Exception { killBookie(replicaToKill); startNewBookie(); - getAuditor(10, TimeUnit.SECONDS).submitAuditTask().get(); // ensure auditor runs + forceAuditorRun(); assertTrue("Should be marked as underreplicated", latch.await(5, TimeUnit.SECONDS)); latch = new CountDownLatch(1); @@ -405,14 +455,14 @@ public void testEmptyLedgerLosesQuorumEventually() throws Exception { lh.getLedgerMetadata().getAllEnsembles().get(0L)); killBookie(replicaToKill); - getAuditor(10, TimeUnit.SECONDS).submitAuditTask().get(); // ensure auditor runs + forceAuditorRun(); assertTrue("Should be marked as underreplicated", latch.await(5, TimeUnit.SECONDS)); latch = new CountDownLatch(1); s = watchUrLedgerNode(urZNode, latch); // should be marked as replicated startNewBookie(); - getAuditor(10, TimeUnit.SECONDS).submitAuditTask().get(); // ensure auditor runs + forceAuditorRun(); if (s != null) { assertTrue("Should be marked as replicated", latch.await(20, TimeUnit.SECONDS)); @@ -465,9 +515,10 @@ public void testLedgerMetadataContainsIpAddressAsBookieID() LOG.info("Killing Bookie :" + replicaToKillAddr); killBookie(replicaToKillAddr); + forceAuditorRun(); // waiting to publish urLedger znode by Auditor - latch.await(); + latch.await(10, TimeUnit.SECONDS); latch = new CountDownLatch(1); LOG.info("Watching on urLedgerPath:" + urLedgerZNode + " to know the status of rereplication process"); @@ -482,12 +533,13 @@ public void testLedgerMetadataContainsIpAddressAsBookieID() int newBookieIndex = lastBookieIndex(); BookieServer newBookieServer = serverByIndex(newBookieIndex); + forceAuditorRun(); if (LOG.isDebugEnabled()) { LOG.debug("Waiting to finish the replication of failed bookie : " + replicaToKillAddr); } - latch.await(); + latch.await(10, TimeUnit.SECONDS); // grace period to update the urledger metadata in zookeeper LOG.info("Waiting to update the urledger metadata in zookeeper"); @@ -541,9 +593,10 @@ public void testLedgerMetadataContainsHostNameAsBookieID() LOG.info("Killing Bookie :" + replicaToKillAddr); killBookie(replicaToKillAddr); + forceAuditorRun(); // waiting to publish urLedger znode by Auditor - latch.await(); + latch.await(10, TimeUnit.SECONDS); latch = new CountDownLatch(1); LOG.info("Watching on urLedgerPath:" + urLedgerZNode + " to know the status of rereplication process"); @@ -560,12 +613,13 @@ public void testLedgerMetadataContainsHostNameAsBookieID() int newBookieIndex = lastBookieIndex(); BookieServer newBookieServer = serverByIndex(newBookieIndex); + forceAuditorRun(); if (LOG.isDebugEnabled()) { LOG.debug("Waiting to finish the replication of failed bookie : " + replicaToKillAddr); } - latch.await(); + latch.await(10, TimeUnit.SECONDS); // grace period to update the urledger metadata in zookeeper LOG.info("Waiting to update the urledger metadata in zookeeper"); @@ -646,4 +700,8 @@ public void process(WatchedEvent event) { } }); } + + private void forceAuditorRun() throws Exception { + getAuditor(10, TimeUnit.SECONDS).submitAuditTask().get(); + } }