Skip to content

Commit

Permalink
trying to reduce or fix test flakiness
Browse files Browse the repository at this point in the history
  • Loading branch information
dlg99 committed May 31, 2024
1 parent d7b2df4 commit 9ed9c60
Showing 1 changed file with 73 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,46 @@ public void tearDown() throws Exception {
}
}

@Test
public void testAuditorRunsOnChangeInWritableBookies() throws Exception {
List<Integer> listOfReplicaIndex = new ArrayList<Integer>();
List<LedgerHandle> listOfLedgerHandle = createLedgersAndAddEntries(1, 5);
closeLedgers(listOfLedgerHandle);
LedgerHandle lhandle = listOfLedgerHandle.get(0);
int ledgerReplicaIndex = 0;
BookieId replicaToKillAddr = lhandle.getLedgerMetadata().getAllEnsembles().get(0L).get(0);

CountDownLatch latch = new CountDownLatch(listOfLedgerHandle.size());
for (LedgerHandle lh : listOfLedgerHandle) {
ledgerReplicaIndex = getReplicaIndexInLedger(lh, replicaToKillAddr);
listOfReplicaIndex.add(ledgerReplicaIndex);
assertNull("UrLedger already exists!",
watchUrLedgerNode(getUrLedgerZNode(lh), latch));
}

LOG.info("Killing Bookie :" + replicaToKillAddr);
killBookie(replicaToKillAddr);

// don't force auditor run here, wait for its watcher to do the work

// waiting to publish urLedger znode by Auditor
latch.await(100, TimeUnit.SECONDS);

latch = new CountDownLatch(listOfLedgerHandle.size());
for (LedgerHandle lh : listOfLedgerHandle) {
ledgerReplicaIndex = getReplicaIndexInLedger(lh, replicaToKillAddr);
listOfReplicaIndex.add(ledgerReplicaIndex);
assertNotNull("UrLedger already exists!",
watchUrLedgerNode(getUrLedgerZNode(lh), latch));
}

startNewBookie();
// don't force auditor run here, wait for its watcher to do the work

// waiting to publish urLedger znode by Auditor
latch.await(100, TimeUnit.SECONDS);
}

/**
* Test verifies publish urLedger by Auditor and replication worker is
* picking up the entries and finishing the rereplication of open ledger.
Expand All @@ -160,9 +200,10 @@ public void testOpenLedgers() throws Exception {

LOG.info("Killing Bookie :" + replicaToKillAddr);
killBookie(replicaToKillAddr);
forceAuditorRun();

// waiting to publish urLedger znode by Auditor
latch.await();
latch.await(10, TimeUnit.SECONDS);
latch = new CountDownLatch(1);
LOG.info("Watching on urLedgerPath:" + urLedgerZNode
+ " to know the status of rereplication process");
Expand All @@ -174,12 +215,13 @@ public void testOpenLedgers() throws Exception {
startNewBookie();
int newBookieIndex = lastBookieIndex();
BookieServer newBookieServer = serverByIndex(newBookieIndex);
forceAuditorRun();

if (LOG.isDebugEnabled()) {
LOG.debug("Waiting to finish the replication of failed bookie : "
+ replicaToKillAddr);
}
latch.await();
latch.await(10, TimeUnit.SECONDS);

// grace period to update the urledger metadata in zookeeper
LOG.info("Waiting to update the urledger metadata in zookeeper");
Expand Down Expand Up @@ -211,9 +253,10 @@ public void testClosedLedgers() throws Exception {

LOG.info("Killing Bookie :" + replicaToKillAddr);
killBookie(replicaToKillAddr);
forceAuditorRun();

// waiting to publish urLedger znode by Auditor
latch.await();
latch.await(10, TimeUnit.SECONDS);

// Again watching the urLedger znode to know the replication status
latch = new CountDownLatch(listOfLedgerHandle.size());
Expand All @@ -230,14 +273,15 @@ public void testClosedLedgers() throws Exception {
startNewBookie();
int newBookieIndex = lastBookieIndex();
BookieServer newBookieServer = serverByIndex(newBookieIndex);
forceAuditorRun();

if (LOG.isDebugEnabled()) {
LOG.debug("Waiting to finish the replication of failed bookie : "
+ replicaToKillAddr);
}

// waiting to finish replication
latch.await();
latch.await(10, TimeUnit.SECONDS);

// grace period to update the urledger metadata in zookeeper
LOG.info("Waiting to update the urledger metadata in zookeeper");
Expand Down Expand Up @@ -280,9 +324,10 @@ public void testStopWhileReplicationInProgress() throws Exception {

LOG.info("Killing Bookie :" + replicaToKillAddr);
killBookie(replicaToKillAddr);
forceAuditorRun();

// waiting to publish urLedger znode by Auditor
latch.await();
latch.await(10, TimeUnit.SECONDS);

// Again watching the urLedger znode to know the replication status
latch = new CountDownLatch(listOfLedgerHandle.size());
Expand All @@ -299,6 +344,7 @@ public void testStopWhileReplicationInProgress() throws Exception {
startNewBookie();
int newBookieIndex = lastBookieIndex();
BookieServer newBookieServer = serverByIndex(newBookieIndex);
forceAuditorRun();

if (LOG.isDebugEnabled()) {
LOG.debug("Waiting to finish the replication of failed bookie : "
Expand All @@ -315,9 +361,10 @@ public void testStopWhileReplicationInProgress() throws Exception {
}

startReplicationService();
forceAuditorRun();

LOG.info("Waiting to finish rereplication processes");
latch.await();
latch.await(10, TimeUnit.SECONDS);

// grace period to update the urledger metadata in zookeeper
LOG.info("Waiting to update the urledger metadata in zookeeper");
Expand Down Expand Up @@ -350,8 +397,10 @@ public void testNoSuchLedgerExists() throws Exception {
.getLedgerMetadata().getAllEnsembles()
.get(0L).get(0);
killBookie(replicaToKillAddr);
forceAuditorRun();

// waiting to publish urLedger znode by Auditor
latch.await();
latch.await(10, TimeUnit.SECONDS);

latch = new CountDownLatch(listOfLedgerHandle.size());
for (LedgerHandle lh : listOfLedgerHandle) {
Expand All @@ -364,9 +413,10 @@ public void testNoSuchLedgerExists() throws Exception {
bkc.deleteLedger(lh.getId());
}
startNewBookie();
forceAuditorRun();

// waiting to delete published urledgers, since it doesn't exists
latch.await();
latch.await(10, TimeUnit.SECONDS);

for (LedgerHandle lh : listOfLedgerHandle) {
assertNull("UrLedger still exists after rereplication",
Expand All @@ -391,7 +441,7 @@ public void testEmptyLedgerLosesQuorumEventually() throws Exception {
killBookie(replicaToKill);
startNewBookie();

getAuditor(10, TimeUnit.SECONDS).submitAuditTask().get(); // ensure auditor runs
forceAuditorRun();

assertTrue("Should be marked as underreplicated", latch.await(5, TimeUnit.SECONDS));
latch = new CountDownLatch(1);
Expand All @@ -405,14 +455,14 @@ public void testEmptyLedgerLosesQuorumEventually() throws Exception {
lh.getLedgerMetadata().getAllEnsembles().get(0L));
killBookie(replicaToKill);

getAuditor(10, TimeUnit.SECONDS).submitAuditTask().get(); // ensure auditor runs
forceAuditorRun();

assertTrue("Should be marked as underreplicated", latch.await(5, TimeUnit.SECONDS));
latch = new CountDownLatch(1);
s = watchUrLedgerNode(urZNode, latch); // should be marked as replicated

startNewBookie();
getAuditor(10, TimeUnit.SECONDS).submitAuditTask().get(); // ensure auditor runs
forceAuditorRun();

if (s != null) {
assertTrue("Should be marked as replicated", latch.await(20, TimeUnit.SECONDS));
Expand Down Expand Up @@ -465,9 +515,10 @@ public void testLedgerMetadataContainsIpAddressAsBookieID()

LOG.info("Killing Bookie :" + replicaToKillAddr);
killBookie(replicaToKillAddr);
forceAuditorRun();

// waiting to publish urLedger znode by Auditor
latch.await();
latch.await(10, TimeUnit.SECONDS);
latch = new CountDownLatch(1);
LOG.info("Watching on urLedgerPath:" + urLedgerZNode
+ " to know the status of rereplication process");
Expand All @@ -482,12 +533,13 @@ public void testLedgerMetadataContainsIpAddressAsBookieID()

int newBookieIndex = lastBookieIndex();
BookieServer newBookieServer = serverByIndex(newBookieIndex);
forceAuditorRun();

if (LOG.isDebugEnabled()) {
LOG.debug("Waiting to finish the replication of failed bookie : "
+ replicaToKillAddr);
}
latch.await();
latch.await(10, TimeUnit.SECONDS);

// grace period to update the urledger metadata in zookeeper
LOG.info("Waiting to update the urledger metadata in zookeeper");
Expand Down Expand Up @@ -541,9 +593,10 @@ public void testLedgerMetadataContainsHostNameAsBookieID()

LOG.info("Killing Bookie :" + replicaToKillAddr);
killBookie(replicaToKillAddr);
forceAuditorRun();

// waiting to publish urLedger znode by Auditor
latch.await();
latch.await(10, TimeUnit.SECONDS);
latch = new CountDownLatch(1);
LOG.info("Watching on urLedgerPath:" + urLedgerZNode
+ " to know the status of rereplication process");
Expand All @@ -560,12 +613,13 @@ public void testLedgerMetadataContainsHostNameAsBookieID()

int newBookieIndex = lastBookieIndex();
BookieServer newBookieServer = serverByIndex(newBookieIndex);
forceAuditorRun();

if (LOG.isDebugEnabled()) {
LOG.debug("Waiting to finish the replication of failed bookie : "
+ replicaToKillAddr);
}
latch.await();
latch.await(10, TimeUnit.SECONDS);

// grace period to update the urledger metadata in zookeeper
LOG.info("Waiting to update the urledger metadata in zookeeper");
Expand Down Expand Up @@ -646,4 +700,8 @@ public void process(WatchedEvent event) {
}
});
}

private void forceAuditorRun() throws Exception {
getAuditor(10, TimeUnit.SECONDS).submitAuditTask().get();
}
}

0 comments on commit 9ed9c60

Please sign in to comment.