Skip to content

Commit

Permalink
Add raft_voter_contact()
Browse files Browse the repository at this point in the history
This returns the number of voting nodes that are recently in contact with the leader, to allow determining if the cluster is currently in a degraded / at risk state.
  • Loading branch information
Roger Light committed May 30, 2024
1 parent e104569 commit 28c318c
Show file tree
Hide file tree
Showing 6 changed files with 233 additions and 1 deletion.
3 changes: 2 additions & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ test_integration_core_SOURCES = \
test/integration/test_strerror.c \
test/integration/test_submit.c \
test/integration/test_tick.c \
test/integration/test_transfer.c
test/integration/test_transfer.c \
test/integration/test_voter_contacts.c
test_integration_core_CFLAGS = $(AM_CFLAGS) -Wno-conversion
test_integration_core_LDFLAGS = -no-install
test_integration_core_LDADD = libtest.la libraft.la
Expand Down
22 changes: 22 additions & 0 deletions include/raft.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -981,6 +981,9 @@ struct raft
unsigned max_catch_up_rounds;
unsigned max_catch_up_round_duration;

/* Current number of voting nodes we are in contact with */
int voter_contacts;

/* Fields added after the v0.x ABI freeze, packed in the unused space. */
union {
RAFT__RESERVED;
Expand Down Expand Up @@ -1603,6 +1606,25 @@ RAFT_API int raft_transfer(struct raft *r,
raft_id id,
raft_transfer_cb cb);

/**
* Return the number of voting servers that the leader has recently been in
* contact with. This can be used to help determine whether the cluster may be
* in a degraded/at risk state.
*
* In general this should only be called on the leader. This will return valid
* values >= 1, because a leader is always in contact with itself.
*
* The one exception to this is if your server is changing role away from being
* the leader, then calling this in the state callback will produce results
* that are up to date. Calling on a server that has never been a leader will
* return -1.
*
* Note that the value returned is only up to date as far as the most recent
* messages from the followers, and so should not be relied upon for absolute
* correctness.
*/
RAFT_API int raft_voter_contacts(struct raft *r);

/**
* Return the index of the last entry that was applied to the local FSM.
*/
Expand Down
6 changes: 6 additions & 0 deletions src/convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ static void convertSetState(struct raft *r, unsigned short new_state)
{
r->state = new_state;
r->update->flags |= RAFT_UPDATE_STATE;
if (r->state == RAFT_LEADER) {
r->voter_contacts = 1;
/* Note that we don't reset voter_contacts for follower nodes so the
* node that was previously a leader knows how many contacts it just
* had, assuming it is still online. */
}
}

/* Clear follower state. */
Expand Down
5 changes: 5 additions & 0 deletions src/raft.c
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ int raft_init(struct raft *r,
r->install_snapshot_timeout = DEFAULT_INSTALL_SNAPSHOT_TIMEOUT;
r->commit_index = 0;
r->last_stored = 0;
r->voter_contacts = -1;
r->state = RAFT_FOLLOWER;
r->follower_state.current_leader.id = 0;
r->follower_state.current_leader.address = NULL;
Expand Down Expand Up @@ -854,4 +855,8 @@ const char *raft_role_name(int role)
return name;
}

int raft_voter_contacts(struct raft *r)
{
return r->voter_contacts;
}
#undef infof
1 change: 1 addition & 0 deletions src/timeout.c
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ static bool checkContactQuorum(struct raft *r)
if ((server->role == RAFT_VOTER && is_recent) || server->id == r->id) {
contacts++;
}
r->voter_contacts = (int)contacts;

if (!is_recent) {
switch (progressState(r, i)) {
Expand Down
197 changes: 197 additions & 0 deletions test/integration/test_voter_contacts.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
#include "../lib/cluster.h"
#include "../lib/runner.h"

#define N_SERVERS 3

/******************************************************************************
*
* Fixture with a test raft cluster.
*
*****************************************************************************/

struct fixture
{
FIXTURE_CLUSTER;
};

/******************************************************************************
*
* Helper macros
*
*****************************************************************************/

#define STEP_N(N) raft_fixture_step_n(&f->cluster, N)

/******************************************************************************
*
* Set up a cluster with a three servers.
*
*****************************************************************************/

static void *setUp(const MunitParameter params[], MUNIT_UNUSED void *user_data)
{
struct fixture *f = munit_malloc(sizeof *f);
SETUP_CLUSTER();
return f;
}

static void tearDown(void *data)
{
struct fixture *f = data;
TEAR_DOWN_CLUSTER();
free(f);
}

/******************************************************************************
*
* raft_voter_contacts
*
*****************************************************************************/

SUITE(raft_voter_contacts)

TEST(raft_voter_contacts, upToDate, setUp, tearDown, 0, NULL)
{
struct fixture *f = data;

for (unsigned id = 1; id <= N_SERVERS; id++) {
CLUSTER_SET_TERM(id, 1 /* term */);
CLUSTER_ADD_ENTRY(id, RAFT_CHANGE, N_SERVERS /* servers */,
N_SERVERS /* voters */);
CLUSTER_START(id);
}

/* No communication yet, so all counts should be 0 */
for (unsigned id = 1; id <= N_SERVERS; id++) {
int count = raft_voter_contacts(CLUSTER_RAFT(id));
munit_assert_int(count, ==, -1);
}

CLUSTER_TRACE(
"[ 0] 1 > term 1, 1 entry (1^1)\n"
"[ 0] 2 > term 1, 1 entry (1^1)\n"
"[ 0] 3 > term 1, 1 entry (1^1)\n"
"[ 100] 1 > timeout as follower\n"
" convert to candidate, start election for term 2\n"
"[ 110] 2 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 110] 3 > recv request vote from server 1\n"
" remote term is higher (2 vs 1) -> bump term\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 120] 1 > recv request vote result from server 2\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 2 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n"
"[ 120] 1 > recv request vote result from server 3\n"
" local server is leader -> ignore\n"
"[ 130] 2 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 130] 3 > recv append entries from server 1\n"
" no new entries to persist\n"
"[ 140] 1 > recv append entries result from server 2\n"
"[ 140] 1 > recv append entries result from server 3\n"
"[ 170] 1 > timeout as leader\n"
" pipeline server 2 sending a heartbeat (no entries)\n"
" pipeline server 3 sending a heartbeat (no entries)\n");

/* Cluster election has succeeded and heartbeats have been sent, so the
* leader should know about all of the followers. */
for (unsigned id = 1; id <= N_SERVERS; id++) {
int count = raft_voter_contacts(CLUSTER_RAFT(id));
if (id == 1 /* leader */) {
munit_assert_int(count, ==, N_SERVERS);
} else {
munit_assert_int(count, ==, -1);
}
}

/* Stop the cluster leader, so a new leader is elected and the number of
* voters should be decreased */
CLUSTER_STOP(1);

/* Run until a leader is elected */
CLUSTER_TRACE(
"[ 260] 2 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 270] 3 > recv request vote from server 2\n"
" local server has a leader (server 1) -> reject\n"
"[ 280] 2 > recv request vote result from server 3\n"
" remote term is lower (2 vs 3) -> ignore\n"
"[ 290] 3 > timeout as follower\n"
" convert to candidate, start election for term 3\n"
"[ 300] 2 > recv request vote from server 3\n"
" already voted for server 2 -> don't grant vote\n"
"[ 310] 3 > recv request vote result from server 2\n"
" vote not granted\n"
"[ 390] 2 > timeout as candidate\n"
" stay candidate, start election for term 4\n"
"[ 400] 3 > recv request vote from server 2\n"
" remote term is higher (4 vs 3) -> bump term, step down\n"
" remote log is equal (1^1) -> grant vote\n"
"[ 410] 2 > recv request vote result from server 3\n"
" quorum reached with 2 votes out of 3 -> convert to leader\n"
" probe server 1 sending a heartbeat (no entries)\n"
" probe server 3 sending a heartbeat (no entries)\n");

/* Leader hasn't had a timeout yet, so it hasn't calculated the voter count
* yet. This could potentially be improved, but it's a very small period of
* time where the incorrect value will be available. */
for (unsigned id = 1; id < N_SERVERS; id++) {
int count = raft_voter_contacts(CLUSTER_RAFT(id));
if (id == 2 /* leader */) {
munit_assert_int(count, ==, 1);
} else {
munit_assert_int(count, ==, -1);
}
}

/* Continue until leader timeout */
CLUSTER_TRACE(
"[ 420] 3 > recv append entries from server 2\n"
" no new entries to persist\n"
"[ 430] 2 > recv append entries result from server 3\n"
"[ 460] 2 > timeout as leader\n"
" probe server 1 sending a heartbeat (no entries)\n"
" pipeline server 3 sending a heartbeat (no entries)\n");

for (unsigned id = 1; id < N_SERVERS; id++) {
int count = raft_voter_contacts(CLUSTER_RAFT(id));
if (id == 2 /* leader */) {
munit_assert_int(count, ==, N_SERVERS-1);
} else {
munit_assert_int(count, ==, -1);
}
}

/* Move on by 10ms to make the node 1 restart distinct in time from the
* previous trace */
CLUSTER_ELAPSE(10);

/* Revive the old leader, so the count should go back up */
CLUSTER_START(1);

CLUSTER_TRACE(
"[ 470] 1 > term 2, voted for 1, 1 entry (1^1)\n"
"[ 470] 1 > recv append entries from server 2\n"
" remote term is higher (4 vs 2) -> bump term\n"
" no new entries to persist\n"
"[ 470] 3 > recv append entries from server 2\n"
" no new entries to persist\n"
"[ 480] 2 > recv append entries result from server 1\n"
"[ 480] 2 > recv append entries result from server 3\n"
"[ 510] 2 > timeout as leader\n"
" pipeline server 1 sending a heartbeat (no entries)\n"
" pipeline server 3 sending a heartbeat (no entries)\n");

for (unsigned id = 1; id < N_SERVERS; id++) {
int count = raft_voter_contacts(CLUSTER_RAFT(id));
if (id == 2 /* leader */) {
munit_assert_int(count, ==, N_SERVERS);
} else {
munit_assert_int(count, ==, -1);
}
}

return MUNIT_OK;
}

0 comments on commit 28c318c

Please sign in to comment.