From 43fab28bf074908655bcfd16186606eba079a774 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Wed, 27 Mar 2024 10:55:12 +0000 Subject: [PATCH 1/3] Use selective receive to allow for vnode to be blocked Block and unblock a vnode when triggering a repair. Otherwise their is a potential race condition: - aae_controller is triggered into repair mode, queueing all updates to be re-applied after rebuild complete - update is applied to leveled backend by vnode - snapshot is taken for rebuild (including update) - message is cast to aae_controller with update ... which will be queued and applied twice - update is queue --- src/riak_kv_vnode.erl | 46 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/src/riak_kv_vnode.erl b/src/riak_kv_vnode.erl index 491421475..3fd781cca 100644 --- a/src/riak_kv_vnode.erl +++ b/src/riak_kv_vnode.erl @@ -239,6 +239,8 @@ -define(REAPER_BATCH_SIZE, 1024). -define(ERASER_BATCH_SIZE, 1024). +-define(INIT_REBUILD_BLOCKTIME, 1000). + %% Erlang's if Bool -> thing; true -> thang end. syntax hurts my %% brain. It scans as if true -> thing; true -> thang end. So, here is %% a macro, ?ELSE to use in if statements. You're welcome. @@ -438,21 +440,30 @@ queue_tictactreerebuild(AAECntrl, Partition, OnlyIfBroken, State) -> fun() -> ?LOG_INFO("Starting tree rebuild for partition=~w", [Partition]), SW = os:timestamp(), - case when_loading_complete(AAECntrl, - Preflists, - fun preflistfun/2, - OnlyIfBroken) of + BlockRequest = self(), + BlockTimeMS = ?INIT_REBUILD_BLOCKTIME, + {blocked, VnodePid} = + riak_core_vnode_master:sync_command( + {Partition, node()}, + {block_vnode, BlockRequest, BlockTimeMS}, + riak_kv_vnode_master, + infinity), + case when_loading_complete( + AAECntrl, Preflists, fun preflistfun/2, OnlyIfBroken) of {ok, StoreFold, FinishFun} -> + VnodePid ! {release_vnode, BlockRequest}, Output = StoreFold(), FinishFun(Output), Duration = timer:now_diff(os:timestamp(), SW) div (1000 * 1000), - ?LOG_INFO("Tree rebuild complete for partition=~w" ++ - " in duration=~w seconds", - [Partition, Duration]); + ?LOG_INFO( + "Tree rebuild complete for partition=~w" + " in duration=~w seconds", + [Partition, Duration]); skipped -> - ?LOG_INFO("Tree rebuild skipped for partition=~w", - [Partition]) + VnodePid ! {release_vnode, BlockRequest}, + ?LOG_INFO( + "Tree rebuild skipped for partition=~w", [Partition]) end, ok end, @@ -1484,6 +1495,23 @@ handle_command({reset_hashtree_tokens, MinToken, MaxToken}, _Sender, State) -> end, {reply, ok, State}; +handle_command({block_vnode, BlockRequest, BlockTimeMS}, Sender, State) -> + riak_core_vnode:reply(Sender, {blocked, self()}), + receive + {release_vnode, BlockRequest} -> + ?LOG_INFO( + "Vnode block released for ~w request ~w", + [Sender, BlockRequest]), + {noreply, State} + after + BlockTimeMS -> + ?LOG_WARNING( + "Vnode block request timed out after ~w for ~w request ~w", + [BlockTimeMS, Sender, BlockRequest] + ), + {noreply, State} + end; + handle_command(Req, Sender, State) -> handle_request(riak_kv_requests:request_type(Req), Req, Sender, State). From eeba0adadd166b29157b286d730d682a18575f2b Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Wed, 27 Mar 2024 12:15:26 +0000 Subject: [PATCH 2/3] Handle message outside of selective receive If the selective receive times out - then need to handle the release message --- src/riak_kv_vnode.erl | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/riak_kv_vnode.erl b/src/riak_kv_vnode.erl index 3fd781cca..8d44668aa 100644 --- a/src/riak_kv_vnode.erl +++ b/src/riak_kv_vnode.erl @@ -1497,11 +1497,14 @@ handle_command({reset_hashtree_tokens, MinToken, MaxToken}, _Sender, State) -> handle_command({block_vnode, BlockRequest, BlockTimeMS}, Sender, State) -> riak_core_vnode:reply(Sender, {blocked, self()}), + SW = os:timestamp(), receive {release_vnode, BlockRequest} -> + LockedTime = timer:now_diff(os:timestamp(), SW), ?LOG_INFO( - "Vnode block released for ~w request ~w", - [Sender, BlockRequest]), + "Vnode block released for ~w request ~w after" + "block_time=~w microseconds", + [Sender, BlockRequest, LockedTime]), {noreply, State} after BlockTimeMS -> @@ -2665,6 +2668,12 @@ handle_info({aae_pong, QueueTime}, State) -> ok end, {ok, State}; +handle_info({release_vnode, BlockRequest}, State) -> + ?LOG_WARNING( + "Vnode block release request ~w received outside of block", + [BlockRequest] + ), + {ok, State}; handle_info({Ref, ok}, State) -> ?LOG_INFO("Ignoring ok returned after timeout for Ref ~p", [Ref]), {ok, State}. From 71a7c5e34d85c934f6485014569d87272f5e7b67 Mon Sep 17 00:00:00 2001 From: Martin Sumner Date: Thu, 4 Apr 2024 12:39:00 +0100 Subject: [PATCH 3/3] Add configurable block time --- priv/riak_kv.schema | 18 ++++++++++++++++-- src/riak_kv_vnode.erl | 7 ++++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/priv/riak_kv.schema b/priv/riak_kv.schema index ebb1d57e9..940638a12 100644 --- a/priv/riak_kv.schema +++ b/priv/riak_kv.schema @@ -147,7 +147,7 @@ {commented, disabled} ]}. -%% @doc Frequency to prompt exchange per vnode +%% @doc Frequency to prompt exchange per vnode (milliseconds) %% The number of milliseconds which the vnode must wait between self-pokes to %% maybe prompt the next exchange. Default is 8 minutes - check all partitions %% when n=3 once every hour (in each direction). A cycle of exchanges will @@ -161,7 +161,7 @@ hidden ]}. -%% @doc Frequency to prompt rebuild check per vnode +%% @doc Frequency to prompt rebuild check per vnode (milliseconds) %% The number of milliseconds which the vnode must wait between self-pokes to %% maybe prompt the next rebuild. Default is 60 minutes. %% When a node is being re-introduced to a cluster following a long delay, then @@ -173,6 +173,20 @@ hidden ]}. +%% @doc Block vnode for tree rebuild (milliseconds) +%% When rebuilding a vnode's tree cache, the vnode can be blocked while the +%% snapshot is taken - to eliminate race conditions that might otherwise cause +%% a segment to be miscalculated. By default the vnode block is always +%% released within 1000ms i.e. should a process crash or there be a delay in +%% the expected release, the block will still be released after this delay, and +%% the risk of miscalculation will be accepted in preference to having a longer +%% block +{mapping, "tictacaae_rebuild_blocktime", "riak_kv.tictacaae_rebuild_blocktime", [ + {datatype, integer}, + {default, 1000}, + hidden +]}. + %% @doc Max number of leaf IDs per exchange %% To control the length of time for each exchange, only a subset of the %% conflicting leaves will be compared on each exchange. If there are issues diff --git a/src/riak_kv_vnode.erl b/src/riak_kv_vnode.erl index 8d44668aa..09e04f27f 100644 --- a/src/riak_kv_vnode.erl +++ b/src/riak_kv_vnode.erl @@ -441,7 +441,12 @@ queue_tictactreerebuild(AAECntrl, Partition, OnlyIfBroken, State) -> ?LOG_INFO("Starting tree rebuild for partition=~w", [Partition]), SW = os:timestamp(), BlockRequest = self(), - BlockTimeMS = ?INIT_REBUILD_BLOCKTIME, + BlockTimeMS = + application:get_env( + riak_kv, + tictacaae_rebuild_blocktime, + ?INIT_REBUILD_BLOCKTIME + ), {blocked, VnodePid} = riak_core_vnode_master:sync_command( {Partition, node()},