From fee99bc918e21189742ebcbdb6fb51ef8ad39a3c Mon Sep 17 00:00:00 2001 From: Hao Geng Date: Wed, 31 May 2023 10:15:04 -0700 Subject: [PATCH 1/6] Remove StopReplica request during controlled shutdown --- .../main/scala/kafka/controller/KafkaController.scala | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/core/src/main/scala/kafka/controller/KafkaController.scala b/core/src/main/scala/kafka/controller/KafkaController.scala index 6e9b1c19ecfa6..bea51d2d213db 100644 --- a/core/src/main/scala/kafka/controller/KafkaController.scala +++ b/core/src/main/scala/kafka/controller/KafkaController.scala @@ -1759,16 +1759,7 @@ class KafkaController(val config: KafkaConfig, controllerContext.partitionLeadershipInfo(partition).get.leaderAndIsr.leader == id } partitionStateMachine.handleStateChanges(partitionsLedByBroker.toSeq, OnlinePartition, Some(ControlledShutdownPartitionLeaderElectionStrategy)) - try { - brokerRequestBatch.newBatch() - partitionsFollowedByBroker.foreach { partition => - brokerRequestBatch.addStopReplicaRequestForBrokers(Seq(id), partition, deletePartition = false) - } - brokerRequestBatch.sendRequestsToBrokers(epoch) - } catch { - case e: IllegalStateException => - handleIllegalState(e) - } + // If the broker is a follower, updates the isr in ZK and notifies the current leader replicaStateMachine.handleStateChanges(partitionsFollowedByBroker.map(partition => PartitionAndReplica(partition, id)).toSeq, OfflineReplica) From cf23e7d866a4a7d416c4867c51a2acb6b84e6667 Mon Sep 17 00:00:00 2001 From: Hao Geng Date: Thu, 1 Jun 2023 17:48:01 -0700 Subject: [PATCH 2/6] Add BlockFollowerFromAddingBack to LeaderAndIsrPartitionState --- .../main/resources/common/message/LeaderAndIsrRequest.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/clients/src/main/resources/common/message/LeaderAndIsrRequest.json b/clients/src/main/resources/common/message/LeaderAndIsrRequest.json index 8febea0fab937..fa62e456bf9bb 100644 --- a/clients/src/main/resources/common/message/LeaderAndIsrRequest.json +++ b/clients/src/main/resources/common/message/LeaderAndIsrRequest.json @@ -89,7 +89,9 @@ { "name": "RemovingReplicas", "type": "[]int32", "versions": "4+", "ignorable": true, "entityType": "brokerId", "about": "The replica IDs that we are removing this partition from, or null if no replicas are being removed." }, { "name": "IsNew", "type": "bool", "versions": "1+", "default": "false", "ignorable": true, - "about": "Whether the replica should have existed on the broker or not." } + "about": "Whether the replica should have existed on the broker or not." }, + { "name": "BlockFollowerFromAddingBack", "type": "bool", "tag": 0, "taggedVersions": "6+", "versions": "6+", "default": "false", + "about": "Whether the leader should block follower being adding back in a short period."} ]} ] } From 09dcb694af01074c9a336dede86c185a6c492a9b Mon Sep 17 00:00:00 2001 From: Hao Geng Date: Thu, 1 Jun 2023 17:55:27 -0700 Subject: [PATCH 3/6] Honor the blockFollowerFromAddingBack flag in LAIR --- .../main/scala/kafka/cluster/Partition.scala | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/kafka/cluster/Partition.scala b/core/src/main/scala/kafka/cluster/Partition.scala index 864410dccae21..76d5a64dabaac 100755 --- a/core/src/main/scala/kafka/cluster/Partition.scala +++ b/core/src/main/scala/kafka/cluster/Partition.scala @@ -42,7 +42,7 @@ import org.apache.kafka.common.requests.OffsetsForLeaderEpochResponse.{UNDEFINED import org.apache.kafka.common.utils.Time import org.apache.kafka.common.{IsolationLevel, TopicPartition, Uuid} -import scala.collection.{Map, Seq} +import scala.collection.{Map, Seq, mutable} import scala.jdk.CollectionConverters._ trait IsrChangeListener { @@ -275,6 +275,9 @@ class Partition(val topicPartition: TopicPartition, newGauge("LastStableOffsetLag", () => log.map(_.lastStableOffsetLag).getOrElse(0), tags) ISR_STATES_TO_CREATE_METRICS.foreach(c => newGauge(s"${c.getSimpleName}", () => if (isrStateClass.equals(c)) 1 else 0, tags)) + private val expandIsrLocks: mutable.Map[Int, Long] = mutable.Map.empty[Int, Long] + private val expandIsrLockTime: Long = 60000 // Block replica to be added back to ISR in 60 seconds + def isrStateClass: Class[_ <: IsrState] = isrState.getClass def isUnderReplicated: Boolean = isLeader && (assignmentState.replicationFactor - isrState.isr.size) > 0 @@ -561,7 +564,8 @@ class Partition(val topicPartition: TopicPartition, assignment = partitionState.replicas.asScala.map(_.toInt), isr = isr, addingReplicas = addingReplicas, - removingReplicas = removingReplicas + removingReplicas = removingReplicas, + blockFollowerFromAddingBack = partitionState.blockFollowerFromAddingBack() ) try { createLogIfNotExists(partitionState.isNew, isFutureReplica = false, highWatermarkCheckpoints, topicId) @@ -745,7 +749,8 @@ class Partition(val topicPartition: TopicPartition, def updateAssignmentAndIsr(assignment: Seq[Int], isr: Set[Int], addingReplicas: Seq[Int], - removingReplicas: Seq[Int]): Unit = { + removingReplicas: Seq[Int], + blockFollowerFromAddingBack: Boolean = false): Unit = { val newRemoteReplicas = assignment.filter(_ != localBrokerId) val removedReplicas = remoteReplicasMap.keys.filter(!newRemoteReplicas.contains(_)) @@ -758,9 +763,16 @@ class Partition(val topicPartition: TopicPartition, assignmentState = OngoingReassignmentState(addingReplicas, removingReplicas, assignment) else assignmentState = SimpleAssignmentState(assignment) + + blockFollowersFromAddingBackToIsr(isr) isrState = CommittedIsr(isr) } + def blockFollowersFromAddingBackToIsr(isr: Set[Int]): Unit = { + val followersKickedOutOfIsr = isrState.isr.diff(isr) + followersKickedOutOfIsr.foreach(replica => expandIsrLocks(replica) = System.currentTimeMillis()) + } + /** * Check and maybe expand the ISR of the partition. * A replica will be added to ISR if its LEO >= current hw of the partition and it is caught up to @@ -1379,6 +1391,14 @@ class Partition(val topicPartition: TopicPartition, } private[cluster] def expandIsr(newInSyncReplica: Int): Unit = { + if (expandIsrLocks.contains(newInSyncReplica)) { + if (expandIsrLocks(newInSyncReplica) + expandIsrLockTime < System.currentTimeMillis()) { + trace(s"Avoiding adding $newInSyncReplica to isr as it is locked since $expandIsrLocks(newInSyncReplica)") + } else { + expandIsrLocks.remove(newInSyncReplica) + } + } + // This is called from maybeExpandIsr which holds the ISR write lock if (!isrState.isInflight) { // When expanding the ISR, we can safely assume the new replica will make it into the ISR since this puts us in From 7d61b4aa9499295e4d3d157e799bad0a7b6d2c5e Mon Sep 17 00:00:00 2001 From: Hao Geng Date: Thu, 1 Jun 2023 21:38:05 -0700 Subject: [PATCH 4/6] Set the blockFollowerFromAddingBack flag to true during controlled shutdown --- .../kafka/controller/ControllerChannelManager.scala | 11 +++++++---- .../scala/kafka/controller/ControllerContext.scala | 2 ++ .../main/scala/kafka/controller/KafkaController.scala | 7 +++++-- .../scala/kafka/controller/ReplicaStateMachine.scala | 10 +++++++--- .../kafka/controller/ReplicaStateMachineTest.scala | 9 ++++++--- 5 files changed, 27 insertions(+), 12 deletions(-) diff --git a/core/src/main/scala/kafka/controller/ControllerChannelManager.scala b/core/src/main/scala/kafka/controller/ControllerChannelManager.scala index 6923d618e98c6..dc8cf457754b8 100755 --- a/core/src/main/scala/kafka/controller/ControllerChannelManager.scala +++ b/core/src/main/scala/kafka/controller/ControllerChannelManager.scala @@ -553,8 +553,9 @@ abstract class AbstractControllerBrokerRequestBatch(config: KafkaConfig, topicPartition: TopicPartition, leaderIsrAndControllerEpoch: LeaderIsrAndControllerEpoch, replicaAssignment: ReplicaAssignment, - isNew: Boolean): Unit = { - addLeaderAndIsrRequestForBrokers(brokerIds, topicPartition, leaderIsrAndControllerEpoch, replicaAssignment, isNew, ControllerContextSnapshot(controllerContext)) + isNew: Boolean, + blockFollowerFromAddingBack: Boolean = false): Unit = { + addLeaderAndIsrRequestForBrokers(brokerIds, topicPartition, leaderIsrAndControllerEpoch, replicaAssignment, isNew, ControllerContextSnapshot(controllerContext), blockFollowerFromAddingBack) } def addLeaderAndIsrRequestForBrokers(brokerIds: Seq[Int], @@ -562,7 +563,8 @@ abstract class AbstractControllerBrokerRequestBatch(config: KafkaConfig, leaderIsrAndControllerEpoch: LeaderIsrAndControllerEpoch, replicaAssignment: ReplicaAssignment, isNew: Boolean, - controllerContextSnapshot: ControllerContextSnapshot): Unit = { + controllerContextSnapshot: ControllerContextSnapshot, + blockFollowerFromAddingBack: Boolean): Unit = { brokerIds.filter(_ >= 0).foreach { brokerId => val result = leaderAndIsrRequestMap.getOrElseUpdate(brokerId, mutable.Map.empty) @@ -579,7 +581,8 @@ abstract class AbstractControllerBrokerRequestBatch(config: KafkaConfig, .setReplicas(replicaAssignment.replicas.map(Integer.valueOf).asJava) .setAddingReplicas(replicaAssignment.addingReplicas.map(Integer.valueOf).asJava) .setRemovingReplicas(replicaAssignment.removingReplicas.map(Integer.valueOf).asJava) - .setIsNew(isNew || alreadyNew)) + .setIsNew(isNew || alreadyNew) + .setBlockFollowerFromAddingBack(blockFollowerFromAddingBack)) } addUpdateMetadataRequestForBrokers(controllerContextSnapshot.liveOrShuttingDownBrokerIds.toSeq, Set(topicPartition), diff --git a/core/src/main/scala/kafka/controller/ControllerContext.scala b/core/src/main/scala/kafka/controller/ControllerContext.scala index 73790869be8c3..4ebe65b081e4e 100644 --- a/core/src/main/scala/kafka/controller/ControllerContext.scala +++ b/core/src/main/scala/kafka/controller/ControllerContext.scala @@ -104,6 +104,8 @@ class ControllerContext { val topicsToBeDeleted = mutable.Set.empty[String] + val replicasBeingShutdown = mutable.Set.empty[PartitionAndReplica] + /** The following topicsWithDeletionStarted variable is used to properly update the offlinePartitionCount metric. * When a topic is going through deletion, we don't want to keep track of its partition state * changes in the offlinePartitionCount metric. This goal means if some partitions of a topic are already diff --git a/core/src/main/scala/kafka/controller/KafkaController.scala b/core/src/main/scala/kafka/controller/KafkaController.scala index bea51d2d213db..ac88a63ace42d 100644 --- a/core/src/main/scala/kafka/controller/KafkaController.scala +++ b/core/src/main/scala/kafka/controller/KafkaController.scala @@ -1760,9 +1760,12 @@ class KafkaController(val config: KafkaConfig, } partitionStateMachine.handleStateChanges(partitionsLedByBroker.toSeq, OnlinePartition, Some(ControlledShutdownPartitionLeaderElectionStrategy)) + // If the broker is a follower, updates the isr in ZK and notifies the current leader - replicaStateMachine.handleStateChanges(partitionsFollowedByBroker.map(partition => - PartitionAndReplica(partition, id)).toSeq, OfflineReplica) + val followerReplicas = partitionsFollowedByBroker.map(partition => PartitionAndReplica(partition, id)).toSeq + controllerContext.replicasBeingShutdown ++= followerReplicas + replicaStateMachine.handleStateChanges(followerReplicas, OfflineReplica) + controllerContext.replicasBeingShutdown --= followerReplicas trace(s"All leaders = ${controllerContext.partitionsLeadershipInfo.mkString(",")}") if (shouldSkipShutdownSafetyCheck) { // When skipping shutdown safety check, we allow the broker to shutdown even though it may be the leader for some partitions. diff --git a/core/src/main/scala/kafka/controller/ReplicaStateMachine.scala b/core/src/main/scala/kafka/controller/ReplicaStateMachine.scala index 8909e75de36e6..6f8bfc9982f34 100644 --- a/core/src/main/scala/kafka/controller/ReplicaStateMachine.scala +++ b/core/src/main/scala/kafka/controller/ReplicaStateMachine.scala @@ -260,7 +260,8 @@ class ZkReplicaStateMachine(config: KafkaConfig, leaderIsrAndControllerEpoch, controllerContext.partitionFullReplicaAssignment(partition), isNew = false, - controllerContextSnapshot) + controllerContextSnapshot, + false) case None => } timingsOpt.foreach { timings => @@ -279,7 +280,8 @@ class ZkReplicaStateMachine(config: KafkaConfig, } } case OfflineReplica => - validReplicas.foreach { replica => + // Should not send StopReplicaRequest to brokers being controlled shutdown + validReplicas.filter {replica => !controllerContext.replicasBeingShutdown.contains(replica)}.foreach { replica => controllerBrokerRequestBatch.addStopReplicaRequestForBrokers(Seq(replicaId), replica.topicPartition, deletePartition = false) } val (replicasWithLeadershipInfo, replicasWithoutLeadershipInfo) = validReplicas.partition { replica => @@ -294,7 +296,9 @@ class ZkReplicaStateMachine(config: KafkaConfig, controllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(recipients, partition, leaderIsrAndControllerEpoch, - controllerContext.partitionFullReplicaAssignment(partition), isNew = false) + controllerContext.partitionFullReplicaAssignment(partition), + isNew = false, + controllerContext.replicasBeingShutdown.contains(PartitionAndReplica(partition, replicaId))) } val replica = PartitionAndReplica(partition, replicaId) val currentState = controllerContext.replicaState(replica) diff --git a/core/src/test/scala/unit/kafka/controller/ReplicaStateMachineTest.scala b/core/src/test/scala/unit/kafka/controller/ReplicaStateMachineTest.scala index cf3cd6948c74e..57e19b45e7274 100644 --- a/core/src/test/scala/unit/kafka/controller/ReplicaStateMachineTest.scala +++ b/core/src/test/scala/unit/kafka/controller/ReplicaStateMachineTest.scala @@ -195,7 +195,8 @@ class ReplicaStateMachineTest { controllerContext.putPartitionLeadershipInfo(partition, leaderIsrAndControllerEpoch) EasyMock.expect(mockControllerBrokerRequestBatch.newBatch()) EasyMock.expect(mockControllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(Seq(brokerId), - partition, leaderIsrAndControllerEpoch, replicaAssignment(Seq(brokerId)), isNew = false, controllerContextSnapshot)) + partition, leaderIsrAndControllerEpoch, replicaAssignment(Seq(brokerId)), isNew = false, controllerContextSnapshot, + blockFollowerFromAddingBack = false)) EasyMock.expect(mockControllerBrokerRequestBatch.sendRequestsToBrokers(controllerEpoch)) EasyMock.replay(mockZkClient, mockControllerBrokerRequestBatch) replicaStateMachine.handleStateChanges(replicas, OnlineReplica) @@ -264,7 +265,8 @@ class ReplicaStateMachineTest { controllerContext.putPartitionLeadershipInfo(partition, leaderIsrAndControllerEpoch) EasyMock.expect(mockControllerBrokerRequestBatch.newBatch()) EasyMock.expect(mockControllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(Seq(brokerId), - partition, leaderIsrAndControllerEpoch, replicaAssignment(Seq(brokerId)), isNew = false, controllerContextSnapshot)) + partition, leaderIsrAndControllerEpoch, replicaAssignment(Seq(brokerId)), isNew = false, controllerContextSnapshot, + blockFollowerFromAddingBack = false)) EasyMock.expect(mockControllerBrokerRequestBatch.sendRequestsToBrokers(controllerEpoch)) EasyMock.replay(mockZkClient, mockControllerBrokerRequestBatch) replicaStateMachine.handleStateChanges(replicas, OnlineReplica) @@ -378,7 +380,8 @@ class ReplicaStateMachineTest { controllerContext.putPartitionLeadershipInfo(partition, leaderIsrAndControllerEpoch) EasyMock.expect(mockControllerBrokerRequestBatch.newBatch()) EasyMock.expect(mockControllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(Seq(brokerId), - partition, leaderIsrAndControllerEpoch, replicaAssignment(Seq(brokerId)), isNew = false, controllerContextSnapshot)) + partition, leaderIsrAndControllerEpoch, replicaAssignment(Seq(brokerId)), isNew = false, controllerContextSnapshot, + blockFollowerFromAddingBack = false)) EasyMock.expect(mockControllerBrokerRequestBatch.sendRequestsToBrokers(controllerEpoch)) EasyMock.replay(mockZkClient, mockControllerBrokerRequestBatch) replicaStateMachine.handleStateChanges(replicas, OnlineReplica) From 6753a376275e6d996d25662d66e812bb75db4a32 Mon Sep 17 00:00:00 2001 From: Hao Geng Date: Thu, 1 Jun 2023 21:40:03 -0700 Subject: [PATCH 5/6] Wait for some period after fire the ControlledShutdown request to the controller --- core/src/main/scala/kafka/server/KafkaServer.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/src/main/scala/kafka/server/KafkaServer.scala b/core/src/main/scala/kafka/server/KafkaServer.scala index a6ce3adf306d2..9c3a8ee544fcc 100755 --- a/core/src/main/scala/kafka/server/KafkaServer.scala +++ b/core/src/main/scala/kafka/server/KafkaServer.scala @@ -787,6 +787,9 @@ class KafkaServer( CoreUtils.swallow(controlledShutdown(), this) _brokerState = BrokerState.SHUTTING_DOWN + // This delay is to wait LeaderAndIsrRequest to remove the followers on this broker from their ISRs. + Thread.sleep(60000) + if (healthCheckScheduler != null) healthCheckScheduler.shutdown() From 720360ce255144ca863ce3c3ab7626b2aad6c851 Mon Sep 17 00:00:00 2001 From: Hao Geng Date: Wed, 2 Aug 2023 09:46:59 -0700 Subject: [PATCH 6/6] POC code for reduce produce latency --- .../common/utils/LiCombinedControlTransformer.java | 6 ++++-- .../common/message/LiCombinedControlRequest.json | 4 +++- core/src/main/scala/kafka/cluster/Partition.scala | 14 +++++++++++--- .../controller/ControllerChannelManager.scala | 13 ++++++++++++- .../scala/kafka/controller/KafkaController.scala | 6 +++++- .../kafka/controller/ReplicaStateMachine.scala | 6 ++++-- .../main/scala/kafka/server/DelayedProduce.scala | 11 ++++++++++- core/src/main/scala/kafka/server/KafkaApis.scala | 6 ++++++ core/src/main/scala/kafka/server/KafkaServer.scala | 11 +++++++---- 9 files changed, 62 insertions(+), 15 deletions(-) diff --git a/clients/src/main/java/org/apache/kafka/common/utils/LiCombinedControlTransformer.java b/clients/src/main/java/org/apache/kafka/common/utils/LiCombinedControlTransformer.java index be54b53380775..382265206ff9f 100644 --- a/clients/src/main/java/org/apache/kafka/common/utils/LiCombinedControlTransformer.java +++ b/clients/src/main/java/org/apache/kafka/common/utils/LiCombinedControlTransformer.java @@ -56,7 +56,8 @@ public static LiCombinedControlRequestData.LeaderAndIsrPartitionState transformL .setReplicas(partitionState.replicas()) .setAddingReplicas(partitionState.addingReplicas()) .setRemovingReplicas(partitionState.removingReplicas()) - .setIsNew(partitionState.isNew()); + .setIsNew(partitionState.isNew()) + .setBlockFollowerFromAddingBack(partitionState.blockFollowerFromAddingBack()); } public static LeaderAndIsrRequestData.LeaderAndIsrPartitionState restoreLeaderAndIsrPartition( @@ -71,7 +72,8 @@ public static LeaderAndIsrRequestData.LeaderAndIsrPartitionState restoreLeaderAn .setReplicas(partitionState.replicas()) .setAddingReplicas(partitionState.addingReplicas()) .setRemovingReplicas(partitionState.removingReplicas()) - .setIsNew(partitionState.isNew()); + .setIsNew(partitionState.isNew()) + .setBlockFollowerFromAddingBack(partitionState.blockFollowerFromAddingBack()); } public static LiCombinedControlRequestData.UpdateMetadataPartitionState transformUpdateMetadataPartition( diff --git a/clients/src/main/resources/common/message/LiCombinedControlRequest.json b/clients/src/main/resources/common/message/LiCombinedControlRequest.json index 015f8a93163be..079846cc7fa99 100644 --- a/clients/src/main/resources/common/message/LiCombinedControlRequest.json +++ b/clients/src/main/resources/common/message/LiCombinedControlRequest.json @@ -127,7 +127,9 @@ { "name": "RemovingReplicas", "type": "[]int32", "versions": "0+", "ignorable": true, "about": "The replica IDs that we are removing this partition from, or null if no replicas are being removed." }, { "name": "IsNew", "type": "bool", "versions": "0+", "default": "false", "ignorable": true, - "about": "Whether the replica should have existed on the broker or not." } + "about": "Whether the replica should have existed on the broker or not." }, + { "name": "BlockFollowerFromAddingBack", "type": "bool", "tag": 0, "taggedVersions": "1+", "versions": "1+", "default": "false", + "about": "Whether the leader should block follower being adding back in a short period."} ]}, { "name": "UpdateMetadataPartitionState", "versions": "0+", "fields": [ { "name": "TopicName", "type": "string", "versions": "0", "entityType": "topicName", "ignorable": true, diff --git a/core/src/main/scala/kafka/cluster/Partition.scala b/core/src/main/scala/kafka/cluster/Partition.scala index 76d5a64dabaac..ac6212c08963f 100755 --- a/core/src/main/scala/kafka/cluster/Partition.scala +++ b/core/src/main/scala/kafka/cluster/Partition.scala @@ -552,6 +552,8 @@ class Partition(val topicPartition: TopicPartition, highWatermarkCheckpoints: OffsetCheckpoints, topicId: Option[Uuid]): Boolean = { val (leaderHWIncremented, isNewLeader) = inWriteLock(leaderIsrUpdateLock) { + info(s"Making leader to partition $partitionState") + // record the epoch of the controller that made the leadership decision. This is useful while updating the isr // to maintain the decision maker controller's epoch in the zookeeper path controllerEpoch = partitionState.controllerEpoch @@ -764,7 +766,10 @@ class Partition(val topicPartition: TopicPartition, else assignmentState = SimpleAssignmentState(assignment) - blockFollowersFromAddingBackToIsr(isr) + if (blockFollowerFromAddingBack) { + info(s"hgeng: block follower from adding to isr: $isr") + blockFollowersFromAddingBackToIsr(isr) + } isrState = CommittedIsr(isr) } @@ -1392,9 +1397,12 @@ class Partition(val topicPartition: TopicPartition, private[cluster] def expandIsr(newInSyncReplica: Int): Unit = { if (expandIsrLocks.contains(newInSyncReplica)) { - if (expandIsrLocks(newInSyncReplica) + expandIsrLockTime < System.currentTimeMillis()) { - trace(s"Avoiding adding $newInSyncReplica to isr as it is locked since $expandIsrLocks(newInSyncReplica)") + if (System.currentTimeMillis() < expandIsrLocks(newInSyncReplica) + expandIsrLockTime) { + info(s"hgeng: Avoid adding $newInSyncReplica to isr as it is locked since ${expandIsrLocks(newInSyncReplica)}") + return } else { + info(s"hgeng: Remove $newInSyncReplica from expandIsrLock map. Current time ${System.currentTimeMillis()}" + + s", lock time ${expandIsrLocks(newInSyncReplica)}") expandIsrLocks.remove(newInSyncReplica) } } diff --git a/core/src/main/scala/kafka/controller/ControllerChannelManager.scala b/core/src/main/scala/kafka/controller/ControllerChannelManager.scala index dc8cf457754b8..d0c388f2b5bb7 100755 --- a/core/src/main/scala/kafka/controller/ControllerChannelManager.scala +++ b/core/src/main/scala/kafka/controller/ControllerChannelManager.scala @@ -664,6 +664,8 @@ abstract class AbstractControllerBrokerRequestBatch(config: KafkaConfig, else if (config.interBrokerProtocolVersion >= KAFKA_1_0_IV0) 1 else 0 + var blockFollowerAddingBack = false + val maxBrokerEpoch = controllerContext.maxBrokerEpoch leaderAndIsrRequestMap.forKeyValue { (broker, leaderAndIsrPartitionStates) => if (controllerContext.liveOrShuttingDownBrokerIds.contains(broker)) { @@ -677,8 +679,14 @@ abstract class AbstractControllerBrokerRequestBatch(config: KafkaConfig, } else { "become-follower" } + if (state.blockFollowerFromAddingBack()) { + blockFollowerAddingBack = true + info("hgeng:Blocking follower from adding back is true") + } + if (stateChangeLog.isTraceEnabled) - stateChangeLog.trace(s"Sending $typeOfRequest LeaderAndIsr request $state to broker $broker for partition $topicPartition") + stateChangeLog.trace(s"hgeng: Sending $typeOfRequest LeaderAndIsr request $state to broker $broker for partition $topicPartition" + + s"blockFollowerAddingBack $blockFollowerAddingBack") } stateChangeLog.info(s"Sending LeaderAndIsr request to broker $broker with $numBecomeLeaders become-leader " + s"and ${leaderAndIsrPartitionStates.size - numBecomeLeaders} become-follower partitions") @@ -701,6 +709,9 @@ abstract class AbstractControllerBrokerRequestBatch(config: KafkaConfig, controllerEpoch, brokerEpoch, maxBrokerEpoch, leaderAndIsrPartitionStates.values.toBuffer.asJava, topicIds.asJava, leaders.asJava) } + if (blockFollowerAddingBack) { + info(s"hgeng: LeaderAndIsrRequestBuilder is $leaderAndIsrRequestBuilder") + } sendRequest(broker, leaderAndIsrRequestBuilder, (r: AbstractResponse) => { val leaderAndIsrResponse = r.asInstanceOf[LeaderAndIsrResponse] diff --git a/core/src/main/scala/kafka/controller/KafkaController.scala b/core/src/main/scala/kafka/controller/KafkaController.scala index ac88a63ace42d..4cd1c369aafa6 100644 --- a/core/src/main/scala/kafka/controller/KafkaController.scala +++ b/core/src/main/scala/kafka/controller/KafkaController.scala @@ -1745,7 +1745,7 @@ class KafkaController(val config: KafkaConfig, zkClient.recordBrokerShutdown(id, brokerEpoch, controllerContext.epochZkVersion) controllerContext.shuttingDownBrokerIds += (id -> brokerEpoch) - info(s"Shutting down broker $id") + info(s"hgeng:Shutting down broker $id") debug(s"All shutting down brokers: ${controllerContext.shuttingDownBrokerIds.mkString(",")}") debug(s"Live brokers: ${controllerContext.liveBrokerIds.mkString(",")}") @@ -1764,8 +1764,12 @@ class KafkaController(val config: KafkaConfig, // If the broker is a follower, updates the isr in ZK and notifies the current leader val followerReplicas = partitionsFollowedByBroker.map(partition => PartitionAndReplica(partition, id)).toSeq controllerContext.replicasBeingShutdown ++= followerReplicas + info(s"hgeng: replicas being shutdown: ${controllerContext.replicasBeingShutdown}") + info("hgeng: Handling state changes") replicaStateMachine.handleStateChanges(followerReplicas, OfflineReplica) + info("hgeng: finished handling changes") controllerContext.replicasBeingShutdown --= followerReplicas + info(s"hgeng: replicas being shutdown: ${controllerContext.replicasBeingShutdown}") trace(s"All leaders = ${controllerContext.partitionsLeadershipInfo.mkString(",")}") if (shouldSkipShutdownSafetyCheck) { // When skipping shutdown safety check, we allow the broker to shutdown even though it may be the leader for some partitions. diff --git a/core/src/main/scala/kafka/controller/ReplicaStateMachine.scala b/core/src/main/scala/kafka/controller/ReplicaStateMachine.scala index 6f8bfc9982f34..b246d395d5ba0 100644 --- a/core/src/main/scala/kafka/controller/ReplicaStateMachine.scala +++ b/core/src/main/scala/kafka/controller/ReplicaStateMachine.scala @@ -289,10 +289,12 @@ class ZkReplicaStateMachine(config: KafkaConfig, } val updatedLeaderIsrAndControllerEpochs = removeReplicasFromIsr(replicaId, replicasWithLeadershipInfo.map(_.topicPartition)) updatedLeaderIsrAndControllerEpochs.forKeyValue { (partition, leaderIsrAndControllerEpoch) => - stateLogger.info(s"Partition $partition state changed to $leaderIsrAndControllerEpoch after removing replica $replicaId from the ISR as part of transition to $OfflineReplica") + stateLogger.info(s"hgeng: Partition $partition state changed to $leaderIsrAndControllerEpoch after removing replica $replicaId from the ISR as part of transition to $OfflineReplica") if (!controllerContext.isTopicQueuedUpForDeletion(partition.topic)) { - val recipients = controllerContext.partitionReplicaAssignment(partition).filterNot(_ == replicaId) + val recipients = controllerContext.partitionReplicaAssignment(partition) // possible PERF TODO? could add controllerContextSnapshot as 6th arg here, too: + stateLogger.info(s"hgeng: partition: $partition" + + s"replicaId: $replicaId, contains? ${controllerContext.replicasBeingShutdown.contains(PartitionAndReplica(partition, replicaId))}") controllerBrokerRequestBatch.addLeaderAndIsrRequestForBrokers(recipients, partition, leaderIsrAndControllerEpoch, diff --git a/core/src/main/scala/kafka/server/DelayedProduce.scala b/core/src/main/scala/kafka/server/DelayedProduce.scala index ced8d5f53a358..a35bfa356c918 100644 --- a/core/src/main/scala/kafka/server/DelayedProduce.scala +++ b/core/src/main/scala/kafka/server/DelayedProduce.scala @@ -54,7 +54,8 @@ class DelayedProduce(delayMs: Long, produceMetadata: ProduceMetadata, replicaManager: ReplicaManager, responseCallback: Map[TopicPartition, PartitionResponse] => Unit, - lockOpt: Option[Lock] = None) + lockOpt: Option[Lock] = None, + creationTime: Long = System.currentTimeMillis()) extends DelayedOperation(delayMs, lockOpt) { import DelayedOperation._ @@ -126,6 +127,14 @@ class DelayedProduce(delayMs: Long, * Upon completion, return the current response status along with the error code per partition */ override def onComplete(): Unit = { + val timeToComplete = System.currentTimeMillis() - creationTime + if (timeToComplete > 1000) { + info(s"Delayed request takes ${timeToComplete} to complete," + + s" partitions are ${produceMetadata.produceStatus.keysIterator.next()}") + info(s"Delayed request takes ${timeToComplete} to complete," + + s" partitions are ${produceMetadata.produceStatus.keySet}") + } + val responseStatus = produceMetadata.produceStatus.map { case (k, status) => k -> status.responseStatus } responseCallback(responseStatus) } diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index 2afd58cc88036..4a09218db62b8 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -289,6 +289,12 @@ class KafkaApis(val requestChannel: RequestChannel, } private def doHandleLeaderAndIsrRequest(correlationId: Int, leaderAndIsrRequest: LeaderAndIsrRequest): LeaderAndIsrResponse = { + + leaderAndIsrRequest.partitionStates().forEach( state => + info(s"hgeng: Handling LAIR request partition state ${state}") + ) + + replicaManager.becomeLeaderOrFollower(correlationId, leaderAndIsrRequest, RequestHandlerHelper.onLeadershipChange(groupCoordinator, txnCoordinator, _, _)) } diff --git a/core/src/main/scala/kafka/server/KafkaServer.scala b/core/src/main/scala/kafka/server/KafkaServer.scala index 9c3a8ee544fcc..688e5102685f3 100755 --- a/core/src/main/scala/kafka/server/KafkaServer.scala +++ b/core/src/main/scala/kafka/server/KafkaServer.scala @@ -221,7 +221,7 @@ class KafkaServer( */ override def startup(): Unit = { try { - info("starting") + info("hgeng:1:25: starting") if (isShuttingDown.get) throw new IllegalStateException("Kafka server is still shutting down, cannot re-start!") @@ -547,7 +547,8 @@ class KafkaServer( } private def initZkClient(time: Time): Unit = { - info(s"Connecting to zookeeper on ${config.zkConnect}") + info("hgeng: init zk client") + info(s"hgeng fat: Connecting to zookeeper on ${config.zkConnect}") val secureAclsEnabled = config.zkEnableSecureAcls val isZkSecurityEnabled = JaasUtils.isZkSaslEnabled() || KafkaConfig.zkTlsClientAuthEnabled(zkClientConfig) @@ -758,7 +759,7 @@ class KafkaServer( // We request the controller to do a controlled shutdown. On failure, we backoff for a configured period // of time and try again for a configured number of retries. If all the attempt fails, we simply force // the shutdown. - info("Starting controlled shutdown") + info("hgeng:Starting controlled shutdown") _brokerState = BrokerState.PENDING_CONTROLLED_SHUTDOWN @@ -775,7 +776,7 @@ class KafkaServer( */ override def shutdown(): Unit = { try { - info("shutting down") + info("hgeng: shutting down") if (isStartingUp.get) throw new IllegalStateException("Kafka server is still starting up, cannot shut down!") @@ -784,11 +785,13 @@ class KafkaServer( // last in the `if` block. If the order is reversed, we could shutdown twice or leave `isShuttingDown` set to // `true` at the end of this method. if (shutdownLatch.getCount > 0 && isShuttingDown.compareAndSet(false, true)) { + info("hgeng: sending controlled shutdown") CoreUtils.swallow(controlledShutdown(), this) _brokerState = BrokerState.SHUTTING_DOWN // This delay is to wait LeaderAndIsrRequest to remove the followers on this broker from their ISRs. Thread.sleep(60000) + info("hgeng: received controlled shutdown success, and slept for 60 seconds") if (healthCheckScheduler != null) healthCheckScheduler.shutdown()