diff --git a/Cargo.lock b/Cargo.lock
index 9a61e27a12c5e9..823bcf8b0ca265 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -601,7 +601,7 @@ dependencies = [
  "lazy_static",
  "lazycell",
  "peeking_take_while",
- "prettyplease 0.2.4",
+ "prettyplease 0.2.16",
  "proc-macro2",
  "quote",
  "regex",
@@ -610,6 +610,29 @@ dependencies = [
  "syn 2.0.50",
 ]
 
+[[package]]
+name = "bindgen"
+version = "0.69.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4c69fae65a523209d34240b60abe0c42d33d1045d445c0839d8a4894a736e2d"
+dependencies = [
+ "bitflags 2.4.2",
+ "cexpr",
+ "clang-sys",
+ "lazy_static",
+ "lazycell",
+ "log",
+ "peeking_take_while",
+ "prettyplease 0.2.16",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn 2.0.50",
+ "which",
+]
+
 [[package]]
 name = "bit-set"
 version = "0.5.2"
@@ -1231,7 +1254,7 @@ version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c278839b831783b70278b14df4d45e1beb1aad306c07bb796637de9a0e323e8e"
 dependencies = [
- "crossbeam-utils",
+ "crossbeam-utils 0.8.18",
 ]
 
 [[package]]
@@ -1327,6 +1350,16 @@ dependencies = [
  "winapi 0.2.8",
 ]
 
+[[package]]
+name = "cpu-time"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded"
+dependencies = [
+ "libc",
+ "winapi 0.3.9",
+]
+
 [[package]]
 name = "cpufeatures"
 version = "0.2.7"
@@ -1397,10 +1430,9 @@ dependencies = [
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "176dc175b78f56c0f321911d9c8eb2b77a78a4860b9c19db83835fea1a46649b"
+source = "git+https://github.com/ryoqun/crossbeam?rev=438ec7cdaf6c6a8f593e50344c725fef8a13c7a5#438ec7cdaf6c6a8f593e50344c725fef8a13c7a5"
 dependencies = [
- "crossbeam-utils",
+ "crossbeam-utils 0.8.19",
 ]
 
 [[package]]
@@ -1411,7 +1443,7 @@ checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e"
 dependencies = [
  "cfg-if 1.0.0",
  "crossbeam-epoch",
- "crossbeam-utils",
+ "crossbeam-utils 0.8.18",
 ]
 
 [[package]]
@@ -1420,7 +1452,7 @@ version = "0.9.5"
 source = "git+https://github.com/solana-labs/crossbeam?rev=fd279d707025f0e60951e429bf778b4813d1b6bf#fd279d707025f0e60951e429bf778b4813d1b6bf"
 dependencies = [
  "cfg-if 1.0.0",
- "crossbeam-utils",
+ "crossbeam-utils 0.8.18",
  "lazy_static",
  "memoffset 0.6.4",
  "scopeguard",
@@ -1435,6 +1467,11 @@ dependencies = [
  "cfg-if 1.0.0",
 ]
 
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.19"
+source = "git+https://github.com/ryoqun/crossbeam?rev=438ec7cdaf6c6a8f593e50344c725fef8a13c7a5#438ec7cdaf6c6a8f593e50344c725fef8a13c7a5"
+
 [[package]]
 name = "crunchy"
 version = "0.2.2"
@@ -1501,6 +1538,12 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "cty"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b365fabc795046672053e29c954733ec3b05e4be654ab130fe8f1f94d7051f35"
+
 [[package]]
 name = "curve25519-dalek"
 version = "3.2.1"
@@ -1583,6 +1626,15 @@ dependencies = [
  "rusticata-macros",
 ]
 
+[[package]]
+name = "deranged"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f32d04922c60427da6f9fef14d042d9edddef64cb9d4ce0d64d0685fbeb1fd3"
+dependencies = [
+ "powerfmt",
+]
+
 [[package]]
 name = "derivation-path"
 version = "0.2.0"
@@ -2575,6 +2627,43 @@ dependencies = [
  "tokio-native-tls",
 ]
 
+[[package]]
+name = "iai-callgrind"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e99bf26f496b13ac6273014f40afda46a233fbfb0289ce50fb4daaad2f2ffc80"
+dependencies = [
+ "bincode",
+ "bindgen 0.69.2",
+ "cc",
+ "cfg-if 1.0.0",
+ "cty",
+ "iai-callgrind-macros",
+ "iai-callgrind-runner",
+ "regex",
+]
+
+[[package]]
+name = "iai-callgrind-macros"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2a4bb39225592c0a28cfca6f70af52ebd8da23f533c2cdd0a3329c1fa252d56"
+dependencies = [
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.50",
+]
+
+[[package]]
+name = "iai-callgrind-runner"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c23a951b9eccaa1e38556d27473d1462a9c247a27961812edcaac156af861282"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "iana-time-zone"
 version = "0.1.46"
@@ -2942,7 +3031,7 @@ version = "0.11.0+8.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d3386f101bcb4bd252d8e9d2fb41ec3b0862a15a62b478c355b2982efa469e3e"
 dependencies = [
- "bindgen",
+ "bindgen 0.65.1",
  "bzip2-sys",
  "cc",
  "glob",
@@ -3450,15 +3539,6 @@ dependencies = [
  "syn 2.0.50",
 ]
 
-[[package]]
-name = "num_threads"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97ba99ba6393e2c3734791401b66902d981cb03bf190af674ca69949b6d5fb15"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "number_prefix"
 version = "0.4.0"
@@ -3884,6 +3964,12 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc59d1bcc64fc5d021d67521f818db868368028108d37f0e98d74e33f68297b5"
 
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.15"
@@ -3938,9 +4024,9 @@ dependencies = [
 
 [[package]]
 name = "prettyplease"
-version = "0.2.4"
+version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058"
+checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5"
 dependencies = [
  "proc-macro2",
  "syn 2.0.50",
@@ -4003,6 +4089,32 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "procfs"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
+dependencies = [
+ "bitflags 2.4.2",
+ "chrono",
+ "flate2",
+ "hex",
+ "lazy_static",
+ "procfs-core",
+ "rustix",
+]
+
+[[package]]
+name = "procfs-core"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
+dependencies = [
+ "bitflags 2.4.2",
+ "chrono",
+ "hex",
+]
+
 [[package]]
 name = "proptest"
 version = "1.4.0"
@@ -4317,7 +4429,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
 dependencies = [
  "crossbeam-deque",
- "crossbeam-utils",
+ "crossbeam-utils 0.8.18",
 ]
 
 [[package]]
@@ -4834,8 +4946,13 @@ version = "2.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe"
 dependencies = [
+ "base64 0.13.1",
+ "chrono",
+ "hex",
  "serde",
+ "serde_json",
  "serde_with_macros",
+ "time",
 ]
 
 [[package]]
@@ -7232,6 +7349,7 @@ dependencies = [
  "itertools",
  "log",
  "percentage",
+ "rand 0.8.5",
  "rustc_version 0.4.0",
  "solana-bpf-loader-program",
  "solana-frozen-abi",
@@ -7462,7 +7580,13 @@ dependencies = [
 name = "solana-unified-scheduler-logic"
 version = "1.19.0"
 dependencies = [
+ "assert_matches",
+ "iai-callgrind",
+ "qualifier_attr",
  "solana-sdk",
+ "solana-unified-scheduler-logic",
+ "static_assertions",
+ "triomphe",
 ]
 
 [[package]]
@@ -7470,16 +7594,30 @@ name = "solana-unified-scheduler-pool"
 version = "1.19.0"
 dependencies = [
  "assert_matches",
+ "bincode",
+ "cpu-time",
+ "criterion",
  "crossbeam-channel",
+ "dashmap",
  "derivative",
  "log",
+ "procfs",
+ "qualifier_attr",
+ "rand 0.8.5",
+ "rustix",
+ "serde_json",
  "solana-ledger",
  "solana-logger",
+ "solana-measure",
+ "solana-metrics",
+ "solana-nohash-hasher",
  "solana-program-runtime",
  "solana-runtime",
  "solana-sdk",
  "solana-unified-scheduler-logic",
+ "solana-unified-scheduler-pool",
  "solana-vote",
+ "tikv-jemallocator",
 ]
 
 [[package]]
@@ -7975,6 +8113,12 @@ dependencies = [
  "spl-program-error",
 ]
 
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
+
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -8329,21 +8473,32 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.9"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd"
+checksum = "c4a34ab300f2dee6e562c10a046fc05e358b29f9bf92277f30c3c8d82275f6f5"
 dependencies = [
+ "deranged",
  "itoa",
- "libc",
- "num_threads",
+ "powerfmt",
+ "serde",
+ "time-core",
  "time-macros",
 ]
 
+[[package]]
+name = "time-core"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
+
 [[package]]
 name = "time-macros"
-version = "0.2.4"
+version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792"
+checksum = "4ad70d68dba9e1f8aceda7aa6711965dfec1cac869f311a51bd08b3a2ccbce20"
+dependencies = [
+ "time-core",
+]
 
 [[package]]
 name = "tiny-bip39"
@@ -8713,6 +8868,16 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0de5f738ceab88e2491a94ddc33c3feeadfa95fedc60363ef110845df12f3878"
 
+[[package]]
+name = "triomphe"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "859eb650cfee7434994602c3a68b25d77ad9e68c8a6cd491616ef86661382eb3"
+dependencies = [
+ "serde",
+ "stable_deref_trait",
+]
+
 [[package]]
 name = "try-lock"
 version = "0.2.3"
diff --git a/Cargo.toml b/Cargo.toml
index 0ec4b780fe13e4..eadfa6d02d16f3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -179,6 +179,7 @@ console_error_panic_hook = "0.1.7"
 console_log = "0.2.2"
 const_format = "0.2.32"
 core_affinity = "0.5.10"
+cpu-time = "1.0.0"
 criterion = "0.5.1"
 criterion-stats = "0.3.0"
 crossbeam-channel = "0.5.11"
@@ -268,6 +269,7 @@ predicates = "2.1"
 pretty-hex = "0.3.0"
 prio-graph = "0.2.1"
 proc-macro2 = "1.0.78"
+procfs = "0.16.0"
 proptest = "1.4"
 prost = "0.11.9"
 prost-build = "0.11.9"
@@ -288,6 +290,7 @@ reqwest = { version = "0.11.23", default-features = false }
 rolling-file = "0.2.0"
 rpassword = "7.3"
 rustc_version = "0.4"
+rustix = "0.38.21"
 rustls = { version = "0.21.10", default-features = false, features = ["quic"] }
 rustversion = "1.0.14"
 scopeguard = "1.2.0"
@@ -439,6 +442,8 @@ zstd = "0.11.2"
 # for details, see https://github.com/solana-labs/crossbeam/commit/fd279d707025f0e60951e429bf778b4813d1b6bf
 crossbeam-epoch = { git = "https://github.com/solana-labs/crossbeam", rev = "fd279d707025f0e60951e429bf778b4813d1b6bf" }
 
+crossbeam-channel = { git = "https://github.com/ryoqun/crossbeam", rev = "438ec7cdaf6c6a8f593e50344c725fef8a13c7a5" }
+
 # We include the following crates as our dependencies above from crates.io:
 #
 #  * spl-associated-token-account
diff --git a/ci/test-bench.sh b/ci/test-bench.sh
index aacc82cffbb0a6..1444405bcccf5c 100755
--- a/ci/test-bench.sh
+++ b/ci/test-bench.sh
@@ -56,6 +56,10 @@ _ $cargoNightly bench --manifest-path gossip/Cargo.toml ${V:+--verbose} \
 _ $cargoNightly bench --manifest-path poh/Cargo.toml ${V:+--verbose} \
   -- -Z unstable-options --format=json | tee -a "$BENCH_FILE"
 
+# Run scheduler-pool benches
+_ $cargoNightly bench --manifest-path scheduler-pool/Cargo.toml ${V:+--verbose} \
+  -- -Z unstable-options --format=json | tee -a "$BENCH_FILE"
+
 # Run core benches
 _ $cargoNightly bench --manifest-path core/Cargo.toml ${V:+--verbose} \
   -- -Z unstable-options --format=json | tee -a "$BENCH_FILE"
diff --git a/core/src/drop_bank_service.rs b/core/src/drop_bank_service.rs
index 0321643d6aab68..f65ae566a08411 100644
--- a/core/src/drop_bank_service.rs
+++ b/core/src/drop_bank_service.rs
@@ -1,11 +1,8 @@
 use {
     crossbeam_channel::Receiver,
     solana_measure::measure::Measure,
-    solana_runtime::bank::Bank,
-    std::{
-        sync::Arc,
-        thread::{self, Builder, JoinHandle},
-    },
+    solana_runtime::installed_scheduler_pool::BankWithScheduler,
+    std::thread::{self, Builder, JoinHandle},
 };
 
 pub struct DropBankService {
@@ -13,7 +10,7 @@ pub struct DropBankService {
 }
 
 impl DropBankService {
-    pub fn new(bank_receiver: Receiver<Vec<Arc<Bank>>>) -> Self {
+    pub fn new(bank_receiver: Receiver<Vec<BankWithScheduler>>) -> Self {
         let thread_hdl = Builder::new()
             .name("solDropBankSrvc".to_string())
             .spawn(move || {
diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs
index a80a04d47c1573..c8d744b30c202a 100644
--- a/core/src/replay_stage.rs
+++ b/core/src/replay_stage.rs
@@ -536,7 +536,7 @@ impl ReplayStage {
         cluster_slots_update_sender: ClusterSlotsUpdateSender,
         cost_update_sender: Sender<CostUpdate>,
         voting_sender: Sender<VoteOp>,
-        drop_bank_sender: Sender<Vec<Arc<Bank>>>,
+        drop_bank_sender: Sender<Vec<BankWithScheduler>>,
         block_metadata_notifier: Option<BlockMetadataNotifierArc>,
         log_messages_bytes_limit: Option<usize>,
         prioritization_fee_cache: Arc<PrioritizationFeeCache>,
@@ -1618,7 +1618,7 @@ impl ReplayStage {
 
         // Grab the Slot and BankId's of the banks we need to purge, then clear the banks
         // from BankForks
-        let (slots_to_purge, removed_banks): (Vec<(Slot, BankId)>, Vec<Arc<Bank>>) = {
+        let (slots_to_purge, removed_banks): (Vec<(Slot, BankId)>, Vec<BankWithScheduler>) = {
             let mut w_bank_forks = bank_forks.write().unwrap();
             slot_descendants
                 .iter()
@@ -2275,7 +2275,7 @@ impl ReplayStage {
         replay_timing: &mut ReplayTiming,
         voting_sender: &Sender<VoteOp>,
         epoch_slots_frozen_slots: &mut EpochSlotsFrozenSlots,
-        drop_bank_sender: &Sender<Vec<Arc<Bank>>>,
+        drop_bank_sender: &Sender<Vec<BankWithScheduler>>,
         wait_to_vote_slot: Option<Slot>,
     ) {
         if bank.is_empty() {
@@ -4093,7 +4093,7 @@ impl ReplayStage {
         has_new_vote_been_rooted: &mut bool,
         voted_signatures: &mut Vec<Signature>,
         epoch_slots_frozen_slots: &mut EpochSlotsFrozenSlots,
-        drop_bank_sender: &Sender<Vec<Arc<Bank>>>,
+        drop_bank_sender: &Sender<Vec<BankWithScheduler>>,
     ) {
         bank_forks.read().unwrap().prune_program_cache(new_root);
         let removed_banks = bank_forks.write().unwrap().set_root(
diff --git a/core/src/validator.rs b/core/src/validator.rs
index 97ef0a01ef87ad..cb8456af27b37d 100644
--- a/core/src/validator.rs
+++ b/core/src/validator.rs
@@ -142,8 +142,8 @@ const WAIT_FOR_SUPERMAJORITY_THRESHOLD_PERCENT: u64 = 80;
 #[derive(Clone, EnumString, EnumVariantNames, Default, IntoStaticStr, Display)]
 #[strum(serialize_all = "kebab-case")]
 pub enum BlockVerificationMethod {
-    #[default]
     BlockstoreProcessor,
+    #[default]
     UnifiedScheduler,
 }
 
@@ -1434,8 +1434,10 @@ impl Validator {
 
     // Used for notifying many nodes in parallel to exit
     pub fn exit(&mut self) {
+        info!("exit1");
         self.validator_exit.write().unwrap().exit();
 
+        info!("exit2");
         // drop all signals in blockstore
         self.blockstore.drop_signal();
     }
@@ -1471,24 +1473,29 @@ impl Validator {
     }
 
     pub fn join(self) {
-        drop(self.bank_forks);
+        info!("join1");
         drop(self.cluster_info);
 
+        info!("join2");
         self.poh_service.join().expect("poh_service");
         drop(self.poh_recorder);
 
+        info!("join3");
         if let Some(json_rpc_service) = self.json_rpc_service {
             json_rpc_service.join().expect("rpc_service");
         }
 
+        info!("join4");
         if let Some(pubsub_service) = self.pubsub_service {
             pubsub_service.join().expect("pubsub_service");
         }
 
+        info!("join5");
         self.rpc_completed_slots_service
             .join()
             .expect("rpc_completed_slots_service");
 
+        info!("join6");
         if let Some(optimistically_confirmed_bank_tracker) =
             self.optimistically_confirmed_bank_tracker
         {
@@ -1497,96 +1504,126 @@ impl Validator {
                 .expect("optimistically_confirmed_bank_tracker");
         }
 
+        info!("join7");
         if let Some(transaction_status_service) = self.transaction_status_service {
             transaction_status_service
                 .join()
                 .expect("transaction_status_service");
         }
 
+        info!("join8");
         if let Some(rewards_recorder_service) = self.rewards_recorder_service {
             rewards_recorder_service
                 .join()
                 .expect("rewards_recorder_service");
         }
 
+        info!("join9");
         if let Some(cache_block_meta_service) = self.cache_block_meta_service {
             cache_block_meta_service
                 .join()
                 .expect("cache_block_meta_service");
         }
 
+        info!("join10");
         if let Some(system_monitor_service) = self.system_monitor_service {
             system_monitor_service
                 .join()
                 .expect("system_monitor_service");
         }
 
+        info!("join11");
         if let Some(sample_performance_service) = self.sample_performance_service {
             sample_performance_service
                 .join()
                 .expect("sample_performance_service");
         }
 
+        info!("join12");
         if let Some(entry_notifier_service) = self.entry_notifier_service {
             entry_notifier_service
                 .join()
                 .expect("entry_notifier_service");
         }
 
+        info!("join13");
         if let Some(s) = self.snapshot_packager_service {
             s.join().expect("snapshot_packager_service");
         }
 
+        info!("join14");
         self.gossip_service.join().expect("gossip_service");
         if let Some(repair_quic_endpoint) = &self.repair_quic_endpoint {
             repair::quic_endpoint::close_quic_endpoint(repair_quic_endpoint);
         }
+        info!("join15");
         self.serve_repair_service
             .join()
             .expect("serve_repair_service");
+        info!("join15");
         if let Some(repair_quic_endpoint_join_handle) = self.repair_quic_endpoint_join_handle {
             self.repair_quic_endpoint_runtime
                 .map(|runtime| runtime.block_on(repair_quic_endpoint_join_handle))
                 .transpose()
                 .unwrap();
         };
+        info!("join16");
         self.stats_reporter_service
             .join()
             .expect("stats_reporter_service");
+        info!("join17");
         self.blockstore_metric_report_service
             .join()
             .expect("ledger_metric_report_service");
+        info!("join18");
         self.accounts_background_service
             .join()
             .expect("accounts_background_service");
+        info!("join19");
         self.accounts_hash_verifier
             .join()
             .expect("accounts_hash_verifier");
+        info!("join20");
         if let Some(turbine_quic_endpoint) = &self.turbine_quic_endpoint {
             solana_turbine::quic_endpoint::close_quic_endpoint(turbine_quic_endpoint);
         }
+        info!("join21");
         self.tpu.join().expect("tpu");
+        info!("join22");
         self.tvu.join().expect("tvu");
+        info!("join23");
         if let Some(turbine_quic_endpoint_join_handle) = self.turbine_quic_endpoint_join_handle {
             self.turbine_quic_endpoint_runtime
                 .map(|runtime| runtime.block_on(turbine_quic_endpoint_join_handle))
                 .transpose()
                 .unwrap();
         }
+        info!("join24");
         self.completed_data_sets_service
             .join()
             .expect("completed_data_sets_service");
+        info!("join25");
         if let Some(ip_echo_server) = self.ip_echo_server {
             ip_echo_server.shutdown_background();
         }
 
+        info!("join26");
         if let Some(geyser_plugin_service) = self.geyser_plugin_service {
             geyser_plugin_service.join().expect("geyser_plugin_service");
         }
 
+        info!("join27");
         self.poh_timing_report_service
             .join()
             .expect("poh_timing_report_service");
+        info!("join28");
+        self.bank_forks.write().unwrap().prepare_to_drop();
+        let sc = Arc::strong_count(&self.bank_forks);
+        if let Some(bank_forks) = Arc::into_inner(self.bank_forks) {
+            drop::<BankForks>(bank_forks.into_inner().unwrap());
+        } else {
+            warn!("seems bankforks are leaking...{}:", sc);
+        }
     }
 }
 
diff --git a/ledger-tool/src/ledger_utils.rs b/ledger-tool/src/ledger_utils.rs
index 116b21527ae4d8..fffa823afcd529 100644
--- a/ledger-tool/src/ledger_utils.rs
+++ b/ledger-tool/src/ledger_utils.rs
@@ -304,6 +304,8 @@ pub fn load_and_process_ledger(
             }
         }
         BlockVerificationMethod::UnifiedScheduler => {
+            let unified_scheduler_handler_threads =
+                value_t!(arg_matches, "unified_scheduler_handler_threads", usize).ok();
             let no_transaction_status_sender = None;
             let no_replay_vote_sender = None;
             let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64));
diff --git a/ledger-tool/src/main.rs b/ledger-tool/src/main.rs
index 9b299cfadcbcf2..18ef7460bcdc88 100644
--- a/ledger-tool/src/main.rs
+++ b/ledger-tool/src/main.rs
@@ -1654,6 +1654,8 @@ fn main() {
                     }
                     exit_signal.store(true, Ordering::Relaxed);
                     system_monitor_service.join().unwrap();
+                    bank_forks.write().unwrap().prepare_to_drop();
+                    drop::<BankForks>(Arc::into_inner(bank_forks).unwrap().into_inner().unwrap());
                 }
                 ("graph", Some(arg_matches)) => {
                     let output_file = value_t_or_exit!(arg_matches, "graph_filename", String);
diff --git a/ledger/src/blockstore_processor.rs b/ledger/src/blockstore_processor.rs
index 63edb23e01cc18..d07e6f21fe79da 100644
--- a/ledger/src/blockstore_processor.rs
+++ b/ledger/src/blockstore_processor.rs
@@ -338,8 +338,7 @@ fn process_batches(
         // scheduling always succeeds here without being blocked on actual transaction executions.
         // The transaction execution errors will be collected via the blocking fn called
         // BankWithScheduler::wait_for_completed_scheduler(), if any.
-        schedule_batches_for_execution(bank, batches);
-        Ok(())
+        schedule_batches_for_execution(bank, batches)
     } else {
         debug!(
             "process_batches()/rebatch_and_execute_batches({} batches)",
@@ -360,7 +359,7 @@ fn process_batches(
 fn schedule_batches_for_execution(
     bank: &BankWithScheduler,
     batches: &[TransactionBatchWithIndexes],
-) {
+) -> Result<()> {
     for TransactionBatchWithIndexes {
         batch,
         transaction_indexes,
@@ -371,8 +370,9 @@ fn schedule_batches_for_execution(
                 .sanitized_transactions()
                 .iter()
                 .zip(transaction_indexes.iter()),
-        );
+        )?;
     }
+    Ok(())
 }
 
 fn rebatch_transactions<'a>(
@@ -440,9 +440,7 @@ fn rebatch_and_execute_batches(
     {
         let mut cost_tracker = bank.write_cost_tracker().unwrap();
         for tx_cost in &tx_costs {
-            cost_tracker
-                .try_add(tx_cost)
-                .map_err(TransactionError::from)?;
+            cost_tracker.try_add(tx_cost)?;
         }
     }
 
@@ -1960,6 +1958,7 @@ pub mod tests {
             instruction::{Instruction, InstructionError},
             native_token::LAMPORTS_PER_SOL,
             pubkey::Pubkey,
+            scheduling::SchedulingMode,
             signature::{Keypair, Signer},
             system_instruction::SystemError,
             system_transaction,
@@ -4544,7 +4543,7 @@ pub mod tests {
             ..
         } = create_genesis_config_with_leader(500, &dummy_leader_pubkey, 100);
         let bank = Arc::new(Bank::new_for_tests(&genesis_config));
-        let context = SchedulingContext::new(bank.clone());
+        let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone());
 
         let txs = create_test_transactions(&mint_keypair, &genesis_config.hash());
 
@@ -4559,7 +4558,7 @@ pub mod tests {
         mocked_scheduler
             .expect_schedule_execution()
             .times(txs.len())
-            .returning(|_| ());
+            .returning(|_| Ok(()));
         mocked_scheduler
             .expect_wait_for_termination()
             .with(mockall::predicate::eq(true))
diff --git a/local-cluster/tests/local_cluster.rs b/local-cluster/tests/local_cluster.rs
index 6f7de16df296b1..9a9cc9c4b093aa 100644
--- a/local-cluster/tests/local_cluster.rs
+++ b/local-cluster/tests/local_cluster.rs
@@ -4,7 +4,7 @@ use {
     crossbeam_channel::{unbounded, Receiver},
     gag::BufferRedirect,
     log::*,
-    rand::seq::IteratorRandom,
+    rand::seq::SliceRandom,
     serial_test::serial,
     solana_accounts_db::{
         hardened_unpack::open_genesis_config, utils::create_accounts_run_and_snapshot_dirs,
@@ -5499,12 +5499,14 @@ fn test_randomly_mixed_block_verification_methods_between_bootstrap_and_not() {
     );
 
     // Randomly switch to use unified scheduler
-    config
-        .validator_configs
-        .iter_mut()
-        .choose(&mut rand::thread_rng())
-        .unwrap()
-        .block_verification_method = BlockVerificationMethod::UnifiedScheduler;
+    let mut methods = [
+        BlockVerificationMethod::UnifiedScheduler,
+        BlockVerificationMethod::BlockstoreProcessor,
+    ];
+    methods.shuffle(&mut rand::thread_rng());
+    for (validator_config, method) in config.validator_configs.iter_mut().zip(methods) {
+        validator_config.block_verification_method = method;
+    }
 
     let local = LocalCluster::new(&mut config, SocketAddrSpace::Unspecified);
     cluster_tests::spend_and_verify_all_nodes(
diff --git a/metrics/src/datapoint.rs b/metrics/src/datapoint.rs
index e2740ce3aecc47..8a13a112da0636 100644
--- a/metrics/src/datapoint.rs
+++ b/metrics/src/datapoint.rs
@@ -60,6 +60,15 @@ impl DataPoint {
         }
     }
 
+    pub fn at(timestamp: SystemTime, name: &'static str) -> Self {
+        DataPoint {
+            name,
+            timestamp,
+            tags: vec![],
+            fields: vec![],
+        }
+    }
+
     pub fn add_tag(&mut self, name: &'static str, value: &str) -> &mut Self {
         self.tags.push((name, value.to_string()));
         self
@@ -160,6 +169,56 @@ macro_rules! create_datapoint {
     };
 }
 
+#[macro_export]
+macro_rules! create_datapoint_at {
+    (@field $point:ident $name:expr, $string:expr, String) => {
+        $point.add_field_str($name, &$string);
+    };
+    (@field $point:ident $name:expr, $value:expr, i64) => {
+        $point.add_field_i64($name, $value as i64);
+    };
+    (@field $point:ident $name:expr, $value:expr, f64) => {
+        $point.add_field_f64($name, $value as f64);
+    };
+    (@field $point:ident $name:expr, $value:expr, bool) => {
+        $point.add_field_bool($name, $value as bool);
+    };
+    (@tag $point:ident $tag_name:expr, $tag_value:expr) => {
+        $point.add_tag($tag_name, &$tag_value);
+    };
+
+    (@fields $point:ident) => {};
+
+    // process tags
+    (@fields $point:ident $tag_name:expr => $tag_value:expr, $($rest:tt)*) => {
+        $crate::create_datapoint!(@tag $point $tag_name, $tag_value);
+        $crate::create_datapoint!(@fields $point $($rest)*);
+    };
+    (@fields $point:ident $tag_name:expr => $tag_value:expr) => {
+        $crate::create_datapoint!(@tag $point $tag_name, $tag_value);
+    };
+
+    // process fields
+    (@fields $point:ident ($name:expr, $value:expr, $type:ident) , $($rest:tt)*) => {
+        $crate::create_datapoint!(@field $point $name, $value, $type);
+        $crate::create_datapoint!(@fields $point $($rest)*);
+    };
+    (@fields $point:ident ($name:expr, $value:expr, $type:ident)) => {
+        $crate::create_datapoint!(@field $point $name, $value, $type);
+    };
+
+    (@point $name:expr, $at:expr, $($fields:tt)+) => {
+        {
+            let mut point = $crate::datapoint::DataPoint::at($at, &$name);
+            $crate::create_datapoint!(@fields point $($fields)+);
+            point
+        }
+    };
+    (@point $name:expr, $at:expr) => {
+        $crate::datapoint::DataPoint::at($at, &$name)
+    };
+}
+
 #[macro_export]
 macro_rules! datapoint {
     ($level:expr, $name:expr, $($fields:tt)+) => {
@@ -168,6 +227,21 @@ macro_rules! datapoint {
         }
     };
 }
+
+#[macro_export]
+macro_rules! datapoint_at {
+    ($level:expr, $at:expr, $name:expr) => {
+        if log::log_enabled!($level) {
+            $crate::submit($crate::create_datapoint_at!(@point $name, $at), $level);
+        }
+    };
+    ($level:expr, $at:expr, $name:expr, $($fields:tt)+) => {
+        if log::log_enabled!($level) {
+            $crate::submit($crate::create_datapoint_at!(@point $name, $at, $($fields)+), $level);
+        }
+    };
+}
+
 #[macro_export]
 macro_rules! datapoint_error {
     ($name:expr, $($fields:tt)+) => {
@@ -189,6 +263,16 @@ macro_rules! datapoint_info {
     };
 }
 
+#[macro_export]
+macro_rules! datapoint_info_at {
+    ($at:expr, $name:expr) => {
+        $crate::datapoint_at!(log::Level::Info, $at, $name);
+    };
+    ($at:expr, $name:expr, $($fields:tt)+) => {
+        $crate::datapoint_at!(log::Level::Info, $at, $name, $($fields)+);
+    };
+}
+
 #[macro_export]
 macro_rules! datapoint_debug {
     ($name:expr, $($fields:tt)+) => {
diff --git a/metrics/src/metrics.rs b/metrics/src/metrics.rs
index b989ada6861fd1..ec847059bccd07 100644
--- a/metrics/src/metrics.rs
+++ b/metrics/src/metrics.rs
@@ -181,7 +181,7 @@ impl Default for MetricsAgent {
 
         Self::new(
             Arc::new(InfluxDbMetricsWriter::new()),
-            Duration::from_secs(10),
+            Duration::from_secs(1),
             max_points_per_sec,
         )
     }
diff --git a/program-runtime/src/loaded_programs.rs b/program-runtime/src/loaded_programs.rs
index 6da84b0d1f0692..2845db21118584 100644
--- a/program-runtime/src/loaded_programs.rs
+++ b/program-runtime/src/loaded_programs.rs
@@ -673,6 +673,10 @@ impl LoadedProgramsForTxBatch {
             self.replenish(*key, entry.clone());
         })
     }
+
+    pub fn is_empty(&self) -> bool {
+        self.entries.is_empty()
+    }
 }
 
 pub enum LoadedProgramMatchCriteria {
@@ -700,6 +704,10 @@ impl<FG: ForkGraph> LoadedPrograms<FG> {
         self.fork_graph = Some(fork_graph);
     }
 
+    pub fn unset_fork_graph(&mut self) {
+        self.fork_graph = None;
+    }
+
     /// Returns the current environments depending on the given epoch
     pub fn get_environments_for_epoch(&self, epoch: Epoch) -> &ProgramRuntimeEnvironments {
         if epoch != self.latest_root_epoch {
diff --git a/programs/sbf/Cargo.lock b/programs/sbf/Cargo.lock
index 1b8d422d42ba7c..264494740abb94 100644
--- a/programs/sbf/Cargo.lock
+++ b/programs/sbf/Cargo.lock
@@ -1032,7 +1032,7 @@ version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c278839b831783b70278b14df4d45e1beb1aad306c07bb796637de9a0e323e8e"
 dependencies = [
- "crossbeam-utils",
+ "crossbeam-utils 0.8.18",
 ]
 
 [[package]]
@@ -1108,6 +1108,16 @@ dependencies = [
  "winapi 0.2.8",
 ]
 
+[[package]]
+name = "cpu-time"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded"
+dependencies = [
+ "libc",
+ "winapi 0.3.9",
+]
+
 [[package]]
 name = "cpufeatures"
 version = "0.2.7"
@@ -1129,10 +1139,9 @@ dependencies = [
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "176dc175b78f56c0f321911d9c8eb2b77a78a4860b9c19db83835fea1a46649b"
+source = "git+https://github.com/ryoqun/crossbeam?rev=438ec7cdaf6c6a8f593e50344c725fef8a13c7a5#438ec7cdaf6c6a8f593e50344c725fef8a13c7a5"
 dependencies = [
- "crossbeam-utils",
+ "crossbeam-utils 0.8.19",
 ]
 
 [[package]]
@@ -1143,7 +1152,7 @@ checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e"
 dependencies = [
  "cfg-if 1.0.0",
  "crossbeam-epoch",
- "crossbeam-utils",
+ "crossbeam-utils 0.8.18",
 ]
 
 [[package]]
@@ -1153,7 +1162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd"
 dependencies = [
  "cfg-if 1.0.0",
- "crossbeam-utils",
+ "crossbeam-utils 0.8.18",
  "lazy_static",
  "memoffset 0.6.4",
  "scopeguard",
@@ -1168,6 +1177,11 @@ dependencies = [
  "cfg-if 1.0.0",
 ]
 
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.19"
+source = "git+https://github.com/ryoqun/crossbeam?rev=438ec7cdaf6c6a8f593e50344c725fef8a13c7a5#438ec7cdaf6c6a8f593e50344c725fef8a13c7a5"
+
 [[package]]
 name = "crunchy"
 version = "0.2.2"
@@ -1286,6 +1300,15 @@ dependencies = [
  "rusticata-macros",
 ]
 
+[[package]]
+name = "deranged"
+version = "0.3.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8eb30d70a07a3b04884d2677f06bec33509dc67ca60d92949e5535352d3191dc"
+dependencies = [
+ "powerfmt",
+]
+
 [[package]]
 name = "derivation-path"
 version = "0.2.0"
@@ -2031,6 +2054,12 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
 
+[[package]]
+name = "hex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
+
 [[package]]
 name = "histogram"
 version = "0.6.9"
@@ -3092,15 +3121,6 @@ dependencies = [
  "syn 2.0.50",
 ]
 
-[[package]]
-name = "num_threads"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aba1801fb138d8e85e11d0fc70baf4fe1cdfffda7c6cd34a854905df588e5ed0"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "number_prefix"
 version = "0.4.0"
@@ -3485,6 +3505,12 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc59d1bcc64fc5d021d67521f818db868368028108d37f0e98d74e33f68297b5"
 
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.8"
@@ -3605,6 +3631,32 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "procfs"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
+dependencies = [
+ "bitflags 2.4.2",
+ "chrono",
+ "flate2",
+ "hex",
+ "lazy_static",
+ "procfs-core",
+ "rustix",
+]
+
+[[package]]
+name = "procfs-core"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
+dependencies = [
+ "bitflags 2.4.2",
+ "chrono",
+ "hex",
+]
+
 [[package]]
 name = "prost"
 version = "0.11.9"
@@ -3842,7 +3894,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
 dependencies = [
  "crossbeam-deque",
- "crossbeam-utils",
+ "crossbeam-utils 0.8.18",
 ]
 
 [[package]]
@@ -4295,8 +4347,13 @@ version = "2.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe"
 dependencies = [
+ "base64 0.13.1",
+ "chrono",
+ "hex",
  "serde",
+ "serde_json",
  "serde_with_macros",
+ "time",
 ]
 
 [[package]]
@@ -6307,6 +6364,7 @@ dependencies = [
  "itertools",
  "log",
  "percentage",
+ "rand 0.8.5",
  "rustc_version",
  "solana-bpf-loader-program",
  "solana-frozen-abi",
@@ -6471,7 +6529,10 @@ dependencies = [
 name = "solana-unified-scheduler-logic"
 version = "1.19.0"
 dependencies = [
+ "assert_matches",
+ "qualifier_attr",
  "solana-sdk",
+ "static_assertions",
 ]
 
 [[package]]
@@ -6479,10 +6540,18 @@ name = "solana-unified-scheduler-pool"
 version = "1.19.0"
 dependencies = [
  "assert_matches",
+ "cpu-time",
  "crossbeam-channel",
+ "dashmap",
  "derivative",
  "log",
+ "procfs",
+ "qualifier_attr",
+ "rustix",
+ "serde_json",
  "solana-ledger",
+ "solana-measure",
+ "solana-metrics",
  "solana-program-runtime",
  "solana-runtime",
  "solana-sdk",
@@ -7235,21 +7304,32 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.9"
+version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd"
+checksum = "f657ba42c3f86e7680e53c8cd3af8abbe56b5491790b46e22e19c0d57463583e"
 dependencies = [
+ "deranged",
  "itoa",
- "libc",
- "num_threads",
+ "powerfmt",
+ "serde",
+ "time-core",
  "time-macros",
 ]
 
+[[package]]
+name = "time-core"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
+
 [[package]]
 name = "time-macros"
-version = "0.2.4"
+version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792"
+checksum = "26197e33420244aeb70c3e8c78376ca46571bc4e701e4791c2cd9f57dcb3a43f"
+dependencies = [
+ "time-core",
+]
 
 [[package]]
 name = "tiny-bip39"
diff --git a/programs/sbf/Cargo.toml b/programs/sbf/Cargo.toml
index 8a99a0f005471a..c4ae2f6c4bec08 100644
--- a/programs/sbf/Cargo.toml
+++ b/programs/sbf/Cargo.toml
@@ -169,6 +169,8 @@ members = [
 targets = ["x86_64-unknown-linux-gnu"]
 
 [patch.crates-io]
+crossbeam-channel = { git = "https://github.com/ryoqun/crossbeam", rev = "438ec7cdaf6c6a8f593e50344c725fef8a13c7a5" }
+
 # We include the following crates as our dependencies from crates.io:
 #
 #  * spl-associated-token-account
diff --git a/runtime/src/bank.rs b/runtime/src/bank.rs
index 7e051019c99871..d5ca601c8c78ce 100644
--- a/runtime/src/bank.rs
+++ b/runtime/src/bank.rs
@@ -4931,7 +4931,7 @@ impl Bank {
                 programs_modified_by_tx,
             } = execution_result
             {
-                if details.status.is_ok() {
+                if details.status.is_ok() && !programs_modified_by_tx.is_empty() {
                     let mut cache = self.loaded_programs_cache.write().unwrap();
                     cache.merge(programs_modified_by_tx);
                 }
diff --git a/runtime/src/bank_forks.rs b/runtime/src/bank_forks.rs
index 668062c8d31cce..c9185967eef2e0 100644
--- a/runtime/src/bank_forks.rs
+++ b/runtime/src/bank_forks.rs
@@ -5,7 +5,8 @@ use {
         accounts_background_service::{AbsRequestSender, SnapshotRequest, SnapshotRequestKind},
         bank::{epoch_accounts_hash_utils, Bank, SquashTiming},
         installed_scheduler_pool::{
-            BankWithScheduler, InstalledSchedulerPoolArc, SchedulingContext,
+            BankWithScheduler, DefaultScheduleExecutionArg, InstalledSchedulerPoolArc,
+            SchedulingContext,
         },
         snapshot_config::SnapshotConfig,
     },
@@ -15,6 +16,7 @@ use {
     solana_sdk::{
         clock::{Epoch, Slot},
         hash::Hash,
+        scheduling::SchedulingMode,
         timing,
     },
     std::{
@@ -73,7 +75,13 @@ pub struct BankForks {
     last_accounts_hash_slot: Slot,
     in_vote_only_mode: Arc<AtomicBool>,
     highest_slot_at_startup: Slot,
-    scheduler_pool: Option<InstalledSchedulerPoolArc>,
+    scheduler_pool: Option<InstalledSchedulerPoolArc<DefaultScheduleExecutionArg>>,
+}
+
+impl Drop for BankForks {
+    fn drop(&mut self) {
+        info!("BankForks::drop(): successfully dropped");
+    }
 }
 
 impl Index<u64> for BankForks {
@@ -212,7 +220,10 @@ impl BankForks {
         self[self.root()].clone()
     }
 
-    pub fn install_scheduler_pool(&mut self, pool: InstalledSchedulerPoolArc) {
+    pub fn install_scheduler_pool(
+        &mut self,
+        pool: InstalledSchedulerPoolArc<DefaultScheduleExecutionArg>,
+    ) {
         info!("Installed new scheduler_pool into bank_forks: {:?}", pool);
         assert!(
             self.scheduler_pool.replace(pool).is_none(),
@@ -220,6 +231,26 @@ impl BankForks {
         );
     }
 
+    pub fn uninstall_scheduler_pool(&mut self) {
+        // hint scheduler pool to cut circular references of Arc<SchedulerPool>
+        if let Some(sp) = self.scheduler_pool.take() {
+            sp.uninstalled_from_bank_forks();
+        }
+    }
+
+    pub fn prepare_to_drop(&mut self) {
+        let root_bank = self.root_bank();
+        // drop all non root BankWithScheduler, which causes all schedulers wind down.
+        self.banks.clear();
+        self.uninstall_scheduler_pool();
+        // this cuts circular references of BankForks...
+        root_bank
+            .loaded_programs_cache
+            .write()
+            .unwrap()
+            .unset_fork_graph();
+    }
+
     pub fn insert(&mut self, mut bank: Bank) -> BankWithScheduler {
         if self.root.load(Ordering::Relaxed) < self.highest_slot_at_startup {
             bank.check_program_modification_slot();
@@ -227,7 +258,7 @@ impl BankForks {
 
         let bank = Arc::new(bank);
         let bank = if let Some(scheduler_pool) = &self.scheduler_pool {
-            let context = SchedulingContext::new(bank.clone());
+            let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone());
             let scheduler = scheduler_pool.take_scheduler(context);
             BankWithScheduler::new(bank, Some(scheduler))
         } else {
@@ -248,7 +279,7 @@ impl BankForks {
         self.insert(bank)
     }
 
-    pub fn remove(&mut self, slot: Slot) -> Option<Arc<Bank>> {
+    pub fn remove(&mut self, slot: Slot) -> Option<BankWithScheduler> {
         let bank = self.banks.remove(&slot)?;
         for parent in bank.proper_ancestors() {
             let Entry::Occupied(mut entry) = self.descendants.entry(parent) else {
@@ -265,7 +296,7 @@ impl BankForks {
         if entry.get().is_empty() {
             entry.remove_entry();
         }
-        Some(bank.clone_without_scheduler())
+        Some(bank)
     }
 
     pub fn highest_slot(&self) -> Slot {
@@ -285,7 +316,7 @@ impl BankForks {
         root: Slot,
         accounts_background_request_sender: &AbsRequestSender,
         highest_super_majority_root: Option<Slot>,
-    ) -> (Vec<Arc<Bank>>, SetRootMetrics) {
+    ) -> (Vec<BankWithScheduler>, SetRootMetrics) {
         let old_epoch = self.root_bank().epoch();
         // To support `RootBankCache` (via `ReadOnlyAtomicSlot`) accessing `root` *without* locking
         // BankForks first *and* from a different thread, this store *must* be at least Release to
@@ -464,7 +495,7 @@ impl BankForks {
         root: Slot,
         accounts_background_request_sender: &AbsRequestSender,
         highest_super_majority_root: Option<Slot>,
-    ) -> Vec<Arc<Bank>> {
+    ) -> Vec<BankWithScheduler> {
         let program_cache_prune_start = Instant::now();
         let set_root_start = Instant::now();
         let (removed_banks, set_root_metrics) = self.do_set_root_return_metrics(
@@ -625,7 +656,7 @@ impl BankForks {
         &mut self,
         root: Slot,
         highest_super_majority_root: Option<Slot>,
-    ) -> (Vec<Arc<Bank>>, u64, u64) {
+    ) -> (Vec<BankWithScheduler>, u64, u64) {
         // Clippy doesn't like separating the two collects below,
         // but we want to collect timing separately, and the 2nd requires
         // a unique borrow to self which is already borrowed by self.banks
diff --git a/runtime/src/installed_scheduler_pool.rs b/runtime/src/installed_scheduler_pool.rs
index d39a18d567232a..ae7c410397d063 100644
--- a/runtime/src/installed_scheduler_pool.rs
+++ b/runtime/src/installed_scheduler_pool.rs
@@ -25,11 +25,13 @@ use {
     log::*,
     solana_program_runtime::timings::ExecuteTimings,
     solana_sdk::{
+        clock::Slot,
         hash::Hash,
-        slot_history::Slot,
+        scheduling::{SchedulingMode, WithSchedulingMode},
         transaction::{Result, SanitizedTransaction},
     },
     std::{
+        borrow::Borrow,
         fmt::Debug,
         ops::Deref,
         sync::{Arc, RwLock},
@@ -38,8 +40,9 @@ use {
 #[cfg(feature = "dev-context-only-utils")]
 use {mockall::automock, qualifier_attr::qualifiers};
 
-pub trait InstalledSchedulerPool: Send + Sync + Debug {
-    fn take_scheduler(&self, context: SchedulingContext) -> InstalledSchedulerBox;
+pub trait InstalledSchedulerPool<SEA: ScheduleExecutionArg>: Send + Sync + Debug {
+    fn take_scheduler(&self, context: SchedulingContext) -> Box<dyn InstalledScheduler<SEA>>;
+    fn uninstalled_from_bank_forks(self: Arc<Self>);
 }
 
 #[cfg_attr(doc, aquamarine::aquamarine)]
@@ -97,15 +100,15 @@ pub trait InstalledSchedulerPool: Send + Sync + Debug {
     feature = "dev-context-only-utils",
     allow(unused_attributes, clippy::needless_lifetimes)
 )]
-pub trait InstalledScheduler: Send + Sync + Debug + 'static {
+pub trait InstalledScheduler<SEA: ScheduleExecutionArg>: Send + Sync + Debug + 'static {
     fn id(&self) -> SchedulerId;
     fn context(&self) -> &SchedulingContext;
 
     // Calling this is illegal as soon as wait_for_termination is called.
     fn schedule_execution<'a>(
         &'a self,
-        transaction_with_index: &'a (&'a SanitizedTransaction, usize),
-    );
+        transaction_with_index: SEA::TransactionWithIndex<'a>,
+    ) -> Result<()>;
 
     /// Wait for a scheduler to terminate after processing.
     ///
@@ -135,13 +138,47 @@ pub trait UninstalledScheduler: Send + Sync + Debug + 'static {
     fn return_to_pool(self: Box<Self>);
 }
 
-pub type InstalledSchedulerBox = Box<dyn InstalledScheduler>;
+pub type InstalledSchedulerBox = Box<dyn InstalledScheduler<DefaultScheduleExecutionArg>>;
 pub type UninstalledSchedulerBox = Box<dyn UninstalledScheduler>;
 
-pub type InstalledSchedulerPoolArc = Arc<dyn InstalledSchedulerPool>;
+pub type InstalledSchedulerPoolArc<SEA> = Arc<dyn InstalledSchedulerPool<SEA>>;
 
 pub type SchedulerId = u64;
 
+pub trait WithTransactionAndIndex: Send + Sync + Debug {
+    fn with_transaction_and_index<R>(
+        &self,
+        callback: impl FnOnce(&SanitizedTransaction, usize) -> R,
+    ) -> R;
+}
+
+impl<
+        T: Send + Sync + Debug + Borrow<SanitizedTransaction>,
+        U: Send + Sync + Debug + Borrow<usize>,
+        Z: Send + Sync + Debug + Deref<Target = (T, U)>,
+    > WithTransactionAndIndex for Z
+{
+    fn with_transaction_and_index<R>(
+        &self,
+        callback: impl FnOnce(&SanitizedTransaction, usize) -> R,
+    ) -> R {
+        callback(self.0.borrow(), *self.1.borrow())
+    }
+}
+
+pub trait ScheduleExecutionArg: Send + Sync + Debug + 'static {
+    // GAT is used to make schedule_execution parametric even supporting references
+    // under the object-safety req. of InstalledScheduler trait...
+    type TransactionWithIndex<'tx>: WithTransactionAndIndex;
+}
+
+#[derive(Debug, Default, Clone)]
+pub struct DefaultScheduleExecutionArg;
+
+impl ScheduleExecutionArg for DefaultScheduleExecutionArg {
+    type TransactionWithIndex<'tx> = &'tx (&'tx SanitizedTransaction, usize);
+}
+
 /// A small context to propagate a bank and its scheduling mode to the scheduler subsystem.
 ///
 /// Note that this isn't called `SchedulerContext` because the contexts aren't associated with
@@ -153,13 +190,19 @@ pub type SchedulerId = u64;
 /// `SchedulingContext`s.
 #[derive(Clone, Debug)]
 pub struct SchedulingContext {
-    // mode: SchedulingMode, // this will be added later.
+    mode: SchedulingMode,
     bank: Arc<Bank>,
 }
 
+impl WithSchedulingMode for SchedulingContext {
+    fn mode(&self) -> SchedulingMode {
+        self.mode
+    }
+}
+
 impl SchedulingContext {
-    pub fn new(bank: Arc<Bank>) -> Self {
-        Self { bank }
+    pub fn new(mode: SchedulingMode, bank: Arc<Bank>) -> Self {
+        Self { mode, bank }
     }
 
     pub fn bank(&self) -> &Arc<Bank> {
@@ -246,9 +289,14 @@ impl BankWithScheduler {
     pub(crate) fn new(bank: Arc<Bank>, scheduler: Option<InstalledSchedulerBox>) -> Self {
         if let Some(bank_in_context) = scheduler
             .as_ref()
-            .map(|scheduler| scheduler.context().bank())
+            .map(|scheduler| scheduler.context().bank().clone())
         {
-            assert!(Arc::ptr_eq(&bank, bank_in_context));
+            assert!(
+                Arc::ptr_eq(&bank, &bank_in_context),
+                "different bank!? {} {}",
+                bank.slot(),
+                bank_in_context.slot()
+            );
         }
 
         Self {
@@ -290,7 +338,7 @@ impl BankWithScheduler {
     pub fn schedule_transaction_executions<'a>(
         &self,
         transactions_with_indexes: impl ExactSizeIterator<Item = (&'a SanitizedTransaction, &'a usize)>,
-    ) {
+    ) -> Result<()> {
         trace!(
             "schedule_transaction_executions(): {} txs",
             transactions_with_indexes.len()
@@ -300,8 +348,10 @@ impl BankWithScheduler {
         let scheduler = scheduler_guard.as_ref().unwrap();
 
         for (sanitized_transaction, &index) in transactions_with_indexes {
-            scheduler.schedule_execution(&(sanitized_transaction, index));
+            scheduler.schedule_execution(&(sanitized_transaction, index))?;
         }
+
+        Ok(())
     }
 
     // take needless &mut only to communicate its semantic mutability to humans...
@@ -359,22 +409,23 @@ impl BankWithSchedulerInner {
         );
 
         let mut scheduler = scheduler.write().unwrap();
-        let result_with_timings =
+        let (was_noop, result_with_timings) =
             if let Some(scheduler) = scheduler.as_mut().filter(|_| reason.is_paused()) {
                 scheduler.pause_for_recent_blockhash();
-                None
+                (false, None)
             } else if let Some(scheduler) = scheduler.take() {
                 let (result_with_timings, uninstalled_scheduler) =
                     scheduler.wait_for_termination(reason.is_dropped());
                 uninstalled_scheduler.return_to_pool();
-                Some(result_with_timings)
+                (false, Some(result_with_timings))
             } else {
-                None
+                (true, None)
             };
         debug!(
-            "wait_for_scheduler_termination(slot: {}, reason: {:?}): finished with: {:?}...",
+            "wait_for_scheduler_termination(slot: {}, reason: {:?}): was_noop: {:?} finished with: {:?}...",
             bank.slot(),
             reason,
+            was_noop,
             result_with_timings.as_ref().map(|(result, _)| result),
         );
 
@@ -435,7 +486,7 @@ mod tests {
     fn setup_mocked_scheduler_with_extra(
         bank: Arc<Bank>,
         is_dropped_flags: impl Iterator<Item = bool>,
-        f: Option<impl Fn(&mut MockInstalledScheduler)>,
+        f: Option<impl Fn(&mut MockInstalledScheduler<DefaultScheduleExecutionArg>)>,
     ) -> InstalledSchedulerBox {
         let mut mock = MockInstalledScheduler::new();
         let seq = Arc::new(Mutex::new(Sequence::new()));
@@ -443,7 +494,10 @@ mod tests {
         mock.expect_context()
             .times(1)
             .in_sequence(&mut seq.lock().unwrap())
-            .return_const(SchedulingContext::new(bank));
+            .return_const(SchedulingContext::new(
+                SchedulingMode::BlockVerification,
+                bank,
+            ));
 
         for wait_reason in is_dropped_flags {
             let seq_cloned = seq.clone();
@@ -479,7 +533,7 @@ mod tests {
         setup_mocked_scheduler_with_extra(
             bank,
             is_dropped_flags,
-            None::<fn(&mut MockInstalledScheduler) -> ()>,
+            None::<fn(&mut MockInstalledScheduler<DefaultScheduleExecutionArg>) -> ()>,
         )
     }
 
@@ -535,12 +589,14 @@ mod tests {
             Some(setup_mocked_scheduler_with_extra(
                 bank,
                 [false].into_iter(),
-                Some(|mocked: &mut MockInstalledScheduler| {
-                    mocked
-                        .expect_pause_for_recent_blockhash()
-                        .times(1)
-                        .returning(|| ());
-                }),
+                Some(
+                    |mocked: &mut MockInstalledScheduler<DefaultScheduleExecutionArg>| {
+                        mocked
+                            .expect_pause_for_recent_blockhash()
+                            .times(1)
+                            .returning(|| ());
+                    },
+                ),
             )),
         );
         goto_end_of_slot_with_scheduler(&bank);
@@ -566,15 +622,20 @@ mod tests {
         let mocked_scheduler = setup_mocked_scheduler_with_extra(
             bank.clone(),
             [true].into_iter(),
-            Some(|mocked: &mut MockInstalledScheduler| {
-                mocked
-                    .expect_schedule_execution()
-                    .times(1)
-                    .returning(|(_, _)| ());
-            }),
+            Some(
+                |mocked: &mut MockInstalledScheduler<DefaultScheduleExecutionArg>| {
+                    mocked
+                        .expect_schedule_execution()
+                        .times(1)
+                        .returning(|(_, _)| Ok(()));
+                },
+            ),
         );
 
         let bank = BankWithScheduler::new(bank, Some(mocked_scheduler));
-        bank.schedule_transaction_executions([(&tx0, &0)].into_iter());
+        assert_matches!(
+            bank.schedule_transaction_executions([(&tx0, &0)].into_iter()),
+            Ok(())
+        );
     }
 }
diff --git a/sdk/Cargo.toml b/sdk/Cargo.toml
index 57bf0738fa41eb..bc7c40a68080bb 100644
--- a/sdk/Cargo.toml
+++ b/sdk/Cargo.toml
@@ -72,7 +72,7 @@ serde = { workspace = true }
 serde_bytes = { workspace = true }
 serde_derive = { workspace = true }
 serde_json = { workspace = true, optional = true }
-serde_with = { workspace = true, features = ["macros"] }
+serde_with = { workspace = true, features = ["macros", "alloc"] }
 sha2 = { workspace = true }
 sha3 = { workspace = true, optional = true }
 siphasher = { workspace = true }
diff --git a/sdk/src/lib.rs b/sdk/src/lib.rs
index 7c6b643884e449..52eb4fd0e94841 100644
--- a/sdk/src/lib.rs
+++ b/sdk/src/lib.rs
@@ -98,6 +98,7 @@ pub mod rent_debits;
 pub mod reward_info;
 pub mod reward_type;
 pub mod rpc_port;
+pub mod scheduling;
 pub mod secp256k1_instruction;
 pub mod shred_version;
 pub mod signature;
diff --git a/sdk/src/scheduling.rs b/sdk/src/scheduling.rs
new file mode 100644
index 00000000000000..aa39f7a8b08e8d
--- /dev/null
+++ b/sdk/src/scheduling.rs
@@ -0,0 +1,11 @@
+//! Primitive types relevant to transaction scheduling
+#![cfg(feature = "full")]
+
+#[derive(Debug, Clone, Copy)]
+pub enum SchedulingMode {
+    BlockVerification,
+}
+
+pub trait WithSchedulingMode {
+    fn mode(&self) -> SchedulingMode;
+}
diff --git a/sdk/src/transaction/sanitized.rs b/sdk/src/transaction/sanitized.rs
index 4189f1b64b86e2..a735e0c8170dbd 100644
--- a/sdk/src/transaction/sanitized.rs
+++ b/sdk/src/transaction/sanitized.rs
@@ -36,11 +36,17 @@ pub struct SanitizedTransaction {
 }
 
 /// Set of accounts that must be locked for safe transaction processing
-#[derive(Debug, Clone, Default, Eq, PartialEq)]
+use serde_with::serde_as;
+use serde_with::DisplayFromStr;
+
+#[serde_as]
+#[derive(Debug, Clone, Default, Eq, PartialEq, Serialize)]
 pub struct TransactionAccountLocks<'a> {
     /// List of readonly account key locks
+    #[serde_as(as = "Vec<DisplayFromStr>")]
     pub readonly: Vec<&'a Pubkey>,
     /// List of writable account key locks
+    #[serde_as(as = "Vec<DisplayFromStr>")]
     pub writable: Vec<&'a Pubkey>,
 }
 
diff --git a/svm/Cargo.toml b/svm/Cargo.toml
index ac672613c9c4fc..ffb3dc69e0bf57 100644
--- a/svm/Cargo.toml
+++ b/svm/Cargo.toml
@@ -13,6 +13,7 @@ edition = { workspace = true }
 itertools = { workspace = true }
 log = { workspace = true }
 percentage = { workspace = true }
+rand = { workspace = true }
 solana-bpf-loader-program = { workspace = true }
 solana-frozen-abi = { workspace = true }
 solana-frozen-abi-macro = { workspace = true }
diff --git a/svm/src/transaction_processor.rs b/svm/src/transaction_processor.rs
index b58d178df4b963..0394ceed278742 100644
--- a/svm/src/transaction_processor.rs
+++ b/svm/src/transaction_processor.rs
@@ -288,14 +288,17 @@ impl<FG: ForkGraph> TransactionBatchProcessor<FG> {
 
         execution_time.stop();
 
-        const SHRINK_LOADED_PROGRAMS_TO_PERCENTAGE: u8 = 90;
-        self.loaded_programs_cache
-            .write()
-            .unwrap()
-            .evict_using_2s_random_selection(
-                Percentage::from(SHRINK_LOADED_PROGRAMS_TO_PERCENTAGE),
-                self.slot,
-            );
+        use rand::Rng;
+        if rand::thread_rng().gen_range(0..1000) == 0 {
+            const SHRINK_LOADED_PROGRAMS_TO_PERCENTAGE: u8 = 90;
+            self.loaded_programs_cache
+                .write()
+                .unwrap()
+                .evict_using_2s_random_selection(
+                    Percentage::from(SHRINK_LOADED_PROGRAMS_TO_PERCENTAGE),
+                    self.slot,
+                );
+        }
 
         debug!(
             "load: {}us execute: {}us txs_len={}",
diff --git a/unified-scheduler-logic/Cargo.toml b/unified-scheduler-logic/Cargo.toml
index b2e80c79c7a08f..e1dd176a2bd510 100644
--- a/unified-scheduler-logic/Cargo.toml
+++ b/unified-scheduler-logic/Cargo.toml
@@ -10,4 +10,26 @@ license = { workspace = true }
 edition = { workspace = true }
 
 [dependencies]
+assert_matches = { workspace = true }
+qualifier_attr = { workspace = true }
 solana-sdk = { workspace = true }
+static_assertions = { workspace = true }
+#[[bench]]
+#name = "bench-with-iai-callgrind"
+#harness = false
+
+[dev-dependencies]
+# See order-crates-for-publishing.py for using this unusual `path = "."`
+solana-unified-scheduler-logic = { path = ".", features = ["dev-context-only-utils"] }
+triomphe = { version = "0.1.11" }
+
+[target."cfg(target_os = \"linux\")".dev-dependencies]
+iai-callgrind = { version = "0.10.2", features = [
+    "client_requests"
+] }
+
+[target."cfg(not(target_os = \"linux\"))".dev-dependencies]
+iai-callgrind = { version = "0.10.2" }
+
+[features]
+dev-context-only-utils = []
diff --git a/unified-scheduler-logic/benches/bench-with-iai-callgrind.rs b/unified-scheduler-logic/benches/bench-with-iai-callgrind.rs
new file mode 100644
index 00000000000000..b30d290de87855
--- /dev/null
+++ b/unified-scheduler-logic/benches/bench-with-iai-callgrind.rs
@@ -0,0 +1,668 @@
+#![cfg(feature = "dummy")]
+#![allow(clippy::arithmetic_side_effects)]
+
+#[global_allocator]
+static GLOBAL: B = B;
+
+struct A<T>(T);
+
+unsafe impl<T> std::marker::Sync for A<T> {}
+
+static LOCAL_ALLOCATOR: A<std::cell::UnsafeCell<BL>> = A(std::cell::UnsafeCell::new(BL::new()));
+
+struct BL {
+    cursor: *mut u8,
+    limit: *mut u8,
+    bytes: [u8; Self::BLOCK_SIZE],
+}
+
+impl BL {
+    const BLOCK_SIZE: usize = 100_000_000;
+
+    const fn new() -> Self {
+        Self {
+            cursor: usize::max_value() as _,
+            limit: usize::max_value() as _,
+            bytes: [0; Self::BLOCK_SIZE],
+        }
+    }
+
+    #[inline(always)]
+    pub fn alloc2(&mut self, bytes: usize) -> *mut u8 {
+        loop {
+            self.cursor = unsafe { (((self.cursor.sub(bytes)) as usize) & !15) as _ };
+            if self.cursor >= self.limit {
+                return self.cursor;
+            } else if self.limit == usize::max_value() as _ {
+                self.limit = self.bytes.as_mut_ptr();
+                self.cursor = unsafe { self.limit.add(Self::BLOCK_SIZE) };
+                continue;
+            } else {
+                panic!("out of memory form BL");
+            }
+        }
+    }
+}
+
+use std::{
+    alloc::{GlobalAlloc, Layout},
+    hint::black_box,
+};
+
+struct B;
+
+unsafe impl GlobalAlloc for B {
+    #[inline(always)]
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        (*LOCAL_ALLOCATOR.0.get()).alloc2(layout.size())
+    }
+
+    #[inline(always)]
+    unsafe fn dealloc(&self, _ptr: *mut u8, _layout: Layout) {}
+}
+
+use {
+    assert_matches::assert_matches,
+    iai_callgrind::{
+        client_requests::callgrind::toggle_collect, library_benchmark, library_benchmark_group,
+        main,
+    },
+    solana_sdk::{
+        instruction::{AccountMeta, Instruction},
+        message::Message,
+        pubkey::Pubkey,
+        signature::Signer,
+        signer::keypair::Keypair,
+        transaction::{SanitizedTransaction, Transaction},
+    },
+    solana_unified_scheduler_logic::{Page, SchedulingStateMachine},
+};
+
+#[library_benchmark]
+#[bench::min(0)]
+#[bench::one(1)]
+#[bench::two(2)]
+#[bench::three(3)]
+#[bench::normal(32)]
+#[bench::large(64)]
+#[bench::max(128)]
+fn bench_schedule_task(account_count: usize) {
+    toggle_collect();
+    let mut accounts = vec![];
+    for i in 0..account_count {
+        if i % 2 == 0 {
+            accounts.push(AccountMeta::new(Keypair::new().pubkey(), true));
+        } else {
+            accounts.push(AccountMeta::new_readonly(Keypair::new().pubkey(), true));
+        }
+    }
+
+    let payer = Keypair::new();
+    let memo_ix = Instruction {
+        program_id: Pubkey::default(),
+        accounts,
+        data: vec![0x00],
+    };
+    let mut ixs = vec![];
+    for _ in 0..1 {
+        ixs.push(memo_ix.clone());
+    }
+    let msg = Message::new(&ixs, Some(&payer.pubkey()));
+    let txn = Transaction::new_unsigned(msg);
+    //panic!("{:?}", txn);
+    //assert_eq!(wire_txn.len(), 3);
+    let tx0 = SanitizedTransaction::from_transaction_for_tests(txn);
+    let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| Page::default());
+    let mut scheduler =
+        unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() };
+    toggle_collect();
+    let task = scheduler.schedule_task(task);
+    toggle_collect();
+    task.unwrap();
+}
+
+#[library_benchmark]
+#[bench::min(0)]
+#[bench::one(1)]
+#[bench::two(2)]
+#[bench::three(3)]
+#[bench::normal(32)]
+#[bench::large(64)]
+#[bench::max(128)]
+fn bench_drop_task(account_count: usize) {
+    toggle_collect();
+    let mut accounts = vec![];
+    for _ in 0..account_count {
+        accounts.push(AccountMeta::new(Keypair::new().pubkey(), true));
+    }
+
+    let payer = Keypair::new();
+    let memo_ix = Instruction {
+        program_id: Pubkey::default(),
+        accounts,
+        data: vec![0x00],
+    };
+    let mut ixs = vec![];
+    for _ in 0..1 {
+        ixs.push(memo_ix.clone());
+    }
+    let msg = Message::new(&ixs, Some(&payer.pubkey()));
+    let txn = Transaction::new_unsigned(msg);
+    //panic!("{:?}", txn);
+    //assert_eq!(wire_txn.len(), 3);
+    let tx0 = SanitizedTransaction::from_transaction_for_tests(txn);
+    let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| Page::default());
+
+    toggle_collect();
+    drop(task);
+    toggle_collect();
+}
+
+#[library_benchmark]
+#[bench::one(1)]
+fn bench_insert_task(account_count: usize) {
+    toggle_collect();
+    let mut accounts = vec![];
+    for _ in 0..account_count {
+        accounts.push(AccountMeta::new(Keypair::new().pubkey(), true));
+    }
+
+    let payer = Keypair::new();
+    let memo_ix = Instruction {
+        program_id: Pubkey::default(),
+        accounts,
+        data: vec![0x00],
+    };
+    let mut ixs = vec![];
+    for _ in 0..1 {
+        ixs.push(memo_ix.clone());
+    }
+    let msg = Message::new(&ixs, Some(&payer.pubkey()));
+    let txn = Transaction::new_unsigned(msg);
+    //panic!("{:?}", txn);
+    //assert_eq!(wire_txn.len(), 3);
+    let tx0 = SanitizedTransaction::from_transaction_for_tests(txn);
+    let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| Page::default());
+
+    let mut b = std::collections::BTreeMap::new();
+    toggle_collect();
+    b.insert(task.index, task.clone());
+    b.insert(task.index + 1, task.clone());
+    b.remove(&task.index);
+    b.remove(&(task.index + 1));
+    //b.insert(task.index + 4, task);
+    toggle_collect();
+    drop(b);
+}
+
+#[library_benchmark]
+#[bench::arc_new(1)]
+#[bench::arc_new_and_clone(2)]
+#[bench::rc_new(3)]
+#[bench::rc_new_and_clone(4)]
+fn bench_arc(account_count: usize) {
+    toggle_collect();
+
+    {
+        let b;
+        match account_count {
+            1 => {
+                toggle_collect();
+                b = black_box(std::sync::Arc::new(black_box(3_u32)));
+            }
+            2 => {
+                b = black_box(std::sync::Arc::new(black_box(3_u32)));
+                toggle_collect();
+                std::mem::forget(black_box(b.clone()));
+            }
+            _ => {
+                let b;
+                match account_count {
+                    3 => {
+                        toggle_collect();
+                        b = black_box(std::rc::Rc::new(black_box(3_u32)));
+                    }
+                    4 => {
+                        toggle_collect();
+                        b = black_box(std::rc::Rc::new(black_box(3_u32)));
+                        black_box(b.clone());
+                    }
+                    _ => panic!(),
+                }
+                toggle_collect();
+                drop(b);
+                return;
+            }
+        }
+        toggle_collect();
+        drop(b);
+    }
+}
+
+#[library_benchmark]
+#[bench::arc_new(1)]
+#[bench::arc_new_and_clone(2)]
+#[bench::rc_new(3)]
+#[bench::rc_new_and_clone(4)]
+fn bench_triomphe_arc(account_count: usize) {
+    toggle_collect();
+
+    {
+        let b;
+        match account_count {
+            1 => {
+                toggle_collect();
+                b = black_box(triomphe::Arc::new(black_box(3_u32)));
+            }
+            2 => {
+                b = black_box(triomphe::Arc::new(black_box(3_u32)));
+                toggle_collect();
+                std::mem::forget(black_box(b.clone()));
+            }
+            _ => {
+                let b;
+                match account_count {
+                    3 => {
+                        toggle_collect();
+                        b = black_box(std::rc::Rc::new(black_box(3_u32)));
+                    }
+                    4 => {
+                        toggle_collect();
+                        b = black_box(std::rc::Rc::new(black_box(3_u32)));
+                        black_box(b.clone());
+                    }
+                    _ => panic!(),
+                }
+                toggle_collect();
+                drop(b);
+                return;
+            }
+        }
+        toggle_collect();
+        drop(b);
+    }
+}
+
+#[library_benchmark]
+#[bench::one(1)]
+fn bench_heaviest_task(account_count: usize) {
+    toggle_collect();
+    let mut accounts = vec![];
+    for _ in 0..account_count {
+        accounts.push(AccountMeta::new(Keypair::new().pubkey(), true));
+    }
+
+    let payer = Keypair::new();
+    let memo_ix = Instruction {
+        program_id: Pubkey::default(),
+        accounts,
+        data: vec![0x00],
+    };
+    let mut ixs = vec![];
+    for _ in 0..1 {
+        ixs.push(memo_ix.clone());
+    }
+    let msg = Message::new(&ixs, Some(&payer.pubkey()));
+    let txn = Transaction::new_unsigned(msg);
+    //panic!("{:?}", txn);
+    //assert_eq!(wire_txn.len(), 3);
+    let tx0 = SanitizedTransaction::from_transaction_for_tests(txn);
+    let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| Page::default());
+
+    let mut b = std::collections::BTreeMap::new();
+    b.insert(task.index, task.clone());
+    b.insert(task.index + 1, task.clone());
+    b.insert(task.index + 2, task.clone());
+    let mut c = std::collections::BTreeMap::new();
+    c.insert(task.index + 3, task.clone());
+    c.insert(task.index + 4, task.clone());
+    c.insert(task.index + 5, task.clone());
+
+    toggle_collect();
+    let d = b.first_key_value();
+    let e = c.first_key_value();
+    let f = std::cmp::min_by(d, e, |x, y| x.map(|x| x.0).cmp(&y.map(|y| y.0))).map(|x| x.1);
+    assert_matches!(f.map(|f| f.task_index()), Some(0));
+    toggle_collect();
+    dbg!(f);
+
+    drop(b);
+}
+
+#[library_benchmark]
+#[bench::min(0)]
+#[bench::one(1)]
+#[bench::two(2)]
+#[bench::three(3)]
+#[bench::normal(32)]
+#[bench::large(64)]
+#[bench::max(128)]
+fn bench_schedule_task_conflicting(account_count: usize) {
+    toggle_collect();
+    let mut accounts = vec![];
+    for _ in 0..account_count {
+        accounts.push(AccountMeta::new(Keypair::new().pubkey(), true));
+    }
+
+    let payer = Keypair::new();
+    let memo_ix = Instruction {
+        program_id: Pubkey::default(),
+        accounts,
+        data: vec![0x00],
+    };
+    let mut ixs = vec![];
+    for _ in 0..1 {
+        ixs.push(memo_ix.clone());
+    }
+    let msg = Message::new(&ixs, Some(&payer.pubkey()));
+    let txn = Transaction::new_unsigned(msg);
+    //panic!("{:?}", txn);
+    //assert_eq!(wire_txn.len(), 3);
+    let tx0 = SanitizedTransaction::from_transaction_for_tests(txn);
+    let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| Page::default());
+    let mut scheduler =
+        unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() };
+    let task = scheduler.schedule_task(task).unwrap();
+    let task2 = task.clone();
+    toggle_collect();
+    assert_matches!(scheduler.schedule_task(task2), None);
+    toggle_collect();
+    drop(task);
+}
+
+#[library_benchmark]
+#[bench::min(3, 0)]
+#[bench::one(3, 1)]
+#[bench::two(2, 2)]
+#[bench::three(3, 3)]
+#[bench::normal(3, 32)]
+#[bench::large(3, 64)]
+#[bench::large2(3, 128)]
+#[bench::large3(3, 256)]
+#[bench::large4(3, 1024)]
+#[bench::large5(3, 2048)]
+fn bench_schedule_task_conflicting_hot(account_count: usize, task_count: usize) {
+    toggle_collect();
+    let mut accounts = vec![];
+    for _ in 0..account_count {
+        accounts.push(AccountMeta::new(Keypair::new().pubkey(), true));
+    }
+
+    let payer = Keypair::new();
+    let memo_ix = Instruction {
+        program_id: Pubkey::default(),
+        accounts,
+        data: vec![0x00],
+    };
+    let mut ixs = vec![];
+    for _ in 0..1 {
+        ixs.push(memo_ix.clone());
+    }
+    let msg = Message::new(&ixs, Some(&payer.pubkey()));
+    let txn = Transaction::new_unsigned(msg);
+    //panic!("{:?}", txn);
+    //assert_eq!(wire_txn.len(), 3);
+    let tx0 = SanitizedTransaction::from_transaction_for_tests(txn);
+
+    let mut scheduler =
+        unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() };
+
+    let mut pages: std::collections::HashMap<solana_sdk::pubkey::Pubkey, Page> =
+        std::collections::HashMap::new();
+    let task = SchedulingStateMachine::create_task(tx0.clone(), 0, &mut |address| {
+        pages.entry(address).or_default().clone()
+    });
+    scheduler.schedule_task(task).unwrap();
+    for i in 1..=task_count {
+        let task = SchedulingStateMachine::create_task(tx0.clone(), i, &mut |address| {
+            pages.entry(address).or_default().clone()
+        });
+        assert_matches!(scheduler.schedule_task(task), None);
+    }
+
+    let task = SchedulingStateMachine::create_task(tx0.clone(), task_count + 1, &mut |address| {
+        pages.entry(address).or_default().clone()
+    });
+    let task2 = task.clone();
+
+    toggle_collect();
+    assert_matches!(scheduler.schedule_task(task2), None);
+    toggle_collect();
+
+    drop(task);
+}
+
+#[library_benchmark]
+#[bench::min(0)]
+#[bench::one(1)]
+#[bench::two(2)]
+#[bench::three(3)]
+#[bench::normal(32)]
+#[bench::large(64)]
+#[bench::max(128)]
+fn bench_deschedule_task_conflicting(account_count: usize) {
+    toggle_collect();
+    let mut accounts = vec![];
+    for _ in 0..account_count {
+        accounts.push(AccountMeta::new(Keypair::new().pubkey(), true));
+    }
+
+    let payer = Keypair::new();
+    let memo_ix = Instruction {
+        program_id: Pubkey::default(),
+        accounts,
+        data: vec![0x00],
+    };
+    let mut ixs = vec![];
+    for _ in 0..1 {
+        ixs.push(memo_ix.clone());
+    }
+    let msg = Message::new(&ixs, Some(&payer.pubkey()));
+    let txn = Transaction::new_unsigned(msg);
+    //panic!("{:?}", txn);
+    //assert_eq!(wire_txn.len(), 3);
+    let tx0 = SanitizedTransaction::from_transaction_for_tests(txn);
+    let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| Page::default());
+    let mut scheduler =
+        unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() };
+    let task = scheduler.schedule_task(task).unwrap();
+    assert_matches!(scheduler.schedule_task(task.clone()), None);
+
+    toggle_collect();
+    scheduler.deschedule_task(&task);
+    toggle_collect();
+
+    drop(task);
+}
+
+#[library_benchmark]
+#[bench::min(0)]
+#[bench::one(1)]
+#[bench::two(2)]
+#[bench::three(3)]
+#[bench::normal(32)]
+#[bench::large(64)]
+#[bench::max(128)]
+fn bench_schedule_unblocked_task(account_count: usize) {
+    toggle_collect();
+    let mut accounts = vec![];
+    for _ in 0..account_count {
+        accounts.push(AccountMeta::new(Keypair::new().pubkey(), true));
+    }
+
+    let payer = Keypair::new();
+    let memo_ix = Instruction {
+        program_id: Pubkey::default(),
+        accounts,
+        data: vec![0x00],
+    };
+    let mut ixs = vec![];
+    for _ in 0..1 {
+        ixs.push(memo_ix.clone());
+    }
+    let msg = Message::new(&ixs, Some(&payer.pubkey()));
+    let txn = Transaction::new_unsigned(msg);
+    //panic!("{:?}", txn);
+    //assert_eq!(wire_txn.len(), 3);
+    let tx0 = SanitizedTransaction::from_transaction_for_tests(txn);
+    let mut pages: std::collections::HashMap<solana_sdk::pubkey::Pubkey, Page> =
+        std::collections::HashMap::new();
+    let task = SchedulingStateMachine::create_task(tx0.clone(), 0, &mut |address| {
+        pages.entry(address).or_default().clone()
+    });
+    let task2 = SchedulingStateMachine::create_task(tx0, 1, &mut |address| {
+        pages.entry(address).or_default().clone()
+    });
+    let mut scheduler =
+        unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() };
+    let task = scheduler.schedule_task(task).unwrap();
+    assert_matches!(scheduler.schedule_task(task2), None);
+    scheduler.deschedule_task(&task);
+    toggle_collect();
+    let retried_task = scheduler.schedule_unblocked_task();
+    toggle_collect();
+    let retried_task = retried_task.unwrap();
+    assert_eq!(task.transaction(), retried_task.transaction());
+    drop(task);
+}
+
+#[library_benchmark]
+#[bench::min(0)]
+#[bench::one(1)]
+#[bench::two(2)]
+#[bench::three(3)]
+#[bench::small(16)]
+#[bench::normal(32)]
+#[bench::large(64)]
+//#[bench::max(128)]
+fn bench_end_to_end_worst(account_count: usize) {
+    toggle_collect();
+    let mut accounts = vec![];
+    for _ in 0..account_count {
+        accounts.push(AccountMeta::new(Keypair::new().pubkey(), true));
+    }
+
+    let payer = Keypair::new();
+    let memo_ix = Instruction {
+        program_id: Pubkey::default(),
+        accounts,
+        data: vec![0x00],
+    };
+    let mut ixs = vec![];
+    for _ in 0..1 {
+        ixs.push(memo_ix.clone());
+    }
+    let msg = Message::new(&ixs, Some(&payer.pubkey()));
+    let txn = Transaction::new_unsigned(msg);
+    //panic!("{:?}", txn);
+    //assert_eq!(wire_txn.len(), 3);
+    let tx0 = SanitizedTransaction::from_transaction_for_tests(txn);
+    let mut pages: std::collections::HashMap<solana_sdk::pubkey::Pubkey, Page> =
+        std::collections::HashMap::new();
+    let task = SchedulingStateMachine::create_task(tx0.clone(), 0, &mut |address| {
+        pages.entry(address).or_default().clone()
+    });
+    let mut scheduler =
+        unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() };
+
+    let task = scheduler.schedule_task(task).unwrap();
+    for i in 1..account_count {
+        let mut accounts = vec![memo_ix.accounts[i].clone()];
+        //let mut accounts = vec![AccountMeta::new(Keypair::new().pubkey(), true)];
+        for _ in 0..account_count {
+            accounts.push(AccountMeta::new(Keypair::new().pubkey(), true));
+        }
+
+        let payer = Keypair::new();
+        let memo_ix = Instruction {
+            program_id: Pubkey::default(),
+            accounts,
+            data: vec![0x00],
+        };
+        let ixs = vec![memo_ix];
+        let msg = Message::new(&ixs, Some(&payer.pubkey()));
+        let txn = Transaction::new_unsigned(msg);
+        //panic!("{:?}", txn);
+        //assert_eq!(wire_txn.len(), 3);
+        let tx0 = SanitizedTransaction::from_transaction_for_tests(txn);
+        let task2 = SchedulingStateMachine::create_task(tx0, i, &mut |address| {
+            pages.entry(address).or_default().clone()
+        });
+        toggle_collect();
+        let scheduled_task = scheduler.schedule_task(task2.clone());
+        toggle_collect();
+        drop(scheduled_task);
+    }
+
+    toggle_collect();
+    scheduler.deschedule_task(&task);
+    if let Some(_cc) = account_count.checked_sub(1) {
+        //assert_eq!(scheduler.unblocked_task_count(), cc);
+        //let mut c = 0;
+        while let Some(retried_task) = scheduler.schedule_unblocked_task() {
+            //c += 1;
+            //scheduler.deschedule_task(&retried_task);
+            toggle_collect();
+            drop::<solana_unified_scheduler_logic::Task>(retried_task);
+            toggle_collect();
+        }
+        //assert_eq!(c, cc);
+    }
+    toggle_collect();
+
+    //assert_eq!(task2.task_index(), retried_task.task_index());
+    drop(task);
+}
+
+#[library_benchmark]
+#[bench::min(0)]
+#[bench::one(1)]
+#[bench::two(2)]
+#[bench::three(3)]
+#[bench::normal(32)]
+#[bench::large(64)]
+#[bench::max(128)]
+fn bench_deschedule_task(account_count: usize) {
+    toggle_collect();
+    let mut accounts = vec![];
+    for i in 0..account_count {
+        if i % 2 == 0 {
+            accounts.push(AccountMeta::new(Keypair::new().pubkey(), true));
+        } else {
+            accounts.push(AccountMeta::new_readonly(Keypair::new().pubkey(), true));
+        }
+    }
+
+    let payer = Keypair::new();
+    let memo_ix = Instruction {
+        program_id: Pubkey::default(),
+        accounts,
+        data: vec![0x00],
+    };
+    let mut ixs = vec![];
+    for _ in 0..1 {
+        ixs.push(memo_ix.clone());
+    }
+    let msg = Message::new(&ixs, Some(&payer.pubkey()));
+    let txn = Transaction::new_unsigned(msg);
+    //panic!("{:?}", txn);
+    //assert_eq!(wire_txn.len(), 3);
+    let tx0 = SanitizedTransaction::from_transaction_for_tests(txn);
+    let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| Page::default());
+    let mut scheduler =
+        unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() };
+    let task = scheduler.schedule_task(task).unwrap();
+    toggle_collect();
+    scheduler.deschedule_task(&task);
+    toggle_collect();
+    drop(task);
+}
+
+library_benchmark_group!(
+    name = bench_scheduling_state_machine;
+    benchmarks = bench_end_to_end_worst, bench_arc, bench_triomphe_arc, bench_drop_task, bench_insert_task, bench_heaviest_task, bench_schedule_task, bench_schedule_task_conflicting, bench_schedule_task_conflicting_hot, bench_deschedule_task, bench_deschedule_task_conflicting, bench_schedule_unblocked_task
+    //benchmarks = bench_arc, bench_triomphe_arc
+    //benchmarks = bench_end_to_end_worst
+);
+
+main!(library_benchmark_groups = bench_scheduling_state_machine);
diff --git a/unified-scheduler-logic/src/lib.rs b/unified-scheduler-logic/src/lib.rs
index 997c6c1745a7c9..6bffc9aaec5ac2 100644
--- a/unified-scheduler-logic/src/lib.rs
+++ b/unified-scheduler-logic/src/lib.rs
@@ -1,15 +1,315 @@
-use solana_sdk::transaction::SanitizedTransaction;
+#![allow(rustdoc::private_intra_doc_links)]
+//! The task (transaction) scheduling code for the unified scheduler
+//!
+//! ### High-level API and design
+//!
+//! The most important type is [`SchedulingStateMachine`]. It takes new tasks (= transactons) and
+//! may return back them if runnable via
+//! [`::schedule_task()`](SchedulingStateMachine::schedule_task) while maintaining the account
+//! readonly/writable lock rules. Those returned runnable tasks are guaranteed to be safe to
+//! execute in parallel. Lastly, `SchedulingStateMachine` should be notified about the completion
+//! of the exeuciton via [`::deschedule_task()`](SchedulingStateMachine::deschedule_task), so that
+//! conflicting tasks can be returned from
+//! [`::schedule_unblocked_task()`](SchedulingStateMachine::schedule_unblocked_task) as
+//! newly-unblocked runnable ones.
+//!
+//! The design principle of this crate (`solana-unified-scheduler-logic`) is simplicity for the
+//! separation of concern. It is interacted only with a few of its public API by
+//! `solana-unified-scheduler-pool`. This crate doesn't know about banks, slots, solana-runtime,
+//! threads, crossbeam-channel at all. Becasue of this, it's deterministic, easy-to-unit-test, and
+//! its perf footprint is well understood. It really focuses on its single job: sorting
+//! transactions in executable order.
+//!
+//! ### Algorithm
+//!
+//! The algorithm can be said it's based on per-address FIFO queues, which are updated every time
+//! both new task is coming (= called _scheduling_) and runnable (= _post-scheduling_) task is
+//! finished (= called _descheduling_).
+//!
+//! For the _non-conflicting scheduling_ case, the story is very simple; it just remembers that all
+//! of accessed addresses are write-locked or read-locked with the number of active (=
+//! _currently-scheduled-and-not-descheduled-yet_) tasks. Correspondingly, descheduling does the
+//! opposite book-keeping process, regardless whether a finished task has been conflicted or not.
+//!
+//! For the _conflicting scheduling_ case, it remembers that each of **non-conflicting addresses**
+//! like the non-conflicting case above. As for **conflicting addresses**, each task is recorded to
+//! respective FIFO queues attached to the (conflicting) addresses. Importantly, the number of
+//! conflicting addresses of the conflicting task is also remembered.
+//!
+//! The last missing piece is that the scheduler actually tries to reschedule previously blocked
+//! tasks while deschduling, in addition to the above-mentioned book-keeping processing. Namely,
+//! when given address is ready for new fresh locking resulted from descheduling a task (i.e. write
+//! lock is released or read lock count is reached to zero), it pops out the first element of the
+//! FIFO blocked-task queue of the address. Then, it immediately marks the address as relocked. It
+//! also decrements the number of conflicting addresses of the popped-out task. As the final step,
+//! if the number reaches to the zero, it means the task has fully finished locking all of its
+//! addresses and is directly routed to be runnable.
+//!
+//! Put differently, this algorigthm tries to gradually lock all of addresses of tasks at different
+//! timings while not deviating the execution order from the original task ingestion order. This
+//! implies there's no locking retries in general, which is the primary source of non-linear perf.
+//! degration.
+//!
+//! As a ballpark number from a synthesized micro benchmark on usual CPU for `mainnet-beta`
+//! validators, it takes roughly 100ns to schedule and deschedule a transaction with 10 accounts.
+//! And 1us for a transaction with 100 accounts. Note that this excludes crossbeam communication
+//! overhead at all. That's said, it's not unrealistic to say the whole unified scheduler can
+//! attain 100k-1m tps overall, assuming those transaction executions aren't bottlenecked.
+//!
+//! ### Runtime performance characteristics and data structure arrangement
+//!
+//! Its algorithm is very fast for high throughput, real-time for low latency. The whole
+//! unified-scheduler architecture is designed from grounds up to support the fastest execution of
+//! this scheduling code. For that end, unified scheduler pre-loads address-specific locking state
+//! data structures (called [`Page`]) for all of transaction's accounts, in order to offload the
+//! job to other threads from the scheduler thread. This preloading is done inside
+//! [`create_task()`](SchedulingStateMachine::create_task). In this way, task scheduling
+//! computational complexity is basically reduced to several word-sized loads and stores in the
+//! schduler thread (i.e.  constant; no allocations nor syscalls), while being proportional to the
+//! number of addresses in a given transaction. Note that this statement is held true, regardless
+//! of conflicts. This is because the preloading also pre-allocates some scratch-pad area
+//! ([`blocked_tasks`](PageInner::blocked_tasks)) to stash blocked ones. So, a conflict only incurs
+//! some additional fixed number of mem stores, within error magin of the constant complexity. And
+//! additional memory allocation for the scratchpad could said to be amortized, if such unsual
+//! event should occur.
+//!
+//! [`Arc`] is used to implement this preloading mechanism, because `Page`s are shared across tasks
+//! accessing the same account, and among threads due to the preloading. Also, interior mutability
+//! is needed. However, `SchedulingStateMachine` doesn't use conventional locks like RwLock.
+//! Leveraving the fact it's the only state-mutating exclusive thread, it instead uses
+//! `UnsafeCell`, which is sugar-coated by a tailored wrapper called [`TokenCell`]. `TokenCell`
+//! improses an overly restrictive aliasing rule via rust type system to maintain the memory
+//! safety. By localizing any synchronization to the message passing, the scheduling code itself
+//! attains maximally possible single-threaed execution without stalling cpu pipelines at all, only
+//! constrained to mem access latency, while efficiently utilzing L1-L3 cpu cache with full of
+//! `Page`s.
+//!
+//! ### Buffer bloat insignificance
+//!
+//! The scheduler code itself doesn't care about the buffer bloat problem, which can occur in
+//! unified scheduler, where a run of heavily linearized and blocked tasks could severely hampered
+//! by very large number of interleaved runnable tasks along side.  The reason is again for
+//! separation of concerns. This is acceptable because the scheduling code itself isn't susceptible
+//! to the buffer bloat problem by itself as explained by the description and validated by the
+//! mentioned benchmark above. Thus, this should be solved elsewhere, specifically at the scheduler
+//! pool.
+#[cfg(feature = "dev-context-only-utils")]
+use qualifier_attr::field_qualifiers;
+use {
+    crate::utils::{ShortCounter, Token, TokenCell},
+    solana_sdk::{pubkey::Pubkey, transaction::SanitizedTransaction},
+    static_assertions::const_assert_eq,
+    std::{collections::VecDeque, mem, sync::Arc},
+};
 
-pub struct Task {
+/// Internal utilities. Namely this contains [`ShortCounter`] and [`TokenCell`].
+mod utils {
+    #[cfg(feature = "dev-context-only-utils")]
+    use qualifier_attr::qualifiers;
+    use std::{
+        any::{self, TypeId},
+        cell::{RefCell, UnsafeCell},
+        collections::BTreeSet,
+        marker::PhantomData,
+        thread,
+    };
+
+    /// A really tiny counter to hide `.checked_{add,sub}` all over the place.
+    ///
+    /// It's caller's reponsibility to ensure this (backed by [`u32`]) never overflow.
+    #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))]
+    #[derive(Debug, Clone, Copy)]
+    pub(super) struct ShortCounter(u32);
+
+    impl ShortCounter {
+        pub(super) fn zero() -> Self {
+            Self(0)
+        }
+
+        pub(super) fn one() -> Self {
+            Self(1)
+        }
+
+        pub(super) fn is_one(&self) -> bool {
+            self.0 == 1
+        }
+
+        pub(super) fn is_zero(&self) -> bool {
+            self.0 == 0
+        }
+
+        pub(super) fn current(&self) -> u32 {
+            self.0
+        }
+
+        #[must_use]
+        pub(super) fn increment(self) -> Self {
+            Self(self.0.checked_add(1).unwrap())
+        }
+
+        #[must_use]
+        pub(super) fn decrement(self) -> Self {
+            Self(self.0.checked_sub(1).unwrap())
+        }
+
+        pub(super) fn increment_self(&mut self) -> &mut Self {
+            *self = self.increment();
+            self
+        }
+
+        pub(super) fn decrement_self(&mut self) -> &mut Self {
+            *self = self.decrement();
+            self
+        }
+
+        pub(super) fn reset_to_zero(&mut self) -> &mut Self {
+            self.0 = 0;
+            self
+        }
+    }
+
+    /// A conditionally [`Send`]-able and [`Sync`]-able cell leveraging scheduler's one-by-one data
+    /// access pattern with zero runtime synchronization cost.
+    ///
+    /// To comply with Rust's aliasing rules, these cells require a carefully-created [`Token`] to
+    /// be passed around to access the inner values. The token is a special-purpose phantom object
+    /// to get rid of its inherent `unsafe`-ness in [`UnsafeCell`], which is internally used for
+    /// the interior mutability.
+    ///
+    /// The final objective of [`Token`] is to ensure there's only one mutable reference to the
+    /// [`TokenCell`] at most _at any given moment_. To that end, it's `unsafe` to create it,
+    /// shifting the responsibility of binding the only singleton instance to a particular thread
+    /// and not creating more than one, onto the API consumers. And its constructor is non-`const`,
+    /// and the type is `!Clone` (and `!Copy` as well), `!Default`, `!Send` and `!Sync` to make it
+    /// relatively hard to cross thread boundaries accidentally.
+    ///
+    /// In other words, the token semantically _owns_ all of its associated instances of
+    /// [`TokenCell`]s. And `&mut Token` is needed to access one of them as if the one is of
+    /// [`Token`]'s `*_mut()` getters. Thus, the Rust aliasing rule for `UnsafeCell` can
+    /// transitively be proven to be satisfied simply based on the usual borrow checking of the
+    /// `&mut` reference of [`Token`] itself via [`::borrow_mut()`](TokenCell::borrow_mut).
+    ///
+    /// By extension, it's allowed to create _multiple_ tokens in a _single_ process as long as no
+    /// instance of [`TokenCell`] is shared by multiple instances of [`Token`].
+    ///
+    /// Note that this is overly restrictive in that it's forbidden, yet, technically possible
+    /// to _have multiple mutable references to the inner values at the same time, if and only
+    /// if the respective cells aren't aliased to each other (i.e. different instances)_. This
+    /// artificial restriction is acceptable for its intended use by the unified scheduler's code
+    /// because its algorithm only needs to access each instance of [`TokenCell`]-ed data once at a
+    /// time. Finally, this restriction is traded off for restoration of Rust aliasing rule at zero
+    /// runtime cost.  Without this token mechanism, there's no way to realize this.
+    #[derive(Debug, Default)]
+    pub(super) struct TokenCell<V>(UnsafeCell<V>);
+
+    impl<V> TokenCell<V> {
+        /// Creates a new `TokenCell` with the `value` typed as `V`.
+        ///
+        /// Note that this isn't parametric over the its accompanied `Token`'s lifetime to avoid
+        /// complex handling of non-`'static` heaped data in general. Instead, it's manually
+        /// required to ensure this instance is accessed only via its associated Token for the
+        /// entire lifetime.
+        // non-const to forbid unprotected sharing via static variables among threads.
+        pub(super) fn new(value: V) -> Self {
+            Self(UnsafeCell::new(value))
+        }
+
+        /// Returns a mutable reference with its lifetime bound to the mutable reference of the
+        /// given token.
+        ///
+        /// In this way, any additional reborrow can never happen at the same time across all
+        /// instances of [`TokenCell<V>`] conceptually owned by the instance of [`Token<V>`] (a
+        /// particular thread), unless previous borrow is released. After the release, the used
+        /// singleton token should be free to be reused for reborrows.
+        pub(super) fn borrow_mut<'t>(&self, _token: &'t mut Token<V>) -> &'t mut V {
+            unsafe { &mut *self.0.get() }
+        }
+    }
+
+    // Safety: Access to TokenCell is assumed to be only from a single thread by proper use of
+    // Token once after TokenCell is sent to the thread from other threads; So, both implementing
+    // Send and Sync can be thought as safe.
+    //
+    // In other words, TokenCell is technicall still !Send and !Sync. But there should be no legal
+    // use happening which requires !Send or !Sync to avoid undefined behavior.
+    unsafe impl<V> Send for TokenCell<V> {}
+    unsafe impl<V> Sync for TokenCell<V> {}
+
+    /// A auxiliary zero-sized type to enforce aliasing rule to [`TokenCell`] via rust type system
+    ///
+    /// Token semantically owns a collection of `TokenCell` objects and governs the _unique_
+    /// existence of mutable access over them by requiring the token itself to be mutably borrowed
+    /// to get a mutable reference to the internal value of `TokenCell`.
+    #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))]
+    // *mut is used to make this type !Send and !Sync
+    pub(super) struct Token<V: 'static>(PhantomData<*mut V>);
+
+    impl<V> Token<V> {
+        // Returns the token to acquire a mutable reference to the inner value of [TokenCell].
+        //
+        // Safety:
+        // This method should be called exactly once for each thread at most.
+        #[must_use]
+        pub(super) unsafe fn assume_exclusive_mutating_thread() -> Self {
+            thread_local! {
+                static TOKENS: RefCell<BTreeSet<TypeId>> = const { RefCell::new(BTreeSet::new()) };
+            }
+            assert!(
+                TOKENS.with_borrow_mut(|tokens| tokens.insert(TypeId::of::<Self>())),
+                "{:?} is wrongly initialized twice on {:?}",
+                any::type_name::<Self>(),
+                thread::current()
+            );
+
+            Self(PhantomData)
+        }
+    }
+
+    #[cfg(test)]
+    mod tests {
+        use super::Token;
+
+        #[test]
+        #[should_panic(
+            expected = "\"solana_unified_scheduler_logic::utils::Token<usize>\" is wrongly \
+                        initialized twice on Thread"
+        )]
+        fn test_second_creation_of_tokens_in_a_thread() {
+            unsafe {
+                let _ = Token::<usize>::assume_exclusive_mutating_thread();
+                let _ = Token::<usize>::assume_exclusive_mutating_thread();
+            }
+        }
+    }
+}
+
+/// [`Result`] for locking a [page](Page) with particular [usage](RequestedUsage).
+type LockResult = Result<PageUsage, ()>;
+const_assert_eq!(mem::size_of::<LockResult>(), 8);
+
+/// Something to be scheduled; usually a wrapper of [`SanitizedTransaction`].
+pub type Task = Arc<TaskInner>;
+const_assert_eq!(mem::size_of::<Task>(), 8);
+
+/// [`Token`] for [`Page`].
+type PageToken = Token<PageInner>;
+const_assert_eq!(mem::size_of::<PageToken>(), 0);
+
+/// [`Token`] for [task](Task)'s [internal mutable data](`TaskInner::blocked_page_count`).
+type BlockedPageCountToken = Token<ShortCounter>;
+const_assert_eq!(mem::size_of::<BlockedPageCountToken>(), 0);
+
+/// Internal scheduling data about a particular task.
+#[cfg_attr(feature = "dev-context-only-utils", field_qualifiers(index(pub)))]
+#[derive(Debug)]
+pub struct TaskInner {
     transaction: SanitizedTransaction,
     index: usize,
+    lock_attempts: Vec<LockAttempt>,
+    blocked_page_count: TokenCell<ShortCounter>,
 }
 
-impl Task {
-    pub fn create_task(transaction: SanitizedTransaction, index: usize) -> Self {
-        Task { transaction, index }
-    }
-
+impl TaskInner {
     pub fn task_index(&self) -> usize {
         self.index
     }
@@ -17,4 +317,1025 @@ impl Task {
     pub fn transaction(&self) -> &SanitizedTransaction {
         &self.transaction
     }
+
+    fn lock_attempts(&self) -> &Vec<LockAttempt> {
+        &self.lock_attempts
+    }
+
+    fn blocked_page_count_mut<'t>(
+        &self,
+        token: &'t mut BlockedPageCountToken,
+    ) -> &'t mut ShortCounter {
+        self.blocked_page_count.borrow_mut(token)
+    }
+
+    fn set_blocked_page_count(&self, token: &mut BlockedPageCountToken, count: ShortCounter) {
+        *self.blocked_page_count_mut(token) = count;
+    }
+
+    #[must_use]
+    fn try_unblock(self: &Task, token: &mut BlockedPageCountToken) -> Option<Task> {
+        self.blocked_page_count_mut(token)
+            .decrement_self()
+            .is_zero()
+            .then(|| self.clone())
+    }
+}
+
+/// [`Task`]'s per-address attempt to use a [page](Page) with [certain kind of
+/// request](RequestedUsage).
+#[derive(Debug)]
+struct LockAttempt {
+    page: Page,
+    requested_usage: RequestedUsage,
+}
+const_assert_eq!(mem::size_of::<LockAttempt>(), 16);
+
+impl LockAttempt {
+    fn new(page: Page, requested_usage: RequestedUsage) -> Self {
+        Self {
+            page,
+            requested_usage,
+        }
+    }
+
+    fn page_mut<'t>(&self, page_token: &'t mut PageToken) -> &'t mut PageInner {
+        self.page.0.borrow_mut(page_token)
+    }
+}
+
+/// Status about how the [`Page`] is used currently. Unlike [`RequestedUsage`], it has additional
+/// variant of [`Unused`](`PageUsage::Unused`).
+#[derive(Copy, Clone, Debug, Default)]
+enum PageUsage {
+    #[default]
+    Unused,
+    Readonly(ShortCounter),
+    Writable,
+}
+const_assert_eq!(mem::size_of::<PageUsage>(), 8);
+
+impl PageUsage {
+    fn from_requested_usage(requested_usage: RequestedUsage) -> Self {
+        match requested_usage {
+            RequestedUsage::Readonly => PageUsage::Readonly(ShortCounter::one()),
+            RequestedUsage::Writable => PageUsage::Writable,
+        }
+    }
+}
+
+/// Status about how a task is requesting to use a particular [`Page`]. Unlike [`PageUsage`],
+/// it has only two unit variants.
+#[derive(Clone, Copy, Debug)]
+enum RequestedUsage {
+    Readonly,
+    Writable,
+}
+
+/// Internal scheduling data about a particular address.
+///
+/// Specifially, it holds the current [`PageUsage`] (or no usage with [`PageUsage::Unused`]) and
+/// which [`Task`]s are blocked to be executed after the current task is notified to be finished
+/// via [`::deschedule_task`](`SchedulingStateMachine::deschedule_task`)
+#[derive(Debug)]
+struct PageInner {
+    usage: PageUsage,
+    blocked_tasks: VecDeque<(Task, RequestedUsage)>,
+}
+
+impl Default for PageInner {
+    fn default() -> Self {
+        Self {
+            usage: PageUsage::default(),
+            blocked_tasks: VecDeque::with_capacity(1024),
+        }
+    }
+}
+
+impl PageInner {
+    fn push_blocked_task(&mut self, task: Task, requested_usage: RequestedUsage) {
+        self.blocked_tasks.push_back((task, requested_usage));
+    }
+
+    fn has_no_blocked_task(&self) -> bool {
+        self.blocked_tasks.is_empty()
+    }
+
+    #[must_use]
+    fn pop_unblocked_next_task(&mut self) -> Option<(Task, RequestedUsage)> {
+        self.blocked_tasks.pop_front()
+    }
+
+    #[must_use]
+    fn blocked_next_task(&self) -> Option<(&Task, RequestedUsage)> {
+        self.blocked_tasks
+            .front()
+            .map(|(task, requested_usage)| (task, *requested_usage))
+    }
+
+    #[must_use]
+    fn pop_blocked_next_readonly_task(&mut self) -> Option<(Task, RequestedUsage)> {
+        if matches!(
+            self.blocked_next_task(),
+            Some((_, RequestedUsage::Readonly))
+        ) {
+            self.pop_unblocked_next_task()
+        } else {
+            None
+        }
+    }
+}
+
+const_assert_eq!(mem::size_of::<TokenCell<PageInner>>(), 40);
+
+/// Scheduler's internal data for each address ([`Pubkey`](`solana_sdk::pubkey::Pubkey`)). Very
+/// opaque wrapper type; no methods just with [`::clone()`](Clone::clone) and
+/// [`::default()`](Default::default).
+#[derive(Debug, Clone, Default)]
+pub struct Page(Arc<TokenCell<PageInner>>);
+const_assert_eq!(mem::size_of::<Page>(), 8);
+
+/// A high-level `struct`, managing the overall scheduling of [tasks](Task), to be used by
+/// `solana-unified-scheduler-pool`.
+#[cfg_attr(feature = "dev-context-only-utils", field_qualifiers(count_token(pub)))]
+pub struct SchedulingStateMachine {
+    last_task_index: Option<usize>,
+    unblocked_task_queue: VecDeque<Task>,
+    active_task_count: ShortCounter,
+    handled_task_count: ShortCounter,
+    unblocked_task_count: ShortCounter,
+    total_task_count: ShortCounter,
+    count_token: BlockedPageCountToken,
+    page_token: PageToken,
+}
+const_assert_eq!(mem::size_of::<SchedulingStateMachine>(), 64);
+
+impl SchedulingStateMachine {
+    pub fn has_no_active_task(&self) -> bool {
+        self.active_task_count.is_zero()
+    }
+
+    pub fn unblocked_task_queue_count(&self) -> usize {
+        self.unblocked_task_queue.len()
+    }
+
+    pub fn active_task_count(&self) -> u32 {
+        self.active_task_count.current()
+    }
+
+    pub fn handled_task_count(&self) -> u32 {
+        self.handled_task_count.current()
+    }
+
+    pub fn unblocked_task_count(&self) -> u32 {
+        self.unblocked_task_count.current()
+    }
+
+    pub fn total_task_count(&self) -> u32 {
+        self.total_task_count.current()
+    }
+
+    #[must_use]
+    pub fn schedule_task(&mut self, task: Task) -> Option<Task> {
+        let new_task_index = task.task_index();
+        if let Some(old_task_index) = self.last_task_index.replace(new_task_index) {
+            assert!(
+                new_task_index > old_task_index,
+                "bad new task index: {new_task_index} > {old_task_index}"
+            );
+        }
+        self.total_task_count.increment_self();
+        self.active_task_count.increment_self();
+        self.attempt_lock_for_task(task)
+    }
+
+    pub fn has_unblocked_task(&self) -> bool {
+        !self.unblocked_task_queue.is_empty()
+    }
+
+    #[must_use]
+    pub fn schedule_unblocked_task(&mut self) -> Option<Task> {
+        self.unblocked_task_queue.pop_front().map(|task| {
+            self.unblocked_task_count.increment_self();
+            task
+        })
+    }
+
+    pub fn deschedule_task(&mut self, task: &Task) {
+        let blocked_task_index = task.task_index();
+        let largest_task_index = self
+            .last_task_index
+            .expect("task should have been scheduled");
+        assert!(
+            blocked_task_index <= largest_task_index,
+            "bad unblocked task index: {blocked_task_index} <= {largest_task_index}"
+        );
+        self.active_task_count.decrement_self();
+        self.handled_task_count.increment_self();
+        self.unlock_for_task(task);
+    }
+
+    #[must_use]
+    fn attempt_lock_pages(&mut self, task: &Task) -> ShortCounter {
+        let mut blocked_page_count = ShortCounter::zero();
+
+        for attempt in task.lock_attempts() {
+            let page = attempt.page_mut(&mut self.page_token);
+            let lock_status = if page.has_no_blocked_task() {
+                Self::attempt_lock_page(page, attempt.requested_usage)
+            } else {
+                LockResult::Err(())
+            };
+            match lock_status {
+                LockResult::Ok(PageUsage::Unused) => unreachable!(),
+                LockResult::Ok(new_usage) => {
+                    page.usage = new_usage;
+                }
+                LockResult::Err(()) => {
+                    blocked_page_count.increment_self();
+                    page.push_blocked_task(task.clone(), attempt.requested_usage);
+                }
+            }
+        }
+
+        blocked_page_count
+    }
+
+    fn attempt_lock_page(page: &PageInner, requested_usage: RequestedUsage) -> LockResult {
+        match page.usage {
+            PageUsage::Unused => LockResult::Ok(PageUsage::from_requested_usage(requested_usage)),
+            PageUsage::Readonly(count) => match requested_usage {
+                RequestedUsage::Readonly => LockResult::Ok(PageUsage::Readonly(count.increment())),
+                RequestedUsage::Writable => LockResult::Err(()),
+            },
+            PageUsage::Writable => LockResult::Err(()),
+        }
+    }
+
+    #[must_use]
+    fn unlock_page(page: &mut PageInner, attempt: &LockAttempt) -> Option<(Task, RequestedUsage)> {
+        let mut is_unused_now = false;
+        match &mut page.usage {
+            PageUsage::Readonly(ref mut count) => match attempt.requested_usage {
+                RequestedUsage::Readonly => {
+                    if count.is_one() {
+                        is_unused_now = true;
+                    } else {
+                        count.decrement_self();
+                    }
+                }
+                RequestedUsage::Writable => unreachable!(),
+            },
+            PageUsage::Writable => match attempt.requested_usage {
+                RequestedUsage::Writable => {
+                    is_unused_now = true;
+                }
+                RequestedUsage::Readonly => unreachable!(),
+            },
+            PageUsage::Unused => unreachable!(),
+        }
+
+        if is_unused_now {
+            page.usage = PageUsage::Unused;
+            page.pop_unblocked_next_task()
+        } else {
+            None
+        }
+    }
+
+    #[must_use]
+    fn attempt_lock_for_task(&mut self, task: Task) -> Option<Task> {
+        let blocked_page_count = self.attempt_lock_pages(&task);
+
+        if blocked_page_count.is_zero() {
+            // succeeded
+            Some(task)
+        } else {
+            // failed
+            task.set_blocked_page_count(&mut self.count_token, blocked_page_count);
+            None
+        }
+    }
+
+    fn unlock_for_task(&mut self, task: &Task) {
+        for unlock_attempt in task.lock_attempts() {
+            let page = unlock_attempt.page_mut(&mut self.page_token);
+            let mut unblocked_task_from_page = Self::unlock_page(page, unlock_attempt);
+
+            while let Some((task_with_unblocked_page, requested_usage)) = unblocked_task_from_page {
+                if let Some(task) = task_with_unblocked_page.try_unblock(&mut self.count_token) {
+                    self.unblocked_task_queue.push_back(task);
+                }
+
+                match Self::attempt_lock_page(page, requested_usage) {
+                    LockResult::Ok(PageUsage::Unused) => unreachable!(),
+                    LockResult::Ok(new_usage) => {
+                        page.usage = new_usage;
+                        // Try to further schedule blocked task for parallelism in the case of
+                        // readonly usages
+                        unblocked_task_from_page = if matches!(new_usage, PageUsage::Readonly(_)) {
+                            page.pop_blocked_next_readonly_task()
+                        } else {
+                            None
+                        };
+                    }
+                    LockResult::Err(_) => panic!("should never fail in this context"),
+                }
+            }
+        }
+    }
+
+    /// Creates a new task with [`SanitizedTransaction`] with all of its corresponding [`Page`]s
+    /// preloaded.
+    ///
+    /// Closure (`page_loader`) is used to delegate the (possibly multi-threaded)
+    /// implementation of [`Page`] look-up by [`pubkey`](Pubkey) to callers. It's the caller's
+    /// responsibility to ensure the same instance is returned from the closure, given a particular
+    /// pubkey.
+    pub fn create_task(
+        transaction: SanitizedTransaction,
+        index: usize,
+        page_loader: &mut impl FnMut(Pubkey) -> Page,
+    ) -> Task {
+        // this is safe bla bla
+        let locks = transaction.get_account_locks_unchecked();
+
+        let writable_locks = locks
+            .writable
+            .iter()
+            .map(|address| (address, RequestedUsage::Writable));
+        let readonly_locks = locks
+            .readonly
+            .iter()
+            .map(|address| (address, RequestedUsage::Readonly));
+
+        let lock_attempts = writable_locks
+            .chain(readonly_locks)
+            .map(|(address, requested_usage)| {
+                LockAttempt::new(page_loader(**address), requested_usage)
+            })
+            .collect();
+
+        Task::new(TaskInner {
+            transaction,
+            index,
+            lock_attempts,
+            blocked_page_count: TokenCell::new(ShortCounter::zero()),
+        })
+    }
+
+    /// Rewind the inactive state machine to be initialized
+    ///
+    /// This isn't called _reset_ to indicate this isn't safe to call this at any given moment.
+    /// This panics if the state machine hasn't properly been finished (i.e.  there should be no
+    /// active task) to uphold invariants of [`Page`]s.
+    ///
+    /// This method is intended to reuse SchedulingStateMachine instance (to avoid its `unsafe`
+    /// [constructor](SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling)
+    /// as much as possible) and its (possbily cached) associated [`Page`]s for processing other
+    /// slots.
+    pub fn reinitialize(&mut self) {
+        assert!(self.has_no_active_task());
+        assert_eq!(self.unblocked_task_queue.len(), 0);
+        self.last_task_index = None;
+        self.active_task_count.reset_to_zero();
+        self.handled_task_count.reset_to_zero();
+        self.unblocked_task_count.reset_to_zero();
+        self.total_task_count.reset_to_zero();
+    }
+
+    /// Creates a new instance of [`SchedulingStateMachine`] with its `unsafe` fields created as
+    /// well, thus carrying over `unsafe`.
+    ///
+    /// # Safety
+    /// Call this exactly once for each thread. See [`TokenCell`] for details.
+    #[must_use]
+    pub unsafe fn exclusively_initialize_current_thread_for_scheduling() -> Self {
+        Self {
+            last_task_index: None,
+            unblocked_task_queue: VecDeque::with_capacity(1024),
+            active_task_count: ShortCounter::zero(),
+            handled_task_count: ShortCounter::zero(),
+            unblocked_task_count: ShortCounter::zero(),
+            total_task_count: ShortCounter::zero(),
+            count_token: unsafe { BlockedPageCountToken::assume_exclusive_mutating_thread() },
+            page_token: unsafe { PageToken::assume_exclusive_mutating_thread() },
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use {
+        super::*,
+        assert_matches::assert_matches,
+        solana_sdk::{
+            instruction::{AccountMeta, Instruction},
+            message::Message,
+            pubkey::Pubkey,
+            signature::Signer,
+            signer::keypair::Keypair,
+            transaction::{SanitizedTransaction, Transaction},
+        },
+        std::{cell::RefCell, collections::HashMap, rc::Rc},
+    };
+
+    fn simplest_transaction() -> SanitizedTransaction {
+        let payer = Keypair::new();
+        let message = Message::new(&[], Some(&payer.pubkey()));
+        let unsigned = Transaction::new_unsigned(message);
+        SanitizedTransaction::from_transaction_for_tests(unsigned)
+    }
+
+    fn transaction_with_readonly_address(address: Pubkey) -> SanitizedTransaction {
+        let instruction = Instruction {
+            program_id: Pubkey::default(),
+            accounts: vec![AccountMeta::new_readonly(address, false)],
+            data: vec![],
+        };
+        let message = Message::new(&[instruction], Some(&Pubkey::new_unique()));
+        let unsigned = Transaction::new_unsigned(message);
+        SanitizedTransaction::from_transaction_for_tests(unsigned)
+    }
+
+    fn transaction_with_writable_address(address: Pubkey) -> SanitizedTransaction {
+        let instruction = Instruction {
+            program_id: Pubkey::default(),
+            accounts: vec![AccountMeta::new(address, false)],
+            data: vec![],
+        };
+        let message = Message::new(&[instruction], Some(&Pubkey::new_unique()));
+        let unsigned = Transaction::new_unsigned(message);
+        SanitizedTransaction::from_transaction_for_tests(unsigned)
+    }
+
+    fn create_address_loader(
+        pages: Option<Rc<RefCell<HashMap<Pubkey, Page>>>>,
+    ) -> impl FnMut(Pubkey) -> Page {
+        let pages = pages.unwrap_or_default();
+        move |address| pages.borrow_mut().entry(address).or_default().clone()
+    }
+
+    #[test]
+    fn test_debug() {
+        // these are almost meaningless just to see eye-pleasing coverage report....
+        assert_eq!(
+            format!(
+                "{:?}",
+                LockResult::Ok(PageUsage::Readonly(ShortCounter::one()))
+            ),
+            "Ok(Readonly(ShortCounter(1)))"
+        );
+        let sanitized = simplest_transaction();
+        let task = SchedulingStateMachine::create_task(sanitized, 0, &mut |_| Page::default());
+        assert!(format!("{:?}", task).contains("TaskInner"));
+
+        assert_eq!(
+            format!("{:?}", PageInner::default()),
+            "PageInner { usage: Unused, blocked_tasks: [] }"
+        )
+    }
+
+    #[test]
+    fn test_scheduling_state_machine_creation() {
+        let state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        assert_eq!(state_machine.active_task_count(), 0);
+        assert_eq!(state_machine.total_task_count(), 0);
+        assert!(state_machine.has_no_active_task());
+    }
+
+    #[test]
+    fn test_scheduling_state_machine_reinitialization() {
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        state_machine.total_task_count.increment_self();
+        assert_eq!(state_machine.total_task_count(), 1);
+        state_machine.last_task_index = Some(1);
+        state_machine.reinitialize();
+        assert_eq!(state_machine.total_task_count(), 0);
+        assert_eq!(state_machine.last_task_index, None);
+    }
+
+    #[test]
+    fn test_create_task() {
+        let sanitized = simplest_transaction();
+        let task =
+            SchedulingStateMachine::create_task(sanitized.clone(), 3, &mut |_| Page::default());
+        assert_eq!(task.task_index(), 3);
+        assert_eq!(task.transaction(), &sanitized);
+    }
+
+    #[test]
+    fn test_non_conflicting_task_related_counts() {
+        let sanitized = simplest_transaction();
+        let address_loader = &mut create_address_loader(None);
+        let task = SchedulingStateMachine::create_task(sanitized.clone(), 3, address_loader);
+
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        let task = state_machine.schedule_task(task).unwrap();
+        assert_eq!(state_machine.active_task_count(), 1);
+        assert_eq!(state_machine.total_task_count(), 1);
+        state_machine.deschedule_task(&task);
+        assert_eq!(state_machine.active_task_count(), 0);
+        assert_eq!(state_machine.total_task_count(), 1);
+        assert!(state_machine.has_no_active_task());
+    }
+
+    #[test]
+    fn test_conflicting_task_related_counts() {
+        let sanitized = simplest_transaction();
+        let address_loader = &mut create_address_loader(None);
+        let task1 = SchedulingStateMachine::create_task(sanitized.clone(), 101, address_loader);
+        let task2 = SchedulingStateMachine::create_task(sanitized.clone(), 102, address_loader);
+        let task3 = SchedulingStateMachine::create_task(sanitized.clone(), 103, address_loader);
+
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        assert_matches!(
+            state_machine
+                .schedule_task(task1.clone())
+                .map(|t| t.task_index()),
+            Some(101)
+        );
+        assert_matches!(state_machine.schedule_task(task2.clone()), None);
+
+        state_machine.deschedule_task(&task1);
+        assert!(state_machine.has_unblocked_task());
+        assert_eq!(state_machine.unblocked_task_queue_count(), 1);
+        assert_eq!(
+            state_machine
+                .schedule_unblocked_task()
+                .unwrap()
+                .task_index(),
+            task2.task_index()
+        );
+        assert!(!state_machine.has_unblocked_task());
+        assert_eq!(state_machine.unblocked_task_queue_count(), 0);
+        state_machine.deschedule_task(&task2);
+
+        assert_matches!(
+            state_machine
+                .schedule_task(task3.clone())
+                .map(|task| task.task_index()),
+            Some(103)
+        );
+        state_machine.deschedule_task(&task3);
+        assert!(state_machine.has_no_active_task());
+    }
+
+    #[test]
+    fn test_unblocked_task_related_counts() {
+        let sanitized = simplest_transaction();
+        let address_loader = &mut create_address_loader(None);
+        let task1 = SchedulingStateMachine::create_task(sanitized.clone(), 101, address_loader);
+        let task2 = SchedulingStateMachine::create_task(sanitized.clone(), 102, address_loader);
+
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        assert_matches!(
+            state_machine
+                .schedule_task(task1.clone())
+                .map(|t| t.task_index()),
+            Some(101)
+        );
+        assert_matches!(state_machine.schedule_task(task2.clone()), None);
+
+        state_machine.deschedule_task(&task1);
+
+        assert_eq!(state_machine.unblocked_task_count(), 0);
+        assert_matches!(
+            state_machine
+                .schedule_unblocked_task()
+                .map(|t| t.task_index()),
+            Some(102)
+        );
+        assert_eq!(state_machine.unblocked_task_count(), 1);
+        // there's no blocked task anymore; calling schedule_unblocked_task should be noop and
+        // shouldn't increment the unblocked_task_count().
+        assert_matches!(state_machine.schedule_unblocked_task(), None);
+        assert_eq!(state_machine.unblocked_task_count(), 1);
+
+        state_machine.deschedule_task(&task2);
+        assert!(state_machine.has_no_active_task());
+    }
+
+    #[test]
+    fn test_existing_blocking_task_then_newly_scheduled_task() {
+        let sanitized = simplest_transaction();
+        let address_loader = &mut create_address_loader(None);
+        let task1 = SchedulingStateMachine::create_task(sanitized.clone(), 101, address_loader);
+        let task2 = SchedulingStateMachine::create_task(sanitized.clone(), 102, address_loader);
+        let task3 = SchedulingStateMachine::create_task(sanitized.clone(), 103, address_loader);
+
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        assert_matches!(
+            state_machine
+                .schedule_task(task1.clone())
+                .map(|t| t.task_index()),
+            Some(101)
+        );
+        assert_matches!(state_machine.schedule_task(task2.clone()), None);
+
+        assert_eq!(state_machine.unblocked_task_queue_count(), 0);
+        state_machine.deschedule_task(&task1);
+        assert_eq!(state_machine.unblocked_task_queue_count(), 1);
+
+        // new task is arriving after task1 is already descheduled and task2 got unblocked
+        assert_matches!(state_machine.schedule_task(task3.clone()), None);
+
+        assert_eq!(state_machine.unblocked_task_count(), 0);
+        assert_matches!(
+            state_machine
+                .schedule_unblocked_task()
+                .map(|t| t.task_index()),
+            Some(102)
+        );
+        assert_eq!(state_machine.unblocked_task_count(), 1);
+
+        state_machine.deschedule_task(&task2);
+
+        assert_matches!(
+            state_machine
+                .schedule_unblocked_task()
+                .map(|t| t.task_index()),
+            Some(103)
+        );
+        assert_eq!(state_machine.unblocked_task_count(), 2);
+
+        state_machine.deschedule_task(&task3);
+        assert!(state_machine.has_no_active_task());
+    }
+
+    #[test]
+    fn test_multiple_readonly_task_and_counts() {
+        let conflicting_address = Pubkey::new_unique();
+        let sanitized1 = transaction_with_readonly_address(conflicting_address);
+        let sanitized2 = transaction_with_readonly_address(conflicting_address);
+        let address_loader = &mut create_address_loader(None);
+        let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader);
+        let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader);
+
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        // both of read-only tasks should be immediately runnable
+        assert_matches!(
+            state_machine
+                .schedule_task(task1.clone())
+                .map(|t| t.task_index()),
+            Some(101)
+        );
+        assert_matches!(
+            state_machine
+                .schedule_task(task2.clone())
+                .map(|t| t.task_index()),
+            Some(102)
+        );
+
+        assert_eq!(state_machine.active_task_count(), 2);
+        assert_eq!(state_machine.handled_task_count(), 0);
+        assert_eq!(state_machine.unblocked_task_queue_count(), 0);
+        state_machine.deschedule_task(&task1);
+        assert_eq!(state_machine.active_task_count(), 1);
+        assert_eq!(state_machine.handled_task_count(), 1);
+        assert_eq!(state_machine.unblocked_task_queue_count(), 0);
+        state_machine.deschedule_task(&task2);
+        assert_eq!(state_machine.active_task_count(), 0);
+        assert_eq!(state_machine.handled_task_count(), 2);
+        assert!(state_machine.has_no_active_task());
+    }
+
+    #[test]
+    fn test_all_blocking_redable_tasks_block_writable_task() {
+        let conflicting_address = Pubkey::new_unique();
+        let sanitized1 = transaction_with_readonly_address(conflicting_address);
+        let sanitized2 = transaction_with_readonly_address(conflicting_address);
+        let sanitized3 = transaction_with_writable_address(conflicting_address);
+        let address_loader = &mut create_address_loader(None);
+        let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader);
+        let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader);
+        let task3 = SchedulingStateMachine::create_task(sanitized3, 103, address_loader);
+
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        assert_matches!(
+            state_machine
+                .schedule_task(task1.clone())
+                .map(|t| t.task_index()),
+            Some(101)
+        );
+        assert_matches!(
+            state_machine
+                .schedule_task(task2.clone())
+                .map(|t| t.task_index()),
+            Some(102)
+        );
+        assert_matches!(state_machine.schedule_task(task3.clone()), None);
+
+        assert_eq!(state_machine.active_task_count(), 3);
+        assert_eq!(state_machine.handled_task_count(), 0);
+        assert_eq!(state_machine.unblocked_task_queue_count(), 0);
+        state_machine.deschedule_task(&task1);
+        assert_eq!(state_machine.active_task_count(), 2);
+        assert_eq!(state_machine.handled_task_count(), 1);
+        assert_eq!(state_machine.unblocked_task_queue_count(), 0);
+        assert_matches!(state_machine.schedule_unblocked_task(), None);
+        state_machine.deschedule_task(&task2);
+        assert_eq!(state_machine.active_task_count(), 1);
+        assert_eq!(state_machine.handled_task_count(), 2);
+        assert_eq!(state_machine.unblocked_task_queue_count(), 1);
+        // task3 is finally unblocked after all of readble tasks (task1 and task2) is finished.
+        assert_matches!(
+            state_machine
+                .schedule_unblocked_task()
+                .map(|t| t.task_index()),
+            Some(103)
+        );
+        state_machine.deschedule_task(&task3);
+        assert!(state_machine.has_no_active_task());
+    }
+
+    #[test]
+    fn test_readonly_then_writable_then_readonly_linearized() {
+        let conflicting_address = Pubkey::new_unique();
+        let sanitized1 = transaction_with_readonly_address(conflicting_address);
+        let sanitized2 = transaction_with_writable_address(conflicting_address);
+        let sanitized3 = transaction_with_readonly_address(conflicting_address);
+        let address_loader = &mut create_address_loader(None);
+        let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader);
+        let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader);
+        let task3 = SchedulingStateMachine::create_task(sanitized3, 103, address_loader);
+
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        assert_matches!(
+            state_machine
+                .schedule_task(task1.clone())
+                .map(|t| t.task_index()),
+            Some(101)
+        );
+        assert_matches!(state_machine.schedule_task(task2.clone()), None);
+        assert_matches!(state_machine.schedule_task(task3.clone()), None);
+
+        assert_matches!(state_machine.schedule_unblocked_task(), None);
+        state_machine.deschedule_task(&task1);
+        assert_matches!(
+            state_machine
+                .schedule_unblocked_task()
+                .map(|t| t.task_index()),
+            Some(102)
+        );
+        assert_matches!(state_machine.schedule_unblocked_task(), None);
+        state_machine.deschedule_task(&task2);
+        assert_matches!(
+            state_machine
+                .schedule_unblocked_task()
+                .map(|t| t.task_index()),
+            Some(103)
+        );
+        assert_matches!(state_machine.schedule_unblocked_task(), None);
+        state_machine.deschedule_task(&task3);
+        assert!(state_machine.has_no_active_task());
+    }
+
+    #[test]
+    fn test_readonly_then_writable() {
+        let conflicting_address = Pubkey::new_unique();
+        let sanitized1 = transaction_with_readonly_address(conflicting_address);
+        let sanitized2 = transaction_with_writable_address(conflicting_address);
+        let address_loader = &mut create_address_loader(None);
+        let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader);
+        let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader);
+
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        assert_matches!(
+            state_machine
+                .schedule_task(task1.clone())
+                .map(|t| t.task_index()),
+            Some(101)
+        );
+        assert_matches!(state_machine.schedule_task(task2.clone()), None);
+
+        // descheduling read-locking task1 should equate to unblocking write-locking task2
+        state_machine.deschedule_task(&task1);
+        assert_matches!(
+            state_machine
+                .schedule_unblocked_task()
+                .map(|t| t.task_index()),
+            Some(102)
+        );
+        state_machine.deschedule_task(&task2);
+        assert!(state_machine.has_no_active_task());
+    }
+
+    #[test]
+    fn test_blocked_tasks_writable_2_readonly_then_writable() {
+        let conflicting_address = Pubkey::new_unique();
+        let sanitized1 = transaction_with_writable_address(conflicting_address);
+        let sanitized2 = transaction_with_readonly_address(conflicting_address);
+        let sanitized3 = transaction_with_readonly_address(conflicting_address);
+        let sanitized4 = transaction_with_writable_address(conflicting_address);
+        let address_loader = &mut create_address_loader(None);
+        let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader);
+        let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader);
+        let task3 = SchedulingStateMachine::create_task(sanitized3, 103, address_loader);
+        let task4 = SchedulingStateMachine::create_task(sanitized4, 104, address_loader);
+
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        assert_matches!(
+            state_machine
+                .schedule_task(task1.clone())
+                .map(|t| t.task_index()),
+            Some(101)
+        );
+        assert_matches!(state_machine.schedule_task(task2.clone()), None);
+        assert_matches!(state_machine.schedule_task(task3.clone()), None);
+        assert_matches!(state_machine.schedule_task(task4.clone()), None);
+
+        state_machine.deschedule_task(&task1);
+        assert_matches!(
+            state_machine
+                .schedule_unblocked_task()
+                .map(|t| t.task_index()),
+            Some(102)
+        );
+        assert_matches!(
+            state_machine
+                .schedule_unblocked_task()
+                .map(|t| t.task_index()),
+            Some(103)
+        );
+        // the above deschedule_task(task1) call should only unblock task2 and task3 because these
+        // are read-locking. And shouldn't unblock task4 because it's write-locking
+        assert_matches!(state_machine.schedule_unblocked_task(), None);
+
+        state_machine.deschedule_task(&task2);
+        // still task4 is blocked...
+        assert_matches!(state_machine.schedule_unblocked_task(), None);
+
+        state_machine.deschedule_task(&task3);
+        // finally task4 should be unblocked
+        assert_matches!(
+            state_machine
+                .schedule_unblocked_task()
+                .map(|t| t.task_index()),
+            Some(104)
+        );
+        state_machine.deschedule_task(&task4);
+        assert!(state_machine.has_no_active_task());
+    }
+
+    #[test]
+    fn test_gradual_locking() {
+        let conflicting_address = Pubkey::new_unique();
+        let sanitized1 = transaction_with_writable_address(conflicting_address);
+        let sanitized2 = transaction_with_writable_address(conflicting_address);
+        let pages = Rc::new(RefCell::new(HashMap::new()));
+        let address_loader = &mut create_address_loader(Some(pages.clone()));
+        let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader);
+        let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader);
+
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        assert_matches!(
+            state_machine
+                .schedule_task(task1.clone())
+                .map(|t| t.task_index()),
+            Some(101)
+        );
+        assert_matches!(state_machine.schedule_task(task2.clone()), None);
+        let pages = pages.borrow_mut();
+        let page = pages.get(&conflicting_address).unwrap();
+        assert_matches!(
+            page.0.borrow_mut(&mut state_machine.page_token).usage,
+            PageUsage::Writable
+        );
+        // task2's fee payer should have been locked already even if task2 is blocked still via the
+        // above the schedule_task(task2) call
+        let fee_payer = task2.transaction().message().fee_payer();
+        let page = pages.get(fee_payer).unwrap();
+        assert_matches!(
+            page.0.borrow_mut(&mut state_machine.page_token).usage,
+            PageUsage::Writable
+        );
+    }
+
+    #[test]
+    #[should_panic(expected = "internal error: entered unreachable code")]
+    fn test_unreachable_unlock_conditions1() {
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        let page = Page::default();
+        let _ = SchedulingStateMachine::unlock_page(
+            page.0.borrow_mut(&mut state_machine.page_token),
+            &LockAttempt::new(page, RequestedUsage::Writable),
+        );
+    }
+
+    #[test]
+    #[should_panic(expected = "internal error: entered unreachable code")]
+    fn test_unreachable_unlock_conditions2() {
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        let page = Page::default();
+        page.0.borrow_mut(&mut state_machine.page_token).usage = PageUsage::Writable;
+        let _ = SchedulingStateMachine::unlock_page(
+            page.0.borrow_mut(&mut state_machine.page_token),
+            &LockAttempt::new(page, RequestedUsage::Readonly),
+        );
+    }
+
+    #[test]
+    #[should_panic(expected = "internal error: entered unreachable code")]
+    fn test_unreachable_unlock_conditions3() {
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        let page = Page::default();
+        page.0.borrow_mut(&mut state_machine.page_token).usage =
+            PageUsage::Readonly(ShortCounter::one());
+        let _ = SchedulingStateMachine::unlock_page(
+            page.0.borrow_mut(&mut state_machine.page_token),
+            &LockAttempt::new(page, RequestedUsage::Writable),
+        );
+    }
+
+    #[test]
+    #[should_panic(expected = "bad new task index: 101 > 101")]
+    fn test_schedule_same_task() {
+        let conflicting_address = Pubkey::new_unique();
+        let sanitized = transaction_with_writable_address(conflicting_address);
+        let address_loader = &mut create_address_loader(None);
+        let task = SchedulingStateMachine::create_task(sanitized, 101, address_loader);
+
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        let _ = state_machine.schedule_task(task.clone());
+        let _ = state_machine.schedule_task(task.clone());
+    }
+
+    #[test]
+    #[should_panic(expected = "bad new task index: 101 > 102")]
+    fn test_schedule_task_out_of_order() {
+        let conflicting_address = Pubkey::new_unique();
+        let sanitized = transaction_with_writable_address(conflicting_address);
+        let address_loader = &mut create_address_loader(None);
+        let task1 = SchedulingStateMachine::create_task(sanitized.clone(), 101, address_loader);
+        let task2 = SchedulingStateMachine::create_task(sanitized.clone(), 102, address_loader);
+
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        let _ = state_machine.schedule_task(task2.clone());
+        let _ = state_machine.schedule_task(task1.clone());
+    }
+
+    #[test]
+    #[should_panic(expected = "task should have been scheduled")]
+    fn test_deschedule_new_task_wihout_scheduling() {
+        let conflicting_address = Pubkey::new_unique();
+        let sanitized = transaction_with_writable_address(conflicting_address);
+        let address_loader = &mut create_address_loader(None);
+        let task = SchedulingStateMachine::create_task(sanitized.clone(), 101, address_loader);
+
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        state_machine.deschedule_task(&task);
+    }
+
+    #[test]
+    #[should_panic(expected = "bad unblocked task index: 102 <= 101")]
+    fn test_deschedule_new_task_out_of_order() {
+        let conflicting_address = Pubkey::new_unique();
+        let sanitized = transaction_with_writable_address(conflicting_address);
+        let address_loader = &mut create_address_loader(None);
+        let task1 = SchedulingStateMachine::create_task(sanitized.clone(), 101, address_loader);
+        let task2 = SchedulingStateMachine::create_task(sanitized.clone(), 102, address_loader);
+
+        let mut state_machine = unsafe {
+            SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+        };
+        let _ = state_machine.schedule_task(task1.clone());
+        state_machine.deschedule_task(&task2);
+    }
 }
diff --git a/unified-scheduler-pool/Cargo.toml b/unified-scheduler-pool/Cargo.toml
index 7626215b1e1126..bc2a33014ff266 100644
--- a/unified-scheduler-pool/Cargo.toml
+++ b/unified-scheduler-pool/Cargo.toml
@@ -11,17 +11,41 @@ edition = { workspace = true }
 
 [dependencies]
 assert_matches = { workspace = true }
+cpu-time = { workspace = true }
 crossbeam-channel = { workspace = true }
+dashmap = { workspace = true }
 derivative = { workspace = true }
 log = { workspace = true }
+qualifier_attr = { workspace = true }
+rustix = { workspace = true }
+serde_json = { workspace = true }
 solana-ledger = { workspace = true }
+solana-measure = { workspace = true }
+solana-metrics = { workspace = true }
 solana-program-runtime = { workspace = true }
 solana-runtime = { workspace = true }
 solana-sdk = { workspace = true }
 solana-unified-scheduler-logic = { workspace = true }
 solana-vote = { workspace = true }
 
+[target."cfg(target_os = \"linux\")".dependencies]
+procfs = { workspace = true }
+
 [dev-dependencies]
-assert_matches = { workspace = true }
+bincode = { workspace = true }
+criterion = "0.5.1"
+jemallocator = { workspace = true }
+log = { workspace = true }
+rand = { workspace = true }
 solana-logger = { workspace = true }
+solana-nohash-hasher = { workspace = true }
 solana-runtime = { workspace = true, features = ["dev-context-only-utils"] }
+# See order-crates-for-publishing.py for using this unusual `path = "."`
+solana-unified-scheduler-pool = { path = ".", features = ["dev-context-only-utils"] }
+
+[[bench]]
+name = "lib"
+harness = false
+
+[features]
+dev-context-only-utils = []
diff --git a/unified-scheduler-pool/benches/lib.rs b/unified-scheduler-pool/benches/lib.rs
new file mode 100644
index 00000000000000..479eaf68e73f35
--- /dev/null
+++ b/unified-scheduler-pool/benches/lib.rs
@@ -0,0 +1,208 @@
+#![allow(unused_imports, dead_code)]
+#![feature(test)]
+
+extern crate test;
+
+#[cfg(not(target_env = "msvc"))]
+use jemallocator::Jemalloc;
+
+#[cfg(not(target_env = "msvc"))]
+#[global_allocator]
+static GLOBAL: Jemalloc = Jemalloc;
+
+use {
+    solana_program_runtime::timings::ExecuteTimings,
+    solana_runtime::{
+        bank::Bank,
+        bank_forks::BankForks,
+        genesis_utils::{create_genesis_config, GenesisConfigInfo},
+        installed_scheduler_pool::{
+            DefaultScheduleExecutionArg, InstalledScheduler, SchedulingContext,
+        },
+        prioritization_fee_cache::PrioritizationFeeCache,
+    },
+    solana_sdk::{
+        scheduling::SchedulingMode,
+        transaction::{Result, SanitizedTransaction},
+    },
+    solana_unified_scheduler_logic::{Page, SchedulingStateMachine},
+    solana_unified_scheduler_pool::{
+        HandlerContext, PooledScheduler, SchedulerPool, SpawnableScheduler, TaskHandler,
+    },
+    std::sync::Arc,
+};
+
+#[derive(Debug, Clone)]
+struct DummyTaskHandler;
+
+impl TaskHandler<DefaultScheduleExecutionArg> for DummyTaskHandler {
+    fn handle(
+        &self,
+        _result: &mut Result<()>,
+        _timings: &mut ExecuteTimings,
+        _bank: &Arc<Bank>,
+        _transaction: &SanitizedTransaction,
+        _index: usize,
+        _handler_context: &HandlerContext,
+    ) {
+    }
+
+    fn create<T: SpawnableScheduler<Self, DefaultScheduleExecutionArg>>(
+        _pool: &SchedulerPool<T, Self, DefaultScheduleExecutionArg>,
+    ) -> Self {
+        Self
+    }
+}
+
+fn setup_dummy_fork_graph(bank: Bank) -> Arc<Bank> {
+    let slot = bank.slot();
+    let bank_fork = BankForks::new_rw_arc(bank);
+    let bank = bank_fork.read().unwrap().get(slot).unwrap();
+    bank.loaded_programs_cache
+        .write()
+        .unwrap()
+        .set_fork_graph(bank_fork);
+    bank
+}
+
+use solana_sdk::{
+    instruction::{AccountMeta, Instruction},
+    message::Message,
+    pubkey::Pubkey,
+    signature::Signer,
+    signer::keypair::Keypair,
+    transaction::Transaction,
+};
+
+fn do_bench_tx_throughput(label: &str, bencher: &mut Criterion) {
+    solana_logger::setup();
+
+    /*
+    let GenesisConfigInfo {
+        genesis_config,
+        ..
+    } = create_genesis_config(10_000);
+    */
+    let payer = Keypair::new();
+
+    let mut accounts = vec![];
+    for i in 0..100 {
+        if i % 2 == 0 {
+            accounts.push(AccountMeta::new(Keypair::new().pubkey(), true));
+        } else {
+            accounts.push(AccountMeta::new_readonly(Keypair::new().pubkey(), true));
+        }
+    }
+
+    let memo_ix = Instruction {
+        program_id: Pubkey::default(),
+        accounts,
+        data: vec![0x00],
+    };
+    let mut ixs = vec![];
+    for _ in 0..1 {
+        ixs.push(memo_ix.clone());
+    }
+    let msg = Message::new(&ixs, Some(&payer.pubkey()));
+    let txn = Transaction::new_unsigned(msg);
+    //assert_eq!(wire_txn.len(), 3);
+    let tx0 = SanitizedTransaction::from_transaction_for_tests(txn);
+    /*
+    let bank = Bank::new_for_tests(&genesis_config);
+    let bank = setup_dummy_fork_graph(bank);
+    let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64));
+    let pool = SchedulerPool::<PooledScheduler<DummyTaskHandler, DefaultScheduleExecutionArg>, _, _>::new(
+        None,
+        None,
+        None,
+        ignored_prioritization_fee_cache,
+    );
+    let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone());
+    */
+
+    let (s, r) = crossbeam_channel::bounded(1000);
+
+    use std::sync::atomic::AtomicUsize;
+    let i = Arc::new(AtomicUsize::default());
+    use std::sync::Mutex;
+    let pages: Arc<Mutex<std::collections::HashMap<solana_sdk::pubkey::Pubkey, Page>>> =
+        Arc::new(Mutex::new(std::collections::HashMap::new()));
+    /*
+    for _ in 0..5 {
+        std::thread::Builder::new()
+            .name("solScGen".to_owned())
+            .spawn({
+                let pages = pages.clone();
+                let i = i.clone();
+                let tx1 = tx0.clone();
+                let s = s.clone();
+                move || loop {
+                    let tasks = std::iter::repeat_with(|| SchedulingStateMachine::create_task(tx1.clone(), i.fetch_add(1, std::sync::atomic::Ordering::Relaxed), &mut |address| {
+        pages.lock().unwrap().entry(address).or_default().clone()
+    })).take(100).collect::<Vec<_>>();
+                    if s.send(tasks).is_err() {
+                        break;
+                    }
+                }
+            })
+            .unwrap();
+    }
+    std::thread::sleep(std::time::Duration::from_secs(5));
+    */
+
+    //assert_eq!(bank.transaction_count(), 0);
+    //let mut scheduler = pool.do_take_scheduler(context);
+
+    let mut scheduler =
+        unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() };
+
+    let tasks = std::iter::repeat_with(|| {
+        SchedulingStateMachine::create_task(
+            tx0.clone(),
+            i.fetch_add(1, std::sync::atomic::Ordering::Relaxed),
+            &mut |address| pages.lock().unwrap().entry(address).or_default().clone(),
+        )
+    })
+    .take(100)
+    .collect::<Vec<_>>();
+    s.send(tasks).unwrap();
+
+    bencher.bench_function(label, |b| {
+        b.iter(|| {
+            for _ in 0..600 {
+                let mut first_task = None;
+                let tt = r.recv().unwrap();
+                let mut new_tasks = Vec::with_capacity(tt.len());
+                for t in tt {
+                    /*
+                    scheduler.schedule_task(t);
+                    */
+                    if let Some(task) = scheduler.schedule_task(t) {
+                        first_task = Some(task);
+                    }
+                }
+                scheduler.deschedule_task(first_task.as_ref().unwrap());
+                new_tasks.push(first_task.unwrap());
+                while let Some(unblocked_task) = scheduler.schedule_unblocked_task() {
+                    scheduler.deschedule_task(&unblocked_task);
+                    new_tasks.push(unblocked_task);
+                }
+                assert!(scheduler.has_no_active_task());
+                s.send(new_tasks).unwrap();
+            }
+            /*
+            scheduler.pause_for_recent_blockhash();
+            scheduler.clear_session_result_with_timings();
+            scheduler.restart_session();
+            */
+        })
+    });
+}
+
+fn bench_entrypoint(bencher: &mut Criterion) {
+    do_bench_tx_throughput("bench_tx_throughput", bencher)
+}
+
+use criterion::{criterion_group, criterion_main, Criterion};
+criterion_group!(benches, bench_entrypoint);
+criterion_main!(benches);
diff --git a/unified-scheduler-pool/benches/scheduler.rs b/unified-scheduler-pool/benches/scheduler.rs
new file mode 100644
index 00000000000000..b61903d77df4a4
--- /dev/null
+++ b/unified-scheduler-pool/benches/scheduler.rs
@@ -0,0 +1,922 @@
+#![cfg(feature = "dummy")]
+#![feature(test)]
+#![allow(clippy::arithmetic_side_effects)]
+
+#[global_allocator]
+static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc;
+
+extern crate test;
+
+use {
+    assert_matches::assert_matches,
+    log::*,
+    rand::{thread_rng, Rng},
+    solana_program_runtime::timings::ExecuteTimings,
+    solana_runtime::{
+        bank::Bank,
+        genesis_utils::{create_genesis_config, GenesisConfigInfo},
+        installed_scheduler_pool::{
+            InstalledScheduler, ResultWithTimings, ScheduleExecutionArg, SchedulerId,
+            SchedulingContext, SchedulingMode, WithTransactionAndIndex,
+        },
+        prioritization_fee_cache::PrioritizationFeeCache,
+    },
+    solana_sdk::{
+        scheduling::SchedulingMode,
+        system_transaction,
+        transaction::{Result, SanitizedTransaction},
+    },
+    solana_unified_scheduler_pool::{
+        PooledScheduler, SchedulerPool, SpawnableScheduler, TaskHandler,
+    },
+    std::{
+        fmt::Debug,
+        marker::{PhantomData, Send, Sync},
+        mem,
+        sync::Arc,
+    },
+    test::Bencher,
+};
+
+const TX_COUNT: usize = 10_000;
+
+#[derive(Debug, Default, Clone)]
+struct ScheduleExecutionArgForBench;
+
+// use Arc-ed transaction for very cheap .clone() so that the consumer is never starved for
+// incoming transactions.
+type TransactionWithIndexForBench = Arc<(SanitizedTransaction, usize)>;
+
+impl ScheduleExecutionArg for ScheduleExecutionArgForBench {
+    type TransactionWithIndex<'_tx> = TransactionWithIndexForBench;
+}
+
+#[derive(Debug, Default, Clone)]
+struct BenchFriendlyHandler<SEA: ScheduleExecutionArg + Clone, const MUTATE_ARC: bool>(
+    PhantomData<SEA>,
+);
+
+impl<SEA: ScheduleExecutionArg + Clone, const MUTATE_ARC: bool> TaskHandler<SEA>
+    for BenchFriendlyHandler<SEA, MUTATE_ARC>
+{
+    fn create<T: SpawnableScheduler<Self, SEA>>(_pool: &SchedulerPool<T, Self, SEA>) -> Self {
+        Self(PhantomData)
+    }
+
+    fn handle<T: SpawnableScheduler<Self, SEA>>(
+        &self,
+        _result: &mut Result<()>,
+        _timings: &mut ExecuteTimings,
+        bank: &Arc<Bank>,
+        transaction: &SanitizedTransaction,
+        _index: usize,
+        _pool: &SchedulerPool<T, Self, SEA>,
+    ) {
+        //std::hint::black_box(bank.clone());
+        let mut i = 0;
+        for _ in 0..10 {
+            if MUTATE_ARC {
+                //for _ in 0..2 {
+                std::hint::black_box((Arc::downgrade(bank)).upgrade().unwrap());
+                //}
+            }
+            // call random one of Bank's lightweight-and-very-multi-threaded-friendly methods which take a
+            // transaction inside this artifical tight loop.
+            i += bank.get_fee_for_message_with_lamports_per_signature(transaction.message(), i)
+        }
+        std::hint::black_box(i);
+    }
+}
+
+type BenchFriendlyHandlerWithArcMutation = BenchFriendlyHandler<ScheduleExecutionArgForBench, true>;
+type BenchFriendlyHandlerWithoutArcMutation =
+    BenchFriendlyHandler<ScheduleExecutionArgForBench, false>;
+
+fn run_bench<
+    F: FnOnce(Arc<SchedulerPool<I, TH, ScheduleExecutionArgForBench>>, SchedulingContext) -> I,
+    I: SpawnableScheduler<TH, ScheduleExecutionArgForBench>,
+    TH: TaskHandler<ScheduleExecutionArgForBench>,
+>(
+    bencher: &mut Bencher,
+    create_scheduler: F,
+) {
+    solana_logger::setup();
+
+    let GenesisConfigInfo {
+        genesis_config,
+        mint_keypair,
+        ..
+    } = create_genesis_config(1_000_000_000);
+    let bank = &Arc::new(Bank::new_for_tests(&genesis_config));
+    let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64));
+    let pool = SchedulerPool::new(None, None, None, ignored_prioritization_fee_cache);
+    let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone());
+
+    let mut scheduler = create_scheduler(pool, context.clone());
+    let tx0 = &SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer(
+        &mint_keypair,
+        &solana_sdk::pubkey::new_rand(),
+        2,
+        genesis_config.hash(),
+    ));
+    let tx_with_index = TransactionWithIndexForBench::new((tx0.clone(), 0));
+    bencher.iter(|| {
+        for _ in 0..TX_COUNT {
+            scheduler.schedule_execution(tx_with_index.clone());
+        }
+        assert_matches!(scheduler.wait_for_termination(false), Some((Ok(()), _)));
+        scheduler.replace_context(context.clone());
+    });
+}
+
+mod blocking_ref {
+    use {super::*, solana_runtime::installed_scheduler_pool::DefaultScheduleExecutionArg};
+
+    #[bench]
+    fn bench_without_arc_mutation(bencher: &mut Bencher) {
+        solana_logger::setup();
+
+        let GenesisConfigInfo {
+            genesis_config,
+            mint_keypair,
+            ..
+        } = create_genesis_config(1_000_000_000);
+        let bank = &Arc::new(Bank::new_for_tests(&genesis_config));
+        let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64));
+        let pool = SchedulerPool::new(None, None, None, ignored_prioritization_fee_cache);
+        let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone());
+
+        let mut scheduler = PooledScheduler::<_, DefaultScheduleExecutionArg>::do_spawn(
+            pool,
+            context.clone(),
+            BenchFriendlyHandler::<_, false>::default(),
+        );
+        let tx0 = &SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer(
+            &mint_keypair,
+            &solana_sdk::pubkey::new_rand(),
+            2,
+            genesis_config.hash(),
+        ));
+        let tx_with_index = &(tx0, 0);
+        bencher.iter(|| {
+            for _ in 0..TX_COUNT {
+                scheduler.schedule_execution(tx_with_index);
+            }
+            assert_matches!(scheduler.wait_for_termination(false), Some((Ok(()), _)));
+            scheduler.replace_context(context.clone());
+        });
+    }
+}
+
+mod blocking {
+    use super::*;
+
+    type BlockingScheduler<H> = PooledScheduler<H, ScheduleExecutionArgForBench>;
+
+    #[bench]
+    fn bench_with_arc_mutation(bencher: &mut Bencher) {
+        run_bench(bencher, |pool, context| {
+            BlockingScheduler::do_spawn(
+                pool,
+                context,
+                BenchFriendlyHandlerWithArcMutation::default(),
+            )
+        });
+    }
+
+    #[bench]
+    fn bench_without_arc_mutation(bencher: &mut Bencher) {
+        run_bench(bencher, |pool, context| {
+            BlockingScheduler::do_spawn(
+                pool,
+                context,
+                BenchFriendlyHandlerWithoutArcMutation::default(),
+            )
+        });
+    }
+}
+
+mod nonblocking {
+    use super::*;
+
+    #[derive(Debug)]
+    pub(super) struct NonblockingScheduler<H: TaskHandler<ScheduleExecutionArgForBench> + Clone> {
+        id: SchedulerId,
+        pub(crate) pool: Arc<SchedulerPool<Self, H, ScheduleExecutionArgForBench>>,
+        transaction_sender: crossbeam_channel::Sender<ChainedChannel>,
+        result_receiver: crossbeam_channel::Receiver<(Result<()>, ExecuteTimings, usize)>,
+        lane_count: usize,
+        context: SchedulingContext,
+        _phantom: PhantomData<H>,
+    }
+
+    enum ChainedChannel {
+        Payload(TransactionWithIndexForBench),
+        NextContext(SchedulingContext),
+        NextChannel(Box<dyn WithChannelPair + Send + Sync>),
+    }
+
+    type ChannelPair = (
+        crossbeam_channel::Receiver<ChainedChannel>,
+        crossbeam_channel::Sender<(Result<()>, ExecuteTimings, usize)>,
+    );
+
+    trait WithChannelPair {
+        fn unwrap_channel_pair(&mut self) -> ChannelPair;
+    }
+
+    struct ChannelPairOption(Option<ChannelPair>);
+
+    impl WithChannelPair for ChannelPairOption {
+        fn unwrap_channel_pair(&mut self) -> ChannelPair {
+            self.0.take().unwrap()
+        }
+    }
+
+    impl<H: TaskHandler<ScheduleExecutionArgForBench> + Clone>
+        SpawnableScheduler<H, ScheduleExecutionArgForBench> for NonblockingScheduler<H>
+    {
+        fn spawn(
+            _pool: Arc<SchedulerPool<Self, H, ScheduleExecutionArgForBench>>,
+            _initial_context: SchedulingContext,
+            _handler: H,
+        ) -> Self {
+            unimplemented!();
+        }
+
+        fn retire_if_stale(&mut self) -> bool {
+            unimplemented!();
+        }
+    }
+
+    impl<H: TaskHandler<ScheduleExecutionArgForBench> + Clone> NonblockingScheduler<H> {
+        pub(super) fn spawn(
+            pool: Arc<SchedulerPool<Self, H, ScheduleExecutionArgForBench>>,
+            initial_context: SchedulingContext,
+            lane_count: usize,
+            handler: H,
+        ) -> Self {
+            let (transaction_sender, transaction_receiver) =
+                crossbeam_channel::unbounded::<ChainedChannel>();
+            let (result_sender, result_receiver) = crossbeam_channel::unbounded();
+
+            for _ in 0..lane_count {
+                let mut bank = Arc::clone(initial_context.bank());
+                let mut transaction_receiver = transaction_receiver.clone();
+                let mut result_sender = result_sender.clone();
+                std::thread::spawn({
+                    let pool = pool.clone();
+                    let handler = handler.clone();
+                    move || {
+                        let mut result = Ok(());
+                        let mut timings = ExecuteTimings::default();
+                        let mut count = 0;
+                        while let Ok(message) = transaction_receiver.recv() {
+                            match message {
+                                ChainedChannel::Payload(with_transaction_and_index) => {
+                                    count += 1;
+                                    with_transaction_and_index.with_transaction_and_index(
+                                        |transaction, index| {
+                                            H::handle(
+                                                &handler,
+                                                &mut result,
+                                                &mut timings,
+                                                &bank,
+                                                transaction,
+                                                index,
+                                                &pool,
+                                            );
+                                        },
+                                    );
+                                }
+                                ChainedChannel::NextContext(next_context) => {
+                                    bank = next_context.bank().clone();
+                                }
+                                ChainedChannel::NextChannel(mut next_receiver_box) => {
+                                    result_sender
+                                        .send((
+                                            mem::replace(&mut result, Ok(())),
+                                            mem::take(&mut timings),
+                                            mem::take(&mut count),
+                                        ))
+                                        .unwrap();
+                                    (transaction_receiver, result_sender) =
+                                        next_receiver_box.unwrap_channel_pair();
+                                }
+                            }
+                        }
+                    }
+                });
+            }
+
+            Self {
+                id: thread_rng().gen::<SchedulerId>(),
+                pool,
+                transaction_sender,
+                result_receiver,
+                lane_count,
+                context: initial_context,
+                _phantom: PhantomData,
+            }
+        }
+    }
+    impl<H: TaskHandler<ScheduleExecutionArgForBench> + Clone>
+        InstalledScheduler<ScheduleExecutionArgForBench> for NonblockingScheduler<H>
+    {
+        fn id(&self) -> SchedulerId {
+            self.id
+        }
+
+        fn context(&self) -> &SchedulingContext {
+            &self.context
+        }
+
+        fn schedule_execution(&self, transaction_with_index: TransactionWithIndexForBench) {
+            self.transaction_sender
+                .send(ChainedChannel::Payload(transaction_with_index))
+                .unwrap();
+        }
+
+        fn wait_for_termination(&mut self, _is_dropped: bool) -> Option<ResultWithTimings> {
+            let (next_transaction_sender, next_transaction_receiver) =
+                crossbeam_channel::unbounded::<ChainedChannel>();
+            let (next_result_sender, next_result_receiver) = crossbeam_channel::unbounded();
+            for _ in 0..self.lane_count {
+                let (next_transaction_receiver, next_result_sender) = (
+                    next_transaction_receiver.clone(),
+                    next_result_sender.clone(),
+                );
+                self.transaction_sender
+                    .send(ChainedChannel::NextChannel(Box::new(ChannelPairOption(
+                        Some((next_transaction_receiver, next_result_sender)),
+                    ))))
+                    .unwrap();
+            }
+            self.transaction_sender = next_transaction_sender;
+
+            let mut overall_result = Ok(());
+            let mut overall_timings = ExecuteTimings::default();
+
+            while let Ok((result, timings, count)) = self.result_receiver.recv() {
+                match result {
+                    Ok(()) => {}
+                    Err(e) => overall_result = Err(e),
+                }
+                overall_timings.accumulate(&timings);
+                trace!("received: {count:?}");
+            }
+            self.result_receiver = next_result_receiver;
+
+            Some((overall_result, overall_timings))
+        }
+
+        /*
+        fn return_to_pool(self: Box<Self>) {
+            self.pool.clone().return_scheduler(self)
+        }
+        */
+        fn pause_for_recent_blockhash(&mut self) {
+            todo!()
+        }
+    }
+
+    #[bench]
+    fn bench_with_01_thread_with_arc_mutation(bencher: &mut Bencher) {
+        run_bench(bencher, |pool, context| {
+            NonblockingScheduler::spawn(
+                pool,
+                context,
+                1,
+                BenchFriendlyHandlerWithArcMutation::default(),
+            )
+        });
+    }
+
+    #[bench]
+    fn bench_with_01_thread_without_arc_mutation(bencher: &mut Bencher) {
+        run_bench(bencher, |pool, context| {
+            NonblockingScheduler::spawn(
+                pool,
+                context,
+                1,
+                BenchFriendlyHandlerWithoutArcMutation::default(),
+            )
+        });
+    }
+
+    #[bench]
+    fn bench_with_04_threads_with_arc_mutation(bencher: &mut Bencher) {
+        run_bench(bencher, |pool, context| {
+            NonblockingScheduler::spawn(
+                pool,
+                context,
+                4,
+                BenchFriendlyHandlerWithArcMutation::default(),
+            )
+        });
+    }
+
+    #[bench]
+    fn bench_with_04_threads_without_arc_mutation(bencher: &mut Bencher) {
+        run_bench(bencher, |pool, context| {
+            NonblockingScheduler::spawn(
+                pool,
+                context,
+                4,
+                BenchFriendlyHandlerWithoutArcMutation::default(),
+            )
+        });
+    }
+
+    #[bench]
+    fn bench_with_08_threads_with_arc_mutation(bencher: &mut Bencher) {
+        run_bench(bencher, |pool, context| {
+            NonblockingScheduler::spawn(
+                pool,
+                context,
+                8,
+                BenchFriendlyHandlerWithArcMutation::default(),
+            )
+        });
+    }
+
+    #[bench]
+    fn bench_with_08_threads_without_arc_mutation(bencher: &mut Bencher) {
+        run_bench(bencher, |pool, context| {
+            NonblockingScheduler::spawn(
+                pool,
+                context,
+                8,
+                BenchFriendlyHandlerWithoutArcMutation::default(),
+            )
+        });
+    }
+
+    #[bench]
+    fn bench_with_16_threads_with_arc_mutation(bencher: &mut Bencher) {
+        run_bench(bencher, |pool, context| {
+            NonblockingScheduler::spawn(
+                pool,
+                context,
+                16,
+                BenchFriendlyHandlerWithArcMutation::default(),
+            )
+        });
+    }
+
+    #[bench]
+    fn bench_with_16_threads_without_arc_mutation(bencher: &mut Bencher) {
+        run_bench(bencher, |pool, context| {
+            NonblockingScheduler::spawn(
+                pool,
+                context,
+                16,
+                BenchFriendlyHandlerWithoutArcMutation::default(),
+            )
+        });
+    }
+}
+
+// demonstrate meaningfully differing performance profile regarding multi worker thread utilization
+// with saturated transaction execution for each bench scenarios, with/without the existence of
+// artificial and needless synchronizations.
+// conversely, the whole InstallableScheduler machinery can be justified as it can eliminate these
+// synchronizations altogether to bare minimum (i.e. bank freeze).
+#[cfg(feature = "dummy")]
+mod thread_utilization {
+    use {
+        super::*,
+        crate::nonblocking::NonblockingScheduler,
+        solana_nohash_hasher::IntSet,
+        solana_sdk::{
+            signature::Signature, signer::keypair::Keypair,
+            system_instruction::SystemInstruction::Transfer, transaction::TransactionAccountLocks,
+        },
+        std::{collections::HashMap, sync::Mutex, thread::sleep, time::Duration},
+    };
+
+    #[derive(Debug, Clone)]
+    struct SleepyHandler;
+
+    impl<SEA: ScheduleExecutionArg> TaskHandler<SEA> for SleepyHandler {
+        fn create<T: SpawnableScheduler<Self, SEA>>(_pool: &SchedulerPool<T, Self, SEA>) -> Self {
+            Self
+        }
+
+        fn handle<T: SpawnableScheduler<Self, SEA>>(
+            &self,
+            _result: &mut Result<()>,
+            _timings: &mut ExecuteTimings,
+            _bank: &Arc<Bank>,
+            transaction: &SanitizedTransaction,
+            _index: usize,
+            _pool: &SchedulerPool<T, Self, SEA>,
+        ) {
+            let Ok(Transfer { lamports: sleep_ms }) =
+                bincode::deserialize(&transaction.message().instructions()[0].data)
+            else {
+                panic!()
+            };
+
+            sleep(Duration::from_millis(sleep_ms));
+        }
+    }
+
+    enum Step {
+        Batch(Vec<TransactionWithIndexForBench>),
+        // mimic periodic or contention-induced synchronization with this artificial blocking
+        MaySynchronize,
+    }
+
+    const WORKER_THREAD_COUNT: usize = 10;
+
+    fn simulate_synchronization_point<T: InstallableScheduler<ScheduleExecutionArgForBench>>(
+        scheduler: &mut T,
+        context: SchedulingContext,
+    ) {
+        assert_matches!(scheduler.wait_for_termination(false), Some((Ok(()), _)));
+        scheduler.replace_context(context);
+    }
+
+    fn run_scenario_and_finalize<T: InstallableScheduler<ScheduleExecutionArgForBench>>(
+        bencher: &mut Bencher,
+        really_synchronize: bool,
+        scheduler: &mut T,
+        context: SchedulingContext,
+        create_scenario: impl Fn() -> Vec<Step>,
+    ) {
+        let scenario = &create_scenario();
+        bencher.iter(|| {
+            for step in scenario {
+                match step {
+                    Step::Batch(txes) => {
+                        for tx in txes {
+                            scheduler.schedule_execution(tx.clone());
+                        }
+                    }
+                    Step::MaySynchronize => {
+                        if really_synchronize {
+                            simulate_synchronization_point(scheduler, context.clone());
+                        }
+                    }
+                }
+            }
+            simulate_synchronization_point(scheduler, context.clone());
+        })
+    }
+
+    // frequent synchronization creates non-zero idling time among some of worker threads, given
+    // batches with mixed transactions. then, it adds up as these kinds synchronizations occurs over
+    // processing
+    fn bench_random_execution_durations(bencher: &mut Bencher, really_synchronize: bool) {
+        let GenesisConfigInfo {
+            genesis_config,
+            mint_keypair,
+            ..
+        } = create_genesis_config(1_000_000_000);
+        let bank = &Arc::new(Bank::new_for_tests(&genesis_config));
+
+        let create_tx_with_index = |index| {
+            let tx0 =
+                SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer(
+                    &mint_keypair,
+                    &solana_sdk::pubkey::new_rand(),
+                    // simulate somewhat realistic work load; txes finish at different timings
+                    thread_rng().gen_range(1..10),
+                    genesis_config.hash(),
+                ));
+            TransactionWithIndexForBench::new((tx0, index))
+        };
+
+        let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64));
+        let pool = SchedulerPool::new(None, None, None, ignored_prioritization_fee_cache);
+        let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone());
+        let mut scheduler =
+            NonblockingScheduler::spawn(pool, context.clone(), WORKER_THREAD_COUNT, SleepyHandler);
+
+        run_scenario_and_finalize(bencher, really_synchronize, &mut scheduler, context, || {
+            const TX_PER_BATCH: usize = 20;
+            const SYNCHRONIZATION_PER_BENCH_ITER: usize = 10;
+
+            (0..SYNCHRONIZATION_PER_BENCH_ITER)
+                .flat_map(|_| {
+                    [
+                        Step::Batch((0..TX_PER_BATCH).map(create_tx_with_index).collect()),
+                        Step::MaySynchronize,
+                    ]
+                })
+                .collect()
+        });
+    }
+
+    #[bench]
+    fn bench_random_execution_durations_with_interleaved_synchronization(bencher: &mut Bencher) {
+        bench_random_execution_durations(bencher, true);
+    }
+
+    #[bench]
+    fn bench_random_execution_durations_without_interleaved_synchronization(bencher: &mut Bencher) {
+        bench_random_execution_durations(bencher, false);
+    }
+
+    #[derive(Debug, Clone)]
+    struct SleepyHandlerWithCompletionSignal(crossbeam_channel::Sender<Signature>);
+
+    impl<SEA: ScheduleExecutionArg> TaskHandler<SEA> for SleepyHandlerWithCompletionSignal {
+        fn create<T: SpawnableScheduler<Self, SEA>>(_pool: &SchedulerPool<T, Self, SEA>) -> Self {
+            // not needed for bench...
+            unimplemented!();
+        }
+
+        fn handle<T: SpawnableScheduler<Self, SEA>>(
+            &self,
+            _result: &mut Result<()>,
+            _timings: &mut ExecuteTimings,
+            _bank: &Arc<Bank>,
+            transaction: &SanitizedTransaction,
+            _index: usize,
+            _pool: &SchedulerPool<T, Self, SEA>,
+        ) {
+            let Ok(Transfer { lamports: sleep_ms }) =
+                bincode::deserialize(&transaction.message().instructions()[0].data)
+            else {
+                panic!()
+            };
+
+            sleep(Duration::from_millis(sleep_ms));
+
+            self.0.send(*transaction.signature()).unwrap();
+        }
+    }
+
+    // a wrapper InstallableScheduler to integrate with dep graph scheduling logic
+    #[derive(Debug)]
+    struct NonblockingSchedulerWithDepGraph {
+        inner_scheduler: NonblockingScheduler<SleepyHandlerWithCompletionSignal>,
+        pending_transactions: Mutex<Vec<SanitizedTransaction>>,
+        completion_receiver: crossbeam_channel::Receiver<Signature>,
+    }
+
+    impl InstalledScheduler<ScheduleExecutionArgForBench> for NonblockingSchedulerWithDepGraph {
+        fn id(&self) -> SchedulerId {
+            self.inner_scheduler.id()
+        }
+
+        fn context(&self) -> &SchedulingContext {
+            self.inner_scheduler.context()
+        }
+
+        fn schedule_execution(&self, transaction_with_index: TransactionWithIndexForBench) {
+            // just buffer all the txes to work with the dep graph outer loop nicely, which needs
+            // some buffering to schedule efficiently
+            // note taht the prompt execution as soon as entering into schedule_execution() isn't
+            // needed for these particular bench purposes. so, buffering is okay in that regard.
+            self.pending_transactions
+                .lock()
+                .unwrap()
+                .push(transaction_with_index.0.clone());
+        }
+
+        fn wait_for_termination(&mut self, is_dropped: bool) -> Option<ResultWithTimings> {
+            // execute all the pending transactions now!
+            self.execute_batches(
+                self.context().bank(),
+                &std::mem::take(&mut *self.pending_transactions.lock().unwrap()),
+                &self.completion_receiver,
+            )
+            .unwrap();
+
+            self.inner_scheduler.wait_for_termination(is_dropped)
+        }
+
+        /*
+        fn return_to_pool(self: Box<Self>) {
+            Box::new(self.inner_scheduler).return_to_pool()
+        }
+        */
+    }
+
+    /*
+    impl InstallableScheduler<ScheduleExecutionArgForBench> for NonblockingSchedulerWithDepGraph {
+        fn replace_context(&mut self, context: SchedulingContext) {
+            self.inner_scheduler.replace_context(context)
+        }
+    }
+    */
+
+    // adapted from https://github.com/jito-foundation/jito-solana/pull/294; retained to be as-is
+    // as much as possible by the use of some wrapper type hackery.
+    impl NonblockingSchedulerWithDepGraph {
+        // for each index, builds a transaction dependency graph of indices that need to execute before
+        // the current one.
+        // The returned Vec<HashSet<usize>> is a 1:1 mapping for the indices that need to be executed
+        // before that index can be executed
+        fn build_dependency_graph(
+            tx_account_locks: &[TransactionAccountLocks],
+        ) -> Vec<IntSet<usize>> {
+            // build a map whose key is a pubkey + value is a sorted vector of all indices that
+            // lock that account
+            let mut indices_read_locking_account = HashMap::new();
+            let mut indicies_write_locking_account = HashMap::new();
+            tx_account_locks
+                .iter()
+                .enumerate()
+                .for_each(|(idx, tx_account_locks)| {
+                    for account in &tx_account_locks.readonly {
+                        indices_read_locking_account
+                            .entry(**account)
+                            .and_modify(|indices: &mut Vec<usize>| indices.push(idx))
+                            .or_insert_with(|| vec![idx]);
+                    }
+                    for account in &tx_account_locks.writable {
+                        indicies_write_locking_account
+                            .entry(**account)
+                            .and_modify(|indices: &mut Vec<usize>| indices.push(idx))
+                            .or_insert_with(|| vec![idx]);
+                    }
+                });
+
+            tx_account_locks
+                .iter()
+                .enumerate()
+                .map(|(idx, account_locks)| {
+                    let mut dep_graph: IntSet<usize> = IntSet::default();
+
+                    let readlock_conflict_accs = account_locks.writable.iter();
+                    let writelock_conflict_accs = account_locks
+                        .readonly
+                        .iter()
+                        .chain(account_locks.writable.iter());
+
+                    for acc in readlock_conflict_accs {
+                        if let Some(indices) = indices_read_locking_account.get(acc) {
+                            dep_graph.extend(indices.iter().take_while(|l_idx| **l_idx < idx));
+                        }
+                    }
+
+                    for acc in writelock_conflict_accs {
+                        if let Some(indices) = indicies_write_locking_account.get(acc) {
+                            dep_graph.extend(indices.iter().take_while(|l_idx| **l_idx < idx));
+                        }
+                    }
+                    dep_graph
+                })
+                .collect()
+        }
+
+        fn execute_batches(
+            &self,
+            bank: &Arc<Bank>,
+            pending_transactions: &[SanitizedTransaction],
+            receiver: &crossbeam_channel::Receiver<Signature>,
+        ) -> Result<()> {
+            if pending_transactions.is_empty() {
+                return Ok(());
+            }
+
+            let mut tx_account_locks: Vec<_> = Vec::with_capacity(pending_transactions.len());
+            for tx in pending_transactions {
+                tx_account_locks
+                    .push(tx.get_account_locks(bank.get_transaction_account_lock_limit())?);
+            }
+
+            // the dependency graph contains the indices that must be executed (marked with
+            // State::Done) before they can be executed
+            let dependency_graph = Self::build_dependency_graph(&tx_account_locks);
+
+            #[derive(Clone)]
+            enum State {
+                Blocked,
+                Processing,
+                Done,
+            }
+
+            let mut processing_states: Vec<State> = vec![State::Blocked; dependency_graph.len()];
+            let mut signature_indices: HashMap<&Signature, usize> =
+                HashMap::with_capacity(dependency_graph.len());
+            signature_indices.extend(
+                pending_transactions
+                    .iter()
+                    .enumerate()
+                    .map(|(idx, tx)| (tx.signature(), idx)),
+            );
+
+            loop {
+                let mut is_done = true;
+                for idx in 0..processing_states.len() {
+                    match processing_states[idx] {
+                        State::Blocked => {
+                            is_done = false;
+
+                            // if all the dependent txs are executed, this transaction can be
+                            // scheduled for execution.
+                            if dependency_graph[idx]
+                                .iter()
+                                .all(|idx| matches!(processing_states[*idx], State::Done))
+                            {
+                                self.inner_scheduler.schedule_execution(Arc::new((
+                                    pending_transactions[idx].clone(),
+                                    idx,
+                                )));
+                                // this idx can be scheduled and moved to processing
+                                processing_states[idx] = State::Processing;
+                            }
+                        }
+                        State::Processing => {
+                            is_done = false;
+                        }
+                        State::Done => {}
+                    }
+                }
+
+                if is_done {
+                    break;
+                }
+
+                let mut executor_responses: Vec<_> = vec![receiver.recv().unwrap()];
+                executor_responses.extend(receiver.try_iter());
+                for r in &executor_responses {
+                    processing_states[*signature_indices.get(r).unwrap()] = State::Done;
+                }
+            }
+            Ok(())
+        }
+    }
+
+    // frequent synchronizations hampers efficient (= parallelizable) scheduling of several chunks
+    // of txes which are tied together for each common account locks. Ideally those independent chunks can be
+    // executed in parallel, which each is consuming one worker thread as a form of serialized runs
+    // of processing. However, should synchronizations occurs between boundaries of those chunks
+    // arrival, it cannot schedule the later-coming one because it firstly flush out the the first
+    // one
+    // in other words, this is just a re-manifestation of perf. issue coming from write barriers in
+    // general.
+    fn bench_long_serialized_runs(bencher: &mut Bencher, really_synchronize: bool) {
+        let GenesisConfigInfo { genesis_config, .. } = create_genesis_config(1_000_000_000);
+        let bank = &Arc::new(Bank::new_for_tests(&genesis_config));
+        let (kp1, kp2) = (Keypair::new(), Keypair::new());
+
+        let create_tx_of_serialized_run1 = || {
+            let tx0 =
+                SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer(
+                    &kp1,
+                    &solana_sdk::pubkey::new_rand(),
+                    10,
+                    genesis_config.hash(),
+                ));
+            TransactionWithIndexForBench::new((tx0, 0))
+        };
+        let create_tx_of_serialized_run2 = || {
+            let tx0 =
+                SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer(
+                    &kp2,
+                    &solana_sdk::pubkey::new_rand(),
+                    10,
+                    genesis_config.hash(),
+                ));
+            TransactionWithIndexForBench::new((tx0, 0))
+        };
+
+        let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64));
+        let pool = SchedulerPool::new(None, None, None, ignored_prioritization_fee_cache);
+        let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone());
+        let (completion_sender, completion_receiver) = crossbeam_channel::unbounded();
+        let handler = SleepyHandlerWithCompletionSignal(completion_sender);
+        let tx_lock_ignoring_scheduler =
+            NonblockingScheduler::spawn(pool, context.clone(), WORKER_THREAD_COUNT, handler);
+        let tx_lock_adhering_scheduler = NonblockingSchedulerWithDepGraph {
+            inner_scheduler: tx_lock_ignoring_scheduler,
+            pending_transactions: Mutex::new(Vec::default()),
+            completion_receiver,
+        };
+        let mut scheduler = tx_lock_adhering_scheduler;
+        run_scenario_and_finalize(bencher, really_synchronize, &mut scheduler, context, || {
+            (0..1)
+                .flat_map(|_| {
+                    [
+                        Step::Batch(vec![create_tx_of_serialized_run1()]),
+                        Step::Batch(vec![create_tx_of_serialized_run1()]),
+                        Step::Batch(vec![create_tx_of_serialized_run1()]),
+                        Step::Batch(vec![create_tx_of_serialized_run1()]),
+                        Step::MaySynchronize,
+                        Step::Batch(vec![create_tx_of_serialized_run2()]),
+                        Step::Batch(vec![create_tx_of_serialized_run2()]),
+                        Step::Batch(vec![create_tx_of_serialized_run2()]),
+                        Step::Batch(vec![create_tx_of_serialized_run2()]),
+                        Step::MaySynchronize,
+                    ]
+                })
+                .collect()
+        });
+    }
+
+    #[bench]
+    fn bench_long_serialized_runs_with_interleaved_synchronization(bencher: &mut Bencher) {
+        bench_long_serialized_runs(bencher, true);
+    }
+
+    #[bench]
+    fn bench_long_serialized_runs_without_interleaved_synchronization(bencher: &mut Bencher) {
+        bench_long_serialized_runs(bencher, false);
+    }
+}
diff --git a/unified-scheduler-pool/src/lib.rs b/unified-scheduler-pool/src/lib.rs
index 09ded82ee88e7d..b38a3cd329b2fb 100644
--- a/unified-scheduler-pool/src/lib.rs
+++ b/unified-scheduler-pool/src/lib.rs
@@ -8,35 +8,51 @@
 //! and commits any side-effects (i.e. on-chain state changes) into the associated `Bank` via
 //! `solana-ledger`'s helper function called `execute_batch()`.
 
+#[cfg(feature = "dev-context-only-utils")]
+use qualifier_attr::qualifiers;
 use {
     assert_matches::assert_matches,
-    crossbeam_channel::{select, unbounded, Receiver, SendError, Sender},
+    cpu_time::ThreadTime,
+    crossbeam_channel::{
+        bounded, disconnected, never, select_biased, unbounded, Receiver, RecvError,
+        RecvTimeoutError, SendError, Sender, TryRecvError,
+    },
+    dashmap::DashMap,
     derivative::Derivative,
     log::*,
     solana_ledger::blockstore_processor::{
         execute_batch, TransactionBatchWithIndexes, TransactionStatusSender,
     },
+    solana_measure::measure::Measure,
+    solana_metrics::datapoint_info_at,
     solana_program_runtime::timings::ExecuteTimings,
     solana_runtime::{
         bank::Bank,
+        compute_budget_details::GetComputeBudgetDetails,
         installed_scheduler_pool::{
-            InstalledScheduler, InstalledSchedulerBox, InstalledSchedulerPool,
-            InstalledSchedulerPoolArc, ResultWithTimings, SchedulerId, SchedulingContext,
-            UninstalledScheduler, UninstalledSchedulerBox,
+            DefaultScheduleExecutionArg, InstalledScheduler, InstalledSchedulerPool,
+            InstalledSchedulerPoolArc, ResultWithTimings, ScheduleExecutionArg, SchedulerId,
+            SchedulingContext, UninstalledScheduler, UninstalledSchedulerBox,
+            WithTransactionAndIndex,
         },
         prioritization_fee_cache::PrioritizationFeeCache,
     },
-    solana_sdk::transaction::{Result, SanitizedTransaction},
-    solana_unified_scheduler_logic::Task,
+    solana_sdk::{
+        clock::Slot,
+        pubkey::Pubkey,
+        transaction::{Result, SanitizedTransaction, TransactionError},
+    },
+    solana_unified_scheduler_logic::{Page, SchedulingStateMachine, Task},
     solana_vote::vote_sender_types::ReplayVoteSender,
     std::{
+        env,
         fmt::Debug,
-        marker::PhantomData,
         sync::{
             atomic::{AtomicU64, Ordering::Relaxed},
-            Arc, Mutex, OnceLock, Weak,
+            Arc, Mutex, OnceLock, RwLock, RwLockReadGuard, Weak,
         },
         thread::{self, JoinHandle},
+        time::{Duration, Instant, SystemTime},
     },
 };
 
@@ -46,7 +62,11 @@ type AtomicSchedulerId = AtomicU64;
 // contains some internal fields, whose types aren't available in solana-runtime (currently
 // TransactionStatusSender; also, PohRecorder in the future)...
 #[derive(Debug)]
-pub struct SchedulerPool<S: SpawnableScheduler<TH>, TH: TaskHandler> {
+pub struct SchedulerPool<
+    S: SpawnableScheduler<TH, SEA>,
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
+> {
     scheduler_inners: Mutex<Vec<S::Inner>>,
     handler_count: usize,
     handler_context: HandlerContext,
@@ -62,7 +82,11 @@ pub struct SchedulerPool<S: SpawnableScheduler<TH>, TH: TaskHandler> {
     // memory increase.
     weak_self: Weak<Self>,
     next_scheduler_id: AtomicSchedulerId,
-    _phantom: PhantomData<TH>,
+    // prune schedulers, stop idling scheduler's threads, sanity check on the
+    // address book after scheduler is returned.
+    cleaner_sender: Sender<Weak<RwLock<ThreadManager<S, TH, SEA>>>>,
+    cleaner_exit_signal_sender: Sender<()>,
+    cleaner_thread: Mutex<Option<JoinHandle<()>>>,
 }
 
 #[derive(Debug)]
@@ -73,16 +97,121 @@ pub struct HandlerContext {
     prioritization_fee_cache: Arc<PrioritizationFeeCache>,
 }
 
-pub type DefaultSchedulerPool =
-    SchedulerPool<PooledScheduler<DefaultTaskHandler>, DefaultTaskHandler>;
+pub type DefaultSchedulerPool = SchedulerPool<
+    PooledScheduler<DefaultTaskHandler, DefaultScheduleExecutionArg>,
+    DefaultTaskHandler,
+    DefaultScheduleExecutionArg,
+>;
+
+struct WatchedThreadManager<S, TH, SEA>
+where
+    S: SpawnableScheduler<TH, SEA>,
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
+{
+    thread_manager: Weak<RwLock<ThreadManager<S, TH, SEA>>>,
+    #[cfg(target_os = "linux")]
+    tick: u64,
+    #[cfg(target_os = "linux")]
+    updated_at: Instant,
+}
 
-impl<S, TH> SchedulerPool<S, TH>
+impl<S, TH, SEA> WatchedThreadManager<S, TH, SEA>
 where
-    S: SpawnableScheduler<TH>,
-    TH: TaskHandler,
+    S: SpawnableScheduler<TH, SEA>,
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
+{
+    fn new(thread_manager: Weak<RwLock<ThreadManager<S, TH, SEA>>>) -> Self {
+        Self {
+            thread_manager,
+            #[cfg(target_os = "linux")]
+            tick: 0,
+            #[cfg(target_os = "linux")]
+            updated_at: Instant::now(),
+        }
+    }
+
+    fn retire_if_stale(&mut self) -> bool {
+        #[cfg_attr(not(target_os = "linux"), allow(unused_variables))]
+        let Some(thread_manager) = self.thread_manager.upgrade() else {
+            return false;
+        };
+
+        // The following linux-only code implements an eager native thread reclaiming, which is
+        // only useful if the solana-validator sees many unrooted forks. Such hostile situations
+        // should NEVER happen on remotely-uncontrollable ledgers created by solana-test-validator.
+        // And it's generally not expected mainnet-beta validators (or any live clusters for that
+        // matter) to be run on non-linux OSes at all.
+        //
+        // Thus, this OS-specific implementation can be justified because this enables the hot-path
+        // (the scheduler main thread) to omit VDSO calls and timed-out futex syscalls by relying on
+        // this out-of-bound cleaner for a defensive thread reclaiming.
+        #[cfg(target_os = "linux")]
+        {
+            let Some(tid) = thread_manager.read().unwrap().active_tid_if_not_primary() else {
+                self.tick = 0;
+                self.updated_at = Instant::now();
+                return true;
+            };
+
+            let pid = std::process::id();
+            let task = procfs::process::Process::new(pid.try_into().unwrap())
+                .unwrap()
+                .task_from_tid(tid)
+                .unwrap();
+            let stat = task.stat().unwrap();
+            let current_tick = stat.utime.checked_add(stat.stime).unwrap();
+            if current_tick > self.tick {
+                self.tick = current_tick;
+                self.updated_at = Instant::now();
+            } else {
+                // 5x of 400ms block time
+                const IDLE_DURATION_FOR_EAGER_THREAD_RECLAIM: Duration = Duration::from_secs(2);
+
+                let elapsed = self.updated_at.elapsed();
+                if elapsed > IDLE_DURATION_FOR_EAGER_THREAD_RECLAIM {
+                    const BITS_PER_HEX_DIGIT: usize = 4;
+                    let thread_manager = &mut thread_manager.write().unwrap();
+                    info!(
+                        "[sch_{:0width$x}]: cleaner: retire_if_stale(): stopping thread manager ({tid}/{} <= {}/{:?})...",
+                        thread_manager.scheduler_id,
+                        current_tick,
+                        self.tick,
+                        elapsed,
+                        width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT,
+                    );
+                    thread_manager.suspend();
+                    self.tick = 0;
+                    self.updated_at = Instant::now();
+                }
+            }
+        }
+
+        true
+    }
+}
+
+impl<S, TH, SEA> Drop for SchedulerPool<S, TH, SEA>
+where
+    S: SpawnableScheduler<TH, SEA>,
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
+{
+    fn drop(&mut self) {
+        info!("SchedulerPool::drop() is successfully called");
+    }
+}
+
+impl<S, TH, SEA> SchedulerPool<S, TH, SEA>
+where
+    S: SpawnableScheduler<TH, SEA>,
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
 {
     // Some internal impl and test code want an actual concrete type, NOT the
     // `dyn InstalledSchedulerPool`. So don't merge this into `Self::new_dyn()`.
+    #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))]
     fn new(
         handler_count: Option<usize>,
         log_messages_bytes_limit: Option<usize>,
@@ -90,12 +219,66 @@ where
         replay_vote_sender: Option<ReplayVoteSender>,
         prioritization_fee_cache: Arc<PrioritizationFeeCache>,
     ) -> Arc<Self> {
-        let handler_count = handler_count.unwrap_or(1);
-        // we're hard-coding the number of handler thread to 1, meaning this impl is currently
-        // single-threaded still.
-        assert_eq!(handler_count, 1); // replace this with assert!(handler_count >= 1) later
+        let handler_count = handler_count.unwrap_or(Self::default_handler_count());
+        assert!(handler_count >= 1);
+
+        let (scheduler_pool_sender, scheduler_pool_receiver) = bounded(1);
+        let (cleaner_sender, cleaner_receiver) = unbounded();
+        let (cleaner_exit_signal_sender, cleaner_exit_signal_receiver) = unbounded();
+
+        let cleaner_main_loop = || {
+            move || {
+                let scheduler_pool: Arc<Self> = scheduler_pool_receiver.recv().unwrap();
+                drop(scheduler_pool_receiver);
+
+                let mut thread_managers: Vec<WatchedThreadManager<S, TH, SEA>> = vec![];
+
+                'outer: loop {
+                    let mut schedulers = scheduler_pool.scheduler_inners.lock().unwrap();
+                    let schedulers_len_pre_retain = schedulers.len();
+                    schedulers.retain_mut(|scheduler| scheduler.retire_if_stale());
+                    let schedulers_len_post_retain = schedulers.len();
+                    drop(schedulers);
+
+                    let thread_manager_len_pre_retain = thread_managers.len();
+                    thread_managers.retain_mut(|thread_manager| thread_manager.retire_if_stale());
+
+                    let thread_manager_len_pre_push = thread_managers.len();
+                    'inner: loop {
+                        match cleaner_receiver.try_recv() {
+                            Ok(thread_manager) => {
+                                thread_managers.push(WatchedThreadManager::new(thread_manager))
+                            }
+                            Err(TryRecvError::Disconnected) => break 'outer,
+                            Err(TryRecvError::Empty) => break 'inner,
+                        }
+                    }
+
+                    info!(
+                        "cleaner: unused schedulers in the pool: {} => {}, all thread managers: {} => {} => {}",
+                        schedulers_len_pre_retain,
+                        schedulers_len_post_retain,
+                        thread_manager_len_pre_retain,
+                        thread_manager_len_pre_push,
+                        thread_managers.len(),
+                    );
+                    // wait for signal with timeout here instead of recv_timeout() to write all the
+                    // preceeding logs at once.
+                    match cleaner_exit_signal_receiver.recv_timeout(Duration::from_secs(1)) {
+                        Ok(()) | Err(RecvTimeoutError::Disconnected) => break 'outer,
+                        Err(RecvTimeoutError::Timeout) => continue,
+                    }
+                }
+                info!("cleaner thread terminating!");
+            }
+        };
 
-        Arc::new_cyclic(|weak_self| Self {
+        let cleaner_thread = thread::Builder::new()
+            .name("solScCleaner".to_owned())
+            .spawn(cleaner_main_loop())
+            .unwrap();
+
+        let scheduler_pool = Arc::new_cyclic(|weak_self| Self {
             scheduler_inners: Mutex::default(),
             handler_count,
             handler_context: HandlerContext {
@@ -105,9 +288,13 @@ where
                 prioritization_fee_cache,
             },
             weak_self: weak_self.clone(),
-            next_scheduler_id: AtomicSchedulerId::default(),
-            _phantom: PhantomData,
-        })
+            next_scheduler_id: AtomicSchedulerId::new(PRIMARY_SCHEDULER_ID),
+            cleaner_thread: Mutex::new(Some(cleaner_thread)),
+            cleaner_sender,
+            cleaner_exit_signal_sender,
+        });
+        scheduler_pool_sender.send(scheduler_pool.clone()).unwrap();
+        scheduler_pool
     }
 
     // This apparently-meaningless wrapper is handy, because some callers explicitly want
@@ -118,7 +305,7 @@ where
         transaction_status_sender: Option<TransactionStatusSender>,
         replay_vote_sender: Option<ReplayVoteSender>,
         prioritization_fee_cache: Arc<PrioritizationFeeCache>,
-    ) -> InstalledSchedulerPoolArc {
+    ) -> InstalledSchedulerPoolArc<SEA> {
         Self::new(
             handler_count,
             log_messages_bytes_limit,
@@ -146,16 +333,21 @@ where
             .push(scheduler);
     }
 
+    #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))]
     fn do_take_scheduler(&self, context: SchedulingContext) -> S {
         // pop is intentional for filo, expecting relatively warmed-up scheduler due to having been
         // returned recently
-        if let Some(inner) = self.scheduler_inners.lock().expect("not poisoned").pop() {
-            S::from_inner(inner, context)
+        if let Some(pooled_inner) = self.scheduler_inners.lock().expect("not poisoned").pop() {
+            S::from_inner(pooled_inner, context)
         } else {
-            S::spawn(self.self_arc(), context)
+            S::spawn(self.self_arc(), context, TH::create(self))
         }
     }
 
+    fn register_to_cleaner(&self, thread_manager: Weak<RwLock<ThreadManager<S, TH, SEA>>>) {
+        self.cleaner_sender.send(thread_manager).unwrap();
+    }
+
     pub fn default_handler_count() -> usize {
         Self::calculate_default_handler_count(
             thread::available_parallelism()
@@ -188,18 +380,41 @@ where
     }
 }
 
-impl<S, TH> InstalledSchedulerPool for SchedulerPool<S, TH>
+impl<S, TH, SEA> InstalledSchedulerPool<SEA> for SchedulerPool<S, TH, SEA>
 where
-    S: SpawnableScheduler<TH>,
-    TH: TaskHandler,
+    S: SpawnableScheduler<TH, SEA>,
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
 {
-    fn take_scheduler(&self, context: SchedulingContext) -> InstalledSchedulerBox {
+    fn take_scheduler(&self, context: SchedulingContext) -> Box<dyn InstalledScheduler<SEA>> {
         Box::new(self.do_take_scheduler(context))
     }
+
+    fn uninstalled_from_bank_forks(self: Arc<Self>) {
+        self.scheduler_inners.lock().unwrap().clear();
+        self.cleaner_exit_signal_sender.send(()).unwrap();
+        let () = self
+            .cleaner_thread
+            .lock()
+            .unwrap()
+            .take()
+            .unwrap()
+            .join()
+            .unwrap();
+        info!(
+            "SchedulerPool::uninstalled_from_bank_forks(): joined cleaner thread at {:?}...",
+            thread::current()
+        );
+    }
 }
 
-pub trait TaskHandler: Send + Sync + Debug + Sized + 'static {
+pub trait TaskHandler<SEA: ScheduleExecutionArg>:
+    Send + Sync + Debug + Sized + Clone + 'static
+{
+    fn create<T: SpawnableScheduler<Self, SEA>>(pool: &SchedulerPool<T, Self, SEA>) -> Self;
+
     fn handle(
+        &self,
         result: &mut Result<()>,
         timings: &mut ExecuteTimings,
         bank: &Arc<Bank>,
@@ -209,11 +424,16 @@ pub trait TaskHandler: Send + Sync + Debug + Sized + 'static {
     );
 }
 
-#[derive(Debug)]
+#[derive(Clone, Debug)]
 pub struct DefaultTaskHandler;
 
-impl TaskHandler for DefaultTaskHandler {
+impl<SEA: ScheduleExecutionArg> TaskHandler<SEA> for DefaultTaskHandler {
+    fn create<T: SpawnableScheduler<Self, SEA>>(_pool: &SchedulerPool<T, Self, SEA>) -> Self {
+        Self
+    }
+
     fn handle(
+        &self,
         result: &mut Result<()>,
         timings: &mut ExecuteTimings,
         bank: &Arc<Bank>,
@@ -244,15 +464,31 @@ impl TaskHandler for DefaultTaskHandler {
 struct ExecutedTask {
     task: Task,
     result_with_timings: ResultWithTimings,
+    slot: Slot,
+    thx: usize,
+    handler_timings: Option<HandlerTimings>,
+}
+
+pub struct HandlerTimings {
+    finish_time: SystemTime,
+    execution_us: u64,
+    execution_cpu_us: u128,
 }
 
 impl ExecutedTask {
-    fn new_boxed(task: Task) -> Box<Self> {
+    fn new_boxed(task: Task, thx: usize, slot: Slot) -> Box<Self> {
         Box::new(Self {
             task,
             result_with_timings: initialized_result_with_timings(),
+            slot,
+            thx,
+            handler_timings: None,
         })
     }
+
+    fn is_err(&self) -> bool {
+        self.result_with_timings.0.is_err()
+    }
 }
 
 // A very tiny generic message type to signal about opening and closing of subchannels, which are
@@ -268,6 +504,7 @@ enum SubchanneledPayload<P1, P2> {
 }
 
 type NewTaskPayload = SubchanneledPayload<Task, SchedulingContext>;
+type ExecutedTaskPayload = SubchanneledPayload<Box<ExecutedTask>, ()>;
 
 // A tiny generic message type to synchronize multiple threads everytime some contextual data needs
 // to be switched (ie. SchedulingContext), just using a single communication channel.
@@ -336,6 +573,10 @@ mod chained_channel {
             self.sender = chained_sender;
             Ok(())
         }
+
+        pub(super) fn len(&self) -> usize {
+            self.sender.len()
+        }
     }
 
     // P doesn't need to be `: Clone`, yet rustc derive can't handle it.
@@ -386,55 +627,198 @@ mod chained_channel {
     }
 }
 
+#[derive(Default, Debug)]
+pub struct AddressBook {
+    book: DashMap<Pubkey, Page>,
+}
+
+impl AddressBook {
+    pub fn load(&self, address: Pubkey) -> Page {
+        self.book.entry(address).or_default().clone()
+    }
+
+    pub fn page_count(&self) -> usize {
+        self.book.len()
+    }
+
+    pub fn clear(&self) {
+        self.book.clear();
+    }
+}
+
 fn initialized_result_with_timings() -> ResultWithTimings {
     (Ok(()), ExecuteTimings::default())
 }
 
-// Currently, simplest possible implementation (i.e. single-threaded)
-// this will be replaced with more proper implementation...
-// not usable at all, especially for mainnet-beta
 #[derive(Debug)]
-pub struct PooledScheduler<TH: TaskHandler> {
-    inner: PooledSchedulerInner<Self, TH>,
+pub struct PooledScheduler<TH, SEA>
+where
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
+{
+    inner: PooledSchedulerInner<Self, TH, SEA>,
     context: SchedulingContext,
 }
 
 #[derive(Debug)]
-pub struct PooledSchedulerInner<S: SpawnableScheduler<TH>, TH: TaskHandler> {
-    thread_manager: ThreadManager<S, TH>,
+pub struct PooledSchedulerInner<S, TH, SEA>
+where
+    S: SpawnableScheduler<TH, SEA>,
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
+{
+    thread_manager: Arc<RwLock<ThreadManager<S, TH, SEA>>>,
+    address_book: AddressBook,
+    pooled_at: Instant,
+}
+
+impl<S, TH, SEA> PooledSchedulerInner<S, TH, SEA>
+where
+    S: SpawnableScheduler<TH, SEA>,
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
+{
+    fn pooled_since(&self) -> Duration {
+        self.pooled_at.elapsed()
+    }
+
+    fn suspend_thread_manager(&mut self) {
+        debug!("suspend_thread_manager()");
+        self.thread_manager.write().unwrap().suspend();
+    }
+
+    fn id(&self) -> SchedulerId {
+        self.thread_manager.read().unwrap().scheduler_id
+    }
 }
 
+type Tid = i32;
+// The linux's tid (essentially is in the pid name space) is guaranteed to be non-zero; so
+// using 0 for special purpose at user-land is totally safe.
+#[cfg_attr(target_os = "linux", allow(dead_code))]
+const DUMMY_TID: Tid = 0;
+
+#[derive(Default)]
+struct LogInterval(usize);
+
+impl LogInterval {
+    fn increment(&mut self) -> bool {
+        let should_log = self.0 % 1000 == 0;
+        self.0 = self.0.checked_add(1).unwrap();
+        should_log
+    }
+}
+
+const PRIMARY_SCHEDULER_ID: SchedulerId = 0;
+
 // This type manages the OS threads for scheduling and executing transactions. The term
 // `session` is consistently used to mean a group of Tasks scoped under a single SchedulingContext.
 // This is equivalent to a particular bank for block verification. However, new terms is introduced
 // here to mean some continuous time over multiple continuous banks/slots for the block production,
 // which is planned to be implemented in the future.
 #[derive(Debug)]
-struct ThreadManager<S: SpawnableScheduler<TH>, TH: TaskHandler> {
+struct ThreadManager<S, TH, SEA>
+where
+    S: SpawnableScheduler<TH, SEA>,
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
+{
     scheduler_id: SchedulerId,
-    pool: Arc<SchedulerPool<S, TH>>,
+    pool: Arc<SchedulerPool<S, TH, SEA>>,
+    handler: TH,
     new_task_sender: Sender<NewTaskPayload>,
-    new_task_receiver: Receiver<NewTaskPayload>,
+    new_task_receiver: Option<Receiver<NewTaskPayload>>,
     session_result_sender: Sender<Option<ResultWithTimings>>,
     session_result_receiver: Receiver<Option<ResultWithTimings>>,
     session_result_with_timings: Option<ResultWithTimings>,
-    scheduler_thread: Option<JoinHandle<()>>,
+    scheduler_thread_and_tid: Option<(JoinHandle<Option<ResultWithTimings>>, Tid)>,
     handler_threads: Vec<JoinHandle<()>>,
+    accumulator_thread: Option<JoinHandle<()>>,
 }
 
-impl<TH: TaskHandler> PooledScheduler<TH> {
-    fn do_spawn(pool: Arc<SchedulerPool<Self, TH>>, initial_context: SchedulingContext) -> Self {
+impl<TH, SEA> PooledScheduler<TH, SEA>
+where
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
+{
+    fn do_spawn(
+        pool: Arc<SchedulerPool<Self, TH, SEA>>,
+        initial_context: SchedulingContext,
+        handler: TH,
+    ) -> Self {
         Self::from_inner(
-            PooledSchedulerInner::<Self, TH> {
-                thread_manager: ThreadManager::new(pool),
+            PooledSchedulerInner {
+                thread_manager: Arc::new(RwLock::new(ThreadManager::new(pool.clone(), handler))),
+                address_book: AddressBook::default(),
+                pooled_at: Instant::now(),
             },
             initial_context,
         )
     }
+
+    #[cfg(feature = "dev-context-only-utils")]
+    pub fn clear_session_result_with_timings(&mut self) {
+        assert_matches!(
+            self.inner
+                .thread_manager
+                .write()
+                .unwrap()
+                .take_session_result_with_timings(),
+            (Ok(_), _)
+        );
+    }
+
+    #[cfg(feature = "dev-context-only-utils")]
+    pub fn restart_session(&mut self) {
+        self.inner
+            .thread_manager
+            .write()
+            .unwrap()
+            .start_session(&self.context);
+    }
+
+    #[cfg(feature = "dev-context-only-utils")]
+    pub fn schedule_task(&self, task: Task) {
+        self.inner.thread_manager.read().unwrap().send_task(task);
+    }
+
+    fn ensure_thread_manager_resumed(
+        &self,
+        context: &SchedulingContext,
+    ) -> std::result::Result<RwLockReadGuard<'_, ThreadManager<Self, TH, SEA>>, TransactionError>
+    {
+        let mut was_already_active = false;
+        loop {
+            let read = self.inner.thread_manager.read().unwrap();
+            if !read.is_suspended() {
+                debug!(
+                    "{}",
+                    if was_already_active {
+                        "ensure_thread_manager_resumed(): was already active."
+                    } else {
+                        "ensure_thread_manager_resumed(): wasn't already active..."
+                    }
+                );
+                return Ok(read);
+            } else {
+                debug!("ensure_thread_manager_resumed(): will start threads...");
+                drop(read);
+                let mut write = self.inner.thread_manager.write().unwrap();
+                write.start_or_try_resume_threads(context)?;
+                drop(write);
+                was_already_active = false;
+            }
+        }
+    }
 }
 
-impl<S: SpawnableScheduler<TH>, TH: TaskHandler> ThreadManager<S, TH> {
-    fn new(pool: Arc<SchedulerPool<S, TH>>) -> Self {
+impl<S, TH, SEA> ThreadManager<S, TH, SEA>
+where
+    S: SpawnableScheduler<TH, SEA>,
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
+{
+    fn new(pool: Arc<SchedulerPool<S, TH, SEA>>, handler: TH) -> Self {
         let (new_task_sender, new_task_receiver) = unbounded();
         let (session_result_sender, session_result_receiver) = unbounded();
         let handler_count = pool.handler_count;
@@ -442,23 +826,40 @@ impl<S: SpawnableScheduler<TH>, TH: TaskHandler> ThreadManager<S, TH> {
         Self {
             scheduler_id: pool.new_scheduler_id(),
             pool,
+            handler,
             new_task_sender,
-            new_task_receiver,
+            new_task_receiver: Some(new_task_receiver),
             session_result_sender,
             session_result_receiver,
             session_result_with_timings: None,
-            scheduler_thread: None,
+            scheduler_thread_and_tid: None,
             handler_threads: Vec::with_capacity(handler_count),
+            accumulator_thread: None,
         }
     }
 
+    fn is_suspended(&self) -> bool {
+        self.scheduler_thread_and_tid.is_none()
+    }
+
+    pub fn take_scheduler_thread(&mut self) -> Option<JoinHandle<Option<ResultWithTimings>>> {
+        self.scheduler_thread_and_tid
+            .take()
+            .map(|(thread, _tid)| thread)
+    }
+
     fn execute_task_with_handler(
+        handler: &TH,
         bank: &Arc<Bank>,
         executed_task: &mut Box<ExecutedTask>,
         handler_context: &HandlerContext,
+        send_metrics: bool,
     ) {
+        let handler_timings =
+            send_metrics.then_some((Measure::start("process_message_time"), ThreadTime::now()));
         debug!("handling task at {:?}", thread::current());
         TH::handle(
+            handler,
             &mut executed_task.result_with_timings.0,
             &mut executed_task.result_with_timings.1,
             bank,
@@ -466,30 +867,77 @@ impl<S: SpawnableScheduler<TH>, TH: TaskHandler> ThreadManager<S, TH> {
             executed_task.task.task_index(),
             handler_context,
         );
+        if let Some((mut wall_time, cpu_time)) = handler_timings {
+            executed_task.handler_timings = Some(HandlerTimings {
+                finish_time: SystemTime::now(),
+                execution_cpu_us: cpu_time.elapsed().as_micros(),
+                execution_us: {
+                    // make wall time is longer than cpu time, always
+                    wall_time.stop();
+                    wall_time.as_us()
+                },
+            });
+        }
     }
 
     fn accumulate_result_with_timings(
-        (result, timings): &mut ResultWithTimings,
+        (_result, timings): &mut ResultWithTimings,
         executed_task: Box<ExecutedTask>,
     ) {
-        match executed_task.result_with_timings.0 {
-            Ok(()) => {}
-            Err(error) => {
-                error!("error is detected while accumulating....: {error:?}");
-                // Override errors intentionally for simplicity, not retaining the
-                // first error unlike the block verification in the
-                // blockstore_processor. This will be addressed with more
-                // full-fledged impl later.
-                *result = Err(error);
-            }
-        }
+        assert_matches!(executed_task.result_with_timings.0, Ok(()));
         timings.accumulate(&executed_task.result_with_timings.1);
+
+        if let Some(handler_timings) = &executed_task.handler_timings {
+            let thread = format!("solScExLane{:02}", executed_task.thx);
+            let signature = executed_task.task.transaction().signature().to_string();
+            let account_locks_in_json = serde_json::to_string(
+                &executed_task
+                    .task
+                    .transaction()
+                    .get_account_locks_unchecked(),
+            )
+            .unwrap();
+            let status = format!("{:?}", executed_task.result_with_timings.0);
+            let compute_unit_price = executed_task
+                .task
+                .transaction()
+                .get_compute_budget_details(false)
+                .map(|d| d.compute_unit_price)
+                .unwrap_or_default();
+
+            datapoint_info_at!(
+                handler_timings.finish_time,
+                "transaction_timings",
+                ("slot", executed_task.slot, i64),
+                ("index", executed_task.task.task_index(), i64),
+                ("thread", thread, String),
+                ("signature", signature, String),
+                ("account_locks_in_json", account_locks_in_json, String),
+                ("status", status, String),
+                ("duration", handler_timings.execution_us, i64),
+                ("cpu_duration", handler_timings.execution_cpu_us, i64),
+                ("compute_units", 0 /*task.cu*/, i64),
+                ("priority", compute_unit_price, i64), // old name is kept for compat...
+            );
+        }
+
+        drop(executed_task);
     }
 
     fn take_session_result_with_timings(&mut self) -> ResultWithTimings {
         self.session_result_with_timings.take().unwrap()
     }
 
+    fn reset_session_on_error(&mut self) -> Result<()> {
+        let err = self
+            .session_result_with_timings
+            .replace(initialized_result_with_timings())
+            .unwrap()
+            .0;
+        assert_matches!(err, Err(_));
+        err
+    }
+
     fn put_session_result_with_timings(&mut self, result_with_timings: ResultWithTimings) {
         assert_matches!(
             self.session_result_with_timings
@@ -498,11 +946,38 @@ impl<S: SpawnableScheduler<TH>, TH: TaskHandler> ThreadManager<S, TH> {
         );
     }
 
-    fn start_threads(&mut self, context: &SchedulingContext) {
-        let (mut runnable_task_sender, runnable_task_receiver) =
+    fn start_or_try_resume_threads(&mut self, context: &SchedulingContext) -> Result<()> {
+        if !self.is_suspended() {
+            // this can't be promoted to panic! as read => write upgrade isn't completely
+            // race-free in ensure_thread_manager_resumed()...
+            warn!("try_resume(): already resumed");
+            return Ok(());
+        } else if self
+            .session_result_with_timings
+            .as_ref()
+            .map(|(result, _)| result.is_err())
+            .unwrap_or(false)
+        {
+            warn!("try_resume(): skipping resuming due to err, while resetting session result");
+            return self.reset_session_on_error();
+        }
+        debug!("try_resume(): doing now");
+
+        let send_metrics = env::var("SOLANA_TRANSACTION_TIMINGS").is_ok();
+
+        let (mut blocked_task_sender, blocked_task_receiver) =
             chained_channel::unbounded::<Task, SchedulingContext>(context.clone());
+        let (idle_task_sender, idle_task_receiver) = unbounded::<Task>();
         let (finished_task_sender, finished_task_receiver) = unbounded::<Box<ExecutedTask>>();
-
+        let (finished_idle_task_sender, finished_idle_task_receiver) =
+            unbounded::<Box<ExecutedTask>>();
+        let (executed_task_sender, executed_task_receiver) = unbounded::<ExecutedTaskPayload>();
+        let (accumulated_result_sender, accumulated_result_receiver) =
+            unbounded::<Option<ResultWithTimings>>();
+
+        let scheduler_id = self.scheduler_id;
+        let mut slot = context.bank().slot();
+        let (tid_sender, tid_receiver) = bounded(1);
         let mut result_with_timings = self.session_result_with_timings.take();
 
         // High-level flow of new tasks:
@@ -512,13 +987,16 @@ impl<S: SpawnableScheduler<TH>, TH: TaskHandler> ThreadManager<S, TH> {
         // 4. the handler thread processes the dispatched task.
         // 5. the handler thread reply back to the scheduler thread as an executed task.
         // 6. the scheduler thread post-processes the executed task.
+        // 7. the scheduler thread send the executed task to the accumulator thread.
+        // 8. the accumulator thread examines the executed task's result and accumulate its timing,
+        //    finally dropping the transaction inside the executed task.
         let scheduler_main_loop = || {
             let handler_count = self.pool.handler_count;
             let session_result_sender = self.session_result_sender.clone();
-            let new_task_receiver = self.new_task_receiver.clone();
+            let mut new_task_receiver = self.new_task_receiver.take().unwrap();
 
             let mut session_ending = false;
-            let mut active_task_count: usize = 0;
+            let mut thread_suspending = false;
 
             // Now, this is the main loop for the scheduler thread, which is a special beast.
             //
@@ -558,95 +1036,274 @@ impl<S: SpawnableScheduler<TH>, TH: TaskHandler> ThreadManager<S, TH> {
             // cycles out of the scheduler thread. Thus, any kinds of unessential overhead sources
             // like syscalls, VDSO, and even memory (de)allocation should be avoided at all costs
             // by design or by means of offloading at the last resort.
-            move || loop {
-                let mut is_finished = false;
-                while !is_finished {
-                    select! {
-                        recv(finished_task_receiver) -> executed_task => {
-                            let executed_task = executed_task.unwrap();
-
-                            active_task_count = active_task_count.checked_sub(1).unwrap();
-                            let result_with_timings = result_with_timings.as_mut().unwrap();
-                            Self::accumulate_result_with_timings(result_with_timings, executed_task);
-                        },
-                        recv(new_task_receiver) -> message => {
-                            assert!(!session_ending);
-
-                            match message.unwrap() {
-                                NewTaskPayload::Payload(task) => {
-                                    // so, we're NOT scheduling at all here; rather, just execute
-                                    // tx straight off. the inter-tx locking deps aren't needed to
-                                    // be resolved in the case of single-threaded FIFO like this.
-                                    runnable_task_sender
+            move || {
+                const BITS_PER_HEX_DIGIT: usize = 4;
+                let mut state_machine = unsafe {
+                    SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling()
+                };
+                let mut log_interval = LogInterval::default();
+                // hint compiler about inline[never] and unlikely?
+                macro_rules! log_scheduler {
+                    ($prefix:tt) => {
+                        info!(
+                            "[sch_{:0width$x}]: slot: {}[{:12}]({}{}): state_machine(({}(+{})=>{})/{}|{}) channels(<{} >{}+{} <{}+{})",
+                            scheduler_id, slot,
+                            (if ($prefix) == "step" { "interval" } else { $prefix }),
+                            (if session_ending {"S"} else {"-"}), (if thread_suspending {"T"} else {"-"}),
+                            state_machine.active_task_count(), state_machine.unblocked_task_queue_count(), state_machine.handled_task_count(),
+                            state_machine.total_task_count(),
+                            state_machine.unblocked_task_count(),
+                            new_task_receiver.len(),
+                            blocked_task_sender.len(), idle_task_sender.len(),
+                            finished_task_receiver.len(), finished_idle_task_receiver.len(),
+                            width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT,
+                        );
+                    };
+                }
+
+                trace!("solScheduler thread is running at: {:?}", thread::current());
+                tid_sender
+                    .send({
+                        #[cfg(not(target_os = "linux"))]
+                        let tid = DUMMY_TID;
+                        #[cfg(target_os = "linux")]
+                        let tid = rustix::thread::gettid().as_raw_nonzero().get();
+                        tid
+                    })
+                    .unwrap();
+                let (do_now, dont_now) = (&disconnected::<()>(), &never::<()>());
+                log_scheduler!("S+T:started");
+
+                while !thread_suspending {
+                    let mut is_finished = false;
+                    while !is_finished {
+                        let state_change = select_biased! {
+                            recv(finished_task_receiver) -> executed_task => {
+                                let executed_task = executed_task.unwrap();
+                                if executed_task.is_err() {
+                                    log_scheduler!("S+T:aborted");
+                                    // MUST: clear the addressbook before reusing this scheduler
+                                    // ...
+                                    session_result_sender.send(None).unwrap();
+                                    // be explicit about specifically dropping this receiver
+                                    drop(new_task_receiver);
+                                    // this timings aren't for the accumulated one. but
+                                    // caller doesn't care.
+                                    return Some(executed_task.result_with_timings);
+                                } else {
+                                    state_machine.deschedule_task(&executed_task.task);
+                                    executed_task_sender.send_buffered(ExecutedTaskPayload::Payload(executed_task)).unwrap();
+                                }
+                                "step"
+                            },
+                            recv(if state_machine.has_unblocked_task() { do_now } else { dont_now }) -> dummy_result => {
+                                assert_matches!(dummy_result, Err(RecvError));
+
+                                if let Some(task) = state_machine.schedule_unblocked_task() {
+                                    blocked_task_sender
                                         .send_payload(task)
                                         .unwrap();
-                                    active_task_count = active_task_count.checked_add(1).unwrap();
                                 }
-                                NewTaskPayload::OpenSubchannel(context) => {
-                                    // signal about new SchedulingContext to handler threads
-                                    runnable_task_sender
-                                        .send_chained_channel(context, handler_count)
-                                        .unwrap();
-                                    assert_matches!(
-                                        result_with_timings.replace(initialized_result_with_timings()),
-                                        None
-                                    );
+                                "step"
+                            },
+                            recv(new_task_receiver) -> message => {
+                                assert!(message.is_err() || (!session_ending && !thread_suspending));
+                                match message {
+                                    Ok(NewTaskPayload::Payload(task)) => {
+                                        if let Some(task) = state_machine.schedule_task(task) {
+                                            idle_task_sender.send(task).unwrap();
+                                        }
+                                        "step"
+                                    }
+                                    Ok(NewTaskPayload::OpenSubchannel(context)) => {
+                                        slot = context.bank().slot();
+                                        blocked_task_sender
+                                            .send_chained_channel(context, handler_count)
+                                            .unwrap();
+                                        executed_task_sender
+                                            .send(ExecutedTaskPayload::OpenSubchannel(()))
+                                            .unwrap();
+                                        "S:started"
+                                    }
+                                    Ok(NewTaskPayload::CloseSubchannel) => {
+                                        session_ending = true;
+                                        "S:ending"
+                                    }
+                                    Err(_) => {
+                                        assert!(!thread_suspending);
+                                        thread_suspending = true;
+
+                                        // Err(_) on new_task_receiver guarantees
+                                        // that there's no live sender and no messages to be
+                                        // received anymore; so dropping by overriding it with
+                                        // never() should pose no possibility of missed messages.
+                                        new_task_receiver = never();
+
+                                        "T:suspending"
+                                    }
                                 }
-                                NewTaskPayload::CloseSubchannel => {
-                                    session_ending = true;
+                            },
+                            recv(finished_idle_task_receiver) -> executed_task => {
+                                let executed_task = executed_task.unwrap();
+                                if executed_task.is_err() {
+                                    log_scheduler!("S+T:aborted");
+                                    session_result_sender.send(None).unwrap();
+                                    // be explicit about specifically dropping this receiver
+                                    drop(new_task_receiver);
+                                    // this timings aren't for the accumulated one. but
+                                    // caller doesn't care.
+                                    return Some(executed_task.result_with_timings);
+                                } else {
+                                    state_machine.deschedule_task(&executed_task.task);
+                                    executed_task_sender.send_buffered(ExecutedTaskPayload::Payload(executed_task)).unwrap();
                                 }
-                            }
-                        },
-                    };
+                                "step"
+                            },
+                        };
+                        if state_change != "step" || log_interval.increment() {
+                            log_scheduler!(state_change);
+                        }
 
-                    // a really simplistic termination condition, which only works under the
-                    // assumption of single handler thread...
-                    is_finished = session_ending && active_task_count == 0;
+                        is_finished = state_machine.has_no_active_task()
+                            && (session_ending || thread_suspending);
+                    }
+
+                    if session_ending {
+                        log_scheduler!("S:ended");
+                        state_machine.reinitialize();
+                        log_interval = LogInterval::default();
+                        executed_task_sender
+                            .send(ExecutedTaskPayload::CloseSubchannel)
+                            .unwrap();
+                        session_result_sender
+                            .send(Some(
+                                accumulated_result_receiver
+                                    .recv()
+                                    .unwrap()
+                                    .unwrap_or_else(initialized_result_with_timings),
+                            ))
+                            .unwrap();
+                        if !thread_suspending {
+                            session_ending = false;
+                        }
+                    }
                 }
 
-                if session_ending {
-                    session_result_sender
-                        .send(Some(
-                            result_with_timings
-                                .take()
-                                .unwrap_or_else(initialized_result_with_timings),
-                        ))
+                log_scheduler!("T:suspended");
+                let result_with_timings = if session_ending {
+                    None
+                } else {
+                    executed_task_sender
+                        .send(ExecutedTaskPayload::CloseSubchannel)
                         .unwrap();
-                    session_ending = false;
-                }
+                    accumulated_result_receiver.recv().unwrap()
+                };
+                trace!(
+                    "solScheduler thread is terminating at: {:?}",
+                    thread::current()
+                );
+                result_with_timings
             }
         };
 
-        let handler_main_loop = || {
+        let handler_main_loop = |thx| {
             let pool = self.pool.clone();
-            let mut runnable_task_receiver = runnable_task_receiver.clone();
+            let handler = self.handler.clone();
+            let mut blocked_task_receiver = blocked_task_receiver.clone();
+            let mut idle_task_receiver = idle_task_receiver.clone();
             let finished_task_sender = finished_task_sender.clone();
+            let finished_idle_task_sender = finished_idle_task_sender.clone();
 
-            move || loop {
-                let (task, sender) = select! {
-                    recv(runnable_task_receiver.for_select()) -> message => {
-                        if let Some(task) = runnable_task_receiver.after_select(message.unwrap()) {
-                            (task, &finished_task_sender)
-                        } else {
-                            continue;
-                        }
-                    },
-                };
-                let mut task = ExecutedTask::new_boxed(task);
-                Self::execute_task_with_handler(
-                    runnable_task_receiver.context().bank(),
-                    &mut task,
-                    &pool.handler_context,
+            move || {
+                trace!(
+                    "solScHandler{:02} thread is running at: {:?}",
+                    thx,
+                    thread::current()
+                );
+                loop {
+                    let (task, sender) = select_biased! {
+                        recv(blocked_task_receiver.for_select()) -> message => {
+                            match message {
+                                Ok(message) => {
+                                    if let Some(task) = blocked_task_receiver.after_select(message) {
+                                        (task, &finished_task_sender)
+                                    } else {
+                                        continue;
+                                    }
+                                },
+                                Err(_) => break,
+                            }
+                        },
+                        recv(idle_task_receiver) -> task => {
+                            if let Ok(task) = task {
+                                (task, &finished_idle_task_sender)
+                            } else {
+                                idle_task_receiver = never();
+                                continue;
+                            }
+                        },
+                    };
+                    let bank = blocked_task_receiver.context().bank();
+                    let mut task = ExecutedTask::new_boxed(task, thx, bank.slot());
+                    Self::execute_task_with_handler(
+                        &handler,
+                        bank,
+                        &mut task,
+                        &pool.handler_context,
+                        send_metrics,
+                    );
+                    if sender.send(task).is_err() {
+                        break;
+                    }
+                }
+                trace!(
+                    "solScHandler{:02} thread is terminating at: {:?}",
+                    thx,
+                    thread::current()
                 );
-                sender.send(task).unwrap();
             }
         };
 
-        self.scheduler_thread = Some(
+        let accumulator_main_loop = || {
+            move || 'outer: loop {
+                match executed_task_receiver.recv_timeout(Duration::from_millis(40)) {
+                    Ok(ExecutedTaskPayload::Payload(executed_task)) => {
+                        let result_with_timings = result_with_timings.as_mut().unwrap();
+                        Self::accumulate_result_with_timings(result_with_timings, executed_task);
+                    }
+                    Ok(ExecutedTaskPayload::OpenSubchannel(())) => {
+                        assert_matches!(
+                            result_with_timings.replace(initialized_result_with_timings()),
+                            None
+                        );
+                    }
+                    Ok(ExecutedTaskPayload::CloseSubchannel) => {
+                        if accumulated_result_sender
+                            .send(result_with_timings.take())
+                            .is_err()
+                        {
+                            break 'outer;
+                        }
+                    }
+                    Err(RecvTimeoutError::Disconnected) => break 'outer,
+                    Err(RecvTimeoutError::Timeout) => continue,
+                }
+            }
+        };
+
+        self.scheduler_thread_and_tid = Some((
             thread::Builder::new()
                 .name("solScheduler".to_owned())
                 .spawn(scheduler_main_loop())
                 .unwrap(),
+            tid_receiver.recv().unwrap(),
+        ));
+
+        self.accumulator_thread = Some(
+            thread::Builder::new()
+                .name("solScAccmltr".to_owned())
+                .spawn(accumulator_main_loop())
+                .unwrap(),
         );
 
         self.handler_threads = (0..self.pool.handler_count)
@@ -654,95 +1311,194 @@ impl<S: SpawnableScheduler<TH>, TH: TaskHandler> ThreadManager<S, TH> {
                 |thx| {
                     thread::Builder::new()
                         .name(format!("solScHandler{:02}", thx))
-                        .spawn(handler_main_loop())
+                        .spawn(handler_main_loop(thx))
                         .unwrap()
                 }
             })
             .collect();
+        Ok(())
     }
 
-    fn send_task(&self, task: Task) {
+    fn send_task(&self, task: Task) -> bool {
         debug!("send_task()");
         self.new_task_sender
             .send(NewTaskPayload::Payload(task))
-            .unwrap()
+            .is_err()
     }
 
     fn end_session(&mut self) {
-        if self.session_result_with_timings.is_some() {
+        debug!("end_session(): will end session...");
+        if self.is_suspended() {
+            debug!("end_session(): no threads..");
+            assert_matches!(self.session_result_with_timings, Some(_));
+            return;
+        } else if self.session_result_with_timings.is_some() {
             debug!("end_session(): already result resides within thread manager..");
             return;
         }
-        debug!("end_session(): will end session...");
 
-        self.new_task_sender
+        let mut abort_detected = self
+            .new_task_sender
             .send(NewTaskPayload::CloseSubchannel)
-            .unwrap();
+            .is_err();
 
         if let Some(result_with_timings) = self.session_result_receiver.recv().unwrap() {
+            assert!(!abort_detected);
             self.put_session_result_with_timings(result_with_timings);
+        } else {
+            abort_detected = true;
+        }
+
+        if abort_detected {
+            self.suspend();
         }
     }
 
     fn start_session(&mut self, context: &SchedulingContext) {
-        assert_matches!(self.session_result_with_timings, None);
-        self.new_task_sender
-            .send(NewTaskPayload::OpenSubchannel(context.clone()))
-            .unwrap();
+        if !self.is_suspended() {
+            assert_matches!(self.session_result_with_timings, None);
+            self.new_task_sender
+                .send(NewTaskPayload::OpenSubchannel(context.clone()))
+                .unwrap();
+        } else {
+            self.put_session_result_with_timings(initialized_result_with_timings());
+            assert_matches!(self.start_or_try_resume_threads(context), Ok(()));
+        }
+    }
+
+    fn suspend(&mut self) {
+        let Some(scheduler_thread) = self.take_scheduler_thread() else {
+            warn!("suspend(): already suspended...");
+            return;
+        };
+        debug!("suspend(): terminating threads by {:?}", thread::current());
+
+        let (s, r) = unbounded();
+        (self.new_task_sender, self.new_task_receiver) = (s, Some(r));
+
+        let () = self.accumulator_thread.take().unwrap().join().unwrap();
+        for thread in self.handler_threads.drain(..) {
+            debug!("joining...: {:?}", thread);
+            () = thread.join().unwrap();
+        }
+        if let Some(result_with_timings) = scheduler_thread.join().unwrap() {
+            self.put_session_result_with_timings(result_with_timings);
+        }
+
+        debug!(
+            "suspend(): successfully suspended threads by {:?}",
+            thread::current()
+        );
+    }
+
+    fn is_primary(&self) -> bool {
+        self.scheduler_id == PRIMARY_SCHEDULER_ID
+    }
+
+    #[cfg(target_os = "linux")]
+    fn active_tid_if_not_primary(&self) -> Option<Tid> {
+        if self.is_primary() {
+            // always exempt from cleaner...
+            None
+        } else {
+            self.scheduler_thread_and_tid.as_ref().map(|&(_, tid)| tid)
+        }
     }
 }
 
-pub trait SpawnableScheduler<TH: TaskHandler>: InstalledScheduler {
-    type Inner: Debug + Send + Sync;
+pub trait SpawnableScheduler<TH, SEA>: InstalledScheduler<SEA>
+where
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
+{
+    type Inner: Debug + Send + Sync + RetirableSchedulerInner;
 
     fn into_inner(self) -> (ResultWithTimings, Self::Inner);
 
     fn from_inner(inner: Self::Inner, context: SchedulingContext) -> Self;
 
-    fn spawn(pool: Arc<SchedulerPool<Self, TH>>, initial_context: SchedulingContext) -> Self
+    fn spawn(
+        pool: Arc<SchedulerPool<Self, TH, SEA>>,
+        initial_context: SchedulingContext,
+        handler: TH,
+    ) -> Self
     where
         Self: Sized;
 }
 
-impl<TH: TaskHandler> SpawnableScheduler<TH> for PooledScheduler<TH> {
-    type Inner = PooledSchedulerInner<Self, TH>;
+pub trait RetirableSchedulerInner {
+    fn retire_if_stale(&mut self) -> bool;
+}
+
+impl<TH, SEA> SpawnableScheduler<TH, SEA> for PooledScheduler<TH, SEA>
+where
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
+{
+    type Inner = PooledSchedulerInner<Self, TH, SEA>;
 
-    fn into_inner(mut self) -> (ResultWithTimings, Self::Inner) {
+    fn into_inner(self) -> (ResultWithTimings, Self::Inner) {
         let result_with_timings = {
-            let manager = &mut self.inner.thread_manager;
+            let manager = &mut self.inner.thread_manager.write().unwrap();
             manager.end_session();
             manager.take_session_result_with_timings()
         };
         (result_with_timings, self.inner)
     }
 
-    fn from_inner(mut inner: Self::Inner, context: SchedulingContext) -> Self {
-        inner.thread_manager.start_session(&context);
+    fn from_inner(inner: Self::Inner, context: SchedulingContext) -> Self {
+        inner
+            .thread_manager
+            .write()
+            .unwrap()
+            .start_session(&context);
         Self { inner, context }
     }
 
-    fn spawn(pool: Arc<SchedulerPool<Self, TH>>, initial_context: SchedulingContext) -> Self {
-        let mut scheduler = Self::do_spawn(pool, initial_context);
-        scheduler
-            .inner
-            .thread_manager
-            .start_threads(&scheduler.context);
+    fn spawn(
+        pool: Arc<SchedulerPool<Self, TH, SEA>>,
+        initial_context: SchedulingContext,
+        handler: TH,
+    ) -> Self {
+        let scheduler = Self::do_spawn(pool.clone(), initial_context, handler);
+        pool.register_to_cleaner(Arc::downgrade(&scheduler.inner.thread_manager));
         scheduler
     }
 }
 
-impl<TH: TaskHandler> InstalledScheduler for PooledScheduler<TH> {
+impl<TH, SEA> InstalledScheduler<SEA> for PooledScheduler<TH, SEA>
+where
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
+{
     fn id(&self) -> SchedulerId {
-        self.inner.thread_manager.scheduler_id
+        self.inner.id()
     }
 
     fn context(&self) -> &SchedulingContext {
         &self.context
     }
 
-    fn schedule_execution(&self, &(transaction, index): &(&SanitizedTransaction, usize)) {
-        let task = Task::create_task(transaction.clone(), index);
-        self.inner.thread_manager.send_task(task);
+    fn schedule_execution(
+        &self,
+        transaction_with_index: SEA::TransactionWithIndex<'_>,
+    ) -> Result<()> {
+        transaction_with_index.with_transaction_and_index(|transaction, index| {
+            let task =
+                SchedulingStateMachine::create_task(transaction.clone(), index, &mut |pubkey| {
+                    self.inner.address_book.load(pubkey)
+                });
+            let abort_detected = self
+                .ensure_thread_manager_resumed(&self.context)?
+                .send_task(task);
+            if abort_detected {
+                let thread_manager = &mut self.inner.thread_manager.write().unwrap();
+                thread_manager.suspend();
+                thread_manager.reset_session_on_error()
+            } else {
+                Ok(())
+            }
+        })
     }
 
     fn wait_for_termination(
@@ -754,17 +1510,78 @@ impl<TH: TaskHandler> InstalledScheduler for PooledScheduler<TH> {
     }
 
     fn pause_for_recent_blockhash(&mut self) {
-        self.inner.thread_manager.end_session();
+        self.inner.thread_manager.write().unwrap().end_session();
     }
 }
 
-impl<S, TH> UninstalledScheduler for PooledSchedulerInner<S, TH>
+impl<S, TH, SEA> UninstalledScheduler for PooledSchedulerInner<S, TH, SEA>
 where
-    S: SpawnableScheduler<TH, Inner = PooledSchedulerInner<S, TH>>,
-    TH: TaskHandler,
+    S: SpawnableScheduler<TH, SEA, Inner = PooledSchedulerInner<S, TH, SEA>>,
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
 {
-    fn return_to_pool(self: Box<Self>) {
-        self.thread_manager.pool.clone().return_scheduler(*self)
+    fn return_to_pool(mut self: Box<Self>) {
+        let pool = self.thread_manager.write().unwrap().pool.clone();
+        self.pooled_at = Instant::now();
+        pool.return_scheduler(*self)
+    }
+}
+
+impl<S, TH, SEA> RetirableSchedulerInner for PooledSchedulerInner<S, TH, SEA>
+where
+    S: SpawnableScheduler<TH, SEA, Inner = PooledSchedulerInner<S, TH, SEA>>,
+    TH: TaskHandler<SEA>,
+    SEA: ScheduleExecutionArg,
+{
+    fn retire_if_stale(&mut self) -> bool {
+        // reap threads after 10mins of inactivity for any pooled (idle) schedulers. The primary
+        // scheduler is special-cased to empty its address book instead, for easier monitoring to
+        // accumulate os-level thread metrics. The duration is chosen based on the rough estimation
+        // from the frequency of short-lived forks on the mainnet-beta, with consideration of some
+        // increased forking at epoch boundaries.
+        const IDLE_DURATION_FOR_LAZY_THREAD_RECLAIM: Duration = Duration::from_secs(600);
+
+        const BITS_PER_HEX_DIGIT: usize = 4;
+        let page_count = self.address_book.page_count();
+        if page_count < 200_000 {
+            info!(
+                "[sch_{:0width$x}]: cleaner: address book size: {page_count}...",
+                self.id(),
+                width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT,
+            );
+        } else if self.thread_manager.read().unwrap().is_primary() {
+            info!(
+                "[sch_{:0width$x}]: cleaner: too big address book size: {page_count}...; emptying the primary scheduler",
+                self.id(),
+                width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT,
+            );
+            self.address_book.clear();
+            return true;
+        } else {
+            info!(
+                "[sch_{:0width$x}]: cleaner: too big address book size: {page_count}...; retiring scheduler",
+                self.id(),
+                width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT,
+            );
+            self.suspend_thread_manager();
+            return false;
+        }
+
+        let pooled_duration = self.pooled_since();
+        if pooled_duration <= IDLE_DURATION_FOR_LAZY_THREAD_RECLAIM {
+            true
+        } else if !self.thread_manager.read().unwrap().is_primary() {
+            info!(
+                "[sch_{:0width$x}]: cleaner: retiring unused scheduler after {:?}...",
+                self.id(),
+                pooled_duration,
+                width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT,
+            );
+            self.suspend_thread_manager();
+            false
+        } else {
+            true
+        }
     }
 }
 
@@ -772,7 +1589,6 @@ where
 mod tests {
     use {
         super::*,
-        assert_matches::assert_matches,
         solana_runtime::{
             bank::Bank,
             bank_forks::BankForks,
@@ -783,11 +1599,12 @@ mod tests {
         solana_sdk::{
             clock::MAX_PROCESSING_AGE,
             pubkey::Pubkey,
+            scheduling::SchedulingMode,
             signer::keypair::Keypair,
             system_transaction,
             transaction::{SanitizedTransaction, TransactionError},
         },
-        std::{sync::Arc, thread::JoinHandle},
+        std::{mem, sync::Arc, thread::JoinHandle},
     };
 
     #[test]
@@ -800,7 +1617,10 @@ mod tests {
 
         // this indirectly proves that there should be circular link because there's only one Arc
         // at this moment now
-        assert_eq!((Arc::strong_count(&pool), Arc::weak_count(&pool)), (1, 1));
+        assert_eq!(
+            (Arc::strong_count(&pool), Arc::weak_count(&pool)),
+            (1 + 1 /* todo */, 1)
+        );
         let debug = format!("{pool:#?}");
         assert!(!debug.is_empty());
     }
@@ -813,7 +1633,7 @@ mod tests {
         let pool =
             DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache);
         let bank = Arc::new(Bank::default_for_tests());
-        let context = SchedulingContext::new(bank);
+        let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank);
         let scheduler = pool.take_scheduler(context);
 
         let debug = format!("{scheduler:#?}");
@@ -828,7 +1648,7 @@ mod tests {
         let pool =
             DefaultSchedulerPool::new(None, None, None, None, ignored_prioritization_fee_cache);
         let bank = Arc::new(Bank::default_for_tests());
-        let context = &SchedulingContext::new(bank);
+        let context = &SchedulingContext::new(SchedulingMode::BlockVerification, bank);
 
         let scheduler1 = pool.do_take_scheduler(context.clone());
         let scheduler_id1 = scheduler1.id();
@@ -857,7 +1677,7 @@ mod tests {
         let pool =
             DefaultSchedulerPool::new(None, None, None, None, ignored_prioritization_fee_cache);
         let bank = Arc::new(Bank::default_for_tests());
-        let context = &SchedulingContext::new(bank);
+        let context = &SchedulingContext::new(SchedulingMode::BlockVerification, bank);
         let mut scheduler = pool.do_take_scheduler(context.clone());
 
         // should never panic.
@@ -879,8 +1699,10 @@ mod tests {
         let new_bank = &Arc::new(Bank::default_for_tests());
         assert!(!Arc::ptr_eq(old_bank, new_bank));
 
-        let old_context = &SchedulingContext::new(old_bank.clone());
-        let new_context = &SchedulingContext::new(new_bank.clone());
+        let old_context =
+            &SchedulingContext::new(SchedulingMode::BlockVerification, old_bank.clone());
+        let new_context =
+            &SchedulingContext::new(SchedulingMode::BlockVerification, new_bank.clone());
 
         let scheduler = pool.do_take_scheduler(old_context.clone());
         let scheduler_id = scheduler.id();
@@ -897,11 +1719,14 @@ mod tests {
 
         let bank = Bank::default_for_tests();
         let bank_forks = BankForks::new_rw_arc(bank);
-        let mut bank_forks = bank_forks.write().unwrap();
+        let mut bank_forks_write = bank_forks.write().unwrap();
         let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64));
         let pool =
             DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache);
-        bank_forks.install_scheduler_pool(pool);
+        bank_forks_write.install_scheduler_pool(pool);
+        bank_forks_write.prepare_to_drop();
+        drop(bank_forks_write);
+        drop::<BankForks>(Arc::into_inner(bank_forks).unwrap().into_inner().unwrap());
     }
 
     #[test]
@@ -968,11 +1793,11 @@ mod tests {
         let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64));
         let pool =
             DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache);
-        let context = SchedulingContext::new(bank.clone());
+        let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone());
 
         assert_eq!(bank.transaction_count(), 0);
         let scheduler = pool.take_scheduler(context);
-        scheduler.schedule_execution(&(tx0, 0));
+        assert_matches!(scheduler.schedule_execution(&(tx0, 0)), Ok(()));
         let bank = BankWithScheduler::new(bank, Some(scheduler));
         assert_matches!(bank.wait_for_completed_scheduler(), Some((Ok(()), _)));
         assert_eq!(bank.transaction_count(), 1);
@@ -993,7 +1818,7 @@ mod tests {
         let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64));
         let pool =
             DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache);
-        let context = SchedulingContext::new(bank.clone());
+        let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone());
         let mut scheduler = pool.take_scheduler(context);
 
         let unfunded_keypair = Keypair::new();
@@ -1005,9 +1830,9 @@ mod tests {
                 genesis_config.hash(),
             ));
         assert_eq!(bank.transaction_count(), 0);
-        scheduler.schedule_execution(&(bad_tx, 0));
+        assert_matches!(scheduler.schedule_execution(&(bad_tx, 0)), Ok(()));
         // simulate the task-sending thread is stalled for some reason.
-        std::thread::sleep(std::time::Duration::from_secs(1));
+        thread::sleep(Duration::from_secs(1));
         assert_eq!(bank.transaction_count(), 0);
 
         let good_tx_after_bad_tx =
@@ -1023,25 +1848,26 @@ mod tests {
                 .result,
             Ok(_)
         );
-        scheduler.schedule_execution(&(good_tx_after_bad_tx, 0));
+        thread::sleep(Duration::from_secs(3));
+        assert_matches!(
+            scheduler.schedule_execution(&(good_tx_after_bad_tx, 0)),
+            Err(_)
+        );
+        error!("last pause!");
         scheduler.pause_for_recent_blockhash();
         // transaction_count should remain same as scheduler should be bailing out.
         // That's because we're testing the serialized failing execution case in this test.
-        // However, currently threaded impl can't properly abort in this situtation..
-        // so, 1 should be observed, intead of 0.
         // Also note that bank.transaction_count() is generally racy by nature, because
         // blockstore_processor and unified_scheduler both tend to process non-conflicting batches
         // in parallel as part of the normal operation.
-        assert_eq!(bank.transaction_count(), 1);
+        assert_eq!(bank.transaction_count(), 0);
 
         let bank = BankWithScheduler::new(bank, Some(scheduler));
         assert_matches!(
             bank.wait_for_completed_scheduler(),
-            Some((
-                Err(solana_sdk::transaction::TransactionError::AccountNotFound),
-                _timings
-            ))
+            Some((Ok(()), _timings))
         );
+        pool.uninstalled_from_bank_forks();
     }
 
     #[derive(Debug)]
@@ -1049,7 +1875,7 @@ mod tests {
         Mutex<ResultWithTimings>,
         Mutex<Vec<JoinHandle<ResultWithTimings>>>,
         SchedulingContext,
-        Arc<SchedulerPool<Self, DefaultTaskHandler>>,
+        Arc<SchedulerPool<Self, DefaultTaskHandler, DefaultScheduleExecutionArg>>,
     );
 
     impl<const TRIGGER_RACE_CONDITION: bool> AsyncScheduler<TRIGGER_RACE_CONDITION> {
@@ -1068,7 +1894,7 @@ mod tests {
         }
     }
 
-    impl<const TRIGGER_RACE_CONDITION: bool> InstalledScheduler
+    impl<const TRIGGER_RACE_CONDITION: bool> InstalledScheduler<DefaultScheduleExecutionArg>
         for AsyncScheduler<TRIGGER_RACE_CONDITION>
     {
         fn id(&self) -> SchedulerId {
@@ -1079,20 +1905,24 @@ mod tests {
             &self.2
         }
 
-        fn schedule_execution(&self, &(transaction, index): &(&SanitizedTransaction, usize)) {
+        fn schedule_execution(
+            &self,
+            &(transaction, index): &(&SanitizedTransaction, usize),
+        ) -> Result<()> {
             let transaction_and_index = (transaction.clone(), index);
             let context = self.context().clone();
             let pool = self.3.clone();
 
-            self.1.lock().unwrap().push(std::thread::spawn(move || {
+            self.1.lock().unwrap().push(thread::spawn(move || {
                 // intentionally sleep to simulate race condition where register_recent_blockhash
                 // is handle before finishing executing scheduled transactions
-                std::thread::sleep(std::time::Duration::from_secs(1));
+                thread::sleep(Duration::from_secs(1));
 
                 let mut result = Ok(());
                 let mut timings = ExecuteTimings::default();
 
-                <DefaultTaskHandler as TaskHandler>::handle(
+                <DefaultTaskHandler as TaskHandler<DefaultScheduleExecutionArg>>::handle(
+                    &DefaultTaskHandler,
                     &mut result,
                     &mut timings,
                     context.bank(),
@@ -1102,6 +1932,8 @@ mod tests {
                 );
                 (result, timings)
             }));
+
+            Ok(())
         }
 
         fn wait_for_termination(
@@ -1109,7 +1941,7 @@ mod tests {
             _is_dropped: bool,
         ) -> (ResultWithTimings, UninstalledSchedulerBox) {
             self.do_wait();
-            let result_with_timings = std::mem::replace(
+            let result_with_timings = mem::replace(
                 &mut *self.0.lock().unwrap(),
                 initialized_result_with_timings(),
             );
@@ -1134,7 +1966,8 @@ mod tests {
         }
     }
 
-    impl<const TRIGGER_RACE_CONDITION: bool> SpawnableScheduler<DefaultTaskHandler>
+    impl<const TRIGGER_RACE_CONDITION: bool>
+        SpawnableScheduler<DefaultTaskHandler, DefaultScheduleExecutionArg>
         for AsyncScheduler<TRIGGER_RACE_CONDITION>
     {
         // well, i wish i can use ! (never type).....
@@ -1149,8 +1982,9 @@ mod tests {
         }
 
         fn spawn(
-            pool: Arc<SchedulerPool<Self, DefaultTaskHandler>>,
+            pool: Arc<SchedulerPool<Self, DefaultTaskHandler, DefaultScheduleExecutionArg>>,
             initial_context: SchedulingContext,
+            _handler: DefaultTaskHandler,
         ) -> Self {
             AsyncScheduler::<TRIGGER_RACE_CONDITION>(
                 Mutex::new(initialized_result_with_timings()),
@@ -1161,6 +1995,14 @@ mod tests {
         }
     }
 
+    impl<const TRIGGER_RACE_CONDITION: bool> RetirableSchedulerInner
+        for AsyncScheduler<TRIGGER_RACE_CONDITION>
+    {
+        fn retire_if_stale(&mut self) -> bool {
+            unimplemented!();
+        }
+    }
+
     fn do_test_scheduler_schedule_execution_recent_blockhash_edge_case<
         const TRIGGER_RACE_CONDITION: bool,
     >() {
@@ -1190,24 +2032,24 @@ mod tests {
             );
         }
         let bank = setup_dummy_fork_graph(bank);
-        let context = SchedulingContext::new(bank.clone());
+        let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone());
 
         let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64));
-        let pool =
-            SchedulerPool::<AsyncScheduler<TRIGGER_RACE_CONDITION>, DefaultTaskHandler>::new_dyn(
-                None,
-                None,
-                None,
-                None,
-                ignored_prioritization_fee_cache,
-            );
+        let pool = SchedulerPool::<
+            AsyncScheduler<TRIGGER_RACE_CONDITION>,
+            DefaultTaskHandler,
+            DefaultScheduleExecutionArg,
+        >::new_dyn(None, None, None, None, ignored_prioritization_fee_cache);
         let scheduler = pool.take_scheduler(context);
 
         let bank = BankWithScheduler::new(bank, Some(scheduler));
         assert_eq!(bank.transaction_count(), 0);
 
         // schedule but not immediately execute transaction
-        bank.schedule_transaction_executions([(&very_old_valid_tx, &0)].into_iter());
+        assert_matches!(
+            bank.schedule_transaction_executions([(&very_old_valid_tx, &0)].into_iter()),
+            Ok(())
+        );
         // this calls register_recent_blockhash internally
         bank.fill_bank_with_ticks_for_tests();