diff --git a/Cargo.lock b/Cargo.lock index 9a61e27a12c5e9..823bcf8b0ca265 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -601,7 +601,7 @@ dependencies = [ "lazy_static", "lazycell", "peeking_take_while", - "prettyplease 0.2.4", + "prettyplease 0.2.16", "proc-macro2", "quote", "regex", @@ -610,6 +610,29 @@ dependencies = [ "syn 2.0.50", ] +[[package]] +name = "bindgen" +version = "0.69.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4c69fae65a523209d34240b60abe0c42d33d1045d445c0839d8a4894a736e2d" +dependencies = [ + "bitflags 2.4.2", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "prettyplease 0.2.16", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.50", + "which", +] + [[package]] name = "bit-set" version = "0.5.2" @@ -1231,7 +1254,7 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c278839b831783b70278b14df4d45e1beb1aad306c07bb796637de9a0e323e8e" dependencies = [ - "crossbeam-utils", + "crossbeam-utils 0.8.18", ] [[package]] @@ -1327,6 +1350,16 @@ dependencies = [ "winapi 0.2.8", ] +[[package]] +name = "cpu-time" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" +dependencies = [ + "libc", + "winapi 0.3.9", +] + [[package]] name = "cpufeatures" version = "0.2.7" @@ -1397,10 +1430,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "176dc175b78f56c0f321911d9c8eb2b77a78a4860b9c19db83835fea1a46649b" +source = "git+https://github.com/ryoqun/crossbeam?rev=438ec7cdaf6c6a8f593e50344c725fef8a13c7a5#438ec7cdaf6c6a8f593e50344c725fef8a13c7a5" dependencies = [ - "crossbeam-utils", + "crossbeam-utils 0.8.19", ] [[package]] @@ -1411,7 +1443,7 @@ checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", - "crossbeam-utils", + "crossbeam-utils 0.8.18", ] [[package]] @@ -1420,7 +1452,7 @@ version = "0.9.5" source = "git+https://github.com/solana-labs/crossbeam?rev=fd279d707025f0e60951e429bf778b4813d1b6bf#fd279d707025f0e60951e429bf778b4813d1b6bf" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils", + "crossbeam-utils 0.8.18", "lazy_static", "memoffset 0.6.4", "scopeguard", @@ -1435,6 +1467,11 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "git+https://github.com/ryoqun/crossbeam?rev=438ec7cdaf6c6a8f593e50344c725fef8a13c7a5#438ec7cdaf6c6a8f593e50344c725fef8a13c7a5" + [[package]] name = "crunchy" version = "0.2.2" @@ -1501,6 +1538,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "cty" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b365fabc795046672053e29c954733ec3b05e4be654ab130fe8f1f94d7051f35" + [[package]] name = "curve25519-dalek" version = "3.2.1" @@ -1583,6 +1626,15 @@ dependencies = [ "rusticata-macros", ] +[[package]] +name = "deranged" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f32d04922c60427da6f9fef14d042d9edddef64cb9d4ce0d64d0685fbeb1fd3" +dependencies = [ + "powerfmt", +] + [[package]] name = "derivation-path" version = "0.2.0" @@ -2575,6 +2627,43 @@ dependencies = [ "tokio-native-tls", ] +[[package]] +name = "iai-callgrind" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e99bf26f496b13ac6273014f40afda46a233fbfb0289ce50fb4daaad2f2ffc80" +dependencies = [ + "bincode", + "bindgen 0.69.2", + "cc", + "cfg-if 1.0.0", + "cty", + "iai-callgrind-macros", + "iai-callgrind-runner", + "regex", +] + +[[package]] +name = "iai-callgrind-macros" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2a4bb39225592c0a28cfca6f70af52ebd8da23f533c2cdd0a3329c1fa252d56" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn 2.0.50", +] + +[[package]] +name = "iai-callgrind-runner" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c23a951b9eccaa1e38556d27473d1462a9c247a27961812edcaac156af861282" +dependencies = [ + "serde", +] + [[package]] name = "iana-time-zone" version = "0.1.46" @@ -2942,7 +3031,7 @@ version = "0.11.0+8.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3386f101bcb4bd252d8e9d2fb41ec3b0862a15a62b478c355b2982efa469e3e" dependencies = [ - "bindgen", + "bindgen 0.65.1", "bzip2-sys", "cc", "glob", @@ -3450,15 +3539,6 @@ dependencies = [ "syn 2.0.50", ] -[[package]] -name = "num_threads" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97ba99ba6393e2c3734791401b66902d981cb03bf190af674ca69949b6d5fb15" -dependencies = [ - "libc", -] - [[package]] name = "number_prefix" version = "0.4.0" @@ -3884,6 +3964,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc59d1bcc64fc5d021d67521f818db868368028108d37f0e98d74e33f68297b5" +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.15" @@ -3938,9 +4024,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.4" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058" +checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" dependencies = [ "proc-macro2", "syn 2.0.50", @@ -4003,6 +4089,32 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.4.2", + "chrono", + "flate2", + "hex", + "lazy_static", + "procfs-core", + "rustix", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.4.2", + "chrono", + "hex", +] + [[package]] name = "proptest" version = "1.4.0" @@ -4317,7 +4429,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ "crossbeam-deque", - "crossbeam-utils", + "crossbeam-utils 0.8.18", ] [[package]] @@ -4834,8 +4946,13 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe" dependencies = [ + "base64 0.13.1", + "chrono", + "hex", "serde", + "serde_json", "serde_with_macros", + "time", ] [[package]] @@ -7232,6 +7349,7 @@ dependencies = [ "itertools", "log", "percentage", + "rand 0.8.5", "rustc_version 0.4.0", "solana-bpf-loader-program", "solana-frozen-abi", @@ -7462,7 +7580,13 @@ dependencies = [ name = "solana-unified-scheduler-logic" version = "1.19.0" dependencies = [ + "assert_matches", + "iai-callgrind", + "qualifier_attr", "solana-sdk", + "solana-unified-scheduler-logic", + "static_assertions", + "triomphe", ] [[package]] @@ -7470,16 +7594,30 @@ name = "solana-unified-scheduler-pool" version = "1.19.0" dependencies = [ "assert_matches", + "bincode", + "cpu-time", + "criterion", "crossbeam-channel", + "dashmap", "derivative", "log", + "procfs", + "qualifier_attr", + "rand 0.8.5", + "rustix", + "serde_json", "solana-ledger", "solana-logger", + "solana-measure", + "solana-metrics", + "solana-nohash-hasher", "solana-program-runtime", "solana-runtime", "solana-sdk", "solana-unified-scheduler-logic", + "solana-unified-scheduler-pool", "solana-vote", + "tikv-jemallocator", ] [[package]] @@ -7975,6 +8113,12 @@ dependencies = [ "spl-program-error", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "static_assertions" version = "1.1.0" @@ -8329,21 +8473,32 @@ dependencies = [ [[package]] name = "time" -version = "0.3.9" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd" +checksum = "c4a34ab300f2dee6e562c10a046fc05e358b29f9bf92277f30c3c8d82275f6f5" dependencies = [ + "deranged", "itoa", - "libc", - "num_threads", + "powerfmt", + "serde", + "time-core", "time-macros", ] +[[package]] +name = "time-core" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" + [[package]] name = "time-macros" -version = "0.2.4" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" +checksum = "4ad70d68dba9e1f8aceda7aa6711965dfec1cac869f311a51bd08b3a2ccbce20" +dependencies = [ + "time-core", +] [[package]] name = "tiny-bip39" @@ -8713,6 +8868,16 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0de5f738ceab88e2491a94ddc33c3feeadfa95fedc60363ef110845df12f3878" +[[package]] +name = "triomphe" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "859eb650cfee7434994602c3a68b25d77ad9e68c8a6cd491616ef86661382eb3" +dependencies = [ + "serde", + "stable_deref_trait", +] + [[package]] name = "try-lock" version = "0.2.3" diff --git a/Cargo.toml b/Cargo.toml index 0ec4b780fe13e4..eadfa6d02d16f3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -179,6 +179,7 @@ console_error_panic_hook = "0.1.7" console_log = "0.2.2" const_format = "0.2.32" core_affinity = "0.5.10" +cpu-time = "1.0.0" criterion = "0.5.1" criterion-stats = "0.3.0" crossbeam-channel = "0.5.11" @@ -268,6 +269,7 @@ predicates = "2.1" pretty-hex = "0.3.0" prio-graph = "0.2.1" proc-macro2 = "1.0.78" +procfs = "0.16.0" proptest = "1.4" prost = "0.11.9" prost-build = "0.11.9" @@ -288,6 +290,7 @@ reqwest = { version = "0.11.23", default-features = false } rolling-file = "0.2.0" rpassword = "7.3" rustc_version = "0.4" +rustix = "0.38.21" rustls = { version = "0.21.10", default-features = false, features = ["quic"] } rustversion = "1.0.14" scopeguard = "1.2.0" @@ -439,6 +442,8 @@ zstd = "0.11.2" # for details, see https://github.com/solana-labs/crossbeam/commit/fd279d707025f0e60951e429bf778b4813d1b6bf crossbeam-epoch = { git = "https://github.com/solana-labs/crossbeam", rev = "fd279d707025f0e60951e429bf778b4813d1b6bf" } +crossbeam-channel = { git = "https://github.com/ryoqun/crossbeam", rev = "438ec7cdaf6c6a8f593e50344c725fef8a13c7a5" } + # We include the following crates as our dependencies above from crates.io: # # * spl-associated-token-account diff --git a/ci/test-bench.sh b/ci/test-bench.sh index aacc82cffbb0a6..1444405bcccf5c 100755 --- a/ci/test-bench.sh +++ b/ci/test-bench.sh @@ -56,6 +56,10 @@ _ $cargoNightly bench --manifest-path gossip/Cargo.toml ${V:+--verbose} \ _ $cargoNightly bench --manifest-path poh/Cargo.toml ${V:+--verbose} \ -- -Z unstable-options --format=json | tee -a "$BENCH_FILE" +# Run scheduler-pool benches +_ $cargoNightly bench --manifest-path scheduler-pool/Cargo.toml ${V:+--verbose} \ + -- -Z unstable-options --format=json | tee -a "$BENCH_FILE" + # Run core benches _ $cargoNightly bench --manifest-path core/Cargo.toml ${V:+--verbose} \ -- -Z unstable-options --format=json | tee -a "$BENCH_FILE" diff --git a/core/src/drop_bank_service.rs b/core/src/drop_bank_service.rs index 0321643d6aab68..f65ae566a08411 100644 --- a/core/src/drop_bank_service.rs +++ b/core/src/drop_bank_service.rs @@ -1,11 +1,8 @@ use { crossbeam_channel::Receiver, solana_measure::measure::Measure, - solana_runtime::bank::Bank, - std::{ - sync::Arc, - thread::{self, Builder, JoinHandle}, - }, + solana_runtime::installed_scheduler_pool::BankWithScheduler, + std::thread::{self, Builder, JoinHandle}, }; pub struct DropBankService { @@ -13,7 +10,7 @@ pub struct DropBankService { } impl DropBankService { - pub fn new(bank_receiver: Receiver>>) -> Self { + pub fn new(bank_receiver: Receiver>) -> Self { let thread_hdl = Builder::new() .name("solDropBankSrvc".to_string()) .spawn(move || { diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index a80a04d47c1573..c8d744b30c202a 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -536,7 +536,7 @@ impl ReplayStage { cluster_slots_update_sender: ClusterSlotsUpdateSender, cost_update_sender: Sender, voting_sender: Sender, - drop_bank_sender: Sender>>, + drop_bank_sender: Sender>, block_metadata_notifier: Option, log_messages_bytes_limit: Option, prioritization_fee_cache: Arc, @@ -1618,7 +1618,7 @@ impl ReplayStage { // Grab the Slot and BankId's of the banks we need to purge, then clear the banks // from BankForks - let (slots_to_purge, removed_banks): (Vec<(Slot, BankId)>, Vec>) = { + let (slots_to_purge, removed_banks): (Vec<(Slot, BankId)>, Vec) = { let mut w_bank_forks = bank_forks.write().unwrap(); slot_descendants .iter() @@ -2275,7 +2275,7 @@ impl ReplayStage { replay_timing: &mut ReplayTiming, voting_sender: &Sender, epoch_slots_frozen_slots: &mut EpochSlotsFrozenSlots, - drop_bank_sender: &Sender>>, + drop_bank_sender: &Sender>, wait_to_vote_slot: Option, ) { if bank.is_empty() { @@ -4093,7 +4093,7 @@ impl ReplayStage { has_new_vote_been_rooted: &mut bool, voted_signatures: &mut Vec, epoch_slots_frozen_slots: &mut EpochSlotsFrozenSlots, - drop_bank_sender: &Sender>>, + drop_bank_sender: &Sender>, ) { bank_forks.read().unwrap().prune_program_cache(new_root); let removed_banks = bank_forks.write().unwrap().set_root( diff --git a/core/src/validator.rs b/core/src/validator.rs index 97ef0a01ef87ad..cb8456af27b37d 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -142,8 +142,8 @@ const WAIT_FOR_SUPERMAJORITY_THRESHOLD_PERCENT: u64 = 80; #[derive(Clone, EnumString, EnumVariantNames, Default, IntoStaticStr, Display)] #[strum(serialize_all = "kebab-case")] pub enum BlockVerificationMethod { - #[default] BlockstoreProcessor, + #[default] UnifiedScheduler, } @@ -1434,8 +1434,10 @@ impl Validator { // Used for notifying many nodes in parallel to exit pub fn exit(&mut self) { + info!("exit1"); self.validator_exit.write().unwrap().exit(); + info!("exit2"); // drop all signals in blockstore self.blockstore.drop_signal(); } @@ -1471,24 +1473,29 @@ impl Validator { } pub fn join(self) { - drop(self.bank_forks); + info!("join1"); drop(self.cluster_info); + info!("join2"); self.poh_service.join().expect("poh_service"); drop(self.poh_recorder); + info!("join3"); if let Some(json_rpc_service) = self.json_rpc_service { json_rpc_service.join().expect("rpc_service"); } + info!("join4"); if let Some(pubsub_service) = self.pubsub_service { pubsub_service.join().expect("pubsub_service"); } + info!("join5"); self.rpc_completed_slots_service .join() .expect("rpc_completed_slots_service"); + info!("join6"); if let Some(optimistically_confirmed_bank_tracker) = self.optimistically_confirmed_bank_tracker { @@ -1497,96 +1504,126 @@ impl Validator { .expect("optimistically_confirmed_bank_tracker"); } + info!("join7"); if let Some(transaction_status_service) = self.transaction_status_service { transaction_status_service .join() .expect("transaction_status_service"); } + info!("join8"); if let Some(rewards_recorder_service) = self.rewards_recorder_service { rewards_recorder_service .join() .expect("rewards_recorder_service"); } + info!("join9"); if let Some(cache_block_meta_service) = self.cache_block_meta_service { cache_block_meta_service .join() .expect("cache_block_meta_service"); } + info!("join10"); if let Some(system_monitor_service) = self.system_monitor_service { system_monitor_service .join() .expect("system_monitor_service"); } + info!("join11"); if let Some(sample_performance_service) = self.sample_performance_service { sample_performance_service .join() .expect("sample_performance_service"); } + info!("join12"); if let Some(entry_notifier_service) = self.entry_notifier_service { entry_notifier_service .join() .expect("entry_notifier_service"); } + info!("join13"); if let Some(s) = self.snapshot_packager_service { s.join().expect("snapshot_packager_service"); } + info!("join14"); self.gossip_service.join().expect("gossip_service"); if let Some(repair_quic_endpoint) = &self.repair_quic_endpoint { repair::quic_endpoint::close_quic_endpoint(repair_quic_endpoint); } + info!("join15"); self.serve_repair_service .join() .expect("serve_repair_service"); + info!("join15"); if let Some(repair_quic_endpoint_join_handle) = self.repair_quic_endpoint_join_handle { self.repair_quic_endpoint_runtime .map(|runtime| runtime.block_on(repair_quic_endpoint_join_handle)) .transpose() .unwrap(); }; + info!("join16"); self.stats_reporter_service .join() .expect("stats_reporter_service"); + info!("join17"); self.blockstore_metric_report_service .join() .expect("ledger_metric_report_service"); + info!("join18"); self.accounts_background_service .join() .expect("accounts_background_service"); + info!("join19"); self.accounts_hash_verifier .join() .expect("accounts_hash_verifier"); + info!("join20"); if let Some(turbine_quic_endpoint) = &self.turbine_quic_endpoint { solana_turbine::quic_endpoint::close_quic_endpoint(turbine_quic_endpoint); } + info!("join21"); self.tpu.join().expect("tpu"); + info!("join22"); self.tvu.join().expect("tvu"); + info!("join23"); if let Some(turbine_quic_endpoint_join_handle) = self.turbine_quic_endpoint_join_handle { self.turbine_quic_endpoint_runtime .map(|runtime| runtime.block_on(turbine_quic_endpoint_join_handle)) .transpose() .unwrap(); } + info!("join24"); self.completed_data_sets_service .join() .expect("completed_data_sets_service"); + info!("join25"); if let Some(ip_echo_server) = self.ip_echo_server { ip_echo_server.shutdown_background(); } + info!("join26"); if let Some(geyser_plugin_service) = self.geyser_plugin_service { geyser_plugin_service.join().expect("geyser_plugin_service"); } + info!("join27"); self.poh_timing_report_service .join() .expect("poh_timing_report_service"); + info!("join28"); + self.bank_forks.write().unwrap().prepare_to_drop(); + let sc = Arc::strong_count(&self.bank_forks); + if let Some(bank_forks) = Arc::into_inner(self.bank_forks) { + drop::(bank_forks.into_inner().unwrap()); + } else { + warn!("seems bankforks are leaking...{}:", sc); + } } } diff --git a/ledger-tool/src/ledger_utils.rs b/ledger-tool/src/ledger_utils.rs index 116b21527ae4d8..fffa823afcd529 100644 --- a/ledger-tool/src/ledger_utils.rs +++ b/ledger-tool/src/ledger_utils.rs @@ -304,6 +304,8 @@ pub fn load_and_process_ledger( } } BlockVerificationMethod::UnifiedScheduler => { + let unified_scheduler_handler_threads = + value_t!(arg_matches, "unified_scheduler_handler_threads", usize).ok(); let no_transaction_status_sender = None; let no_replay_vote_sender = None; let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); diff --git a/ledger-tool/src/main.rs b/ledger-tool/src/main.rs index 9b299cfadcbcf2..18ef7460bcdc88 100644 --- a/ledger-tool/src/main.rs +++ b/ledger-tool/src/main.rs @@ -1654,6 +1654,8 @@ fn main() { } exit_signal.store(true, Ordering::Relaxed); system_monitor_service.join().unwrap(); + bank_forks.write().unwrap().prepare_to_drop(); + drop::(Arc::into_inner(bank_forks).unwrap().into_inner().unwrap()); } ("graph", Some(arg_matches)) => { let output_file = value_t_or_exit!(arg_matches, "graph_filename", String); diff --git a/ledger/src/blockstore_processor.rs b/ledger/src/blockstore_processor.rs index 63edb23e01cc18..d07e6f21fe79da 100644 --- a/ledger/src/blockstore_processor.rs +++ b/ledger/src/blockstore_processor.rs @@ -338,8 +338,7 @@ fn process_batches( // scheduling always succeeds here without being blocked on actual transaction executions. // The transaction execution errors will be collected via the blocking fn called // BankWithScheduler::wait_for_completed_scheduler(), if any. - schedule_batches_for_execution(bank, batches); - Ok(()) + schedule_batches_for_execution(bank, batches) } else { debug!( "process_batches()/rebatch_and_execute_batches({} batches)", @@ -360,7 +359,7 @@ fn process_batches( fn schedule_batches_for_execution( bank: &BankWithScheduler, batches: &[TransactionBatchWithIndexes], -) { +) -> Result<()> { for TransactionBatchWithIndexes { batch, transaction_indexes, @@ -371,8 +370,9 @@ fn schedule_batches_for_execution( .sanitized_transactions() .iter() .zip(transaction_indexes.iter()), - ); + )?; } + Ok(()) } fn rebatch_transactions<'a>( @@ -440,9 +440,7 @@ fn rebatch_and_execute_batches( { let mut cost_tracker = bank.write_cost_tracker().unwrap(); for tx_cost in &tx_costs { - cost_tracker - .try_add(tx_cost) - .map_err(TransactionError::from)?; + cost_tracker.try_add(tx_cost)?; } } @@ -1960,6 +1958,7 @@ pub mod tests { instruction::{Instruction, InstructionError}, native_token::LAMPORTS_PER_SOL, pubkey::Pubkey, + scheduling::SchedulingMode, signature::{Keypair, Signer}, system_instruction::SystemError, system_transaction, @@ -4544,7 +4543,7 @@ pub mod tests { .. } = create_genesis_config_with_leader(500, &dummy_leader_pubkey, 100); let bank = Arc::new(Bank::new_for_tests(&genesis_config)); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); let txs = create_test_transactions(&mint_keypair, &genesis_config.hash()); @@ -4559,7 +4558,7 @@ pub mod tests { mocked_scheduler .expect_schedule_execution() .times(txs.len()) - .returning(|_| ()); + .returning(|_| Ok(())); mocked_scheduler .expect_wait_for_termination() .with(mockall::predicate::eq(true)) diff --git a/local-cluster/tests/local_cluster.rs b/local-cluster/tests/local_cluster.rs index 6f7de16df296b1..9a9cc9c4b093aa 100644 --- a/local-cluster/tests/local_cluster.rs +++ b/local-cluster/tests/local_cluster.rs @@ -4,7 +4,7 @@ use { crossbeam_channel::{unbounded, Receiver}, gag::BufferRedirect, log::*, - rand::seq::IteratorRandom, + rand::seq::SliceRandom, serial_test::serial, solana_accounts_db::{ hardened_unpack::open_genesis_config, utils::create_accounts_run_and_snapshot_dirs, @@ -5499,12 +5499,14 @@ fn test_randomly_mixed_block_verification_methods_between_bootstrap_and_not() { ); // Randomly switch to use unified scheduler - config - .validator_configs - .iter_mut() - .choose(&mut rand::thread_rng()) - .unwrap() - .block_verification_method = BlockVerificationMethod::UnifiedScheduler; + let mut methods = [ + BlockVerificationMethod::UnifiedScheduler, + BlockVerificationMethod::BlockstoreProcessor, + ]; + methods.shuffle(&mut rand::thread_rng()); + for (validator_config, method) in config.validator_configs.iter_mut().zip(methods) { + validator_config.block_verification_method = method; + } let local = LocalCluster::new(&mut config, SocketAddrSpace::Unspecified); cluster_tests::spend_and_verify_all_nodes( diff --git a/metrics/src/datapoint.rs b/metrics/src/datapoint.rs index e2740ce3aecc47..8a13a112da0636 100644 --- a/metrics/src/datapoint.rs +++ b/metrics/src/datapoint.rs @@ -60,6 +60,15 @@ impl DataPoint { } } + pub fn at(timestamp: SystemTime, name: &'static str) -> Self { + DataPoint { + name, + timestamp, + tags: vec![], + fields: vec![], + } + } + pub fn add_tag(&mut self, name: &'static str, value: &str) -> &mut Self { self.tags.push((name, value.to_string())); self @@ -160,6 +169,56 @@ macro_rules! create_datapoint { }; } +#[macro_export] +macro_rules! create_datapoint_at { + (@field $point:ident $name:expr, $string:expr, String) => { + $point.add_field_str($name, &$string); + }; + (@field $point:ident $name:expr, $value:expr, i64) => { + $point.add_field_i64($name, $value as i64); + }; + (@field $point:ident $name:expr, $value:expr, f64) => { + $point.add_field_f64($name, $value as f64); + }; + (@field $point:ident $name:expr, $value:expr, bool) => { + $point.add_field_bool($name, $value as bool); + }; + (@tag $point:ident $tag_name:expr, $tag_value:expr) => { + $point.add_tag($tag_name, &$tag_value); + }; + + (@fields $point:ident) => {}; + + // process tags + (@fields $point:ident $tag_name:expr => $tag_value:expr, $($rest:tt)*) => { + $crate::create_datapoint!(@tag $point $tag_name, $tag_value); + $crate::create_datapoint!(@fields $point $($rest)*); + }; + (@fields $point:ident $tag_name:expr => $tag_value:expr) => { + $crate::create_datapoint!(@tag $point $tag_name, $tag_value); + }; + + // process fields + (@fields $point:ident ($name:expr, $value:expr, $type:ident) , $($rest:tt)*) => { + $crate::create_datapoint!(@field $point $name, $value, $type); + $crate::create_datapoint!(@fields $point $($rest)*); + }; + (@fields $point:ident ($name:expr, $value:expr, $type:ident)) => { + $crate::create_datapoint!(@field $point $name, $value, $type); + }; + + (@point $name:expr, $at:expr, $($fields:tt)+) => { + { + let mut point = $crate::datapoint::DataPoint::at($at, &$name); + $crate::create_datapoint!(@fields point $($fields)+); + point + } + }; + (@point $name:expr, $at:expr) => { + $crate::datapoint::DataPoint::at($at, &$name) + }; +} + #[macro_export] macro_rules! datapoint { ($level:expr, $name:expr, $($fields:tt)+) => { @@ -168,6 +227,21 @@ macro_rules! datapoint { } }; } + +#[macro_export] +macro_rules! datapoint_at { + ($level:expr, $at:expr, $name:expr) => { + if log::log_enabled!($level) { + $crate::submit($crate::create_datapoint_at!(@point $name, $at), $level); + } + }; + ($level:expr, $at:expr, $name:expr, $($fields:tt)+) => { + if log::log_enabled!($level) { + $crate::submit($crate::create_datapoint_at!(@point $name, $at, $($fields)+), $level); + } + }; +} + #[macro_export] macro_rules! datapoint_error { ($name:expr, $($fields:tt)+) => { @@ -189,6 +263,16 @@ macro_rules! datapoint_info { }; } +#[macro_export] +macro_rules! datapoint_info_at { + ($at:expr, $name:expr) => { + $crate::datapoint_at!(log::Level::Info, $at, $name); + }; + ($at:expr, $name:expr, $($fields:tt)+) => { + $crate::datapoint_at!(log::Level::Info, $at, $name, $($fields)+); + }; +} + #[macro_export] macro_rules! datapoint_debug { ($name:expr, $($fields:tt)+) => { diff --git a/metrics/src/metrics.rs b/metrics/src/metrics.rs index b989ada6861fd1..ec847059bccd07 100644 --- a/metrics/src/metrics.rs +++ b/metrics/src/metrics.rs @@ -181,7 +181,7 @@ impl Default for MetricsAgent { Self::new( Arc::new(InfluxDbMetricsWriter::new()), - Duration::from_secs(10), + Duration::from_secs(1), max_points_per_sec, ) } diff --git a/program-runtime/src/loaded_programs.rs b/program-runtime/src/loaded_programs.rs index 6da84b0d1f0692..2845db21118584 100644 --- a/program-runtime/src/loaded_programs.rs +++ b/program-runtime/src/loaded_programs.rs @@ -673,6 +673,10 @@ impl LoadedProgramsForTxBatch { self.replenish(*key, entry.clone()); }) } + + pub fn is_empty(&self) -> bool { + self.entries.is_empty() + } } pub enum LoadedProgramMatchCriteria { @@ -700,6 +704,10 @@ impl LoadedPrograms { self.fork_graph = Some(fork_graph); } + pub fn unset_fork_graph(&mut self) { + self.fork_graph = None; + } + /// Returns the current environments depending on the given epoch pub fn get_environments_for_epoch(&self, epoch: Epoch) -> &ProgramRuntimeEnvironments { if epoch != self.latest_root_epoch { diff --git a/programs/sbf/Cargo.lock b/programs/sbf/Cargo.lock index 1b8d422d42ba7c..264494740abb94 100644 --- a/programs/sbf/Cargo.lock +++ b/programs/sbf/Cargo.lock @@ -1032,7 +1032,7 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c278839b831783b70278b14df4d45e1beb1aad306c07bb796637de9a0e323e8e" dependencies = [ - "crossbeam-utils", + "crossbeam-utils 0.8.18", ] [[package]] @@ -1108,6 +1108,16 @@ dependencies = [ "winapi 0.2.8", ] +[[package]] +name = "cpu-time" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" +dependencies = [ + "libc", + "winapi 0.3.9", +] + [[package]] name = "cpufeatures" version = "0.2.7" @@ -1129,10 +1139,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "176dc175b78f56c0f321911d9c8eb2b77a78a4860b9c19db83835fea1a46649b" +source = "git+https://github.com/ryoqun/crossbeam?rev=438ec7cdaf6c6a8f593e50344c725fef8a13c7a5#438ec7cdaf6c6a8f593e50344c725fef8a13c7a5" dependencies = [ - "crossbeam-utils", + "crossbeam-utils 0.8.19", ] [[package]] @@ -1143,7 +1152,7 @@ checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", - "crossbeam-utils", + "crossbeam-utils 0.8.18", ] [[package]] @@ -1153,7 +1162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils", + "crossbeam-utils 0.8.18", "lazy_static", "memoffset 0.6.4", "scopeguard", @@ -1168,6 +1177,11 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "git+https://github.com/ryoqun/crossbeam?rev=438ec7cdaf6c6a8f593e50344c725fef8a13c7a5#438ec7cdaf6c6a8f593e50344c725fef8a13c7a5" + [[package]] name = "crunchy" version = "0.2.2" @@ -1286,6 +1300,15 @@ dependencies = [ "rusticata-macros", ] +[[package]] +name = "deranged" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eb30d70a07a3b04884d2677f06bec33509dc67ca60d92949e5535352d3191dc" +dependencies = [ + "powerfmt", +] + [[package]] name = "derivation-path" version = "0.2.0" @@ -2031,6 +2054,12 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "histogram" version = "0.6.9" @@ -3092,15 +3121,6 @@ dependencies = [ "syn 2.0.50", ] -[[package]] -name = "num_threads" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aba1801fb138d8e85e11d0fc70baf4fe1cdfffda7c6cd34a854905df588e5ed0" -dependencies = [ - "libc", -] - [[package]] name = "number_prefix" version = "0.4.0" @@ -3485,6 +3505,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc59d1bcc64fc5d021d67521f818db868368028108d37f0e98d74e33f68297b5" +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.8" @@ -3605,6 +3631,32 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.4.2", + "chrono", + "flate2", + "hex", + "lazy_static", + "procfs-core", + "rustix", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.4.2", + "chrono", + "hex", +] + [[package]] name = "prost" version = "0.11.9" @@ -3842,7 +3894,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ "crossbeam-deque", - "crossbeam-utils", + "crossbeam-utils 0.8.18", ] [[package]] @@ -4295,8 +4347,13 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe" dependencies = [ + "base64 0.13.1", + "chrono", + "hex", "serde", + "serde_json", "serde_with_macros", + "time", ] [[package]] @@ -6307,6 +6364,7 @@ dependencies = [ "itertools", "log", "percentage", + "rand 0.8.5", "rustc_version", "solana-bpf-loader-program", "solana-frozen-abi", @@ -6471,7 +6529,10 @@ dependencies = [ name = "solana-unified-scheduler-logic" version = "1.19.0" dependencies = [ + "assert_matches", + "qualifier_attr", "solana-sdk", + "static_assertions", ] [[package]] @@ -6479,10 +6540,18 @@ name = "solana-unified-scheduler-pool" version = "1.19.0" dependencies = [ "assert_matches", + "cpu-time", "crossbeam-channel", + "dashmap", "derivative", "log", + "procfs", + "qualifier_attr", + "rustix", + "serde_json", "solana-ledger", + "solana-measure", + "solana-metrics", "solana-program-runtime", "solana-runtime", "solana-sdk", @@ -7235,21 +7304,32 @@ dependencies = [ [[package]] name = "time" -version = "0.3.9" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2702e08a7a860f005826c6815dcac101b19b5eb330c27fe4a5928fec1d20ddd" +checksum = "f657ba42c3f86e7680e53c8cd3af8abbe56b5491790b46e22e19c0d57463583e" dependencies = [ + "deranged", "itoa", - "libc", - "num_threads", + "powerfmt", + "serde", + "time-core", "time-macros", ] +[[package]] +name = "time-core" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" + [[package]] name = "time-macros" -version = "0.2.4" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42657b1a6f4d817cda8e7a0ace261fe0cc946cf3a80314390b22cc61ae080792" +checksum = "26197e33420244aeb70c3e8c78376ca46571bc4e701e4791c2cd9f57dcb3a43f" +dependencies = [ + "time-core", +] [[package]] name = "tiny-bip39" diff --git a/programs/sbf/Cargo.toml b/programs/sbf/Cargo.toml index 8a99a0f005471a..c4ae2f6c4bec08 100644 --- a/programs/sbf/Cargo.toml +++ b/programs/sbf/Cargo.toml @@ -169,6 +169,8 @@ members = [ targets = ["x86_64-unknown-linux-gnu"] [patch.crates-io] +crossbeam-channel = { git = "https://github.com/ryoqun/crossbeam", rev = "438ec7cdaf6c6a8f593e50344c725fef8a13c7a5" } + # We include the following crates as our dependencies from crates.io: # # * spl-associated-token-account diff --git a/runtime/src/bank.rs b/runtime/src/bank.rs index 7e051019c99871..d5ca601c8c78ce 100644 --- a/runtime/src/bank.rs +++ b/runtime/src/bank.rs @@ -4931,7 +4931,7 @@ impl Bank { programs_modified_by_tx, } = execution_result { - if details.status.is_ok() { + if details.status.is_ok() && !programs_modified_by_tx.is_empty() { let mut cache = self.loaded_programs_cache.write().unwrap(); cache.merge(programs_modified_by_tx); } diff --git a/runtime/src/bank_forks.rs b/runtime/src/bank_forks.rs index 668062c8d31cce..c9185967eef2e0 100644 --- a/runtime/src/bank_forks.rs +++ b/runtime/src/bank_forks.rs @@ -5,7 +5,8 @@ use { accounts_background_service::{AbsRequestSender, SnapshotRequest, SnapshotRequestKind}, bank::{epoch_accounts_hash_utils, Bank, SquashTiming}, installed_scheduler_pool::{ - BankWithScheduler, InstalledSchedulerPoolArc, SchedulingContext, + BankWithScheduler, DefaultScheduleExecutionArg, InstalledSchedulerPoolArc, + SchedulingContext, }, snapshot_config::SnapshotConfig, }, @@ -15,6 +16,7 @@ use { solana_sdk::{ clock::{Epoch, Slot}, hash::Hash, + scheduling::SchedulingMode, timing, }, std::{ @@ -73,7 +75,13 @@ pub struct BankForks { last_accounts_hash_slot: Slot, in_vote_only_mode: Arc, highest_slot_at_startup: Slot, - scheduler_pool: Option, + scheduler_pool: Option>, +} + +impl Drop for BankForks { + fn drop(&mut self) { + info!("BankForks::drop(): successfully dropped"); + } } impl Index for BankForks { @@ -212,7 +220,10 @@ impl BankForks { self[self.root()].clone() } - pub fn install_scheduler_pool(&mut self, pool: InstalledSchedulerPoolArc) { + pub fn install_scheduler_pool( + &mut self, + pool: InstalledSchedulerPoolArc, + ) { info!("Installed new scheduler_pool into bank_forks: {:?}", pool); assert!( self.scheduler_pool.replace(pool).is_none(), @@ -220,6 +231,26 @@ impl BankForks { ); } + pub fn uninstall_scheduler_pool(&mut self) { + // hint scheduler pool to cut circular references of Arc + if let Some(sp) = self.scheduler_pool.take() { + sp.uninstalled_from_bank_forks(); + } + } + + pub fn prepare_to_drop(&mut self) { + let root_bank = self.root_bank(); + // drop all non root BankWithScheduler, which causes all schedulers wind down. + self.banks.clear(); + self.uninstall_scheduler_pool(); + // this cuts circular references of BankForks... + root_bank + .loaded_programs_cache + .write() + .unwrap() + .unset_fork_graph(); + } + pub fn insert(&mut self, mut bank: Bank) -> BankWithScheduler { if self.root.load(Ordering::Relaxed) < self.highest_slot_at_startup { bank.check_program_modification_slot(); @@ -227,7 +258,7 @@ impl BankForks { let bank = Arc::new(bank); let bank = if let Some(scheduler_pool) = &self.scheduler_pool { - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); let scheduler = scheduler_pool.take_scheduler(context); BankWithScheduler::new(bank, Some(scheduler)) } else { @@ -248,7 +279,7 @@ impl BankForks { self.insert(bank) } - pub fn remove(&mut self, slot: Slot) -> Option> { + pub fn remove(&mut self, slot: Slot) -> Option { let bank = self.banks.remove(&slot)?; for parent in bank.proper_ancestors() { let Entry::Occupied(mut entry) = self.descendants.entry(parent) else { @@ -265,7 +296,7 @@ impl BankForks { if entry.get().is_empty() { entry.remove_entry(); } - Some(bank.clone_without_scheduler()) + Some(bank) } pub fn highest_slot(&self) -> Slot { @@ -285,7 +316,7 @@ impl BankForks { root: Slot, accounts_background_request_sender: &AbsRequestSender, highest_super_majority_root: Option, - ) -> (Vec>, SetRootMetrics) { + ) -> (Vec, SetRootMetrics) { let old_epoch = self.root_bank().epoch(); // To support `RootBankCache` (via `ReadOnlyAtomicSlot`) accessing `root` *without* locking // BankForks first *and* from a different thread, this store *must* be at least Release to @@ -464,7 +495,7 @@ impl BankForks { root: Slot, accounts_background_request_sender: &AbsRequestSender, highest_super_majority_root: Option, - ) -> Vec> { + ) -> Vec { let program_cache_prune_start = Instant::now(); let set_root_start = Instant::now(); let (removed_banks, set_root_metrics) = self.do_set_root_return_metrics( @@ -625,7 +656,7 @@ impl BankForks { &mut self, root: Slot, highest_super_majority_root: Option, - ) -> (Vec>, u64, u64) { + ) -> (Vec, u64, u64) { // Clippy doesn't like separating the two collects below, // but we want to collect timing separately, and the 2nd requires // a unique borrow to self which is already borrowed by self.banks diff --git a/runtime/src/installed_scheduler_pool.rs b/runtime/src/installed_scheduler_pool.rs index d39a18d567232a..ae7c410397d063 100644 --- a/runtime/src/installed_scheduler_pool.rs +++ b/runtime/src/installed_scheduler_pool.rs @@ -25,11 +25,13 @@ use { log::*, solana_program_runtime::timings::ExecuteTimings, solana_sdk::{ + clock::Slot, hash::Hash, - slot_history::Slot, + scheduling::{SchedulingMode, WithSchedulingMode}, transaction::{Result, SanitizedTransaction}, }, std::{ + borrow::Borrow, fmt::Debug, ops::Deref, sync::{Arc, RwLock}, @@ -38,8 +40,9 @@ use { #[cfg(feature = "dev-context-only-utils")] use {mockall::automock, qualifier_attr::qualifiers}; -pub trait InstalledSchedulerPool: Send + Sync + Debug { - fn take_scheduler(&self, context: SchedulingContext) -> InstalledSchedulerBox; +pub trait InstalledSchedulerPool: Send + Sync + Debug { + fn take_scheduler(&self, context: SchedulingContext) -> Box>; + fn uninstalled_from_bank_forks(self: Arc); } #[cfg_attr(doc, aquamarine::aquamarine)] @@ -97,15 +100,15 @@ pub trait InstalledSchedulerPool: Send + Sync + Debug { feature = "dev-context-only-utils", allow(unused_attributes, clippy::needless_lifetimes) )] -pub trait InstalledScheduler: Send + Sync + Debug + 'static { +pub trait InstalledScheduler: Send + Sync + Debug + 'static { fn id(&self) -> SchedulerId; fn context(&self) -> &SchedulingContext; // Calling this is illegal as soon as wait_for_termination is called. fn schedule_execution<'a>( &'a self, - transaction_with_index: &'a (&'a SanitizedTransaction, usize), - ); + transaction_with_index: SEA::TransactionWithIndex<'a>, + ) -> Result<()>; /// Wait for a scheduler to terminate after processing. /// @@ -135,13 +138,47 @@ pub trait UninstalledScheduler: Send + Sync + Debug + 'static { fn return_to_pool(self: Box); } -pub type InstalledSchedulerBox = Box; +pub type InstalledSchedulerBox = Box>; pub type UninstalledSchedulerBox = Box; -pub type InstalledSchedulerPoolArc = Arc; +pub type InstalledSchedulerPoolArc = Arc>; pub type SchedulerId = u64; +pub trait WithTransactionAndIndex: Send + Sync + Debug { + fn with_transaction_and_index( + &self, + callback: impl FnOnce(&SanitizedTransaction, usize) -> R, + ) -> R; +} + +impl< + T: Send + Sync + Debug + Borrow, + U: Send + Sync + Debug + Borrow, + Z: Send + Sync + Debug + Deref, + > WithTransactionAndIndex for Z +{ + fn with_transaction_and_index( + &self, + callback: impl FnOnce(&SanitizedTransaction, usize) -> R, + ) -> R { + callback(self.0.borrow(), *self.1.borrow()) + } +} + +pub trait ScheduleExecutionArg: Send + Sync + Debug + 'static { + // GAT is used to make schedule_execution parametric even supporting references + // under the object-safety req. of InstalledScheduler trait... + type TransactionWithIndex<'tx>: WithTransactionAndIndex; +} + +#[derive(Debug, Default, Clone)] +pub struct DefaultScheduleExecutionArg; + +impl ScheduleExecutionArg for DefaultScheduleExecutionArg { + type TransactionWithIndex<'tx> = &'tx (&'tx SanitizedTransaction, usize); +} + /// A small context to propagate a bank and its scheduling mode to the scheduler subsystem. /// /// Note that this isn't called `SchedulerContext` because the contexts aren't associated with @@ -153,13 +190,19 @@ pub type SchedulerId = u64; /// `SchedulingContext`s. #[derive(Clone, Debug)] pub struct SchedulingContext { - // mode: SchedulingMode, // this will be added later. + mode: SchedulingMode, bank: Arc, } +impl WithSchedulingMode for SchedulingContext { + fn mode(&self) -> SchedulingMode { + self.mode + } +} + impl SchedulingContext { - pub fn new(bank: Arc) -> Self { - Self { bank } + pub fn new(mode: SchedulingMode, bank: Arc) -> Self { + Self { mode, bank } } pub fn bank(&self) -> &Arc { @@ -246,9 +289,14 @@ impl BankWithScheduler { pub(crate) fn new(bank: Arc, scheduler: Option) -> Self { if let Some(bank_in_context) = scheduler .as_ref() - .map(|scheduler| scheduler.context().bank()) + .map(|scheduler| scheduler.context().bank().clone()) { - assert!(Arc::ptr_eq(&bank, bank_in_context)); + assert!( + Arc::ptr_eq(&bank, &bank_in_context), + "different bank!? {} {}", + bank.slot(), + bank_in_context.slot() + ); } Self { @@ -290,7 +338,7 @@ impl BankWithScheduler { pub fn schedule_transaction_executions<'a>( &self, transactions_with_indexes: impl ExactSizeIterator, - ) { + ) -> Result<()> { trace!( "schedule_transaction_executions(): {} txs", transactions_with_indexes.len() @@ -300,8 +348,10 @@ impl BankWithScheduler { let scheduler = scheduler_guard.as_ref().unwrap(); for (sanitized_transaction, &index) in transactions_with_indexes { - scheduler.schedule_execution(&(sanitized_transaction, index)); + scheduler.schedule_execution(&(sanitized_transaction, index))?; } + + Ok(()) } // take needless &mut only to communicate its semantic mutability to humans... @@ -359,22 +409,23 @@ impl BankWithSchedulerInner { ); let mut scheduler = scheduler.write().unwrap(); - let result_with_timings = + let (was_noop, result_with_timings) = if let Some(scheduler) = scheduler.as_mut().filter(|_| reason.is_paused()) { scheduler.pause_for_recent_blockhash(); - None + (false, None) } else if let Some(scheduler) = scheduler.take() { let (result_with_timings, uninstalled_scheduler) = scheduler.wait_for_termination(reason.is_dropped()); uninstalled_scheduler.return_to_pool(); - Some(result_with_timings) + (false, Some(result_with_timings)) } else { - None + (true, None) }; debug!( - "wait_for_scheduler_termination(slot: {}, reason: {:?}): finished with: {:?}...", + "wait_for_scheduler_termination(slot: {}, reason: {:?}): was_noop: {:?} finished with: {:?}...", bank.slot(), reason, + was_noop, result_with_timings.as_ref().map(|(result, _)| result), ); @@ -435,7 +486,7 @@ mod tests { fn setup_mocked_scheduler_with_extra( bank: Arc, is_dropped_flags: impl Iterator, - f: Option, + f: Option)>, ) -> InstalledSchedulerBox { let mut mock = MockInstalledScheduler::new(); let seq = Arc::new(Mutex::new(Sequence::new())); @@ -443,7 +494,10 @@ mod tests { mock.expect_context() .times(1) .in_sequence(&mut seq.lock().unwrap()) - .return_const(SchedulingContext::new(bank)); + .return_const(SchedulingContext::new( + SchedulingMode::BlockVerification, + bank, + )); for wait_reason in is_dropped_flags { let seq_cloned = seq.clone(); @@ -479,7 +533,7 @@ mod tests { setup_mocked_scheduler_with_extra( bank, is_dropped_flags, - None:: ()>, + None::) -> ()>, ) } @@ -535,12 +589,14 @@ mod tests { Some(setup_mocked_scheduler_with_extra( bank, [false].into_iter(), - Some(|mocked: &mut MockInstalledScheduler| { - mocked - .expect_pause_for_recent_blockhash() - .times(1) - .returning(|| ()); - }), + Some( + |mocked: &mut MockInstalledScheduler| { + mocked + .expect_pause_for_recent_blockhash() + .times(1) + .returning(|| ()); + }, + ), )), ); goto_end_of_slot_with_scheduler(&bank); @@ -566,15 +622,20 @@ mod tests { let mocked_scheduler = setup_mocked_scheduler_with_extra( bank.clone(), [true].into_iter(), - Some(|mocked: &mut MockInstalledScheduler| { - mocked - .expect_schedule_execution() - .times(1) - .returning(|(_, _)| ()); - }), + Some( + |mocked: &mut MockInstalledScheduler| { + mocked + .expect_schedule_execution() + .times(1) + .returning(|(_, _)| Ok(())); + }, + ), ); let bank = BankWithScheduler::new(bank, Some(mocked_scheduler)); - bank.schedule_transaction_executions([(&tx0, &0)].into_iter()); + assert_matches!( + bank.schedule_transaction_executions([(&tx0, &0)].into_iter()), + Ok(()) + ); } } diff --git a/sdk/Cargo.toml b/sdk/Cargo.toml index 57bf0738fa41eb..bc7c40a68080bb 100644 --- a/sdk/Cargo.toml +++ b/sdk/Cargo.toml @@ -72,7 +72,7 @@ serde = { workspace = true } serde_bytes = { workspace = true } serde_derive = { workspace = true } serde_json = { workspace = true, optional = true } -serde_with = { workspace = true, features = ["macros"] } +serde_with = { workspace = true, features = ["macros", "alloc"] } sha2 = { workspace = true } sha3 = { workspace = true, optional = true } siphasher = { workspace = true } diff --git a/sdk/src/lib.rs b/sdk/src/lib.rs index 7c6b643884e449..52eb4fd0e94841 100644 --- a/sdk/src/lib.rs +++ b/sdk/src/lib.rs @@ -98,6 +98,7 @@ pub mod rent_debits; pub mod reward_info; pub mod reward_type; pub mod rpc_port; +pub mod scheduling; pub mod secp256k1_instruction; pub mod shred_version; pub mod signature; diff --git a/sdk/src/scheduling.rs b/sdk/src/scheduling.rs new file mode 100644 index 00000000000000..aa39f7a8b08e8d --- /dev/null +++ b/sdk/src/scheduling.rs @@ -0,0 +1,11 @@ +//! Primitive types relevant to transaction scheduling +#![cfg(feature = "full")] + +#[derive(Debug, Clone, Copy)] +pub enum SchedulingMode { + BlockVerification, +} + +pub trait WithSchedulingMode { + fn mode(&self) -> SchedulingMode; +} diff --git a/sdk/src/transaction/sanitized.rs b/sdk/src/transaction/sanitized.rs index 4189f1b64b86e2..a735e0c8170dbd 100644 --- a/sdk/src/transaction/sanitized.rs +++ b/sdk/src/transaction/sanitized.rs @@ -36,11 +36,17 @@ pub struct SanitizedTransaction { } /// Set of accounts that must be locked for safe transaction processing -#[derive(Debug, Clone, Default, Eq, PartialEq)] +use serde_with::serde_as; +use serde_with::DisplayFromStr; + +#[serde_as] +#[derive(Debug, Clone, Default, Eq, PartialEq, Serialize)] pub struct TransactionAccountLocks<'a> { /// List of readonly account key locks + #[serde_as(as = "Vec")] pub readonly: Vec<&'a Pubkey>, /// List of writable account key locks + #[serde_as(as = "Vec")] pub writable: Vec<&'a Pubkey>, } diff --git a/svm/Cargo.toml b/svm/Cargo.toml index ac672613c9c4fc..ffb3dc69e0bf57 100644 --- a/svm/Cargo.toml +++ b/svm/Cargo.toml @@ -13,6 +13,7 @@ edition = { workspace = true } itertools = { workspace = true } log = { workspace = true } percentage = { workspace = true } +rand = { workspace = true } solana-bpf-loader-program = { workspace = true } solana-frozen-abi = { workspace = true } solana-frozen-abi-macro = { workspace = true } diff --git a/svm/src/transaction_processor.rs b/svm/src/transaction_processor.rs index b58d178df4b963..0394ceed278742 100644 --- a/svm/src/transaction_processor.rs +++ b/svm/src/transaction_processor.rs @@ -288,14 +288,17 @@ impl TransactionBatchProcessor { execution_time.stop(); - const SHRINK_LOADED_PROGRAMS_TO_PERCENTAGE: u8 = 90; - self.loaded_programs_cache - .write() - .unwrap() - .evict_using_2s_random_selection( - Percentage::from(SHRINK_LOADED_PROGRAMS_TO_PERCENTAGE), - self.slot, - ); + use rand::Rng; + if rand::thread_rng().gen_range(0..1000) == 0 { + const SHRINK_LOADED_PROGRAMS_TO_PERCENTAGE: u8 = 90; + self.loaded_programs_cache + .write() + .unwrap() + .evict_using_2s_random_selection( + Percentage::from(SHRINK_LOADED_PROGRAMS_TO_PERCENTAGE), + self.slot, + ); + } debug!( "load: {}us execute: {}us txs_len={}", diff --git a/unified-scheduler-logic/Cargo.toml b/unified-scheduler-logic/Cargo.toml index b2e80c79c7a08f..e1dd176a2bd510 100644 --- a/unified-scheduler-logic/Cargo.toml +++ b/unified-scheduler-logic/Cargo.toml @@ -10,4 +10,26 @@ license = { workspace = true } edition = { workspace = true } [dependencies] +assert_matches = { workspace = true } +qualifier_attr = { workspace = true } solana-sdk = { workspace = true } +static_assertions = { workspace = true } +#[[bench]] +#name = "bench-with-iai-callgrind" +#harness = false + +[dev-dependencies] +# See order-crates-for-publishing.py for using this unusual `path = "."` +solana-unified-scheduler-logic = { path = ".", features = ["dev-context-only-utils"] } +triomphe = { version = "0.1.11" } + +[target."cfg(target_os = \"linux\")".dev-dependencies] +iai-callgrind = { version = "0.10.2", features = [ + "client_requests" +] } + +[target."cfg(not(target_os = \"linux\"))".dev-dependencies] +iai-callgrind = { version = "0.10.2" } + +[features] +dev-context-only-utils = [] diff --git a/unified-scheduler-logic/benches/bench-with-iai-callgrind.rs b/unified-scheduler-logic/benches/bench-with-iai-callgrind.rs new file mode 100644 index 00000000000000..b30d290de87855 --- /dev/null +++ b/unified-scheduler-logic/benches/bench-with-iai-callgrind.rs @@ -0,0 +1,668 @@ +#![cfg(feature = "dummy")] +#![allow(clippy::arithmetic_side_effects)] + +#[global_allocator] +static GLOBAL: B = B; + +struct A(T); + +unsafe impl std::marker::Sync for A {} + +static LOCAL_ALLOCATOR: A> = A(std::cell::UnsafeCell::new(BL::new())); + +struct BL { + cursor: *mut u8, + limit: *mut u8, + bytes: [u8; Self::BLOCK_SIZE], +} + +impl BL { + const BLOCK_SIZE: usize = 100_000_000; + + const fn new() -> Self { + Self { + cursor: usize::max_value() as _, + limit: usize::max_value() as _, + bytes: [0; Self::BLOCK_SIZE], + } + } + + #[inline(always)] + pub fn alloc2(&mut self, bytes: usize) -> *mut u8 { + loop { + self.cursor = unsafe { (((self.cursor.sub(bytes)) as usize) & !15) as _ }; + if self.cursor >= self.limit { + return self.cursor; + } else if self.limit == usize::max_value() as _ { + self.limit = self.bytes.as_mut_ptr(); + self.cursor = unsafe { self.limit.add(Self::BLOCK_SIZE) }; + continue; + } else { + panic!("out of memory form BL"); + } + } + } +} + +use std::{ + alloc::{GlobalAlloc, Layout}, + hint::black_box, +}; + +struct B; + +unsafe impl GlobalAlloc for B { + #[inline(always)] + unsafe fn alloc(&self, layout: Layout) -> *mut u8 { + (*LOCAL_ALLOCATOR.0.get()).alloc2(layout.size()) + } + + #[inline(always)] + unsafe fn dealloc(&self, _ptr: *mut u8, _layout: Layout) {} +} + +use { + assert_matches::assert_matches, + iai_callgrind::{ + client_requests::callgrind::toggle_collect, library_benchmark, library_benchmark_group, + main, + }, + solana_sdk::{ + instruction::{AccountMeta, Instruction}, + message::Message, + pubkey::Pubkey, + signature::Signer, + signer::keypair::Keypair, + transaction::{SanitizedTransaction, Transaction}, + }, + solana_unified_scheduler_logic::{Page, SchedulingStateMachine}, +}; + +#[library_benchmark] +#[bench::min(0)] +#[bench::one(1)] +#[bench::two(2)] +#[bench::three(3)] +#[bench::normal(32)] +#[bench::large(64)] +#[bench::max(128)] +fn bench_schedule_task(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for i in 0..account_count { + if i % 2 == 0 { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } else { + accounts.push(AccountMeta::new_readonly(Keypair::new().pubkey(), true)); + } + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| Page::default()); + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + toggle_collect(); + let task = scheduler.schedule_task(task); + toggle_collect(); + task.unwrap(); +} + +#[library_benchmark] +#[bench::min(0)] +#[bench::one(1)] +#[bench::two(2)] +#[bench::three(3)] +#[bench::normal(32)] +#[bench::large(64)] +#[bench::max(128)] +fn bench_drop_task(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| Page::default()); + + toggle_collect(); + drop(task); + toggle_collect(); +} + +#[library_benchmark] +#[bench::one(1)] +fn bench_insert_task(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| Page::default()); + + let mut b = std::collections::BTreeMap::new(); + toggle_collect(); + b.insert(task.index, task.clone()); + b.insert(task.index + 1, task.clone()); + b.remove(&task.index); + b.remove(&(task.index + 1)); + //b.insert(task.index + 4, task); + toggle_collect(); + drop(b); +} + +#[library_benchmark] +#[bench::arc_new(1)] +#[bench::arc_new_and_clone(2)] +#[bench::rc_new(3)] +#[bench::rc_new_and_clone(4)] +fn bench_arc(account_count: usize) { + toggle_collect(); + + { + let b; + match account_count { + 1 => { + toggle_collect(); + b = black_box(std::sync::Arc::new(black_box(3_u32))); + } + 2 => { + b = black_box(std::sync::Arc::new(black_box(3_u32))); + toggle_collect(); + std::mem::forget(black_box(b.clone())); + } + _ => { + let b; + match account_count { + 3 => { + toggle_collect(); + b = black_box(std::rc::Rc::new(black_box(3_u32))); + } + 4 => { + toggle_collect(); + b = black_box(std::rc::Rc::new(black_box(3_u32))); + black_box(b.clone()); + } + _ => panic!(), + } + toggle_collect(); + drop(b); + return; + } + } + toggle_collect(); + drop(b); + } +} + +#[library_benchmark] +#[bench::arc_new(1)] +#[bench::arc_new_and_clone(2)] +#[bench::rc_new(3)] +#[bench::rc_new_and_clone(4)] +fn bench_triomphe_arc(account_count: usize) { + toggle_collect(); + + { + let b; + match account_count { + 1 => { + toggle_collect(); + b = black_box(triomphe::Arc::new(black_box(3_u32))); + } + 2 => { + b = black_box(triomphe::Arc::new(black_box(3_u32))); + toggle_collect(); + std::mem::forget(black_box(b.clone())); + } + _ => { + let b; + match account_count { + 3 => { + toggle_collect(); + b = black_box(std::rc::Rc::new(black_box(3_u32))); + } + 4 => { + toggle_collect(); + b = black_box(std::rc::Rc::new(black_box(3_u32))); + black_box(b.clone()); + } + _ => panic!(), + } + toggle_collect(); + drop(b); + return; + } + } + toggle_collect(); + drop(b); + } +} + +#[library_benchmark] +#[bench::one(1)] +fn bench_heaviest_task(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| Page::default()); + + let mut b = std::collections::BTreeMap::new(); + b.insert(task.index, task.clone()); + b.insert(task.index + 1, task.clone()); + b.insert(task.index + 2, task.clone()); + let mut c = std::collections::BTreeMap::new(); + c.insert(task.index + 3, task.clone()); + c.insert(task.index + 4, task.clone()); + c.insert(task.index + 5, task.clone()); + + toggle_collect(); + let d = b.first_key_value(); + let e = c.first_key_value(); + let f = std::cmp::min_by(d, e, |x, y| x.map(|x| x.0).cmp(&y.map(|y| y.0))).map(|x| x.1); + assert_matches!(f.map(|f| f.task_index()), Some(0)); + toggle_collect(); + dbg!(f); + + drop(b); +} + +#[library_benchmark] +#[bench::min(0)] +#[bench::one(1)] +#[bench::two(2)] +#[bench::three(3)] +#[bench::normal(32)] +#[bench::large(64)] +#[bench::max(128)] +fn bench_schedule_task_conflicting(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| Page::default()); + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + let task = scheduler.schedule_task(task).unwrap(); + let task2 = task.clone(); + toggle_collect(); + assert_matches!(scheduler.schedule_task(task2), None); + toggle_collect(); + drop(task); +} + +#[library_benchmark] +#[bench::min(3, 0)] +#[bench::one(3, 1)] +#[bench::two(2, 2)] +#[bench::three(3, 3)] +#[bench::normal(3, 32)] +#[bench::large(3, 64)] +#[bench::large2(3, 128)] +#[bench::large3(3, 256)] +#[bench::large4(3, 1024)] +#[bench::large5(3, 2048)] +fn bench_schedule_task_conflicting_hot(account_count: usize, task_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + + let mut pages: std::collections::HashMap = + std::collections::HashMap::new(); + let task = SchedulingStateMachine::create_task(tx0.clone(), 0, &mut |address| { + pages.entry(address).or_default().clone() + }); + scheduler.schedule_task(task).unwrap(); + for i in 1..=task_count { + let task = SchedulingStateMachine::create_task(tx0.clone(), i, &mut |address| { + pages.entry(address).or_default().clone() + }); + assert_matches!(scheduler.schedule_task(task), None); + } + + let task = SchedulingStateMachine::create_task(tx0.clone(), task_count + 1, &mut |address| { + pages.entry(address).or_default().clone() + }); + let task2 = task.clone(); + + toggle_collect(); + assert_matches!(scheduler.schedule_task(task2), None); + toggle_collect(); + + drop(task); +} + +#[library_benchmark] +#[bench::min(0)] +#[bench::one(1)] +#[bench::two(2)] +#[bench::three(3)] +#[bench::normal(32)] +#[bench::large(64)] +#[bench::max(128)] +fn bench_deschedule_task_conflicting(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| Page::default()); + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + let task = scheduler.schedule_task(task).unwrap(); + assert_matches!(scheduler.schedule_task(task.clone()), None); + + toggle_collect(); + scheduler.deschedule_task(&task); + toggle_collect(); + + drop(task); +} + +#[library_benchmark] +#[bench::min(0)] +#[bench::one(1)] +#[bench::two(2)] +#[bench::three(3)] +#[bench::normal(32)] +#[bench::large(64)] +#[bench::max(128)] +fn bench_schedule_unblocked_task(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let mut pages: std::collections::HashMap = + std::collections::HashMap::new(); + let task = SchedulingStateMachine::create_task(tx0.clone(), 0, &mut |address| { + pages.entry(address).or_default().clone() + }); + let task2 = SchedulingStateMachine::create_task(tx0, 1, &mut |address| { + pages.entry(address).or_default().clone() + }); + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + let task = scheduler.schedule_task(task).unwrap(); + assert_matches!(scheduler.schedule_task(task2), None); + scheduler.deschedule_task(&task); + toggle_collect(); + let retried_task = scheduler.schedule_unblocked_task(); + toggle_collect(); + let retried_task = retried_task.unwrap(); + assert_eq!(task.transaction(), retried_task.transaction()); + drop(task); +} + +#[library_benchmark] +#[bench::min(0)] +#[bench::one(1)] +#[bench::two(2)] +#[bench::three(3)] +#[bench::small(16)] +#[bench::normal(32)] +#[bench::large(64)] +//#[bench::max(128)] +fn bench_end_to_end_worst(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let mut pages: std::collections::HashMap = + std::collections::HashMap::new(); + let task = SchedulingStateMachine::create_task(tx0.clone(), 0, &mut |address| { + pages.entry(address).or_default().clone() + }); + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + + let task = scheduler.schedule_task(task).unwrap(); + for i in 1..account_count { + let mut accounts = vec![memo_ix.accounts[i].clone()]; + //let mut accounts = vec![AccountMeta::new(Keypair::new().pubkey(), true)]; + for _ in 0..account_count { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let ixs = vec![memo_ix]; + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task2 = SchedulingStateMachine::create_task(tx0, i, &mut |address| { + pages.entry(address).or_default().clone() + }); + toggle_collect(); + let scheduled_task = scheduler.schedule_task(task2.clone()); + toggle_collect(); + drop(scheduled_task); + } + + toggle_collect(); + scheduler.deschedule_task(&task); + if let Some(_cc) = account_count.checked_sub(1) { + //assert_eq!(scheduler.unblocked_task_count(), cc); + //let mut c = 0; + while let Some(retried_task) = scheduler.schedule_unblocked_task() { + //c += 1; + //scheduler.deschedule_task(&retried_task); + toggle_collect(); + drop::(retried_task); + toggle_collect(); + } + //assert_eq!(c, cc); + } + toggle_collect(); + + //assert_eq!(task2.task_index(), retried_task.task_index()); + drop(task); +} + +#[library_benchmark] +#[bench::min(0)] +#[bench::one(1)] +#[bench::two(2)] +#[bench::three(3)] +#[bench::normal(32)] +#[bench::large(64)] +#[bench::max(128)] +fn bench_deschedule_task(account_count: usize) { + toggle_collect(); + let mut accounts = vec![]; + for i in 0..account_count { + if i % 2 == 0 { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } else { + accounts.push(AccountMeta::new_readonly(Keypair::new().pubkey(), true)); + } + } + + let payer = Keypair::new(); + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //panic!("{:?}", txn); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + let task = SchedulingStateMachine::create_task(tx0, 0, &mut |_| Page::default()); + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + let task = scheduler.schedule_task(task).unwrap(); + toggle_collect(); + scheduler.deschedule_task(&task); + toggle_collect(); + drop(task); +} + +library_benchmark_group!( + name = bench_scheduling_state_machine; + benchmarks = bench_end_to_end_worst, bench_arc, bench_triomphe_arc, bench_drop_task, bench_insert_task, bench_heaviest_task, bench_schedule_task, bench_schedule_task_conflicting, bench_schedule_task_conflicting_hot, bench_deschedule_task, bench_deschedule_task_conflicting, bench_schedule_unblocked_task + //benchmarks = bench_arc, bench_triomphe_arc + //benchmarks = bench_end_to_end_worst +); + +main!(library_benchmark_groups = bench_scheduling_state_machine); diff --git a/unified-scheduler-logic/src/lib.rs b/unified-scheduler-logic/src/lib.rs index 997c6c1745a7c9..6bffc9aaec5ac2 100644 --- a/unified-scheduler-logic/src/lib.rs +++ b/unified-scheduler-logic/src/lib.rs @@ -1,15 +1,315 @@ -use solana_sdk::transaction::SanitizedTransaction; +#![allow(rustdoc::private_intra_doc_links)] +//! The task (transaction) scheduling code for the unified scheduler +//! +//! ### High-level API and design +//! +//! The most important type is [`SchedulingStateMachine`]. It takes new tasks (= transactons) and +//! may return back them if runnable via +//! [`::schedule_task()`](SchedulingStateMachine::schedule_task) while maintaining the account +//! readonly/writable lock rules. Those returned runnable tasks are guaranteed to be safe to +//! execute in parallel. Lastly, `SchedulingStateMachine` should be notified about the completion +//! of the exeuciton via [`::deschedule_task()`](SchedulingStateMachine::deschedule_task), so that +//! conflicting tasks can be returned from +//! [`::schedule_unblocked_task()`](SchedulingStateMachine::schedule_unblocked_task) as +//! newly-unblocked runnable ones. +//! +//! The design principle of this crate (`solana-unified-scheduler-logic`) is simplicity for the +//! separation of concern. It is interacted only with a few of its public API by +//! `solana-unified-scheduler-pool`. This crate doesn't know about banks, slots, solana-runtime, +//! threads, crossbeam-channel at all. Becasue of this, it's deterministic, easy-to-unit-test, and +//! its perf footprint is well understood. It really focuses on its single job: sorting +//! transactions in executable order. +//! +//! ### Algorithm +//! +//! The algorithm can be said it's based on per-address FIFO queues, which are updated every time +//! both new task is coming (= called _scheduling_) and runnable (= _post-scheduling_) task is +//! finished (= called _descheduling_). +//! +//! For the _non-conflicting scheduling_ case, the story is very simple; it just remembers that all +//! of accessed addresses are write-locked or read-locked with the number of active (= +//! _currently-scheduled-and-not-descheduled-yet_) tasks. Correspondingly, descheduling does the +//! opposite book-keeping process, regardless whether a finished task has been conflicted or not. +//! +//! For the _conflicting scheduling_ case, it remembers that each of **non-conflicting addresses** +//! like the non-conflicting case above. As for **conflicting addresses**, each task is recorded to +//! respective FIFO queues attached to the (conflicting) addresses. Importantly, the number of +//! conflicting addresses of the conflicting task is also remembered. +//! +//! The last missing piece is that the scheduler actually tries to reschedule previously blocked +//! tasks while deschduling, in addition to the above-mentioned book-keeping processing. Namely, +//! when given address is ready for new fresh locking resulted from descheduling a task (i.e. write +//! lock is released or read lock count is reached to zero), it pops out the first element of the +//! FIFO blocked-task queue of the address. Then, it immediately marks the address as relocked. It +//! also decrements the number of conflicting addresses of the popped-out task. As the final step, +//! if the number reaches to the zero, it means the task has fully finished locking all of its +//! addresses and is directly routed to be runnable. +//! +//! Put differently, this algorigthm tries to gradually lock all of addresses of tasks at different +//! timings while not deviating the execution order from the original task ingestion order. This +//! implies there's no locking retries in general, which is the primary source of non-linear perf. +//! degration. +//! +//! As a ballpark number from a synthesized micro benchmark on usual CPU for `mainnet-beta` +//! validators, it takes roughly 100ns to schedule and deschedule a transaction with 10 accounts. +//! And 1us for a transaction with 100 accounts. Note that this excludes crossbeam communication +//! overhead at all. That's said, it's not unrealistic to say the whole unified scheduler can +//! attain 100k-1m tps overall, assuming those transaction executions aren't bottlenecked. +//! +//! ### Runtime performance characteristics and data structure arrangement +//! +//! Its algorithm is very fast for high throughput, real-time for low latency. The whole +//! unified-scheduler architecture is designed from grounds up to support the fastest execution of +//! this scheduling code. For that end, unified scheduler pre-loads address-specific locking state +//! data structures (called [`Page`]) for all of transaction's accounts, in order to offload the +//! job to other threads from the scheduler thread. This preloading is done inside +//! [`create_task()`](SchedulingStateMachine::create_task). In this way, task scheduling +//! computational complexity is basically reduced to several word-sized loads and stores in the +//! schduler thread (i.e. constant; no allocations nor syscalls), while being proportional to the +//! number of addresses in a given transaction. Note that this statement is held true, regardless +//! of conflicts. This is because the preloading also pre-allocates some scratch-pad area +//! ([`blocked_tasks`](PageInner::blocked_tasks)) to stash blocked ones. So, a conflict only incurs +//! some additional fixed number of mem stores, within error magin of the constant complexity. And +//! additional memory allocation for the scratchpad could said to be amortized, if such unsual +//! event should occur. +//! +//! [`Arc`] is used to implement this preloading mechanism, because `Page`s are shared across tasks +//! accessing the same account, and among threads due to the preloading. Also, interior mutability +//! is needed. However, `SchedulingStateMachine` doesn't use conventional locks like RwLock. +//! Leveraving the fact it's the only state-mutating exclusive thread, it instead uses +//! `UnsafeCell`, which is sugar-coated by a tailored wrapper called [`TokenCell`]. `TokenCell` +//! improses an overly restrictive aliasing rule via rust type system to maintain the memory +//! safety. By localizing any synchronization to the message passing, the scheduling code itself +//! attains maximally possible single-threaed execution without stalling cpu pipelines at all, only +//! constrained to mem access latency, while efficiently utilzing L1-L3 cpu cache with full of +//! `Page`s. +//! +//! ### Buffer bloat insignificance +//! +//! The scheduler code itself doesn't care about the buffer bloat problem, which can occur in +//! unified scheduler, where a run of heavily linearized and blocked tasks could severely hampered +//! by very large number of interleaved runnable tasks along side. The reason is again for +//! separation of concerns. This is acceptable because the scheduling code itself isn't susceptible +//! to the buffer bloat problem by itself as explained by the description and validated by the +//! mentioned benchmark above. Thus, this should be solved elsewhere, specifically at the scheduler +//! pool. +#[cfg(feature = "dev-context-only-utils")] +use qualifier_attr::field_qualifiers; +use { + crate::utils::{ShortCounter, Token, TokenCell}, + solana_sdk::{pubkey::Pubkey, transaction::SanitizedTransaction}, + static_assertions::const_assert_eq, + std::{collections::VecDeque, mem, sync::Arc}, +}; -pub struct Task { +/// Internal utilities. Namely this contains [`ShortCounter`] and [`TokenCell`]. +mod utils { + #[cfg(feature = "dev-context-only-utils")] + use qualifier_attr::qualifiers; + use std::{ + any::{self, TypeId}, + cell::{RefCell, UnsafeCell}, + collections::BTreeSet, + marker::PhantomData, + thread, + }; + + /// A really tiny counter to hide `.checked_{add,sub}` all over the place. + /// + /// It's caller's reponsibility to ensure this (backed by [`u32`]) never overflow. + #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))] + #[derive(Debug, Clone, Copy)] + pub(super) struct ShortCounter(u32); + + impl ShortCounter { + pub(super) fn zero() -> Self { + Self(0) + } + + pub(super) fn one() -> Self { + Self(1) + } + + pub(super) fn is_one(&self) -> bool { + self.0 == 1 + } + + pub(super) fn is_zero(&self) -> bool { + self.0 == 0 + } + + pub(super) fn current(&self) -> u32 { + self.0 + } + + #[must_use] + pub(super) fn increment(self) -> Self { + Self(self.0.checked_add(1).unwrap()) + } + + #[must_use] + pub(super) fn decrement(self) -> Self { + Self(self.0.checked_sub(1).unwrap()) + } + + pub(super) fn increment_self(&mut self) -> &mut Self { + *self = self.increment(); + self + } + + pub(super) fn decrement_self(&mut self) -> &mut Self { + *self = self.decrement(); + self + } + + pub(super) fn reset_to_zero(&mut self) -> &mut Self { + self.0 = 0; + self + } + } + + /// A conditionally [`Send`]-able and [`Sync`]-able cell leveraging scheduler's one-by-one data + /// access pattern with zero runtime synchronization cost. + /// + /// To comply with Rust's aliasing rules, these cells require a carefully-created [`Token`] to + /// be passed around to access the inner values. The token is a special-purpose phantom object + /// to get rid of its inherent `unsafe`-ness in [`UnsafeCell`], which is internally used for + /// the interior mutability. + /// + /// The final objective of [`Token`] is to ensure there's only one mutable reference to the + /// [`TokenCell`] at most _at any given moment_. To that end, it's `unsafe` to create it, + /// shifting the responsibility of binding the only singleton instance to a particular thread + /// and not creating more than one, onto the API consumers. And its constructor is non-`const`, + /// and the type is `!Clone` (and `!Copy` as well), `!Default`, `!Send` and `!Sync` to make it + /// relatively hard to cross thread boundaries accidentally. + /// + /// In other words, the token semantically _owns_ all of its associated instances of + /// [`TokenCell`]s. And `&mut Token` is needed to access one of them as if the one is of + /// [`Token`]'s `*_mut()` getters. Thus, the Rust aliasing rule for `UnsafeCell` can + /// transitively be proven to be satisfied simply based on the usual borrow checking of the + /// `&mut` reference of [`Token`] itself via [`::borrow_mut()`](TokenCell::borrow_mut). + /// + /// By extension, it's allowed to create _multiple_ tokens in a _single_ process as long as no + /// instance of [`TokenCell`] is shared by multiple instances of [`Token`]. + /// + /// Note that this is overly restrictive in that it's forbidden, yet, technically possible + /// to _have multiple mutable references to the inner values at the same time, if and only + /// if the respective cells aren't aliased to each other (i.e. different instances)_. This + /// artificial restriction is acceptable for its intended use by the unified scheduler's code + /// because its algorithm only needs to access each instance of [`TokenCell`]-ed data once at a + /// time. Finally, this restriction is traded off for restoration of Rust aliasing rule at zero + /// runtime cost. Without this token mechanism, there's no way to realize this. + #[derive(Debug, Default)] + pub(super) struct TokenCell(UnsafeCell); + + impl TokenCell { + /// Creates a new `TokenCell` with the `value` typed as `V`. + /// + /// Note that this isn't parametric over the its accompanied `Token`'s lifetime to avoid + /// complex handling of non-`'static` heaped data in general. Instead, it's manually + /// required to ensure this instance is accessed only via its associated Token for the + /// entire lifetime. + // non-const to forbid unprotected sharing via static variables among threads. + pub(super) fn new(value: V) -> Self { + Self(UnsafeCell::new(value)) + } + + /// Returns a mutable reference with its lifetime bound to the mutable reference of the + /// given token. + /// + /// In this way, any additional reborrow can never happen at the same time across all + /// instances of [`TokenCell`] conceptually owned by the instance of [`Token`] (a + /// particular thread), unless previous borrow is released. After the release, the used + /// singleton token should be free to be reused for reborrows. + pub(super) fn borrow_mut<'t>(&self, _token: &'t mut Token) -> &'t mut V { + unsafe { &mut *self.0.get() } + } + } + + // Safety: Access to TokenCell is assumed to be only from a single thread by proper use of + // Token once after TokenCell is sent to the thread from other threads; So, both implementing + // Send and Sync can be thought as safe. + // + // In other words, TokenCell is technicall still !Send and !Sync. But there should be no legal + // use happening which requires !Send or !Sync to avoid undefined behavior. + unsafe impl Send for TokenCell {} + unsafe impl Sync for TokenCell {} + + /// A auxiliary zero-sized type to enforce aliasing rule to [`TokenCell`] via rust type system + /// + /// Token semantically owns a collection of `TokenCell` objects and governs the _unique_ + /// existence of mutable access over them by requiring the token itself to be mutably borrowed + /// to get a mutable reference to the internal value of `TokenCell`. + #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))] + // *mut is used to make this type !Send and !Sync + pub(super) struct Token(PhantomData<*mut V>); + + impl Token { + // Returns the token to acquire a mutable reference to the inner value of [TokenCell]. + // + // Safety: + // This method should be called exactly once for each thread at most. + #[must_use] + pub(super) unsafe fn assume_exclusive_mutating_thread() -> Self { + thread_local! { + static TOKENS: RefCell> = const { RefCell::new(BTreeSet::new()) }; + } + assert!( + TOKENS.with_borrow_mut(|tokens| tokens.insert(TypeId::of::())), + "{:?} is wrongly initialized twice on {:?}", + any::type_name::(), + thread::current() + ); + + Self(PhantomData) + } + } + + #[cfg(test)] + mod tests { + use super::Token; + + #[test] + #[should_panic( + expected = "\"solana_unified_scheduler_logic::utils::Token\" is wrongly \ + initialized twice on Thread" + )] + fn test_second_creation_of_tokens_in_a_thread() { + unsafe { + let _ = Token::::assume_exclusive_mutating_thread(); + let _ = Token::::assume_exclusive_mutating_thread(); + } + } + } +} + +/// [`Result`] for locking a [page](Page) with particular [usage](RequestedUsage). +type LockResult = Result; +const_assert_eq!(mem::size_of::(), 8); + +/// Something to be scheduled; usually a wrapper of [`SanitizedTransaction`]. +pub type Task = Arc; +const_assert_eq!(mem::size_of::(), 8); + +/// [`Token`] for [`Page`]. +type PageToken = Token; +const_assert_eq!(mem::size_of::(), 0); + +/// [`Token`] for [task](Task)'s [internal mutable data](`TaskInner::blocked_page_count`). +type BlockedPageCountToken = Token; +const_assert_eq!(mem::size_of::(), 0); + +/// Internal scheduling data about a particular task. +#[cfg_attr(feature = "dev-context-only-utils", field_qualifiers(index(pub)))] +#[derive(Debug)] +pub struct TaskInner { transaction: SanitizedTransaction, index: usize, + lock_attempts: Vec, + blocked_page_count: TokenCell, } -impl Task { - pub fn create_task(transaction: SanitizedTransaction, index: usize) -> Self { - Task { transaction, index } - } - +impl TaskInner { pub fn task_index(&self) -> usize { self.index } @@ -17,4 +317,1025 @@ impl Task { pub fn transaction(&self) -> &SanitizedTransaction { &self.transaction } + + fn lock_attempts(&self) -> &Vec { + &self.lock_attempts + } + + fn blocked_page_count_mut<'t>( + &self, + token: &'t mut BlockedPageCountToken, + ) -> &'t mut ShortCounter { + self.blocked_page_count.borrow_mut(token) + } + + fn set_blocked_page_count(&self, token: &mut BlockedPageCountToken, count: ShortCounter) { + *self.blocked_page_count_mut(token) = count; + } + + #[must_use] + fn try_unblock(self: &Task, token: &mut BlockedPageCountToken) -> Option { + self.blocked_page_count_mut(token) + .decrement_self() + .is_zero() + .then(|| self.clone()) + } +} + +/// [`Task`]'s per-address attempt to use a [page](Page) with [certain kind of +/// request](RequestedUsage). +#[derive(Debug)] +struct LockAttempt { + page: Page, + requested_usage: RequestedUsage, +} +const_assert_eq!(mem::size_of::(), 16); + +impl LockAttempt { + fn new(page: Page, requested_usage: RequestedUsage) -> Self { + Self { + page, + requested_usage, + } + } + + fn page_mut<'t>(&self, page_token: &'t mut PageToken) -> &'t mut PageInner { + self.page.0.borrow_mut(page_token) + } +} + +/// Status about how the [`Page`] is used currently. Unlike [`RequestedUsage`], it has additional +/// variant of [`Unused`](`PageUsage::Unused`). +#[derive(Copy, Clone, Debug, Default)] +enum PageUsage { + #[default] + Unused, + Readonly(ShortCounter), + Writable, +} +const_assert_eq!(mem::size_of::(), 8); + +impl PageUsage { + fn from_requested_usage(requested_usage: RequestedUsage) -> Self { + match requested_usage { + RequestedUsage::Readonly => PageUsage::Readonly(ShortCounter::one()), + RequestedUsage::Writable => PageUsage::Writable, + } + } +} + +/// Status about how a task is requesting to use a particular [`Page`]. Unlike [`PageUsage`], +/// it has only two unit variants. +#[derive(Clone, Copy, Debug)] +enum RequestedUsage { + Readonly, + Writable, +} + +/// Internal scheduling data about a particular address. +/// +/// Specifially, it holds the current [`PageUsage`] (or no usage with [`PageUsage::Unused`]) and +/// which [`Task`]s are blocked to be executed after the current task is notified to be finished +/// via [`::deschedule_task`](`SchedulingStateMachine::deschedule_task`) +#[derive(Debug)] +struct PageInner { + usage: PageUsage, + blocked_tasks: VecDeque<(Task, RequestedUsage)>, +} + +impl Default for PageInner { + fn default() -> Self { + Self { + usage: PageUsage::default(), + blocked_tasks: VecDeque::with_capacity(1024), + } + } +} + +impl PageInner { + fn push_blocked_task(&mut self, task: Task, requested_usage: RequestedUsage) { + self.blocked_tasks.push_back((task, requested_usage)); + } + + fn has_no_blocked_task(&self) -> bool { + self.blocked_tasks.is_empty() + } + + #[must_use] + fn pop_unblocked_next_task(&mut self) -> Option<(Task, RequestedUsage)> { + self.blocked_tasks.pop_front() + } + + #[must_use] + fn blocked_next_task(&self) -> Option<(&Task, RequestedUsage)> { + self.blocked_tasks + .front() + .map(|(task, requested_usage)| (task, *requested_usage)) + } + + #[must_use] + fn pop_blocked_next_readonly_task(&mut self) -> Option<(Task, RequestedUsage)> { + if matches!( + self.blocked_next_task(), + Some((_, RequestedUsage::Readonly)) + ) { + self.pop_unblocked_next_task() + } else { + None + } + } +} + +const_assert_eq!(mem::size_of::>(), 40); + +/// Scheduler's internal data for each address ([`Pubkey`](`solana_sdk::pubkey::Pubkey`)). Very +/// opaque wrapper type; no methods just with [`::clone()`](Clone::clone) and +/// [`::default()`](Default::default). +#[derive(Debug, Clone, Default)] +pub struct Page(Arc>); +const_assert_eq!(mem::size_of::(), 8); + +/// A high-level `struct`, managing the overall scheduling of [tasks](Task), to be used by +/// `solana-unified-scheduler-pool`. +#[cfg_attr(feature = "dev-context-only-utils", field_qualifiers(count_token(pub)))] +pub struct SchedulingStateMachine { + last_task_index: Option, + unblocked_task_queue: VecDeque, + active_task_count: ShortCounter, + handled_task_count: ShortCounter, + unblocked_task_count: ShortCounter, + total_task_count: ShortCounter, + count_token: BlockedPageCountToken, + page_token: PageToken, +} +const_assert_eq!(mem::size_of::(), 64); + +impl SchedulingStateMachine { + pub fn has_no_active_task(&self) -> bool { + self.active_task_count.is_zero() + } + + pub fn unblocked_task_queue_count(&self) -> usize { + self.unblocked_task_queue.len() + } + + pub fn active_task_count(&self) -> u32 { + self.active_task_count.current() + } + + pub fn handled_task_count(&self) -> u32 { + self.handled_task_count.current() + } + + pub fn unblocked_task_count(&self) -> u32 { + self.unblocked_task_count.current() + } + + pub fn total_task_count(&self) -> u32 { + self.total_task_count.current() + } + + #[must_use] + pub fn schedule_task(&mut self, task: Task) -> Option { + let new_task_index = task.task_index(); + if let Some(old_task_index) = self.last_task_index.replace(new_task_index) { + assert!( + new_task_index > old_task_index, + "bad new task index: {new_task_index} > {old_task_index}" + ); + } + self.total_task_count.increment_self(); + self.active_task_count.increment_self(); + self.attempt_lock_for_task(task) + } + + pub fn has_unblocked_task(&self) -> bool { + !self.unblocked_task_queue.is_empty() + } + + #[must_use] + pub fn schedule_unblocked_task(&mut self) -> Option { + self.unblocked_task_queue.pop_front().map(|task| { + self.unblocked_task_count.increment_self(); + task + }) + } + + pub fn deschedule_task(&mut self, task: &Task) { + let blocked_task_index = task.task_index(); + let largest_task_index = self + .last_task_index + .expect("task should have been scheduled"); + assert!( + blocked_task_index <= largest_task_index, + "bad unblocked task index: {blocked_task_index} <= {largest_task_index}" + ); + self.active_task_count.decrement_self(); + self.handled_task_count.increment_self(); + self.unlock_for_task(task); + } + + #[must_use] + fn attempt_lock_pages(&mut self, task: &Task) -> ShortCounter { + let mut blocked_page_count = ShortCounter::zero(); + + for attempt in task.lock_attempts() { + let page = attempt.page_mut(&mut self.page_token); + let lock_status = if page.has_no_blocked_task() { + Self::attempt_lock_page(page, attempt.requested_usage) + } else { + LockResult::Err(()) + }; + match lock_status { + LockResult::Ok(PageUsage::Unused) => unreachable!(), + LockResult::Ok(new_usage) => { + page.usage = new_usage; + } + LockResult::Err(()) => { + blocked_page_count.increment_self(); + page.push_blocked_task(task.clone(), attempt.requested_usage); + } + } + } + + blocked_page_count + } + + fn attempt_lock_page(page: &PageInner, requested_usage: RequestedUsage) -> LockResult { + match page.usage { + PageUsage::Unused => LockResult::Ok(PageUsage::from_requested_usage(requested_usage)), + PageUsage::Readonly(count) => match requested_usage { + RequestedUsage::Readonly => LockResult::Ok(PageUsage::Readonly(count.increment())), + RequestedUsage::Writable => LockResult::Err(()), + }, + PageUsage::Writable => LockResult::Err(()), + } + } + + #[must_use] + fn unlock_page(page: &mut PageInner, attempt: &LockAttempt) -> Option<(Task, RequestedUsage)> { + let mut is_unused_now = false; + match &mut page.usage { + PageUsage::Readonly(ref mut count) => match attempt.requested_usage { + RequestedUsage::Readonly => { + if count.is_one() { + is_unused_now = true; + } else { + count.decrement_self(); + } + } + RequestedUsage::Writable => unreachable!(), + }, + PageUsage::Writable => match attempt.requested_usage { + RequestedUsage::Writable => { + is_unused_now = true; + } + RequestedUsage::Readonly => unreachable!(), + }, + PageUsage::Unused => unreachable!(), + } + + if is_unused_now { + page.usage = PageUsage::Unused; + page.pop_unblocked_next_task() + } else { + None + } + } + + #[must_use] + fn attempt_lock_for_task(&mut self, task: Task) -> Option { + let blocked_page_count = self.attempt_lock_pages(&task); + + if blocked_page_count.is_zero() { + // succeeded + Some(task) + } else { + // failed + task.set_blocked_page_count(&mut self.count_token, blocked_page_count); + None + } + } + + fn unlock_for_task(&mut self, task: &Task) { + for unlock_attempt in task.lock_attempts() { + let page = unlock_attempt.page_mut(&mut self.page_token); + let mut unblocked_task_from_page = Self::unlock_page(page, unlock_attempt); + + while let Some((task_with_unblocked_page, requested_usage)) = unblocked_task_from_page { + if let Some(task) = task_with_unblocked_page.try_unblock(&mut self.count_token) { + self.unblocked_task_queue.push_back(task); + } + + match Self::attempt_lock_page(page, requested_usage) { + LockResult::Ok(PageUsage::Unused) => unreachable!(), + LockResult::Ok(new_usage) => { + page.usage = new_usage; + // Try to further schedule blocked task for parallelism in the case of + // readonly usages + unblocked_task_from_page = if matches!(new_usage, PageUsage::Readonly(_)) { + page.pop_blocked_next_readonly_task() + } else { + None + }; + } + LockResult::Err(_) => panic!("should never fail in this context"), + } + } + } + } + + /// Creates a new task with [`SanitizedTransaction`] with all of its corresponding [`Page`]s + /// preloaded. + /// + /// Closure (`page_loader`) is used to delegate the (possibly multi-threaded) + /// implementation of [`Page`] look-up by [`pubkey`](Pubkey) to callers. It's the caller's + /// responsibility to ensure the same instance is returned from the closure, given a particular + /// pubkey. + pub fn create_task( + transaction: SanitizedTransaction, + index: usize, + page_loader: &mut impl FnMut(Pubkey) -> Page, + ) -> Task { + // this is safe bla bla + let locks = transaction.get_account_locks_unchecked(); + + let writable_locks = locks + .writable + .iter() + .map(|address| (address, RequestedUsage::Writable)); + let readonly_locks = locks + .readonly + .iter() + .map(|address| (address, RequestedUsage::Readonly)); + + let lock_attempts = writable_locks + .chain(readonly_locks) + .map(|(address, requested_usage)| { + LockAttempt::new(page_loader(**address), requested_usage) + }) + .collect(); + + Task::new(TaskInner { + transaction, + index, + lock_attempts, + blocked_page_count: TokenCell::new(ShortCounter::zero()), + }) + } + + /// Rewind the inactive state machine to be initialized + /// + /// This isn't called _reset_ to indicate this isn't safe to call this at any given moment. + /// This panics if the state machine hasn't properly been finished (i.e. there should be no + /// active task) to uphold invariants of [`Page`]s. + /// + /// This method is intended to reuse SchedulingStateMachine instance (to avoid its `unsafe` + /// [constructor](SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling) + /// as much as possible) and its (possbily cached) associated [`Page`]s for processing other + /// slots. + pub fn reinitialize(&mut self) { + assert!(self.has_no_active_task()); + assert_eq!(self.unblocked_task_queue.len(), 0); + self.last_task_index = None; + self.active_task_count.reset_to_zero(); + self.handled_task_count.reset_to_zero(); + self.unblocked_task_count.reset_to_zero(); + self.total_task_count.reset_to_zero(); + } + + /// Creates a new instance of [`SchedulingStateMachine`] with its `unsafe` fields created as + /// well, thus carrying over `unsafe`. + /// + /// # Safety + /// Call this exactly once for each thread. See [`TokenCell`] for details. + #[must_use] + pub unsafe fn exclusively_initialize_current_thread_for_scheduling() -> Self { + Self { + last_task_index: None, + unblocked_task_queue: VecDeque::with_capacity(1024), + active_task_count: ShortCounter::zero(), + handled_task_count: ShortCounter::zero(), + unblocked_task_count: ShortCounter::zero(), + total_task_count: ShortCounter::zero(), + count_token: unsafe { BlockedPageCountToken::assume_exclusive_mutating_thread() }, + page_token: unsafe { PageToken::assume_exclusive_mutating_thread() }, + } + } +} + +#[cfg(test)] +mod tests { + use { + super::*, + assert_matches::assert_matches, + solana_sdk::{ + instruction::{AccountMeta, Instruction}, + message::Message, + pubkey::Pubkey, + signature::Signer, + signer::keypair::Keypair, + transaction::{SanitizedTransaction, Transaction}, + }, + std::{cell::RefCell, collections::HashMap, rc::Rc}, + }; + + fn simplest_transaction() -> SanitizedTransaction { + let payer = Keypair::new(); + let message = Message::new(&[], Some(&payer.pubkey())); + let unsigned = Transaction::new_unsigned(message); + SanitizedTransaction::from_transaction_for_tests(unsigned) + } + + fn transaction_with_readonly_address(address: Pubkey) -> SanitizedTransaction { + let instruction = Instruction { + program_id: Pubkey::default(), + accounts: vec![AccountMeta::new_readonly(address, false)], + data: vec![], + }; + let message = Message::new(&[instruction], Some(&Pubkey::new_unique())); + let unsigned = Transaction::new_unsigned(message); + SanitizedTransaction::from_transaction_for_tests(unsigned) + } + + fn transaction_with_writable_address(address: Pubkey) -> SanitizedTransaction { + let instruction = Instruction { + program_id: Pubkey::default(), + accounts: vec![AccountMeta::new(address, false)], + data: vec![], + }; + let message = Message::new(&[instruction], Some(&Pubkey::new_unique())); + let unsigned = Transaction::new_unsigned(message); + SanitizedTransaction::from_transaction_for_tests(unsigned) + } + + fn create_address_loader( + pages: Option>>>, + ) -> impl FnMut(Pubkey) -> Page { + let pages = pages.unwrap_or_default(); + move |address| pages.borrow_mut().entry(address).or_default().clone() + } + + #[test] + fn test_debug() { + // these are almost meaningless just to see eye-pleasing coverage report.... + assert_eq!( + format!( + "{:?}", + LockResult::Ok(PageUsage::Readonly(ShortCounter::one())) + ), + "Ok(Readonly(ShortCounter(1)))" + ); + let sanitized = simplest_transaction(); + let task = SchedulingStateMachine::create_task(sanitized, 0, &mut |_| Page::default()); + assert!(format!("{:?}", task).contains("TaskInner")); + + assert_eq!( + format!("{:?}", PageInner::default()), + "PageInner { usage: Unused, blocked_tasks: [] }" + ) + } + + #[test] + fn test_scheduling_state_machine_creation() { + let state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + assert_eq!(state_machine.active_task_count(), 0); + assert_eq!(state_machine.total_task_count(), 0); + assert!(state_machine.has_no_active_task()); + } + + #[test] + fn test_scheduling_state_machine_reinitialization() { + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + state_machine.total_task_count.increment_self(); + assert_eq!(state_machine.total_task_count(), 1); + state_machine.last_task_index = Some(1); + state_machine.reinitialize(); + assert_eq!(state_machine.total_task_count(), 0); + assert_eq!(state_machine.last_task_index, None); + } + + #[test] + fn test_create_task() { + let sanitized = simplest_transaction(); + let task = + SchedulingStateMachine::create_task(sanitized.clone(), 3, &mut |_| Page::default()); + assert_eq!(task.task_index(), 3); + assert_eq!(task.transaction(), &sanitized); + } + + #[test] + fn test_non_conflicting_task_related_counts() { + let sanitized = simplest_transaction(); + let address_loader = &mut create_address_loader(None); + let task = SchedulingStateMachine::create_task(sanitized.clone(), 3, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + let task = state_machine.schedule_task(task).unwrap(); + assert_eq!(state_machine.active_task_count(), 1); + assert_eq!(state_machine.total_task_count(), 1); + state_machine.deschedule_task(&task); + assert_eq!(state_machine.active_task_count(), 0); + assert_eq!(state_machine.total_task_count(), 1); + assert!(state_machine.has_no_active_task()); + } + + #[test] + fn test_conflicting_task_related_counts() { + let sanitized = simplest_transaction(); + let address_loader = &mut create_address_loader(None); + let task1 = SchedulingStateMachine::create_task(sanitized.clone(), 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized.clone(), 102, address_loader); + let task3 = SchedulingStateMachine::create_task(sanitized.clone(), 103, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + assert_matches!( + state_machine + .schedule_task(task1.clone()) + .map(|t| t.task_index()), + Some(101) + ); + assert_matches!(state_machine.schedule_task(task2.clone()), None); + + state_machine.deschedule_task(&task1); + assert!(state_machine.has_unblocked_task()); + assert_eq!(state_machine.unblocked_task_queue_count(), 1); + assert_eq!( + state_machine + .schedule_unblocked_task() + .unwrap() + .task_index(), + task2.task_index() + ); + assert!(!state_machine.has_unblocked_task()); + assert_eq!(state_machine.unblocked_task_queue_count(), 0); + state_machine.deschedule_task(&task2); + + assert_matches!( + state_machine + .schedule_task(task3.clone()) + .map(|task| task.task_index()), + Some(103) + ); + state_machine.deschedule_task(&task3); + assert!(state_machine.has_no_active_task()); + } + + #[test] + fn test_unblocked_task_related_counts() { + let sanitized = simplest_transaction(); + let address_loader = &mut create_address_loader(None); + let task1 = SchedulingStateMachine::create_task(sanitized.clone(), 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized.clone(), 102, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + assert_matches!( + state_machine + .schedule_task(task1.clone()) + .map(|t| t.task_index()), + Some(101) + ); + assert_matches!(state_machine.schedule_task(task2.clone()), None); + + state_machine.deschedule_task(&task1); + + assert_eq!(state_machine.unblocked_task_count(), 0); + assert_matches!( + state_machine + .schedule_unblocked_task() + .map(|t| t.task_index()), + Some(102) + ); + assert_eq!(state_machine.unblocked_task_count(), 1); + // there's no blocked task anymore; calling schedule_unblocked_task should be noop and + // shouldn't increment the unblocked_task_count(). + assert_matches!(state_machine.schedule_unblocked_task(), None); + assert_eq!(state_machine.unblocked_task_count(), 1); + + state_machine.deschedule_task(&task2); + assert!(state_machine.has_no_active_task()); + } + + #[test] + fn test_existing_blocking_task_then_newly_scheduled_task() { + let sanitized = simplest_transaction(); + let address_loader = &mut create_address_loader(None); + let task1 = SchedulingStateMachine::create_task(sanitized.clone(), 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized.clone(), 102, address_loader); + let task3 = SchedulingStateMachine::create_task(sanitized.clone(), 103, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + assert_matches!( + state_machine + .schedule_task(task1.clone()) + .map(|t| t.task_index()), + Some(101) + ); + assert_matches!(state_machine.schedule_task(task2.clone()), None); + + assert_eq!(state_machine.unblocked_task_queue_count(), 0); + state_machine.deschedule_task(&task1); + assert_eq!(state_machine.unblocked_task_queue_count(), 1); + + // new task is arriving after task1 is already descheduled and task2 got unblocked + assert_matches!(state_machine.schedule_task(task3.clone()), None); + + assert_eq!(state_machine.unblocked_task_count(), 0); + assert_matches!( + state_machine + .schedule_unblocked_task() + .map(|t| t.task_index()), + Some(102) + ); + assert_eq!(state_machine.unblocked_task_count(), 1); + + state_machine.deschedule_task(&task2); + + assert_matches!( + state_machine + .schedule_unblocked_task() + .map(|t| t.task_index()), + Some(103) + ); + assert_eq!(state_machine.unblocked_task_count(), 2); + + state_machine.deschedule_task(&task3); + assert!(state_machine.has_no_active_task()); + } + + #[test] + fn test_multiple_readonly_task_and_counts() { + let conflicting_address = Pubkey::new_unique(); + let sanitized1 = transaction_with_readonly_address(conflicting_address); + let sanitized2 = transaction_with_readonly_address(conflicting_address); + let address_loader = &mut create_address_loader(None); + let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + // both of read-only tasks should be immediately runnable + assert_matches!( + state_machine + .schedule_task(task1.clone()) + .map(|t| t.task_index()), + Some(101) + ); + assert_matches!( + state_machine + .schedule_task(task2.clone()) + .map(|t| t.task_index()), + Some(102) + ); + + assert_eq!(state_machine.active_task_count(), 2); + assert_eq!(state_machine.handled_task_count(), 0); + assert_eq!(state_machine.unblocked_task_queue_count(), 0); + state_machine.deschedule_task(&task1); + assert_eq!(state_machine.active_task_count(), 1); + assert_eq!(state_machine.handled_task_count(), 1); + assert_eq!(state_machine.unblocked_task_queue_count(), 0); + state_machine.deschedule_task(&task2); + assert_eq!(state_machine.active_task_count(), 0); + assert_eq!(state_machine.handled_task_count(), 2); + assert!(state_machine.has_no_active_task()); + } + + #[test] + fn test_all_blocking_redable_tasks_block_writable_task() { + let conflicting_address = Pubkey::new_unique(); + let sanitized1 = transaction_with_readonly_address(conflicting_address); + let sanitized2 = transaction_with_readonly_address(conflicting_address); + let sanitized3 = transaction_with_writable_address(conflicting_address); + let address_loader = &mut create_address_loader(None); + let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader); + let task3 = SchedulingStateMachine::create_task(sanitized3, 103, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + assert_matches!( + state_machine + .schedule_task(task1.clone()) + .map(|t| t.task_index()), + Some(101) + ); + assert_matches!( + state_machine + .schedule_task(task2.clone()) + .map(|t| t.task_index()), + Some(102) + ); + assert_matches!(state_machine.schedule_task(task3.clone()), None); + + assert_eq!(state_machine.active_task_count(), 3); + assert_eq!(state_machine.handled_task_count(), 0); + assert_eq!(state_machine.unblocked_task_queue_count(), 0); + state_machine.deschedule_task(&task1); + assert_eq!(state_machine.active_task_count(), 2); + assert_eq!(state_machine.handled_task_count(), 1); + assert_eq!(state_machine.unblocked_task_queue_count(), 0); + assert_matches!(state_machine.schedule_unblocked_task(), None); + state_machine.deschedule_task(&task2); + assert_eq!(state_machine.active_task_count(), 1); + assert_eq!(state_machine.handled_task_count(), 2); + assert_eq!(state_machine.unblocked_task_queue_count(), 1); + // task3 is finally unblocked after all of readble tasks (task1 and task2) is finished. + assert_matches!( + state_machine + .schedule_unblocked_task() + .map(|t| t.task_index()), + Some(103) + ); + state_machine.deschedule_task(&task3); + assert!(state_machine.has_no_active_task()); + } + + #[test] + fn test_readonly_then_writable_then_readonly_linearized() { + let conflicting_address = Pubkey::new_unique(); + let sanitized1 = transaction_with_readonly_address(conflicting_address); + let sanitized2 = transaction_with_writable_address(conflicting_address); + let sanitized3 = transaction_with_readonly_address(conflicting_address); + let address_loader = &mut create_address_loader(None); + let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader); + let task3 = SchedulingStateMachine::create_task(sanitized3, 103, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + assert_matches!( + state_machine + .schedule_task(task1.clone()) + .map(|t| t.task_index()), + Some(101) + ); + assert_matches!(state_machine.schedule_task(task2.clone()), None); + assert_matches!(state_machine.schedule_task(task3.clone()), None); + + assert_matches!(state_machine.schedule_unblocked_task(), None); + state_machine.deschedule_task(&task1); + assert_matches!( + state_machine + .schedule_unblocked_task() + .map(|t| t.task_index()), + Some(102) + ); + assert_matches!(state_machine.schedule_unblocked_task(), None); + state_machine.deschedule_task(&task2); + assert_matches!( + state_machine + .schedule_unblocked_task() + .map(|t| t.task_index()), + Some(103) + ); + assert_matches!(state_machine.schedule_unblocked_task(), None); + state_machine.deschedule_task(&task3); + assert!(state_machine.has_no_active_task()); + } + + #[test] + fn test_readonly_then_writable() { + let conflicting_address = Pubkey::new_unique(); + let sanitized1 = transaction_with_readonly_address(conflicting_address); + let sanitized2 = transaction_with_writable_address(conflicting_address); + let address_loader = &mut create_address_loader(None); + let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + assert_matches!( + state_machine + .schedule_task(task1.clone()) + .map(|t| t.task_index()), + Some(101) + ); + assert_matches!(state_machine.schedule_task(task2.clone()), None); + + // descheduling read-locking task1 should equate to unblocking write-locking task2 + state_machine.deschedule_task(&task1); + assert_matches!( + state_machine + .schedule_unblocked_task() + .map(|t| t.task_index()), + Some(102) + ); + state_machine.deschedule_task(&task2); + assert!(state_machine.has_no_active_task()); + } + + #[test] + fn test_blocked_tasks_writable_2_readonly_then_writable() { + let conflicting_address = Pubkey::new_unique(); + let sanitized1 = transaction_with_writable_address(conflicting_address); + let sanitized2 = transaction_with_readonly_address(conflicting_address); + let sanitized3 = transaction_with_readonly_address(conflicting_address); + let sanitized4 = transaction_with_writable_address(conflicting_address); + let address_loader = &mut create_address_loader(None); + let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader); + let task3 = SchedulingStateMachine::create_task(sanitized3, 103, address_loader); + let task4 = SchedulingStateMachine::create_task(sanitized4, 104, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + assert_matches!( + state_machine + .schedule_task(task1.clone()) + .map(|t| t.task_index()), + Some(101) + ); + assert_matches!(state_machine.schedule_task(task2.clone()), None); + assert_matches!(state_machine.schedule_task(task3.clone()), None); + assert_matches!(state_machine.schedule_task(task4.clone()), None); + + state_machine.deschedule_task(&task1); + assert_matches!( + state_machine + .schedule_unblocked_task() + .map(|t| t.task_index()), + Some(102) + ); + assert_matches!( + state_machine + .schedule_unblocked_task() + .map(|t| t.task_index()), + Some(103) + ); + // the above deschedule_task(task1) call should only unblock task2 and task3 because these + // are read-locking. And shouldn't unblock task4 because it's write-locking + assert_matches!(state_machine.schedule_unblocked_task(), None); + + state_machine.deschedule_task(&task2); + // still task4 is blocked... + assert_matches!(state_machine.schedule_unblocked_task(), None); + + state_machine.deschedule_task(&task3); + // finally task4 should be unblocked + assert_matches!( + state_machine + .schedule_unblocked_task() + .map(|t| t.task_index()), + Some(104) + ); + state_machine.deschedule_task(&task4); + assert!(state_machine.has_no_active_task()); + } + + #[test] + fn test_gradual_locking() { + let conflicting_address = Pubkey::new_unique(); + let sanitized1 = transaction_with_writable_address(conflicting_address); + let sanitized2 = transaction_with_writable_address(conflicting_address); + let pages = Rc::new(RefCell::new(HashMap::new())); + let address_loader = &mut create_address_loader(Some(pages.clone())); + let task1 = SchedulingStateMachine::create_task(sanitized1, 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized2, 102, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + assert_matches!( + state_machine + .schedule_task(task1.clone()) + .map(|t| t.task_index()), + Some(101) + ); + assert_matches!(state_machine.schedule_task(task2.clone()), None); + let pages = pages.borrow_mut(); + let page = pages.get(&conflicting_address).unwrap(); + assert_matches!( + page.0.borrow_mut(&mut state_machine.page_token).usage, + PageUsage::Writable + ); + // task2's fee payer should have been locked already even if task2 is blocked still via the + // above the schedule_task(task2) call + let fee_payer = task2.transaction().message().fee_payer(); + let page = pages.get(fee_payer).unwrap(); + assert_matches!( + page.0.borrow_mut(&mut state_machine.page_token).usage, + PageUsage::Writable + ); + } + + #[test] + #[should_panic(expected = "internal error: entered unreachable code")] + fn test_unreachable_unlock_conditions1() { + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + let page = Page::default(); + let _ = SchedulingStateMachine::unlock_page( + page.0.borrow_mut(&mut state_machine.page_token), + &LockAttempt::new(page, RequestedUsage::Writable), + ); + } + + #[test] + #[should_panic(expected = "internal error: entered unreachable code")] + fn test_unreachable_unlock_conditions2() { + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + let page = Page::default(); + page.0.borrow_mut(&mut state_machine.page_token).usage = PageUsage::Writable; + let _ = SchedulingStateMachine::unlock_page( + page.0.borrow_mut(&mut state_machine.page_token), + &LockAttempt::new(page, RequestedUsage::Readonly), + ); + } + + #[test] + #[should_panic(expected = "internal error: entered unreachable code")] + fn test_unreachable_unlock_conditions3() { + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + let page = Page::default(); + page.0.borrow_mut(&mut state_machine.page_token).usage = + PageUsage::Readonly(ShortCounter::one()); + let _ = SchedulingStateMachine::unlock_page( + page.0.borrow_mut(&mut state_machine.page_token), + &LockAttempt::new(page, RequestedUsage::Writable), + ); + } + + #[test] + #[should_panic(expected = "bad new task index: 101 > 101")] + fn test_schedule_same_task() { + let conflicting_address = Pubkey::new_unique(); + let sanitized = transaction_with_writable_address(conflicting_address); + let address_loader = &mut create_address_loader(None); + let task = SchedulingStateMachine::create_task(sanitized, 101, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + let _ = state_machine.schedule_task(task.clone()); + let _ = state_machine.schedule_task(task.clone()); + } + + #[test] + #[should_panic(expected = "bad new task index: 101 > 102")] + fn test_schedule_task_out_of_order() { + let conflicting_address = Pubkey::new_unique(); + let sanitized = transaction_with_writable_address(conflicting_address); + let address_loader = &mut create_address_loader(None); + let task1 = SchedulingStateMachine::create_task(sanitized.clone(), 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized.clone(), 102, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + let _ = state_machine.schedule_task(task2.clone()); + let _ = state_machine.schedule_task(task1.clone()); + } + + #[test] + #[should_panic(expected = "task should have been scheduled")] + fn test_deschedule_new_task_wihout_scheduling() { + let conflicting_address = Pubkey::new_unique(); + let sanitized = transaction_with_writable_address(conflicting_address); + let address_loader = &mut create_address_loader(None); + let task = SchedulingStateMachine::create_task(sanitized.clone(), 101, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + state_machine.deschedule_task(&task); + } + + #[test] + #[should_panic(expected = "bad unblocked task index: 102 <= 101")] + fn test_deschedule_new_task_out_of_order() { + let conflicting_address = Pubkey::new_unique(); + let sanitized = transaction_with_writable_address(conflicting_address); + let address_loader = &mut create_address_loader(None); + let task1 = SchedulingStateMachine::create_task(sanitized.clone(), 101, address_loader); + let task2 = SchedulingStateMachine::create_task(sanitized.clone(), 102, address_loader); + + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + let _ = state_machine.schedule_task(task1.clone()); + state_machine.deschedule_task(&task2); + } } diff --git a/unified-scheduler-pool/Cargo.toml b/unified-scheduler-pool/Cargo.toml index 7626215b1e1126..bc2a33014ff266 100644 --- a/unified-scheduler-pool/Cargo.toml +++ b/unified-scheduler-pool/Cargo.toml @@ -11,17 +11,41 @@ edition = { workspace = true } [dependencies] assert_matches = { workspace = true } +cpu-time = { workspace = true } crossbeam-channel = { workspace = true } +dashmap = { workspace = true } derivative = { workspace = true } log = { workspace = true } +qualifier_attr = { workspace = true } +rustix = { workspace = true } +serde_json = { workspace = true } solana-ledger = { workspace = true } +solana-measure = { workspace = true } +solana-metrics = { workspace = true } solana-program-runtime = { workspace = true } solana-runtime = { workspace = true } solana-sdk = { workspace = true } solana-unified-scheduler-logic = { workspace = true } solana-vote = { workspace = true } +[target."cfg(target_os = \"linux\")".dependencies] +procfs = { workspace = true } + [dev-dependencies] -assert_matches = { workspace = true } +bincode = { workspace = true } +criterion = "0.5.1" +jemallocator = { workspace = true } +log = { workspace = true } +rand = { workspace = true } solana-logger = { workspace = true } +solana-nohash-hasher = { workspace = true } solana-runtime = { workspace = true, features = ["dev-context-only-utils"] } +# See order-crates-for-publishing.py for using this unusual `path = "."` +solana-unified-scheduler-pool = { path = ".", features = ["dev-context-only-utils"] } + +[[bench]] +name = "lib" +harness = false + +[features] +dev-context-only-utils = [] diff --git a/unified-scheduler-pool/benches/lib.rs b/unified-scheduler-pool/benches/lib.rs new file mode 100644 index 00000000000000..479eaf68e73f35 --- /dev/null +++ b/unified-scheduler-pool/benches/lib.rs @@ -0,0 +1,208 @@ +#![allow(unused_imports, dead_code)] +#![feature(test)] + +extern crate test; + +#[cfg(not(target_env = "msvc"))] +use jemallocator::Jemalloc; + +#[cfg(not(target_env = "msvc"))] +#[global_allocator] +static GLOBAL: Jemalloc = Jemalloc; + +use { + solana_program_runtime::timings::ExecuteTimings, + solana_runtime::{ + bank::Bank, + bank_forks::BankForks, + genesis_utils::{create_genesis_config, GenesisConfigInfo}, + installed_scheduler_pool::{ + DefaultScheduleExecutionArg, InstalledScheduler, SchedulingContext, + }, + prioritization_fee_cache::PrioritizationFeeCache, + }, + solana_sdk::{ + scheduling::SchedulingMode, + transaction::{Result, SanitizedTransaction}, + }, + solana_unified_scheduler_logic::{Page, SchedulingStateMachine}, + solana_unified_scheduler_pool::{ + HandlerContext, PooledScheduler, SchedulerPool, SpawnableScheduler, TaskHandler, + }, + std::sync::Arc, +}; + +#[derive(Debug, Clone)] +struct DummyTaskHandler; + +impl TaskHandler for DummyTaskHandler { + fn handle( + &self, + _result: &mut Result<()>, + _timings: &mut ExecuteTimings, + _bank: &Arc, + _transaction: &SanitizedTransaction, + _index: usize, + _handler_context: &HandlerContext, + ) { + } + + fn create>( + _pool: &SchedulerPool, + ) -> Self { + Self + } +} + +fn setup_dummy_fork_graph(bank: Bank) -> Arc { + let slot = bank.slot(); + let bank_fork = BankForks::new_rw_arc(bank); + let bank = bank_fork.read().unwrap().get(slot).unwrap(); + bank.loaded_programs_cache + .write() + .unwrap() + .set_fork_graph(bank_fork); + bank +} + +use solana_sdk::{ + instruction::{AccountMeta, Instruction}, + message::Message, + pubkey::Pubkey, + signature::Signer, + signer::keypair::Keypair, + transaction::Transaction, +}; + +fn do_bench_tx_throughput(label: &str, bencher: &mut Criterion) { + solana_logger::setup(); + + /* + let GenesisConfigInfo { + genesis_config, + .. + } = create_genesis_config(10_000); + */ + let payer = Keypair::new(); + + let mut accounts = vec![]; + for i in 0..100 { + if i % 2 == 0 { + accounts.push(AccountMeta::new(Keypair::new().pubkey(), true)); + } else { + accounts.push(AccountMeta::new_readonly(Keypair::new().pubkey(), true)); + } + } + + let memo_ix = Instruction { + program_id: Pubkey::default(), + accounts, + data: vec![0x00], + }; + let mut ixs = vec![]; + for _ in 0..1 { + ixs.push(memo_ix.clone()); + } + let msg = Message::new(&ixs, Some(&payer.pubkey())); + let txn = Transaction::new_unsigned(msg); + //assert_eq!(wire_txn.len(), 3); + let tx0 = SanitizedTransaction::from_transaction_for_tests(txn); + /* + let bank = Bank::new_for_tests(&genesis_config); + let bank = setup_dummy_fork_graph(bank); + let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); + let pool = SchedulerPool::, _, _>::new( + None, + None, + None, + ignored_prioritization_fee_cache, + ); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); + */ + + let (s, r) = crossbeam_channel::bounded(1000); + + use std::sync::atomic::AtomicUsize; + let i = Arc::new(AtomicUsize::default()); + use std::sync::Mutex; + let pages: Arc>> = + Arc::new(Mutex::new(std::collections::HashMap::new())); + /* + for _ in 0..5 { + std::thread::Builder::new() + .name("solScGen".to_owned()) + .spawn({ + let pages = pages.clone(); + let i = i.clone(); + let tx1 = tx0.clone(); + let s = s.clone(); + move || loop { + let tasks = std::iter::repeat_with(|| SchedulingStateMachine::create_task(tx1.clone(), i.fetch_add(1, std::sync::atomic::Ordering::Relaxed), &mut |address| { + pages.lock().unwrap().entry(address).or_default().clone() + })).take(100).collect::>(); + if s.send(tasks).is_err() { + break; + } + } + }) + .unwrap(); + } + std::thread::sleep(std::time::Duration::from_secs(5)); + */ + + //assert_eq!(bank.transaction_count(), 0); + //let mut scheduler = pool.do_take_scheduler(context); + + let mut scheduler = + unsafe { SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() }; + + let tasks = std::iter::repeat_with(|| { + SchedulingStateMachine::create_task( + tx0.clone(), + i.fetch_add(1, std::sync::atomic::Ordering::Relaxed), + &mut |address| pages.lock().unwrap().entry(address).or_default().clone(), + ) + }) + .take(100) + .collect::>(); + s.send(tasks).unwrap(); + + bencher.bench_function(label, |b| { + b.iter(|| { + for _ in 0..600 { + let mut first_task = None; + let tt = r.recv().unwrap(); + let mut new_tasks = Vec::with_capacity(tt.len()); + for t in tt { + /* + scheduler.schedule_task(t); + */ + if let Some(task) = scheduler.schedule_task(t) { + first_task = Some(task); + } + } + scheduler.deschedule_task(first_task.as_ref().unwrap()); + new_tasks.push(first_task.unwrap()); + while let Some(unblocked_task) = scheduler.schedule_unblocked_task() { + scheduler.deschedule_task(&unblocked_task); + new_tasks.push(unblocked_task); + } + assert!(scheduler.has_no_active_task()); + s.send(new_tasks).unwrap(); + } + /* + scheduler.pause_for_recent_blockhash(); + scheduler.clear_session_result_with_timings(); + scheduler.restart_session(); + */ + }) + }); +} + +fn bench_entrypoint(bencher: &mut Criterion) { + do_bench_tx_throughput("bench_tx_throughput", bencher) +} + +use criterion::{criterion_group, criterion_main, Criterion}; +criterion_group!(benches, bench_entrypoint); +criterion_main!(benches); diff --git a/unified-scheduler-pool/benches/scheduler.rs b/unified-scheduler-pool/benches/scheduler.rs new file mode 100644 index 00000000000000..b61903d77df4a4 --- /dev/null +++ b/unified-scheduler-pool/benches/scheduler.rs @@ -0,0 +1,922 @@ +#![cfg(feature = "dummy")] +#![feature(test)] +#![allow(clippy::arithmetic_side_effects)] + +#[global_allocator] +static GLOBAL: jemallocator::Jemalloc = jemallocator::Jemalloc; + +extern crate test; + +use { + assert_matches::assert_matches, + log::*, + rand::{thread_rng, Rng}, + solana_program_runtime::timings::ExecuteTimings, + solana_runtime::{ + bank::Bank, + genesis_utils::{create_genesis_config, GenesisConfigInfo}, + installed_scheduler_pool::{ + InstalledScheduler, ResultWithTimings, ScheduleExecutionArg, SchedulerId, + SchedulingContext, SchedulingMode, WithTransactionAndIndex, + }, + prioritization_fee_cache::PrioritizationFeeCache, + }, + solana_sdk::{ + scheduling::SchedulingMode, + system_transaction, + transaction::{Result, SanitizedTransaction}, + }, + solana_unified_scheduler_pool::{ + PooledScheduler, SchedulerPool, SpawnableScheduler, TaskHandler, + }, + std::{ + fmt::Debug, + marker::{PhantomData, Send, Sync}, + mem, + sync::Arc, + }, + test::Bencher, +}; + +const TX_COUNT: usize = 10_000; + +#[derive(Debug, Default, Clone)] +struct ScheduleExecutionArgForBench; + +// use Arc-ed transaction for very cheap .clone() so that the consumer is never starved for +// incoming transactions. +type TransactionWithIndexForBench = Arc<(SanitizedTransaction, usize)>; + +impl ScheduleExecutionArg for ScheduleExecutionArgForBench { + type TransactionWithIndex<'_tx> = TransactionWithIndexForBench; +} + +#[derive(Debug, Default, Clone)] +struct BenchFriendlyHandler( + PhantomData, +); + +impl TaskHandler + for BenchFriendlyHandler +{ + fn create>(_pool: &SchedulerPool) -> Self { + Self(PhantomData) + } + + fn handle>( + &self, + _result: &mut Result<()>, + _timings: &mut ExecuteTimings, + bank: &Arc, + transaction: &SanitizedTransaction, + _index: usize, + _pool: &SchedulerPool, + ) { + //std::hint::black_box(bank.clone()); + let mut i = 0; + for _ in 0..10 { + if MUTATE_ARC { + //for _ in 0..2 { + std::hint::black_box((Arc::downgrade(bank)).upgrade().unwrap()); + //} + } + // call random one of Bank's lightweight-and-very-multi-threaded-friendly methods which take a + // transaction inside this artifical tight loop. + i += bank.get_fee_for_message_with_lamports_per_signature(transaction.message(), i) + } + std::hint::black_box(i); + } +} + +type BenchFriendlyHandlerWithArcMutation = BenchFriendlyHandler; +type BenchFriendlyHandlerWithoutArcMutation = + BenchFriendlyHandler; + +fn run_bench< + F: FnOnce(Arc>, SchedulingContext) -> I, + I: SpawnableScheduler, + TH: TaskHandler, +>( + bencher: &mut Bencher, + create_scheduler: F, +) { + solana_logger::setup(); + + let GenesisConfigInfo { + genesis_config, + mint_keypair, + .. + } = create_genesis_config(1_000_000_000); + let bank = &Arc::new(Bank::new_for_tests(&genesis_config)); + let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); + let pool = SchedulerPool::new(None, None, None, ignored_prioritization_fee_cache); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); + + let mut scheduler = create_scheduler(pool, context.clone()); + let tx0 = &SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer( + &mint_keypair, + &solana_sdk::pubkey::new_rand(), + 2, + genesis_config.hash(), + )); + let tx_with_index = TransactionWithIndexForBench::new((tx0.clone(), 0)); + bencher.iter(|| { + for _ in 0..TX_COUNT { + scheduler.schedule_execution(tx_with_index.clone()); + } + assert_matches!(scheduler.wait_for_termination(false), Some((Ok(()), _))); + scheduler.replace_context(context.clone()); + }); +} + +mod blocking_ref { + use {super::*, solana_runtime::installed_scheduler_pool::DefaultScheduleExecutionArg}; + + #[bench] + fn bench_without_arc_mutation(bencher: &mut Bencher) { + solana_logger::setup(); + + let GenesisConfigInfo { + genesis_config, + mint_keypair, + .. + } = create_genesis_config(1_000_000_000); + let bank = &Arc::new(Bank::new_for_tests(&genesis_config)); + let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); + let pool = SchedulerPool::new(None, None, None, ignored_prioritization_fee_cache); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); + + let mut scheduler = PooledScheduler::<_, DefaultScheduleExecutionArg>::do_spawn( + pool, + context.clone(), + BenchFriendlyHandler::<_, false>::default(), + ); + let tx0 = &SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer( + &mint_keypair, + &solana_sdk::pubkey::new_rand(), + 2, + genesis_config.hash(), + )); + let tx_with_index = &(tx0, 0); + bencher.iter(|| { + for _ in 0..TX_COUNT { + scheduler.schedule_execution(tx_with_index); + } + assert_matches!(scheduler.wait_for_termination(false), Some((Ok(()), _))); + scheduler.replace_context(context.clone()); + }); + } +} + +mod blocking { + use super::*; + + type BlockingScheduler = PooledScheduler; + + #[bench] + fn bench_with_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + BlockingScheduler::do_spawn( + pool, + context, + BenchFriendlyHandlerWithArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_without_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + BlockingScheduler::do_spawn( + pool, + context, + BenchFriendlyHandlerWithoutArcMutation::default(), + ) + }); + } +} + +mod nonblocking { + use super::*; + + #[derive(Debug)] + pub(super) struct NonblockingScheduler + Clone> { + id: SchedulerId, + pub(crate) pool: Arc>, + transaction_sender: crossbeam_channel::Sender, + result_receiver: crossbeam_channel::Receiver<(Result<()>, ExecuteTimings, usize)>, + lane_count: usize, + context: SchedulingContext, + _phantom: PhantomData, + } + + enum ChainedChannel { + Payload(TransactionWithIndexForBench), + NextContext(SchedulingContext), + NextChannel(Box), + } + + type ChannelPair = ( + crossbeam_channel::Receiver, + crossbeam_channel::Sender<(Result<()>, ExecuteTimings, usize)>, + ); + + trait WithChannelPair { + fn unwrap_channel_pair(&mut self) -> ChannelPair; + } + + struct ChannelPairOption(Option); + + impl WithChannelPair for ChannelPairOption { + fn unwrap_channel_pair(&mut self) -> ChannelPair { + self.0.take().unwrap() + } + } + + impl + Clone> + SpawnableScheduler for NonblockingScheduler + { + fn spawn( + _pool: Arc>, + _initial_context: SchedulingContext, + _handler: H, + ) -> Self { + unimplemented!(); + } + + fn retire_if_stale(&mut self) -> bool { + unimplemented!(); + } + } + + impl + Clone> NonblockingScheduler { + pub(super) fn spawn( + pool: Arc>, + initial_context: SchedulingContext, + lane_count: usize, + handler: H, + ) -> Self { + let (transaction_sender, transaction_receiver) = + crossbeam_channel::unbounded::(); + let (result_sender, result_receiver) = crossbeam_channel::unbounded(); + + for _ in 0..lane_count { + let mut bank = Arc::clone(initial_context.bank()); + let mut transaction_receiver = transaction_receiver.clone(); + let mut result_sender = result_sender.clone(); + std::thread::spawn({ + let pool = pool.clone(); + let handler = handler.clone(); + move || { + let mut result = Ok(()); + let mut timings = ExecuteTimings::default(); + let mut count = 0; + while let Ok(message) = transaction_receiver.recv() { + match message { + ChainedChannel::Payload(with_transaction_and_index) => { + count += 1; + with_transaction_and_index.with_transaction_and_index( + |transaction, index| { + H::handle( + &handler, + &mut result, + &mut timings, + &bank, + transaction, + index, + &pool, + ); + }, + ); + } + ChainedChannel::NextContext(next_context) => { + bank = next_context.bank().clone(); + } + ChainedChannel::NextChannel(mut next_receiver_box) => { + result_sender + .send(( + mem::replace(&mut result, Ok(())), + mem::take(&mut timings), + mem::take(&mut count), + )) + .unwrap(); + (transaction_receiver, result_sender) = + next_receiver_box.unwrap_channel_pair(); + } + } + } + } + }); + } + + Self { + id: thread_rng().gen::(), + pool, + transaction_sender, + result_receiver, + lane_count, + context: initial_context, + _phantom: PhantomData, + } + } + } + impl + Clone> + InstalledScheduler for NonblockingScheduler + { + fn id(&self) -> SchedulerId { + self.id + } + + fn context(&self) -> &SchedulingContext { + &self.context + } + + fn schedule_execution(&self, transaction_with_index: TransactionWithIndexForBench) { + self.transaction_sender + .send(ChainedChannel::Payload(transaction_with_index)) + .unwrap(); + } + + fn wait_for_termination(&mut self, _is_dropped: bool) -> Option { + let (next_transaction_sender, next_transaction_receiver) = + crossbeam_channel::unbounded::(); + let (next_result_sender, next_result_receiver) = crossbeam_channel::unbounded(); + for _ in 0..self.lane_count { + let (next_transaction_receiver, next_result_sender) = ( + next_transaction_receiver.clone(), + next_result_sender.clone(), + ); + self.transaction_sender + .send(ChainedChannel::NextChannel(Box::new(ChannelPairOption( + Some((next_transaction_receiver, next_result_sender)), + )))) + .unwrap(); + } + self.transaction_sender = next_transaction_sender; + + let mut overall_result = Ok(()); + let mut overall_timings = ExecuteTimings::default(); + + while let Ok((result, timings, count)) = self.result_receiver.recv() { + match result { + Ok(()) => {} + Err(e) => overall_result = Err(e), + } + overall_timings.accumulate(&timings); + trace!("received: {count:?}"); + } + self.result_receiver = next_result_receiver; + + Some((overall_result, overall_timings)) + } + + /* + fn return_to_pool(self: Box) { + self.pool.clone().return_scheduler(self) + } + */ + fn pause_for_recent_blockhash(&mut self) { + todo!() + } + } + + #[bench] + fn bench_with_01_thread_with_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 1, + BenchFriendlyHandlerWithArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_with_01_thread_without_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 1, + BenchFriendlyHandlerWithoutArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_with_04_threads_with_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 4, + BenchFriendlyHandlerWithArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_with_04_threads_without_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 4, + BenchFriendlyHandlerWithoutArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_with_08_threads_with_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 8, + BenchFriendlyHandlerWithArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_with_08_threads_without_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 8, + BenchFriendlyHandlerWithoutArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_with_16_threads_with_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 16, + BenchFriendlyHandlerWithArcMutation::default(), + ) + }); + } + + #[bench] + fn bench_with_16_threads_without_arc_mutation(bencher: &mut Bencher) { + run_bench(bencher, |pool, context| { + NonblockingScheduler::spawn( + pool, + context, + 16, + BenchFriendlyHandlerWithoutArcMutation::default(), + ) + }); + } +} + +// demonstrate meaningfully differing performance profile regarding multi worker thread utilization +// with saturated transaction execution for each bench scenarios, with/without the existence of +// artificial and needless synchronizations. +// conversely, the whole InstallableScheduler machinery can be justified as it can eliminate these +// synchronizations altogether to bare minimum (i.e. bank freeze). +#[cfg(feature = "dummy")] +mod thread_utilization { + use { + super::*, + crate::nonblocking::NonblockingScheduler, + solana_nohash_hasher::IntSet, + solana_sdk::{ + signature::Signature, signer::keypair::Keypair, + system_instruction::SystemInstruction::Transfer, transaction::TransactionAccountLocks, + }, + std::{collections::HashMap, sync::Mutex, thread::sleep, time::Duration}, + }; + + #[derive(Debug, Clone)] + struct SleepyHandler; + + impl TaskHandler for SleepyHandler { + fn create>(_pool: &SchedulerPool) -> Self { + Self + } + + fn handle>( + &self, + _result: &mut Result<()>, + _timings: &mut ExecuteTimings, + _bank: &Arc, + transaction: &SanitizedTransaction, + _index: usize, + _pool: &SchedulerPool, + ) { + let Ok(Transfer { lamports: sleep_ms }) = + bincode::deserialize(&transaction.message().instructions()[0].data) + else { + panic!() + }; + + sleep(Duration::from_millis(sleep_ms)); + } + } + + enum Step { + Batch(Vec), + // mimic periodic or contention-induced synchronization with this artificial blocking + MaySynchronize, + } + + const WORKER_THREAD_COUNT: usize = 10; + + fn simulate_synchronization_point>( + scheduler: &mut T, + context: SchedulingContext, + ) { + assert_matches!(scheduler.wait_for_termination(false), Some((Ok(()), _))); + scheduler.replace_context(context); + } + + fn run_scenario_and_finalize>( + bencher: &mut Bencher, + really_synchronize: bool, + scheduler: &mut T, + context: SchedulingContext, + create_scenario: impl Fn() -> Vec, + ) { + let scenario = &create_scenario(); + bencher.iter(|| { + for step in scenario { + match step { + Step::Batch(txes) => { + for tx in txes { + scheduler.schedule_execution(tx.clone()); + } + } + Step::MaySynchronize => { + if really_synchronize { + simulate_synchronization_point(scheduler, context.clone()); + } + } + } + } + simulate_synchronization_point(scheduler, context.clone()); + }) + } + + // frequent synchronization creates non-zero idling time among some of worker threads, given + // batches with mixed transactions. then, it adds up as these kinds synchronizations occurs over + // processing + fn bench_random_execution_durations(bencher: &mut Bencher, really_synchronize: bool) { + let GenesisConfigInfo { + genesis_config, + mint_keypair, + .. + } = create_genesis_config(1_000_000_000); + let bank = &Arc::new(Bank::new_for_tests(&genesis_config)); + + let create_tx_with_index = |index| { + let tx0 = + SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer( + &mint_keypair, + &solana_sdk::pubkey::new_rand(), + // simulate somewhat realistic work load; txes finish at different timings + thread_rng().gen_range(1..10), + genesis_config.hash(), + )); + TransactionWithIndexForBench::new((tx0, index)) + }; + + let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); + let pool = SchedulerPool::new(None, None, None, ignored_prioritization_fee_cache); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); + let mut scheduler = + NonblockingScheduler::spawn(pool, context.clone(), WORKER_THREAD_COUNT, SleepyHandler); + + run_scenario_and_finalize(bencher, really_synchronize, &mut scheduler, context, || { + const TX_PER_BATCH: usize = 20; + const SYNCHRONIZATION_PER_BENCH_ITER: usize = 10; + + (0..SYNCHRONIZATION_PER_BENCH_ITER) + .flat_map(|_| { + [ + Step::Batch((0..TX_PER_BATCH).map(create_tx_with_index).collect()), + Step::MaySynchronize, + ] + }) + .collect() + }); + } + + #[bench] + fn bench_random_execution_durations_with_interleaved_synchronization(bencher: &mut Bencher) { + bench_random_execution_durations(bencher, true); + } + + #[bench] + fn bench_random_execution_durations_without_interleaved_synchronization(bencher: &mut Bencher) { + bench_random_execution_durations(bencher, false); + } + + #[derive(Debug, Clone)] + struct SleepyHandlerWithCompletionSignal(crossbeam_channel::Sender); + + impl TaskHandler for SleepyHandlerWithCompletionSignal { + fn create>(_pool: &SchedulerPool) -> Self { + // not needed for bench... + unimplemented!(); + } + + fn handle>( + &self, + _result: &mut Result<()>, + _timings: &mut ExecuteTimings, + _bank: &Arc, + transaction: &SanitizedTransaction, + _index: usize, + _pool: &SchedulerPool, + ) { + let Ok(Transfer { lamports: sleep_ms }) = + bincode::deserialize(&transaction.message().instructions()[0].data) + else { + panic!() + }; + + sleep(Duration::from_millis(sleep_ms)); + + self.0.send(*transaction.signature()).unwrap(); + } + } + + // a wrapper InstallableScheduler to integrate with dep graph scheduling logic + #[derive(Debug)] + struct NonblockingSchedulerWithDepGraph { + inner_scheduler: NonblockingScheduler, + pending_transactions: Mutex>, + completion_receiver: crossbeam_channel::Receiver, + } + + impl InstalledScheduler for NonblockingSchedulerWithDepGraph { + fn id(&self) -> SchedulerId { + self.inner_scheduler.id() + } + + fn context(&self) -> &SchedulingContext { + self.inner_scheduler.context() + } + + fn schedule_execution(&self, transaction_with_index: TransactionWithIndexForBench) { + // just buffer all the txes to work with the dep graph outer loop nicely, which needs + // some buffering to schedule efficiently + // note taht the prompt execution as soon as entering into schedule_execution() isn't + // needed for these particular bench purposes. so, buffering is okay in that regard. + self.pending_transactions + .lock() + .unwrap() + .push(transaction_with_index.0.clone()); + } + + fn wait_for_termination(&mut self, is_dropped: bool) -> Option { + // execute all the pending transactions now! + self.execute_batches( + self.context().bank(), + &std::mem::take(&mut *self.pending_transactions.lock().unwrap()), + &self.completion_receiver, + ) + .unwrap(); + + self.inner_scheduler.wait_for_termination(is_dropped) + } + + /* + fn return_to_pool(self: Box) { + Box::new(self.inner_scheduler).return_to_pool() + } + */ + } + + /* + impl InstallableScheduler for NonblockingSchedulerWithDepGraph { + fn replace_context(&mut self, context: SchedulingContext) { + self.inner_scheduler.replace_context(context) + } + } + */ + + // adapted from https://github.com/jito-foundation/jito-solana/pull/294; retained to be as-is + // as much as possible by the use of some wrapper type hackery. + impl NonblockingSchedulerWithDepGraph { + // for each index, builds a transaction dependency graph of indices that need to execute before + // the current one. + // The returned Vec> is a 1:1 mapping for the indices that need to be executed + // before that index can be executed + fn build_dependency_graph( + tx_account_locks: &[TransactionAccountLocks], + ) -> Vec> { + // build a map whose key is a pubkey + value is a sorted vector of all indices that + // lock that account + let mut indices_read_locking_account = HashMap::new(); + let mut indicies_write_locking_account = HashMap::new(); + tx_account_locks + .iter() + .enumerate() + .for_each(|(idx, tx_account_locks)| { + for account in &tx_account_locks.readonly { + indices_read_locking_account + .entry(**account) + .and_modify(|indices: &mut Vec| indices.push(idx)) + .or_insert_with(|| vec![idx]); + } + for account in &tx_account_locks.writable { + indicies_write_locking_account + .entry(**account) + .and_modify(|indices: &mut Vec| indices.push(idx)) + .or_insert_with(|| vec![idx]); + } + }); + + tx_account_locks + .iter() + .enumerate() + .map(|(idx, account_locks)| { + let mut dep_graph: IntSet = IntSet::default(); + + let readlock_conflict_accs = account_locks.writable.iter(); + let writelock_conflict_accs = account_locks + .readonly + .iter() + .chain(account_locks.writable.iter()); + + for acc in readlock_conflict_accs { + if let Some(indices) = indices_read_locking_account.get(acc) { + dep_graph.extend(indices.iter().take_while(|l_idx| **l_idx < idx)); + } + } + + for acc in writelock_conflict_accs { + if let Some(indices) = indicies_write_locking_account.get(acc) { + dep_graph.extend(indices.iter().take_while(|l_idx| **l_idx < idx)); + } + } + dep_graph + }) + .collect() + } + + fn execute_batches( + &self, + bank: &Arc, + pending_transactions: &[SanitizedTransaction], + receiver: &crossbeam_channel::Receiver, + ) -> Result<()> { + if pending_transactions.is_empty() { + return Ok(()); + } + + let mut tx_account_locks: Vec<_> = Vec::with_capacity(pending_transactions.len()); + for tx in pending_transactions { + tx_account_locks + .push(tx.get_account_locks(bank.get_transaction_account_lock_limit())?); + } + + // the dependency graph contains the indices that must be executed (marked with + // State::Done) before they can be executed + let dependency_graph = Self::build_dependency_graph(&tx_account_locks); + + #[derive(Clone)] + enum State { + Blocked, + Processing, + Done, + } + + let mut processing_states: Vec = vec![State::Blocked; dependency_graph.len()]; + let mut signature_indices: HashMap<&Signature, usize> = + HashMap::with_capacity(dependency_graph.len()); + signature_indices.extend( + pending_transactions + .iter() + .enumerate() + .map(|(idx, tx)| (tx.signature(), idx)), + ); + + loop { + let mut is_done = true; + for idx in 0..processing_states.len() { + match processing_states[idx] { + State::Blocked => { + is_done = false; + + // if all the dependent txs are executed, this transaction can be + // scheduled for execution. + if dependency_graph[idx] + .iter() + .all(|idx| matches!(processing_states[*idx], State::Done)) + { + self.inner_scheduler.schedule_execution(Arc::new(( + pending_transactions[idx].clone(), + idx, + ))); + // this idx can be scheduled and moved to processing + processing_states[idx] = State::Processing; + } + } + State::Processing => { + is_done = false; + } + State::Done => {} + } + } + + if is_done { + break; + } + + let mut executor_responses: Vec<_> = vec![receiver.recv().unwrap()]; + executor_responses.extend(receiver.try_iter()); + for r in &executor_responses { + processing_states[*signature_indices.get(r).unwrap()] = State::Done; + } + } + Ok(()) + } + } + + // frequent synchronizations hampers efficient (= parallelizable) scheduling of several chunks + // of txes which are tied together for each common account locks. Ideally those independent chunks can be + // executed in parallel, which each is consuming one worker thread as a form of serialized runs + // of processing. However, should synchronizations occurs between boundaries of those chunks + // arrival, it cannot schedule the later-coming one because it firstly flush out the the first + // one + // in other words, this is just a re-manifestation of perf. issue coming from write barriers in + // general. + fn bench_long_serialized_runs(bencher: &mut Bencher, really_synchronize: bool) { + let GenesisConfigInfo { genesis_config, .. } = create_genesis_config(1_000_000_000); + let bank = &Arc::new(Bank::new_for_tests(&genesis_config)); + let (kp1, kp2) = (Keypair::new(), Keypair::new()); + + let create_tx_of_serialized_run1 = || { + let tx0 = + SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer( + &kp1, + &solana_sdk::pubkey::new_rand(), + 10, + genesis_config.hash(), + )); + TransactionWithIndexForBench::new((tx0, 0)) + }; + let create_tx_of_serialized_run2 = || { + let tx0 = + SanitizedTransaction::from_transaction_for_tests(system_transaction::transfer( + &kp2, + &solana_sdk::pubkey::new_rand(), + 10, + genesis_config.hash(), + )); + TransactionWithIndexForBench::new((tx0, 0)) + }; + + let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); + let pool = SchedulerPool::new(None, None, None, ignored_prioritization_fee_cache); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); + let (completion_sender, completion_receiver) = crossbeam_channel::unbounded(); + let handler = SleepyHandlerWithCompletionSignal(completion_sender); + let tx_lock_ignoring_scheduler = + NonblockingScheduler::spawn(pool, context.clone(), WORKER_THREAD_COUNT, handler); + let tx_lock_adhering_scheduler = NonblockingSchedulerWithDepGraph { + inner_scheduler: tx_lock_ignoring_scheduler, + pending_transactions: Mutex::new(Vec::default()), + completion_receiver, + }; + let mut scheduler = tx_lock_adhering_scheduler; + run_scenario_and_finalize(bencher, really_synchronize, &mut scheduler, context, || { + (0..1) + .flat_map(|_| { + [ + Step::Batch(vec![create_tx_of_serialized_run1()]), + Step::Batch(vec![create_tx_of_serialized_run1()]), + Step::Batch(vec![create_tx_of_serialized_run1()]), + Step::Batch(vec![create_tx_of_serialized_run1()]), + Step::MaySynchronize, + Step::Batch(vec![create_tx_of_serialized_run2()]), + Step::Batch(vec![create_tx_of_serialized_run2()]), + Step::Batch(vec![create_tx_of_serialized_run2()]), + Step::Batch(vec![create_tx_of_serialized_run2()]), + Step::MaySynchronize, + ] + }) + .collect() + }); + } + + #[bench] + fn bench_long_serialized_runs_with_interleaved_synchronization(bencher: &mut Bencher) { + bench_long_serialized_runs(bencher, true); + } + + #[bench] + fn bench_long_serialized_runs_without_interleaved_synchronization(bencher: &mut Bencher) { + bench_long_serialized_runs(bencher, false); + } +} diff --git a/unified-scheduler-pool/src/lib.rs b/unified-scheduler-pool/src/lib.rs index 09ded82ee88e7d..b38a3cd329b2fb 100644 --- a/unified-scheduler-pool/src/lib.rs +++ b/unified-scheduler-pool/src/lib.rs @@ -8,35 +8,51 @@ //! and commits any side-effects (i.e. on-chain state changes) into the associated `Bank` via //! `solana-ledger`'s helper function called `execute_batch()`. +#[cfg(feature = "dev-context-only-utils")] +use qualifier_attr::qualifiers; use { assert_matches::assert_matches, - crossbeam_channel::{select, unbounded, Receiver, SendError, Sender}, + cpu_time::ThreadTime, + crossbeam_channel::{ + bounded, disconnected, never, select_biased, unbounded, Receiver, RecvError, + RecvTimeoutError, SendError, Sender, TryRecvError, + }, + dashmap::DashMap, derivative::Derivative, log::*, solana_ledger::blockstore_processor::{ execute_batch, TransactionBatchWithIndexes, TransactionStatusSender, }, + solana_measure::measure::Measure, + solana_metrics::datapoint_info_at, solana_program_runtime::timings::ExecuteTimings, solana_runtime::{ bank::Bank, + compute_budget_details::GetComputeBudgetDetails, installed_scheduler_pool::{ - InstalledScheduler, InstalledSchedulerBox, InstalledSchedulerPool, - InstalledSchedulerPoolArc, ResultWithTimings, SchedulerId, SchedulingContext, - UninstalledScheduler, UninstalledSchedulerBox, + DefaultScheduleExecutionArg, InstalledScheduler, InstalledSchedulerPool, + InstalledSchedulerPoolArc, ResultWithTimings, ScheduleExecutionArg, SchedulerId, + SchedulingContext, UninstalledScheduler, UninstalledSchedulerBox, + WithTransactionAndIndex, }, prioritization_fee_cache::PrioritizationFeeCache, }, - solana_sdk::transaction::{Result, SanitizedTransaction}, - solana_unified_scheduler_logic::Task, + solana_sdk::{ + clock::Slot, + pubkey::Pubkey, + transaction::{Result, SanitizedTransaction, TransactionError}, + }, + solana_unified_scheduler_logic::{Page, SchedulingStateMachine, Task}, solana_vote::vote_sender_types::ReplayVoteSender, std::{ + env, fmt::Debug, - marker::PhantomData, sync::{ atomic::{AtomicU64, Ordering::Relaxed}, - Arc, Mutex, OnceLock, Weak, + Arc, Mutex, OnceLock, RwLock, RwLockReadGuard, Weak, }, thread::{self, JoinHandle}, + time::{Duration, Instant, SystemTime}, }, }; @@ -46,7 +62,11 @@ type AtomicSchedulerId = AtomicU64; // contains some internal fields, whose types aren't available in solana-runtime (currently // TransactionStatusSender; also, PohRecorder in the future)... #[derive(Debug)] -pub struct SchedulerPool, TH: TaskHandler> { +pub struct SchedulerPool< + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +> { scheduler_inners: Mutex>, handler_count: usize, handler_context: HandlerContext, @@ -62,7 +82,11 @@ pub struct SchedulerPool, TH: TaskHandler> { // memory increase. weak_self: Weak, next_scheduler_id: AtomicSchedulerId, - _phantom: PhantomData, + // prune schedulers, stop idling scheduler's threads, sanity check on the + // address book after scheduler is returned. + cleaner_sender: Sender>>>, + cleaner_exit_signal_sender: Sender<()>, + cleaner_thread: Mutex>>, } #[derive(Debug)] @@ -73,16 +97,121 @@ pub struct HandlerContext { prioritization_fee_cache: Arc, } -pub type DefaultSchedulerPool = - SchedulerPool, DefaultTaskHandler>; +pub type DefaultSchedulerPool = SchedulerPool< + PooledScheduler, + DefaultTaskHandler, + DefaultScheduleExecutionArg, +>; + +struct WatchedThreadManager +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + thread_manager: Weak>>, + #[cfg(target_os = "linux")] + tick: u64, + #[cfg(target_os = "linux")] + updated_at: Instant, +} -impl SchedulerPool +impl WatchedThreadManager where - S: SpawnableScheduler, - TH: TaskHandler, + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + fn new(thread_manager: Weak>>) -> Self { + Self { + thread_manager, + #[cfg(target_os = "linux")] + tick: 0, + #[cfg(target_os = "linux")] + updated_at: Instant::now(), + } + } + + fn retire_if_stale(&mut self) -> bool { + #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] + let Some(thread_manager) = self.thread_manager.upgrade() else { + return false; + }; + + // The following linux-only code implements an eager native thread reclaiming, which is + // only useful if the solana-validator sees many unrooted forks. Such hostile situations + // should NEVER happen on remotely-uncontrollable ledgers created by solana-test-validator. + // And it's generally not expected mainnet-beta validators (or any live clusters for that + // matter) to be run on non-linux OSes at all. + // + // Thus, this OS-specific implementation can be justified because this enables the hot-path + // (the scheduler main thread) to omit VDSO calls and timed-out futex syscalls by relying on + // this out-of-bound cleaner for a defensive thread reclaiming. + #[cfg(target_os = "linux")] + { + let Some(tid) = thread_manager.read().unwrap().active_tid_if_not_primary() else { + self.tick = 0; + self.updated_at = Instant::now(); + return true; + }; + + let pid = std::process::id(); + let task = procfs::process::Process::new(pid.try_into().unwrap()) + .unwrap() + .task_from_tid(tid) + .unwrap(); + let stat = task.stat().unwrap(); + let current_tick = stat.utime.checked_add(stat.stime).unwrap(); + if current_tick > self.tick { + self.tick = current_tick; + self.updated_at = Instant::now(); + } else { + // 5x of 400ms block time + const IDLE_DURATION_FOR_EAGER_THREAD_RECLAIM: Duration = Duration::from_secs(2); + + let elapsed = self.updated_at.elapsed(); + if elapsed > IDLE_DURATION_FOR_EAGER_THREAD_RECLAIM { + const BITS_PER_HEX_DIGIT: usize = 4; + let thread_manager = &mut thread_manager.write().unwrap(); + info!( + "[sch_{:0width$x}]: cleaner: retire_if_stale(): stopping thread manager ({tid}/{} <= {}/{:?})...", + thread_manager.scheduler_id, + current_tick, + self.tick, + elapsed, + width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT, + ); + thread_manager.suspend(); + self.tick = 0; + self.updated_at = Instant::now(); + } + } + } + + true + } +} + +impl Drop for SchedulerPool +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + fn drop(&mut self) { + info!("SchedulerPool::drop() is successfully called"); + } +} + +impl SchedulerPool +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, { // Some internal impl and test code want an actual concrete type, NOT the // `dyn InstalledSchedulerPool`. So don't merge this into `Self::new_dyn()`. + #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))] fn new( handler_count: Option, log_messages_bytes_limit: Option, @@ -90,12 +219,66 @@ where replay_vote_sender: Option, prioritization_fee_cache: Arc, ) -> Arc { - let handler_count = handler_count.unwrap_or(1); - // we're hard-coding the number of handler thread to 1, meaning this impl is currently - // single-threaded still. - assert_eq!(handler_count, 1); // replace this with assert!(handler_count >= 1) later + let handler_count = handler_count.unwrap_or(Self::default_handler_count()); + assert!(handler_count >= 1); + + let (scheduler_pool_sender, scheduler_pool_receiver) = bounded(1); + let (cleaner_sender, cleaner_receiver) = unbounded(); + let (cleaner_exit_signal_sender, cleaner_exit_signal_receiver) = unbounded(); + + let cleaner_main_loop = || { + move || { + let scheduler_pool: Arc = scheduler_pool_receiver.recv().unwrap(); + drop(scheduler_pool_receiver); + + let mut thread_managers: Vec> = vec![]; + + 'outer: loop { + let mut schedulers = scheduler_pool.scheduler_inners.lock().unwrap(); + let schedulers_len_pre_retain = schedulers.len(); + schedulers.retain_mut(|scheduler| scheduler.retire_if_stale()); + let schedulers_len_post_retain = schedulers.len(); + drop(schedulers); + + let thread_manager_len_pre_retain = thread_managers.len(); + thread_managers.retain_mut(|thread_manager| thread_manager.retire_if_stale()); + + let thread_manager_len_pre_push = thread_managers.len(); + 'inner: loop { + match cleaner_receiver.try_recv() { + Ok(thread_manager) => { + thread_managers.push(WatchedThreadManager::new(thread_manager)) + } + Err(TryRecvError::Disconnected) => break 'outer, + Err(TryRecvError::Empty) => break 'inner, + } + } + + info!( + "cleaner: unused schedulers in the pool: {} => {}, all thread managers: {} => {} => {}", + schedulers_len_pre_retain, + schedulers_len_post_retain, + thread_manager_len_pre_retain, + thread_manager_len_pre_push, + thread_managers.len(), + ); + // wait for signal with timeout here instead of recv_timeout() to write all the + // preceeding logs at once. + match cleaner_exit_signal_receiver.recv_timeout(Duration::from_secs(1)) { + Ok(()) | Err(RecvTimeoutError::Disconnected) => break 'outer, + Err(RecvTimeoutError::Timeout) => continue, + } + } + info!("cleaner thread terminating!"); + } + }; - Arc::new_cyclic(|weak_self| Self { + let cleaner_thread = thread::Builder::new() + .name("solScCleaner".to_owned()) + .spawn(cleaner_main_loop()) + .unwrap(); + + let scheduler_pool = Arc::new_cyclic(|weak_self| Self { scheduler_inners: Mutex::default(), handler_count, handler_context: HandlerContext { @@ -105,9 +288,13 @@ where prioritization_fee_cache, }, weak_self: weak_self.clone(), - next_scheduler_id: AtomicSchedulerId::default(), - _phantom: PhantomData, - }) + next_scheduler_id: AtomicSchedulerId::new(PRIMARY_SCHEDULER_ID), + cleaner_thread: Mutex::new(Some(cleaner_thread)), + cleaner_sender, + cleaner_exit_signal_sender, + }); + scheduler_pool_sender.send(scheduler_pool.clone()).unwrap(); + scheduler_pool } // This apparently-meaningless wrapper is handy, because some callers explicitly want @@ -118,7 +305,7 @@ where transaction_status_sender: Option, replay_vote_sender: Option, prioritization_fee_cache: Arc, - ) -> InstalledSchedulerPoolArc { + ) -> InstalledSchedulerPoolArc { Self::new( handler_count, log_messages_bytes_limit, @@ -146,16 +333,21 @@ where .push(scheduler); } + #[cfg_attr(feature = "dev-context-only-utils", qualifiers(pub))] fn do_take_scheduler(&self, context: SchedulingContext) -> S { // pop is intentional for filo, expecting relatively warmed-up scheduler due to having been // returned recently - if let Some(inner) = self.scheduler_inners.lock().expect("not poisoned").pop() { - S::from_inner(inner, context) + if let Some(pooled_inner) = self.scheduler_inners.lock().expect("not poisoned").pop() { + S::from_inner(pooled_inner, context) } else { - S::spawn(self.self_arc(), context) + S::spawn(self.self_arc(), context, TH::create(self)) } } + fn register_to_cleaner(&self, thread_manager: Weak>>) { + self.cleaner_sender.send(thread_manager).unwrap(); + } + pub fn default_handler_count() -> usize { Self::calculate_default_handler_count( thread::available_parallelism() @@ -188,18 +380,41 @@ where } } -impl InstalledSchedulerPool for SchedulerPool +impl InstalledSchedulerPool for SchedulerPool where - S: SpawnableScheduler, - TH: TaskHandler, + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, { - fn take_scheduler(&self, context: SchedulingContext) -> InstalledSchedulerBox { + fn take_scheduler(&self, context: SchedulingContext) -> Box> { Box::new(self.do_take_scheduler(context)) } + + fn uninstalled_from_bank_forks(self: Arc) { + self.scheduler_inners.lock().unwrap().clear(); + self.cleaner_exit_signal_sender.send(()).unwrap(); + let () = self + .cleaner_thread + .lock() + .unwrap() + .take() + .unwrap() + .join() + .unwrap(); + info!( + "SchedulerPool::uninstalled_from_bank_forks(): joined cleaner thread at {:?}...", + thread::current() + ); + } } -pub trait TaskHandler: Send + Sync + Debug + Sized + 'static { +pub trait TaskHandler: + Send + Sync + Debug + Sized + Clone + 'static +{ + fn create>(pool: &SchedulerPool) -> Self; + fn handle( + &self, result: &mut Result<()>, timings: &mut ExecuteTimings, bank: &Arc, @@ -209,11 +424,16 @@ pub trait TaskHandler: Send + Sync + Debug + Sized + 'static { ); } -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct DefaultTaskHandler; -impl TaskHandler for DefaultTaskHandler { +impl TaskHandler for DefaultTaskHandler { + fn create>(_pool: &SchedulerPool) -> Self { + Self + } + fn handle( + &self, result: &mut Result<()>, timings: &mut ExecuteTimings, bank: &Arc, @@ -244,15 +464,31 @@ impl TaskHandler for DefaultTaskHandler { struct ExecutedTask { task: Task, result_with_timings: ResultWithTimings, + slot: Slot, + thx: usize, + handler_timings: Option, +} + +pub struct HandlerTimings { + finish_time: SystemTime, + execution_us: u64, + execution_cpu_us: u128, } impl ExecutedTask { - fn new_boxed(task: Task) -> Box { + fn new_boxed(task: Task, thx: usize, slot: Slot) -> Box { Box::new(Self { task, result_with_timings: initialized_result_with_timings(), + slot, + thx, + handler_timings: None, }) } + + fn is_err(&self) -> bool { + self.result_with_timings.0.is_err() + } } // A very tiny generic message type to signal about opening and closing of subchannels, which are @@ -268,6 +504,7 @@ enum SubchanneledPayload { } type NewTaskPayload = SubchanneledPayload; +type ExecutedTaskPayload = SubchanneledPayload, ()>; // A tiny generic message type to synchronize multiple threads everytime some contextual data needs // to be switched (ie. SchedulingContext), just using a single communication channel. @@ -336,6 +573,10 @@ mod chained_channel { self.sender = chained_sender; Ok(()) } + + pub(super) fn len(&self) -> usize { + self.sender.len() + } } // P doesn't need to be `: Clone`, yet rustc derive can't handle it. @@ -386,55 +627,198 @@ mod chained_channel { } } +#[derive(Default, Debug)] +pub struct AddressBook { + book: DashMap, +} + +impl AddressBook { + pub fn load(&self, address: Pubkey) -> Page { + self.book.entry(address).or_default().clone() + } + + pub fn page_count(&self) -> usize { + self.book.len() + } + + pub fn clear(&self) { + self.book.clear(); + } +} + fn initialized_result_with_timings() -> ResultWithTimings { (Ok(()), ExecuteTimings::default()) } -// Currently, simplest possible implementation (i.e. single-threaded) -// this will be replaced with more proper implementation... -// not usable at all, especially for mainnet-beta #[derive(Debug)] -pub struct PooledScheduler { - inner: PooledSchedulerInner, +pub struct PooledScheduler +where + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + inner: PooledSchedulerInner, context: SchedulingContext, } #[derive(Debug)] -pub struct PooledSchedulerInner, TH: TaskHandler> { - thread_manager: ThreadManager, +pub struct PooledSchedulerInner +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + thread_manager: Arc>>, + address_book: AddressBook, + pooled_at: Instant, +} + +impl PooledSchedulerInner +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + fn pooled_since(&self) -> Duration { + self.pooled_at.elapsed() + } + + fn suspend_thread_manager(&mut self) { + debug!("suspend_thread_manager()"); + self.thread_manager.write().unwrap().suspend(); + } + + fn id(&self) -> SchedulerId { + self.thread_manager.read().unwrap().scheduler_id + } } +type Tid = i32; +// The linux's tid (essentially is in the pid name space) is guaranteed to be non-zero; so +// using 0 for special purpose at user-land is totally safe. +#[cfg_attr(target_os = "linux", allow(dead_code))] +const DUMMY_TID: Tid = 0; + +#[derive(Default)] +struct LogInterval(usize); + +impl LogInterval { + fn increment(&mut self) -> bool { + let should_log = self.0 % 1000 == 0; + self.0 = self.0.checked_add(1).unwrap(); + should_log + } +} + +const PRIMARY_SCHEDULER_ID: SchedulerId = 0; + // This type manages the OS threads for scheduling and executing transactions. The term // `session` is consistently used to mean a group of Tasks scoped under a single SchedulingContext. // This is equivalent to a particular bank for block verification. However, new terms is introduced // here to mean some continuous time over multiple continuous banks/slots for the block production, // which is planned to be implemented in the future. #[derive(Debug)] -struct ThreadManager, TH: TaskHandler> { +struct ThreadManager +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ scheduler_id: SchedulerId, - pool: Arc>, + pool: Arc>, + handler: TH, new_task_sender: Sender, - new_task_receiver: Receiver, + new_task_receiver: Option>, session_result_sender: Sender>, session_result_receiver: Receiver>, session_result_with_timings: Option, - scheduler_thread: Option>, + scheduler_thread_and_tid: Option<(JoinHandle>, Tid)>, handler_threads: Vec>, + accumulator_thread: Option>, } -impl PooledScheduler { - fn do_spawn(pool: Arc>, initial_context: SchedulingContext) -> Self { +impl PooledScheduler +where + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + fn do_spawn( + pool: Arc>, + initial_context: SchedulingContext, + handler: TH, + ) -> Self { Self::from_inner( - PooledSchedulerInner:: { - thread_manager: ThreadManager::new(pool), + PooledSchedulerInner { + thread_manager: Arc::new(RwLock::new(ThreadManager::new(pool.clone(), handler))), + address_book: AddressBook::default(), + pooled_at: Instant::now(), }, initial_context, ) } + + #[cfg(feature = "dev-context-only-utils")] + pub fn clear_session_result_with_timings(&mut self) { + assert_matches!( + self.inner + .thread_manager + .write() + .unwrap() + .take_session_result_with_timings(), + (Ok(_), _) + ); + } + + #[cfg(feature = "dev-context-only-utils")] + pub fn restart_session(&mut self) { + self.inner + .thread_manager + .write() + .unwrap() + .start_session(&self.context); + } + + #[cfg(feature = "dev-context-only-utils")] + pub fn schedule_task(&self, task: Task) { + self.inner.thread_manager.read().unwrap().send_task(task); + } + + fn ensure_thread_manager_resumed( + &self, + context: &SchedulingContext, + ) -> std::result::Result>, TransactionError> + { + let mut was_already_active = false; + loop { + let read = self.inner.thread_manager.read().unwrap(); + if !read.is_suspended() { + debug!( + "{}", + if was_already_active { + "ensure_thread_manager_resumed(): was already active." + } else { + "ensure_thread_manager_resumed(): wasn't already active..." + } + ); + return Ok(read); + } else { + debug!("ensure_thread_manager_resumed(): will start threads..."); + drop(read); + let mut write = self.inner.thread_manager.write().unwrap(); + write.start_or_try_resume_threads(context)?; + drop(write); + was_already_active = false; + } + } + } } -impl, TH: TaskHandler> ThreadManager { - fn new(pool: Arc>) -> Self { +impl ThreadManager +where + S: SpawnableScheduler, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + fn new(pool: Arc>, handler: TH) -> Self { let (new_task_sender, new_task_receiver) = unbounded(); let (session_result_sender, session_result_receiver) = unbounded(); let handler_count = pool.handler_count; @@ -442,23 +826,40 @@ impl, TH: TaskHandler> ThreadManager { Self { scheduler_id: pool.new_scheduler_id(), pool, + handler, new_task_sender, - new_task_receiver, + new_task_receiver: Some(new_task_receiver), session_result_sender, session_result_receiver, session_result_with_timings: None, - scheduler_thread: None, + scheduler_thread_and_tid: None, handler_threads: Vec::with_capacity(handler_count), + accumulator_thread: None, } } + fn is_suspended(&self) -> bool { + self.scheduler_thread_and_tid.is_none() + } + + pub fn take_scheduler_thread(&mut self) -> Option>> { + self.scheduler_thread_and_tid + .take() + .map(|(thread, _tid)| thread) + } + fn execute_task_with_handler( + handler: &TH, bank: &Arc, executed_task: &mut Box, handler_context: &HandlerContext, + send_metrics: bool, ) { + let handler_timings = + send_metrics.then_some((Measure::start("process_message_time"), ThreadTime::now())); debug!("handling task at {:?}", thread::current()); TH::handle( + handler, &mut executed_task.result_with_timings.0, &mut executed_task.result_with_timings.1, bank, @@ -466,30 +867,77 @@ impl, TH: TaskHandler> ThreadManager { executed_task.task.task_index(), handler_context, ); + if let Some((mut wall_time, cpu_time)) = handler_timings { + executed_task.handler_timings = Some(HandlerTimings { + finish_time: SystemTime::now(), + execution_cpu_us: cpu_time.elapsed().as_micros(), + execution_us: { + // make wall time is longer than cpu time, always + wall_time.stop(); + wall_time.as_us() + }, + }); + } } fn accumulate_result_with_timings( - (result, timings): &mut ResultWithTimings, + (_result, timings): &mut ResultWithTimings, executed_task: Box, ) { - match executed_task.result_with_timings.0 { - Ok(()) => {} - Err(error) => { - error!("error is detected while accumulating....: {error:?}"); - // Override errors intentionally for simplicity, not retaining the - // first error unlike the block verification in the - // blockstore_processor. This will be addressed with more - // full-fledged impl later. - *result = Err(error); - } - } + assert_matches!(executed_task.result_with_timings.0, Ok(())); timings.accumulate(&executed_task.result_with_timings.1); + + if let Some(handler_timings) = &executed_task.handler_timings { + let thread = format!("solScExLane{:02}", executed_task.thx); + let signature = executed_task.task.transaction().signature().to_string(); + let account_locks_in_json = serde_json::to_string( + &executed_task + .task + .transaction() + .get_account_locks_unchecked(), + ) + .unwrap(); + let status = format!("{:?}", executed_task.result_with_timings.0); + let compute_unit_price = executed_task + .task + .transaction() + .get_compute_budget_details(false) + .map(|d| d.compute_unit_price) + .unwrap_or_default(); + + datapoint_info_at!( + handler_timings.finish_time, + "transaction_timings", + ("slot", executed_task.slot, i64), + ("index", executed_task.task.task_index(), i64), + ("thread", thread, String), + ("signature", signature, String), + ("account_locks_in_json", account_locks_in_json, String), + ("status", status, String), + ("duration", handler_timings.execution_us, i64), + ("cpu_duration", handler_timings.execution_cpu_us, i64), + ("compute_units", 0 /*task.cu*/, i64), + ("priority", compute_unit_price, i64), // old name is kept for compat... + ); + } + + drop(executed_task); } fn take_session_result_with_timings(&mut self) -> ResultWithTimings { self.session_result_with_timings.take().unwrap() } + fn reset_session_on_error(&mut self) -> Result<()> { + let err = self + .session_result_with_timings + .replace(initialized_result_with_timings()) + .unwrap() + .0; + assert_matches!(err, Err(_)); + err + } + fn put_session_result_with_timings(&mut self, result_with_timings: ResultWithTimings) { assert_matches!( self.session_result_with_timings @@ -498,11 +946,38 @@ impl, TH: TaskHandler> ThreadManager { ); } - fn start_threads(&mut self, context: &SchedulingContext) { - let (mut runnable_task_sender, runnable_task_receiver) = + fn start_or_try_resume_threads(&mut self, context: &SchedulingContext) -> Result<()> { + if !self.is_suspended() { + // this can't be promoted to panic! as read => write upgrade isn't completely + // race-free in ensure_thread_manager_resumed()... + warn!("try_resume(): already resumed"); + return Ok(()); + } else if self + .session_result_with_timings + .as_ref() + .map(|(result, _)| result.is_err()) + .unwrap_or(false) + { + warn!("try_resume(): skipping resuming due to err, while resetting session result"); + return self.reset_session_on_error(); + } + debug!("try_resume(): doing now"); + + let send_metrics = env::var("SOLANA_TRANSACTION_TIMINGS").is_ok(); + + let (mut blocked_task_sender, blocked_task_receiver) = chained_channel::unbounded::(context.clone()); + let (idle_task_sender, idle_task_receiver) = unbounded::(); let (finished_task_sender, finished_task_receiver) = unbounded::>(); - + let (finished_idle_task_sender, finished_idle_task_receiver) = + unbounded::>(); + let (executed_task_sender, executed_task_receiver) = unbounded::(); + let (accumulated_result_sender, accumulated_result_receiver) = + unbounded::>(); + + let scheduler_id = self.scheduler_id; + let mut slot = context.bank().slot(); + let (tid_sender, tid_receiver) = bounded(1); let mut result_with_timings = self.session_result_with_timings.take(); // High-level flow of new tasks: @@ -512,13 +987,16 @@ impl, TH: TaskHandler> ThreadManager { // 4. the handler thread processes the dispatched task. // 5. the handler thread reply back to the scheduler thread as an executed task. // 6. the scheduler thread post-processes the executed task. + // 7. the scheduler thread send the executed task to the accumulator thread. + // 8. the accumulator thread examines the executed task's result and accumulate its timing, + // finally dropping the transaction inside the executed task. let scheduler_main_loop = || { let handler_count = self.pool.handler_count; let session_result_sender = self.session_result_sender.clone(); - let new_task_receiver = self.new_task_receiver.clone(); + let mut new_task_receiver = self.new_task_receiver.take().unwrap(); let mut session_ending = false; - let mut active_task_count: usize = 0; + let mut thread_suspending = false; // Now, this is the main loop for the scheduler thread, which is a special beast. // @@ -558,95 +1036,274 @@ impl, TH: TaskHandler> ThreadManager { // cycles out of the scheduler thread. Thus, any kinds of unessential overhead sources // like syscalls, VDSO, and even memory (de)allocation should be avoided at all costs // by design or by means of offloading at the last resort. - move || loop { - let mut is_finished = false; - while !is_finished { - select! { - recv(finished_task_receiver) -> executed_task => { - let executed_task = executed_task.unwrap(); - - active_task_count = active_task_count.checked_sub(1).unwrap(); - let result_with_timings = result_with_timings.as_mut().unwrap(); - Self::accumulate_result_with_timings(result_with_timings, executed_task); - }, - recv(new_task_receiver) -> message => { - assert!(!session_ending); - - match message.unwrap() { - NewTaskPayload::Payload(task) => { - // so, we're NOT scheduling at all here; rather, just execute - // tx straight off. the inter-tx locking deps aren't needed to - // be resolved in the case of single-threaded FIFO like this. - runnable_task_sender + move || { + const BITS_PER_HEX_DIGIT: usize = 4; + let mut state_machine = unsafe { + SchedulingStateMachine::exclusively_initialize_current_thread_for_scheduling() + }; + let mut log_interval = LogInterval::default(); + // hint compiler about inline[never] and unlikely? + macro_rules! log_scheduler { + ($prefix:tt) => { + info!( + "[sch_{:0width$x}]: slot: {}[{:12}]({}{}): state_machine(({}(+{})=>{})/{}|{}) channels(<{} >{}+{} <{}+{})", + scheduler_id, slot, + (if ($prefix) == "step" { "interval" } else { $prefix }), + (if session_ending {"S"} else {"-"}), (if thread_suspending {"T"} else {"-"}), + state_machine.active_task_count(), state_machine.unblocked_task_queue_count(), state_machine.handled_task_count(), + state_machine.total_task_count(), + state_machine.unblocked_task_count(), + new_task_receiver.len(), + blocked_task_sender.len(), idle_task_sender.len(), + finished_task_receiver.len(), finished_idle_task_receiver.len(), + width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT, + ); + }; + } + + trace!("solScheduler thread is running at: {:?}", thread::current()); + tid_sender + .send({ + #[cfg(not(target_os = "linux"))] + let tid = DUMMY_TID; + #[cfg(target_os = "linux")] + let tid = rustix::thread::gettid().as_raw_nonzero().get(); + tid + }) + .unwrap(); + let (do_now, dont_now) = (&disconnected::<()>(), &never::<()>()); + log_scheduler!("S+T:started"); + + while !thread_suspending { + let mut is_finished = false; + while !is_finished { + let state_change = select_biased! { + recv(finished_task_receiver) -> executed_task => { + let executed_task = executed_task.unwrap(); + if executed_task.is_err() { + log_scheduler!("S+T:aborted"); + // MUST: clear the addressbook before reusing this scheduler + // ... + session_result_sender.send(None).unwrap(); + // be explicit about specifically dropping this receiver + drop(new_task_receiver); + // this timings aren't for the accumulated one. but + // caller doesn't care. + return Some(executed_task.result_with_timings); + } else { + state_machine.deschedule_task(&executed_task.task); + executed_task_sender.send_buffered(ExecutedTaskPayload::Payload(executed_task)).unwrap(); + } + "step" + }, + recv(if state_machine.has_unblocked_task() { do_now } else { dont_now }) -> dummy_result => { + assert_matches!(dummy_result, Err(RecvError)); + + if let Some(task) = state_machine.schedule_unblocked_task() { + blocked_task_sender .send_payload(task) .unwrap(); - active_task_count = active_task_count.checked_add(1).unwrap(); } - NewTaskPayload::OpenSubchannel(context) => { - // signal about new SchedulingContext to handler threads - runnable_task_sender - .send_chained_channel(context, handler_count) - .unwrap(); - assert_matches!( - result_with_timings.replace(initialized_result_with_timings()), - None - ); + "step" + }, + recv(new_task_receiver) -> message => { + assert!(message.is_err() || (!session_ending && !thread_suspending)); + match message { + Ok(NewTaskPayload::Payload(task)) => { + if let Some(task) = state_machine.schedule_task(task) { + idle_task_sender.send(task).unwrap(); + } + "step" + } + Ok(NewTaskPayload::OpenSubchannel(context)) => { + slot = context.bank().slot(); + blocked_task_sender + .send_chained_channel(context, handler_count) + .unwrap(); + executed_task_sender + .send(ExecutedTaskPayload::OpenSubchannel(())) + .unwrap(); + "S:started" + } + Ok(NewTaskPayload::CloseSubchannel) => { + session_ending = true; + "S:ending" + } + Err(_) => { + assert!(!thread_suspending); + thread_suspending = true; + + // Err(_) on new_task_receiver guarantees + // that there's no live sender and no messages to be + // received anymore; so dropping by overriding it with + // never() should pose no possibility of missed messages. + new_task_receiver = never(); + + "T:suspending" + } } - NewTaskPayload::CloseSubchannel => { - session_ending = true; + }, + recv(finished_idle_task_receiver) -> executed_task => { + let executed_task = executed_task.unwrap(); + if executed_task.is_err() { + log_scheduler!("S+T:aborted"); + session_result_sender.send(None).unwrap(); + // be explicit about specifically dropping this receiver + drop(new_task_receiver); + // this timings aren't for the accumulated one. but + // caller doesn't care. + return Some(executed_task.result_with_timings); + } else { + state_machine.deschedule_task(&executed_task.task); + executed_task_sender.send_buffered(ExecutedTaskPayload::Payload(executed_task)).unwrap(); } - } - }, - }; + "step" + }, + }; + if state_change != "step" || log_interval.increment() { + log_scheduler!(state_change); + } - // a really simplistic termination condition, which only works under the - // assumption of single handler thread... - is_finished = session_ending && active_task_count == 0; + is_finished = state_machine.has_no_active_task() + && (session_ending || thread_suspending); + } + + if session_ending { + log_scheduler!("S:ended"); + state_machine.reinitialize(); + log_interval = LogInterval::default(); + executed_task_sender + .send(ExecutedTaskPayload::CloseSubchannel) + .unwrap(); + session_result_sender + .send(Some( + accumulated_result_receiver + .recv() + .unwrap() + .unwrap_or_else(initialized_result_with_timings), + )) + .unwrap(); + if !thread_suspending { + session_ending = false; + } + } } - if session_ending { - session_result_sender - .send(Some( - result_with_timings - .take() - .unwrap_or_else(initialized_result_with_timings), - )) + log_scheduler!("T:suspended"); + let result_with_timings = if session_ending { + None + } else { + executed_task_sender + .send(ExecutedTaskPayload::CloseSubchannel) .unwrap(); - session_ending = false; - } + accumulated_result_receiver.recv().unwrap() + }; + trace!( + "solScheduler thread is terminating at: {:?}", + thread::current() + ); + result_with_timings } }; - let handler_main_loop = || { + let handler_main_loop = |thx| { let pool = self.pool.clone(); - let mut runnable_task_receiver = runnable_task_receiver.clone(); + let handler = self.handler.clone(); + let mut blocked_task_receiver = blocked_task_receiver.clone(); + let mut idle_task_receiver = idle_task_receiver.clone(); let finished_task_sender = finished_task_sender.clone(); + let finished_idle_task_sender = finished_idle_task_sender.clone(); - move || loop { - let (task, sender) = select! { - recv(runnable_task_receiver.for_select()) -> message => { - if let Some(task) = runnable_task_receiver.after_select(message.unwrap()) { - (task, &finished_task_sender) - } else { - continue; - } - }, - }; - let mut task = ExecutedTask::new_boxed(task); - Self::execute_task_with_handler( - runnable_task_receiver.context().bank(), - &mut task, - &pool.handler_context, + move || { + trace!( + "solScHandler{:02} thread is running at: {:?}", + thx, + thread::current() + ); + loop { + let (task, sender) = select_biased! { + recv(blocked_task_receiver.for_select()) -> message => { + match message { + Ok(message) => { + if let Some(task) = blocked_task_receiver.after_select(message) { + (task, &finished_task_sender) + } else { + continue; + } + }, + Err(_) => break, + } + }, + recv(idle_task_receiver) -> task => { + if let Ok(task) = task { + (task, &finished_idle_task_sender) + } else { + idle_task_receiver = never(); + continue; + } + }, + }; + let bank = blocked_task_receiver.context().bank(); + let mut task = ExecutedTask::new_boxed(task, thx, bank.slot()); + Self::execute_task_with_handler( + &handler, + bank, + &mut task, + &pool.handler_context, + send_metrics, + ); + if sender.send(task).is_err() { + break; + } + } + trace!( + "solScHandler{:02} thread is terminating at: {:?}", + thx, + thread::current() ); - sender.send(task).unwrap(); } }; - self.scheduler_thread = Some( + let accumulator_main_loop = || { + move || 'outer: loop { + match executed_task_receiver.recv_timeout(Duration::from_millis(40)) { + Ok(ExecutedTaskPayload::Payload(executed_task)) => { + let result_with_timings = result_with_timings.as_mut().unwrap(); + Self::accumulate_result_with_timings(result_with_timings, executed_task); + } + Ok(ExecutedTaskPayload::OpenSubchannel(())) => { + assert_matches!( + result_with_timings.replace(initialized_result_with_timings()), + None + ); + } + Ok(ExecutedTaskPayload::CloseSubchannel) => { + if accumulated_result_sender + .send(result_with_timings.take()) + .is_err() + { + break 'outer; + } + } + Err(RecvTimeoutError::Disconnected) => break 'outer, + Err(RecvTimeoutError::Timeout) => continue, + } + } + }; + + self.scheduler_thread_and_tid = Some(( thread::Builder::new() .name("solScheduler".to_owned()) .spawn(scheduler_main_loop()) .unwrap(), + tid_receiver.recv().unwrap(), + )); + + self.accumulator_thread = Some( + thread::Builder::new() + .name("solScAccmltr".to_owned()) + .spawn(accumulator_main_loop()) + .unwrap(), ); self.handler_threads = (0..self.pool.handler_count) @@ -654,95 +1311,194 @@ impl, TH: TaskHandler> ThreadManager { |thx| { thread::Builder::new() .name(format!("solScHandler{:02}", thx)) - .spawn(handler_main_loop()) + .spawn(handler_main_loop(thx)) .unwrap() } }) .collect(); + Ok(()) } - fn send_task(&self, task: Task) { + fn send_task(&self, task: Task) -> bool { debug!("send_task()"); self.new_task_sender .send(NewTaskPayload::Payload(task)) - .unwrap() + .is_err() } fn end_session(&mut self) { - if self.session_result_with_timings.is_some() { + debug!("end_session(): will end session..."); + if self.is_suspended() { + debug!("end_session(): no threads.."); + assert_matches!(self.session_result_with_timings, Some(_)); + return; + } else if self.session_result_with_timings.is_some() { debug!("end_session(): already result resides within thread manager.."); return; } - debug!("end_session(): will end session..."); - self.new_task_sender + let mut abort_detected = self + .new_task_sender .send(NewTaskPayload::CloseSubchannel) - .unwrap(); + .is_err(); if let Some(result_with_timings) = self.session_result_receiver.recv().unwrap() { + assert!(!abort_detected); self.put_session_result_with_timings(result_with_timings); + } else { + abort_detected = true; + } + + if abort_detected { + self.suspend(); } } fn start_session(&mut self, context: &SchedulingContext) { - assert_matches!(self.session_result_with_timings, None); - self.new_task_sender - .send(NewTaskPayload::OpenSubchannel(context.clone())) - .unwrap(); + if !self.is_suspended() { + assert_matches!(self.session_result_with_timings, None); + self.new_task_sender + .send(NewTaskPayload::OpenSubchannel(context.clone())) + .unwrap(); + } else { + self.put_session_result_with_timings(initialized_result_with_timings()); + assert_matches!(self.start_or_try_resume_threads(context), Ok(())); + } + } + + fn suspend(&mut self) { + let Some(scheduler_thread) = self.take_scheduler_thread() else { + warn!("suspend(): already suspended..."); + return; + }; + debug!("suspend(): terminating threads by {:?}", thread::current()); + + let (s, r) = unbounded(); + (self.new_task_sender, self.new_task_receiver) = (s, Some(r)); + + let () = self.accumulator_thread.take().unwrap().join().unwrap(); + for thread in self.handler_threads.drain(..) { + debug!("joining...: {:?}", thread); + () = thread.join().unwrap(); + } + if let Some(result_with_timings) = scheduler_thread.join().unwrap() { + self.put_session_result_with_timings(result_with_timings); + } + + debug!( + "suspend(): successfully suspended threads by {:?}", + thread::current() + ); + } + + fn is_primary(&self) -> bool { + self.scheduler_id == PRIMARY_SCHEDULER_ID + } + + #[cfg(target_os = "linux")] + fn active_tid_if_not_primary(&self) -> Option { + if self.is_primary() { + // always exempt from cleaner... + None + } else { + self.scheduler_thread_and_tid.as_ref().map(|&(_, tid)| tid) + } } } -pub trait SpawnableScheduler: InstalledScheduler { - type Inner: Debug + Send + Sync; +pub trait SpawnableScheduler: InstalledScheduler +where + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + type Inner: Debug + Send + Sync + RetirableSchedulerInner; fn into_inner(self) -> (ResultWithTimings, Self::Inner); fn from_inner(inner: Self::Inner, context: SchedulingContext) -> Self; - fn spawn(pool: Arc>, initial_context: SchedulingContext) -> Self + fn spawn( + pool: Arc>, + initial_context: SchedulingContext, + handler: TH, + ) -> Self where Self: Sized; } -impl SpawnableScheduler for PooledScheduler { - type Inner = PooledSchedulerInner; +pub trait RetirableSchedulerInner { + fn retire_if_stale(&mut self) -> bool; +} + +impl SpawnableScheduler for PooledScheduler +where + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + type Inner = PooledSchedulerInner; - fn into_inner(mut self) -> (ResultWithTimings, Self::Inner) { + fn into_inner(self) -> (ResultWithTimings, Self::Inner) { let result_with_timings = { - let manager = &mut self.inner.thread_manager; + let manager = &mut self.inner.thread_manager.write().unwrap(); manager.end_session(); manager.take_session_result_with_timings() }; (result_with_timings, self.inner) } - fn from_inner(mut inner: Self::Inner, context: SchedulingContext) -> Self { - inner.thread_manager.start_session(&context); + fn from_inner(inner: Self::Inner, context: SchedulingContext) -> Self { + inner + .thread_manager + .write() + .unwrap() + .start_session(&context); Self { inner, context } } - fn spawn(pool: Arc>, initial_context: SchedulingContext) -> Self { - let mut scheduler = Self::do_spawn(pool, initial_context); - scheduler - .inner - .thread_manager - .start_threads(&scheduler.context); + fn spawn( + pool: Arc>, + initial_context: SchedulingContext, + handler: TH, + ) -> Self { + let scheduler = Self::do_spawn(pool.clone(), initial_context, handler); + pool.register_to_cleaner(Arc::downgrade(&scheduler.inner.thread_manager)); scheduler } } -impl InstalledScheduler for PooledScheduler { +impl InstalledScheduler for PooledScheduler +where + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ fn id(&self) -> SchedulerId { - self.inner.thread_manager.scheduler_id + self.inner.id() } fn context(&self) -> &SchedulingContext { &self.context } - fn schedule_execution(&self, &(transaction, index): &(&SanitizedTransaction, usize)) { - let task = Task::create_task(transaction.clone(), index); - self.inner.thread_manager.send_task(task); + fn schedule_execution( + &self, + transaction_with_index: SEA::TransactionWithIndex<'_>, + ) -> Result<()> { + transaction_with_index.with_transaction_and_index(|transaction, index| { + let task = + SchedulingStateMachine::create_task(transaction.clone(), index, &mut |pubkey| { + self.inner.address_book.load(pubkey) + }); + let abort_detected = self + .ensure_thread_manager_resumed(&self.context)? + .send_task(task); + if abort_detected { + let thread_manager = &mut self.inner.thread_manager.write().unwrap(); + thread_manager.suspend(); + thread_manager.reset_session_on_error() + } else { + Ok(()) + } + }) } fn wait_for_termination( @@ -754,17 +1510,78 @@ impl InstalledScheduler for PooledScheduler { } fn pause_for_recent_blockhash(&mut self) { - self.inner.thread_manager.end_session(); + self.inner.thread_manager.write().unwrap().end_session(); } } -impl UninstalledScheduler for PooledSchedulerInner +impl UninstalledScheduler for PooledSchedulerInner where - S: SpawnableScheduler>, - TH: TaskHandler, + S: SpawnableScheduler>, + TH: TaskHandler, + SEA: ScheduleExecutionArg, { - fn return_to_pool(self: Box) { - self.thread_manager.pool.clone().return_scheduler(*self) + fn return_to_pool(mut self: Box) { + let pool = self.thread_manager.write().unwrap().pool.clone(); + self.pooled_at = Instant::now(); + pool.return_scheduler(*self) + } +} + +impl RetirableSchedulerInner for PooledSchedulerInner +where + S: SpawnableScheduler>, + TH: TaskHandler, + SEA: ScheduleExecutionArg, +{ + fn retire_if_stale(&mut self) -> bool { + // reap threads after 10mins of inactivity for any pooled (idle) schedulers. The primary + // scheduler is special-cased to empty its address book instead, for easier monitoring to + // accumulate os-level thread metrics. The duration is chosen based on the rough estimation + // from the frequency of short-lived forks on the mainnet-beta, with consideration of some + // increased forking at epoch boundaries. + const IDLE_DURATION_FOR_LAZY_THREAD_RECLAIM: Duration = Duration::from_secs(600); + + const BITS_PER_HEX_DIGIT: usize = 4; + let page_count = self.address_book.page_count(); + if page_count < 200_000 { + info!( + "[sch_{:0width$x}]: cleaner: address book size: {page_count}...", + self.id(), + width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT, + ); + } else if self.thread_manager.read().unwrap().is_primary() { + info!( + "[sch_{:0width$x}]: cleaner: too big address book size: {page_count}...; emptying the primary scheduler", + self.id(), + width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT, + ); + self.address_book.clear(); + return true; + } else { + info!( + "[sch_{:0width$x}]: cleaner: too big address book size: {page_count}...; retiring scheduler", + self.id(), + width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT, + ); + self.suspend_thread_manager(); + return false; + } + + let pooled_duration = self.pooled_since(); + if pooled_duration <= IDLE_DURATION_FOR_LAZY_THREAD_RECLAIM { + true + } else if !self.thread_manager.read().unwrap().is_primary() { + info!( + "[sch_{:0width$x}]: cleaner: retiring unused scheduler after {:?}...", + self.id(), + pooled_duration, + width = SchedulerId::BITS as usize / BITS_PER_HEX_DIGIT, + ); + self.suspend_thread_manager(); + false + } else { + true + } } } @@ -772,7 +1589,6 @@ where mod tests { use { super::*, - assert_matches::assert_matches, solana_runtime::{ bank::Bank, bank_forks::BankForks, @@ -783,11 +1599,12 @@ mod tests { solana_sdk::{ clock::MAX_PROCESSING_AGE, pubkey::Pubkey, + scheduling::SchedulingMode, signer::keypair::Keypair, system_transaction, transaction::{SanitizedTransaction, TransactionError}, }, - std::{sync::Arc, thread::JoinHandle}, + std::{mem, sync::Arc, thread::JoinHandle}, }; #[test] @@ -800,7 +1617,10 @@ mod tests { // this indirectly proves that there should be circular link because there's only one Arc // at this moment now - assert_eq!((Arc::strong_count(&pool), Arc::weak_count(&pool)), (1, 1)); + assert_eq!( + (Arc::strong_count(&pool), Arc::weak_count(&pool)), + (1 + 1 /* todo */, 1) + ); let debug = format!("{pool:#?}"); assert!(!debug.is_empty()); } @@ -813,7 +1633,7 @@ mod tests { let pool = DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); let bank = Arc::new(Bank::default_for_tests()); - let context = SchedulingContext::new(bank); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank); let scheduler = pool.take_scheduler(context); let debug = format!("{scheduler:#?}"); @@ -828,7 +1648,7 @@ mod tests { let pool = DefaultSchedulerPool::new(None, None, None, None, ignored_prioritization_fee_cache); let bank = Arc::new(Bank::default_for_tests()); - let context = &SchedulingContext::new(bank); + let context = &SchedulingContext::new(SchedulingMode::BlockVerification, bank); let scheduler1 = pool.do_take_scheduler(context.clone()); let scheduler_id1 = scheduler1.id(); @@ -857,7 +1677,7 @@ mod tests { let pool = DefaultSchedulerPool::new(None, None, None, None, ignored_prioritization_fee_cache); let bank = Arc::new(Bank::default_for_tests()); - let context = &SchedulingContext::new(bank); + let context = &SchedulingContext::new(SchedulingMode::BlockVerification, bank); let mut scheduler = pool.do_take_scheduler(context.clone()); // should never panic. @@ -879,8 +1699,10 @@ mod tests { let new_bank = &Arc::new(Bank::default_for_tests()); assert!(!Arc::ptr_eq(old_bank, new_bank)); - let old_context = &SchedulingContext::new(old_bank.clone()); - let new_context = &SchedulingContext::new(new_bank.clone()); + let old_context = + &SchedulingContext::new(SchedulingMode::BlockVerification, old_bank.clone()); + let new_context = + &SchedulingContext::new(SchedulingMode::BlockVerification, new_bank.clone()); let scheduler = pool.do_take_scheduler(old_context.clone()); let scheduler_id = scheduler.id(); @@ -897,11 +1719,14 @@ mod tests { let bank = Bank::default_for_tests(); let bank_forks = BankForks::new_rw_arc(bank); - let mut bank_forks = bank_forks.write().unwrap(); + let mut bank_forks_write = bank_forks.write().unwrap(); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); let pool = DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); - bank_forks.install_scheduler_pool(pool); + bank_forks_write.install_scheduler_pool(pool); + bank_forks_write.prepare_to_drop(); + drop(bank_forks_write); + drop::(Arc::into_inner(bank_forks).unwrap().into_inner().unwrap()); } #[test] @@ -968,11 +1793,11 @@ mod tests { let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); let pool = DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); assert_eq!(bank.transaction_count(), 0); let scheduler = pool.take_scheduler(context); - scheduler.schedule_execution(&(tx0, 0)); + assert_matches!(scheduler.schedule_execution(&(tx0, 0)), Ok(())); let bank = BankWithScheduler::new(bank, Some(scheduler)); assert_matches!(bank.wait_for_completed_scheduler(), Some((Ok(()), _))); assert_eq!(bank.transaction_count(), 1); @@ -993,7 +1818,7 @@ mod tests { let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); let pool = DefaultSchedulerPool::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); let mut scheduler = pool.take_scheduler(context); let unfunded_keypair = Keypair::new(); @@ -1005,9 +1830,9 @@ mod tests { genesis_config.hash(), )); assert_eq!(bank.transaction_count(), 0); - scheduler.schedule_execution(&(bad_tx, 0)); + assert_matches!(scheduler.schedule_execution(&(bad_tx, 0)), Ok(())); // simulate the task-sending thread is stalled for some reason. - std::thread::sleep(std::time::Duration::from_secs(1)); + thread::sleep(Duration::from_secs(1)); assert_eq!(bank.transaction_count(), 0); let good_tx_after_bad_tx = @@ -1023,25 +1848,26 @@ mod tests { .result, Ok(_) ); - scheduler.schedule_execution(&(good_tx_after_bad_tx, 0)); + thread::sleep(Duration::from_secs(3)); + assert_matches!( + scheduler.schedule_execution(&(good_tx_after_bad_tx, 0)), + Err(_) + ); + error!("last pause!"); scheduler.pause_for_recent_blockhash(); // transaction_count should remain same as scheduler should be bailing out. // That's because we're testing the serialized failing execution case in this test. - // However, currently threaded impl can't properly abort in this situtation.. - // so, 1 should be observed, intead of 0. // Also note that bank.transaction_count() is generally racy by nature, because // blockstore_processor and unified_scheduler both tend to process non-conflicting batches // in parallel as part of the normal operation. - assert_eq!(bank.transaction_count(), 1); + assert_eq!(bank.transaction_count(), 0); let bank = BankWithScheduler::new(bank, Some(scheduler)); assert_matches!( bank.wait_for_completed_scheduler(), - Some(( - Err(solana_sdk::transaction::TransactionError::AccountNotFound), - _timings - )) + Some((Ok(()), _timings)) ); + pool.uninstalled_from_bank_forks(); } #[derive(Debug)] @@ -1049,7 +1875,7 @@ mod tests { Mutex, Mutex>>, SchedulingContext, - Arc>, + Arc>, ); impl AsyncScheduler { @@ -1068,7 +1894,7 @@ mod tests { } } - impl InstalledScheduler + impl InstalledScheduler for AsyncScheduler { fn id(&self) -> SchedulerId { @@ -1079,20 +1905,24 @@ mod tests { &self.2 } - fn schedule_execution(&self, &(transaction, index): &(&SanitizedTransaction, usize)) { + fn schedule_execution( + &self, + &(transaction, index): &(&SanitizedTransaction, usize), + ) -> Result<()> { let transaction_and_index = (transaction.clone(), index); let context = self.context().clone(); let pool = self.3.clone(); - self.1.lock().unwrap().push(std::thread::spawn(move || { + self.1.lock().unwrap().push(thread::spawn(move || { // intentionally sleep to simulate race condition where register_recent_blockhash // is handle before finishing executing scheduled transactions - std::thread::sleep(std::time::Duration::from_secs(1)); + thread::sleep(Duration::from_secs(1)); let mut result = Ok(()); let mut timings = ExecuteTimings::default(); - ::handle( + >::handle( + &DefaultTaskHandler, &mut result, &mut timings, context.bank(), @@ -1102,6 +1932,8 @@ mod tests { ); (result, timings) })); + + Ok(()) } fn wait_for_termination( @@ -1109,7 +1941,7 @@ mod tests { _is_dropped: bool, ) -> (ResultWithTimings, UninstalledSchedulerBox) { self.do_wait(); - let result_with_timings = std::mem::replace( + let result_with_timings = mem::replace( &mut *self.0.lock().unwrap(), initialized_result_with_timings(), ); @@ -1134,7 +1966,8 @@ mod tests { } } - impl SpawnableScheduler + impl + SpawnableScheduler for AsyncScheduler { // well, i wish i can use ! (never type)..... @@ -1149,8 +1982,9 @@ mod tests { } fn spawn( - pool: Arc>, + pool: Arc>, initial_context: SchedulingContext, + _handler: DefaultTaskHandler, ) -> Self { AsyncScheduler::( Mutex::new(initialized_result_with_timings()), @@ -1161,6 +1995,14 @@ mod tests { } } + impl RetirableSchedulerInner + for AsyncScheduler + { + fn retire_if_stale(&mut self) -> bool { + unimplemented!(); + } + } + fn do_test_scheduler_schedule_execution_recent_blockhash_edge_case< const TRIGGER_RACE_CONDITION: bool, >() { @@ -1190,24 +2032,24 @@ mod tests { ); } let bank = setup_dummy_fork_graph(bank); - let context = SchedulingContext::new(bank.clone()); + let context = SchedulingContext::new(SchedulingMode::BlockVerification, bank.clone()); let ignored_prioritization_fee_cache = Arc::new(PrioritizationFeeCache::new(0u64)); - let pool = - SchedulerPool::, DefaultTaskHandler>::new_dyn( - None, - None, - None, - None, - ignored_prioritization_fee_cache, - ); + let pool = SchedulerPool::< + AsyncScheduler, + DefaultTaskHandler, + DefaultScheduleExecutionArg, + >::new_dyn(None, None, None, None, ignored_prioritization_fee_cache); let scheduler = pool.take_scheduler(context); let bank = BankWithScheduler::new(bank, Some(scheduler)); assert_eq!(bank.transaction_count(), 0); // schedule but not immediately execute transaction - bank.schedule_transaction_executions([(&very_old_valid_tx, &0)].into_iter()); + assert_matches!( + bank.schedule_transaction_executions([(&very_old_valid_tx, &0)].into_iter()), + Ok(()) + ); // this calls register_recent_blockhash internally bank.fill_bank_with_ticks_for_tests();