diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index da027798f962..cf04c0331877 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -16,9 +16,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.\" Copyright (c) 2024, Klara, Inc. -.\" -.Dd November 1, 2024 +.Dd November 13, 2024 .Dt ZFS 4 .Os . @@ -75,6 +73,17 @@ When set to .Sy 0 the array is dynamically sized based on total system memory. . +.It Sy dbuf_evict_parallel Ns = Ns Sy 0 Pq uint +When set to 1, ZFS will use up to +.Sy dbuf_evict_threads +threads to evict dbuf data in parallel, improving the responsiveness +of ZFS to memory pressure. +. +.It Sy dbuf_evict_threads Ns = Ns Sy 0 Pq uint +Sets the maximum number of dbuf eviction threads to be used. +When set to 0, ZFS uses one-eighth of the available CPUs, +with a minimum of 2 and a maximum of 16. +. .It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq uint dnode slots allocated in a single operation as a power of 2. The default value minimizes lock contention for the bulk operation performed. @@ -678,449 +687,449 @@ When the number of bytes consumed by dnodes in the ARC exceeds this number of bytes, try to unpin some of it in response to demand for non-metadata. This value acts as a ceiling to the amount of dnode metadata, and defaults to .Sy 0 , -which indicates that a percent which is based on -.Sy zfs_arc_dnode_limit_percent -of the ARC meta buffers that may be used for dnodes. -.It Sy zfs_arc_dnode_limit_percent Ns = Ns Sy 10 Ns % Pq u64 -Percentage that can be consumed by dnodes of ARC meta buffers. -.Pp -See also -.Sy zfs_arc_dnode_limit , -which serves a similar purpose but has a higher priority if nonzero. -. -.It Sy zfs_arc_dnode_reduce_percent Ns = Ns Sy 10 Ns % Pq u64 -Percentage of ARC dnodes to try to scan in response to demand for non-metadata -when the number of bytes consumed by dnodes exceeds -.Sy zfs_arc_dnode_limit . -. -.It Sy zfs_arc_average_blocksize Ns = Ns Sy 8192 Ns B Po 8 KiB Pc Pq uint -The ARC's buffer hash table is sized based on the assumption of an average -block size of this value. -This works out to roughly 1 MiB of hash table per 1 GiB of physical memory -with 8-byte pointers. -For configurations with a known larger average block size, -this value can be increased to reduce the memory footprint. -. -.It Sy zfs_arc_eviction_pct Ns = Ns Sy 200 Ns % Pq uint -When -.Fn arc_is_overflowing , -.Fn arc_get_data_impl -waits for this percent of the requested amount of data to be evicted. -For example, by default, for every -.Em 2 KiB -that's evicted, -.Em 1 KiB -of it may be "reused" by a new allocation. -Since this is above -.Sy 100 Ns % , -it ensures that progress is made towards getting -.Sy arc_size No under Sy arc_c . -Since this is finite, it ensures that allocations can still happen, -even during the potentially long time that -.Sy arc_size No is more than Sy arc_c . -. -.It Sy zfs_arc_evict_batch_limit Ns = Ns Sy 10 Pq uint -Number ARC headers to evict per sub-list before proceeding to another sub-list. -This batch-style operation prevents entire sub-lists from being evicted at once -but comes at a cost of additional unlocking and locking. -. -.It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint -If set to a non zero value, it will replace the -.Sy arc_grow_retry -value with this value. -The -.Sy arc_grow_retry -.No value Pq default Sy 5 Ns s -is the number of seconds the ARC will wait before -trying to resume growth after a memory pressure event. -. -.It Sy zfs_arc_lotsfree_percent Ns = Ns Sy 10 Ns % Pq int -Throttle I/O when free system memory drops below this percentage of total -system memory. -Setting this value to -.Sy 0 -will disable the throttle. -. -.It Sy zfs_arc_max Ns = Ns Sy 0 Ns B Pq u64 -Max size of ARC in bytes. -If -.Sy 0 , -then the max size of ARC is determined by the amount of system memory installed. -The larger of -.Sy all_system_memory No \- Sy 1 GiB -and -.Sy 5/8 No \(mu Sy all_system_memory -will be used as the limit. -This value must be at least -.Sy 67108864 Ns B Pq 64 MiB . -.Pp -This value can be changed dynamically, with some caveats. -It cannot be set back to -.Sy 0 -while running, and reducing it below the current ARC size will not cause -the ARC to shrink without memory pressure to induce shrinking. -. -.It Sy zfs_arc_meta_balance Ns = Ns Sy 500 Pq uint -Balance between metadata and data on ghost hits. -Values above 100 increase metadata caching by proportionally reducing effect -of ghost data hits on target data/metadata rate. -. -.It Sy zfs_arc_min Ns = Ns Sy 0 Ns B Pq u64 -Min size of ARC in bytes. -.No If set to Sy 0 , arc_c_min -will default to consuming the larger of -.Sy 32 MiB -and -.Sy all_system_memory No / Sy 32 . -. -.It Sy zfs_arc_min_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 1s Pc Pq uint -Minimum time prefetched blocks are locked in the ARC. -. -.It Sy zfs_arc_min_prescient_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 6s Pc Pq uint -Minimum time "prescient prefetched" blocks are locked in the ARC. -These blocks are meant to be prefetched fairly aggressively ahead of -the code that may use them. -. -.It Sy zfs_arc_prune_task_threads Ns = Ns Sy 1 Pq int -Number of arc_prune threads. -.Fx -does not need more than one. -Linux may theoretically use one per mount point up to number of CPUs, -but that was not proven to be useful. -. -.It Sy zfs_max_missing_tvds Ns = Ns Sy 0 Pq int -Number of missing top-level vdevs which will be allowed during -pool import (only in read-only mode). -. -.It Sy zfs_max_nvlist_src_size Ns = Sy 0 Pq u64 -Maximum size in bytes allowed to be passed as -.Sy zc_nvlist_src_size -for ioctls on -.Pa /dev/zfs . -This prevents a user from causing the kernel to allocate -an excessive amount of memory. -When the limit is exceeded, the ioctl fails with -.Sy EINVAL -and a description of the error is sent to the -.Pa zfs-dbgmsg -log. -This parameter should not need to be touched under normal circumstances. -If -.Sy 0 , -equivalent to a quarter of the user-wired memory limit under -.Fx -and to -.Sy 134217728 Ns B Pq 128 MiB -under Linux. -. -.It Sy zfs_multilist_num_sublists Ns = Ns Sy 0 Pq uint -To allow more fine-grained locking, each ARC state contains a series -of lists for both data and metadata objects. -Locking is performed at the level of these "sub-lists". -This parameters controls the number of sub-lists per ARC state, -and also applies to other uses of the multilist data structure. -.Pp -If -.Sy 0 , -equivalent to the greater of the number of online CPUs and -.Sy 4 . -. -.It Sy zfs_arc_overflow_shift Ns = Ns Sy 8 Pq int -The ARC size is considered to be overflowing if it exceeds the current -ARC target size -.Pq Sy arc_c -by thresholds determined by this parameter. -Exceeding by -.Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No / Sy 2 -starts ARC reclamation process. -If that appears insufficient, exceeding by -.Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No \(mu Sy 1.5 -blocks new buffer allocation until the reclaim thread catches up. -Started reclamation process continues till ARC size returns below the -target size. -.Pp -The default value of -.Sy 8 -causes the ARC to start reclamation if it exceeds the target size by -.Em 0.2% -of the target size, and block allocations by -.Em 0.6% . -. -.It Sy zfs_arc_shrink_shift Ns = Ns Sy 0 Pq uint -If nonzero, this will update -.Sy arc_shrink_shift Pq default Sy 7 -with the new value. -. -.It Sy zfs_arc_pc_percent Ns = Ns Sy 0 Ns % Po off Pc Pq uint -Percent of pagecache to reclaim ARC to. -.Pp -This tunable allows the ZFS ARC to play more nicely -with the kernel's LRU pagecache. -It can guarantee that the ARC size won't collapse under scanning -pressure on the pagecache, yet still allows the ARC to be reclaimed down to -.Sy zfs_arc_min -if necessary. -This value is specified as percent of pagecache size (as measured by -.Sy NR_FILE_PAGES ) , -where that percent may exceed -.Sy 100 . -This -only operates during memory pressure/reclaim. -. -.It Sy zfs_arc_shrinker_limit Ns = Ns Sy 10000 Pq int -This is a limit on how many pages the ARC shrinker makes available for -eviction in response to one page allocation attempt. -Note that in practice, the kernel's shrinker can ask us to evict -up to about four times this for one allocation attempt. -To reduce OOM risk, this limit is applied for kswapd reclaims only. -.Pp -The default limit of -.Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages -limits the amount of time spent attempting to reclaim ARC memory to -less than 100 ms per allocation attempt, -even with a small average compressed block size of ~8 KiB. -.Pp -The parameter can be set to 0 (zero) to disable the limit, -and only applies on Linux. -. -.It Sy zfs_arc_shrinker_seeks Ns = Ns Sy 2 Pq int -Relative cost of ARC eviction on Linux, AKA number of seeks needed to -restore evicted page. -Bigger values make ARC more precious and evictions smaller, comparing to -other kernel subsystems. -Value of 4 means parity with page cache. -. -.It Sy zfs_arc_sys_free Ns = Ns Sy 0 Ns B Pq u64 -The target number of bytes the ARC should leave as free memory on the system. -If zero, equivalent to the bigger of -.Sy 512 KiB No and Sy all_system_memory/64 . -. -.It Sy zfs_autoimport_disable Ns = Ns Sy 1 Ns | Ns 0 Pq int -Disable pool import at module load by ignoring the cache file -.Pq Sy spa_config_path . -. -.It Sy zfs_checksum_events_per_second Ns = Ns Sy 20 Ns /s Pq uint -Rate limit checksum events to this many per second. -Note that this should not be set below the ZED thresholds + which indicates that a percent which is based on + .Sy zfs_arc_dnode_limit_percent + of the ARC meta buffers that may be used for dnodes. + .It Sy zfs_arc_dnode_limit_percent Ns = Ns Sy 10 Ns % Pq u64 + Percentage that can be consumed by dnodes of ARC meta buffers. + .Pp + See also + .Sy zfs_arc_dnode_limit , + which serves a similar purpose but has a higher priority if nonzero. + . + .It Sy zfs_arc_dnode_reduce_percent Ns = Ns Sy 10 Ns % Pq u64 + Percentage of ARC dnodes to try to scan in response to demand for non-metadata + when the number of bytes consumed by dnodes exceeds + .Sy zfs_arc_dnode_limit . + . + .It Sy zfs_arc_average_blocksize Ns = Ns Sy 8192 Ns B Po 8 KiB Pc Pq uint + The ARC's buffer hash table is sized based on the assumption of an average + block size of this value. + This works out to roughly 1 MiB of hash table per 1 GiB of physical memory + with 8-byte pointers. + For configurations with a known larger average block size, + this value can be increased to reduce the memory footprint. + . + .It Sy zfs_arc_eviction_pct Ns = Ns Sy 200 Ns % Pq uint + When + .Fn arc_is_overflowing , + .Fn arc_get_data_impl + waits for this percent of the requested amount of data to be evicted. + For example, by default, for every + .Em 2 KiB + that's evicted, + .Em 1 KiB + of it may be "reused" by a new allocation. + Since this is above + .Sy 100 Ns % , + it ensures that progress is made towards getting + .Sy arc_size No under Sy arc_c . + Since this is finite, it ensures that allocations can still happen, + even during the potentially long time that + .Sy arc_size No is more than Sy arc_c . + . + .It Sy zfs_arc_evict_batch_limit Ns = Ns Sy 10 Pq uint + Number ARC headers to evict per sub-list before proceeding to another sub-list. + This batch-style operation prevents entire sub-lists from being evicted at once + but comes at a cost of additional unlocking and locking. + . + .It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint + If set to a non zero value, it will replace the + .Sy arc_grow_retry + value with this value. + The + .Sy arc_grow_retry + .No value Pq default Sy 5 Ns s + is the number of seconds the ARC will wait before + trying to resume growth after a memory pressure event. + . + .It Sy zfs_arc_lotsfree_percent Ns = Ns Sy 10 Ns % Pq int + Throttle I/O when free system memory drops below this percentage of total + system memory. + Setting this value to + .Sy 0 + will disable the throttle. + . + .It Sy zfs_arc_max Ns = Ns Sy 0 Ns B Pq u64 + Max size of ARC in bytes. + If + .Sy 0 , + then the max size of ARC is determined by the amount of system memory installed. + The larger of + .Sy all_system_memory No \- Sy 1 GiB + and + .Sy 5/8 No \(mu Sy all_system_memory + will be used as the limit. + This value must be at least + .Sy 67108864 Ns B Pq 64 MiB . + .Pp + This value can be changed dynamically, with some caveats. + It cannot be set back to + .Sy 0 + while running, and reducing it below the current ARC size will not cause + the ARC to shrink without memory pressure to induce shrinking. + . + .It Sy zfs_arc_meta_balance Ns = Ns Sy 500 Pq uint + Balance between metadata and data on ghost hits. + Values above 100 increase metadata caching by proportionally reducing effect + of ghost data hits on target data/metadata rate. + . + .It Sy zfs_arc_min Ns = Ns Sy 0 Ns B Pq u64 + Min size of ARC in bytes. + .No If set to Sy 0 , arc_c_min + will default to consuming the larger of + .Sy 32 MiB + and + .Sy all_system_memory No / Sy 32 . + . + .It Sy zfs_arc_min_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 1s Pc Pq uint + Minimum time prefetched blocks are locked in the ARC. + . + .It Sy zfs_arc_min_prescient_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 6s Pc Pq uint + Minimum time "prescient prefetched" blocks are locked in the ARC. + These blocks are meant to be prefetched fairly aggressively ahead of + the code that may use them. + . + .It Sy zfs_arc_prune_task_threads Ns = Ns Sy 1 Pq int + Number of arc_prune threads. + .Fx + does not need more than one. + Linux may theoretically use one per mount point up to number of CPUs, + but that was not proven to be useful. + . + .It Sy zfs_max_missing_tvds Ns = Ns Sy 0 Pq int + Number of missing top-level vdevs which will be allowed during + pool import (only in read-only mode). + . + .It Sy zfs_max_nvlist_src_size Ns = Sy 0 Pq u64 + Maximum size in bytes allowed to be passed as + .Sy zc_nvlist_src_size + for ioctls on + .Pa /dev/zfs . + This prevents a user from causing the kernel to allocate + an excessive amount of memory. + When the limit is exceeded, the ioctl fails with + .Sy EINVAL + and a description of the error is sent to the + .Pa zfs-dbgmsg + log. + This parameter should not need to be touched under normal circumstances. + If + .Sy 0 , + equivalent to a quarter of the user-wired memory limit under + .Fx + and to + .Sy 134217728 Ns B Pq 128 MiB + under Linux. + . + .It Sy zfs_multilist_num_sublists Ns = Ns Sy 0 Pq uint + To allow more fine-grained locking, each ARC state contains a series + of lists for both data and metadata objects. + Locking is performed at the level of these "sub-lists". + This parameters controls the number of sub-lists per ARC state, + and also applies to other uses of the multilist data structure. + .Pp + If + .Sy 0 , + equivalent to the greater of the number of online CPUs and + .Sy 4 . + . + .It Sy zfs_arc_overflow_shift Ns = Ns Sy 8 Pq int + The ARC size is considered to be overflowing if it exceeds the current + ARC target size + .Pq Sy arc_c + by thresholds determined by this parameter. + Exceeding by + .Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No / Sy 2 + starts ARC reclamation process. + If that appears insufficient, exceeding by + .Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No \(mu Sy 1.5 + blocks new buffer allocation until the reclaim thread catches up. + Started reclamation process continues till ARC size returns below the + target size. + .Pp + The default value of + .Sy 8 + causes the ARC to start reclamation if it exceeds the target size by + .Em 0.2% + of the target size, and block allocations by + .Em 0.6% . + . + .It Sy zfs_arc_shrink_shift Ns = Ns Sy 0 Pq uint + If nonzero, this will update + .Sy arc_shrink_shift Pq default Sy 7 + with the new value. + . + .It Sy zfs_arc_pc_percent Ns = Ns Sy 0 Ns % Po off Pc Pq uint + Percent of pagecache to reclaim ARC to. + .Pp + This tunable allows the ZFS ARC to play more nicely + with the kernel's LRU pagecache. + It can guarantee that the ARC size won't collapse under scanning + pressure on the pagecache, yet still allows the ARC to be reclaimed down to + .Sy zfs_arc_min + if necessary. + This value is specified as percent of pagecache size (as measured by + .Sy NR_FILE_PAGES ) , + where that percent may exceed + .Sy 100 . + This + only operates during memory pressure/reclaim. + . + .It Sy zfs_arc_shrinker_limit Ns = Ns Sy 10000 Pq int + This is a limit on how many pages the ARC shrinker makes available for + eviction in response to one page allocation attempt. + Note that in practice, the kernel's shrinker can ask us to evict + up to about four times this for one allocation attempt. + To reduce OOM risk, this limit is applied for kswapd reclaims only. + .Pp + The default limit of + .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages + limits the amount of time spent attempting to reclaim ARC memory to + less than 100 ms per allocation attempt, + even with a small average compressed block size of ~8 KiB. + .Pp + The parameter can be set to 0 (zero) to disable the limit, + and only applies on Linux. + . + .It Sy zfs_arc_shrinker_seeks Ns = Ns Sy 2 Pq int + Relative cost of ARC eviction on Linux, AKA number of seeks needed to + restore evicted page. + Bigger values make ARC more precious and evictions smaller, comparing to + other kernel subsystems. + Value of 4 means parity with page cache. + . + .It Sy zfs_arc_sys_free Ns = Ns Sy 0 Ns B Pq u64 + The target number of bytes the ARC should leave as free memory on the system. + If zero, equivalent to the bigger of + .Sy 512 KiB No and Sy all_system_memory/64 . + . + .It Sy zfs_autoimport_disable Ns = Ns Sy 1 Ns | Ns 0 Pq int + Disable pool import at module load by ignoring the cache file + .Pq Sy spa_config_path . + . + .It Sy zfs_checksum_events_per_second Ns = Ns Sy 20 Ns /s Pq uint + Rate limit checksum events to this many per second. + Note that this should not be set below the ZED thresholds (currently 10 checksums over 10 seconds) -or else the daemon may not trigger any action. -. -.It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint -This controls the amount of time that a ZIL block (lwb) will remain "open" -when it isn't "full", and it has a thread waiting for it to be committed to -stable storage. -The timeout is scaled based on a percentage of the last lwb -latency to avoid significantly impacting the latency of each individual -transaction record (itx). -. -.It Sy zfs_condense_indirect_commit_entry_delay_ms Ns = Ns Sy 0 Ns ms Pq int -Vdev indirection layer (used for device removal) sleeps for this many -milliseconds during mapping generation. -Intended for use with the test suite to throttle vdev removal speed. -. -.It Sy zfs_condense_indirect_obsolete_pct Ns = Ns Sy 25 Ns % Pq uint -Minimum percent of obsolete bytes in vdev mapping required to attempt to -condense -.Pq see Sy zfs_condense_indirect_vdevs_enable . -Intended for use with the test suite -to facilitate triggering condensing as needed. -. -.It Sy zfs_condense_indirect_vdevs_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int -Enable condensing indirect vdev mappings. -When set, attempt to condense indirect vdev mappings -if the mapping uses more than -.Sy zfs_condense_min_mapping_bytes -bytes of memory and if the obsolete space map object uses more than -.Sy zfs_condense_max_obsolete_bytes -bytes on-disk. -The condensing process is an attempt to save memory by removing obsolete -mappings. -. -.It Sy zfs_condense_max_obsolete_bytes Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64 -Only attempt to condense indirect vdev mappings if the on-disk size -of the obsolete space map object is greater than this number of bytes -.Pq see Sy zfs_condense_indirect_vdevs_enable . -. -.It Sy zfs_condense_min_mapping_bytes Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq u64 -Minimum size vdev mapping to attempt to condense -.Pq see Sy zfs_condense_indirect_vdevs_enable . -. -.It Sy zfs_dbgmsg_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int -Internally ZFS keeps a small log to facilitate debugging. -The log is enabled by default, and can be disabled by unsetting this option. -The contents of the log can be accessed by reading -.Pa /proc/spl/kstat/zfs/dbgmsg . -Writing -.Sy 0 -to the file clears the log. -.Pp -This setting does not influence debug prints due to -.Sy zfs_flags . -. -.It Sy zfs_dbgmsg_maxsize Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint -Maximum size of the internal ZFS debug log. -. -.It Sy zfs_dbuf_state_index Ns = Ns Sy 0 Pq int -Historically used for controlling what reporting was available under -.Pa /proc/spl/kstat/zfs . -No effect. -. -.It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1 min Pc Pq u64 -Check time in milliseconds. -This defines the frequency at which we check for hung I/O requests -and potentially invoke the -.Sy zfs_deadman_failmode -behavior. -. -.It Sy zfs_deadman_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int -When a pool sync operation takes longer than -.Sy zfs_deadman_synctime_ms , -or when an individual I/O operation takes longer than -.Sy zfs_deadman_ziotime_ms , -then the operation is considered to be "hung". -If -.Sy zfs_deadman_enabled -is set, then the deadman behavior is invoked as described by -.Sy zfs_deadman_failmode . -By default, the deadman is enabled and set to -.Sy wait -which results in "hung" I/O operations only being logged. -The deadman is automatically disabled when a pool gets suspended. -. -.It Sy zfs_deadman_events_per_second Ns = Ns Sy 1 Ns /s Pq int -Rate limit deadman zevents (which report hung I/O operations) to this many per -second. -. -.It Sy zfs_deadman_failmode Ns = Ns Sy wait Pq charp -Controls the failure behavior when the deadman detects a "hung" I/O operation. -Valid values are: -.Bl -tag -compact -offset 4n -width "continue" -.It Sy wait -Wait for a "hung" operation to complete. -For each "hung" operation a "deadman" event will be posted -describing that operation. -.It Sy continue -Attempt to recover from a "hung" operation by re-dispatching it -to the I/O pipeline if possible. -.It Sy panic -Panic the system. -This can be used to facilitate automatic fail-over -to a properly configured fail-over partner. -.El -. -.It Sy zfs_deadman_synctime_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq u64 -Interval in milliseconds after which the deadman is triggered and also -the interval after which a pool sync operation is considered to be "hung". -Once this limit is exceeded the deadman will be invoked every -.Sy zfs_deadman_checktime_ms -milliseconds until the pool sync completes. -. -.It Sy zfs_deadman_ziotime_ms Ns = Ns Sy 300000 Ns ms Po 5 min Pc Pq u64 -Interval in milliseconds after which the deadman is triggered and an -individual I/O operation is considered to be "hung". -As long as the operation remains "hung", -the deadman will be invoked every -.Sy zfs_deadman_checktime_ms -milliseconds until the operation completes. -. -.It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int -Enable prefetching dedup-ed blocks which are going to be freed. -. -.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint -Maximum number of dedup log flush passes (iterations) each transaction. -.Pp -At the start of each transaction, OpenZFS will estimate how many entries it -needs to flush out to keep up with the change rate, taking the amount and time -taken to flush on previous txgs into account (see -.Sy zfs_dedup_log_flush_flow_rate_txgs ) . -It will spread this amount into a number of passes. -At each pass, it will use the amount already flushed and the total time taken -by flushing and by other IO to recompute how much it should do for the remainder -of the txg. -.Pp -Reducing the max number of passes will make flushing more aggressive, flushing -out more entries on each pass. -This can be faster, but also more likely to compete with other IO. -Increasing the max number of passes will put fewer entries onto each pass, -keeping the overhead of dedup changes to a minimum but possibly causing a large -number of changes to be dumped on the last pass, which can blow out the txg -sync time beyond -.Sy zfs_txg_timeout . -. -.It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint -Minimum time to spend on dedup log flush each transaction. -.Pp -At least this long will be spent flushing dedup log entries each transaction, -up to -.Sy zfs_txg_timeout . -This occurs even if doing so would delay the transaction, that is, other IO -completes under this time. -. -.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint -Flush at least this many entries each transaction. -.Pp -OpenZFS will estimate how many entries it needs to flush each transaction to -keep up with the ingest rate (see -.Sy zfs_dedup_log_flush_flow_rate_txgs ) . -This sets the minimum for that estimate. -Raising it can force OpenZFS to flush more aggressively, keeping the log small -and so reducing pool import times, but can make it less able to back off if -log flushing would compete with other IO too much. -. -.It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint -Number of transactions to use to compute the flow rate. -.Pp -OpenZFS will estimate how many entries it needs to flush each transaction by -monitoring the number of entries changed (ingest rate), number of entries -flushed (flush rate) and time spent flushing (flush time rate) and combining -these into an overall "flow rate". -It will use an exponential weighted moving average over some number of recent -transactions to compute these rates. -This sets the number of transactions to compute these averages over. -Setting it higher can help to smooth out the flow rate in the face of spiky -workloads, but will take longer for the flow rate to adjust to a sustained -change in the ingress rate. -. -.It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint -Max transactions to before starting to flush dedup logs. -.Pp -OpenZFS maintains two dedup logs, one receiving new changes, one flushing. -If there is nothing to flush, it will accumulate changes for no more than this -many transactions before switching the logs and starting to flush entries out. -. -.It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64 -Max memory to use for dedup logs. -.Pp -OpenZFS will spend no more than this much memory on maintaining the in-memory -dedup log. -Flushing will begin when around half this amount is being spent on logs. -The default value of -.Sy 0 -will cause it to be set by -.Sy zfs_dedup_log_mem_max_percent -instead. -. -.It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint -Max memory to use for dedup logs, as a percentage of total memory. -.Pp -If -.Sy zfs_dedup_log_mem_max -is not set, it will be initialised as a percentage of the total memory in the -system. -. -.It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint -Start to delay each transaction once there is this amount of dirty data, -expressed as a percentage of -.Sy zfs_dirty_data_max . -This value should be at least -.Sy zfs_vdev_async_write_active_max_dirty_percent . -.No See Sx ZFS TRANSACTION DELAY . -. -.It Sy zfs_delay_scale Ns = Ns Sy 500000 Pq int -This controls how quickly the transaction delay approaches infinity. -Larger values cause longer delays for a given amount of dirty data. -.Pp -For the smoothest delay, this value should be about 1 billion divided + or else the daemon may not trigger any action. + . + .It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint + This controls the amount of time that a ZIL block (lwb) will remain "open" + when it isn't "full", and it has a thread waiting for it to be committed to + stable storage. + The timeout is scaled based on a percentage of the last lwb + latency to avoid significantly impacting the latency of each individual + transaction record (itx). + . + .It Sy zfs_condense_indirect_commit_entry_delay_ms Ns = Ns Sy 0 Ns ms Pq int + Vdev indirection layer (used for device removal) sleeps for this many + milliseconds during mapping generation. + Intended for use with the test suite to throttle vdev removal speed. + . + .It Sy zfs_condense_indirect_obsolete_pct Ns = Ns Sy 25 Ns % Pq uint + Minimum percent of obsolete bytes in vdev mapping required to attempt to + condense + .Pq see Sy zfs_condense_indirect_vdevs_enable . + Intended for use with the test suite + to facilitate triggering condensing as needed. + . + .It Sy zfs_condense_indirect_vdevs_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int + Enable condensing indirect vdev mappings. + When set, attempt to condense indirect vdev mappings + if the mapping uses more than + .Sy zfs_condense_min_mapping_bytes + bytes of memory and if the obsolete space map object uses more than + .Sy zfs_condense_max_obsolete_bytes + bytes on-disk. + The condensing process is an attempt to save memory by removing obsolete + mappings. + . + .It Sy zfs_condense_max_obsolete_bytes Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64 + Only attempt to condense indirect vdev mappings if the on-disk size + of the obsolete space map object is greater than this number of bytes + .Pq see Sy zfs_condense_indirect_vdevs_enable . + . + .It Sy zfs_condense_min_mapping_bytes Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq u64 + Minimum size vdev mapping to attempt to condense + .Pq see Sy zfs_condense_indirect_vdevs_enable . + . + .It Sy zfs_dbgmsg_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int + Internally ZFS keeps a small log to facilitate debugging. + The log is enabled by default, and can be disabled by unsetting this option. + The contents of the log can be accessed by reading + .Pa /proc/spl/kstat/zfs/dbgmsg . + Writing + .Sy 0 + to the file clears the log. + .Pp + This setting does not influence debug prints due to + .Sy zfs_flags . + . + .It Sy zfs_dbgmsg_maxsize Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint + Maximum size of the internal ZFS debug log. + . + .It Sy zfs_dbuf_state_index Ns = Ns Sy 0 Pq int + Historically used for controlling what reporting was available under + .Pa /proc/spl/kstat/zfs . + No effect. + . + .It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1 min Pc Pq u64 + Check time in milliseconds. + This defines the frequency at which we check for hung I/O requests + and potentially invoke the + .Sy zfs_deadman_failmode + behavior. + . + .It Sy zfs_deadman_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int + When a pool sync operation takes longer than + .Sy zfs_deadman_synctime_ms , + or when an individual I/O operation takes longer than + .Sy zfs_deadman_ziotime_ms , + then the operation is considered to be "hung". + If + .Sy zfs_deadman_enabled + is set, then the deadman behavior is invoked as described by + .Sy zfs_deadman_failmode . + By default, the deadman is enabled and set to + .Sy wait + which results in "hung" I/O operations only being logged. + The deadman is automatically disabled when a pool gets suspended. + . + .It Sy zfs_deadman_events_per_second Ns = Ns Sy 1 Ns /s Pq int + Rate limit deadman zevents (which report hung I/O operations) to this many per + second. + . + .It Sy zfs_deadman_failmode Ns = Ns Sy wait Pq charp + Controls the failure behavior when the deadman detects a "hung" I/O operation. + Valid values are: + .Bl -tag -compact -offset 4n -width "continue" + .It Sy wait + Wait for a "hung" operation to complete. + For each "hung" operation a "deadman" event will be posted + describing that operation. + .It Sy continue + Attempt to recover from a "hung" operation by re-dispatching it + to the I/O pipeline if possible. + .It Sy panic + Panic the system. + This can be used to facilitate automatic fail-over + to a properly configured fail-over partner. + .El + . + .It Sy zfs_deadman_synctime_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq u64 + Interval in milliseconds after which the deadman is triggered and also + the interval after which a pool sync operation is considered to be "hung". + Once this limit is exceeded the deadman will be invoked every + .Sy zfs_deadman_checktime_ms + milliseconds until the pool sync completes. + . + .It Sy zfs_deadman_ziotime_ms Ns = Ns Sy 300000 Ns ms Po 5 min Pc Pq u64 + Interval in milliseconds after which the deadman is triggered and an + individual I/O operation is considered to be "hung". + As long as the operation remains "hung", + the deadman will be invoked every + .Sy zfs_deadman_checktime_ms + milliseconds until the operation completes. + . + .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int + Enable prefetching dedup-ed blocks which are going to be freed. + . + .It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint + Maximum number of dedup log flush passes (iterations) each transaction. + .Pp + At the start of each transaction, OpenZFS will estimate how many entries it + needs to flush out to keep up with the change rate, taking the amount and time + taken to flush on previous txgs into account (see + .Sy zfs_dedup_log_flush_flow_rate_txgs ) . + It will spread this amount into a number of passes. + At each pass, it will use the amount already flushed and the total time taken + by flushing and by other IO to recompute how much it should do for the remainder + of the txg. + .Pp + Reducing the max number of passes will make flushing more aggressive, flushing + out more entries on each pass. + This can be faster, but also more likely to compete with other IO. + Increasing the max number of passes will put fewer entries onto each pass, + keeping the overhead of dedup changes to a minimum but possibly causing a large + number of changes to be dumped on the last pass, which can blow out the txg + sync time beyond + .Sy zfs_txg_timeout . + . + .It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint + Minimum time to spend on dedup log flush each transaction. + .Pp + At least this long will be spent flushing dedup log entries each transaction, + up to + .Sy zfs_txg_timeout . + This occurs even if doing so would delay the transaction, that is, other IO + completes under this time. + . + .It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint + Flush at least this many entries each transaction. + .Pp + OpenZFS will estimate how many entries it needs to flush each transaction to + keep up with the ingest rate (see + .Sy zfs_dedup_log_flush_flow_rate_txgs ) . + This sets the minimum for that estimate. + Raising it can force OpenZFS to flush more aggressively, keeping the log small + and so reducing pool import times, but can make it less able to back off if + log flushing would compete with other IO too much. + . + .It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint + Number of transactions to use to compute the flow rate. + .Pp + OpenZFS will estimate how many entries it needs to flush each transaction by + monitoring the number of entries changed (ingest rate), number of entries + flushed (flush rate) and time spent flushing (flush time rate) and combining + these into an overall "flow rate". + It will use an exponential weighted moving average over some number of recent + transactions to compute these rates. + This sets the number of transactions to compute these averages over. + Setting it higher can help to smooth out the flow rate in the face of spiky + workloads, but will take longer for the flow rate to adjust to a sustained + change in the ingress rate. + . + .It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint + Max transactions to before starting to flush dedup logs. + .Pp + OpenZFS maintains two dedup logs, one receiving new changes, one flushing. + If there is nothing to flush, it will accumulate changes for no more than this + many transactions before switching the logs and starting to flush entries out. + . + .It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64 + Max memory to use for dedup logs. + .Pp + OpenZFS will spend no more than this much memory on maintaining the in-memory + dedup log. + Flushing will begin when around half this amount is being spent on logs. + The default value of + .Sy 0 + will cause it to be set by + .Sy zfs_dedup_log_mem_max_percent + instead. + . + .It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint + Max memory to use for dedup logs, as a percentage of total memory. + .Pp + If + .Sy zfs_dedup_log_mem_max + is not set, it will be initialised as a percentage of the total memory in the + system. + . + .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint + Start to delay each transaction once there is this amount of dirty data, + expressed as a percentage of + .Sy zfs_dirty_data_max . + This value should be at least + .Sy zfs_vdev_async_write_active_max_dirty_percent . + .No See Sx ZFS TRANSACTION DELAY . + . + .It Sy zfs_delay_scale Ns = Ns Sy 500000 Pq int + This controls how quickly the transaction delay approaches infinity. + Larger values cause longer delays for a given amount of dirty data. + .Pp + For the smoothest delay, this value should be about 1 billion divided by the maximum number of operations per second. This will smoothly handle between ten times and a tenth of this number. .No See Sx ZFS TRANSACTION DELAY . diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index cbd07d19a7f9..a74f09a942f1 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -183,6 +183,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); static kmem_cache_t *dbuf_kmem_cache; kmem_cache_t *dbuf_dirty_kmem_cache; static taskq_t *dbu_evict_taskq; +static taskq_t *dbuf_evict_taskq; static kthread_t *dbuf_cache_evict_thread; static kmutex_t dbuf_evict_lock; @@ -237,6 +238,20 @@ static uint_t dbuf_metadata_cache_shift = 6; /* Set the dbuf hash mutex count as log2 shift (dynamic by default) */ static uint_t dbuf_mutex_cache_shift = 0; +/* + * Number of dbuf_evict threads + */ +static uint_t dbuf_evict_threads = 0; + +/* + * The minimum number of bytes we can evict at once is a block size. + * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task. + * We use this value to compute a scaling factor for the eviction tasks. + */ +#define DBUF_MIN_EVICT_PERTASK_SHIFT (SPA_MAXBLOCKSHIFT) + +static uint_t dbuf_evict_parallel = 0; + static unsigned long dbuf_cache_target_bytes(void); static unsigned long dbuf_metadata_cache_target_bytes(void); @@ -768,26 +783,47 @@ dbuf_cache_above_lowater(void) } /* - * Evict the oldest eligible dbuf from the dbuf cache. + * Evict the oldest eligible dbufs from the dbuf cache. + * Use the multilist sublist (mls) with the provided index #idx. */ static void -dbuf_evict_one(void) +dbuf_evict_many(uint64_t bytes, unsigned int idx) { - int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache); + int64_t evicted = 0; + dmu_buf_impl_t *marker = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); + marker->db_objset = NULL; + + ASSERT3U(idx, <, multilist_get_num_sublists( + &dbuf_caches[DB_DBUF_CACHE].cache)); + multilist_sublist_t *mls = multilist_sublist_lock_idx( &dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); dmu_buf_impl_t *db = multilist_sublist_tail(mls); - while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { - db = multilist_sublist_prev(mls, db); - } + multilist_sublist_insert_after(mls, db, marker); + + while (db != NULL && evicted < bytes) { + int skip = 0; + while (db != NULL && (db->db_objset == NULL || + mutex_tryenter(&db->db_mtx) == 0)) { + db = multilist_sublist_prev(mls, db); + if (skip == 0) + skip = 1; + } - DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, - multilist_sublist_t *, mls); + if (db == NULL) + break; + + if (skip) { + multilist_sublist_remove(mls, marker); + multilist_sublist_insert_before(mls, db, marker); + } + + DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, + multilist_sublist_t *, mls); - if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); uint64_t size = db->db.db_size; @@ -803,9 +839,121 @@ dbuf_evict_one(void) db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); DBUF_STAT_BUMP(cache_total_evicts); - } else { - multilist_sublist_unlock(mls); + evicted += size + usize; + + mls = multilist_sublist_lock_idx( + &dbuf_caches[DB_DBUF_CACHE].cache, idx); + db = multilist_sublist_prev(mls, marker); } + + multilist_sublist_remove(mls, marker); + multilist_sublist_unlock(mls); + kmem_cache_free(dbuf_kmem_cache, marker); +} + +typedef struct evict_arg { + taskq_ent_t tqe; + unsigned idx; + uint64_t bytes; +} evict_arg_t; + +static void +dbuf_evict_task(void *arg) +{ + evict_arg_t *eva = arg; + dbuf_evict_many(eva->bytes, eva->idx); +} + +static void +dbuf_evict(void) +{ + int64_t bytes = (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) - + dbuf_cache_lowater_bytes()); + + if (bytes <= 0) + return; + + unsigned idx = multilist_get_random_index( + &dbuf_caches[DB_DBUF_CACHE].cache); + + if (!dbuf_evict_parallel) + return (dbuf_evict_many(bytes, idx)); + + /* + * Go to the parallel eviction. + */ + unsigned int num_sublists = multilist_get_num_sublists( + &dbuf_caches[DB_DBUF_CACHE].cache); + evict_arg_t *evarg = kmem_zalloc(sizeof (*evarg) * num_sublists, + KM_SLEEP); + /* + * How we scale + * + * Example 1, # of chunks less than # of tasks. + * We have: + * - 4 tasks + * - 3 chunks + * - 3 full col + * - 0 low cols. + * + * The first low col index is 3. + * The tasks #0-#2 evict 1 chunk each. + * + * 0 | 1 | 2 | 3 | + * +===+===+===+===+ + * | x | x | x | | + * +---+---+---+---+ + * + * Example 2, # of chunks more than # of tasks. + * We have: + * - 4 tasks + * - 9 chunks + * - 1 full col + * - 3 low cols + * + * The first low col index is 1. + * The task #0 evicts 3 chunks, the others evict 2 chunks each. + * + * 0 | 1 | 2 | 3 | + * +===+===+===+===+ + * | x | x | x | x | + * +---+---+---+---+ + * | x | x | x | x | + * +---+---+---+---+ + * | x | | | | + * +---+---+---+---+ + */ + + /* + * Compute number of tasks to run (n), first low col index (k), + * normal and low bytes per task. + */ + uint64_t nchunks = ((bytes - 1) >> DBUF_MIN_EVICT_PERTASK_SHIFT) + 1; + unsigned n = nchunks < num_sublists ? nchunks : num_sublists; + uint64_t fullrows = nchunks / n; + unsigned lastrowcols = nchunks % n; + unsigned k = (lastrowcols ? lastrowcols : n); + + uint64_t bytes_pertask_low = fullrows << DBUF_MIN_EVICT_PERTASK_SHIFT; + uint64_t bytes_pertask = bytes_pertask_low + (lastrowcols ? + (1 << DBUF_MIN_EVICT_PERTASK_SHIFT) : 0); + + for (unsigned i = 0; i < n; i++) { + uint64_t evict = i < k ? bytes_pertask : bytes_pertask_low; + + evarg[i].idx = idx; + evarg[i].bytes = evict; + + taskq_dispatch_ent(dbuf_evict_taskq, dbuf_evict_task, + &evarg[i], 0, &evarg[i].tqe); + + /* wrap idx */ + if (++idx >= num_sublists) + idx = 0; + } + + taskq_wait(dbuf_evict_taskq); + kmem_free(evarg, sizeof (*evarg) * num_sublists); } /* @@ -839,7 +987,7 @@ dbuf_evict_thread(void *unused) * minimize lock contention. */ while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { - dbuf_evict_one(); + dbuf_evict(); } mutex_enter(&dbuf_evict_lock); @@ -866,7 +1014,7 @@ dbuf_evict_notify(uint64_t size) */ if (size > dbuf_cache_target_bytes()) { if (size > dbuf_cache_hiwater_bytes()) - dbuf_evict_one(); + dbuf_evict(); cv_signal(&dbuf_evict_cv); } } @@ -975,11 +1123,16 @@ dbuf_init(void) dbuf_stats_init(h); + if (dbuf_evict_threads == 0) + dbuf_evict_threads = MAX(2, MIN(16, max_ncpus >> 3)); /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); + dbuf_evict_taskq = taskq_create("dbuf_evict", + MIN(dbuf_evict_threads, max_ncpus), defclsyspri, + MIN(dbuf_evict_threads, max_ncpus), max_ncpus, TASKQ_PREPOPULATE); for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { multilist_create(&dbuf_caches[dcs].cache, @@ -1047,6 +1200,8 @@ dbuf_fini(void) kmem_cache_destroy(dbuf_kmem_cache); kmem_cache_destroy(dbuf_dirty_kmem_cache); taskq_destroy(dbu_evict_taskq); + taskq_wait(dbuf_evict_taskq); + taskq_destroy(dbuf_evict_taskq); mutex_enter(&dbuf_evict_lock); dbuf_evict_thread_exit = B_TRUE; @@ -4106,7 +4261,7 @@ dmu_buf_rele(dmu_buf_t *db, const void *tag) * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() * ^ | * | | - * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * +-----dbuf_destroy()<--dbuf_evict()<------------+ * */ void @@ -5441,3 +5596,9 @@ ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD, "Set size of dbuf cache mutex array as log2 shift."); + +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, evict_parallel, UINT, ZMOD_RW, + "Evict from the dbuf cache in parallel using a taskq"); + +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, evict_threads, UINT, ZMOD_RW, + "Maximum number of dbuf_evict threads");