From b97b51c4e3463e4495b2a0b87c2207bb487b574c Mon Sep 17 00:00:00 2001 From: Sean Doran Date: Tue, 5 Dec 2023 22:21:47 +0000 Subject: [PATCH] fix abd, have taskq_wait_synced() wait for threads to be created taskq_wait_synced() did a VERIFY() on whether the taskq's threads were the requested number, but taskq_create() can ultimately return early because taskq_thread_create() is allowed to return when two desired threads are created. fix this race panic. also, taskq_wait_synced() may fail if if num_ecores is nonzero (on Apple Silicon), so create a flag that lets taskq_create_common() deal with the max_ncpus. Make boot_ncpus a variable that's MAX(1, (int)max_ncores - num_ecores). boot_ncpus is used in common code. Modify the alignments and quanta/import sizes of the abd kmem and vmem cache creations. Make DEBUG builds work with KMF_LITE | KMF_BUFCTL on the abd kmem caches. Signed-off-by: Sean Doran --- include/os/windows/spl/sys/sysmacros.h | 3 ++- include/os/windows/spl/sys/taskq.h | 1 + module/os/windows/spl/spl-seg_kmem.c | 32 ++++++++++++-------------- module/os/windows/spl/spl-taskq.c | 19 +++++++++++++-- module/os/windows/spl/spl-windows.c | 7 ++++++ module/os/windows/zfs/abd_os.c | 29 +++++++++++++++++++++-- 6 files changed, 69 insertions(+), 22 deletions(-) diff --git a/include/os/windows/spl/sys/sysmacros.h b/include/os/windows/spl/sys/sysmacros.h index 1bedb83b59f5..3f26be5ebe20 100644 --- a/include/os/windows/spl/sys/sysmacros.h +++ b/include/os/windows/spl/sys/sysmacros.h @@ -61,6 +61,8 @@ extern uint32_t cpu_number(void); #define is_system_labeled() 0 extern unsigned int max_ncpus; +extern unsigned int boot_ncpus; +extern unsigned int num_ecores; #ifndef RLIM64_INFINITY #define RLIM64_INFINITY (~0ULL) @@ -133,7 +135,6 @@ extern uint32_t zone_get_hostid(void *zone); extern void spl_setup(void); extern void spl_cleanup(void); -#define boot_ncpus max_ncpus #define SET_ERROR(err) \ (__set_error(__FILE__, __func__, __LINE__, err), err) diff --git a/include/os/windows/spl/sys/taskq.h b/include/os/windows/spl/sys/taskq.h index b50a1c3d061a..5977cc91edb4 100644 --- a/include/os/windows/spl/sys/taskq.h +++ b/include/os/windows/spl/sys/taskq.h @@ -60,6 +60,7 @@ struct proc; #ifdef _WIN32 #define TASKQ_TIMESHARE 0x0020 /* macOS dynamic thread priority */ #define TASKQ_REALLY_DYNAMIC 0x0040 /* don't filter out TASKQ_DYNAMIC */ +#define TASKQ_CREATE_SYNCED 0x0080 /* don't deflate ncpus */ #endif /* * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as diff --git a/module/os/windows/spl/spl-seg_kmem.c b/module/os/windows/spl/spl-seg_kmem.c index 1afdfa2872f7..6bfcc336951a 100644 --- a/module/os/windows/spl/spl-seg_kmem.c +++ b/module/os/windows/spl/spl-seg_kmem.c @@ -227,31 +227,27 @@ segkmem_abd_init() /* * OpenZFS does not segregate the abd kmem cache out of the general * heap, leading to large numbers of short-lived slabs exchanged - * between the kmem cache and it's parent. XNU absorbs this with a - * qcache, following its history of absorbing the pre-ABD zio file and - * metadata caches being qcached (which raises the exchanges with the - * general heap from PAGESIZE to 256k). + * between the kmem cache and it's parent. XNU absorbs this with a a + * large minimum request to the parent vmem_caches on large-memory + * MacOS systems. */ extern vmem_t *spl_heap_arena; -#define BIG_SLAB 131072 -#ifdef __arm64__ -#define BIG_BIG_SLAB (BIG_SLAB * 2) -#else -#define BIG_BIG_SLAB BIG_SLAB -#endif +#define BIG_SLAB (PAGESIZE * 16) #define SMALL_RAM_MACHINE (4ULL * 1024ULL * 1024ULL * 1024ULL) if (total_memory >= SMALL_RAM_MACHINE) { abd_arena = vmem_create("abd_cache", NULL, 0, - PAGESIZE, vmem_alloc_impl, vmem_free_impl, spl_heap_arena, - BIG_BIG_SLAB, VM_SLEEP | VMC_NO_QCACHE); + sizeof (void *), + vmem_alloc_impl, vmem_free_impl, spl_heap_arena, + BIG_SLAB, VM_SLEEP | VMC_NO_QCACHE); } else { abd_arena = vmem_create("abd_cache", NULL, 0, - PAGESIZE, vmem_alloc_impl, vmem_free_impl, spl_heap_arena, - 131072, VM_SLEEP | VMC_NO_QCACHE); + sizeof (void *), + vmem_alloc_impl, vmem_free_impl, spl_heap_arena, + PAGESIZE, VM_SLEEP | VMC_NO_QCACHE); } VERIFY3P(abd_arena, !=, NULL); @@ -268,13 +264,15 @@ segkmem_abd_init() if (total_memory >= SMALL_RAM_MACHINE) { abd_subpage_arena = vmem_create("abd_subpage_cache", NULL, 0, - sizeof (void *), vmem_alloc_impl, vmem_free_impl, + sizeof (void *), + vmem_alloc_impl, vmem_free_impl, spl_heap_arena, BIG_SLAB, VM_SLEEP | VMC_NO_QCACHE); } else { abd_subpage_arena = vmem_create("abd_subpage_cache", NULL, 0, - 512, vmem_alloc_impl, vmem_free_impl, abd_arena, - 131072, VM_SLEEP | VMC_NO_QCACHE); + sizeof (void *), + vmem_alloc_impl, vmem_free_impl, abd_arena, + PAGESIZE, VM_SLEEP | VMC_NO_QCACHE); } VERIFY3P(abd_subpage_arena, !=, NULL); diff --git a/module/os/windows/spl/spl-taskq.c b/module/os/windows/spl/spl-taskq.c index 30944f85ff36..44a8b53768e6 100644 --- a/module/os/windows/spl/spl-taskq.c +++ b/module/os/windows/spl/spl-taskq.c @@ -2359,6 +2359,8 @@ taskq_create_common(const char *name, int instance, int nthreads, pri_t pri, taskq_t *tq = kmem_cache_alloc(taskq_cache, KM_SLEEP); #ifdef _WIN32 uint_t ncpus = max_ncpus; + if (!(flags & TASKQ_CREATE_SYNCED)) + ncpus = boot_ncpus; /* possibly deflated by num_ecores */ #else uint_t ncpus = ((boot_max_ncpus == -1) ? max_ncpus : boot_max_ncpus); #endif @@ -2820,9 +2822,22 @@ taskq_create_synced(const char *name, int nthreads, pri_t pri, flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH); tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX, - flags | TASKQ_PREPOPULATE); + flags | TASKQ_PREPOPULATE | TASKQ_CREATE_SYNCED); + VERIFY(tq != NULL); - VERIFY(tq->tq_nthreads == nthreads); + + /* wait until our minalloc (nthreads) threads are created */ + mutex_enter(&tq->tq_lock); + for (int i = 1; tq->tq_nthreads != nthreads; i++) { + dprintf("SPL: %s:%d: waiting for tq_nthreads (%d)" + " to be nthreads (%d), (target = %d, pass %d)\n", + __func__, __LINE__, + tq->tq_nthreads, tq->tq_nthreads_target, nthreads, i); + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + } + mutex_exit(&tq->tq_lock); + + VERIFY3U(tq->tq_nthreads, ==, nthreads); /* spawn all syncthreads */ for (int i = 0; i < nthreads; i++) { diff --git a/module/os/windows/spl/spl-windows.c b/module/os/windows/spl/spl-windows.c index 453067dbe3bf..0218fbc12e77 100644 --- a/module/os/windows/spl/spl-windows.c +++ b/module/os/windows/spl/spl-windows.c @@ -45,6 +45,7 @@ static utsname_t utsname_static = { { 0 } }; unsigned int max_ncpus = 0; +unsigned int boot_ncpus = 0; uint64_t total_memory = 0; uint64_t real_total_memory = 0; @@ -495,6 +496,12 @@ spl_start(PUNICODE_STRING RegistryPath) max_ncpus = KeQueryActiveProcessorCountEx(ALL_PROCESSOR_GROUPS); if (!max_ncpus) max_ncpus = 1; dprintf("SPL: total ncpu %d\n", max_ncpus); +#if defined(__arm64__) + num_ecores = (max_ncpus > 4) ? 4 : 0; // Apple has 4 eCores, fixme + boot_ncpus = MAX(1, (int)max_ncpus - (int)num_ecores); +#else + boot_ncpus = max_ncpus; +#endif // Not sure how to get physical RAM size in a Windows Driver // So until then, pull some numbers out of the aether. Next diff --git a/module/os/windows/zfs/abd_os.c b/module/os/windows/zfs/abd_os.c index 9cdde5d9cef0..70a1386de622 100644 --- a/module/os/windows/zfs/abd_os.c +++ b/module/os/windows/zfs/abd_os.c @@ -385,9 +385,34 @@ abd_init(void) /* check if we guessed ABD_PGSIZE correctly */ ASSERT3U(ABD_PGSIZE, ==, PAGE_SIZE); +#ifdef DEBUG + /* + * KMF_BUFTAG | KMF_LITE on the abd kmem_caches causes them to waste + * up to 50% of their memory for redzone. Even in DEBUG builds this + * therefore should be KMC_NOTOUCH unless there are concerns about + * overruns, UAFs, etc involving abd chunks or subpage chunks. + * + * Additionally these KMF_ + * flags require the definitions from + */ + + /* + * DEBUGGING: do this + * const int cflags = KMF_BUFTAG | KMF_LITE; + * or + * const int cflags = KMC_ARENA_SLAB; + * (the latter tests larger exchanges of memory with the kernel) + */ + + int cflags = KMF_BUFTAG | KMF_LITE; + // int cflags = KMC_ARENA_SLAB; +#else + int cflags = KMC_NOTOUCH; +#endif + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, - ABD_PGSIZE, - NULL, NULL, NULL, NULL, abd_arena, KMC_NOTOUCH); + sizeof (void *), + NULL, NULL, NULL, NULL, abd_arena, cflags); wmsum_init(&abd_sums.abdstat_struct_size, 0); wmsum_init(&abd_sums.abdstat_scatter_cnt, 0);