From a27086177c23899fd5ca15553908d0ba205051b2 Mon Sep 17 00:00:00 2001 From: Jorgen Lundman Date: Fri, 31 Jul 2020 10:32:26 +0900 Subject: [PATCH 001/231] Windows: bring in SPL sourcefiles cstyle still needs to be applied --- include/os/windows/spl/rpc/types.h | 7 + include/os/windows/spl/rpc/xdr.h | 170 + include/os/windows/spl/spl-bmalloc.h | 218 + include/os/windows/spl/spl-ctl.h | 45 + include/os/windows/spl/spl-debug.h | 64 + include/os/windows/spl/spl-device.h | 29 + include/os/windows/spl/spl-trace.h | 29 + include/os/windows/spl/strings.h | 4 + include/os/windows/spl/sys/acl.h | 118 + include/os/windows/spl/sys/acl_impl.h | 5 + include/os/windows/spl/sys/atomic.h | 178 + include/os/windows/spl/sys/attr.h | 7 + include/os/windows/spl/sys/avl.h | 316 + include/os/windows/spl/sys/avl_impl.h | 144 + include/os/windows/spl/sys/bitmap.h | 5 + include/os/windows/spl/sys/bootconf.h | 5 + include/os/windows/spl/sys/bootprops.h | 5 + include/os/windows/spl/sys/buf.h | 5 + include/os/windows/spl/sys/byteorder.h | 64 + include/os/windows/spl/sys/callb.h | 67 + include/os/windows/spl/sys/cmn_err.h | 54 + include/os/windows/spl/sys/compress.h | 5 + include/os/windows/spl/sys/condvar.h | 11 + include/os/windows/spl/sys/conf.h | 7 + include/os/windows/spl/sys/console.h | 5 + include/os/windows/spl/sys/cpupart.h | 5 + include/os/windows/spl/sys/cpuvar.h | 5 + include/os/windows/spl/sys/crc32.h | 5 + include/os/windows/spl/sys/cred.h | 69 + include/os/windows/spl/sys/ctype.h | 7 + include/os/windows/spl/sys/ddi.h | 5 + include/os/windows/spl/sys/debug.h | 202 + include/os/windows/spl/sys/dirent.h | 35 + include/os/windows/spl/sys/disp.h | 5 + include/os/windows/spl/sys/dkio.h | 15 + include/os/windows/spl/sys/dklabel.h | 5 + include/os/windows/spl/sys/dnlc.h | 24 + include/os/windows/spl/sys/dumphdr.h | 4 + include/os/windows/spl/sys/efi_partition.h | 5 + include/os/windows/spl/sys/errno.h | 158 + include/os/windows/spl/sys/extdirent.h | 7 + include/os/windows/spl/sys/fcntl.h | 30 + include/os/windows/spl/sys/file.h | 30 + include/os/windows/spl/sys/fs/swapnode.h | 5 + include/os/windows/spl/sys/idmap.h | 7 + include/os/windows/spl/sys/int_limits.h | 5 + include/os/windows/spl/sys/int_types.h | 7 + include/os/windows/spl/sys/inttypes.h | 5 + include/os/windows/spl/sys/isa_defs.h | 125 + include/os/windows/spl/sys/kidmap.h | 7 + include/os/windows/spl/sys/kmem.h | 157 + include/os/windows/spl/sys/kmem_impl.h | 503 ++ include/os/windows/spl/sys/kobj.h | 48 + include/os/windows/spl/sys/kstat.h | 277 + include/os/windows/spl/sys/list.h | 149 + include/os/windows/spl/sys/md5.h | 70 + include/os/windows/spl/sys/md5_consts.h | 133 + include/os/windows/spl/sys/mkdev.h | 5 + include/os/windows/spl/sys/mntent.h | 5 + include/os/windows/spl/sys/mode.h | 5 + include/os/windows/spl/sys/mount.h | 128 + include/os/windows/spl/sys/mutex.h | 98 + include/os/windows/spl/sys/note.h | 5 + include/os/windows/spl/sys/old-taskq.h | 186 + include/os/windows/spl/sys/open.h | 5 + include/os/windows/spl/sys/param.h | 17 + include/os/windows/spl/sys/pathname.h | 12 + include/os/windows/spl/sys/policy.h | 120 + include/os/windows/spl/sys/pool.h | 7 + include/os/windows/spl/sys/priv.h | 533 ++ include/os/windows/spl/sys/priv_impl.h | 5 + include/os/windows/spl/sys/proc.h | 14 + include/os/windows/spl/sys/processor.h | 11 + include/os/windows/spl/sys/pset.h | 15 + include/os/windows/spl/sys/random.h | 37 + include/os/windows/spl/sys/refstr.h | 5 + include/os/windows/spl/sys/resource.h | 7 + include/os/windows/spl/sys/rwlock.h | 67 + include/os/windows/spl/sys/sdt.h | 5 + include/os/windows/spl/sys/seg_kmem.h | 150 + include/os/windows/spl/sys/sid.h | 103 + include/os/windows/spl/sys/signal.h | 80 + include/os/windows/spl/sys/stat.h | 65 + include/os/windows/spl/sys/stropts.h | 173 + include/os/windows/spl/sys/sunddi.h | 197 + include/os/windows/spl/sys/sunldi.h | 34 + include/os/windows/spl/sys/sysdc.h | 4 + include/os/windows/spl/sys/sysevent.h | 36 + .../os/windows/spl/sys/sysevent/eventdefs.h | 135 + include/os/windows/spl/sys/sysmacros.h | 266 + include/os/windows/spl/sys/systeminfo.h | 14 + include/os/windows/spl/sys/systm.h | 71 + include/os/windows/spl/sys/t_lock.h | 10 + include/os/windows/spl/sys/taskq.h | 102 + include/os/windows/spl/sys/taskq_impl.h | 179 + include/os/windows/spl/sys/thread.h | 109 + include/os/windows/spl/sys/time.h | 117 + include/os/windows/spl/sys/timer.h | 78 + include/os/windows/spl/sys/tsd.h | 53 + include/os/windows/spl/sys/types.h | 224 + include/os/windows/spl/sys/types32.h | 13 + include/os/windows/spl/sys/ubc.h | 5 + include/os/windows/spl/sys/uio.h | 189 + include/os/windows/spl/sys/unistd.h | 7 + include/os/windows/spl/sys/utsname.h | 46 + include/os/windows/spl/sys/va_list.h | 5 + include/os/windows/spl/sys/varargs.h | 7 + include/os/windows/spl/sys/vfs.h | 85 + include/os/windows/spl/sys/vfs_opreg.h | 5 + include/os/windows/spl/sys/vmem.h | 174 + include/os/windows/spl/sys/vmem_impl.h | 154 + include/os/windows/spl/sys/vmsystm.h | 7 + include/os/windows/spl/sys/vnode.h | 583 ++ include/os/windows/spl/sys/zmod.h | 123 + include/os/windows/spl/sys/zone.h | 11 + include/os/windows/spl/unistd.h | 4 + include/os/windows/zfs/sys/spa.h | 1192 +++ include/os/windows/zfs/zfs_config.h | 95 + module/os/windows/PORTING_NOTES.txt | 89 + module/os/windows/README.md | 430 + module/os/windows/driver.c | 335 + module/os/windows/spl/CMakeLists.txt | 32 + module/os/windows/spl/spl-atomic.c | 53 + module/os/windows/spl/spl-avl.c | 1077 +++ module/os/windows/spl/spl-condvar.c | 281 + module/os/windows/spl/spl-cred.c | 165 + module/os/windows/spl/spl-ddi.c | 655 ++ module/os/windows/spl/spl-debug.c | 28 + module/os/windows/spl/spl-err.c | 67 + module/os/windows/spl/spl-kmem.c | 7067 +++++++++++++++++ module/os/windows/spl/spl-kstat.c | 1943 +++++ module/os/windows/spl/spl-list.c | 197 + module/os/windows/spl/spl-md5.c | 667 ++ module/os/windows/spl/spl-mount.c | 94 + module/os/windows/spl/spl-mutex.c | 191 + module/os/windows/spl/spl-policy.c | 880 ++ module/os/windows/spl/spl-proc.c | 26 + module/os/windows/spl/spl-processor.c | 48 + module/os/windows/spl/spl-rwlock.c | 238 + module/os/windows/spl/spl-seg_kmem.c | 291 + module/os/windows/spl/spl-taskq.c | 2296 ++++++ module/os/windows/spl/spl-thread.c | 142 + module/os/windows/spl/spl-time.c | 113 + module/os/windows/spl/spl-tsd.c | 387 + module/os/windows/spl/spl-uio.c | 312 + module/os/windows/spl/spl-vmem.c | 3813 +++++++++ module/os/windows/spl/spl-vnode.c | 1818 +++++ module/os/windows/spl/spl-windows.c | 647 ++ module/os/windows/spl/spl-xdr.c | 523 ++ module/os/windows/spl/spl-zlib.c | 199 + module/os/windows/zfs/CMakeLists.txt | 129 + 151 files changed, 35111 insertions(+) create mode 100644 include/os/windows/spl/rpc/types.h create mode 100644 include/os/windows/spl/rpc/xdr.h create mode 100644 include/os/windows/spl/spl-bmalloc.h create mode 100644 include/os/windows/spl/spl-ctl.h create mode 100644 include/os/windows/spl/spl-debug.h create mode 100644 include/os/windows/spl/spl-device.h create mode 100644 include/os/windows/spl/spl-trace.h create mode 100644 include/os/windows/spl/strings.h create mode 100644 include/os/windows/spl/sys/acl.h create mode 100644 include/os/windows/spl/sys/acl_impl.h create mode 100644 include/os/windows/spl/sys/atomic.h create mode 100644 include/os/windows/spl/sys/attr.h create mode 100644 include/os/windows/spl/sys/avl.h create mode 100644 include/os/windows/spl/sys/avl_impl.h create mode 100644 include/os/windows/spl/sys/bitmap.h create mode 100644 include/os/windows/spl/sys/bootconf.h create mode 100644 include/os/windows/spl/sys/bootprops.h create mode 100644 include/os/windows/spl/sys/buf.h create mode 100644 include/os/windows/spl/sys/byteorder.h create mode 100644 include/os/windows/spl/sys/callb.h create mode 100644 include/os/windows/spl/sys/cmn_err.h create mode 100644 include/os/windows/spl/sys/compress.h create mode 100644 include/os/windows/spl/sys/condvar.h create mode 100644 include/os/windows/spl/sys/conf.h create mode 100644 include/os/windows/spl/sys/console.h create mode 100644 include/os/windows/spl/sys/cpupart.h create mode 100644 include/os/windows/spl/sys/cpuvar.h create mode 100644 include/os/windows/spl/sys/crc32.h create mode 100644 include/os/windows/spl/sys/cred.h create mode 100644 include/os/windows/spl/sys/ctype.h create mode 100644 include/os/windows/spl/sys/ddi.h create mode 100644 include/os/windows/spl/sys/debug.h create mode 100644 include/os/windows/spl/sys/dirent.h create mode 100644 include/os/windows/spl/sys/disp.h create mode 100644 include/os/windows/spl/sys/dkio.h create mode 100644 include/os/windows/spl/sys/dklabel.h create mode 100644 include/os/windows/spl/sys/dnlc.h create mode 100644 include/os/windows/spl/sys/dumphdr.h create mode 100644 include/os/windows/spl/sys/efi_partition.h create mode 100644 include/os/windows/spl/sys/errno.h create mode 100644 include/os/windows/spl/sys/extdirent.h create mode 100644 include/os/windows/spl/sys/fcntl.h create mode 100644 include/os/windows/spl/sys/file.h create mode 100644 include/os/windows/spl/sys/fs/swapnode.h create mode 100644 include/os/windows/spl/sys/idmap.h create mode 100644 include/os/windows/spl/sys/int_limits.h create mode 100644 include/os/windows/spl/sys/int_types.h create mode 100644 include/os/windows/spl/sys/inttypes.h create mode 100644 include/os/windows/spl/sys/isa_defs.h create mode 100644 include/os/windows/spl/sys/kidmap.h create mode 100644 include/os/windows/spl/sys/kmem.h create mode 100644 include/os/windows/spl/sys/kmem_impl.h create mode 100644 include/os/windows/spl/sys/kobj.h create mode 100644 include/os/windows/spl/sys/kstat.h create mode 100644 include/os/windows/spl/sys/list.h create mode 100644 include/os/windows/spl/sys/md5.h create mode 100644 include/os/windows/spl/sys/md5_consts.h create mode 100644 include/os/windows/spl/sys/mkdev.h create mode 100644 include/os/windows/spl/sys/mntent.h create mode 100644 include/os/windows/spl/sys/mode.h create mode 100644 include/os/windows/spl/sys/mount.h create mode 100644 include/os/windows/spl/sys/mutex.h create mode 100644 include/os/windows/spl/sys/note.h create mode 100644 include/os/windows/spl/sys/old-taskq.h create mode 100644 include/os/windows/spl/sys/open.h create mode 100644 include/os/windows/spl/sys/param.h create mode 100644 include/os/windows/spl/sys/pathname.h create mode 100644 include/os/windows/spl/sys/policy.h create mode 100644 include/os/windows/spl/sys/pool.h create mode 100644 include/os/windows/spl/sys/priv.h create mode 100644 include/os/windows/spl/sys/priv_impl.h create mode 100644 include/os/windows/spl/sys/proc.h create mode 100644 include/os/windows/spl/sys/processor.h create mode 100644 include/os/windows/spl/sys/pset.h create mode 100644 include/os/windows/spl/sys/random.h create mode 100644 include/os/windows/spl/sys/refstr.h create mode 100644 include/os/windows/spl/sys/resource.h create mode 100644 include/os/windows/spl/sys/rwlock.h create mode 100644 include/os/windows/spl/sys/sdt.h create mode 100644 include/os/windows/spl/sys/seg_kmem.h create mode 100644 include/os/windows/spl/sys/sid.h create mode 100644 include/os/windows/spl/sys/signal.h create mode 100644 include/os/windows/spl/sys/stat.h create mode 100644 include/os/windows/spl/sys/stropts.h create mode 100644 include/os/windows/spl/sys/sunddi.h create mode 100644 include/os/windows/spl/sys/sunldi.h create mode 100644 include/os/windows/spl/sys/sysdc.h create mode 100644 include/os/windows/spl/sys/sysevent.h create mode 100644 include/os/windows/spl/sys/sysevent/eventdefs.h create mode 100644 include/os/windows/spl/sys/sysmacros.h create mode 100644 include/os/windows/spl/sys/systeminfo.h create mode 100644 include/os/windows/spl/sys/systm.h create mode 100644 include/os/windows/spl/sys/t_lock.h create mode 100644 include/os/windows/spl/sys/taskq.h create mode 100644 include/os/windows/spl/sys/taskq_impl.h create mode 100644 include/os/windows/spl/sys/thread.h create mode 100644 include/os/windows/spl/sys/time.h create mode 100644 include/os/windows/spl/sys/timer.h create mode 100644 include/os/windows/spl/sys/tsd.h create mode 100644 include/os/windows/spl/sys/types.h create mode 100644 include/os/windows/spl/sys/types32.h create mode 100644 include/os/windows/spl/sys/ubc.h create mode 100644 include/os/windows/spl/sys/uio.h create mode 100644 include/os/windows/spl/sys/unistd.h create mode 100644 include/os/windows/spl/sys/utsname.h create mode 100644 include/os/windows/spl/sys/va_list.h create mode 100644 include/os/windows/spl/sys/varargs.h create mode 100644 include/os/windows/spl/sys/vfs.h create mode 100644 include/os/windows/spl/sys/vfs_opreg.h create mode 100644 include/os/windows/spl/sys/vmem.h create mode 100644 include/os/windows/spl/sys/vmem_impl.h create mode 100644 include/os/windows/spl/sys/vmsystm.h create mode 100644 include/os/windows/spl/sys/vnode.h create mode 100644 include/os/windows/spl/sys/zmod.h create mode 100644 include/os/windows/spl/sys/zone.h create mode 100644 include/os/windows/spl/unistd.h create mode 100644 include/os/windows/zfs/sys/spa.h create mode 100644 include/os/windows/zfs/zfs_config.h create mode 100644 module/os/windows/PORTING_NOTES.txt create mode 100644 module/os/windows/README.md create mode 100644 module/os/windows/driver.c create mode 100644 module/os/windows/spl/CMakeLists.txt create mode 100644 module/os/windows/spl/spl-atomic.c create mode 100644 module/os/windows/spl/spl-avl.c create mode 100644 module/os/windows/spl/spl-condvar.c create mode 100644 module/os/windows/spl/spl-cred.c create mode 100644 module/os/windows/spl/spl-ddi.c create mode 100644 module/os/windows/spl/spl-debug.c create mode 100644 module/os/windows/spl/spl-err.c create mode 100644 module/os/windows/spl/spl-kmem.c create mode 100644 module/os/windows/spl/spl-kstat.c create mode 100644 module/os/windows/spl/spl-list.c create mode 100644 module/os/windows/spl/spl-md5.c create mode 100644 module/os/windows/spl/spl-mount.c create mode 100644 module/os/windows/spl/spl-mutex.c create mode 100644 module/os/windows/spl/spl-policy.c create mode 100644 module/os/windows/spl/spl-proc.c create mode 100644 module/os/windows/spl/spl-processor.c create mode 100644 module/os/windows/spl/spl-rwlock.c create mode 100644 module/os/windows/spl/spl-seg_kmem.c create mode 100644 module/os/windows/spl/spl-taskq.c create mode 100644 module/os/windows/spl/spl-thread.c create mode 100644 module/os/windows/spl/spl-time.c create mode 100644 module/os/windows/spl/spl-tsd.c create mode 100644 module/os/windows/spl/spl-uio.c create mode 100644 module/os/windows/spl/spl-vmem.c create mode 100644 module/os/windows/spl/spl-vnode.c create mode 100644 module/os/windows/spl/spl-windows.c create mode 100644 module/os/windows/spl/spl-xdr.c create mode 100644 module/os/windows/spl/spl-zlib.c create mode 100644 module/os/windows/zfs/CMakeLists.txt diff --git a/include/os/windows/spl/rpc/types.h b/include/os/windows/spl/rpc/types.h new file mode 100644 index 000000000000..4602ededc064 --- /dev/null +++ b/include/os/windows/spl/rpc/types.h @@ -0,0 +1,7 @@ + +#ifndef _SPL_RPC_TYPES_H +#define _SPL_RPC_TYPES_H + +typedef int bool_t; + +#endif /* SPL_RPC_TYPES_H */ diff --git a/include/os/windows/spl/rpc/xdr.h b/include/os/windows/spl/rpc/xdr.h new file mode 100644 index 000000000000..70f006927926 --- /dev/null +++ b/include/os/windows/spl/rpc/xdr.h @@ -0,0 +1,170 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright (c) 1989, 2011, Oracle and/or its affiliates. All rights reserved. + */ +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ +/* + * Portions of this source code were derived from Berkeley + * 4.3 BSD under license from the Regents of the University of + * California. + */ + +/* + * xdr.h, External Data Representation Serialization Routines. + * + */ + +#ifndef _SPL_RPC_XDR_H +#define _SPL_RPC_XDR_H + + +#include +#include + +/* + * XDR enums and types. + */ +enum xdr_op { + XDR_ENCODE, + XDR_DECODE +}; + +struct xdr_ops; + +typedef struct { + struct xdr_ops *x_ops; /* Also used to let caller know if + xdrmem_create() succeeds (sigh..) */ + caddr_t x_addr; /* Current buffer addr */ + caddr_t x_addr_end; /* End of the buffer */ + enum xdr_op x_op; /* Stream direction */ +} XDR; + +typedef bool_t (*xdrproc_t)(XDR *xdrs, void *ptr); + +struct xdr_ops { + bool_t (*xdr_control)(XDR *, int, void *); + + bool_t (*xdr_char)(XDR *, char *); + bool_t (*xdr_u_short)(XDR *, unsigned short *); + bool_t (*xdr_u_int)(XDR *, unsigned *); + bool_t (*xdr_u_longlong_t)(XDR *, u_longlong_t *); + + bool_t (*xdr_opaque)(XDR *, caddr_t, const uint_t); + bool_t (*xdr_string)(XDR *, char **, const uint_t); + bool_t (*xdr_array)(XDR *, caddr_t *, uint_t *, const uint_t, + const uint_t, const xdrproc_t); +}; + +/* + * XDR control operator. + */ +#define XDR_GET_BYTES_AVAIL 1 + +struct xdr_bytesrec { + bool_t xc_is_last_record; + size_t xc_num_avail; +}; + +/* + * XDR functions. + */ +void xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, + const enum xdr_op op); +#define xdr_destroy(xdrs) ((void) 0) /* Currently not needed. If needed later, + we'll add it to struct xdr_ops */ + +#define xdr_control(xdrs, req, info) (xdrs)->x_ops->xdr_control((xdrs), \ + (req), (info)) + +/* + * For precaution, the following are defined as static inlines instead of macros + * to get some amount of type safety. + * + * Also, macros wouldn't work in the case where typecasting is done, because it + * must be possible to reference the functions' addresses by these names. + */ +static inline bool_t xdr_char(XDR *xdrs, char *cp) +{ + return xdrs->x_ops->xdr_char(xdrs, cp); +} + +static inline bool_t xdr_u_short(XDR *xdrs, unsigned short *usp) +{ + return xdrs->x_ops->xdr_u_short(xdrs, usp); +} + +static inline bool_t xdr_short(XDR *xdrs, short *sp) +{ + //BUILD_BUG_ON(sizeof(short) != 2); + return xdrs->x_ops->xdr_u_short(xdrs, (unsigned short *) sp); +} + +static inline bool_t xdr_u_int(XDR *xdrs, unsigned *up) +{ + return xdrs->x_ops->xdr_u_int(xdrs, up); +} + +static inline bool_t xdr_int(XDR *xdrs, int *ip) +{ + //BUILD_BUG_ON(sizeof(int) != 4); + return xdrs->x_ops->xdr_u_int(xdrs, (unsigned *) ip); +} + +static inline bool_t xdr_u_longlong_t(XDR *xdrs, u_longlong_t *ullp) +{ + return xdrs->x_ops->xdr_u_longlong_t(xdrs, ullp); +} + +static inline bool_t xdr_longlong_t(XDR *xdrs, longlong_t *llp) +{ + //BUILD_BUG_ON(sizeof(longlong_t) != 8); + return xdrs->x_ops->xdr_u_longlong_t(xdrs, (u_longlong_t *) llp); +} + +/* + * Fixed-length opaque data. + */ +static inline bool_t xdr_opaque(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + return xdrs->x_ops->xdr_opaque(xdrs, cp, cnt); +} + +/* + * Variable-length string. + * The *sp buffer must have (maxsize + 1) bytes. + */ +static inline bool_t xdr_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + return xdrs->x_ops->xdr_string(xdrs, sp, maxsize); +} + +/* + * Variable-length arrays. + */ +static inline bool_t xdr_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, + const uint_t maxsize, const uint_t elsize, const xdrproc_t elproc) +{ + return xdrs->x_ops->xdr_array(xdrs, arrp, sizep, maxsize, elsize, + elproc); +} + +#endif /* SPL_RPC_XDR_H */ diff --git a/include/os/windows/spl/spl-bmalloc.h b/include/os/windows/spl/spl-bmalloc.h new file mode 100644 index 000000000000..7fd9f4b8f76e --- /dev/null +++ b/include/os/windows/spl/spl-bmalloc.h @@ -0,0 +1,218 @@ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * Copyright 2014 Brendon Humphrey (brendon.humphrey@mac.com) + * + * CDDL HEADER END + */ + +#ifndef BMALLOC_H +#define BMALLOC_H + +#include + +/* + * Knobs and controls + */ + +/* + * Place the allocator in thread-safe mode. If you have an application where the + * allocator does not have to be thread safe, then removing the mutexes will + * improve the allocator performance by about 30%. + */ +#define THREAD_SAFE 1 + +/* + * Provide extra locking around the slice lists, as under some conditions, + * memory handling errors in the application can interfere with the locking + * strategy used. + */ +// #define SLICE_SPINLOCK 1 + +/* + * Turn on counting of the number of allocations made to each allocator. Major + * performance killer. Keep turned off. + */ +// #define COUNT_ALLOCATIONS 1 + +/* + * Borrow an idea from the Linux kernel SLUB allocator - namely, have the Slice + * Allocator simply forget about full slices. They are "found" again when a free + * occurs from the full slice, and added to the partial list again. This saves a + * small amount of list processing overhead and storage space. (The performance + * difference is probably purely academic.) + * + * You will want to enable this if hunting memory leaks. + */ +// #define SLICE_ALLOCATOR_TRACK_FULL_SLABS 1 + +// #define DEBUG 1 + +#ifdef DEBUG + +/* Select a logging mechanism. */ +// #define REPORT_PANIC 1 +#define REPORT_LOG 1 + +/* + * Check whether an application is writing beyond the number of bytes allocated + * in a call to bmalloc(). Implemented using buffer poisoning. + */ +#define SLICE_CHECK_BOUNDS_WRITE 1 + +/* + * Check for writes to memory after free. Works in part by poisoning the user + * memory on free. The idea is that if a buffer is not fully poisoned on + * allocate, there is evidence of use after free. This may have the side effect + * of causing other failures - if an application relies on valid data in the + * memory after free, bad things can happen. + */ +#define SLICE_CHECK_WRITE_AFTER_FREE 1 + +/* Check integrity of slice row headers. */ +#define SLICE_CHECK_ROW_HEADERS 1 + +/* + * Check that the number of bytes passed to bmalloc to release matches the + * number of bytes allocated. + */ +#define SLICE_CHECK_FREE_SIZE 1 + +/* + * Instrument the Slice object to detect concurrent threads accessing the data + * structures - indicative of a serious programming error. + */ +#define SLICE_CHECK_THREADS 1 + +/* + * Have the SA check that any operations performed on a slice are performed on a + * slice that the the SA actually owns. + */ +#define SA_CHECK_SLICE_SIZE 1 + +/* Select correct dependencies based on debug flags. */ + +#ifdef SLICE_CHECK_WRITE_AFTER_FREE +/* Poison user allocatable portions of slice rows on free. */ +#define SLICE_POISON_USER_SPACE 1 +#endif /* SLICE_CHECK_WRITE_AFTER_FREE */ + +#endif /* DEBUG */ + +/* + * Data Types + */ +typedef uint64_t sa_size_t; +typedef uint8_t sa_byte_t; +typedef uint8_t sa_bool_t; +typedef uint64_t sa_hrtime_t; +typedef uint32_t large_offset_t; + +typedef struct slice_allocator { + + /* + * Statistics + */ + uint64_t slices_created; /* slices added to sa */ + uint64_t slices_destroyed; /* empty slices freed */ + uint64_t slice_alloc; /* allocation count */ + uint64_t slice_free; /* free count */ + uint64_t slice_alloc_fail; /* num failed allocs */ + uint64_t free_slices; /* number of empty slices cached */ + + /* + * State + */ + + uint64_t flags; + sa_size_t slice_size; + list_t free; + list_t partial; +#ifdef SLICE_ALLOCATOR_TRACK_FULL_SLABS + list_t full; +#endif /* SLICE_ALLOCATOR_TRACK_FULL_SLABS */ + /* Max alloc size for slice */ + sa_size_t max_alloc_size; + /* Number of rows to be allocated in the Slices */ + sa_size_t num_allocs_per_slice; + lck_spin_t *spinlock; +} slice_allocator_t; + +// Convenient way to access kernel_memory_allocate and kmem_free +void * osif_malloc(sa_size_t size); +void osif_free(void* buf, sa_size_t size); + +// +// Initialises the allocator, must be called before any other function. +// +void bmalloc_init(); + +// +// Allocate bytes of memory for the application +// +void* bmalloc(uint64_t size, int flags); +void* bzmalloc(uint64_t size, int flags); + +// +// Release memory from the application +// +void bfree(void* buf, uint64_t size); + +// +// Attempt to release pages of +// memory from the free memory block collection. +// Returns number of pages released. +uint64_t bmalloc_release_pages(uint64_t num_pages); + +// +// Manages from free memory within the allocator. +// Should be called periodically (say at least +// every 10 seconds). +// Returns the number of pages released as a result +uint64_t bmalloc_garbage_collect(); + +// +// Release all remaining memory and allocator resources +// +void bmalloc_fini(); + +/* + * Slice allocator interfaces for kmem to use as "slabs" for its caches + */ + +void +slice_allocator_init(slice_allocator_t *sa, sa_size_t max_alloc_size); + +void * +slice_allocator_alloc(slice_allocator_t *sa, sa_size_t size); + +void +slice_allocator_free(slice_allocator_t *sa, void *buf, sa_size_t size); + +void +slice_allocator_garbage_collect(slice_allocator_t *sa); + +uint64_t +slice_allocator_release_pages(slice_allocator_t *sa, uint64_t num_pages); + +void +slice_allocator_fini(slice_allocator_t *sa); + + +#endif diff --git a/include/os/windows/spl/spl-ctl.h b/include/os/windows/spl/spl-ctl.h new file mode 100644 index 000000000000..9db13907348f --- /dev/null +++ b/include/os/windows/spl/spl-ctl.h @@ -0,0 +1,45 @@ +/*****************************************************************************\ + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . +\*****************************************************************************/ + +#ifndef _DEBUG_CTL_H +#define _DEBUG_CTL_H + +/* + * Contains shared definitions which both the user space + * and kernel space portions of splat must agree on. + */ +typedef struct spl_debug_header { + int ph_len; + int ph_flags; + int ph_subsys; + int ph_mask; + int ph_cpu_id; + int ph_sec; + long ph_usec; + int ph_stack; + int ph_pid; + int ph_line_num; +} spl_debug_header_t; + +#endif /* _DEBUG_CTL_H */ diff --git a/include/os/windows/spl/spl-debug.h b/include/os/windows/spl/spl-debug.h new file mode 100644 index 000000000000..2cf871edfcba --- /dev/null +++ b/include/os/windows/spl/spl-debug.h @@ -0,0 +1,64 @@ +/*****************************************************************************\ + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . +\*****************************************************************************/ + +/* + * Available debug functions. These function should be used by any + * package which needs to integrate with the SPL log infrastructure. + * + * SDEBUG() - Log debug message with specified mask. + * SDEBUG_LIMIT() - Log just 1 debug message with specified mask. + * SWARN() - Log a warning message. + * SERROR() - Log an error message. + * SEMERG() - Log an emergency error message. + * SCONSOLE() - Log a generic message to the console. + * + * SENTRY - Log entry point to a function. + * SEXIT - Log exit point from a function. + * SRETURN(x) - Log return from a function. + * SGOTO(x, y) - Log goto within a function. + */ + +#ifndef _SPL_DEBUG_INTERNAL_H +#define _SPL_DEBUG_INTERNAL_H + +//#include +//#include +//#include +#include +#ifdef __cplusplus +// To make C++ happier about strnlen in kcdata.h +extern "C" { +#endif +#include +#ifdef __cplusplus +} +#endif + + + +void spl_backtrace(char *thesignal); +int getpcstack(uintptr_t *pcstack, int pcstack_limit); +void print_symbol(uintptr_t symbol); + +#endif /* SPL_DEBUG_INTERNAL_H */ diff --git a/include/os/windows/spl/spl-device.h b/include/os/windows/spl/spl-device.h new file mode 100644 index 000000000000..51e8b3e04813 --- /dev/null +++ b/include/os/windows/spl/spl-device.h @@ -0,0 +1,29 @@ +/*****************************************************************************\ + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . +\*****************************************************************************/ + +#ifndef _SPL_DEVICE_H +#define _SPL_DEVICE_H + + +#endif /* _SPL_DEVICE_H */ diff --git a/include/os/windows/spl/spl-trace.h b/include/os/windows/spl/spl-trace.h new file mode 100644 index 000000000000..e89bbe08c7b6 --- /dev/null +++ b/include/os/windows/spl/spl-trace.h @@ -0,0 +1,29 @@ +/*****************************************************************************\ + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . +\*****************************************************************************/ + +#ifndef _SPL_TRACE_H +#define _SPL_TRACE_H + + +#endif /* SPL_TRACE_H */ diff --git a/include/os/windows/spl/strings.h b/include/os/windows/spl/strings.h new file mode 100644 index 000000000000..697ac4db61ce --- /dev/null +++ b/include/os/windows/spl/strings.h @@ -0,0 +1,4 @@ +#ifndef _SPL_STRINGS_H +#define _SPL_STRINGS_H + +#endif /* SPL_STRINGS_H */ diff --git a/include/os/windows/spl/sys/acl.h b/include/os/windows/spl/sys/acl.h new file mode 100644 index 000000000000..c44750975c8f --- /dev/null +++ b/include/os/windows/spl/sys/acl.h @@ -0,0 +1,118 @@ +/* +* CDDL HEADER START +* +* The contents of this file are subject to the terms of the +* Common Development and Distribution License (the "License"). +* You may not use this file except in compliance with the License. +* +* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +* or http://www.opensolaris.org/os/licensing. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* When distributing Covered Code, include this CDDL HEADER in each +* file and include the License file at usr/src/OPENSOLARIS.LICENSE. +* If applicable, add the following below this CDDL HEADER, with the +* fields enclosed by brackets "[]" replaced with your own identifying +* information: Portions Copyright [yyyy] [name of copyright owner] +* +* CDDL HEADER END +*/ +/* +* Copyright 2009 Sun Microsystems, Inc. All rights reserved. +* Use is subject to license terms. +*/ + +#ifndef _SPL_ACL_H +#define _SPL_ACL_H + +#include + +typedef struct ace { + uid_t a_who; + uint32_t a_access_mask; + uint16_t a_flags; + uint16_t a_type; +} ace_t; + +typedef struct ace_object { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ + uint8_t a_obj_type[16]; /* obj type */ + uint8_t a_inherit_obj_type[16]; /* inherit obj */ +} ace_object_t; + +#define MAX_ACL_ENTRIES 1024 + +#define ACE_READ_DATA 0x00000001 +#define ACE_LIST_DIRECTORY 0x00000001 +#define ACE_WRITE_DATA 0x00000002 +#define ACE_ADD_FILE 0x00000002 +#define ACE_APPEND_DATA 0x00000004 +#define ACE_ADD_SUBDIRECTORY 0x00000004 +#define ACE_READ_NAMED_ATTRS 0x00000008 +#define ACE_WRITE_NAMED_ATTRS 0x00000010 +#define ACE_EXECUTE 0x00000020 +#define ACE_DELETE_CHILD 0x00000040 +#define ACE_READ_ATTRIBUTES 0x00000080 +#define ACE_WRITE_ATTRIBUTES 0x00000100 +#define ACE_DELETE 0x00010000 +#define ACE_READ_ACL 0x00020000 +#define ACE_WRITE_ACL 0x00040000 +#define ACE_WRITE_OWNER 0x00080000 +#define ACE_SYNCHRONIZE 0x00100000 + +#define ACE_FILE_INHERIT_ACE 0x0001 +#define ACE_DIRECTORY_INHERIT_ACE 0x0002 +#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004 +#define ACE_INHERIT_ONLY_ACE 0x0008 +#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010 +#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020 +#define ACE_IDENTIFIER_GROUP 0x0040 +#define ACE_INHERITED_ACE 0x0080 +#define ACE_OWNER 0x1000 +#define ACE_GROUP 0x2000 +#define ACE_EVERYONE 0x4000 + +#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000 +#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001 +#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002 +#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003 + +#define ACL_AUTO_INHERIT 0x0001 +#define ACL_PROTECTED 0x0002 +#define ACL_DEFAULTED 0x0004 +#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED|ACL_DEFAULTED) + +#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E +#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 + +#define ACE_ALL_TYPES 0x001F + +#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE|ACE_IDENTIFIER_GROUP) + +#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_SYNCHRONIZE) + +#define VSA_ACE 0x0010 +#define VSA_ACECNT 0x0020 +#define VSA_ACE_ALLTYPES 0x0040 +#define VSA_ACE_ACLFLAGS 0x0080 + +#endif /* _SPL_ACL_H */ diff --git a/include/os/windows/spl/sys/acl_impl.h b/include/os/windows/spl/sys/acl_impl.h new file mode 100644 index 000000000000..663e78cc0f6d --- /dev/null +++ b/include/os/windows/spl/sys/acl_impl.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_ACL_IMPL_H +#define _SPL_ACL_IMPL_H + +#endif /* _SPL_ACL_IMPL_H */ diff --git a/include/os/windows/spl/sys/atomic.h b/include/os/windows/spl/sys/atomic.h new file mode 100644 index 000000000000..21ffbfead78b --- /dev/null +++ b/include/os/windows/spl/sys/atomic.h @@ -0,0 +1,178 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * OSX Atomic functions using GCC builtins. + * + * Jorgen Lundman + * + */ + +#ifndef _SPL_ATOMIC_H +#define _SPL_ATOMIC_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * + * GCC atomic versions. These are preferrable once we sort out compatibility + * issues with GCC versions? + */ + +/* The _nv variants return the NewValue */ + +/* + * Increment target + */ +static inline void atomic_inc_32(volatile uint32_t *target) +{ + InterlockedIncrement((volatile LONG *)target); +} +static inline void atomic_inc_64(volatile uint64_t *target) +{ + InterlockedIncrement64((volatile LONG64 *)target); +} +static inline int32_t atomic_inc_32_nv(volatile uint32_t *target) +{ + return InterlockedIncrement((volatile LONG *)target); +} +static inline int64_t atomic_inc_64_nv(volatile uint64_t *target) +{ + return InterlockedIncrement64((volatile LONG64 *)target); +} + + + +/* + * Decrement target + */ +static inline void atomic_dec_32(volatile uint32_t *target) +{ + InterlockedDecrement((volatile LONG *)target); +} +static inline void atomic_dec_64(volatile uint64_t *target) +{ + InterlockedDecrement64((volatile LONG64 *)target); +} +static inline int32_t atomic_dec_32_nv(volatile uint32_t *target) +{ + return InterlockedDecrement((volatile LONG *)target); +} +static inline int64_t atomic_dec_64_nv(volatile uint64_t *target) +{ + return InterlockedDecrement64((volatile LONG64 *)target); +} + + + + +/* + * Add delta to target + */ +static inline void +atomic_add_32(volatile uint32_t *target, int32_t delta) +{ + InterlockedExchangeAdd((volatile LONG *)target, delta); +} +static inline uint32_t +atomic_add_32_nv(volatile uint32_t *target, int32_t delta) +{ + return InterlockedExchangeAdd((volatile LONG *)target, delta) + delta; +} +static inline void +atomic_add_64(volatile uint64_t *target, int64_t delta) +{ + InterlockedExchangeAdd64((volatile LONG64 *)target, delta); +} +static inline uint64_t +atomic_add_64_nv(volatile uint64_t *target, int64_t delta) +{ + return InterlockedExchangeAdd64((volatile LONG64 *)target, delta) + delta; +} + + +/* + * Subtract delta to target + */ +static inline void +atomic_sub_32(volatile uint32_t *target, int32_t delta) +{ + InterlockedExchangeAdd((volatile LONG *)target, -delta); +} +static inline void +atomic_sub_64(volatile uint64_t *target, int64_t delta) +{ + InterlockedExchangeAdd64((volatile LONG64 *)target, -delta); +} +static inline uint64_t +atomic_sub_64_nv(volatile uint64_t *target, int64_t delta) +{ + return InterlockedExchangeAdd64((volatile LONG64 *)target, -delta) - delta; +} + + +/* + * logical OR bits with target + */ + +/* + * logical AND bits with target + */ + + +/* + * Compare And Set + * if *arg1 == arg2, then set *arg1 = arg3; return old value. + */ + +static inline uint32_t +atomic_cas_32(volatile uint32_t *_target, uint32_t _cmp, uint32_t _new) +{ + return InterlockedCompareExchange((volatile LONG *)_target, _new, _cmp); +} +static inline uint64_t +atomic_cas_64(volatile uint64_t *_target, uint64_t _cmp, uint64_t _new) +{ + return InterlockedCompareExchange64((volatile LONG64 *)_target, _new, _cmp); +} + +static inline uint64_t +atomic_swap_64(volatile uint64_t *_target, uint64_t _new) +{ + return InterlockedExchange64((volatile LONG64 *)_target, _new); +} + +extern void *atomic_cas_ptr(volatile void *_target, void *_cmp, void *_new); + +static inline void membar_producer(void) { _mm_mfence(); } + +#ifdef __cplusplus +} +#endif + +#endif /* _SPL_ATOMIC_H */ diff --git a/include/os/windows/spl/sys/attr.h b/include/os/windows/spl/sys/attr.h new file mode 100644 index 000000000000..93995b449449 --- /dev/null +++ b/include/os/windows/spl/sys/attr.h @@ -0,0 +1,7 @@ + +#ifndef _SPL_ATTR_H +#define _SPL_ATTR_H + +//#include_next + +#endif /* SPL_ATTR_H */ diff --git a/include/os/windows/spl/sys/avl.h b/include/os/windows/spl/sys/avl.h new file mode 100644 index 000000000000..1f5304283c1b --- /dev/null +++ b/include/os/windows/spl/sys/avl.h @@ -0,0 +1,316 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _AVL_H +#define _AVL_H + +/* + * This is a private header file. Applications should not directly include + * this file. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* + * This is a generic implemenatation of AVL trees for use in the Solaris kernel. + * The interfaces provide an efficient way of implementing an ordered set of + * data structures. + * + * AVL trees provide an alternative to using an ordered linked list. Using AVL + * trees will usually be faster, however they requires more storage. An ordered + * linked list in general requires 2 pointers in each data structure. The + * AVL tree implementation uses 3 pointers. The following chart gives the + * approximate performance of operations with the different approaches: + * + * Operation Link List AVL tree + * --------- -------- -------- + * lookup O(n) O(log(n)) + * + * insert 1 node constant constant + * + * delete 1 node constant between constant and O(log(n)) + * + * delete all nodes O(n) O(n) + * + * visit the next + * or prev node constant between constant and O(log(n)) + * + * + * The data structure nodes are anchored at an "avl_tree_t" (the equivalent + * of a list header) and the individual nodes will have a field of + * type "avl_node_t" (corresponding to list pointers). + * + * The type "avl_index_t" is used to indicate a position in the list for + * certain calls. + * + * The usage scenario is generally: + * + * 1. Create the list/tree with: avl_create() + * + * followed by any mixture of: + * + * 2a. Insert nodes with: avl_add(), or avl_find() and avl_insert() + * + * 2b. Visited elements with: + * avl_first() - returns the lowest valued node + * avl_last() - returns the highest valued node + * AVL_NEXT() - given a node go to next higher one + * AVL_PREV() - given a node go to previous lower one + * + * 2c. Find the node with the closest value either less than or greater + * than a given value with avl_nearest(). + * + * 2d. Remove individual nodes from the list/tree with avl_remove(). + * + * and finally when the list is being destroyed + * + * 3. Use avl_destroy_nodes() to quickly process/free up any remaining nodes. + * Note that once you use avl_destroy_nodes(), you can no longer + * use any routine except avl_destroy_nodes() and avl_destoy(). + * + * 4. Use avl_destroy() to destroy the AVL tree itself. + * + * Any locking for multiple thread access is up to the user to provide, just + * as is needed for any linked list implementation. + */ + + /* + * AVL comparator helpers + */ +#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0)) +#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b))) +#define AVL_PCMP(a, b) \ + (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b))) + +/* + * Type used for the root of the AVL tree. + */ +typedef struct avl_tree avl_tree_t; + +/* + * The data nodes in the AVL tree must have a field of this type. + */ +typedef struct avl_node avl_node_t; + +/* + * An opaque type used to locate a position in the tree where a node + * would be inserted. + */ +typedef uintptr_t avl_index_t; + + +/* + * Direction constants used for avl_nearest(). + */ +#define AVL_BEFORE (0) +#define AVL_AFTER (1) + + +/* + * Prototypes + * + * Where not otherwise mentioned, "void *" arguments are a pointer to the + * user data structure which must contain a field of type avl_node_t. + * + * Also assume the user data structures looks like: + * stuct my_type { + * ... + * avl_node_t my_link; + * ... + * }; + */ + +/* + * Initialize an AVL tree. Arguments are: + * + * tree - the tree to be initialized + * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 + * -1 for <, 0 for ==, and +1 for > + * size - the value of sizeof(struct my_type) + * offset - the value of OFFSETOF(struct my_type, my_link) + */ +extern void avl_create(avl_tree_t *tree, + int (*compar) (const void *, const void *), uint32_t size, uint32_t offset); + + +/* + * Find a node with a matching value in the tree. Returns the matching node + * found. If not found, it returns NULL and then if "where" is not NULL it sets + * "where" for use with avl_insert() or avl_nearest(). + * + * node - node that has the value being looked for + * where - position for use with avl_nearest() or avl_insert(), may be NULL + */ +extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where); + +/* + * Insert a node into the tree. + * + * node - the node to insert + * where - position as returned from avl_find() + */ +extern void avl_insert(avl_tree_t *tree, void *node, avl_index_t where); + +/* + * Insert "new_data" in "tree" in the given "direction" either after + * or before the data "here". + * + * This might be usefull for avl clients caching recently accessed + * data to avoid doing avl_find() again for insertion. + * + * new_data - new data to insert + * here - existing node in "tree" + * direction - either AVL_AFTER or AVL_BEFORE the data "here". + */ +extern void avl_insert_here(avl_tree_t *tree, void *new_data, void *here, + int direction); + + +/* + * Return the first or last valued node in the tree. Will return NULL + * if the tree is empty. + * + */ +extern void *avl_first(avl_tree_t *tree); +extern void *avl_last(avl_tree_t *tree); + + +/* + * Return the next or previous valued node in the tree. + * AVL_NEXT() will return NULL if at the last node. + * AVL_PREV() will return NULL if at the first node. + * + * node - the node from which the next or previous node is found + */ +#define AVL_NEXT(tree, node) avl_walk(tree, node, AVL_AFTER) +#define AVL_PREV(tree, node) avl_walk(tree, node, AVL_BEFORE) + + +/* + * Find the node with the nearest value either greater or less than + * the value from a previous avl_find(). Returns the node or NULL if + * there isn't a matching one. + * + * where - position as returned from avl_find() + * direction - either AVL_BEFORE or AVL_AFTER + * + * EXAMPLE get the greatest node that is less than a given value: + * + * avl_tree_t *tree; + * struct my_data look_for_value = {....}; + * struct my_data *node; + * struct my_data *less; + * avl_index_t where; + * + * node = avl_find(tree, &look_for_value, &where); + * if (node != NULL) + * less = AVL_PREV(tree, node); + * else + * less = avl_nearest(tree, where, AVL_BEFORE); + */ +extern void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction); + + +/* + * Add a single node to the tree. + * The node must not be in the tree, and it must not + * compare equal to any other node already in the tree. + * + * node - the node to add + */ +extern void avl_add(avl_tree_t *tree, void *node); + + +/* + * Remove a single node from the tree. The node must be in the tree. + * + * node - the node to remove + */ +extern void avl_remove(avl_tree_t *tree, void *node); + +/* + * Reinsert a node only if its order has changed relative to its nearest + * neighbors. To optimize performance avl_update_lt() checks only the previous + * node and avl_update_gt() checks only the next node. Use avl_update_lt() and + * avl_update_gt() only if you know the direction in which the order of the + * node may change. + */ +extern boolean_t avl_update(avl_tree_t *, void *); +extern boolean_t avl_update_lt(avl_tree_t *, void *); +extern boolean_t avl_update_gt(avl_tree_t *, void *); + +/* + * Return the number of nodes in the tree + */ +extern ulong_t avl_numnodes(avl_tree_t *tree); + +/* + * Return B_TRUE if there are zero nodes in the tree, B_FALSE otherwise. + */ +extern boolean_t avl_is_empty(avl_tree_t *tree); + +/* + * Used to destroy any remaining nodes in a tree. The cookie argument should + * be initialized to NULL before the first call. Returns a node that has been + * removed from the tree and may be free()'d. Returns NULL when the tree is + * empty. + * + * Once you call avl_destroy_nodes(), you can only continuing calling it and + * finally avl_destroy(). No other AVL routines will be valid. + * + * cookie - a "void *" used to save state between calls to avl_destroy_nodes() + * + * EXAMPLE: + * avl_tree_t *tree; + * struct my_data *node; + * void *cookie; + * + * cookie = NULL; + * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL) + * free(node); + * avl_destroy(tree); + */ +extern void *avl_destroy_nodes(avl_tree_t *tree, void **cookie); + + +/* + * Final destroy of an AVL tree. Arguments are: + * + * tree - the empty tree to destroy + */ +extern void avl_destroy(avl_tree_t *tree); + +extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2); + +#ifdef __cplusplus +} +#endif + +#endif /* _AVL_H */ diff --git a/include/os/windows/spl/sys/avl_impl.h b/include/os/windows/spl/sys/avl_impl.h new file mode 100644 index 000000000000..8c2aa34df054 --- /dev/null +++ b/include/os/windows/spl/sys/avl_impl.h @@ -0,0 +1,144 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _AVL_IMPL_H +#define _AVL_IMPL_H + + + +/* + * This is a private header file. Applications should not directly include + * this file. + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * generic AVL tree implementation for kernel use + * + * There are 5 pieces of information stored for each node in an AVL tree + * + * pointer to less than child + * pointer to greater than child + * a pointer to the parent of this node + * an indication [0/1] of which child I am of my parent + * a "balance" (-1, 0, +1) indicating which child tree is taller + * + * Since they only need 3 bits, the last two fields are packed into the + * bottom bits of the parent pointer on 64 bit machines to save on space. + */ + + +/* + * for 64 bit machines, avl_pcb contains parent pointer, balance and child_index + * values packed in the following manner: + * + * |63 3| 2 |1 0 | + * |-------------------------------------|-----------------|-------------| + * | avl_parent hi order bits | avl_child_index | avl_balance | + * | | | + 1 | + * |-------------------------------------|-----------------|-------------| + * + */ +struct avl_node { + struct avl_node *avl_child[2]; /* left/right children nodes */ + uintptr_t avl_pcb; /* parent, child_index, balance */ +}; + +/* + * macros to extract/set fields in avl_pcb + * + * pointer to the parent of the current node is the high order bits + */ +#define AVL_XPARENT(n) ((struct avl_node *)((n)->avl_pcb & ~7)) +#define AVL_SETPARENT(n, p) \ + ((n)->avl_pcb = (((n)->avl_pcb & 7) | (uintptr_t)(p))) + +/* + * index of this node in its parent's avl_child[]: bit #2 + */ +#define AVL_XCHILD(n) (((n)->avl_pcb >> 2) & 1) +#define AVL_SETCHILD(n, c) \ + ((n)->avl_pcb = (uintptr_t)(((n)->avl_pcb & ~4) | ((c) << 2))) + +/* + * balance indication for a node, lowest 2 bits. A valid balance is + * -1, 0, or +1, and is encoded by adding 1 to the value to get the + * unsigned values of 0, 1, 2. + */ +#define AVL_XBALANCE(n) ((int)(((n)->avl_pcb & 3) - 1)) +#define AVL_SETBALANCE(n, b) \ + ((n)->avl_pcb = (uintptr_t)((((n)->avl_pcb & ~3) | ((b) + 1)))) + + + + +/* + * switch between a node and data pointer for a given tree + * the value of "o" is tree->avl_offset + */ +#define AVL_NODE2DATA(n, o) ((void *)((uintptr_t)(n) - (o))) +#define AVL_DATA2NODE(d, o) ((struct avl_node *)((uintptr_t)(d) + (o))) + + + +/* + * macros used to create/access an avl_index_t + */ +#define AVL_INDEX2NODE(x) ((avl_node_t *)((x) & ~1)) +#define AVL_INDEX2CHILD(x) ((x) & 1) +#define AVL_MKINDEX(n, c) ((avl_index_t)(n) | (c)) + + +/* + * The tree structure. The fields avl_root, avl_compar, and avl_offset come + * first since they are needed for avl_find(). We want them to fit into + * a single 64 byte cache line to make avl_find() as fast as possible. + */ +struct avl_tree { + struct avl_node *avl_root; /* root node in tree */ + int (*avl_compar)(const void *, const void *); + uint32_t avl_offset; /* offsetof(type, avl_link_t field) */ + ulong_t avl_numnodes; /* number of nodes in the tree */ + uint32_t avl_size; /* sizeof user type struct */ +}; + + +/* + * This will only by used via AVL_NEXT() or AVL_PREV() + */ +extern void *avl_walk(struct avl_tree *, void *, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _AVL_IMPL_H */ diff --git a/include/os/windows/spl/sys/bitmap.h b/include/os/windows/spl/sys/bitmap.h new file mode 100644 index 000000000000..f04d679e86a0 --- /dev/null +++ b/include/os/windows/spl/sys/bitmap.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_BITMAP_H +#define _SPL_BITMAP_H + +#endif /* SPL_BITMAP_H */ diff --git a/include/os/windows/spl/sys/bootconf.h b/include/os/windows/spl/sys/bootconf.h new file mode 100644 index 000000000000..853b9804db0a --- /dev/null +++ b/include/os/windows/spl/sys/bootconf.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_BOOTCONF_H +#define _SPL_BOOTCONF_H + +#endif /* SPL_BOOTCONF_H */ diff --git a/include/os/windows/spl/sys/bootprops.h b/include/os/windows/spl/sys/bootprops.h new file mode 100644 index 000000000000..2ea8b06670ec --- /dev/null +++ b/include/os/windows/spl/sys/bootprops.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_BOOTPROPS_H +#define _SPL_BOOTPROPS_H + +#endif /* SPL_BOOTPROPS_H */ diff --git a/include/os/windows/spl/sys/buf.h b/include/os/windows/spl/sys/buf.h new file mode 100644 index 000000000000..755cee52d74f --- /dev/null +++ b/include/os/windows/spl/sys/buf.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_BUF_H +#define _SPL_BUF_H + +#endif /* SPL_BUF_H */ diff --git a/include/os/windows/spl/sys/byteorder.h b/include/os/windows/spl/sys/byteorder.h new file mode 100644 index 000000000000..3ccb3c62938e --- /dev/null +++ b/include/os/windows/spl/sys/byteorder.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + */ + +#ifndef _SPL_BYTEORDER_H +#define _SPL_BYTEORDER_H + +//#include +//#include +#include + +#define LE_16(x) (x) +#define LE_32(x) (x) +#define LE_64(x) (x) +#define BE_16(x) _byteswap_ushort(x) +#define BE_32(x) _byteswap_ulong(x) +#define BE_64(x) _byteswap_uint64(x) + +#define BE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define BE_IN16(xa) \ + (((uint16_t)BE_IN8(xa) << 8) | BE_IN8((uint8_t *)(xa)+1)) + +#define BE_IN32(xa) \ + (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2)) + + +#if !defined(htonll) +#define htonll(x) _byteswap_uint64(x) +#endif +#if !defined(ntohll) +#define ntohll(x) _byteswap_uint64(x) +#endif + + +// I'm going to assume windows in LE for now +#define _LITTLE_ENDIAN + + +#endif /* SPL_BYTEORDER_H */ diff --git a/include/os/windows/spl/sys/callb.h b/include/os/windows/spl/sys/callb.h new file mode 100644 index 000000000000..f94c56834ac0 --- /dev/null +++ b/include/os/windows/spl/sys/callb.h @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SPL_CALLB_H +#define _SPL_CALLB_H + +#include + +#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp)); + +typedef struct callb_cpr { + kmutex_t *cc_lockp; +} callb_cpr_t; + +#define CALLB_CPR_INIT(cp, lockp, func, name) { \ + (cp)->cc_lockp = lockp; \ +} + +#define CALLB_CPR_SAFE_BEGIN(cp) { \ + CALLB_CPR_ASSERT(cp); \ +} + +#define CALLB_CPR_SAFE_END(cp, lockp) { \ + CALLB_CPR_ASSERT(cp); \ +} + +#define CALLB_CPR_EXIT(cp) { \ + ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ + mutex_exit((cp)->cc_lockp); \ +} + + +#define CALLOUT_FLAG_ROUNDUP 0x1 +#define CALLOUT_FLAG_ABSOLUTE 0x2 +#define CALLOUT_FLAG_HRESTIME 0x4 +#define CALLOUT_FLAG_32BIT 0x8 + +/* Move me to more correct "sys/callo.h" file when convenient. */ +#define CALLOUT_NORMAL 1 +typedef uint64_t callout_id_t; +callout_id_t timeout_generic(int, void (*)(void *), void *, hrtime_t, hrtime_t, + int); + + +#endif /* _SPL_CALLB_H */ diff --git a/include/os/windows/spl/sys/cmn_err.h b/include/os/windows/spl/sys/cmn_err.h new file mode 100644 index 000000000000..921d7381f43f --- /dev/null +++ b/include/os/windows/spl/sys/cmn_err.h @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _SPL_CMN_ERR_H +#define _SPL_CMN_ERR_H + +#include +#include + +#define CE_CONT 0 /* continuation */ +#define CE_NOTE 1 /* notice */ +#define CE_WARN 2 /* warning */ +#define CE_PANIC 3 /* panic */ +#define CE_IGNORE 4 /* print nothing */ + +#ifdef _KERNEL + +extern void vcmn_err(int, const char *, __va_list); +extern void cmn_err(int, const char *, ...); + +#endif /* _KERNEL */ + +#define fm_panic panic + +#endif /* SPL_CMN_ERR_H */ diff --git a/include/os/windows/spl/sys/compress.h b/include/os/windows/spl/sys/compress.h new file mode 100644 index 000000000000..13967e5e5f81 --- /dev/null +++ b/include/os/windows/spl/sys/compress.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_COMPRESS_H +#define _SPL_COMPRESS_H + +#endif /* SPL_COMPRESS_H */ diff --git a/include/os/windows/spl/sys/condvar.h b/include/os/windows/spl/sys/condvar.h new file mode 100644 index 000000000000..b1a3d9863942 --- /dev/null +++ b/include/os/windows/spl/sys/condvar.h @@ -0,0 +1,11 @@ + +#ifndef _SPL_CONDVAR_H +#define _SPL_CONDVAR_H + +#include +//#include +#include +#include +#include + +#endif /* _SPL_CONDVAR_H */ diff --git a/include/os/windows/spl/sys/conf.h b/include/os/windows/spl/sys/conf.h new file mode 100644 index 000000000000..c9bdd0cf20ce --- /dev/null +++ b/include/os/windows/spl/sys/conf.h @@ -0,0 +1,7 @@ + +#ifndef _SPL_CONF_H +#define _SPL_CONF_H + +//#include_next + +#endif /* SPL_CONF_H */ diff --git a/include/os/windows/spl/sys/console.h b/include/os/windows/spl/sys/console.h new file mode 100644 index 000000000000..a798b153ee00 --- /dev/null +++ b/include/os/windows/spl/sys/console.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_CONSOLE_H +#define _SPL_CONSOLE_H + +#endif /* _SPL_CONSOLE_H */ diff --git a/include/os/windows/spl/sys/cpupart.h b/include/os/windows/spl/sys/cpupart.h new file mode 100644 index 000000000000..8c7b303f8aa6 --- /dev/null +++ b/include/os/windows/spl/sys/cpupart.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_CPUPART_H +#define _SPL_CPUPART_H + +#endif /* SPL_CPUPART_H */ diff --git a/include/os/windows/spl/sys/cpuvar.h b/include/os/windows/spl/sys/cpuvar.h new file mode 100644 index 000000000000..aeae4644b8df --- /dev/null +++ b/include/os/windows/spl/sys/cpuvar.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_CPUVAR_H +#define _SPL_CPUVAR_H + +#endif /* SPL_CPUVAR_H */ diff --git a/include/os/windows/spl/sys/crc32.h b/include/os/windows/spl/sys/crc32.h new file mode 100644 index 000000000000..2eee7303bfc2 --- /dev/null +++ b/include/os/windows/spl/sys/crc32.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_CRC32_H +#define _SPL_CRC32_H + +#endif /* SPL_CRC32_H */ diff --git a/include/os/windows/spl/sys/cred.h b/include/os/windows/spl/sys/cred.h new file mode 100644 index 000000000000..d6e17c3269ba --- /dev/null +++ b/include/os/windows/spl/sys/cred.h @@ -0,0 +1,69 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + */ + +#ifndef _SPL_CRED_H +#define _SPL_CRED_H + +#include +#include +//#include + +struct ucred; // fixme +typedef struct ucred cred_t; + +#define kcred (cred_t *)NULL +#define CRED() (cred_t *)NULL + +//#include + +// Older OSX API +#if !(MAC_OS_X_VERSION_MIN_REQUIRED >= 1070) +#define kauth_cred_getruid(x) (x)->cr_ruid +#define kauth_cred_getrgid(x) (x)->cr_rgid +#define kauth_cred_getsvuid(x) (x)->cr_svuid +#define kauth_cred_getsvgid(x) (x)->cr_svgid +#endif + + +extern void crhold(cred_t *cr); +extern void crfree(cred_t *cr); +extern uid_t crgetuid(const cred_t *cr); +extern uid_t crgetruid(const cred_t *cr); +extern uid_t crgetsuid(const cred_t *cr); +extern uid_t crgetfsuid(const cred_t *cr); +extern gid_t crgetgid(const cred_t *cr); +extern gid_t crgetrgid(const cred_t *cr); +extern gid_t crgetsgid(const cred_t *cr); +extern gid_t crgetfsgid(const cred_t *cr); +extern int crgetngroups(const cred_t *cr); +extern gid_t * crgetgroups(const cred_t *cr); +extern void crgetgroupsfree(gid_t *gids); +extern int spl_cred_ismember_gid(cred_t *cr, gid_t gid); + +#define crgetsid(cred, i) (NULL) + +#endif /* _SPL_CRED_H */ diff --git a/include/os/windows/spl/sys/ctype.h b/include/os/windows/spl/sys/ctype.h new file mode 100644 index 000000000000..6011fc308403 --- /dev/null +++ b/include/os/windows/spl/sys/ctype.h @@ -0,0 +1,7 @@ + +#ifndef _SPL_CTYPE_H +#define _SPL_CTYPE_H + +//#include_next + +#endif /* SPL_CTYPE_H */ diff --git a/include/os/windows/spl/sys/ddi.h b/include/os/windows/spl/sys/ddi.h new file mode 100644 index 000000000000..1c5827e67755 --- /dev/null +++ b/include/os/windows/spl/sys/ddi.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_DDI_H +#define _SPL_DDI_H + +#endif /* SPL_DDI_H */ diff --git a/include/os/windows/spl/sys/debug.h b/include/os/windows/spl/sys/debug.h new file mode 100644 index 000000000000..0bfbe2469a26 --- /dev/null +++ b/include/os/windows/spl/sys/debug.h @@ -0,0 +1,202 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Available Solaris debug functions. All of the ASSERT() macros will be + * compiled out when NDEBUG is defined, this is the default behavior for + * the SPL. To enable assertions use the --enable-debug with configure. + * The VERIFY() functions are never compiled out and cannot be disabled. + * + * PANIC() - Panic the node and print message. + * ASSERT() - Assert X is true, if not panic. + * ASSERTF() - Assert X is true, if not panic and print message. + * ASSERTV() - Wraps a variable declaration which is only used by ASSERT(). + * ASSERT3S() - Assert signed X OP Y is true, if not panic. + * ASSERT3U() - Assert unsigned X OP Y is true, if not panic. + * ASSERT3P() - Assert pointer X OP Y is true, if not panic. + * ASSERT0() - Assert value is zero, if not panic. + * VERIFY() - Verify X is true, if not panic. + * VERIFY3S() - Verify signed X OP Y is true, if not panic. + * VERIFY3U() - Verify unsigned X OP Y is true, if not panic. + * VERIFY3P() - Verify pointer X OP Y is true, if not panic. + * VERIFY0() - Verify value is zero, if not panic. + */ + +#ifndef _SPL_DEBUG_H +#define _SPL_DEBUG_H + +#include +#include + +#define panic(...) do { KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, __VA_ARGS__)); DbgBreakPoint(); windows_delay(hz); } while (1) + +extern void printBuffer(const char *fmt, ...); + +#define LUDICROUS_SPEED // use circular buffer +// xprintf is always printed +// dprintf is printed in DEBUG builds +// IOLog is printed in DEBUG builds (legacy from osx) +// +#ifdef DBG /* Debugging Disabled */ + #ifdef LUDICROUS_SPEED + #define dprintf(...) printBuffer(__VA_ARGS__) + #define IOLog(...) printBuffer(__VA_ARGS__) + #define xprintf(...) KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, __VA_ARGS__)) + #define TraceEvent(x, ...) printBuffer(__VA_ARGS__) + #else + #undef KdPrintEx + #define KdPrintEx(_x_) DbgPrintEx _x_ + #define dprintf(...) KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, __VA_ARGS__)) + #define IOLog(...) KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, __VA_ARGS__)) + #define xprintf(...) KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, __VA_ARGS__)) + #define TraceEvent(level, ...) KdPrintEx((DPFLTR_IHVDRIVER_ID, level, __VA_ARGS__)) + //#define dprintf(...) + //#define IOLog(...) + #endif + #define PANIC(fmt, ...) \ + do { \ + xprintf(fmt, __VA_ARGS__); \ + DbgBreakPoint(); \ + } while (0) +#else + //#undef KdPrintEx + //#define KdPrintEx(_x_) DbgPrintEx _x_ + //#define dprintf(...) KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, __VA_ARGS__)) + //#define IOLog(...) KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, __VA_ARGS__)) + #define TraceEvent(x, ...) + #define xprintf(...) DbgPrintEx(DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, __VA_ARGS__) + #define dprintf(...) + #define IOLog(...) + #define PANIC(fmt, ...) \ + do { \ + xprintf(fmt, __VA_ARGS__); \ + } while (0) +#endif + +#ifdef DBG /* Debugging Disabled */ + + +/* Define SPL_DEBUG_STR to make clear which ASSERT definitions are used */ +#define SPL_DEBUG_STR " (DEBUG mode)" + +/* ASSERTION that is safe to use within the debug system */ +#define __ASSERT(cond) \ +do { \ + if (unlikely(!(cond))) { \ + printk(KERN_EMERG "ASSERTION(" #cond ") failed\n"); \ + BUG(); \ + } \ +} while (0) + +#define ASSERTF(cond, fmt, ...) \ +do { \ + if (unlikely(!(cond))) \ + PANIC("ASSERTION(" #cond ") failed: " fmt, __VA_ARGS__); \ +} while (0) + +#define ASSERT3B(x,y,z) VERIFY3B(x, y, z) +#define ASSERT3S(x,y,z) VERIFY3S(x, y, z) +#define ASSERT3U(x,y,z) VERIFY3U(x, y, z) +#define ASSERT3P(x,y,z) VERIFY3P(x, y, z) +#define ASSERT0(x) VERIFY0(x) + +#define ASSERTV(x) x + +#else /* Debugging Enabled */ + +/* Define SPL_DEBUG_STR to make clear which ASSERT definitions are used */ +#define SPL_DEBUG_STR "" + +#define __ASSERT(x) ((void)0) +#define ASSERTF(x, y, z, ...) ((void)0) +#define ASSERTV(x) + +#define ASSERT3B(x,y,z) ((void)0) +#define ASSERT3S(x,y,z) ((void)0) +#define ASSERT3U(x,y,z) ((void)0) +#define ASSERT3P(x,y,z) ((void)0) +#define ASSERT0(x) ((void)0) + +#endif /* DBG */ + +#define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE, FMT, CAST) \ + do { \ + TYPE _verify3_left = (TYPE)(LEFT); \ + TYPE _verify3_right = (TYPE)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + PANIC("VERIFY3( %s " #OP " %s ) " \ + "failed (" FMT " " #OP " " FMT ")\n", \ + #LEFT, #RIGHT, \ + CAST (_verify3_left), CAST (_verify3_right)); \ + } while (0) + +#define VERIFY3B(x,y,z) VERIFY3_IMPL(x, y, z, int64_t, "%lld", (boolean_t)) +#define VERIFY3S(x,y,z) VERIFY3_IMPL(x, y, z, int64_t, "%lld", (long long)) +#define VERIFY3U(x,y,z) VERIFY3_IMPL(x, y, z, uint64_t, "%llu", \ + (unsigned long long)) +#define VERIFY3P(x,y,z) VERIFY3_IMPL(x, y, z, uintptr_t, "%p", (void *)) +#define VERIFY0(x) VERIFY3_IMPL(0, ==, x, int64_t, "%lld", (long long)) + +#define VERIFY(EX) do { if (!(EX)) panic("PANIC: %s %s:%d\n", #EX, __FILE__, __LINE__); } while(0) + +/* + * IMPLY and EQUIV are assertions of the form: + * + * if (a) then (b) + * and + * if (a) then (b) *AND* if (b) then (a) + */ +#if DEBAG +#define IMPLY(A, B) \ + ((void)(((!(A)) || (B)) || \ + panic("(" #A ") implies (" #B ")", __FILE__, __LINE__))) +#define EQUIV(A, B) \ + ((void)((!!(A) == !!(B)) || \ + panic("(" #A ") is equivalent to (" #B ")", __FILE__, __LINE__))) +#else +#define IMPLY(A, B) ((void)0) +#define EQUIV(A, B) ((void)0) +#endif + + +/* + * Compile-time assertion. The condition 'x' must be constant. + */ +#define CTASSERT_GLOBAL(x) _CTASSERT(x, __LINE__) +#define CTASSERT(x) { _CTASSERT(x, __LINE__); } +#define _CTASSERT(x, y) __CTASSERT(x, y) +#define __CTASSERT(x, y) \ + typedef char __attribute__ ((unused)) \ + __compile_time_assertion__ ## y[(x) ? 1 : -1] + +#endif /* SPL_DEBUG_H */ diff --git a/include/os/windows/spl/sys/dirent.h b/include/os/windows/spl/sys/dirent.h new file mode 100644 index 000000000000..663bf5bdaa34 --- /dev/null +++ b/include/os/windows/spl/sys/dirent.h @@ -0,0 +1,35 @@ + +#ifndef _SPL_DIRENT_H +#define _SPL_DIRENT_H + +//#include_next + +#define MAXNAMLEN 255 + +/* + * File types + */ +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 +#define DT_WHT 14 + +struct dirent { + uint64_t d_ino; /* file number of entry */ + uint64_t d_seekoff; /* seek offset (optional, used by servers) */ + uint16_t d_reclen; /* length of this record */ + uint16_t d_namlen; /* length of string in d_name */ + uint8_t d_type; /* file type, see below */ + char d_name[MAXPATHLEN]; /* entry name (up to MAXPATHLEN bytes) */ +}; + +#define IFTODT(mode) (((mode) & 0170000) >> 12) +#define DTTOIF(dirtype) ((dirtype) << 12) + + +#endif /* SPL_DIRENT_H */ diff --git a/include/os/windows/spl/sys/disp.h b/include/os/windows/spl/sys/disp.h new file mode 100644 index 000000000000..e8606ede4a4b --- /dev/null +++ b/include/os/windows/spl/sys/disp.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_DISP_H +#define _SPL_DISP_H + +#endif /* SPL_DISP_H */ diff --git a/include/os/windows/spl/sys/dkio.h b/include/os/windows/spl/sys/dkio.h new file mode 100644 index 000000000000..731c1f753895 --- /dev/null +++ b/include/os/windows/spl/sys/dkio.h @@ -0,0 +1,15 @@ + +#ifndef _SPL_DKIO_H +#define _SPL_DKIO_H + +struct dk_callback { + void (*dkc_callback)(void *dkc_cookie, int error); + void *dkc_cookie; + int dkc_flag; +}; + +#define DKIOC (0x04 << 8) +#define DKIOCFLUSHWRITECACHE (DKIOC | 34) +#define DKIOCTRIM (DKIOC | 35) + +#endif /* _SPL_DKIO_H */ diff --git a/include/os/windows/spl/sys/dklabel.h b/include/os/windows/spl/sys/dklabel.h new file mode 100644 index 000000000000..9584d832d6a1 --- /dev/null +++ b/include/os/windows/spl/sys/dklabel.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_DKLABEL_H +#define _SPL_DKLABEL_H + +#endif /* _SPL_DKLABEL_H */ diff --git a/include/os/windows/spl/sys/dnlc.h b/include/os/windows/spl/sys/dnlc.h new file mode 100644 index 000000000000..6f378a61f168 --- /dev/null +++ b/include/os/windows/spl/sys/dnlc.h @@ -0,0 +1,24 @@ + +#ifndef _SPL_DNLC_H +#define _SPL_DNLC_H + +/* + * Reduce the dcache and icache then reap the free'd slabs. Note the + * interface takes a reclaim percentage but we don't have easy access to + * the total number of entries to calculate the reclaim count. However, + * in practice this doesn't need to be even close to correct. We simply + * need to reclaim some useful fraction of the cache. The caller can + * determine if more needs to be done. + */ +static inline void +dnlc_reduce_cache(void *reduce_percent) +{ +#if 0 + int nr = (uintptr_t)reduce_percent * 10000; + shrink_dcache_memory(nr, GFP_KERNEL); + shrink_icache_memory(nr, GFP_KERNEL); + kmem_reap(); +#endif +} + +#endif /* SPL_DNLC_H */ diff --git a/include/os/windows/spl/sys/dumphdr.h b/include/os/windows/spl/sys/dumphdr.h new file mode 100644 index 000000000000..fe40873174a9 --- /dev/null +++ b/include/os/windows/spl/sys/dumphdr.h @@ -0,0 +1,4 @@ +#ifndef _SPL_DUMPHDR_H +#define _SPL_DUMPHDR_H + +#endif /* SPL_DUMPHDR_H */ diff --git a/include/os/windows/spl/sys/efi_partition.h b/include/os/windows/spl/sys/efi_partition.h new file mode 100644 index 000000000000..bbbaef1e6145 --- /dev/null +++ b/include/os/windows/spl/sys/efi_partition.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_EFI_PARTITION_H +#define _SPL_EFI_PARTITION_H + +#endif /* SPL_EFI_PARTITION_H */ diff --git a/include/os/windows/spl/sys/errno.h b/include/os/windows/spl/sys/errno.h new file mode 100644 index 000000000000..4b83690c9666 --- /dev/null +++ b/include/os/windows/spl/sys/errno.h @@ -0,0 +1,158 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2000 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_ERRNO_H +#define _SYS_ERRNO_H + + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL +#define ERESTART (-1) /* restart syscall */ +#define EJUSTRETURN (-2) /* don't modify regs, just return */ +#endif + + +/* + * Error codes + */ + +#define EPERM 1 /* Not super-user */ +#define ENOENT 2 /* No such file or directory */ +#define ESRCH 3 /* No such process */ +#define EINTR 4 /* interrupted system call */ +#define EIO 5 /* I/O error */ +#define ENXIO 6 /* No such device or address */ +#define E2BIG 7 /* Arg list too long */ +#define ENOEXEC 8 /* Exec format error */ +#define EBADF 9 /* Bad file number */ +#define ECHILD 10 /* No children */ +#define EAGAIN 11 /* Resource temporarily unavailable */ +#define ENOMEM 12 /* Not enough core */ +#define EACCES 13 /* Permission denied */ +#define EFAULT 14 /* Bad address */ +#define ENOTBLK 15 /* Block device required */ +#define EBUSY 16 /* Mount device busy */ +#define EEXIST 17 /* File exists */ +#define EXDEV 18 /* Cross-device link */ +#define ENODEV 19 /* No such device */ +#define ENOTDIR 20 /* Not a directory */ +#define EISDIR 21 /* Is a directory */ +#define EINVAL 22 /* Invalid argument */ +#define ENFILE 23 /* File table overflow */ +#define EMFILE 24 /* Too many open files */ +#define ENOTTY 25 /* Inappropriate ioctl for device */ +#define EFBIG 27 /* File too large */ +#define ENOSPC 28 /* No space left on device */ +#define ESPIPE 29 /* Illegal seek */ +#define EROFS 30 /* Read only file system */ +#define EMLINK 31 /* Too many links */ +#define EPIPE 32 /* Broken pipe */ +#define EDOM 33 /* Math arg out of domain of func */ +#define ERANGE 34 /* Math result not representable */ + +#define EDEADLK 36 +#define ENAMETOOLONG 38 +#define ENOLCK 39 +#define ENOSYS 40 +#define ENOTEMPTY 41 +#define EILSEQ 42 /* Illegal byte sequence */ + +#define EDQUOT 49 /* Disc quota exceeded */ +#define EBADE 50 /* invalid exchange */ +#define ESHUTDOWN 58 /* Can't send after socket shutdown */ +#define ESTALE 70 /* Stale NFS file handle */ + +#ifndef _KERNEL +#define ERESTART 85 /* Interrupted system call should be restarted */ +#endif + +#define EADDRINUSE 100 +#define EADDRNOTAVAIL 101 +#define EAFNOSUPPORT 102 +#define EALREADY 103 +#define EBADMSG 104 +#define ECANCELED 105 +#define ECONNABORTED 106 +#define ECONNREFUSED 107 +#define ECONNRESET 108 +#define EDESTADDRREQ 109 +#define EHOSTUNREACH 110 +#define EIDRM 111 +#define EINPROGRESS 112 +#define EISCONN 113 +#define ELOOP 114 +#define EMSGSIZE 115 +#define ENETDOWN 116 +#define ENETRESET 117 +#define ENETUNREACH 118 +#define ENOBUFS 119 +#define ENODATA 120 +#define ENOLINK 121 +#define ENOMSG 122 +#define ENOPROTOOPT 123 +#define ENOSR 124 +#define ENOSTR 125 +#define ENOTCONN 126 +#define ENOTRECOVERABLE 127 +#define ENOTSOCK 128 +#define ENOTSUP 129 +#define EOPNOTSUPP 130 +#define EOTHER 131 +#define EOVERFLOW 132 +#define EOWNERDEAD 133 +#define EPROTO 134 +#define EPROTONOSUPPORT 135 +#define EPROTOTYPE 136 +#define ETIME 137 +#define ETIMEDOUT 138 +#define ETXTBSY 139 +#define EWOULDBLOCK 140 + +#define ENOTACTIVE 142 +#define ECHRNG 143 +#define EREMOTEIO 144 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ERRNO_H */ diff --git a/include/os/windows/spl/sys/extdirent.h b/include/os/windows/spl/sys/extdirent.h new file mode 100644 index 000000000000..aa2278f52fa3 --- /dev/null +++ b/include/os/windows/spl/sys/extdirent.h @@ -0,0 +1,7 @@ + +#ifndef _SPL_EXTDIRENT_H +#define _SPL_EXTDIRENT_H + +#define ED_CASE_CONFLICT 0x10 + +#endif /* _SPL_EXTDIRENT_H */ diff --git a/include/os/windows/spl/sys/fcntl.h b/include/os/windows/spl/sys/fcntl.h new file mode 100644 index 000000000000..593cc2f532b0 --- /dev/null +++ b/include/os/windows/spl/sys/fcntl.h @@ -0,0 +1,30 @@ +#ifndef _SPL_FCNTL_H +#define _SPL_FCNTL_H + +#include +#include + +#define F_FREESP 11 + +#define F_RDLCK 1 /* shared or read lock */ +#define F_UNLCK 2 /* unlock */ +#define F_WRLCK 3 /* exclusive or write lock */ +#ifdef KERNEL +#define F_WAIT 0x010 /* Wait until lock is granted */ +#define F_FLOCK 0x020 /* Use flock(2) semantics for lock */ +#define F_POSIX 0x040 /* Use POSIX semantics for lock */ +#define F_PROV 0x080 /* Non-coalesced provisional lock */ +#define F_WAKE1_SAFE 0x100 /* its safe to only wake one waiter */ +#define F_ABORT 0x200 /* lock attempt aborted (force umount) */ +#define F_OFD_LOCK 0x400 /* Use "OFD" semantics for lock */ +#endif + +struct flock { + off_t l_start; /* starting offset */ + off_t l_len; /* len = 0 means until end of file */ + pid_t l_pid; /* lock owner */ + short l_type; /* lock type: read/write, etc. */ + short l_whence; /* type of l_start */ +}; + +#endif /* _SPL_FCNTL_H */ \ No newline at end of file diff --git a/include/os/windows/spl/sys/file.h b/include/os/windows/spl/sys/file.h new file mode 100644 index 000000000000..d6907e27c978 --- /dev/null +++ b/include/os/windows/spl/sys/file.h @@ -0,0 +1,30 @@ + +#ifndef _SPL_FILE_H +#define _SPL_FILE_H + +#define FIGNORECASE 0x00080000 +#define FKIOCTL 0x80000000 +#define FCOPYSTR 0x40000000 + +#include + +struct spl_fileproc { + void *f_vnode; // this points to the "fd" so we can look it up. + list_node_t f_next; /* next zfsdev_state_t link */ + uint64_t f_fd; + uint64_t f_offset; + void *f_proc; + void *f_fp; + int f_writes; + uint64_t f_file; // Minor of the file +}; + +//typedef struct spl_fileproc file_t; +#define file_t struct spl_fileproc + +void *getf(uint64_t fd); +void releasef(uint64_t fd); +/* O3X extended - get vnode from previos getf() */ +struct vnode *getf_vnode(void *fp); + +#endif /* SPL_FILE_H */ diff --git a/include/os/windows/spl/sys/fs/swapnode.h b/include/os/windows/spl/sys/fs/swapnode.h new file mode 100644 index 000000000000..0a24f21d0c08 --- /dev/null +++ b/include/os/windows/spl/sys/fs/swapnode.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_SWAPNODE_H +#define _SPL_SWAPNODE_H + +#endif /* SPL_SWAPNODE_H */ diff --git a/include/os/windows/spl/sys/idmap.h b/include/os/windows/spl/sys/idmap.h new file mode 100644 index 000000000000..581d6c8026b3 --- /dev/null +++ b/include/os/windows/spl/sys/idmap.h @@ -0,0 +1,7 @@ + +#ifndef _SPL_IDMAP_H +#define _SPL_IDMAP_H + +#define IDMAP_WK_CREATOR_OWNER_UID 2147483648U + +#endif /* SPL_IDMAP_H */ diff --git a/include/os/windows/spl/sys/int_limits.h b/include/os/windows/spl/sys/int_limits.h new file mode 100644 index 000000000000..ce860dc9befa --- /dev/null +++ b/include/os/windows/spl/sys/int_limits.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_INT_LIMITS_H +#define _SPL_INT_LIMITS_H + +#endif /* SPL_INT_LIMITS_H */ diff --git a/include/os/windows/spl/sys/int_types.h b/include/os/windows/spl/sys/int_types.h new file mode 100644 index 000000000000..47f96f6635df --- /dev/null +++ b/include/os/windows/spl/sys/int_types.h @@ -0,0 +1,7 @@ + +#ifndef _SPL_INT_TYPES_H +#define _SPL_INT_TYPES_H + +#include + +#endif /* SPL_INT_TYPES_H */ diff --git a/include/os/windows/spl/sys/inttypes.h b/include/os/windows/spl/sys/inttypes.h new file mode 100644 index 000000000000..a28d05c8cb0e --- /dev/null +++ b/include/os/windows/spl/sys/inttypes.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_INTTYPES_H +#define _SPL_INTTYPES_H + +#endif /* SPL_INTTYPES_H */ diff --git a/include/os/windows/spl/sys/isa_defs.h b/include/os/windows/spl/sys/isa_defs.h new file mode 100644 index 000000000000..51c434102c5d --- /dev/null +++ b/include/os/windows/spl/sys/isa_defs.h @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + + +#ifndef _SPL_ISA_DEFS_H +#define _SPL_ISA_DEFS_H + +/* x86_64 arch specific defines */ +#if defined(__x86_64) || defined(__x86_64__) + +#if !defined(__x86_64) +#define __x86_64 +#endif + +#if !defined(__amd64) +#define __amd64 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +#if !defined(_LP64) +#define _LP64 +#endif + +/* i386 arch specific defines */ +#elif defined(__i386) || defined(__i386__) + +#if !defined(__i386) +#define __i386 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +#if !defined(_ILP32) +#define _ILP32 +#endif + +/* powerpc (ppc64) arch specific defines */ +#elif defined(__powerpc) || defined(__powerpc__) + +#if !defined(__powerpc) +#define __powerpc +#endif + +#if !defined(__powerpc__) +#define __powerpc__ +#endif + +#if !defined(_LP64) +#define _LP64 +#endif + +/* arm arch specific defines */ +#elif defined(__arm) || defined(__arm__) + +#if !defined(__arm) +#define __arm +#endif + +#if !defined(__arm__) +#define __arm__ +#endif + +#if defined(__ARMEL__) +#define _LITTLE_ENDIAN +#else +#define _BIG_ENDIAN +#endif + +#else /* Currently only x86_64, i386, arm, and powerpc arches supported */ +#error "Unsupported ISA type" +#endif + +#if defined(_ILP32) && defined(_LP64) +#error "Both _ILP32 and _LP64 are defined" +#endif + +#include + +#if defined(__LITTLE_ENDIAN) && !defined(_LITTLE_ENDIAN) +#define _LITTLE_ENDIAN __LITTLE_ENDIAN +#endif + +#if defined(__BIG_ENDIAN) && !defined(_BIG_ENDIAN) +#define _BIG_ENDIAN __BIG_ENDIAN +#endif + +#if defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) +#error "Both _LITTLE_ENDIAN and _BIG_ENDIAN are defined" +#endif + +#if !defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN) +#error "Neither _LITTLE_ENDIAN or _BIG_ENDIAN are defined" +#endif + +#endif /* _SPL_ISA_DEFS_H */ diff --git a/include/os/windows/spl/sys/kidmap.h b/include/os/windows/spl/sys/kidmap.h new file mode 100644 index 000000000000..63b1ccba0b01 --- /dev/null +++ b/include/os/windows/spl/sys/kidmap.h @@ -0,0 +1,7 @@ + +#ifndef _SPL_KIDMAP_H +#define _SPL_KIDMAP_H + +#include + +#endif /* SPL_KIDMAP_H */ diff --git a/include/os/windows/spl/sys/kmem.h b/include/os/windows/spl/sys/kmem.h new file mode 100644 index 000000000000..923450013ca9 --- /dev/null +++ b/include/os/windows/spl/sys/kmem.h @@ -0,0 +1,157 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS Project + * Copyright (C) 2013 Jorgen Lundman + * Copyright (C) 2017 Sean Doran + * + */ + +#ifndef _SPL_KMEM_H +#define _SPL_KMEM_H + +#include +#include +#include +#include +//#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// XNU total amount of memory +extern uint64_t physmem; + +#define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */ +#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */ +#define KM_PANIC 0x0002 /* if memory cannot be allocated, panic */ +#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */ +#define KM_NORMALPRI 0x0008 /* with KM_NOSLEEP, lower priority allocation */ +#define KM_NODEBUG 0x0010 /* NOT IMPLEMENTED ON OSX */ +#define KM_NO_VBA 0x0020 /* OSX: don't descend to the bucket layer */ +#define KM_VMFLAGS 0x00ff /* flags that must match VM_* flags */ + +#define KM_FLAGS 0xffff /* all settable kmem flags */ + + /* + * Kernel memory allocator: DDI interfaces. + * See kmem_alloc(9F) for details. + */ + //MALLOC(fs, struct free_slab *, sizeof(struct free_slab), + // M_TEMP, M_WAITOK); +#define MALLOC(A,C,S,T,F) \ + (A) = (C)ExAllocatePoolWithTag(NonPagedPoolNx, (S), '!SFZ') +#define FREE(A,T) \ + ExFreePoolWithTag((A), '!SFZ') + +// Work around symbol collisions in XNU +#define kmem_alloc(size, kmflags) zfs_kmem_alloc((size), (kmflags)) +#define kmem_zalloc(size, kmflags) zfs_kmem_zalloc((size), (kmflags)) +#define kmem_free(buf, size) zfs_kmem_free((buf), (size)) + + void* zfs_kmem_alloc(uint32_t size, int kmflags); + void* zfs_kmem_zalloc(uint32_t size, int kmflags); + void zfs_kmem_free(void *buf, uint32_t size); + + void spl_kmem_init(uint64_t); + void spl_kmem_thread_init(); + void spl_kmem_mp_init(); + void spl_kmem_thread_fini(); + void spl_kmem_fini(); + + uint64_t kmem_size(void); + uint64_t kmem_used(void); + int64_t kmem_avail(void); + uint64_t kmem_num_pages_wanted(); + int spl_vm_pool_low(void); + int64_t spl_minimal_physmem_p(void); + int64_t spl_adjust_pressure(int64_t); + int64_t spl_free_wrapper(void); + int64_t spl_free_manual_pressure_wrapper(void); + boolean_t spl_free_fast_pressure_wrapper(void); + void spl_free_set_pressure(int64_t); + void spl_free_set_fast_pressure(boolean_t); + uint64_t spl_free_last_pressure_wrapper(void); + +#define KMC_NOTOUCH 0x00010000 +#define KMC_NODEBUG 0x00020000 +#define KMC_NOMAGAZINE 0x00040000 +#define KMC_NOHASH 0x00080000 +#define KMC_QCACHE 0x00100000 +#define KMC_KMEM_ALLOC 0x00200000 /* internal use only */ +#define KMC_IDENTIFIER 0x00400000 /* internal use only */ +#define KMC_PREFILL 0x00800000 +#define KMC_ARENA_SLAB 0x01000000 /* use a bigger kmem cache */ + + struct kmem_cache; + + typedef struct kmem_cache kmem_cache_t; + + /* Client response to kmem move callback */ + typedef enum kmem_cbrc { + KMEM_CBRC_YES, + KMEM_CBRC_NO, + KMEM_CBRC_LATER, + KMEM_CBRC_DONT_NEED, + KMEM_CBRC_DONT_KNOW + } kmem_cbrc_t; + +#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) +#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) + + kmem_cache_t *kmem_cache_create(char *name, uint32_t bufsize, uint32_t align, + int (*constructor)(void *, void *, int), + void (*destructor)(void *, void *), + void (*reclaim)(void *), + void *_private, struct vmem *vmp, int cflags); + void kmem_cache_destroy(kmem_cache_t *cache); + void *kmem_cache_alloc(kmem_cache_t *cache, int flags); + void kmem_cache_free(kmem_cache_t *cache, void *buf); + void kmem_cache_free_to_slab(kmem_cache_t *cache, void *buf); + void kmem_cache_reap_now(kmem_cache_t *cache); + void kmem_depot_ws_zero(kmem_cache_t *cache); + void kmem_reap(void); + void kmem_reap_idspace(void); + kmem_cache_t *kmem_cache_buf_in_cache(kmem_cache_t *, void *); + + int kmem_debugging(void); + void kmem_cache_set_move(kmem_cache_t *, + kmem_cbrc_t (*)(void *, void *, uint32_t, void *)); + + // void *calloc(uint32_t n, uint32_t s); + char *kmem_asprintf(const char *fmt, ...); + void strfree(char *str); + char *kmem_vasprintf(const char *fmt, va_list ap); + char *kmem_strstr(const char *in, const char *str); + void strident_canon(char *s, uint32_t n); + + boolean_t spl_arc_no_grow(uint32_t, boolean_t, kmem_cache_t **); + +#ifdef __cplusplus +} +#endif + +#endif /* _SPL_KMEM_H */ diff --git a/include/os/windows/spl/sys/kmem_impl.h b/include/os/windows/spl/sys/kmem_impl.h new file mode 100644 index 000000000000..c973462b602b --- /dev/null +++ b/include/os/windows/spl/sys/kmem_impl.h @@ -0,0 +1,503 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_KMEM_IMPL_H +#define _SYS_KMEM_IMPL_H + +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include +#include +//#include + +#ifdef __cplusplus +extern "C" { +#endif + +#pragma pack(2) + + /* + * kernel memory allocator: implementation-private data structures + * + * Lock order: + * 1. cache_lock + * 2. cc_lock in order by CPU ID + * 3. cache_depot_lock + * + * Do not call kmem_cache_alloc() or taskq_dispatch() while holding any of the + * above locks. + */ + +#define KMF_AUDIT 0x00000001 /* transaction auditing */ +#define KMF_DEADBEEF 0x00000002 /* deadbeef checking */ +#define KMF_REDZONE 0x00000004 /* redzone checking */ +#define KMF_CONTENTS 0x00000008 /* freed-buffer content logging */ +#define KMF_STICKY 0x00000010 /* if set, override /etc/system */ +#define KMF_NOMAGAZINE 0x00000020 /* disable per-cpu magazines */ +#define KMF_FIREWALL 0x00000040 /* put all bufs before unmapped pages */ +#define KMF_LITE 0x00000100 /* lightweight debugging */ + +#define KMF_HASH 0x00000200 /* cache has hash table */ +#define KMF_RANDOMIZE 0x00000400 /* randomize other kmem_flags */ + +#define KMF_DUMPDIVERT 0x00001000 /* use alternate memory at dump time */ +#define KMF_DUMPUNSAFE 0x00002000 /* flag caches used at dump time */ +#define KMF_PREFILL 0x00004000 /* Prefill the slab when created. */ + +#define KMF_BUFTAG (KMF_DEADBEEF | KMF_REDZONE) +#define KMF_TOUCH (KMF_BUFTAG | KMF_LITE | KMF_CONTENTS) +#define KMF_RANDOM (KMF_TOUCH | KMF_AUDIT | KMF_NOMAGAZINE) +#define KMF_DEBUG (KMF_RANDOM | KMF_FIREWALL) + +#define KMEM_STACK_DEPTH 15 + +#define KMEM_FREE_PATTERN 0xdeadbeefdeadbeefULL +#define KMEM_UNINITIALIZED_PATTERN 0xbaddcafebaddcafeULL +#define KMEM_REDZONE_PATTERN 0xfeedfacefeedfaceULL +#define KMEM_REDZONE_BYTE 0xbb + + /* + * Upstream platforms handle size == 0 as valid alloc, we + * can not return NULL, as that invalidates KM_SLEEP. So + * we return a valid hardcoded address, instead of actually taking up + * memory by fudging size to 1 byte. If read/writes are + * attempted, we will get page fault (which is correct, they + * asked for zero bytes after all) + */ +#define KMEM_ZERO_SIZE_PTR ((void *)16) + + /* + * Redzone size encodings for kmem_alloc() / kmem_free(). We encode the + * allocation size, rather than storing it directly, so that kmem_free() + * can distinguish frees of the wrong size from redzone violations. + * + * A size of zero is never valid. + */ +#define KMEM_SIZE_ENCODE(x) (251 * (x) + 1) +#define KMEM_SIZE_DECODE(x) ((x) / 251) +#define KMEM_SIZE_VALID(x) ((x) % 251 == 1 && (x) != 1) + + +#define KMEM_ALIGN 8 /* min guaranteed alignment */ +#define KMEM_ALIGN_SHIFT 3 /* log2(KMEM_ALIGN) */ +#define KMEM_VOID_FRACTION 8 /* never waste more than 1/8 of slab */ + +#define KMEM_SLAB_IS_PARTIAL(sp) \ +((sp)->slab_refcnt > 0 && (sp)->slab_refcnt < (sp)->slab_chunks) +#define KMEM_SLAB_IS_ALL_USED(sp) \ +((sp)->slab_refcnt == (sp)->slab_chunks) + + /* + * The bufctl (buffer control) structure keeps some minimal information + * about each buffer: its address, its slab, and its current linkage, + * which is either on the slab's freelist (if the buffer is free), or + * on the cache's buf-to-bufctl hash table (if the buffer is allocated). + * In the case of non-hashed, or "raw", caches (the common case), only + * the freelist linkage is necessary: the buffer address is at a fixed + * offset from the bufctl address, and the slab is at the end of the page. + * + * NOTE: bc_next must be the first field; raw buffers have linkage only. + */ + typedef struct kmem_bufctl { + struct kmem_bufctl *bc_next; /* next bufctl struct */ + void *bc_addr; /* address of buffer */ + struct kmem_slab *bc_slab; /* controlling slab */ + } kmem_bufctl_t; + + /* + * The KMF_AUDIT version of the bufctl structure. The beginning of this + * structure must be identical to the normal bufctl structure so that + * pointers are interchangeable. + */ + typedef struct kmem_bufctl_audit { + struct kmem_bufctl *bc_next; /* next bufctl struct */ + void *bc_addr; /* address of buffer */ + struct kmem_slab *bc_slab; /* controlling slab */ + kmem_cache_t *bc_cache; /* controlling cache */ + hrtime_t bc_timestamp; /* transaction time */ + kthread_t *bc_thread; /* thread doing transaction */ + struct kmem_bufctl *bc_lastlog; /* last log entry */ + void *bc_contents; /* contents at last free */ + int bc_depth; /* stack depth */ + pc_t bc_stack[KMEM_STACK_DEPTH]; /* pc stack */ + } kmem_bufctl_audit_t; + + /* + * A kmem_buftag structure is appended to each buffer whenever any of the + * KMF_BUFTAG flags (KMF_DEADBEEF, KMF_REDZONE, KMF_VERIFY) are set. + */ + typedef struct kmem_buftag { + uint64_t bt_redzone; /* 64-bit redzone pattern */ + kmem_bufctl_t *bt_bufctl; /* bufctl */ + intptr_t bt_bxstat; /* bufctl ^ (alloc/free) */ + } kmem_buftag_t; + + /* + * A variant of the kmem_buftag structure used for KMF_LITE caches. + * Previous callers are stored in reverse chronological order. (i.e. most + * recent first) + */ + typedef struct kmem_buftag_lite { + kmem_buftag_t bt_buftag; /* a normal buftag */ + pc_t bt_history[1]; /* zero or more callers */ + } kmem_buftag_lite_t; + +#define KMEM_BUFTAG_LITE_SIZE(f) \ +(offsetof(kmem_buftag_lite_t, bt_history[f])) + +#define KMEM_BUFTAG(cp, buf) \ +((kmem_buftag_t *)((char *)(buf) + (cp)->cache_buftag)) + +#define KMEM_BUFCTL(cp, buf) \ +((kmem_bufctl_t *)((char *)(buf) + (cp)->cache_bufctl)) + +#define KMEM_BUF(cp, bcp) \ +((void *)((char *)(bcp) - (cp)->cache_bufctl)) + +#define KMEM_SLAB(cp, buf) \ +((kmem_slab_t *)P2END((uintptr_t)(buf), (cp)->cache_slabsize) - 1) + + /* + * Test for using alternate memory at dump time. + */ +#define KMEM_DUMP(cp) ((cp)->cache_flags & KMF_DUMPDIVERT) +#define KMEM_DUMPCC(ccp) ((ccp)->cc_flags & KMF_DUMPDIVERT) + + /* + * The "CPU" macro loads a cpu_t that refers to the cpu that the current + * thread is running on at the time the macro is executed. A context switch + * may occur immediately after loading this data structure, leaving this + * thread pointing at the cpu_t for the previous cpu. This is not a problem; + * we'd just end up checking the previous cpu's per-cpu cache, and then check + * the other layers of the kmem cache if need be. + * + * It's not even a problem if the old cpu gets DR'ed out during the context + * switch. The cpu-remove DR operation bzero()s the cpu_t, but doesn't free + * it. So the cpu_t's cpu_cache_offset would read as 0, causing us to use + * cpu 0's per-cpu cache. + * + * So, there is no need to disable kernel preemption while using the CPU macro + * below since if we have been context switched, there will not be any + * correctness problem, just a momentary use of a different per-cpu cache. + */ + +#define KMEM_CPU_CACHE(cp) \ +(&cp->cache_cpu[cpu_number()]) + +#define KMOM_MAGAZINE_VALID(cp, mp) \ +(((kmem_slab_t *)P2END((uintptr_t)(mp), PAGESIZE) - 1)->slab_cache == \ +(cp)->cache_magtype->mt_cache) + +#define KMEM_MAGAZINE_VALID(cp, mp) \ +(((kmem_slab_t *)P2END((uintptr_t)(mp), PAGESIZE) - 1)->slab_cache == \ +(cp)->cache_magtype->mt_cache) + +#define KMEM_SLAB_OFFSET(sp, buf) \ +((uint32_t)((uintptr_t)(buf) - (uintptr_t)((sp)->slab_base))) + +#define KMEM_SLAB_MEMBER(sp, buf) \ +(KMEM_SLAB_OFFSET(sp, buf) < (sp)->slab_cache->cache_slabsize) + +#define KMEM_BUFTAG_ALLOC 0xa110c8edUL +#define KMEM_BUFTAG_FREE 0xf4eef4eeUL + + /* slab_later_count thresholds */ +#define KMEM_DISBELIEF 3 + + /* slab_flags */ +#define KMEM_SLAB_NOMOVE 0x1 +#define KMEM_SLAB_MOVE_PENDING 0x2 + + typedef struct kmem_slab { + struct kmem_cache *slab_cache; /* controlling cache */ + void *slab_base; /* base of allocated memory */ + avl_node_t slab_link; /* slab linkage */ + struct kmem_bufctl *slab_head; /* first free buffer */ + long slab_refcnt; /* outstanding allocations */ + long slab_chunks; /* chunks (bufs) in this slab */ + uint32_t slab_stuck_offset; /* unmoved buffer offset */ + uint16_t slab_later_count; /* cf KMEM_CBRC_LATER */ + uint16_t slab_flags; /* bits to mark the slab */ + hrtime_t slab_create_time; /* when was slab created? */ + } kmem_slab_t; + +#define KMEM_HASH_INITIAL 64 + +#define KMEM_HASH(cp, buf) \ +((cp)->cache_hash_table + \ +(((uintptr_t)(buf) >> (cp)->cache_hash_shift) & (cp)->cache_hash_mask)) + +#define KMEM_CACHE_NAMELEN 31 + + typedef struct kmem_magazine { + void *mag_next; + void *mag_round[1]; /* one or more rounds */ + } kmem_magazine_t; + + /* + * The magazine types for fast per-cpu allocation + */ + typedef struct kmem_magtype { + short mt_magsize; /* magazine size (number of rounds) */ + int mt_align; /* magazine alignment */ + uint32_t mt_minbuf; /* all smaller buffers qualify */ + uint32_t mt_maxbuf; /* no larger buffers qualify */ + kmem_cache_t *mt_cache; /* magazine cache */ + } kmem_magtype_t; + +#define KMEM_CPU_CACHE_SIZE 128 /* must be power of 2 */ +#define KMEM_CPU_PAD (KMEM_CPU_CACHE_SIZE - sizeof (kmutex_t) - \ + 2 * sizeof (uint64_t) - 2 * sizeof (void *) - sizeof (int) - \ + 5 * sizeof (short)) +#define KMEM_CACHE_SIZE(ncpus) \ + offsetof(kmem_cache_t, cache_cpu[ncpus]) + + /* Offset from kmem_cache->cache_cpu for per cpu caches */ +#define KMEM_CPU_CACHE_OFFSET(cpuid) \ + offsetof(kmem_cache_t, cache_cpu[cpuid]) - \ + offsetof(kmem_cache_t, cache_cpu) + +// ((uint32_t)(&((kmem_cache_t *)0)->cache_cpu[cpuid]) - \ +// (uint32_t)(&((kmem_cache_t *)0)->cache_cpu)) + + /* + * Per CPU cache data + */ + typedef struct kmem_cpu_cache { + kmutex_t cc_lock; /* protects this cpu's local cache */ + uint64_t cc_alloc; /* allocations from this cpu */ + uint64_t cc_free; /* frees to this cpu */ + kmem_magazine_t *cc_loaded; /* the currently loaded magazine */ + kmem_magazine_t *cc_ploaded; /* the previously loaded magazine */ + int cc_flags; /* CPU-local copy of cache_flags */ + short cc_rounds; /* number of objects in loaded mag */ + short cc_prounds; /* number of objects in previous mag */ + short cc_magsize; /* number of rounds in a full mag */ + short cc_dump_rounds; /* dump time copy of cc_rounds */ + short cc_dump_prounds; /* dump time copy of cc_prounds */ + char cc_pad[KMEM_CPU_PAD]; /* for nice alignment */ + } kmem_cpu_cache_t; + + /* + * The magazine lists used in the depot. + */ + typedef struct kmem_maglist { + kmem_magazine_t *ml_list; /* magazine list */ + long ml_total; /* number of magazines */ + long ml_min; /* min since last update */ + long ml_reaplimit; /* max reapable magazines */ + uint64_t ml_alloc; /* allocations from this list */ + } kmem_maglist_t; + + typedef struct kmem_defrag { + /* + * Statistics + */ + uint64_t kmd_callbacks; /* move callbacks */ + uint64_t kmd_yes; /* KMEM_CBRC_YES responses */ + uint64_t kmd_no; /* NO responses */ + uint64_t kmd_later; /* LATER responses */ + uint64_t kmd_dont_need; /* DONT_NEED responses */ + uint64_t kmd_dont_know; /* DONT_KNOW responses */ + uint64_t kmd_hunt_found; /* DONT_KNOW: # found in mag */ + uint64_t kmd_slabs_freed; /* slabs freed by moves */ + uint64_t kmd_defrags; /* kmem_cache_defrag() */ + uint64_t kmd_scans; /* kmem_cache_scan() */ + + /* + * Consolidator fields + */ + avl_tree_t kmd_moves_pending; /* buffer moves pending */ + list_t kmd_deadlist; /* deferred slab frees */ + uint32_t kmd_deadcount; /* # of slabs in kmd_deadlist */ + uint8_t kmd_reclaim_numer; /* slab usage threshold */ + uint8_t kmd_pad1; /* compiler padding */ + uint16_t kmd_consolidate; /* triggers consolidator */ + uint32_t kmd_pad2; /* compiler padding */ + uint32_t kmd_slabs_sought; /* reclaimable slabs sought */ + uint32_t kmd_slabs_found; /* reclaimable slabs found */ + uint32_t kmd_tries; /* nth scan interval counter */ + /* + * Fields used to ASSERT that the client does not kmem_cache_free() + * objects passed to the move callback. + */ + void *kmd_from_buf; /* object to move */ + void *kmd_to_buf; /* move destination */ + kthread_t *kmd_thread; /* thread calling move */ + } kmem_defrag_t; + + /* + * Cache callback function types + */ + typedef int (*constructor_fn_t)(void*, void*, int); + typedef void (*destructor_fn_t)(void*, void*); + typedef void (*reclaim_fn_t)(void*); + + /* + * Cache + */ + struct kmem_cache { + + /* + * Statistics + */ + uint64_t cache_slab_create; /* slab creates */ + uint64_t cache_slab_destroy; /* slab destroys */ + uint64_t cache_slab_alloc; /* slab layer allocations */ + uint64_t cache_slab_free; /* slab layer frees */ + uint64_t cache_alloc_fail; /* total failed allocations */ + uint64_t cache_buftotal; /* total buffers */ + uint64_t cache_bufmax; /* max buffers ever */ + uint64_t cache_bufslab; /* buffers free in slab layer */ + uint64_t cache_reap; /* cache reaps */ + uint64_t cache_rescale; /* hash table rescales */ + uint64_t cache_lookup_depth; /* hash lookup depth */ + uint64_t cache_depot_contention; /* mutex contention count */ + uint64_t cache_depot_contention_prev; /* previous snapshot */ + uint64_t cache_alloc_count; /* Number of allocations in cache */ + uint64_t no_vba_success; /* successful calls with KM_NO_VBA flag set */ + uint64_t no_vba_fail; + uint64_t arc_no_grow_set; /* number of times we set arc growth suppression time */ + uint64_t arc_no_grow; /* number of times spl_zio_is_suppressed returned true for this cache */ + + /* + * Cache properties + */ + char cache_name[KMEM_CACHE_NAMELEN + 1]; + uint32_t cache_bufsize; /* object size */ + uint32_t cache_align; /* object alignment */ + int (*cache_constructor)(void *, void *, int); + void (*cache_destructor)(void *, void *); + void (*cache_reclaim)(void *); + kmem_cbrc_t (*cache_move)(void *, void *, uint32_t, void *); + void *cache_private; /* opaque arg to callbacks */ + vmem_t *cache_arena; /* vmem source for slabs */ + int cache_cflags; /* cache creation flags */ + int cache_flags; /* various cache state info */ + uint32_t cache_mtbf; /* induced alloc failure rate */ + uint32_t cache_pad1; /* compiler padding */ + kstat_t *cache_kstat; /* exported statistics */ + list_node_t cache_link; /* cache linkage */ + + /* + * Slab layer + */ + kmutex_t cache_lock; /* protects slab layer */ + + uint32_t cache_chunksize; /* buf + alignment [+ debug] */ + uint32_t cache_slabsize; /* size of a slab */ + uint32_t cache_maxchunks; /* max buffers per slab */ + uint32_t cache_bufctl; /* buf-to-bufctl distance */ + uint32_t cache_buftag; /* buf-to-buftag distance */ + uint32_t cache_verify; /* bytes to verify */ + uint32_t cache_contents; /* bytes of saved content */ + uint32_t cache_color; /* next slab color */ + uint32_t cache_mincolor; /* maximum slab color */ + uint32_t cache_maxcolor; /* maximum slab color */ + uint32_t cache_hash_shift; /* get to interesting bits */ + uint32_t cache_hash_mask; /* hash table mask */ + list_t cache_complete_slabs; /* completely allocated slabs */ + uint32_t cache_complete_slab_count; + avl_tree_t cache_partial_slabs; /* partial slab freelist */ + uint32_t cache_partial_binshift; /* for AVL sort bins */ + kmem_cache_t *cache_bufctl_cache; /* source of bufctls */ + kmem_bufctl_t **cache_hash_table; /* hash table base */ + kmem_defrag_t *cache_defrag; /* slab consolidator fields */ + + /* + * Depot layer + */ + kmutex_t cache_depot_lock; /* protects depot */ + kmem_magtype_t *cache_magtype; /* magazine type */ + kmem_maglist_t cache_full; /* full magazines */ + kmem_maglist_t cache_empty; /* empty magazines */ + void *cache_dumpfreelist; /* heap during crash dump */ + void *cache_dumplog; /* log entry during dump */ + + /* + * Per CPU structures + */ + // XNU adjust to suit offsetof + kmem_cpu_cache_t cache_cpu[1]; /* per-cpu data */ + + } ; + + typedef struct kmem_cpu_log_header { + kmutex_t clh_lock; + char *clh_current; + uint32_t clh_avail; + int clh_chunk; + int clh_hits; +#if defined (SPL_DEBUG_MUTEX) + char clh_pad[128 - sizeof (kmutex_t) - sizeof (char *) - + sizeof (uint32_t) - 2 * sizeof (int)]; +#else + char clh_pad[128 - sizeof (kmutex_t) - sizeof (char *) - + sizeof (uint32_t) - 2 * sizeof (int)]; +#endif + } kmem_cpu_log_header_t; + + typedef struct kmem_log_header { + kmutex_t lh_lock; + char *lh_base; + int *lh_free; + uint32_t lh_chunksize; + int lh_nchunks; + int lh_head; + int lh_tail; + int lh_hits; + kmem_cpu_log_header_t lh_cpu[1]; /* ncpus actually allocated */ + } kmem_log_header_t; + + /* kmem_move kmm_flags */ +#define KMM_DESPERATE 0x1 +#define KMM_NOTIFY 0x2 +#define KMM_DEBUG 0x4 + + typedef struct kmem_move { + kmem_slab_t *kmm_from_slab; + void *kmm_from_buf; + void *kmm_to_buf; + avl_node_t kmm_entry; + int kmm_flags; + } kmem_move_t; + + /* + * In order to consolidate partial slabs, it must be possible for the cache to + * have partial slabs. + */ +#define KMEM_IS_MOVABLE(cp) \ + (((cp)->cache_chunksize * 2) <= (cp)->cache_slabsize) + +#pragma pack() + +#endif diff --git a/include/os/windows/spl/sys/kobj.h b/include/os/windows/spl/sys/kobj.h new file mode 100644 index 000000000000..fb30531f4fa3 --- /dev/null +++ b/include/os/windows/spl/sys/kobj.h @@ -0,0 +1,48 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SPL_KOBJ_H +#define _SPL_KOBJ_H + +#include + + +struct _buf { + intptr_t _fd; +}; + +struct bootstat { + uint64_t st_size; +}; + +//typedef struct _buf buf_t; + +extern struct _buf *kobj_open_file(char *name); +extern void kobj_close_file(struct _buf *file); +extern int kobj_read_file(struct _buf *file, char *buf, + ssize_t size, offset_t off); +extern int kobj_get_filesize(struct _buf *file, uint64_t *size); + +#endif /* SPL_KOBJ_H */ diff --git a/include/os/windows/spl/sys/kstat.h b/include/os/windows/spl/sys/kstat.h new file mode 100644 index 000000000000..19b864de5d2b --- /dev/null +++ b/include/os/windows/spl/sys/kstat.h @@ -0,0 +1,277 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SPL_KSTAT_H +#define _SPL_KSTAT_H + +#include +#include +#include +//#include +#include + +/* +* Kernel statistics driver (/dev/zfs) ioctls +* Defined outside the ZFS ioctls, and handled separately in zfs_vnops_windows.c +*/ + +#define KSTAT_IOC_CHAIN_ID CTL_CODE(ZFSIOCTL_TYPE, 0x7FD, METHOD_NEITHER, FILE_ANY_ACCESS) +#define KSTAT_IOC_READ CTL_CODE(ZFSIOCTL_TYPE, 0x7FE, METHOD_NEITHER, FILE_ANY_ACCESS) +#define KSTAT_IOC_WRITE CTL_CODE(ZFSIOCTL_TYPE, 0x7FF, METHOD_NEITHER, FILE_ANY_ACCESS) + + +#define KSTAT_STRLEN 31 + +#if defined(_KERNEL) + +#define KSTAT_ENTER(k) \ + { kmutex_t *lp = (k)->ks_lock; if (lp) mutex_enter(lp); } + +#define KSTAT_EXIT(k) \ + { kmutex_t *lp = (k)->ks_lock; if (lp) mutex_exit(lp); } + +#define KSTAT_UPDATE(k, rw) (*(k)->ks_update)((k), (rw)) + +#define KSTAT_SNAPSHOT(k, buf, rw) (*(k)->ks_snapshot)((k), (buf), (rw)) + +#endif /* defined(_KERNEL) */ + +/* For reference valid classes are: + * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc + */ + +#define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */ +#define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */ +#define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */ +#define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */ +#define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ +#define KSTAT_TYPE_TXG 5 /* txg sync; ks_ndata >= 1 */ +#define KSTAT_NUM_TYPES 6 + +#define KSTAT_DATA_CHAR 0 +#define KSTAT_DATA_INT32 1 +#define KSTAT_DATA_UINT32 2 +#define KSTAT_DATA_INT64 3 +#define KSTAT_DATA_UINT64 4 +#define KSTAT_DATA_LONG 5 +#define KSTAT_DATA_ULONG 6 +#define KSTAT_DATA_STRING 7 +#define KSTAT_NUM_DATAS 8 + +#define KSTAT_INTR_HARD 0 +#define KSTAT_INTR_SOFT 1 +#define KSTAT_INTR_WATCHDOG 2 +#define KSTAT_INTR_SPURIOUS 3 +#define KSTAT_INTR_MULTSVC 4 +#define KSTAT_NUM_INTRS 5 + +#define KSTAT_FLAG_VIRTUAL 0x01 +#define KSTAT_FLAG_VAR_SIZE 0x02 +#define KSTAT_FLAG_WRITABLE 0x04 +#define KSTAT_FLAG_PERSISTENT 0x08 +#define KSTAT_FLAG_DORMANT 0x10 +#define KSTAT_FLAG_UNSUPPORTED (KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_WRITABLE | \ +KSTAT_FLAG_PERSISTENT | KSTAT_FLAG_DORMANT) +#define KSTAT_FLAG_INVALID 0x20 +#define KSTAT_FLAG_LONGSTRINGS 0x40 + +#define KS_MAGIC 0x9d9d9d9d + +#define KSTAT_NAMED_PTR(kptr) ((kstat_named_t *)(kptr)->ks_data) + + +/* Dynamic updates */ +#define KSTAT_READ 0 +#define KSTAT_WRITE 1 + +struct kstat; + +typedef int kid_t; /* unique kstat id */ +typedef int kstat_update_t(struct kstat *, int); /* dynamic update cb */ + +#pragma pack(4) +typedef struct kstat { + /* + * Fields relevant to both kernel and user + */ + hrtime_t ks_crtime; /* creation time (from gethrtime()) */ + struct kstat *ks_next; /* kstat chain linkage */ + kid_t ks_kid; /* unique kstat ID */ + char ks_module[KSTAT_STRLEN]; /* provider module name */ + uchar_t ks_resv; /* reserved, currently just padding */ + int ks_instance; /* provider module's instance */ + char ks_name[KSTAT_STRLEN]; /* kstat name */ + uchar_t ks_type; /* kstat data type */ + char ks_class[KSTAT_STRLEN]; /* kstat class */ + uchar_t ks_flags; /* kstat flags */ + void *ks_data; /* kstat type-specific data */ + uint_t ks_ndata; /* # of type-specific data records */ + size_t ks_data_size; /* total size of kstat data section */ + hrtime_t ks_snaptime; /* time of last data shapshot */ + /* + * Fields relevant to kernel only + */ + int(*ks_update)(struct kstat *, int); /* dynamic update */ + void *ks_private; /* arbitrary provider-private data */ + int(*ks_snapshot)(struct kstat *, void *, int); + void *ks_lock; /* protects this kstat's data */ + + int ks_returnvalue; + int ks_errnovalue; +} kstat_t; +#pragma pack() + +#pragma pack(4) +typedef struct kstat_named { + char name[KSTAT_STRLEN]; /* name of counter */ + uchar_t data_type; /* data type */ + union { + char c[16]; /* enough for 128-bit ints */ + int32_t i32; + uint32_t ui32; + struct { + union { + char *ptr; /* NULL-term string */ +#if defined(_KERNEL) && defined(_MULTI_DATAMODEL) + caddr32_t ptr32; +#endif + char __pad[8]; /* 64-bit padding */ + } addr; + uint32_t len; /* # bytes for strlen + '\0' */ + } str; + /* + * The int64_t and uint64_t types are not valid for a maximally conformant + * 32-bit compilation environment (cc -Xc) using compilers prior to the + * introduction of C99 conforming compiler (reference ISO/IEC 9899:1990). + * In these cases, the visibility of i64 and ui64 is only permitted for + * 64-bit compilation environments or 32-bit non-maximally conformant + * C89 or C90 ANSI C compilation environments (cc -Xt and cc -Xa). In the + * C99 ANSI C compilation environment, the long long type is supported. + * The _INT64_TYPE is defined by the implementation (see sys/int_types.h). + */ + int64_t i64; + uint64_t ui64; + + long l; + ulong_t ul; + + /* These structure members are obsolete */ + + longlong_t ll; + u_longlong_t ull; + float f; + double d; + } value; /* value of counter */ +} kstat_named_t; +#pragma pack() + + +#define KSTAT_NAMED_PTR(kptr) ((kstat_named_t *)(kptr)->ks_data) + +/* +* Retrieve the pointer of the string contained in the given named kstat. +*/ +#define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.str.addr.ptr) + +/* +* Retrieve the length of the buffer required to store the string in the given +* named kstat. +*/ +#define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.str.len) + +typedef struct kstat_intr { + uint_t intrs[KSTAT_NUM_INTRS]; +} kstat_intr_t; + +typedef struct kstat_io { + u_longlong_t nread; /* number of bytes read */ + u_longlong_t nwritten; /* number of bytes written */ + uint_t reads; /* number of read operations */ + uint_t writes; /* number of write operations */ + hrtime_t wtime; /* cumulative wait (pre-service) time */ + hrtime_t wlentime; /* cumulative wait length*time product*/ + hrtime_t wlastupdate; /* last time wait queue changed */ + hrtime_t rtime; /* cumulative run (service) time */ + hrtime_t rlentime; /* cumulative run length*time product */ + hrtime_t rlastupdate; /* last time run queue changed */ + uint_t wcnt; /* count of elements in wait state */ + uint_t rcnt; /* count of elements in run state */ +} kstat_io_t; + +typedef struct kstat_timer { + char name[KSTAT_STRLEN+1]; /* event name */ + u_longlong_t num_events; /* number of events */ + hrtime_t elapsed_time; /* cumulative elapsed time */ + hrtime_t min_time; /* shortest event duration */ + hrtime_t max_time; /* longest event duration */ + hrtime_t start_time; /* previous event start time */ + hrtime_t stop_time; /* previous event stop time */ +} kstat_timer_t; + +void spl_kstat_init(void); +void spl_kstat_fini(void); + +typedef uint64_t zoneid_t; +#define ALL_ZONES 0 + +extern kstat_t *kstat_create(const char *, int, const char *, const char *, + uchar_t, uint_t, uchar_t); +extern kstat_t *kstat_create_zone(const char *, int, const char *, + const char *, uchar_t, uint_t, uchar_t, zoneid_t); +extern void kstat_install(kstat_t *); +extern void kstat_delete(kstat_t *); +extern void kstat_named_setstr(kstat_named_t *knp, const char *src); +extern void kstat_set_string(char *, const char *); +extern void kstat_delete_byname(const char *, int, const char *); +extern void kstat_delete_byname_zone(const char *, int, const char *, zoneid_t); +extern void kstat_named_init(kstat_named_t *, const char *, uchar_t); +extern void kstat_timer_init(kstat_timer_t *, const char *); +extern void kstat_waitq_enter(kstat_io_t *); +extern void kstat_waitq_exit(kstat_io_t *); +extern void kstat_runq_enter(kstat_io_t *); +extern void kstat_runq_exit(kstat_io_t *); +extern void kstat_waitq_to_runq(kstat_io_t *); +extern void kstat_runq_back_to_waitq(kstat_io_t *); +extern void kstat_timer_start(kstat_timer_t *); +extern void kstat_timer_stop(kstat_timer_t *); + +extern void kstat_zone_add(kstat_t *, zoneid_t); +extern void kstat_zone_remove(kstat_t *, zoneid_t); +extern int kstat_zone_find(kstat_t *, zoneid_t); + +extern kstat_t *kstat_hold_bykid(kid_t kid, zoneid_t); +extern kstat_t *kstat_hold_byname(const char *, int, const char *, zoneid_t); +extern void kstat_rele(kstat_t *); + +extern void kstat_set_raw_ops(kstat_t *ksp, + int(*headers)(char *buf, size_t size), + int(*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, off_t index)); + +int spl_kstat_chain_id(PDEVICE_OBJECT DiskDevice, PIRP Irp, PIO_STACK_LOCATION IrpSp); +int spl_kstat_read(PDEVICE_OBJECT DiskDevice, PIRP Irp, PIO_STACK_LOCATION IrpSp); +int spl_kstat_write(PDEVICE_OBJECT DiskDevice, PIRP Irp, PIO_STACK_LOCATION IrpSp); + +#endif /* _SPL_KSTAT_H */ diff --git a/include/os/windows/spl/sys/list.h b/include/os/windows/spl/sys/list.h new file mode 100644 index 000000000000..cfb9a7a92abd --- /dev/null +++ b/include/os/windows/spl/sys/list.h @@ -0,0 +1,149 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _SPL_LIST_H +#define _SPL_LIST_H + +#include + +/* + * NOTE: I have implemented the Solaris list API in terms of the native + * linux API. This has certain advantages in terms of leveraging the linux + * list debugging infrastructure, but it also means that the internals of a + * list differ slightly than on Solaris. This is not a problem as long as + * all callers stick to the published API. The two major differences are: + * + * 1) A list_node_t is mapped to a linux list_head struct which changes + * the name of the list_next/list_prev pointers to next/prev respectively. + * + * 2) A list_node_t which is not attached to a list on Solaris is denoted + * by having its list_next/list_prev pointers set to NULL. Under linux + * the next/prev pointers are set to LIST_POISON1 and LIST_POISON2 + * respectively. At this moment this only impacts the implementation + * of the list_link_init() and list_link_active() functions. + */ + +//typedef struct list_head list_node_t; +//#pragma pack(4) +typedef struct list_node { + struct list_node *list_next; + struct list_node *list_prev; +} list_node_t; + + + +typedef struct list { + size_t list_size; + size_t list_offset; + list_node_t list_head; +} list_t; +//#pragma pack() + +void list_create(list_t *, size_t, size_t); +void list_destroy(list_t *); + +void list_insert_after(list_t *, void *, void *); +void list_insert_before(list_t *, void *, void *); +void list_insert_head(list_t *, void *); +void list_insert_tail(list_t *, void *); +void list_remove(list_t *, void *); +void list_move_tail(list_t *, list_t *); + +void *list_head(list_t *); +void *list_tail(list_t *); +void *list_next(list_t *, void *); +void *list_prev(list_t *, void *); + +int list_link_active(list_node_t *); +int list_is_empty(list_t *); + +#define LIST_POISON1 NULL +#define LIST_POISON2 NULL + +//#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) +//#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) +#define list_d2l(a, obj) ((list_node_t *)(((uint64_t)obj) + (uint64_t)(a)->list_offset)) +#define list_object(a, node) ((void *)(((uint64_t)node) - (uint64_t)(a)->list_offset)) +#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head) + + +static inline void +list_link_init(list_node_t *node) +{ + node->list_next = LIST_POISON1; + node->list_prev = LIST_POISON2; +} + +static inline void +__list_del(list_node_t * prev, list_node_t * next) +{ + next->list_prev = prev; + prev->list_next = next; +} + +static inline void list_del(list_node_t *entry) +{ + __list_del(entry->list_prev, entry->list_next); + entry->list_next = LIST_POISON1; + entry->list_prev = LIST_POISON2; +} + +static inline void * +list_remove_head(list_t *list) +{ + list_node_t *head = list->list_head.list_next; + if (head == &list->list_head) + return NULL; + + list_del(head); + return list_object(list, head); +} + +static inline void * +list_remove_tail(list_t *list) +{ + list_node_t *tail = list->list_head.list_prev; + if (tail == &list->list_head) + return NULL; + + list_del(tail); + return list_object(list, tail); +} + +static inline void +list_link_replace(list_node_t *old_node, list_node_t *new_node) +{ + ASSERT(list_link_active(old_node)); + ASSERT(!list_link_active(new_node)); + + new_node->list_next = old_node->list_next; + new_node->list_prev = old_node->list_prev; + old_node->list_prev->list_next = new_node; + old_node->list_next->list_prev = new_node; + list_link_init(old_node); +} + +#endif /* SPL_LIST_H */ diff --git a/include/os/windows/spl/sys/md5.h b/include/os/windows/spl/sys/md5.h new file mode 100644 index 000000000000..1781654cc96b --- /dev/null +++ b/include/os/windows/spl/sys/md5.h @@ -0,0 +1,70 @@ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Cleaned up version of the md5.h header file from RFC 1321. + */ + +/* + * MD5.H - header file for MD5C.C + */ + +/* + * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All + * rights reserved. + * + * License to copy and use this software is granted provided that it + * is identified as the "RSA Data Security, Inc. MD5 Message-Digest + * Algorithm" in all material mentioning or referencing this software + * or this function. + * + * License is also granted to make and use derivative works provided + * that such works are identified as "derived from the RSA Data + * Security, Inc. MD5 Message-Digest Algorithm" in all material + * mentioning or referencing the derived work. + * + * RSA Data Security, Inc. makes no representations concerning either + * the merchantability of this software or the suitability of this + * software for any particular purpose. It is provided "as is" + * without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this + * documentation and/or software. + */ + +#ifndef _SYS_MD5_H +#define _SYS_MD5_H + +#include /* for uint_* */ + +/* + * Definitions for MD5 hashing functions, conformant to RFC 1321 + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define MD5_DIGEST_LENGTH 16 + +/* MD5 context. */ +typedef struct { + uint32_t state[4]; /* state (ABCD) */ + uint32_t count[2]; /* number of bits, modulo 2^64 (lsb first) */ + union { + uint8_t buf8[64]; /* undigested input */ + uint32_t buf32[16]; /* realigned input */ + } buf_un; +} MD5_CTX; + +void MD5Init(MD5_CTX *); +void MD5Update(MD5_CTX *, const void *, unsigned int); +void MD5Final(void *, MD5_CTX *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MD5_H */ diff --git a/include/os/windows/spl/sys/md5_consts.h b/include/os/windows/spl/sys/md5_consts.h new file mode 100644 index 000000000000..e767cc3bd909 --- /dev/null +++ b/include/os/windows/spl/sys/md5_consts.h @@ -0,0 +1,133 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) by 1998 Sun Microsystems, Inc. + * All rights reserved. + */ + +#ifndef _SYS_MD5_CONSTS_H +#define _SYS_MD5_CONSTS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* constants, as provided in RFC 1321 */ + +#define MD5_CONST_0 (uint32_t)0xd76aa478 +#define MD5_CONST_1 (uint32_t)0xe8c7b756 +#define MD5_CONST_2 (uint32_t)0x242070db +#define MD5_CONST_3 (uint32_t)0xc1bdceee +#define MD5_CONST_4 (uint32_t)0xf57c0faf +#define MD5_CONST_5 (uint32_t)0x4787c62a +#define MD5_CONST_6 (uint32_t)0xa8304613 +#define MD5_CONST_7 (uint32_t)0xfd469501 +#define MD5_CONST_8 (uint32_t)0x698098d8 +#define MD5_CONST_9 (uint32_t)0x8b44f7af +#define MD5_CONST_10 (uint32_t)0xffff5bb1 +#define MD5_CONST_11 (uint32_t)0x895cd7be +#define MD5_CONST_12 (uint32_t)0x6b901122 +#define MD5_CONST_13 (uint32_t)0xfd987193 +#define MD5_CONST_14 (uint32_t)0xa679438e +#define MD5_CONST_15 (uint32_t)0x49b40821 +#define MD5_CONST_16 (uint32_t)0xf61e2562 +#define MD5_CONST_17 (uint32_t)0xc040b340 +#define MD5_CONST_18 (uint32_t)0x265e5a51 +#define MD5_CONST_19 (uint32_t)0xe9b6c7aa +#define MD5_CONST_20 (uint32_t)0xd62f105d +#define MD5_CONST_21 (uint32_t)0x2441453 +#define MD5_CONST_22 (uint32_t)0xd8a1e681 +#define MD5_CONST_23 (uint32_t)0xe7d3fbc8 +#define MD5_CONST_24 (uint32_t)0x21e1cde6 +#define MD5_CONST_25 (uint32_t)0xc33707d6 +#define MD5_CONST_26 (uint32_t)0xf4d50d87 +#define MD5_CONST_27 (uint32_t)0x455a14ed +#define MD5_CONST_28 (uint32_t)0xa9e3e905 +#define MD5_CONST_29 (uint32_t)0xfcefa3f8 +#define MD5_CONST_30 (uint32_t)0x676f02d9 +#define MD5_CONST_31 (uint32_t)0x8d2a4c8a +#define MD5_CONST_32 (uint32_t)0xfffa3942 +#define MD5_CONST_33 (uint32_t)0x8771f681 +#define MD5_CONST_34 (uint32_t)0x6d9d6122 +#define MD5_CONST_35 (uint32_t)0xfde5380c +#define MD5_CONST_36 (uint32_t)0xa4beea44 +#define MD5_CONST_37 (uint32_t)0x4bdecfa9 +#define MD5_CONST_38 (uint32_t)0xf6bb4b60 +#define MD5_CONST_39 (uint32_t)0xbebfbc70 +#define MD5_CONST_40 (uint32_t)0x289b7ec6 +#define MD5_CONST_41 (uint32_t)0xeaa127fa +#define MD5_CONST_42 (uint32_t)0xd4ef3085 +#define MD5_CONST_43 (uint32_t)0x4881d05 +#define MD5_CONST_44 (uint32_t)0xd9d4d039 +#define MD5_CONST_45 (uint32_t)0xe6db99e5 +#define MD5_CONST_46 (uint32_t)0x1fa27cf8 +#define MD5_CONST_47 (uint32_t)0xc4ac5665 +#define MD5_CONST_48 (uint32_t)0xf4292244 +#define MD5_CONST_49 (uint32_t)0x432aff97 +#define MD5_CONST_50 (uint32_t)0xab9423a7 +#define MD5_CONST_51 (uint32_t)0xfc93a039 +#define MD5_CONST_52 (uint32_t)0x655b59c3 +#define MD5_CONST_53 (uint32_t)0x8f0ccc92 +#define MD5_CONST_54 (uint32_t)0xffeff47d +#define MD5_CONST_55 (uint32_t)0x85845dd1 +#define MD5_CONST_56 (uint32_t)0x6fa87e4f +#define MD5_CONST_57 (uint32_t)0xfe2ce6e0 +#define MD5_CONST_58 (uint32_t)0xa3014314 +#define MD5_CONST_59 (uint32_t)0x4e0811a1 +#define MD5_CONST_60 (uint32_t)0xf7537e82 +#define MD5_CONST_61 (uint32_t)0xbd3af235 +#define MD5_CONST_62 (uint32_t)0x2ad7d2bb +#define MD5_CONST_63 (uint32_t)0xeb86d391 + +/* initialization constants, as given in RFC 1321. used in MD5Init */ + +#define MD5_INIT_CONST_1 (uint32_t)0x67452301 +#define MD5_INIT_CONST_2 (uint32_t)0xefcdab89 +#define MD5_INIT_CONST_3 (uint32_t)0x98badcfe +#define MD5_INIT_CONST_4 (uint32_t)0x10325476 + +/* shift constants, as given in RFC 1321. used in MD5Transform */ + +#define MD5_SHIFT_11 7 +#define MD5_SHIFT_12 12 +#define MD5_SHIFT_13 17 +#define MD5_SHIFT_14 22 +#define MD5_SHIFT_21 5 +#define MD5_SHIFT_22 9 +#define MD5_SHIFT_23 14 +#define MD5_SHIFT_24 20 +#define MD5_SHIFT_31 4 +#define MD5_SHIFT_32 11 +#define MD5_SHIFT_33 16 +#define MD5_SHIFT_34 23 +#define MD5_SHIFT_41 6 +#define MD5_SHIFT_42 10 +#define MD5_SHIFT_43 15 +#define MD5_SHIFT_44 21 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MD5_CONSTS_H */ diff --git a/include/os/windows/spl/sys/mkdev.h b/include/os/windows/spl/sys/mkdev.h new file mode 100644 index 000000000000..5cecac9b6fb0 --- /dev/null +++ b/include/os/windows/spl/sys/mkdev.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_MKDEV_H +#define _SPL_MKDEV_H + +#endif /* SPL_MKDEV_H */ diff --git a/include/os/windows/spl/sys/mntent.h b/include/os/windows/spl/sys/mntent.h new file mode 100644 index 000000000000..6b79d1edf547 --- /dev/null +++ b/include/os/windows/spl/sys/mntent.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_MNTENT_H +#define _SPL_MNTENT_H + +#endif /* SPL_MNTENT_H */ diff --git a/include/os/windows/spl/sys/mode.h b/include/os/windows/spl/sys/mode.h new file mode 100644 index 000000000000..c2b091d37803 --- /dev/null +++ b/include/os/windows/spl/sys/mode.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_MODE_H +#define _SPL_MODE_H + +#endif /* SPL_MODE_H */ diff --git a/include/os/windows/spl/sys/mount.h b/include/os/windows/spl/sys/mount.h new file mode 100644 index 000000000000..318221529671 --- /dev/null +++ b/include/os/windows/spl/sys/mount.h @@ -0,0 +1,128 @@ + +#ifndef _SPL_MOUNT_H +#define _SPL_MOUNT_H + +//#undef vnode_t +//#include_next +//#define vnode_t struct vnode +#define MNT_WAIT 1 /* synchronized I/O file integrity completion */ +#define MNT_NOWAIT 2 /* start all I/O, but do not wait for it */ + +#define MNT_RDONLY 0x00000001 /* read only filesystem */ +#define MNT_SYNCHRONOUS 0x00000002 /* file system written synchronously */ +#define MNT_NOEXEC 0x00000004 /* can't exec from filesystem */ +#define MNT_NOSUID 0x00000008 /* don't honor setuid bits on fs */ +#define MNT_NODEV 0x00000010 /* don't interpret special files */ +#define MNT_UNION 0x00000020 /* union with underlying filesystem */ +#define MNT_ASYNC 0x00000040 /* file system written asynchronously */ +#define MNT_CPROTECT 0x00000080 /* file system supports content protection */ + +#define MNT_LOCAL 0x00001000 /* filesystem is stored locally */ +#define MNT_QUOTA 0x00002000 /* quotas are enabled on filesystem */ +#define MNT_ROOTFS 0x00004000 /* identifies the root filesystem */ +#define MNT_DOVOLFS 0x00008000 /* FS supports volfs (deprecated flag in Mac OS X 10.5) */ + +#define MNT_DONTBROWSE 0x00100000 /* file system is not appropriate path to user data */ +#define MNT_IGNORE_OWNERSHIP 0x00200000 /* VFS will ignore ownership information on filesystem objects */ +#define MNT_AUTOMOUNTED 0x00400000 /* filesystem was mounted by automounter */ +#define MNT_JOURNALED 0x00800000 /* filesystem is journaled */ +#define MNT_NOUSERXATTR 0x01000000 /* Don't allow user extended attributes */ +#define MNT_DEFWRITE 0x02000000 /* filesystem should defer writes */ +#define MNT_MULTILABEL 0x04000000 /* MAC support for individual labels */ +#define MNT_NOATIME 0x10000000 /* disable update of file access time */ + +#define MNT_UPDATE 0x00010000 /* not a real mount, just an update */ +#define MNT_NOBLOCK 0x00020000 /* don't block unmount if not responding */ +#define MNT_RELOAD 0x00040000 /* reload filesystem data */ +#define MNT_FORCE 0x00080000 /* force unmount or readonly change */ +#define MNT_CMDFLAGS (MNT_UPDATE|MNT_NOBLOCK|MNT_RELOAD|MNT_FORCE) + +#define MNT_UNKNOWNPERMISSIONS MNT_IGNORE_OWNERSHIP + +#define MFSTYPENAMELEN 16 + +// Undo this OSX legacy +typedef struct fsid { int32_t val[2]; } fsid_t; + +//#pragma pack(4) + +struct vfsstatfs { + uint32_t f_bsize; /* fundamental file system block size */ + size_t f_iosize; /* optimal transfer block size */ + uint64_t f_blocks; /* total data blocks in file system */ + uint64_t f_bfree; /* free blocks in fs */ + uint64_t f_bavail; /* free blocks avail to non-superuser */ + uint64_t f_bused; /* free blocks avail to non-superuser */ + uint64_t f_files; /* total file nodes in file system */ + uint64_t f_ffree; /* free file nodes in fs */ + fsid_t f_fsid; /* file system id */ + uid_t f_owner; /* user that mounted the filesystem */ + uint64_t f_flags; /* copy of mount exported flags */ + char f_fstypename[MFSTYPENAMELEN];/* fs type name inclus */ + char f_mntonname[MAXPATHLEN];/* directory on which mounted */ + char f_mntfromname[MAXPATHLEN];/* mounted filesystem */ + uint32_t f_fssubtype; /* fs sub-type (flavor) */ + void *f_reserved[2]; /* For future use == 0 */ +}; + +//#pragma pack() + +//enum mount_type { +// MOUNT_TYPE_DCB = 231, // diskObject (most entries not used, should be own struct?) +// MOUNT_TYPE_VCB // fsObject +//}; + +typedef enum _FSD_IDENTIFIER_TYPE { + MOUNT_TYPE_DGL = ':DGL', // Dokan Global + MOUNT_TYPE_DCB = ':DCB', // Disk Control Block + MOUNT_TYPE_VCB = ':VCB', // Volume Control Block + MOUNT_TYPE_FCB = ':FCB', // File Control Block + MOUNT_TYPE_CCB = ':CCB', // Context Control Block +} FSD_IDENTIFIER_TYPE; + + +typedef enum mount_type mount_type_t; + +struct mount +{ + FSD_IDENTIFIER_TYPE type; + ULONG size; +// mount_type_t type; + void *fsprivate; + void *parent_device; // Only set so vcd can find dcb + PDEVICE_OBJECT deviceObject; + PDEVICE_OBJECT diskDeviceObject; + UNICODE_STRING bus_name; + UNICODE_STRING device_name; + UNICODE_STRING symlink_name; + UNICODE_STRING fs_name; + UNICODE_STRING name; + UNICODE_STRING uuid; + UNICODE_STRING mountpoint; + boolean_t justDriveLetter; + uint64_t volume_opens; + PVPB vpb; + + uint64_t mountflags; + + // NotifySync is used by notify directory change + PNOTIFY_SYNC NotifySync; + LIST_ENTRY DirNotifyList; +}; +typedef struct mount mount_t; +#define LK_NOWAIT 1 + +int vfs_busy(mount_t *mp, int flags); +void vfs_unbusy(mount_t *mp); +int vfs_isrdonly(mount_t *mp); +void * vfs_fsprivate(mount_t *mp); +void vfs_setfsprivate(mount_t *mp, void *mntdata); +void vfs_clearflags(mount_t *mp, uint64_t flags); +void vfs_setflags(mount_t *mp, uint64_t flags); +struct vfsstatfs * vfs_statfs(mount_t *mp); +uint64_t vfs_flags(mount_t *mp); +void vfs_setlocklocal(mount_t *mp); +int vfs_typenum(mount_t *mp); +void vfs_getnewfsid(struct mount *mp); +int vfs_isunmount(mount_t *mp); +#endif /* SPL_MOUNT_H */ diff --git a/include/os/windows/spl/sys/mutex.h b/include/os/windows/spl/sys/mutex.h new file mode 100644 index 000000000000..c2b38a84dfe1 --- /dev/null +++ b/include/os/windows/spl/sys/mutex.h @@ -0,0 +1,98 @@ +/* +* CDDL HEADER START +* +* The contents of this file are subject to the terms of the +* Common Development and Distribution License (the "License"). +* You may not use this file except in compliance with the License. +* +* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +* or http://www.opensolaris.org/os/licensing. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* When distributing Covered Code, include this CDDL HEADER in each +* file and include the License file at usr/src/OPENSOLARIS.LICENSE. +* If applicable, add the following below this CDDL HEADER, with the +* fields enclosed by brackets "[]" replaced with your own identifying +* information: Portions Copyright [yyyy] [name of copyright owner] +* +* CDDL HEADER END +*/ + +/* +* +* Copyright (C) 2017 Jorgen Lundman +* +*/ + +#ifndef OSX_MUTEX_H +#define OSX_MUTEX_H + +#include <../spl_config.h> // For SPL_DEBUG_MUTEX + +#ifdef _KERNEL +//#include +//#include +//#include +#include +#include + +//#include +//#include +//#include +#include + + +typedef enum { + MUTEX_ADAPTIVE = 0, /* spin if owner is running, otherwise block */ + MUTEX_SPIN = 1, /* block interrupts and spin */ + MUTEX_DRIVER = 4, /* driver (DDI) mutex */ + MUTEX_DEFAULT = 6 /* kernel default mutex */ +} kmutex_type_t; + +typedef struct { + KEVENT opaque; +} mutex_t; + +/* + * Solaris kmutex defined. + * + * and is embedded into ZFS structures (see dbuf) so we need to match the + * size carefully. It appears to be 32 bytes. Or rather, it needs to be + * aligned. + */ + +typedef struct kmutex { + mutex_t m_lock; + void *m_owner; + unsigned int initialised; + unsigned int set_event_guard; +} kmutex_t; + + +#define MUTEX_HELD(x) (mutex_owned(x)) +#define MUTEX_NOT_HELD(x) (!mutex_owned(x)) + +#define mutex_init spl_mutex_init +void spl_mutex_init(kmutex_t *mp, char *name, kmutex_type_t type, void *ibc); + +#define mutex_enter spl_mutex_enter +void spl_mutex_enter(kmutex_t *mp); + +#define mutex_destroy spl_mutex_destroy +#define mutex_exit spl_mutex_exit +#define mutex_tryenter spl_mutex_tryenter +#define mutex_owned spl_mutex_owned +#define mutex_owner spl_mutex_owner + +void spl_mutex_destroy(kmutex_t *mp); +void spl_mutex_exit(kmutex_t *mp); +int spl_mutex_tryenter(kmutex_t *mp); +int spl_mutex_owned(kmutex_t *mp); +struct kthread *spl_mutex_owner(kmutex_t *mp); + +int spl_mutex_subsystem_init(void); +void spl_mutex_subsystem_fini(void); + +#endif // KERNEL +#endif diff --git a/include/os/windows/spl/sys/note.h b/include/os/windows/spl/sys/note.h new file mode 100644 index 000000000000..020031e55625 --- /dev/null +++ b/include/os/windows/spl/sys/note.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_NOTE_H +#define _SPL_NOTE_H + +#endif /* SPL_NOTE_H */ diff --git a/include/os/windows/spl/sys/old-taskq.h b/include/os/windows/spl/sys/old-taskq.h new file mode 100644 index 000000000000..b6cf4d4d8c4c --- /dev/null +++ b/include/os/windows/spl/sys/old-taskq.h @@ -0,0 +1,186 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + + +#ifndef _SPL_TASKQ_H +#define _SPL_TASKQ_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define TASKQ_NAMELEN 31 + +#define TASKQ_PREPOPULATE 0x00000001 +#define TASKQ_CPR_SAFE 0x00000002 +#define TASKQ_DYNAMIC 0x00000004 +#define TASKQ_THREADS_CPU_PCT 0x00000008 +#define TASKQ_DC_BATCH 0x00000010 + +typedef struct taskq taskq_t; + +typedef unsigned long taskqid_t; +typedef void (task_func_t)(void *); + +#define TQENT_FLAG_PREALLOC 0x1 + +/* + * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as + * KM_SLEEP/KM_NOSLEEP. TQ_NOQUEUE/TQ_NOALLOC are set particularly + * large so as not to conflict with already used GFP_* defines. + */ +#define TQ_SLEEP 0x00000000 +#define TQ_NOSLEEP 0x00000001 +#define TQ_PUSHPAGE 0x00000002 +#define TQ_NOQUEUE 0x01000000 +#define TQ_NOALLOC 0x02000000 +#define TQ_NEW 0x04000000 +#define TQ_FRONT 0x08000000 +#define TQ_ACTIVE 0x80000000 + +int spl_taskq_init(void); +void spl_taskq_fini(void); + +typedef struct taskq_bucket taskq_bucket_t; + +typedef struct taskq_ent { + struct taskq_ent *tqent_next; + struct taskq_ent *tqent_prev; + task_func_t *tqent_func; + void *tqent_arg; + uintptr_t tqent_flags; /* On Solaris this and next */ + taskq_bucket_t *tqent_bucket; /* is a union */ + kthread_t *tqent_thread; + kcondvar_t tqent_cv; + kmutex_t tqent_thread_lock; + kcondvar_t tqent_thread_cv; +} taskq_ent_t; + +/* + * Per-CPU hash bucket manages taskq_bent_t structures using freelist. + */ +struct taskq_bucket { + kmutex_t tqbucket_lock; + taskq_t *tqbucket_taskq; /* Enclosing taskq */ + taskq_ent_t tqbucket_freelist; + uint_t tqbucket_nalloc; /* # of allocated entries */ + uint_t tqbucket_nfree; /* # of free entries */ + kcondvar_t tqbucket_cv; + ushort_t tqbucket_flags; + hrtime_t tqbucket_totaltime; +}; + +/* + * Bucket flags. + */ +#define TQBUCKET_CLOSE 0x01 +#define TQBUCKET_SUSPEND 0x02 + +/* + * taskq implementation flags: bit range 16-31 + */ +#define TASKQ_ACTIVE 0x00010000 +#define TASKQ_SUSPENDED 0x00020000 +#define TASKQ_NOINSTANCE 0x00040000 + +struct taskq { + char tq_name[TASKQ_NAMELEN + 1]; + kmutex_t tq_lock; + krwlock_t tq_threadlock; + kcondvar_t tq_dispatch_cv; + kcondvar_t tq_wait_cv; + uint_t tq_flags; + int tq_active; + int tq_nthreads; + int tq_nalloc; + int tq_minalloc; + int tq_maxalloc; + taskq_ent_t *tq_freelist; + taskq_ent_t tq_task; + int tq_maxsize; + pri_t tq_pri; /* Scheduling priority */ + taskq_bucket_t *tq_buckets; /* Per-cpu array of buckets */ +#ifndef __APPLE__ + int tq_instance; +#endif /*!__APPLE__*/ + uint_t tq_nbuckets; /* # of buckets (2^n) */ + union { + kthread_t *_tq_thread; + kthread_t **_tq_threadlist; + } tq_thr; + /* + * Statistics. + */ + hrtime_t tq_totaltime; /* Time spent processing tasks */ + int tq_tasks; /* Total # of tasks posted */ + int tq_executed; /* Total # of tasks executed */ + int tq_maxtasks; /* Max number of tasks in the queue */ + int tq_tcreates; + int tq_tdeaths; +}; + +#define tq_thread tq_thr._tq_thread +#define tq_threadlist tq_thr._tq_threadlist + +extern taskq_t *system_taskq; + +extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, + uint_t, clock_t); + +extern void nulltask(void *); // Maybe we don't need this? +extern void taskq_destroy(taskq_t *); +extern void taskq_wait(taskq_t *); +extern void taskq_suspend(taskq_t *); +extern int taskq_suspended(taskq_t *); +extern void taskq_resume(taskq_t *); +extern int taskq_member(taskq_t *, kthread_t *); + +#define taskq_create_proc(a, b, c, d, e, p, f) \ + (taskq_create(a, b, c, d, e, f)) +#define taskq_create_sysdc(a, b, d, e, p, dc, f) \ + (taskq_create(a, b, maxclsyspri, d, e, f)) + +extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); +extern int taskq_empty_ent(taskq_ent_t *); +extern void taskq_init_ent(taskq_ent_t *); + + + + +#ifdef __cplusplus +} +#endif + +#endif /* _SPL_TASKQ_H */ diff --git a/include/os/windows/spl/sys/open.h b/include/os/windows/spl/sys/open.h new file mode 100644 index 000000000000..791791d4e5a3 --- /dev/null +++ b/include/os/windows/spl/sys/open.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_OPEN_H +#define _SPL_OPEN_H + +#endif /* SPL_OPEN_H */ diff --git a/include/os/windows/spl/sys/param.h b/include/os/windows/spl/sys/param.h new file mode 100644 index 000000000000..cf1fbf0d26e6 --- /dev/null +++ b/include/os/windows/spl/sys/param.h @@ -0,0 +1,17 @@ + +#ifndef _SPL_PARAM_H +#define _SPL_PARAM_H + +//#include_next +//#include + +/* Pages to bytes and back */ +#define ptob(pages) (pages << PAGE_SHIFT) +#define btop(bytes) (bytes >> PAGE_SHIFT) +#ifndef howmany +#define howmany(x, y) ((((x) % (y)) == 0) ? ((x) / (y)) : (((x) / (y)) + 1)) +#endif + +#define MAXUID UINT32_MAX + +#endif /* SPL_PARAM_H */ diff --git a/include/os/windows/spl/sys/pathname.h b/include/os/windows/spl/sys/pathname.h new file mode 100644 index 000000000000..cbad71a7db15 --- /dev/null +++ b/include/os/windows/spl/sys/pathname.h @@ -0,0 +1,12 @@ + +#ifndef _SPL_PATHNAME_H +#define _SPL_PATHNAME_H + +typedef struct pathname { + char *pn_buf; /* underlying storage */ + char *pn_path; /* remaining pathname */ + size_t pn_pathlen; /* remaining length */ + size_t pn_bufsize; /* total size of pn_buf */ +} pathname_t; + +#endif /* SPL_PATHNAME_H */ diff --git a/include/os/windows/spl/sys/policy.h b/include/os/windows/spl/sys/policy.h new file mode 100644 index 000000000000..1b011782d1a1 --- /dev/null +++ b/include/os/windows/spl/sys/policy.h @@ -0,0 +1,120 @@ +/*****************************************************************************\ + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . +\*****************************************************************************/ + + + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + */ + +#ifndef _SPL_POLICY_H +#define _SPL_POLICY_H + +#ifdef _KERNEL + +#include +#include + +struct vattr; + +int secpolicy_fs_unmount(cred_t *, struct mount *); +int secpolicy_nfs(const cred_t *); +int secpolicy_sys_config(const cred_t *, boolean_t); +int secpolicy_zfs(const cred_t *); +int secpolicy_zinject(const cred_t *); +//int secpolicy_vnode_setids_setgids(const cred_t *, gid_t); +//int secpolicy_vnode_setid_retain(const cred_t *, boolean_t); +//void secpolicy_setid_clear(struct vattr *, cred_t *); +int secpolicy_vnode_any_access(const cred_t *, struct vnode *, uid_t); +int secpolicy_vnode_access2(const cred_t *, struct vnode *, uid_t, mode_t, mode_t); +//int secpolicy_vnode_chown(const cred_t *, uid_t); +//int secpolicy_vnode_setdac(const cred_t *, uid_t); +//int secpolicy_vnode_remove(const cred_t *); +/* + * This function to be called from xxfs_setattr(). + * Must be called with the node's attributes read-write locked. + * + * cred_t * - acting credentials + * struct vnode * - vnode we're operating on + * struct vattr *va - new attributes, va_mask may be + * changed on return from a call + * struct vattr *oldva - old attributes, need include owner + * and mode only + * int flags - setattr flags + * int iaccess(void *node, int mode, cred_t *cr) + * - non-locking internal access function + * mode be checked + * w/ VREAD|VWRITE|VEXEC, not fs + * internal mode encoding. + * + * void *node - internal node (inode, tmpnode) to + * pass as arg to iaccess + */ +int secpolicy_vnode_setattr(cred_t *, struct vnode *, vattr_t *, + const vattr_t *, int, int (void *, int, cred_t *), void *); + +//int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t); +int secpolicy_vnode_stky_modify(const cred_t *); +int secpolicy_setid_setsticky_clear(struct vnode *vp, vattr_t *vap, + const vattr_t *ovap, cred_t *cr); +//int secpolicy_basic_link(const cred_t *); + +int secpolicy_vnode_remove(struct vnode *, const cred_t *); +int secpolicy_vnode_create_gid(const cred_t *); +int secpolicy_vnode_setids_setgids(struct vnode *, const cred_t *, gid_t); +int secpolicy_vnode_setdac(struct vnode *, const cred_t *, uid_t); +int secpolicy_vnode_chown(struct vnode *, const cred_t *, uid_t); +int secpolicy_vnode_setid_retain(struct vnode *, const cred_t *, boolean_t); +int secpolicy_xvattr(struct vnode *, vattr_t *, uid_t, const cred_t *, enum vtype); +int secpolicy_setid_clear(vattr_t *, struct vnode *, const cred_t *); +int secpolicy_basic_link(struct vnode *, const cred_t *); +int secpolicy_fs_mount_clearopts(const cred_t *, struct mount *); +int secpolicy_fs_mount(const cred_t *, struct vnode *, struct mount *); + +#endif /* _KERNEL */ + +#endif /* SPL_POLICY_H */ diff --git a/include/os/windows/spl/sys/pool.h b/include/os/windows/spl/sys/pool.h new file mode 100644 index 000000000000..e8fb9c75ec8d --- /dev/null +++ b/include/os/windows/spl/sys/pool.h @@ -0,0 +1,7 @@ + +#ifndef _SPL_POOL_H +#define _SPL_POOL_H + +#include + +#endif /* SPL_POOL_H */ diff --git a/include/os/windows/spl/sys/priv.h b/include/os/windows/spl/sys/priv.h new file mode 100644 index 000000000000..8a3ddd543035 --- /dev/null +++ b/include/os/windows/spl/sys/priv.h @@ -0,0 +1,533 @@ +/*- + * Copyright (c) 2006 nCircle Network Security, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson for the TrustedBSD + * Project under contract to nCircle Network Security, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY, + * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Privilege checking interface for BSD kernel. + */ +#ifndef _SPL_PRIV_H +#define _SPL_PRIV_H + +/* + * Privilege list, sorted loosely by kernel subsystem. + * + * Think carefully before adding or reusing one of these privileges -- are + * there existing instances referring to the same privilege? Third party + * vendors may request the assignment of privileges to be used in loadable + * modules. Particular numeric privilege assignments are part of the + * loadable kernel module ABI, and should not be changed across minor + * releases. + * + * When adding a new privilege, remember to determine if it's appropriate for + * use in jail, and update the privilege switch in kern_jail.c as necessary. + */ + +/* + * Track beginning of privilege list. + */ +#define _PRIV_LOWEST 1 + +/* + * The remaining privileges typically correspond to one or a small + * number of specific privilege checks, and have (relatively) precise + * meanings. They are loosely sorted into a set of base system + * privileges, such as the ability to reboot, and then loosely by + * subsystem, indicated by a subsystem name. + */ +#define _PRIV_ROOT 1 /* Removed. */ +#define PRIV_ACCT 2 /* Manage process accounting. */ +#define PRIV_MAXFILES 3 /* Exceed system open files limit. */ +#define PRIV_MAXPROC 4 /* Exceed system processes limit. */ +#define PRIV_KTRACE 5 /* Set/clear KTRFAC_ROOT on ktrace. */ +#define PRIV_SETDUMPER 6 /* Configure dump device. */ +#define PRIV_REBOOT 8 /* Can reboot system. */ +#define PRIV_SWAPON 9 /* Can swapon(). */ +#define PRIV_SWAPOFF 10 /* Can swapoff(). */ +#define PRIV_MSGBUF 11 /* Can read kernel message buffer. */ +#define PRIV_IO 12 /* Can perform low-level I/O. */ +#define PRIV_KEYBOARD 13 /* Reprogram keyboard. */ +#define PRIV_DRIVER 14 /* Low-level driver privilege. */ +#define PRIV_ADJTIME 15 /* Set time adjustment. */ +#define PRIV_NTP_ADJTIME 16 /* Set NTP time adjustment. */ +#define PRIV_CLOCK_SETTIME 17 /* Can call clock_settime. */ +#define PRIV_SETTIMEOFDAY 18 /* Can call settimeofday. */ +#define _PRIV_SETHOSTID 19 /* Removed. */ +#define _PRIV_SETDOMAINNAME 20 /* Removed. */ + +/* + * Audit subsystem privileges. + */ +#define PRIV_AUDIT_CONTROL 40 /* Can configure audit. */ +#define PRIV_AUDIT_FAILSTOP 41 /* Can run during audit fail stop. */ +#define PRIV_AUDIT_GETAUDIT 42 /* Can get proc audit properties. */ +#define PRIV_AUDIT_SETAUDIT 43 /* Can set proc audit properties. */ +#define PRIV_AUDIT_SUBMIT 44 /* Can submit an audit record. */ + +/* + * Credential management privileges. + */ +#define PRIV_CRED_SETUID 50 /* setuid. */ +#define PRIV_CRED_SETEUID 51 /* seteuid to !ruid and !svuid. */ +#define PRIV_CRED_SETGID 52 /* setgid. */ +#define PRIV_CRED_SETEGID 53 /* setgid to !rgid and !svgid. */ +#define PRIV_CRED_SETGROUPS 54 /* Set process additional groups. */ +#define PRIV_CRED_SETREUID 55 /* setreuid. */ +#define PRIV_CRED_SETREGID 56 /* setregid. */ +#define PRIV_CRED_SETRESUID 57 /* setresuid. */ +#define PRIV_CRED_SETRESGID 58 /* setresgid. */ +#define PRIV_SEEOTHERGIDS 59 /* Exempt bsd.seeothergids. */ +#define PRIV_SEEOTHERUIDS 60 /* Exempt bsd.seeotheruids. */ + +/* + * Debugging privileges. + */ +#define PRIV_DEBUG_DIFFCRED 80 /* Exempt debugging other users. */ +#define PRIV_DEBUG_SUGID 81 /* Exempt debugging setuid proc. */ +#define PRIV_DEBUG_UNPRIV 82 /* Exempt unprivileged debug limit. */ +#define PRIV_DEBUG_DENIED 83 /* Exempt P2_NOTRACE. */ + +/* + * Dtrace privileges. + */ +#define PRIV_DTRACE_KERNEL 90 /* Allow use of DTrace on the kernel. */ +#define PRIV_DTRACE_PROC 91 /* Allow attaching DTrace to process. */ +#define PRIV_DTRACE_USER 92 /* Process may submit DTrace events. */ + +/* + * Firmware privilegs. + */ +#define PRIV_FIRMWARE_LOAD 100 /* Can load firmware. */ + +/* + * Jail privileges. + */ +#define PRIV_JAIL_ATTACH 110 /* Attach to a jail. */ +#define PRIV_JAIL_SET 111 /* Set jail parameters. */ +#define PRIV_JAIL_REMOVE 112 /* Remove a jail. */ + +/* + * Kernel environment priveleges. + */ +#define PRIV_KENV_SET 120 /* Set kernel env. variables. */ +#define PRIV_KENV_UNSET 121 /* Unset kernel env. variables. */ + +/* + * Loadable kernel module privileges. + */ +#define PRIV_KLD_LOAD 130 /* Load a kernel module. */ +#define PRIV_KLD_UNLOAD 131 /* Unload a kernel module. */ + +/* + * Privileges associated with the MAC Framework and specific MAC policy + * modules. + */ +#define PRIV_MAC_PARTITION 140 /* Privilege in mac_partition policy. */ +#define PRIV_MAC_PRIVS 141 /* Privilege in the mac_privs policy. */ + +/* + * Process-related privileges. + */ +#define PRIV_PROC_LIMIT 160 /* Exceed user process limit. */ +#define PRIV_PROC_SETLOGIN 161 /* Can call setlogin. */ +#define PRIV_PROC_SETRLIMIT 162 /* Can raise resources limits. */ +#define PRIV_PROC_SETLOGINCLASS 163 /* Can call setloginclass(2). */ + +/* + * System V IPC privileges. + */ +#define PRIV_IPC_READ 170 /* Can override IPC read perm. */ +#define PRIV_IPC_WRITE 171 /* Can override IPC write perm. */ +#define PRIV_IPC_ADMIN 172 /* Can override IPC owner-only perm. */ +#define PRIV_IPC_MSGSIZE 173 /* Exempt IPC message queue limit. */ + +/* + * POSIX message queue privileges. + */ +#define PRIV_MQ_ADMIN 180 /* Can override msgq owner-only perm. */ + +/* + * Performance monitoring counter privileges. + */ +#define PRIV_PMC_MANAGE 190 /* Can administer PMC. */ +#define PRIV_PMC_SYSTEM 191 /* Can allocate a system-wide PMC. */ + +/* + * Scheduling privileges. + */ +#define PRIV_SCHED_DIFFCRED 200 /* Exempt scheduling other users. */ +#define PRIV_SCHED_SETPRIORITY 201 /* Can set lower nice value for proc. */ +#define PRIV_SCHED_RTPRIO 202 /* Can set real time scheduling. */ +#define PRIV_SCHED_SETPOLICY 203 /* Can set scheduler policy. */ +#define PRIV_SCHED_SET 204 /* Can set thread scheduler. */ +#define PRIV_SCHED_SETPARAM 205 /* Can set thread scheduler params. */ +#define PRIV_SCHED_CPUSET 206 /* Can manipulate cpusets. */ +#define PRIV_SCHED_CPUSET_INTR 207 /* Can adjust IRQ to CPU binding. */ + +/* + * POSIX semaphore privileges. + */ +#define PRIV_SEM_WRITE 220 /* Can override sem write perm. */ + +/* + * Signal privileges. + */ +#define PRIV_SIGNAL_DIFFCRED 230 /* Exempt signalling other users. */ +#define PRIV_SIGNAL_SUGID 231 /* Non-conserv signal setuid proc. */ + +/* + * Sysctl privileges. + */ +#define PRIV_SYSCTL_DEBUG 240 /* Can invoke sysctl.debug. */ +#define PRIV_SYSCTL_WRITE 241 /* Can write sysctls. */ +#define PRIV_SYSCTL_WRITEJAIL 242 /* Can write sysctls, jail permitted. */ + +/* + * TTY privileges. + */ +#define PRIV_TTY_CONSOLE 250 /* Set console to tty. */ +#define PRIV_TTY_DRAINWAIT 251 /* Set tty drain wait time. */ +#define PRIV_TTY_DTRWAIT 252 /* Set DTR wait on tty. */ +#define PRIV_TTY_EXCLUSIVE 253 /* Override tty exclusive flag. */ +#define _PRIV_TTY_PRISON 254 /* Removed. */ +#define PRIV_TTY_STI 255 /* Simulate input on another tty. */ +#define PRIV_TTY_SETA 256 /* Set tty termios structure. */ + +/* + * UFS-specific privileges. + */ +#define PRIV_UFS_EXTATTRCTL 270 /* Can configure EAs on UFS1. */ +#define PRIV_UFS_QUOTAOFF 271 /* quotaoff(). */ +#define PRIV_UFS_QUOTAON 272 /* quotaon(). */ +#define PRIV_UFS_SETUSE 273 /* setuse(). */ + +/* + * ZFS-specific privileges. + */ +#define PRIV_ZFS_POOL_CONFIG 280 /* Can configure ZFS pools. */ +#define PRIV_ZFS_INJECT 281 /* Can inject faults in the ZFS fault + injection framework. */ +#define PRIV_ZFS_JAIL 282 /* Can attach/detach ZFS file systems + to/from jails. */ + +/* + * NFS-specific privileges. + */ +#define PRIV_NFS_DAEMON 290 /* Can become the NFS daemon. */ +#define PRIV_NFS_LOCKD 291 /* Can become NFS lock daemon. */ + +/* + * VFS privileges. + */ +#define PRIV_VFS_READ 310 /* Override vnode DAC read perm. */ +#define PRIV_VFS_WRITE 311 /* Override vnode DAC write perm. */ +#define PRIV_VFS_ADMIN 312 /* Override vnode DAC admin perm. */ +#define PRIV_VFS_EXEC 313 /* Override vnode DAC exec perm. */ +#define PRIV_VFS_LOOKUP 314 /* Override vnode DAC lookup perm. */ +#define PRIV_VFS_BLOCKRESERVE 315 /* Can use free block reserve. */ +#define PRIV_VFS_CHFLAGS_DEV 316 /* Can chflags() a device node. */ +#define PRIV_VFS_CHOWN 317 /* Can set user; group to non-member. */ +#define PRIV_VFS_CHROOT 318 /* chroot(). */ +#define PRIV_VFS_RETAINSUGID 319 /* Can retain sugid bits on change. */ +#define PRIV_VFS_EXCEEDQUOTA 320 /* Exempt from quota restrictions. */ +#define PRIV_VFS_EXTATTR_SYSTEM 321 /* Operate on system EA namespace. */ +#define PRIV_VFS_FCHROOT 322 /* fchroot(). */ +#define PRIV_VFS_FHOPEN 323 /* Can fhopen(). */ +#define PRIV_VFS_FHSTAT 324 /* Can fhstat(). */ +#define PRIV_VFS_FHSTATFS 325 /* Can fhstatfs(). */ +#define PRIV_VFS_GENERATION 326 /* stat() returns generation number. */ +#define PRIV_VFS_GETFH 327 /* Can retrieve file handles. */ +#define PRIV_VFS_GETQUOTA 328 /* getquota(). */ +#define PRIV_VFS_LINK 329 /* bsd.hardlink_check_uid */ +#define PRIV_VFS_MKNOD_BAD 330 /* Can mknod() to mark bad inodes. */ +#define PRIV_VFS_MKNOD_DEV 331 /* Can mknod() to create dev nodes. */ +#define PRIV_VFS_MKNOD_WHT 332 /* Can mknod() to create whiteout. */ +#define PRIV_VFS_MOUNT 333 /* Can mount(). */ +#define PRIV_VFS_MOUNT_OWNER 334 /* Can manage other users' file systems. */ +#define PRIV_VFS_MOUNT_EXPORTED 335 /* Can set MNT_EXPORTED on mount. */ +#define PRIV_VFS_MOUNT_PERM 336 /* Override dev node perms at mount. */ +#define PRIV_VFS_MOUNT_SUIDDIR 337 /* Can set MNT_SUIDDIR on mount. */ +#define PRIV_VFS_MOUNT_NONUSER 338 /* Can perform a non-user mount. */ +#define PRIV_VFS_SETGID 339 /* Can setgid if not in group. */ +#define PRIV_VFS_SETQUOTA 340 /* setquota(). */ +#define PRIV_VFS_STICKYFILE 341 /* Can set sticky bit on file. */ +#define PRIV_VFS_SYSFLAGS 342 /* Can modify system flags. */ +#define PRIV_VFS_UNMOUNT 343 /* Can unmount(). */ +#define PRIV_VFS_STAT 344 /* Override vnode MAC stat perm. */ + +/* + * Virtual memory privileges. + */ +#define PRIV_VM_MADV_PROTECT 360 /* Can set MADV_PROTECT. */ +#define PRIV_VM_MLOCK 361 /* Can mlock(), mlockall(). */ +#define PRIV_VM_MUNLOCK 362 /* Can munlock(), munlockall(). */ +#define PRIV_VM_SWAP_NOQUOTA 363 /* + * Can override the global + * swap reservation limits. + */ +#define PRIV_VM_SWAP_NORLIMIT 364 /* + * Can override the per-uid + * swap reservation limits. + */ + +/* + * Device file system privileges. + */ +#define PRIV_DEVFS_RULE 370 /* Can manage devfs rules. */ +#define PRIV_DEVFS_SYMLINK 371 /* Can create symlinks in devfs. */ + +/* + * Random number generator privileges. + */ +#define PRIV_RANDOM_RESEED 380 /* Closing /dev/random reseeds. */ + +/* + * Network stack privileges. + */ +#define PRIV_NET_BRIDGE 390 /* Administer bridge. */ +#define PRIV_NET_GRE 391 /* Administer GRE. */ +#define _PRIV_NET_PPP 392 /* Removed. */ +#define _PRIV_NET_SLIP 393 /* Removed. */ +#define PRIV_NET_BPF 394 /* Monitor BPF. */ +#define PRIV_NET_RAW 395 /* Open raw socket. */ +#define PRIV_NET_ROUTE 396 /* Administer routing. */ +#define PRIV_NET_TAP 397 /* Can open tap device. */ +#define PRIV_NET_SETIFMTU 398 /* Set interface MTU. */ +#define PRIV_NET_SETIFFLAGS 399 /* Set interface flags. */ +#define PRIV_NET_SETIFCAP 400 /* Set interface capabilities. */ +#define PRIV_NET_SETIFNAME 401 /* Set interface name. */ +#define PRIV_NET_SETIFMETRIC 402 /* Set interface metrics. */ +#define PRIV_NET_SETIFPHYS 403 /* Set interface physical layer prop. */ +#define PRIV_NET_SETIFMAC 404 /* Set interface MAC label. */ +#define PRIV_NET_ADDMULTI 405 /* Add multicast addr. to ifnet. */ +#define PRIV_NET_DELMULTI 406 /* Delete multicast addr. from ifnet. */ +#define PRIV_NET_HWIOCTL 407 /* Issue hardware ioctl on ifnet. */ +#define PRIV_NET_SETLLADDR 408 /* Set interface link-level address. */ +#define PRIV_NET_ADDIFGROUP 409 /* Add new interface group. */ +#define PRIV_NET_DELIFGROUP 410 /* Delete interface group. */ +#define PRIV_NET_IFCREATE 411 /* Create cloned interface. */ +#define PRIV_NET_IFDESTROY 412 /* Destroy cloned interface. */ +#define PRIV_NET_ADDIFADDR 413 /* Add protocol addr to interface. */ +#define PRIV_NET_DELIFADDR 414 /* Delete protocol addr on interface. */ +#define PRIV_NET_LAGG 415 /* Administer lagg interface. */ +#define PRIV_NET_GIF 416 /* Administer gif interface. */ +#define PRIV_NET_SETIFVNET 417 /* Move interface to vnet. */ +#define PRIV_NET_SETIFDESCR 418 /* Set interface description. */ +#define PRIV_NET_SETIFFIB 419 /* Set interface fib. */ +#define PRIV_NET_VXLAN 420 /* Administer vxlan. */ + +/* + * 802.11-related privileges. + */ +#define PRIV_NET80211_GETKEY 440 /* Query 802.11 keys. */ +#define PRIV_NET80211_MANAGE 441 /* Administer 802.11. */ + +/* + * Placeholder for AppleTalk privileges, not supported anymore. + */ +#define _PRIV_NETATALK_RESERVEDPORT 450 /* Bind low port number. */ + +/* + * ATM privileges. + */ +#define PRIV_NETATM_CFG 460 +#define PRIV_NETATM_ADD 461 +#define PRIV_NETATM_DEL 462 +#define PRIV_NETATM_SET 463 + +/* + * Bluetooth privileges. + */ +#define PRIV_NETBLUETOOTH_RAW 470 /* Open raw bluetooth socket. */ + +/* + * Netgraph and netgraph module privileges. + */ +#define PRIV_NETGRAPH_CONTROL 480 /* Open netgraph control socket. */ +#define PRIV_NETGRAPH_TTY 481 /* Configure tty for netgraph. */ + +/* + * IPv4 and IPv6 privileges. + */ +#define PRIV_NETINET_RESERVEDPORT 490 /* Bind low port number. */ +#define PRIV_NETINET_IPFW 491 /* Administer IPFW firewall. */ +#define PRIV_NETINET_DIVERT 492 /* Open IP divert socket. */ +#define PRIV_NETINET_PF 493 /* Administer pf firewall. */ +#define PRIV_NETINET_DUMMYNET 494 /* Administer DUMMYNET. */ +#define PRIV_NETINET_CARP 495 /* Administer CARP. */ +#define PRIV_NETINET_MROUTE 496 /* Administer multicast routing. */ +#define PRIV_NETINET_RAW 497 /* Open netinet raw socket. */ +#define PRIV_NETINET_GETCRED 498 /* Query netinet pcb credentials. */ +#define PRIV_NETINET_ADDRCTRL6 499 /* Administer IPv6 address scopes. */ +#define PRIV_NETINET_ND6 500 /* Administer IPv6 neighbor disc. */ +#define PRIV_NETINET_SCOPE6 501 /* Administer IPv6 address scopes. */ +#define PRIV_NETINET_ALIFETIME6 502 /* Administer IPv6 address lifetimes. */ +#define PRIV_NETINET_IPSEC 503 /* Administer IPSEC. */ +#define PRIV_NETINET_REUSEPORT 504 /* Allow [rapid] port/address reuse. */ +#define PRIV_NETINET_SETHDROPTS 505 /* Set certain IPv4/6 header options. */ +#define PRIV_NETINET_BINDANY 506 /* Allow bind to any address. */ +#define PRIV_NETINET_HASHKEY 507 /* Get and set hash keys for IPv4/6. */ + +/* + * Placeholders for IPX/SPX privileges, not supported any more. + */ +#define _PRIV_NETIPX_RESERVEDPORT 520 /* Bind low port number. */ +#define _PRIV_NETIPX_RAW 521 /* Open netipx raw socket. */ + +/* + * NCP privileges. + */ +#define PRIV_NETNCP 530 /* Use another user's connection. */ + +/* + * SMB privileges. + */ +#define PRIV_NETSMB 540 /* Use another user's connection. */ + +/* + * VM86 privileges. + */ +#define PRIV_VM86_INTCALL 550 /* Allow invoking vm86 int handlers. */ + +/* + * Set of reserved privilege values, which will be allocated to code as + * needed, in order to avoid renumbering later privileges due to insertion. + */ +#define _PRIV_RESERVED0 560 +#define _PRIV_RESERVED1 561 +#define _PRIV_RESERVED2 562 +#define _PRIV_RESERVED3 563 +#define _PRIV_RESERVED4 564 +#define _PRIV_RESERVED5 565 +#define _PRIV_RESERVED6 566 +#define _PRIV_RESERVED7 567 +#define _PRIV_RESERVED8 568 +#define _PRIV_RESERVED9 569 +#define _PRIV_RESERVED10 570 +#define _PRIV_RESERVED11 571 +#define _PRIV_RESERVED12 572 +#define _PRIV_RESERVED13 573 +#define _PRIV_RESERVED14 574 +#define _PRIV_RESERVED15 575 + +/* + * Define a set of valid privilege numbers that can be used by loadable + * modules that don't yet have privilege reservations. Ideally, these should + * not be used, since their meaning is opaque to any policies that are aware + * of specific privileges, such as jail, and as such may be arbitrarily + * denied. + */ +#define PRIV_MODULE0 600 +#define PRIV_MODULE1 601 +#define PRIV_MODULE2 602 +#define PRIV_MODULE3 603 +#define PRIV_MODULE4 604 +#define PRIV_MODULE5 605 +#define PRIV_MODULE6 606 +#define PRIV_MODULE7 607 +#define PRIV_MODULE8 608 +#define PRIV_MODULE9 609 +#define PRIV_MODULE10 610 +#define PRIV_MODULE11 611 +#define PRIV_MODULE12 612 +#define PRIV_MODULE13 613 +#define PRIV_MODULE14 614 +#define PRIV_MODULE15 615 + +/* + * DDB(4) privileges. + */ +#define PRIV_DDB_CAPTURE 620 /* Allow reading of DDB capture log. */ + +/* + * Arla/nnpfs privileges. + */ +#define PRIV_NNPFS_DEBUG 630 /* Perforn ARLA_VIOC_NNPFSDEBUG. */ + +/* + * cpuctl(4) privileges. + */ +#define PRIV_CPUCTL_WRMSR 640 /* Write model-specific register. */ +#define PRIV_CPUCTL_UPDATE 641 /* Update cpu microcode. */ + +/* + * Capi4BSD privileges. + */ +#define PRIV_C4B_RESET_CTLR 650 /* Load firmware, reset controller. */ +#define PRIV_C4B_TRACE 651 /* Unrestricted CAPI message tracing. */ + +/* + * OpenAFS privileges. + */ +#define PRIV_AFS_ADMIN 660 /* Can change AFS client settings. */ +#define PRIV_AFS_DAEMON 661 /* Can become the AFS daemon. */ + +/* + * Resource Limits privileges. + */ +#define PRIV_RCTL_GET_RACCT 670 +#define PRIV_RCTL_GET_RULES 671 +#define PRIV_RCTL_GET_LIMITS 672 +#define PRIV_RCTL_ADD_RULE 673 +#define PRIV_RCTL_REMOVE_RULE 674 + +/* + * mem(4) privileges. + */ +#define PRIV_KMEM_READ 680 /* Open mem/kmem for reading. */ +#define PRIV_KMEM_WRITE 681 /* Open mem/kmem for writing. */ + +/* + * Track end of privilege list. + */ +#define _PRIV_HIGHEST 682 + +/* + * Validate that a named privilege is known by the privilege system. Invalid + * privileges presented to the privilege system by a priv_check interface + * will result in a panic. This is only approximate due to sparse allocation + * of the privilege space. + */ +#define PRIV_VALID(x) ((x) > _PRIV_LOWEST && (x) < _PRIV_HIGHEST) + +#ifdef _KERNEL +/* + * Privilege check interfaces, modeled after historic suser() interfaces, but + * with the addition of a specific privilege name. No flags are currently + * defined for the API. Historically, flags specified using the real uid + * instead of the effective uid, and whether or not the check should be + * allowed in jail. + */ +struct thread; +struct ucred; +int priv_check(struct thread *td, int priv); +int priv_check_cred(struct ucred *cred, int priv, int flags); +#endif + +#endif /* _SPL_PRIV_H */ diff --git a/include/os/windows/spl/sys/priv_impl.h b/include/os/windows/spl/sys/priv_impl.h new file mode 100644 index 000000000000..dadbf32f021f --- /dev/null +++ b/include/os/windows/spl/sys/priv_impl.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_PRIV_IMPL_H +#define _SPL_PRIV_IMPL_H + +#endif /* _SPL_PRIV_IMPL_H */ diff --git a/include/os/windows/spl/sys/proc.h b/include/os/windows/spl/sys/proc.h new file mode 100644 index 000000000000..16b160960505 --- /dev/null +++ b/include/os/windows/spl/sys/proc.h @@ -0,0 +1,14 @@ + +#ifndef _SPL_PROC_H +#define _SPL_PROC_H + +//#include +//#include_next +//#include +typedef struct proc { void *something; } proc_t; +extern proc_t p0; /* process 0 */ +#define current_proc PsGetCurrentProcess + +#define current_proc PsGetCurrentProcess + +#endif /* SPL_PROC_H */ diff --git a/include/os/windows/spl/sys/processor.h b/include/os/windows/spl/sys/processor.h new file mode 100644 index 000000000000..5088f327becd --- /dev/null +++ b/include/os/windows/spl/sys/processor.h @@ -0,0 +1,11 @@ + +#ifndef _SPL_PROCESSOR_H +#define _SPL_PROCESSOR_H + +#include + +extern uint32_t getcpuid(); + +typedef int processorid_t; + +#endif /* _SPL_PROCESSOR_H */ diff --git a/include/os/windows/spl/sys/pset.h b/include/os/windows/spl/sys/pset.h new file mode 100644 index 000000000000..4406e1c1071a --- /dev/null +++ b/include/os/windows/spl/sys/pset.h @@ -0,0 +1,15 @@ + +#ifndef _SPL_PSET_H +#define _SPL_PSET_H + +typedef int psetid_t; + +/* special processor set id's */ +#define PS_NONE -1 +#define PS_QUERY -2 +#define PS_MYID -3 +#define PS_SOFT -4 +#define PS_HARD -5 +#define PS_QUERY_TYPE -6 + +#endif /* SPL_PSET_H */ diff --git a/include/os/windows/spl/sys/random.h b/include/os/windows/spl/sys/random.h new file mode 100644 index 000000000000..a7070280e511 --- /dev/null +++ b/include/os/windows/spl/sys/random.h @@ -0,0 +1,37 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + */ + +#ifndef _SPL_RANDOM_H +#define _SPL_RANDOM_H + +//#include_next + + +extern int random_get_bytes(uint8_t *ptr, uint32_t len); +#define random_get_pseudo_bytes random_get_bytes + +#endif /* _SPL_RANDOM_H */ diff --git a/include/os/windows/spl/sys/refstr.h b/include/os/windows/spl/sys/refstr.h new file mode 100644 index 000000000000..2b640f193574 --- /dev/null +++ b/include/os/windows/spl/sys/refstr.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_REFSTR_H +#define _SPL_REFSTR_H + +#endif /* SPL_REFSTR_H */ diff --git a/include/os/windows/spl/sys/resource.h b/include/os/windows/spl/sys/resource.h new file mode 100644 index 000000000000..e6234c3d014a --- /dev/null +++ b/include/os/windows/spl/sys/resource.h @@ -0,0 +1,7 @@ + +#ifndef _SPL_RESOURCE_H +#define _SPL_RESOURCE_H + +//#include_next + +#endif /* SPL_RESOURCE_H */ diff --git a/include/os/windows/spl/sys/rwlock.h b/include/os/windows/spl/sys/rwlock.h new file mode 100644 index 000000000000..7c050328b4b9 --- /dev/null +++ b/include/os/windows/spl/sys/rwlock.h @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2017 Jorgen Lundman + */ + +#ifndef _SPL_RWLOCK_H +#define _SPL_RWLOCK_H + +#include +//#include + +typedef enum { + RW_DRIVER = 2, + RW_DEFAULT = 4 +} krw_type_t; + +typedef enum { + RW_NONE = 0, + RW_WRITER = 1, + RW_READER = 2 +} krw_t; + +struct krwlock { + ERESOURCE rw_lock; /* opaque data */ + void *rw_owner; /* writer (exclusive) lock only */ + int rw_readers; /* reader lock only */ + int rw_pad; /* */ +}; +typedef struct krwlock krwlock_t; + +#define RW_WRITE_HELD(x) (rw_write_held((x))) +#define RW_LOCK_HELD(x) (rw_lock_held((x))) + +extern void rw_init(krwlock_t *, char *, krw_type_t, void *); +extern void rw_destroy(krwlock_t *); +extern void rw_enter(krwlock_t *, krw_t); +extern int rw_tryenter(krwlock_t *, krw_t); +extern void rw_exit(krwlock_t *); +extern void rw_downgrade(krwlock_t *); +extern int rw_tryupgrade(krwlock_t *); +extern int rw_write_held(krwlock_t *); +extern int rw_lock_held(krwlock_t *); +extern int rw_isinit(krwlock_t *); + +int spl_rwlock_init(void); +void spl_rwlock_fini(void); + +#endif /* _SPL_RWLOCK_H */ diff --git a/include/os/windows/spl/sys/sdt.h b/include/os/windows/spl/sys/sdt.h new file mode 100644 index 000000000000..d6a826c58c8b --- /dev/null +++ b/include/os/windows/spl/sys/sdt.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_SDT_H +#define _SPL_SDT_H + +#endif /* SPL_SDT_H */ diff --git a/include/os/windows/spl/sys/seg_kmem.h b/include/os/windows/spl/sys/seg_kmem.h new file mode 100644 index 000000000000..a8861d9edd9f --- /dev/null +++ b/include/os/windows/spl/sys/seg_kmem.h @@ -0,0 +1,150 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VM_SEG_KMEM_H +#define _VM_SEG_KMEM_H + + +#ifdef __cplusplus +extern "C" { +#endif + +//#include +//#include +#include +//#include +//#include +//#include + + /* + * VM - Kernel Segment Driver + */ + +#if defined(_KERNEL) + + extern uint64_t segkmem_total_allocated; + +// extern char *kernelheap; /* start of primary kernel heap */ +// extern char *ekernelheap; /* end of primary kernel heap */ +// extern char *heap_lp_base; /* start of kernel large page heap arena */ +// extern char *heap_lp_end; /* end of kernel large page heap arena */ +// extern struct seg kvseg; /* primary kernel heap segment */ +// extern struct seg kvseg_core; /* "core" kernel heap segment */ +// extern struct seg kzioseg; /* Segment for zio mappings */ +// extern vmem_t *heap_lp_arena; /* kernel large page heap arena */ +// extern vmem_t *heap_arena; /* primary kernel heap arena */ +// extern vmem_t *hat_memload_arena; /* HAT translation arena */ +// extern struct seg kvseg32; /* 32-bit kernel heap segment */ +// extern vmem_t *heap32_arena; /* 32-bit kernel heap arena */ +// extern vmem_t *heaptext_arena; /* kernel text arena, from heap */ +// extern struct as kas; /* kernel address space */ +// extern int segkmem_reloc; /* enable/disable segkmem relocatable pages */ +// extern vmem_t *static_arena; /* arena for caches to import static memory */ +// extern vmem_t *static_alloc_arena; /* arena for allocating static memory */ + extern vmem_t *zio_arena_parent; /* qcaching for zio arenas and abd arena */ + extern vmem_t *zio_arena; /* arena for zio caches for file blocks */ + extern vmem_t *zio_metadata_arena; /* arena for zio caches for (zfs) metadata blocks */ +// extern struct vnode kvps[]; + /* + * segkmem page vnodes + */ +#define kvp (kvps[KV_KVP]) +#define zvp (kvps[KV_ZVP]) +#if defined(__sparc) +#define mpvp (kvps[KV_MPVP]) +#define promvp (kvps[KV_PROMVP]) +#endif /* __sparc */ + +// extern int segkmem_create(struct seg *); +// extern page_t *segkmem_page_create(void *, uint32_t, int, void *); +// extern void *segkmem_xalloc(vmem_t *, void *, uint32_t, int, uint_t, +// page_t *(*page_create_func)(void *, uint32_t, int, void *), void *); + void *segkmem_alloc(vmem_t *, uint32_t, int); +// extern void *segkmem_alloc_permanent(vmem_t *, uint32_t, int); + extern void segkmem_free(vmem_t *, void *, uint32_t); +// extern void segkmem_xfree(vmem_t *, void *, uint32_t, void (*)(page_t *)); + +// extern void *boot_alloc(void *, uint32_t, uint_t); +// extern void boot_mapin(caddr_t addr, uint32_t size); + extern void kernelheap_init(void); + extern void kernelheap_fini(void); +// extern void segkmem_gc(void); + + extern void *segkmem_zio_alloc(vmem_t *, uint32_t, int); +// extern int segkmem_zio_create(struct seg *); + extern void segkmem_zio_free(vmem_t *, void *, uint32_t); + extern void segkmem_zio_init(void); + extern void segkmem_zio_fini(void); + + /* + * Flags for segkmem_xalloc(). + * + * SEGKMEM_SHARELOCKED requests pages which are locked SE_SHARED to be + * returned rather than unlocked which is now the default. Note that + * memory returned by SEGKMEM_SHARELOCKED cannot be freed by segkmem_free(). + * This is a hack for seg_dev that should be cleaned up in the future. + */ +#define SEGKMEM_SHARELOCKED 0x20000 + + /* + * Large page for kmem caches support + */ +// typedef struct segkmem_lpcb { +// kmutex_t lp_lock; +// kcondvar_t lp_cv; +// uint_t lp_wait; +// uint_t lp_uselp; +// ulong_t lp_throttle; + + /* stats */ +// uint64_t sleep_allocs_failed; +// uint64_t nosleep_allocs_failed; +// uint64_t allocs_throttled; +// uint64_t allocs_limited; +// uint64_t alloc_bytes_failed; +// } segkmem_lpcb_t; + +// extern void *segkmem_alloc_lp(vmem_t *, uint32_t *, uint32_t, int); +// extern void segkmem_free_lp(vmem_t *, void *, uint32_t); +// extern int segkmem_lpsetup(); +// extern void segkmem_heap_lp_init(void); + +// extern uint32_t segkmem_lpsize; +// extern int segkmem_lpszc; +// extern uint32_t segkmem_heaplp_quantum; +// extern uint32_t segkmem_kmemlp_max; + +#define SEGKMEM_USE_LARGEPAGES (segkmem_lpsize > PAGESIZE) + +#define IS_KMEM_VA_LARGEPAGE(vaddr) \ +(((vaddr) >= heap_lp_base) && ((vaddr) < heap_lp_end)) + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _VM_SEG_KMEM_H */ diff --git a/include/os/windows/spl/sys/sid.h b/include/os/windows/spl/sys/sid.h new file mode 100644 index 000000000000..e254259ed8ef --- /dev/null +++ b/include/os/windows/spl/sys/sid.h @@ -0,0 +1,103 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _SPL_SID_H +#define _SPL_SID_H + +#ifdef __cplusplus +extern "C" { +#endif + + +typedef struct ksiddomain { + char *kd_name; +} ksiddomain_t; + +typedef enum ksid_index { + KSID_USER, + KSID_GROUP, + KSID_OWNER, + KSID_COUNT +} ksid_index_t; + +typedef int ksid_t; + +/* Should be in kidmap.h */ +typedef int32_t idmap_stat; + +static inline ksiddomain_t * +ksid_lookupdomain(const char *dom) +{ + ksiddomain_t *kd; + int len = strlen(dom); + + kd = (ksiddomain_t *)kmem_zalloc(sizeof(ksiddomain_t), KM_SLEEP); + kd->kd_name = (char *)kmem_zalloc(len + 1, KM_SLEEP); + memcpy(kd->kd_name, dom, len); + + return (kd); +} + +static inline void +ksiddomain_rele(ksiddomain_t *ksid) +{ + kmem_free(ksid->kd_name, strlen(ksid->kd_name) + 1); + kmem_free(ksid, sizeof(ksiddomain_t)); +} + +#define UID_NOBODY 65534 +#define GID_NOBODY 65534 + +static __inline uint_t +ksid_getid(ksid_t *ks) +{ + panic("%s has been unexpectedly called", __func__); + return 0; +} + +static __inline const char * +ksid_getdomain(ksid_t *ks) +{ + panic("%s has been unexpectedly called", __func__); + return 0; +} + +static __inline uint_t +ksid_getrid(ksid_t *ks) +{ + panic("%s has been unexpectedly called", __func__); + return 0; +} + +#define kidmap_getsidbyuid(zone, uid, sid_prefix, rid) (1) +#define kidmap_getsidbygid(zone, gid, sid_prefix, rid) (1) + +#ifdef __cplusplus +} +#endif + +#endif /* _SPL_SID_H */ diff --git a/include/os/windows/spl/sys/signal.h b/include/os/windows/spl/sys/signal.h new file mode 100644 index 000000000000..b4b865e6d39e --- /dev/null +++ b/include/os/windows/spl/sys/signal.h @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + + +#ifndef _SPL_SIGNAL_H +#define _SPL_SIGNAL_H + +#include +//#include +#include +#include +//#include_next + +#define FORREAL 0 /* Usual side-effects */ +#define JUSTLOOKING 1 /* Don't stop the process */ + +struct proc; + +//extern int +//thread_issignal(struct proc *, thread_t, sigset_t); + +/* The "why" argument indicates the allowable side-effects of the call: + * + * FORREAL: Extract the next pending signal from p_sig into p_cursig; + * stop the process if a stop has been requested or if a traced signal + * is pending. + * + * JUSTLOOKING: Don't stop the process, just indicate whether or not + * a signal might be pending (FORREAL is needed to tell for sure). + */ +#define threadmask (sigmask(SIGILL)|sigmask(SIGTRAP)|\ + sigmask(SIGIOT)|sigmask(SIGEMT)|\ + sigmask(SIGFPE)|sigmask(SIGBUS)|\ + sigmask(SIGSEGV)|sigmask(SIGSYS)|\ + sigmask(SIGPIPE)|sigmask(SIGKILL)|\ + sigmask(SIGTERM)|sigmask(SIGINT)) + +static inline int +issig(int why) +{ + return 0; +} + +#endif /* SPL_SIGNAL_H */ diff --git a/include/os/windows/spl/sys/stat.h b/include/os/windows/spl/sys/stat.h new file mode 100644 index 000000000000..9a842aeccaf1 --- /dev/null +++ b/include/os/windows/spl/sys/stat.h @@ -0,0 +1,65 @@ +#ifndef _SPL_STAT_H +#define _SPL_STAT_H + +//#include_next + +#ifndef S_IFMT +/* File type */ +#define S_IFMT 0170000 /* [XSI] type of file mask */ +#define S_IFIFO 0010000 /* [XSI] named pipe (fifo) */ +#define S_IFCHR 0020000 /* [XSI] character special */ +#define S_IFDIR 0040000 /* [XSI] directory */ +#define S_IFBLK 0060000 /* [XSI] block special */ +#define S_IFREG 0100000 /* [XSI] regular */ +#define S_IFLNK 0120000 /* [XSI] symbolic link */ +#define S_IFSOCK 0140000 /* [XSI] socket */ +#if !defined(_POSIX_C_SOURCE) +#define S_IFWHT 0160000 /* OBSOLETE: whiteout */ +#endif +/* File mode */ +/* Read, write, execute/search by owner */ +#define S_IRWXU 0000700 /* [XSI] RWX mask for owner */ +#define S_IRUSR 0000400 /* [XSI] R for owner */ +#define S_IWUSR 0000200 /* [XSI] W for owner */ +#define S_IXUSR 0000100 /* [XSI] X for owner */ +/* Read, write, execute/search by group */ +#define S_IRWXG 0000070 /* [XSI] RWX mask for group */ +#define S_IRGRP 0000040 /* [XSI] R for group */ +#define S_IWGRP 0000020 /* [XSI] W for group */ +#define S_IXGRP 0000010 /* [XSI] X for group */ +/* Read, write, execute/search by others */ +#define S_IRWXO 0000007 /* [XSI] RWX mask for other */ +#define S_IROTH 0000004 /* [XSI] R for other */ +#define S_IWOTH 0000002 /* [XSI] W for other */ +#define S_IXOTH 0000001 /* [XSI] X for other */ + +#define S_ISUID 0004000 /* [XSI] set user id on execution */ +#define S_ISGID 0002000 /* [XSI] set group id on execution */ +#define S_ISVTX 0001000 /* [XSI] directory restrcted delete */ + +#if !defined(_POSIX_C_SOURCE) +#define S_ISTXT S_ISVTX /* sticky bit: not supported */ +#define S_IREAD S_IRUSR /* backward compatability */ +#define S_IWRITE S_IWUSR /* backward compatability */ +#define S_IEXEC S_IXUSR /* backward compatability */ +#endif +#endif /* !S_IFMT */ + +/* + * [XSI] The following macros shall be provided to test whether a file is + * of the specified type. The value m supplied to the macros is the value + * of st_mode from a stat structure. The macro shall evaluate to a non-zero + * value if the test is true; 0 if the test is false. + */ +#define S_ISBLK(m) (((m)& S_IFMT) == S_IFBLK) /* block special */ +#define S_ISCHR(m) (((m)& S_IFMT) == S_IFCHR) /* char special */ +#define S_ISDIR(m) (((m)& S_IFMT) == S_IFDIR) /* directory */ +#define S_ISFIFO(m) (((m)& S_IFMT) == S_IFIFO) /* fifo or socket */ +#define S_ISREG(m) (((m)& S_IFMT) == S_IFREG) /* regular file */ +#define S_ISLNK(m) (((m)& S_IFMT) == S_IFLNK) /* symbolic link */ +#define S_ISSOCK(m) (((m)& S_IFMT) == S_IFSOCK) /* socket */ +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define S_ISWHT(m) (((m)& S_IFMT) == S_IFWHT) /* OBSOLETE: whiteout */ +#endif + +#endif /* SPL_STAT_H */ diff --git a/include/os/windows/spl/sys/stropts.h b/include/os/windows/spl/sys/stropts.h new file mode 100644 index 000000000000..6fcb9634d086 --- /dev/null +++ b/include/os/windows/spl/sys/stropts.h @@ -0,0 +1,173 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2013 Jorgen Lundman + * + */ + + +#ifndef _SPL_STROPTS_H +#define _SPL_STROPTS_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define isprint(c) ((c) >= ' ' && (c) <= '~') + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + * High order bit is 31 (or 63 in _LP64 kernel). + */ +static inline int +highbit64(unsigned long long i) +{ + register int h = 1; + + if (i == 0) + return (0); + if (i & 0xffffffff00000000ull) { + h += 32; i >>= 32; + } + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +} + +static inline int +highbit(unsigned long long i) +{ + register int h = 1; + + if (i == 0) + return (0); + if (i & 0xffffffff00000000ull) { + h += 32; i >>= 32; + } + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +} + +/* + * Find lowest one bit set. + * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. + * Low order bit is 0. + */ +static inline int +lowbit(unsigned long long i) +{ + register int h = 1; + + if (i == 0) + return (0); + + if (!(i & 0xffffffff)) { + h += 32; i >>= 32; + } + if (!(i & 0xffff)) { + h += 16; i >>= 16; + } + if (!(i & 0xff)) { + h += 8; i >>= 8; + } + if (!(i & 0xf)) { + h += 4; i >>= 4; + } + if (!(i & 0x3)) { + h += 2; i >>= 2; + } + if (!(i & 0x1)) { + h += 1; + } + return (h); +} + + + +static inline int +is_ascii_str(const char * str) +{ + unsigned char ch; + + while ((ch = (unsigned char)*str++) != '\0') { + if (ch >= 0x80) + return (0); + } + return (1); +} + + +static inline void * +kmemchr(const void *s, int c, size_t n) +{ + if (n != 0) { + const unsigned char *p = (const unsigned char *)s; + do { + if (*p++ == (unsigned char)c) + return ((void *)(uintptr_t)(p - 1)); + } while (--n != 0); + } + return (NULL); +} + +#define LONG_BIT 64 +#define IDX(c) ((unsigned char)(c) / LONG_BIT) +#define BIT(c) ((unsigned long)1 << ((unsigned char)(c) % LONG_BIT)) + + +#ifdef __cplusplus +} +#endif + +#endif /* SPL_STROPTS_H */ diff --git a/include/os/windows/spl/sys/sunddi.h b/include/os/windows/spl/sys/sunddi.h new file mode 100644 index 000000000000..189678f40f65 --- /dev/null +++ b/include/os/windows/spl/sys/sunddi.h @@ -0,0 +1,197 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Garrett D'Amore . All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + + + +#ifndef _SPL_SUNDDI_H +#define _SPL_SUNDDI_H + +#include +#include +#include +#include +#include +#include +#include +#include +//#include + +typedef int ddi_devid_t; + +#define DDI_DEV_T_NONE ((dev_t)-1) +#define DDI_DEV_T_ANY ((dev_t)-2) +#define DI_MAJOR_T_UNKNOWN ((major_t)0) + +#define DDI_PROP_DONTPASS 0x0001 +#define DDI_PROP_CANSLEEP 0x0002 + +#define DDI_SUCCESS 0 +#define DDI_FAILURE -1 + +#define ddi_prop_lookup_string(x1,x2,x3,x4,x5) (*x5 = NULL) +#define ddi_prop_free(x) (void)0 +#define ddi_root_node() (void)0 + +#define isalnum(ch) (isalpha(ch) || isdigit(ch)) +#define isalpha(ch) (isupper(ch) || islower(ch)) +#define isdigit(ch) ((ch) >= '0' && (ch) <= '9') +#define islower(ch) ((ch) >= 'a' && (ch) <= 'z') +#define isspace(ch) (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \ + ((ch) == '\t') || ((ch) == '\f')) +#define isupper(ch) ((ch) >= 'A' && (ch) <= 'Z') +#define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \ + ((ch) >= 'A' && (ch) <= 'F')) +#define isgraph(C) ((C) >= 0x21 && (C) <= 0x7E) +#define ispunct(C) (((C) >= 0x21 && (C) <= 0x2F) || \ + ((C) >= 0x3A && (C) <= 0x40) || \ + ((C) >= 0x5B && (C) <= 0x60) || \ + ((C) >= 0x7B && (C) <= 0x7E)) + +#define DIGIT(x) \ + (isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A') + +#define MBASE ('z' - 'a' + 1 + 10) +/* +* The following macro is a version of isalnum() that limits alphabetic +* characters to the ranges a-z and A-Z; locale dependent characters will not +* return 1. The members of a-z and A-Z are assumed to be in ascending order +* and contiguous. +*/ +#define lisalnum(x) \ + (isdigit(x) || ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z')) + +//extern int ddi_strtoul(const char *, char **, int, unsigned long *); +//extern int ddi_strtol(const char *, char **, int, long *); +//extern int ddi_strtoull(const char *, char **, int, unsigned long long *); +//extern int ddi_strtoll(const char *, char **, int, long long *); + +// Define proper Solaris API calls, and clean ZFS up to use +int ddi_copyin(const void *from, void *to, size_t len, int flags); +int ddi_copyout(const void *from, void *to, size_t len, int flags); +int ddi_copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done); + +int ddi_copysetup(void *to, size_t len, void **out_buffer, PMDL *out_mdl); + + +extern int ddi_strtol(const char *str, char **nptr, int base, long *result); + + +#ifndef OTYPCNT +#define OTYPCNT 5 +#define OTYP_BLK 0 +#define OTYP_MNT 1 +#define OTYP_CHR 2 +#define OTYP_SWP 3 +#define OTYP_LYR 4 +#endif + +#define P2END(x, align) (-(~((uint64_t)x) & -((uint64_t)align))) + +#define ddi_name_to_major(name) devsw_name2blk(name, NULL, 0) + +struct dev_info { + dev_t dev; // Major / Minor + void *devc; + void *devb; +}; +typedef struct dev_info dev_info_t; + +int ddi_strtoull(const char *, char **, int, unsigned long long *); +int ddi_strtoll(const char *, char **, int, long long *); +int ddi_strtoul(const char *, char **, int, unsigned long *); +int ddi_strtol(const char *, char **, int, long *); +int ddi_soft_state_init(void **, uint32_t, uint32_t); +int ddi_soft_state_zalloc(void *, int); +void *ddi_get_soft_state(void *, int); +void ddi_soft_state_free(void *, int); +void ddi_soft_state_fini(void **); +int ddi_create_minor_node(dev_info_t *, char *, int, + minor_t, char *, int); +void ddi_remove_minor_node(dev_info_t *, char *); + +int ddi_driver_major(dev_info_t *); + +typedef void *ldi_ident_t; + +#define DDI_SUCCESS 0 +#define DDI_FAILURE -1 + +#define DDI_PSEUDO "" + +#define ddi_prop_update_int64(a, b, c, d) DDI_SUCCESS +#define ddi_prop_update_string(a, b, c, d) DDI_SUCCESS + +#define bioerror(bp, er) (buf_seterror((bp), (er))) +#define biodone(bp) buf_biodone(bp) + +static inline long ddi_fls(long mask) { \ + /*Algorithm courtesy of Steve Chessin.*/ \ + while (mask) { \ + long nx; \ + if ((nx = (mask & (mask - 1))) == 0) \ + break; \ + mask = nx; \ + } \ + return (__lzcnt(mask)); \ +} + +// find_first_bits_de_bruijn(unsigned nums[ARRAY_SIZE]) +static inline long ddi_ffs(long mask) { \ + static const int MultiplyDeBruijnBitPosition[32] = { \ + 1, 2, 29, 3, 30, 15, 25, 4, 31, 23, 21, 16, 26, 18, 5, 9, \ + 32, 28, 14, 24, 22, 20, 17, 8, 27, 13, 19, 7, 12, 6, 11, 10 \ + }; \ + return MultiplyDeBruijnBitPosition[ \ + ((unsigned)((mask & -mask) * 0x077CB531U)) >> 27]; \ +} + + + + +#define getminor(X) minor((X)) + + + +/* + * This data structure is entirely private to the soft state allocator. + */ +struct i_ddi_soft_state { + void **array; /* the array of pointers */ + kmutex_t lock; /* serialize access to this struct */ + uint32_t size; /* how many bytes per state struct */ + uint32_t n_items; /* how many structs herein */ + struct i_ddi_soft_state *next; /* 'dirty' elements */ +}; + +#define MIN_N_ITEMS 8 /* 8 void *'s == 32 bytes */ + + +uint32_t +ddi_strcspn(const char * __restrict s, const char * __restrict charset); +#define strcspn ddi_strcspn + +#endif /* SPL_SUNDDI_H */ diff --git a/include/os/windows/spl/sys/sunldi.h b/include/os/windows/spl/sys/sunldi.h new file mode 100644 index 000000000000..85eb04bd2c2d --- /dev/null +++ b/include/os/windows/spl/sys/sunldi.h @@ -0,0 +1,34 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + */ + + + +#ifndef _SPL_SUNLDI_H +#define _SPL_SUNLDI_H + +#include + +#define SECTOR_SIZE 512 + +#endif /* SPL_SUNLDI_H */ diff --git a/include/os/windows/spl/sys/sysdc.h b/include/os/windows/spl/sys/sysdc.h new file mode 100644 index 000000000000..e27f31fdd078 --- /dev/null +++ b/include/os/windows/spl/sys/sysdc.h @@ -0,0 +1,4 @@ +#ifndef _SPL_SYSDC_H +#define _SPL_SYSDC_H + +#endif /* SPL_SYSDC_H */ diff --git a/include/os/windows/spl/sys/sysevent.h b/include/os/windows/spl/sys/sysevent.h new file mode 100644 index 000000000000..8389ac483bda --- /dev/null +++ b/include/os/windows/spl/sys/sysevent.h @@ -0,0 +1,36 @@ +/* +* CDDL HEADER START +* +* The contents of this file are subject to the terms of the +* Common Development and Distribution License, Version 1.0 only +* (the "License"). You may not use this file except in compliance +* with the License. +* +* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +* or http://www.opensolaris.org/os/licensing. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* When distributing Covered Code, include this CDDL HEADER in each +* file and include the License file at usr/src/OPENSOLARIS.LICENSE. +* If applicable, add the following below this CDDL HEADER, with the +* fields enclosed by brackets "[]" replaced with your own identifying +* information: Portions Copyright [yyyy] [name of copyright owner] +* +* CDDL HEADER END +*/ +/* +* Copyright 2006 Sun Microsystems, Inc. All rights reserved. +* Use is subject to license terms. +*/ + +#ifndef _SYS_SYSEVENT_H +#define _SYS_SYSEVENT_H + +#include + +typedef struct sysevent { + nvlist_t *resource; +} sysevent_t; + +#endif diff --git a/include/os/windows/spl/sys/sysevent/eventdefs.h b/include/os/windows/spl/sys/sysevent/eventdefs.h new file mode 100644 index 000000000000..8ed67c266270 --- /dev/null +++ b/include/os/windows/spl/sys/sysevent/eventdefs.h @@ -0,0 +1,135 @@ +/* +* CDDL HEADER START +* +* The contents of this file are subject to the terms of the +* Common Development and Distribution License (the "License"). +* You may not use this file except in compliance with the License. +* +* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +* or http://www.opensolaris.org/os/licensing. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* When distributing Covered Code, include this CDDL HEADER in each +* file and include the License file at usr/src/OPENSOLARIS.LICENSE. +* If applicable, add the following below this CDDL HEADER, with the +* fields enclosed by brackets "[]" replaced with your own identifying +* information: Portions Copyright [yyyy] [name of copyright owner] +* +* CDDL HEADER END +*/ +/* +* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. +* Copyright 2015 Nexenta Systems, Inc. All rights reserved. +* Copyright 2017 Joyent, Inc. +*/ + +#ifndef _SYS_SYSEVENT_EVENTDEFS_H +#define _SYS_SYSEVENT_EVENTDEFS_H + +#ifdef __cplusplus +extern "C" { +#endif + + /* + * eventdefs.h contains public definitions for sysevent types (classes + * and subclasses). All additions/removal/changes are subject + * to PSARC approval. + */ + + /* Sysevent Class definitions */ +#define EC_NONE "EC_none" +#define EC_PRIV "EC_priv" +#define EC_PLATFORM "EC_platform" /* events private to platform */ +#define EC_DR "EC_dr" /* Dynamic reconfiguration event class */ +#define EC_ENV "EC_env" /* Environmental monitor event class */ +#define EC_DOMAIN "EC_domain" /* Domain event class */ +#define EC_AP_DRIVER "EC_ap_driver" /* Alternate Pathing event class */ +#define EC_IPMP "EC_ipmp" /* IP Multipathing event class */ +#define EC_DEV_ADD "EC_dev_add" /* device add event class */ +#define EC_DEV_REMOVE "EC_dev_remove" /* device remove event class */ +#define EC_DEV_BRANCH "EC_dev_branch" /* device tree branch event class */ +#define EC_DEV_STATUS "EC_dev_status" /* device status event class */ +#define EC_FM "EC_fm" /* FMA error report event */ +#define EC_ZFS "EC_zfs" /* ZFS event */ +#define EC_DATALINK "EC_datalink" /* datalink event */ +#define EC_VRRP "EC_vrrp" /* VRRP event */ + + /* + * EC_DEV_ADD and EC_DEV_REMOVE subclass definitions - supporting attributes + * (name/value pairs) are found in sys/sysevent/dev.h + */ +#define ESC_DISK "disk" /* disk device */ +#define ESC_NETWORK "network" /* network interface */ +#define ESC_PRINTER "printer" /* printer device */ +#define ESC_LOFI "lofi" /* lofi device */ + + /* + * EC_DEV_BRANCH subclass definitions - supporting attributes (name/value pairs) + * are found in sys/sysevent/dev.h + */ + + /* device tree branch added */ +#define ESC_DEV_BRANCH_ADD "dev_branch_add" + + /* device tree branch removed */ +#define ESC_DEV_BRANCH_REMOVE "dev_branch_remove" + + /* + * EC_DEV_STATUS subclass definitions + * + * device capacity dynamically changed + */ +#define ESC_DEV_DLE "dev_dle" + + /* LUN has received an eject request from the user */ +#define ESC_DEV_EJECT_REQUEST "dev_eject_request" + + /* FMA Fault and Error event protocol subclass */ +#define ESC_FM_ERROR "error" +#define ESC_FM_ERROR_REPLAY "error_replay" + + /* + * ZFS subclass definitions. supporting attributes (name/value paris) are found + * in sys/fs/zfs.h + */ +#define ESC_ZFS_RESILVER_START "resilver_start" +#define ESC_ZFS_RESILVER_FINISH "resilver_finish" +#define ESC_ZFS_VDEV_REMOVE "vdev_remove" +#define ESC_ZFS_VDEV_REMOVE_AUX "vdev_remove_aux" +#define ESC_ZFS_VDEV_REMOVE_DEV "vdev_remove_dev" +#define ESC_ZFS_POOL_CREATE "pool_create" +#define ESC_ZFS_POOL_DESTROY "pool_destroy" +#define ESC_ZFS_POOL_IMPORT "pool_import" +#define ESC_ZFS_POOL_EXPORT "pool_export" +#define ESC_ZFS_VDEV_ADD "vdev_add" +#define ESC_ZFS_VDEV_ATTACH "vdev_attach" +#define ESC_ZFS_VDEV_CLEAR "vdev_clear" +#define ESC_ZFS_VDEV_CHECK "vdev_check" +#define ESC_ZFS_VDEV_ONLINE "vdev_online" +#define ESC_ZFS_CONFIG_SYNC "config_sync" +#define ESC_ZFS_SCRUB_START "scrub_start" +#define ESC_ZFS_SCRUB_FINISH "scrub_finish" +#define ESC_ZFS_SCRUB_ABORT "scrub_abort" +#define ESC_ZFS_SCRUB_RESUME "scrub_resume" +#define ESC_ZFS_SCRUB_PAUSED "scrub_paused" +#define ESC_ZFS_VDEV_SPARE "vdev_spare" +#define ESC_ZFS_VDEV_AUTOEXPAND "vdev_autoexpand" +#define ESC_ZFS_BOOTFS_VDEV_ATTACH "bootfs_vdev_attach" +#define ESC_ZFS_POOL_REGUID "pool_reguid" +#define ESC_ZFS_HISTORY_EVENT "history_event" +#define ESC_ZFS_TRIM_START "trim_start" +#define ESC_ZFS_TRIM_FINISH "trim_finish" +#define ESC_ZFS_TRIM_CANCEL "trim_cancel" +#define ESC_ZFS_TRIM_RESUME "trim_resume" +#define ESC_ZFS_TRIM_SUSPEND "trim_suspend" + /* + * datalink subclass definitions. + */ +#define ESC_DATALINK_PHYS_ADD "datalink_phys_add" /* new physical link */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSEVENT_EVENTDEFS_H */ \ No newline at end of file diff --git a/include/os/windows/spl/sys/sysmacros.h b/include/os/windows/spl/sys/sysmacros.h new file mode 100644 index 000000000000..7a642ac62131 --- /dev/null +++ b/include/os/windows/spl/sys/sysmacros.h @@ -0,0 +1,266 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + */ + +#ifndef _SPL_SYSMACROS_H +#define _SPL_SYSMACROS_H + +#include +#include +#include +#include +#include + +#ifndef _KERNEL +#define _KERNEL __KERNEL__ +#endif + +#define FALSE 0 +#define TRUE 1 + +#if 0 +#define INT8_MAX (127) +#define INT8_MIN (-128) +#define UINT8_MAX (255) +#define UINT8_MIN (0) + +#define INT16_MAX (32767) +#define INT16_MIN (-32768) +#define UINT16_MAX (65535) +#define UINT16_MIN (0) + +#define INT32_MAX INT_MAX +#define INT32_MIN INT_MIN +#define UINT32_MAX UINT_MAX +#define UINT32_MIN UINT_MIN + +#define INT64_MAX LLONG_MAX +#define INT64_MIN LLONG_MIN +#define UINT64_MAX ULLONG_MAX +#define UINT64_MIN ULLONG_MIN + +#define NBBY 8 +#define MAXBSIZE 8192 +#endif +#define NBBY 8 + //#define ENOTSUP EOPNOTSUPP + +#define MAXMSGLEN 256 +#define MAXNAMELEN 256 +#define MAXPATHLEN 1024 +#define MAXOFFSET_T LLONG_MAX +#define DEV_BSIZE 512 +#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ + +#define proc_pageout NULL +#define curproc (struct proc *)PsGetCurrentProcess() +//#define max_ncpus num_possible_cpus() +//#define CPU_SEQID smp_processor_id() +extern uint32_t cpu_number(void); +#define CPU_SEQID (cpu_number()) +#define _NOTE(x) +#define is_system_labeled() 0 + +extern unsigned int max_ncpus; + +#ifndef RLIM64_INFINITY +#define RLIM64_INFINITY (~0ULL) +#endif + +/* 0..MAX_PRIO-1: Process priority + * 0..MAX_RT_PRIO-1: RT priority tasks + * MAX_RT_PRIO..MAX_PRIO-1: SCHED_NORMAL tasks + * + * Treat shim tasks as SCHED_NORMAL tasks + */ +//#define minclsyspri (MAX_RT_PRIO) +//#define maxclsyspri (MAX_PRIO-1) +/* + * In OSX, the kernel thread priorities start at 81 and goes to + * 95 MAXPRI_KERNEL. BASEPRI_REALTIME starts from 96. Since + * swap priority is at 92. Most ZFS priorities should probably + * stay below this, but kmem_reap needs to be higher. + */ +#define minclsyspri 81 /* BASEPRI_KERNEL */ +#define defclsyspri 81 /* BASEPRI_KERNEL */ +#define maxclsyspri 89 + + +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) + +/* Missing macros + */ +#define PAGESIZE PAGE_SIZE + +/* from Solaris sys/byteorder.h */ +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + + +/* Dtrace probes do not exist in the linux kernel */ +#ifdef DTRACE_PROBE +#undef DTRACE_PROBE +#endif /* DTRACE_PROBE */ +#define DTRACE_PROBE(a) ((void)0) + +#ifdef DTRACE_PROBE1 +#undef DTRACE_PROBE1 +#endif /* DTRACE_PROBE1 */ +#define DTRACE_PROBE1(a, b, c) ((void)0) + +#ifdef DTRACE_PROBE2 +#undef DTRACE_PROBE2 +#endif /* DTRACE_PROBE2 */ +#define DTRACE_PROBE2(a, b, c, d, e) ((void)0) + +#ifdef DTRACE_PROBE3 +#undef DTRACE_PROBE3 +#endif /* DTRACE_PROBE3 */ +#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0) + +#ifdef DTRACE_PROBE4 +#undef DTRACE_PROBE4 +#endif /* DTRACE_PROBE4 */ +#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0) + +/* Missing globals */ +extern char spl_version[32]; +extern char hw_serial[11]; + +/* Missing misc functions */ +//extern int highbit(unsigned long long i); +//extern int lowbit(unsigned long long i); +extern uint32_t zone_get_hostid(void *zone); +extern void spl_setup(void); +extern void spl_cleanup(void); + +#define NBITSMINOR 20 +#define MINORMASK ((1UL<> NBITSMINOR) +#define minor(x) ((x) & MINORMASK) + +#define makedev(x, y) (((x) << NBITSMINOR) | ((y) & MINORMASK)) +#define makedevice(maj,min) makedev(maj,min) + +/* common macros */ +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a, b) ((a) < (b) ? (b) : (a)) +#endif +#ifndef ABS +#define ABS(a) ((a) < 0 ? -(a) : (a)) +#endif +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#endif + +#ifndef roundup +#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) +#endif + +/* + * Compatibility macros/typedefs needed for Solaris -> Linux port + * For some reason Windows makes some of these signed, and everything goes to hell. + * But may have put in too many (uint64_t), check this + */ +#define P2ALIGN(x, align) (((uint64_t)x) & -((uint64_t)align)) +#define P2CROSS(x, y, align) ((((uint64_t)x) ^ ((uint64_t)y)) > ((uint64_t)align) - 1) +#define P2ROUNDUP(x, align) (-(-((uint64_t)x) & -((uint64_t)align))) +#define P2PHASE(x, align) (((uint64_t)x) & (((uint64_t)align) - 1)) +#define P2NPHASE(x, align) (-((uint64_t)x) & (((uint64_t)align) - 1)) +#define ISP2(x) ((((uint64_t)x) & (((uint64_t)x) - 1)) == 0) +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1))==0) +#define P2BOUNDARY(off, len, align) \ + ((((uint64_t)off) ^ (((uint64_t)off) + ((uint64_t)len) - 1)) > ((uint64_t)align) - 1) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)((uint64_t)x) & -(type)((uint64_t)align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)((uint64_t)x) & ((type)((uint64_t)align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)((uint64_t)x) & ((type)((uint64_t)align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + (-(-(type)((uint64_t)x) & -(type)((uint64_t)align))) +#define P2END_TYPED(x, align, type) \ + (-(~(type)((uint64_t)x) & -(type)((uint64_t)align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)((uint64_t)phase) - (((type)((uint64_t)phase) - (type)((uint64_t)x)) & -(type)((uint64_t)align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)((uint64_t)x) ^ (type)((uint64_t)y)) > (type)((uint64_t)align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)((uint64_t)x) ^ (type)((uint64_t)y)) < ((type)((uint64_t)x) & (type)((uint64_t)y))) + +/* + * P2* Macros from Illumos + */ + +/* + * return x rounded up to the next phase (offset) within align. + * phase should be < align. + * eg, P2PHASEUP(0x1234, 0x100, 0x10) == 0x1310 (0x13*align + phase) + * eg, P2PHASEUP(0x5600, 0x100, 0x10) == 0x5610 (0x56*align + phase) + */ +#define P2PHASEUP(x, align, phase) (((uint64_t)phase) - ((((uint64_t)phase) - ((uint64_t)x)) & -((uint64_t)align))) + +/* + * Return TRUE if they have the same highest bit set. + * eg, P2SAMEHIGHBIT(0x1234, 0x1001) == TRUE (the high bit is 0x1000) + * eg, P2SAMEHIGHBIT(0x1234, 0x3010) == FALSE (high bit of 0x3010 is 0x2000) + */ +#define P2SAMEHIGHBIT(x, y) ((((uint64_t)x) ^ ((uint64_t)y)) < (((uint64_t)x) & ((uint64_t)y))) + +/* + * End Illumos copy-fest + */ + +/* avoid any possibility of clashing with version */ +#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof) + /* Use the correct builtin mechanism. The Traditional macro is + not safe on this platform. */ +// #define offsetof(s, m) __builtin_offsetof(s, m) +#include +#endif + +#define SET_ERROR(X) (X) + +#endif /* _SPL_SYSMACROS_H */ diff --git a/include/os/windows/spl/sys/systeminfo.h b/include/os/windows/spl/sys/systeminfo.h new file mode 100644 index 000000000000..8076ef9f1ce8 --- /dev/null +++ b/include/os/windows/spl/sys/systeminfo.h @@ -0,0 +1,14 @@ + +#ifndef _SPL_SYSTEMINFO_H +#define _SPL_SYSTEMINFO_H + +#define HW_INVALID_HOSTID 0xFFFFFFFF /* an invalid hostid */ +#define HW_HOSTID_LEN 11 /* minimum buffer size needed */ + /* to hold a decimal or hex */ + /* hostid string */ + +const char *spl_panicstr(void); +int spl_system_inshutdown(void); + + +#endif /* SPL_SYSTEMINFO_H */ diff --git a/include/os/windows/spl/sys/systm.h b/include/os/windows/spl/sys/systm.h new file mode 100644 index 000000000000..fd5194792b67 --- /dev/null +++ b/include/os/windows/spl/sys/systm.h @@ -0,0 +1,71 @@ + +#ifndef _SPL_SYSTM_H +#define _SPL_SYSTM_H + +//#include_next +#include + +typedef uintptr_t pc_t; + + +// Find a header to place this? +struct bsd_timeout_wrapper { + uint32_t flag; // Must be first + uint32_t init; + void(*func)(void *); + void *arg; + KTIMER timer; +}; + +/* bsd_timeout will create a new thread, and the new thread will +* first sleep the desired duration, then call the wanted function +*/ +static inline void bsd_timeout_handler(void *arg) +{ + struct bsd_timeout_wrapper *btw = arg; + KeWaitForSingleObject(&btw->timer, Executive, KernelMode, TRUE, NULL); + if (btw->init == 0x42994299) btw->func(btw->arg); + thread_exit(); +} + +#define BSD_TIMEOUT_MAGIC 0x42994299 +static inline void bsd_untimeout(void(*func)(void *), void *ID) +{ + /* + * Unfortunately, calling KeSetTimer() does not Signal (or abort) any thread + * sitting in KeWaitForSingleObject() so they would wait forever. Instead we + * change the timeout to be now, so that the threads can exit. + */ + struct bsd_timeout_wrapper *btw = (struct bsd_timeout_wrapper *)ID; + LARGE_INTEGER p = { -1 }; + btw->init = 0; + // Investigate why this assert triggers on Unload + //ASSERT(btw->init == BSD_TIMEOUT_MAGIC); // timer was not initialized + if(btw->init == BSD_TIMEOUT_MAGIC) + KeSetTimer(&btw->timer, p, NULL); +} + +static inline void bsd_timeout(void *FUNC, void *ID, struct timespec *TIM) +{ + LARGE_INTEGER duetime; + struct bsd_timeout_wrapper *btw = (struct bsd_timeout_wrapper *)ID; + void(*func)(void *) = FUNC; + ASSERT(btw != NULL); + duetime.QuadPart = -((int64_t)(SEC2NSEC100(TIM->tv_sec) + NSEC2NSEC100(TIM->tv_nsec))); + btw->func = func; + btw->arg = ID; + /* Global vars are guaranteed set to 0, still is this secure enough? */ + if (btw->init != BSD_TIMEOUT_MAGIC) { + btw->init = BSD_TIMEOUT_MAGIC; + KeInitializeTimer(&btw->timer); + } + if (!KeSetTimer(&btw->timer, duetime, NULL)) { + func((ID)); + } else { + /* Another option would have been to use taskq, as it can cancel */ + thread_create(NULL, 0, bsd_timeout_handler, ID, 0, &p0, + TS_RUN, minclsyspri); + } +} + +#endif /* SPL_SYSTM_H */ diff --git a/include/os/windows/spl/sys/t_lock.h b/include/os/windows/spl/sys/t_lock.h new file mode 100644 index 000000000000..d04712d57780 --- /dev/null +++ b/include/os/windows/spl/sys/t_lock.h @@ -0,0 +1,10 @@ + +#ifndef _SPL_T_LOCK_H +#define _SPL_T_LOCK_H + +#include +#include +#include +#include + +#endif /* SPL_T_LOCK_H */ diff --git a/include/os/windows/spl/sys/taskq.h b/include/os/windows/spl/sys/taskq.h new file mode 100644 index 000000000000..42d971a008e4 --- /dev/null +++ b/include/os/windows/spl/sys/taskq.h @@ -0,0 +1,102 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (C) 2015 Jorgen Lundman + */ + +#ifndef _SYS_TASKQ_H +#define _SYS_TASKQ_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define TASKQ_NAMELEN 31 + +typedef struct taskq taskq_t; +typedef uintptr_t taskqid_t; +typedef void (task_func_t)(void *); + +struct proc; + +/* + * Public flags for taskq_create(): bit range 0-15 + */ +#define TASKQ_PREPOPULATE 0x0001 /* Prepopulate with threads and data */ +#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ +#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ +#define TASKQ_THREADS_CPU_PCT 0x0008 /* number of threads as % of ncpu */ +#define TASKQ_DC_BATCH 0x0010 /* Taskq uses SDC in batch mode */ + +/* + * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as + * KM_SLEEP/KM_NOSLEEP. + */ +#define TQ_SLEEP 0x00 /* Can block for memory */ +#define TQ_NOSLEEP 0x01 /* cannot block for memory; may fail */ +#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ +#define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */ +#define TQ_FRONT 0x08 /* Put task at the front of the queue */ + +#ifdef _KERNEL + +extern taskq_t *system_taskq; + +extern int spl_taskq_init(void); +extern void spl_taskq_fini(void); +extern void taskq_mp_init(void); + +extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +extern taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, + int, uint_t); +extern taskq_t *taskq_create_proc(const char *, int, pri_t, int, int, + proc_t *, uint_t); +extern taskq_t *taskq_create_sysdc(const char *, int, int, int, + proc_t *, uint_t, uint_t); +extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +extern void nulltask(void *); +extern void taskq_destroy(taskq_t *); +extern void taskq_wait(taskq_t *); +extern void taskq_suspend(taskq_t *); +extern int taskq_suspended(taskq_t *); +extern void taskq_resume(taskq_t *); +extern int taskq_member(taskq_t *, struct kthread *); + +#define taskq_wait_outstanding(T, D) taskq_wait((T)) + +extern void system_taskq_init(void); +extern void system_taskq_fini(void); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TASKQ_H */ diff --git a/include/os/windows/spl/sys/taskq_impl.h b/include/os/windows/spl/sys/taskq_impl.h new file mode 100644 index 000000000000..205d2bbb0a5a --- /dev/null +++ b/include/os/windows/spl/sys/taskq_impl.h @@ -0,0 +1,179 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright (C) 2015 Jorgen Lundman + */ + + +#ifndef _SYS_TASKQ_IMPL_H +#define _SYS_TASKQ_IMPL_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct taskq_bucket taskq_bucket_t; + +typedef struct taskq_ent { + struct taskq_ent *tqent_next; + struct taskq_ent *tqent_prev; + task_func_t *tqent_func; + void *tqent_arg; + union { + taskq_bucket_t *tqent_bucket; + uintptr_t tqent_flags; + } tqent_un; + kthread_t *tqent_thread; + kcondvar_t tqent_cv; + /* Used to simulate TS_STOPPED */ + kmutex_t tqent_thread_lock; + kcondvar_t tqent_thread_cv; +} taskq_ent_t; + +#define TQENT_FLAG_PREALLOC 0x1 + +/* + * Taskq Statistics fields are not protected by any locks. + */ +typedef struct tqstat { + uint_t tqs_hits; + uint_t tqs_misses; + uint_t tqs_overflow; /* no threads to allocate */ + uint_t tqs_tcreates; /* threads created */ + uint_t tqs_tdeaths; /* threads died */ + uint_t tqs_maxthreads; /* max # of alive threads */ + uint_t tqs_nomem; /* # of times there were no memory */ + uint_t tqs_disptcreates; +} tqstat_t; + +/* + * Per-CPU hash bucket manages taskq_bent_t structures using freelist. + */ +struct taskq_bucket { + kmutex_t tqbucket_lock; + taskq_t *tqbucket_taskq; /* Enclosing taskq */ + taskq_ent_t tqbucket_freelist; + uint_t tqbucket_nalloc; /* # of allocated entries */ + uint_t tqbucket_nfree; /* # of free entries */ + kcondvar_t tqbucket_cv; + ushort_t tqbucket_flags; + hrtime_t tqbucket_totaltime; + tqstat_t tqbucket_stat; +}; + +/* + * Bucket flags. + */ +#define TQBUCKET_CLOSE 0x01 +#define TQBUCKET_SUSPEND 0x02 + +#define TASKQ_INTERFACE_FLAGS 0x0000ffff /* defined in */ + +/* + * taskq implementation flags: bit range 16-31 + */ +#define TASKQ_CHANGING 0x00010000 /* nthreads != target */ +#define TASKQ_SUSPENDED 0x00020000 /* taskq is suspended */ +#define TASKQ_NOINSTANCE 0x00040000 /* no instance number */ +#define TASKQ_THREAD_CREATED 0x00080000 /* a thread has been created */ +#define TASKQ_DUTY_CYCLE 0x00100000 /* using the SDC class */ + +struct taskq { + char tq_name[TASKQ_NAMELEN + 1]; + kmutex_t tq_lock; + krwlock_t tq_threadlock; + kcondvar_t tq_dispatch_cv; + kcondvar_t tq_wait_cv; + kcondvar_t tq_exit_cv; + pri_t tq_pri; /* Scheduling priority */ + uint_t tq_flags; + int tq_active; + int tq_nthreads; + int tq_nthreads_target; + int tq_nthreads_max; + int tq_threads_ncpus_pct; + int tq_nalloc; + int tq_minalloc; + int tq_maxalloc; + kcondvar_t tq_maxalloc_cv; + int tq_maxalloc_wait; + taskq_ent_t *tq_freelist; + taskq_ent_t tq_task; + int tq_maxsize; + taskq_bucket_t *tq_buckets; /* Per-cpu array of buckets */ + int tq_instance; + uint_t tq_nbuckets; /* # of buckets (2^n) */ + union { + kthread_t *_tq_thread; + kthread_t **_tq_threadlist; + } tq_thr; + + list_node_t tq_cpupct_link; /* linkage for taskq_cpupct_list */ + proc_t *tq_proc; /* process for taskq threads */ + int tq_cpupart; /* cpupart id bound to */ + uint_t tq_DC; /* duty cycle for SDC */ + + /* + * Statistics. + */ + kstat_t *tq_kstat; /* Exported statistics */ + hrtime_t tq_totaltime; /* Time spent processing tasks */ + uint64_t tq_tasks; /* Total # of tasks posted */ + uint64_t tq_executed; /* Total # of tasks executed */ + int tq_maxtasks; /* Max number of tasks in the queue */ + int tq_tcreates; + int tq_tdeaths; +}; + +/* Special form of taskq dispatch that uses preallocated entries. */ +void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); + + +#define tq_thread tq_thr._tq_thread +#define tq_threadlist tq_thr._tq_threadlist + +/* The MAX guarantees we have at least one thread */ +#define TASKQ_THREADS_PCT(ncpus, pct) MAX(((ncpus) * (pct)) / 100, 1) + +/* Extra ZOL / Apple */ +extern void taskq_init_ent(taskq_ent_t *t); +extern taskqid_t taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time); + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TASKQ_IMPL_H */ diff --git a/include/os/windows/spl/sys/thread.h b/include/os/windows/spl/sys/thread.h new file mode 100644 index 000000000000..8497513e64c8 --- /dev/null +++ b/include/os/windows/spl/sys/thread.h @@ -0,0 +1,109 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_THREAD_H +#define _SPL_THREAD_H + +#include +#include +#include +#include +//#include + + +//struct kthread { +// void *something; +//}; +typedef struct _KTHREAD kthread_t; +typedef struct _KTHREAD thread_t; + +/* + * Thread interfaces + */ +#define TP_MAGIC 0x53535353 + +#define TS_FREE 0x00 /* Thread at loose ends */ +#define TS_SLEEP 0x01 /* Awaiting an event */ +#define TS_RUN 0x02 /* Runnable, but not yet on a processor */ +#define TS_ONPROC 0x04 /* Thread is being run on a processor */ +#define TS_ZOMB 0x08 /* Thread has died but hasn't been reaped */ +#define TS_STOPPED 0x10 /* Stopped, initial state */ +#define TS_WAIT 0x20 /* Waiting to become runnable */ + + +typedef void (*thread_func_t)(void *); + +//HANDLE PsGetCurrentThreadId(); + +// This should be ThreadId, but that dies in taskq_member, +// for now, dsl_pool_sync_context calls it instead. +#define current_thread PsGetCurrentThread +#define curthread ((void *)current_thread()) /* current thread pointer */ +#define curproj (ttoproj(curthread)) /* current project pointer */ + +#define thread_join(t) VERIFY(0) + +// Drop the p0 argument, not used. + +#ifdef SPL_DEBUG_THREAD + +#define thread_create(A,B,C,D,E,F,G,H) spl_thread_create(A,B,C,D,E,G,__FILE__, __LINE__, H) +extern kthread_t *spl_thread_create(caddr_t stk, size_t stksize, + void (*proc)(void *), void *arg, size_t len, /*proc_t *pp,*/ int state, + char *, int, pri_t pri); + +#else + +#define thread_create(A,B,C,D,E,F,G,H) spl_thread_create(A,B,C,D,E,G,H) +extern kthread_t *spl_thread_create(caddr_t stk, size_t stksize, + void (*proc)(void *), void *arg, size_t len, /*proc_t *pp,*/ int state, + pri_t pri); + +#endif + +#define thread_exit spl_thread_exit +extern void spl_thread_exit(void); + +extern kthread_t *spl_current_thread(void); + +#define delay windows_delay +#define IOSleep windows_delay +extern void windows_delay(int); + + +#define KPREEMPT_SYNC 0 +static inline void kpreempt(int flags) +{ + (void)flags; + //ZwYieldExecution(); + LARGE_INTEGER interval; + interval.QuadPart = 0; + KeDelayExecutionThread(KernelMode, FALSE, &interval); +} + +#endif /* _SPL_THREAD_H */ diff --git a/include/os/windows/spl/sys/time.h b/include/os/windows/spl/sys/time.h new file mode 100644 index 000000000000..7fe7768edaad --- /dev/null +++ b/include/os/windows/spl/sys/time.h @@ -0,0 +1,117 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_TIME_H +#define _SPL_TIME_H + +#include +//#include_next +#include +//#include +#include +struct timespec; + +#if defined(CONFIG_64BIT) +#define TIME_MAX INT64_MAX +#define TIME_MIN INT64_MIN +#else +#define TIME_MAX INT32_MAX +#define TIME_MIN INT32_MIN +#endif + +#define SEC 1 +#define MILLISEC 1000 +#define MICROSEC 1000000 +#define NANOSEC 1000000000 + +#define NSEC2SEC(n) ((n) / (NANOSEC / SEC)) +#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC)) + +/* Already defined in include/linux/time.h */ +#undef CLOCK_THREAD_CPUTIME_ID +#undef CLOCK_REALTIME +#undef CLOCK_MONOTONIC +#undef CLOCK_PROCESS_CPUTIME_ID + +typedef enum clock_type { + __CLOCK_REALTIME0 = 0, /* obsolete; same as CLOCK_REALTIME */ + CLOCK_VIRTUAL = 1, /* thread's user-level CPU clock */ + CLOCK_THREAD_CPUTIME_ID = 2, /* thread's user+system CPU clock */ + CLOCK_REALTIME = 3, /* wall clock */ + CLOCK_MONOTONIC = 4, /* high resolution monotonic clock */ + CLOCK_PROCESS_CPUTIME_ID = 5, /* process's user+system CPU clock */ + CLOCK_HIGHRES = CLOCK_MONOTONIC, /* alternate name */ + CLOCK_PROF = CLOCK_THREAD_CPUTIME_ID,/* alternate name */ +} clock_type_t; + +#if 0 +#define hz \ +({ \ + ASSERT(HZ >= 100 && HZ <= MICROSEC); \ + HZ; \ +}) +#endif + +#define TIMESPEC_OVERFLOW(ts) \ + ((ts)->tv_sec < TIME_MIN || (ts)->tv_sec > TIME_MAX) + +typedef long long hrtime_t; + +extern hrtime_t gethrtime(void); +extern void gethrestime(struct timespec *); +extern time_t gethrestime_sec(void); +extern void hrt2ts(hrtime_t hrt, struct timespec *tsp); + +#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) +#define USEC2NSEC(u) ((hrtime_t)(u) * (NANOSEC / MICROSEC)) +#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) + +// Windows 100NS +#define SEC2NSEC100(n) ((n) * 10000000ULL) +#define NSEC2NSEC100(n) ((n) / 100ULL) + +// ZFS time is 2* 64bit values, which are seconds, and nanoseconds since 1970 +// Windows time is 1 64bit value; representing the number of 100-nanosecond intervals since January 1, 1601 (UTC). +// There's 116444736000000000 100ns between 1601 and 1970 + +// I think these functions handle sec correctly, but nsec should be */100 +#define TIME_WINDOWS_TO_UNIX(WT, UT) do { \ + uint64_t unixepoch = (WT) - 116444736000000000ULL; \ + (UT)[0] = /* seconds */ unixepoch / 10000000ULL; \ + (UT)[1] = /* remainding nsec */ unixepoch - ((UT)[0] * 10000000ULL); \ + } while(0) + +#define TIME_UNIX_TO_WINDOWS(UT, WT) do { \ + (WT) = ((UT)[1]) + ((UT)[0] * 10000000ULL) + 116444736000000000ULL; \ + } while(0) + +#define TIME_UNIX_TO_WINDOWS_EX(SEC, USEC, WT) do { \ + (WT) = (USEC) + ((SEC) * 10000000ULL) + 116444736000000000ULL; \ + } while(0) + +#endif /* _SPL_TIME_H */ diff --git a/include/os/windows/spl/sys/timer.h b/include/os/windows/spl/sys/timer.h new file mode 100644 index 000000000000..eb522038e306 --- /dev/null +++ b/include/os/windows/spl/sys/timer.h @@ -0,0 +1,78 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + +#ifndef _SPL_TIMER_H +#define _SPL_TIMER_H + +#include +// Typical timespec is smaller, but we need to retain the precision +// to copy time between Unix and Windows. +struct timespec { + uint64_t tv_sec; + uint64_t tv_nsec; +}; + +//#define USEC_PER_SEC 1000000 /* microseconds per second */ + +/* Open Solaris lbolt is in hz */ +static inline uint64_t +zfs_lbolt(void) +{ + uint64_t lbolt_hz; + LARGE_INTEGER ticks; + KeQueryTickCount(&ticks); + lbolt_hz = ticks.QuadPart * KeQueryTimeIncrement(); + lbolt_hz /= (10000000 / 119); // Solaris hz ? + return (lbolt_hz); +} + + +#define lbolt zfs_lbolt() +#define lbolt64 zfs_lbolt() + +#define ddi_get_lbolt() (zfs_lbolt()) +#define ddi_get_lbolt64() (zfs_lbolt()) + +#define typecheck(type,x) \ + ({ type __dummy; \ + typeof(x) __dummy2; \ + (void)(&__dummy == &__dummy2); \ + 1; \ + }) + + +#pragma error( disable: 4296 ) +#define ddi_time_before(a, b) ((a) - (b) < 0) +#define ddi_time_after(a, b) ddi_time_before(b, a) + +#define ddi_time_before64(a, b) ((a) - (b) < 0) +#define ddi_time_after64(a, b) ddi_time_before64(b, a) +#pragma error( default: 4296 ) + + +#endif /* _SPL_TIMER_H */ diff --git a/include/os/windows/spl/sys/tsd.h b/include/os/windows/spl/sys/tsd.h new file mode 100644 index 000000000000..fce2a16d3e3d --- /dev/null +++ b/include/os/windows/spl/sys/tsd.h @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2008 MacZFS + * Copyright (C) 2013 Jorgen Lundman + * + */ + + +#ifndef _SPL_TSD_H +#define _SPL_TSD_H + +#include + +#define TSD_HASH_TABLE_BITS_DEFAULT 9 +#define TSD_KEYS_MAX 32768 +#define DTOR_PID (PID_MAX_LIMIT+1) +#define PID_KEY (TSD_KEYS_MAX+1) + +typedef void (*dtor_func_t)(void *); + +extern int tsd_set(uint_t, void *); +extern void *tsd_get(uint_t); +extern void tsd_create(uint_t *, dtor_func_t); +extern void tsd_destroy(uint_t *); +extern void tsd_exit(void); + +uint64_t spl_tsd_size(void); +void tsd_thread_exit(void); +int spl_tsd_init(void); +void spl_tsd_fini(void); + +#endif /* _SPL_TSD_H */ diff --git a/include/os/windows/spl/sys/types.h b/include/os/windows/spl/sys/types.h new file mode 100644 index 000000000000..e0322dcc7e8f --- /dev/null +++ b/include/os/windows/spl/sys/types.h @@ -0,0 +1,224 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + */ +#ifndef _SPL_TYPES_H +#define _SPL_TYPES_H + + +// Linux kernel optimization, ignore them for now on OSX. +#define unlikely +#define likely +// use ntintsafe.h ? +typedef enum { B_FALSE = 0, B_TRUE = 1 } boolean_t; +typedef short pri_t; +typedef int int32_t; +typedef unsigned long ulong; +typedef unsigned long ulong_t; +typedef unsigned long long uint64_t; +typedef unsigned long long u_longlong_t; +typedef unsigned long long rlim64_t; +typedef unsigned long long loff_t; +#define _CLOCK_T_DEFINED +typedef unsigned long long clock_t; +typedef long long int64_t; +typedef long long longlong_t; +typedef unsigned char uchar_t; +typedef unsigned int uint_t; +typedef unsigned int uint32_t; +typedef unsigned short ushort_t; +typedef void *spinlock_t; +typedef long long offset_t; +typedef long long off_t; +typedef struct timespec timestruc_t; /* definition per SVr4 */ +typedef struct timespec timespec_t; +typedef ulong_t pgcnt_t; +typedef unsigned int mode_t ; +//typedef uint32_t dev32_t; +#define NODEV32 (dev32_t)(-1) +typedef uint_t minor_t; +typedef char *caddr_t; +typedef unsigned char uint8_t; +typedef char int8_t; +typedef short int int16_t; +typedef unsigned short int uint16_t; +//#include_next +#include +#include +//#include +typedef unsigned long long uid_t; +typedef unsigned long long gid_t; +typedef unsigned int pid_t; +// size_t is 32bit on IllumOS, but 64bit on windows, so changed to uint32_t +typedef uintptr_t pc_t; +typedef uint64_t ssize_t; +typedef uint64_t vm_offset_t; +typedef uint64_t dev_t; +#define NGROUPS 16 + +typedef unsigned short umode_t; +typedef uint64_t user_addr_t; +typedef uint64_t user_size_t; +typedef uint64_t ino64_t; + +typedef unsigned long u_long; +typedef unsigned char uuid_t[16]; + + +// Yeah nothing is going to work until we fix this atomic stuff +#define _Atomic + +#define PATH_MAX 1024 +#define Z_OK 0 + +struct buf; +typedef struct buf buf_t; +typedef unsigned int uInt; +#include +#include +#include +#include +#include +#include + + +#define snprintf _snprintf +#define vprintf(...) vKdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_INFO_LEVEL, __VA_ARGS__)) +#define vsnprintf _vsnprintf + +#ifndef ULLONG_MAX +#define ULLONG_MAX (~0ULL) +#endif + +#ifndef LLONG_MAX +#define LLONG_MAX ((long long)(~0ULL>>1)) +#endif + +#if 0 +typedef unsigned long intptr_t; +typedef unsigned long long u_offset_t; +typedef struct task_struct kthread_t; +typedef struct task_struct proc_t; +typedef struct vmem { } vmem_t; +typedef struct timespec timestruc_t; /* definition per SVr4 */ +typedef struct timespec timespec_t; +typedef u_longlong_t len_t; +typedef longlong_t diskaddr_t; +typedef ushort_t o_bcopy; +typedef uint_t major_t; +typedef ulong_t pfn_t; +typedef long spgcnt_t; +typedef short index_t; +typedef int id_t; +typedef unsigned short mode_t; +extern proc_t p0; +#endif + +#include +#define FREAD 0x0001 +#define FWRITE 0x0002 + +#define FCREAT O_CREAT +#define FTRUNC O_TRUNC +#define FEXCL O_EXCL +#define FNOCTTY O_NOCTTY +//#define FASYNC O_SYNC +#define FNOFOLLOW O_NOFOLLOW +#define FAPPEND O_APPEND + + +#define FSYNC 0x10 /* file (data+inode) integrity while writing */ +#define FDSYNC 0x40 /* file data only integrity while writing */ +#define FRSYNC 0x8000 /* sync read operations at same level of */ + /* integrity as specified for writes by */ + /* FSYNC and FDSYNC flags */ +#define FOFFMAX 0x2000 /* large file */ + +#define EXPORT_SYMBOL(X) +#define module_param(X,Y,Z) +#define MODULE_PARM_DESC(X,Y) + +#define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ +#define B_READ 0x00000001 /* Read buffer. */ +#define B_ASYNC 0x00000002 /* Start I/O, do not wait. */ +#define B_NOCACHE 0x00000004 /* Do not cache block after use. */ +#define B_PHYS 0x00000020 /* I/O to user memory. */ +#define B_PASSIVE 0x00000800 /* PASSIVE I/Os are ignored by THROTTLE I/O */ +#define B_BUSY B_PHYS + +#ifdef __GNUC__ +#define member_type(type, member) __typeof__ (((type *)0)->member) +#else +#define member_type(type, member) void +#endif + +#define container_of(ptr, type, member) ((type *)( \ + (char *)(member_type(type, member) *){ ptr } - offsetof(type, member))) + +#define bzero(b,len) (memset((b), '\0', (len))) +#define bcopy(b1,b2,len) (memmove((b2), (b1), (len))) +#define bcmp(b1, b2, len) (memcmp((b2), (b1), (len))) +//int snprintf(char *s, int l, char *fmt, ...); + +extern uint32_t strlcpy(register char* s, register const char* t, register uint32_t n); +extern uint32_t strlcat(register char* s, register const char* t, register uint32_t n); +#define strtok_r strtok_s +#define strcasecmp _stricmp + +#define htonl _byteswap_ulong + +struct mount; +typedef struct mount mount_t; + +#define always_inline __forceinline +#define __attribute__ + +struct kauth_cred; +typedef struct kauth_cred kauth_cred_t; +struct kauth_acl; +typedef struct kauth_acl kauth_acl_t; +#define KAUTH_FILESEC_NONE ((kauth_filesec_t)0) + +struct kauth_ace_rights; +typedef struct kauth_ace_rights kauth_ace_rights_t; + +extern int groupmember(gid_t gid, kauth_cred_t *cred); + +typedef struct { +#define KAUTH_GUID_SIZE 16 /* 128-bit identifier */ + unsigned char g_guid[KAUTH_GUID_SIZE]; +} guid_t; + +#pragma warning( disable: 4296 ) // expression is always true +#pragma error( disable: 4296 ) // expression is always true +#pragma warning( disable: 4703 ) // potentially uninitialized local pointer variable + +#define LINK_MAX 32767 /* max file link count */ + +#define FNV1_32A_INIT ((uint32_t)0x811c9dc5) +uint32_t fnv_32a_str(const char *str, uint32_t hval); +uint32_t fnv_32a_buf(void *buf, size_t len, uint32_t hval); + +#endif /* _SPL_TYPES_H */ diff --git a/include/os/windows/spl/sys/types32.h b/include/os/windows/spl/sys/types32.h new file mode 100644 index 000000000000..44634fad098f --- /dev/null +++ b/include/os/windows/spl/sys/types32.h @@ -0,0 +1,13 @@ + +#ifndef _SPL_TYPES32_H +#define _SPL_TYPES32_H + +#include +#include + +typedef uint32_t caddr32_t; +typedef int32_t daddr32_t; +typedef int32_t time32_t; +typedef uint32_t size32_t; + +#endif /* _SPL_TYPES32_H */ diff --git a/include/os/windows/spl/sys/ubc.h b/include/os/windows/spl/sys/ubc.h new file mode 100644 index 000000000000..5aaa3314dc97 --- /dev/null +++ b/include/os/windows/spl/sys/ubc.h @@ -0,0 +1,5 @@ +#ifndef UBC_H_INCLUDED +#define UBC_H_INCLUDED + + +#endif diff --git a/include/os/windows/spl/sys/uio.h b/include/os/windows/spl/sys/uio.h new file mode 100644 index 000000000000..006a0132a358 --- /dev/null +++ b/include/os/windows/spl/sys/uio.h @@ -0,0 +1,189 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2014 Garrett D'Amore + * + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2017 Jorgen Lundman + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + + /* + * Copyright (c) 2017 Jorgen Lundman + */ + + +#ifndef _SYS_UIO_H +#define _SYS_UIO_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* + * I/O parameter information. A uio structure describes the I/O which + * is to be performed by an operation. Typically the data movement will + * be performed by a routine such as uiomove(), which updates the uio + * structure to reflect what was done. + */ + +typedef struct iovec { + void *iov_base; + size_t iov_len; +} iovec_t; + + +/* +* I/O direction. +*/ +typedef enum uio_rw { UIO_READ, UIO_WRITE } uio_rw_t; + +/* + * Segment flag values. + */ +typedef enum uio_seg { UIO_USERSPACE, UIO_SYSSPACE, UIO_USERISPACE } uio_seg_t; + + +typedef struct uio { + iovec_t *uio_iov; /* pointer to array of iovecs */ + int uio_iovcnt; /* number of iovecs */ + loff_t uio_offset; /* file offset */ + uio_seg_t uio_segflg; /* address space (kernel or user) */ + loff_t uio_limit; /* u-limit (maximum byte offset) */ + ssize_t uio_resid; /* residual count */ + enum uio_rw uio_rw; + int uio_max_iovs; /* max number of iovecs this uio_t can hold */ + uint32_t uio_index; /* Current index */ +} uio_t; + + +#if defined(_KERNEL) || defined(_FAKE_KERNEL) + +uio_t *uio_create(int iovcount, off_t offset, int spacetype, int iodirection); +void uio_free(uio_t *uio); +int uio_addiov(uio_t *uio, user_addr_t baseaddr, user_size_t length); +int uio_isuserspace(uio_t *uio); +int uio_getiov(uio_t *uio, int index, user_addr_t *baseaddr, user_size_t *length); +int uio_iovcnt(uio_t *uio); +off_t uio_offset(uio_t *uio); +void uio_update(uio_t *uio, user_size_t count); +uint64_t uio_resid(uio_t *uio); +user_addr_t uio_curriovbase(uio_t *uio); +user_size_t uio_curriovlen(uio_t *uio); +void uio_setoffset(uio_t *uio, off_t a_offset); +uio_t *uio_duplicate(uio_t *uio); +int uio_rw(uio_t *a_uio); +void uio_setrw(uio_t *a_uio, int a_value); + +int uiomove(void *, uint32_t, enum uio_rw, struct uio *); +int spl_uiomove(const uint8_t *, uint32_t, struct uio *); +int uiocopy(void *, uint32_t, enum uio_rw, struct uio *, uint64_t *); +void uioskip(struct uio *, uint32_t); +int uiodup(struct uio *, struct uio *, iovec_t *, int); + +#endif /* defined(_KERNEL) */ + +// xuio struct is not used in this platform, but we define it +// to allow compilation and easier patching +typedef enum xuio_type { + UIOTYPE_ASYNCIO, + UIOTYPE_ZEROCOPY, +} xuio_type_t; + + +#define UIOA_IOV_MAX 16 + +typedef struct uioa_page_s { + int uioa_pfncnt; + void **uioa_ppp; + caddr_t uioa_base; + size_t uioa_len; +} uioa_page_t; + +typedef struct xuio { + uio_t *xu_uio; + enum xuio_type xu_type; + union { + struct { + uint32_t xu_a_state; + ssize_t xu_a_mbytes; + uioa_page_t *xu_a_lcur; + void **xu_a_lppp; + void *xu_a_hwst[4]; + uioa_page_t xu_a_locked[UIOA_IOV_MAX]; + } xu_aio; + struct { + int xu_zc_rw; + void *xu_zc_priv; + } xu_zc; + } xu_ext; +} xuio_t; + +#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv +#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw + +/* +* same as uiomove() but doesn't modify uio structure. +* return in cbytes how many bytes were copied. +*/ +static inline int uiocopy(const char *p, uint32_t n, enum uio_rw rw, struct uio *uio, uint64_t *cbytes) \ +{ \ +int result; \ +struct uio *nuio = uio_duplicate(uio); \ + unsigned long long x = uio_resid(uio); \ + if (!nuio) return ENOMEM; \ + uio_setrw(nuio, rw); +\ +result = spl_uiomove(p, n, nuio); \ +*cbytes = (x - uio_resid(nuio)); \ +uio_free(nuio); \ +return result; \ +} + +// Apple's uiomove puts the uio_rw in uio_create +#define uiomove(A,B,C,D) spl_uiomove((A),(B),(D)) +#define uioskip(A,B) uio_update((A), (B)) + + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UIO_H */ diff --git a/include/os/windows/spl/sys/unistd.h b/include/os/windows/spl/sys/unistd.h new file mode 100644 index 000000000000..dd7c6c2d883d --- /dev/null +++ b/include/os/windows/spl/sys/unistd.h @@ -0,0 +1,7 @@ + +#ifndef _SPL_UNISTD_H +#define _SPL_UNISTD_H + +//#include_next + +#endif /* SPL_UNISTD_H */ diff --git a/include/os/windows/spl/sys/utsname.h b/include/os/windows/spl/sys/utsname.h new file mode 100644 index 000000000000..0ded2a3d3957 --- /dev/null +++ b/include/os/windows/spl/sys/utsname.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _SPL_UTSNAME_H +#define _SPL_UTSNAME_H + +#define _SYS_NMLN 257 +struct utsname { + char sysname[_SYS_NMLN]; + char nodename[_SYS_NMLN]; + char release[_SYS_NMLN]; + char version[_SYS_NMLN]; + char machine[_SYS_NMLN]; +}; + +extern struct utsname utsname; + +#endif /* SPL_UTSNAME_H */ diff --git a/include/os/windows/spl/sys/va_list.h b/include/os/windows/spl/sys/va_list.h new file mode 100644 index 000000000000..a35023cf3033 --- /dev/null +++ b/include/os/windows/spl/sys/va_list.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_VA_LIST_H +#define _SPL_VA_LIST_H + +#endif /* SPL_VA_LIST_H */ diff --git a/include/os/windows/spl/sys/varargs.h b/include/os/windows/spl/sys/varargs.h new file mode 100644 index 000000000000..6a65b9d0657a --- /dev/null +++ b/include/os/windows/spl/sys/varargs.h @@ -0,0 +1,7 @@ + +#ifndef _SPL_VARARGS_H +#define _SPL_VARARGS_H + +#define __va_list va_list + +#endif /* SPL_VARARGS_H */ diff --git a/include/os/windows/spl/sys/vfs.h b/include/os/windows/spl/sys/vfs.h new file mode 100644 index 000000000000..cf068c2625f9 --- /dev/null +++ b/include/os/windows/spl/sys/vfs.h @@ -0,0 +1,85 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ifndef _SPL_ZFS_H +#define _SPL_ZFS_H + +#include +#include + +#define MAXFIDSZ 64 + +typedef struct mount vfs_t; + +//#define LK_NOWAIT 0x00000010 /* do not sleep to await lock */ +#define vn_vfswlock(vp) (0) +#define vn_vfsunlock(vp) +#define VFS_HOLD(vfsp) +#define VFS_RELE(vfsp) + + + +/* + * File identifier. Should be unique per filesystem on a single + * machine. This is typically called by a stateless file server + * in order to generate "file handles". + * + * Do not change the definition of struct fid ... fid_t without + * letting the CacheFS group know about it! They will have to do at + * least two things, in the same change that changes this structure: + * 1. change CFSVERSION in usr/src/uts/common/sys/fs/cachefs_fs.h + * 2. put the old version # in the canupgrade array + * in cachfs_upgrade() in usr/src/cmd/fs.d/cachefs/fsck/fsck.c + * This is necessary because CacheFS stores FIDs on disk. + * + * Many underlying file systems cast a struct fid into other + * file system dependent structures which may require 4 byte alignment. + * Because a fid starts with a short it may not be 4 byte aligned, the + * fid_pad will force the alignment. + */ +#define MAXFIDSZ 64 +#define OLD_MAXFIDSZ 16 + +typedef struct fid { + union { + long fid_pad; + struct { + ushort_t len; /* length of data in bytes */ + char data[MAXFIDSZ]; /* data (variable len) */ + } _fid; + } un; +} fid_t; + + +extern void (*mountroot_post_hook)(void); + +#endif /* SPL_ZFS_H */ diff --git a/include/os/windows/spl/sys/vfs_opreg.h b/include/os/windows/spl/sys/vfs_opreg.h new file mode 100644 index 000000000000..b795ae71d17d --- /dev/null +++ b/include/os/windows/spl/sys/vfs_opreg.h @@ -0,0 +1,5 @@ + +#ifndef _SPL_OPREG_H +#define _SPL_OPREG_H + +#endif /* SPL_OPREG_H */ diff --git a/include/os/windows/spl/sys/vmem.h b/include/os/windows/spl/sys/vmem.h new file mode 100644 index 000000000000..9ff8b26e0d8a --- /dev/null +++ b/include/os/windows/spl/sys/vmem.h @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VMEM_H +#define _SYS_VMEM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +// BGH - Back to 512k users reporting glitching, beachballing etc. +//#define KMEM_QUANTUM (PAGESIZE<<7) + +#define KMEM_QUANTUM (PAGESIZE) // (<<5, 128k, has been running for months (as of 23 sept 2015), lower does glitch + + + + /* + * Per-allocation flags + */ +#define VM_SLEEP 0x00000000 /* same as KM_SLEEP */ +#define VM_NOSLEEP 0x00000001 /* same as KM_NOSLEEP */ +#define VM_PANIC 0x00000002 /* same as KM_PANIC */ +#define VM_PUSHPAGE 0x00000004 /* same as KM_PUSHPAGE */ +#define VM_NORMALPRI 0x00000008 /* same as KM_NORMALPRI */ +#define VM_NODEBUG 0x00000010 /* matches KM_NODE~BUG, not implemented on OSX */ +#define VM_NO_VBA 0x00000020 /* OSX: do not descend to the bucket layer */ +#define VM_KMFLAGS 0x000000ff /* flags that must match KM_* flags */ + +#define VM_BESTFIT 0x00000100 +#define VM_FIRSTFIT 0x00000200 +#define VM_NEXTFIT 0x00000400 + + /* + * The following flags are restricted for use only within the kernel. + * VM_MEMLOAD is for use by the HAT to avoid infinite recursion. + * VM_NORELOC is used by the kernel when static VA->PA mappings are required. + */ +#define VM_MEMLOAD 0x00000800 +#define VM_NORELOC 0x00001000 + + /* + * VM_ABORT requests that vmem_alloc() *ignore* the VM_SLEEP/VM_NOSLEEP flags + * and forgo reaping if the allocation or attempted import, fails. This + * flag is a segkmem-specific flag, and should not be used by anyone else. + */ +#define VM_ABORT 0x00002000 + + /* + * VM_ENDALLOC requests that large addresses be preferred in allocations. + * Has no effect if VM_NEXTFIT is active. + */ +#define VM_ENDALLOC 0x00004000 + +#define VM_FLAGS 0x0000FFFF + + /* + * Arena creation flags + */ +#define VMC_POPULATOR 0x00010000 +#define VMC_NO_QCACHE 0x00020000 /* cannot use quantum caches */ +#define VMC_IDENTIFIER 0x00040000 /* not backed by memory */ + // VMC_XALLOC 0x00080000 below + // VMC_XALIGN 0x00100000 below +#define VMC_DUMPSAFE 0x00200000 /* can use alternate dump memory */ + // KMC_IDENTIFIER == 0x00400000 + // KMC_PREFILL == 0x00800000 +#define VMC_TIMEFREE 0x01000000 /* keep span creation time, newest spans to front */ +#define VMC_OLDFIRST 0x02000000 /* must accompany VMC_TIMEFREE, oldest spans to front */ + /* + * internal use only; the import function uses the vmem_ximport_t interface + * and may increase the request size if it so desires. + * VMC_XALIGN, for use with vmem_xcreate, specifies that + * the address returned by the import function will be + * aligned according to the alignment argument. + */ +#define VMC_XALLOC 0x00080000 +#define VMC_XALIGN 0x00100000 +#define VMC_FLAGS 0xFFFF0000 + + /* + * Public segment types + */ +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 + + /* + * Implementation-private segment types + */ +#define VMEM_SPAN 0x10 +#define VMEM_ROTOR 0x20 +#define VMEM_WALKER 0x40 + + /* + * VMEM_REENTRANT indicates to vmem_walk() that the callback routine may + * call back into the arena being walked, so vmem_walk() must drop the + * arena lock before each callback. The caveat is that since the arena + * isn't locked, its state can change. Therefore it is up to the callback + * routine to handle cases where the segment isn't of the expected type. + * For example, we use this to walk heap_arena when generating a crash dump; + * see segkmem_dump() for sample usage. + */ +#define VMEM_REENTRANT 0x80000000 + + struct vmem; + + typedef struct vmem vmem_t; + typedef void *(vmem_alloc_t)(vmem_t *, uint32_t, int); + typedef void (vmem_free_t)(vmem_t *, void *, uint32_t); + + /* + * Alternate import style; the requested size is passed in a pointer, + * which can be increased by the import function if desired. + */ + typedef void *(vmem_ximport_t)(vmem_t *, uint32_t *, uint32_t, int); + +#ifdef _KERNEL + extern vmem_t *vmem_init(const char *, void *, uint32_t, uint32_t, + vmem_alloc_t *, vmem_free_t *); + extern void vmem_fini(vmem_t *); + extern void vmem_update(void *); + extern int vmem_is_populator(); + extern uint32_t vmem_seg_size; +#endif + + extern vmem_t *vmem_create(const char *, void *, uint32_t, uint32_t, + vmem_alloc_t *, vmem_free_t *, vmem_t *, uint32_t, int); + extern vmem_t *vmem_xcreate(const char *, void *, uint32_t, uint32_t, + vmem_ximport_t *, vmem_free_t *, vmem_t *, uint32_t, int); + extern void vmem_destroy(vmem_t *); + extern void *vmem_alloc(vmem_t *, uint32_t, int); + extern void *vmem_xalloc(vmem_t *, uint32_t, uint32_t, uint32_t, uint32_t, + void *, void *, int); + extern void vmem_free(vmem_t *, void *, uint32_t); + extern void vmem_xfree(vmem_t *, void *, uint32_t); + extern void *vmem_add(vmem_t *, void *, uint32_t, int); + extern int vmem_contains(vmem_t *, void *, uint32_t); + extern void vmem_walk(vmem_t *, int, void (*)(void *, void *, uint32_t), void *); + extern uint32_t vmem_size(vmem_t *, int); + extern uint32_t vmem_size_locked(vmem_t *, int); + extern uint32_t vmem_size_semi_atomic(vmem_t *, int); + extern void vmem_qcache_reap(vmem_t *vmp); + extern int64_t vmem_buckets_size(int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VMEM_H */ diff --git a/include/os/windows/spl/sys/vmem_impl.h b/include/os/windows/spl/sys/vmem_impl.h new file mode 100644 index 000000000000..95a6777f7413 --- /dev/null +++ b/include/os/windows/spl/sys/vmem_impl.h @@ -0,0 +1,154 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1999-2001, 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VMEM_IMPL_H +#define _SYS_VMEM_IMPL_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + + typedef struct vmem_seg vmem_seg_t; + +#define VMEM_STACK_DEPTH 20 + + struct vmem_seg { + /* + * The first four fields must match vmem_freelist_t exactly. + */ + uintptr_t vs_start; /* start of segment (inclusive) */ + uintptr_t vs_end; /* end of segment (exclusive) */ + vmem_seg_t *vs_knext; /* next of kin (alloc, free, span) */ + vmem_seg_t *vs_kprev; /* prev of kin */ + + vmem_seg_t *vs_anext; /* next in arena */ + vmem_seg_t *vs_aprev; /* prev in arena */ + uint8_t vs_type; /* alloc, free, span */ + uint8_t vs_import; /* non-zero if segment was imported */ + uint8_t vs_depth; /* stack depth if KMF_AUDIT active */ + /* + * if VM_FREESORT is set on the arena, then + * this field is set at span creation time. + */ + hrtime_t vs_span_createtime; + /* + * The following fields are present only when KMF_AUDIT is set. + */ + kthread_t *vs_thread; + hrtime_t vs_timestamp; + pc_t vs_stack[VMEM_STACK_DEPTH]; + }; + + typedef struct vmem_freelist { + uintptr_t vs_start; /* always zero */ + uintptr_t vs_end; /* segment size */ + vmem_seg_t *vs_knext; /* next of kin */ + vmem_seg_t *vs_kprev; /* prev of kin */ + } vmem_freelist_t; + +#define VS_SIZE(vsp) ((vsp)->vs_end - (vsp)->vs_start) + + /* + * Segment hashing + */ +#define VMEM_HASH_INDEX(a, s, q, m) \ +((((a) + ((a) >> (s)) + ((a) >> ((s) << 1))) >> (q)) & (m)) + +#define VMEM_HASH(vmp, addr) \ +(&(vmp)->vm_hash_table[VMEM_HASH_INDEX(addr, \ +(vmp)->vm_hash_shift, (vmp)->vm_qshift, (vmp)->vm_hash_mask)]) + +#define VMEM_QCACHE_SLABSIZE(max) \ + MAX(1 << highbit(3 * (max)), 64) + +#define VMEM_NAMELEN 30 +#define VMEM_HASH_INITIAL 16 +#define VMEM_NQCACHE_MAX 16 +#define VMEM_FREELISTS (sizeof (void *) * 8) + + typedef struct vmem_kstat { + kstat_named_t vk_mem_inuse; /* memory in use */ + kstat_named_t vk_mem_import; /* memory imported */ + kstat_named_t vk_mem_total; /* total memory in arena */ + kstat_named_t vk_source_id; /* vmem id of vmem source */ + kstat_named_t vk_alloc; /* number of allocations */ + kstat_named_t vk_free; /* number of frees */ + kstat_named_t vk_wait; /* number of allocations that waited */ + kstat_named_t vk_fail; /* number of allocations that failed */ + kstat_named_t vk_lookup; /* hash lookup count */ + kstat_named_t vk_search; /* freelist search count */ + kstat_named_t vk_populate_fail; /* populates that failed */ + kstat_named_t vk_contains; /* vmem_contains() calls */ + kstat_named_t vk_contains_search; /* vmem_contains() search cnt */ + kstat_named_t vk_parent_alloc; /* called the source allocator */ + kstat_named_t vk_parent_free; /* called the source free function */ + kstat_named_t vk_threads_waiting; /* threads in cv_wait in vmem allocator function */ + kstat_named_t vk_excess; /* count of retained excess imports */ + } vmem_kstat_t; + + struct vmem { + char vm_name[VMEM_NAMELEN]; /* arena name */ + kcondvar_t vm_cv; /* cv for blocking allocations */ + kmutex_t vm_lock; /* arena lock */ + uint32_t vm_id; /* vmem id */ + hrtime_t vm_createtime; + uint32_t vm_mtbf; /* induced alloc failure rate */ + int vm_cflags; /* arena creation flags */ + int vm_qshift; /* log2(vm_quantum) */ + uint32_t vm_quantum; /* vmem quantum */ + uint32_t vm_qcache_max; /* maximum size to front by kmem */ + uint32_t vm_min_import; /* smallest amount to import */ + void *(*vm_source_alloc)(vmem_t *, uint32_t, int); + void (*vm_source_free)(vmem_t *, void *, uint32_t); + vmem_t *vm_source; /* vmem source for imported memory */ + vmem_t *vm_next; /* next in vmem_list */ + kstat_t *vm_ksp; /* kstat */ + ssize_t vm_nsegfree; /* number of free vmem_seg_t's */ + vmem_seg_t *vm_segfree; /* free vmem_seg_t list */ + vmem_seg_t **vm_hash_table; /* allocated-segment hash table */ + uint32_t vm_hash_mask; /* hash_size - 1 */ + uint32_t vm_hash_shift; /* log2(vm_hash_mask + 1) */ + ulong_t vm_freemap; /* bitmap of non-empty freelists */ + vmem_seg_t vm_seg0; /* anchor segment */ + vmem_seg_t vm_rotor; /* rotor for VM_NEXTFIT allocations */ + vmem_seg_t *vm_hash0[VMEM_HASH_INITIAL]; /* initial hash table */ + void *vm_qcache[VMEM_NQCACHE_MAX]; /* quantum caches */ + vmem_freelist_t vm_freelist[VMEM_FREELISTS + 1]; /* power-of-2 flists */ + vmem_kstat_t vm_kstat; /* kstat data */ + }; + typedef struct vmem vmem_t; +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VMEM_IMPL_H */ diff --git a/include/os/windows/spl/sys/vmsystm.h b/include/os/windows/spl/sys/vmsystm.h new file mode 100644 index 000000000000..c9569412c88c --- /dev/null +++ b/include/os/windows/spl/sys/vmsystm.h @@ -0,0 +1,7 @@ + +#ifndef _SPL_VMSYSTM_H +#define _SPL_VMSYSTM_H + +#include + +#endif /* SPL_VMSYSTM_H */ diff --git a/include/os/windows/spl/sys/vnode.h b/include/os/windows/spl/sys/vnode.h new file mode 100644 index 000000000000..88f88f2ab43a --- /dev/null +++ b/include/os/windows/spl/sys/vnode.h @@ -0,0 +1,583 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + */ + +#ifndef _SPL_VNODE_H +#define _SPL_VNODE_H + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//#include +#include + +/* Enable to track all IOCOUNT */ +//#define DEBUG_IOCOUNT + + +/* + * Lets define a vnode struct that will hold everything needed for Windows + * request to be handled. + */ +#define VNODE_DEAD 1 +#define VNODE_MARKTERM 2 +#define VNODE_NEEDINACTIVE 4 +#define VNODE_MARKROOT 8 +#define VNODE_SIZECHANGE 16 +#define VNODE_EASIZE 32 +#define VNODE_FLUSHING 64 +#define VNODE_VALIDBITS 127 + +/* v_unlink flags */ +#define UNLINK_DELETE_ON_CLOSE (1 << 0) // 1 +#define UNLINK_DELETED (1 << 1) // 2 + +#include +typedef struct vnode_fileobjects { + avl_node_t avlnode; + void *fileobject; +} vnode_fileobjects_t; + + +#pragma pack(8) +struct vnode { + // Windows specific header, has to be first. + FSRTL_ADVANCED_FCB_HEADER FileHeader; + // Mutex for locking access to FileHeader. + FAST_MUTEX AdvancedFcbHeaderMutex; + // mmap file access struct + SECTION_OBJECT_POINTERS SectionObjectPointers; + + // Our implementation data fields + // KSPIN_LOCK v_spinlock; + kmutex_t v_mutex; + + mount_t *v_mount; + uint32_t v_flags; + uint32_t v_iocount; // Short term holds + uint32_t v_usecount; // Long term holds + uint32_t v_type; + uint32_t v_unlink; + uint32_t v_unused; + void *v_data; + uint64_t v_id; + uint64_t v_easize; + hrtime_t v_age; // How long since entered DEAD + + // Other Windows entries + // Must be 8byte aligned + ERESOURCE resource; // Holder for FileHeader.Resource + ERESOURCE pageio_resource; // Holder for FileHeader.PageIoResource + FILE_LOCK lock; + SECURITY_DESCRIPTOR *security_descriptor; + SHARE_ACCESS share_access; + + list_node_t v_list; // vnode_all_list member node. + + avl_tree_t v_fileobjects; // All seen FOs that point to this +}; +typedef struct vnode vnode_t; +#pragma pack() + +struct vfs_context; +typedef struct vfs_context vfs_context_t; + +struct caller_context; +typedef struct caller_context caller_context_t; +typedef int vcexcl_t; + +enum vcexcl { NONEXCL, EXCL }; + +#define VSUID 0x800 /*04000*/ /* set user id on execution */ +#define VSGID 0x400 /*02000*/ /* set group id on execution */ +#define VSVTX 0x200 /*01000*/ /* save swapped text even after use */ +#define VREAD 0x100 /*00400*/ /* read, write, execute permissions */ +#define VWRITE 0x080 /*00200*/ +#define VEXEC 0x040 /*00100*/ + +/* +* Vnode types. VNON means no type. +*/ +enum vtype { + /* 0 */ + VNON, + /* 1 - 5 */ + VREG, VDIR, VBLK, VCHR, VLNK, + /* 6 - 10 */ + VSOCK, VFIFO, VBAD, VSTR, VCPLX +}; + +extern enum vtype iftovt_tab[]; +extern int vttoif_tab[]; + +#define IFTOVT(mode) (iftovt_tab[((mode)& S_IFMT) >> 12]) +#define VTTOIF(indx) (vttoif_tab[(int)(indx)]) +#define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) + + +/* + * OSX uses separate vnop getxattr and setxattr to deal with XATTRs, so + * we never get vop&XVATTR set from VFS. All internal checks for it in + * ZFS is not required. + */ +#define ATTR_XVATTR 0 +#define AT_XVATTR ATTR_XVATTR + +#define B_INVAL 0x01 +#define B_TRUNC 0x02 + +#define DNLC_NO_VNODE (struct vnode *)(-1) + + +#define IS_DEVVP(vp) \ + (vnode_ischr(vp) || vnode_isblk(vp) || vnode_isfifo(vp)) + + + +#define VNODE_ATTR_va_rdev (1LL << 0) /* 00000001 */ +#define VNODE_ATTR_va_nlink (1LL << 1) /* 00000002 */ +#define VNODE_ATTR_va_total_size (1LL << 2) /* 00000004 */ +#define VNODE_ATTR_va_total_alloc (1LL << 3) /* 00000008 */ +#define VNODE_ATTR_va_data_size (1LL << 4) /* 00000010 */ +#define VNODE_ATTR_va_data_alloc (1LL << 5) /* 00000020 */ +#define VNODE_ATTR_va_iosize (1LL << 6) /* 00000040 */ +#define VNODE_ATTR_va_uid (1LL << 7) /* 00000080 */ +#define VNODE_ATTR_va_gid (1LL << 8) /* 00000100 */ +#define VNODE_ATTR_va_mode (1LL << 9) /* 00000200 */ +#define VNODE_ATTR_va_flags (1LL << 10) /* 00000400 */ +#define VNODE_ATTR_va_acl (1LL << 11) /* 00000800 */ +#define VNODE_ATTR_va_create_time (1LL << 12) /* 00001000 */ +#define VNODE_ATTR_va_access_time (1LL << 13) /* 00002000 */ +#define VNODE_ATTR_va_modify_time (1LL << 14) /* 00004000 */ +#define VNODE_ATTR_va_change_time (1LL << 15) /* 00008000 */ +#define VNODE_ATTR_va_backup_time (1LL << 16) /* 00010000 */ +#define VNODE_ATTR_va_fileid (1LL << 17) /* 00020000 */ +#define VNODE_ATTR_va_linkid (1LL << 18) /* 00040000 */ +#define VNODE_ATTR_va_parentid (1LL << 19) /* 00080000 */ +#define VNODE_ATTR_va_fsid (1LL << 20) /* 00100000 */ +#define VNODE_ATTR_va_filerev (1LL << 21) /* 00200000 */ +#define VNODE_ATTR_va_gen (1LL << 22) /* 00400000 */ +#define VNODE_ATTR_va_encoding (1LL << 23) /* 00800000 */ +#define VNODE_ATTR_va_type (1LL << 24) /* 01000000 */ +#define VNODE_ATTR_va_name (1LL << 25) /* 02000000 */ +#define VNODE_ATTR_va_uuuid (1LL << 26) /* 04000000 */ +#define VNODE_ATTR_va_guuid (1LL << 27) /* 08000000 */ +#define VNODE_ATTR_va_nchildren (1LL << 28) /* 10000000 */ +#define VNODE_ATTR_va_dirlinkcount (1LL << 29) /* 20000000 */ +#define VNODE_ATTR_va_addedtime (1LL << 30) /* 40000000 */ + +enum rm { RMFILE, RMDIRECTORY }; /* rm or rmdir (remove) */ +enum create { CRCREAT, CRMKNOD, CRMKDIR }; /* reason for create */ + +#define va_mask va_active +#define va_nodeid va_fileid +#define va_nblocks va_filerev + + +/* + * vnode attr translations + */ +#define AT_TYPE VNODE_ATTR_va_type +#define AT_MODE VNODE_ATTR_va_mode +#define AT_ACL VNODE_ATTR_va_acl +#define AT_UID VNODE_ATTR_va_uid +#define AT_GID VNODE_ATTR_va_gid +#define AT_ATIME VNODE_ATTR_va_access_time +#define AT_MTIME VNODE_ATTR_va_modify_time +#define AT_CTIME VNODE_ATTR_va_change_time +#define AT_CRTIME VNODE_ATTR_va_create_time +#define AT_SIZE VNODE_ATTR_va_data_size +#define AT_NOSET 0 + +#define va_size va_data_size +#define va_atime va_access_time +#define va_mtime va_modify_time +#define va_ctime va_change_time +#define va_crtime va_create_time +#define va_bytes va_data_size + + +// TBD - this comes from XNU, to assist with compiling right now, but +// this struct should be replaced with whatever we cook up for Windows +struct vnode_attr { + uint64_t va_supported; + uint64_t va_active; + int va_vaflags; + dev_t va_rdev; /* device id (device nodes only) */ + uint64_t va_nlink; /* number of references to this file */ + uint64_t va_total_size; /* size in bytes of all forks */ + uint64_t va_total_alloc; /* disk space used by all forks */ + uint64_t va_data_size; /* size in bytes of the fork managed by current vnode */ + uint64_t va_data_alloc; /* disk space used by the fork managed by current vnode */ + uint32_t va_iosize; /* optimal I/O blocksize */ + + /* file security information */ + uid_t va_uid; /* owner UID */ + gid_t va_gid; /* owner GID */ + mode_t va_mode; /* posix permissions */ + uint32_t va_flags; /* file flags */ + struct kauth_acl *va_acl; /* access control list */ + + struct timespec va_create_time; /* time of creation */ + struct timespec va_access_time; /* time of last access */ + struct timespec va_modify_time; /* time of last data modification */ + struct timespec va_change_time; /* time of last metadata change */ + struct timespec va_backup_time; /* time of last backup */ + + uint64_t va_fileid; /* file unique ID in filesystem */ + uint64_t va_linkid; /* file link unique ID */ + uint64_t va_parentid; /* parent ID */ + uint32_t va_fsid; /* filesystem ID */ + uint64_t va_filerev; /* file revision counter */ /* XXX */ + + enum vtype va_type; /* file type (create only) */ + char * va_name; /* Name for ATTR_CMN_NAME; MAXPATHLEN bytes */ + +}; +typedef struct vnode_attr vattr; +typedef struct vnode_attr vattr_t; + +/* vsa_mask values */ +#define VSA_ACL 0x0001 +#define VSA_ACLCNT 0x0002 +#define VSA_DFACL 0x0004 +#define VSA_DFACLCNT 0x0008 +#define VSA_ACE 0x0010 +#define VSA_ACECNT 0x0020 +#define VSA_ACE_ALLTYPES 0x0040 +#define VSA_ACE_ACLFLAGS 0x0080 /* get/set ACE ACL flags */ + +/* + * component name operations (for VNOP_LOOKUP) + */ +// Unfortunately 'DELETE' is a Win32 define as well. +// We should consider moving all these to VN_* +#define LOOKUP 0 /* perform name lookup only */ +#define CREATE 1 /* setup for file creation */ +#define VN_DELETE 2 /* setup for file deletion */ +#define RENAME 3 /* setup for file renaming */ +#define OPMASK 3 /* mask for operation */ + +/* + * component name operational modifier flags + */ +#define FOLLOW 0x00000040 /* follow symbolic links */ +#define NOTRIGGER 0x10000000 /* don't trigger automounts */ + +/* + * component name parameter descriptors. + */ +#define ISDOTDOT 0x00002000 /* current component name is .. */ +#define MAKEENTRY 0x00004000 /* entry is to be added to name cache */ +#define ISLASTCN 0x00008000 /* this is last component of pathname */ +#define ISWHITEOUT 0x00020000 /* OBSOLETE: found whiteout */ +#define DOWHITEOUT 0x00040000 /* OBSOLETE: do whiteouts */ + + +struct componentname { + uint32_t cn_nameiop; + uint32_t cn_flags; + char *cn_pnbuf; + int cn_pnlen; + char *cn_nameptr; + int cn_namelen; +}; + + + + +extern struct vnode *vn_alloc(int flag); + +extern int vn_open(char *pnamep, enum uio_seg seg, int filemode, + int createmode, + struct vnode **vpp, enum create crwhy, mode_t umask); + +extern int vn_openat(char *pnamep, enum uio_seg seg, int filemode, + int createmode, struct vnode **vpp, enum create crwhy, + mode_t umask, struct vnode *startvp); + +#define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0) +#define vn_free(vp) do { } while (0) +#define vn_pages_remove(vp,fl,op) do { } while (0) + + + +// OSX kernel has a vn_rdwr, let's work around it. +extern int zfs_vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, + ssize_t len, offset_t offset, enum uio_seg seg, + int ioflag, rlim64_t ulimit, cred_t *cr, + ssize_t *residp); + +#define vn_rdwr(rw, vp, base, len, off, seg, flg, limit, cr, resid) \ + zfs_vn_rdwr((rw), (vp), (base), (len), (off), (seg), (flg), (limit), (cr), (resid)) + +extern int vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag); +extern int vn_rename(char *from, char *to, enum uio_seg seg); + +#define LK_RETRY 0 +#define LK_SHARED 0 +#define VN_UNLOCK( vp ) +static inline int vn_lock(struct vnode *vp, int fl) { return 0; } + + +// KERNEL + +#ifdef DEBUG_IOCOUNT +#define VN_HOLD(vp) vnode_getwithref(vp, __FILE__, __LINE__) +#define VN_RELE(vp) \ + do { \ + if ((vp) && (vp) != DNLC_NO_VNODE) \ + vnode_put(vp, __FILE__, __LINE__); \ + } while (0) +#define vnode_getwithvid(V, ID) vnode_debug_getwithvid((V), (ID), __FILE__, __LINE__) + +#else + +#define VN_HOLD(vp) vnode_getwithref(vp) +#define VN_RELE(vp) \ + do { \ + if ((vp) && (vp) != DNLC_NO_VNODE) \ + vnode_put(vp); \ + } while (0) + +#endif + + + +void spl_rele_async(void *arg); +void vn_rele_async(struct vnode *vp, void *taskq); + +#define VN_RELE_ASYNC(vp,tq) vn_rele_async((vp),(tq)) + +#define vn_exists(vp) +#define vn_is_readonly(vp) vnode_vfsisrdonly(vp) + +#define VATTR_NULL(v) do { } while(0) + +extern int +VOP_CLOSE(struct vnode *vp, int flag, int count, offset_t off, void *cr, void *); +extern int +VOP_FSYNC(struct vnode *vp, int flags, void* unused, void *); +extern int +VOP_SPACE(HANDLE h, int cmd, struct flock *fl, int flags, offset_t off, + cred_t *cr, void *ctx); + +extern int VOP_GETATTR(struct vnode *vp, vattr_t *vap, int flags, void *x3, void *x4); + +#define VOP_UNLOCK(vp,fl) do { } while(0) + +void vfs_mountedfrom(struct mount *vfsp, char *osname); + +extern struct vnode *dnlc_lookup ( struct vnode *dvp, char *name ); +extern int dnlc_purge_vfsp ( struct mount *mp, int flags ); +extern void dnlc_remove ( struct vnode *vp, char *name ); +extern void dnlc_update ( struct vnode *vp, char *name, + struct vnode *tp); + +//#define build_path(A, B, C, D, E, F) spl_build_path(A,B,C,D,E,F) +//extern int spl_build_path(struct vnode *vp, char *buff, int buflen, int *outlen, +// int flags, vfs_context_t ctx); + + +extern struct vnode *rootdir; + +static inline int +chklock(struct vnode *vp, int iomode, unsigned long long offset, ssize_t len, int fmode, void *ct) +{ + return (0); +} + +#define vn_has_cached_data(VP) 0 /*(VTOZ(VP)->z_is_mapped || vnode_isswap(VP) || win_has_cached_data(VP))*/ + +static inline int win_has_cached_data(struct vnode *vp) +{ + int ret = 0; + PFILE_OBJECT pfo = CcGetFileObjectFromSectionPtrsRef(&vp->SectionObjectPointers); + if (pfo) { + // Although peeking in this macro, it only looks at SectionPointers, so maybe + // we should do that directly, and skip the FileObject extra? + ret = CcIsFileCached(pfo); + ObDereferenceObject(pfo); + } + return ret; +} + +#if 0 +// Since CcGetFileObjectFromSectionPtrsRef is vista and up, we store FileObject in vp now. +#define vnode_pager_setsize(vp, sz) do { \ + PFILE_OBJECT fileObject = CcGetFileObjectFromSectionPtrsRef(&vp->SectionObjectPointers); \ + if (fileObject != NULL) { \ + CC_FILE_SIZES ccfs; \ + vp->FileHeader.AllocationSize.QuadPart = P2ROUNDUP((sz), PAGE_SIZE); \ + vp->FileHeader.FileSize.QuadPart = (sz); \ + vp->FileHeader.ValidDataLength.QuadPart = (sz); \ + ccfs.AllocationSize = vp->FileHeader.AllocationSize; \ + ccfs.FileSize = vp->FileHeader.FileSize; \ + ccfs.ValidDataLength = vp->FileHeader.ValidDataLength; \ + CcSetFileSizes(fileObject, &ccfs); \ + ObDereferenceObject(fileObject); \ + } \ + } while(0) +#else +#define vnode_pager_setsize(vp, sz) do { \ + vp->FileHeader.AllocationSize.QuadPart = P2ROUNDUP((sz), PAGE_SIZE); \ + vp->FileHeader.FileSize.QuadPart = (sz); \ + vp->FileHeader.ValidDataLength.QuadPart = (sz); \ + vnode_setsizechange(vp, 1); \ + } while(0) +#endif + +#define vn_ismntpt(vp) (vnode_mountedhere(vp) != NULL) + +#if 0 +extern errno_t VOP_LOOKUP (struct vnode *, struct vnode **, + struct componentname *, vfs_context_t); +extern errno_t VOP_MKDIR (struct vnode *, struct vnode **, + struct componentname *, struct vnode_attr *, + vfs_context_t); +extern errno_t VOP_REMOVE (struct vnode *, struct vnode *, + struct componentname *, int, vfs_context_t); +extern errno_t VOP_SYMLINK (struct vnode *, struct vnode **, + struct componentname *, struct vnode_attr *, + char *, vfs_context_t); +#endif +void spl_vnode_fini(void); +int spl_vnode_init(void); + + +extern int spl_vfs_root(mount_t *mount, struct vnode **vp); +#define VFS_ROOT(V, L, VP) spl_vfs_root((V), (VP)) + +extern void cache_purgevfs(mount_t mp); + +int spl_vn_rdwr( + enum uio_rw rw, + struct vnode *vp, + caddr_t base, + ssize_t len, + offset_t offset, + enum uio_seg seg, + int ioflag, + rlim64_t ulimit, /* meaningful only if rw is UIO_WRITE */ + cred_t *cr, + ssize_t *residp); + +//vfs_context_t vfs_context_kernel(void); +//vfs_context_t spl_vfs_context_kernel(void); +extern int spl_vnode_notify(struct vnode *vp, uint32_t type, struct vnode_attr *vap); +extern int spl_vfs_get_notify_attributes(struct vnode_attr *vap); +extern void spl_hijack_mountroot(void *func); +extern void spl_setrootvnode(struct vnode *vp); + +struct vnode *getrootdir(void); +void spl_vfs_start(void); + +int vnode_vfsisrdonly(vnode_t *vp); +uint64_t vnode_vid(vnode_t *vp); +int vnode_isreg(vnode_t *vp); +int vnode_isdir(vnode_t *vp); +#ifdef DEBUG_IOCOUNT +int vnode_debug_getwithvid(vnode_t *, uint64_t, char *, int); +int vnode_getwithref(vnode_t *vp, char *, int); +int vnode_put(vnode_t *vp, char *, int); +void vnode_check_iocount(void); +#else +int vnode_getwithvid(vnode_t *, uint64_t); +int vnode_getwithref(vnode_t *vp); +int vnode_put(vnode_t *vp); +#endif + + +void *vnode_fsnode(struct vnode *dvp); +enum vtype vnode_vtype(vnode_t *vp); +int vnode_isblk(vnode_t *vp); +int vnode_ischr(vnode_t *vp); +int vnode_isswap(vnode_t *vp); +int vnode_isfifo(vnode_t *vp); +int vnode_islnk(vnode_t *vp); +mount_t *vnode_mountedhere(vnode_t *vp); +void ubc_setsize(struct vnode *, uint64_t); +int vnode_isinuse(vnode_t *vp, uint64_t refcnt); +int vnode_isidle(vnode_t *vp); +int vnode_recycle(vnode_t *vp); +int vnode_isvroot(vnode_t *vp); +mount_t *vnode_mount(vnode_t *vp); +void vnode_clearfsnode(vnode_t *vp); +void vnode_create(mount_t *, void *v_data, int type, int flags, struct vnode **vpp); +int vnode_ref(vnode_t *vp); +void vnode_rele(vnode_t *vp); +void *vnode_sectionpointer(vnode_t *vp); +void *vnode_security(vnode_t *vp); +void vnode_setsecurity(vnode_t *vp, void *sd); +void vnode_couplefileobject(vnode_t *vp, FILE_OBJECT *fileobject, uint64_t size); +void vnode_decouplefileobject(vnode_t *vp, FILE_OBJECT *fileobject); +void vnode_setsizechange(vnode_t *vp, int set); +int vnode_sizechange(vnode_t *vp); +int vnode_isrecycled(vnode_t *vp); + +#define VNODE_READDIR_EXTENDED 1 + +#define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ +#define FORCECLOSE 0x0002 /* vflush: force file closeure */ +#define WRITECLOSE 0x0004 /* vflush: only close writeable files */ +#define SKIPSWAP 0x0008 /* vflush: skip vnodes marked VSWAP */ +#define SKIPROOT 0x0010 /* vflush: skip root vnodes marked VROOT */ +#define VNODELOCKED 0x0100 /* vflush: vnode already locked call to recycle */ +#define NULLVP NULL + +int vflush(struct mount *mp, struct vnode *skipvp, int flags); +int vnode_fileobject_add(vnode_t *vp, void *fo); +int vnode_fileobject_remove(vnode_t *vp, void *fo); +int vnode_fileobject_empty(vnode_t *vp, int locked); + +void vnode_lock(vnode_t *vp); +void vnode_unlock(vnode_t *vp); +int vnode_drain_delayclose(int); +int vnode_easize(struct vnode *vp, uint64_t *size); +void vnode_set_easize(struct vnode *vp, uint64_t size); +void vnode_clear_easize(struct vnode *vp); +int vnode_flushcache(vnode_t *vp, FILE_OBJECT *fileobject, boolean_t ); + +int kernel_ioctl(PDEVICE_OBJECT DeviceObject, long cmd, void *inbuf, uint32_t inlen, + void *outbuf, uint32_t outlen); + +/* Linux TRIM API */ +int blk_queue_discard(PDEVICE_OBJECT dev); +int blk_queue_discard_secure(PDEVICE_OBJECT dev); +int blk_queue_nonrot(PDEVICE_OBJECT dev); +int blkdev_issue_discard_bytes(PDEVICE_OBJECT dev, uint64_t offset, uint64_t size, uint32_t flags); + +#endif /* SPL_VNODE_H */ diff --git a/include/os/windows/spl/sys/zmod.h b/include/os/windows/spl/sys/zmod.h new file mode 100644 index 000000000000..5d9db0a4c031 --- /dev/null +++ b/include/os/windows/spl/sys/zmod.h @@ -0,0 +1,123 @@ +/*****************************************************************************\ + * zlib.h -- interface of the 'zlib' general purpose compression library + * version 1.2.5, April 19th, 2010 + * + * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * Jean-loup Gailly + * Mark Adler +\*****************************************************************************/ + +#ifndef _SPL_ZMOD_H +#define _SPL_ZMOD_H + + +#include +#include +#include + + +struct _zmemheader { + uint64_t length; + char data[0]; +}; + +static inline void * +zfs_zalloc(void* opaque, uInt items, uInt size) +{ + struct _zmemheader *hdr; + uint32_t alloc_size = (items * size) + sizeof (uint64_t); + hdr = kmem_zalloc(alloc_size, KM_SLEEP); + hdr->length = alloc_size; + return (&hdr->data); +} + +static inline void +zfs_zfree(void *opaque, void *addr) +{ + struct _zmemheader *hdr; + hdr = addr; + hdr--; + kmem_free(hdr, (uint32_t)hdr->length); +} + +/* + * Uncompress the buffer 'src' into the buffer 'dst'. The caller must store + * the expected decompressed data size externally so it can be passed in. + * The resulting decompressed size is then returned through dstlen. This + * function return Z_OK on success, or another error code on failure. + */ +static inline int + z_uncompress(void *dst, uint32_t *dstlen, const void *src, uint32_t srclen) +{ + z_stream zs; + int err; + + bzero(&zs, sizeof (zs)); + zs.next_in = (uchar_t *)src; + zs.avail_in = srclen; + zs.next_out = dst; + zs.avail_out = *dstlen; + zs.zalloc = zfs_zalloc; + zs.zfree = zfs_zfree; + if ((err = inflateInit(&zs)) != Z_OK) + return (err); + if ((err = inflate(&zs, Z_FINISH)) != Z_STREAM_END) { + (void) inflateEnd(&zs); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *dstlen = zs.total_out; + return (inflateEnd(&zs)); +} + +static inline int +z_compress_level(void *dst, uint32_t *dstlen, const void *src, uint32_t srclen, + int level) +{ + z_stream zs; + int err; + bzero(&zs, sizeof (zs)); + zs.next_in = (uchar_t *)src; + zs.avail_in = srclen; + zs.next_out = dst; + zs.avail_out = *dstlen; + zs.zalloc = zfs_zalloc; + zs.zfree = zfs_zfree; + if ((err = deflateInit(&zs, level)) != Z_OK) + return (err); + if ((err = deflate(&zs, Z_FINISH)) != Z_STREAM_END) { + (void) deflateEnd(&zs); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *dstlen = zs.total_out; + return (deflateEnd(&zs)); +} + +static inline int +z_compress(void *dst, uint32_t *dstlen, const void *src, uint32_t srclen) +{ + return (z_compress_level(dst, dstlen, src, srclen, + Z_DEFAULT_COMPRESSION)); +} + + +int spl_zlib_init(void); +void spl_zlib_fini(void); + +#endif /* SPL_ZMOD_H */ diff --git a/include/os/windows/spl/sys/zone.h b/include/os/windows/spl/sys/zone.h new file mode 100644 index 000000000000..18b06215e567 --- /dev/null +++ b/include/os/windows/spl/sys/zone.h @@ -0,0 +1,11 @@ + +#ifndef _SPL_ZONE_H +#define _SPL_ZONE_H + +#include + +#define zone_dataset_visible(x, y) (1) +#define INGLOBALZONE(z) (1) + +static inline unsigned long long getzoneid(void) { return 0; } +#endif /* SPL_ZONE_H */ diff --git a/include/os/windows/spl/unistd.h b/include/os/windows/spl/unistd.h new file mode 100644 index 000000000000..c6b298a342f1 --- /dev/null +++ b/include/os/windows/spl/unistd.h @@ -0,0 +1,4 @@ +#ifndef _SPL_UNISTD_H +#define _SPL_UNISTD_H + +#endif /* SPL_UNISTD_H */ diff --git a/include/os/windows/zfs/sys/spa.h b/include/os/windows/zfs/sys/spa.h new file mode 100644 index 000000000000..7738112c5882 --- /dev/null +++ b/include/os/windows/zfs/sys/spa.h @@ -0,0 +1,1192 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Intel Corporation. + */ + +#ifndef _SYS_SPA_H +#define _SYS_SPA_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Forward references that lots of things need. + */ +typedef struct spa spa_t; +typedef struct vdev vdev_t; +typedef struct metaslab metaslab_t; +typedef struct metaslab_group metaslab_group_t; +typedef struct metaslab_class metaslab_class_t; +typedef struct zio zio_t; +typedef struct zilog zilog_t; +typedef struct spa_aux_vdev spa_aux_vdev_t; +typedef struct ddt ddt_t; +typedef struct ddt_entry ddt_entry_t; +typedef struct zbookmark_phys zbookmark_phys_t; + +struct dsl_pool; +struct dsl_dataset; +struct dsl_crypto_params; + +/* + * General-purpose 32-bit and 64-bit bitfield encodings. + */ +#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) +#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) +#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) +#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) + +#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) +#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) + +#define BF32_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1U << (len)); \ + ASSERT3U(low + len, <=, 32); \ + (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ +_NOTE(CONSTCOND) } while (0) + +#define BF64_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1ULL << (len)); \ + ASSERT3U(low + len, <=, 64); \ + ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ +_NOTE(CONSTCOND) } while (0) + +#define BF32_GET_SB(x, low, len, shift, bias) \ + ((BF32_GET(x, low, len) + (bias)) << (shift)) +#define BF64_GET_SB(x, low, len, shift, bias) \ + ((BF64_GET(x, low, len) + (bias)) << (shift)) + +#define BF32_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT(IS_P2ALIGNED(val, 1U << shift)); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +_NOTE(CONSTCOND) } while (0) +#define BF64_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +_NOTE(CONSTCOND) } while (0) + +/* + * We currently support block sizes from 512 bytes to 16MB. + * The benefits of larger blocks, and thus larger IO, need to be weighed + * against the cost of COWing a giant block to modify one byte, and the + * large latency of reading or writing a large block. + * + * Note that although blocks up to 16MB are supported, the recordsize + * property can not be set larger than zfs_max_recordsize (default 1MB). + * See the comment near zfs_max_recordsize in dsl_dataset.c for details. + * + * Note that although the LSIZE field of the blkptr_t can store sizes up + * to 32MB, the dnode's dn_datablkszsec can only store sizes up to + * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. + */ +#define SPA_MINBLOCKSHIFT 9 +#define SPA_OLD_MAXBLOCKSHIFT 17 +#define SPA_MAXBLOCKSHIFT 24 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + +/* + * Size of block to hold the configuration data (a packed nvlist) + */ +#define SPA_CONFIG_BLOCKSIZE (1ULL << 14) + +/* + * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. + * The ASIZE encoding should be at least 64 times larger (6 more bits) + * to support up to 4-way RAID-Z mirror mode with worst-case gang block + * overhead, three DVAs per bp, plus one more bit in case we do anything + * else that expands the ASIZE. + */ +#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ +#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ +#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ + +#define SPA_COMPRESSBITS 7 +#define SPA_VDEVBITS 24 + +/* + * All SPA data is represented by 128-bit data virtual addresses (DVAs). + * The members of the dva_t should be considered opaque outside the SPA. + */ +typedef struct dva { + uint64_t dva_word[2]; +} dva_t; + +/* + * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. + */ +typedef struct zio_cksum { + uint64_t zc_word[4]; +} zio_cksum_t; + +/* + * Some checksums/hashes need a 256-bit initialization salt. This salt is kept + * secret and is suitable for use in MAC algorithms as the key. + */ +typedef struct zio_cksum_salt { + uint8_t zcs_bytes[32]; +} zio_cksum_salt_t; + +/* + * Each block is described by its DVAs, time of birth, checksum, etc. + * The word-by-word, bit-by-bit layout of the blkptr is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | pad | vdev1 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 1 |G| offset1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 2 | pad | vdev2 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 3 |G| offset2 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 4 | pad | vdev3 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 5 |G| offset3 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 8 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 9 | physical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | logical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | fill count | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * c | checksum[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * d | checksum[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * e | checksum[2] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * f | checksum[3] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * vdev virtual device ID + * offset offset into virtual device + * LSIZE logical size + * PSIZE physical size (after compression) + * ASIZE allocated size (including RAID-Z parity and gang block headers) + * GRID RAID-Z layout information (reserved for future use) + * cksum checksum function + * comp compression function + * G gang block indicator + * B byteorder (endianness) + * D dedup + * X encryption + * E blkptr_t contains embedded data (see below) + * lvl level of indirection + * type DMU object type + * phys birth txg when dva[0] was written; zero if same as logical birth txg + * note that typically all the dva's would be written in this + * txg, but they could be different if they were moved by + * device removal. + * log. birth transaction group in which the block was logically born + * fill count number of non-zero blocks under this bp + * checksum[4] 256-bit checksum of the data this bp describes + */ + +/* + * The blkptr_t's of encrypted blocks also need to store the encryption + * parameters so that the block can be decrypted. This layout is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | vdev1 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 1 |G| offset1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 2 | vdev2 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 3 |G| offset2 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 4 | salt | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 5 | IV1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 8 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 9 | physical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | logical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | IV2 | fill count | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * c | checksum[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * d | checksum[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * e | MAC[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * f | MAC[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * salt Salt for generating encryption keys + * IV1 First 64 bits of encryption IV + * X Block requires encryption handling (set to 1) + * E blkptr_t contains embedded data (set to 0, see below) + * fill count number of non-zero blocks under this bp (truncated to 32 bits) + * IV2 Last 32 bits of encryption IV + * checksum[2] 128-bit checksum of the data this bp describes + * MAC[2] 128-bit message authentication code for this data + * + * The X bit being set indicates that this block is one of 3 types. If this is + * a level 0 block with an encrypted object type, the block is encrypted + * (see BP_IS_ENCRYPTED()). If this is a level 0 block with an unencrypted + * object type, this block is authenticated with an HMAC (see + * BP_IS_AUTHENTICATED()). Otherwise (if level > 0), this bp will use the MAC + * words to store a checksum-of-MACs from the level below (see + * BP_HAS_INDIRECT_MAC_CKSUM()). For convenience in the code, BP_IS_PROTECTED() + * refers to both encrypted and authenticated blocks and BP_USES_CRYPT() + * refers to any of these 3 kinds of blocks. + * + * The additional encryption parameters are the salt, IV, and MAC which are + * explained in greater detail in the block comment at the top of zio_crypt.c. + * The MAC occupies half of the checksum space since it serves a very similar + * purpose: to prevent data corruption on disk. The only functional difference + * is that the checksum is used to detect on-disk corruption whether or not the + * encryption key is loaded and the MAC provides additional protection against + * malicious disk tampering. We use the 3rd DVA to store the salt and first + * 64 bits of the IV. As a result encrypted blocks can only have 2 copies + * maximum instead of the normal 3. The last 32 bits of the IV are stored in + * the upper bits of what is usually the fill count. Note that only blocks at + * level 0 or -2 are ever encrypted, which allows us to guarantee that these + * 32 bits are not trampled over by other code (see zio_crypt.c for details). + * The salt and IV are not used for authenticated bps or bps with an indirect + * MAC checksum, so these blocks can utilize all 3 DVAs and the full 64 bits + * for the fill count. + */ + +/* + * "Embedded" blkptr_t's don't actually point to a block, instead they + * have a data payload embedded in the blkptr_t itself. See the comment + * in blkptr.c for more details. + * + * The blkptr_t is laid out as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | payload | + * 1 | payload | + * 2 | payload | + * 3 | payload | + * 4 | payload | + * 5 | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | payload | + * 8 | payload | + * 9 | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | logical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | payload | + * c | payload | + * d | payload | + * e | payload | + * f | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * payload contains the embedded data + * B (byteorder) byteorder (endianness) + * D (dedup) padding (set to zero) + * X encryption (set to zero; see above) + * E (embedded) set to one + * lvl indirection level + * type DMU object type + * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) + * comp compression function of payload + * PSIZE size of payload after compression, in bytes + * LSIZE logical size of payload, in bytes + * note that 25 bits is enough to store the largest + * "normal" BP's LSIZE (2^16 * 2^9) in bytes + * log. birth transaction group in which the block was logically born + * + * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded + * bp's they are stored in units of SPA_MINBLOCKSHIFT. + * Generally, the generic BP_GET_*() macros can be used on embedded BP's. + * The B, D, X, lvl, type, and comp fields are stored the same as with normal + * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must + * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before + * other macros, as they assert that they are only used on BP's of the correct + * "embedded-ness". Encrypted blkptr_t's cannot be embedded because they use + * the payload space for encryption parameters (see the comment above on + * how encryption parameters are stored). + */ + +#define BPE_GET_ETYPE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET((bp)->blk_prop, 40, 8)) +#define BPE_SET_ETYPE(bp, t) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET((bp)->blk_prop, 40, 8, t); \ +_NOTE(CONSTCOND) } while (0) + +#define BPE_GET_LSIZE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) +#define BPE_SET_LSIZE(bp, x) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BPE_GET_PSIZE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) +#define BPE_SET_PSIZE(bp, x) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +typedef enum bp_embedded_type { + BP_EMBEDDED_TYPE_DATA, + BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */ + NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED +} bp_embedded_type_t; + +#define BPE_NUM_WORDS 14 +#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) +#define BPE_IS_PAYLOADWORD(bp, wp) \ + ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) + +#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ +#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ +#define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */ + +/* + * A block is a hole when it has either 1) never been written to, or + * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads + * without physically allocating disk space. Holes are represented in the + * blkptr_t structure by zeroed blk_dva. Correct checking for holes is + * done through the BP_IS_HOLE macro. For holes, the logical size, level, + * DMU object type, and birth times are all also stored for holes that + * were written to at some point (i.e. were punched after having been filled). + */ +typedef struct blkptr { + dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ + uint64_t blk_prop; /* size, compression, type, etc */ + uint64_t blk_pad[2]; /* Extra space for the future */ + uint64_t blk_phys_birth; /* txg when block was allocated */ + uint64_t blk_birth; /* transaction group at birth */ + uint64_t blk_fill; /* fill count */ + zio_cksum_t blk_cksum; /* 256-bit checksum */ +} blkptr_t; + +/* + * Macros to get and set fields in a bp or DVA. + */ +#define DVA_GET_ASIZE(dva) \ + BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_ASIZE(dva, x) \ + BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ + SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) +#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) + +#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) +#define DVA_SET_VDEV(dva, x) \ + BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) + +#define DVA_GET_OFFSET(dva) \ + BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_OFFSET(dva, x) \ + BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) +#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) + +#define BP_GET_LSIZE(bp) \ + (BP_IS_EMBEDDED(bp) ? \ + (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ + BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) +#define BP_SET_LSIZE(bp, x) do { \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, \ + 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BP_GET_PSIZE(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)) +#define BP_SET_PSIZE(bp, x) do { \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, \ + 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BP_GET_COMPRESS(bp) \ + BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) +#define BP_SET_COMPRESS(bp, x) \ + BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) + +#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) +#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) + +#define BP_GET_CHECKSUM(bp) \ + (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \ + BF64_GET((bp)->blk_prop, 40, 8)) +#define BP_SET_CHECKSUM(bp, x) do { \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET((bp)->blk_prop, 40, 8, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) +#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) + +#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) +#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) + +/* encrypted, authenticated, and MAC cksum bps use the same bit */ +#define BP_USES_CRYPT(bp) BF64_GET((bp)->blk_prop, 61, 1) +#define BP_SET_CRYPT(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) + +#define BP_IS_ENCRYPTED(bp) \ + (BP_USES_CRYPT(bp) && \ + BP_GET_LEVEL(bp) <= 0 && \ + DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) + +#define BP_IS_AUTHENTICATED(bp) \ + (BP_USES_CRYPT(bp) && \ + BP_GET_LEVEL(bp) <= 0 && \ + !DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) + +#define BP_HAS_INDIRECT_MAC_CKSUM(bp) \ + (BP_USES_CRYPT(bp) && BP_GET_LEVEL(bp) > 0) + +#define BP_IS_PROTECTED(bp) \ + (BP_IS_ENCRYPTED(bp) || BP_IS_AUTHENTICATED(bp)) + +#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) +#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) + +#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) +#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) + +#define BP_PHYSICAL_BIRTH(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) + +#define BP_SET_BIRTH(bp, logical, physical) \ +{ \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + (bp)->blk_birth = (logical); \ + (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ +} + +#define BP_GET_FILL(bp) \ + ((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \ + ((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill)) + +#define BP_SET_FILL(bp, fill) \ +{ \ + if (BP_IS_ENCRYPTED(bp)) \ + BF64_SET((bp)->blk_fill, 0, 32, fill); \ + else \ + (bp)->blk_fill = fill; \ +} + +#define BP_GET_IV2(bp) \ + (ASSERT(BP_IS_ENCRYPTED(bp)), \ + BF64_GET((bp)->blk_fill, 32, 32)) +#define BP_SET_IV2(bp, iv2) \ +{ \ + ASSERT(BP_IS_ENCRYPTED(bp)); \ + BF64_SET((bp)->blk_fill, 32, 32, iv2); \ +} + +#define BP_IS_METADATA(bp) \ + (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) + +#define BP_GET_ASIZE(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + (DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) + +#define BP_GET_UCSIZE(bp) \ + (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) + +#define BP_GET_NDVAS(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + (!!DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) + +#define BP_COUNT_GANG(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ + DVA_GET_GANG(&(bp)->blk_dva[1]) + \ + (DVA_GET_GANG(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))) + +#define DVA_EQUAL(dva1, dva2) \ + ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ + (dva1)->dva_word[0] == (dva2)->dva_word[0]) + +#define BP_EQUAL(bp1, bp2) \ + (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ + (bp1)->blk_birth == (bp2)->blk_birth && \ + DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ + DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ + DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) + +#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ + (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ + ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ + ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ + ((zc1).zc_word[3] - (zc2).zc_word[3]))) + +#define ZIO_CHECKSUM_MAC_EQUAL(zc1, zc2) \ + (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ + ((zc1).zc_word[1] - (zc2).zc_word[1]))) + +#define ZIO_CHECKSUM_IS_ZERO(zc) \ + (0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \ + (zc)->zc_word[2] | (zc)->zc_word[3])) + +#define ZIO_CHECKSUM_BSWAP(zcp) \ +{ \ + (zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]); \ + (zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]); \ + (zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]); \ + (zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]); \ +} + +#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) + +#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ +{ \ + (zcp)->zc_word[0] = w0; \ + (zcp)->zc_word[1] = w1; \ + (zcp)->zc_word[2] = w2; \ + (zcp)->zc_word[3] = w3; \ +} + +#define MAX_DATA_MAC_LEN 16 +#define MAX_DATA_IV_LEN 12 + +#define ZIO_SET_MAC(bp, mac) \ + bcopy((mac), &(bp)->blk_cksum.zc_word[2], MAX_DATA_MAC_LEN); + +#define ZIO_SET_IV(bp, iv) \ + bcopy((iv), (bp)->blk_iv, MAX_DATA_IV_LEN); + +#define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0]) +#define BP_IS_GANG(bp) \ + (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp))) +#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ + (dva)->dva_word[1] == 0ULL) +#define BP_IS_HOLE(bp) \ + (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) + +/* BP_IS_RAIDZ(bp) assumes no block compression */ +#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ + BP_GET_PSIZE(bp)) + +#define BP_ZERO(bp) \ +{ \ + (bp)->blk_dva[0].dva_word[0] = 0; \ + (bp)->blk_dva[0].dva_word[1] = 0; \ + (bp)->blk_dva[1].dva_word[0] = 0; \ + (bp)->blk_dva[1].dva_word[1] = 0; \ + (bp)->blk_dva[2].dva_word[0] = 0; \ + (bp)->blk_dva[2].dva_word[1] = 0; \ + (bp)->blk_prop = 0; \ + (bp)->blk_pad[0] = 0; \ + (bp)->blk_pad[1] = 0; \ + (bp)->blk_phys_birth = 0; \ + (bp)->blk_birth = 0; \ + (bp)->blk_fill = 0; \ + ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ +} + +#ifdef _BIG_ENDIAN +#define ZFS_HOST_BYTEORDER (0ULL) +#else +#define ZFS_HOST_BYTEORDER (1ULL) +#endif + +#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) + +#define BP_SPRINTF_LEN 400 + +/* + * This macro allows code sharing between zfs, libzpool, and mdb. + * 'func' is either snprintf() or mdb_snprintf(). + * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. + */ +#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, crypt_type, \ + compress) \ +{ \ + static const char *copyname[] = \ + { "zero", "single", "double", "triple" }; \ + int len = 0; \ + int copies = 0; \ + int d; \ + \ + if (bp == NULL) { \ + len += func(buf + len, size - len, ""); \ + } else if (BP_IS_HOLE(bp)) { \ + len += func(buf + len, size - len, \ + "HOLE [L%llu %s] " \ + "size=%llxL birth=%lluL", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + (u_longlong_t)BP_GET_LSIZE(bp), \ + (u_longlong_t)bp->blk_birth); \ + } else if (BP_IS_EMBEDDED(bp)) { \ + len = func(buf + len, size - len, \ + "EMBEDDED [L%llu %s] et=%u %s " \ + "size=%llxL/%llxP birth=%lluL", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + (int)BPE_GET_ETYPE(bp), \ + compress, \ + (u_longlong_t)BPE_GET_LSIZE(bp), \ + (u_longlong_t)BPE_GET_PSIZE(bp), \ + (u_longlong_t)bp->blk_birth); \ + } else { \ + for (d = 0; d < BP_GET_NDVAS(bp); d++) { \ + const dva_t *dva = &bp->blk_dva[d]; \ + if (DVA_IS_VALID(dva)) \ + copies++; \ + len += func(buf + len, size - len, \ + "DVA[%d]=<%llu:%llx:%llx>%c", d, \ + (u_longlong_t)DVA_GET_VDEV(dva), \ + (u_longlong_t)DVA_GET_OFFSET(dva), \ + (u_longlong_t)DVA_GET_ASIZE(dva), \ + ws); \ + } \ + if (BP_IS_ENCRYPTED(bp)) { \ + len += func(buf + len, size - len, \ + "salt=%llx iv=%llx:%llx%c", \ + (u_longlong_t)bp->blk_dva[2].dva_word[0], \ + (u_longlong_t)bp->blk_dva[2].dva_word[1], \ + (u_longlong_t)BP_GET_IV2(bp), \ + ws); \ + } \ + if (BP_IS_GANG(bp) && \ + DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ + DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ + copies--; \ + len += func(buf + len, size - len, \ + "[L%llu %s] %s %s %s %s %s %s %s%c" \ + "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ + "cksum=%llx:%llx:%llx:%llx", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + checksum, \ + compress, \ + crypt_type, \ + BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ + BP_IS_GANG(bp) ? "gang" : "contiguous", \ + BP_GET_DEDUP(bp) ? "dedup" : "unique", \ + copyname[copies], \ + ws, \ + (u_longlong_t)BP_GET_LSIZE(bp), \ + (u_longlong_t)BP_GET_PSIZE(bp), \ + (u_longlong_t)bp->blk_birth, \ + (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ + (u_longlong_t)BP_GET_FILL(bp), \ + ws, \ + (u_longlong_t)bp->blk_cksum.zc_word[0], \ + (u_longlong_t)bp->blk_cksum.zc_word[1], \ + (u_longlong_t)bp->blk_cksum.zc_word[2], \ + (u_longlong_t)bp->blk_cksum.zc_word[3]); \ + } \ + ASSERT(len < size); \ +} + +#include + +#define BP_GET_BUFC_TYPE(bp) \ + (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) + +typedef enum spa_import_type { + SPA_IMPORT_EXISTING, + SPA_IMPORT_ASSEMBLE +} spa_import_type_t; + +// Hold module busy to stop unregister until all exported. +extern uint64_t zfs_module_busy; + +/* + * Send TRIM commands in-line during normal pool operation while deleting. + * OFF: no + * ON: yes + */ +typedef enum { + SPA_AUTOTRIM_OFF = 0, /* default */ + SPA_AUTOTRIM_ON +} spa_autotrim_t; + +/* + * Reason TRIM command was issued, used internally for accounting purposes. + */ +typedef enum trim_type { + TRIM_TYPE_MANUAL = 0, + TRIM_TYPE_AUTO = 1, +} trim_type_t; + +/* state manipulation functions */ +extern int spa_open(const char *pool, spa_t **, void *tag); +extern int spa_open_rewind(const char *pool, spa_t **, void *tag, + nvlist_t *policy, nvlist_t **config); +extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, + size_t buflen); +extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, + nvlist_t *zplprops, struct dsl_crypto_params *dcp); +extern int spa_import_rootpool(char *devpath, char *devid); +extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, + uint64_t flags); +extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); +extern int spa_destroy(char *pool); +extern int spa_checkpoint(const char *pool); +extern int spa_checkpoint_discard(const char *pool); +extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, + boolean_t hardforce); +extern int spa_reset(char *pool); +extern void spa_async_request(spa_t *spa, int flag); +extern void spa_async_unrequest(spa_t *spa, int flag); +extern void spa_async_suspend(spa_t *spa); +extern void spa_async_resume(spa_t *spa); +extern spa_t *spa_inject_addref(char *pool); +extern void spa_inject_delref(spa_t *spa); +extern void spa_scan_stat_init(spa_t *spa); +extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); + +#define SPA_ASYNC_CONFIG_UPDATE 0x01 +#define SPA_ASYNC_REMOVE 0x02 +#define SPA_ASYNC_PROBE 0x04 +#define SPA_ASYNC_RESILVER_DONE 0x08 +#define SPA_ASYNC_RESILVER 0x10 +#define SPA_ASYNC_AUTOEXPAND 0x20 +#define SPA_ASYNC_REMOVE_DONE 0x40 +#define SPA_ASYNC_REMOVE_STOP 0x80 +#define SPA_ASYNC_INITIALIZE_RESTART 0x100 +#define SPA_ASYNC_TRIM_RESTART 0x200 +#define SPA_ASYNC_AUTOTRIM_RESTART 0x400 + +/* + * Controls the behavior of spa_vdev_remove(). + */ +#define SPA_REMOVE_UNSPARE 0x01 +#define SPA_REMOVE_DONE 0x02 + +/* device manipulation */ +extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); +extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, + int replacing); +extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, + int replace_done); +extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); +extern boolean_t spa_vdev_remove_active(spa_t *spa); +extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t guid, + nvlist_t *vdev_errlist); +extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, + uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist); +extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); +extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); +extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, + nvlist_t *props, boolean_t exp); + +/* spare state (which is global across all pools) */ +extern void spa_spare_add(vdev_t *vd); +extern void spa_spare_remove(vdev_t *vd); +extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); +extern void spa_spare_activate(vdev_t *vd); + +/* L2ARC state (which is global across all pools) */ +extern void spa_l2cache_add(vdev_t *vd); +extern void spa_l2cache_remove(vdev_t *vd); +extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); +extern void spa_l2cache_activate(vdev_t *vd); +extern void spa_l2cache_drop(spa_t *spa); + +/* scanning */ +extern int spa_scan(spa_t *spa, pool_scan_func_t func); +extern int spa_scan_stop(spa_t *spa); +extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); + +/* spa syncing */ +extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ +extern void spa_sync_allpools(void); + +extern int zfs_sync_pass_deferred_free; + +/* spa namespace global mutex */ +extern kmutex_t spa_namespace_lock; + +/* + * SPA configuration functions in spa_config.c + */ + +#define SPA_CONFIG_UPDATE_POOL 0 +#define SPA_CONFIG_UPDATE_VDEVS 1 + +extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t); +extern void spa_config_load(void); +extern nvlist_t *spa_all_configs(uint64_t *); +extern void spa_config_set(spa_t *spa, nvlist_t *config); +extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, + int getstats); +extern void spa_config_update(spa_t *spa, int what); + +/* + * Miscellaneous SPA routines in spa_misc.c + */ + +/* Namespace manipulation */ +extern spa_t *spa_lookup(const char *name); +extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); +extern void spa_remove(spa_t *spa); +extern spa_t *spa_next(spa_t *prev); + +/* Refcount functions */ +extern void spa_open_ref(spa_t *spa, void *tag); +extern void spa_close(spa_t *spa, void *tag); +extern void spa_async_close(spa_t *spa, void *tag); +extern boolean_t spa_refcount_zero(spa_t *spa); + +#define SCL_NONE 0x00 +#define SCL_CONFIG 0x01 +#define SCL_STATE 0x02 +#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ +#define SCL_ALLOC 0x08 +#define SCL_ZIO 0x10 +#define SCL_FREE 0x20 +#define SCL_VDEV 0x40 +#define SCL_LOCKS 7 +#define SCL_ALL ((1 << SCL_LOCKS) - 1) +#define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) + +/* Historical pool statistics */ +typedef struct spa_stats_history { + kmutex_t lock; + uint64_t count; + uint64_t size; + kstat_t *kstat; + void *_private; + list_t list; +} spa_stats_history_t; + +typedef struct spa_stats { + spa_stats_history_t read_history; + spa_stats_history_t txg_history; + spa_stats_history_t tx_assign_histogram; + spa_stats_history_t io_history; + spa_stats_history_t mmp_history; + spa_stats_history_t iostats; +} spa_stats_t; + +typedef enum txg_state { + TXG_STATE_BIRTH = 0, + TXG_STATE_OPEN = 1, + TXG_STATE_QUIESCED = 2, + TXG_STATE_WAIT_FOR_SYNC = 3, + TXG_STATE_SYNCED = 4, + TXG_STATE_COMMITTED = 5, +} txg_state_t; + +/* Assorted pool IO kstats */ +typedef struct spa_iostats { + kstat_named_t trim_extents_written; + kstat_named_t trim_bytes_written; + kstat_named_t trim_extents_skipped; + kstat_named_t trim_bytes_skipped; + kstat_named_t trim_extents_failed; + kstat_named_t trim_bytes_failed; + kstat_named_t autotrim_extents_written; + kstat_named_t autotrim_bytes_written; + kstat_named_t autotrim_extents_skipped; + kstat_named_t autotrim_bytes_skipped; + kstat_named_t autotrim_extents_failed; + kstat_named_t autotrim_bytes_failed; +} spa_iostats_t; + +extern void spa_stats_init(spa_t *spa); +extern void spa_stats_destroy(spa_t *spa); +extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, + uint32_t aflags); +extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time); +extern int spa_txg_history_set(spa_t *spa, uint64_t txg, + txg_state_t completed_state, hrtime_t completed_time); +extern int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, + uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty); +extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); +extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id); +extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error, + hrtime_t duration); +extern void *spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, + uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id, + int error); +extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, + uint64_t extents_written, uint64_t bytes_written, + uint64_t extents_skipped, uint64_t bytes_skipped, + uint64_t extents_failed, uint64_t bytes_failed); + +/* Pool configuration locks */ +extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); +extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); +extern void spa_config_exit(spa_t *spa, int locks, void *tag); +extern int spa_config_held(spa_t *spa, int locks, krw_t rw); + +/* Pool vdev add/remove lock */ +extern uint64_t spa_vdev_enter(spa_t *spa); +extern uint64_t spa_vdev_config_enter(spa_t *spa); +extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, + int error, char *tag); +extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); + +/* Pool vdev state change lock */ +extern void spa_vdev_state_enter(spa_t *spa, int oplock); +extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); + +/* Log state */ +typedef enum spa_log_state { + SPA_LOG_UNKNOWN = 0, /* unknown log state */ + SPA_LOG_MISSING, /* missing log(s) */ + SPA_LOG_CLEAR, /* clear the log(s) */ + SPA_LOG_GOOD, /* log(s) are good */ +} spa_log_state_t; + +extern spa_log_state_t spa_get_log_state(spa_t *spa); +extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); +extern int spa_reset_logs(spa_t *spa); + +/* Log claim callback */ +extern void spa_claim_notify(zio_t *zio); +extern void spa_deadman(void *); + +/* Accessor functions */ +extern boolean_t spa_shutting_down(spa_t *spa); +extern struct dsl_pool *spa_get_dsl(spa_t *spa); +extern boolean_t spa_is_initializing(spa_t *spa); +extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa); +extern blkptr_t *spa_get_rootblkptr(spa_t *spa); +extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); +extern void spa_altroot(spa_t *, char *, size_t); +extern int spa_sync_pass(spa_t *spa); +extern char *spa_name(spa_t *spa); +extern uint64_t spa_guid(spa_t *spa); +extern uint64_t spa_load_guid(spa_t *spa); +extern uint64_t spa_last_synced_txg(spa_t *spa); +extern uint64_t spa_first_txg(spa_t *spa); +extern uint64_t spa_syncing_txg(spa_t *spa); +extern uint64_t spa_final_dirty_txg(spa_t *spa); +extern uint64_t spa_version(spa_t *spa); +extern pool_state_t spa_state(spa_t *spa); +extern spa_load_state_t spa_load_state(spa_t *spa); +extern uint64_t spa_freeze_txg(spa_t *spa); +extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); +extern uint64_t spa_get_dspace(spa_t *spa); +extern uint64_t spa_get_checkpoint_space(spa_t *spa); +extern uint64_t spa_get_slop_space(spa_t *spa); +extern void spa_update_dspace(spa_t *spa); +extern uint64_t spa_version(spa_t *spa); +extern boolean_t spa_deflate(spa_t *spa); +extern metaslab_class_t *spa_normal_class(spa_t *spa); +extern metaslab_class_t *spa_log_class(spa_t *spa); +extern metaslab_class_t *spa_special_class(spa_t *spa); +extern metaslab_class_t *spa_dedup_class(spa_t *spa); +extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, + dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); + +extern void spa_evicting_os_register(spa_t *, objset_t *os); +extern void spa_evicting_os_deregister(spa_t *, objset_t *os); +extern void spa_evicting_os_wait(spa_t *spa); +extern int spa_max_replication(spa_t *spa); +extern int spa_prev_software_version(spa_t *spa); +extern uint8_t spa_get_failmode(spa_t *spa); +extern boolean_t spa_suspended(spa_t *spa); +extern uint64_t spa_bootfs(spa_t *spa); +extern uint64_t spa_delegation(spa_t *spa); +extern objset_t *spa_meta_objset(spa_t *spa); +extern uint64_t spa_deadman_synctime(spa_t *spa); +extern spa_autotrim_t spa_get_autotrim(spa_t *spa); + +/* Miscellaneous support routines */ +extern void spa_load_failed(spa_t *spa, const char *fmt, ...); +extern void spa_load_note(spa_t *spa, const char *fmt, ...); +extern void spa_activate_mos_feature(spa_t *spa, const char *feature, + dmu_tx_t *tx); +extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); +extern int spa_rename(const char *oldname, const char *newname); +extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); +extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); +extern char *spa_strdup(const char *); +extern void spa_strfree(char *); +extern uint64_t spa_get_random(uint64_t range); +extern uint64_t spa_generate_guid(spa_t *spa); +extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); +extern void spa_freeze(spa_t *spa); +extern int spa_change_guid(spa_t *spa); +extern void spa_upgrade(spa_t *spa, uint64_t version); +extern void spa_evict_all(void); +extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, + boolean_t l2cache); +extern boolean_t spa_has_spare(spa_t *, uint64_t guid); +extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); +extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); +extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); +extern boolean_t spa_has_slogs(spa_t *spa); +extern boolean_t spa_is_root(spa_t *spa); +extern boolean_t spa_writeable(spa_t *spa); +extern boolean_t spa_has_pending_synctask(spa_t *spa); +extern int spa_maxblocksize(spa_t *spa); +extern int spa_minashift(spa_t *spa); +extern int spa_maxdnodesize(spa_t *spa); +extern boolean_t spa_has_checkpoint(spa_t *spa); +extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); +extern boolean_t spa_suspend_async_destroy(spa_t *spa); +extern uint64_t spa_min_claim_txg(spa_t *spa); +extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp); +extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, + const blkptr_t *bp); +typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, + void *arg); +extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, + spa_remap_cb_t callback, void *arg); +extern uint64_t spa_get_last_removal_txg(spa_t *spa); +extern boolean_t spa_trust_config(spa_t *spa); +extern uint64_t spa_missing_tvds_allowed(spa_t *spa); +extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); +extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); +extern boolean_t spa_multihost(spa_t *spa); +extern unsigned long spa_get_hostid(void); +extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); + +extern int spa_mode(spa_t *spa); +extern uint64_t zfs_strtonum(const char *str, char **nptr); + +extern char *spa_his_ievent_table[]; + +extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); +extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, + char *his_buf); +extern int spa_history_log(spa_t *spa, const char *his_buf); +extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); +extern void spa_history_log_version(spa_t *spa, const char *operation); +extern void spa_history_log_internal(spa_t *spa, const char *operation, + dmu_tx_t *tx, const char *fmt, ...); +extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, + dmu_tx_t *tx, const char *fmt, ...); +extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, + dmu_tx_t *tx, const char *fmt, ...); + +/* error handling */ +struct zbookmark_phys; +extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); +extern void zfs_ereport_post(const char *_class, spa_t *spa, vdev_t *vd, + const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, + uint64_t length); +extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, + const char *name, nvlist_t *aux); +extern void zfs_post_remove(spa_t *spa, vdev_t *vd); +extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); +extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); +extern uint64_t spa_get_errlog_size(spa_t *spa); +extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); +extern void spa_errlog_rotate(spa_t *spa); +extern void spa_errlog_drain(spa_t *spa); +extern void spa_errlog_sync(spa_t *spa, uint64_t txg); +extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); + +/* vdev cache */ +extern void vdev_cache_stat_init(void); +extern void vdev_cache_stat_fini(void); + +/* Initialization and termination */ +extern void spa_init(int flags); +extern void spa_fini(void); +extern void spa_boot_init(void); + +/* properties */ +extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); +extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); +extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); +extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); + +/* asynchronous event notification */ +extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, + const char *name); +extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, + const char *name); +extern void spa_event_post(sysevent_t *ev); +extern void spa_event_discard(sysevent_t *ev); + +#ifdef ZFS_DEBUG +#define dprintf_bp(bp, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ + snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ + dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ + kmem_free(__blkbuf, BP_SPRINTF_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_bp(bp, fmt, ...) +#endif + +extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_H */ diff --git a/include/os/windows/zfs/zfs_config.h b/include/os/windows/zfs/zfs_config.h new file mode 100644 index 000000000000..1593f5cfe334 --- /dev/null +++ b/include/os/windows/zfs/zfs_config.h @@ -0,0 +1,95 @@ +/* zfs_config.h. Generated from zfs_config.h.in by configure. */ +/* zfs_config.h.in. Generated from configure.ac by autoheader. */ + +/* Define to 1 to enabled dmu tx validation */ +/* #undef DEBUG_DMU_TX */ + +/* Path where the Filesystems bundle is installed. */ +#define FILESYSTEMS_PREFIX "/Library/Filesystems" + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define if you have libblkid */ +/* #undef HAVE_LIBBLKID */ + +/* Define if you have libuuid */ +#define HAVE_LIBUUID 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the `mlockall' function. */ +#define HAVE_MLOCKALL 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define if you have zlib */ +#define HAVE_ZLIB 1 + +/* Path where the kernel module is installed. */ +#define KERNEL_MODPREFIX "/Library/Extensions" + +/* Define to the sub-directory where libtool stores uninstalled libraries. */ +#define LT_OBJDIR ".libs/" + +/* Define to a directory where mount(2) will look for mount_zfs. */ +#define MOUNTEXECDIR "${exec_prefix}/sbin" + +/* Define ZFS_BOOT to enable kext load at boot */ +#define ZFS_BOOT 1 + +/* zfs debugging enabled */ +/* #undef ZFS_DEBUG */ + +/* Define the project author. */ +#define ZFS_META_AUTHOR "OpenZFS on OS X" + +/* Define the project release date. */ +/* #undef ZFS_META_DATA */ + +/* Define the project license. */ +#define ZFS_META_LICENSE "CDDL" + +/* Define the libtool library 'age' version information. */ +/* #undef ZFS_META_LT_AGE */ + +/* Define the libtool library 'current' version information. */ +/* #undef ZFS_META_LT_CURRENT */ + +/* Define the libtool library 'revision' version information. */ +/* #undef ZFS_META_LT_REVISION */ + +/* Define the project name. */ +#define ZFS_META_NAME "zfs" + +/* Define the project release. */ +#define ZFS_META_RELEASE "1" + +/* Define the project version. */ +#define ZFS_META_VERSION "0.2.4" + +/* Define the project alias string. */ +#define ZFS_META_ALIAS "zfs-" ZFS_META_VERSION "-" ZFS_META_RELEASE diff --git a/module/os/windows/PORTING_NOTES.txt b/module/os/windows/PORTING_NOTES.txt new file mode 100644 index 000000000000..e598441f53af --- /dev/null +++ b/module/os/windows/PORTING_NOTES.txt @@ -0,0 +1,89 @@ +=== Unix to Windows porting notes === + + + +All the IO Request Packets (IRP) all come in through the same set of +function handlers, which can get a bit noisy. So ioctls from userland +to, say, listing datasets, come in to the same place, as requests to +list a directory, and volume creation notifications. We split these +out in the tail end of zfs_vnops_windows.c in the function `dispatcher`. + +To trigger a mount, we add two new ZFS ioctls, for mount and unmount. +In mount we will create a new fake disk, then create a filesystem that +we attach to the fake disk. Then we attach the filesystem to the mount +point desired. When IRP requests come in, we will immediately split it +into three; diskDevice to handle the fake disk related requests. +ioctlDevice which handles the ZFS ioctls from userland, and finally +fsDevice which gets the vnop requests from the mounted ZFS filesystem. + + +IRP_MJ_CREATE appears to serve the purpose of vnop_lookup, vnop_open, +vnop_mkdir, and vnop_create. The "create" in this context is more in +the line of "create a handle to file/dir" - existing, or creating, +entries. It has a flag to open "parent" of a file entry as well. + +We will use "fscontext" for the vnode *vp pointer, which gets you +znode_t pointer via VTOZ() macro. This project has created its own +vnode struct, to work more closely to Unix. The refcounting is done +internally to the project, and is separate to any OS related +refcounting (Unlike that of `FileObject`). Some Windows specific +variables are also contained in the vnode struct. + +It is expected that before using a struct vnode, a reference is taken +using `VN_HOLD(vp)` and release it with `VN_RELE(vp)`, for any access +of the vnode struct, and/or, znode. These are short term locks, for +long term (like that of directory handles, mountpoint) use +`vnode_ref()` and `vnode_rele()` respectively. + +Directory listings come in the form of IRP_MJ_DIRECTORY_CONTROL + +IRP_MN_QUERY_DIRECTORY. It comes with a structure "type" requested, +one of nine. Although, it appears mostly three are used, and those are +implemented. Add more as needed... + +Each return struct has an "offset to next node", relative to each +node, and the final is zero to indicate last entry in buffer. Each +struct is followed by filename in typical windows fashion, in 2byte +chars. Due to variable length filename, the next struct start has to +be aligned to 8 bytes. + +As long as there are valid entries in the return buf, it needs to +return STATUS_SUCCESS, even when EOF has been reached. So EOF has to +be remembered until the next call, at which time it should return +STATUS_NO_MORE_FILES. The directory query can also pass along pattern +to match against, which is only passed along in the first call, and +needs to be remembered. Similarly the index (in OpenZFS terms, the +directory offset) needs to be saved. These are stored in the +"fscontext2" void* ptr assigned to the directory in MJ_CREATE. Often +called "Ccb" in Windows. There is no Unix equivalent, there the offset +is stored in the `UIO` offset passed to `vnop_readdir()`. + +Deleting directories are done a little differently. It calls +IRP_MJ_CREATE to open a handle to the directory, then calls +IRP_SET_INFORMATION with type `FileDispositionInformation`, which has a +single boolean `delete`. Then it calls IRP_MJ_CLOSE, and eventually +IRP_MJ_CLEANUP. And if this is the final reference to the directory, +we call `vnop_rmdir`. + +For files, it calls IRP_MJ_CREATE with the flag DELETE_ON_CLOSE, and +closes the file. The IRP_MJ_CLEANUP handler is eventually called. The +"delete flag" is stored in the vnode struct, using the new +vnode_setunlink() and vnode_unlink() API calls. + +Many IRP calls that take structs will check the input size matches +that of sizeof(struct). A few structs will paste variable length +information, like that of Filename, at the end of the struct. A few +observations have come up; + +* Often the struct has WCHAR filename[1], which means you can always +fit the first (wide) character of the name, and the returned Length +needs to be adjusted to be one character less than the filename +length. + +* Those structs that take a variable name will also check if the full +name will fit, and if it does not, returns STATUS_BUFFER_OVERFLOW. +But, it is expected to fill in as much of the name that fits. Other +data, like in the case of FileAllInformation struct, need to be valid, +even though we return "error" and partial filename. + +* FileNameLength should be set to "required" length, and Information +size should be the same (no bigger) than input size. diff --git a/module/os/windows/README.md b/module/os/windows/README.md new file mode 100644 index 000000000000..cf5420a958e0 --- /dev/null +++ b/module/os/windows/README.md @@ -0,0 +1,430 @@ + +[![Build status](https://ci.appveyor.com/api/projects/status/dcw734sl0prmolwr/branch/master?svg=true)](https://ci.appveyor.com/project/lundman/zfsin/branch/master) + + +# To setup a development environment for compiling ZFS. + + +Download free development Windows 10 image from Microsoft. + +https://developer.microsoft.com/en-us/windows/downloads/virtual-machines + +and create two VMs. + +* Host (running Visual Studio and Kernel Debugger) +* Target (runs the compiled kernel module) + +The VM images comes with Visual Studio 2017, which we use to compile the driver. + +It is recommended that the VMs are placed on static IP, as they can +change IP with all the crashes, and you have to configure the remote +kernel development again. + +Go download the Windows Driver Kit 10 + +https://developer.microsoft.com/en-us/windows/hardware/windows-driver-kit + +and install on both VMs. You will need both the SDK and WDK: +Download the SDK with the Visual Studio 2017 community edition first and install it. +It will update the already installed Visual Studio. +Then install the WDK. At the end of the installer, allow it to install the Visual Studio extension. + + +On Target VM, complete the guide specified here, under +section "Prepare the target computer for provisioning". + +https://msdn.microsoft.com/windows/hardware/drivers/gettingstarted/provision-a-target-computer-wdk-8-1?f=255&MSPPError=-2147217396 + +Which mostly entails running: + +C:\Program Files (x86)\Windows Kits\10\Remote\x64\WDK Test Target Setup x64-x64_en-us.msi + +* reboot Target VM + + +On the Host VM, continue the guide to configure Visual Studio 2017. + +* Load Visual Studio 2017, there is no need to load the project yet. +* Menu > Driver > Test > Configure Devices +* Click "Add New Device" +* In "Display name:" enter "Target" +* In "Device type:" leave as "Computer" +* In "Network host name:" enter IP of Target VM, for me "172.16.248.103" +* Provisioning options: o Provision device and choose debugger settings. +* Click "Next >" + +It now confirms that it talked to the Target, and note here that +"Host IP" it that of the Host VM, for me, "172.16.248.102", and not to +be confused by the Target IP entered on previous screen. + +* Click "Next >" + +Watch and wait as remote items are installed on the Target VM. It +will most likely reboot the Target VM as well. + +I've had dialog boxes pop up and I agree to installation, but I am not +sure they are supposed to. They probably shouldn't, it would seem it +failed to put WDKRemoteUser in Administrators group. If that happens, +use "lusrmgr.msc" to correct it. + +The task "Creating system restore point" will most likely fail and +that is acceptable, however, if other tasks fail, you may need to +retry until they work. + +At the end of the run, the output window offers a link to the full +log, which is worth reading if you encounter issues. + +When things fail, I start a CMD prompt as Administrator, and paste in +the commands that fail, from the log file. It would be nice if this +process just worked though. + +If your version of .NET newer, just move along. + +The Target VM should reboot, and login as "WDKRemoteUser". + +It is recommended you get GIT bash for Windows and install: + +https://git-scm.com/downloads + +--- + +Handling configuration errors with Visual Studio 2019 & WDK 10: + +There are some issues with Visual Studio 2019 which can cause the following problem in setting up kernel debugging. +ERROR: Task “Configuring kernel debugger settings (possible reboot)†failed to complete successfully. Look at the logs in the driver test group explorer for more details on the failure. + +This problem is related to MSVC debug tool location mismatch, and as a workaround use the following steps to mitigate this problem: + +As Administrator, run Developer Command Prompt for VS 2019 in your Host VM +Run the following commands in the VS Developer Command Prompt: + +cd /d %VCToolsRedistDir%\debug_nonredist +MKLINK /J x86\Microsoft.VC141.DebugCRT x86\Microsoft.VC142.DebugCRT +MKLINK /J x64\Microsoft.VC141.DebugCRT x64\Microsoft.VC142.DebugCRT + +Retry configuration by following guide to configure Visual Studio 2017 mentioned above. + +--- + + +Host and Target VMs are now configured. + +First time you load the project it might default to + +Debug : ARM + +you probably want to change ARM ==> X64. + +* Load ZFSin solution +* Menu > Debug > ZFSin Properties +* Configuration Properties > Debugging +"Debugging tools for Windows - Kernel Debugger" +Remote Computer Name: Target + +* Configuration Properties > Driver Install > Deployment +Target Device Name: Target +[Tick] Remove previous driver versions +O Hardware ID Driver Update +Root\ZFSin + + +You can run DbgView on the Target VM to see the kernel prints on that VM. + + +Run the compiled Target + +* Compile solution +* Menu > Debug > Start Debugging (F5) + +wait a while, for VS2017 to deploy the .sys file on Target and start it. + + + + + +Target VM optionals. + +If you find it frustrating to do development work when Windows Defender or +Windows Updates run, you can disable those in gpedit.msc + +* Computer Configuration > Administrative Templates > + Windows Components > + Windows Defender + Windows Updates + + +--- + +# Milestones + + + ✅ Compile SPL sources + * Godzillion warnings yet to be addressed + + ✅ Port SPL sources, atomics, mutex, kmem, condvars + * C11 _Atomics in kmem not yet handled + + ✅ Compile ZFS sources, stubbing out code as needed + + ✅ Include kernel zlib library + + ✅ Load and Unload SPL and ZFS code + + ✅ Port kernel `zfs_ioctl.c` to accept ioctls from userland + + ✅ Compile userland libspl, libzpool, libzfs, ... + + ✅ Include pthread wrapper library + * Replaced with thin pthread.h file + + ✅ Include userland zlib library + + ✅ Compile cmd/zpool + + ✅ Port functions in libzpool, libzfs. Iterate disks, ioctl + + ✅ Test ioctl from zpool to talk to kernel + + ✅ Port kernel `vdev_disk.c` / `vdev_file.c` to issue IO + + ✅ Port over cmd/zfs + + ✅ Add ioctl calls to MOUNT and create Volume to attach + + ✅ Add ioctl calls to UNMOUNT and detach and delete Volume + + ✅ Port kernel `zfs_vnops.c` / `zfs_vnops_windows.c` + * Many special cases missing, flags to create/read/etc + + ✅ Correct file information (dates, size, etc) + + ✅ Basic DOS usage + + ✅ Simple Notepad text edit, executables also work. + + ✅ Basic drag'n'drop in Explorer + + ✅ zfs send / recv, file and pipe. + + ✅ ZVOL support + + ✅ git clone ZFS repo on ZFS mounted fs + + ✅ Compile ZFS on top of ZFS + + ⎠Scrooge McDuck style swim in cash + +--- + +# Design issues that need addressing. + +* Windows do not handle EFI labels, for now they are parsed with +libefi, and we send offset and size with the filename, that both +libzfs and kernel will parse out and use. This works for a proof +of concept. + +Possibly a more proper solution would be to write a thin virtual +hard disk driver, which reads the EFI label and present just the +partitions. + +* vdev_disk.c spawns a thread to get around that IoCompletionRoutine +is called in a different context, to sleep until signalled. Is there +a better way to do async in Windows? + +* ThreadId should be checked, using PsGetCurrentThreadId() but +it makes zio_taskq_member(taskq_member()) crash. Investigate. + +* Functions in posix.c need sustenance. + +* The Volume created for MOUNT has something wrong with it, we are + unable to query it for mountpoint, currently has to string compare a + list of all mounts. Possibly also related is that we can not call + any of the functions to set mountpoint to change it. This needs to + be researched. + +* Find a way to get system RAM in SPL, so we can size up the kmem as +expected. Currently looks up the information in the Registry. +kmem should also use Windows signals +"\KernelObjects\LowMemoryCondition" to sense pressure. + +Thinking on mount structure. Second design: + +Add dataset property WinDriveLetter, which is ignored on Unix system. +So for a simple drive letter dataset: + +zfs set driveletter=Z pool + +The default creating of a new pool, AND, importing a UNIX pool, would +set the root dataset to + +driveletter=?: + +So it is assigned first-available drive letter. All lower datasets +will be mounted inside the drive letter. If pool's WinDriveLetter is +not set, it will mount "/pool" as "C:/pool". + +--- + +# Installing a binary release + +Latest binary files are available at [GitHub releases](https://github.com/openzfsonwindows/ZFSin/releases) + + +If you are running windows 10 with secure boot on and/or installing an older release you will need to enable unsigned drivers from an elevated CMD: + + +* `bcdedit.exe -set testsigning on ` +* Then **reboot**. After restart it should have _Test Mode_ bottom right corner of the screen. + + +After that either + +* Run OpenZFSOnWindows.exe installer to install +* *Would you like to install device software?* should pop up, click install + * If installing an unsigned release, click "Install anyway" in the "unknown developer" popup + +Or if you do not want to run the Installer, run this command by hand from elevated CMD: +* `zfsinstaller.exe install .\ZFSin.inf` +* *Would you like to install device software?* should pop up, click install + * If installing an unsigned release, click "Install anyway" in the "unknown developer" popup + +Run `zpool.exe status` to confirm it can talk to the kernel + +Failure would be: +``` +Unable to open \\.\ZFS: No error. +``` + +Success would be: +``` +No pools available +``` + +--- + +# Creating your first pool. + +The basic syntax to creating a pool is as below. We use the pool name +"tank" here as with Open ZFS documentation. Feel free to pick your own +pool name. + +``` +# zpool create [options] tank disk + - Create single disk pool + +# zpool create [options] tank mirror disk1 disk2 + - Create mirrored pool ("raid1") + +# zpool create [options] tank raidz disk1 disk2 disk3 .... diskn + - Create raidz ("raid5") pool of multiple disks + +``` + +The default _options_ will "mostly" work in Windows, but for best compatibility +should use a case insensitive filesystem. + +The recommended _options_ string for Windows is currently: + +``` +zpool create -O casesensitivity=insensitive -O compression=lz4 \ + -O atime=off -o ashift=12 tank disk +``` + +* Creating filebased pools would look like: +``` +# fsutil file createnew C:\poolfile.bin 200000000 +# zpool.exe create tank \\?\C:\poolfile.bin + +Note that "\\?\C:\" needs to be escaped in bash shell, ie +"\\\\?\\C:\\". + + TEST ONLINE 0 0 0 + \??\C:\poolfile.bin ONLINE 0 0 0 +``` + +* Creating a HDD pool + +First, locate disk name + +``` +# wmic diskdrive list brief +VMware, VMware Virtual S SCSI Disk Device \\.\PHYSICALDRIVE2 VMware, VMware Virtual S SCSI Disk Device 0 5362882560 +# zpool create tank PHYSICALDRIVE2 +``` + +# Creating a ZVOL virtual hard disk + +Creating a virtual hard disk (ZVOL) is done by passing "-V " to the "zfs create" command. +``` +# zfs create -V 2g tank/hello +``` + +Which would create a disk of 2GB in size, called "tank/hello". +Confirm it was created with: + +``` +# wmic diskdrive list brief +Caption DeviceID Model Partitions Size +ZVOL tank/hello SCSI Disk Device \\.\PHYSICALDRIVE2 ZVOL tank/hello SCSI DiskDevice 0 2105671680 +``` + + +# Exporting the pool + +If you have finished with ZFS, or want to eject the USB or HDD that the +pool resides on, it must first be _exported_. Similar to "ejecting" a +USB device before unplugging it. + +``` +# zpool export tank +``` + +# Importing a pool + +If a zpool has been created on a disk partition from a different system make +sure the partition label contains "zfs". Otherwise `zpool import` won't +recognize the pool and will fail with "no pools available to import". + +``` +# zpool import tank +``` + +# Uninstalling the driver + +If you used the Installer, you can browse to "C:\Program Files (x86)\OpenZFS On Windows" +and run the "uninst000.exe" Uninstaller program. + +You can also use "Add Remove Programs" from the Settings menu, and +click on "OpenZFS On Windows-debug version x.xx" and select Uninstall. + +If you did not use the Installer, you can manually uninstall it: + +``` +zfsinstaller uninstall .\ZFSin.inf +``` + +To verify that the driver got uninstalled properly you can check "zpool.exe status". + +When uninstalled with success, "zpool.exe status" should return: +``` +Unable to open \\.\ZFS: No error. +``` + +If the driver is still there, it would be: +``` +No pools available +``` + +A reboot might be necessary to uninstall it completely. + +# Tuning + +You can use the [registry](https://openzfsonosx.org/wiki/Windows_Registry) to tune various parameters. +Also, there is [`kstat`](https://openzfsonosx.org/wiki/Windows_kstat) to dynamically change parameters. + +# Nightly builds + +There are nightly builds available at [AppVeyor](https://ci.appveyor.com/project/lundman/zfsin/branch/master/artifacts) +- These builds are currently not signed and therefore require test mode to be enabled. + +There also are test builds [available here](https://openzfsonosx.org/wiki/Windows_builds). These are "hotfix" builds for allowing people to test specific fixes before they are ready for a release. diff --git a/module/os/windows/driver.c b/module/os/windows/driver.c new file mode 100644 index 000000000000..dfc740ee1c37 --- /dev/null +++ b/module/os/windows/driver.c @@ -0,0 +1,335 @@ +#include +#include + +#include +#include +//#include + +#include + +#include "Trace.h" + +DRIVER_INITIALIZE DriverEntry; +//EVT_WDF_DRIVER_DEVICE_ADD ZFSin_Init; + +extern int initDbgCircularBuffer(void); +extern int finiDbgCircularBuffer(void); +extern int spl_start(void); +extern int spl_stop(void); +extern int zfs_start(void); +extern void zfs_stop(void); +extern void windows_delay(int ticks); + +PDRIVER_OBJECT WIN_DriverObject = NULL; +PDRIVER_UNLOAD STOR_DriverUnload; +PDRIVER_DISPATCH STOR_MajorFunction[IRP_MJ_MAXIMUM_FUNCTION + 1]; + +wzvolDriverInfo STOR_wzvolDriverInfo; + +DRIVER_UNLOAD ZFSin_Fini; +void ZFSin_Fini(PDRIVER_OBJECT DriverObject) +{ + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_INFO_LEVEL, "ZFSin_Fini\n")); + zfs_stop(); + if (STOR_DriverUnload != NULL) { + STOR_DriverUnload(DriverObject); + STOR_DriverUnload = NULL; + } + + kstat_osx_fini(); + spl_stop(); + finiDbgCircularBuffer(); + + if (STOR_wzvolDriverInfo.zvContextArray) { + ExFreePoolWithTag(STOR_wzvolDriverInfo.zvContextArray, MP_TAG_GENERAL); + STOR_wzvolDriverInfo.zvContextArray = NULL; + } + ZFSWppCleanup(DriverObject); +} + +/* + * Setup a Storage Miniport Driver, used only by ZVOL to create virtual disks. + */ +NTSTATUS DriverEntry(_In_ PDRIVER_OBJECT DriverObject, _In_ PUNICODE_STRING pRegistryPath) +{ + NTSTATUS status; + ZFSWppInit(DriverObject, pRegistryPath); + + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_INFO_LEVEL, "ZFSin: DriverEntry\n")); + + // Setup global so zfs_ioctl.c can setup devnode + WIN_DriverObject = DriverObject; + + /* Setup print buffer, since we print from SPL */ + initDbgCircularBuffer(); + + spl_start(); + + kstat_osx_init(pRegistryPath); + + /* + * Initialise storport for the ZVOL virtual disks. This also + * sets the Driver Callbacks, so we make a copy of them, so + * that Dispatcher can use them. + */ + status = zvol_start(DriverObject, pRegistryPath); + + if (STATUS_SUCCESS != status) { + /* If we failed, we carryon without ZVOL support. */ + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "ZFSin: StorPortInitialize() failed, no ZVOL for you. %d/0x%x\n", status, status)); + memset(STOR_MajorFunction, 0, sizeof(STOR_MajorFunction)); + STOR_DriverUnload = NULL; + } else { + /* Make a copy of the Driver Callbacks for miniport */ + memcpy(STOR_MajorFunction, WIN_DriverObject->MajorFunction, sizeof(STOR_MajorFunction)); + STOR_DriverUnload = WIN_DriverObject->DriverUnload; + } + + /* Now set the Driver Callbacks to dispatcher and start ZFS */ + WIN_DriverObject->DriverUnload = ZFSin_Fini; + + zfs_start(); + + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "ZFSin: Started\n")); + return STATUS_SUCCESS; +} + +//extern unsigned long spl_hostid; +extern int random_get_bytes(void *ptr, unsigned long len); + +void spl_create_hostid(HANDLE h, PUNICODE_STRING pRegistryPath) +{ + NTSTATUS Status; + + UNICODE_STRING AttachKey; + RtlInitUnicodeString(&AttachKey, L"hostid"); + + random_get_bytes(&spl_hostid, sizeof(spl_hostid)); + + Status = ZwSetValueKey( + h, + &AttachKey, + 0, + REG_DWORD, + &spl_hostid, + sizeof(spl_hostid) + ); + + if (!NT_SUCCESS(Status)) { + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "%s: Unable to create Registry %wZ/hostid: 0x%x. hostid unset.\n", __func__, pRegistryPath, Status)); + spl_hostid = 0; + } + + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "SPL: created hostid 0x%04x\n", spl_hostid)); +} + +// Whenever we start up, write the version string to registry. +#include <../zfs_config.h> + +void spl_update_version(HANDLE h, PUNICODE_STRING pRegistryPath) +{ + NTSTATUS Status; + + UNICODE_STRING AttachKey; + UNICODE_STRING ValueKey; + RtlInitUnicodeString(&AttachKey, L"version"); + RtlInitUnicodeString(&ValueKey, L""ZFS_META_VERSION "-" ZFS_META_RELEASE); + + Status = ZwSetValueKey( + h, + &AttachKey, + 0, + REG_SZ, + ValueKey.Buffer, + ValueKey.Length + ); + + if (!NT_SUCCESS(Status)) { + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "%s: Unable to create Registry %wZ/version: 0x%x. hostid unset.\n", __func__, pRegistryPath, Status)); + } +} + +int spl_check_assign_types(kstat_named_t *kold, PKEY_VALUE_FULL_INFORMATION regBuffer) +{ + + switch (kold->data_type) { + + case KSTAT_DATA_UINT64: + case KSTAT_DATA_INT64: + { + if (regBuffer->Type != REG_QWORD || + regBuffer->DataLength != sizeof(uint64_t)) { + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "%s: registry '%s' matched in kstat. Type needs to be REG_QWORD. (8 bytes)\n", __func__, + kold->name)); + return 0; + } + uint64_t newvalue = *(uint64_t *)((uint8_t *)regBuffer + regBuffer->DataOffset); + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "%s: kstat '%s': 0x%llx -> 0x%llx\n", __func__, + kold->name, + kold->value.ui64, + newvalue + )); + kold->value.ui64 = newvalue; + return 1; + } + + case KSTAT_DATA_UINT32: + case KSTAT_DATA_INT32: + { + if (regBuffer->Type != REG_DWORD || + regBuffer->DataLength != sizeof(uint32_t)) { + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "%s: registry '%s' matched in kstat. Type needs to be REG_DWORD. (4 bytes)\n", __func__, + kold->name)); + return 0; + } + uint32_t newvalue = *(uint32_t *)((uint8_t *)regBuffer + regBuffer->DataOffset); + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "%s: kstat '%s': 0x%lx -> 0x%lx\n", __func__, + kold->name, + kold->value.ui32, + newvalue + )); + kold->value.ui32 = newvalue; + return 1; + } + default: + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "%s: registry '%s' matched in kstat of unsupported type. Only INT32 and INT64 types supported.\n", __func__, + kold->name)); + } + return 0; +} + +// +// kstat_osx_init(): +// read kstat values +// spl_kstat_registry(this): +// open registry +// for each registry entry +// match name in kstat - assign value +// close registry +// return 0 (OK) +// write kstat values (if OK) +// +extern wchar_t zfs_vdev_protection_filter[64]; +int spl_kstat_registry(PUNICODE_STRING pRegistryPath, kstat_t *ksp) +{ + OBJECT_ATTRIBUTES ObjectAttributes; + HANDLE h; + NTSTATUS Status; + + InitializeObjectAttributes(&ObjectAttributes, + pRegistryPath, + OBJ_KERNEL_HANDLE | OBJ_CASE_INSENSITIVE, + NULL, + NULL); + + Status = ZwOpenKey(&h, // KeyHandle + KEY_ALL_ACCESS, // DesiredAccess + &ObjectAttributes);// ObjectAttributes + + if (!NT_SUCCESS(Status)) { + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "%s: Unable to open Registry %wZ: 0x%x. Going with defaults.\n", __func__, pRegistryPath, Status)); + return 0; + } + + // Iterate all Registry entries. + NTSTATUS status = 0; + ULONG index = 0; + ULONG length; + PKEY_VALUE_FULL_INFORMATION regBuffer; + char keyname[KSTAT_STRLEN + 1]; + int changed = 0; + + for (index = 0; status != STATUS_NO_MORE_ENTRIES; index++) { + // Get the buffer size necessary + status = ZwEnumerateValueKey(h, index, KeyValueFullInformation, NULL, 0, &length); + + if ((status != STATUS_BUFFER_TOO_SMALL) && (status != STATUS_BUFFER_OVERFLOW)) + break; // Something is wrong - or we finished + + // Allocate space to hold + regBuffer = (PKEY_VALUE_FULL_INFORMATION)ExAllocatePoolWithTag(NonPagedPoolNx, length, 'zfsr'); + + if (regBuffer == NULL) + continue; + + status = ZwEnumerateValueKey(h, index, KeyValueFullInformation, regBuffer, length, &length); + if (!NT_SUCCESS(status)) { + ExFreePool(regBuffer); + continue; + } + + // Convert name to straight ascii so we compare with kstat + ULONG outlen; + status = RtlUnicodeToUTF8N(keyname, KSTAT_STRLEN, &outlen, + regBuffer->Name, regBuffer->NameLength); + + // Conversion failed? move along.. + if (status != STATUS_SUCCESS && + status != STATUS_SOME_NOT_MAPPED) { + ExFreePool(regBuffer); + continue; + } + + // Output string is only null terminated if input is, so do so now. + keyname[outlen] = 0; + + // Support for registry values that are not tunable through kstat. Those bypass the kstat name matching loop + // and get directly set in the corresponding code variable. + // + if (strcasecmp("zfs_vdev_protection_filter", keyname) == 0) { + if (regBuffer->Type != REG_SZ || + regBuffer->DataLength > (sizeof(zfs_vdev_protection_filter)-sizeof(wchar_t))) { // will be NULL terminated + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "%s: registry '%s'. Type needs to be REG_SZ (63 wchar max)\n", __func__, + keyname)); + ExFreePool(regBuffer); + continue; + } + char* newvalue = (char*)((uint8_t*)regBuffer + regBuffer->DataOffset); + KdPrintEx((DPFLTR_IHVDRIVER_ID, DPFLTR_ERROR_LEVEL, "%s: registry '%s': %S\n", __func__, + keyname, newvalue)); + + bzero(zfs_vdev_protection_filter, sizeof(zfs_vdev_protection_filter)); + bcopy(newvalue, zfs_vdev_protection_filter, min(sizeof(zfs_vdev_protection_filter), regBuffer->DataLength)); + } + else { + // Now iterate kstats and attempt to match name with 'keyname'. + kstat_named_t *kold; + kold = ksp->ks_data; + for (unsigned int i = 0; i < ksp->ks_ndata; i++, kold++) { + + // Find name? + if (kold->name != NULL && + !strcasecmp(kold->name, keyname)) { + + // Check types match and are supported + if (!spl_check_assign_types(kold, regBuffer)) + break; + + // Special case 'hostid' is automatically generated if not + // set, so if we read it in, signal to not set it. + // KSTAT_UPDATE is called after this function completes. + if (spl_hostid == 0 && + strcasecmp("hostid", keyname) == 0) + spl_hostid = 1; // Non-zero + + changed++; + break; + } + } + } + + ExFreePool(regBuffer); + } // for() all keys + + // Now check that hostid was read it, if it wasn't, make up a random one. + if (spl_hostid == 0) { + spl_create_hostid(h, pRegistryPath); + } + + // Make sure version is updated + spl_update_version(h, pRegistryPath); + + ZwClose(h); + return (changed); +} + diff --git a/module/os/windows/spl/CMakeLists.txt b/module/os/windows/spl/CMakeLists.txt new file mode 100644 index 000000000000..644773c29236 --- /dev/null +++ b/module/os/windows/spl/CMakeLists.txt @@ -0,0 +1,32 @@ +wdk_add_library(splkern + spl-atomic.c + spl-avl.c + spl-condvar.c + spl-cred.c + spl-ddi.c + spl-debug.c + spl-err.c + spl-kmem.c + spl-kobj.c + spl-kstat.c + spl-list.c + spl-md5.c + spl-mount.c + spl-mutex.c + spl-policy.c + spl-proc.c + spl-processor.c + spl-rwlock.c + spl-seg_kmem.c + spl-taskq.c + spl-thread.c + spl-time.c + spl-tsd.c + spl-uio.c + spl-vmem.c + spl-vnode.c + spl-windows.c + spl-xdr.c +) + +target_include_directories(splkern BEFORE PUBLIC ../../include) \ No newline at end of file diff --git a/module/os/windows/spl/spl-atomic.c b/module/os/windows/spl/spl-atomic.c new file mode 100644 index 000000000000..897adcb544e5 --- /dev/null +++ b/module/os/windows/spl/spl-atomic.c @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + ***************************************************************************** + * Solaris Porting Layer (SPL) Atomic Implementation. + ***************************************************************************** + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + */ + +#include +//#include + + +//#include +#include +//#include +#include + + +#ifdef _KERNEL + +/* nothing */ + + +void *atomic_cas_ptr(volatile void *_target, void *_cmp, void *_new) +{ + return InterlockedCompareExchangePointer((volatile PVOID *)_target, _new, _cmp); +} + + +#endif diff --git a/module/os/windows/spl/spl-avl.c b/module/os/windows/spl/spl-avl.c new file mode 100644 index 000000000000..6a5fdd39a242 --- /dev/null +++ b/module/os/windows/spl/spl-avl.c @@ -0,0 +1,1077 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * AVL - generic AVL tree implementation for kernel use + * + * A complete description of AVL trees can be found in many CS textbooks. + * + * Here is a very brief overview. An AVL tree is a binary search tree that is + * almost perfectly balanced. By "almost" perfectly balanced, we mean that at + * any given node, the left and right subtrees are allowed to differ in height + * by at most 1 level. + * + * This relaxation from a perfectly balanced binary tree allows doing + * insertion and deletion relatively efficiently. Searching the tree is + * still a fast operation, roughly O(log(N)). + * + * The key to insertion and deletion is a set of tree maniuplations called + * rotations, which bring unbalanced subtrees back into the semi-balanced state. + * + * This implementation of AVL trees has the following peculiarities: + * + * - The AVL specific data structures are physically embedded as fields + * in the "using" data structures. To maintain generality the code + * must constantly translate between "avl_node_t *" and containing + * data structure "void *"s by adding/subracting the avl_offset. + * + * - Since the AVL data is always embedded in other structures, there is + * no locking or memory allocation in the AVL routines. This must be + * provided for by the enclosing data structure's semantics. Typically, + * avl_insert()/_add()/_remove()/avl_insert_here() require some kind of + * exclusive write lock. Other operations require a read lock. + * + * - The implementation uses iteration instead of explicit recursion, + * since it is intended to run on limited size kernel stacks. Since + * there is no recursion stack present to move "up" in the tree, + * there is an explicit "parent" link in the avl_node_t. + * + * - The left/right children pointers of a node are in an array. + * In the code, variables (instead of constants) are used to represent + * left and right indices. The implementation is written as if it only + * dealt with left handed manipulations. By changing the value assigned + * to "left", the code also works for right handed trees. The + * following variables/terms are frequently used: + * + * int left; // 0 when dealing with left children, + * // 1 for dealing with right children + * + * int left_heavy; // -1 when left subtree is taller at some node, + * // +1 when right subtree is taller + * + * int right; // will be the opposite of left (0 or 1) + * int right_heavy;// will be the opposite of left_heavy (-1 or 1) + * + * int direction; // 0 for "<" (ie. left child); 1 for ">" (right) + * + * Though it is a little more confusing to read the code, the approach + * allows using half as much code (and hence cache footprint) for tree + * manipulations and eliminates many conditional branches. + * + * - The avl_index_t is an opaque "cookie" used to find nodes at or + * adjacent to where a new value would be inserted in the tree. The value + * is a modified "avl_node_t *". The bottom bit (normally 0 for a + * pointer) is set to indicate if that the new node has a value greater + * than the value of the indicated "avl_node_t *". + */ + +#include +#include +#include +#include +#include + +/* + * Small arrays to translate between balance (or diff) values and child indeces. + * + * Code that deals with binary tree data structures will randomly use + * left and right children when examining a tree. C "if()" statements + * which evaluate randomly suffer from very poor hardware branch prediction. + * In this code we avoid some of the branch mispredictions by using the + * following translation arrays. They replace random branches with an + * additional memory reference. Since the translation arrays are both very + * small the data should remain efficiently in cache. + */ +static const int avl_child2balance[2] = {-1, 1}; +static const int avl_balance2child[] = {0, 0, 1}; + + +/* + * Walk from one node to the previous valued node (ie. an infix walk + * towards the left). At any given node we do one of 2 things: + * + * - If there is a left child, go to it, then to it's rightmost descendant. + * + * - otherwise we return thru parent nodes until we've come from a right child. + * + * Return Value: + * NULL - if at the end of the nodes + * otherwise next node + */ +void * +avl_walk(avl_tree_t *tree, void *oldnode, int left) +{ + uint32_t off = tree->avl_offset; + avl_node_t *node = AVL_DATA2NODE(oldnode, off); + int right = 1 - left; + int was_child; + + + /* + * nowhere to walk to if tree is empty + */ + if (node == NULL) + return (NULL); + + /* + * Visit the previous valued node. There are two possibilities: + * + * If this node has a left child, go down one left, then all + * the way right. + */ + if (node->avl_child[left] != NULL) { + for (node = node->avl_child[left]; + node->avl_child[right] != NULL; + node = node->avl_child[right]) + ; + /* + * Otherwise, return thru left children as far as we can. + */ + } else { + for (;;) { + was_child = AVL_XCHILD(node); + node = AVL_XPARENT(node); + if (node == NULL) + return (NULL); + if (was_child == right) + break; + } + } + + return (AVL_NODE2DATA(node, off)); +} + +/* + * Return the lowest valued node in a tree or NULL. + * (leftmost child from root of tree) + */ +void * +avl_first(avl_tree_t *tree) +{ + avl_node_t *node; + avl_node_t *prev = NULL; + uint32_t off = tree->avl_offset; + + for (node = tree->avl_root; node != NULL; node = node->avl_child[0]) + prev = node; + + if (prev != NULL) + return (AVL_NODE2DATA(prev, off)); + return (NULL); +} + +/* + * Return the highest valued node in a tree or NULL. + * (rightmost child from root of tree) + */ +void * +avl_last(avl_tree_t *tree) +{ + avl_node_t *node; + avl_node_t *prev = NULL; + uint32_t off = tree->avl_offset; + + for (node = tree->avl_root; node != NULL; node = node->avl_child[1]) + prev = node; + + if (prev != NULL) + return (AVL_NODE2DATA(prev, off)); + return (NULL); +} + +/* + * Access the node immediately before or after an insertion point. + * + * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child + * + * Return value: + * NULL: no node in the given direction + * "void *" of the found tree node + */ +void * +avl_nearest(avl_tree_t *tree, avl_index_t where, int direction) +{ + int child = AVL_INDEX2CHILD(where); + avl_node_t *node = AVL_INDEX2NODE(where); + void *data; + uint32_t off = tree->avl_offset; + + if (node == NULL) { + ASSERT(tree->avl_root == NULL); + return (NULL); + } + data = AVL_NODE2DATA(node, off); + if (child != direction) + return (data); + + return (avl_walk(tree, data, direction)); +} + + +/* + * Search for the node which contains "value". The algorithm is a + * simple binary tree search. + * + * return value: + * NULL: the value is not in the AVL tree + * *where (if not NULL) is set to indicate the insertion point + * "void *" of the found tree node + */ +void * +avl_find(avl_tree_t *tree, const void *value, avl_index_t *where) +{ + avl_node_t *node; + avl_node_t *prev = NULL; + int child = 0; + int diff; + uint32_t off = tree->avl_offset; + + for (node = tree->avl_root; node != NULL; + node = node->avl_child[child]) { + + prev = node; + + diff = tree->avl_compar(value, AVL_NODE2DATA(node, off)); + ASSERT(-1 <= diff && diff <= 1); + if (diff == 0) { +#ifdef DEBUG + if (where != NULL) + *where = 0; +#endif + return (AVL_NODE2DATA(node, off)); + } + child = avl_balance2child[1 + diff]; + + } + + if (where != NULL) + *where = AVL_MKINDEX(prev, child); + + return (NULL); +} + + +/* + * Perform a rotation to restore balance at the subtree given by depth. + * + * This routine is used by both insertion and deletion. The return value + * indicates: + * 0 : subtree did not change height + * !0 : subtree was reduced in height + * + * The code is written as if handling left rotations, right rotations are + * symmetric and handled by swapping values of variables right/left[_heavy] + * + * On input balance is the "new" balance at "node". This value is either + * -2 or +2. + */ +static int +avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance) +{ + int left = !(balance < 0); /* when balance = -2, left will be 0 */ + int right = 1 - left; + int left_heavy = balance >> 1; + int right_heavy = -left_heavy; + avl_node_t *parent = AVL_XPARENT(node); + avl_node_t *child = node->avl_child[left]; + avl_node_t *cright; + avl_node_t *gchild; + avl_node_t *gright; + avl_node_t *gleft; + int which_child = AVL_XCHILD(node); + int child_bal = AVL_XBALANCE(child); + + /* BEGIN CSTYLED */ + /* + * case 1 : node is overly left heavy, the left child is balanced or + * also left heavy. This requires the following rotation. + * + * (node bal:-2) + * / \ + * / \ + * (child bal:0 or -1) + * / \ + * / \ + * cright + * + * becomes: + * + * (child bal:1 or 0) + * / \ + * / \ + * (node bal:-1 or 0) + * / \ + * / \ + * cright + * + * we detect this situation by noting that child's balance is not + * right_heavy. + */ + /* END CSTYLED */ + if (child_bal != right_heavy) { + + /* + * compute new balance of nodes + * + * If child used to be left heavy (now balanced) we reduced + * the height of this sub-tree -- used in "return...;" below + */ + child_bal += right_heavy; /* adjust towards right */ + + /* + * move "cright" to be node's left child + */ + cright = child->avl_child[right]; + node->avl_child[left] = cright; + if (cright != NULL) { + AVL_SETPARENT(cright, node); + AVL_SETCHILD(cright, left); + } + + /* + * move node to be child's right child + */ + child->avl_child[right] = node; + AVL_SETBALANCE(node, -child_bal); + AVL_SETCHILD(node, right); + AVL_SETPARENT(node, child); + + /* + * update the pointer into this subtree + */ + AVL_SETBALANCE(child, child_bal); + AVL_SETCHILD(child, which_child); + AVL_SETPARENT(child, parent); + if (parent != NULL) + parent->avl_child[which_child] = child; + else + tree->avl_root = child; + + return (child_bal == 0); + } + + /* BEGIN CSTYLED */ + /* + * case 2 : When node is left heavy, but child is right heavy we use + * a different rotation. + * + * (node b:-2) + * / \ + * / \ + * / \ + * (child b:+1) + * / \ + * / \ + * (gchild b: != 0) + * / \ + * / \ + * gleft gright + * + * becomes: + * + * (gchild b:0) + * / \ + * / \ + * / \ + * (child b:?) (node b:?) + * / \ / \ + * / \ / \ + * gleft gright + * + * computing the new balances is more complicated. As an example: + * if gchild was right_heavy, then child is now left heavy + * else it is balanced + */ + /* END CSTYLED */ + gchild = child->avl_child[right]; + gleft = gchild->avl_child[left]; + gright = gchild->avl_child[right]; + + /* + * move gright to left child of node and + * + * move gleft to right child of node + */ + node->avl_child[left] = gright; + if (gright != NULL) { + AVL_SETPARENT(gright, node); + AVL_SETCHILD(gright, left); + } + + child->avl_child[right] = gleft; + if (gleft != NULL) { + AVL_SETPARENT(gleft, child); + AVL_SETCHILD(gleft, right); + } + + /* + * move child to left child of gchild and + * + * move node to right child of gchild and + * + * fixup parent of all this to point to gchild + */ + balance = AVL_XBALANCE(gchild); + gchild->avl_child[left] = child; + AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0)); + AVL_SETPARENT(child, gchild); + AVL_SETCHILD(child, left); + + gchild->avl_child[right] = node; + AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0)); + AVL_SETPARENT(node, gchild); + AVL_SETCHILD(node, right); + + AVL_SETBALANCE(gchild, 0); + AVL_SETPARENT(gchild, parent); + AVL_SETCHILD(gchild, which_child); + if (parent != NULL) + parent->avl_child[which_child] = gchild; + else + tree->avl_root = gchild; + + return (1); /* the new tree is always shorter */ +} + + +/* + * Insert a new node into an AVL tree at the specified (from avl_find()) place. + * + * Newly inserted nodes are always leaf nodes in the tree, since avl_find() + * searches out to the leaf positions. The avl_index_t indicates the node + * which will be the parent of the new node. + * + * After the node is inserted, a single rotation further up the tree may + * be necessary to maintain an acceptable AVL balance. + */ +void +avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where) +{ + avl_node_t *node; + avl_node_t *parent = AVL_INDEX2NODE(where); + int old_balance; + int new_balance; + int which_child = AVL_INDEX2CHILD(where); + uint32_t off = tree->avl_offset; + + ASSERT(tree); +#ifdef _LP64 + ASSERT(((uintptr_t)new_data & 0x7) == 0); +#endif + + node = AVL_DATA2NODE(new_data, off); + + /* + * First, add the node to the tree at the indicated position. + */ + ++tree->avl_numnodes; + + node->avl_child[0] = NULL; + node->avl_child[1] = NULL; + + AVL_SETCHILD(node, which_child); + AVL_SETBALANCE(node, 0); + AVL_SETPARENT(node, parent); + if (parent != NULL) { + ASSERT(parent->avl_child[which_child] == NULL); + parent->avl_child[which_child] = node; + } else { + ASSERT(tree->avl_root == NULL); + tree->avl_root = node; + } + /* + * Now, back up the tree modifying the balance of all nodes above the + * insertion point. If we get to a highly unbalanced ancestor, we + * need to do a rotation. If we back out of the tree we are done. + * If we brought any subtree into perfect balance (0), we are also done. + */ + for (;;) { + node = parent; + if (node == NULL) + return; + + /* + * Compute the new balance + */ + old_balance = AVL_XBALANCE(node); + new_balance = old_balance + avl_child2balance[which_child]; + + /* + * If we introduced equal balance, then we are done immediately + */ + if (new_balance == 0) { + AVL_SETBALANCE(node, 0); + return; + } + + /* + * If both old and new are not zero we went + * from -1 to -2 balance, do a rotation. + */ + if (old_balance != 0) + break; + + AVL_SETBALANCE(node, new_balance); + parent = AVL_XPARENT(node); + which_child = AVL_XCHILD(node); + } + + /* + * perform a rotation to fix the tree and return + */ + (void) avl_rotation(tree, node, new_balance); +} + +/* + * Insert "new_data" in "tree" in the given "direction" either after or + * before (AVL_AFTER, AVL_BEFORE) the data "here". + * + * Insertions can only be done at empty leaf points in the tree, therefore + * if the given child of the node is already present we move to either + * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since + * every other node in the tree is a leaf, this always works. + * + * To help developers using this interface, we assert that the new node + * is correctly ordered at every step of the way in DEBUG kernels. + */ +void +avl_insert_here( + avl_tree_t *tree, + void *new_data, + void *here, + int direction) +{ + avl_node_t *node; + int child = direction; /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */ +#ifdef DEBUG + int diff; +#endif + + ASSERT(tree != NULL); + ASSERT(new_data != NULL); + ASSERT(here != NULL); + ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER); + + /* + * If corresponding child of node is not NULL, go to the neighboring + * node and reverse the insertion direction. + */ + node = AVL_DATA2NODE(here, tree->avl_offset); + +#ifdef DEBUG + diff = tree->avl_compar(new_data, here); + ASSERT(-1 <= diff && diff <= 1); + ASSERT(diff != 0); + ASSERT(diff > 0 ? child == 1 : child == 0); +#endif + + if (node->avl_child[child] != NULL) { + node = node->avl_child[child]; + child = 1 - child; + while (node->avl_child[child] != NULL) { +#ifdef DEBUG + diff = tree->avl_compar(new_data, + AVL_NODE2DATA(node, tree->avl_offset)); + ASSERT(-1 <= diff && diff <= 1); + ASSERT(diff != 0); + ASSERT(diff > 0 ? child == 1 : child == 0); +#endif + node = node->avl_child[child]; + } +#ifdef DEBUG + diff = tree->avl_compar(new_data, + AVL_NODE2DATA(node, tree->avl_offset)); + ASSERT(-1 <= diff && diff <= 1); + ASSERT(diff != 0); + ASSERT(diff > 0 ? child == 1 : child == 0); +#endif + } + ASSERT(node->avl_child[child] == NULL); + + avl_insert(tree, new_data, AVL_MKINDEX(node, child)); +} + +/* + * Add a new node to an AVL tree. + */ +void +avl_add(avl_tree_t *tree, void *new_node) +{ + avl_index_t where = 0; + + /* + * This is unfortunate. We want to call panic() here, even for + * non-DEBUG kernels. In userland, however, we can't depend on anything + * in libc or else the rtld build process gets confused. So, all we can + * do in userland is resort to a normal ASSERT(). + */ + if (avl_find(tree, new_node, &where) != NULL) +#ifdef _KERNEL + panic("avl_find() succeeded inside avl_add()"); +#else + ASSERT(0); +#endif + avl_insert(tree, new_node, where); +} + +/* + * Delete a node from the AVL tree. Deletion is similar to insertion, but + * with 2 complications. + * + * First, we may be deleting an interior node. Consider the following subtree: + * + * d c c + * / \ / \ / \ + * b e b e b e + * / \ / \ / + * a c a a + * + * When we are deleting node (d), we find and bring up an adjacent valued leaf + * node, say (c), to take the interior node's place. In the code this is + * handled by temporarily swapping (d) and (c) in the tree and then using + * common code to delete (d) from the leaf position. + * + * Secondly, an interior deletion from a deep tree may require more than one + * rotation to fix the balance. This is handled by moving up the tree through + * parents and applying rotations as needed. The return value from + * avl_rotation() is used to detect when a subtree did not change overall + * height due to a rotation. + */ +void +avl_remove(avl_tree_t *tree, void *data) +{ + avl_node_t *delete; + avl_node_t *parent; + avl_node_t *node; + avl_node_t tmp; + int old_balance; + int new_balance; + int left; + int right; + int which_child; + size_t off = tree->avl_offset; + + ASSERT(tree); + + delete = AVL_DATA2NODE(data, off); + + /* + * Deletion is easiest with a node that has at most 1 child. + * We swap a node with 2 children with a sequentially valued + * neighbor node. That node will have at most 1 child. Note this + * has no effect on the ordering of the remaining nodes. + * + * As an optimization, we choose the greater neighbor if the tree + * is right heavy, otherwise the left neighbor. This reduces the + * number of rotations needed. + */ + if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) { + + /* + * choose node to swap from whichever side is taller + */ + old_balance = AVL_XBALANCE(delete); + left = avl_balance2child[old_balance + 1]; + right = 1 - left; + + /* + * get to the previous value'd node + * (down 1 left, as far as possible right) + */ + for (node = delete->avl_child[left]; + node->avl_child[right] != NULL; + node = node->avl_child[right]) + ; + + /* + * create a temp placeholder for 'node' + * move 'node' to delete's spot in the tree + */ + tmp = *node; + + *node = *delete; + if (node->avl_child[left] == node) + node->avl_child[left] = &tmp; + + parent = AVL_XPARENT(node); + if (parent != NULL) + parent->avl_child[AVL_XCHILD(node)] = node; + else + tree->avl_root = node; + AVL_SETPARENT(node->avl_child[left], node); + AVL_SETPARENT(node->avl_child[right], node); + + /* + * Put tmp where node used to be (just temporary). + * It always has a parent and at most 1 child. + */ + delete = &tmp; + parent = AVL_XPARENT(delete); + parent->avl_child[AVL_XCHILD(delete)] = delete; + which_child = (delete->avl_child[1] != 0); + if (delete->avl_child[which_child] != NULL) + AVL_SETPARENT(delete->avl_child[which_child], delete); + } + + + /* + * Here we know "delete" is at least partially a leaf node. It can + * be easily removed from the tree. + */ + ASSERT(tree->avl_numnodes > 0); + --tree->avl_numnodes; + parent = AVL_XPARENT(delete); + which_child = AVL_XCHILD(delete); + if (delete->avl_child[0] != NULL) + node = delete->avl_child[0]; + else + node = delete->avl_child[1]; + + /* + * Connect parent directly to node (leaving out delete). + */ + if (node != NULL) { + AVL_SETPARENT(node, parent); + AVL_SETCHILD(node, which_child); + } + if (parent == NULL) { + tree->avl_root = node; + return; + } + parent->avl_child[which_child] = node; + + + /* + * Since the subtree is now shorter, begin adjusting parent balances + * and performing any needed rotations. + */ + do { + + /* + * Move up the tree and adjust the balance + * + * Capture the parent and which_child values for the next + * iteration before any rotations occur. + */ + node = parent; + old_balance = AVL_XBALANCE(node); + new_balance = old_balance - avl_child2balance[which_child]; + parent = AVL_XPARENT(node); + which_child = AVL_XCHILD(node); + + /* + * If a node was in perfect balance but isn't anymore then + * we can stop, since the height didn't change above this point + * due to a deletion. + */ + if (old_balance == 0) { + AVL_SETBALANCE(node, new_balance); + break; + } + + /* + * If the new balance is zero, we don't need to rotate + * else + * need a rotation to fix the balance. + * If the rotation doesn't change the height + * of the sub-tree we have finished adjusting. + */ + if (new_balance == 0) + AVL_SETBALANCE(node, new_balance); + else if (!avl_rotation(tree, node, new_balance)) + break; + } while (parent != NULL); +} + +#define AVL_REINSERT(tree, obj) \ + avl_remove((tree), (obj)); \ + avl_add((tree), (obj)) + +boolean_t +avl_update_lt(avl_tree_t *t, void *obj) +{ + void *neighbor; + + ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) || + (t->avl_compar(obj, neighbor) <= 0)); + + neighbor = AVL_PREV(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +avl_update_gt(avl_tree_t *t, void *obj) +{ + void *neighbor; + + ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) || + (t->avl_compar(obj, neighbor) >= 0)); + + neighbor = AVL_NEXT(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +avl_update(avl_tree_t *t, void *obj) +{ + void *neighbor; + + neighbor = AVL_PREV(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + neighbor = AVL_NEXT(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + +void +avl_swap(avl_tree_t *tree1, avl_tree_t *tree2) +{ + avl_node_t *temp_node; + ulong_t temp_numnodes; + + // ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar); + ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset); + ASSERT3U(tree1->avl_size, ==, tree2->avl_size); + temp_node = tree1->avl_root; + temp_numnodes = tree1->avl_numnodes; + tree1->avl_root = tree2->avl_root; + tree1->avl_numnodes = tree2->avl_numnodes; + tree2->avl_root = temp_node; + tree2->avl_numnodes = temp_numnodes; +} + +/* + * initialize a new AVL tree + */ +void +avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *), + uint32_t size, uint32_t offset) +{ + ASSERT(tree); + ASSERT(compar); + ASSERT(size > 0); + ASSERT(size >= offset + sizeof (avl_node_t)); +#ifdef _LP64 + ASSERT((offset & 0x7) == 0); +#endif + + tree->avl_compar = compar; + tree->avl_root = NULL; + tree->avl_numnodes = 0; + tree->avl_size = size; + tree->avl_offset = offset; +} + +/* + * Delete a tree. + */ +/* ARGSUSED */ +void +avl_destroy(avl_tree_t *tree) +{ + ASSERT(tree); + ASSERT(tree->avl_numnodes == 0); + ASSERT(tree->avl_root == NULL); +} + + +/* + * Return the number of nodes in an AVL tree. + */ +ulong_t +avl_numnodes(avl_tree_t *tree) +{ + ASSERT(tree); + return (tree->avl_numnodes); +} + +boolean_t +avl_is_empty(avl_tree_t *tree) +{ + ASSERT(tree); + return (tree->avl_numnodes == 0); +} + +#define CHILDBIT (1L) + +/* + * Post-order tree walk used to visit all tree nodes and destroy the tree + * in post order. This is used for destroying a tree w/o paying any cost + * for rebalancing it. + * + * example: + * + * void *cookie = NULL; + * my_data_t *node; + * + * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL) + * free(node); + * avl_destroy(tree); + * + * The cookie is really an avl_node_t to the current node's parent and + * an indication of which child you looked at last. + * + * On input, a cookie value of CHILDBIT indicates the tree is done. + */ +void * +avl_destroy_nodes(avl_tree_t *tree, void **cookie) +{ + avl_node_t *node; + avl_node_t *parent; + int child; + void *first; + uint32_t off = tree->avl_offset; + + /* + * Initial calls go to the first node or it's right descendant. + */ + if (*cookie == NULL) { + first = avl_first(tree); + + /* + * deal with an empty tree + */ + if (first == NULL) { + *cookie = (void *)CHILDBIT; + return (NULL); + } + + node = AVL_DATA2NODE(first, off); + parent = AVL_XPARENT(node); + goto check_right_side; + } + + /* + * If there is no parent to return to we are done. + */ + parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT); + if (parent == NULL) { + if (tree->avl_root != NULL) { + ASSERT(tree->avl_numnodes == 1); + tree->avl_root = NULL; + tree->avl_numnodes = 0; + } + return (NULL); + } + + /* + * Remove the child pointer we just visited from the parent and tree. + */ + child = (uintptr_t)(*cookie) & CHILDBIT; + parent->avl_child[child] = NULL; + ASSERT(tree->avl_numnodes > 1); + --tree->avl_numnodes; + + /* + * If we just did a right child or there isn't one, go up to parent. + */ + if (child == 1 || parent->avl_child[1] == NULL) { + node = parent; + parent = AVL_XPARENT(parent); + goto done; + } + + /* + * Do parent's right child, then leftmost descendent. + */ + node = parent->avl_child[1]; + while (node->avl_child[0] != NULL) { + parent = node; + node = node->avl_child[0]; + } + + /* + * If here, we moved to a left child. It may have one + * child on the right (when balance == +1). + */ +check_right_side: + if (node->avl_child[1] != NULL) { + ASSERT(AVL_XBALANCE(node) == 1); + parent = node; + node = node->avl_child[1]; + ASSERT(node->avl_child[0] == NULL && + node->avl_child[1] == NULL); + } else { + ASSERT(AVL_XBALANCE(node) <= 0); + } + +done: + if (parent == NULL) { + *cookie = (void *)CHILDBIT; + ASSERT(node == tree->avl_root); + } else { + *cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node)); + } + + return (AVL_NODE2DATA(node, off)); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +#if 0 + +static int avl_init(void) { return 0; } +static int avl_fini(void) { return 0; } + +spl_module_init(avl_init); +spl_module_exit(avl_fini); + +MODULE_DESCRIPTION("Generic AVL tree implementation"); +MODULE_AUTHOR(ZFS_META_AUTHOR); +MODULE_LICENSE(ZFS_META_LICENSE); +MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); + +EXPORT_SYMBOL(avl_create); +EXPORT_SYMBOL(avl_find); +EXPORT_SYMBOL(avl_insert); +EXPORT_SYMBOL(avl_insert_here); +EXPORT_SYMBOL(avl_walk); +EXPORT_SYMBOL(avl_first); +EXPORT_SYMBOL(avl_last); +EXPORT_SYMBOL(avl_nearest); +EXPORT_SYMBOL(avl_add); +EXPORT_SYMBOL(avl_remove); +EXPORT_SYMBOL(avl_numnodes); +EXPORT_SYMBOL(avl_destroy_nodes); +EXPORT_SYMBOL(avl_destroy); +#endif +#endif diff --git a/module/os/windows/spl/spl-condvar.c b/module/os/windows/spl/spl-condvar.c new file mode 100644 index 000000000000..f00123cdf101 --- /dev/null +++ b/module/os/windows/spl/spl-condvar.c @@ -0,0 +1,281 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + * Following the guide at http://www.cs.wustl.edu/~schmidt/win32-cv-1.html and implementing the + * second-to-last suggestion, albeit in kernel mode, and replacing CriticalSection with Atomics. + * At some point, we should perhaps look at the final "SignalObjectAndWait" solution, presumably + * by using the Wait argument to Mutex, and call WaitForObject. + */ + +#include +#include +//#include +#include + +#ifdef SPL_DEBUG_MUTEX +void spl_wdlist_settime(void *mpleak, uint64_t value); +#endif + +#define CONDVAR_INIT 0x12345678 + +void +spl_cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) +{ + (void) cvp; (void) name; (void) type; (void) arg; + //DbgBreakPoint(); + KeInitializeEvent(&cvp->kevent[CV_SIGNAL], SynchronizationEvent, FALSE); + KeInitializeEvent(&cvp->kevent[CV_BROADCAST], NotificationEvent, FALSE); +// KeInitializeSpinLock(&cvp->waiters_count_lock); + cvp->waiters_count = 0; + cvp->initialised = CONDVAR_INIT; +} + +void +spl_cv_destroy(kcondvar_t *cvp) +{ + if (cvp->initialised != CONDVAR_INIT) + panic("%s: not initialised", __func__); + // We have probably already signalled the waiters, but we need to + // kick around long enough for them to wake. + while (cvp->waiters_count > 0) + cv_broadcast(cvp); + ASSERT0(cvp->waiters_count); + cvp->initialised = 0; +} + +void +spl_cv_signal(kcondvar_t *cvp) +{ + if (cvp->initialised != CONDVAR_INIT) + panic("%s: not initialised", __func__); + KIRQL oldIrq; + +// KeAcquireSpinLock(&cvp->waiters_count_lock, &oldIrq); + uint32_t have_waiters = cvp->waiters_count > 0; +// KeReleaseSpinLock(&cvp->waiters_count_lock, oldIrq); + + if (have_waiters) + KeSetEvent(&cvp->kevent[CV_SIGNAL], 0, FALSE); +} + +// WakeConditionVariable or WakeAllConditionVariable function. + +void +spl_cv_broadcast(kcondvar_t *cvp) +{ + if (cvp->initialised != CONDVAR_INIT) + panic("%s: not initialised", __func__); + KIRQL oldIrq; + +// KeAcquireSpinLock(&cvp->waiters_count_lock, &oldIrq); + int have_waiters = cvp->waiters_count > 0; +// KeReleaseSpinLock(&cvp->waiters_count_lock, oldIrq); + + if (have_waiters) + KeSetEvent(&cvp->kevent[CV_BROADCAST], 0, FALSE); +} + +/* + * Block on the indicated condition variable and + * release the associated mutex while blocked. + */ +void +spl_cv_wait(kcondvar_t *cvp, kmutex_t *mp, int flags, const char *msg) +{ + int result; + if (cvp->initialised != CONDVAR_INIT) + panic("%s: not initialised", __func__); + + if (msg != NULL && msg[0] == '&') + ++msg; /* skip over '&' prefixes */ +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, 0); +#endif + KIRQL oldIrq; +// KeAcquireSpinLock(&cvp->waiters_count_lock, &oldIrq); + atomic_inc_32(&cvp->waiters_count); +// KeReleaseSpinLock(&cvp->waiters_count_lock, oldIrq); + mutex_exit(mp); + void *locks[CV_MAX_EVENTS] = { &cvp->kevent[CV_SIGNAL], &cvp->kevent[CV_BROADCAST] }; + result = KeWaitForMultipleObjects(2, locks, WaitAny, Executive, KernelMode, FALSE, NULL, NULL); + +// KeAcquireSpinLock(&cvp->waiters_count_lock, &oldIrq); + // If last listener, clear BROADCAST event. (Even if it was SIGNAL + // overclearing will not hurt?) + if (cvp->waiters_count == 1) + KeClearEvent(&cvp->kevent[CV_BROADCAST]); + + atomic_dec_32(&cvp->waiters_count); + + //int last_waiter = + // result == STATUS_WAIT_0 + CV_BROADCAST + // && cvp->waiters_count == 0; +// KeReleaseSpinLock(&cvp->waiters_count_lock, oldIrq); + + //if (last_waiter) + // KeClearEvent(&cvp->kevent[CV_BROADCAST]); + + mutex_enter(mp); + +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, gethrestime_sec()); +#endif +} + +/* + * Same as cv_wait except the thread will unblock at 'tim' + * (an absolute time) if it hasn't already unblocked. + * + * Returns the amount of time left from the original 'tim' value + * when it was unblocked. + */ +int +spl_cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t tim, int flags, + const char *msg) +{ + int result; + clock_t timenow; + LARGE_INTEGER timeout; + (void) cvp; (void) flags; + + if (cvp->initialised != CONDVAR_INIT) + panic("%s: not initialised", __func__); + + if (msg != NULL && msg[0] == '&') + ++msg; /* skip over '&' prefixes */ + + timenow = zfs_lbolt(); + + // Check for events already in the past + if (tim < timenow) + tim = timenow; + + /* + * Pointer to a time-out value that specifies the absolute or + * relative time, in 100-nanosecond units, at which the wait is to + * be completed. A positive value specifies an absolute time, + * relative to January 1, 1601. A negative value specifies an + * interval relative to the current time. + */ + timeout.QuadPart = -100000 * MAX(1, (tim - timenow) / hz); + +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, 0); +#endif + KIRQL oldIrq; +// KeAcquireSpinLock(&cvp->waiters_count_lock, &oldIrq); + atomic_inc_32(&cvp->waiters_count); +// KeReleaseSpinLock(&cvp->waiters_count_lock, oldIrq); + mutex_exit(mp); + + void *locks[CV_MAX_EVENTS] = { &cvp->kevent[CV_SIGNAL], &cvp->kevent[CV_BROADCAST] }; + result = KeWaitForMultipleObjects(2, locks, WaitAny, Executive, KernelMode, FALSE, &timeout, NULL); + +// KeAcquireSpinLock(&cvp->waiters_count_lock, &oldIrq); + + int last_waiter = + result == STATUS_WAIT_0 + CV_BROADCAST + && cvp->waiters_count == 1; +// KeReleaseSpinLock(&cvp->waiters_count_lock, oldIrq); + + if (last_waiter) + KeClearEvent(&cvp->kevent[CV_BROADCAST]); + + atomic_dec_32(&cvp->waiters_count); + + mutex_enter(mp); + +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, gethrestime_sec()); +#endif + return (result == STATUS_TIMEOUT ? -1 : 0); + +} + + +/* +* Compatibility wrapper for the cv_timedwait_hires() Illumos interface. +*/ +clock_t +cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag) +{ + int result; + LARGE_INTEGER timeout; + + if (cvp->initialised != CONDVAR_INIT) + panic("%s: not initialised", __func__); + ASSERT(cvp->initialised == CONDVAR_INIT); + + if (res > 1) { + /* + * Align expiration to the specified resolution. + */ + if (flag & CALLOUT_FLAG_ROUNDUP) + tim += res - 1; + tim = (tim / res) * res; + } + + if (flag & CALLOUT_FLAG_ABSOLUTE) { + // 'tim' here is absolute UNIX time (from gethrtime()) so convert it to + // absolute Windows time + hrtime_t now = gethrtime(); + + tim -= now; // Remove the ticks, what remains should be "sleep" amount. + } + timeout.QuadPart = -tim / 100; + +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, 0); +#endif + KIRQL oldIrq; +// KeAcquireSpinLock(&cvp->waiters_count_lock, &oldIrq); + atomic_inc_32(&cvp->waiters_count); +// KeReleaseSpinLock(&cvp->waiters_count_lock, oldIrq); + mutex_exit(mp); + + void *locks[CV_MAX_EVENTS] = { &cvp->kevent[CV_SIGNAL], &cvp->kevent[CV_BROADCAST] }; + result = KeWaitForMultipleObjects(2, locks, WaitAny, Executive, KernelMode, FALSE, &timeout, NULL); + +// KeAcquireSpinLock(&cvp->waiters_count_lock, &oldIrq); + + int last_waiter = + result == STATUS_WAIT_0 + CV_BROADCAST + && cvp->waiters_count == 1; +// KeReleaseSpinLock(&cvp->waiters_count_lock, oldIrq); + + if (last_waiter) + KeClearEvent(&cvp->kevent[CV_BROADCAST]); + + atomic_dec_32(&cvp->waiters_count); + + mutex_enter(mp); + +#ifdef SPL_DEBUG_MUTEX + spl_wdlist_settime(mp->leak, gethrestime_sec()); +#endif + + return (result == STATUS_TIMEOUT ? -1 : 0); +} diff --git a/module/os/windows/spl/spl-cred.c b/module/os/windows/spl/spl-cred.c new file mode 100644 index 000000000000..50fafbe8bc64 --- /dev/null +++ b/module/os/windows/spl/spl-cred.c @@ -0,0 +1,165 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + */ + +#include +#include +//#include + +/* Return the effective user id */ +uid_t +crgetuid(const cred_t *cr) +{ + if (!cr) return 0; + // return kauth_cred_getuid((kauth_cred_t)cr); + return (uint64_t)-1; +} + + +/* Return the real user id */ +uid_t +crgetruid(const cred_t *cr) +{ + if (!cr) return 0; + //return kauth_cred_getruid((kauth_cred_t)cr); + return (uint64_t)-1; +} + +/* Return the saved user id */ +uid_t +crgetsuid(const cred_t *cr) +{ + if (!cr) return 0; + //return kauth_cred_getsvuid((kauth_cred_t)cr); + return (uint64_t)-1; +} + +/* Return the filesystem user id */ +uid_t +crgetfsuid(const cred_t *cr) +{ + if (!cr) return 0; + return (uint64_t)-1; +} + +/* Return the effective group id */ +gid_t +crgetgid(const cred_t *cr) +{ + if (!cr) return 0; +// return kauth_cred_getgid((kauth_cred_t)cr); + return (uint64_t)-1; +} + +/* Return the real group id */ +gid_t +crgetrgid(const cred_t *cr) +{ + if (!cr) return 0; +// return kauth_cred_getrgid((kauth_cred_t)cr); + return (uint64_t)-1; +} + +/* Return the saved group id */ +gid_t +crgetsgid(const cred_t *cr) +{ + if (!cr) return 0; +// return kauth_cred_getsvgid((kauth_cred_t)cr); + return (uint64_t)-1; +} + +/* Return the filesystem group id */ +gid_t +crgetfsgid(const cred_t *cr) +{ + (void)cr; + return (uint64_t)-1; +} + + +/* + * Unfortunately, to get the count of groups, we have to call XNU which + * memcpy's them over. No real clean way to get around that, but at least + * these calls are done sparingly. + */ +int crgetngroups(const cred_t *cr) +{ + (void)cr; + // gid_t gids[NGROUPS]; +// int count = NGROUPS; +// int ret; +// +// ret = kauth_cred_getgroups((kauth_cred_t) cr, gids, &count); +// +// if (!ret) return count; + + return 0; +} + + +/* + * We always allocate NGROUPs here, since we don't know how many there will + * be until after the call. Unlike IllumOS, the ptr returned is allocated + * and must be returned by a call to crgetgroupsfree(). + */ +gid_t *crgetgroups(const cred_t *cr) +{ + gid_t *gids; + int count = NGROUPS; + (void)cr; + + gids = kmem_zalloc(sizeof(gid_t) * count, KM_SLEEP); + if (!gids) return NULL; + + //kauth_cred_getgroups((kauth_cred_t) cr, gids, &count); + + return gids; +} + +void crgetgroupsfree(gid_t *gids) +{ + if (!gids) return; + kmem_free(gids, sizeof(gid_t) * NGROUPS); +} + +/* + * Return true if "cr" belongs in group "gid". + */ +int spl_cred_ismember_gid(cred_t *cr, gid_t gid) +{ + int ret = 0; // Is not member. + (void)cr; (void)gid; + //kauth_cred_ismember_gid((kauth_cred_t)cr, gid, &ret); + if (ret == 1) + return TRUE; + return FALSE; +} + +int groupmember(gid_t gid, kauth_cred_t *cred) +{ + return 0; +} \ No newline at end of file diff --git a/module/os/windows/spl/spl-ddi.c b/module/os/windows/spl/spl-ddi.c new file mode 100644 index 000000000000..c0b2ed361038 --- /dev/null +++ b/module/os/windows/spl/spl-ddi.c @@ -0,0 +1,655 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +//#include + +/* + * Allocate a set of pointers to 'n_items' objects of size 'size' + * bytes. Each pointer is initialized to nil. + * + * The 'size' and 'n_items' values are stashed in the opaque + * handle returned to the caller. + * + * This implementation interprets 'set of pointers' to mean 'array + * of pointers' but note that nothing in the interface definition + * precludes an implementation that uses, for example, a linked list. + * However there should be a small efficiency gain from using an array + * at lookup time. + * + * NOTE As an optimization, we make our growable array allocations in + * powers of two (bytes), since that's how much kmem_alloc (currently) + * gives us anyway. It should save us some free/realloc's .. + * + * As a further optimization, we make the growable array start out + * with MIN_N_ITEMS in it. + */ + + +int +ddi_soft_state_init(void **state_p, uint32_t size, uint32_t n_items) +{ + struct i_ddi_soft_state *ss; + + if (state_p == NULL || *state_p != NULL || size == 0) + return (EINVAL); + + ss = kmem_zalloc(sizeof (*ss), KM_SLEEP); + mutex_init(&ss->lock, NULL, MUTEX_DRIVER, NULL); + ss->size = size; + + if (n_items < MIN_N_ITEMS) + ss->n_items = MIN_N_ITEMS; + else { + int bitlog; + + if ((bitlog = ddi_fls(n_items)) == ddi_ffs(n_items)) + bitlog--; + ss->n_items = 1 << bitlog; + } + + ASSERT(ss->n_items >= n_items); + + ss->array = kmem_zalloc(ss->n_items * sizeof (void *), KM_SLEEP); + + *state_p = ss; + + return (0); +} + + +/* + * Allocate a state structure of size 'size' to be associated + * with item 'item'. + * + * In this implementation, the array is extended to + * allow the requested offset, if needed. + */ +int +ddi_soft_state_zalloc(void *state, int item) +{ + struct i_ddi_soft_state *ss; + void **array; + void *new_element; + + if ((ss = state) == NULL || item < 0) + return (DDI_FAILURE); + + mutex_enter(&ss->lock); + if (ss->size == 0) { + mutex_exit(&ss->lock); + cmn_err(CE_WARN, "ddi_soft_state_zalloc: bad handle"); + return (DDI_FAILURE); + } + + array = ss->array; /* NULL if ss->n_items == 0 */ + ASSERT(ss->n_items != 0 && array != NULL); + + /* + * refuse to tread on an existing element + */ + if (item < ss->n_items && array[item] != NULL) { + mutex_exit(&ss->lock); + return (DDI_FAILURE); + } + + /* + * Allocate a new element to plug in + */ + new_element = kmem_zalloc(ss->size, KM_SLEEP); + + /* + * Check if the array is big enough, if not, grow it. + */ + if (item >= ss->n_items) { + void **new_array; + uint32_t new_n_items; + struct i_ddi_soft_state *dirty; + + /* + * Allocate a new array of the right length, copy + * all the old pointers to the new array, then + * if it exists at all, put the old array on the + * dirty list. + * + * Note that we can't kmem_free() the old array. + * + * Why -- well the 'get' operation is 'mutex-free', so we + * can't easily catch a suspended thread that is just about + * to dereference the array we just grew out of. So we + * cons up a header and put it on a list of 'dirty' + * pointer arrays. (Dirty in the sense that there may + * be suspended threads somewhere that are in the middle + * of referencing them). Fortunately, we -can- garbage + * collect it all at ddi_soft_state_fini time. + */ + new_n_items = ss->n_items; + while (new_n_items < (1 + item)) + new_n_items <<= 1; /* double array size .. */ + + ASSERT(new_n_items >= (1 + item)); /* sanity check! */ + + new_array = kmem_zalloc(new_n_items * sizeof (void *), + KM_SLEEP); + /* + * Copy the pointers into the new array + */ + bcopy(array, new_array, ss->n_items * sizeof (void *)); + + /* + * Save the old array on the dirty list + */ + dirty = kmem_zalloc(sizeof (*dirty), KM_SLEEP); + dirty->array = ss->array; + dirty->n_items = ss->n_items; + dirty->next = ss->next; + ss->next = dirty; + + ss->array = (array = new_array); + ss->n_items = new_n_items; + } + + ASSERT(array != NULL && item < ss->n_items && array[item] == NULL); + + array[item] = new_element; + + mutex_exit(&ss->lock); + return (DDI_SUCCESS); +} + + +/* + * Fetch a pointer to the allocated soft state structure. + * + * This is designed to be cheap. + * + * There's an argument that there should be more checking for + * nil pointers and out of bounds on the array.. but we do a lot + * of that in the alloc/free routines. + * + * An array has the convenience that we don't need to lock read-access + * to it c.f. a linked list. However our "expanding array" strategy + * means that we should hold a readers lock on the i_ddi_soft_state + * structure. + * + * However, from a performance viewpoint, we need to do it without + * any locks at all -- this also makes it a leaf routine. The algorithm + * is 'lock-free' because we only discard the pointer arrays at + * ddi_soft_state_fini() time. + */ +void * +ddi_get_soft_state(void *state, int item) +{ + struct i_ddi_soft_state *ss = state; + + ASSERT(ss != NULL && item >= 0); + + if (item < ss->n_items && ss->array != NULL) + return (ss->array[item]); + return (NULL); +} + +/* + * Free the state structure corresponding to 'item.' Freeing an + * element that has either gone or was never allocated is not + * considered an error. Note that we free the state structure, but + * we don't shrink our pointer array, or discard 'dirty' arrays, + * since even a few pointers don't really waste too much memory. + * + * Passing an item number that is out of bounds, or a null pointer will + * provoke an error message. + */ +void +ddi_soft_state_free(void *state, int item) +{ + struct i_ddi_soft_state *ss; + void **array; + void *element; + static char msg[] = "ddi_soft_state_free:"; + + if ((ss = state) == NULL) { + cmn_err(CE_WARN, "%s null handle", + msg); + return; + } + + element = NULL; + + mutex_enter(&ss->lock); + + if ((array = ss->array) == NULL || ss->size == 0) { + cmn_err(CE_WARN, "%s bad handle", + msg); + } else if (item < 0 || item >= ss->n_items) { + cmn_err(CE_WARN, "%s item %d not in range [0..%lu]", + msg, item, ss->n_items - 1); + } else if (array[item] != NULL) { + element = array[item]; + array[item] = NULL; + } + + mutex_exit(&ss->lock); + + if (element) + kmem_free(element, ss->size); +} + + +/* + * Free the entire set of pointers, and any + * soft state structures contained therein. + * + * Note that we don't grab the ss->lock mutex, even though + * we're inspecting the various fields of the data structure. + * + * There is an implicit assumption that this routine will + * never run concurrently with any of the above on this + * particular state structure i.e. by the time the driver + * calls this routine, there should be no other threads + * running in the driver. + */ +void +ddi_soft_state_fini(void **state_p) +{ + struct i_ddi_soft_state *ss, *dirty; + int item; + static char msg[] = "ddi_soft_state_fini:"; + + if (state_p == NULL || (ss = *state_p) == NULL) { + //cmn_err(CE_WARN, "%s null handle", + // msg); + return; + } + + if (ss->size == 0) { + cmn_err(CE_WARN, "%s bad handle", + msg); + return; + } + + if (ss->n_items > 0) { + for (item = 0; item < ss->n_items; item++) + ddi_soft_state_free(ss, item); + kmem_free(ss->array, ss->n_items * sizeof (void *)); + } + + /* + * Now delete any dirty arrays from previous 'grow' operations + */ + for (dirty = ss->next; dirty; dirty = ss->next) { + ss->next = dirty->next; + kmem_free(dirty->array, dirty->n_items * sizeof (void *)); + kmem_free(dirty, sizeof (*dirty)); + } + + mutex_destroy(&ss->lock); + kmem_free(ss, sizeof (*ss)); + + *state_p = NULL; +} + +int +ddi_create_minor_node(dev_info_t *dip, char *name, int spec_type, + minor_t minor_num, char *node_type, int flag) +{ + dev_t dev; + int error=0; + char *r, *dup; + + //printf("ddi_create_minor_node: name %s: %d,%d\n", name, flag, minor_num); + + //dev = makedev(flag, minor_num); + dev = minor_num; + dip->dev = dev; + + /* + * http://lists.apple.com/archives/darwin-kernel/2007/Nov/msg00038.html + * + * devfs_make_name() has an off-by-one error when using directories + * and it appears Apple does not want to fix it. + * + * We then change "/" to "_" and create more Apple-like /dev names + * + */ + MALLOC(dup, char *, strlen(name)+1, M_TEMP, M_WAITOK); + if (dup == NULL) return ENOMEM; + bcopy(name, dup, strlen(name)); + dup[strlen(name)] = '\0'; + + for (r = dup; + (r=strchr(r, '/')); + *r = '_') /* empty */ ; + + dip->devc = NULL; + dip->devb = NULL; +#if 0 + if (spec_type == S_IFCHR) + dip->devc = devfs_make_node(dev, DEVFS_CHAR, /* Make the node */ + UID_ROOT, GID_OPERATOR, + 0600, "rdisk_%s", dup); + //0600, "rdisk3", dup); + else + dip->devb = devfs_make_node(dev, DEVFS_BLOCK, /* Make the node */ + UID_ROOT, GID_OPERATOR, + 0600, "disk_%s", dup); + //0600, "disk3", dup); +#endif + //printf("ddi_create_minor: devfs_make_name '%s'\n", dup ); + + FREE(dup, M_TEMP); + + return error; +} + + +void +ddi_remove_minor_node(dev_info_t *dip, char *name) +{ + //printf("zvol: remove minor: '%s'\n", name ? name : ""); + if (dip->devc) { +// devfs_remove(dip->devc); + dip->devc = NULL; + } + if (dip->devb) { + // devfs_remove(dip->devb); + dip->devb = NULL; + } +} + + + +int +ddi_strtol(const char *str, char **nptr, int base, long *result) +{ + long val; + int c; + int xx; + int neg = 0; + long multmin; + long limit; + const char **ptr = (const char **)nptr; + const unsigned char *ustr = (const unsigned char *)str; + + if (ptr != (const char **)0) + *ptr = (char *)ustr; /* in case no number is formed */ + if (base < 0 || base > MBASE || base == 1) { + /* base is invalid -- should be a fatal error */ + return (EINVAL); + } + if (!isalnum(c = *ustr)) { + while (isspace(c)) + c = *++ustr; + switch (c) { + case '-': + neg++; + /* FALLTHROUGH */ + case '+': + c = *++ustr; + } + } + if (base == 0) + if (c != '0') + base = 10; + else if (ustr[1] == 'x' || ustr[1] == 'X') + base = 16; + else + base = 8; + /* + * for any base > 10, the digits incrementally following + * 9 are assumed to be "abc...z" or "ABC...Z" + */ + if (!lisalnum(c) || (xx = DIGIT(c)) >= base) { + /* no number formed */ + return (EINVAL); + } + if (base == 16 && c == '0' && (ustr[1] == 'x' || ustr[1] == 'X') && + isxdigit(ustr[2])) + c = *(ustr += 2); /* skip over leading "0x" or "0X" */ + + /* this code assumes that abs(LONG_MIN) >= abs(LONG_MAX) */ + if (neg) + limit = LONG_MIN; + else + limit = -LONG_MAX; + multmin = limit / (long)base; + val = -DIGIT(c); + for (c = *++ustr; lisalnum(c) && (xx = DIGIT(c)) < base; ) { + /* accumulate neg avoids surprises near LONG_MAX */ + if (val < multmin) + goto overflow; + val *= base; + if (val < limit + xx) + goto overflow; + val -= xx; + c = *++ustr; + } + if (ptr != (const char **)0) + *ptr = (char *)ustr; + *result = neg ? val : -val; + return (0); + +overflow: + for (c = *++ustr; lisalnum(c) && (xx = DIGIT(c)) < base; (c = *++ustr)) + ; + if (ptr != (const char **)0) + *ptr = (char *)ustr; + return (ERANGE); +} + +char * __cdecl +strpbrk(const char *s, const char *b) +{ + const char *p; + + do { + for (p = b; *p != '\0' && *p != *s; ++p) + ; + if (*p != '\0') + return ((char *)s); + } while (*s++); + return (NULL); +} + +int +ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result) +{ + *result = (unsigned long)_strtoui64(str, nptr, base); + if (*result == 0) + return (EINVAL); + else if (*result == (unsigned long)ULONG_MAX) + return (ERANGE); + return (0); +} + +int +ddi_strtoull(const char *str, char **nptr, int base, unsigned long long *result) +{ + *result = (unsigned long long)_strtoui64(str, nptr, base); + if (*result == 0) + return (EINVAL); + else if (*result == ULLONG_MAX) + return (ERANGE); + return (0); +} + +int +ddi_strtoll(const char *str, char **nptr, int base, long long *result) +{ + long long val; + int c; + int xx; + int neg = 0; + long long multmin; + long long limit; + const char **ptr = (const char **)nptr; + const unsigned char *ustr = (const unsigned char *)str; + + if (ptr != (const char **)0) + *ptr = (char *)ustr; /* in case no number is formed */ + if (base < 0 || base > MBASE || base == 1) { + /* base is invalid -- should be a fatal error */ + return (EINVAL); + } + if (!isalnum(c = *ustr)) { + while (isspace(c)) + c = *++ustr; + switch (c) { + case '-': + neg++; + /* FALLTHROUGH */ + case '+': + c = *++ustr; + } + } + if (base == 0) + if (c != '0') + base = 10; + else if (ustr[1] == 'x' || ustr[1] == 'X') + base = 16; + else + base = 8; + /* + * for any base > 10, the digits incrementally following + * 9 are assumed to be "abc...z" or "ABC...Z" + */ + if (!lisalnum(c) || (xx = DIGIT(c)) >= base) { + /* no number formed */ + return (EINVAL); + } + if (base == 16 && c == '0' && (ustr[1] == 'x' || ustr[1] == 'X') && + isxdigit(ustr[2])) + c = *(ustr += 2); /* skip over leading "0x" or "0X" */ + + /* this code assumes that abs(LONG_MIN) >= abs(LONG_MAX) */ + if (neg) + limit = LONGLONG_MIN; + else + limit = -LONGLONG_MAX; + multmin = limit / (long)base; + val = -DIGIT(c); + for (c = *++ustr; lisalnum(c) && (xx = DIGIT(c)) < base; ) { + /* accumulate neg avoids surprises near LONG_MAX */ + if (val < multmin) + goto overflow; + val *= base; + if (val < limit + xx) + goto overflow; + val -= xx; + c = *++ustr; + } + if (ptr != (const char **)0) + *ptr = (char *)ustr; + *result = neg ? val : -val; + return (0); + +overflow: + for (c = *++ustr; lisalnum(c) && (xx = DIGIT(c)) < base; (c = *++ustr)) + ; + if (ptr != (const char **)0) + *ptr = (char *)ustr; + return (ERANGE); +} + +uint32_t +ddi_strcspn(const char * __restrict s, const char * __restrict charset) +{ + /* + * NB: idx and bit are temporaries whose use causes gcc 3.4.2 to + * generate better code. Without them, gcc gets a little confused. + */ + const char *s1; + u_long bit; + u_long tbl[(255 + 1) / LONG_BIT]; + int idx; + if (*s == '\0') + return (0); + + // 64bit code + tbl[0] = 1; + tbl[3] = tbl[2] = tbl[1] = 0; + for (; *charset != '\0'; charset++) { + idx = IDX(*charset); + bit = BIT(*charset); + tbl[idx] |= bit; + + } + + for (s1 = s; ; s1++) { + idx = IDX(*s1); + bit = BIT(*s1); + if ((tbl[idx] & bit) != 0) + break; + } + return (uint32_t)(s1 - s); +} + +extern uint32_t +strlcpy(register char* s, register const char* t, register uint32_t n) +{ + const char* o = t; + + if (n) + do + { + if (!--n) + { + *s = 0; + break; + } + } while (*s++ = *t++); + if (!n) + while (*t++); + return (uint32_t)(t - o - 1); +} + +extern uint32_t +strlcat(register char* s, register const char* t, register uint32_t n) +{ + register size_t m; + const char* o = t; + + if (m = n) + { + while (n && *s) + { + n--; + s++; + } + m -= n; + if (n) + do + { + if (!--n) + { + *s = 0; + break; + } + } while (*s++ = *t++); + else + *s = 0; + } + if (!n) + while (*t++); + return (t - o) + m - 1; +} diff --git a/module/os/windows/spl/spl-debug.c b/module/os/windows/spl/spl-debug.c new file mode 100644 index 000000000000..e8d05c9f435c --- /dev/null +++ b/module/os/windows/spl/spl-debug.c @@ -0,0 +1,28 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include + + +/* Debug log support enabled */ diff --git a/module/os/windows/spl/spl-err.c b/module/os/windows/spl/spl-err.c new file mode 100644 index 000000000000..d51c047f81bd --- /dev/null +++ b/module/os/windows/spl/spl-err.c @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + */ + +#include +#include +#include + +#include + +void +vcmn_err(int ce, const char *fmt, va_list ap) +{ + char msg[MAXMSGLEN]; + + _vsnprintf(msg, MAXMSGLEN - 1, fmt, ap); + + switch (ce) { + case CE_IGNORE: + break; + case CE_CONT: + dprintf("%s", msg); + break; + case CE_NOTE: + dprintf("SPL: Notice: %s\n", msg); + break; + case CE_WARN: + TraceEvent(TRACE_WARNING, "SPL: Warning: %s\n", msg); + break; + case CE_PANIC: + PANIC("%s", msg); + break; + } +} /* vcmn_err() */ + +void +cmn_err(int ce, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vcmn_err(ce, fmt, ap); + va_end(ap); +} /* cmn_err() */ diff --git a/module/os/windows/spl/spl-kmem.c b/module/os/windows/spl/spl-kmem.c new file mode 100644 index 000000000000..adea91dd1b36 --- /dev/null +++ b/module/os/windows/spl/spl-kmem.c @@ -0,0 +1,7067 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * Copyright (C) 2017 Brendon Humphrey + * Copyright (C) 2017 Sean Doran + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * + */ + +#include +//#include +#include +#include +//#include +#include +#include +#include +//#include +#include +#include +#include +#include +//#include +#include +//#include + +#include + +// =============================================================== +// Options +// =============================================================== +// #define PRINT_CACHE_STATS 1 + +// Uncomment to turn on kmems' debug features. +//#define DEBUG +// =============================================================== +// OS Interface +// =============================================================== + +// This variable is a count of the number of threads +// blocked waiting for memory pages to become free. +// We are using wake indications on this event as a +// indication of paging activity, and therefore as a +// proxy to the machine experiencing memory pressure. +// +// xnu vm variables +extern volatile unsigned int vm_page_free_wanted; // 0 by default smd +extern unsigned int vm_page_free_min; // 3500 by default smd kern.vm_page_free_min, rarely changes +extern volatile unsigned int vm_page_free_count; // will tend to vm_page_free_min smd + +#define SMALL_PRESSURE_INCURSION_PAGES (vm_page_free_min >> 5) + +static kcondvar_t spl_free_thread_cv; +static kmutex_t spl_free_thread_lock; +static boolean_t spl_free_thread_exit; +static volatile _Atomic int64_t spl_free; +int64_t spl_free_delta_ema; + +static boolean_t spl_event_thread_exit = FALSE; +static PKEVENT low_mem_event = NULL; + +static volatile _Atomic int64_t spl_free_manual_pressure = 0; +static volatile _Atomic boolean_t spl_free_fast_pressure = FALSE; +static _Atomic boolean_t spl_free_maybe_reap_flag = FALSE; +static _Atomic uint64_t spl_free_last_pressure = 0; + +// Start and end address of kernel memory +extern vm_offset_t virtual_space_start; +extern vm_offset_t virtual_space_end; + +// Can be polled to determine if the VM is experiecing +// a shortage of free pages. +extern int vm_pool_low(void); + +// Which CPU are we executing on? +extern uint32_t cpu_number(); + +// Invoke the kernel debugger +extern void Debugger(const char *message); + +// =============================================================== +// Non Illumos Variables +// =============================================================== + +// Flag to cause tasks and threads to terminate as +// the kmem module is preparing to unload. +static int shutting_down = 0; + +// Amount of RAM in pages, in machine that ZFS can use +uint64_t physmem = 0; + +// Size in bytes of the memory allocated in seg_kmem +extern uint64_t segkmem_total_mem_allocated; + +// Number of active threads +extern uint64_t zfs_threads; +extern uint64_t zfs_active_mutex; +extern uint64_t zfs_active_rwlock; + +// Amount of RAM in bytes, in machine that ZFS can use +extern uint64_t total_memory; + +// Amount of RAM in bytes, in host machine (Windows) +extern uint64_t real_total_memory; + +#define MULT 1 + +static const char *KMEM_VA_PREFIX = "kmem_va"; +static const char *KMEM_MAGAZINE_PREFIX = "kmem_magazine_"; + +static char kext_version[64] = SPL_META_VERSION "-" SPL_META_RELEASE SPL_DEBUG_STR; + +//struct sysctl_oid_list sysctl__spl_children; +//SYSCTL_DECL(_spl); +//SYSCTL_NODE(, OID_AUTO, spl, CTLFLAG_RD, 0, ""); +//SYSCTL_STRING(_spl, OID_AUTO, kext_version, +// CTLFLAG_RD | CTLFLAG_LOCKED, +// kext_version, 0, "SPL KEXT Version"); + +extern void kstat_init(void); + + +// =============================================================== +// Illumos Variables +// =============================================================== + +struct kmem_cache_kstat { + kstat_named_t kmc_buf_size; + kstat_named_t kmc_align; + kstat_named_t kmc_chunk_size; + kstat_named_t kmc_slab_size; + kstat_named_t kmc_alloc; + kstat_named_t kmc_alloc_fail; + kstat_named_t kmc_free; + kstat_named_t kmc_depot_alloc; + kstat_named_t kmc_depot_free; + kstat_named_t kmc_depot_contention; + kstat_named_t kmc_slab_alloc; + kstat_named_t kmc_slab_free; + kstat_named_t kmc_buf_constructed; + kstat_named_t kmc_buf_avail; + kstat_named_t kmc_buf_inuse; + kstat_named_t kmc_buf_total; + kstat_named_t kmc_buf_max; + kstat_named_t kmc_slab_create; + kstat_named_t kmc_slab_destroy; + kstat_named_t kmc_vmem_source; + kstat_named_t kmc_hash_size; + kstat_named_t kmc_hash_lookup_depth; + kstat_named_t kmc_hash_rescale; + kstat_named_t kmc_full_magazines; + kstat_named_t kmc_empty_magazines; + kstat_named_t kmc_magazine_size; + kstat_named_t kmc_reap; /* number of kmem_cache_reap() calls */ + kstat_named_t kmc_defrag; /* attempts to defrag all partial slabs */ + kstat_named_t kmc_scan; /* attempts to defrag one partial slab */ + kstat_named_t kmc_move_callbacks; /* sum of yes, no, later, dn, dk */ + kstat_named_t kmc_move_yes; + kstat_named_t kmc_move_no; + kstat_named_t kmc_move_later; + kstat_named_t kmc_move_dont_need; + kstat_named_t kmc_move_dont_know; /* obj unrecognized by client ... */ + kstat_named_t kmc_move_hunt_found; /* ... but found in mag layer */ + kstat_named_t kmc_move_slabs_freed; /* slabs freed by consolidator */ + kstat_named_t kmc_move_reclaimable; /* buffers, if consolidator ran */ + kstat_named_t kmc_no_vba_success; + kstat_named_t kmc_no_vba_fail; + kstat_named_t kmc_arc_no_grow_set; + kstat_named_t kmc_arc_no_grow; +} kmem_cache_kstat = { + { "buf_size", KSTAT_DATA_UINT64 }, + { "align", KSTAT_DATA_UINT64 }, + { "chunk_size", KSTAT_DATA_UINT64 }, + { "slab_size", KSTAT_DATA_UINT64 }, + { "alloc", KSTAT_DATA_UINT64 }, + { "alloc_fail", KSTAT_DATA_UINT64 }, + { "free", KSTAT_DATA_UINT64 }, + { "depot_alloc", KSTAT_DATA_UINT64 }, + { "depot_free", KSTAT_DATA_UINT64 }, + { "depot_contention", KSTAT_DATA_UINT64 }, + { "slab_alloc", KSTAT_DATA_UINT64 }, + { "slab_free", KSTAT_DATA_UINT64 }, + { "buf_constructed", KSTAT_DATA_UINT64 }, + { "buf_avail", KSTAT_DATA_UINT64 }, + { "buf_inuse", KSTAT_DATA_UINT64 }, + { "buf_total", KSTAT_DATA_UINT64 }, + { "buf_max", KSTAT_DATA_UINT64 }, + { "slab_create", KSTAT_DATA_UINT64 }, + { "slab_destroy", KSTAT_DATA_UINT64 }, + { "vmem_source", KSTAT_DATA_UINT64 }, + { "hash_size", KSTAT_DATA_UINT64 }, + { "hash_lookup_depth", KSTAT_DATA_UINT64 }, + { "hash_rescale", KSTAT_DATA_UINT64 }, + { "full_magazines", KSTAT_DATA_UINT64 }, + { "empty_magazines", KSTAT_DATA_UINT64 }, + { "magazine_size", KSTAT_DATA_UINT64 }, + { "reap", KSTAT_DATA_UINT64 }, + { "defrag", KSTAT_DATA_UINT64 }, + { "scan", KSTAT_DATA_UINT64 }, + { "move_callbacks", KSTAT_DATA_UINT64 }, + { "move_yes", KSTAT_DATA_UINT64 }, + { "move_no", KSTAT_DATA_UINT64 }, + { "move_later", KSTAT_DATA_UINT64 }, + { "move_dont_need", KSTAT_DATA_UINT64 }, + { "move_dont_know", KSTAT_DATA_UINT64 }, + { "move_hunt_found", KSTAT_DATA_UINT64 }, + { "move_slabs_freed", KSTAT_DATA_UINT64 }, + { "move_reclaimable", KSTAT_DATA_UINT64 }, + { "no_vba_success", KSTAT_DATA_UINT64 }, + { "no_vba_fail", KSTAT_DATA_UINT64 }, + { "arc_no_grow_set", KSTAT_DATA_UINT64 }, + { "arc_no_grow", KSTAT_DATA_UINT64 }, +}; + +static kmutex_t kmem_cache_kstat_lock; + +/* + * The default set of caches to back kmem_alloc(). + * These sizes should be reevaluated periodically. + * + * We want allocations that are multiples of the coherency granularity + * (64 bytes) to be satisfied from a cache which is a multiple of 64 + * bytes, so that it will be 64-byte aligned. For all multiples of 64, + * the next 1 greater than or equal to it must be a + * multiple of 64. + * + * We split the table into two sections: size <= 4k and size > 4k. This + * saves a lot of space and cache footprint in our cache tables. + */ +static const int kmem_alloc_sizes[] = { + 1 * 8, + 2 * 8, + 3 * 8, + 4 * 8, 5 * 8, 6 * 8, 7 * 8, + 4 * 16, 5 * 16, 6 * 16, 7 * 16, + 4 * 32, 5 * 32, 6 * 32, 7 * 32, + 4 * 64, 5 * 64, 6 * 64, 7 * 64, + 4 * 128, 9*64, 5 * 128, 6 * 128, 13*64, 7 * 128, + P2ALIGN(8192 / 8, 64), + P2ALIGN(8192 / 7, 64), + P2ALIGN(8192 / 6, 64), + P2ALIGN(8192 / 5, 64), + P2ALIGN(8192 / 4, 64), + P2ALIGN(8192 / 3, 64), + P2ALIGN(8192 / 2, 64), +}; + +static const int kmem_big_alloc_sizes[] = { + 2 * 4096, 3 * 4096, + 2 * 8192, 3 * 8192, + 4 * 8192, 5 * 8192, 6 * 8192, 7 * 8192, + 8 * 8192, 9 * 8192, 10 * 8192, 11 * 8192, + 12 * 8192, 13 * 8192, 14 * 8192, 15 * 8192, + 16 * 8192 +}; + +#define KMEM_MAXBUF 4096 +#define KMEM_BIG_MAXBUF_32BIT 32768 +#define KMEM_BIG_MAXBUF 131072 + +#define KMEM_BIG_MULTIPLE 4096 /* big_alloc_sizes must be a multiple */ +#define KMEM_BIG_SHIFT 12 /* lg(KMEM_BIG_MULTIPLE) */ + +static kmem_cache_t *kmem_alloc_table[KMEM_MAXBUF >> KMEM_ALIGN_SHIFT]; +static kmem_cache_t *kmem_big_alloc_table[KMEM_BIG_MAXBUF >> KMEM_BIG_SHIFT]; + +#define KMEM_ALLOC_TABLE_MAX (KMEM_MAXBUF >> KMEM_ALIGN_SHIFT) +static uint32_t kmem_big_alloc_table_max = 0; /* # of filled elements */ + +static kmem_magtype_t kmem_magtype[] = { + { 1, 8, 3200, 65536 }, + { 3, 16, 256, 32768 }, + { 7, 32, 64, 16384 }, + { 15, 64, 0, 8192 }, + { 31, 64, 0, 4096 }, + { 47, 64, 0, 2048 }, + { 63, 64, 0, 1024 }, + { 95, 64, 0, 512 }, + { 143, 64, 0, 0 }, +}; + +#ifdef _WIN32 +static struct bsd_timeout_wrapper kmem_update_timer; +static struct bsd_timeout_wrapper kmem_reaping; +static struct bsd_timeout_wrapper kmem_reaping_idspace; +#else +static uint32_t kmem_reaping; +static uint32_t kmem_reaping_idspace; +#endif + +/* + * kmem tunables + */ +static struct timespec kmem_reap_interval = {5, 0}; + +int kmem_depot_contention = 3; /* max failed tryenters per real interval */ +pgcnt_t kmem_reapahead = 0; /* start reaping N pages before pageout */ +int kmem_panic = 1; /* whether to panic on error */ +int kmem_logging = 0; /* kmem_log_enter() override */ +uint32_t kmem_mtbf = 0; /* mean time between failures [default: off] */ +uint32_t kmem_transaction_log_size; /* transaction log size [2% of memory] */ +uint32_t kmem_content_log_size; /* content log size [2% of memory] */ +uint32_t kmem_failure_log_size; /* failure log [4 pages per CPU] */ +uint32_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */ +uint32_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */ +uint32_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */ +uint32_t kmem_lite_maxalign = 8192; /* maximum buffer alignment for KMF_LITE */ +int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */ +uint32_t kmem_maxverify; /* maximum bytes to inspect in debug routines */ +uint32_t kmem_minfirewall; /* hardware-enforced redzone threshold */ + +uint32_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */ + +/* + * Be aware that KMF_AUDIT does not release memory, and you will eventually + * grind to a halt. But it is useful to enable if you can trigger a memory + * fault, and wish to see the calling stack. + */ +#ifdef DBG +// can be 0 or KMF_LITE +// or KMF_DEADBEEF | KMF_REDZONE | KMF_CONTENTS +// with or without KMF_AUDIT +int kmem_flags = KMF_DEADBEEF | KMF_REDZONE; +#else +int kmem_flags = 0; +#endif +int kmem_ready; + +static kmem_cache_t *kmem_slab_cache; +static kmem_cache_t *kmem_bufctl_cache; +static kmem_cache_t *kmem_bufctl_audit_cache; + +static kmutex_t kmem_cache_lock; /* inter-cache linkage only */ +static list_t kmem_caches; +extern vmem_t *heap_arena; +static taskq_t *kmem_taskq; +static kmutex_t kmem_flags_lock; +static vmem_t *kmem_metadata_arena; +static vmem_t *kmem_msb_arena; /* arena for metadata caches */ +static vmem_t *kmem_cache_arena; +static vmem_t *kmem_hash_arena; +static vmem_t *kmem_log_arena; +static vmem_t *kmem_oversize_arena; +static vmem_t *kmem_va_arena; +static vmem_t *kmem_default_arena; +static vmem_t *kmem_firewall_arena; + +/* + * Define KMEM_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define KMEM_STATS +#endif /* DEBUG */ + +#ifdef KMEM_STATS +#define KMEM_STAT_ADD(stat) ((stat)++) +#define KMEM_STAT_COND_ADD(cond, stat) ((void) (!(cond) || (stat)++)) +#else +#define KMEM_STAT_ADD(stat) /* nothing */ +#define KMEM_STAT_COND_ADD(cond, stat) /* nothing */ +#endif /* KMEM_STATS */ + +/* + * kmem slab consolidator thresholds (tunables) + */ +uint32_t kmem_frag_minslabs = 101; /* minimum total slabs */ +uint32_t kmem_frag_numer = 1; /* free buffers (numerator) */ +uint32_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */ +/* + * Maximum number of slabs from which to move buffers during a single + * maintenance interval while the system is not low on memory. + */ +uint32_t kmem_reclaim_max_slabs = 4; // smd 1 +/* + * Number of slabs to scan backwards from the end of the partial slab list + * when searching for buffers to relocate. + */ +uint32_t kmem_reclaim_scan_range = 48; // smd 12 + +#ifdef KMEM_STATS +static struct { + uint64_t kms_callbacks; + uint64_t kms_yes; + uint64_t kms_no; + uint64_t kms_later; + uint64_t kms_dont_need; + uint64_t kms_dont_know; + uint64_t kms_hunt_found_mag; + uint64_t kms_hunt_found_slab; + uint64_t kms_hunt_alloc_fail; + uint64_t kms_hunt_lucky; + uint64_t kms_notify; + uint64_t kms_notify_callbacks; + uint64_t kms_disbelief; + uint64_t kms_already_pending; + uint64_t kms_callback_alloc_fail; + uint64_t kms_callback_taskq_fail; + uint64_t kms_endscan_slab_dead; + uint64_t kms_endscan_slab_destroyed; + uint64_t kms_endscan_nomem; + uint64_t kms_endscan_refcnt_changed; + uint64_t kms_endscan_nomove_changed; + uint64_t kms_endscan_freelist; + uint64_t kms_avl_update; + uint64_t kms_avl_noupdate; + uint64_t kms_no_longer_reclaimable; + uint64_t kms_notify_no_longer_reclaimable; + uint64_t kms_notify_slab_dead; + uint64_t kms_notify_slab_destroyed; + uint64_t kms_alloc_fail; + uint64_t kms_constructor_fail; + uint64_t kms_dead_slabs_freed; + uint64_t kms_defrags; + uint64_t kms_scans; + uint64_t kms_scan_depot_ws_reaps; + uint64_t kms_debug_reaps; + uint64_t kms_debug_scans; +} kmem_move_stats; +#endif /* KMEM_STATS */ + +/* consolidator knobs */ +static boolean_t kmem_move_noreap; +static boolean_t kmem_move_blocked; +static boolean_t kmem_move_fulltilt; +static boolean_t kmem_move_any_partial; + +#ifdef DEBUG +/* + * kmem consolidator debug tunables: + * Ensure code coverage by occasionally running the consolidator even when the + * caches are not fragmented (they may never be). These intervals are mean time + * in cache maintenance intervals (kmem_cache_update). + */ +uint32_t kmem_mtb_move = 20; /* defrag 1 slab (~15min) */ // smd: 60=15m, 20=5min +uint32_t kmem_mtb_reap = 240; /* defrag all slabs (~7.5hrs) */ // 1800=7.5h, 720=3h, 240=1h +uint32_t kmem_mtb_reap_count = 0; // how many times have we done an mtb reap? +#endif /* DEBUG */ + +static kmem_cache_t *kmem_defrag_cache; +static kmem_cache_t *kmem_move_cache; +static taskq_t *kmem_move_taskq; + +static void kmem_cache_scan(kmem_cache_t *); +static void kmem_cache_defrag(kmem_cache_t *); +static void kmem_slab_prefill(kmem_cache_t *, kmem_slab_t *); + + +kmem_log_header_t *kmem_transaction_log; +kmem_log_header_t *kmem_content_log; +kmem_log_header_t *kmem_failure_log; +kmem_log_header_t *kmem_slab_log; + +static int kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */ + +#define KMEM_BUFTAG_LITE_ENTER(bt, count, caller) \ +if ((count) > 0) { \ +pc_t *_s = ((kmem_buftag_lite_t *)(bt))->bt_history; \ +pc_t *_e; \ +/* memmove() the old entries down one notch */ \ +for (_e = &_s[(count) - 1]; _e > _s; _e--) \ +*_e = *(_e - 1); \ +*_s = (uintptr_t)(caller); \ +} + +#define KMERR_MODIFIED 0 /* buffer modified while on freelist */ +#define KMERR_REDZONE 1 /* redzone violation (write past end of buf) */ +#define KMERR_DUPFREE 2 /* freed a buffer twice */ +#define KMERR_BADADDR 3 /* freed a bad (unallocated) address */ +#define KMERR_BADBUFTAG 4 /* buftag corrupted */ +#define KMERR_BADBUFCTL 5 /* bufctl corrupted */ +#define KMERR_BADCACHE 6 /* freed a buffer to the wrong cache */ +#define KMERR_BADSIZE 7 /* alloc size != free size */ +#define KMERR_BADBASE 8 /* buffer base address wrong */ + +struct { + hrtime_t kmp_timestamp; /* timestamp of panic */ + int kmp_error; /* type of kmem error */ + void *kmp_buffer; /* buffer that induced panic */ + void *kmp_realbuf; /* real start address for buffer */ + kmem_cache_t *kmp_cache; /* buffer's cache according to client */ + kmem_cache_t *kmp_realcache; /* actual cache containing buffer */ + kmem_slab_t *kmp_slab; /* slab accoring to kmem_findslab() */ + kmem_bufctl_t *kmp_bufctl; /* bufctl */ +} kmem_panic_info; + +extern uint64_t stat_osif_malloc_success; +extern uint64_t stat_osif_malloc_bytes; +extern uint64_t stat_osif_free; +extern uint64_t stat_osif_free_bytes; + +extern uint64_t spl_bucket_non_pow2_allocs; + +// stats for spl_root_allocator(); +extern uint64_t spl_root_allocator_calls; +extern uint64_t spl_root_allocator_large_bytes_asked; +extern uint64_t spl_root_allocator_small_bytes_asked; +extern uint64_t spl_root_allocator_minalloc_bytes_asked; +extern uint64_t spl_root_allocator_extra_pass; +extern uint64_t spl_root_allocator_recovered; +extern uint64_t spl_root_allocator_recovered_bytes; + +extern uint64_t spl_vmem_unconditional_allocs; +extern uint64_t spl_vmem_unconditional_alloc_bytes; +extern uint64_t spl_vmem_conditional_allocs; +extern uint64_t spl_vmem_conditional_alloc_bytes; +extern uint64_t spl_vmem_conditional_alloc_deny; +extern uint64_t spl_vmem_conditional_alloc_deny_bytes; + +extern uint64_t spl_xat_success; +extern uint64_t spl_xat_late_success; +extern uint64_t spl_xat_late_success_nosleep; +extern uint64_t spl_xat_pressured; +extern uint64_t spl_xat_bailed; +extern uint64_t spl_xat_bailed_contended; +extern uint64_t spl_xat_lastalloc; +extern uint64_t spl_xat_lastfree; +extern uint64_t spl_xat_forced; +extern uint64_t spl_xat_sleep; +extern uint64_t spl_xat_late_deny; +extern uint64_t spl_xat_no_waiters; +extern uint64_t spl_xft_wait; + +extern uint64_t spl_vba_parent_memory_appeared; +extern uint64_t spl_vba_parent_memory_blocked; +extern uint64_t spl_vba_hiprio_blocked; +extern uint64_t spl_vba_cv_timeout; +extern uint64_t spl_vba_loop_timeout; +extern uint64_t spl_vba_cv_timeout_blocked; +extern uint64_t spl_vba_loop_timeout_blocked; +extern uint64_t spl_vba_sleep; +extern uint64_t spl_vba_loop_entries; + +extern uint64_t spl_bucket_tunable_large_span; +extern uint64_t spl_bucket_tunable_small_span; +extern void spl_set_bucket_tunable_large_span(uint64_t); +extern void spl_set_bucket_tunable_small_span(uint64_t); + +extern _Atomic uint64_t spl_arc_no_grow_bits; +extern uint64_t spl_arc_no_grow_count; + +extern uint64_t spl_frag_max_walk; +extern uint64_t spl_frag_walked_out; +extern uint64_t spl_frag_walk_cnt; + +uint64_t spl_buckets_mem_free = 0; +uint64_t spl_arc_reclaim_avoided = 0; + +uint64_t kmem_free_to_slab_when_fragmented = 0; + +typedef struct spl_stats { + kstat_named_t spl_os_alloc; + kstat_named_t spl_active_threads; + kstat_named_t spl_active_mutex; + kstat_named_t spl_active_rwlock; + kstat_named_t spl_active_tsd; + kstat_named_t spl_free_wake_count; + kstat_named_t spl_spl_free; + kstat_named_t spl_spl_free_manual_pressure; + kstat_named_t spl_spl_free_fast_pressure; + kstat_named_t spl_spl_free_delta_ema; + kstat_named_t spl_spl_free_negative_count; + kstat_named_t spl_osif_malloc_success; + kstat_named_t spl_osif_malloc_bytes; + kstat_named_t spl_osif_free; + kstat_named_t spl_osif_free_bytes; + kstat_named_t spl_bucket_non_pow2_allocs; + + kstat_named_t spl_vmem_unconditional_allocs; + kstat_named_t spl_vmem_unconditional_alloc_bytes; + kstat_named_t spl_vmem_conditional_allocs; + kstat_named_t spl_vmem_conditional_alloc_bytes; + kstat_named_t spl_vmem_conditional_alloc_deny; + kstat_named_t spl_vmem_conditional_alloc_deny_bytes; + + kstat_named_t spl_xat_success; + kstat_named_t spl_xat_late_success; + kstat_named_t spl_xat_late_success_nosleep; + kstat_named_t spl_xat_pressured; + kstat_named_t spl_xat_bailed; + kstat_named_t spl_xat_bailed_contended; + kstat_named_t spl_xat_lastalloc; + kstat_named_t spl_xat_lastfree; + kstat_named_t spl_xat_forced; + kstat_named_t spl_xat_sleep; + kstat_named_t spl_xat_late_deny; + kstat_named_t spl_xat_no_waiters; + kstat_named_t spl_xft_wait; + + kstat_named_t spl_vba_parent_memory_appeared; + kstat_named_t spl_vba_parent_memory_blocked; + kstat_named_t spl_vba_hiprio_blocked; + kstat_named_t spl_vba_cv_timeout; + kstat_named_t spl_vba_loop_timeout; + kstat_named_t spl_vba_cv_timeout_blocked; + kstat_named_t spl_vba_loop_timeout_blocked; + kstat_named_t spl_vba_sleep; + kstat_named_t spl_vba_loop_entries; + + kstat_named_t spl_bucket_tunable_large_span; + kstat_named_t spl_bucket_tunable_small_span; + + kstat_named_t spl_buckets_mem_free; + kstat_named_t spl_arc_no_grow_bits; + kstat_named_t spl_arc_no_grow_count; + kstat_named_t spl_frag_max_walk; + kstat_named_t spl_frag_walked_out; + kstat_named_t spl_frag_walk_cnt; + kstat_named_t spl_arc_reclaim_avoided; + + kstat_named_t kmem_free_to_slab_when_fragmented; + +} spl_stats_t; + +static spl_stats_t spl_stats = { + {"os_mem_alloc", KSTAT_DATA_UINT64}, + {"active_threads", KSTAT_DATA_UINT64}, + {"active_mutex", KSTAT_DATA_UINT64}, + {"active_rwlock", KSTAT_DATA_UINT64}, + {"active_tsd", KSTAT_DATA_UINT64}, + {"spl_free_wake_count", KSTAT_DATA_UINT64}, + {"spl_spl_free", KSTAT_DATA_INT64}, + {"spl_spl_free_manual_pressure", KSTAT_DATA_UINT64}, + {"spl_spl_free_fast_pressure", KSTAT_DATA_UINT64}, + {"spl_spl_free_delta_ema", KSTAT_DATA_UINT64}, + {"spl_spl_free_negative_count", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_success", KSTAT_DATA_UINT64}, + {"spl_osif_malloc_bytes", KSTAT_DATA_UINT64}, + {"spl_osif_free", KSTAT_DATA_UINT64}, + {"spl_osif_free_bytes", KSTAT_DATA_UINT64}, + {"spl_bucket_non_pow2_allocs", KSTAT_DATA_UINT64}, + + {"vmem_unconditional_allocs", KSTAT_DATA_UINT64}, + {"vmem_unconditional_alloc_bytes", KSTAT_DATA_UINT64}, + {"vmem_conditional_allocs", KSTAT_DATA_UINT64}, + {"vmem_conditional_alloc_bytes", KSTAT_DATA_UINT64}, + {"vmem_conditional_alloc_deny", KSTAT_DATA_UINT64}, + {"vmem_conditional_alloc_deny_bts", KSTAT_DATA_UINT64}, + + {"spl_xat_success", KSTAT_DATA_UINT64}, + {"spl_xat_late_success", KSTAT_DATA_UINT64}, + {"spl_xat_late_success_nosleep", KSTAT_DATA_UINT64}, + {"spl_xat_pressured", KSTAT_DATA_UINT64}, + {"spl_xat_bailed", KSTAT_DATA_UINT64}, + {"spl_xat_bailed_contended", KSTAT_DATA_UINT64}, + {"spl_xat_lastalloc", KSTAT_DATA_UINT64}, + {"spl_xat_lastfree", KSTAT_DATA_UINT64}, + {"spl_xat_forced", KSTAT_DATA_UINT64}, + {"spl_xat_sleep", KSTAT_DATA_UINT64}, + {"spl_xat_late_deny", KSTAT_DATA_UINT64}, + {"spl_xat_no_waiters", KSTAT_DATA_UINT64}, + {"spl_xft_wait", KSTAT_DATA_UINT64}, + + {"spl_vba_parent_memory_appeared", KSTAT_DATA_UINT64}, + {"spl_vba_parent_memory_blocked", KSTAT_DATA_UINT64}, + {"spl_vba_hiprio_blocked", KSTAT_DATA_UINT64}, + {"spl_vba_cv_timeout", KSTAT_DATA_UINT64}, + {"spl_vba_loop_timeout", KSTAT_DATA_UINT64}, + {"spl_vba_cv_timeout_blocked", KSTAT_DATA_UINT64}, + {"spl_vba_loop_timeout_blocked", KSTAT_DATA_UINT64}, + {"spl_vba_sleep", KSTAT_DATA_UINT64}, + {"spl_vba_loop_entries", KSTAT_DATA_UINT64}, + + {"spl_tunable_large_span", KSTAT_DATA_UINT64}, + {"spl_tunable_small_span", KSTAT_DATA_UINT64}, + + {"spl_buckets_mem_free", KSTAT_DATA_UINT64}, + {"spl_arc_no_grow_bits", KSTAT_DATA_UINT64}, + {"spl_arc_no_grow_count", KSTAT_DATA_UINT64}, + + {"spl_vmem_frag_max_walk", KSTAT_DATA_UINT64}, + {"spl_vmem_frag_walked_out", KSTAT_DATA_UINT64}, + {"spl_vmem_frag_walk_cnt", KSTAT_DATA_UINT64}, + {"spl_arc_reclaim_avoided", KSTAT_DATA_UINT64}, + + {"kmem_free_to_slab_when_frgmnted", KSTAT_DATA_UINT64}, +}; + +static kstat_t *spl_ksp = 0; + +// Stub out caller() +caddr_t +caller() +{ + return ((caddr_t)(0)); +} + +//void * +//calloc(uint32_t n, uint32_t s) +//{ +// return (zfs_kmem_zalloc(n * s, KM_NOSLEEP)); +//} + +#define IS_DIGIT(c) ((c) >= '0' && (c) <= '9') + +#define IS_ALPHA(c) \ +(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) + +/* + * Get bytes from the /dev/random generator. Returns 0 + * on success. Returns EAGAIN if there is insufficient entropy. + */ +int +random_get_bytes(uint8_t *ptr, uint32_t len) +{ + //read_random(ptr, len); + LARGE_INTEGER TickCount; + ULONG r; + PULONG b; + int i; + + KeQueryTickCount(&TickCount); + + b = (PULONG) ptr; + + for (i = 0; i < len / sizeof(ULONG); i++) + b[i] = RtlRandomEx(&TickCount.LowPart); + + len &= (sizeof(ULONG) - 1); + if (len > 0) { + r = RtlRandomEx(&TickCount.LowPart); + RtlCopyMemory(&b[i], &r, len); + } + return (0); +} + +/* + * BGH - Missing from OSX? + * + * Convert a string into a valid C identifier by replacing invalid + * characters with '_'. Also makes sure the string is nul-terminated + * and takes up at most n bytes. + */ +void +strident_canon(char *s, uint32_t n) +{ + char c; + char *end = s + n - 1; + + if ((c = *s) == 0) + return; + + if (!IS_ALPHA(c) && c != '_') + *s = '_'; + + while (s < end && ((c = *(++s)) != 0)) { + if (!IS_ALPHA(c) && !IS_DIGIT(c) && c != '_') + *s = '_'; + } + *s = 0; +} + +int +strident_valid(const char *id) +{ + int c = *id++; + + if (!IS_ALPHA(c) && c != '_') + return (0); + while ((c = *id++) != 0) { + if (!IS_ALPHA(c) && !IS_DIGIT(c) && c != '_') + return (0); + } + return (1); +} + +static void +copy_pattern(uint64_t pattern, void *buf_arg, uint32_t size) +{ + uint64_t *bufend = (uint64_t *)((char *)buf_arg + size); + uint64_t *buf = buf_arg; + + while (buf < bufend) + *buf++ = pattern; +} + +static void * +verify_pattern(uint64_t pattern, void *buf_arg, uint32_t size) +{ + uint64_t *bufend = (uint64_t *)((char *)buf_arg + size); + uint64_t *buf; + + for (buf = buf_arg; buf < bufend; buf++) + if (*buf != pattern) + return (buf); + return (NULL); +} + +static void * +verify_and_copy_pattern(uint64_t old, uint64_t new, void *buf_arg, uint32_t size) +{ + uint64_t *bufend = (uint64_t *)((char *)buf_arg + size); + uint64_t *buf; + + for (buf = buf_arg; buf < bufend; buf++) { + if (*buf != old) { + copy_pattern(old, buf_arg, + (uint32_t) ((char *)buf - (char *)buf_arg)); + return (buf); + } + *buf = new; + } + + return (NULL); +} + +static void +kmem_cache_applyall(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag) +{ + kmem_cache_t *cp; + + mutex_enter(&kmem_cache_lock); + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) + if (tq != NULL) + (void) taskq_dispatch(tq, (task_func_t *)func, cp, + tqflag); + else + func(cp); + mutex_exit(&kmem_cache_lock); +} + +static void +kmem_cache_applyall_id(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag) +{ + kmem_cache_t *cp; + + mutex_enter(&kmem_cache_lock); + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + if (!(cp->cache_cflags & KMC_IDENTIFIER)) + continue; + if (tq != NULL) + (void) taskq_dispatch(tq, (task_func_t *)func, cp, + tqflag); + else + func(cp); + } + mutex_exit(&kmem_cache_lock); +} + +/* + * Debugging support. Given a buffer address, find its slab. + */ +static kmem_slab_t * +kmem_findslab(kmem_cache_t *cp, void *buf) +{ + kmem_slab_t *sp; + + mutex_enter(&cp->cache_lock); + for (sp = list_head(&cp->cache_complete_slabs); sp != NULL; + sp = list_next(&cp->cache_complete_slabs, sp)) { + if (KMEM_SLAB_MEMBER(sp, buf)) { + mutex_exit(&cp->cache_lock); + return (sp); + } + } + for (sp = avl_first(&cp->cache_partial_slabs); sp != NULL; + sp = AVL_NEXT(&cp->cache_partial_slabs, sp)) { + if (KMEM_SLAB_MEMBER(sp, buf)) { + mutex_exit(&cp->cache_lock); + return (sp); + } + } + mutex_exit(&cp->cache_lock); + + return (NULL); +} + +static void +kmem_error(int error, kmem_cache_t *cparg, void *bufarg) +{ + kmem_buftag_t *btp = NULL; + kmem_bufctl_t *bcp = NULL; + kmem_cache_t *cp = cparg; + kmem_slab_t *sp; + uint64_t *off; + void *buf = bufarg; + + kmem_logging = 0; /* stop logging when a bad thing happens */ + + kmem_panic_info.kmp_timestamp = gethrtime(); + + sp = kmem_findslab(cp, buf); + if (sp == NULL) { + for (cp = list_tail(&kmem_caches); cp != NULL; + cp = list_prev(&kmem_caches, cp)) { + if ((sp = kmem_findslab(cp, buf)) != NULL) + break; + } + } + + if (sp == NULL) { + cp = NULL; + error = KMERR_BADADDR; + } else { + if (cp != cparg) + error = KMERR_BADCACHE; + else + buf = (char *)bufarg - + ((uintptr_t)bufarg - + (uintptr_t)sp->slab_base) % cp->cache_chunksize; + if (buf != bufarg) + error = KMERR_BADBASE; + if (cp->cache_flags & KMF_BUFTAG) + btp = KMEM_BUFTAG(cp, buf); + if (cp->cache_flags & KMF_HASH) { + mutex_enter(&cp->cache_lock); + for (bcp = *KMEM_HASH(cp, buf); bcp; bcp = bcp->bc_next) + if (bcp->bc_addr == buf) + break; + mutex_exit(&cp->cache_lock); + if (bcp == NULL && btp != NULL) + bcp = btp->bt_bufctl; + if (kmem_findslab(cp->cache_bufctl_cache, bcp) == + NULL || P2PHASE((uintptr_t)bcp, KMEM_ALIGN) || + bcp->bc_addr != buf) { + error = KMERR_BADBUFCTL; + bcp = NULL; + } + } + } + + kmem_panic_info.kmp_error = error; + kmem_panic_info.kmp_buffer = bufarg; + kmem_panic_info.kmp_realbuf = buf; + kmem_panic_info.kmp_cache = cparg; + kmem_panic_info.kmp_realcache = cp; + kmem_panic_info.kmp_slab = sp; + kmem_panic_info.kmp_bufctl = bcp; + + dprintf("SPL: kernel memory allocator: "); + + switch (error) { + + case KMERR_MODIFIED: + TraceEvent(TRACE_ERROR, "buffer modified after being freed\n"); + off = verify_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify); + if (off == NULL) /* shouldn't happen */ + off = buf; + TraceEvent(TRACE_ERROR, "SPL: modification occurred at offset 0x%lx " + "(0x%llx replaced by 0x%llx)\n", + (uintptr_t)off - (uintptr_t)buf, + (longlong_t)KMEM_FREE_PATTERN, (longlong_t)*off); + break; + + case KMERR_REDZONE: + TraceEvent(TRACE_ERROR, "redzone violation: write past end of buffer\n"); + break; + + case KMERR_BADADDR: + TraceEvent(TRACE_ERROR, "invalid free: buffer not in cache\n"); + break; + + case KMERR_DUPFREE: + TraceEvent(TRACE_ERROR, "duplicate free: buffer freed twice\n"); + break; + + case KMERR_BADBUFTAG: + TraceEvent(TRACE_ERROR, "boundary tag corrupted\n"); + TraceEvent(TRACE_ERROR, "SPL: bcp ^ bxstat = %lx, should be %lx\n", + (intptr_t)btp->bt_bufctl ^ btp->bt_bxstat, + KMEM_BUFTAG_FREE); + break; + + case KMERR_BADBUFCTL: + TraceEvent(TRACE_ERROR, "bufctl corrupted\n"); + break; + + case KMERR_BADCACHE: + TraceEvent(TRACE_ERROR, "buffer freed to wrong cache\n"); + TraceEvent(TRACE_ERROR, "SPL: buffer was allocated from %s,\n", cp->cache_name); + TraceEvent(TRACE_ERROR, "SPL: caller attempting free to %s.\n", cparg->cache_name); + break; + + case KMERR_BADSIZE: + TraceEvent(TRACE_ERROR, "bad free: free size (%u) != alloc size (%u)\n", + KMEM_SIZE_DECODE(((uint32_t *)btp)[0]), + KMEM_SIZE_DECODE(((uint32_t *)btp)[1])); + break; + + case KMERR_BADBASE: + TraceEvent(TRACE_ERROR, "bad free: free address (%p) != alloc address (%p)\n", + bufarg, buf); + break; + } + + dprintf("SPL: buffer=%p bufctl=%p cache: %s\n", + bufarg, (void *)bcp, cparg->cache_name); + + if (bcp != NULL && (cp->cache_flags & KMF_AUDIT) && + error != KMERR_BADBUFCTL) { + int d; + timestruc_t ts = {0, 0}; + kmem_bufctl_audit_t *bcap = (kmem_bufctl_audit_t *)bcp; + + hrt2ts(kmem_panic_info.kmp_timestamp - bcap->bc_timestamp, &ts); + dprintf("SPL: previous transaction on buffer %p:\n", buf); + dprintf("SPL: thread=%p time=T-%ld.%09ld slab=%p cache: %s\n", + (void *)bcap->bc_thread, ts.tv_sec, ts.tv_nsec, + (void *)sp, cp->cache_name); + +//#include <../um/DbgHelp.h> +//#pragma comment(lib, "Dbghelp.lib") +// HANDLE process; +// process = GetCurrentProcess(); +// SymInitialize(process, NULL, TRUE); +// SYMBOL_INFO symbol; + + for (d = 0; d < MIN(bcap->bc_depth, KMEM_STACK_DEPTH); d++) { + +// SymFromAddr(process, (DWORD64)(bcap->bc_stack[d]), 0, NULL); + dprintf(" : %p\n, ", bcap->bc_stack[d]); + } + } + + if (kmem_panic > 0) { + //delay(hz); + //IOSleep(1000000); + DbgBreakPoint(); + panic("kernel heap corruption detected"); + } + + // if (kmem_panic == 0) { + // debug_enter(NULL); + // Debugger("Kernel heap corruption detected"); + // } + DbgBreakPoint(); + + kmem_logging = 1; /* resume logging */ +} + +static kmem_log_header_t * +kmem_log_init(uint32_t logsize) +{ + kmem_log_header_t *lhp; + unsigned int nchunks = 4 * max_ncpus; + uint32_t lhsize = (uint32_t)(uint64_t)(&((kmem_log_header_t *)0)->lh_cpu[max_ncpus]); + unsigned int i; + + /* + * Make sure that lhp->lh_cpu[] is nicely aligned + * to prevent FALSE sharing of cache lines. + */ + lhsize = P2ROUNDUP(lhsize, KMEM_ALIGN); + lhp = vmem_xalloc(kmem_log_arena, lhsize, 64, P2NPHASE(lhsize, 64), 0, + NULL, NULL, VM_SLEEP); + bzero(lhp, lhsize); + + mutex_init(&lhp->lh_lock, NULL, MUTEX_DEFAULT, NULL); + lhp->lh_nchunks = nchunks; + lhp->lh_chunksize = P2ROUNDUP(logsize / nchunks + 1, PAGESIZE); + lhp->lh_base = vmem_alloc(kmem_log_arena, + lhp->lh_chunksize * nchunks, VM_SLEEP); + lhp->lh_free = vmem_alloc(kmem_log_arena, + nchunks * sizeof (int), VM_SLEEP); + bzero(lhp->lh_base, lhp->lh_chunksize * nchunks); + + for (i = 0; i < max_ncpus; i++) { + kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[i]; + mutex_init(&clhp->clh_lock, NULL, MUTEX_DEFAULT, NULL); + clhp->clh_chunk = i; + } + + for (i = max_ncpus; i < nchunks; i++) + lhp->lh_free[i] = i; + + lhp->lh_head = max_ncpus; + lhp->lh_tail = 0; + + return (lhp); +} + + +static void +kmem_log_fini(kmem_log_header_t *lhp) +{ + int nchunks = 4 * max_ncpus; + uint32_t lhsize = (uint32_t)(uint64_t)(&((kmem_log_header_t *)0)->lh_cpu[max_ncpus]); + unsigned int i; + + + + for (i = 0; i < max_ncpus; i++) { + kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[i]; + mutex_destroy(&clhp->clh_lock); + } + + vmem_free(kmem_log_arena, + lhp->lh_free, + nchunks * sizeof (int)); + + vmem_free(kmem_log_arena, + lhp->lh_base, + lhp->lh_chunksize * nchunks); + + mutex_destroy(&lhp->lh_lock); + + lhsize = P2ROUNDUP(lhsize, KMEM_ALIGN); + vmem_xfree(kmem_log_arena, + lhp, + lhsize); +} + + +static void * +kmem_log_enter(kmem_log_header_t *lhp, void *data, uint32_t size) +{ + void *logspace; + + kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[cpu_number()]; + + // if (lhp == NULL || kmem_logging == 0 || panicstr) + if (lhp == NULL || kmem_logging == 0) + return (NULL); + + mutex_enter(&clhp->clh_lock); + clhp->clh_hits++; + if (size > clhp->clh_avail) { + mutex_enter(&lhp->lh_lock); + lhp->lh_hits++; + lhp->lh_free[lhp->lh_tail] = clhp->clh_chunk; + lhp->lh_tail = (lhp->lh_tail + 1) % lhp->lh_nchunks; + clhp->clh_chunk = lhp->lh_free[lhp->lh_head]; + lhp->lh_head = (lhp->lh_head + 1) % lhp->lh_nchunks; + clhp->clh_current = lhp->lh_base + + clhp->clh_chunk * lhp->lh_chunksize; + clhp->clh_avail = lhp->lh_chunksize; + if (size > lhp->lh_chunksize) + size = lhp->lh_chunksize; + mutex_exit(&lhp->lh_lock); + } + logspace = clhp->clh_current; + clhp->clh_current += size; + clhp->clh_avail -= size; + bcopy(data, logspace, size); + mutex_exit(&clhp->clh_lock); + return (logspace); +} + +#define KMEM_AUDIT(lp, cp, bcp) \ +{ \ +kmem_bufctl_audit_t *_bcp = (kmem_bufctl_audit_t *)(bcp); \ +_bcp->bc_timestamp = gethrtime(); \ +_bcp->bc_thread = spl_current_thread(); \ +_bcp->bc_depth = getpcstack(_bcp->bc_stack, KMEM_STACK_DEPTH); \ +_bcp->bc_lastlog = kmem_log_enter((lp), _bcp, sizeof (*_bcp)); \ +} + +static void +kmem_log_event(kmem_log_header_t *lp, kmem_cache_t *cp, + kmem_slab_t *sp, void *addr) +{ + kmem_bufctl_audit_t bca; + + bzero(&bca, sizeof (kmem_bufctl_audit_t)); + bca.bc_addr = addr; + bca.bc_slab = sp; + KMEM_AUDIT(lp, cp, &bca); +} + +/* + * Create a new slab for cache cp. + */ +static kmem_slab_t * +kmem_slab_create(kmem_cache_t *cp, int kmflag) +{ + uint32_t slabsize = cp->cache_slabsize; + uint32_t chunksize = cp->cache_chunksize; + int cache_flags = cp->cache_flags; + uint32_t color, chunks; + char *buf, *slab; + kmem_slab_t *sp; + kmem_bufctl_t *bcp; + vmem_t *vmp = cp->cache_arena; + + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + + color = cp->cache_color + cp->cache_align; + if (color > cp->cache_maxcolor) + color = cp->cache_mincolor; + cp->cache_color = color; + + slab = vmem_alloc(vmp, slabsize, kmflag & KM_VMFLAGS); + + if (slab == NULL) + goto vmem_alloc_failure; + + ASSERT(P2PHASE((uintptr_t)slab, vmp->vm_quantum) == 0); + + /* + * Reverify what was already checked in kmem_cache_set_move(), since the + * consolidator depends (for correctness) on slabs being initialized + * with the 0xbaddcafe memory pattern (setting a low order bit usable by + * clients to distinguish uninitialized memory from known objects). + */ + ASSERT((cp->cache_move == NULL) || !(cp->cache_cflags & KMC_NOTOUCH)); + if (!(cp->cache_cflags & KMC_NOTOUCH)) + copy_pattern(KMEM_UNINITIALIZED_PATTERN, slab, slabsize); + + if (cache_flags & KMF_HASH) { + if ((sp = kmem_cache_alloc(kmem_slab_cache, kmflag)) == NULL) + goto slab_alloc_failure; + chunks = (slabsize - color) / chunksize; + } else { + sp = KMEM_SLAB(cp, slab); + chunks = (slabsize - sizeof (kmem_slab_t) - color) / chunksize; + } + sp->slab_cache = cp; + sp->slab_head = NULL; + sp->slab_refcnt = 0; + sp->slab_base = buf = slab + color; + sp->slab_chunks = (long)chunks; + sp->slab_stuck_offset = (uint32_t)-1; + sp->slab_later_count = 0; + sp->slab_flags = 0; + sp->slab_create_time = gethrtime(); + + ASSERT(chunks > 0); + while (chunks-- != 0) { + if (cache_flags & KMF_HASH) { + bcp = kmem_cache_alloc(cp->cache_bufctl_cache, kmflag); + if (bcp == NULL) + goto bufctl_alloc_failure; + if (cache_flags & KMF_AUDIT) { + kmem_bufctl_audit_t *bcap = + (kmem_bufctl_audit_t *)bcp; + bzero(bcap, sizeof (kmem_bufctl_audit_t)); + bcap->bc_cache = cp; + } + bcp->bc_addr = buf; + bcp->bc_slab = sp; + } else { + bcp = KMEM_BUFCTL(cp, buf); + } + if (cache_flags & KMF_BUFTAG) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + btp->bt_redzone = KMEM_REDZONE_PATTERN; + btp->bt_bufctl = bcp; + btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE; + if (cache_flags & KMF_DEADBEEF) { + copy_pattern(KMEM_FREE_PATTERN, buf, + cp->cache_verify); + } + } + bcp->bc_next = sp->slab_head; + sp->slab_head = bcp; + buf += chunksize; + } + + kmem_log_event(kmem_slab_log, cp, sp, slab); + + return (sp); + +bufctl_alloc_failure: + + while ((bcp = sp->slab_head) != NULL) { + sp->slab_head = bcp->bc_next; + kmem_cache_free(cp->cache_bufctl_cache, bcp); + } + kmem_cache_free(kmem_slab_cache, sp); + +slab_alloc_failure: + + vmem_free(vmp, slab, slabsize); + +vmem_alloc_failure: + + if (0 == (kmflag & KM_NO_VBA)) { + kmem_log_event(kmem_failure_log, cp, NULL, NULL); + atomic_inc_64(&cp->cache_alloc_fail); + } + + return (NULL); +} + +/* + * Destroy a slab. + */ +static void +kmem_slab_destroy(kmem_cache_t *cp, kmem_slab_t *sp) +{ + vmem_t *vmp = cp->cache_arena; + void *slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, vmp->vm_quantum); + + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + ASSERT(sp->slab_refcnt == 0); + + if (cp->cache_flags & KMF_HASH) { + kmem_bufctl_t *bcp; + while ((bcp = sp->slab_head) != NULL) { + sp->slab_head = bcp->bc_next; + kmem_cache_free(cp->cache_bufctl_cache, bcp); + } + kmem_cache_free(kmem_slab_cache, sp); + } + kpreempt(KPREEMPT_SYNC); + vmem_free(vmp, slab, cp->cache_slabsize); +} + +static void * +kmem_slab_alloc_impl(kmem_cache_t *cp, kmem_slab_t *sp, boolean_t prefill) +{ + kmem_bufctl_t *bcp, **hash_bucket; + void *buf; + boolean_t new_slab = (sp->slab_refcnt == 0); + + ASSERT(MUTEX_HELD(&cp->cache_lock)); + /* + * kmem_slab_alloc() drops cache_lock when it creates a new slab, so we + * can't ASSERT(avl_is_empty(&cp->cache_partial_slabs)) here when the + * slab is newly created. + */ + ASSERT(new_slab || (KMEM_SLAB_IS_PARTIAL(sp) && + (sp == avl_first(&cp->cache_partial_slabs)))); + ASSERT(sp->slab_cache == cp); + + cp->cache_slab_alloc++; + cp->cache_bufslab--; + sp->slab_refcnt++; + bcp = sp->slab_head; + sp->slab_head = bcp->bc_next; + + if (cp->cache_flags & KMF_HASH) { + /* + * Add buffer to allocated-address hash table. + */ + buf = bcp->bc_addr; + hash_bucket = KMEM_HASH(cp, buf); + bcp->bc_next = *hash_bucket; + *hash_bucket = bcp; + if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) { + KMEM_AUDIT(kmem_transaction_log, cp, bcp); + } + } else { + buf = KMEM_BUF(cp, bcp); + } + + ASSERT(KMEM_SLAB_MEMBER(sp, buf)); + + if (sp->slab_head == NULL) { + ASSERT(KMEM_SLAB_IS_ALL_USED(sp)); + if (new_slab) { + ASSERT(sp->slab_chunks == 1); + } else { + ASSERT(sp->slab_chunks > 1); /* the slab was partial */ + avl_remove(&cp->cache_partial_slabs, sp); + sp->slab_later_count = 0; /* clear history */ + sp->slab_flags &= ~KMEM_SLAB_NOMOVE; + sp->slab_stuck_offset = (uint32_t)-1; + } + list_insert_head(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count++; + return (buf); + } + + ASSERT(KMEM_SLAB_IS_PARTIAL(sp)); + /* + * Peek to see if the magazine layer is enabled before + * we prefill. We're not holding the cpu cache lock, + * so the peek could be wrong, but there's no harm in it. + */ + if (new_slab && prefill && (cp->cache_flags & KMF_PREFILL) && + (KMEM_CPU_CACHE(cp)->cc_magsize != 0)) { + kmem_slab_prefill(cp, sp); + return (buf); + } + + if (new_slab) { + avl_add(&cp->cache_partial_slabs, sp); + return (buf); + } + + /* + * The slab is now more allocated than it was, so the + * order remains unchanged. + */ + ASSERT(!avl_update(&cp->cache_partial_slabs, sp)); + return (buf); +} + +/* + * Allocate a raw (unconstructed) buffer from cp's slab layer. + */ +static void * +kmem_slab_alloc(kmem_cache_t *cp, int kmflag) +{ + kmem_slab_t *sp; + void *buf; + boolean_t test_destructor; + + mutex_enter(&cp->cache_lock); + test_destructor = (cp->cache_slab_alloc == 0); + sp = avl_first(&cp->cache_partial_slabs); + + if (sp == NULL) { + ASSERT(cp->cache_bufslab == 0); + + /* + * The freelist is empty. Create a new slab. + */ + mutex_exit(&cp->cache_lock); + if ((sp = kmem_slab_create(cp, kmflag)) == NULL) { + return (NULL); + } + mutex_enter(&cp->cache_lock); + cp->cache_slab_create++; + if ((cp->cache_buftotal += sp->slab_chunks) > cp->cache_bufmax) + cp->cache_bufmax = cp->cache_buftotal; + cp->cache_bufslab += sp->slab_chunks; + } + + buf = kmem_slab_alloc_impl(cp, sp, B_TRUE); + ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) == + (cp->cache_complete_slab_count + + avl_numnodes(&cp->cache_partial_slabs) + + (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount))); + mutex_exit(&cp->cache_lock); + + if (test_destructor && cp->cache_destructor != NULL) { + copy_pattern(KMEM_UNINITIALIZED_PATTERN, buf, + cp->cache_bufsize); + if (cp->cache_flags & KMF_DEADBEEF) { + copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify); + } + } + + return (buf); +} + +static void kmem_slab_move_yes(kmem_cache_t *, kmem_slab_t *, void *); + +/* + * Free a raw (unconstructed) buffer to cp's slab layer. + */ +static void +kmem_slab_free(kmem_cache_t *cp, void *buf) +{ + kmem_slab_t *sp = NULL; + kmem_bufctl_t *bcp, **prev_bcpp; + + ASSERT(buf != NULL); + + mutex_enter(&cp->cache_lock); + cp->cache_slab_free++; + + if (cp->cache_flags & KMF_HASH) { + /* + * Look up buffer in allocated-address hash table. + */ + prev_bcpp = KMEM_HASH(cp, buf); + while ((bcp = *prev_bcpp) != NULL) { + if (bcp->bc_addr == buf) { + *prev_bcpp = bcp->bc_next; + sp = bcp->bc_slab; + break; + } + cp->cache_lookup_depth++; + prev_bcpp = &bcp->bc_next; + } + } else { + bcp = KMEM_BUFCTL(cp, buf); + sp = KMEM_SLAB(cp, buf); + } + + if (bcp == NULL || sp->slab_cache != cp || !KMEM_SLAB_MEMBER(sp, buf)) { + mutex_exit(&cp->cache_lock); + kmem_error(KMERR_BADADDR, cp, buf); + return; + } + + if (KMEM_SLAB_OFFSET(sp, buf) == sp->slab_stuck_offset) { + /* + * If this is the buffer that prevented the consolidator from + * clearing the slab, we can reset the slab flags now that the + * buffer is freed. (It makes sense to do this in + * kmem_cache_free(), where the client gives up ownership of the + * buffer, but on the hot path the test is too expensive.) + */ + kmem_slab_move_yes(cp, sp, buf); + } + + if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) { + if (cp->cache_flags & KMF_CONTENTS) + ((kmem_bufctl_audit_t *)bcp)->bc_contents = + kmem_log_enter(kmem_content_log, buf, + cp->cache_contents); + KMEM_AUDIT(kmem_transaction_log, cp, bcp); + } + + bcp->bc_next = sp->slab_head; + sp->slab_head = bcp; + + cp->cache_bufslab++; + ASSERT(sp->slab_refcnt >= 1); + + if (--sp->slab_refcnt == 0) { + /* + * There are no outstanding allocations from this slab, + * so we can reclaim the memory. + */ + if (sp->slab_chunks == 1) { + list_remove(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count--; + } else { + avl_remove(&cp->cache_partial_slabs, sp); + } + + cp->cache_buftotal -= sp->slab_chunks; + cp->cache_bufslab -= sp->slab_chunks; + /* + * Defer releasing the slab to the virtual memory subsystem + * while there is a pending move callback, since we guarantee + * that buffers passed to the move callback have only been + * touched by kmem or by the client itself. Since the memory + * patterns baddcafe (uninitialized) and deadbeef (freed) both + * set at least one of the two lowest order bits, the client can + * test those bits in the move callback to determine whether or + * not it knows about the buffer (assuming that the client also + * sets one of those low order bits whenever it frees a buffer). + */ + if (cp->cache_defrag == NULL || + (avl_is_empty(&cp->cache_defrag->kmd_moves_pending) && + !(sp->slab_flags & KMEM_SLAB_MOVE_PENDING))) { + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + } else { + list_t *deadlist = &cp->cache_defrag->kmd_deadlist; + /* + * Slabs are inserted at both ends of the deadlist to + * distinguish between slabs freed while move callbacks + * are pending (list head) and a slab freed while the + * lock is dropped in kmem_move_buffers() (list tail) so + * that in both cases slab_destroy() is called from the + * right context. + */ + if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) { + list_insert_tail(deadlist, sp); + } else { + list_insert_head(deadlist, sp); + } + cp->cache_defrag->kmd_deadcount++; + mutex_exit(&cp->cache_lock); + } + return; + } + + if (bcp->bc_next == NULL) { + /* Transition the slab from completely allocated to partial. */ + ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1)); + ASSERT(sp->slab_chunks > 1); + list_remove(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count--; + avl_add(&cp->cache_partial_slabs, sp); + } else { +#ifdef DEBUG + if (avl_update_gt(&cp->cache_partial_slabs, sp)) { + KMEM_STAT_ADD(kmem_move_stats.kms_avl_update); + } else { + KMEM_STAT_ADD(kmem_move_stats.kms_avl_noupdate); + } +#else + (void) avl_update_gt(&cp->cache_partial_slabs, sp); +#endif + } + + ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) == + (cp->cache_complete_slab_count + + avl_numnodes(&cp->cache_partial_slabs) + + (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount))); + mutex_exit(&cp->cache_lock); +} + +/* + * Return -1 if kmem_error, 1 if constructor fails, 0 if successful. + */ +static int +kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct, + caddr_t caller) +{ + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl; + uint32_t mtbf; + + if (btp->bt_bxstat != ((intptr_t)bcp ^ KMEM_BUFTAG_FREE)) { + kmem_error(KMERR_BADBUFTAG, cp, buf); + return (-1); + } + + btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_ALLOC; + + if ((cp->cache_flags & KMF_HASH) && bcp->bc_addr != buf) { + kmem_error(KMERR_BADBUFCTL, cp, buf); + return (-1); + } + + if (cp->cache_flags & KMF_DEADBEEF) { + if (!construct && (cp->cache_flags & KMF_LITE)) { + if (*(uint64_t *)buf != KMEM_FREE_PATTERN) { + kmem_error(KMERR_MODIFIED, cp, buf); + return (-1); + } + if (cp->cache_constructor != NULL) + *(uint64_t *)buf = btp->bt_redzone; + else + *(uint64_t *)buf = KMEM_UNINITIALIZED_PATTERN; + } else { + construct = 1; + if (verify_and_copy_pattern(KMEM_FREE_PATTERN, + KMEM_UNINITIALIZED_PATTERN, buf, + cp->cache_verify)) { + kmem_error(KMERR_MODIFIED, cp, buf); + return (-1); + } + } + } + btp->bt_redzone = KMEM_REDZONE_PATTERN; + + if ((mtbf = kmem_mtbf | cp->cache_mtbf) != 0 && + gethrtime() % mtbf == 0 && + (kmflag & (KM_NOSLEEP | KM_PANIC)) == KM_NOSLEEP) { + kmem_log_event(kmem_failure_log, cp, NULL, NULL); + if (!construct && cp->cache_destructor != NULL) + cp->cache_destructor(buf, cp->cache_private); + } else { + mtbf = 0; + } + + if (mtbf || (construct && cp->cache_constructor != NULL && + cp->cache_constructor(buf, cp->cache_private, kmflag) != 0)) { + atomic_inc_64(&cp->cache_alloc_fail); + btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE; + if (cp->cache_flags & KMF_DEADBEEF) + copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify); + kmem_slab_free(cp, buf); + return (1); + } + + if (cp->cache_flags & KMF_AUDIT) { + KMEM_AUDIT(kmem_transaction_log, cp, bcp); + } + + if ((cp->cache_flags & KMF_LITE) && + !(cp->cache_cflags & KMC_KMEM_ALLOC)) { + KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller); + } + + return (0); +} + +static int +kmem_cache_free_debug(kmem_cache_t *cp, void *buf, caddr_t caller) +{ + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl; + kmem_slab_t *sp; + + if (btp->bt_bxstat != ((intptr_t)bcp ^ KMEM_BUFTAG_ALLOC)) { + if (btp->bt_bxstat == ((intptr_t)bcp ^ KMEM_BUFTAG_FREE)) { + kmem_error(KMERR_DUPFREE, cp, buf); + return (-1); + } + sp = kmem_findslab(cp, buf); + if (sp == NULL || sp->slab_cache != cp) + kmem_error(KMERR_BADADDR, cp, buf); + else + kmem_error(KMERR_REDZONE, cp, buf); + return (-1); + } + + btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE; + + if ((cp->cache_flags & KMF_HASH) && bcp->bc_addr != buf) { + kmem_error(KMERR_BADBUFCTL, cp, buf); + return (-1); + } + + if (btp->bt_redzone != KMEM_REDZONE_PATTERN) { + kmem_error(KMERR_REDZONE, cp, buf); + return (-1); + } + + if (cp->cache_flags & KMF_AUDIT) { + if (cp->cache_flags & KMF_CONTENTS) + bcp->bc_contents = kmem_log_enter(kmem_content_log, + buf, cp->cache_contents); + KMEM_AUDIT(kmem_transaction_log, cp, bcp); + } + + if ((cp->cache_flags & KMF_LITE) && + !(cp->cache_cflags & KMC_KMEM_ALLOC)) { + KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller); + } + + if (cp->cache_flags & KMF_DEADBEEF) { + if (cp->cache_flags & KMF_LITE) + btp->bt_redzone = *(uint64_t *)buf; + else if (cp->cache_destructor != NULL) + cp->cache_destructor(buf, cp->cache_private); + + copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify); + } + + return (0); +} + +/* + * Free each object in magazine mp to cp's slab layer, and free mp itself. + */ +static void +kmem_magazine_destroy(kmem_cache_t *cp, kmem_magazine_t *mp, int nrounds) +{ + int round; + + ASSERT(!list_link_active(&cp->cache_link) || + taskq_member(kmem_taskq, curthread)); + + for (round = 0; round < nrounds; round++) { + void *buf = mp->mag_round[round]; + + if (cp->cache_flags & KMF_DEADBEEF) { + if (verify_pattern(KMEM_FREE_PATTERN, buf, + cp->cache_verify) != NULL) { + kmem_error(KMERR_MODIFIED, cp, buf); + continue; + } + if ((cp->cache_flags & KMF_LITE) && + cp->cache_destructor != NULL) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + *(uint64_t *)buf = btp->bt_redzone; + cp->cache_destructor(buf, cp->cache_private); + *(uint64_t *)buf = KMEM_FREE_PATTERN; + } + } else if (cp->cache_destructor != NULL) { + cp->cache_destructor(buf, cp->cache_private); + } + + kmem_slab_free(cp, buf); + kpreempt(KPREEMPT_SYNC); + } + ASSERT(KMEM_MAGAZINE_VALID(cp, mp)); + kmem_cache_free(cp->cache_magtype->mt_cache, mp); +} + +/* + * Allocate a magazine from the depot. + */ +static kmem_magazine_t * +kmem_depot_alloc(kmem_cache_t *cp, kmem_maglist_t *mlp) +{ + kmem_magazine_t *mp; + + /* + * If we can't get the depot lock without contention, + * update our contention count. We use the depot + * contention rate to determine whether we need to + * increase the magazine size for better scalability. + */ + if (!mutex_tryenter(&cp->cache_depot_lock)) { + mutex_enter(&cp->cache_depot_lock); + cp->cache_depot_contention++; + } + + if ((mp = mlp->ml_list) != NULL) { + ASSERT(KMEM_MAGAZINE_VALID(cp, mp)); + mlp->ml_list = mp->mag_next; + if (--mlp->ml_total < mlp->ml_min) + mlp->ml_min = mlp->ml_total; + mlp->ml_alloc++; + } + + mutex_exit(&cp->cache_depot_lock); + + return (mp); +} + +/* + * Free a magazine to the depot. + */ +static void +kmem_depot_free(kmem_cache_t *cp, kmem_maglist_t *mlp, kmem_magazine_t *mp) +{ + mutex_enter(&cp->cache_depot_lock); + ASSERT(KMEM_MAGAZINE_VALID(cp, mp)); + mp->mag_next = mlp->ml_list; + mlp->ml_list = mp; + mlp->ml_total++; + mutex_exit(&cp->cache_depot_lock); +} + +/* + * Update the working set statistics for cp's depot. + */ +static void +kmem_depot_ws_update(kmem_cache_t *cp) +{ + mutex_enter(&cp->cache_depot_lock); + cp->cache_full.ml_reaplimit = cp->cache_full.ml_min; + cp->cache_full.ml_min = cp->cache_full.ml_total; + cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_min; + cp->cache_empty.ml_min = cp->cache_empty.ml_total; + mutex_exit(&cp->cache_depot_lock); +} + +/* + * Set the working set statistics for cp's depot to zero. (Everything is + * eligible for reaping.) + */ +void +kmem_depot_ws_zero(kmem_cache_t *cp) +{ + mutex_enter(&cp->cache_depot_lock); + cp->cache_full.ml_reaplimit = cp->cache_full.ml_total; + cp->cache_full.ml_min = cp->cache_full.ml_total; + cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_total; + cp->cache_empty.ml_min = cp->cache_empty.ml_total; + mutex_exit(&cp->cache_depot_lock); +} + +/* + * The number of bytes to reap before we call kpreempt(). The default (1MB) + * causes us to preempt reaping up to hundres of times per second. Using a + * larger value (1GB) causes this to have virtually no effect. + */ +//size_t kmem_reap_preempt_bytes = 1024 * 1024 * 1024; +uint32_t kmem_reap_preempt_bytes = 64 * 1024 * 1024; + + +/* + * Reap all magazines that have fallen out of the depot's working set. + */ +static void +kmem_depot_ws_reap(kmem_cache_t *cp) +{ + uint32_t bytes = 0; + long reap; + kmem_magazine_t *mp; + + ASSERT(!list_link_active(&cp->cache_link) || + taskq_member(kmem_taskq, curthread)); + + reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min); + while (reap-- && + (mp = kmem_depot_alloc(cp, &cp->cache_full)) != NULL) { + kmem_magazine_destroy(cp, mp, cp->cache_magtype->mt_magsize); + bytes += cp->cache_magtype->mt_magsize * cp->cache_bufsize; + if (bytes > kmem_reap_preempt_bytes) { + kpreempt(KPREEMPT_SYNC); + bytes = 0; + } + } + + reap = MIN(cp->cache_empty.ml_reaplimit, cp->cache_empty.ml_min); + while (reap-- && + (mp = kmem_depot_alloc(cp, &cp->cache_empty)) != NULL) { + kmem_magazine_destroy(cp, mp, 0); + bytes += cp->cache_magtype->mt_magsize * cp->cache_bufsize; + if (bytes > kmem_reap_preempt_bytes) { + kpreempt(KPREEMPT_SYNC); + bytes = 0; + } + } +} + +static void +kmem_cpu_reload(kmem_cpu_cache_t *ccp, kmem_magazine_t *mp, int rounds) +{ + ASSERT((ccp->cc_loaded == NULL && ccp->cc_rounds == -1) || + (ccp->cc_loaded && ccp->cc_rounds + rounds == ccp->cc_magsize)); + ASSERT(ccp->cc_magsize > 0); + + ccp->cc_ploaded = ccp->cc_loaded; + ccp->cc_prounds = ccp->cc_rounds; + ccp->cc_loaded = mp; + ccp->cc_rounds = (short)rounds; +} + +/* + * Intercept kmem alloc/free calls during crash dump in order to avoid + * changing kmem state while memory is being saved to the dump device. + * Otherwise, ::kmem_verify will report "corrupt buffers". Note that + * there are no locks because only one CPU calls kmem during a crash + * dump. To enable this feature, first create the associated vmem + * arena with VMC_DUMPSAFE. + */ +static void *kmem_dump_start; /* start of pre-reserved heap */ +static void *kmem_dump_end; /* end of heap area */ +static void *kmem_dump_curr; /* current free heap pointer */ +static uint32_t kmem_dump_size; /* size of heap area */ + +/* append to each buf created in the pre-reserved heap */ +typedef struct kmem_dumpctl { + void *kdc_next; /* cache dump free list linkage */ +} kmem_dumpctl_t; + +#define KMEM_DUMPCTL(cp, buf) \ +((kmem_dumpctl_t *)P2ROUNDUP((uintptr_t)(buf) + (cp)->cache_bufsize, \ +sizeof (void *))) + +/* Keep some simple stats. */ +#define KMEM_DUMP_LOGS (100) + +typedef struct kmem_dump_log { + kmem_cache_t *kdl_cache; + uint_t kdl_allocs; /* # of dump allocations */ + uint_t kdl_frees; /* # of dump frees */ + uint_t kdl_alloc_fails; /* # of allocation failures */ + uint_t kdl_free_nondump; /* # of non-dump frees */ + uint_t kdl_unsafe; /* cache was used, but unsafe */ +} kmem_dump_log_t; + +static kmem_dump_log_t *kmem_dump_log; +static int kmem_dump_log_idx; + +#define KDI_LOG(cp, stat) { \ +kmem_dump_log_t *kdl; \ +if ((kdl = (kmem_dump_log_t *)((cp)->cache_dumplog)) != NULL) { \ +kdl->stat++; \ +} else if (kmem_dump_log_idx < KMEM_DUMP_LOGS) { \ +kdl = &kmem_dump_log[kmem_dump_log_idx++]; \ +kdl->stat++; \ +kdl->kdl_cache = (cp); \ +(cp)->cache_dumplog = kdl; \ +} \ +} + +/* set non zero for full report */ +uint_t kmem_dump_verbose = 0; + +/* stats for overize heap */ +uint_t kmem_dump_oversize_allocs = 0; +uint_t kmem_dump_oversize_max = 0; + +static void +kmem_dumppr(char **pp, char *e, const char *format, ...) +{ + char *p = *pp; + + if (p < e) { + int n; + va_list ap; + + va_start(ap, format); + n = _vsnprintf(p, e - p, format, ap); + va_end(ap); + *pp = p + n; + } +} + +/* + * Called when dumpadm(1M) configures dump parameters. + */ +void +kmem_dump_init(uint32_t size) +{ + if (kmem_dump_start != NULL) + zfs_kmem_free(kmem_dump_start, kmem_dump_size); + + if (kmem_dump_log == NULL) + kmem_dump_log = + (kmem_dump_log_t *)zfs_kmem_zalloc( + KMEM_DUMP_LOGS * sizeof (kmem_dump_log_t), KM_SLEEP); + + kmem_dump_start = zfs_kmem_alloc(size, KM_SLEEP); + + if (kmem_dump_start != NULL) { + kmem_dump_size = size; + kmem_dump_curr = kmem_dump_start; + kmem_dump_end = (void *)((char *)kmem_dump_start + size); + copy_pattern(KMEM_UNINITIALIZED_PATTERN, kmem_dump_start, size); + } else { + kmem_dump_size = 0; + kmem_dump_curr = NULL; + kmem_dump_end = NULL; + } +} + +/* + * Set flag for each kmem_cache_t if is safe to use alternate dump + * memory. Called just before panic crash dump starts. Set the flag + * for the calling CPU. + */ +void +kmem_dump_begin(void) +{ + if (kmem_dump_start != NULL) { + kmem_cache_t *cp; + + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + + if (cp->cache_arena->vm_cflags & VMC_DUMPSAFE) { + cp->cache_flags |= KMF_DUMPDIVERT; + ccp->cc_flags |= KMF_DUMPDIVERT; + ccp->cc_dump_rounds = ccp->cc_rounds; + ccp->cc_dump_prounds = ccp->cc_prounds; + ccp->cc_rounds = ccp->cc_prounds = -1; + } else { + cp->cache_flags |= KMF_DUMPUNSAFE; + ccp->cc_flags |= KMF_DUMPUNSAFE; + } + } + } +} + +/* + * finished dump intercept + * print any warnings on the console + * return verbose information to dumpsys() in the given buffer + */ +uint64_t +kmem_dump_finish(char *buf, uint32_t size) +{ + int kdi_idx; + int kdi_end = kmem_dump_log_idx; + int percent = 0; + int header = 0; + int warn = 0; + uint32_t used; + kmem_cache_t *cp; + kmem_dump_log_t *kdl; + char *e = buf + size; + char *p = buf; + + if (kmem_dump_size == 0 || kmem_dump_verbose == 0) + return (0); + + used = (uint32_t) ((char *)kmem_dump_curr - (char *)kmem_dump_start); + percent = (int)((used * 100LL) / kmem_dump_size); + + kmem_dumppr(&p, e, "%% heap used,%d\n", percent); + kmem_dumppr(&p, e, "used bytes,%ld\n", used); + kmem_dumppr(&p, e, "heap size,%ld\n", kmem_dump_size); + kmem_dumppr(&p, e, "Oversize allocs,%d\n", + kmem_dump_oversize_allocs); + kmem_dumppr(&p, e, "Oversize max size,%ld\n", + kmem_dump_oversize_max); + + for (kdi_idx = 0; kdi_idx < kdi_end; kdi_idx++) { + kdl = &kmem_dump_log[kdi_idx]; + cp = kdl->kdl_cache; + if (cp == NULL) + break; + if (kdl->kdl_alloc_fails) + ++warn; + if (header == 0) { + kmem_dumppr(&p, e, + "Cache Name,Allocs,Frees,Alloc Fails," + "Nondump Frees,Unsafe Allocs/Frees\n"); + header = 1; + } + kmem_dumppr(&p, e, "%s,%d,%d,%d,%d,%d\n", + cp->cache_name, kdl->kdl_allocs, kdl->kdl_frees, + kdl->kdl_alloc_fails, kdl->kdl_free_nondump, + kdl->kdl_unsafe); + } + + /* return buffer size used */ + if (p < e) + bzero(p, e - p); + return (p - buf); +} + +/* + * Allocate a constructed object from alternate dump memory. + */ +void * +kmem_cache_alloc_dump(kmem_cache_t *cp, int kmflag) +{ + void *buf; + void *curr; + char *bufend; + + /* return a constructed object */ + if ((buf = cp->cache_dumpfreelist) != NULL) { + cp->cache_dumpfreelist = KMEM_DUMPCTL(cp, buf)->kdc_next; + KDI_LOG(cp, kdl_allocs); + return (buf); + } + + /* create a new constructed object */ + curr = kmem_dump_curr; + buf = (void *)P2ROUNDUP((uintptr_t)curr, cp->cache_align); + bufend = (char *)KMEM_DUMPCTL(cp, buf) + sizeof (kmem_dumpctl_t); + + /* hat layer objects cannot cross a page boundary */ + if (cp->cache_align < PAGESIZE) { + char *page = (char *)P2ROUNDUP((uintptr_t)buf, PAGESIZE); + if (bufend > page) { + bufend += page - (char *)buf; + buf = (void *)page; + } + } + + /* fall back to normal alloc if reserved area is used up */ + if (bufend > (char *)kmem_dump_end) { + kmem_dump_curr = kmem_dump_end; + KDI_LOG(cp, kdl_alloc_fails); + return (NULL); + } + + /* + * Must advance curr pointer before calling a constructor that + * may also allocate memory. + */ + kmem_dump_curr = bufend; + + /* run constructor */ + if (cp->cache_constructor != NULL && + cp->cache_constructor(buf, cp->cache_private, kmflag) + != 0) { +#ifdef DEBUG + dprintf("name='%s' cache=0x%p: kmem cache constructor failed\n", + cp->cache_name, (void *)cp); +#endif + /* reset curr pointer iff no allocs were done */ + if (kmem_dump_curr == bufend) + kmem_dump_curr = curr; + + /* fall back to normal alloc if the constructor fails */ + KDI_LOG(cp, kdl_alloc_fails); + return (NULL); + } + + KDI_LOG(cp, kdl_allocs); + return (buf); +} + +/* + * Free a constructed object in alternate dump memory. + */ +int +kmem_cache_free_dump(kmem_cache_t *cp, void *buf) +{ + /* save constructed buffers for next time */ + if ((char *)buf >= (char *)kmem_dump_start && + (char *)buf < (char *)kmem_dump_end) { + KMEM_DUMPCTL(cp, buf)->kdc_next = cp->cache_dumpfreelist; + cp->cache_dumpfreelist = buf; + KDI_LOG(cp, kdl_frees); + return (0); + } + + /* count all non-dump buf frees */ + KDI_LOG(cp, kdl_free_nondump); + + /* just drop buffers that were allocated before dump started */ + if (kmem_dump_curr < kmem_dump_end) + return (0); + + /* fall back to normal free if reserved area is used up */ + return (1); +} + +/* + * Allocate a constructed object from cache cp. + */ +void * +kmem_cache_alloc(kmem_cache_t *cp, int kmflag) +{ + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + kmem_magazine_t *fmp; + void *buf; + mutex_enter(&ccp->cc_lock); + for (;;) { + /* + * If there's an object available in the current CPU's + * loaded magazine, just take it and return. + */ + if (ccp->cc_rounds > 0) { + buf = ccp->cc_loaded->mag_round[--ccp->cc_rounds]; + ccp->cc_alloc++; + mutex_exit(&ccp->cc_lock); + if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPUNSAFE)) { + if (ccp->cc_flags & KMF_DUMPUNSAFE) { + ASSERT(!(ccp->cc_flags & + KMF_DUMPDIVERT)); + KDI_LOG(cp, kdl_unsafe); + } + if ((ccp->cc_flags & KMF_BUFTAG) && + kmem_cache_alloc_debug(cp, buf, kmflag, 0, + caller()) != 0) { + if (kmflag & KM_NOSLEEP) + return (NULL); + mutex_enter(&ccp->cc_lock); + continue; + } + } + return (buf); + } + + /* + * The loaded magazine is empty. If the previously loaded + * magazine was full, exchange them and try again. + */ + if (ccp->cc_prounds > 0) { + kmem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds); + continue; + } + + /* + * Return an alternate buffer at dump time to preserve + * the heap. + */ + if (ccp->cc_flags & (KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) { + if (ccp->cc_flags & KMF_DUMPUNSAFE) { + ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT)); + /* log it so that we can warn about it */ + KDI_LOG(cp, kdl_unsafe); + } else { + if ((buf = kmem_cache_alloc_dump(cp, kmflag)) != + NULL) { + mutex_exit(&ccp->cc_lock); + return (buf); + } + break; /* fall back to slab layer */ + } + } + + /* + * If the magazine layer is disabled, break out now. + */ + if (ccp->cc_magsize == 0) + break; + + /* + * Try to get a full magazine from the depot. + */ + fmp = kmem_depot_alloc(cp, &cp->cache_full); + if (fmp != NULL) { + if (ccp->cc_ploaded != NULL) + kmem_depot_free(cp, &cp->cache_empty, + ccp->cc_ploaded); + kmem_cpu_reload(ccp, fmp, ccp->cc_magsize); + continue; + } + + /* + * There are no full magazines in the depot, + * so fall through to the slab layer. + */ + break; + } + mutex_exit(&ccp->cc_lock); + + /* + * We couldn't allocate a constructed object from the magazine layer, + * so get a raw buffer from the slab layer and apply its constructor. + */ + buf = kmem_slab_alloc(cp, kmflag); + + if (buf == NULL) + return (NULL); + + if (cp->cache_flags & KMF_BUFTAG) { + /* + * Make kmem_cache_alloc_debug() apply the constructor for us. + */ + int rc = kmem_cache_alloc_debug(cp, buf, kmflag, 1, caller()); + if (rc != 0) { + if (kmflag & KM_NOSLEEP) + return (NULL); + /* + * kmem_cache_alloc_debug() detected corruption + * but didn't panic (kmem_panic <= 0). We should not be + * here because the constructor failed (indicated by a + * return code of 1). Try again. + */ + ASSERT(rc == -1); + return (kmem_cache_alloc(cp, kmflag)); + } + return (buf); + } + + if (cp->cache_constructor != NULL && + cp->cache_constructor(buf, cp->cache_private, kmflag) != 0) { + atomic_inc_64(&cp->cache_alloc_fail); + kmem_slab_free(cp, buf); + return (NULL); + } + + return (buf); +} + +/* + * The freed argument tells whether or not kmem_cache_free_debug() has already + * been called so that we can avoid the duplicate free error. For example, a + * buffer on a magazine has already been freed by the client but is still + * constructed. + */ +static void +kmem_slab_free_constructed(kmem_cache_t *cp, void *buf, boolean_t freed) +{ + if (!freed && (cp->cache_flags & KMF_BUFTAG)) + if (kmem_cache_free_debug(cp, buf, caller()) == -1) + return; + + /* + * Note that if KMF_DEADBEEF is in effect and KMF_LITE is not, + * kmem_cache_free_debug() will have already applied the destructor. + */ + if ((cp->cache_flags & (KMF_DEADBEEF | KMF_LITE)) != KMF_DEADBEEF && + cp->cache_destructor != NULL) { + if (cp->cache_flags & KMF_DEADBEEF) { /* KMF_LITE implied */ + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + *(uint64_t *)buf = btp->bt_redzone; + cp->cache_destructor(buf, cp->cache_private); + *(uint64_t *)buf = KMEM_FREE_PATTERN; + } else { + cp->cache_destructor(buf, cp->cache_private); + } + } + + kmem_slab_free(cp, buf); +} + +/* + * Used when there's no room to free a buffer to the per-CPU cache. + * Drops and re-acquires &ccp->cc_lock, and returns non-zero if the + * caller should try freeing to the per-CPU cache again. + * Note that we don't directly install the magazine in the cpu cache, + * since its state may have changed wildly while the lock was dropped. + */ +static int +kmem_cpucache_magazine_alloc(kmem_cpu_cache_t *ccp, kmem_cache_t *cp) +{ + kmem_magazine_t *emp; + kmem_magtype_t *mtp; + + ASSERT(MUTEX_HELD(&ccp->cc_lock)); + ASSERT(((uint_t)ccp->cc_rounds == (uint_t)ccp->cc_magsize || + ((uint_t)ccp->cc_rounds == (uint_t)-1)) && + ((uint_t)ccp->cc_prounds == (uint_t)ccp->cc_magsize || + ((uint_t)ccp->cc_prounds == (uint_t)-1))); + + emp = kmem_depot_alloc(cp, &cp->cache_empty); + if (emp != NULL) { + if (ccp->cc_ploaded != NULL) + kmem_depot_free(cp, &cp->cache_full, + ccp->cc_ploaded); + kmem_cpu_reload(ccp, emp, 0); + return (1); + } + /* + * There are no empty magazines in the depot, + * so try to allocate a new one. We must drop all locks + * across kmem_cache_alloc() because lower layers may + * attempt to allocate from this cache. + */ + mtp = cp->cache_magtype; + mutex_exit(&ccp->cc_lock); + emp = kmem_cache_alloc(mtp->mt_cache, KM_NOSLEEP); + mutex_enter(&ccp->cc_lock); + + if (emp != NULL) { + /* + * We successfully allocated an empty magazine. + * However, we had to drop ccp->cc_lock to do it, + * so the cache's magazine size may have changed. + * If so, free the magazine and try again. + */ + if (ccp->cc_magsize != mtp->mt_magsize) { + mutex_exit(&ccp->cc_lock); + kmem_cache_free(mtp->mt_cache, emp); + mutex_enter(&ccp->cc_lock); + return (1); + } + + /* + * We got a magazine of the right size. Add it to + * the depot and try the whole dance again. + */ + kmem_depot_free(cp, &cp->cache_empty, emp); + return (1); + } + + /* + * We couldn't allocate an empty magazine, + * so fall through to the slab layer. + */ + return (0); +} + +/* + * If the cache's parent arena is a leaf arena (i.e., it imports all its memory) + * then we can consider it fragmented if either there is 1 GiB free in the arena + * or one eighth of the arena is free. + * + * This is useful in kmem_cache_free{_debug} to determine whether to free to the + * slab layer if the loaded magazine is full. + */ +static inline boolean_t +kmem_cache_parent_arena_fragmented(kmem_cache_t *cp) +{ + const vmem_kstat_t *kp = &cp->cache_arena->vm_kstat; + const int64_t vk_import = kp->vk_mem_import.value.ui64; + const int64_t vk_inuse = kp->vk_mem_inuse.value.ui64; + const int64_t vk_total = kp->vk_mem_total.value.ui64; + + if (vk_import == vk_total && vk_inuse < vk_total) { + const int64_t vk_free = vk_total - vk_inuse; + const int64_t highthresh = 1024LL*1024LL*1024LL; + // we are fragmented if we have 1GiB free + if (vk_free >= highthresh) + return (B_TRUE); + // we are fragmented if at least 1/8 of the + // total arena space is free + if (vk_free > 0 && vk_total > 0) { + const int64_t eighth_total= vk_total / 8; + if (vk_free >= eighth_total) + return (B_TRUE); + } + } + return (B_FALSE); +} + +/* + * Free a constructed object to cache cp. + */ +void +kmem_cache_free(kmem_cache_t *cp, void *buf) +{ + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + + /* + * The client must not free either of the buffers passed to the move + * callback function. + */ + ASSERT(cp->cache_defrag == NULL || + cp->cache_defrag->kmd_thread != spl_current_thread() || + (buf != cp->cache_defrag->kmd_from_buf && + buf != cp->cache_defrag->kmd_to_buf)); + + if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) { + if (ccp->cc_flags & KMF_DUMPUNSAFE) { + ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT)); + /* log it so that we can warn about it */ + KDI_LOG(cp, kdl_unsafe); + } else if (KMEM_DUMPCC(ccp) && !kmem_cache_free_dump(cp, buf)) { + return; + } + if (ccp->cc_flags & KMF_BUFTAG) { + if (kmem_cache_free_debug(cp, buf, caller()) == -1) + return; + } + } + + mutex_enter(&ccp->cc_lock); + /* + * Any changes to this logic should be reflected in kmem_slab_prefill() + */ + for (;;) { + /* + * If there's a slot available in the current CPU's + * loaded magazine, just put the object there and return. + */ + if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) { + ccp->cc_loaded->mag_round[ccp->cc_rounds++] = buf; + ccp->cc_free++; + mutex_exit(&ccp->cc_lock); + return; + } + + /* + * If the magazine layer is disabled, break out now. + */ + if (ccp->cc_magsize == 0) { + break; + } + + /* + * The magazine layer is on, but the loaded magazine is now + * full (of allocatable constructed elements). + * + * If the cache's arena is badly fragmented, break out now; + * this frees to the slab layer. + * + * Note: this is not reflected in kmem_slab_prefill() which + * deals with a freshly allocated slab. + */ + + if (kmem_free_to_slab_when_fragmented == 1 && + kmem_cache_parent_arena_fragmented(cp)) + break; + + /* + * The loaded magazine is full. If the previously loaded + * magazine was empty, exchange them and try again. + */ + if (ccp->cc_prounds == 0) { + kmem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds); + continue; + } + + if (!kmem_cpucache_magazine_alloc(ccp, cp)) { + /* + * We couldn't free our constructed object to the + * magazine layer, so apply its destructor and free it + * to the slab layer. + */ + break; + } + } + mutex_exit(&ccp->cc_lock); + kpreempt(KPREEMPT_SYNC); + kmem_slab_free_constructed(cp, buf, B_TRUE); +} + +/* + * Free a constructed object to cache cp. + * Do not free to the magazine layer. + * This is essentially just kmem_cache_free() without + * the for(;;) loop or the ccp critical section. + */ +void +kmem_cache_free_to_slab(kmem_cache_t *cp, void *buf) +{ + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + + /* + * The client must not free either of the buffers passed to the move + * callback function. + */ + ASSERT(cp->cache_defrag == NULL || + cp->cache_defrag->kmd_thread != spl_current_thread() || + (buf != cp->cache_defrag->kmd_from_buf && + buf != cp->cache_defrag->kmd_to_buf)); + + if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) { + if (ccp->cc_flags & KMF_DUMPUNSAFE) { + ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT)); + /* log it so that we can warn about it */ + KDI_LOG(cp, kdl_unsafe); + } else if (KMEM_DUMPCC(ccp) && !kmem_cache_free_dump(cp, buf)) { + return; + } + if (ccp->cc_flags & KMF_BUFTAG) { + if (kmem_cache_free_debug(cp, buf, caller()) == -1) + return; + } + } + + /* omitted the for(;;) loop from kmem_cache_free */ + /* also do not take ccp mutex */ + + kmem_slab_free_constructed(cp, buf, B_TRUE); +} + +static void +kmem_slab_prefill(kmem_cache_t *cp, kmem_slab_t *sp) +{ + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + + kmem_bufctl_t *next, *head; + uint32_t nbufs; + + /* + * Completely allocate the newly created slab and put the pre-allocated + * buffers in magazines. Any of the buffers that cannot be put in + * magazines must be returned to the slab. + */ + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(cp->cache_constructor == NULL); + ASSERT(sp->slab_cache == cp); + ASSERT(sp->slab_refcnt == 1); + ASSERT(sp->slab_head != NULL && sp->slab_chunks > sp->slab_refcnt); + ASSERT(avl_find(&cp->cache_partial_slabs, sp, NULL) == NULL); + + head = sp->slab_head; + nbufs = (sp->slab_chunks - sp->slab_refcnt); + sp->slab_head = NULL; + sp->slab_refcnt += (long)nbufs; + cp->cache_bufslab -= nbufs; + cp->cache_slab_alloc += nbufs; + list_insert_head(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count++; + mutex_exit(&cp->cache_lock); + mutex_enter(&ccp->cc_lock); + + while (head != NULL) { + void *buf = KMEM_BUF(cp, head); + /* + * If there's a slot available in the current CPU's + * loaded magazine, just put the object there and + * continue. + */ + if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) { + ccp->cc_loaded->mag_round[ccp->cc_rounds++] = + buf; + ccp->cc_free++; + nbufs--; + head = head->bc_next; + continue; + } + + /* + * The loaded magazine is full. If the previously + * loaded magazine was empty, exchange them and try + * again. + */ + if (ccp->cc_prounds == 0) { + kmem_cpu_reload(ccp, ccp->cc_ploaded, + ccp->cc_prounds); + continue; + } + + /* + * If the magazine layer is disabled, break out now. + */ + + if (ccp->cc_magsize == 0) { + break; + } + + if (!kmem_cpucache_magazine_alloc(ccp, cp)) + break; + } + mutex_exit(&ccp->cc_lock); + if (nbufs != 0) { + ASSERT(head != NULL); + + /* + * If there was a failure, return remaining objects to + * the slab + */ + while (head != NULL) { + ASSERT(nbufs != 0); + next = head->bc_next; + head->bc_next = NULL; + kmem_slab_free(cp, KMEM_BUF(cp, head)); + head = next; + nbufs--; + } + } + ASSERT(head == NULL); + ASSERT(nbufs == 0); + mutex_enter(&cp->cache_lock); +} + +void * +zfs_kmem_zalloc(uint32_t size, int kmflag) +{ + uint32_t index; + void *buf; + + if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) { + kmem_cache_t *cp = kmem_alloc_table[index]; + buf = kmem_cache_alloc(cp, kmflag); + if (buf != NULL) { + if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp)) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + ((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE; + ((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size); + + if (cp->cache_flags & KMF_LITE) { + KMEM_BUFTAG_LITE_ENTER(btp, + kmem_lite_count, caller()); + } + } + bzero(buf, size); + } + } else { + buf = zfs_kmem_alloc(size, kmflag); + if (buf != NULL) + bzero(buf, size); + } + return (buf); +} + +void * +zfs_kmem_alloc(uint32_t size, int kmflag) +{ + uint32_t index; + kmem_cache_t *cp; + void *buf; + + if (size == 0) return KMEM_ZERO_SIZE_PTR; + + if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) { + cp = kmem_alloc_table[index]; + /* fall through to kmem_cache_alloc() */ + + } else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) < + kmem_big_alloc_table_max) { + cp = kmem_big_alloc_table[index]; + /* fall through to kmem_cache_alloc() */ + + } else { + + buf = vmem_alloc(kmem_oversize_arena, size, + kmflag & KM_VMFLAGS); + if (buf == NULL) + kmem_log_event(kmem_failure_log, NULL, NULL, + (void *)size); + else if (KMEM_DUMP(kmem_slab_cache)) { + /* stats for dump intercept */ + kmem_dump_oversize_allocs++; + if (size > kmem_dump_oversize_max) + kmem_dump_oversize_max = size; + } + return (buf); + } + + buf = kmem_cache_alloc(cp, kmflag); + if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp) && buf != NULL) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + ((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE; + ((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size); + + if (cp->cache_flags & KMF_LITE) { + KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller()); + } + } + return (buf); +} + +void +zfs_kmem_free(void *buf, uint32_t size) +{ + uint32_t index; + kmem_cache_t *cp; + + if (size == 0 || buf == KMEM_ZERO_SIZE_PTR || buf == NULL) + return; + + if ((index = (size - 1) >> KMEM_ALIGN_SHIFT) < KMEM_ALLOC_TABLE_MAX) { + cp = kmem_alloc_table[index]; + /* fall through to kmem_cache_free() */ + + } else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) < + kmem_big_alloc_table_max) { + cp = kmem_big_alloc_table[index]; + /* fall through to kmem_cache_free() */ + + } else { + vmem_free(kmem_oversize_arena, buf, size); + return; + } + + if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp)) { + kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf); + uint32_t *ip = (uint32_t *)btp; + if (ip[1] != KMEM_SIZE_ENCODE(size)) { + if (*(uint64_t *)buf == KMEM_FREE_PATTERN) { + kmem_error(KMERR_DUPFREE, cp, buf); + return; + } + if (KMEM_SIZE_VALID(ip[1])) { + ip[0] = KMEM_SIZE_ENCODE(size); + kmem_error(KMERR_BADSIZE, cp, buf); + } else { + kmem_error(KMERR_REDZONE, cp, buf); + } + return; + } + if (((uint8_t *)buf)[size] != KMEM_REDZONE_BYTE) { + kmem_error(KMERR_REDZONE, cp, buf); + return; + } + btp->bt_redzone = KMEM_REDZONE_PATTERN; + if (cp->cache_flags & KMF_LITE) { + KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, + caller()); + } + } + kmem_cache_free(cp, buf); +} + +/* + * Try to allocate at least `size' bytes of memory without sleeping or + * panicking. Return actual allocated size in `asize'. If allocation failed, + * try final allocation with sleep or panic allowed. + */ +void * +kmem_alloc_tryhard(uint32_t size, uint32_t *asize, int kmflag) +{ + void *p; + + *asize = P2ROUNDUP(size, KMEM_ALIGN); + do { + p = kmem_alloc(*asize, (kmflag | KM_NOSLEEP) & ~KM_PANIC); + if (p != NULL) + return (p); + *asize += KMEM_ALIGN; + } while (*asize <= PAGESIZE); + + *asize = P2ROUNDUP(size, KMEM_ALIGN); + return (zfs_kmem_alloc(*asize, kmflag)); +} + +/* + * Reclaim all unused memory from a cache. + */ +static void +kmem_cache_reap(kmem_cache_t *cp) +{ + ASSERT(taskq_member(kmem_taskq, curthread)); + + cp->cache_reap++; + + /* + * Ask the cache's owner to free some memory if possible. + * The idea is to handle things like the inode cache, which + * typically sits on a bunch of memory that it doesn't truly + * *need*. Reclaim policy is entirely up to the owner; this + * callback is just an advisory plea for help. + */ + if (cp->cache_reclaim != NULL) { + long delta; + + /* + * Reclaimed memory should be reapable (not included in the + * depot's working set). + */ + delta = cp->cache_full.ml_total; + cp->cache_reclaim(cp->cache_private); + delta = cp->cache_full.ml_total - delta; + if (delta > 0) { + mutex_enter(&cp->cache_depot_lock); + cp->cache_full.ml_reaplimit += delta; + cp->cache_full.ml_min += delta; + mutex_exit(&cp->cache_depot_lock); + } + } + + kmem_depot_ws_reap(cp); + + if (cp->cache_defrag != NULL && !kmem_move_noreap) { + kmem_cache_defrag(cp); + } +} + + +static void +kmem_reap_timeout(void *flag_arg) +{ + uint32_t *flag = (uint32_t *)flag_arg; + + ASSERT(flag == (void *)&kmem_reaping || flag == (void *)&kmem_reaping_idspace); + *flag = 0; +} + +static void +kmem_reap_done(void *flag) +{ + (void) bsd_timeout(kmem_reap_timeout, flag, &kmem_reap_interval); +} + +static void +kmem_reap_start(void *flag) +{ + ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace); + + if (flag == &kmem_reaping) { + kmem_cache_applyall(kmem_cache_reap, kmem_taskq, TQ_NOSLEEP); + /* + * if we have segkp under heap, reap segkp cache. + */ + } + else + kmem_cache_applyall_id(kmem_cache_reap, kmem_taskq, TQ_NOSLEEP); + + /* + * We use taskq_dispatch() to schedule a timeout to clear + * the flag so that kmem_reap() becomes self-throttling: + * we won't reap again until the current reap completes *and* + * at least kmem_reap_interval ticks have elapsed. + */ + if (!taskq_dispatch(kmem_taskq, kmem_reap_done, flag, TQ_NOSLEEP)) + kmem_reap_done(flag); +} + +static void +kmem_reap_common(void *flag_arg) +{ + uint32_t *flag = (uint32_t *)flag_arg; + + + if (MUTEX_HELD(&kmem_cache_lock) || kmem_taskq == NULL || + atomic_cas_32(flag, 0, 1) != 0) + return; + + /* + * It may not be kosher to do memory allocation when a reap is called + * is called (for example, if vmem_populate() is in the call chain). + * So we start the reap going with a TQ_NOALLOC dispatch. If the + * dispatch fails, we reset the flag, and the next reap will try again. + */ + if (!taskq_dispatch(kmem_taskq, kmem_reap_start, flag, TQ_NOALLOC)) + *flag = 0; +} + +/* + * Reclaim all unused memory from all caches. Called from the VM system + * when memory gets tight. + */ +void +kmem_reap(void) +{ + kmem_reap_common(&kmem_reaping); +} + +/* + * Reclaim all unused memory from identifier arenas, called when a vmem + * arena not back by memory is exhausted. Since reaping memory-backed caches + * cannot help with identifier exhaustion, we avoid both a large amount of + * work and unwanted side-effects from reclaim callbacks. + */ +void +kmem_reap_idspace(void) +{ + kmem_reap_common(&kmem_reaping_idspace); +} + +/* + * Purge all magazines from a cache and set its magazine limit to zero. + * All calls are serialized by the kmem_taskq lock, except for the final + * call from kmem_cache_destroy(). + */ +static void +kmem_cache_magazine_purge(kmem_cache_t *cp) +{ + kmem_cpu_cache_t *ccp; + kmem_magazine_t *mp, *pmp; + int rounds, prounds, cpu_seqid; + + ASSERT(!list_link_active(&cp->cache_link) || + taskq_member(kmem_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + + for (cpu_seqid = 0; cpu_seqid < (int)max_ncpus; cpu_seqid++) { + ccp = &cp->cache_cpu[cpu_seqid]; + + mutex_enter(&ccp->cc_lock); + mp = ccp->cc_loaded; + pmp = ccp->cc_ploaded; + rounds = ccp->cc_rounds; + prounds = ccp->cc_prounds; + ccp->cc_loaded = NULL; + ccp->cc_ploaded = NULL; + ccp->cc_rounds = -1; + ccp->cc_prounds = -1; + ccp->cc_magsize = 0; + mutex_exit(&ccp->cc_lock); + + if (mp) + kmem_magazine_destroy(cp, mp, rounds); + + if (pmp) + kmem_magazine_destroy(cp, pmp, prounds); + } + + kmem_depot_ws_zero(cp); + kmem_depot_ws_reap(cp); +} + +/* + * Enable per-cpu magazines on a cache. + */ +static void +kmem_cache_magazine_enable(kmem_cache_t *cp) +{ + int cpu_seqid; + + if (cp->cache_flags & KMF_NOMAGAZINE) + return; + + for (cpu_seqid = 0; cpu_seqid < (int)max_ncpus; cpu_seqid++) { + kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid]; + mutex_enter(&ccp->cc_lock); + ccp->cc_magsize = cp->cache_magtype->mt_magsize; + mutex_exit(&ccp->cc_lock); + } + +} + +static void +kmem_cache_magazine_disable(kmem_cache_t *cp) +{ + int cpu_seqid; + + if (cp->cache_flags & KMF_NOMAGAZINE) + return; + + for (cpu_seqid = 0; cpu_seqid < (int)max_ncpus; cpu_seqid++) { + kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid]; + mutex_enter(&ccp->cc_lock); + ccp->cc_magsize = 0; + mutex_exit(&ccp->cc_lock); + } + +} + +/* + * Reap (almost) everything right now. + */ +void +kmem_cache_reap_now(kmem_cache_t *cp) +{ + ASSERT(list_link_active(&cp->cache_link)); + + kmem_depot_ws_zero(cp); + + (void) taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_depot_ws_reap, cp, TQ_SLEEP); + taskq_wait(kmem_taskq); +} + +/* + * Recompute a cache's magazine size. The trade-off is that larger magazines + * provide a higher transfer rate with the depot, while smaller magazines + * reduce memory consumption. Magazine resizing is an expensive operation; + * it should not be done frequently. + * + * Changes to the magazine size are serialized by the kmem_taskq lock. + * + * Note: at present this only grows the magazine size. It might be useful + * to allow shrinkage too. + */ +static void +kmem_cache_magazine_resize(kmem_cache_t *cp) +{ + kmem_magtype_t *mtp = cp->cache_magtype; + + ASSERT(taskq_member(kmem_taskq, curthread)); + + if (cp->cache_chunksize < mtp->mt_maxbuf) { + kmem_cache_magazine_purge(cp); + mutex_enter(&cp->cache_depot_lock); + cp->cache_magtype = ++mtp; + cp->cache_depot_contention_prev = + cp->cache_depot_contention + INT_MAX; + mutex_exit(&cp->cache_depot_lock); + kmem_cache_magazine_enable(cp); + } +} + +/* + * Rescale a cache's hash table, so that the table size is roughly the + * cache size. We want the average lookup time to be extremely small. + */ +static void +kmem_hash_rescale(kmem_cache_t *cp) +{ + kmem_bufctl_t **old_table, **new_table, *bcp; + uint32_t old_size, new_size, h; + + ASSERT(taskq_member(kmem_taskq, curthread)); + + new_size = MAX(KMEM_HASH_INITIAL, + 1 << (highbit(3 * cp->cache_buftotal + 4) - 2)); + old_size = cp->cache_hash_mask + 1; + + if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) + return; + + new_table = vmem_alloc(kmem_hash_arena, new_size * sizeof (void *), + VM_NOSLEEP); + if (new_table == NULL) + return; + bzero(new_table, new_size * sizeof (void *)); + + mutex_enter(&cp->cache_lock); + + old_size = cp->cache_hash_mask + 1; + old_table = cp->cache_hash_table; + + cp->cache_hash_mask = new_size - 1; + cp->cache_hash_table = new_table; + cp->cache_rescale++; + + for (h = 0; h < old_size; h++) { + bcp = old_table[h]; + while (bcp != NULL) { + void *addr = bcp->bc_addr; + kmem_bufctl_t *next_bcp = bcp->bc_next; + kmem_bufctl_t **hash_bucket = KMEM_HASH(cp, addr); + bcp->bc_next = *hash_bucket; + *hash_bucket = bcp; + bcp = next_bcp; + } + } + + mutex_exit(&cp->cache_lock); + + vmem_free(kmem_hash_arena, old_table, old_size * sizeof (void *)); +} + +/* + * Perform periodic maintenance on a cache: hash rescaling, depot working-set + * update, magazine resizing, and slab consolidation. + */ +static void +kmem_cache_update(kmem_cache_t *cp) +{ + int need_hash_rescale = 0; + int need_magazine_resize = 0; + + /* + * If the cache has become much larger or smaller than its hash table, + * fire off a request to rescale the hash table. + */ + mutex_enter(&cp->cache_lock); + + if ((cp->cache_flags & KMF_HASH) && + (cp->cache_buftotal > (cp->cache_hash_mask << 1) || + (cp->cache_buftotal < (cp->cache_hash_mask >> 1) && + cp->cache_hash_mask > KMEM_HASH_INITIAL))) + need_hash_rescale = 1; + + mutex_exit(&cp->cache_lock); + + /* + * Update the depot working set statistics. + */ + kmem_depot_ws_update(cp); + + /* + * If there's a lot of contention in the depot, + * increase the magazine size. + */ + mutex_enter(&cp->cache_depot_lock); + + if (cp->cache_chunksize < cp->cache_magtype->mt_maxbuf && + (int)(cp->cache_depot_contention - + cp->cache_depot_contention_prev) > kmem_depot_contention) + need_magazine_resize = 1; + + cp->cache_depot_contention_prev = cp->cache_depot_contention; + + mutex_exit(&cp->cache_depot_lock); + + if (need_hash_rescale) + (void) taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_hash_rescale, cp, TQ_NOSLEEP); + + if (need_magazine_resize) + (void) taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_cache_magazine_resize, + cp, TQ_NOSLEEP); + + // smd : the following if is only TRUE for the dnode cache + if (cp->cache_defrag != NULL) + (void) taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_cache_scan, cp, TQ_NOSLEEP); + +#ifdef DEBUG + else { + // for every other cache, duplicate some of the logic from + // kmem_cache_scan() below + // run reap occasionally even if there is plenty of memory + uint16_t debug_rand; + + (void) random_get_bytes((uint8_t *)&debug_rand, 2); + if (!kmem_move_noreap && + ((debug_rand % kmem_mtb_reap) == 0)) { + // no mutex above, so no need to give it up as in kmem_cache_scan() + // dprintf("SPL: kmem_cache_update random debug reap %u, doing %s\n", + // ++kmem_mtb_reap_count, cp->cache_name); + // kmem_cache_reap(cp); // XXX + } + } +#endif + +} + +static void kmem_update_timeout(void *dummy); + +static void +kmem_update(void *dummy) +{ + kmem_cache_applyall(kmem_cache_update, NULL, TQ_NOSLEEP); + + /* + * We use taskq_dispatch() to reschedule the timeout so that + * kmem_update() becomes self-throttling: it won't schedule + * new tasks until all previous tasks have completed. + */ + if (!taskq_dispatch(kmem_taskq, kmem_update_timeout, dummy, TQ_NOSLEEP)) + kmem_update_timeout(&kmem_update_timer); + +} + +static void +kmem_update_timeout(void *dummy) +{ + (void) bsd_timeout(kmem_update, dummy, &kmem_reap_interval); +} + +static int +kmem_cache_kstat_update(kstat_t *ksp, int rw) +{ + struct kmem_cache_kstat *kmcp = &kmem_cache_kstat; + kmem_cache_t *cp = ksp->ks_private; + uint64_t cpu_buf_avail; + uint64_t buf_avail = 0; + int cpu_seqid; + long reap; + + if (rw == KSTAT_WRITE) + return (EACCES); + + mutex_enter(&cp->cache_lock); + + kmcp->kmc_alloc_fail.value.ui64 = cp->cache_alloc_fail; + kmcp->kmc_alloc.value.ui64 = cp->cache_slab_alloc; + kmcp->kmc_free.value.ui64 = cp->cache_slab_free; + kmcp->kmc_slab_alloc.value.ui64 = cp->cache_slab_alloc; + kmcp->kmc_slab_free.value.ui64 = cp->cache_slab_free; + kmcp->kmc_no_vba_success.value.ui64 = cp->no_vba_success; + kmcp->kmc_no_vba_fail.value.ui64 = cp->no_vba_fail; + kmcp->kmc_arc_no_grow_set.value.ui64 = cp->arc_no_grow_set; + kmcp->kmc_arc_no_grow.value.ui64 = cp->arc_no_grow; + + for (cpu_seqid = 0; cpu_seqid < (int)max_ncpus; cpu_seqid++) { + kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid]; + + mutex_enter(&ccp->cc_lock); + + cpu_buf_avail = 0; + if (ccp->cc_rounds > 0) + cpu_buf_avail += ccp->cc_rounds; + if (ccp->cc_prounds > 0) + cpu_buf_avail += ccp->cc_prounds; + + kmcp->kmc_alloc.value.ui64 += ccp->cc_alloc; + kmcp->kmc_free.value.ui64 += ccp->cc_free; + buf_avail += cpu_buf_avail; + + mutex_exit(&ccp->cc_lock); + } + + mutex_enter(&cp->cache_depot_lock); + + kmcp->kmc_depot_alloc.value.ui64 = cp->cache_full.ml_alloc; + kmcp->kmc_depot_free.value.ui64 = cp->cache_empty.ml_alloc; + kmcp->kmc_depot_contention.value.ui64 = cp->cache_depot_contention; + kmcp->kmc_full_magazines.value.ui64 = cp->cache_full.ml_total; + kmcp->kmc_empty_magazines.value.ui64 = cp->cache_empty.ml_total; + kmcp->kmc_magazine_size.value.ui64 = + (cp->cache_flags & KMF_NOMAGAZINE) ? + 0 : cp->cache_magtype->mt_magsize; + + kmcp->kmc_alloc.value.ui64 += cp->cache_full.ml_alloc; + kmcp->kmc_free.value.ui64 += cp->cache_empty.ml_alloc; + buf_avail += cp->cache_full.ml_total * cp->cache_magtype->mt_magsize; + + reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min); + reap = MIN(reap, cp->cache_full.ml_total); + + mutex_exit(&cp->cache_depot_lock); + + kmcp->kmc_buf_size.value.ui64 = cp->cache_bufsize; + kmcp->kmc_align.value.ui64 = cp->cache_align; + kmcp->kmc_chunk_size.value.ui64 = cp->cache_chunksize; + kmcp->kmc_slab_size.value.ui64 = cp->cache_slabsize; + kmcp->kmc_buf_constructed.value.ui64 = buf_avail; + buf_avail += cp->cache_bufslab; + kmcp->kmc_buf_avail.value.ui64 = buf_avail; + kmcp->kmc_buf_inuse.value.ui64 = cp->cache_buftotal - buf_avail; + kmcp->kmc_buf_total.value.ui64 = cp->cache_buftotal; + kmcp->kmc_buf_max.value.ui64 = cp->cache_bufmax; + kmcp->kmc_slab_create.value.ui64 = cp->cache_slab_create; + kmcp->kmc_slab_destroy.value.ui64 = cp->cache_slab_destroy; + kmcp->kmc_hash_size.value.ui64 = (cp->cache_flags & KMF_HASH) ? + cp->cache_hash_mask + 1 : 0; + kmcp->kmc_hash_lookup_depth.value.ui64 = cp->cache_lookup_depth; + kmcp->kmc_hash_rescale.value.ui64 = cp->cache_rescale; + kmcp->kmc_vmem_source.value.ui64 = cp->cache_arena->vm_id; + kmcp->kmc_reap.value.ui64 = cp->cache_reap; + + if (cp->cache_defrag == NULL) { + kmcp->kmc_move_callbacks.value.ui64 = 0; + kmcp->kmc_move_yes.value.ui64 = 0; + kmcp->kmc_move_no.value.ui64 = 0; + kmcp->kmc_move_later.value.ui64 = 0; + kmcp->kmc_move_dont_need.value.ui64 = 0; + kmcp->kmc_move_dont_know.value.ui64 = 0; + kmcp->kmc_move_hunt_found.value.ui64 = 0; + kmcp->kmc_move_slabs_freed.value.ui64 = 0; + kmcp->kmc_defrag.value.ui64 = 0; + kmcp->kmc_scan.value.ui64 = 0; + kmcp->kmc_move_reclaimable.value.ui64 = 0; + } else { + int64_t reclaimable; + + kmem_defrag_t *kd = cp->cache_defrag; + kmcp->kmc_move_callbacks.value.ui64 = kd->kmd_callbacks; + kmcp->kmc_move_yes.value.ui64 = kd->kmd_yes; + kmcp->kmc_move_no.value.ui64 = kd->kmd_no; + kmcp->kmc_move_later.value.ui64 = kd->kmd_later; + kmcp->kmc_move_dont_need.value.ui64 = kd->kmd_dont_need; + kmcp->kmc_move_dont_know.value.ui64 = kd->kmd_dont_know; + kmcp->kmc_move_hunt_found.value.ui64 = kd->kmd_hunt_found; + kmcp->kmc_move_slabs_freed.value.ui64 = kd->kmd_slabs_freed; + kmcp->kmc_defrag.value.ui64 = kd->kmd_defrags; + kmcp->kmc_scan.value.ui64 = kd->kmd_scans; + + reclaimable = cp->cache_bufslab - (cp->cache_maxchunks - 1); + reclaimable = MAX(reclaimable, 0); + reclaimable += ((uint64_t)reap * cp->cache_magtype->mt_magsize); + kmcp->kmc_move_reclaimable.value.ui64 = reclaimable; + } + + mutex_exit(&cp->cache_lock); + return (0); +} + +/* + * Return a named statistic about a particular cache. + * This shouldn't be called very often, so it's currently designed for + * simplicity (leverages existing kstat support) rather than efficiency. + */ +uint64_t +kmem_cache_stat(kmem_cache_t *cp, char *name) +{ + uint_t i; + kstat_t *ksp = cp->cache_kstat; + kstat_named_t *knp = (kstat_named_t *)&kmem_cache_kstat; + uint64_t value = 0; + + if (ksp != NULL) { + mutex_enter(&kmem_cache_kstat_lock); + (void) kmem_cache_kstat_update(ksp, KSTAT_READ); + for (i = 0; i < ksp->ks_ndata; i++) { + if (strcmp(knp[i].name, name) == 0) { + value = knp[i].value.ui64; + break; + } + } + mutex_exit(&kmem_cache_kstat_lock); + } + return (value); +} + +// TRUE if we have more than a critical minimum of memory +// used in arc_memory_throttle; if FALSE, we throttle +static inline boolean_t +spl_minimal_physmem_p_logic() +{ + // Are we using more than ZFS has? + if (segkmem_total_mem_allocated > total_memory) { + vm_page_free_wanted = (segkmem_total_mem_allocated - total_memory) / PAGE_SIZE; + vm_page_free_count = 0; + } else { + vm_page_free_wanted = 0; + vm_page_free_count = (total_memory - segkmem_total_mem_allocated) / PAGE_SIZE; + } + + // do we have enough memory to avoid throttling? + if (vm_page_free_wanted > 0) + return (FALSE); + if (vm_page_free_count < (vm_page_free_min + 512)) + // 512 pages above 3500 (normal vm_page_free_min) + // 2MiB above 13 MiB + return (FALSE); + return (TRUE); +} + +/* + * Windows pressure events come from "\KernelObjects\HighMemoryCondition" and + * "\KernelObjects\LowMemoryCondition" + */ + +int64_t +spl_minimal_physmem_p(void) +{ + + // arc will throttle throttle if we are paging, otherwise + // we want a small bit of pressure here so that we can compete + // a little with the xnu buffer cache + + return (spl_free > -1024LL); +} + +/* + * Return the maximum amount of memory that is (in theory) allocatable + * from the heap. This may be used as an estimate only since there + * is no guarentee this space will still be available when an allocation + * request is made, nor that the space may be allocated in one big request + * due to kernel heap fragmentation. + */ +uint64_t +kmem_maxavail(void) +{ +#ifndef APPLE + // spgcnt_t pmem = availrmem - tune.t_minarmem; + // spgcnt_t vmem = btop(vmem_size(heap_arena, VMEM_FREE)); + // + // return ((uint32_t)ptob(MAX(MIN(pmem, vmem), 0))); +#endif + return (physmem * PAGE_SIZE); +} + +/* + * Indicate whether memory-intensive kmem debugging is enabled. + */ +int +kmem_debugging(void) +{ + return (kmem_flags & (KMF_AUDIT | KMF_REDZONE)); +} + +/* binning function, sorts finely at the two extremes */ +#define KMEM_PARTIAL_SLAB_WEIGHT(sp, binshift) \ +((((sp)->slab_refcnt <= (binshift)) || \ +(((sp)->slab_chunks - (sp)->slab_refcnt) <= (binshift))) \ +? -(sp)->slab_refcnt \ +: -((binshift) + ((sp)->slab_refcnt >> (binshift)))) + +/* + * Minimizing the number of partial slabs on the freelist minimizes + * fragmentation (the ratio of unused buffers held by the slab layer). There are + * two ways to get a slab off of the freelist: 1) free all the buffers on the + * slab, and 2) allocate all the buffers on the slab. It follows that we want + * the most-used slabs at the front of the list where they have the best chance + * of being completely allocated, and the least-used slabs at a safe distance + * from the front to improve the odds that the few remaining buffers will all be + * freed before another allocation can tie up the slab. For that reason a slab + * with a higher slab_refcnt sorts less than than a slab with a lower + * slab_refcnt. + * + * However, if a slab has at least one buffer that is deemed unfreeable, we + * would rather have that slab at the front of the list regardless of + * slab_refcnt, since even one unfreeable buffer makes the entire slab + * unfreeable. If the client returns KMEM_CBRC_NO in response to a cache_move() + * callback, the slab is marked unfreeable for as long as it remains on the + * freelist. + */ +static int +kmem_partial_slab_cmp(const void *pp0, const void *pp1) +{ + const kmem_cache_t *cp; + const kmem_slab_t *s0 = pp0; + const kmem_slab_t *s1 = pp1; + int w0, w1; + uint32_t binshift; + + ASSERT(KMEM_SLAB_IS_PARTIAL(s0)); + ASSERT(KMEM_SLAB_IS_PARTIAL(s1)); + ASSERT(s0->slab_cache == s1->slab_cache); + cp = s1->slab_cache; + ASSERT(MUTEX_HELD((struct kmutex *)&cp->cache_lock)); + binshift = cp->cache_partial_binshift; + + /* weight of first slab */ + w0 = KMEM_PARTIAL_SLAB_WEIGHT(s0, (int)binshift); + if (s0->slab_flags & KMEM_SLAB_NOMOVE) { + w0 -= cp->cache_maxchunks; + } + + /* weight of second slab */ + w1 = KMEM_PARTIAL_SLAB_WEIGHT(s1, (int)binshift); + if (s1->slab_flags & KMEM_SLAB_NOMOVE) { + w1 -= cp->cache_maxchunks; + } + + if (w0 < w1) + return (-1); + if (w0 > w1) + return (1); + + // compare slab age if available + hrtime_t c0 = s0->slab_create_time, c1 = s1->slab_create_time; + if (c0 !=0 && c1 != 0 && c0 != c1) { + // higher time is newer; newer sorts before older + if (c0 < c1) // c0 is older than c1 + return (1); // so c0 sorts after c1 + if (c0 > c1) + return (-1); + } + + /* compare pointer values */ + if ((uintptr_t)s0 < (uintptr_t)s1) + return (-1); + if ((uintptr_t)s0 > (uintptr_t)s1) + return (1); + + return (0); +} + +/* + * It must be valid to call the destructor (if any) on a newly created object. + * That is, the constructor (if any) must leave the object in a valid state for + * the destructor. + */ +kmem_cache_t * +kmem_cache_create( + char *name, /* descriptive name for this cache */ + uint32_t bufsize, /* size of the objects it manages */ + uint32_t align, /* required object alignment */ + int (*constructor)(void *, void *, int), /* object constructor */ + void (*destructor)(void *, void *), /* object destructor */ + void (*reclaim)(void *), /* memory reclaim callback */ + void *private, /* pass-thru arg for constr/destr/reclaim */ + vmem_t *vmp, /* vmem source for slab allocation */ + int cflags) /* cache creation flags */ +{ + int cpu_seqid; + uint32_t chunksize; + kmem_cache_t *cp; + kmem_magtype_t *mtp; + uint32_t csize = KMEM_CACHE_SIZE(max_ncpus); + +#ifdef DEBUG + /* + * Cache names should conform to the rules for valid C identifiers + */ + if (!strident_valid(name)) { + cmn_err(CE_CONT, + "kmem_cache_create: '%s' is an invalid cache name\n" + "cache names must conform to the rules for " + "C identifiers\n", name); + } +#endif /* DEBUG */ + + if (vmp == NULL) + vmp = kmem_default_arena; + + /* + * If this kmem cache has an identifier vmem arena as its source, mark + * it such to allow kmem_reap_idspace(). + */ + ASSERT(!(cflags & KMC_IDENTIFIER)); /* consumer should not set this */ + if (vmp->vm_cflags & VMC_IDENTIFIER) + cflags |= KMC_IDENTIFIER; + + /* + * Get a kmem_cache structure. We arrange that cp->cache_cpu[] + * is aligned on a KMEM_CPU_CACHE_SIZE boundary to prevent + * FALSE sharing of per-CPU data. + */ + cp = vmem_xalloc(kmem_cache_arena, csize, + KMEM_CPU_CACHE_SIZE, + P2NPHASE(csize, KMEM_CPU_CACHE_SIZE), + 0, NULL, NULL, VM_SLEEP); + bzero(cp, csize); + list_link_init(&cp->cache_link); + + if (align == 0) + align = KMEM_ALIGN; + + /* + * If we're not at least KMEM_ALIGN aligned, we can't use free + * memory to hold bufctl information (because we can't safely + * perform word loads and stores on it). + */ + if (align < KMEM_ALIGN) + cflags |= KMC_NOTOUCH; + + if ((align & (align - 1)) != 0 || align > vmp->vm_quantum) + panic("kmem_cache_create: bad alignment %lu", align); + + mutex_enter(&kmem_flags_lock); + if (kmem_flags & KMF_RANDOMIZE) + kmem_flags = (((kmem_flags | ~KMF_RANDOM) + 1) & KMF_RANDOM) | + KMF_RANDOMIZE; + cp->cache_flags = (kmem_flags | cflags) & KMF_DEBUG; + mutex_exit(&kmem_flags_lock); + + /* + * Make sure all the various flags are reasonable. + */ + ASSERT(!(cflags & KMC_NOHASH) || !(cflags & KMC_NOTOUCH)); + + if (cp->cache_flags & KMF_LITE) { + if (bufsize >= kmem_lite_minsize && + align <= kmem_lite_maxalign && + P2PHASE(bufsize, kmem_lite_maxalign) != 0) { + cp->cache_flags |= KMF_BUFTAG; + cp->cache_flags &= ~(KMF_AUDIT | KMF_FIREWALL); + } else { + cp->cache_flags &= ~KMF_DEBUG; + } + } + + if (cp->cache_flags & KMF_DEADBEEF) + cp->cache_flags |= KMF_REDZONE; + + if ((cflags & KMC_QCACHE) && (cp->cache_flags & KMF_AUDIT)) + cp->cache_flags |= KMF_NOMAGAZINE; + + if (cflags & KMC_NODEBUG) + cp->cache_flags &= ~KMF_DEBUG; + + if (cflags & KMC_NOTOUCH) + cp->cache_flags &= ~KMF_TOUCH; + + if (cflags & KMC_PREFILL) + cp->cache_flags |= KMF_PREFILL; + + if (cflags & KMC_NOHASH) + cp->cache_flags &= ~(KMF_AUDIT | KMF_FIREWALL); + + if (cflags & KMC_NOMAGAZINE) + cp->cache_flags |= KMF_NOMAGAZINE; + + if ((cp->cache_flags & KMF_AUDIT) && !(cflags & KMC_NOTOUCH)) + cp->cache_flags |= KMF_REDZONE; + + if (!(cp->cache_flags & KMF_AUDIT)) + cp->cache_flags &= ~KMF_CONTENTS; + + if ((cp->cache_flags & KMF_BUFTAG) && bufsize >= kmem_minfirewall && + !(cp->cache_flags & KMF_LITE) && !(cflags & KMC_NOHASH)) + cp->cache_flags |= KMF_FIREWALL; + + if (vmp != kmem_default_arena || kmem_firewall_arena == NULL) + cp->cache_flags &= ~KMF_FIREWALL; + + if (cp->cache_flags & KMF_FIREWALL) { + cp->cache_flags &= ~KMF_BUFTAG; + cp->cache_flags |= KMF_NOMAGAZINE; + ASSERT(vmp == kmem_default_arena); + vmp = kmem_firewall_arena; + } + + /* + * Set cache properties. + */ + (void) strncpy(cp->cache_name, name, KMEM_CACHE_NAMELEN); + strident_canon(cp->cache_name, KMEM_CACHE_NAMELEN + 1); + cp->cache_bufsize = bufsize; + cp->cache_align = align; + cp->cache_constructor = constructor; + cp->cache_destructor = destructor; + cp->cache_reclaim = reclaim; + cp->cache_private = private; + cp->cache_arena = vmp; + cp->cache_cflags = cflags; + + /* + * Determine the chunk size. + */ + chunksize = bufsize; + + if (align >= KMEM_ALIGN) { + chunksize = P2ROUNDUP(chunksize, KMEM_ALIGN); + cp->cache_bufctl = chunksize - KMEM_ALIGN; + } + + if (cp->cache_flags & KMF_BUFTAG) { + cp->cache_bufctl = chunksize; + cp->cache_buftag = chunksize; + if (cp->cache_flags & KMF_LITE) + chunksize += KMEM_BUFTAG_LITE_SIZE(kmem_lite_count); + else + chunksize += sizeof (kmem_buftag_t); + } + + if (cp->cache_flags & KMF_DEADBEEF) { + cp->cache_verify = MIN(cp->cache_buftag, kmem_maxverify); + if (cp->cache_flags & KMF_LITE) + cp->cache_verify = sizeof (uint64_t); + } + + cp->cache_contents = MIN(cp->cache_bufctl, kmem_content_maxsave); + + cp->cache_chunksize = chunksize = P2ROUNDUP(chunksize, align); + + /* + * Now that we know the chunk size, determine the optimal slab size. + */ + + size_t vquantum = vmp->vm_quantum; + + if ((cflags & KMC_ARENA_SLAB) == KMC_ARENA_SLAB) { + VERIFY3U((vmp->vm_cflags & VMC_NO_QCACHE),==,VMC_NO_QCACHE); + VERIFY3U(vmp->vm_min_import,>,0); + VERIFY3U(vmp->vm_min_import,>=,(2 * vmp->vm_quantum)); + VERIFY(ISP2(vmp->vm_min_import)); + vquantum = vmp->vm_min_import >> 1; + } + + if (vmp == kmem_firewall_arena) { + cp->cache_slabsize = P2ROUNDUP(chunksize, vquantum); + cp->cache_mincolor = cp->cache_slabsize - chunksize; + cp->cache_maxcolor = cp->cache_mincolor; + cp->cache_flags |= KMF_HASH; + ASSERT(!(cp->cache_flags & KMF_BUFTAG)); + } else if ((cflags & KMC_NOHASH) || (!(cflags & KMC_NOTOUCH) && + !(cp->cache_flags & KMF_AUDIT) && + chunksize < vquantum / + KMEM_VOID_FRACTION)) { + cp->cache_slabsize = vquantum; + cp->cache_mincolor = 0; + cp->cache_maxcolor = + (cp->cache_slabsize - sizeof (kmem_slab_t)) % chunksize; + ASSERT(chunksize + sizeof (kmem_slab_t) <= cp->cache_slabsize); + ASSERT(!(cp->cache_flags & KMF_AUDIT)); + } else { + uint32_t chunks, bestfit, waste, slabsize; + uint32_t minwaste = LONG_MAX; + + for (chunks = 1; chunks <= KMEM_VOID_FRACTION; chunks++) { + slabsize = P2ROUNDUP(chunksize * chunks, + vquantum); + chunks = slabsize / chunksize; + waste = (slabsize % chunksize) / chunks; + if (waste < minwaste) { + minwaste = waste; + bestfit = slabsize; + } + } + if (cflags & KMC_QCACHE) + bestfit = VMEM_QCACHE_SLABSIZE(vmp->vm_qcache_max); + cp->cache_slabsize = bestfit; + cp->cache_mincolor = 0; + cp->cache_maxcolor = bestfit % chunksize; + cp->cache_flags |= KMF_HASH; + } + + cp->cache_maxchunks = (cp->cache_slabsize / cp->cache_chunksize); + cp->cache_partial_binshift = highbit(cp->cache_maxchunks / 16) + 1; + + /* + * Disallowing prefill when either the DEBUG or HASH flag is set or when + * there is a constructor avoids some tricky issues with debug setup + * that may be revisited later. We cannot allow prefill in a + * metadata cache because of potential recursion. + */ + if (vmp == kmem_msb_arena || + cp->cache_flags & (KMF_HASH | KMF_BUFTAG) || + cp->cache_constructor != NULL) + cp->cache_flags &= ~KMF_PREFILL; + + if (cp->cache_flags & KMF_HASH) { + ASSERT(!(cflags & KMC_NOHASH)); + cp->cache_bufctl_cache = (cp->cache_flags & KMF_AUDIT) ? + kmem_bufctl_audit_cache : kmem_bufctl_cache; + } + + if (cp->cache_maxcolor >= vquantum) + cp->cache_maxcolor = vquantum - 1; + + cp->cache_color = cp->cache_mincolor; + + /* + * Initialize the rest of the slab layer. + */ + mutex_init(&cp->cache_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&cp->cache_partial_slabs, kmem_partial_slab_cmp, + sizeof (kmem_slab_t), offsetof(kmem_slab_t, slab_link)); + /* LINTED: E_TRUE_LOGICAL_EXPR */ + ASSERT(sizeof (list_node_t) <= sizeof (avl_node_t)); + /* reuse partial slab AVL linkage for complete slab list linkage */ + list_create(&cp->cache_complete_slabs, + sizeof (kmem_slab_t), offsetof(kmem_slab_t, slab_link)); + + if (cp->cache_flags & KMF_HASH) { + cp->cache_hash_table = vmem_alloc(kmem_hash_arena, + KMEM_HASH_INITIAL * sizeof (void *), + VM_SLEEP); + bzero(cp->cache_hash_table, + KMEM_HASH_INITIAL * sizeof (void *)); + cp->cache_hash_mask = KMEM_HASH_INITIAL - 1; + cp->cache_hash_shift = highbit((ulong_t)chunksize) - 1; + } + + /* + * Initialize the depot. + */ + mutex_init(&cp->cache_depot_lock, NULL, MUTEX_DEFAULT, NULL); + + for (mtp = kmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) + continue; + + cp->cache_magtype = mtp; + + /* + * Initialize the CPU layer. + */ + for (cpu_seqid = 0; cpu_seqid < (int)max_ncpus; cpu_seqid++) { + kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid]; + mutex_init(&ccp->cc_lock, NULL, MUTEX_DEFAULT, NULL); // XNU + ccp->cc_flags = cp->cache_flags; + ccp->cc_rounds = -1; + ccp->cc_prounds = -1; + } + + /* + * Create the cache's kstats. + */ + if ((cp->cache_kstat = kstat_create("unix", 0, cp->cache_name, + "kmem_cache", KSTAT_TYPE_NAMED, + sizeof (kmem_cache_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + cp->cache_kstat->ks_data = &kmem_cache_kstat; + cp->cache_kstat->ks_update = kmem_cache_kstat_update; + cp->cache_kstat->ks_private = cp; + cp->cache_kstat->ks_lock = &kmem_cache_kstat_lock; + kstat_install(cp->cache_kstat); + } + + /* + * Add the cache to the global list. This makes it visible + * to kmem_update(), so the cache must be ready for business. + */ + mutex_enter(&kmem_cache_lock); + list_insert_tail(&kmem_caches, cp); + mutex_exit(&kmem_cache_lock); + + if (kmem_ready) + kmem_cache_magazine_enable(cp); + + return (cp); +} + +static int +kmem_move_cmp(const void *buf, const void *p) +{ + const kmem_move_t *kmm = p; + uintptr_t v1 = (uintptr_t)buf; + uintptr_t v2 = (uintptr_t)kmm->kmm_from_buf; + return (v1 < v2 ? -1 : (v1 > v2 ? 1 : 0)); +} + +static void +kmem_reset_reclaim_threshold(kmem_defrag_t *kmd) +{ + kmd->kmd_reclaim_numer = 1; +} + +/* + * Initially, when choosing candidate slabs for buffers to move, we want to be + * very selective and take only slabs that are less than + * (1 / KMEM_VOID_FRACTION) allocated. If we have difficulty finding candidate + * slabs, then we raise the allocation ceiling incrementally. The reclaim + * threshold is reset to (1 / KMEM_VOID_FRACTION) as soon as the cache is no + * longer fragmented. + */ +static void +kmem_adjust_reclaim_threshold(kmem_defrag_t *kmd, int direction) +{ + if (direction > 0) { + /* make it easier to find a candidate slab */ + if (kmd->kmd_reclaim_numer < (KMEM_VOID_FRACTION - 1)) { + kmd->kmd_reclaim_numer++; + } + } else { + /* be more selective */ + if (kmd->kmd_reclaim_numer > 1) { + kmd->kmd_reclaim_numer--; + } + } +} + +void +kmem_cache_set_move(kmem_cache_t *cp, + kmem_cbrc_t (*move)(void *, void *, uint32_t, void *)) +{ + kmem_defrag_t *defrag; + + ASSERT(move != NULL); + /* + * The consolidator does not support NOTOUCH caches because kmem cannot + * initialize their slabs with the 0xbaddcafe memory pattern, which sets + * a low order bit usable by clients to distinguish uninitialized memory + * from known objects (see kmem_slab_create). + */ + ASSERT(!(cp->cache_cflags & KMC_NOTOUCH)); + ASSERT(!(cp->cache_cflags & KMC_IDENTIFIER)); + + /* + * We should not be holding anyone's cache lock when calling + * kmem_cache_alloc(), so allocate in all cases before acquiring the + * lock. + */ + defrag = kmem_cache_alloc(kmem_defrag_cache, KM_SLEEP); + + mutex_enter(&cp->cache_lock); + + if (KMEM_IS_MOVABLE(cp)) { + if (cp->cache_move == NULL) { + ASSERT(cp->cache_slab_alloc == 0); + + cp->cache_defrag = defrag; + defrag = NULL; /* nothing to free */ + bzero(cp->cache_defrag, sizeof (kmem_defrag_t)); + avl_create(&cp->cache_defrag->kmd_moves_pending, + kmem_move_cmp, sizeof (kmem_move_t), + offsetof(kmem_move_t, kmm_entry)); + /* LINTED: E_TRUE_LOGICAL_EXPR */ + ASSERT(sizeof (list_node_t) <= sizeof (avl_node_t)); + /* reuse the slab's AVL linkage for deadlist linkage */ + list_create(&cp->cache_defrag->kmd_deadlist, + sizeof (kmem_slab_t), + offsetof(kmem_slab_t, slab_link)); + kmem_reset_reclaim_threshold(cp->cache_defrag); + } + cp->cache_move = move; + } + + mutex_exit(&cp->cache_lock); + + if (defrag != NULL) { + kmem_cache_free(kmem_defrag_cache, defrag); /* unused */ + } +} + +void +kmem_qcache_destroy() +{ + kmem_cache_t *cp; + kmem_cache_t* cache_to_destroy = NULL; + + do { + cache_to_destroy = NULL; + mutex_enter(&kmem_cache_lock); + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + if (cp->cache_cflags & KMC_QCACHE) { + cache_to_destroy = cp; + break; + } + } + mutex_exit(&kmem_cache_lock); + + if (cache_to_destroy) { + kmem_cache_destroy(cache_to_destroy); + } + } while (cache_to_destroy); +} + +void +kmem_cache_destroy(kmem_cache_t *cp) +{ + int cpu_seqid; + + /* + * Remove the cache from the global cache list so that no one else + * can schedule tasks on its behalf, wait for any pending tasks to + * complete, purge the cache, and then destroy it. + */ + mutex_enter(&kmem_cache_lock); + list_remove(&kmem_caches, cp); + mutex_exit(&kmem_cache_lock); + + if (kmem_taskq != NULL) + taskq_wait(kmem_taskq); + + if (kmem_move_taskq != NULL) + taskq_wait(kmem_move_taskq); + + kmem_cache_magazine_purge(cp); + + mutex_enter(&cp->cache_lock); + + if (cp->cache_buftotal != 0) + cmn_err(CE_WARN, "kmem_cache_destroy: '%s' (%p) not empty", + cp->cache_name, (void *)cp); + if (cp->cache_defrag != NULL) { + avl_destroy(&cp->cache_defrag->kmd_moves_pending); + list_destroy(&cp->cache_defrag->kmd_deadlist); + kmem_cache_free(kmem_defrag_cache, cp->cache_defrag); + cp->cache_defrag = NULL; + } + /* + * The cache is now dead. There should be no further activity. We + * enforce this by setting land mines in the constructor, destructor, + * reclaim, and move routines that induce a kernel text fault if + * invoked. + */ + cp->cache_constructor = (int (*)(void *, void *, int))1; + cp->cache_destructor = (void (*)(void *, void *))2; + cp->cache_reclaim = (void (*)(void *))3; + cp->cache_move = (kmem_cbrc_t (*)(void *, void *, uint32_t, void *))4; + mutex_exit(&cp->cache_lock); + + kstat_delete(cp->cache_kstat); + + if (cp->cache_hash_table != NULL) + vmem_free(kmem_hash_arena, cp->cache_hash_table, + (cp->cache_hash_mask + 1) * sizeof (void *)); + + for (cpu_seqid = 0; cpu_seqid < (int)max_ncpus; cpu_seqid++) + mutex_destroy(&cp->cache_cpu[cpu_seqid].cc_lock); // XNU + + mutex_destroy(&cp->cache_depot_lock); + mutex_destroy(&cp->cache_lock); + + vmem_free(kmem_cache_arena, cp, KMEM_CACHE_SIZE(max_ncpus)); +} + +static void +kmem_alloc_caches_create(const int *array, uint32_t count, + kmem_cache_t **alloc_table, uint32_t maxbuf, + uint_t shift) +{ + char name[KMEM_CACHE_NAMELEN + 1]; + uint32_t table_unit = (1 << shift); /* range of one alloc_table entry */ + uint32_t size = table_unit; + int i; + + for (i = 0; i < (int)count; i++) { + uint32_t cache_size = array[i]; + uint32_t align = KMEM_ALIGN; + kmem_cache_t *cp; + + /* if the table has an entry for maxbuf, we're done */ + if (size > maxbuf) + break; + + /* cache size must be a multiple of the table unit */ + ASSERT(P2PHASE(cache_size, table_unit) == 0); + + /* + * If they allocate a multiple of the coherency granularity, + * they get a coherency-granularity-aligned address. + */ + if (IS_P2ALIGNED(cache_size, 64)) + align = 64; + if (IS_P2ALIGNED(cache_size, PAGESIZE)) + align = PAGESIZE; + (void) snprintf(name, sizeof (name), + "kmem_alloc_%lu", cache_size); + cp = kmem_cache_create(name, cache_size, align, + NULL, NULL, NULL, NULL, NULL, KMC_KMEM_ALLOC | KMF_HASH); + + while (size <= cache_size) { + alloc_table[(size - 1) >> shift] = cp; + size += table_unit; + } + } + + ASSERT(size > maxbuf); /* i.e. maxbuf <= max(cache_size) */ +} + +static void +kmem_alloc_caches_destroy() +{ + kmem_cache_t *cache_to_destroy = NULL; + kmem_cache_t *cp = NULL; + + do { + cache_to_destroy = NULL; + + // Locate the first cache that has the KMC_KMEM_ALLOC flag. + mutex_enter(&kmem_cache_lock); + + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + if (cp->cache_cflags & KMC_KMEM_ALLOC) { + cache_to_destroy = cp; + break; + } + } + + mutex_exit(&kmem_cache_lock); + + // Destroy the cache + if (cache_to_destroy) { + kmem_cache_destroy(cache_to_destroy); + } + + } while (cache_to_destroy); +} + +static void +kmem_destroy_cache_by_name(const char *substr) +{ + kmem_cache_t *cache_to_destroy = NULL; + kmem_cache_t *cp = NULL; + + do { + cache_to_destroy = NULL; + + // Locate the first cache that has the KMC_KMEM_ALLOC flag. + mutex_enter(&kmem_cache_lock); + + for (cp = list_head(&kmem_caches); cp != NULL; + cp = list_next(&kmem_caches, cp)) { + if (kmem_strstr(cp->cache_name, substr)) { + cache_to_destroy = cp; + break; + } + } + + mutex_exit(&kmem_cache_lock); + + // Destroy the cache + if (cache_to_destroy) { + kmem_cache_destroy(cache_to_destroy); + } + + } while (cache_to_destroy); +} + +static void +kmem_cache_init(int pass, int use_large_pages) +{ + int i; + uint32_t maxbuf; + kmem_magtype_t *mtp; + + for (i = 0; i < sizeof (kmem_magtype) / sizeof (*mtp); i++) { + char name[KMEM_CACHE_NAMELEN + 1]; + + mtp = &kmem_magtype[i]; + (void) snprintf(name, KMEM_CACHE_NAMELEN, "%s%d", + KMEM_MAGAZINE_PREFIX, + mtp->mt_magsize); + mtp->mt_cache = kmem_cache_create( + name, + (mtp->mt_magsize + 1) * sizeof (void *), + mtp->mt_align, NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + } + + kmem_slab_cache = kmem_cache_create("kmem_slab_cache", + sizeof (kmem_slab_t), 0, NULL, NULL, + NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + + kmem_bufctl_cache = kmem_cache_create("kmem_bufctl_cache", + sizeof (kmem_bufctl_t), 0, + NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + + kmem_bufctl_audit_cache = kmem_cache_create("kmem_bufctl_audit_cache", + sizeof (kmem_bufctl_audit_t), + 0, NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + + if (pass == 2) { + kmem_va_arena = vmem_create(KMEM_VA_PREFIX, + NULL, 0, PAGESIZE, + vmem_alloc, vmem_free, heap_arena, + 2 * PAGESIZE, VM_SLEEP); + + kmem_default_arena = vmem_create("kmem_default", + NULL, 0, PAGESIZE, + vmem_alloc, vmem_free, kmem_va_arena, + 0, VMC_DUMPSAFE | VM_SLEEP); + + /* Figure out what our maximum cache size is */ + maxbuf = kmem_max_cached; + if (maxbuf <= KMEM_MAXBUF) { + maxbuf = 0; + kmem_max_cached = KMEM_MAXBUF; + } else { + uint32_t size = 0; + uint32_t max = + sizeof (kmem_big_alloc_sizes) / sizeof (int); + /* + * Round maxbuf up to an existing cache size. If maxbuf + * is larger than the largest cache, we truncate it to + * the largest cache's size. + */ + for (i = 0; i < (int)max; i++) { + size = kmem_big_alloc_sizes[i]; + if (maxbuf <= size) + break; + } + kmem_max_cached = maxbuf = size; + } + + /* + * The big alloc table may not be completely overwritten, so + * we clear out any stale cache pointers from the first pass. + */ + bzero(kmem_big_alloc_table, sizeof (kmem_big_alloc_table)); + } else { + /* + * During the first pass, the kmem_alloc_* caches + * are treated as metadata. + */ + kmem_default_arena = kmem_msb_arena; + maxbuf = KMEM_BIG_MAXBUF_32BIT; + } + + /* + * Set up the default caches to back kmem_alloc() + */ + kmem_alloc_caches_create( + kmem_alloc_sizes, sizeof (kmem_alloc_sizes) / sizeof (int), + kmem_alloc_table, KMEM_MAXBUF, KMEM_ALIGN_SHIFT); + + kmem_alloc_caches_create( + kmem_big_alloc_sizes, sizeof (kmem_big_alloc_sizes) / sizeof (int), + kmem_big_alloc_table, maxbuf, KMEM_BIG_SHIFT); + + kmem_big_alloc_table_max = maxbuf >> KMEM_BIG_SHIFT; +} + +struct free_slab { + vmem_t *vmp; + uint32_t slabsize; + void *slab; + list_node_t next; +}; + +static list_t freelist; + + +void +kmem_cache_build_slablist(kmem_cache_t *cp) +{ + int cpu_seqid; + + vmem_t *vmp = cp->cache_arena; + kmem_slab_t *sp; + struct free_slab *fs; + + for (sp = list_head(&cp->cache_complete_slabs); sp != NULL; + sp = list_next(&cp->cache_complete_slabs, sp)) { + + MALLOC(fs, struct free_slab *, sizeof(struct free_slab), + M_TEMP, M_WAITOK); + fs->vmp = vmp; + fs->slabsize = cp->cache_slabsize; + fs->slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, vmp->vm_quantum); + list_link_init(&fs->next); + list_insert_tail(&freelist, fs); + } + + for (sp = avl_first(&cp->cache_partial_slabs); sp != NULL; + sp = AVL_NEXT(&cp->cache_partial_slabs, sp)) { + + MALLOC(fs, struct free_slab *, sizeof(struct free_slab), M_TEMP, M_WAITOK); + fs->vmp = vmp; + fs->slabsize = cp->cache_slabsize; + fs->slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, vmp->vm_quantum); + list_link_init(&fs->next); + list_insert_tail(&freelist, fs); + + } + + + kstat_delete(cp->cache_kstat); + + if (cp->cache_hash_table != NULL) + vmem_free(kmem_hash_arena, cp->cache_hash_table, + (cp->cache_hash_mask + 1) * sizeof (void *)); + + for (cpu_seqid = 0; cpu_seqid < (int)max_ncpus; cpu_seqid++) + mutex_destroy(&cp->cache_cpu[cpu_seqid].cc_lock); // XNU + + mutex_destroy(&cp->cache_depot_lock); + mutex_destroy(&cp->cache_lock); + + vmem_free(kmem_cache_arena, cp, KMEM_CACHE_SIZE(max_ncpus)); +} + + +static void +kmem_cache_fini() +{ + kmem_cache_t *cp; + int i; + struct free_slab *fs; + + list_create(&freelist, sizeof (struct free_slab), + offsetof(struct free_slab, next)); + + mutex_enter(&kmem_cache_lock); + + while ((cp = list_head(&kmem_caches))) { + list_remove(&kmem_caches, cp); + mutex_exit(&kmem_cache_lock); + kmem_cache_build_slablist(cp); + mutex_enter(&kmem_cache_lock); + } + + mutex_exit(&kmem_cache_lock); + + i = 0; + while ((fs = list_head(&freelist))) { + i++; + list_remove(&freelist, fs); + vmem_free(fs->vmp, fs->slab, fs->slabsize); + FREE(fs, M_TEMP); + + } + dprintf("SPL: Released %u slabs\n", i); + list_destroy(&freelist); +} + + +// this is intended to substitute for kmem_avail() in arc.c +int64_t +spl_free_wrapper(void) +{ + return (spl_free); +} + +// this is intended to substitute for kmem_avail() in arc.c +// when arc_reclaim_thread() calls spl_free_set_pressure(0); +int64_t +spl_free_manual_pressure_wrapper(void) +{ + return (spl_free_manual_pressure); +} + +uint64_t +spl_free_last_pressure_wrapper(void) +{ + return (spl_free_last_pressure); +} + +int64_t +spl_free_set_and_wait_pressure(int64_t new_p, boolean_t fast, clock_t check_interval) +{ + + int64_t snapshot_pressure = 0; + + if (new_p <= 0) + return (0); + + spl_free_fast_pressure = fast; + + if (spl_free_manual_pressure >= 0) + spl_free_manual_pressure += new_p; + else + spl_free_manual_pressure = new_p; + + // wait for another thread to reset pressure + const uint64_t start = zfs_lbolt(); + const uint64_t end_by = start + (hz*60); + const uint64_t double_at = start + (hz/2); + const uint64_t double_again_at = start + hz; + boolean_t doubled = FALSE, doubled_again = FALSE; + uint64_t now; + + spl_free_last_pressure = start; + + for (; spl_free_manual_pressure != 0; ) { + // has another thread set spl_free_manual_pressure? + if (spl_free_manual_pressure < new_p) + spl_free_manual_pressure = new_p; + snapshot_pressure = spl_free_manual_pressure; + mutex_enter(&spl_free_thread_lock); + cv_timedwait_hires(&spl_free_thread_cv, + &spl_free_thread_lock, check_interval, 0, 0); + mutex_exit(&spl_free_thread_lock); + now = zfs_lbolt(); + if (now > end_by) { + TraceEvent(TRACE_ERROR, "%s: timed out after one minute!\n", __func__); + break; + } else if (now > double_again_at && !doubled_again) { + doubled_again = TRUE; + new_p *= 2; + } else if (now > double_at) { + doubled = TRUE; + new_p *= 2; + } + } + return (snapshot_pressure); +} + +// routinely called by arc_reclaim_thread() with new_p == 0 +void +spl_free_set_pressure(int64_t new_p) +{ + if (new_p > spl_free_manual_pressure || new_p <= 0) + spl_free_manual_pressure = new_p; + if (new_p == 0) { + spl_free_fast_pressure = FALSE; + // wake up both spl_free_thread() to recalculate spl_free + // and any spl_free_set_and_wait_pressure() threads + cv_broadcast(&spl_free_thread_cv); + } + spl_free_last_pressure = zfs_lbolt(); +} + +void +spl_free_set_pressure_both(int64_t new_p, boolean_t fast) +{ + spl_free_fast_pressure = fast; + if (new_p > spl_free_manual_pressure || new_p <= 0) + spl_free_manual_pressure = new_p; + spl_free_last_pressure = zfs_lbolt(); +} + +void spl_free_maybe_reap(void); + +void +spl_free_set_emergency_pressure(int64_t new_p) +{ + spl_free_fast_pressure = TRUE; + if (new_p > spl_free_manual_pressure || new_p <= 0) + spl_free_manual_pressure = new_p; + spl_free_maybe_reap(); + spl_free_last_pressure = zfs_lbolt(); +} + +void +spl_free_set_emergency_pressure_additive(int64_t new_p) +{ + spl_free_fast_pressure = TRUE; + spl_free_manual_pressure += new_p; + spl_free_last_pressure = zfs_lbolt(); +} + +void +spl_free_set_pressure_additive(int64_t new_p) +{ + spl_free_manual_pressure += new_p; + spl_free_last_pressure = zfs_lbolt(); +} + +boolean_t +spl_free_fast_pressure_wrapper() +{ + return (spl_free_fast_pressure); +} + +void +spl_free_set_fast_pressure(boolean_t state) +{ + spl_free_fast_pressure = state; + spl_free_last_pressure = zfs_lbolt(); +} + +void +spl_free_reap_caches(void) +{ + // note: this may take some time + vmem_qcache_reap(zio_arena_parent); + kmem_reap(); + vmem_qcache_reap(kmem_va_arena); +} + +void +spl_free_maybe_reap(void) +{ + static _Atomic uint64_t last_reap = 0; + const uint64_t lockout_time = 60 * hz; + + uint64_t now = zfs_lbolt(); + if (now > last_reap + lockout_time) { + last_reap = now; + spl_free_maybe_reap_flag = TRUE; + } +} + +boolean_t +spl_maybe_send_large_pressure(uint64_t now, uint64_t minutes, boolean_t full) +{ + static volatile _Atomic uint64_t spl_last_large_pressure = 0; + const uint64_t interval_ticks = minutes * 60ULL * (uint64_t)hz; + + if (spl_last_large_pressure + interval_ticks > now) + return (FALSE); + + spl_last_large_pressure = now; + + const int64_t sixteenth_physmem = (int64_t)real_total_memory / 16LL; + const int64_t sixtyfourth_physmem = sixteenth_physmem / 4LL; + int64_t howmuch = sixteenth_physmem; + + if (full == FALSE) + howmuch = sixtyfourth_physmem; + + + dprintf("SPL: %s: %lld bytes at time %llu\n", + __func__, howmuch, now); + + spl_free_set_emergency_pressure(howmuch); + + return(TRUE); +} + +static void +spl_free_thread(void *notused) +{ + callb_cpr_t cpr; + uint64_t last_update = zfs_lbolt(); + int64_t last_spl_free; + uint64_t ema_new = 0; + uint64_t ema_old = 0; + uint64_t alpha; + + CALLB_CPR_INIT(&cpr, &spl_free_thread_lock, callb_generic_cpr, FTAG); + + spl_free = (int64_t)PAGESIZE * + (int64_t)(vm_page_free_count - vm_page_free_min); + + mutex_enter(&spl_free_thread_lock); + + dprintf("SPL: beginning spl_free_thread() loop, spl_free == %lld\n", + spl_free); + + uint64_t recent_lowmem = 0; + uint64_t last_disequilibrium = 0; + + while (!spl_free_thread_exit) { + mutex_exit(&spl_free_thread_lock); + boolean_t lowmem = FALSE; + boolean_t emergency_lowmem = FALSE; + int64_t base; + int64_t new_spl_free = 0LL; + + spl_stats.spl_free_wake_count.value.ui64++; + + if (spl_free_maybe_reap_flag == TRUE) { + spl_free_maybe_reap_flag = FALSE; + spl_free_reap_caches(); + } + + uint64_t time_now = zfs_lbolt(); + uint64_t time_now_seconds = 0; + if (time_now > hz) + time_now_seconds = time_now / hz; + + last_spl_free = spl_free; + + new_spl_free = 0LL; + + // if there is pressure that has not yet reached arc_reclaim_thread() + // then start with a negative new_spl_free + if (spl_free_manual_pressure > 0) { + int64_t old_pressure = spl_free_manual_pressure; + new_spl_free -= old_pressure * 2LL; + lowmem = TRUE; + if (spl_free_fast_pressure) { + emergency_lowmem = TRUE; + new_spl_free -= old_pressure * 4LL; + } + } + + // can we allocate at least a 64 MiB segment from spl_heap_arena? + // this probes the reserve and also the largest imported spans, + // which vmem_alloc can fragment if needed. + + boolean_t reserve_low = FALSE; + extern vmem_t *spl_heap_arena; + const uint64_t sixtyfour = 64ULL*1024ULL*1024ULL; + const uint64_t rvallones = (sixtyfour << 1ULL) - 1ULL; + const uint64_t rvmask = ~rvallones; + uint64_t rvfreebits = spl_heap_arena->vm_freemap; + + if ((rvfreebits & rvmask) == 0) { + reserve_low = TRUE; + } else { + new_spl_free += (int64_t) sixtyfour; + } + + // do we have lots of memory in the spl_heap_arena ? + + boolean_t early_lots_free = FALSE; + const uint64_t onetwentyeight = 128ULL*1024ULL*1024ULL; + const uint64_t sixteen = 16ULL*1024ULL*1024ULL; + if (!reserve_low) { + early_lots_free = TRUE; + } else if (vmem_size_semi_atomic(spl_heap_arena, VMEM_FREE) > onetwentyeight) { + early_lots_free = TRUE; + new_spl_free += (int64_t) sixteen; + } + + // do we have lots of memory in the bucket_arenas ? + + extern int64_t vmem_buckets_size(int); // non-locking + int64_t buckets_free = vmem_buckets_size(VMEM_FREE); + if ((uint64_t)buckets_free != spl_buckets_mem_free) + spl_buckets_mem_free = (uint64_t) buckets_free; + + if (buckets_free >= 512LL*1024LL*1024LL) { + early_lots_free = TRUE; + new_spl_free += (int64_t) sixteen; + } + if (buckets_free >= 1024LL*1024LL*1024LL) { + reserve_low = FALSE; + new_spl_free += (int64_t) sixteen; + } + + // if we have neither alloced or freed in several minutes, + // then we do not need to shrink back if there is a momentary + // transient memory spike (i.e., one that lasts less than a second) + + boolean_t memory_equilibrium = FALSE; + const uint64_t five_minutes = 300ULL; + const uint64_t one_minute = 60ULL; + uint64_t last_xat_alloc_seconds = spl_xat_lastalloc; + uint64_t last_xat_free_seconds = spl_xat_lastfree; + + if (last_xat_alloc_seconds + five_minutes > time_now_seconds && + last_xat_free_seconds + five_minutes > time_now_seconds) { + if (last_disequilibrium + one_minute > time_now_seconds) { + memory_equilibrium = TRUE; + last_disequilibrium = 0; + } + } else { + last_disequilibrium = time_now_seconds; + } + + boolean_t just_alloced = FALSE; + if (last_xat_alloc_seconds + 1 > time_now_seconds) + just_alloced = TRUE; + + // this is a sign of a period of time of low system memory, however + // XNU's generation of this variable is not very predictable, + // but generally it should be taken seriously when it's positive + // (it is often FALSEly 0) + + if ((vm_page_free_wanted > 0 && reserve_low && !early_lots_free && + !memory_equilibrium && !just_alloced) || + vm_page_free_wanted >= 1024) { + int64_t bminus = (int64_t)vm_page_free_wanted * (int64_t)PAGESIZE * -16LL; + if (bminus > -16LL*1024LL*1024LL) + bminus = -16LL*1024LL*1024LL; + new_spl_free += bminus; + lowmem = TRUE; + emergency_lowmem = TRUE; + // atomic swaps to set these variables used in .../zfs/arc.c + int64_t previous_highest_pressure = 0; + int64_t new_p = -bminus; + previous_highest_pressure = spl_free_manual_pressure; + if (new_p > previous_highest_pressure || new_p <= 0) { + boolean_t fast = FALSE; + if (vm_page_free_wanted > vm_page_free_min / 8) + fast = TRUE; + spl_free_set_pressure_both(-16LL * new_spl_free, fast); + } + last_disequilibrium = time_now_seconds; + } else if (vm_page_free_wanted > 0) { + int64_t bytes_wanted = (int64_t)vm_page_free_wanted * (int64_t)PAGESIZE; + new_spl_free -= bytes_wanted; + if (reserve_low && !early_lots_free) { + lowmem = TRUE; + if (recent_lowmem == 0) { + recent_lowmem = time_now; + } + if (!memory_equilibrium) { + last_disequilibrium = time_now_seconds; + } + } + } + + // these variables are reliably maintained by XNU + // if vm_page_free_count > vm_page_free_min, then XNU + // is scanning pages and we may want to try to free some memory up + + int64_t above_min_free_pages = (int64_t)vm_page_free_count - (int64_t)vm_page_free_min; + int64_t above_min_free_bytes = (int64_t)PAGESIZE * above_min_free_pages; + + // vm_page_free_min normally 3500, page free target normally 4000 but not exported + // so we are not scanning if we are 500 pages above vm_page_free_min. + + // even if we're scanning we may have plenty of space in the reserve arena, + // in which case we should not react too strongly + + // if we have been in memory equilibrium, also don't react too strongly + + if (above_min_free_bytes < (int64_t)PAGESIZE * 500LL && reserve_low + && !early_lots_free && !memory_equilibrium) { + // trigger a reap below + lowmem = TRUE; + } + extern volatile unsigned int vm_page_speculative_count; + if ((above_min_free_bytes < 0LL && reserve_low && !early_lots_free && + !memory_equilibrium && !just_alloced) || + above_min_free_bytes <= -4LL*1024LL*1024LL) { + int64_t new_p = -1LL * above_min_free_bytes; + boolean_t fast = FALSE; + emergency_lowmem = TRUE; + lowmem = TRUE; + recent_lowmem = time_now; + last_disequilibrium = time_now_seconds; + int64_t spec_bytes = (int64_t)vm_page_speculative_count * (int64_t)PAGESIZE; + if (vm_page_free_wanted > 0 || new_p > spec_bytes) { + // force a stronger reaction from ARC if we are also low on + // speculative pages (xnu prefetched file blocks with no clients yet) + fast = TRUE; + } + spl_free_set_pressure_both(new_p, fast); + } else if (above_min_free_bytes < 0LL && !early_lots_free) { + lowmem = TRUE; + if (recent_lowmem == 0) + recent_lowmem = time_now; + if (!memory_equilibrium) + last_disequilibrium = time_now_seconds; + } + + new_spl_free += above_min_free_bytes; + + // If we have already detected a memory shortage and we + // have not reaped in a while (a short while for emergency_lowmem), + // then do a kmem_reap() now. + // See http://comments.gmane.org/gmane.os.illumos.devel/22552 + // (notably Richard Elling's "A kernel module can call kmem_reap() whenever + // it wishes and some modules, like zfs, do so." + // If we reap, stop processing spl_free on this pass, to + // let the reaps (and arc, if pressure has been set above) + // do their job for a few milliseconds. + if (emergency_lowmem || lowmem) { + static uint64_t last_reap = 0; + uint64_t now = time_now; + uint64_t elapsed = 60*hz; + if (emergency_lowmem) + elapsed = 15*hz; // minimum frequency from kmem_reap_interval + if (now - last_reap > elapsed) { + last_reap = now; + // spl_free_reap_caches() calls functions that will + // acquire locks and can take a while + // so set spl_free to a small positive value + // to stop arc shrinking too much during this period + // when we expect to be freeing up arc-usable memory, + // but low enough that arc_no_grow likely will be set. + const int64_t two_spamax = 32LL * 1024LL * 1024LL; + if (spl_free < two_spamax) + spl_free = two_spamax; // atomic! + spl_free_reap_caches(); + // we do not have any lock now, so we can jump + // to just before the thread-suspending code + goto justwait; + } + } + + // a number or exceptions to reverse the lowmem / emergency_lowmem states + // if we have recently reaped. we also take the strong reaction sting + // out of the set pressure by turning off spl_free_fast_pressure, since + // that automatically provokes an arc shrink and arc reap + + if (!reserve_low || early_lots_free || memory_equilibrium || just_alloced) { + lowmem = FALSE; + emergency_lowmem = FALSE; + spl_free_fast_pressure = FALSE; + } + + if (vm_page_speculative_count > 0) { + // speculative memory can be squeezed a bit; it is file blocks that + // have been prefetched by xnu but are not (yet) in use by any + // consumer + if (vm_page_speculative_count / 4 + vm_page_free_count > vm_page_free_min) { + emergency_lowmem = FALSE; + spl_free_fast_pressure = FALSE; + } + if (vm_page_speculative_count / 2 + vm_page_free_count > vm_page_free_min) { + lowmem = FALSE; + spl_free_fast_pressure = FALSE; + } + } + + // Stay in a low memory condition for several seconds after we + // first detect that we are in it, giving the system (arc, xnu and userland) + // time to adapt + + if (!lowmem && recent_lowmem > 0) { + if (recent_lowmem + 4*hz < time_now) + lowmem = TRUE; + else + recent_lowmem = 0; + } + + // if we are in a lowmem "hangover", cure it with pressure, then wait + // for the pressure to take effect in arc.c code + + // triggered when we have had at least one lowmem in the previous + // few seconds -- possibly two (one that causes a reap, one + // that falls through to the 4 second hold above). + + if (recent_lowmem == time_now && early_lots_free && reserve_low) { + // we can't grab 64 MiB as a single segment, + // but otherwise have ample memory brought in from xnu, + // but recently we had lowmem... and still have lowmem. + // cure this condition with a dose of pressure. + if (above_min_free_bytes < 0) { + int64_t old_p = spl_free_manual_pressure; + if (old_p <= -above_min_free_bytes) { + recent_lowmem = 0; + spl_free_manual_pressure = -above_min_free_bytes; + goto justwait; + } + } + } + + base = new_spl_free; + + // adjust for available memory in spl_heap_arena + // cf arc_available_memory() + if (!emergency_lowmem) { + extern vmem_t *spl_default_arena; + int64_t heap_free = (int64_t)vmem_size_semi_atomic(spl_heap_arena, VMEM_FREE); + // grabbed buckets_free up above; we are OK with change to it in the meanwhile, + // it'll get an update on the next run. + int64_t combined_free = heap_free + buckets_free; + + if (combined_free != 0) { + const int64_t mb = 1024*1024; + if (!lowmem && above_min_free_bytes > (int64_t)PAGESIZE * 10000LL) { + if (above_min_free_bytes < 64LL * mb) + new_spl_free += combined_free / 16; + else if (above_min_free_bytes < 128LL * mb) + new_spl_free += combined_free / 8; + else if (above_min_free_bytes < 256LL * mb) + new_spl_free += combined_free / 4; + else + new_spl_free += combined_free / 2; + } else { + new_spl_free -= 16LL * mb; + } + } + + // memory footprint has gotten really big, decrease spl_free substantially + int64_t total_mem_used = (int64_t) segkmem_total_mem_allocated; + int64_t mem_used_perc = (int64_t)(segkmem_total_mem_allocated * 100LL / real_total_memory); + if (mem_used_perc > 70 && mem_used_perc <= 75) { + new_spl_free -= total_mem_used / 64; + } else if (mem_used_perc > 75) { + new_spl_free -= total_mem_used / 32; + lowmem = TRUE; + } + } + + // Adjust in the face of a large ARC. + // We don't treat (zfs) metadata and non-metadata + // differently here, and leave policy with respect + // to the relative value of each up to arc.c. + // O3X arc.c does not (yet) take these arena sizes into + // account like Illumos's does. + uint64_t zio_size = vmem_size_semi_atomic(zio_arena_parent, VMEM_ALLOC | VMEM_FREE); + // wrap this in a basic block for lexical scope SSA convenience + if (zio_size > 0) { + static uint64_t zio_last_too_big = 0; + static int64_t imposed_cap = 75; + const uint64_t seconds_of_lower_cap = 10*hz; + uint64_t now = time_now; + uint32_t zio_pct = (uint32_t)(zio_size * 100ULL / real_total_memory); + // if not hungry for memory, shrink towards a + // 75% total memory cap on zfs_file_data + if (!lowmem && !emergency_lowmem && zio_pct > 75 && + (now > zio_last_too_big + seconds_of_lower_cap)) { + new_spl_free -= zio_size / 64; + zio_last_too_big = now; + imposed_cap = 75; + } else if (lowmem || emergency_lowmem) { + // shrink towards stricter caps if we are hungry for memory + const uint32_t lowmem_cap = 25; + const uint32_t emergency_lowmem_cap = 5; + // we don't want the lowest cap to be so low that + // we will not make any use of the fixed size reserve + if (lowmem && zio_pct > lowmem_cap) { + new_spl_free -= zio_size / 32; + zio_last_too_big = now; + imposed_cap = lowmem_cap; + } + if (emergency_lowmem && zio_pct > emergency_lowmem_cap) { + new_spl_free -= zio_size / 8; + zio_last_too_big = now; + imposed_cap = emergency_lowmem_cap; + } + } + if (zio_last_too_big != now && + now < zio_last_too_big + seconds_of_lower_cap && + zio_pct > imposed_cap) { + new_spl_free -= zio_size / 64; + } + } + + // try to get 1/64 of spl_heap_arena freed up + if (emergency_lowmem && new_spl_free >= 0LL) { + extern vmem_t *spl_root_arena; + uint64_t root_size = vmem_size_semi_atomic(spl_heap_arena, VMEM_ALLOC | VMEM_FREE); + uint64_t root_free = vmem_size_semi_atomic(spl_heap_arena, VMEM_FREE); + int64_t difference = root_size - root_free; + int64_t target = root_size / 64; + if (difference < target) { + new_spl_free -= target; + } + // and we should definitely not be returning positive now + if (new_spl_free >= 0LL) + new_spl_free = -1024LL; + } + + uint64_t delta = (uint64_t)new_spl_free - (uint64_t)last_spl_free; + + boolean_t spl_free_is_negative = FALSE; + + if (new_spl_free < 0LL) { + spl_stats.spl_spl_free_negative_count.value.ui64++; + spl_free_is_negative = TRUE; + } + + // NOW set spl_free from calculated new_spl_free + spl_free = new_spl_free; + // the direct equivalent of : + // __c11_atomic_store(&spl_free, new_spl_free, __ATOMIC_SEQ_CST); + + + // Because we're already negative, arc is likely to have been + // signalled already. We can rely on the _maybe_ in + // spl-vmem.c:xnu_alloc_throttled() [XAT] to try to give arc a + // kick with greater probability. + // However, if we've gone negative several times, and have not + // tried a full kick in a long time, do so now; if the full kick + // is refused because there has been a kick too few minutes ago, + // try a gentler kick. + // We do this outside the lock, as spl_maybe_send_large_pressure + // may need to take a mutex, and we forbid further mutex entry when + // spl_free_lock is held. + + if (spl_free_is_negative) { + static volatile _Atomic uint32_t negatives_since_last_kick = 0; + + if (negatives_since_last_kick++ > 8) { + if (spl_maybe_send_large_pressure(time_now, 360, TRUE) || + spl_maybe_send_large_pressure(time_now, 60, FALSE)) { + negatives_since_last_kick = 0; + } + } + } + + if (lowmem) + recent_lowmem = time_now; + + // maintain an exponential moving average for the ema kstat + if (last_update > hz) + alpha = 1; + else { + uint64_t td_tick = (uint64_t)(time_now - last_update); + alpha = td_tick / (uint64_t)(hz*50); // roughly 0.02 + } + + ema_new = (alpha * delta) + (1 - alpha)*ema_old; + spl_free_delta_ema = ema_new; + ema_old = ema_new; + + justwait: + mutex_enter(&spl_free_thread_lock); + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_hires(&spl_free_thread_cv, &spl_free_thread_lock, + MSEC2NSEC(10), 0, 0); + CALLB_CPR_SAFE_END(&cpr, &spl_free_thread_lock); + } + spl_free_thread_exit = FALSE; + dprintf("SPL: spl_free_thread_exit set to FALSE " \ + "and exiting: cv_broadcasting\n"); + spl_free_manual_pressure = 0; + cv_broadcast(&spl_free_thread_cv); + CALLB_CPR_EXIT(&cpr); + dprintf("SPL: %s thread_exit\n", __func__); + thread_exit(); +} + +static void +spl_event_thread(void *notused) +{ + callb_cpr_t cpr; + NTSTATUS Status; + + DECLARE_CONST_UNICODE_STRING(low_mem_name, L"\\KernelObjects\\LowMemoryCondition"); + HANDLE low_mem_handle; + low_mem_event = IoCreateNotificationEvent((PUNICODE_STRING)&low_mem_name, &low_mem_handle); + if (low_mem_event == NULL) { + TraceEvent(TRACE_ERROR, "%s: failed IoCreateNotificationEvent(\\KernelObjects\\LowMemoryCondition)", __func__); + thread_exit(); + } + KeClearEvent(low_mem_event); + + dprintf("SPL: beginning spl_event_thread() loop\n"); + + while (!spl_event_thread_exit) { + + /* Don't busy loop */ + delay(hz); + + /* Sleep forever waiting for event */ + Status = KeWaitForSingleObject(low_mem_event, Executive, KernelMode, FALSE, NULL); + KeClearEvent(low_mem_event); + + xprintf("%s: LOWMEMORY EVENT *** 0x%x (memusage: %llu)\n", __func__, Status, segkmem_total_mem_allocated); + /* We were signalled */ + //vm_page_free_wanted = vm_page_free_min; + spl_free_set_pressure(vm_page_free_min); + vm_page_free_wanted = vm_page_free_min; + cv_broadcast(&spl_free_thread_cv); + } + + ZwClose(low_mem_handle); + + spl_event_thread_exit = FALSE; + dprintf("SPL: %s thread_exit\n", __func__); + thread_exit(); +} + + + + +static int +spl_kstat_update(kstat_t *ksp, int rw) +{ + spl_stats_t *ks = ksp->ks_data; + + if (rw == KSTAT_WRITE) { + + if (ks->spl_spl_free_manual_pressure.value.i64 != spl_free_manual_pressure) { + spl_free_set_pressure(ks->spl_spl_free_manual_pressure.value.i64 * 1024 *1024); + if (ks->spl_spl_free_manual_pressure.value.i64 > 0) { + spl_free_reap_caches(); + } + } + + if (ks->spl_spl_free_fast_pressure.value.i64 != spl_free_fast_pressure) { + if (spl_free_wrapper() != 0) { + spl_free_set_fast_pressure(TRUE); + } + } + + if (ks->spl_bucket_tunable_large_span.value.ui64 != spl_bucket_tunable_large_span) { + spl_set_bucket_tunable_large_span(ks->spl_bucket_tunable_large_span.value.ui64); + } + + if (ks->spl_bucket_tunable_small_span.value.ui64 != spl_bucket_tunable_small_span) { + spl_set_bucket_tunable_small_span(ks->spl_bucket_tunable_small_span.value.ui64); + } + + if (ks->spl_frag_max_walk.value.ui64 != spl_frag_max_walk) { + spl_frag_max_walk = ks->spl_frag_max_walk.value.ui64; + } + + if (ks->kmem_free_to_slab_when_fragmented.value.ui64 != + kmem_free_to_slab_when_fragmented) { + kmem_free_to_slab_when_fragmented = + ks->kmem_free_to_slab_when_fragmented.value.ui64; + } + + } else { + ks->spl_os_alloc.value.ui64 = segkmem_total_mem_allocated; + ks->spl_active_threads.value.ui64 = zfs_threads; + ks->spl_active_mutex.value.ui64 = zfs_active_mutex; + ks->spl_active_rwlock.value.ui64 = zfs_active_rwlock; + ks->spl_active_tsd.value.ui64 = spl_tsd_size(); + ks->spl_spl_free.value.i64 = spl_free; + ks->spl_spl_free_manual_pressure.value.i64 = spl_free_manual_pressure; + ks->spl_spl_free_fast_pressure.value.i64 = spl_free_fast_pressure; + ks->spl_spl_free_delta_ema.value.i64 = spl_free_delta_ema; + ks->spl_osif_malloc_success.value.ui64 = stat_osif_malloc_success; + ks->spl_osif_malloc_bytes.value.ui64 = stat_osif_malloc_bytes; + ks->spl_osif_free.value.ui64 = stat_osif_free; + ks->spl_osif_free_bytes.value.ui64 = stat_osif_free_bytes; + ks->spl_bucket_non_pow2_allocs.value.ui64 = spl_bucket_non_pow2_allocs; + + ks->spl_vmem_unconditional_allocs.value.ui64 = spl_vmem_unconditional_allocs; + ks->spl_vmem_unconditional_alloc_bytes.value.ui64 = spl_vmem_unconditional_alloc_bytes; + ks->spl_vmem_conditional_allocs.value.ui64 = spl_vmem_conditional_allocs; + ks->spl_vmem_conditional_alloc_bytes.value.ui64 = spl_vmem_conditional_alloc_bytes; + ks->spl_vmem_conditional_alloc_deny.value.ui64 = spl_vmem_conditional_alloc_deny; + ks->spl_vmem_conditional_alloc_deny_bytes.value.ui64 = spl_vmem_conditional_alloc_deny_bytes; + + ks->spl_xat_success.value.ui64 = spl_xat_success; + ks->spl_xat_late_success.value.ui64 = spl_xat_late_success; + ks->spl_xat_late_success_nosleep.value.ui64 = spl_xat_late_success_nosleep; + ks->spl_xat_pressured.value.ui64 = spl_xat_pressured; + ks->spl_xat_bailed.value.ui64 = spl_xat_bailed; + ks->spl_xat_bailed_contended.value.ui64 = spl_xat_bailed_contended; + ks->spl_xat_lastalloc.value.ui64 = spl_xat_lastalloc; + ks->spl_xat_lastfree.value.ui64 = spl_xat_lastfree; + ks->spl_xat_forced.value.ui64 = spl_xat_forced; + ks->spl_xat_sleep.value.ui64 = spl_xat_sleep; + ks->spl_xat_late_deny.value.ui64 = spl_xat_late_deny; + ks->spl_xat_no_waiters.value.ui64 = spl_xat_no_waiters; + ks->spl_xft_wait.value.ui64 = spl_xft_wait; + + ks->spl_vba_parent_memory_appeared.value.ui64 = spl_vba_parent_memory_appeared; + ks->spl_vba_parent_memory_blocked.value.ui64 = spl_vba_parent_memory_blocked; + ks->spl_vba_hiprio_blocked.value.ui64 = spl_vba_hiprio_blocked; + ks->spl_vba_cv_timeout.value.ui64 = spl_vba_cv_timeout; + ks->spl_vba_loop_timeout.value.ui64 = spl_vba_loop_timeout; + ks->spl_vba_cv_timeout_blocked.value.ui64 = spl_vba_cv_timeout_blocked; + ks->spl_vba_loop_timeout_blocked.value.ui64 = spl_vba_loop_timeout_blocked; + ks->spl_vba_sleep.value.ui64 = spl_vba_sleep; + ks->spl_vba_loop_entries.value.ui64 = spl_vba_loop_entries; + + ks->spl_bucket_tunable_large_span.value.ui64 = spl_bucket_tunable_large_span; + ks->spl_bucket_tunable_small_span.value.ui64 = spl_bucket_tunable_small_span; + + ks->spl_buckets_mem_free.value.ui64 = spl_buckets_mem_free; + ks->spl_arc_no_grow_bits.value.ui64 = spl_arc_no_grow_bits; + ks->spl_arc_no_grow_count.value.ui64 = spl_arc_no_grow_count; + + ks->spl_frag_max_walk.value.ui64 = spl_frag_max_walk; + ks->spl_frag_walked_out.value.ui64 = spl_frag_walked_out; + + ks->spl_frag_walk_cnt.value.ui64 = spl_frag_walk_cnt; + + ks->spl_arc_reclaim_avoided.value.ui64 = spl_arc_reclaim_avoided; + + ks->kmem_free_to_slab_when_fragmented.value.ui64 = kmem_free_to_slab_when_fragmented; + } + + return (0); +} + +void +spl_kmem_init(uint64_t xtotal_memory) +{ + int old_kmem_flags = kmem_flags; + int use_large_pages = 0; + uint32_t maxverify, minfirewall; + + dprintf("SPL: Total memory %llu\n", xtotal_memory); + + //sysctl_register_oid(&sysctl__spl); + //sysctl_register_oid(&sysctl__spl_kext_version); + + // Initialise the kstat lock + mutex_init(&kmem_cache_lock, "kmem_cache_lock", MUTEX_DEFAULT, NULL); // XNU + mutex_init(&kmem_flags_lock, "kmem_flags_lock", MUTEX_DEFAULT, NULL); // XNU + mutex_init(&kmem_cache_kstat_lock, "kmem_kstat_lock", MUTEX_DEFAULT, NULL); // XNU + + /* kstat mutex init */ + spl_kstat_init(); + + /* start vm_init */ + kernelheap_init(); + + /* kstat init */ + kstat_init(); + + /* + * Small-memory systems (< 24 MB) can't handle kmem_flags overhead. + */ + if (physmem < btop(24 << 20) && !(old_kmem_flags & KMF_STICKY)) + kmem_flags = 0; + + /* + * Don't do firewalled allocations if the heap is less than 1TB + * (i.e. on a 32-bit kernel) + * The resulting VM_NEXTFIT allocations would create too much + * fragmentation in a small heap. + */ + maxverify = minfirewall = PAGESIZE / 2; + + + /* LINTED */ + ASSERT(sizeof (kmem_cpu_cache_t) == KMEM_CPU_CACHE_SIZE); + + list_create(&kmem_caches, sizeof (kmem_cache_t), + offsetof(kmem_cache_t, cache_link)); + + kmem_metadata_arena = vmem_create("kmem_metadata", NULL, 0, PAGESIZE, + vmem_alloc, vmem_free, heap_arena, 8 * PAGESIZE, + VM_SLEEP | VMC_NO_QCACHE); + + kmem_msb_arena = vmem_create("kmem_msb", NULL, 0, + PAGESIZE, vmem_alloc, vmem_free, kmem_metadata_arena, 0, + VMC_DUMPSAFE | VM_SLEEP); + + kmem_cache_arena = vmem_create("kmem_cache", NULL, 0, KMEM_ALIGN, + vmem_alloc, vmem_free, kmem_metadata_arena, 0, VM_SLEEP); + + kmem_hash_arena = vmem_create("kmem_hash", NULL, 0, KMEM_ALIGN, + vmem_alloc, vmem_free, kmem_metadata_arena, 0, VM_SLEEP); + + kmem_log_arena = vmem_create("kmem_log", NULL, 0, KMEM_ALIGN, + vmem_alloc, vmem_free, kmem_metadata_arena, 0, VM_SLEEP); + + /* temporary oversize arena for mod_read_system_file */ + kmem_oversize_arena = vmem_create("kmem_oversize", NULL, 0, PAGESIZE, + vmem_alloc, vmem_free, heap_arena, 0, VM_SLEEP); + + // statically declared above kmem_reap_interval = 15 * hz; + + /* + * Read /etc/system. This is a chicken-and-egg problem because + * kmem_flags may be set in /etc/system, but mod_read_system_file() + * needs to use the allocator. The simplest solution is to create + * all the standard kmem caches, read /etc/system, destroy all the + * caches we just created, and then create them all again in light + * of the (possibly) new kmem_flags and other kmem tunables. + */ + + if (old_kmem_flags & KMF_STICKY) + kmem_flags = old_kmem_flags; + + if (!(kmem_flags & KMF_AUDIT)) + vmem_seg_size = offsetof(vmem_seg_t, vs_thread); + + if (kmem_maxverify == 0) + kmem_maxverify = maxverify; + + if (kmem_minfirewall == 0) + kmem_minfirewall = minfirewall; + + /* + * give segkmem a chance to figure out if we are using large pages + * for the kernel heap + */ + // use_large_pages = segkmem_lpsetup(); + use_large_pages = 0; + + /* + * To protect against corruption, we keep the actual number of callers + * KMF_LITE records seperate from the tunable. We arbitrarily clamp + * to 16, since the overhead for small buffers quickly gets out of + * hand. + * + * The real limit would depend on the needs of the largest KMC_NOHASH + * cache. + */ + kmem_lite_count = MIN(MAX(0, kmem_lite_pcs), 16); + kmem_lite_pcs = kmem_lite_count; + + kmem_cache_init(2, use_large_pages); + + if (kmem_flags & (KMF_AUDIT | KMF_RANDOMIZE)) { + if (kmem_transaction_log_size == 0) + kmem_transaction_log_size = (uint32_t) MIN(kmem_maxavail() / 50ULL, + PAGESIZE<<4); + kmem_transaction_log = kmem_log_init(kmem_transaction_log_size); + } + + if (kmem_flags & (KMF_CONTENTS | KMF_RANDOMIZE)) { + if (kmem_content_log_size == 0) + kmem_content_log_size = (uint32_t) MIN(kmem_maxavail() / 50ULL, + PAGESIZE<<4); + kmem_content_log = kmem_log_init(kmem_content_log_size); + } + + kmem_failure_log = kmem_log_init(kmem_failure_log_size); + + kmem_slab_log = kmem_log_init(kmem_slab_log_size); + + + /* + * Warn about invalid or dangerous values of kmem_flags. + * Always warn about unsupported values. + */ + if (((kmem_flags & ~(KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE | + KMF_CONTENTS | KMF_LITE)) != 0) || + ((kmem_flags & KMF_LITE) && kmem_flags != KMF_LITE)) + cmn_err(CE_WARN, "kmem_flags set to unsupported value 0x%x. " + "See the Solaris Tunable Parameters Reference Manual.", + kmem_flags); + +#ifdef DEBUG + if ((kmem_flags & KMF_DEBUG) == 0) + cmn_err(CE_NOTE, "kmem debugging disabled."); +#else + /* + * For non-debug kernels, the only "normal" flags are 0, KMF_LITE, + * KMF_REDZONE, and KMF_CONTENTS (the last because it is only enabled + * if KMF_AUDIT is set). We should warn the user about the performance + * penalty of KMF_AUDIT or KMF_DEADBEEF if they are set and KMF_LITE + * isn't set (since that disables AUDIT). + */ + if (!(kmem_flags & KMF_LITE) && + (kmem_flags & (KMF_AUDIT | KMF_DEADBEEF)) != 0) + cmn_err(CE_WARN, "High-overhead kmem debugging features " + "enabled (kmem_flags = 0x%x). Performance degradation " + "and large memory overhead possible. See the Solaris " + "Tunable Parameters Reference Manual.", kmem_flags); +#endif /* not DEBUG */ + + segkmem_zio_init(); + + kmem_cache_applyall(kmem_cache_magazine_enable, NULL, TQ_SLEEP); + + kmem_ready = 1; + + // Install spl kstats + spl_ksp = kstat_create("spl", 0, "spl_misc", "misc", KSTAT_TYPE_NAMED, + sizeof (spl_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE); + + if (spl_ksp != NULL) { + spl_ksp->ks_data = &spl_stats; + spl_ksp->ks_update = spl_kstat_update; + kstat_install(spl_ksp); + } +} + +void +spl_kmem_fini(void) +{ + //sysctl_unregister_oid(&sysctl__spl_kext_version); + //sysctl_unregister_oid(&sysctl__spl); + + kmem_cache_applyall(kmem_cache_magazine_disable, NULL, TQ_SLEEP); + + kstat_delete(spl_ksp); + + kmem_log_fini(kmem_slab_log); + kmem_log_fini(kmem_failure_log); + + if (kmem_flags & (KMF_CONTENTS | KMF_RANDOMIZE)) { + if (kmem_content_log_size == 0) + kmem_content_log_size = (uint32_t) kmem_maxavail() / 50; + kmem_log_fini(kmem_content_log); + } + + if (kmem_flags & (KMF_AUDIT | KMF_RANDOMIZE)) { + if (kmem_transaction_log_size == 0) + kmem_transaction_log_size = (uint32_t) kmem_maxavail() / 50; + kmem_log_fini(kmem_transaction_log); + } + + // Destroy all the "general allocation" caches + kmem_alloc_caches_destroy(); + + // Destroy the VA associated caches + kmem_destroy_cache_by_name(KMEM_VA_PREFIX); + + kmem_qcache_destroy(); + // Destroy metadata caches + kmem_cache_destroy(kmem_bufctl_cache); + kmem_cache_destroy(kmem_bufctl_audit_cache); + kmem_cache_destroy(kmem_slab_cache); // Dont think this one + + // Some caches cannot be destroyed as + // they mutually reference each other. + // So we explicitly pull them apart piece-by-piece. + kmem_cache_fini(); + + segkmem_zio_fini(); + + // Now destroy the vmem arenas used by kmem. + vmem_destroy(kmem_default_arena); + vmem_destroy(kmem_va_arena); + vmem_destroy(kmem_oversize_arena); + vmem_destroy(kmem_log_arena); + vmem_destroy(kmem_hash_arena); + vmem_destroy(kmem_cache_arena); + vmem_destroy(kmem_msb_arena); + vmem_destroy(kmem_metadata_arena); + + kernelheap_fini(); + + list_destroy(&kmem_caches); + + mutex_destroy(&kmem_cache_kstat_lock); + mutex_destroy(&kmem_flags_lock); + mutex_destroy(&kmem_cache_lock); +} + +static void +kmem_move_init(void) +{ + kmem_defrag_cache = kmem_cache_create("kmem_defrag_cache", + sizeof (kmem_defrag_t), 0, NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + kmem_move_cache = kmem_cache_create("kmem_move_cache", + sizeof (kmem_move_t), 0, NULL, NULL, NULL, NULL, + kmem_msb_arena, KMC_NOHASH); + + /* + * kmem guarantees that move callbacks are sequential and that even + * across multiple caches no two moves ever execute simultaneously. + * Move callbacks are processed on a separate taskq so that client code + * does not interfere with internal maintenance tasks. + */ + kmem_move_taskq = taskq_create("kmem_move_taskq", 1, + minclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE); +} + +void +kmem_move_fini(void) +{ + + taskq_wait(kmem_move_taskq); + taskq_destroy(kmem_move_taskq); + kmem_move_taskq = 0; + + kmem_cache_destroy(kmem_move_cache); + kmem_cache_destroy(kmem_defrag_cache); + +} + +void +spl_kmem_thread_init(void) +{ + kmem_move_init(); + + // Initialize the spl_free locks + mutex_init(&spl_free_thread_lock, "spl_free_thead_lock", MUTEX_DEFAULT, NULL); + + kmem_taskq = taskq_create("kmem_taskq", 1, minclsyspri, + 300, INT_MAX, TASKQ_PREPOPULATE); + + spl_free_thread_exit = FALSE; + // zfsin/212 + (void) cv_init(&spl_free_thread_cv, NULL, CV_DEFAULT, NULL); + (void) thread_create(NULL, 0, spl_free_thread, 0, 0, 0, 0, 92); + + spl_event_thread_exit = FALSE; + (void)thread_create(NULL, 0, spl_event_thread, 0, 0, 0, 0, 92); +} + +void +spl_kmem_thread_fini(void) +{ + shutting_down = 1; + + if (low_mem_event != NULL) { + dprintf("SPL: stopping spl_event_thread\n"); + spl_event_thread_exit = TRUE; + KeSetEvent(low_mem_event, 0, FALSE); + while (spl_event_thread_exit) { + delay(hz >> 4); + } + dprintf("SPL: stopped spl_event_thread\n"); + } + + dprintf("SPL: stop spl_free_thread\n"); + mutex_enter(&spl_free_thread_lock); + dprintf("SPL: stop spl_free_thread, lock acquired, setting exit variable and waiting\n"); + spl_free_thread_exit = TRUE; + while (spl_free_thread_exit) { + cv_signal(&spl_free_thread_cv); + cv_wait(&spl_free_thread_cv, &spl_free_thread_lock); + } + dprintf("SPL: spl_free_thread stop: while loop ended, dropping mutex\n"); + mutex_exit(&spl_free_thread_lock); + dprintf("SPL: spl_free_thread stop: destroying cv and mutex\n"); + cv_destroy(&spl_free_thread_cv); + mutex_destroy(&spl_free_thread_lock); + + dprintf("SPL: bsd_untimeout\n"); + + //bsd_untimeout(kmem_update, 0); + //bsd_untimeout(kmem_reap_timeout, &kmem_reaping); + //bsd_untimeout(kmem_reap_timeout, &kmem_reaping_idspace); + bsd_untimeout(kmem_update, &kmem_update_timer); + bsd_untimeout(kmem_reap_timeout, &kmem_reaping); + bsd_untimeout(kmem_reap_timeout, &kmem_reaping_idspace); + +#if 0 + KeCancelTimer(&kmem_update_timer.timer); + KeCancelTimer(&kmem_reaping.timer); + KeCancelTimer(&kmem_reaping_idspace.timer); + kmem_free(kmem_reaping.timer_callback, sizeof(KDPC)); + kmem_free(kmem_reaping_idspace.timer_callback, sizeof(KDPC)); +#endif + + dprintf("SPL: wait for taskqs to empty\n"); + taskq_wait(kmem_taskq); + + dprintf("SPL: destroy taskq\n"); + taskq_destroy(kmem_taskq); + kmem_taskq = 0; + + kmem_move_fini(); + +} + +void +spl_kmem_mp_init(void) +{ + kmem_update_timeout(&kmem_update_timer); +} + +/* + * Return the slab of the allocated buffer, or NULL if the buffer is not + * allocated. This function may be called with a known slab address to determine + * whether or not the buffer is allocated, or with a NULL slab address to obtain + * an allocated buffer's slab. + */ +static kmem_slab_t * +kmem_slab_allocated(kmem_cache_t *cp, kmem_slab_t *sp, void *buf) +{ + kmem_bufctl_t *bcp, *bufbcp; + + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(sp == NULL || KMEM_SLAB_MEMBER(sp, buf)); + + if (cp->cache_flags & KMF_HASH) { + for (bcp = *KMEM_HASH(cp, buf); + (bcp != NULL) && (bcp->bc_addr != buf); + bcp = bcp->bc_next) { + continue; + } + ASSERT(sp != NULL && bcp != NULL ? sp == bcp->bc_slab : 1); + return (bcp == NULL ? NULL : bcp->bc_slab); + } + + if (sp == NULL) { + DbgBreakPoint(); + sp = KMEM_SLAB(cp, buf); + } + bufbcp = KMEM_BUFCTL(cp, buf); + for (bcp = sp->slab_head; + (bcp != NULL) && (bcp != bufbcp); + bcp = bcp->bc_next) { + continue; + } + return (bcp == NULL ? sp : NULL); +} + +static boolean_t +kmem_slab_is_reclaimable(kmem_cache_t *cp, kmem_slab_t *sp, int flags) +{ + long refcnt = sp->slab_refcnt; + + ASSERT(cp->cache_defrag != NULL); + + /* + * For code coverage we want to be able to move an object within the + * same slab (the only partial slab) even if allocating the destination + * buffer resulted in a completely allocated slab. + */ + if (flags & KMM_DEBUG) { + return ((flags & KMM_DESPERATE) || + ((sp->slab_flags & KMEM_SLAB_NOMOVE) == 0)); + } + + /* If we're desperate, we don't care if the client said NO. */ + if (flags & KMM_DESPERATE) { + return (refcnt < sp->slab_chunks); /* any partial */ + } + + if (sp->slab_flags & KMEM_SLAB_NOMOVE) { + return (B_FALSE); + } + + if ((refcnt == 1) || kmem_move_any_partial) { + return (refcnt < sp->slab_chunks); + } + + /* + * The reclaim threshold is adjusted at each kmem_cache_scan() so that + * slabs with a progressively higher percentage of used buffers can be + * reclaimed until the cache as a whole is no longer fragmented. + * + * sp->slab_refcnt kmd_reclaim_numer + * --------------- < ------------------ + * sp->slab_chunks KMEM_VOID_FRACTION + */ + return ((refcnt * KMEM_VOID_FRACTION) < + (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer)); +} + +static void * +kmem_hunt_mag(kmem_cache_t *cp, kmem_magazine_t *m, int n, void *buf, + void *tbuf) +{ + int i; /* magazine round index */ + + for (i = 0; i < n; i++) { + if (buf == m->mag_round[i]) { + if (cp->cache_flags & KMF_BUFTAG) { + (void) kmem_cache_free_debug(cp, tbuf, + caller()); + } + m->mag_round[i] = tbuf; + return (buf); + } + } + + return (NULL); +} + +/* + * Hunt the magazine layer for the given buffer. If found, the buffer is + * removed from the magazine layer and returned, otherwise NULL is returned. + * The state of the returned buffer is freed and constructed. + */ +static void * +kmem_hunt_mags(kmem_cache_t *cp, void *buf) +{ + kmem_cpu_cache_t *ccp; + kmem_magazine_t *m; + int cpu_seqid; + int n; /* magazine rounds */ + void *tbuf; /* temporary swap buffer */ + + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + + /* + * Allocated a buffer to swap with the one we hope to pull out of a + * magazine when found. + */ + tbuf = kmem_cache_alloc(cp, KM_NOSLEEP); + if (tbuf == NULL) { + KMEM_STAT_ADD(kmem_move_stats.kms_hunt_alloc_fail); + return (NULL); + } + if (tbuf == buf) { + KMEM_STAT_ADD(kmem_move_stats.kms_hunt_lucky); + if (cp->cache_flags & KMF_BUFTAG) { + (void) kmem_cache_free_debug(cp, buf, caller()); + } + return (buf); + } + + /* Hunt the depot. */ + mutex_enter(&cp->cache_depot_lock); + n = cp->cache_magtype->mt_magsize; + for (m = cp->cache_full.ml_list; m != NULL; m = m->mag_next) { + if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { + mutex_exit(&cp->cache_depot_lock); + return (buf); + } + } + mutex_exit(&cp->cache_depot_lock); + + /* Hunt the per-CPU magazines. */ + for (cpu_seqid = 0; cpu_seqid < (int)max_ncpus; cpu_seqid++) { + ccp = &cp->cache_cpu[cpu_seqid]; + + mutex_enter(&ccp->cc_lock); + m = ccp->cc_loaded; + n = ccp->cc_rounds; + if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { + mutex_exit(&ccp->cc_lock); + return (buf); + } + m = ccp->cc_ploaded; + n = ccp->cc_prounds; + if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) { + mutex_exit(&ccp->cc_lock); + return (buf); + } + mutex_exit(&ccp->cc_lock); + } + + kmem_cache_free(cp, tbuf); + return (NULL); +} + +/* + * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(), + * or when the buffer is freed. + */ +static void +kmem_slab_move_yes(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf) +{ + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(KMEM_SLAB_MEMBER(sp, from_buf)); + + if (!KMEM_SLAB_IS_PARTIAL(sp)) { + return; + } + + if (sp->slab_flags & KMEM_SLAB_NOMOVE) { + if (KMEM_SLAB_OFFSET(sp, from_buf) == sp->slab_stuck_offset) { + avl_remove(&cp->cache_partial_slabs, sp); + sp->slab_flags &= ~KMEM_SLAB_NOMOVE; + sp->slab_stuck_offset = (uint32_t)-1; + avl_add(&cp->cache_partial_slabs, sp); + } + } else { + sp->slab_later_count = 0; + sp->slab_stuck_offset = (uint32_t)-1; + } +} + +static void +kmem_slab_move_no(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf) +{ + ASSERT(taskq_member(kmem_move_taskq, curthread)); + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(KMEM_SLAB_MEMBER(sp, from_buf)); + + if (!KMEM_SLAB_IS_PARTIAL(sp)) { + return; + } + + avl_remove(&cp->cache_partial_slabs, sp); + sp->slab_later_count = 0; + sp->slab_flags |= KMEM_SLAB_NOMOVE; + sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp, from_buf); + avl_add(&cp->cache_partial_slabs, sp); +} + +static void kmem_move_end(kmem_cache_t *, kmem_move_t *); + +/* + * The move callback takes two buffer addresses, the buffer to be moved, and a + * newly allocated and constructed buffer selected by kmem as the destination. + * It also takes the size of the buffer and an optional user argument specified + * at cache creation time. kmem guarantees that the buffer to be moved has not + * been unmapped by the virtual memory subsystem. Beyond that, it cannot + * guarantee the present whereabouts of the buffer to be moved, so it is up to + * the client to safely determine whether or not it is still using the buffer. + * The client must not free either of the buffers passed to the move callback, + * since kmem wants to free them directly to the slab layer. The client response + * tells kmem which of the two buffers to free: + * + * YES kmem frees the old buffer (the move was successful) + * NO kmem frees the new buffer, marks the slab of the old buffer + * non-reclaimable to avoid bothering the client again + * LATER kmem frees the new buffer, increments slab_later_count + * DONT_KNOW kmem frees the new buffer, searches mags for the old buffer + * DONT_NEED kmem frees both the old buffer and the new buffer + * + * The pending callback argument now being processed contains both of the + * buffers (old and new) passed to the move callback function, the slab of the + * old buffer, and flags related to the move request, such as whether or not the + * system was desperate for memory. + * + * Slabs are not freed while there is a pending callback, but instead are kept + * on a deadlist, which is drained after the last callback completes. This means + * that slabs are safe to access until kmem_move_end(), no matter how many of + * their buffers have been freed. Once slab_refcnt reaches zero, it stays at + * zero for as long as the slab remains on the deadlist and until the slab is + * freed. + */ +static void +kmem_move_buffer(kmem_move_t *callback) +{ + kmem_cbrc_t response; + kmem_slab_t *sp = callback->kmm_from_slab; + kmem_cache_t *cp = sp->slab_cache; + boolean_t free_on_slab; + + ASSERT(taskq_member(kmem_move_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + ASSERT(KMEM_SLAB_MEMBER(sp, callback->kmm_from_buf)); + + /* + * The number of allocated buffers on the slab may have changed since we + * last checked the slab's reclaimability (when the pending move was + * enqueued), or the client may have responded NO when asked to move + * another buffer on the same slab. + */ + if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) { + KMEM_STAT_ADD(kmem_move_stats.kms_no_longer_reclaimable); + KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY), + kmem_move_stats.kms_notify_no_longer_reclaimable); + kmem_slab_free(cp, callback->kmm_to_buf); + kmem_move_end(cp, callback); + return; + } + + /* + * Hunting magazines is expensive, so we'll wait to do that until the + * client responds KMEM_CBRC_DONT_KNOW. However, checking the slab layer + * is cheap, so we might as well do that here in case we can avoid + * bothering the client. + */ + mutex_enter(&cp->cache_lock); + free_on_slab = (kmem_slab_allocated(cp, sp, + callback->kmm_from_buf) == NULL); + mutex_exit(&cp->cache_lock); + + if (free_on_slab) { + KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_slab); + kmem_slab_free(cp, callback->kmm_to_buf); + kmem_move_end(cp, callback); + return; + } + + if (cp->cache_flags & KMF_BUFTAG) { + /* + * Make kmem_cache_alloc_debug() apply the constructor for us. + */ + if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf, + KM_NOSLEEP, 1, caller()) != 0) { + KMEM_STAT_ADD(kmem_move_stats.kms_alloc_fail); + kmem_move_end(cp, callback); + return; + } + } else if (cp->cache_constructor != NULL && + cp->cache_constructor(callback->kmm_to_buf, cp->cache_private, + KM_NOSLEEP) != 0) { + atomic_inc_64(&cp->cache_alloc_fail); + KMEM_STAT_ADD(kmem_move_stats.kms_constructor_fail); + kmem_slab_free(cp, callback->kmm_to_buf); + kmem_move_end(cp, callback); + return; + } + + KMEM_STAT_ADD(kmem_move_stats.kms_callbacks); + KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY), + kmem_move_stats.kms_notify_callbacks); + cp->cache_defrag->kmd_callbacks++; + cp->cache_defrag->kmd_thread = spl_current_thread(); + cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf; + cp->cache_defrag->kmd_to_buf = callback->kmm_to_buf; + DTRACE_PROBE2(kmem__move__start, kmem_cache_t *, cp, kmem_move_t *, + callback); + + response = cp->cache_move(callback->kmm_from_buf, + callback->kmm_to_buf, cp->cache_bufsize, cp->cache_private); + + DTRACE_PROBE3(kmem__move__end, kmem_cache_t *, cp, kmem_move_t *, + callback, kmem_cbrc_t, response); + cp->cache_defrag->kmd_thread = NULL; + cp->cache_defrag->kmd_from_buf = NULL; + cp->cache_defrag->kmd_to_buf = NULL; + + if (response == KMEM_CBRC_YES) { + KMEM_STAT_ADD(kmem_move_stats.kms_yes); + cp->cache_defrag->kmd_yes++; + kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE); + /* slab safe to access until kmem_move_end() */ + if (sp->slab_refcnt == 0) + cp->cache_defrag->kmd_slabs_freed++; + mutex_enter(&cp->cache_lock); + kmem_slab_move_yes(cp, sp, callback->kmm_from_buf); + mutex_exit(&cp->cache_lock); + kmem_move_end(cp, callback); + return; + } + + switch (response) { + case KMEM_CBRC_NO: + KMEM_STAT_ADD(kmem_move_stats.kms_no); + cp->cache_defrag->kmd_no++; + mutex_enter(&cp->cache_lock); + kmem_slab_move_no(cp, sp, callback->kmm_from_buf); + mutex_exit(&cp->cache_lock); + break; + case KMEM_CBRC_LATER: + KMEM_STAT_ADD(kmem_move_stats.kms_later); + cp->cache_defrag->kmd_later++; + mutex_enter(&cp->cache_lock); + if (!KMEM_SLAB_IS_PARTIAL(sp)) { + mutex_exit(&cp->cache_lock); + break; + } + + if (++sp->slab_later_count >= KMEM_DISBELIEF) { + KMEM_STAT_ADD(kmem_move_stats.kms_disbelief); + kmem_slab_move_no(cp, sp, callback->kmm_from_buf); + } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) { + sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp, + callback->kmm_from_buf); + } + mutex_exit(&cp->cache_lock); + break; + case KMEM_CBRC_DONT_NEED: + KMEM_STAT_ADD(kmem_move_stats.kms_dont_need); + cp->cache_defrag->kmd_dont_need++; + kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE); + if (sp->slab_refcnt == 0) + cp->cache_defrag->kmd_slabs_freed++; + mutex_enter(&cp->cache_lock); + kmem_slab_move_yes(cp, sp, callback->kmm_from_buf); + mutex_exit(&cp->cache_lock); + break; + case KMEM_CBRC_DONT_KNOW: + KMEM_STAT_ADD(kmem_move_stats.kms_dont_know); + cp->cache_defrag->kmd_dont_know++; + if (kmem_hunt_mags(cp, callback->kmm_from_buf) != NULL) { + KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_mag); + cp->cache_defrag->kmd_hunt_found++; + kmem_slab_free_constructed(cp, callback->kmm_from_buf, + B_TRUE); + if (sp->slab_refcnt == 0) + cp->cache_defrag->kmd_slabs_freed++; + mutex_enter(&cp->cache_lock); + kmem_slab_move_yes(cp, sp, callback->kmm_from_buf); + mutex_exit(&cp->cache_lock); + } + break; + default: + panic("'%s' (%p) unexpected move callback response %d\n", + cp->cache_name, (void *)cp, response); + } + + kmem_slab_free_constructed(cp, callback->kmm_to_buf, B_FALSE); + kmem_move_end(cp, callback); +} + +/* Return B_FALSE if there is insufficient memory for the move request. */ +static boolean_t +kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags) +{ + void *to_buf; + avl_index_t index; + kmem_move_t *callback, *pending; + ulong_t n; + + ASSERT(taskq_member(kmem_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); + + callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP); + if (callback == NULL) { + KMEM_STAT_ADD(kmem_move_stats.kms_callback_alloc_fail); + return (B_FALSE); + } + + callback->kmm_from_slab = sp; + callback->kmm_from_buf = buf; + callback->kmm_flags = flags; + + mutex_enter(&cp->cache_lock); + + n = avl_numnodes(&cp->cache_partial_slabs); + if ((n == 0) || ((n == 1) && !(flags & KMM_DEBUG))) { + mutex_exit(&cp->cache_lock); + kmem_cache_free(kmem_move_cache, callback); + return (B_TRUE); /* there is no need for the move request */ + } + + pending = avl_find(&cp->cache_defrag->kmd_moves_pending, buf, &index); + if (pending != NULL) { + /* + * If the move is already pending and we're desperate now, + * update the move flags. + */ + if (flags & KMM_DESPERATE) { + pending->kmm_flags |= KMM_DESPERATE; + } + mutex_exit(&cp->cache_lock); + KMEM_STAT_ADD(kmem_move_stats.kms_already_pending); + kmem_cache_free(kmem_move_cache, callback); + return (B_TRUE); + } + + to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs), + B_FALSE); + callback->kmm_to_buf = to_buf; + avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index); + + mutex_exit(&cp->cache_lock); + + if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer, + callback, TQ_NOSLEEP)) { + KMEM_STAT_ADD(kmem_move_stats.kms_callback_taskq_fail); + mutex_enter(&cp->cache_lock); + avl_remove(&cp->cache_defrag->kmd_moves_pending, callback); + mutex_exit(&cp->cache_lock); + kmem_slab_free(cp, to_buf); + kmem_cache_free(kmem_move_cache, callback); + return (B_FALSE); + } + + return (B_TRUE); +} + +static void +kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback) +{ + avl_index_t index; + + ASSERT(cp->cache_defrag != NULL); + ASSERT(taskq_member(kmem_move_taskq, curthread)); + ASSERT(MUTEX_NOT_HELD(&cp->cache_lock)); + + mutex_enter(&cp->cache_lock); + VERIFY(avl_find(&cp->cache_defrag->kmd_moves_pending, + callback->kmm_from_buf, &index) != NULL); + avl_remove(&cp->cache_defrag->kmd_moves_pending, callback); + if (avl_is_empty(&cp->cache_defrag->kmd_moves_pending)) { + list_t *deadlist = &cp->cache_defrag->kmd_deadlist; + kmem_slab_t *sp; + + /* + * The last pending move completed. Release all slabs from the + * front of the dead list except for any slab at the tail that + * needs to be released from the context of kmem_move_buffers(). + * kmem deferred unmapping the buffers on these slabs in order + * to guarantee that buffers passed to the move callback have + * been touched only by kmem or by the client itself. + */ + while ((sp = list_remove_head(deadlist)) != NULL) { + if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) { + list_insert_tail(deadlist, sp); + break; + } + cp->cache_defrag->kmd_deadcount--; + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed); + mutex_enter(&cp->cache_lock); + } + } + mutex_exit(&cp->cache_lock); + kmem_cache_free(kmem_move_cache, callback); +} + +/* + * Move buffers from least used slabs first by scanning backwards from the end + * of the partial slab list. Scan at most max_scan candidate slabs and move + * buffers from at most max_slabs slabs (0 for all partial slabs in both cases). + * If desperate to reclaim memory, move buffers from any partial slab, otherwise + * skip slabs with a ratio of allocated buffers at or above the current + * threshold. Return the number of unskipped slabs (at most max_slabs, -1 if the + * scan is aborted) so that the caller can adjust the reclaimability threshold + * depending on how many reclaimable slabs it finds. + * + * kmem_move_buffers() drops and reacquires cache_lock every time it issues a + * move request, since it is not valid for kmem_move_begin() to call + * kmem_cache_alloc() or taskq_dispatch() with cache_lock held. + */ +static int +kmem_move_buffers(kmem_cache_t *cp, uint32_t max_scan, uint32_t max_slabs, + int flags) +{ + kmem_slab_t *sp; + void *buf; + int i, j; /* slab index, buffer index */ + int s; /* reclaimable slabs */ + int b; /* allocated (movable) buffers on reclaimable slab */ + boolean_t success; + int refcnt; + int nomove; + + ASSERT(taskq_member(kmem_taskq, curthread)); + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT(kmem_move_cache != NULL); + ASSERT(cp->cache_move != NULL && cp->cache_defrag != NULL); + ASSERT((flags & KMM_DEBUG) ? !avl_is_empty(&cp->cache_partial_slabs) : + avl_numnodes(&cp->cache_partial_slabs) > 1); + + if (kmem_move_blocked) { + return (0); + } + + if (kmem_move_fulltilt) { + flags |= KMM_DESPERATE; + } + + if (max_scan == 0 || (flags & KMM_DESPERATE)) { + /* + * Scan as many slabs as needed to find the desired number of + * candidate slabs. + */ + max_scan = (uint32_t)-1; + } + + if (max_slabs == 0 || (flags & KMM_DESPERATE)) { + /* Find as many candidate slabs as possible. */ + max_slabs = (uint32_t)-1; + } + + sp = avl_last(&cp->cache_partial_slabs); + ASSERT(KMEM_SLAB_IS_PARTIAL(sp)); + for (i = 0, s = 0; (i < (int)max_scan) && (s < (int)max_slabs) && (sp != NULL) && + ((sp != avl_first(&cp->cache_partial_slabs)) || + (flags & KMM_DEBUG)); + sp = AVL_PREV(&cp->cache_partial_slabs, sp), i++) { + + if (!kmem_slab_is_reclaimable(cp, sp, flags)) { + continue; + } + s++; + + /* Look for allocated buffers to move. */ + for (j = 0, b = 0, buf = sp->slab_base; + (j < sp->slab_chunks) && (b < sp->slab_refcnt); + buf = (((char *)buf) + cp->cache_chunksize), j++) { + + if (kmem_slab_allocated(cp, sp, buf) == NULL) { + continue; + } + + b++; + + /* + * Prevent the slab from being destroyed while we drop + * cache_lock and while the pending move is not yet + * registered. Flag the pending move while + * kmd_moves_pending may still be empty, since we can't + * yet rely on a non-zero pending move count to prevent + * the slab from being destroyed. + */ + ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING)); + sp->slab_flags |= KMEM_SLAB_MOVE_PENDING; + /* + * Recheck refcnt and nomove after reacquiring the lock, + * since these control the order of partial slabs, and + * we want to know if we can pick up the scan where we + * left off. + */ + refcnt = sp->slab_refcnt; + nomove = (sp->slab_flags & KMEM_SLAB_NOMOVE); + mutex_exit(&cp->cache_lock); + + success = kmem_move_begin(cp, sp, buf, flags); + + /* + * Now, before the lock is reacquired, kmem could + * process all pending move requests and purge the + * deadlist, so that upon reacquiring the lock, sp has + * been remapped. Or, the client may free all the + * objects on the slab while the pending moves are still + * on the taskq. Therefore, the KMEM_SLAB_MOVE_PENDING + * flag causes the slab to be put at the end of the + * deadlist and prevents it from being destroyed, since + * we plan to destroy it here after reacquiring the + * lock. + */ + mutex_enter(&cp->cache_lock); + ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); + sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING; + + if (sp->slab_refcnt == 0) { + list_t *deadlist = + &cp->cache_defrag->kmd_deadlist; + list_remove(deadlist, sp); + + if (!avl_is_empty( + &cp->cache_defrag->kmd_moves_pending)) { + /* + * A pending move makes it unsafe to + * destroy the slab, because even though + * the move is no longer needed, the + * context where that is determined + * requires the slab to exist. + * Fortunately, a pending move also + * means we don't need to destroy the + * slab here, since it will get + * destroyed along with any other slabs + * on the deadlist after the last + * pending move completes. + */ + list_insert_head(deadlist, sp); + KMEM_STAT_ADD(kmem_move_stats. + kms_endscan_slab_dead); + return (-1); + } + + /* + * Destroy the slab now if it was completely + * freed while we dropped cache_lock and there + * are no pending moves. Since slab_refcnt + * cannot change once it reaches zero, no new + * pending moves from that slab are possible. + */ + cp->cache_defrag->kmd_deadcount--; + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + KMEM_STAT_ADD(kmem_move_stats. + kms_dead_slabs_freed); + KMEM_STAT_ADD(kmem_move_stats. + kms_endscan_slab_destroyed); + mutex_enter(&cp->cache_lock); + /* + * Since we can't pick up the scan where we left + * off, abort the scan and say nothing about the + * number of reclaimable slabs. + */ + return (-1); + } + + if (!success) { + /* + * Abort the scan if there is not enough memory + * for the request and say nothing about the + * number of reclaimable slabs. + */ + KMEM_STAT_COND_ADD(s < (int)max_slabs, + kmem_move_stats.kms_endscan_nomem); + return (-1); + } + + /* + * The slab's position changed while the lock was + * dropped, so we don't know where we are in the + * sequence any more. + */ + if (sp->slab_refcnt != refcnt) { + /* + * If this is a KMM_DEBUG move, the slab_refcnt + * may have changed because we allocated a + * destination buffer on the same slab. In that + * case, we're not interested in counting it. + */ + KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) && + (s < (int)max_slabs), + kmem_move_stats.kms_endscan_refcnt_changed); + return (-1); + } + if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) { + KMEM_STAT_COND_ADD(s < (int)max_slabs, + kmem_move_stats.kms_endscan_nomove_changed); + return (-1); + } + + /* + * Generating a move request allocates a destination + * buffer from the slab layer, bumping the first partial + * slab if it is completely allocated. If the current + * slab becomes the first partial slab as a result, we + * can't continue to scan backwards. + * + * If this is a KMM_DEBUG move and we allocated the + * destination buffer from the last partial slab, then + * the buffer we're moving is on the same slab and our + * slab_refcnt has changed, causing us to return before + * reaching here if there are no partial slabs left. + */ + ASSERT(!avl_is_empty(&cp->cache_partial_slabs)); + if (sp == avl_first(&cp->cache_partial_slabs)) { + /* + * We're not interested in a second KMM_DEBUG + * move. + */ + goto end_scan; + } + } + } +end_scan: + + KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) && + (s < (int)max_slabs) && + (sp == avl_first(&cp->cache_partial_slabs)), + kmem_move_stats.kms_endscan_freelist); + + return (s); +} + +typedef struct kmem_move_notify_args { + kmem_cache_t *kmna_cache; + void *kmna_buf; +} kmem_move_notify_args_t; + +static void +kmem_cache_move_notify_task(void *arg) +{ + kmem_move_notify_args_t *args = arg; + kmem_cache_t *cp = args->kmna_cache; + void *buf = args->kmna_buf; + kmem_slab_t *sp; + + ASSERT(taskq_member(kmem_taskq, curthread)); + ASSERT(list_link_active(&cp->cache_link)); + + zfs_kmem_free(args, sizeof (kmem_move_notify_args_t)); + mutex_enter(&cp->cache_lock); + sp = kmem_slab_allocated(cp, NULL, buf); + + /* Ignore the notification if the buffer is no longer allocated. */ + if (sp == NULL) { + mutex_exit(&cp->cache_lock); + return; + } + + /* Ignore the notification if there's no reason to move the buffer. */ + if (avl_numnodes(&cp->cache_partial_slabs) > 1) { + /* + * So far the notification is not ignored. Ignore the + * notification if the slab is not marked by an earlier refusal + * to move a buffer. + */ + if (!(sp->slab_flags & KMEM_SLAB_NOMOVE) && + (sp->slab_later_count == 0)) { + mutex_exit(&cp->cache_lock); + return; + } + + kmem_slab_move_yes(cp, sp, buf); + ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING)); + sp->slab_flags |= KMEM_SLAB_MOVE_PENDING; + mutex_exit(&cp->cache_lock); + /* see kmem_move_buffers() about dropping the lock */ + (void) kmem_move_begin(cp, sp, buf, KMM_NOTIFY); + mutex_enter(&cp->cache_lock); + ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING); + sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING; + if (sp->slab_refcnt == 0) { + list_t *deadlist = &cp->cache_defrag->kmd_deadlist; + list_remove(deadlist, sp); + + if (!avl_is_empty( + &cp->cache_defrag->kmd_moves_pending)) { + list_insert_head(deadlist, sp); + mutex_exit(&cp->cache_lock); + KMEM_STAT_ADD(kmem_move_stats. + kms_notify_slab_dead); + return; + } + + cp->cache_defrag->kmd_deadcount--; + cp->cache_slab_destroy++; + mutex_exit(&cp->cache_lock); + kmem_slab_destroy(cp, sp); + KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed); + KMEM_STAT_ADD(kmem_move_stats.kms_notify_slab_destroyed); + return; + } + } else { + kmem_slab_move_yes(cp, sp, buf); + } + mutex_exit(&cp->cache_lock); +} + +void +kmem_cache_move_notify(kmem_cache_t *cp, void *buf) +{ + kmem_move_notify_args_t *args; + + KMEM_STAT_ADD(kmem_move_stats.kms_notify); + args = zfs_kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP); + if (args != NULL) { + args->kmna_cache = cp; + args->kmna_buf = buf; + if (!taskq_dispatch(kmem_taskq, + (task_func_t *)kmem_cache_move_notify_task, args, + TQ_NOSLEEP)) + zfs_kmem_free(args, sizeof (kmem_move_notify_args_t)); + } +} + +static void +kmem_cache_defrag(kmem_cache_t *cp) +{ + uint32_t n; + + ASSERT(cp->cache_defrag != NULL); + + mutex_enter(&cp->cache_lock); + n = avl_numnodes(&cp->cache_partial_slabs); + if (n > 1) { + /* kmem_move_buffers() drops and reacquires cache_lock */ + KMEM_STAT_ADD(kmem_move_stats.kms_defrags); + cp->cache_defrag->kmd_defrags++; + (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE); + } + mutex_exit(&cp->cache_lock); +} + +/* Is this cache above the fragmentation threshold? */ +static boolean_t +kmem_cache_frag_threshold(kmem_cache_t *cp, uint64_t nfree) +{ + /* + * nfree kmem_frag_numer + * ------------------ > --------------- + * cp->cache_buftotal kmem_frag_denom + */ + return ((nfree * kmem_frag_denom) > + (cp->cache_buftotal * kmem_frag_numer)); +} + +static boolean_t +kmem_cache_is_fragmented(kmem_cache_t *cp, boolean_t *doreap) +{ + boolean_t fragmented; + uint64_t nfree; + + ASSERT(MUTEX_HELD(&cp->cache_lock)); + *doreap = B_FALSE; + + if (kmem_move_fulltilt) { + if (avl_numnodes(&cp->cache_partial_slabs) > 1) { + return (B_TRUE); + } + } else { + if ((cp->cache_complete_slab_count + avl_numnodes( + &cp->cache_partial_slabs)) < kmem_frag_minslabs) { + return (B_FALSE); + } + } + + nfree = cp->cache_bufslab; + fragmented = ((avl_numnodes(&cp->cache_partial_slabs) > 1) && + kmem_cache_frag_threshold(cp, nfree)); + + /* + * Free buffers in the magazine layer appear allocated from the point of + * view of the slab layer. We want to know if the slab layer would + * appear fragmented if we included free buffers from magazines that + * have fallen out of the working set. + */ + if (!fragmented) { + long reap; + + mutex_enter(&cp->cache_depot_lock); + reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min); + reap = MIN(reap, cp->cache_full.ml_total); + mutex_exit(&cp->cache_depot_lock); + + nfree += ((uint64_t)reap * cp->cache_magtype->mt_magsize); + if (kmem_cache_frag_threshold(cp, nfree)) { + *doreap = B_TRUE; + } + } + + return (fragmented); +} + +/* Called periodically from kmem_taskq */ +static void +kmem_cache_scan(kmem_cache_t *cp) +{ + boolean_t reap = B_FALSE; + kmem_defrag_t *kmd; + + ASSERT(taskq_member(kmem_taskq, curthread)); + + mutex_enter(&cp->cache_lock); + + kmd = cp->cache_defrag; + if (kmd->kmd_consolidate > 0) { + kmd->kmd_consolidate--; + mutex_exit(&cp->cache_lock); + kmem_cache_reap(cp); + return; + } + + if (kmem_cache_is_fragmented(cp, &reap)) { + uint32_t slabs_found; + + /* + * Consolidate reclaimable slabs from the end of the partial + * slab list (scan at most kmem_reclaim_scan_range slabs to find + * reclaimable slabs). Keep track of how many candidate slabs we + * looked for and how many we actually found so we can adjust + * the definition of a candidate slab if we're having trouble + * finding them. + * + * kmem_move_buffers() drops and reacquires cache_lock. + */ + KMEM_STAT_ADD(kmem_move_stats.kms_scans); + kmd->kmd_scans++; + slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range, + kmem_reclaim_max_slabs, 0); + kmd->kmd_slabs_sought += kmem_reclaim_max_slabs; + kmd->kmd_slabs_found += slabs_found; + + if (++kmd->kmd_tries >= kmem_reclaim_scan_range) { + kmd->kmd_tries = 0; + + /* + * If we had difficulty finding candidate slabs in + * previous scans, adjust the threshold so that + * candidates are easier to find. + */ + if (kmd->kmd_slabs_found == kmd->kmd_slabs_sought) { + kmem_adjust_reclaim_threshold(kmd, -1); + } else if ((kmd->kmd_slabs_found * 2) < + kmd->kmd_slabs_sought) { + kmem_adjust_reclaim_threshold(kmd, 1); + } + kmd->kmd_slabs_sought = 0; + kmd->kmd_slabs_found = 0; + } + } else { + kmem_reset_reclaim_threshold(cp->cache_defrag); +#ifdef DEBUG + if (!avl_is_empty(&cp->cache_partial_slabs)) { + /* + * In a debug kernel we want the consolidator to + * run occasionally even when there is plenty of + * memory. + */ + uint16_t debug_rand; + + // smd: note that this only gets called for the dnode cache + // because only the dnode cache has kmem_cache_set_move() applied to it + // brendon says move is voluntary and "tricky" + // the reason this is not called is because the source is + // kmem_cache_update(), that only calls this function (kmem_cache_scan()) + // if there is a move/defrag (same thing) associated with it + // so hoist some of this code up to to kmem_cache_update + + (void) random_get_bytes((uint8_t *)&debug_rand, 2); + // dprintf("SPL: kmem_cache_scan debug_rand = %u, kmem_mtb_reap = %u, kmem_mtb_move = %u, mod1 %u, mod2 %u\n", + // debug_rand, kmem_mtb_reap, kmem_mtb_move, (debug_rand % kmem_mtb_reap), (debug_rand % kmem_mtb_move)); + if (!kmem_move_noreap && + ((debug_rand % kmem_mtb_reap) == 0)) { + mutex_exit(&cp->cache_lock); + KMEM_STAT_ADD(kmem_move_stats.kms_debug_reaps); + kmem_mtb_reap_count++; + // dprintf("SPL: kmem_cache_scan random debug reap %llu\n", kmem_move_stats.kms_debug_reaps); + // kmem_cache_reap(cp); // XXX + return; + } else if ((debug_rand % kmem_mtb_move) == 0) { + KMEM_STAT_ADD(kmem_move_stats.kms_scans); + KMEM_STAT_ADD(kmem_move_stats.kms_debug_scans); + // dprintf("SPL: kmem_cache_scan random debug move scans=%llu debug_scans=%llu\n", + // kmem_move_stats.kms_scans, kmem_move_stats.kms_debug_scans); + kmd->kmd_scans++; + (void) kmem_move_buffers(cp, + kmem_reclaim_scan_range, 1, KMM_DEBUG); + } + } +#endif /* DEBUG */ + } + + mutex_exit(&cp->cache_lock); + + if (reap) { + KMEM_STAT_ADD(kmem_move_stats.kms_scan_depot_ws_reaps); + // kmem_depot_ws_reap(cp); // XXX + } +} + +// =============================================================== +// Status +// =============================================================== + + +uint64_t +kmem_size(void) +{ + return (total_memory); // smd +} + +// this is used in arc_reclaim_needed. if 1, reclaim is needed. +// returning 1 has the effect of throttling ARC, so be careful. +int +spl_vm_pool_low(void) +{ + boolean_t m = spl_minimal_physmem_p_logic(); + + if (m) + return (0); + else + return (1); +} + +// =============================================================== +// String handling +// =============================================================== + +void +strfree(char *str) +{ + zfs_kmem_free(str, strlen(str) + 1); +} + +char * +kvasdprintf(const char *fmt, va_list ap) +{ + unsigned int len; + char *p = NULL; + va_list aq; + +// va_copy(aq, ap); +// len = _vsndprintf(NULL, 0, fmt, aq); +// va_end(aq); +// p = zfs_kmem_alloc(len+1, KM_SLEEP); + if (!p) + return (NULL); + + //_vsndprintf(p, len+1, fmt, ap); + + return (p); +} + +char * +kmem_vasdprintf(const char *fmt, va_list ap) +{ + va_list aq; + char *ptr = NULL; + +// do { +// va_copy(aq, ap); +// ptr = kvasdprintf(fmt, aq); +// va_end(aq); +// } while (ptr == NULL); + + return (ptr); +} + +char * +kmem_asdprintf(const char *fmt, ...) +{ + va_list ap; + char *ptr; + + do { + va_start(ap, fmt); + ptr = kvasdprintf(fmt, ap); + va_end(ap); + } while (ptr == NULL); + + return (ptr); +} + +char * +kmem_asprintf(const char *fmt, ...) +{ + int size; + va_list adx; + char *buf; + + va_start(adx, fmt); + size = _vsnprintf(NULL, 0, fmt, adx) + 1; + va_end(adx); + + buf = kmem_alloc(size, KM_SLEEP); + + va_start(adx, fmt); + (void)_vsnprintf(buf, size, fmt, adx); + va_end(adx); + + return (buf); +} + +/* Copyright (C) 2014 insane coder (http://insanecoding.blogspot.com/, http://asprintf.insanecoding.org/) */ +char * +kmem_vasprintf(const char *fmt, va_list ap) +{ + char *ptr; + int size; + int r = -1; + + size = vsnprintf(NULL, 0, fmt, ap); + if ((size >= 0) && (size < INT_MAX)) { + ptr = (char *)kmem_alloc(size + 1, KM_SLEEP); //+1 for null + if (ptr) { + r = vsnprintf(ptr, size + 1, fmt, ap); //+1 for null + if ((r < 0) || (r > size)) { + kmem_free(ptr, size); + r = -1; + } + } + } else { + ptr = 0; + } + + return(ptr); +} + +char * +kmem_strstr(const char *in, const char *str) +{ + char c; + uint32_t len; + + c = *str++; + if (!c) + return ((char *) in); // Trivial empty string case + + len = (uint32_t)strlen(str); + do { + char sc; + + do { + sc = *in++; + if (!sc) + return ((char *) 0); + } while (sc != c); + } while (strncmp(in, str, len) != 0); + + return ((char *) (in - 1)); +} + + +// suppress timer and related logic for this kmem cache can live here +// three new per-kmem-cache stats: counters: non-vba-success non-vba-fail; flag: arc_no_grow + +// from zfs/include/sys/spa.h + +#define SPA_MINBLOCKSHIFT 9 +#define SPA_MAXBLOCKSHIFT 24 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + +typedef struct { + //_Atomic (kmem_cache_t *)cp_metadata; + //_Atomic (kmem_cache_t *)cp_filedata; + _Atomic kmem_cache_t *cp_metadata; + _Atomic kmem_cache_t *cp_filedata; + uint16_t pointed_to; + _Atomic int64_t suppress_count; + _Atomic uint64_t last_bumped; +} ksupp_t; + +typedef struct { + ksupp_t *ks_entry; +} iksupp_t; + +ksupp_t ksvec[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT] = { { NULL, NULL, FALSE, 0, 0 } }; +iksupp_t iksvec[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT] = { { NULL } }; + +static boolean_t spl_zio_no_grow_inited = FALSE; + +/* + * Test that cp is in ks->cp_metadata or ks->cp_filedata; if so just return + * otherwise, choose the first (and possibly second) NULL + * and try to set it to cp. + * If successful, return. otherwise, sanity check that + * nobody has set ks->cp_metadata or ks->cp_filedata to cp already, and + * that ks->cp_metadata != ks->cp_filedata. + */ + +static void +ks_set_cp(ksupp_t *ks, kmem_cache_t *cp, const uint32_t cachenum) +{ + + ASSERT(cp != NULL); + ASSERT(ks != NULL); + + if (ks->cp_metadata == cp || ks->cp_filedata == cp) + return; + + const uint64_t b = cachenum; + + boolean_t cp_is_metadata = FALSE; + + vmem_t *vmp = cp->cache_arena; + + ASSERT(vmp == zio_metadata_arena || vmp == zio_arena); + + if (vmp == zio_metadata_arena) + cp_is_metadata = TRUE; + + if (cp_is_metadata) { + for (uint32_t i = 0; ; i++) { + if (i >= 1000000) { + panic("SPL: %s: iterated out trying to set ks->cp_metadata (%s)\n", + __func__, cp->cache_name); + } + kmem_cache_t *expected = NULL; + //if (__c11_atomic_compare_exchange_strong(&ks->cp_metadata, &expected, cp, + // __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { + if (InterlockedCompareExchangePointer(&ks->cp_metadata, cp, expected) == expected) { + dprintf("SPL: %s: set iskvec[%llu].ks->cp_metadata (%s) OK\n", + __func__, b, cp->cache_name); + return; + } else if (ks->cp_metadata == cp) { + return; + } else if (ks->cp_metadata == NULL) { + continue; + } else { + panic("%s: CAS failed for iksvec[%llu].ks->cp_metadata: %s wanted %s set\n", + __func__, b, cp->cache_name, ks->cp_metadata->cache_name); + } + } + } else { + for (int32_t j = 0; ; j++) { + if (j >= 1000000) { + panic("SPL: %s: iterated out trying to set ks->cp_filedata (%s)\n", + __func__, cp->cache_name); + } + kmem_cache_t *expected = NULL; + //if (__c11_atomic_compare_exchange_strong(&ks->cp_filedata, &expected, cp, + // __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { + if (InterlockedCompareExchangePointer(&ks->cp_filedata, cp, expected) == expected) { + dprintf("SPL: %s: set iskvec[%llu].ks->cp_filedata (%s) OK\n", + __func__, b, cp->cache_name); + return; + } else if (ks->cp_filedata == cp) { + return; + } else if (ks->cp_filedata == NULL) { + continue; + } else { + panic("%s: CAS failed for iksvec[%llu].ks->cp_metadata: %s wanted %s set\n", + __func__, b, cp->cache_name, ks->cp_filedata->cache_name); + } + } + } +} + +void +spl_zio_no_grow_init(void) +{ + // this is the logic from zio.c:zio_init() + + ASSERT(spl_zio_no_grow_inited == FALSE); + + uint32_t c = 0; + + for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + uint32_t size = (c+1) << SPA_MINBLOCKSHIFT; + uint32_t p2 = size; + uint32_t align = 0; + + while (!ISP2(p2)) + p2 &= p2 - 1; + + if (size <= 4 * SPA_MINBLOCKSIZE) { + align = SPA_MINBLOCKSIZE; + } else if (size <= 128 * 1024 && IS_P2ALIGNED(size, p2 >> 4)) { + align = MIN(p2 >> 4, PAGESIZE); + } else if (IS_P2ALIGNED(size, p2 >> 3)) { + align = MIN(p2 >> 3, PAGESIZE); + } + + if (align != 0) { + iksvec[c].ks_entry = &ksvec[c]; + iksvec[c].ks_entry->pointed_to++; + } + } + + while (--c != 0) { + ASSERT(iksvec[c].ks_entry != NULL); + ASSERT(iksvec[c].ks_entry->pointed_to > 0); + if (iksvec[c - 1].ks_entry == NULL) { + iksvec[c - 1].ks_entry = iksvec[c].ks_entry; + iksvec[c - 1].ks_entry->pointed_to++; + } + } + + spl_zio_no_grow_inited = TRUE; + + dprintf("SPL: %s done.\n", __func__); +} + +static void +spl_zio_no_grow_clear() +{ + for (uint32_t c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + ksupp_t *ks = iksvec[c].ks_entry; + ks->cp_metadata = NULL; + ks->cp_filedata = NULL; + ks->pointed_to = FALSE; + ks->suppress_count = 0; + ks->last_bumped = 0; + iksvec[c].ks_entry = NULL; + } +} + +void +spl_zio_no_grow_fini(void) +{ + // zio_fini() is at its end, so the kmem_caches are gone, + // consequently this is safe. + spl_zio_no_grow_inited = FALSE; + spl_zio_no_grow_clear(); + spl_zio_no_grow_init(); +} + +static void +spl_zio_set_no_grow(const uint32_t size, kmem_cache_t *cp, const uint32_t cachenum) +{ + ASSERT(spl_zio_no_grow_inited == TRUE); + ASSERT(iksvec[cachenum].ks_entry != NULL); + + ksupp_t *ks = iksvec[cachenum].ks_entry; + + // maybe update size->cp mapping vector + + ks_set_cp(ks, cp, cachenum); + + if (ks->cp_metadata != cp && ks->cp_filedata != cp) { + panic("ks_cp_set bad for %s", cp->cache_name); + } + + // suppress the bucket for two allocations (is _Atomic) + ks->suppress_count += 2; + ks->last_bumped = zfs_lbolt(); +} + +boolean_t +spl_zio_is_suppressed(const uint32_t size, const uint64_t now, const boolean_t buf_is_metadata, + kmem_cache_t **zp) +{ + + ASSERT(spl_zio_no_grow_inited == TRUE); + + const uint32_t cachenum = (size - 1) >> SPA_MINBLOCKSHIFT; + + VERIFY3U(cachenum, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + ksupp_t *ks = iksvec[cachenum].ks_entry; + + if (ks == NULL) { + return (FALSE); + } else if (ks->pointed_to < 1) { + ASSERT(ks->pointed_to > 0); // throw an assertion + TraceEvent(TRACE_ERROR, "SPL: %s: ERROR: iksvec[%llu].ks_entry->pointed_to == %u for size %llu\n", + __func__, (uint64_t)cachenum, ks->pointed_to, (uint64_t)size); + return (FALSE); + } else if (ks->suppress_count == 0) { + return (FALSE); + } else { + const uint64_t two_minutes = 120 * hz; + if (ks->last_bumped + two_minutes >= now) { + ks->suppress_count = 0; + ks->last_bumped = now; + return (FALSE); + } else { + ks->suppress_count--; + } + if (buf_is_metadata) { + if (ks->cp_metadata == NULL) { + ks_set_cp(ks, zp[cachenum], cachenum); + if (ks->cp_metadata != NULL) { + atomic_inc_64(&ks->cp_metadata->arc_no_grow); + } else { + TraceEvent(TRACE_WARNING, "WARNING: %s: " + "ks_set_cp->metadata == NULL after ks_set_cp !" + "size = %lu\n", + __func__, size); + } + } else { + atomic_inc_64(&ks->cp_metadata->arc_no_grow); + } + } else { + if (ks->cp_filedata == NULL) { + ks_set_cp(ks, zp[cachenum], cachenum); + if (ks->cp_filedata != NULL) { + atomic_inc_64(&ks->cp_filedata->arc_no_grow); + } else { + TraceEvent(TRACE_WARNING, "WARNING: %s: " + "ks_set_cp->filedata == NULL after ks_set_cp !" + "size = %lu\n", + __func__, size); + } + } else { + atomic_inc_64(&ks->cp_filedata->arc_no_grow); + } + + } + return (TRUE); + } +} + + +/* + * spl_zio_kmem_cache_alloc(): try to get an allocation without descending to the bucket layer, + * and if that fails, set a flag for spl_arc_no_grow() then perform the allocation normally. + */ + +void * +spl_zio_kmem_cache_alloc(kmem_cache_t *cp, int kmflag, uint32_t size, uint32_t cachenum) +{ + // called by: + // spl_zio_kmem_cache_alloc(zio_buf_cache[size], kmflag, size, cachenum) or + // spl_zio_kmem_cache_alloc(zio_data_buf_cache[size], kmflag, size, cachenum) + // those are e.g. + // kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPAMINBLOCKSHIFT] + // and are indexed as uint32_t cachenum = (size - 1) >> SPA_MIN~BLOCKSHIFT + // VERIFY3U(cachenum, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + // try to get memory from no lower than the bucket_heap + void *m = kmem_cache_alloc(cp, kmflag | KM_NO_VBA | KM_NOSLEEP); + + if (m != NULL) { + atomic_inc_64(&cp->no_vba_success); + return (m); + } + + atomic_inc_64(&cp->no_vba_fail); + + // we will have to go below the bucket_heap to a bucket arena. + // if the bucket arena cannot obviously satisfy the allocation, + // and xnu is tight for memory, then we turn on the no_grow suppression + + extern vmem_t *spl_vmem_bucket_arena_by_size(uint32_t); + extern uint64_t vmem_xnu_useful_bytes_free(void); + extern int vmem_canalloc_atomic(vmem_t *, uint32_t); + + vmem_t *bvmp = spl_vmem_bucket_arena_by_size(size); + + if (! vmem_canalloc_atomic(bvmp, size) && + vmem_xnu_useful_bytes_free() < 16ULL*1024ULL*1024ULL) { + spl_zio_set_no_grow(size, cp, cachenum); + atomic_inc_64(&cp->arc_no_grow_set); + } + + // perform the allocation as requested + void *n = kmem_cache_alloc(cp, kmflag); + + return(n); +} + + + +/* +* return true if the reclaim thread should be awakened +* because we do not have enough memory on hand +*/ +boolean_t +spl_arc_reclaim_needed(const size_t bytes, kmem_cache_t **zp) +{ + + /* + * fast path: + * if our argument is 0, then do the equivalent of + * if (arc_available_memory() < 0) return (B_TRUE); + * which is traditional arc.c appraoch + * so we can arc_reclaim_needed() -> spl_arc_reclaim_needed(0) + * if we desire. + */ + if (bytes == 0 && spl_free < 0) { + return (B_TRUE); + } + + // copy some code from zio_buf_alloc() + size_t c = (bytes - 1) >> SPA_MINBLOCKSHIFT; + VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + // if there is free memory in the kmem cache slab layer + // then we do not have to reclaim + + if (zp[c]->cache_bufslab > 1) { + if (spl_free < 0) + atomic_inc_64(&spl_arc_reclaim_avoided); + return (B_FALSE); + } + + extern uint64_t vmem_xnu_useful_bytes_free(void); + const uint64_t min_threshold = 64ULL * 1024ULL * 1024ULL; + const uint64_t pm_pct = real_total_memory >> 8; + const uint64_t high_threshold = MAX(min_threshold, (uint64_t)pm_pct); + const uint64_t low_threshold = bytes; + + const uint64_t f = vmem_xnu_useful_bytes_free(); + + if (f <= low_threshold) { + return (B_TRUE); + } + else if (f > high_threshold) { + if (spl_free < 0) + atomic_inc_64(&spl_arc_reclaim_avoided); + return (B_FALSE); + } + + if (spl_free < 0) { + return (B_TRUE); + } + else { + return (B_FALSE); + } +} + +/* small auxiliary function since we do not export struct kmem_cache to zfs */ +size_t +kmem_cache_bufsize(kmem_cache_t *cp) +{ + return (cp->cache_bufsize); +} + +/* + * check that we would not have KMERR_BADCACHE error in the event + * we did kmem_cache_free(cp, buf) in a DEBUG setting + * + * returns: NULL if the buf is not found in any cache + * cparg if the buf is found in cparg + * a pointer to the cache the buf is found in, if not cparg + */ + +kmem_cache_t * +kmem_cache_buf_in_cache(kmem_cache_t *cparg, void *bufarg) +{ + kmem_cache_t *cp = cparg; + kmem_slab_t *sp; + void *buf = bufarg; + + sp = kmem_findslab(cp, buf); + if (sp == NULL) { + for (cp = list_tail(&kmem_caches); cp != NULL; + cp = list_prev(&kmem_caches, cp)) { + if ((sp = kmem_findslab(cp, buf)) != NULL) + break; + } + } + + if (sp == NULL) { + TraceEvent(TRACE_ERROR, "SPL: %s: KMERR_BADADDR orig cache = %s\n", + __func__, cparg->cache_name); + return (NULL); + } + + if (cp == NULL) { + TraceEvent(TRACE_ERROR, "SPL: %s: ERROR cp == NULL; cparg == %s", + __func__, cparg->cache_name); + return (NULL); + } + + if (cp != cparg) { + TraceEvent(TRACE_ERROR, "SPL: %s: KMERR_BADCACHE arg cache = %s but found in %s instead\n", + __func__, cparg->cache_name, cp->cache_name); + return(cp); + } + + ASSERT(cp==cparg); + + return (cp); +} diff --git a/module/os/windows/spl/spl-kstat.c b/module/os/windows/spl/spl-kstat.c new file mode 100644 index 000000000000..2895206c3116 --- /dev/null +++ b/module/os/windows/spl/spl-kstat.c @@ -0,0 +1,1943 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + */ + +/* + * Provides an implementation of kstat that is backed by whatever windows has ? + */ + +#include +#include +#include +#include +#include + +/* kstat_fr.c */ + +/* +* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. +* Copyright 2014, Joyent, Inc. All rights reserved. +* Copyright 2015 Nexenta Systems, Inc. All rights reserved. +*/ + +/* +* Kernel statistics framework +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include +#include +//#include +#include +#include +#include +//#include +#include +#include +#include +//#include +#include +#include +//#include +//#include +#include +#include + +#include + +/* +* Global lock to protect the AVL trees and kstat_chain_id. +*/ +static kmutex_t kstat_chain_lock; + +/* +* Every install/delete kstat bumps kstat_chain_id. This is used by: +* +* (1) /dev/kstat, to detect changes in the kstat chain across ioctls; +* +* (2) kstat_create(), to assign a KID (kstat ID) to each new kstat. +* /dev/kstat uses the KID as a cookie for kstat lookups. +* +* We reserve the first two IDs because some kstats are created before +* the well-known ones (kstat_headers = 0, kstat_types = 1). +* +* We also bump the kstat_chain_id if a zone is gaining or losing visibility +* into a particular kstat, which is logically equivalent to a kstat being +* installed/deleted. +*/ + +kid_t kstat_chain_id = 2; + +/* +* As far as zones are concerned, there are 3 types of kstat: +* +* 1) Those which have a well-known name, and which should return per-zone data +* depending on which zone is doing the kstat_read(). sockfs:0:sock_unix_list +* is an example of this type of kstat. +* +* 2) Those which should only be exported to a particular list of zones. +* For example, in the case of nfs:*:mntinfo, we don't want zone A to be +* able to see NFS mounts associated with zone B, while we want the +* global zone to be able to see all mounts on the system. +* +* 3) Those that can be exported to all zones. Most system-related +* kstats fall within this category. +* +* An ekstat_t thus contains a list of kstats that the zone is to be +* exported to. The lookup of a name:instance:module thus translates to a +* lookup of name:instance:module:myzone; if the kstat is not exported +* to all zones, and does not have the caller's zoneid explicitly +* enumerated in the list of zones to be exported to, it is the same as +* if the kstat didn't exist. +* +* Writing to kstats is currently disallowed from within a non-global +* zone, although this restriction could be removed in the future. +*/ +typedef struct kstat_zone { + zoneid_t zoneid; + struct kstat_zone *next; +} kstat_zone_t; + +/* +* Extended kstat structure -- for internal use only. +*/ +typedef struct ekstat { + kstat_t e_ks; /* the kstat itself */ + size_t e_size; /* total allocation size */ + kthread_t *e_owner; /* thread holding this kstat */ + kcondvar_t e_cv; /* wait for owner == NULL */ + avl_node_t e_avl_bykid; /* AVL tree to sort by KID */ + avl_node_t e_avl_byname; /* AVL tree to sort by name */ + kstat_zone_t e_zone; /* zone to export stats to */ +} ekstat_t; + +static uint64_t kstat_initial[8192]; +static void *kstat_initial_ptr = kstat_initial; +static size_t kstat_initial_avail = sizeof(kstat_initial); +static vmem_t *kstat_arena; + +#define KSTAT_ALIGN (sizeof (uint64_t)) + +static avl_tree_t kstat_avl_bykid; +static avl_tree_t kstat_avl_byname; + +/* +* Various pointers we need to create kstats at boot time in kstat_init() +*/ +extern kstat_named_t *segmapcnt_ptr; +extern uint_t segmapcnt_ndata; +extern int segmap_kstat_update(kstat_t *, int); +extern kstat_named_t *biostats_ptr; +extern uint_t biostats_ndata; +extern kstat_named_t *pollstats_ptr; +extern uint_t pollstats_ndata; + +extern int vac; +extern uint_t nproc; +extern time_t boot_time; + +static struct { + char name[KSTAT_STRLEN]; + size_t size; + uint_t min_ndata; + uint_t max_ndata; +} kstat_data_type[KSTAT_NUM_TYPES] = { + { "raw", 1, 0, INT_MAX }, + { "name=value", sizeof(kstat_named_t), 0, INT_MAX }, + { "interrupt", sizeof(kstat_intr_t), 1, 1 }, + { "i/o", sizeof(kstat_io_t), 1, 1 }, + { "event_timer", sizeof(kstat_timer_t), 0, INT_MAX }, +}; + +static int header_kstat_update(kstat_t *, int); +static int header_kstat_snapshot(kstat_t *, void *, int); + +int +kstat_zone_find(kstat_t *k, zoneid_t zoneid) +{ + ekstat_t *e = (ekstat_t *)k; + kstat_zone_t *kz; + + ASSERT(MUTEX_HELD(&kstat_chain_lock)); + for (kz = &e->e_zone; kz != NULL; kz = kz->next) { + if (zoneid == ALL_ZONES || kz->zoneid == ALL_ZONES) + return (1); + if (zoneid == kz->zoneid) + return (1); + } + return (0); +} + +void +kstat_zone_remove(kstat_t *k, zoneid_t zoneid) +{ + ekstat_t *e = (ekstat_t *)k; + kstat_zone_t *kz, *t = NULL; + + mutex_enter(&kstat_chain_lock); + if (zoneid == e->e_zone.zoneid) { + kz = e->e_zone.next; + ASSERT(kz != NULL); + e->e_zone.zoneid = kz->zoneid; + e->e_zone.next = kz->next; + goto out; + } + for (kz = &e->e_zone; kz->next != NULL; kz = kz->next) { + if (kz->next->zoneid == zoneid) { + t = kz->next; + kz->next = t->next; + break; + } + } + ASSERT(t != NULL); /* we removed something */ + kz = t; +out: + kstat_chain_id++; + mutex_exit(&kstat_chain_lock); + kmem_free(kz, sizeof(*kz)); +} + +void +kstat_zone_add(kstat_t *k, zoneid_t zoneid) +{ + ekstat_t *e = (ekstat_t *)k; + kstat_zone_t *kz; + + kz = kmem_alloc(sizeof(*kz), KM_NOSLEEP); + if (kz == NULL) + return; + mutex_enter(&kstat_chain_lock); + kz->zoneid = zoneid; + kz->next = e->e_zone.next; + e->e_zone.next = kz; + kstat_chain_id++; + mutex_exit(&kstat_chain_lock); +} + +/* +* Compare the list of zones for the given kstats, returning 0 if they match +* (ie, one list contains ALL_ZONES or both lists contain the same zoneid). +* In practice, this is called indirectly by kstat_hold_byname(), so one of the +* two lists always has one element, and this is an O(n) operation rather than +* O(n^2). +*/ +static int +kstat_zone_compare(ekstat_t *e1, ekstat_t *e2) +{ + kstat_zone_t *kz1, *kz2; + + ASSERT(MUTEX_HELD(&kstat_chain_lock)); + for (kz1 = &e1->e_zone; kz1 != NULL; kz1 = kz1->next) { + for (kz2 = &e2->e_zone; kz2 != NULL; kz2 = kz2->next) { + if (kz1->zoneid == ALL_ZONES || + kz2->zoneid == ALL_ZONES) + return (0); + if (kz1->zoneid == kz2->zoneid) + return (0); + } + } + return (e1->e_zone.zoneid < e2->e_zone.zoneid ? -1 : 1); +} + +/* +* Support for keeping kstats sorted in AVL trees for fast lookups. +*/ +static int +kstat_compare_bykid(const void *a1, const void *a2) +{ + const kstat_t *k1 = a1; + const kstat_t *k2 = a2; + + if (k1->ks_kid < k2->ks_kid) + return (-1); + if (k1->ks_kid > k2->ks_kid) + return (1); + return (kstat_zone_compare((ekstat_t *)k1, (ekstat_t *)k2)); +} + +static int +kstat_compare_byname(const void *a1, const void *a2) +{ + const kstat_t *k1 = a1; + const kstat_t *k2 = a2; + int s; + + s = strcmp(k1->ks_module, k2->ks_module); + if (s > 0) + return (1); + if (s < 0) + return (-1); + + if (k1->ks_instance < k2->ks_instance) + return (-1); + if (k1->ks_instance > k2->ks_instance) + return (1); + + s = strcmp(k1->ks_name, k2->ks_name); + if (s > 0) + return (1); + if (s < 0) + return (-1); + + return (kstat_zone_compare((ekstat_t *)k1, (ekstat_t *)k2)); +} + +static kstat_t * +kstat_hold(avl_tree_t *t, ekstat_t *template) +{ + kstat_t *ksp; + ekstat_t *e; + + mutex_enter(&kstat_chain_lock); + for (;;) { + ksp = avl_find(t, template, NULL); + if (ksp == NULL) + break; + e = (ekstat_t *)ksp; + if (e->e_owner == NULL) { + e->e_owner = (void *)curthread; + break; + } + cv_wait(&e->e_cv, &kstat_chain_lock); + } + mutex_exit(&kstat_chain_lock); + return (ksp); +} + +void +kstat_rele(kstat_t *ksp) +{ + ekstat_t *e = (ekstat_t *)ksp; + + mutex_enter(&kstat_chain_lock); + ASSERT(e->e_owner == (void *)curthread); + e->e_owner = NULL; + cv_broadcast(&e->e_cv); + mutex_exit(&kstat_chain_lock); +} + +kstat_t * +kstat_hold_bykid(kid_t kid, zoneid_t zoneid) +{ + ekstat_t e; + + e.e_ks.ks_kid = kid; + e.e_zone.zoneid = zoneid; + e.e_zone.next = NULL; + + return (kstat_hold(&kstat_avl_bykid, &e)); +} + +kstat_t * +kstat_hold_byname(const char *ks_module, int ks_instance, const char *ks_name, + zoneid_t ks_zoneid) +{ + ekstat_t e; + + kstat_set_string(e.e_ks.ks_module, ks_module); + e.e_ks.ks_instance = ks_instance; + kstat_set_string(e.e_ks.ks_name, ks_name); + e.e_zone.zoneid = ks_zoneid; + e.e_zone.next = NULL; + return (kstat_hold(&kstat_avl_byname, &e)); +} + +static ekstat_t * +kstat_alloc(size_t size) +{ + ekstat_t *e = NULL; + + size = P2ROUNDUP(sizeof(ekstat_t) + size, KSTAT_ALIGN); + + if (kstat_arena == NULL) { + if (size <= kstat_initial_avail) { + e = kstat_initial_ptr; + kstat_initial_ptr = (char *)kstat_initial_ptr + size; + kstat_initial_avail -= size; + } + } else { + e = vmem_alloc(kstat_arena, size, VM_NOSLEEP); + } + + if (e != NULL) { + bzero(e, size); + e->e_size = size; + cv_init(&e->e_cv, NULL, CV_DEFAULT, NULL); + } + + return (e); +} + +static void +kstat_free(ekstat_t *e) +{ + cv_destroy(&e->e_cv); + vmem_free(kstat_arena, e, e->e_size); +} + +extern vmem_t *heap_arena; +void *segkmem_alloc(vmem_t *vmp, uint32_t size, int vmflag); +void segkmem_free(vmem_t *vmp, void *inaddr, uint32_t size); + +/* +* Create various system kstats. +*/ +void +kstat_init(void) +{ + kstat_t *ksp; + ekstat_t *e; + avl_tree_t *t = &kstat_avl_bykid; + + /* + * Set up the kstat vmem arena. + */ + kstat_arena = vmem_create("kstat", + (void *)kstat_initial, sizeof(kstat_initial), KSTAT_ALIGN, + segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP); + + /* + * Make initial kstats appear as though they were allocated. + */ + for (e = avl_first(t); e != NULL; e = avl_walk(t, e, AVL_AFTER)) + (void) vmem_xalloc(kstat_arena, e->e_size, KSTAT_ALIGN, + 0, 0, e, (char *)e + e->e_size, + VM_NOSLEEP | VM_BESTFIT | VM_PANIC); + + /* + * The mother of all kstats. The first kstat in the system, which + * always has KID 0, has the headers for all kstats (including itself) + * as its data. Thus, the kstat driver does not need any special + * interface to extract the kstat chain. + */ + kstat_chain_id = 0; + ksp = kstat_create("unix", 0, "kstat_headers", "kstat", KSTAT_TYPE_RAW, + 0, KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_VAR_SIZE); + if (ksp) { + ksp->ks_lock = &kstat_chain_lock; + ksp->ks_update = header_kstat_update; + ksp->ks_snapshot = header_kstat_snapshot; + kstat_install(ksp); + } else { + panic("cannot create kstat 'kstat_headers'"); + } + + ksp = kstat_create("unix", 0, "kstat_types", "kstat", + KSTAT_TYPE_NAMED, KSTAT_NUM_TYPES, 0); + if (ksp) { + int i; + kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); + + for (i = 0; i < KSTAT_NUM_TYPES; i++) { + kstat_named_init(&kn[i], kstat_data_type[i].name, + KSTAT_DATA_ULONG); + kn[i].value.ul = i; + } + kstat_install(ksp); + } + +} + +/* +* Caller of this should ensure that the string pointed by src +* doesn't change while kstat's lock is held. Not doing so defeats +* kstat's snapshot strategy as explained in +*/ +void +kstat_named_setstr(kstat_named_t *knp, const char *src) +{ + if (knp->data_type != KSTAT_DATA_STRING) + panic("kstat_named_setstr('%p', '%p'): " + "named kstat is not of type KSTAT_DATA_STRING", + (void *)knp, (void *)src); + + KSTAT_NAMED_STR_PTR(knp) = (char *)src; + if (src != NULL) + KSTAT_NAMED_STR_BUFLEN(knp) = strlen(src) + 1; + else + KSTAT_NAMED_STR_BUFLEN(knp) = 0; +} + +void +kstat_set_string(char *dst, const char *src) +{ + bzero(dst, KSTAT_STRLEN); + (void)strncpy(dst, src, KSTAT_STRLEN - 1); +} + +void +kstat_named_init(kstat_named_t *knp, const char *name, uchar_t data_type) +{ + kstat_set_string(knp->name, name); + knp->data_type = data_type; + + if (data_type == KSTAT_DATA_STRING) + kstat_named_setstr(knp, NULL); +} + +void +kstat_timer_init(kstat_timer_t *ktp, const char *name) +{ + kstat_set_string(ktp->name, name); +} + +/* ARGSUSED */ +static int +default_kstat_update(kstat_t *ksp, int rw) +{ + uint_t i; + size_t len = 0; + kstat_named_t *knp; + + /* + * Named kstats with variable-length long strings have a standard + * way of determining how much space is needed to hold the snapshot: + */ + if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED && + (ksp->ks_flags & (KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_LONGSTRINGS))) { + + /* + * Add in the space required for the strings + */ + knp = KSTAT_NAMED_PTR(ksp); + for (i = 0; i < ksp->ks_ndata; i++, knp++) { + if (knp->data_type == KSTAT_DATA_STRING) + len += KSTAT_NAMED_STR_BUFLEN(knp); + } + ksp->ks_data_size = + ksp->ks_ndata * sizeof(kstat_named_t) + len; + } + return (0); +} + +static int +default_kstat_snapshot(kstat_t *ksp, void *buf, int rw) +{ + kstat_io_t *kiop; + hrtime_t cur_time; + size_t namedsz; + + ksp->ks_snaptime = cur_time = gethrtime(); + + if (rw == KSTAT_WRITE) { + if (!(ksp->ks_flags & KSTAT_FLAG_WRITABLE)) + return (EACCES); + bcopy(buf, ksp->ks_data, ksp->ks_data_size); + return (0); + } + + /* + * KSTAT_TYPE_NAMED kstats are defined to have ks_ndata + * number of kstat_named_t structures, followed by an optional + * string segment. The ks_data generally holds only the + * kstat_named_t structures. So we copy it first. The strings, + * if any, are copied below. For other kstat types, ks_data holds the + * entire buffer. + */ + + namedsz = sizeof(kstat_named_t) * ksp->ks_ndata; + if (ksp->ks_type == KSTAT_TYPE_NAMED && ksp->ks_data_size > namedsz) + bcopy(ksp->ks_data, buf, namedsz); + else + bcopy(ksp->ks_data, buf, ksp->ks_data_size); + + /* + * Apply kstat type-specific data massaging + */ + switch (ksp->ks_type) { + + case KSTAT_TYPE_IO: + /* + * Normalize time units and deal with incomplete transactions + */ +#if 0 + kiop = (kstat_io_t *)buf; + + scalehrtime(&kiop->wtime); + scalehrtime(&kiop->wlentime); + scalehrtime(&kiop->wlastupdate); + scalehrtime(&kiop->rtime); + scalehrtime(&kiop->rlentime); + scalehrtime(&kiop->rlastupdate); + + if (kiop->wcnt != 0) { + /* like kstat_waitq_exit */ + hrtime_t wfix = cur_time - kiop->wlastupdate; + kiop->wlastupdate = cur_time; + kiop->wlentime += kiop->wcnt * wfix; + kiop->wtime += wfix; + } + + if (kiop->rcnt != 0) { + /* like kstat_runq_exit */ + hrtime_t rfix = cur_time - kiop->rlastupdate; + kiop->rlastupdate = cur_time; + kiop->rlentime += kiop->rcnt * rfix; + kiop->rtime += rfix; + } +#endif + break; + + case KSTAT_TYPE_NAMED: + /* + * Massage any long strings in at the end of the buffer + */ + if (ksp->ks_data_size > namedsz) { + uint_t i; + kstat_named_t *knp = buf; + char *dst = (char *)(knp + ksp->ks_ndata); + /* + * Copy strings and update pointers + */ + for (i = 0; i < ksp->ks_ndata; i++, knp++) { + if (knp->data_type == KSTAT_DATA_STRING && + KSTAT_NAMED_STR_PTR(knp) != NULL) { + bcopy(KSTAT_NAMED_STR_PTR(knp), dst, + KSTAT_NAMED_STR_BUFLEN(knp)); + KSTAT_NAMED_STR_PTR(knp) = dst; + dst += KSTAT_NAMED_STR_BUFLEN(knp); + } + } + ASSERT(dst <= ((char *)buf + ksp->ks_data_size)); + } + break; + } + return (0); +} + +static int +header_kstat_update(kstat_t *header_ksp, int rw) +{ + int nkstats = 0; + ekstat_t *e; + avl_tree_t *t = &kstat_avl_bykid; + zoneid_t zoneid; + + if (rw == KSTAT_WRITE) + return (EACCES); + + ASSERT(MUTEX_HELD(&kstat_chain_lock)); + + zoneid = getzoneid(); + for (e = avl_first(t); e != NULL; e = avl_walk(t, e, AVL_AFTER)) { + if (kstat_zone_find((kstat_t *)e, zoneid) && + (e->e_ks.ks_flags & KSTAT_FLAG_INVALID) == 0) { + nkstats++; + } + } + header_ksp->ks_ndata = nkstats; + header_ksp->ks_data_size = nkstats * sizeof(kstat_t); + return (0); +} + +/* +* Copy out the data section of kstat 0, which consists of the list +* of all kstat headers. By specification, these headers must be +* copied out in order of increasing KID. +*/ +static int +header_kstat_snapshot(kstat_t *header_ksp, void *buf, int rw) +{ + ekstat_t *e; + avl_tree_t *t = &kstat_avl_bykid; + zoneid_t zoneid; + + header_ksp->ks_snaptime = gethrtime(); + + if (rw == KSTAT_WRITE) + return (EACCES); + + ASSERT(MUTEX_HELD(&kstat_chain_lock)); + + zoneid = getzoneid(); + for (e = avl_first(t); e != NULL; e = avl_walk(t, e, AVL_AFTER)) { + if (kstat_zone_find((kstat_t *)e, zoneid) && + (e->e_ks.ks_flags & KSTAT_FLAG_INVALID) == 0) { + bcopy(&e->e_ks, buf, sizeof(kstat_t)); + buf = (char *)buf + sizeof(kstat_t); + } + } + + return (0); +} + +kstat_t * +kstat_create(const char *ks_module, int ks_instance, const char *ks_name, + const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags) +{ + return (kstat_create_zone(ks_module, ks_instance, ks_name, ks_class, + ks_type, ks_ndata, ks_flags, ALL_ZONES)); +} + +/* +* Allocate and initialize a kstat structure. Or, if a dormant kstat with +* the specified name exists, reactivate it. Returns a pointer to the kstat +* on success, NULL on failure. The kstat will not be visible to the +* kstat driver until kstat_install(). +*/ +kstat_t * +kstat_create_zone(const char *ks_module, int ks_instance, const char *ks_name, + const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags, + zoneid_t ks_zoneid) +{ + size_t ks_data_size; + kstat_t *ksp; + ekstat_t *e; + avl_index_t where; + char namebuf[KSTAT_STRLEN + 16]; + + if (avl_numnodes(&kstat_avl_bykid) == 0) { + avl_create(&kstat_avl_bykid, kstat_compare_bykid, + sizeof(ekstat_t), offsetof(struct ekstat, e_avl_bykid)); + + avl_create(&kstat_avl_byname, kstat_compare_byname, + sizeof(ekstat_t), offsetof(struct ekstat, e_avl_byname)); + } + + /* + * If ks_name == NULL, set the ks_name to . + */ + if (ks_name == NULL) { + char buf[KSTAT_STRLEN]; + kstat_set_string(buf, ks_module); + (void)sprintf(namebuf, "%s%d", buf, ks_instance); + ks_name = namebuf; + } + + /* + * Make sure it's a valid kstat data type + */ + if (ks_type >= KSTAT_NUM_TYPES) { + cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): " + "invalid kstat type %d", + ks_module, ks_instance, ks_name, ks_type); + return (NULL); + } + + /* + * Don't allow persistent virtual kstats -- it makes no sense. + * ks_data points to garbage when the client goes away. + */ + if ((ks_flags & KSTAT_FLAG_PERSISTENT) && + (ks_flags & KSTAT_FLAG_VIRTUAL)) { + cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): " + "cannot create persistent virtual kstat", + ks_module, ks_instance, ks_name); + return (NULL); + } + + /* + * Don't allow variable-size physical kstats, since the framework's + * memory allocation for physical kstat data is fixed at creation time. + */ + if ((ks_flags & KSTAT_FLAG_VAR_SIZE) && + !(ks_flags & KSTAT_FLAG_VIRTUAL)) { + cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): " + "cannot create variable-size physical kstat", + ks_module, ks_instance, ks_name); + return (NULL); + } + + /* + * Make sure the number of data fields is within legal range + */ + if (ks_ndata < kstat_data_type[ks_type].min_ndata || + ks_ndata > kstat_data_type[ks_type].max_ndata) { + cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): " + "ks_ndata=%d out of range [%d, %d]", + ks_module, ks_instance, ks_name, (int)ks_ndata, + kstat_data_type[ks_type].min_ndata, + kstat_data_type[ks_type].max_ndata); + return (NULL); + } + + ks_data_size = kstat_data_type[ks_type].size * ks_ndata; + + /* + * If the named kstat already exists and is dormant, reactivate it. + */ + ksp = kstat_hold_byname(ks_module, ks_instance, ks_name, ks_zoneid); + if (ksp != NULL) { + if (!(ksp->ks_flags & KSTAT_FLAG_DORMANT)) { + /* + * The named kstat exists but is not dormant -- + * this is a kstat namespace collision. + */ + kstat_rele(ksp); + cmn_err(CE_WARN, + "kstat_create('%s', %d, '%s'): namespace collision", + ks_module, ks_instance, ks_name); + return (NULL); + } + if ((strcmp(ksp->ks_class, ks_class) != 0) || + (ksp->ks_type != ks_type) || + (ksp->ks_ndata != ks_ndata) || + (ks_flags & KSTAT_FLAG_VIRTUAL)) { + /* + * The name is the same, but the other key parameters + * differ from those of the dormant kstat -- bogus. + */ + kstat_rele(ksp); + cmn_err(CE_WARN, "kstat_create('%s', %d, '%s'): " + "invalid reactivation of dormant kstat", + ks_module, ks_instance, ks_name); + return (NULL); + } + /* + * Return dormant kstat pointer to caller. As usual, + * the kstat is marked invalid until kstat_install(). + */ + ksp->ks_flags |= KSTAT_FLAG_INVALID; + kstat_rele(ksp); + return (ksp); + } + + /* + * Allocate memory for the new kstat header and, if this is a physical + * kstat, the data section. + */ + e = kstat_alloc(ks_flags & KSTAT_FLAG_VIRTUAL ? 0 : ks_data_size); + if (e == NULL) { + cmn_err(CE_NOTE, "kstat_create('%s', %d, '%s'): " + "insufficient kernel memory", + ks_module, ks_instance, ks_name); + return (NULL); + } + + /* + * Initialize as many fields as we can. The caller may reset + * ks_lock, ks_update, ks_private, and ks_snapshot as necessary. + * Creators of virtual kstats may also reset ks_data. It is + * also up to the caller to initialize the kstat data section, + * if necessary. All initialization must be complete before + * calling kstat_install(). + */ + e->e_zone.zoneid = ks_zoneid; + e->e_zone.next = NULL; + + ksp = &e->e_ks; + ksp->ks_crtime = gethrtime(); + kstat_set_string(ksp->ks_module, ks_module); + ksp->ks_instance = ks_instance; + kstat_set_string(ksp->ks_name, ks_name); + ksp->ks_type = ks_type; + kstat_set_string(ksp->ks_class, ks_class); + ksp->ks_flags = ks_flags | KSTAT_FLAG_INVALID; + if (ks_flags & KSTAT_FLAG_VIRTUAL) + ksp->ks_data = NULL; + else + ksp->ks_data = (void *)(e + 1); + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_data_size; + ksp->ks_snaptime = ksp->ks_crtime; + ksp->ks_update = default_kstat_update; + ksp->ks_private = NULL; + ksp->ks_snapshot = default_kstat_snapshot; + ksp->ks_lock = NULL; + + mutex_enter(&kstat_chain_lock); + + /* + * Add our kstat to the AVL trees. + */ + if (avl_find(&kstat_avl_byname, e, &where) != NULL) { + mutex_exit(&kstat_chain_lock); + cmn_err(CE_WARN, + "kstat_create('%s', %d, '%s'): namespace collision", + ks_module, ks_instance, ks_name); + kstat_free(e); + return (NULL); + } + avl_insert(&kstat_avl_byname, e, where); + + /* + * Loop around until we find an unused KID. + */ + do { + ksp->ks_kid = kstat_chain_id++; + } while (avl_find(&kstat_avl_bykid, e, &where) != NULL); + avl_insert(&kstat_avl_bykid, e, where); + + mutex_exit(&kstat_chain_lock); + + return (ksp); +} + +/* +* Activate a fully initialized kstat and make it visible to /dev/kstat. +*/ +void +kstat_install(kstat_t *ksp) +{ + zoneid_t zoneid = ((ekstat_t *)ksp)->e_zone.zoneid; + + /* + * If this is a variable-size kstat, it MUST provide kstat data locking + * to prevent data-size races with kstat readers. + */ + if ((ksp->ks_flags & KSTAT_FLAG_VAR_SIZE) && ksp->ks_lock == NULL) { + panic("kstat_install('%s', %d, '%s'): " + "cannot create variable-size kstat without data lock", + ksp->ks_module, ksp->ks_instance, ksp->ks_name); + } + + if (kstat_hold_bykid(ksp->ks_kid, zoneid) != ksp) { + cmn_err(CE_WARN, "kstat_install(%p): does not exist", + (void *)ksp); + return; + } + + if (ksp->ks_type == KSTAT_TYPE_NAMED && ksp->ks_data != NULL) { + uint_t i; + kstat_named_t *knp = KSTAT_NAMED_PTR(ksp); + + for (i = 0; i < ksp->ks_ndata; i++, knp++) { + if (knp->data_type == KSTAT_DATA_STRING) { + ksp->ks_flags |= KSTAT_FLAG_LONGSTRINGS; + break; + } + } + /* + * The default snapshot routine does not handle KSTAT_WRITE + * for long strings. + */ + if ((ksp->ks_flags & KSTAT_FLAG_LONGSTRINGS) && + (ksp->ks_flags & KSTAT_FLAG_WRITABLE) && + (ksp->ks_snapshot == default_kstat_snapshot)) { + panic("kstat_install('%s', %d, '%s'): " + "named kstat containing KSTAT_DATA_STRING " + "is writable but uses default snapshot routine", + ksp->ks_module, ksp->ks_instance, ksp->ks_name); + } + } + + if (ksp->ks_flags & KSTAT_FLAG_DORMANT) { + + /* + * We are reactivating a dormant kstat. Initialize the + * caller's underlying data to the value it had when the + * kstat went dormant, and mark the kstat as active. + * Grab the provider's kstat lock if it's not already held. + */ + kmutex_t *lp = ksp->ks_lock; + if (lp != NULL && MUTEX_NOT_HELD(lp)) { + mutex_enter(lp); + (void)KSTAT_UPDATE(ksp, KSTAT_WRITE); + mutex_exit(lp); + } else { + (void)KSTAT_UPDATE(ksp, KSTAT_WRITE); + } + ksp->ks_flags &= ~KSTAT_FLAG_DORMANT; + } + + /* + * Now that the kstat is active, make it visible to the kstat driver. + * When copying out kstats the count is determined in + * header_kstat_update() and actually copied into kbuf in + * header_kstat_snapshot(). kstat_chain_lock is held across the two + * calls to ensure that this list doesn't change. Thus, we need to + * also take the lock to ensure that the we don't copy the new kstat + * in the 2nd pass and overrun the buf. + */ + mutex_enter(&kstat_chain_lock); + ksp->ks_flags &= ~KSTAT_FLAG_INVALID; + mutex_exit(&kstat_chain_lock); + kstat_rele(ksp); +} + +/* +* Remove a kstat from the system. Or, if it's a persistent kstat, +* just update the data and mark it as dormant. +*/ +void +kstat_delete(kstat_t *ksp) +{ + kmutex_t *lp; + ekstat_t *e = (ekstat_t *)ksp; + zoneid_t zoneid; + kstat_zone_t *kz; + + ASSERT(ksp != NULL); + + if (ksp == NULL) + return; + + zoneid = e->e_zone.zoneid; + + lp = ksp->ks_lock; + + if (lp != NULL && MUTEX_HELD(lp)) { + panic("kstat_delete(%p): caller holds data lock %p", + (void *)ksp, (void *)lp); + } + + if (kstat_hold_bykid(ksp->ks_kid, zoneid) != ksp) { + cmn_err(CE_WARN, "kstat_delete(%p): does not exist", + (void *)ksp); + return; + } + + if (ksp->ks_flags & KSTAT_FLAG_PERSISTENT) { + /* + * Update the data one last time, so that all activity + * prior to going dormant has been accounted for. + */ + KSTAT_ENTER(ksp); + (void)KSTAT_UPDATE(ksp, KSTAT_READ); + KSTAT_EXIT(ksp); + + /* + * Mark the kstat as dormant and restore caller-modifiable + * fields to default values, so the kstat is readable during + * the dormant phase. + */ + ksp->ks_flags |= KSTAT_FLAG_DORMANT; + ksp->ks_lock = NULL; + ksp->ks_update = default_kstat_update; + ksp->ks_private = NULL; + ksp->ks_snapshot = default_kstat_snapshot; + kstat_rele(ksp); + return; + } + + /* + * Remove the kstat from the framework's AVL trees, + * free the allocated memory, and increment kstat_chain_id so + * /dev/kstat clients can detect the event. + */ + mutex_enter(&kstat_chain_lock); + avl_remove(&kstat_avl_bykid, e); + avl_remove(&kstat_avl_byname, e); + kstat_chain_id++; + mutex_exit(&kstat_chain_lock); + + kz = e->e_zone.next; + while (kz != NULL) { + kstat_zone_t *t = kz; + + kz = kz->next; + kmem_free(t, sizeof(*t)); + } + kstat_rele(ksp); + kstat_free(e); +} + +void +kstat_delete_byname_zone(const char *ks_module, int ks_instance, + const char *ks_name, zoneid_t ks_zoneid) +{ + kstat_t *ksp; + + ksp = kstat_hold_byname(ks_module, ks_instance, ks_name, ks_zoneid); + if (ksp != NULL) { + kstat_rele(ksp); + kstat_delete(ksp); + } +} + +void +kstat_delete_byname(const char *ks_module, int ks_instance, const char *ks_name) +{ + kstat_delete_byname_zone(ks_module, ks_instance, ks_name, ALL_ZONES); +} + +/* +* The sparc V9 versions of these routines can be much cheaper than +* the poor 32-bit compiler can comprehend, so they're in sparcv9_subr.s. +* For simplicity, however, we always feed the C versions to lint. +*/ +#if !defined(__sparc) || defined(lint) || defined(__lint) + +void +kstat_waitq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt++; + if (wcnt != 0) { + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; + } +} + +void +kstat_waitq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt--; + ASSERT((int)wcnt > 0); + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; +} + +void +kstat_runq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt++; + if (rcnt != 0) { + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; + } +} + +void +kstat_runq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt--; + ASSERT((int)rcnt > 0); + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; +} + +void +kstat_waitq_to_runq(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt, rcnt; + + new = gethrtime(); + + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt--; + ASSERT((int)wcnt > 0); + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; + + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt++; + if (rcnt != 0) { + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; + } +} + +void +kstat_runq_back_to_waitq(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt, rcnt; + + new = gethrtime(); + + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt--; + ASSERT((int)rcnt > 0); + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; + + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt++; + if (wcnt != 0) { + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; + } +} + +#endif + +void +kstat_timer_start(kstat_timer_t *ktp) +{ + ktp->start_time = gethrtime(); +} + +void +kstat_timer_stop(kstat_timer_t *ktp) +{ + hrtime_t etime; + u_longlong_t num_events; + + ktp->stop_time = etime = gethrtime(); + etime -= ktp->start_time; + num_events = ktp->num_events; + if (etime < ktp->min_time || num_events == 0) + ktp->min_time = etime; + if (etime > ktp->max_time) + ktp->max_time = etime; + ktp->elapsed_time += etime; + ktp->num_events = num_events + 1; +} + +/* io/kstat.c */ + +/* + * kernel statistics driver + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include +//#include +#include +#include +#include +#include +#include +//#include +#include +#include +#include +#include +#include + +static dev_info_t *kstat_devi; + +static int +read_kstat_data(int *rvalp, void *user_ksp, int flag) +{ + kstat_t user_kstat, *ksp; +#ifdef _MULTI_DATAMODEL + kstat32_t user_kstat32; +#endif + void *kbuf = NULL; + size_t kbufsize, ubufsize, copysize, allocsize; + int error = 0; + uint_t model; + +#define DDI_MODEL_NONE 0 +// switch (model = ddi_model_convert_from(flag & FMODELS)) { + switch (model = DDI_MODEL_NONE) { +#ifdef _MULTI_DATAMODEL + case DDI_MODEL_ILP32: + if (copyin(user_ksp, &user_kstat32, sizeof(kstat32_t)) != 0) + return (EFAULT); + user_kstat.ks_kid = user_kstat32.ks_kid; + user_kstat.ks_data = (void *)(uintptr_t)user_kstat32.ks_data; + user_kstat.ks_data_size = (size_t)user_kstat32.ks_data_size; + break; +#endif + default: + case DDI_MODEL_NONE: + if (ddi_copyin(user_ksp, &user_kstat, sizeof(kstat_t), 0) != 0) + return (EFAULT); + } + + ksp = kstat_hold_bykid(user_kstat.ks_kid, getzoneid()); + if (ksp == NULL) { + /* + * There is no kstat with the specified KID + */ + return (ENXIO); + } + if (ksp->ks_flags & KSTAT_FLAG_INVALID) { + /* + * The kstat exists, but is momentarily in some + * indeterminate state (e.g. the data section is not + * yet initialized). Try again in a few milliseconds. + */ + kstat_rele(ksp); + return (EAGAIN); + } + + /* + * If it's a fixed-size kstat, allocate the buffer now, so we + * don't have to do it under the kstat's data lock. (If it's a + * var-size kstat or one with long strings, we don't know the size + * until after the update routine is called, so we can't do this + * optimization.) + * The allocator relies on this behavior to prevent recursive + * mutex_enter in its (fixed-size) kstat update routine. + * It's a zalloc to prevent unintentional exposure of random + * juicy morsels of (old) kernel data. + */ + if (!(ksp->ks_flags & (KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_LONGSTRINGS))) { + kbufsize = ksp->ks_data_size; + allocsize = kbufsize + 1; + kbuf = kmem_zalloc(allocsize, KM_NOSLEEP); + if (kbuf == NULL) { + kstat_rele(ksp); + return (EAGAIN); + } + } + KSTAT_ENTER(ksp); + if ((error = KSTAT_UPDATE(ksp, KSTAT_READ)) != 0) { + KSTAT_EXIT(ksp); + kstat_rele(ksp); + if (kbuf != NULL) + kmem_free(kbuf, allocsize); + return (error); + } + + kbufsize = ksp->ks_data_size; + ubufsize = user_kstat.ks_data_size; + + if (ubufsize < kbufsize) { + error = ENOMEM; + } else { + if (kbuf == NULL) { + allocsize = kbufsize + 1; + kbuf = kmem_zalloc(allocsize, KM_NOSLEEP); + } + if (kbuf == NULL) { + error = EAGAIN; + } else { + error = KSTAT_SNAPSHOT(ksp, kbuf, KSTAT_READ); + } + } + + /* + * The following info must be returned to user level, + * even if the the update or snapshot failed. This allows + * kstat readers to get a handle on variable-size kstats, + * detect dormant kstats, etc. + */ + user_kstat.ks_ndata = ksp->ks_ndata; + user_kstat.ks_data_size = kbufsize; + user_kstat.ks_flags = ksp->ks_flags; + user_kstat.ks_snaptime = ksp->ks_snaptime; +#ifndef _WIN32 + *rvalp = kstat_chain_id; +#else + // The above doesn't work, as rvalp refers to the userland struct, before copyin() + // and we need to write value to kernel version. + user_kstat.ks_returnvalue = kstat_chain_id; +#endif + KSTAT_EXIT(ksp); + kstat_rele(ksp); + + if (kbuf == NULL) + goto out; + + /* + * Copy the buffer containing the kstat back to userland. + */ + copysize = kbufsize; + + switch (model) { + int i; +#ifdef _MULTI_DATAMODEL + kstat32_t *k32; + kstat_t *k; + + case DDI_MODEL_ILP32: + + if (ksp->ks_type == KSTAT_TYPE_NAMED) { + kstat_named_t *kn = kbuf; + char *strbuf = (char *)((kstat_named_t *)kn + + ksp->ks_ndata); + + for (i = 0; i < user_kstat.ks_ndata; kn++, i++) + switch (kn->data_type) { + /* + * Named statistics have fields of type 'long'. + * For a 32-bit application looking at a 64-bit + * kernel, forcibly truncate these 64-bit + * quantities to 32-bit values. + */ + case KSTAT_DATA_LONG: + kn->value.i32 = (int32_t)kn->value.l; + kn->data_type = KSTAT_DATA_INT32; + break; + case KSTAT_DATA_ULONG: + kn->value.ui32 = (uint32_t)kn->value.ul; + kn->data_type = KSTAT_DATA_UINT32; + break; + /* + * Long strings must be massaged before being + * copied out to userland. Do that here. + */ + case KSTAT_DATA_STRING: + if (KSTAT_NAMED_STR_PTR(kn) == NULL) + break; + /* + * If the string lies outside of kbuf + * copy it there and update the pointer. + */ + if (KSTAT_NAMED_STR_PTR(kn) < + (char *)kbuf || + KSTAT_NAMED_STR_PTR(kn) + + KSTAT_NAMED_STR_BUFLEN(kn) > + (char *)kbuf + kbufsize + 1) { + bcopy(KSTAT_NAMED_STR_PTR(kn), + strbuf, + KSTAT_NAMED_STR_BUFLEN(kn)); + + KSTAT_NAMED_STR_PTR(kn) = + strbuf; + strbuf += + KSTAT_NAMED_STR_BUFLEN(kn); + ASSERT(strbuf <= + (char *)kbuf + + kbufsize + 1); + } + /* + * The offsets within the buffers are + * the same, so add the offset to the + * beginning of the new buffer to fix + * the pointer. + */ + KSTAT_NAMED_STR_PTR(kn) = + (char *)user_kstat.ks_data + + (KSTAT_NAMED_STR_PTR(kn) - + (char *)kbuf); + /* + * Make sure the string pointer lies + * within the allocated buffer. + */ + ASSERT(KSTAT_NAMED_STR_PTR(kn) + + KSTAT_NAMED_STR_BUFLEN(kn) <= + ((char *)user_kstat.ks_data + + ubufsize)); + ASSERT(KSTAT_NAMED_STR_PTR(kn) >= + (char *)((kstat_named_t *) + user_kstat.ks_data + + user_kstat.ks_ndata)); + /* + * Cast 64-bit ptr to 32-bit. + */ + kn->value.str.addr.ptr32 = + (caddr32_t)(uintptr_t) + KSTAT_NAMED_STR_PTR(kn); + break; + default: + break; + } + } + + if (user_kstat.ks_kid != 0) + break; + + /* + * This is the special case of the kstat header + * list for the entire system. Reshape the + * array in place, then copy it out. + */ + k32 = kbuf; + k = kbuf; + for (i = 0; i < user_kstat.ks_ndata; k32++, k++, i++) { + k32->ks_crtime = k->ks_crtime; + k32->ks_next = 0; + k32->ks_kid = k->ks_kid; + (void)strcpy(k32->ks_module, k->ks_module); + k32->ks_resv = k->ks_resv; + k32->ks_instance = k->ks_instance; + (void)strcpy(k32->ks_name, k->ks_name); + k32->ks_type = k->ks_type; + (void)strcpy(k32->ks_class, k->ks_class); + k32->ks_flags = k->ks_flags; + k32->ks_data = 0; + k32->ks_ndata = k->ks_ndata; + if (k->ks_data_size > UINT32_MAX) { + error = EOVERFLOW; + break; + } + k32->ks_data_size = (size32_t)k->ks_data_size; + k32->ks_snaptime = k->ks_snaptime; + } + + /* + * XXX In this case we copy less data than is + * claimed in the header. + */ + copysize = user_kstat.ks_ndata * sizeof(kstat32_t); + break; +#endif /* _MULTI_DATAMODEL */ + default: + case DDI_MODEL_NONE: + if (ksp->ks_type == KSTAT_TYPE_NAMED) { + kstat_named_t *kn = kbuf; + char *strbuf = (char *)((kstat_named_t *)kn + + ksp->ks_ndata); + + for (i = 0; i < user_kstat.ks_ndata; kn++, i++) + switch (kn->data_type) { +#ifdef _LP64 + case KSTAT_DATA_LONG: + kn->data_type = + KSTAT_DATA_INT64; + break; + case KSTAT_DATA_ULONG: + kn->data_type = + KSTAT_DATA_UINT64; + break; +#endif /* _LP64 */ + case KSTAT_DATA_STRING: + if (KSTAT_NAMED_STR_PTR(kn) == NULL) + break; + /* + * If the string lies outside of kbuf + * copy it there and update the pointer. + */ + if (KSTAT_NAMED_STR_PTR(kn) < + (char *)kbuf || + KSTAT_NAMED_STR_PTR(kn) + + KSTAT_NAMED_STR_BUFLEN(kn) > + (char *)kbuf + kbufsize + 1) { + bcopy(KSTAT_NAMED_STR_PTR(kn), + strbuf, + KSTAT_NAMED_STR_BUFLEN(kn)); + + KSTAT_NAMED_STR_PTR(kn) = + strbuf; + strbuf += + KSTAT_NAMED_STR_BUFLEN(kn); + ASSERT(strbuf <= + (char *)kbuf + + kbufsize + 1); + } + + KSTAT_NAMED_STR_PTR(kn) = + (char *)user_kstat.ks_data + + (KSTAT_NAMED_STR_PTR(kn) - + (char *)kbuf); + ASSERT(KSTAT_NAMED_STR_PTR(kn) + + KSTAT_NAMED_STR_BUFLEN(kn) <= + ((char *)user_kstat.ks_data + + ubufsize)); + ASSERT(KSTAT_NAMED_STR_PTR(kn) >= + (char *)((kstat_named_t *) + user_kstat.ks_data + + user_kstat.ks_ndata)); + break; + default: + break; + } + } + break; + } + + if (error == 0 && + ddi_copyout(kbuf, user_kstat.ks_data, copysize, 0)) + error = EFAULT; + kmem_free(kbuf, allocsize); + +out: + /* + * We have modified the ks_ndata, ks_data_size, ks_flags, and + * ks_snaptime fields of the user kstat; now copy it back to userland. + */ + switch (model) { +#ifdef _MULTI_DATAMODEL + case DDI_MODEL_ILP32: + if (kbufsize > UINT32_MAX) { + error = EOVERFLOW; + break; + } + user_kstat32.ks_ndata = user_kstat.ks_ndata; + user_kstat32.ks_data_size = (size32_t)kbufsize; + user_kstat32.ks_flags = user_kstat.ks_flags; + user_kstat32.ks_snaptime = user_kstat.ks_snaptime; + if (copyout(&user_kstat32, user_ksp, sizeof(kstat32_t)) && + error == 0) + error = EFAULT; + break; +#endif + default: + case DDI_MODEL_NONE: + // If we have an errorcode, set it in ks_errnovalue + // Above sets returnvalue with *rval = + // Must be done before this copyout() + user_kstat.ks_errnovalue = 0; + if (error) { + user_kstat.ks_errnovalue = error; + user_kstat.ks_returnvalue = -1; + } + if (ddi_copyout(&user_kstat, user_ksp, sizeof(kstat_t), 0) && + error == 0) + error = EFAULT; + break; + } + + return (error); +} + +static int +write_kstat_data(int *rvalp, void *user_ksp, int flag, cred_t *cred) +{ + kstat_t user_kstat, *ksp; + void *buf = NULL; + size_t bufsize; + int error = 0; + + if (secpolicy_sys_config(cred, B_FALSE) != 0) + return (EPERM); + +// switch (ddi_model_convert_from(flag & FMODELS)) { + switch (DDI_MODEL_NONE) { +#ifdef _MULTI_DATAMODEL + kstat32_t user_kstat32; + + case DDI_MODEL_ILP32: + if (copyin(user_ksp, &user_kstat32, sizeof(kstat32_t))) + return (EFAULT); + /* + * These are the only fields we actually look at. + */ + user_kstat.ks_kid = user_kstat32.ks_kid; + user_kstat.ks_data = (void *)(uintptr_t)user_kstat32.ks_data; + user_kstat.ks_data_size = (size_t)user_kstat32.ks_data_size; + user_kstat.ks_ndata = user_kstat32.ks_ndata; + break; +#endif + default: + case DDI_MODEL_NONE: + if (ddi_copyin(user_ksp, &user_kstat, sizeof(kstat_t), 0)) + return (EFAULT); + } + + bufsize = user_kstat.ks_data_size; + buf = kmem_alloc(bufsize + 1, KM_NOSLEEP); + if (buf == NULL) + return (EAGAIN); + + if (ddi_copyin(user_kstat.ks_data, buf, bufsize, 0)) { + kmem_free(buf, bufsize + 1); + return (EFAULT); + } + + ksp = kstat_hold_bykid(user_kstat.ks_kid, getzoneid()); + if (ksp == NULL) { + kmem_free(buf, bufsize + 1); + return (ENXIO); + } + if (ksp->ks_flags & KSTAT_FLAG_INVALID) { + kstat_rele(ksp); + kmem_free(buf, bufsize + 1); + return (EAGAIN); + } + if (!(ksp->ks_flags & KSTAT_FLAG_WRITABLE)) { + kstat_rele(ksp); + kmem_free(buf, bufsize + 1); + return (EACCES); + } + + /* + * With KSTAT_FLAG_VAR_SIZE, one must call the kstat's update callback + * routine to ensure ks_data_size is up to date. + * In this case it makes sense to do it anyhow, as it will be shortly + * followed by a KSTAT_SNAPSHOT(). + */ + KSTAT_ENTER(ksp); + error = KSTAT_UPDATE(ksp, KSTAT_READ); + if (error || user_kstat.ks_data_size != ksp->ks_data_size || + user_kstat.ks_ndata != ksp->ks_ndata) { + KSTAT_EXIT(ksp); + kstat_rele(ksp); + kmem_free(buf, bufsize + 1); + return (error ? error : EINVAL); + } + + /* + * We have to ensure that we don't accidentally change the type of + * existing kstat_named statistics when writing over them. + * Since read_kstat_data() modifies some of the types on their way + * out, we need to be sure to handle these types seperately. + */ + if (ksp->ks_type == KSTAT_TYPE_NAMED) { + void *kbuf; + kstat_named_t *kold; + kstat_named_t *knew = buf; + int i; + +#ifdef _MULTI_DATAMODEL + int model = ddi_model_convert_from(flag & FMODELS); +#endif + + /* + * Since ksp->ks_data may be NULL, we need to take a snapshot + * of the published data to look at the types. + */ + kbuf = kmem_alloc(bufsize + 1, KM_NOSLEEP); + if (kbuf == NULL) { + KSTAT_EXIT(ksp); + kstat_rele(ksp); + kmem_free(buf, bufsize + 1); + return (EAGAIN); + } + error = KSTAT_SNAPSHOT(ksp, kbuf, KSTAT_READ); + if (error) { + KSTAT_EXIT(ksp); + kstat_rele(ksp); + kmem_free(kbuf, bufsize + 1); + kmem_free(buf, bufsize + 1); + return (error); + } + kold = kbuf; + + /* + * read_kstat_data() changes the types of + * KSTAT_DATA_LONG / KSTAT_DATA_ULONG, so we need to + * make sure that these (modified) types are considered + * valid. + */ + for (i = 0; i < ksp->ks_ndata; i++, kold++, knew++) { + switch (kold->data_type) { +#ifdef _MULTI_DATAMODEL + case KSTAT_DATA_LONG: + switch (model) { + case DDI_MODEL_ILP32: + if (knew->data_type == + KSTAT_DATA_INT32) { + knew->value.l = + (long)knew->value.i32; + knew->data_type = + KSTAT_DATA_LONG; + } + break; + default: + case DDI_MODEL_NONE: +#ifdef _LP64 + if (knew->data_type == + KSTAT_DATA_INT64) { + knew->value.l = + (long)knew->value.i64; + knew->data_type = + KSTAT_DATA_LONG; + } +#endif /* _LP64 */ + break; + } + break; + case KSTAT_DATA_ULONG: + switch (model) { + case DDI_MODEL_ILP32: + if (knew->data_type == + KSTAT_DATA_UINT32) { + knew->value.ul = + (ulong_t)knew->value.ui32; + knew->data_type = + KSTAT_DATA_ULONG; + } + break; + default: + case DDI_MODEL_NONE: +#ifdef _LP64 + if (knew->data_type == + KSTAT_DATA_UINT64) { + knew->value.ul = + (ulong_t)knew->value.ui64; + knew->data_type = + KSTAT_DATA_ULONG; + } +#endif /* _LP64 */ + break; + } + break; +#endif /* _MULTI_DATAMODEL */ + case KSTAT_DATA_STRING: + if (knew->data_type != KSTAT_DATA_STRING) { + KSTAT_EXIT(ksp); + kstat_rele(ksp); + kmem_free(kbuf, bufsize + 1); + kmem_free(buf, bufsize + 1); + return (EINVAL); + } + +#ifdef _MULTI_DATAMODEL + if (model == DDI_MODEL_ILP32) + KSTAT_NAMED_STR_PTR(knew) = + (char *)(uintptr_t) + knew->value.str.addr.ptr32; +#endif + /* + * Nothing special for NULL + */ + if (KSTAT_NAMED_STR_PTR(knew) == NULL) + break; + + /* + * Check to see that the pointers all point + * to within the buffer and after the array + * of kstat_named_t's. + */ + if (KSTAT_NAMED_STR_PTR(knew) < + (char *) + ((kstat_named_t *)user_kstat.ks_data + + ksp->ks_ndata)) { + KSTAT_EXIT(ksp); + kstat_rele(ksp); + kmem_free(kbuf, bufsize + 1); + kmem_free(buf, bufsize + 1); + return (EINVAL); + } + if (KSTAT_NAMED_STR_PTR(knew) + + KSTAT_NAMED_STR_BUFLEN(knew) > + ((char *)user_kstat.ks_data + + ksp->ks_data_size)) { + KSTAT_EXIT(ksp); + kstat_rele(ksp); + kmem_free(kbuf, bufsize + 1); + kmem_free(buf, bufsize + 1); + return (EINVAL); + } + + /* + * Update the pointers within the buffer + */ + KSTAT_NAMED_STR_PTR(knew) = + (char *)buf + + (KSTAT_NAMED_STR_PTR(knew) - + (char *)user_kstat.ks_data); + break; + default: + break; + } + } + + kold = kbuf; + knew = buf; + + /* + * Now make sure the types are what we expected them to be. + */ + for (i = 0; i < ksp->ks_ndata; i++, kold++, knew++) + if (kold->data_type != knew->data_type) { + KSTAT_EXIT(ksp); + kstat_rele(ksp); + kmem_free(kbuf, bufsize + 1); + kmem_free(buf, bufsize + 1); + return (EINVAL); + } + + kmem_free(kbuf, bufsize + 1); + } + + error = KSTAT_SNAPSHOT(ksp, buf, KSTAT_WRITE); + if (!error) + error = KSTAT_UPDATE(ksp, KSTAT_WRITE); +#ifndef _WIN32 + *rvalp = kstat_chain_id; +#else + // The above doesn't work, as rvalp refers to the userland struct, before copyin() + // and we need to write value to kernel version. + user_kstat.ks_returnvalue = kstat_chain_id; + // We need to copyout() so userland will get the return values. +#endif + + KSTAT_EXIT(ksp); + kstat_rele(ksp); + kmem_free(buf, bufsize + 1); + return (error); +} + +/* spl-kstat.c */ + +void +spl_kstat_init() +{ + /* + * Create the kstat root OID + */ + mutex_init(&kstat_chain_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +spl_kstat_fini() +{ + /* + * Destroy the kstat module/class/name tree + * + * Done in two passes, first unregisters all + * of the oids, second releases all the memory. + */ + + vmem_fini(kstat_arena); + mutex_destroy(&kstat_chain_lock); +} + + +void kstat_set_raw_ops(kstat_t *ksp, + int(*headers)(char *buf, size_t size), + int(*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, off_t index)) +{ +} + +int spl_kstat_chain_id(PDEVICE_OBJECT DiskDevice, PIRP Irp, PIO_STACK_LOCATION IrpSp) +{ + kstat_t ksp = { 0 }; + ksp.ks_returnvalue = kstat_chain_id; + ASSERT3U(IrpSp->Parameters.DeviceIoControl.OutputBufferLength, >=, sizeof(ksp)); + ddi_copyout(&ksp, IrpSp->Parameters.DeviceIoControl.Type3InputBuffer, + sizeof(ksp), 0); + dprintf("%s: returning kstat_chain_id %d\n", __func__, kstat_chain_id); + return 0; +} + +int spl_kstat_read(PDEVICE_OBJECT DiskDevice, PIRP Irp, PIO_STACK_LOCATION IrpSp) +{ + int rval, rc; + kstat_t *ksp; + ksp = (kstat_t *)IrpSp->Parameters.DeviceIoControl.Type3InputBuffer; + rc = read_kstat_data(&ksp->ks_returnvalue, (void *)ksp, 0); + return 0; +} + +int spl_kstat_write(PDEVICE_OBJECT DiskDevice, PIRP Irp, PIO_STACK_LOCATION IrpSp) +{ + int rval, rc; + kstat_t *ksp; + ksp = (kstat_t *)IrpSp->Parameters.DeviceIoControl.Type3InputBuffer; + rc = write_kstat_data(&ksp->ks_returnvalue, (void *)ksp, 0, NULL); + return 0; +} diff --git a/module/os/windows/spl/spl-list.c b/module/os/windows/spl/spl-list.c new file mode 100644 index 000000000000..64740b1d67e5 --- /dev/null +++ b/module/os/windows/spl/spl-list.c @@ -0,0 +1,197 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Generic doubly-linked list implementation + */ + +#include +#include +#include +#include + + +#define list_insert_after_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_prev = node; \ + lnew->list_next = node->list_next; \ + node->list_next->list_prev = lnew; \ + node->list_next = lnew; \ +} + +#define list_insert_before_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_next = node; \ + lnew->list_prev = node->list_prev; \ + node->list_prev->list_next = lnew; \ + node->list_prev = lnew; \ +} + +void +list_create(list_t *list, size_t size, size_t offset) +{ + ASSERT(list); + ASSERT(size > 0); + ASSERT(size >= offset + sizeof (list_node_t)); + + list->list_size = size; + list->list_offset = offset; + list->list_head.list_next = list->list_head.list_prev = + &list->list_head; +} + +void +list_destroy(list_t *list) +{ + list_node_t *node = &list->list_head; + + ASSERT(list); + ASSERT(list->list_head.list_next == node); + ASSERT(list->list_head.list_prev == node); + + node->list_next = node->list_prev = NULL; +} + +void +list_insert_after(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_head(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_after_node(list, lold, nobject); + } +} + +void +list_insert_before(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_tail(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_before_node(list, lold, nobject); + } +} + +void +list_insert_head(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_after_node(list, lold, object); +} + +void +list_insert_tail(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_before_node(list, lold, object); +} + +void +list_remove(list_t *list, void *object) +{ + list_node_t *lold = list_d2l(list, object); + ASSERT(!list_empty(list)); + ASSERT(lold->list_next != NULL); + lold->list_prev->list_next = lold->list_next; + lold->list_next->list_prev = lold->list_prev; + lold->list_next = lold->list_prev = NULL; +} + + +void * +list_head(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_next)); +} + +void * +list_tail(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_prev)); +} + +void * +list_next(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_next != &list->list_head) + return (list_object(list, node->list_next)); + + return (NULL); +} + +void * +list_prev(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_prev != &list->list_head) + return (list_object(list, node->list_prev)); + + return (NULL); +} + +/* + * Insert src list after dst list. Empty src list thereafter. + */ +void +list_move_tail(list_t *dst, list_t *src) +{ + list_node_t *dstnode = &dst->list_head; + list_node_t *srcnode = &src->list_head; + + ASSERT(dst->list_size == src->list_size); + ASSERT(dst->list_offset == src->list_offset); + + if (list_empty(src)) + return; + + dstnode->list_prev->list_next = srcnode->list_next; + srcnode->list_next->list_prev = dstnode->list_prev; + dstnode->list_prev = srcnode->list_prev; + srcnode->list_prev->list_next = dstnode; + + /* empty src list */ + srcnode->list_next = srcnode->list_prev = srcnode; +} + +int +list_link_active(list_node_t *link) +{ + return (link->list_next != NULL); +} + +int +list_is_empty(list_t *list) +{ + return (list_empty(list)); +} diff --git a/module/os/windows/spl/spl-md5.c b/module/os/windows/spl/spl-md5.c new file mode 100644 index 000000000000..c7577e12ab61 --- /dev/null +++ b/module/os/windows/spl/spl-md5.c @@ -0,0 +1,667 @@ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Cleaned-up and optimized version of MD5, based on the reference + * implementation provided in RFC 1321. See RSA Copyright information + * below. + */ + +/* + * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm + */ + +/* + * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All + * rights reserved. + * + * License to copy and use this software is granted provided that it + * is identified as the "RSA Data Security, Inc. MD5 Message-Digest + * Algorithm" in all material mentioning or referencing this software + * or this function. + * + * License is also granted to make and use derivative works provided + * that such works are identified as "derived from the RSA Data + * Security, Inc. MD5 Message-Digest Algorithm" in all material + * mentioning or referencing the derived work. + * + * RSA Data Security, Inc. makes no representations concerning either + * the merchantability of this software or the suitability of this + * software for any particular purpose. It is provided "as is" + * without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this + * documentation and/or software. + */ + +#ifndef _KERNEL +#include +#endif /* _KERNEL */ + +#include +#include +#include /* MD5_CONST() optimization */ +//#include "md5_byteswap.h" +#if !defined(_KERNEL) || defined(_BOOT) +#include +#endif /* !_KERNEL || _BOOT */ + +#ifdef _KERNEL +#include +#endif /* _KERNEL */ + +static void Encode(uint8_t *, const uint32_t *, size_t); + +static void MD5Transform(uint32_t, uint32_t, uint32_t, uint32_t, MD5_CTX *, + const uint8_t [64]); + +static uint8_t PADDING[64] = { 0x80, /* all zeros */ }; + +/* + * F, G, H and I are the basic MD5 functions. + */ +#define F(b, c, d) (((b) & (c)) | ((~b) & (d))) +#define G(b, c, d) (((b) & (d)) | ((c) & (~d))) +#define H(b, c, d) ((b) ^ (c) ^ (d)) +#define I(b, c, d) ((c) ^ ((b) | (~d))) + +/* + * ROTATE_LEFT rotates x left n bits. + */ +#define ROTATE_LEFT(x, n) \ + (((x) << (n)) | ((x) >> ((sizeof (x) << 3) - (n)))) + +/* + * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. + * Rotation is separate from addition to prevent recomputation. + */ + +#define FF(a, b, c, d, x, s, ac) { \ + (a) += F((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \ + (a) = ROTATE_LEFT((a), (s)); \ + (a) += (b); \ + } + +#define GG(a, b, c, d, x, s, ac) { \ + (a) += G((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \ + (a) = ROTATE_LEFT((a), (s)); \ + (a) += (b); \ + } + +#define HH(a, b, c, d, x, s, ac) { \ + (a) += H((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \ + (a) = ROTATE_LEFT((a), (s)); \ + (a) += (b); \ + } + +#define II(a, b, c, d, x, s, ac) { \ + (a) += I((b), (c), (d)) + (x) + ((unsigned long long)(ac)); \ + (a) = ROTATE_LEFT((a), (s)); \ + (a) += (b); \ + } + +/* + * Loading 32-bit constants on a RISC is expensive since it involves both a + * `sethi' and an `or'. thus, we instead have the compiler generate `ld's to + * load the constants from an array called `md5_consts'. however, on intel + * (and other CISC processors), it is cheaper to load the constant + * directly. thus, the c code in MD5Transform() uses the macro MD5_CONST() + * which either expands to a constant or an array reference, depending on the + * architecture the code is being compiled for. + * + * Right now, i386 and amd64 are the CISC exceptions. + * If we get another CISC ISA, we'll have to change the ifdef. + */ + +#if defined(__i386) || defined(__amd64) + +#define MD5_CONST(x) (MD5_CONST_ ## x) +#define MD5_CONST_e(x) MD5_CONST(x) +#define MD5_CONST_o(x) MD5_CONST(x) + +#else +/* + * sparc/RISC optimization: + * + * while it is somewhat counter-intuitive, on sparc (and presumably other RISC + * machines), it is more efficient to place all the constants used in this + * function in an array and load the values out of the array than to manually + * load the constants. this is because setting a register to a 32-bit value + * takes two ops in most cases: a `sethi' and an `or', but loading a 32-bit + * value from memory only takes one `ld' (or `lduw' on v9). while this + * increases memory usage, the compiler can find enough other things to do + * while waiting to keep the pipeline does not stall. additionally, it is + * likely that many of these constants are cached so that later accesses do + * not even go out to the bus. + * + * this array is declared `static' to keep the compiler from having to + * bcopy() this array onto the stack frame of MD5Transform() each time it is + * called -- which is unacceptably expensive. + * + * the `const' is to ensure that callers are good citizens and do not try to + * munge the array. since these routines are going to be called from inside + * multithreaded kernelland, this is a good safety check. -- `constants' will + * end up in .rodata. + * + * unfortunately, loading from an array in this manner hurts performance under + * intel (and presumably other CISC machines). so, there is a macro, + * MD5_CONST(), used in MD5Transform(), that either expands to a reference to + * this array, or to the actual constant, depending on what platform this code + * is compiled for. + */ + +#ifdef sun4v + +/* + * Going to load these consts in 8B chunks, so need to enforce 8B alignment + */ + +/* CSTYLED */ +#pragma align 64 (md5_consts) +#define _MD5_CHECK_ALIGNMENT + +#endif /* sun4v */ + +static const uint32_t md5_consts[] = { + MD5_CONST_0, MD5_CONST_1, MD5_CONST_2, MD5_CONST_3, + MD5_CONST_4, MD5_CONST_5, MD5_CONST_6, MD5_CONST_7, + MD5_CONST_8, MD5_CONST_9, MD5_CONST_10, MD5_CONST_11, + MD5_CONST_12, MD5_CONST_13, MD5_CONST_14, MD5_CONST_15, + MD5_CONST_16, MD5_CONST_17, MD5_CONST_18, MD5_CONST_19, + MD5_CONST_20, MD5_CONST_21, MD5_CONST_22, MD5_CONST_23, + MD5_CONST_24, MD5_CONST_25, MD5_CONST_26, MD5_CONST_27, + MD5_CONST_28, MD5_CONST_29, MD5_CONST_30, MD5_CONST_31, + MD5_CONST_32, MD5_CONST_33, MD5_CONST_34, MD5_CONST_35, + MD5_CONST_36, MD5_CONST_37, MD5_CONST_38, MD5_CONST_39, + MD5_CONST_40, MD5_CONST_41, MD5_CONST_42, MD5_CONST_43, + MD5_CONST_44, MD5_CONST_45, MD5_CONST_46, MD5_CONST_47, + MD5_CONST_48, MD5_CONST_49, MD5_CONST_50, MD5_CONST_51, + MD5_CONST_52, MD5_CONST_53, MD5_CONST_54, MD5_CONST_55, + MD5_CONST_56, MD5_CONST_57, MD5_CONST_58, MD5_CONST_59, + MD5_CONST_60, MD5_CONST_61, MD5_CONST_62, MD5_CONST_63 +}; + + +#ifdef sun4v +/* + * To reduce the number of loads, load consts in 64-bit + * chunks and then split. + * + * No need to mask upper 32-bits, as just interested in + * low 32-bits (saves an & operation and means that this + * optimization doesn't increases the icount. + */ +#define MD5_CONST_e(x) (md5_consts64[x/2] >> 32) +#define MD5_CONST_o(x) (md5_consts64[x/2]) + +#else + +#define MD5_CONST_e(x) (md5_consts[x]) +#define MD5_CONST_o(x) (md5_consts[x]) + +#endif /* sun4v */ + +#endif + +/* + * MD5Init() + * + * purpose: initializes the md5 context and begins and md5 digest operation + * input: MD5_CTX * : the context to initialize. + * output: void + */ + +void +MD5Init(MD5_CTX *ctx) +{ + ctx->count[0] = ctx->count[1] = 0; + + /* load magic initialization constants */ + ctx->state[0] = MD5_INIT_CONST_1; + ctx->state[1] = MD5_INIT_CONST_2; + ctx->state[2] = MD5_INIT_CONST_3; + ctx->state[3] = MD5_INIT_CONST_4; +} + +/* + * MD5Update() + * + * purpose: continues an md5 digest operation, using the message block + * to update the context. + * input: MD5_CTX * : the context to update + * uint8_t * : the message block + * uint32_t : the length of the message block in bytes + * output: void + * + * MD5 crunches in 64-byte blocks. All numeric constants here are related to + * that property of MD5. + */ + +void +MD5Update(MD5_CTX *ctx, const void *inpp, unsigned int input_len) +{ + uint32_t i, buf_index, buf_len; +#ifdef sun4v + uint32_t old_asi; +#endif /* sun4v */ +#if defined(__amd64) + uint32_t block_count; +#endif /* !defined(__amd64) */ + const unsigned char *input = (const unsigned char *)inpp; + + /* compute (number of bytes computed so far) mod 64 */ + buf_index = (ctx->count[0] >> 3) & 0x3F; + + /* update number of bits hashed into this MD5 computation so far */ + if ((ctx->count[0] += (input_len << 3)) < (input_len << 3)) + ctx->count[1]++; + ctx->count[1] += (input_len >> 29); + + buf_len = 64 - buf_index; + + /* transform as many times as possible */ + i = 0; + if (input_len >= buf_len) { + + /* + * general optimization: + * + * only do initial bcopy() and MD5Transform() if + * buf_index != 0. if buf_index == 0, we're just + * wasting our time doing the bcopy() since there + * wasn't any data left over from a previous call to + * MD5Update(). + */ + +#ifdef sun4v + /* + * For N1 use %asi register. However, costly to repeatedly set + * in MD5Transform. Therefore, set once here. + * Should probably restore the old value afterwards... + */ + old_asi = get_little(); + set_little(0x88); +#endif /* sun4v */ + + if (buf_index) { + bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len); + + MD5Transform(ctx->state[0], ctx->state[1], + ctx->state[2], ctx->state[3], ctx, + ctx->buf_un.buf8); + + i = buf_len; + } + + for (; i + 63 < input_len; i += 64) + MD5Transform(ctx->state[0], ctx->state[1], + ctx->state[2], ctx->state[3], ctx, &input[i]); + + +#ifdef sun4v + /* + * Restore old %ASI value + */ + set_little(old_asi); +#endif /* sun4v */ + + /* + * general optimization: + * + * if i and input_len are the same, return now instead + * of calling bcopy(), since the bcopy() in this + * case will be an expensive nop. + */ + + if (input_len == i) + return; + + buf_index = 0; + } + + /* buffer remaining input */ + bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i); +} + +/* + * MD5Final() + * + * purpose: ends an md5 digest operation, finalizing the message digest and + * zeroing the context. + * input: uchar_t * : a buffer to store the digest in + * : The function actually uses void* because many + * : callers pass things other than uchar_t here. + * MD5_CTX * : the context to finalize, save, and zero + * output: void + */ + +void +MD5Final(void *digest, MD5_CTX *ctx) +{ + uint8_t bitcount_le[sizeof (ctx->count)]; + uint32_t index = (ctx->count[0] >> 3) & 0x3f; + + /* store bit count, little endian */ + Encode(bitcount_le, ctx->count, sizeof (bitcount_le)); + + /* pad out to 56 mod 64 */ + MD5Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index); + + /* append length (before padding) */ + MD5Update(ctx, bitcount_le, sizeof (bitcount_le)); + + /* store state in digest */ + Encode(digest, ctx->state, sizeof (ctx->state)); + + /* zeroize sensitive information */ + bzero(ctx, sizeof (*ctx)); +} + +#ifndef _KERNEL + +void +md5_calc(unsigned char *output, unsigned char *input, unsigned int inlen) +{ + MD5_CTX context; + + MD5Init(&context); + MD5Update(&context, input, inlen); + MD5Final(output, &context); +} + +#endif /* !_KERNEL */ + +/* + * sparc register window optimization: + * + * `a', `b', `c', and `d' are passed into MD5Transform explicitly + * since it increases the number of registers available to the + * compiler. under this scheme, these variables can be held in + * %i0 - %i3, which leaves more local and out registers available. + */ + +/* + * MD5Transform() + * + * purpose: md5 transformation -- updates the digest based on `block' + * input: uint32_t : bytes 1 - 4 of the digest + * uint32_t : bytes 5 - 8 of the digest + * uint32_t : bytes 9 - 12 of the digest + * uint32_t : bytes 12 - 16 of the digest + * MD5_CTX * : the context to update + * uint8_t [64]: the block to use to update the digest + * output: void + */ + +static void +MD5Transform(uint32_t a, uint32_t b, uint32_t c, uint32_t d, + MD5_CTX *ctx, const uint8_t block[64]) +{ + /* + * general optimization: + * + * use individual integers instead of using an array. this is a + * win, although the amount it wins by seems to vary quite a bit. + */ + + register uint32_t x_0, x_1, x_2, x_3, x_4, x_5, x_6, x_7; + register uint32_t x_8, x_9, x_10, x_11, x_12, x_13, x_14, x_15; +#ifdef sun4v + unsigned long long *md5_consts64; + + /* LINTED E_BAD_PTR_CAST_ALIGN */ + md5_consts64 = (unsigned long long *) md5_consts; +#endif /* sun4v */ + + /* + * general optimization: + * + * the compiler (at least SC4.2/5.x) generates better code if + * variable use is localized. in this case, swapping the integers in + * this order allows `x_0 'to be swapped nearest to its first use in + * FF(), and likewise for `x_1' and up. note that the compiler + * prefers this to doing each swap right before the FF() that + * uses it. + */ + + /* + * sparc v9/v8plus optimization: + * + * if `block' is already aligned on a 4-byte boundary, use the + * optimized load_little_32() directly. otherwise, bcopy() + * into a buffer that *is* aligned on a 4-byte boundary and + * then do the load_little_32() on that buffer. benchmarks + * have shown that using the bcopy() is better than loading + * the bytes individually and doing the endian-swap by hand. + * + * even though it's quite tempting to assign to do: + * + * blk = bcopy(blk, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32)); + * + * and only have one set of LOAD_LITTLE_32()'s, the compiler (at least + * SC4.2/5.x) *does not* like that, so please resist the urge. + */ + +#ifdef _MD5_CHECK_ALIGNMENT + if ((uintptr_t)block & 0x3) { /* not 4-byte aligned? */ + bcopy(block, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32)); + +#ifdef sun4v + x_15 = LOAD_LITTLE_32_f(ctx->buf_un.buf32); + x_14 = LOAD_LITTLE_32_e(ctx->buf_un.buf32); + x_13 = LOAD_LITTLE_32_d(ctx->buf_un.buf32); + x_12 = LOAD_LITTLE_32_c(ctx->buf_un.buf32); + x_11 = LOAD_LITTLE_32_b(ctx->buf_un.buf32); + x_10 = LOAD_LITTLE_32_a(ctx->buf_un.buf32); + x_9 = LOAD_LITTLE_32_9(ctx->buf_un.buf32); + x_8 = LOAD_LITTLE_32_8(ctx->buf_un.buf32); + x_7 = LOAD_LITTLE_32_7(ctx->buf_un.buf32); + x_6 = LOAD_LITTLE_32_6(ctx->buf_un.buf32); + x_5 = LOAD_LITTLE_32_5(ctx->buf_un.buf32); + x_4 = LOAD_LITTLE_32_4(ctx->buf_un.buf32); + x_3 = LOAD_LITTLE_32_3(ctx->buf_un.buf32); + x_2 = LOAD_LITTLE_32_2(ctx->buf_un.buf32); + x_1 = LOAD_LITTLE_32_1(ctx->buf_un.buf32); + x_0 = LOAD_LITTLE_32_0(ctx->buf_un.buf32); +#else + x_15 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 15); + x_14 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 14); + x_13 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 13); + x_12 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 12); + x_11 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 11); + x_10 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 10); + x_9 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 9); + x_8 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 8); + x_7 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 7); + x_6 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 6); + x_5 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 5); + x_4 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 4); + x_3 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 3); + x_2 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 2); + x_1 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 1); + x_0 = LOAD_LITTLE_32(ctx->buf_un.buf32 + 0); +#endif /* sun4v */ + } else +#endif + { + +#ifdef sun4v + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_15 = LOAD_LITTLE_32_f(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_14 = LOAD_LITTLE_32_e(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_13 = LOAD_LITTLE_32_d(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_12 = LOAD_LITTLE_32_c(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_11 = LOAD_LITTLE_32_b(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_10 = LOAD_LITTLE_32_a(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_9 = LOAD_LITTLE_32_9(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_8 = LOAD_LITTLE_32_8(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_7 = LOAD_LITTLE_32_7(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_6 = LOAD_LITTLE_32_6(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_5 = LOAD_LITTLE_32_5(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_4 = LOAD_LITTLE_32_4(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_3 = LOAD_LITTLE_32_3(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_2 = LOAD_LITTLE_32_2(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_1 = LOAD_LITTLE_32_1(block); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + x_0 = LOAD_LITTLE_32_0(block); +#else +#define LOAD_LITTLE_32(addr) (*(uint32_t *)(void *)(addr)) + x_15 = LOAD_LITTLE_32(block + 60); + x_14 = LOAD_LITTLE_32(block + 56); + x_13 = LOAD_LITTLE_32(block + 52); + x_12 = LOAD_LITTLE_32(block + 48); + x_11 = LOAD_LITTLE_32(block + 44); + x_10 = LOAD_LITTLE_32(block + 40); + x_9 = LOAD_LITTLE_32(block + 36); + x_8 = LOAD_LITTLE_32(block + 32); + x_7 = LOAD_LITTLE_32(block + 28); + x_6 = LOAD_LITTLE_32(block + 24); + x_5 = LOAD_LITTLE_32(block + 20); + x_4 = LOAD_LITTLE_32(block + 16); + x_3 = LOAD_LITTLE_32(block + 12); + x_2 = LOAD_LITTLE_32(block + 8); + x_1 = LOAD_LITTLE_32(block + 4); + x_0 = LOAD_LITTLE_32(block + 0); +#endif /* sun4v */ + } + + /* round 1 */ + FF(a, b, c, d, x_0, MD5_SHIFT_11, MD5_CONST_e(0)); /* 1 */ + FF(d, a, b, c, x_1, MD5_SHIFT_12, MD5_CONST_o(1)); /* 2 */ + FF(c, d, a, b, x_2, MD5_SHIFT_13, MD5_CONST_e(2)); /* 3 */ + FF(b, c, d, a, x_3, MD5_SHIFT_14, MD5_CONST_o(3)); /* 4 */ + FF(a, b, c, d, x_4, MD5_SHIFT_11, MD5_CONST_e(4)); /* 5 */ + FF(d, a, b, c, x_5, MD5_SHIFT_12, MD5_CONST_o(5)); /* 6 */ + FF(c, d, a, b, x_6, MD5_SHIFT_13, MD5_CONST_e(6)); /* 7 */ + FF(b, c, d, a, x_7, MD5_SHIFT_14, MD5_CONST_o(7)); /* 8 */ + FF(a, b, c, d, x_8, MD5_SHIFT_11, MD5_CONST_e(8)); /* 9 */ + FF(d, a, b, c, x_9, MD5_SHIFT_12, MD5_CONST_o(9)); /* 10 */ + FF(c, d, a, b, x_10, MD5_SHIFT_13, MD5_CONST_e(10)); /* 11 */ + FF(b, c, d, a, x_11, MD5_SHIFT_14, MD5_CONST_o(11)); /* 12 */ + FF(a, b, c, d, x_12, MD5_SHIFT_11, MD5_CONST_e(12)); /* 13 */ + FF(d, a, b, c, x_13, MD5_SHIFT_12, MD5_CONST_o(13)); /* 14 */ + FF(c, d, a, b, x_14, MD5_SHIFT_13, MD5_CONST_e(14)); /* 15 */ + FF(b, c, d, a, x_15, MD5_SHIFT_14, MD5_CONST_o(15)); /* 16 */ + + /* round 2 */ + GG(a, b, c, d, x_1, MD5_SHIFT_21, MD5_CONST_e(16)); /* 17 */ + GG(d, a, b, c, x_6, MD5_SHIFT_22, MD5_CONST_o(17)); /* 18 */ + GG(c, d, a, b, x_11, MD5_SHIFT_23, MD5_CONST_e(18)); /* 19 */ + GG(b, c, d, a, x_0, MD5_SHIFT_24, MD5_CONST_o(19)); /* 20 */ + GG(a, b, c, d, x_5, MD5_SHIFT_21, MD5_CONST_e(20)); /* 21 */ + GG(d, a, b, c, x_10, MD5_SHIFT_22, MD5_CONST_o(21)); /* 22 */ + GG(c, d, a, b, x_15, MD5_SHIFT_23, MD5_CONST_e(22)); /* 23 */ + GG(b, c, d, a, x_4, MD5_SHIFT_24, MD5_CONST_o(23)); /* 24 */ + GG(a, b, c, d, x_9, MD5_SHIFT_21, MD5_CONST_e(24)); /* 25 */ + GG(d, a, b, c, x_14, MD5_SHIFT_22, MD5_CONST_o(25)); /* 26 */ + GG(c, d, a, b, x_3, MD5_SHIFT_23, MD5_CONST_e(26)); /* 27 */ + GG(b, c, d, a, x_8, MD5_SHIFT_24, MD5_CONST_o(27)); /* 28 */ + GG(a, b, c, d, x_13, MD5_SHIFT_21, MD5_CONST_e(28)); /* 29 */ + GG(d, a, b, c, x_2, MD5_SHIFT_22, MD5_CONST_o(29)); /* 30 */ + GG(c, d, a, b, x_7, MD5_SHIFT_23, MD5_CONST_e(30)); /* 31 */ + GG(b, c, d, a, x_12, MD5_SHIFT_24, MD5_CONST_o(31)); /* 32 */ + + /* round 3 */ + HH(a, b, c, d, x_5, MD5_SHIFT_31, MD5_CONST_e(32)); /* 33 */ + HH(d, a, b, c, x_8, MD5_SHIFT_32, MD5_CONST_o(33)); /* 34 */ + HH(c, d, a, b, x_11, MD5_SHIFT_33, MD5_CONST_e(34)); /* 35 */ + HH(b, c, d, a, x_14, MD5_SHIFT_34, MD5_CONST_o(35)); /* 36 */ + HH(a, b, c, d, x_1, MD5_SHIFT_31, MD5_CONST_e(36)); /* 37 */ + HH(d, a, b, c, x_4, MD5_SHIFT_32, MD5_CONST_o(37)); /* 38 */ + HH(c, d, a, b, x_7, MD5_SHIFT_33, MD5_CONST_e(38)); /* 39 */ + HH(b, c, d, a, x_10, MD5_SHIFT_34, MD5_CONST_o(39)); /* 40 */ + HH(a, b, c, d, x_13, MD5_SHIFT_31, MD5_CONST_e(40)); /* 41 */ + HH(d, a, b, c, x_0, MD5_SHIFT_32, MD5_CONST_o(41)); /* 42 */ + HH(c, d, a, b, x_3, MD5_SHIFT_33, MD5_CONST_e(42)); /* 43 */ + HH(b, c, d, a, x_6, MD5_SHIFT_34, MD5_CONST_o(43)); /* 44 */ + HH(a, b, c, d, x_9, MD5_SHIFT_31, MD5_CONST_e(44)); /* 45 */ + HH(d, a, b, c, x_12, MD5_SHIFT_32, MD5_CONST_o(45)); /* 46 */ + HH(c, d, a, b, x_15, MD5_SHIFT_33, MD5_CONST_e(46)); /* 47 */ + HH(b, c, d, a, x_2, MD5_SHIFT_34, MD5_CONST_o(47)); /* 48 */ + + /* round 4 */ + II(a, b, c, d, x_0, MD5_SHIFT_41, MD5_CONST_e(48)); /* 49 */ + II(d, a, b, c, x_7, MD5_SHIFT_42, MD5_CONST_o(49)); /* 50 */ + II(c, d, a, b, x_14, MD5_SHIFT_43, MD5_CONST_e(50)); /* 51 */ + II(b, c, d, a, x_5, MD5_SHIFT_44, MD5_CONST_o(51)); /* 52 */ + II(a, b, c, d, x_12, MD5_SHIFT_41, MD5_CONST_e(52)); /* 53 */ + II(d, a, b, c, x_3, MD5_SHIFT_42, MD5_CONST_o(53)); /* 54 */ + II(c, d, a, b, x_10, MD5_SHIFT_43, MD5_CONST_e(54)); /* 55 */ + II(b, c, d, a, x_1, MD5_SHIFT_44, MD5_CONST_o(55)); /* 56 */ + II(a, b, c, d, x_8, MD5_SHIFT_41, MD5_CONST_e(56)); /* 57 */ + II(d, a, b, c, x_15, MD5_SHIFT_42, MD5_CONST_o(57)); /* 58 */ + II(c, d, a, b, x_6, MD5_SHIFT_43, MD5_CONST_e(58)); /* 59 */ + II(b, c, d, a, x_13, MD5_SHIFT_44, MD5_CONST_o(59)); /* 60 */ + II(a, b, c, d, x_4, MD5_SHIFT_41, MD5_CONST_e(60)); /* 61 */ + II(d, a, b, c, x_11, MD5_SHIFT_42, MD5_CONST_o(61)); /* 62 */ + II(c, d, a, b, x_2, MD5_SHIFT_43, MD5_CONST_e(62)); /* 63 */ + II(b, c, d, a, x_9, MD5_SHIFT_44, MD5_CONST_o(63)); /* 64 */ + + ctx->state[0] += a; + ctx->state[1] += b; + ctx->state[2] += c; + ctx->state[3] += d; + + /* + * zeroize sensitive information -- compiler will optimize + * this out if everything is kept in registers + */ + + x_0 = x_1 = x_2 = x_3 = x_4 = x_5 = x_6 = x_7 = x_8 = 0; + x_9 = x_10 = x_11 = x_12 = x_13 = x_14 = x_15 = 0; +} + +/* + * Encode() + * + * purpose: to convert a list of numbers from big endian to little endian + * input: uint8_t * : place to store the converted little endian numbers + * uint32_t * : place to get numbers to convert from + * size_t : the length of the input in bytes + * output: void + */ + +static void +Encode(uint8_t * output, const uint32_t * input, + size_t input_len) +{ + size_t i, j; + + for (i = 0, j = 0; j < input_len; i++, j += sizeof (uint32_t)) { + +#ifdef _LITTLE_ENDIAN + +#ifdef _MD5_CHECK_ALIGNMENT + if ((uintptr_t)output & 0x3) /* Not 4-byte aligned */ + bcopy(input + i, output + j, 4); + else *(uint32_t *)(output + j) = input[i]; +#else + /*LINTED E_BAD_PTR_CAST_ALIGN*/ + *(uint32_t *)(output + j) = input[i]; +#endif /* _MD5_CHECK_ALIGNMENT */ + +#else /* big endian -- will work on little endian, but slowly */ + + output[j] = input[i] & 0xff; + output[j + 1] = (input[i] >> 8) & 0xff; + output[j + 2] = (input[i] >> 16) & 0xff; + output[j + 3] = (input[i] >> 24) & 0xff; +#endif + } +} diff --git a/module/os/windows/spl/spl-mount.c b/module/os/windows/spl/spl-mount.c new file mode 100644 index 000000000000..9e7517a8f2a2 --- /dev/null +++ b/module/os/windows/spl/spl-mount.c @@ -0,0 +1,94 @@ +/* +* CDDL HEADER START +* +* The contents of this file are subject to the terms of the +* Common Development and Distribution License (the "License"). +* You may not use this file except in compliance with the License. +* +* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +* or http://www.opensolaris.org/os/licensing. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* When distributing Covered Code, include this CDDL HEADER in each +* file and include the License file at usr/src/OPENSOLARIS.LICENSE. +* If applicable, add the following below this CDDL HEADER, with the +* fields enclosed by brackets "[]" replaced with your own identifying +* information: Portions Copyright [yyyy] [name of copyright owner] +* +* CDDL HEADER END +*/ + +/* +* +* Copyright (C) 2017 Jorgen Lundman +* +*/ + +#include +#include +#include + +int vfs_busy(mount_t *mp, int flags) +{ + return 0; +} + +void vfs_unbusy(mount_t *mp) +{ +} + +int vfs_isrdonly(mount_t *mp) +{ + return (mp->mountflags & MNT_RDONLY); +} + +void *vfs_fsprivate(mount_t *mp) +{ + return mp->fsprivate; +} + +void vfs_setfsprivate(mount_t *mp, void *mntdata) +{ + mp->fsprivate = mntdata; +} + +void vfs_clearflags(mount_t *mp, uint64_t flags) +{ + mp->mountflags &= ~flags; +} + +void vfs_setflags(mount_t *mp, uint64_t flags) +{ + mp->mountflags |= flags; +} + +uint64_t vfs_flags(mount_t *mp) +{ + return mp->mountflags; +} + +struct vfsstatfs * vfs_statfs(mount_t *mp) +{ + return NULL; +} + +void vfs_setlocklocal(mount_t *mp) +{ +} + +int vfs_typenum(mount_t *mp) +{ + return 0; +} + +void vfs_getnewfsid(struct mount *mp) +{ +} + +int vfs_isunmount(mount_t *mp) +{ + return 0; +} + + diff --git a/module/os/windows/spl/spl-mutex.c b/module/os/windows/spl/spl-mutex.c new file mode 100644 index 000000000000..46055be46f3c --- /dev/null +++ b/module/os/windows/spl/spl-mutex.c @@ -0,0 +1,191 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017,2019 Jorgen Lundman + * + */ + + /* + * Implementation details. + * Using SynchronizationEvent that autoresets. When in 'Signaled' + * state the mutex is considered FREE/Available to be locked. + * Call KeWaitForSingleObject() to wait for it to be made + * 'available' (either blocking, or polling for *Try method) + * Calling KeSetEvent() sets event to Signaled, and wakes 'one' + * waiter, before Clearing it again. + * We attempt to avoid calling KeWaitForSingleObject() by + * using atomic CAS on m_owner, in the simple cases. + */ + +#include +#include +#include +#include +#include +#include + +uint64_t zfs_active_mutex = 0; + +#define MUTEX_INITIALISED 0x23456789 +#define MUTEX_DESTROYED 0x98765432 + +int spl_mutex_subsystem_init(void) +{ + return 0; +} + +void spl_mutex_subsystem_fini(void) +{ + +} + +void spl_mutex_init(kmutex_t *mp, char *name, kmutex_type_t type, void *ibc) +{ + (void)name; + ASSERT(type != MUTEX_SPIN); + ASSERT(ibc == NULL); + + if (mp->initialised == MUTEX_INITIALISED) + panic("%s: mutex already initialised\n", __func__); + mp->initialised = MUTEX_INITIALISED; + mp->set_event_guard = 0; + + mp->m_owner = NULL; + + // Initialise it to 'Signaled' as mutex is 'free'. + KeInitializeEvent((PRKEVENT)&mp->m_lock, SynchronizationEvent, TRUE); + atomic_inc_64(&zfs_active_mutex); +} + +void spl_mutex_destroy(kmutex_t *mp) +{ + if (!mp) return; + + if (mp->initialised != MUTEX_INITIALISED) + panic("%s: mutex not initialised\n", __func__); + + // Make sure any call to KeSetEvent() has completed. + while (mp->set_event_guard != 0) { + kpreempt(KPREEMPT_SYNC); + } + + mp->initialised = MUTEX_DESTROYED; + + if (mp->m_owner != 0) + panic("SPL: releasing held mutex"); + + // There is no FREE member for events + // KeDeleteEvent(); + + atomic_dec_64(&zfs_active_mutex); +} + +void spl_mutex_enter(kmutex_t *mp) +{ + NTSTATUS Status; + kthread_t *thisthread = current_thread(); + + if (mp->initialised != MUTEX_INITIALISED) + panic("%s: mutex not initialised\n", __func__); + + if (mp->m_owner == thisthread) + panic("mutex_enter: locking against myself!"); + + VERIFY3P(mp->m_owner, != , 0xdeadbeefdeadbeef); + + // Test if "m_owner" is NULL, if so, set it to "thisthread". + // Returns original value, so if NULL, it succeeded. +again: + if (InterlockedCompareExchangePointer(&mp->m_owner, + thisthread, NULL) != NULL) { + + // Failed to CAS-in 'thisthread', as owner was not NULL + // Wait forever for event to be signaled. + Status = KeWaitForSingleObject( + (PRKEVENT)&mp->m_lock, + Executive, + KernelMode, + FALSE, + NULL + ); + + // We waited, but someone else may have beaten us to it + // so we need to attempt CAS again + goto again; + } + + ASSERT(mp->m_owner == thisthread); +} + +void spl_mutex_exit(kmutex_t *mp) +{ + if (mp->m_owner != current_thread()) + panic("%s: releasing not held/not our lock?\n", __func__); + + VERIFY3P(mp->m_owner, != , 0xdeadbeefdeadbeef); + + atomic_inc_32(&mp->set_event_guard); + + mp->m_owner = NULL; + + VERIFY3U(KeGetCurrentIrql(), <= , DISPATCH_LEVEL); + + // Wake up one waiter now that it is available. + KeSetEvent((PRKEVENT)&mp->m_lock, SEMAPHORE_INCREMENT, FALSE); + atomic_dec_32(&mp->set_event_guard); +} + +int spl_mutex_tryenter(kmutex_t *mp) +{ + LARGE_INTEGER timeout; + NTSTATUS Status; + kthread_t *thisthread = current_thread(); + + if (mp->initialised != MUTEX_INITIALISED) + panic("%s: mutex not initialised\n", __func__); + + if (mp->m_owner == thisthread) + panic("mutex_tryenter: locking against myself!"); + + // Test if "m_owner" is NULL, if so, set it to "thisthread". + // Returns original value, so if NULL, it succeeded. + if (InterlockedCompareExchangePointer(&mp->m_owner, + thisthread, NULL) != NULL) { + return 0; // Not held. + } + + ASSERT(mp->m_owner == thisthread); + + // held + return (1); +} + +int spl_mutex_owned(kmutex_t *mp) +{ + return (mp->m_owner == current_thread()); +} + +struct kthread *spl_mutex_owner(kmutex_t *mp) +{ + return (mp->m_owner); +} diff --git a/module/os/windows/spl/spl-policy.c b/module/os/windows/spl/spl-policy.c new file mode 100644 index 000000000000..77e9c561480b --- /dev/null +++ b/module/os/windows/spl/spl-policy.c @@ -0,0 +1,880 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include + +int +spl_priv_check_cred(const void *cred, int priv, /*__unused*/ int flags) +{ + int error = 0; + (void)flags; +// if (kauth_cred_getuid(cred) == 0) { +// error = 0; +// goto out; +// } + + /* + * The default is deny, so if no policies have granted it, reject + * with a privilege error here. + */ + // Assuming everything is root for now, fix me. WIN32 + //error = EPERM; +//out: + return (error); +} + +//secpolicy_fs_unmount +#ifdef illumos +/* + * Does the policy computations for "ownership" of a mount; + * here ownership is defined as the ability to "mount" + * the filesystem originally. The rootvfs doesn't cover any + * vnodes; we attribute its ownership to the rootvp. + */ +static int +secpolicy_fs_owner(cred_t *cr, const struct vfs *vfsp) +{ + vnode_t *mvp; + + if (vfsp == NULL) + mvp = NULL; + else if (vfsp == rootvfs) + mvp = rootvp; + else + mvp = vfsp->vfs_vnodecovered; + + return (secpolicy_fs_common(cr, mvp, vfsp, NULL)); +} + +int +secpolicy_fs_unmount(cred_t *cr, struct vfs *vfsp) +{ + return (secpolicy_fs_owner(cr, vfsp)); +} +#elif defined(__FreeBSD__) +int +secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp __unused) +{ + + return (priv_check_cred(cr, PRIV_VFS_UNMOUNT, 0)); +} +#elif defined(__APPLE__) +int +secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_VFS_UNMOUNT, 0)); +} +#endif /* illumos */ + +//secpolicy_nfs +#ifdef illumos +/* + * Checks for operations that are either client-only or are used by + * both clients and servers. + */ +int +secpolicy_nfs(const cred_t *cr) +{ + return (PRIV_POLICY(cr, PRIV_SYS_NFS, B_FALSE, EPERM, NULL)); +} +#elif defined(__FreeBSD__) +int +secpolicy_nfs(cred_t *cr) +{ + + return (priv_check_cred(cr, PRIV_NFS_DAEMON, 0)); +} +#elif defined(__APPLE__) +int +secpolicy_nfs(const cred_t *cr) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_NFS_DAEMON, 0)); +} +#endif /* illumos */ + +//secpolicy_sys_config +#ifdef illumos +/* + * Catch all system configuration. + */ +int +secpolicy_sys_config(const cred_t *cr, boolean_t checkonly) +{ + if (checkonly) { + return (PRIV_POLICY_ONLY(cr, PRIV_SYS_CONFIG, B_FALSE) ? 0 : + EPERM); + } else { + return (PRIV_POLICY(cr, PRIV_SYS_CONFIG, B_FALSE, EPERM, NULL)); + } +} +#elif defined(__FreeBSD__) +int +secpolicy_sys_config(cred_t *cr, int checkonly __unused) +{ + + return (priv_check_cred(cr, PRIV_ZFS_POOL_CONFIG, 0)); +} +#elif defined(__APPLE__) +int +secpolicy_sys_config(const cred_t *cr, boolean_t checkonly) +{ + return (spl_priv_check_cred((kauth_cred_t)cr, PRIV_ZFS_POOL_CONFIG, 0)); +} +#elif defined(_WIN32) +int +secpolicy_sys_config(const cred_t *cr, boolean_t checkonly) +{ + return (spl_priv_check_cred((void *)cr, PRIV_ZFS_POOL_CONFIG, 0)); +} +#endif /* illumos */ + +//secpolicy_zfs +#ifdef illumos +/* + * secpolicy_zfs + * + * Determine if the subject has permission to manipulate ZFS datasets + * (not pools). Equivalent to the SYS_MOUNT privilege. + */ +int +secpolicy_zfs(const cred_t *cr) +{ + return (PRIV_POLICY(cr, PRIV_SYS_MOUNT, B_FALSE, EPERM, NULL)); +} +#elif defined(__FreeBSD__) +int +secpolicy_zfs(cred_t *cr) +{ + + return (priv_check_cred(cr, PRIV_VFS_MOUNT, 0)); +} +#elif defined(_WIN32) +int +secpolicy_zfs(const cred_t *cr) +{ + return (spl_priv_check_cred((kauth_cred_t *)cr, PRIV_VFS_MOUNT, 0)); +} +#endif /* illumos */ + +//secpolicy_zinject +#ifdef illumos +/* + * secpolicy_zinject + * + * Determine if the subject can inject faults in the ZFS fault injection + * framework. Requires all privileges. + */ +int +secpolicy_zinject(const cred_t *cr) +{ + return (secpolicy_require_set(cr, PRIV_FULLSET, NULL, KLPDARG_NONE)); +} +#elif defined(__FreeBSD__) +int +secpolicy_zinject(cred_t *cr) +{ + + return (priv_check_cred(cr, PRIV_ZFS_INJECT, 0)); +} +#elif defined(_WIN32) +int +secpolicy_zinject(const cred_t *cr) +{ + return (spl_priv_check_cred((kauth_cred_t *)cr, PRIV_ZFS_INJECT, 0)); +} +#endif /* illumos */ + +//secpolicy_vnode_any_access +#ifdef illumos +/* + * This is a special routine for ZFS; it is used to determine whether + * any of the privileges in effect allow any form of access to the + * file. There's no reason to audit this or any reason to record + * this. More work is needed to do the "KPLD" stuff. + */ +int +secpolicy_vnode_any_access(const cred_t *cr, vnode_t *vp, uid_t owner) +{ + static int privs[] = { + PRIV_FILE_OWNER, + PRIV_FILE_CHOWN, + PRIV_FILE_DAC_READ, + PRIV_FILE_DAC_WRITE, + PRIV_FILE_DAC_EXECUTE, + PRIV_FILE_DAC_SEARCH, + }; + int i; + + /* Same as secpolicy_vnode_setdac */ + if (owner == cr->cr_uid) + return (0); + + for (i = 0; i < sizeof (privs)/sizeof (int); i++) { + boolean_t allzone = B_FALSE; + int priv; + + switch (priv = privs[i]) { + case PRIV_FILE_DAC_EXECUTE: + if (vp->v_type == VDIR) + continue; + break; + case PRIV_FILE_DAC_SEARCH: + if (vp->v_type != VDIR) + continue; + break; + case PRIV_FILE_DAC_WRITE: + case PRIV_FILE_OWNER: + case PRIV_FILE_CHOWN: + /* We know here that if owner == 0, that cr_uid != 0 */ + allzone = owner == 0; + break; + } + if (PRIV_POLICY_CHOICE(cr, priv, allzone)) + return (0); + } + return (EPERM); +} +#elif defined(__FreeBSD__) +int +secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner) +{ + static int privs[] = { + PRIV_VFS_ADMIN, + PRIV_VFS_READ, + PRIV_VFS_WRITE, + PRIV_VFS_EXEC, + PRIV_VFS_LOOKUP + }; + int i; + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + /* Same as secpolicy_vnode_setdac */ + if (owner == cr->cr_uid) + return (0); + + for (i = 0; i < sizeof (privs)/sizeof (int); i++) { + boolean_t allzone = B_FALSE; + int priv; + + switch (priv = privs[i]) { + case PRIV_VFS_EXEC: + if (vp->v_type == VDIR) + continue; + break; + case PRIV_VFS_LOOKUP: + if (vp->v_type != VDIR) + continue; + break; + } + if (priv_check_cred(cr, priv, 0) == 0) + return (0); + } + return (EPERM); +} +#elif defined(_WIN32) +int +secpolicy_vnode_any_access(const cred_t *cr, vnode_t *vp, uid_t owner) +{ + // FIXME + return (0); +} +#endif /* illumos */ + +//secpolicy_vnode_access2 +#ifdef illumos +/* + * Like secpolicy_vnode_access() but we get the actual wanted mode and the + * current mode of the file, not the missing bits. + */ +int +secpolicy_vnode_access2(const cred_t *cr, vnode_t *vp, uid_t owner, + mode_t curmode, mode_t wantmode) +{ + mode_t mode; + + /* Inline the basic privileges tests. */ + if ((wantmode & VREAD) && + !PRIV_ISASSERT(&CR_OEPRIV(cr), PRIV_FILE_READ) && + priv_policy_va(cr, PRIV_FILE_READ, B_FALSE, EACCES, NULL, + KLPDARG_VNODE, vp, (char *)NULL, KLPDARG_NOMORE) != 0) { + return (EACCES); + } + + if ((wantmode & VWRITE) && + !PRIV_ISASSERT(&CR_OEPRIV(cr), PRIV_FILE_WRITE) && + priv_policy_va(cr, PRIV_FILE_WRITE, B_FALSE, EACCES, NULL, + KLPDARG_VNODE, vp, (char *)NULL, KLPDARG_NOMORE) != 0) { + return (EACCES); + } + + mode = ~curmode & wantmode; + + if (mode == 0) + return (0); + + if ((mode & VREAD) && priv_policy_va(cr, PRIV_FILE_DAC_READ, B_FALSE, + EACCES, NULL, KLPDARG_VNODE, vp, (char *)NULL, + KLPDARG_NOMORE) != 0) { + return (EACCES); + } + + if (mode & VWRITE) { + boolean_t allzone; + + if (owner == 0 && cr->cr_uid != 0) + allzone = B_TRUE; + else + allzone = B_FALSE; + if (priv_policy_va(cr, PRIV_FILE_DAC_WRITE, allzone, EACCES, + NULL, KLPDARG_VNODE, vp, (char *)NULL, + KLPDARG_NOMORE) != 0) { + return (EACCES); + } + } + + if (mode & VEXEC) { + /* + * Directories use file_dac_search to override the execute bit. + */ + int p = vp->v_type == VDIR ? PRIV_FILE_DAC_SEARCH : + PRIV_FILE_DAC_EXECUTE; + + return (priv_policy_va(cr, p, B_FALSE, EACCES, NULL, + KLPDARG_VNODE, vp, (char *)NULL, KLPDARG_NOMORE)); + } + return (0); +} +#elif defined(__FreeBSD__) +int +secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, accmode_t accmode) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + if ((accmode & VREAD) && priv_check_cred(cr, PRIV_VFS_READ, 0) != 0) + return (EACCES); + if ((accmode & VWRITE) && + priv_check_cred(cr, PRIV_VFS_WRITE, 0) != 0) { + return (EACCES); + } + if (accmode & VEXEC) { + if (vp->v_type == VDIR) { + if (priv_check_cred(cr, PRIV_VFS_LOOKUP, 0) != 0) + return (EACCES); + } else { + if (priv_check_cred(cr, PRIV_VFS_EXEC, 0) != 0) + return (EACCES); + } + } + return (0); +} + + +/* + * Like secpolicy_vnode_access() but we get the actual wanted mode and the + * current mode of the file, not the missing bits. + */ +int +secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner, + accmode_t curmode, accmode_t wantmode) +{ + accmode_t mode; + + mode = ~curmode & wantmode; + + if (mode == 0) + return (0); + + return (secpolicy_vnode_access(cr, vp, owner, mode)); +} +#elif defined(_WIN32) +int +secpolicy_vnode_access2(const cred_t *cr, vnode_t *vp, uid_t owner, + mode_t curmode, mode_t wantmode) +{ + // FIXME + return (0); +} +#endif /* illumos */ + +//secpolicy_vnode_setattr +#ifdef illumos +/* + * This function checks the policy decisions surrounding the + * vop setattr call. + * + * It should be called after sufficient locks have been established + * on the underlying data structures. No concurrent modifications + * should be allowed. + * + * The caller must pass in unlocked version of its vaccess function + * this is required because vop_access function should lock the + * node for reading. A three argument function should be defined + * which accepts the following argument: + * A pointer to the internal "node" type (inode *) + * vnode access bits (VREAD|VWRITE|VEXEC) + * a pointer to the credential + * + * This function makes the following policy decisions: + * + * - change permissions + * - permission to change file mode if not owner + * - permission to add sticky bit to non-directory + * - permission to add set-gid bit + * + * The ovap argument should include AT_MODE|AT_UID|AT_GID. + * + * If the vap argument does not include AT_MODE, the mode will be copied from + * ovap. In certain situations set-uid/set-gid bits need to be removed; + * this is done by marking vap->va_mask to include AT_MODE and va_mode + * is updated to the newly computed mode. + */ + +int +secpolicy_vnode_setattr(cred_t *cr, struct vnode *vp, struct vattr *vap, + const struct vattr *ovap, int flags, + int unlocked_access(void *, int, cred_t *), + void *node) +{ + int mask = vap->va_mask; + int error = 0; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + + if (mask & AT_SIZE) { + if (vp->v_type == VDIR) { + error = EISDIR; + goto out; + } + + /* + * If ATTR_NOACLCHECK is set in the flags, then we don't + * perform the secondary unlocked_access() call since the + * ACL (if any) is being checked there. + */ + if (skipaclchk == B_FALSE) { + error = unlocked_access(node, VWRITE, cr); + if (error) + goto out; + } + } + if (mask & AT_MODE) { + /* + * If not the owner of the file then check privilege + * for two things: the privilege to set the mode at all + * and, if we're setting setuid, we also need permissions + * to add the set-uid bit, if we're not the owner. + * In the specific case of creating a set-uid root + * file, we need even more permissions. + */ + if ((error = secpolicy_vnode_setdac(cr, ovap->va_uid)) != 0) + goto out; + + if ((error = secpolicy_setid_setsticky_clear(vp, vap, + ovap, cr)) != 0) + goto out; + } else + vap->va_mode = ovap->va_mode; + + if (mask & (AT_UID|AT_GID)) { + boolean_t checkpriv = B_FALSE; + + /* + * Chowning files. + * + * If you are the file owner: + * chown to other uid FILE_CHOWN_SELF + * chown to gid (non-member) FILE_CHOWN_SELF + * chown to gid (member) + * + * Instead of PRIV_FILE_CHOWN_SELF, FILE_CHOWN is also + * acceptable but the first one is reported when debugging. + * + * If you are not the file owner: + * chown from root PRIV_FILE_CHOWN + zone + * chown from other to any PRIV_FILE_CHOWN + * + */ + if (cr->cr_uid != ovap->va_uid) { + checkpriv = B_TRUE; + } else { + if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || + ((mask & AT_GID) && vap->va_gid != ovap->va_gid && + !groupmember(vap->va_gid, cr))) { + checkpriv = B_TRUE; + } + } + /* + * If necessary, check privilege to see if update can be done. + */ + if (checkpriv && + (error = secpolicy_vnode_chown(cr, ovap->va_uid)) != 0) { + goto out; + } + + /* + * If the file has either the set UID or set GID bits + * set and the caller can set the bits, then leave them. + */ + secpolicy_setid_clear(vap, cr); + } + if (mask & (AT_ATIME|AT_MTIME)) { + /* + * If not the file owner and not otherwise privileged, + * always return an error when setting the + * time other than the current (ATTR_UTIME flag set). + * If setting the current time (ATTR_UTIME not set) then + * unlocked_access will check permissions according to policy. + */ + if (cr->cr_uid != ovap->va_uid) { + if (flags & ATTR_UTIME) + error = secpolicy_vnode_utime_modify(cr); + else if (skipaclchk == B_FALSE) { + error = unlocked_access(node, VWRITE, cr); + if (error == EACCES && + secpolicy_vnode_utime_modify(cr) == 0) + error = 0; + } + if (error) + goto out; + } + } + + /* + * Check for optional attributes here by checking the following: + */ + if (mask & AT_XVATTR) + error = secpolicy_xvattr((xvattr_t *)vap, ovap->va_uid, cr, + vp->v_type); +out: + return (error); +} +#elif defined(__FreeBSD__) +int +secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, int flags, + int unlocked_access(void *, int, cred_t *), void *node) +{ + int mask = vap->va_mask; + int error; + + if (mask & AT_SIZE) { + if (vp->v_type == VDIR) + return (EISDIR); + error = unlocked_access(node, VWRITE, cr); + if (error) + return (error); + } + if (mask & AT_MODE) { + /* + * If not the owner of the file then check privilege + * for two things: the privilege to set the mode at all + * and, if we're setting setuid, we also need permissions + * to add the set-uid bit, if we're not the owner. + * In the specific case of creating a set-uid root + * file, we need even more permissions. + */ + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error) + return (error); + error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cr); + if (error) + return (error); + } else { + vap->va_mode = ovap->va_mode; + } + if (mask & (AT_UID | AT_GID)) { + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error) + return (error); + + /* + * To change the owner of a file, or change the group of a file to a + * group of which we are not a member, the caller must have + * privilege. + */ + if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || + ((mask & AT_GID) && vap->va_gid != ovap->va_gid && + !groupmember(vap->va_gid, cr))) { + if (secpolicy_fs_owner(vp->v_mount, cr) != 0) { + error = priv_check_cred(cr, PRIV_VFS_CHOWN, 0); + if (error) + return (error); + } + } + + if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || + ((mask & AT_GID) && vap->va_gid != ovap->va_gid)) { + secpolicy_setid_clear(vap, vp, cr); + } + } + if (mask & (AT_ATIME | AT_MTIME)) { + /* + * From utimes(2): + * If times is NULL, ... The caller must be the owner of + * the file, have permission to write the file, or be the + * super-user. + * If times is non-NULL, ... The caller must be the owner of + * the file or be the super-user. + */ + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error && (vap->va_vaflags & VA_UTIMES_NULL)) + error = unlocked_access(node, VWRITE, cr); + if (error) + return (error); + } + return (0); +} +#elif defined(__APPLE__) +int +secpolicy_vnode_setattr(cred_t *cr, struct vnode *vp, vattr_t *vap, + const vattr_t *ovap, int flags, + int unlocked_access(void *, int, cred_t *), + void *node) +{ + // FIXME + return (0); +} +#endif /* illumos */ + +//secpolicy_vnode_stky_modify +#ifdef illumos +/* + * Name: secpolicy_vnode_stky_modify() + * + * Normal: verify that subject can make a file a "sticky". + * + * Output: EPERM - if access denied. + */ + +int +secpolicy_vnode_stky_modify(const cred_t *cred) +{ + return (PRIV_POLICY(cred, PRIV_SYS_CONFIG, B_FALSE, EPERM, + "set file sticky")); +} +#elif defined(__FreeBSD__) +int +secpolicy_vnode_stky_modify(cred_t *cr) +{ + + return (EPERM); +} +#elif defined(_WIN32) +int +secpolicy_vnode_stky_modify(const cred_t *cred) +{ + return (EPERM); +} +#endif /* illumos */ + +//secpolicy_setid_setsticky_clear +#ifdef illumos +/* + * Name: secpolicy_vnode_setids_setgids() + * + * Normal: verify that subject can set the file setgid flag. + * + * Output: EPERM - if not privileged + */ + +int +secpolicy_vnode_setids_setgids(const cred_t *cred, gid_t gid) +{ + if (!groupmember(gid, cred)) + return (PRIV_POLICY(cred, PRIV_FILE_SETID, B_FALSE, EPERM, + NULL)); + return (0); +} + +int +secpolicy_setid_setsticky_clear(vnode_t *vp, vattr_t *vap, const vattr_t *ovap, + cred_t *cr) +{ + int error; + + if ((vap->va_mode & S_ISUID) != 0 && + (error = secpolicy_vnode_setid_modify(cr, + ovap->va_uid)) != 0) { + return (error); + } + + /* + * Check privilege if attempting to set the + * sticky bit on a non-directory. + */ + if (vp->v_type != VDIR && (vap->va_mode & S_ISVTX) != 0 && + secpolicy_vnode_stky_modify(cr) != 0) { + vap->va_mode &= ~S_ISVTX; + } + + /* + * Check for privilege if attempting to set the + * group-id bit. + */ + if ((vap->va_mode & S_ISGID) != 0 && + secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) { + vap->va_mode &= ~S_ISGID; + } + + return (0); +} +#elif defined(__FreeBSD__) +int +secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid) +{ + + if (groupmember(gid, cr)) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (priv_check_cred(cr, PRIV_VFS_SETGID, 0)); +} + +int +secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, cred_t *cr) +{ + int error; + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + /* + * Privileged processes may set the sticky bit on non-directories, + * as well as set the setgid bit on a file with a group that the process + * is not a member of. Both of these are allowed in jail(8). + */ + if (vp->v_type != VDIR && (vap->va_mode & S_ISTXT)) { + if (priv_check_cred(cr, PRIV_VFS_STICKYFILE, 0)) + return (EFTYPE); + } + /* + * Check for privilege if attempting to set the + * group-id bit. + */ + if ((vap->va_mode & S_ISGID) != 0) { + error = secpolicy_vnode_setids_setgids(vp, cr, ovap->va_gid); + if (error) + return (error); + } + /* + * Deny setting setuid if we are not the file owner. + */ + if ((vap->va_mode & S_ISUID) && ovap->va_uid != cr->cr_uid) { + error = priv_check_cred(cr, PRIV_VFS_ADMIN, 0); + if (error) + return (error); + } + return (0); +} +#elif defined(__APPLE__) +int +secpolicy_setid_setsticky_clear(vnode_t *vp, vattr_t *vap, const vattr_t *ovap, + cred_t *cr) +{ + // FIXME + return (0); +} +#endif /* illumos */ + +int +secpolicy_vnode_remove(struct vnode *vp, const cred_t *cr) +{ + return (0); +} + +int +secpolicy_vnode_create_gid(const cred_t *cred) +{ + return (0); +} + +int secpolicy_vnode_setids_setgids(struct vnode *vp, const cred_t *cr, + gid_t gid) +{ + return (0); +} + +int secpolicy_vnode_setdac(struct vnode *vp, const cred_t *cr, uid_t u) +{ + return (0); +} + +int secpolicy_vnode_chown( struct vnode *vp, const cred_t *cr, uid_t u) +{ + return (0); +} + +int secpolicy_vnode_setid_retain( struct vnode *vp, const cred_t *cr, + int fal) +{ + return (0); +} + +int secpolicy_xvattr(struct vnode *dvp, vattr_t *vap, uid_t uid, + const cred_t *cr, enum vtype ty) +{ + return (0); +} + +int secpolicy_setid_clear(vattr_t *vap, struct vnode *vp, + const cred_t *cr) +{ + return (0); +} + +int secpolicy_basic_link(struct vnode *svp, const cred_t *cr) +{ + return (0); +} + +int secpolicy_fs_mount_clearopts(const cred_t *cr, struct mount *mp) +{ + return (0); +} + +int secpolicy_fs_mount(const cred_t *cr, struct vnode *vp, struct mount *mp) +{ + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT, 0)); +} + +int +secpolicy_vnode_setattr(cred_t *cr, struct vnode *vp, vattr_t *vap, + const vattr_t *ovap, int flags, + int unlocked_access(void *, int, cred_t *), + void *node) +{ + // FIXME + return (0); +} + +int +secpolicy_setid_setsticky_clear(vnode_t *vp, vattr_t *vap, const vattr_t *ovap, + cred_t *cr) +{ + // FIXME + return (0); +} diff --git a/module/os/windows/spl/spl-proc.c b/module/os/windows/spl/spl-proc.c new file mode 100644 index 000000000000..d18f49b133bf --- /dev/null +++ b/module/os/windows/spl/spl-proc.c @@ -0,0 +1,26 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include + +proc_t p0 = {NULL}; /* process 0 */ diff --git a/module/os/windows/spl/spl-processor.c b/module/os/windows/spl/spl-processor.c new file mode 100644 index 000000000000..5c32401f8849 --- /dev/null +++ b/module/os/windows/spl/spl-processor.c @@ -0,0 +1,48 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + */ + +#include + +uint32_t +cpu_number(void) +{ + uint32_t cpuid; + cpuid = (uint32_t)KeGetCurrentProcessorIndex(); + return cpuid % max_ncpus; + return 0; + return cpuid >= max_ncpus ? 0 : cpuid; +} + +uint32_t +getcpuid() +{ + uint32_t cpuid; + cpuid = (uint32_t)KeGetCurrentProcessorIndex(); + return cpuid % max_ncpus; + return 0; + return cpuid >= max_ncpus ? 0 : cpuid; +} diff --git a/module/os/windows/spl/spl-rwlock.c b/module/os/windows/spl/spl-rwlock.c new file mode 100644 index 000000000000..9bbb8bbcecfa --- /dev/null +++ b/module/os/windows/spl/spl-rwlock.c @@ -0,0 +1,238 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2018 Jorgen Lundman + * + */ + +#include +//#include +#include +#include + +uint64_t zfs_active_rwlock = 0; + +/* We run rwlock with DEBUG on for now, as it protects against + * uninitialised access etc, and almost no cost. + */ +#ifndef DEBUG +#define DEBUG +#endif + +#ifdef DEBUG +int rw_isinit(krwlock_t *rwlp) +{ + if (rwlp->rw_pad != 0x012345678) + return 0; + return 1; +} +#endif + + +void +rw_init(krwlock_t *rwlp, char *name, krw_type_t type, /*__unused*/ void *arg) +{ + ASSERT(type != RW_DRIVER); + +#ifdef DEBUG + VERIFY3U(rwlp->rw_pad, != , 0x012345678); +#endif + ExInitializeResourceLite(&rwlp->rw_lock); + rwlp->rw_owner = NULL; + rwlp->rw_readers = 0; +#ifdef DEBUG + rwlp->rw_pad = 0x012345678; +#endif + atomic_inc_64(&zfs_active_rwlock); +} + +void +rw_destroy(krwlock_t *rwlp) +{ + // Confirm it was initialised, and is unlocked, and not already destroyed. +#ifdef DEBUG + VERIFY3U(rwlp->rw_pad, == , 0x012345678); +#endif + VERIFY3U(rwlp->rw_owner, ==, 0); + VERIFY3U(rwlp->rw_readers, ==, 0); + + // This has caused panic due to IRQL panic, from taskq->zap_evict->rw_destroy + ExDeleteResourceLite(&rwlp->rw_lock); +#ifdef DEBUG + rwlp->rw_pad = 0x99; +#endif + atomic_dec_64(&zfs_active_rwlock); +} + +void +rw_enter(krwlock_t *rwlp, krw_t rw) +{ +#ifdef DEBUG + if (rwlp->rw_pad != 0x012345678) + panic("rwlock %p not initialised\n", rwlp); +#endif + + if (rw == RW_READER) { + ExAcquireResourceSharedLite(&rwlp->rw_lock, TRUE); + atomic_inc_32((volatile uint32_t *)&rwlp->rw_readers); + ASSERT(rwlp->rw_owner == 0); + } else { + if (rwlp->rw_owner == current_thread()) + panic("rw_enter: locking against myself!"); + ExAcquireResourceExclusiveLite(&rwlp->rw_lock, TRUE); + ASSERT(rwlp->rw_owner == 0); + ASSERT(rwlp->rw_readers == 0); + rwlp->rw_owner = current_thread(); + } +} + +/* + * kernel private from osfmk/kern/locks.h + */ + +int +rw_tryenter(krwlock_t *rwlp, krw_t rw) +{ + int held = 0; + +#ifdef DEBUG + if (rwlp->rw_pad != 0x012345678) + panic("rwlock %p not initialised\n", rwlp); +#endif + + if (rw == RW_READER) { + held = ExAcquireResourceSharedLite(&rwlp->rw_lock, FALSE); + if (held) + atomic_inc_32((volatile uint32_t *)&rwlp->rw_readers); + } else { + if (rwlp->rw_owner == current_thread()) + panic("rw_tryenter: locking against myself!"); + + held = ExAcquireResourceExclusiveLite(&rwlp->rw_lock, FALSE); + if (held) + rwlp->rw_owner = current_thread(); + } + + return (held); +} + + + +/* + * It appears a difference between Darwin's + * lck_rw_lock_shared_to_exclusive() and Solaris's rw_tryupgrade() and + * FreeBSD's sx_try_upgrade() is that on failure to upgrade, the prior + * held shared/reader lock is lost on Darwin, but retained on + * Solaris/FreeBSD. We could re-acquire the lock in this situation, + * but it enters a possibility of blocking, when tryupgrade is meant + * to be non-blocking. + * Also note that XNU's lck_rw_lock_shared_to_exclusive() is always + * blocking (when waiting on readers), which means we can not use it. + */ +int +rw_tryupgrade(krwlock_t *rwlp) +{ + int held = 0; + + if (rwlp->rw_owner == current_thread()) + panic("rw_enter: locking against myself!"); + + /* More readers than us? give up */ + if (rwlp->rw_readers != 1) return 0; + + /* + * It is ON. We need to drop our READER lock, and try to + * grab the WRITER as quickly as possible. + */ + atomic_dec_32((volatile uint32_t *)&rwlp->rw_readers); + ExReleaseResourceLite(&rwlp->rw_lock); + + /* Grab the WRITER lock */ + held = ExAcquireResourceExclusiveLite(&rwlp->rw_lock, FALSE); + + if (held) { + /* Looks like we won */ + rwlp->rw_owner = current_thread(); + ASSERT(rwlp->rw_readers == 0); + return 1; + } + + /* + * The worst has happened, we failed to grab WRITE lock, either + * due to another WRITER lock, or, some READER came along. + * IllumOS implementation returns with the READER lock again + * so we need to grab it. + */ + rw_enter(rwlp, RW_READER); + return 0; + +} + +void +rw_exit(krwlock_t *rwlp) +{ + if (rwlp->rw_owner == current_thread()) { + rwlp->rw_owner = NULL; + ASSERT(rwlp->rw_readers == 0); + ExReleaseResourceLite(&rwlp->rw_lock); + } else { + atomic_dec_32((volatile uint32_t *)&rwlp->rw_readers); + ASSERT(rwlp->rw_owner == 0); + ExReleaseResourceLite(&rwlp->rw_lock); + } +} + + +int +rw_lock_held(krwlock_t *rwlp) +{ + /* + * ### not sure about this one ### + */ + return (rwlp->rw_owner == current_thread() || rwlp->rw_readers > 0); +} + +int +rw_write_held(krwlock_t *rwlp) +{ + return (rwlp->rw_owner == current_thread()); +} + +void +rw_downgrade(krwlock_t *rwlp) +{ + if (rwlp->rw_owner != current_thread()) + panic("SPL: rw_downgrade not WRITE lock held\n"); + rw_exit(rwlp); + rw_enter(rwlp, RW_READER); +} + +int spl_rwlock_init(void) +{ + return 0; +} + +void spl_rwlock_fini(void) +{ + ASSERT(zfs_active_rwlock == 0); +} diff --git a/module/os/windows/spl/spl-seg_kmem.c b/module/os/windows/spl/spl-seg_kmem.c new file mode 100644 index 000000000000..36e6ac582356 --- /dev/null +++ b/module/os/windows/spl/spl-seg_kmem.c @@ -0,0 +1,291 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include + +#include +#include +// ugly: smd +#ifdef kmem_free +#undef kmem_free +#endif +#include + +#include +#include +#include + +//#include + +/* + * seg_kmem is the primary kernel memory segment driver. It + * maps the kernel heap [kernelheap, ekernelheap), module text, + * and all memory which was allocated before the VM was initialized + * into kas. + * + * Pages which belong to seg_kmem are hashed into &kvp vnode at + * an offset equal to (u_offset_t)virt_addr, and have p_lckcnt >= 1. + * They must never be paged out since segkmem_fault() is a no-op to + * prevent recursive faults. + * + * Currently, seg_kmem pages are sharelocked (p_sharelock == 1) on + * __x86 and are unlocked (p_sharelock == 0) on __sparc. Once __x86 + * supports relocation the #ifdef kludges can be removed. + * + * seg_kmem pages may be subject to relocation by page_relocate(), + * provided that the HAT supports it; if this is so, segkmem_reloc + * will be set to a nonzero value. All boot time allocated memory as + * well as static memory is considered off limits to relocation. + * Pages are "relocatable" if p_state does not have P_NORELOC set, so + * we request P_NORELOC pages for memory that isn't safe to relocate. + * + * The kernel heap is logically divided up into four pieces: + * + * heap32_arena is for allocations that require 32-bit absolute + * virtual addresses (e.g. code that uses 32-bit pointers/offsets). + * + * heap_core is for allocations that require 2GB *relative* + * offsets; in other words all memory from heap_core is within + * 2GB of all other memory from the same arena. This is a requirement + * of the addressing modes of some processors in supervisor code. + * + * heap_arena is the general heap arena. + * + * static_arena is the static memory arena. Allocations from it + * are not subject to relocation so it is safe to use the memory + * physical address as well as the virtual address (e.g. the VA to + * PA translations are static). Caches may import from static_arena; + * all other static memory allocations should use static_alloc_arena. + * + * On some platforms which have limited virtual address space, seg_kmem + * may share [kernelheap, ekernelheap) with seg_kp; if this is so, + * segkp_bitmap is non-NULL, and each bit represents a page of virtual + * address space which is actually seg_kp mapped. + */ + +/* + * Rough stubbed Port for XNU. + * + * Copyright (c) 2014 Brendon Humphrey (brendon.humphrey@mac.com) + */ + + +#ifdef _KERNEL +#define XNU_KERNEL_PRIVATE +//#include + +#include + +//extern vm_map_t kernel_map; + +/* + * These extern prototypes has to be carefully checked against XNU source + * in case Apple changes them. They are not defined in the "allowed" parts + * of the kernel.framework + */ +typedef uint8_t vm_tag_t; + +/* + * Tag we use to identify memory we have allocated + * + * (VM_KERN_MEMORY_KEXT - mach_vm_statistics.h) + */ +#define SPL_TAG 6 + +/* + * In kernel lowlevel form of malloc. + */ + +/* + * Free memory + */ + +#endif /* _KERNEL */ + +typedef int page_t; + +void *segkmem_alloc(vmem_t *vmp, size_t size, int vmflag); +void segkmem_free(vmem_t *vmp, void *inaddr, size_t size); + + +uint64_t segkmem_total_mem_allocated = 0; /* Total memory held allocated */ +vmem_t *heap_arena; /* primary kernel heap arena */ +vmem_t *zio_arena_parent; /* qcaches for zio and abd arenas */ +vmem_t *zio_arena; /* arena for allocating file data */ +vmem_t *zio_metadata_arena; /* and for allocation of zfs metadata */ + +#ifdef _KERNEL +extern uint64_t total_memory; +uint64_t stat_osif_malloc_success = 0; +uint64_t stat_osif_free = 0; +uint64_t stat_osif_malloc_bytes = 0; +uint64_t stat_osif_free_bytes = 0; +#endif + +void * +osif_malloc(uint64_t size) +{ +#ifdef _KERNEL + + void *tr; + //kern_return_t kr = kernel_memory_allocate(kernel_map, + // &tr, size, PAGESIZE, 0, SPL_TAG); + tr = ExAllocatePoolWithTag(NonPagedPoolNx, size, '!SFZ'); + ASSERT(P2PHASE(tr, PAGE_SIZE) == 0); + if (tr != NULL) { + atomic_inc_64(&stat_osif_malloc_success); + atomic_add_64(&segkmem_total_mem_allocated, size); + atomic_add_64(&stat_osif_malloc_bytes, size); + return(tr); + } else { + dprintf("%s: ExAllocatePoolWithTag failed (memusage: %llu)\n", __func__, segkmem_total_mem_allocated); + ASSERT(0); + extern volatile unsigned int vm_page_free_wanted; + extern volatile unsigned int vm_page_free_min; + spl_free_set_pressure(vm_page_free_min); + vm_page_free_wanted = vm_page_free_min; + return(NULL); + } +#else + return(malloc(size)); +#endif +} + +void +osif_free(void* buf, uint64_t size) +{ +#ifdef _KERNEL + //kmem_free(kernel_map, buf, size); + ExFreePoolWithTag(buf, '!SFZ'); + atomic_inc_64(&stat_osif_free); + atomic_sub_64(&segkmem_total_mem_allocated, size); + atomic_add_64(&stat_osif_free_bytes, size); +#else + free(buf); +#endif /* _KERNEL */ +} + +/* + * Configure vmem, such that the heap arena is fed, + * and drains to the kernel low level allocator. + */ + extern vmem_t *vmem_init(const char *, void *, uint32_t, uint32_t, + vmem_alloc_t *, vmem_free_t *); + +void +kernelheap_init() +{ + heap_arena = vmem_init("heap", NULL, 0, PAGESIZE, + (vmem_alloc_t *)segkmem_alloc, (vmem_free_t *)segkmem_free); +} + + +void kernelheap_fini(void) +{ + vmem_fini(heap_arena); +} + +void * +segkmem_alloc(vmem_t * vmp, size_t size, int maybe_unmasked_vmflag) +{ + return(osif_malloc(size)); +} + +void +segkmem_free(vmem_t *vmp, void *inaddr, size_t size) +{ + osif_free(inaddr, size); + //since this is mainly called by spl_root_arena and free_arena, + //do we really want to wake up a waiter, just because we have + //transferred from one to the other? + //we already have vmem_add_a_gibibyte waking up waiters + //so specializing here seems wasteful + //(originally included in vmem_experiments) + //cv_signal(&vmp->vm_cv); +} + +/* + * OSX does not use separate heaps for the ZIO buffers, + * the ZFS code is structured such that the zio caches will + * fallback to using the kmem_default arena same + * as all the other caches. + */ +// smd: we nevertheless plumb in an arena with heap as parent, so that +// we can track stats and maintain the VM_ / qc settings differently +void +segkmem_zio_init() +{ + + // note: from startup.c and vm_machparam: SEGZIOMINSIZE = 512M. + // and SEGZSIOMAXSIZE = 512G; if physmem is between the two, then + // segziosize is (physmem - SEGZIOMAXSIZE) / 2. + + // Illumos does not segregate zio_metadata_arena out of heap, + // almost exclusively for reasons involving panic dump data + // retention. However, parenting zio_metadata_arena to + // spl_root_arena and giving it its own qcaches provides better + // kstat observability *and* noticeably better performance in + // realworld (zfs/dmu) metadata-heavy activity. Additionally, + // the qcaches pester spl_heap_arena only for slabs 256k and bigger, + // and each of the qcache entries (powers of two from PAGESIZE to + // 64k) are *exact-fit* and therefore dramatically reduce internal + // fragmentation and more than pay off for the extra code and (tiny) + // extra data for holding the arenas' segment tables. + + extern vmem_t *spl_heap_arena; + + zio_arena_parent = vmem_create("zfs_qcache", NULL, 0, + PAGESIZE, vmem_alloc, vmem_free, spl_heap_arena, + 16 * 1024, VM_SLEEP | VMC_TIMEFREE); + + ASSERT(zio_arena_parent != NULL); + + zio_arena = vmem_create("zfs_file_data", NULL, 0, + PAGESIZE, vmem_alloc, vmem_free, zio_arena_parent, + 0, VM_SLEEP); + + zio_metadata_arena = vmem_create("zfs_metadata", NULL, 0, + PAGESIZE, vmem_alloc, vmem_free, zio_arena_parent, + 0, VM_SLEEP); + + ASSERT(zio_arena != NULL); + ASSERT(zio_metadata_arena != NULL); + + extern void spl_zio_no_grow_init(void); + spl_zio_no_grow_init(); +} + +void +segkmem_zio_fini(void) +{ + if (zio_arena) { + vmem_destroy(zio_arena); + } + if (zio_metadata_arena) { + vmem_destroy(zio_metadata_arena); + } + if (zio_arena_parent) { + vmem_destroy(zio_arena_parent); + } +} diff --git a/module/os/windows/spl/spl-taskq.c b/module/os/windows/spl/spl-taskq.c new file mode 100644 index 000000000000..42e52aad1b5d --- /dev/null +++ b/module/os/windows/spl/spl-taskq.c @@ -0,0 +1,2296 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + */ + +/* + * Copyright (C) 2017 Jorgen Lundman + */ + +/* + * Kernel task queues: general-purpose asynchronous task scheduling. + * + * A common problem in kernel programming is the need to schedule tasks + * to be performed later, by another thread. There are several reasons + * you may want or need to do this: + * + * (1) The task isn't time-critical, but your current code path is. + * + * (2) The task may require grabbing locks that you already hold. + * + * (3) The task may need to block (e.g. to wait for memory), but you + * cannot block in your current context. + * + * (4) Your code path can't complete because of some condition, but you can't + * sleep or fail, so you queue the task for later execution when condition + * disappears. + * + * (5) You just want a simple way to launch multiple tasks in parallel. + * + * Task queues provide such a facility. In its simplest form (used when + * performance is not a critical consideration) a task queue consists of a + * single list of tasks, together with one or more threads to service the + * list. There are some cases when this simple queue is not sufficient: + * + * (1) The task queues are very hot and there is a need to avoid data and lock + * contention over global resources. + * + * (2) Some tasks may depend on other tasks to complete, so they can't be put in + * the same list managed by the same thread. + * + * (3) Some tasks may block for a long time, and this should not block other + * tasks in the queue. + * + * To provide useful service in such cases we define a "dynamic task queue" + * which has an individual thread for each of the tasks. These threads are + * dynamically created as they are needed and destroyed when they are not in + * use. The API for managing task pools is the same as for managing task queues + * with the exception of a taskq creation flag TASKQ_DYNAMIC which tells that + * dynamic task pool behavior is desired. + * + * Dynamic task queues may also place tasks in the normal queue (called "backing + * queue") when task pool runs out of resources. Users of task queues may + * disallow such queued scheduling by specifying TQ_NOQUEUE in the dispatch + * flags. + * + * The backing task queue is also used for scheduling internal tasks needed for + * dynamic task queue maintenance. + * + * INTERFACES ================================================================== + * + * taskq_t *taskq_create(name, nthreads, pri, minalloc, maxall, flags); + * + * Create a taskq with specified properties. + * Possible 'flags': + * + * TASKQ_DYNAMIC: Create task pool for task management. If this flag is + * specified, 'nthreads' specifies the maximum number of threads in + * the task queue. Task execution order for dynamic task queues is + * not predictable. + * + * If this flag is not specified (default case) a + * single-list task queue is created with 'nthreads' threads + * servicing it. Entries in this queue are managed by + * taskq_ent_alloc() and taskq_ent_free() which try to keep the + * task population between 'minalloc' and 'maxalloc', but the + * latter limit is only advisory for TQ_SLEEP dispatches and the + * former limit is only advisory for TQ_NOALLOC dispatches. If + * TASKQ_PREPOPULATE is set in 'flags', the taskq will be + * prepopulated with 'minalloc' task structures. + * + * Since non-DYNAMIC taskqs are queues, tasks are guaranteed to be + * executed in the order they are scheduled if nthreads == 1. + * If nthreads > 1, task execution order is not predictable. + * + * TASKQ_PREPOPULATE: Prepopulate task queue with threads. + * Also prepopulate the task queue with 'minalloc' task structures. + * + * TASKQ_THREADS_CPU_PCT: This flag specifies that 'nthreads' should be + * interpreted as a percentage of the # of online CPUs on the + * system. The taskq subsystem will automatically adjust the + * number of threads in the taskq in response to CPU online + * and offline events, to keep the ratio. nthreads must be in + * the range [0,100]. + * + * The calculation used is: + * + * MAX((ncpus_online * percentage)/100, 1) + * + * This flag is not supported for DYNAMIC task queues. + * This flag is not compatible with TASKQ_CPR_SAFE. + * + * TASKQ_CPR_SAFE: This flag specifies that users of the task queue will + * use their own protocol for handling CPR issues. This flag is not + * supported for DYNAMIC task queues. This flag is not compatible + * with TASKQ_THREADS_CPU_PCT. + * + * The 'pri' field specifies the default priority for the threads that + * service all scheduled tasks. + * + * taskq_t *taskq_create_instance(name, instance, nthreads, pri, minalloc, + * maxall, flags); + * + * Like taskq_create(), but takes an instance number (or -1 to indicate + * no instance). + * + * taskq_t *taskq_create_proc(name, nthreads, pri, minalloc, maxall, proc, + * flags); + * + * Like taskq_create(), but creates the taskq threads in the specified + * system process. If proc != &p0, this must be called from a thread + * in that process. + * + * taskq_t *taskq_create_sysdc(name, nthreads, minalloc, maxall, proc, + * dc, flags); + * + * Like taskq_create_proc(), but the taskq threads will use the + * System Duty Cycle (SDC) scheduling class with a duty cycle of dc. + * + * void taskq_destroy(tap): + * + * Waits for any scheduled tasks to complete, then destroys the taskq. + * Caller should guarantee that no new tasks are scheduled in the closing + * taskq. + * + * taskqid_t taskq_dispatch(tq, func, arg, flags): + * + * Dispatches the task "func(arg)" to taskq. The 'flags' indicates whether + * the caller is willing to block for memory. The function returns an + * opaque value which is zero iff dispatch fails. If flags is TQ_NOSLEEP + * or TQ_NOALLOC and the task can't be dispatched, taskq_dispatch() fails + * and returns (taskqid_t)0. + * + * ASSUMES: func != NULL. + * + * Possible flags: + * TQ_NOSLEEP: Do not wait for resources; may fail. + * + * TQ_NOALLOC: Do not allocate memory; may fail. May only be used with + * non-dynamic task queues. + * + * TQ_NOQUEUE: Do not enqueue a task if it can't dispatch it due to + * lack of available resources and fail. If this flag is not + * set, and the task pool is exhausted, the task may be scheduled + * in the backing queue. This flag may ONLY be used with dynamic + * task queues. + * + * NOTE: This flag should always be used when a task queue is used + * for tasks that may depend on each other for completion. + * Enqueueing dependent tasks may create deadlocks. + * + * TQ_SLEEP: May block waiting for resources. May still fail for + * dynamic task queues if TQ_NOQUEUE is also specified, otherwise + * always succeed. + * + * TQ_FRONT: Puts the new task at the front of the queue. Be careful. + * + * NOTE: Dynamic task queues are much more likely to fail in + * taskq_dispatch() (especially if TQ_NOQUEUE was specified), so it + * is important to have backup strategies handling such failures. + * + * void taskq_dispatch_ent(tq, func, arg, flags, tqent) + * + * This is a light-weight form of taskq_dispatch(), that uses a + * preallocated taskq_ent_t structure for scheduling. As a + * result, it does not perform allocations and cannot ever fail. + * Note especially that it cannot be used with TASKQ_DYNAMIC + * taskqs. The memory for the tqent must not be modified or used + * until the function (func) is called. (However, func itself + * may safely modify or free this memory, once it is called.) + * Note that the taskq framework will NOT free this memory. + * + * void taskq_wait(tq): + * + * Waits for all previously scheduled tasks to complete. + * + * NOTE: It does not stop any new task dispatches. + * Do NOT call taskq_wait() from a task: it will cause deadlock. + * + * void taskq_suspend(tq) + * + * Suspend all task execution. Tasks already scheduled for a dynamic task + * queue will still be executed, but all new scheduled tasks will be + * suspended until taskq_resume() is called. + * + * int taskq_suspended(tq) + * + * Returns 1 if taskq is suspended and 0 otherwise. It is intended to + * ASSERT that the task queue is suspended. + * + * void taskq_resume(tq) + * + * Resume task queue execution. + * + * int taskq_member(tq, thread) + * + * Returns 1 if 'thread' belongs to taskq 'tq' and 0 otherwise. The + * intended use is to ASSERT that a given function is called in taskq + * context only. + * + * system_taskq + * + * Global system-wide dynamic task queue for common uses. It may be used by + * any subsystem that needs to schedule tasks and does not need to manage + * its own task queues. It is initialized quite early during system boot. + * + * IMPLEMENTATION ============================================================== + * + * This is schematic representation of the task queue structures. + * + * taskq: + * +-------------+ + * | tq_lock | +---< taskq_ent_free() + * +-------------+ | + * |... | | tqent: tqent: + * +-------------+ | +------------+ +------------+ + * | tq_freelist |-->| tqent_next |--> ... ->| tqent_next | + * +-------------+ +------------+ +------------+ + * |... | | ... | | ... | + * +-------------+ +------------+ +------------+ + * | tq_task | | + * | | +-------------->taskq_ent_alloc() + * +--------------------------------------------------------------------------+ + * | | | tqent tqent | + * | +---------------------+ +--> +------------+ +--> +------------+ | + * | | ... | | | func, arg | | | func, arg | | + * +>+---------------------+ <---|-+ +------------+ <---|-+ +------------+ | + * | tq_taskq.tqent_next | ----+ | | tqent_next | --->+ | | tqent_next |--+ + * +---------------------+ | +------------+ ^ | +------------+ + * +-| tq_task.tqent_prev | +--| tqent_prev | | +--| tqent_prev | ^ + * | +---------------------+ +------------+ | +------------+ | + * | |... | | ... | | | ... | | + * | +---------------------+ +------------+ | +------------+ | + * | ^ | | + * | | | | + * +--------------------------------------+--------------+ TQ_APPEND() -+ + * | | | + * |... | taskq_thread()-----+ + * +-------------+ + * | tq_buckets |--+-------> [ NULL ] (for regular task queues) + * +-------------+ | + * | DYNAMIC TASK QUEUES: + * | + * +-> taskq_bucket[nCPU] taskq_bucket_dispatch() + * +-------------------+ ^ + * +--->| tqbucket_lock | | + * | +-------------------+ +--------+ +--------+ + * | | tqbucket_freelist |-->| tqent |-->...| tqent | ^ + * | +-------------------+<--+--------+<--...+--------+ | + * | | ... | | thread | | thread | | + * | +-------------------+ +--------+ +--------+ | + * | +-------------------+ | + * taskq_dispatch()--+--->| tqbucket_lock | TQ_APPEND()------+ + * TQ_HASH() | +-------------------+ +--------+ +--------+ + * | | tqbucket_freelist |-->| tqent |-->...| tqent | + * | +-------------------+<--+--------+<--...+--------+ + * | | ... | | thread | | thread | + * | +-------------------+ +--------+ +--------+ + * +---> ... + * + * + * Task queues use tq_task field to link new entry in the queue. The queue is a + * circular doubly-linked list. Entries are put in the end of the list with + * TQ_APPEND() and processed from the front of the list by taskq_thread() in + * FIFO order. Task queue entries are cached in the free list managed by + * taskq_ent_alloc() and taskq_ent_free() functions. + * + * All threads used by task queues mark t_taskq field of the thread to + * point to the task queue. + * + * Taskq Thread Management ----------------------------------------------------- + * + * Taskq's non-dynamic threads are managed with several variables and flags: + * + * * tq_nthreads - The number of threads in taskq_thread() for the + * taskq. + * + * * tq_active - The number of threads not waiting on a CV in + * taskq_thread(); includes newly created threads + * not yet counted in tq_nthreads. + * + * * tq_nthreads_target + * - The number of threads desired for the taskq. + * + * * tq_flags & TASKQ_CHANGING + * - Indicates that tq_nthreads != tq_nthreads_target. + * + * * tq_flags & TASKQ_THREAD_CREATED + * - Indicates that a thread is being created in the taskq. + * + * During creation, tq_nthreads and tq_active are set to 0, and + * tq_nthreads_target is set to the number of threads desired. The + * TASKQ_CHANGING flag is set, and taskq_thread_create() is called to + * create the first thread. taskq_thread_create() increments tq_active, + * sets TASKQ_THREAD_CREATED, and creates the new thread. + * + * Each thread starts in taskq_thread(), clears the TASKQ_THREAD_CREATED + * flag, and increments tq_nthreads. It stores the new value of + * tq_nthreads as its "thread_id", and stores its thread pointer in the + * tq_threadlist at the (thread_id - 1). We keep the thread_id space + * densely packed by requiring that only the largest thread_id can exit during + * normal adjustment. The exception is during the destruction of the + * taskq; once tq_nthreads_target is set to zero, no new threads will be created + * for the taskq queue, so every thread can exit without any ordering being + * necessary. + * + * Threads will only process work if their thread id is <= tq_nthreads_target. + * + * When TASKQ_CHANGING is set, threads will check the current thread target + * whenever they wake up, and do whatever they can to apply its effects. + * + * TASKQ_THREAD_CPU_PCT -------------------------------------------------------- + * + * When a taskq is created with TASKQ_THREAD_CPU_PCT, we store their requested + * percentage in tq_threads_ncpus_pct, start them off with the correct thread + * target, and add them to the taskq_cpupct_list for later adjustment. + * + * We register taskq_cpu_setup() to be called whenever a CPU changes state. It + * walks the list of TASKQ_THREAD_CPU_PCT taskqs, adjusts their nthread_target + * if need be, and wakes up all of the threads to process the change. + * + * Dynamic Task Queues Implementation ------------------------------------------ + * + * For a dynamic task queues there is a 1-to-1 mapping between a thread and + * taskq_ent_structure. Each entry is serviced by its own thread and each thread + * is controlled by a single entry. + * + * Entries are distributed over a set of buckets. To avoid using modulo + * arithmetics the number of buckets is 2^n and is determined as the nearest + * power of two roundown of the number of CPUs in the system. Tunable + * variable 'taskq_maxbuckets' limits the maximum number of buckets. Each entry + * is attached to a bucket for its lifetime and can't migrate to other buckets. + * + * Entries that have scheduled tasks are not placed in any list. The dispatch + * function sets their "func" and "arg" fields and signals the corresponding + * thread to execute the task. Once the thread executes the task it clears the + * "func" field and places an entry on the bucket cache of free entries pointed + * by "tqbucket_freelist" field. ALL entries on the free list should have "func" + * field equal to NULL. The free list is a circular doubly-linked list identical + * in structure to the tq_task list above, but entries are taken from it in LIFO + * order - the last freed entry is the first to be allocated. The + * taskq_bucket_dispatch() function gets the most recently used entry from the + * free list, sets its "func" and "arg" fields and signals a worker thread. + * + * After executing each task a per-entry thread taskq_d_thread() places its + * entry on the bucket free list and goes to a timed sleep. If it wakes up + * without getting new task it removes the entry from the free list and destroys + * itself. The thread sleep time is controlled by a tunable variable + * `taskq_thread_timeout'. + * + * There are various statistics kept in the bucket which allows for later + * analysis of taskq usage patterns. Also, a global copy of taskq creation and + * death statistics is kept in the global taskq data structure. Since thread + * creation and death happen rarely, updating such global data does not present + * a performance problem. + * + * NOTE: Threads are not bound to any CPU and there is absolutely no association + * between the bucket and actual thread CPU, so buckets are used only to + * split resources and reduce resource contention. Having threads attached + * to the CPU denoted by a bucket may reduce number of times the job + * switches between CPUs. + * + * Current algorithm creates a thread whenever a bucket has no free + * entries. It would be nice to know how many threads are in the running + * state and don't create threads if all CPUs are busy with existing + * tasks, but it is unclear how such strategy can be implemented. + * + * Currently buckets are created statically as an array attached to task + * queue. On some system with nCPUs < max_ncpus it may waste system + * memory. One solution may be allocation of buckets when they are first + * touched, but it is not clear how useful it is. + * + * SUSPEND/RESUME implementation ----------------------------------------------- + * + * Before executing a task taskq_thread() (executing non-dynamic task + * queues) obtains taskq's thread lock as a reader. The taskq_suspend() + * function gets the same lock as a writer blocking all non-dynamic task + * execution. The taskq_resume() function releases the lock allowing + * taskq_thread to continue execution. + * + * For dynamic task queues, each bucket is marked as TQBUCKET_SUSPEND by + * taskq_suspend() function. After that taskq_bucket_dispatch() always + * fails, so that taskq_dispatch() will either enqueue tasks for a + * suspended backing queue or fail if TQ_NOQUEUE is specified in dispatch + * flags. + * + * NOTE: taskq_suspend() does not immediately block any tasks already + * scheduled for dynamic task queues. It only suspends new tasks + * scheduled after taskq_suspend() was called. + * + * taskq_member() function works by comparing a thread t_taskq pointer with + * the passed thread pointer. + * + * LOCKS and LOCK Hierarchy ---------------------------------------------------- + * + * There are three locks used in task queues: + * + * 1) The taskq_t's tq_lock, protecting global task queue state. + * + * 2) Each per-CPU bucket has a lock for bucket management. + * + * 3) The global taskq_cpupct_lock, which protects the list of + * TASKQ_THREADS_CPU_PCT taskqs. + * + * If both (1) and (2) are needed, tq_lock should be taken *after* the bucket + * lock. + * + * If both (1) and (3) are needed, tq_lock should be taken *after* + * taskq_cpupct_lock. + * + * DEBUG FACILITIES ------------------------------------------------------------ + * + * For DEBUG kernels it is possible to induce random failures to + * taskq_dispatch() function when it is given TQ_NOSLEEP argument. The value of + * taskq_dmtbf and taskq_smtbf tunables control the mean time between induced + * failures for dynamic and static task queues respectively. + * + * Setting TASKQ_STATISTIC to 0 will disable per-bucket statistics. + * + * TUNABLES -------------------------------------------------------------------- + * + * system_taskq_size - Size of the global system_taskq. + * This value is multiplied by nCPUs to determine + * actual size. + * Default value: 64 + * + * taskq_minimum_nthreads_max + * - Minimum size of the thread list for a taskq. + * Useful for testing different thread pool + * sizes by overwriting tq_nthreads_target. + * + * taskq_thread_timeout - Maximum idle time for taskq_d_thread() + * Default value: 5 minutes + * + * taskq_maxbuckets - Maximum number of buckets in any task queue + * Default value: 128 + * + * taskq_search_depth - Maximum # of buckets searched for a free entry + * Default value: 4 + * + * taskq_dmtbf - Mean time between induced dispatch failures + * for dynamic task queues. + * Default value: UINT_MAX (no induced failures) + * + * taskq_smtbf - Mean time between induced dispatch failures + * for static task queues. + * Default value: UINT_MAX (no induced failures) + * + * CONDITIONAL compilation ----------------------------------------------------- + * + * TASKQ_STATISTIC - If set will enable bucket statistic (default). + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For throttlefree */ +#include +#include +#include +#include +#include +#include + +#include + +static kmem_cache_t *taskq_ent_cache, *taskq_cache; + +/* + * Pseudo instance numbers for taskqs without explicitly provided instance. + */ +static vmem_t *taskq_id_arena; + +/* Global system task queue for common use */ +taskq_t *system_taskq = NULL; + +/* + * Maximum number of entries in global system taskq is + * system_taskq_size * max_ncpus + */ +#define SYSTEM_TASKQ_SIZE 64 +int system_taskq_size = SYSTEM_TASKQ_SIZE; + +/* + * Minimum size for tq_nthreads_max; useful for those who want to play around + * with increasing a taskq's tq_nthreads_target. + */ +int taskq_minimum_nthreads_max = 1; + +/* + * We want to ensure that when taskq_create() returns, there is at least + * one thread ready to handle requests. To guarantee this, we have to wait + * for the second thread, since the first one cannot process requests until + * the second thread has been created. + */ +#define TASKQ_CREATE_ACTIVE_THREADS 2 + +/* Maximum percentage allowed for TASKQ_THREADS_CPU_PCT */ +#define TASKQ_CPUPCT_MAX_PERCENT 1000 +int taskq_cpupct_max_percent = TASKQ_CPUPCT_MAX_PERCENT; + +/* + * Dynamic task queue threads that don't get any work within + * taskq_thread_timeout destroy themselves + */ +#define TASKQ_THREAD_TIMEOUT (60 * 5) +int taskq_thread_timeout = TASKQ_THREAD_TIMEOUT; + +#define TASKQ_MAXBUCKETS 128 +int taskq_maxbuckets = TASKQ_MAXBUCKETS; + +/* + * When a bucket has no available entries another buckets are tried. + * taskq_search_depth parameter limits the amount of buckets that we search + * before failing. This is mostly useful in systems with many CPUs where we may + * spend too much time scanning busy buckets. + */ +#define TASKQ_SEARCH_DEPTH 4 +int taskq_search_depth = TASKQ_SEARCH_DEPTH; + +/* + * Hashing function: mix various bits of x. May be pretty much anything. + */ +#define TQ_HASH(x) ((x) ^ ((x) >> 11) ^ ((x) >> 17) ^ ((x) ^ 27)) + +/* + * We do not create any new threads when the system is low on memory and start + * throttling memory allocations. The following macro tries to estimate such + * condition. + */ +#define ENOUGH_MEMORY() (spl_vm_pool_low()) + +/* + * Static functions. + */ +static taskq_t *taskq_create_common(const char *, int, int, pri_t, int, + int, proc_t *, uint_t, uint_t); +static void taskq_thread(void *); +static void taskq_d_thread(taskq_ent_t *); +static void taskq_bucket_extend(void *); +static int taskq_constructor(void *, void *, int); +static void taskq_destructor(void *, void *); +static int taskq_ent_constructor(void *, void *, int); +static void taskq_ent_destructor(void *, void *); +static taskq_ent_t *taskq_ent_alloc(taskq_t *, int); +static void taskq_ent_free(taskq_t *, taskq_ent_t *); +static int taskq_ent_exists(taskq_t *, task_func_t, void *); +static taskq_ent_t *taskq_bucket_dispatch(taskq_bucket_t *, task_func_t, + void *); + +/* + * Task queues kstats. + */ +struct taskq_kstat { + kstat_named_t tq_pid; + kstat_named_t tq_tasks; + kstat_named_t tq_executed; + kstat_named_t tq_maxtasks; + kstat_named_t tq_totaltime; + kstat_named_t tq_nalloc; + kstat_named_t tq_nactive; + kstat_named_t tq_pri; + kstat_named_t tq_nthreads; +} taskq_kstat = { + { "pid", KSTAT_DATA_UINT64 }, + { "tasks", KSTAT_DATA_UINT64 }, + { "executed", KSTAT_DATA_UINT64 }, + { "maxtasks", KSTAT_DATA_UINT64 }, + { "totaltime", KSTAT_DATA_UINT64 }, + { "nactive", KSTAT_DATA_UINT64 }, + { "nalloc", KSTAT_DATA_UINT64 }, + { "priority", KSTAT_DATA_UINT64 }, + { "threads", KSTAT_DATA_UINT64 }, +}; + +struct taskq_d_kstat { + kstat_named_t tqd_pri; + kstat_named_t tqd_btasks; + kstat_named_t tqd_bexecuted; + kstat_named_t tqd_bmaxtasks; + kstat_named_t tqd_bnalloc; + kstat_named_t tqd_bnactive; + kstat_named_t tqd_btotaltime; + kstat_named_t tqd_hits; + kstat_named_t tqd_misses; + kstat_named_t tqd_overflows; + kstat_named_t tqd_tcreates; + kstat_named_t tqd_tdeaths; + kstat_named_t tqd_maxthreads; + kstat_named_t tqd_nomem; + kstat_named_t tqd_disptcreates; + kstat_named_t tqd_totaltime; + kstat_named_t tqd_nalloc; + kstat_named_t tqd_nfree; +} taskq_d_kstat = { + { "priority", KSTAT_DATA_UINT64 }, + { "btasks", KSTAT_DATA_UINT64 }, + { "bexecuted", KSTAT_DATA_UINT64 }, + { "bmaxtasks", KSTAT_DATA_UINT64 }, + { "bnalloc", KSTAT_DATA_UINT64 }, + { "bnactive", KSTAT_DATA_UINT64 }, + { "btotaltime", KSTAT_DATA_UINT64 }, + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "overflows", KSTAT_DATA_UINT64 }, + { "tcreates", KSTAT_DATA_UINT64 }, + { "tdeaths", KSTAT_DATA_UINT64 }, + { "maxthreads", KSTAT_DATA_UINT64 }, + { "nomem", KSTAT_DATA_UINT64 }, + { "disptcreates", KSTAT_DATA_UINT64 }, + { "totaltime", KSTAT_DATA_UINT64 }, + { "nalloc", KSTAT_DATA_UINT64 }, + { "nfree", KSTAT_DATA_UINT64 }, +}; + +static kmutex_t taskq_kstat_lock; +static kmutex_t taskq_d_kstat_lock; +static int taskq_kstat_update(kstat_t *, int); +static int taskq_d_kstat_update(kstat_t *, int); + +/* + * List of all TASKQ_THREADS_CPU_PCT taskqs. + */ +static list_t taskq_cpupct_list; /* protected by cpu_lock */ + +/* + * Collect per-bucket statistic when TASKQ_STATISTIC is defined. + */ +#define TASKQ_STATISTIC 1 + +#if TASKQ_STATISTIC +#define TQ_STAT(b, x) b->tqbucket_stat.x++ +#else +#define TQ_STAT(b, x) +#endif + +/* + * Random fault injection. + */ +uint_t taskq_random; +uint_t taskq_dmtbf = UINT_MAX; /* mean time between injected failures */ +uint_t taskq_smtbf = UINT_MAX; /* mean time between injected failures */ + +/* + * TQ_NOSLEEP dispatches on dynamic task queues are always allowed to fail. + * + * TQ_NOSLEEP dispatches on static task queues can't arbitrarily fail because + * they could prepopulate the cache and make sure that they do not use more + * then minalloc entries. So, fault injection in this case insures that + * either TASKQ_PREPOPULATE is not set or there are more entries allocated + * than is specified by minalloc. TQ_NOALLOC dispatches are always allowed + * to fail, but for simplicity we treat them identically to TQ_NOSLEEP + * dispatches. + */ +#ifdef DEBUG +#define TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flag) \ + taskq_random = (taskq_random * 2416 + 374441) % 1771875;\ + if ((flag & TQ_NOSLEEP) && \ + taskq_random < 1771875 / taskq_dmtbf) { \ + return (0); \ + } + +#define TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flag) \ + taskq_random = (taskq_random * 2416 + 374441) % 1771875;\ + if ((flag & (TQ_NOSLEEP | TQ_NOALLOC)) && \ + (!(tq->tq_flags & TASKQ_PREPOPULATE) || \ + (tq->tq_nalloc > tq->tq_minalloc)) && \ + (taskq_random < (1771875 / taskq_smtbf))) { \ + mutex_exit(&tq->tq_lock); \ + return (0); \ + } +#else +#define TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flag) +#define TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flag) +#endif + +#define IS_EMPTY(l) (((l).tqent_prev == (l).tqent_next) && \ + ((l).tqent_prev == &(l))) + +/* + * Append `tqe' in the end of the doubly-linked list denoted by l. + */ +#define TQ_APPEND(l, tqe) { \ + tqe->tqent_next = &l; \ + tqe->tqent_prev = l.tqent_prev; \ + tqe->tqent_next->tqent_prev = tqe; \ + tqe->tqent_prev->tqent_next = tqe; \ +} +/* + * Prepend 'tqe' to the beginning of l + */ +#define TQ_PREPEND(l, tqe) { \ + tqe->tqent_next = l.tqent_next; \ + tqe->tqent_prev = &l; \ + tqe->tqent_next->tqent_prev = tqe; \ + tqe->tqent_prev->tqent_next = tqe; \ +} + +/* + * Schedule a task specified by func and arg into the task queue entry tqe. + */ +#define TQ_DO_ENQUEUE(tq, tqe, func, arg, front) { \ + ASSERT(MUTEX_HELD(&tq->tq_lock)); \ + _NOTE(CONSTCOND) \ + if (front) { \ + TQ_PREPEND(tq->tq_task, tqe); \ + } else { \ + TQ_APPEND(tq->tq_task, tqe); \ + } \ + tqe->tqent_func = (func); \ + tqe->tqent_arg = (arg); \ + tq->tq_tasks++; \ + if (tq->tq_tasks - tq->tq_executed > tq->tq_maxtasks) \ + tq->tq_maxtasks = (int)(tq->tq_tasks - tq->tq_executed); \ + cv_signal(&tq->tq_dispatch_cv); \ + DTRACE_PROBE2(taskq__enqueue, taskq_t *, tq, taskq_ent_t *, tqe); \ +} + +#define TQ_ENQUEUE(tq, tqe, func, arg) \ + TQ_DO_ENQUEUE(tq, tqe, func, arg, 0) + +#define TQ_ENQUEUE_FRONT(tq, tqe, func, arg) \ + TQ_DO_ENQUEUE(tq, tqe, func, arg, 1) + +/* + * Do-nothing task which may be used to prepopulate thread caches. + */ +/*ARGSUSED*/ +void +nulltask(void *unused) +{ +} + +/*ARGSUSED*/ +static int +taskq_constructor(void *buf, void *cdrarg, int kmflags) +{ + taskq_t *tq = buf; + + bzero(tq, sizeof (taskq_t)); + + mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL); + cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_exit_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL); + + tq->tq_task.tqent_next = &tq->tq_task; + tq->tq_task.tqent_prev = &tq->tq_task; + + return (0); +} + +/*ARGSUSED*/ +static void +taskq_destructor(void *buf, void *cdrarg) +{ + taskq_t *tq = buf; + + ASSERT(tq->tq_nthreads == 0); + ASSERT(tq->tq_buckets == NULL); + ASSERT(tq->tq_tcreates == 0); + ASSERT(tq->tq_tdeaths == 0); + + mutex_destroy(&tq->tq_lock); + rw_destroy(&tq->tq_threadlock); + cv_destroy(&tq->tq_dispatch_cv); + cv_destroy(&tq->tq_exit_cv); + cv_destroy(&tq->tq_wait_cv); + cv_destroy(&tq->tq_maxalloc_cv); +} + +/*ARGSUSED*/ +static int +taskq_ent_constructor(void *buf, void *cdrarg, int kmflags) +{ + taskq_ent_t *tqe = buf; + + tqe->tqent_thread = NULL; + cv_init(&tqe->tqent_cv, NULL, CV_DEFAULT, NULL); + /* Simulate TS_STOPPED */ + mutex_init(&tqe->tqent_thread_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&tqe->tqent_thread_cv, NULL, CV_DEFAULT, NULL); + + return (0); +} + +/*ARGSUSED*/ +static void +taskq_ent_destructor(void *buf, void *cdrarg) +{ + taskq_ent_t *tqe = buf; + + ASSERT(tqe->tqent_thread == NULL); + cv_destroy(&tqe->tqent_cv); + /* See comment in taskq_d_thread(). */ + mutex_destroy(&tqe->tqent_thread_lock); + cv_destroy(&tqe->tqent_thread_cv); +} + +int +spl_taskq_init(void) +{ + taskq_ent_cache = kmem_cache_create("taskq_ent_cache", + sizeof (taskq_ent_t), 0, taskq_ent_constructor, + taskq_ent_destructor, NULL, NULL, NULL, 0); + taskq_cache = kmem_cache_create("taskq_cache", sizeof (taskq_t), + 0, taskq_constructor, taskq_destructor, NULL, NULL, NULL, 0); + taskq_id_arena = vmem_create("taskq_id_arena", + (void *)1, INT32_MAX, 1, NULL, NULL, NULL, 0, + VM_SLEEP | VMC_IDENTIFIER); + + list_create(&taskq_cpupct_list, sizeof (taskq_t), + offsetof(taskq_t, tq_cpupct_link)); + + mutex_init(&taskq_kstat_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&taskq_d_kstat_lock, NULL, MUTEX_DEFAULT, NULL); + + return 0; +} + +void +spl_taskq_fini(void) +{ + if (taskq_cache) { + kmem_cache_destroy(taskq_cache); + taskq_cache = NULL; + } + if (taskq_ent_cache) { + kmem_cache_destroy(taskq_ent_cache); + taskq_ent_cache = NULL; + } + + list_destroy(&taskq_cpupct_list); + + mutex_destroy(&taskq_d_kstat_lock); + mutex_destroy(&taskq_kstat_lock); + + vmem_destroy(taskq_id_arena); +} + + + + +static void +taskq_update_nthreads(taskq_t *tq, uint_t ncpus) +{ + uint_t newtarget = TASKQ_THREADS_PCT(ncpus, tq->tq_threads_ncpus_pct); + + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + /* We must be going from non-zero to non-zero; no exiting. */ + ASSERT3U(tq->tq_nthreads_target, !=, 0); + ASSERT3U(newtarget, !=, 0); + + ASSERT3U(newtarget, <=, tq->tq_nthreads_max); + if (newtarget != tq->tq_nthreads_target) { + tq->tq_flags |= TASKQ_CHANGING; + tq->tq_nthreads_target = newtarget; + cv_broadcast(&tq->tq_dispatch_cv); + cv_broadcast(&tq->tq_exit_cv); + } +} + + +/* + * Create global system dynamic task queue. + */ +void +system_taskq_init(void) +{ + system_taskq = taskq_create_common("system_taskq", 0, + system_taskq_size * max_ncpus, minclsyspri, 4, 512, &p0, 0, + TASKQ_DYNAMIC | TASKQ_PREPOPULATE); +} + + +void +system_taskq_fini(void) +{ + if (system_taskq) + taskq_destroy(system_taskq); + system_taskq = NULL; +} + +/* + * taskq_ent_alloc() + * + * Allocates a new taskq_ent_t structure either from the free list or from the + * cache. Returns NULL if it can't be allocated. + * + * Assumes: tq->tq_lock is held. + */ +static taskq_ent_t * +taskq_ent_alloc(taskq_t *tq, int flags) +{ + int kmflags = (flags & TQ_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; + taskq_ent_t *tqe; + clock_t wait_time; + clock_t wait_rv; + + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + /* + * TQ_NOALLOC allocations are allowed to use the freelist, even if + * we are below tq_minalloc. + */ +again: if ((tqe = tq->tq_freelist) != NULL && + ((flags & TQ_NOALLOC) || tq->tq_nalloc >= tq->tq_minalloc)) { + tq->tq_freelist = tqe->tqent_next; + } else { + if (flags & TQ_NOALLOC) + return (NULL); + + if (tq->tq_nalloc >= tq->tq_maxalloc) { + if (kmflags & KM_NOSLEEP) + return (NULL); + + /* + * We don't want to exceed tq_maxalloc, but we can't + * wait for other tasks to complete (and thus free up + * task structures) without risking deadlock with + * the caller. So, we just delay for one second + * to throttle the allocation rate. If we have tasks + * complete before one second timeout expires then + * taskq_ent_free will signal us and we will + * immediately retry the allocation (reap free). + */ + wait_time = ddi_get_lbolt() + hz; + while (tq->tq_freelist == NULL) { + tq->tq_maxalloc_wait++; + wait_rv = cv_timedwait(&tq->tq_maxalloc_cv, + &tq->tq_lock, wait_time); + tq->tq_maxalloc_wait--; + if (wait_rv == -1) + break; + } + if (tq->tq_freelist) + goto again; /* reap freelist */ + + } + mutex_exit(&tq->tq_lock); + + tqe = kmem_cache_alloc(taskq_ent_cache, kmflags); + + mutex_enter(&tq->tq_lock); + if (tqe != NULL) + tq->tq_nalloc++; + } + return (tqe); +} + +/* + * taskq_ent_free() + * + * Free taskq_ent_t structure by either putting it on the free list or freeing + * it to the cache. + * + * Assumes: tq->tq_lock is held. + */ +static void +taskq_ent_free(taskq_t *tq, taskq_ent_t *tqe) +{ + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + if (tq->tq_nalloc <= tq->tq_minalloc) { + tqe->tqent_next = tq->tq_freelist; + tq->tq_freelist = tqe; + } else { + tq->tq_nalloc--; + mutex_exit(&tq->tq_lock); + kmem_cache_free(taskq_ent_cache, tqe); + mutex_enter(&tq->tq_lock); + } + + if (tq->tq_maxalloc_wait) + cv_signal(&tq->tq_maxalloc_cv); +} + +/* + * taskq_ent_exists() + * + * Return 1 if taskq already has entry for calling 'func(arg)'. + * + * Assumes: tq->tq_lock is held. + */ +static int +taskq_ent_exists(taskq_t *tq, task_func_t func, void *arg) +{ + taskq_ent_t *tqe; + + ASSERT(MUTEX_HELD(&tq->tq_lock)); + + for (tqe = tq->tq_task.tqent_next; tqe != &tq->tq_task; + tqe = tqe->tqent_next) + if ((tqe->tqent_func == func) && (tqe->tqent_arg == arg)) + return (1); + return (0); +} + +/* + * Dispatch a task "func(arg)" to a free entry of bucket b. + * + * Assumes: no bucket locks is held. + * + * Returns: a pointer to an entry if dispatch was successful. + * NULL if there are no free entries or if the bucket is suspended. + */ +static taskq_ent_t * +taskq_bucket_dispatch(taskq_bucket_t *b, task_func_t func, void *arg) +{ + taskq_ent_t *tqe; + + ASSERT(MUTEX_NOT_HELD(&b->tqbucket_lock)); + ASSERT(func != NULL); + + mutex_enter(&b->tqbucket_lock); + + ASSERT(b->tqbucket_nfree != 0 || IS_EMPTY(b->tqbucket_freelist)); + ASSERT(b->tqbucket_nfree == 0 || !IS_EMPTY(b->tqbucket_freelist)); + + /* + * Get en entry from the freelist if there is one. + * Schedule task into the entry. + */ + if ((b->tqbucket_nfree != 0) && + !(b->tqbucket_flags & TQBUCKET_SUSPEND)) { + tqe = b->tqbucket_freelist.tqent_prev; + + ASSERT(tqe != &b->tqbucket_freelist); + ASSERT(tqe->tqent_thread != NULL); + + tqe->tqent_prev->tqent_next = tqe->tqent_next; + tqe->tqent_next->tqent_prev = tqe->tqent_prev; + b->tqbucket_nalloc++; + b->tqbucket_nfree--; + tqe->tqent_func = func; + tqe->tqent_arg = arg; + TQ_STAT(b, tqs_hits); + cv_signal(&tqe->tqent_cv); + DTRACE_PROBE2(taskq__d__enqueue, taskq_bucket_t *, b, + taskq_ent_t *, tqe); + } else { + tqe = NULL; + TQ_STAT(b, tqs_misses); + } + mutex_exit(&b->tqbucket_lock); + return (tqe); +} + +/* + * Dispatch a task. + * + * Assumes: func != NULL + * + * Returns: NULL if dispatch failed. + * non-NULL if task dispatched successfully. + * Actual return value is the pointer to taskq entry that was used to + * dispatch a task. This is useful for debugging. + */ +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) +{ + taskq_bucket_t *bucket = NULL; /* Which bucket needs extension */ + taskq_ent_t *tqe = NULL; + taskq_ent_t *tqe1; + uint_t bsize; + + ASSERT(tq != NULL); + ASSERT(func != NULL); + + if (!(tq->tq_flags & TASKQ_DYNAMIC)) { + /* + * TQ_NOQUEUE flag can't be used with non-dynamic task queues. + */ + ASSERT(!(flags & TQ_NOQUEUE)); + /* + * Enqueue the task to the underlying queue. + */ + mutex_enter(&tq->tq_lock); + + TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flags); + + if ((tqe = taskq_ent_alloc(tq, flags)) == NULL) { + mutex_exit(&tq->tq_lock); + return (0); + } + /* Make sure we start without any flags */ + tqe->tqent_un.tqent_flags = 0; + + if (flags & TQ_FRONT) { + TQ_ENQUEUE_FRONT(tq, tqe, func, arg); + } else { + TQ_ENQUEUE(tq, tqe, func, arg); + } + mutex_exit(&tq->tq_lock); + return ((taskqid_t)tqe); + } + + /* + * Dynamic taskq dispatching. + */ + ASSERT(!(flags & (TQ_NOALLOC | TQ_FRONT))); + TASKQ_D_RANDOM_DISPATCH_FAILURE(tq, flags); + + bsize = tq->tq_nbuckets; + + if (bsize == 1) { + /* + * In a single-CPU case there is only one bucket, so get + * entry directly from there. + */ + if ((tqe = taskq_bucket_dispatch(tq->tq_buckets, func, arg)) + != NULL) + return ((taskqid_t)tqe); /* Fastpath */ + bucket = tq->tq_buckets; + } else { + int loopcount; + taskq_bucket_t *b; + //uintptr_t h = ((uintptr_t)CPU + (uintptr_t)arg) >> 3; + uintptr_t h = ((uintptr_t)(cpu_number()<<3) + (uintptr_t)arg) >> 3; + + h = TQ_HASH(h); + + /* + * The 'bucket' points to the original bucket that we hit. If we + * can't allocate from it, we search other buckets, but only + * extend this one. + */ + b = &tq->tq_buckets[h & (bsize - 1)]; + ASSERT(b->tqbucket_taskq == tq); /* Sanity check */ + + /* + * Do a quick check before grabbing the lock. If the bucket does + * not have free entries now, chances are very small that it + * will after we take the lock, so we just skip it. + */ + if (b->tqbucket_nfree != 0) { + if ((tqe = taskq_bucket_dispatch(b, func, arg)) != NULL) + return ((taskqid_t)tqe); /* Fastpath */ + } else { + TQ_STAT(b, tqs_misses); + } + + bucket = b; + loopcount = MIN(taskq_search_depth, bsize); + /* + * If bucket dispatch failed, search loopcount number of buckets + * before we give up and fail. + */ + do { + b = &tq->tq_buckets[++h & (bsize - 1)]; + ASSERT(b->tqbucket_taskq == tq); /* Sanity check */ + loopcount--; + + if (b->tqbucket_nfree != 0) { + tqe = taskq_bucket_dispatch(b, func, arg); + } else { + TQ_STAT(b, tqs_misses); + } + } while ((tqe == NULL) && (loopcount > 0)); + } + + /* + * At this point we either scheduled a task and (tqe != NULL) or failed + * (tqe == NULL). Try to recover from fails. + */ + + /* + * For KM_SLEEP dispatches, try to extend the bucket and retry dispatch. + */ + if ((tqe == NULL) && !(flags & TQ_NOSLEEP)) { + /* + * taskq_bucket_extend() may fail to do anything, but this is + * fine - we deal with it later. If the bucket was successfully + * extended, there is a good chance that taskq_bucket_dispatch() + * will get this new entry, unless someone is racing with us and + * stealing the new entry from under our nose. + * taskq_bucket_extend() may sleep. + */ + taskq_bucket_extend(bucket); + TQ_STAT(bucket, tqs_disptcreates); + if ((tqe = taskq_bucket_dispatch(bucket, func, arg)) != NULL) + return ((taskqid_t)tqe); + } + + ASSERT(bucket != NULL); + + /* + * Since there are not enough free entries in the bucket, add a + * taskq entry to extend it in the background using backing queue + * (unless we already have a taskq entry to perform that extension). + */ + mutex_enter(&tq->tq_lock); + if (!taskq_ent_exists(tq, taskq_bucket_extend, bucket)) { + if ((tqe1 = taskq_ent_alloc(tq, TQ_NOSLEEP)) != NULL) { + TQ_ENQUEUE_FRONT(tq, tqe1, taskq_bucket_extend, bucket); + } else { + TQ_STAT(bucket, tqs_nomem); + } + } + + /* + * Dispatch failed and we can't find an entry to schedule a task. + * Revert to the backing queue unless TQ_NOQUEUE was asked. + */ + if ((tqe == NULL) && !(flags & TQ_NOQUEUE)) { + if ((tqe = taskq_ent_alloc(tq, flags)) != NULL) { + TQ_ENQUEUE(tq, tqe, func, arg); + } else { + TQ_STAT(bucket, tqs_nomem); + } + } + mutex_exit(&tq->tq_lock); + + return ((taskqid_t)tqe); +} + +/* + * FIXME, Linux has added the ability to start taskq with a given + * delay. + */ +taskqid_t +taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time) +{ + return taskq_dispatch(tq, func, arg, flags); +} + +void +taskq_init_ent(taskq_ent_t *t) +{ + memset(t, 0, sizeof(*t)); +} + + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *tqe) +{ + ASSERT(func != NULL); + ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC)); + + /* + * Mark it as a prealloc'd task. This is important + * to ensure that we don't free it later. + */ + tqe->tqent_un.tqent_flags |= TQENT_FLAG_PREALLOC; + /* + * Enqueue the task to the underlying queue. + */ + mutex_enter(&tq->tq_lock); + + if (flags & TQ_FRONT) { + TQ_ENQUEUE_FRONT(tq, tqe, func, arg); + } else { + TQ_ENQUEUE(tq, tqe, func, arg); + } + mutex_exit(&tq->tq_lock); +} + +/* + * Wait for all pending tasks to complete. + * Calling taskq_wait from a task will cause deadlock. + */ +void +taskq_wait(taskq_t *tq) +{ + mutex_enter(&tq->tq_lock); + while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0) + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + mutex_exit(&tq->tq_lock); + + if (tq->tq_flags & TASKQ_DYNAMIC) { + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + mutex_enter(&b->tqbucket_lock); + while (b->tqbucket_nalloc > 0) + cv_wait(&b->tqbucket_cv, &b->tqbucket_lock); + mutex_exit(&b->tqbucket_lock); + } + } +} + +/* + * Suspend execution of tasks. + * + * Tasks in the queue part will be suspended immediately upon return from this + * function. Pending tasks in the dynamic part will continue to execute, but all + * new tasks will be suspended. + */ +void +taskq_suspend(taskq_t *tq) +{ + rw_enter(&tq->tq_threadlock, RW_WRITER); + + if (tq->tq_flags & TASKQ_DYNAMIC) { + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + mutex_enter(&b->tqbucket_lock); + b->tqbucket_flags |= TQBUCKET_SUSPEND; + mutex_exit(&b->tqbucket_lock); + } + } + /* + * Mark task queue as being suspended. Needed for taskq_suspended(). + */ + mutex_enter(&tq->tq_lock); + ASSERT(!(tq->tq_flags & TASKQ_SUSPENDED)); + tq->tq_flags |= TASKQ_SUSPENDED; + mutex_exit(&tq->tq_lock); +} + +/* + * returns: 1 if tq is suspended, 0 otherwise. + */ +int +taskq_suspended(taskq_t *tq) +{ + return ((tq->tq_flags & TASKQ_SUSPENDED) != 0); +} + +/* + * Resume taskq execution. + */ +void +taskq_resume(taskq_t *tq) +{ + ASSERT(RW_WRITE_HELD(&tq->tq_threadlock)); + + if (tq->tq_flags & TASKQ_DYNAMIC) { + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + mutex_enter(&b->tqbucket_lock); + b->tqbucket_flags &= ~TQBUCKET_SUSPEND; + mutex_exit(&b->tqbucket_lock); + } + } + mutex_enter(&tq->tq_lock); + ASSERT(tq->tq_flags & TASKQ_SUSPENDED); + tq->tq_flags &= ~TASKQ_SUSPENDED; + mutex_exit(&tq->tq_lock); + + rw_exit(&tq->tq_threadlock); +} + +int +taskq_member(taskq_t *tq, struct kthread *thread) +{ + int i; + + mutex_enter(&tq->tq_lock); + if (tq->tq_thread != NULL) /* nthreads==1 case */ + if (tq->tq_thread == (void *)thread) { + mutex_exit(&tq->tq_lock); + return 1; + } + + for (i = 0;i < tq->tq_nthreads; i++) + if (tq->tq_threadlist[i] == (void *)thread) { + mutex_exit(&tq->tq_lock); + return (1); + } + mutex_exit(&tq->tq_lock); + return (0); +} + +/* + * Creates a thread in the taskq. We only allow one outstanding create at + * a time. We drop and reacquire the tq_lock in order to avoid blocking other + * taskq activity while thread_create() or lwp_kernel_create() run. + * + * The first time we're called, we do some additional setup, and do not + * return until there are enough threads to start servicing requests. + */ +static void +taskq_thread_create(taskq_t *tq) +{ + kthread_t *t; + const boolean_t first = (tq->tq_nthreads == 0); + + ASSERT(MUTEX_HELD(&tq->tq_lock)); + ASSERT(tq->tq_flags & TASKQ_CHANGING); + ASSERT(tq->tq_nthreads < tq->tq_nthreads_target); + ASSERT(!(tq->tq_flags & TASKQ_THREAD_CREATED)); + + + tq->tq_flags |= TASKQ_THREAD_CREATED; + tq->tq_active++; + mutex_exit(&tq->tq_lock); + + /* + * With TASKQ_DUTY_CYCLE the new thread must have an LWP + * as explained in ../disp/sysdc.c (for the msacct data). + * Otherwise simple kthreads are preferred. + */ + if ((tq->tq_flags & TASKQ_DUTY_CYCLE) != 0) { + /* Enforced in taskq_create_common */ + dprintf("SPL: taskq_thread_create(TASKQ_DUTY_CYCLE) seen\n"); + } else { + t = thread_create(NULL, 0, taskq_thread, tq, 0, tq->tq_proc, + TS_RUN, tq->tq_pri); + } + + if (!first) { + mutex_enter(&tq->tq_lock); + return; + } + + /* + * We know the thread cannot go away, since tq cannot be + * destroyed until creation has completed. We can therefore + * safely dereference t. + */ + if (tq->tq_flags & TASKQ_THREADS_CPU_PCT) { + mutex_enter(&tq->tq_lock); + taskq_update_nthreads(tq, max_ncpus); + mutex_exit(&tq->tq_lock); + } + mutex_enter(&tq->tq_lock); + + /* Wait until we can service requests. */ + while (tq->tq_nthreads != tq->tq_nthreads_target && + tq->tq_nthreads < TASKQ_CREATE_ACTIVE_THREADS) { + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + } + +} + +/* + * Common "sleep taskq thread" function, which handles CPR stuff, as well + * as giving a nice common point for debuggers to find inactive threads. + */ +static clock_t +taskq_thread_wait(taskq_t *tq, kmutex_t *mx, kcondvar_t *cv, + callb_cpr_t *cprinfo, clock_t timeout) +{ + clock_t ret = 0; + + if (!(tq->tq_flags & TASKQ_CPR_SAFE)) { + CALLB_CPR_SAFE_BEGIN(cprinfo); + } + if ((signed long)timeout < 0) + cv_wait(cv, mx); + else + ret = cv_reltimedwait(cv, mx, timeout, TR_CLOCK_TICK); + + if (!(tq->tq_flags & TASKQ_CPR_SAFE)) { + CALLB_CPR_SAFE_END(cprinfo, mx); + } + + return (ret); +} + +/* + * Worker thread for processing task queue. + */ +static void +taskq_thread(void *arg) +{ + int thread_id; + + taskq_t *tq = arg; + taskq_ent_t *tqe; + callb_cpr_t cprinfo; + hrtime_t start, end; + boolean_t freeit; + + CALLB_CPR_INIT(&cprinfo, &tq->tq_lock, callb_generic_cpr, + tq->tq_name); + + mutex_enter(&tq->tq_lock); + thread_id = ++tq->tq_nthreads; + ASSERT(tq->tq_flags & TASKQ_THREAD_CREATED); + ASSERT(tq->tq_flags & TASKQ_CHANGING); + tq->tq_flags &= ~TASKQ_THREAD_CREATED; + + VERIFY3S(thread_id, <=, tq->tq_nthreads_max); + + if (tq->tq_nthreads_max == 1) + tq->tq_thread = (kthread_t *)curthread; + else + tq->tq_threadlist[thread_id - 1] = (kthread_t *)curthread; + + /* Allow taskq_create_common()'s taskq_thread_create() to return. */ + if (tq->tq_nthreads == TASKQ_CREATE_ACTIVE_THREADS) + cv_broadcast(&tq->tq_wait_cv); + + for (;;) { + if (tq->tq_flags & TASKQ_CHANGING) { + /* See if we're no longer needed */ + if (thread_id > tq->tq_nthreads_target) { + /* + * To preserve the one-to-one mapping between + * thread_id and thread, we must exit from + * highest thread ID to least. + * + * However, if everyone is exiting, the order + * doesn't matter, so just exit immediately. + * (this is safe, since you must wait for + * nthreads to reach 0 after setting + * tq_nthreads_target to 0) + */ + if (thread_id == tq->tq_nthreads || + tq->tq_nthreads_target == 0) + break; + + /* Wait for higher thread_ids to exit */ + (void) taskq_thread_wait(tq, &tq->tq_lock, + &tq->tq_exit_cv, &cprinfo, -1); + continue; + } + + /* + * If no thread is starting taskq_thread(), we can + * do some bookkeeping. + */ + if (!(tq->tq_flags & TASKQ_THREAD_CREATED)) { + /* Check if we've reached our target */ + if (tq->tq_nthreads == tq->tq_nthreads_target) { + tq->tq_flags &= ~TASKQ_CHANGING; + cv_broadcast(&tq->tq_wait_cv); + } + /* Check if we need to create a thread */ + if (tq->tq_nthreads < tq->tq_nthreads_target) { + taskq_thread_create(tq); + continue; /* tq_lock was dropped */ + } + } + } + if ((tqe = tq->tq_task.tqent_next) == &tq->tq_task) { + if (--tq->tq_active == 0) + cv_broadcast(&tq->tq_wait_cv); + (void) taskq_thread_wait(tq, &tq->tq_lock, + &tq->tq_dispatch_cv, &cprinfo, -1); + tq->tq_active++; + continue; + } + + tqe->tqent_prev->tqent_next = tqe->tqent_next; + tqe->tqent_next->tqent_prev = tqe->tqent_prev; + mutex_exit(&tq->tq_lock); + + /* + * For prealloc'd tasks, we don't free anything. We + * have to check this now, because once we call the + * function for a prealloc'd taskq, we can't touch the + * tqent any longer (calling the function returns the + * ownershp of the tqent back to caller of + * taskq_dispatch.) + */ + if ((!(tq->tq_flags & TASKQ_DYNAMIC)) && + (tqe->tqent_un.tqent_flags & TQENT_FLAG_PREALLOC)) { + /* clear pointers to assist assertion checks */ + tqe->tqent_next = tqe->tqent_prev = NULL; + freeit = B_FALSE; + } else { + freeit = B_TRUE; + } + + rw_enter(&tq->tq_threadlock, RW_READER); + start = gethrtime(); + DTRACE_PROBE2(taskq__exec__start, taskq_t *, tq, + taskq_ent_t *, tqe); + tqe->tqent_func(tqe->tqent_arg); + DTRACE_PROBE2(taskq__exec__end, taskq_t *, tq, + taskq_ent_t *, tqe); + end = gethrtime(); + rw_exit(&tq->tq_threadlock); + + mutex_enter(&tq->tq_lock); + tq->tq_totaltime += end - start; + tq->tq_executed++; + + if (freeit) + taskq_ent_free(tq, tqe); + } + + if (tq->tq_nthreads_max == 1) + tq->tq_thread = NULL; + else + tq->tq_threadlist[thread_id - 1] = NULL; + + /* We're exiting, and therefore no longer active */ + ASSERT(tq->tq_active > 0); + tq->tq_active--; + + ASSERT(tq->tq_nthreads > 0); + tq->tq_nthreads--; + + /* Wake up anyone waiting for us to exit */ + cv_broadcast(&tq->tq_exit_cv); + if (tq->tq_nthreads == tq->tq_nthreads_target) { + if (!(tq->tq_flags & TASKQ_THREAD_CREATED)) + tq->tq_flags &= ~TASKQ_CHANGING; + + cv_broadcast(&tq->tq_wait_cv); + } + + CALLB_CPR_EXIT(&cprinfo); + thread_exit(); +} + +/* + * Worker per-entry thread for dynamic dispatches. + */ +static void +taskq_d_thread(taskq_ent_t *tqe) +{ + taskq_bucket_t *bucket = tqe->tqent_un.tqent_bucket; + taskq_t *tq = bucket->tqbucket_taskq; + kmutex_t *lock = &bucket->tqbucket_lock; + kcondvar_t *cv = &tqe->tqent_cv; + callb_cpr_t cprinfo; + clock_t w; + + CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, tq->tq_name); + + /* + * There's no way in Mac OS X KPI to create a thread + * in a suspended state (TS_STOPPED). So instead we + * use tqent_thread as a flag and wait for it to get + * initialized. + */ + mutex_enter(&tqe->tqent_thread_lock); + while (tqe->tqent_thread == (kthread_t *)0xCEDEC0DE) + cv_wait(&tqe->tqent_thread_cv, &tqe->tqent_thread_lock); + mutex_exit(&tqe->tqent_thread_lock); + + mutex_enter(lock); + + for (;;) { + /* + * If a task is scheduled (func != NULL), execute it, otherwise + * sleep, waiting for a job. + */ + if (tqe->tqent_func != NULL) { + hrtime_t start; + hrtime_t end; + + ASSERT(bucket->tqbucket_nalloc > 0); + + /* + * It is possible to free the entry right away before + * actually executing the task so that subsequent + * dispatches may immediately reuse it. But this, + * effectively, creates a two-length queue in the entry + * and may lead to a deadlock if the execution of the + * current task depends on the execution of the next + * scheduled task. So, we keep the entry busy until the + * task is processed. + */ + + mutex_exit(lock); + start = gethrtime(); + DTRACE_PROBE3(taskq__d__exec__start, taskq_t *, tq, + taskq_bucket_t *, bucket, taskq_ent_t *, tqe); + tqe->tqent_func(tqe->tqent_arg); + DTRACE_PROBE3(taskq__d__exec__end, taskq_t *, tq, + taskq_bucket_t *, bucket, taskq_ent_t *, tqe); + end = gethrtime(); + mutex_enter(lock); + bucket->tqbucket_totaltime += end - start; + + /* + * Return the entry to the bucket free list. + */ + tqe->tqent_func = NULL; + TQ_APPEND(bucket->tqbucket_freelist, tqe); + bucket->tqbucket_nalloc--; + bucket->tqbucket_nfree++; + ASSERT(!IS_EMPTY(bucket->tqbucket_freelist)); + /* + * taskq_wait() waits for nalloc to drop to zero on + * tqbucket_cv. + */ + cv_signal(&bucket->tqbucket_cv); + } + + /* + * At this point the entry must be in the bucket free list - + * either because it was there initially or because it just + * finished executing a task and put itself on the free list. + */ + ASSERT(bucket->tqbucket_nfree > 0); + /* + * Go to sleep unless we are closing. + * If a thread is sleeping too long, it dies. + */ + if (! (bucket->tqbucket_flags & TQBUCKET_CLOSE)) { + w = taskq_thread_wait(tq, lock, cv, + &cprinfo, taskq_thread_timeout * hz); + } + + /* + * At this point we may be in two different states: + * + * (1) tqent_func is set which means that a new task is + * dispatched and we need to execute it. + * + * (2) Thread is sleeping for too long or we are closing. In + * both cases destroy the thread and the entry. + */ + + /* If func is NULL we should be on the freelist. */ + ASSERT((tqe->tqent_func != NULL) || + (bucket->tqbucket_nfree > 0)); + /* If func is non-NULL we should be allocated */ + ASSERT((tqe->tqent_func == NULL) || + (bucket->tqbucket_nalloc > 0)); + + /* Check freelist consistency */ + ASSERT((bucket->tqbucket_nfree > 0) || + IS_EMPTY(bucket->tqbucket_freelist)); + ASSERT((bucket->tqbucket_nfree == 0) || + !IS_EMPTY(bucket->tqbucket_freelist)); + + if ((tqe->tqent_func == NULL) && + ((w == -1) || (bucket->tqbucket_flags & TQBUCKET_CLOSE))) { + /* + * This thread is sleeping for too long or we are + * closing - time to die. + * Thread creation/destruction happens rarely, + * so grabbing the lock is not a big performance issue. + * The bucket lock is dropped by CALLB_CPR_EXIT(). + */ + + /* Remove the entry from the free list. */ + tqe->tqent_prev->tqent_next = tqe->tqent_next; + tqe->tqent_next->tqent_prev = tqe->tqent_prev; + ASSERT(bucket->tqbucket_nfree > 0); + bucket->tqbucket_nfree--; + + TQ_STAT(bucket, tqs_tdeaths); + cv_signal(&bucket->tqbucket_cv); + tqe->tqent_thread = NULL; + mutex_enter(&tq->tq_lock); + tq->tq_tdeaths++; + mutex_exit(&tq->tq_lock); + CALLB_CPR_EXIT(&cprinfo); +// DbgBreakPoint(); + kmem_cache_free(taskq_ent_cache, tqe); + thread_exit(); + } + } +} + + +/* + * Taskq creation. May sleep for memory. + * Always use automatically generated instances to avoid kstat name space + * collisions. + */ + +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, int minalloc, + int maxalloc, uint_t flags) +{ + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); + + return (taskq_create_common(name, 0, nthreads, pri, minalloc, + maxalloc, &p0, 0, flags | TASKQ_NOINSTANCE)); +} + +/* + * Create an instance of task queue. It is legal to create task queues with the + * same name and different instances. + * + * taskq_create_instance is used by ddi_taskq_create() where it gets the + * instance from ddi_get_instance(). In some cases the instance is not + * initialized and is set to -1. This case is handled as if no instance was + * passed at all. + */ +taskq_t * +taskq_create_instance(const char *name, int instance, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags) +{ + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); + ASSERT((instance >= 0) || (instance == -1)); + + if (instance < 0) { + flags |= TASKQ_NOINSTANCE; + } + + return (taskq_create_common(name, instance, nthreads, + pri, minalloc, maxalloc, &p0, 0, flags)); +} + +taskq_t * +taskq_create_proc(const char *name, int nthreads, pri_t pri, int minalloc, + int maxalloc, proc_t *proc, uint_t flags) +{ + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); + return (taskq_create_common(name, 0, nthreads, pri, minalloc, + maxalloc, proc, 0, flags | TASKQ_NOINSTANCE)); +} + +taskq_t * +taskq_create_sysdc(const char *name, int nthreads, int minalloc, + int maxalloc, proc_t *proc, uint_t dc, uint_t flags) +{ + ASSERT((flags & ~TASKQ_INTERFACE_FLAGS) == 0); + return (taskq_create_common(name, 0, nthreads, minclsyspri, minalloc, + maxalloc, proc, dc, flags | TASKQ_NOINSTANCE | TASKQ_DUTY_CYCLE)); +} + +static taskq_t * +taskq_create_common(const char *name, int instance, int nthreads, pri_t pri, + int minalloc, int maxalloc, proc_t *proc, uint_t dc, uint_t flags) +{ + taskq_t *tq = kmem_cache_alloc(taskq_cache, KM_SLEEP); + uint_t ncpus = max_ncpus; + uint_t bsize; /* # of buckets - always power of 2 */ + int max_nthreads; + + /* + * TASKQ_DYNAMIC, TASKQ_CPR_SAFE and TASKQ_THREADS_CPU_PCT are all + * mutually incompatible. + */ + IMPLY((flags & TASKQ_DYNAMIC), !(flags & TASKQ_CPR_SAFE)); + IMPLY((flags & TASKQ_DYNAMIC), !(flags & TASKQ_THREADS_CPU_PCT)); + IMPLY((flags & TASKQ_CPR_SAFE), !(flags & TASKQ_THREADS_CPU_PCT)); + + /* Cannot have DYNAMIC with DUTY_CYCLE */ + IMPLY((flags & TASKQ_DYNAMIC), !(flags & TASKQ_DUTY_CYCLE)); + + /* Cannot have DUTY_CYCLE with a p0 kernel process */ + IMPLY((flags & TASKQ_DUTY_CYCLE), proc != &p0); + + /* Cannot have DC_BATCH without DUTY_CYCLE */ + ASSERT((flags & (TASKQ_DUTY_CYCLE|TASKQ_DC_BATCH)) != TASKQ_DC_BATCH); + + //ASSERT(proc != NULL); + + bsize = 1 << (highbit(ncpus) - 1); + ASSERT(bsize >= 1); + bsize = MIN(bsize, taskq_maxbuckets); + + if (flags & TASKQ_DYNAMIC) { + ASSERT3S(nthreads, >=, 1); + tq->tq_maxsize = nthreads; + + /* For dynamic task queues use just one backup thread */ + nthreads = max_nthreads = 1; + + } else if (flags & TASKQ_THREADS_CPU_PCT) { + uint_t pct; + ASSERT3S(nthreads, >=, 0); + pct = nthreads; + + if (pct > taskq_cpupct_max_percent) + pct = taskq_cpupct_max_percent; + + /* + * If you're using THREADS_CPU_PCT, the process for the + * taskq threads must be curproc. This allows any pset + * binding to be inherited correctly. If proc is &p0, + * we won't be creating LWPs, so new threads will be assigned + * to the default processor set. + */ + /*ASSERT(curproc == proc || proc == &p0);*/ + tq->tq_threads_ncpus_pct = pct; + nthreads = 1; /* corrected in taskq_thread_create() */ + max_nthreads = TASKQ_THREADS_PCT(max_ncpus, pct); + + } else { + ASSERT3S(nthreads, >=, 1); + max_nthreads = nthreads; + } + + if (max_nthreads < taskq_minimum_nthreads_max) + max_nthreads = taskq_minimum_nthreads_max; + + /* + * Make sure the name is 0-terminated, and conforms to the rules for + * C indentifiers + */ + (void) strncpy(tq->tq_name, name, TASKQ_NAMELEN + 1); + strident_canon(tq->tq_name, TASKQ_NAMELEN + 1); + + tq->tq_flags = flags | TASKQ_CHANGING; + tq->tq_active = 0; + tq->tq_instance = instance; + tq->tq_nthreads_target = nthreads; + tq->tq_nthreads_max = max_nthreads; + tq->tq_minalloc = minalloc; + tq->tq_maxalloc = maxalloc; + tq->tq_nbuckets = bsize; + tq->tq_proc = proc; + tq->tq_pri = pri; + tq->tq_DC = dc; + list_link_init(&tq->tq_cpupct_link); + + if (max_nthreads > 1) + tq->tq_threadlist = kmem_alloc( + sizeof (kthread_t *) * max_nthreads, KM_SLEEP); + + mutex_enter(&tq->tq_lock); + if (flags & TASKQ_PREPOPULATE) { + while (minalloc-- > 0) + taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP)); + } + + /* + * Before we start creating threads for this taskq, take a + * zone hold so the zone can't go away before taskq_destroy + * makes sure all the taskq threads are gone. This hold is + * similar in purpose to those taken by zthread_create(). + */ + /* + * Create the first thread, which will create any other threads + * necessary. taskq_thread_create will not return until we have + * enough threads to be able to process requests. + */ + taskq_thread_create(tq); + mutex_exit(&tq->tq_lock); + + if (flags & TASKQ_DYNAMIC) { + taskq_bucket_t *bucket = kmem_zalloc(sizeof (taskq_bucket_t) * + bsize, KM_SLEEP); + int b_id; + + tq->tq_buckets = bucket; + + /* Initialize each bucket */ + for (b_id = 0; b_id < bsize; b_id++, bucket++) { + mutex_init(&bucket->tqbucket_lock, NULL, MUTEX_DEFAULT, + NULL); + cv_init(&bucket->tqbucket_cv, NULL, CV_DEFAULT, NULL); + bucket->tqbucket_taskq = tq; + bucket->tqbucket_freelist.tqent_next = + bucket->tqbucket_freelist.tqent_prev = + &bucket->tqbucket_freelist; + if (flags & TASKQ_PREPOPULATE) + taskq_bucket_extend(bucket); + } + } + + /* + * Install kstats. + * We have two cases: + * 1) Instance is provided to taskq_create_instance(). In this case it + * should be >= 0 and we use it. + * + * 2) Instance is not provided and is automatically generated + */ + if (flags & TASKQ_NOINSTANCE) { + instance = tq->tq_instance = + (int)(uintptr_t)vmem_alloc(taskq_id_arena, 1, VM_SLEEP); + } + + if (flags & TASKQ_DYNAMIC) { + if ((tq->tq_kstat = kstat_create("unix", instance, + tq->tq_name, "taskq_d", KSTAT_TYPE_NAMED, + sizeof (taskq_d_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + tq->tq_kstat->ks_lock = &taskq_d_kstat_lock; + tq->tq_kstat->ks_data = &taskq_d_kstat; + tq->tq_kstat->ks_update = taskq_d_kstat_update; + tq->tq_kstat->ks_private = tq; + kstat_install(tq->tq_kstat); + } + } else { + if ((tq->tq_kstat = kstat_create("unix", instance, tq->tq_name, + "taskq", KSTAT_TYPE_NAMED, + sizeof (taskq_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + tq->tq_kstat->ks_lock = &taskq_kstat_lock; + tq->tq_kstat->ks_data = &taskq_kstat; + tq->tq_kstat->ks_update = taskq_kstat_update; + tq->tq_kstat->ks_private = tq; + kstat_install(tq->tq_kstat); + } + } + + return (tq); +} + +/* + * taskq_destroy(). + * + * Assumes: by the time taskq_destroy is called no one will use this task queue + * in any way and no one will try to dispatch entries in it. + */ +void +taskq_destroy(taskq_t *tq) +{ + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + + ASSERT(! (tq->tq_flags & TASKQ_CPR_SAFE)); + + /* + * Destroy kstats. + */ + if (tq->tq_kstat != NULL) { + kstat_delete(tq->tq_kstat); + tq->tq_kstat = NULL; + } + + /* + * Destroy instance if needed. + */ + if (tq->tq_flags & TASKQ_NOINSTANCE) { + vmem_free(taskq_id_arena, (void *)(uintptr_t)(tq->tq_instance), + 1); + tq->tq_instance = 0; + } + + /* + * Unregister from the cpupct list. + */ + + /* + * Wait for any pending entries to complete. + */ + taskq_wait(tq); + + mutex_enter(&tq->tq_lock); + ASSERT((tq->tq_task.tqent_next == &tq->tq_task) && + (tq->tq_active == 0)); + + /* notify all the threads that they need to exit */ + tq->tq_nthreads_target = 0; + + tq->tq_flags |= TASKQ_CHANGING; + cv_broadcast(&tq->tq_dispatch_cv); + cv_broadcast(&tq->tq_exit_cv); + + while (tq->tq_nthreads != 0) + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); // this crashes, sometimes. + + if (tq->tq_nthreads_max != 1) + kmem_free(tq->tq_threadlist, sizeof (kthread_t *) * + tq->tq_nthreads_max); + + tq->tq_minalloc = 0; + while (tq->tq_nalloc != 0) + taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP)); + + mutex_exit(&tq->tq_lock); + + /* + * Mark each bucket as closing and wakeup all sleeping threads. + */ + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + taskq_ent_t *tqe; + + mutex_enter(&b->tqbucket_lock); + + b->tqbucket_flags |= TQBUCKET_CLOSE; + /* Wakeup all sleeping threads */ + + for (tqe = b->tqbucket_freelist.tqent_next; + tqe != &b->tqbucket_freelist; tqe = tqe->tqent_next) + cv_signal(&tqe->tqent_cv); + + ASSERT(b->tqbucket_nalloc == 0); + + /* + * At this point we waited for all pending jobs to complete (in + * both the task queue and the bucket and no new jobs should + * arrive. Wait for all threads to die. + */ + while (b->tqbucket_nfree > 0) + cv_wait(&b->tqbucket_cv, &b->tqbucket_lock); + mutex_exit(&b->tqbucket_lock); + mutex_destroy(&b->tqbucket_lock); + cv_destroy(&b->tqbucket_cv); + } + + if (tq->tq_buckets != NULL) { + ASSERT(tq->tq_flags & TASKQ_DYNAMIC); + kmem_free(tq->tq_buckets, + sizeof (taskq_bucket_t) * tq->tq_nbuckets); + + /* Cleanup fields before returning tq to the cache */ + tq->tq_buckets = NULL; + tq->tq_tcreates = 0; + tq->tq_tdeaths = 0; + } else { + ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC)); + } + + /* + * Now that all the taskq threads are gone, we can + * drop the zone hold taken in taskq_create_common + */ + + tq->tq_threads_ncpus_pct = 0; + tq->tq_totaltime = 0; + tq->tq_tasks = 0; + tq->tq_maxtasks = 0; + tq->tq_executed = 0; + kmem_cache_free(taskq_cache, tq); +} + +/* + * Extend a bucket with a new entry on the free list and attach a worker thread + * to it. + * + * Argument: pointer to the bucket. + * + * This function may quietly fail. It is only used by taskq_dispatch() which + * handles such failures properly. + */ +static void +taskq_bucket_extend(void *arg) +{ + taskq_ent_t *tqe; + taskq_bucket_t *b = (taskq_bucket_t *)arg; + taskq_t *tq = b->tqbucket_taskq; + int nthreads; + kthread_t *thread; + + if (! ENOUGH_MEMORY()) { + TQ_STAT(b, tqs_nomem); + return; + } + + mutex_enter(&tq->tq_lock); + + /* + * Observe global taskq limits on the number of threads. + */ + if (tq->tq_tcreates++ - tq->tq_tdeaths > tq->tq_maxsize) { + tq->tq_tcreates--; + mutex_exit(&tq->tq_lock); + return; + } + mutex_exit(&tq->tq_lock); + tqe = kmem_cache_alloc(taskq_ent_cache, KM_NOSLEEP); + + if (tqe == NULL) { + mutex_enter(&tq->tq_lock); + TQ_STAT(b, tqs_nomem); + tq->tq_tcreates--; + mutex_exit(&tq->tq_lock); + return; + } + + ASSERT(tqe->tqent_thread == NULL); + + tqe->tqent_un.tqent_bucket = b; + + /* + * There's no way in Mac OS X KPI to create a thread + * in a suspended state (TS_STOPPED). So instead we + * use tqent_thread as a flag and the thread must wait + * for it to be initialized (below). + */ + tqe->tqent_thread = (kthread_t *)0xCEDEC0DE; + thread = thread_create(NULL, 0, (void (*)(void *))taskq_d_thread, tqe, 0, pp0, TS_RUN, + tq->tq_pri); + + /* + * Once the entry is ready, link it to the the bucket free list. + */ + mutex_enter(&b->tqbucket_lock); + tqe->tqent_func = NULL; + TQ_APPEND(b->tqbucket_freelist, tqe); + b->tqbucket_nfree++; + TQ_STAT(b, tqs_tcreates); + +#if TASKQ_STATISTIC + nthreads = b->tqbucket_stat.tqs_tcreates - + b->tqbucket_stat.tqs_tdeaths; + b->tqbucket_stat.tqs_maxthreads = MAX(nthreads, + b->tqbucket_stat.tqs_maxthreads); +#endif + + mutex_exit(&b->tqbucket_lock); + /* + * Start the stopped thread. + */ + mutex_enter(&tqe->tqent_thread_lock); + tqe->tqent_thread = thread; + cv_signal(&tqe->tqent_thread_cv); + mutex_exit(&tqe->tqent_thread_lock); +} + +static int +taskq_kstat_update(kstat_t *ksp, int rw) +{ + struct taskq_kstat *tqsp = &taskq_kstat; + taskq_t *tq = ksp->ks_private; + + if (rw == KSTAT_WRITE) + return (EACCES); + + tqsp->tq_pid.value.ui64 = 0; /* kernel_task'd pid is 0 */ + tqsp->tq_tasks.value.ui64 = tq->tq_tasks; + tqsp->tq_executed.value.ui64 = tq->tq_executed; + tqsp->tq_maxtasks.value.ui64 = tq->tq_maxtasks; + tqsp->tq_totaltime.value.ui64 = tq->tq_totaltime; + tqsp->tq_nactive.value.ui64 = tq->tq_active; + tqsp->tq_nalloc.value.ui64 = tq->tq_nalloc; + tqsp->tq_pri.value.ui64 = tq->tq_pri; + tqsp->tq_nthreads.value.ui64 = tq->tq_nthreads; + return (0); +} + +static int +taskq_d_kstat_update(kstat_t *ksp, int rw) +{ + struct taskq_d_kstat *tqsp = &taskq_d_kstat; + taskq_t *tq = ksp->ks_private; + taskq_bucket_t *b = tq->tq_buckets; + int bid = 0; + + if (rw == KSTAT_WRITE) + return (EACCES); + + ASSERT(tq->tq_flags & TASKQ_DYNAMIC); + + tqsp->tqd_btasks.value.ui64 = tq->tq_tasks; + tqsp->tqd_bexecuted.value.ui64 = tq->tq_executed; + tqsp->tqd_bmaxtasks.value.ui64 = tq->tq_maxtasks; + tqsp->tqd_bnalloc.value.ui64 = tq->tq_nalloc; + tqsp->tqd_bnactive.value.ui64 = tq->tq_active; + tqsp->tqd_btotaltime.value.ui64 = tq->tq_totaltime; + tqsp->tqd_pri.value.ui64 = tq->tq_pri; + + tqsp->tqd_hits.value.ui64 = 0; + tqsp->tqd_misses.value.ui64 = 0; + tqsp->tqd_overflows.value.ui64 = 0; + tqsp->tqd_tcreates.value.ui64 = 0; + tqsp->tqd_tdeaths.value.ui64 = 0; + tqsp->tqd_maxthreads.value.ui64 = 0; + tqsp->tqd_nomem.value.ui64 = 0; + tqsp->tqd_disptcreates.value.ui64 = 0; + tqsp->tqd_totaltime.value.ui64 = 0; + tqsp->tqd_nalloc.value.ui64 = 0; + tqsp->tqd_nfree.value.ui64 = 0; + + for (; (b != NULL) && (bid < tq->tq_nbuckets); b++, bid++) { + tqsp->tqd_hits.value.ui64 += b->tqbucket_stat.tqs_hits; + tqsp->tqd_misses.value.ui64 += b->tqbucket_stat.tqs_misses; + tqsp->tqd_overflows.value.ui64 += b->tqbucket_stat.tqs_overflow; + tqsp->tqd_tcreates.value.ui64 += b->tqbucket_stat.tqs_tcreates; + tqsp->tqd_tdeaths.value.ui64 += b->tqbucket_stat.tqs_tdeaths; + tqsp->tqd_maxthreads.value.ui64 += + b->tqbucket_stat.tqs_maxthreads; + tqsp->tqd_nomem.value.ui64 += b->tqbucket_stat.tqs_nomem; + tqsp->tqd_disptcreates.value.ui64 += + b->tqbucket_stat.tqs_disptcreates; + tqsp->tqd_totaltime.value.ui64 += b->tqbucket_totaltime; + tqsp->tqd_nalloc.value.ui64 += b->tqbucket_nalloc; + tqsp->tqd_nfree.value.ui64 += b->tqbucket_nfree; + } + return (0); +} diff --git a/module/os/windows/spl/spl-thread.c b/module/os/windows/spl/spl-thread.c new file mode 100644 index 000000000000..ab8d5c7147f9 --- /dev/null +++ b/module/os/windows/spl/spl-thread.c @@ -0,0 +1,142 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2019 Jorgen Lundman + * + */ + +#include +//#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +uint64_t zfs_threads = 0; + +kthread_t * +spl_thread_create( + caddr_t stk, + size_t stksize, + void (*proc)(void *), + void *arg, + size_t len, + /*struct proc *pp,*/ + int state, +#ifdef SPL_DEBUG_THREAD + char *filename, + int line, +#endif + pri_t pri) +{ + NTSTATUS result; + struct _KTHREAD *thread; + +#ifdef SPL_DEBUG_THREAD + dprintf("Start thread pri %d by '%s':%d\n", pri, + filename, line); +#endif + result = PsCreateSystemThread( + &thread, + 0, // DesiredAccess, + NULL, // ObjectAttributes, + NULL, // ProcessHandle, + 0, // ClientId, + proc, // StartRoutine, + arg // StartContext + ); + + + if (result != STATUS_SUCCESS) + return (NULL); + + /* Improve the priority when asked to do so */ + /* Thread priorities range from 0 to 31, where 0 is the lowest + * priority and 31 is the highest*/ + + if (pri > minclsyspri) { + //thread_precedence_policy_data_t policy; + //policy.importance = pri - minclsyspri; + + //thread_policy_set(thread, + // THREAD_PRECEDENCE_POLICY, + // (thread_policy_t)&policy, + // THREAD_PRECEDENCE_POLICY_COUNT); + + + // TODO: Windows thread priority? + + // why is this call missing? + //KeSetBasePriorityThread(thread, 1); + } + //thread_deallocate(thread); + + atomic_inc_64(&zfs_threads); + int threadid; + PETHREAD eThread; + ObReferenceObjectByHandle(thread, 0, 0, KernelMode, &eThread, 0); + // Perhaps threadid should move to 64bit. + threadid = (int)(uintptr_t) PsGetThreadId(eThread); + ObDereferenceObject(eThread); + ZwClose(thread); + return ((kthread_t *)threadid); +} + +kthread_t * +spl_current_thread(void) +{ + thread_t *cur_thread = current_thread(); + return ((kthread_t *)cur_thread); +} + +void spl_thread_exit(void) +{ + atomic_dec_64(&zfs_threads); + + tsd_thread_exit(); + (void) PsTerminateSystemThread(0); +} + + +/* + * IllumOS has callout.c - place it here until we find a better place + */ +callout_id_t +timeout_generic(int type, void (*func)(void *), void *arg, + hrtime_t expiration, hrtime_t resolution, int flags) +{ +// struct timespec ts; + // hrt2ts(expiration, &ts); + //bsd_timeout(func, arg, &ts); + /* bsd_untimeout() requires func and arg to cancel the timeout, so + * pass it back as the callout_id. If we one day were to implement + * untimeout_generic() they would pass it back to us + */ + return (callout_id_t)arg; +} diff --git a/module/os/windows/spl/spl-time.c b/module/os/windows/spl/spl-time.c new file mode 100644 index 000000000000..22d5a48c6c1c --- /dev/null +++ b/module/os/windows/spl/spl-time.c @@ -0,0 +1,113 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * +*/ + +#include +#include +#include +//#include + + + +/* + * gethrtime() provides high-resolution timestamps with machine-dependent origin +. + * Hence its primary use is to specify intervals. + */ + +static hrtime_t +zfs_abs_to_nano(uint64_t elapsed) +{ + return elapsed * KeQueryTimeIncrement() * 100; +} + + +hrtime_t gethrtime(void) +{ + //static uint64_t start = 0; + static LARGE_INTEGER start = { 0 }; + LARGE_INTEGER now; + if (start.QuadPart == 0) { + KeQueryTickCount(&start); + start.QuadPart--; + } + KeQueryTickCount(&now); + ASSERT((now.QuadPart != start.QuadPart)); + return zfs_abs_to_nano(now.QuadPart - start.QuadPart); +} + + +void +gethrestime(struct timespec *ts) +{ + LARGE_INTEGER now; + uint64_t tv[2]; +#if _WIN32_WINNT >= 0x0602 + KeQuerySystemTimePrecise(&now); +#else + KeQuerySystemTime(&now); +#endif + TIME_WINDOWS_TO_UNIX(now.QuadPart, tv); + // change macro to take 2 dst args, "sec and nsec" to avoid this step? + ts->tv_sec = tv[0]; + ts->tv_nsec = tv[1]; +} + +time_t +gethrestime_sec(void) +{ + struct timespec tv; + gethrestime(&tv); + return (tv.tv_sec); +} + +#if 0 +void +hrt2ts(hrtime_t hrt, struct timespec *tsp) +{ + uint32_t sec, nsec, tmp; + + tmp = (uint32_t)(hrt >> 30); + sec = tmp - (tmp >> 2); + sec = tmp - (sec >> 5); + sec = tmp + (sec >> 1); + sec = tmp - (sec >> 6) + 7; + sec = tmp - (sec >> 3); + sec = tmp + (sec >> 1); + sec = tmp + (sec >> 3); + sec = tmp + (sec >> 4); + tmp = (sec << 7) - sec - sec - sec; + tmp = (tmp << 7) - tmp - tmp - tmp; + tmp = (tmp << 7) - tmp - tmp - tmp; + nsec = (uint32_t)hrt - (tmp << 9); + while (nsec >= NANOSEC) { + nsec -= NANOSEC; + sec++; + } + tsp->tv_sec = (time_t)sec; + tsp->tv_nsec = nsec; +} +#endif \ No newline at end of file diff --git a/module/os/windows/spl/spl-tsd.c b/module/os/windows/spl/spl-tsd.c new file mode 100644 index 000000000000..263f6191b85f --- /dev/null +++ b/module/os/windows/spl/spl-tsd.c @@ -0,0 +1,387 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2014,2017 Jorgen Lundman + * + * A thread will call tsd_create(&key, dtor) to allocate a new + * "variable" placement, called a "key". In IllumOS, this is the index + * into an array of dtors. (If dtor is passed as NULL, TSD internally + * set it to an empty function). So if the dtor array[i] is NULL, it + * is "free" and can be allocated. (returned as *key = i). + * IllumOS will grow this dtor array with realloc when required. + * Then Any Thread can set a value on this "key index", and this value + * is specific to each thread by calling tsd_set(key, value). + * And can be retrieved with tsd_get(key). + * When tsd_destroy(key) is called, we need to loop through all + * threads different "values", and call the dtor on each one. + * Likewise, we need to know when a thread exists, so we can clean up + * the values (by calling dtor for each one) so we patch into the + * thread_exit() call, to also call tsd_thread_exit(). + * + * In OsX, we build an array of the dtors, and return the key index, + * this is to store the dtor, and know which "key" values are valid. + * Then we build an AVL tree, indexed by , to store + * each thread's value. This allows us to do key access quick. + * On thread_exit, we iterate the dtor array, and for each key + * remove . + * On tsd_destroy(key), we use AVL find nearest with , then + * avl_next as long as key remains the same, to remove each thread value. + * + * Note a key of "0" is considered "invalid" in IllumOS, so we return + * a "1" based index, even though internally it is 0 based. + * + */ + +#include +#include +#include +#include +#include + +#include + +/* Initial size of array, and realloc growth size */ +#define TSD_ALLOC_SIZE 5 + +/* array of dtors, allocated in init */ +static dtor_func_t *tsd_dtor_array = NULL; +static uint32_t tsd_dtor_size = 0; + +static avl_tree_t tsd_tree; + +struct spl_tsd_node_s +{ + /* The index/key */ + uint_t tsd_key; + thread_t *tsd_thread; + + /* The payload */ + void *tsd_value; + + /* Internal mumbo */ + avl_node_t tsd_link_node; +}; +typedef struct spl_tsd_node_s spl_tsd_node_t; + +static kmutex_t spl_tsd_mutex; + + +/* + * tsd_set - set thread specific data + * @key: lookup key + * @value: value to set + * + * Caller must prevent racing tsd_create() or tsd_destroy(), protected + * from racing tsd_get() or tsd_set() because it is thread specific. + * This function has been optimized to be fast for the update case. + * When setting the tsd initially it will be slower due to additional + * required locking and potential memory allocations. + * If the value is set to NULL, we also release it. + */ +int +tsd_set(uint_t key, void *value) +{ + spl_tsd_node_t *entry = NULL; + spl_tsd_node_t search; + avl_index_t loc; + uint_t i; + + /* Invalid key values? */ + if ((key < 1) || + (key >= tsd_dtor_size)) { + return EINVAL; + } + + i = key - 1; + + /* First handle the easy case, already has a node/value + * so we just need to find it, update it. + */ + + search.tsd_key = i; + search.tsd_thread = current_thread(); + + mutex_enter(&spl_tsd_mutex); + entry = avl_find(&tsd_tree, &search, &loc); + mutex_exit(&spl_tsd_mutex); + + if (entry) { + + /* If value is set to NULL, release it as well */ + if (value == NULL) { + mutex_enter(&spl_tsd_mutex); + avl_remove(&tsd_tree, entry); + mutex_exit(&spl_tsd_mutex); + + kmem_free(entry, sizeof(*entry)); + return 0; + } + + entry->tsd_value = value; + return 0; + } + + /* No node, we need to create a new one and insert it. */ + /* But if the value is NULL, then why create one eh? */ + if (value == NULL) + return 0; + + entry = kmem_alloc(sizeof(spl_tsd_node_t), KM_SLEEP); + + entry->tsd_key = i; + entry->tsd_thread = current_thread(); + entry->tsd_value = value; + + mutex_enter(&spl_tsd_mutex); + avl_add(&tsd_tree, entry); + mutex_exit(&spl_tsd_mutex); + + return 0; +} + +/* + * tsd_get - get thread specific data + * @key: lookup key + * + * Caller must prevent racing tsd_create() or tsd_destroy(). This + * implementation is designed to be fast and scalable, it does not + * lock the entire table only a single hash bin. + */ +void * +tsd_get(uint_t key) +{ + spl_tsd_node_t *entry = NULL; + spl_tsd_node_t search; + avl_index_t loc; + uint_t i; + + /* Invalid key values? */ + if ((key < 1) || + (key >= tsd_dtor_size)) { + return NULL; + } + + i = key - 1; + + search.tsd_key = i; + search.tsd_thread = current_thread(); + + mutex_enter(&spl_tsd_mutex); + entry = avl_find(&tsd_tree, &search, &loc); + mutex_exit(&spl_tsd_mutex); + + return entry ? entry->tsd_value : NULL; +} + + +static void +tsd_internal_dtor(void *value) +{ +} + +/* + * Create TSD for a pid and fill in key with unique value, remember the dtor + * + * We cheat and create an entry with pid=0, to keep the dtor. + */ +void +tsd_create(uint_t *keyp, dtor_func_t dtor) +{ + uint_t i; + + if (*keyp) return; // Should be 0 + + // Iterate the dtor_array, looking for first NULL + for (i = 0; i < TSD_ALLOC_SIZE; i++) { + if (tsd_dtor_array[i] == NULL) break; + } + + /* Do we need to grow the list? */ + if (i >= tsd_dtor_size) { + dprintf("SPL: tsd list growing not implemented\n"); + return; + } + + if (dtor == NULL) + dtor = tsd_internal_dtor; + + tsd_dtor_array[i] = dtor; + + *keyp = i + 1; +} + +void +tsd_destroy(uint_t *keyp) +{ + spl_tsd_node_t *entry = NULL, *next = NULL; + spl_tsd_node_t search; + avl_index_t loc; + dtor_func_t dtor = NULL; + uint_t i; + + /* Invalid key values? */ + if ((*keyp < 1) || + (*keyp >= tsd_dtor_size)) { + return; + } + + i = *keyp - 1; + *keyp = 0; + + ASSERT(tsd_dtor_array[i] != NULL); + + dtor = tsd_dtor_array[i]; + tsd_dtor_array[i] = NULL; + + /* + * For each thread; + * if it has a value + * call the dtor + */ + search.tsd_key = i; + search.tsd_thread = NULL; + + mutex_enter(&spl_tsd_mutex); + entry = avl_find(&tsd_tree, &search, &loc); + + /* "entry" should really be NULL here, as we searched for the + * NULL thread */ + if (entry == NULL) + entry = avl_nearest(&tsd_tree, loc, AVL_AFTER); + + /* Now, free node, and go to next, as long as the key matches */ + while (entry && (entry->tsd_key == i)) { + next = AVL_NEXT(&tsd_tree, entry); + + /* If we have a value, call the dtor for this thread */ + if (entry->tsd_value) + dtor(entry->tsd_value); + + avl_remove(&tsd_tree, entry); + + kmem_free(entry, sizeof(*entry)); + + entry = next; + } + + mutex_exit(&spl_tsd_mutex); + +} + + + +/* + * A thread is exiting, clear out any tsd values it might have. + */ +void tsd_thread_exit(void) +{ + spl_tsd_node_t *entry = NULL; + spl_tsd_node_t search; + avl_index_t loc; + int i; + + search.tsd_thread = current_thread(); + + /* For all defined dtor/values */ + for (i = 0; i < tsd_dtor_size; i++) { + + /* If not allocated, skip */ + if (tsd_dtor_array[i] == NULL) continue; + + /* Find out of this thread has a value */ + search.tsd_key = i; + + mutex_enter(&spl_tsd_mutex); + entry = avl_find(&tsd_tree, &search, &loc); + if (entry) avl_remove(&tsd_tree, entry); + mutex_exit(&spl_tsd_mutex); + + if (entry == NULL) continue; + + /* If we have a value, call dtor */ + if (entry->tsd_value) + tsd_dtor_array[i](entry->tsd_value); + + kmem_free(entry, sizeof(*entry)); + } // for all i +} + + + + +static int tsd_tree_cmp(const void *arg1, const void *arg2) +{ + const spl_tsd_node_t *node1 = arg1; + const spl_tsd_node_t *node2 = arg2; + if (node1->tsd_key > node2->tsd_key) + return 1; + if (node1->tsd_key < node2->tsd_key) + return -1; + if (node1->tsd_thread > node2->tsd_thread) + return 1; + if (node1->tsd_thread < node2->tsd_thread) + return -1; + return 0; +} + +int +spl_tsd_init(void) +{ + tsd_dtor_array = kmem_zalloc(sizeof(dtor_func_t) * TSD_ALLOC_SIZE, + KM_SLEEP); + tsd_dtor_size = TSD_ALLOC_SIZE; + + mutex_init(&spl_tsd_mutex, NULL, MUTEX_DEFAULT, NULL); + avl_create(&tsd_tree, tsd_tree_cmp, + sizeof (spl_tsd_node_t), + offsetof(spl_tsd_node_t, tsd_link_node)); + return 0; +} + + +uint64_t spl_tsd_size(void) +{ + return avl_numnodes(&tsd_tree); +} + +void +spl_tsd_fini(void) +{ + spl_tsd_node_t *entry = NULL; + void *cookie = NULL; + + dprintf("SPL: tsd unloading %llu\n", spl_tsd_size() ); + + mutex_enter(&spl_tsd_mutex); + cookie = NULL; + while((entry = avl_destroy_nodes(&tsd_tree, &cookie))) { + kmem_free(entry, sizeof(*entry)); + } + mutex_exit(&spl_tsd_mutex); + + avl_destroy(&tsd_tree); + mutex_destroy(&spl_tsd_mutex); + + kmem_free(tsd_dtor_array, sizeof(dtor_func_t) * tsd_dtor_size); + tsd_dtor_size = 0; +} diff --git a/module/os/windows/spl/spl-uio.c b/module/os/windows/spl/spl-uio.c new file mode 100644 index 000000000000..2b159a021223 --- /dev/null +++ b/module/os/windows/spl/spl-uio.c @@ -0,0 +1,312 @@ +/* +* CDDL HEADER START +* +* The contents of this file are subject to the terms of the +* Common Development and Distribution License (the "License"). +* You may not use this file except in compliance with the License. +* +* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +* or http://www.opensolaris.org/os/licensing. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* When distributing Covered Code, include this CDDL HEADER in each +* file and include the License file at usr/src/OPENSOLARIS.LICENSE. +* If applicable, add the following below this CDDL HEADER, with the +* fields enclosed by brackets "[]" replaced with your own identifying +* information: Portions Copyright [yyyy] [name of copyright owner] +* +* CDDL HEADER END +*/ + +/* +* +* Copyright (C) 2017 Jorgen Lundman +* +*/ + +/* +* Provides an implementation of the union of Illumos and OSX UIO struct +* and API calls. That is to say the OsX API calls are kept, to keep +* the UIO structure as opaque, but the internals are more like Illumos +* to avoid the OsX 32bit vs 64bit logic. +*/ + +#include + + +uio_t *uio_create(int iovcount, off_t offset, int spacetype, int iodirection) +{ + void * my_buf_p; + uint64_t my_size; + uio_t *my_uio; + + // Future, make sure the uio struct is aligned, and do one alloc for uio and iovec + my_size = sizeof(uio_t); + my_uio = kmem_alloc((uint32_t)my_size, KM_SLEEP); + + memset(my_uio, 0, my_size); + //my_uio->uio_size = my_size; + my_uio->uio_segflg = spacetype; + + if (iovcount > 0) { + my_uio->uio_iov = kmem_alloc(iovcount * sizeof(iovec_t), KM_SLEEP); + memset(my_uio->uio_iov, 0, iovcount * sizeof(iovec_t)); + } + else { + my_uio->uio_iov = NULL; + } + my_uio->uio_max_iovs = iovcount; + my_uio->uio_offset = offset; + my_uio->uio_rw = iodirection; + + return (my_uio); +} + +void uio_free(uio_t *uio) +{ + ASSERT(uio != NULL); + ASSERT(uio->uio_iov != NULL); + + kmem_free(uio->uio_iov, uio->uio_max_iovs * sizeof(iovec_t)); + kmem_free(uio, sizeof(uio_t)); + +} + +int uio_addiov(uio_t *uio, user_addr_t baseaddr, user_size_t length) +{ + ASSERT(uio != NULL); + ASSERT(uio->uio_iov != NULL); + + for (int i = 0; i < uio->uio_max_iovs; i++) { + if (uio->uio_iov[i].iov_len == 0 && uio->uio_iov[i].iov_base == 0) { + uio->uio_iov[i].iov_len = (uint64_t)length; + uio->uio_iov[i].iov_base = (void *)(user_addr_t)baseaddr; + uio->uio_iovcnt++; + uio->uio_resid += length; + return(0); + } + } + + return(-1); +} + +int uio_isuserspace(uio_t *uio) +{ + ASSERT(uio != NULL); + if (uio->uio_segflg == UIO_USERSPACE) + return 1; + return 0; +} + +int uio_getiov(uio_t *uio, int index, user_addr_t *baseaddr, user_size_t *length) +{ + ASSERT(uio != NULL); + ASSERT(uio->uio_iov != NULL); + + if (index < 0 || index >= uio->uio_iovcnt) { + return(-1); + } + + if (baseaddr != NULL) { + *baseaddr = (user_addr_t) uio->uio_iov[index].iov_base; + } + if (length != NULL) { + *length = uio->uio_iov[index].iov_len; + } + + return 0; +} + +int uio_iovcnt(uio_t *uio) +{ + if (uio == NULL) { + return(0); + } + + return(uio->uio_iovcnt); +} + + +off_t uio_offset(uio_t *uio) +{ + ASSERT(uio != NULL); + ASSERT(uio->uio_iov != NULL); + + if (uio == NULL) { + return(0); + } + + return(uio->uio_offset); +} + +/* + * This function is modelled after OsX, which means you can only pass + * in a value between 0 and current "iov_len". Any larger number will + * ignore the extra bytes. +*/ +void uio_update(uio_t *uio, user_size_t count) +{ + uint32_t ind; + + if (uio == NULL || uio->uio_iovcnt < 1) { + return; + } + + ASSERT(uio->uio_index < uio->uio_iovcnt); + + ind = uio->uio_index; + + if (count) { + if (count > uio->uio_iov->iov_len) { + (uintptr_t)uio->uio_iov[ind].iov_base += uio->uio_iov[ind].iov_len; + uio->uio_iov[ind].iov_len = 0; + } + else { + (uintptr_t)uio->uio_iov[ind].iov_base += count; + uio->uio_iov[ind].iov_len -= count; + } + if (count > (user_size_t)uio->uio_resid) { + uio->uio_offset += uio->uio_resid; + uio->uio_resid = 0; + } + else { + uio->uio_offset += count; + uio->uio_resid -= count; + } + } + + while (uio->uio_iovcnt > 0 && uio->uio_iov[ind].iov_len == 0) { + uio->uio_iovcnt--; + if (uio->uio_iovcnt > 0) { + uio->uio_index = (ind++); + } + } +} + + +uint64_t uio_resid(uio_t *uio) +{ + if (uio == NULL) { + return(0); + } + + return(uio->uio_resid); +} + +user_addr_t uio_curriovbase(uio_t *uio) +{ + if (uio == NULL || uio->uio_iovcnt < 1) { + return(0); + } + + return((user_addr_t)uio->uio_iov[uio->uio_index].iov_base); +} + +user_size_t uio_curriovlen(uio_t *a_uio) +{ + if (a_uio == NULL || a_uio->uio_iovcnt < 1) { + return(0); + } + + return((user_size_t)a_uio->uio_iov[a_uio->uio_index].iov_len); +} + +void uio_setoffset(uio_t *uio, off_t offset) +{ + if (uio == NULL) { + return; + } + uio->uio_offset = offset; +} + +int uio_rw(uio_t *a_uio) +{ + if (a_uio == NULL) { + return(-1); + } + return(a_uio->uio_rw); +} + +void uio_setrw(uio_t *a_uio, int a_value) +{ + if (a_uio == NULL) { + return; + } + + if (a_value == UIO_READ || a_value == UIO_WRITE) { + a_uio->uio_rw = a_value; + } + return; +} + +int uio_spacetype(uio_t *a_uio) +{ + if (a_uio == NULL) { + return(-1); + } + + return(a_uio->uio_segflg); +} + + +uio_t *uio_duplicate(uio_t *a_uio) +{ + uio_t *my_uio; + int i; + + if (a_uio == NULL) { + return(NULL); + } + + my_uio = uio_create(a_uio->uio_max_iovs, + uio_offset(a_uio), + uio_spacetype(a_uio), + uio_rw(a_uio)); + if (my_uio == 0) { + panic("%s :%d - allocation failed\n", __FILE__, __LINE__); + } + + bcopy((void *)a_uio->uio_iov, (void *)my_uio->uio_iov, a_uio->uio_max_iovs * sizeof(iovec_t)); + my_uio->uio_index = a_uio->uio_index; + my_uio->uio_resid = a_uio->uio_resid; + my_uio->uio_iovcnt = a_uio->uio_iovcnt; + + return(my_uio); +} + +int spl_uiomove(const uint8_t *c_cp, uint32_t n, struct uio *uio) +{ + const uint8_t *cp = c_cp; + uint64_t acnt; + int error = 0; + + while (n > 0 && uio_resid(uio)) { + uio_update(uio, 0); + acnt = uio_curriovlen(uio); + if (acnt == 0) { + continue; + } + if (n > 0 && acnt > (uint64_t)n) + acnt = n; + + switch ((int)uio->uio_segflg) { + case UIO_SYSSPACE: + if (uio->uio_rw == UIO_READ) + /*error =*/ bcopy(cp, uio->uio_iov[uio->uio_index].iov_base, + acnt); + else + /*error =*/ bcopy(uio->uio_iov[uio->uio_index].iov_base, (void *)cp, + acnt); + break; + default: + break; + } + uio_update(uio, acnt); + cp += acnt; + n -= (uint32_t)acnt; + } + ASSERT0(n); + return (error); +} + diff --git a/module/os/windows/spl/spl-vmem.c b/module/os/windows/spl/spl-vmem.c new file mode 100644 index 000000000000..58bca9a658dd --- /dev/null +++ b/module/os/windows/spl/spl-vmem.c @@ -0,0 +1,3813 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2017 Sean Doran + */ + +/* + * Big Theory Statement for the virtual memory allocator. + * + * For a more complete description of the main ideas, see: + * + * Jeff Bonwick and Jonathan Adams, + * + * Magazines and vmem: Extending the Slab Allocator to Many CPUs and + * Arbitrary Resources. + * + * Proceedings of the 2001 Usenix Conference. + * Available as http://www.usenix.org/event/usenix01/bonwick.html + * + * + * 1. General Concepts + * ------------------- + * + * 1.1 Overview + * ------------ + * We divide the kernel address space into a number of logically distinct + * pieces, or *arenas*: text, data, heap, stack, and so on. Within these + * arenas we often subdivide further; for example, we use heap addresses + * not only for the kernel heap (kmem_alloc() space), but also for DVMA, + * bp_mapin(), /dev/kmem, and even some device mappings like the TOD chip. + * The kernel address space, therefore, is most accurately described as + * a tree of arenas in which each node of the tree *imports* some subset + * of its parent. The virtual memory allocator manages these arenas and + * supports their natural hierarchical structure. + * + * 1.2 Arenas + * ---------- + * An arena is nothing more than a set of integers. These integers most + * commonly represent virtual addresses, but in fact they can represent + * anything at all. For example, we could use an arena containing the + * integers minpid through maxpid to allocate process IDs. vmem_create() + * and vmem_destroy() create and destroy vmem arenas. In order to + * differentiate between arenas used for adresses and arenas used for + * identifiers, the VMC_IDENTIFIER flag is passed to vmem_create(). This + * prevents identifier exhaustion from being diagnosed as general memory + * failure. + * + * 1.3 Spans + * --------- + * We represent the integers in an arena as a collection of *spans*, or + * contiguous ranges of integers. For example, the kernel heap consists + * of just one span: [kernelheap, ekernelheap). Spans can be added to an + * arena in two ways: explicitly, by vmem_add(), or implicitly, by + * importing, as described in Section 1.5 below. + * + * 1.4 Segments + * ------------ + * Spans are subdivided into *segments*, each of which is either allocated + * or free. A segment, like a span, is a contiguous range of integers. + * Each allocated segment [addr, addr + size) represents exactly one + * vmem_alloc(size) that returned addr. Free segments represent the space + * between allocated segments. If two free segments are adjacent, we + * coalesce them into one larger segment; that is, if segments [a, b) and + * [b, c) are both free, we merge them into a single segment [a, c). + * The segments within a span are linked together in increasing-address order + * so we can easily determine whether coalescing is possible. + * + * Segments never cross span boundaries. When all segments within + * an imported span become free, we return the span to its source. + * + * 1.5 Imported Memory + * ------------------- + * As mentioned in the overview, some arenas are logical subsets of + * other arenas. For example, kmem_va_arena (a virtual address cache + * that satisfies most kmem_slab_create() requests) is just a subset + * of heap_arena (the kernel heap) that provides caching for the most + * common slab sizes. When kmem_va_arena runs out of virtual memory, + * it *imports* more from the heap; we say that heap_arena is the + * *vmem source* for kmem_va_arena. vmem_create() allows you to + * specify any existing vmem arena as the source for your new arena. + * Topologically, since every arena is a child of at most one source, + * the set of all arenas forms a collection of trees. + * + * 1.6 Constrained Allocations + * --------------------------- + * Some vmem clients are quite picky about the kind of address they want. + * For example, the DVMA code may need an address that is at a particular + * phase with respect to some alignment (to get good cache coloring), or + * that lies within certain limits (the addressable range of a device), + * or that doesn't cross some boundary (a DMA counter restriction) -- + * or all of the above. vmem_xalloc() allows the client to specify any + * or all of these constraints. + * + * 1.7 The Vmem Quantum + * -------------------- + * Every arena has a notion of 'quantum', specified at vmem_create() time, + * that defines the arena's minimum unit of currency. Most commonly the + * quantum is either 1 or PAGESIZE, but any power of 2 is legal. + * All vmem allocations are guaranteed to be quantum-aligned. + * + * 1.8 Quantum Caching + * ------------------- + * A vmem arena may be so hot (frequently used) that the scalability of vmem + * allocation is a significant concern. We address this by allowing the most + * common allocation sizes to be serviced by the kernel memory allocator, + * which provides low-latency per-cpu caching. The qcache_max argument to + * vmem_create() specifies the largest allocation size to cache. + * + * 1.9 Relationship to Kernel Memory Allocator + * ------------------------------------------- + * Every kmem cache has a vmem arena as its slab supplier. The kernel memory + * allocator uses vmem_alloc() and vmem_free() to create and destroy slabs. + * + * + * 2. Implementation + * ----------------- + * + * 2.1 Segment lists and markers + * ----------------------------- + * The segment structure (vmem_seg_t) contains two doubly-linked lists. + * + * The arena list (vs_anext/vs_aprev) links all segments in the arena. + * In addition to the allocated and free segments, the arena contains + * special marker segments at span boundaries. Span markers simplify + * coalescing and importing logic by making it easy to tell both when + * we're at a span boundary (so we don't coalesce across it), and when + * a span is completely free (its neighbors will both be span markers). + * + * Imported spans will have vs_import set. + * + * The next-of-kin list (vs_knext/vs_kprev) links segments of the same type: + * (1) for allocated segments, vs_knext is the hash chain linkage; + * (2) for free segments, vs_knext is the freelist linkage; + * (3) for span marker segments, vs_knext is the next span marker. + * + * 2.2 Allocation hashing + * ---------------------- + * We maintain a hash table of all allocated segments, hashed by address. + * This allows vmem_free() to discover the target segment in constant time. + * vmem_update() periodically resizes hash tables to keep hash chains short. + * + * 2.3 Freelist management + * ----------------------- + * We maintain power-of-2 freelists for free segments, i.e. free segments + * of size >= 2^n reside in vmp->vm_freelist[n]. To ensure constant-time + * allocation, vmem_xalloc() looks not in the first freelist that *might* + * satisfy the allocation, but in the first freelist that *definitely* + * satisfies the allocation (unless VM_BESTFIT is specified, or all larger + * freelists are empty). For example, a 1000-byte allocation will be + * satisfied not from the 512..1023-byte freelist, whose members *might* + * contains a 1000-byte segment, but from a 1024-byte or larger freelist, + * the first member of which will *definitely* satisfy the allocation. + * This ensures that vmem_xalloc() works in constant time. + * + * We maintain a bit map to determine quickly which freelists are non-empty. + * vmp->vm_freemap & (1 << n) is non-zero iff vmp->vm_freelist[n] is non-empty. + * + * The different freelists are linked together into one large freelist, + * with the freelist heads serving as markers. Freelist markers simplify + * the maintenance of vm_freemap by making it easy to tell when we're taking + * the last member of a freelist (both of its neighbors will be markers). + * + * 2.4 Vmem Locking + * ---------------- + * For simplicity, all arena state is protected by a per-arena lock. + * For very hot arenas, use quantum caching for scalability. + * + * 2.5 Vmem Population + * ------------------- + * Any internal vmem routine that might need to allocate new segment + * structures must prepare in advance by calling vmem_populate(), which + * will preallocate enough vmem_seg_t's to get is through the entire + * operation without dropping the arena lock. + * + * 2.6 Auditing + * ------------ + * If KMF_AUDIT is set in kmem_flags, we audit vmem allocations as well. + * Since virtual addresses cannot be scribbled on, there is no equivalent + * in vmem to redzone checking, deadbeef, or other kmem debugging features. + * Moreover, we do not audit frees because segment coalescing destroys the + * association between an address and its segment structure. Auditing is + * thus intended primarily to keep track of who's consuming the arena. + * Debugging support could certainly be extended in the future if it proves + * necessary, but we do so much live checking via the allocation hash table + * that even non-DEBUG systems get quite a bit of sanity checking already. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +//#include + +#include + +#define VMEM_INITIAL 21 /* early vmem arenas */ +#define VMEM_SEG_INITIAL 800 +//200 //400 /* early segments */ + +/* + * Adding a new span to an arena requires two segment structures: one to + * represent the span, and one to represent the free segment it contains. + */ +#define VMEM_SEGS_PER_SPAN_CREATE 2 + +/* + * Allocating a piece of an existing segment requires 0-2 segment structures + * depending on how much of the segment we're allocating. + * + * To allocate the entire segment, no new segment structures are needed; we + * simply move the existing segment structure from the freelist to the + * allocation hash table. + * + * To allocate a piece from the left or right end of the segment, we must + * split the segment into two pieces (allocated part and remainder), so we + * need one new segment structure to represent the remainder. + * + * To allocate from the middle of a segment, we need two new segment strucures + * to represent the remainders on either side of the allocated part. + */ +#define VMEM_SEGS_PER_EXACT_ALLOC 0 +#define VMEM_SEGS_PER_LEFT_ALLOC 1 +#define VMEM_SEGS_PER_RIGHT_ALLOC 1 +#define VMEM_SEGS_PER_MIDDLE_ALLOC 2 + +/* + * vmem_populate() preallocates segment structures for vmem to do its work. + * It must preallocate enough for the worst case, which is when we must import + * a new span and then allocate from the middle of it. + */ +#define VMEM_SEGS_PER_ALLOC_MAX \ +(VMEM_SEGS_PER_SPAN_CREATE + VMEM_SEGS_PER_MIDDLE_ALLOC) + +/* + * The segment structures themselves are allocated from vmem_seg_arena, so + * we have a recursion problem when vmem_seg_arena needs to populate itself. + * We address this by working out the maximum number of segment structures + * this act will require, and multiplying by the maximum number of threads + * that we'll allow to do it simultaneously. + * + * The worst-case segment consumption to populate vmem_seg_arena is as + * follows (depicted as a stack trace to indicate why events are occurring): + * + * (In order to lower the fragmentation in the heap_arena, we specify a + * minimum import size for the vmem_metadata_arena which is the same size + * as the kmem_va quantum cache allocations. This causes the worst-case + * allocation from the vmem_metadata_arena to be 3 segments.) + * + * vmem_alloc(vmem_seg_arena) -> 2 segs (span create + exact alloc) + * segkmem_alloc(vmem_metadata_arena) + * vmem_alloc(vmem_metadata_arena) -> 3 segs (span create + left alloc) + * vmem_alloc(heap_arena) -> 1 seg (left alloc) + * page_create() + * hat_memload() + * kmem_cache_alloc() + * kmem_slab_create() + * vmem_alloc(hat_memload_arena) -> 2 segs (span create + exact alloc) + * segkmem_alloc(heap_arena) + * vmem_alloc(heap_arena) -> 1 seg (left alloc) + * page_create() + * hat_memload() -> (hat layer won't recurse further) + * + * The worst-case consumption for each arena is 3 segment structures. + * Of course, a 3-seg reserve could easily be blown by multiple threads. + * Therefore, we serialize all allocations from vmem_seg_arena (which is OK + * because they're rare). We cannot allow a non-blocking allocation to get + * tied up behind a blocking allocation, however, so we use separate locks + * for VM_SLEEP and VM_NOSLEEP allocations. Similarly, VM_PUSHPAGE allocations + * must not block behind ordinary VM_SLEEPs. In addition, if the system is + * panicking then we must keep enough resources for panic_thread to do its + * work. Thus we have at most four threads trying to allocate from + * vmem_seg_arena, and each thread consumes at most three segment structures, + * so we must maintain a 12-seg reserve. + */ +#define VMEM_POPULATE_RESERVE 12 + +/* + * vmem_populate() ensures that each arena has VMEM_MINFREE seg structures + * so that it can satisfy the worst-case allocation *and* participate in + * worst-case allocation from vmem_seg_arena. + */ +#define VMEM_MINFREE (VMEM_POPULATE_RESERVE + VMEM_SEGS_PER_ALLOC_MAX) + +static vmem_t vmem0[VMEM_INITIAL]; +static vmem_t *vmem_populator[VMEM_INITIAL]; +static uint32_t vmem_id; +static uint32_t vmem_populators; +static vmem_seg_t vmem_seg0[VMEM_SEG_INITIAL]; +static vmem_seg_t *vmem_segfree; +static kmutex_t vmem_list_lock; +static kmutex_t vmem_segfree_lock; +static kmutex_t vmem_sleep_lock; +static kmutex_t vmem_nosleep_lock; +static kmutex_t vmem_pushpage_lock; +static kmutex_t vmem_panic_lock; +static kmutex_t vmem_xnu_alloc_lock; +static vmem_t *vmem_list; +static vmem_t *vmem_metadata_arena; +static vmem_t *vmem_seg_arena; +static vmem_t *vmem_hash_arena; +static vmem_t *vmem_vmem_arena; +vmem_t *spl_default_arena; // The bottom-most arena for SPL +static vmem_t *spl_default_arena_parent; // dummy arena as a placeholder +#define VMEM_BUCKETS 13 +#define VMEM_BUCKET_LOWBIT 12 +#define VMEM_BUCKET_HIBIT 24 +static vmem_t *vmem_bucket_arena[VMEM_BUCKETS]; +vmem_t *spl_heap_arena; +static void *spl_heap_arena_initial_alloc; +static uint32_t spl_heap_arena_initial_alloc_size = 0; +#define NUMBER_OF_ARENAS_IN_VMEM_INIT 21 +//static struct timespec vmem_update_interval = {15, 0}; /* vmem_update() every 15 seconds */ +uint32_t vmem_mtbf; /* mean time between failures [default: off] */ +uint32_t vmem_seg_size = sizeof (vmem_seg_t); + +// must match with include/sys/vmem_impl.h +static vmem_kstat_t vmem_kstat_template = { + { "mem_inuse", KSTAT_DATA_UINT64 }, + { "mem_import", KSTAT_DATA_UINT64 }, + { "mem_total", KSTAT_DATA_UINT64 }, + { "vmem_source", KSTAT_DATA_UINT32 }, + { "alloc", KSTAT_DATA_UINT64 }, + { "free", KSTAT_DATA_UINT64 }, + { "wait", KSTAT_DATA_UINT64 }, + { "fail", KSTAT_DATA_UINT64 }, + { "lookup", KSTAT_DATA_UINT64 }, + { "search", KSTAT_DATA_UINT64 }, + { "populate_fail", KSTAT_DATA_UINT64 }, + { "contains", KSTAT_DATA_UINT64 }, + { "contains_search", KSTAT_DATA_UINT64 }, + { "parent_alloc", KSTAT_DATA_UINT64 }, + { "parent_free", KSTAT_DATA_UINT64 }, + { "threads_waiting", KSTAT_DATA_UINT64 }, + { "excess", KSTAT_DATA_UINT64 }, +}; + + +/* + * Insert/delete from arena list (type 'a') or next-of-kin list (type 'k'). + */ +#define VMEM_INSERT(vprev, vsp, type) \ +{ \ +vmem_seg_t *_vnext = (vprev)->vs_##type##next; \ +(vsp)->vs_##type##next = (_vnext); \ +(vsp)->vs_##type##prev = (vprev); \ +(vprev)->vs_##type##next = (vsp); \ +(_vnext)->vs_##type##prev = (vsp); \ +} + +#define VMEM_DELETE(vsp, type) \ +{ \ +vmem_seg_t *_vprev = (vsp)->vs_##type##prev; \ +vmem_seg_t *_vnext = (vsp)->vs_##type##next; \ +(_vprev)->vs_##type##next = (_vnext); \ +(_vnext)->vs_##type##prev = (_vprev); \ +} + +/// vmem thread block count +uint64_t spl_vmem_threads_waiting = 0; + +// number of allocations > minalloc +uint64_t spl_bucket_non_pow2_allocs = 0; + +// allocator kstats +uint64_t spl_vmem_unconditional_allocs = 0; +uint64_t spl_vmem_unconditional_alloc_bytes = 0; +uint64_t spl_vmem_conditional_allocs = 0; +uint64_t spl_vmem_conditional_alloc_bytes = 0; +uint64_t spl_vmem_conditional_alloc_deny = 0; +uint64_t spl_vmem_conditional_alloc_deny_bytes = 0; + +// bucket allocator kstat +uint64_t spl_xat_success = 0; +uint64_t spl_xat_late_success = 0; +uint64_t spl_xat_late_success_nosleep = 0; +uint64_t spl_xat_pressured = 0; +uint64_t spl_xat_bailed = 0; +uint64_t spl_xat_bailed_contended = 0; +uint64_t spl_xat_lastalloc = 0; +uint64_t spl_xat_lastfree = 0; +uint64_t spl_xat_forced = 0; +uint64_t spl_xat_sleep = 0; +uint64_t spl_xat_late_deny = 0; +uint64_t spl_xat_no_waiters = 0; +uint64_t spl_xft_wait = 0; + +uint64_t spl_vba_parent_memory_appeared = 0; +uint64_t spl_vba_parent_memory_blocked = 0; +uint64_t spl_vba_hiprio_blocked = 0; +uint64_t spl_vba_cv_timeout = 0; +uint64_t spl_vba_loop_timeout = 0; +uint64_t spl_vba_cv_timeout_blocked = 0; +uint64_t spl_vba_loop_timeout_blocked = 0; +uint64_t spl_vba_sleep = 0; +uint64_t spl_vba_loop_entries = 0; + +// bucket minimum span size tunables +uint64_t spl_bucket_tunable_large_span = 0; +uint64_t spl_bucket_tunable_small_span = 0; + +// for XAT & XATB visibility into VBA queue +static _Atomic uint32_t spl_vba_threads[VMEM_BUCKETS] = { 0 }; +static uint32_t vmem_bucket_id_to_bucket_number[NUMBER_OF_ARENAS_IN_VMEM_INIT] = { 0 }; +boolean_t spl_arc_no_grow(uint32_t, boolean_t, kmem_cache_t **); +_Atomic uint64_t spl_arc_no_grow_bits = 0; +uint64_t spl_arc_no_grow_count = 0; + +uint64_t spl_frag_max_walk = 1000; // compare span ages this many steps from the head of the freelist +uint64_t spl_frag_walked_out = 0; +uint64_t spl_frag_walk_cnt = 0; + +extern void spl_free_set_emergency_pressure(int64_t p); +extern uint64_t segkmem_total_mem_allocated; +extern uint64_t total_memory; + +/* + * Get a vmem_seg_t from the global segfree list. + */ +static vmem_seg_t * +vmem_getseg_global(void) +{ + vmem_seg_t *vsp; + + mutex_enter(&vmem_segfree_lock); + if ((vsp = vmem_segfree) != NULL) + vmem_segfree = vsp->vs_knext; + mutex_exit(&vmem_segfree_lock); + + if (vsp != NULL) + vsp->vs_span_createtime = 0; + + return (vsp); +} + +/* + * Put a vmem_seg_t on the global segfree list. + */ +static void +vmem_putseg_global(vmem_seg_t *vsp) +{ + mutex_enter(&vmem_segfree_lock); + vsp->vs_knext = vmem_segfree; + vmem_segfree = vsp; + mutex_exit(&vmem_segfree_lock); +} + +/* + * Get a vmem_seg_t from vmp's segfree list. + */ +static vmem_seg_t * +vmem_getseg(vmem_t *vmp) +{ + vmem_seg_t *vsp; + + ASSERT(vmp->vm_nsegfree > 0); + + vsp = vmp->vm_segfree; + vmp->vm_segfree = vsp->vs_knext; + vmp->vm_nsegfree--; + + return (vsp); +} + +/* + * Put a vmem_seg_t on vmp's segfree list. + */ +static void +vmem_putseg(vmem_t *vmp, vmem_seg_t *vsp) +{ + vsp->vs_knext = vmp->vm_segfree; + vmp->vm_segfree = vsp; + vmp->vm_nsegfree++; +} + + +/* + * Add vsp to the appropriate freelist, at the appropriate location, + * keeping the freelist sorted by age. + */ + + +/* + * return true when we continue the for loop in + * vmem_freelist_insert_sort_by_time + */ +static inline boolean_t +flist_sort_compare(boolean_t newfirst, + const vmem_seg_t *vhead, + const vmem_seg_t *nextlist, + vmem_seg_t *p, vmem_seg_t *to_insert) +{ + /* vsp is the segment we are inserting into the freelist + * p is a freelist poniter or an element inside a non-empty freelist + * if we return false, then vsp is inserted immedaitely after p, + */ + + // always enter the for loop if we're at the front of a flist + if (p == vhead) + return (B_TRUE); + + + const vmem_seg_t *n = p->vs_knext; + + if (n == nextlist || n == NULL) { + // if we are at the tail of the flist, then + // insert vsp between p and n + return (B_FALSE); + } + + if (n->vs_import == B_TRUE && to_insert->vs_import == B_FALSE) { + /* put non-imported segments before imported segments + * no matter what their respective create times are, + * thereby making imported segments more likely "age out" + */ + return (B_FALSE); // inserts to_insert between p and n + } + + if (newfirst == B_TRUE) { + if (n->vs_span_createtime < to_insert->vs_span_createtime) { + // n is older than me, so insert me between p and n + return (B_FALSE); + } + } else { + if (n->vs_span_createtime > to_insert->vs_span_createtime) { + // n is newer than me, so insert me between p and n + return (B_FALSE); + } + } + // continue iterating + return (B_TRUE); +} + +static void +vmem_freelist_insert_sort_by_time(vmem_t *vmp, vmem_seg_t *vsp) +{ + ASSERT(vmp->vm_cflags & VMC_TIMEFREE); + ASSERT(vsp->vs_span_createtime > 0); + + const boolean_t newfirst = 0 == (vmp->vm_cflags & VMC_OLDFIRST); + + const uint64_t abs_max_walk_steps = 1ULL << 30ULL; + uint32_t max_walk_steps = (uint32_t)MIN(spl_frag_max_walk, abs_max_walk_steps); + + vmem_seg_t *vprev; + + ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); + + // in vmem_create_common() the freelists are arranged: + // freelist[0].vs_kprev = NULL, freelist[VMEM_FREELISTS].vs_knext = NULL + // freelist[1].vs_kprev = freelist[0], freelist[1].vs_knext = freelist[2] ... + + // from vmem_freelist_insert(): + // VS_SIZE is the segment size (->vs_end - ->vs_start), so say 8k-512 + // highbit is the higest bit set PLUS 1, so in this case would be the 16k list. + // so below, vprev is therefore pointing to the 8k list + + // in vmem_alloc, the unconstrained allocation takes, for a 8k-512 block: + // vsp = flist[8k].vs_knext + // and calls vmem_seg_create() which sends any leftovers from vsp to vmem_freelist_insert + + // vmem_freelist_insert would take the seg (as above, 8k-512 size), vprev points to the + // 16k list, and VMEM_INSERT(vprev, vsp, k) inserts the segment immediately after + + // so vmem_seg_create(...8k-512...) pushes to the head of the 8k list, + // and vmem_alloc(...8-512k...) will pull from the head of the 8k list + + // below we may want to push to the TAIL of the 8k list, which is + // just before flist[16k]. + + vprev = (vmem_seg_t *)&vmp->vm_freelist[highbit(VS_SIZE(vsp)) - 1]; + + int my_listnum = highbit(VS_SIZE(vsp)) - 1; + + ASSERT(my_listnum >= 1); + ASSERT(my_listnum < VMEM_FREELISTS); + + int next_listnum = my_listnum + 1; + + const vmem_seg_t *nextlist = (vmem_seg_t *)&vmp->vm_freelist[next_listnum]; + + ASSERT(vsp->vs_span_createtime != 0); + if (vsp->vs_span_createtime == 0) { + TraceEvent(TRACE_WARNING, "SPL: %s: WARNING: vsp->vs_span_createtime == 0 (%s)!\n", + __func__, vmp->vm_name); + } + + // continuing our example, starts with p at flist[8k] + // and n at the following freelist entry + + const vmem_seg_t *vhead = vprev; + vmem_seg_t *p = vprev; + vmem_seg_t *n = p->vs_knext; + + // walk from the freelist head looking for + // a segment whose creation time is earlier than + // the segment to be inserted's creation time, + // then insert before that segment. + + for (uint32_t step = 0; + flist_sort_compare(newfirst, vhead, nextlist, p, vsp) == B_TRUE; + step++) { + // iterating while predecessor pointer p was created + // at a later tick than funcarg vsp. + // + // below we set p to n and update n. + ASSERT(n != NULL); + if (n == nextlist) { + //dprintf("SPL: %s: at marker (%s)(steps: %u) p->vs_start, end == %lu, %lu\n", + // __func__, vmp->vm_name, step, + // (uintptr_t)p->vs_start, (uintptr_t)p->vs_end); + // IOSleep(1); + // the next entry is the next marker (e.g. 16k marker) + break; + } + if (n->vs_start == 0) { + // from vmem_freelist_delete, this is a head + //dprintf("SPL: %s: n->vs_start == 0 (%s)(steps: %u) p->vs_start, end == %lu, %lu\n", + // __func__, vmp->vm_name, step, + // (uintptr_t)p->vs_start, (uintptr_t)p->vs_end); + // IOSleep(1); + break; + } + if (step >= max_walk_steps) { + ASSERT(nextlist->vs_kprev != NULL); + // we have walked far enough. + // put this segment at the tail of the freelist. + if (nextlist->vs_kprev != NULL) { + n = (vmem_seg_t *)nextlist; + p = nextlist->vs_kprev; + } + //dprintf("SPL: %s: walked out (%s)\n", __func__, vmp->vm_name); + // IOSleep(1); + atomic_inc_64(&spl_frag_walked_out); + break; + } + if (n->vs_knext == NULL) { + //dprintf("SPL: %s: n->vs_knext == NULL (my_listnum == %d)\n", + // __func__, my_listnum); + // IOSleep(1); + break; + } + p = n; + n = n->vs_knext; + atomic_inc_64(&spl_frag_walk_cnt); + } + + ASSERT(p != NULL); + + // insert segment between p and n + + vsp->vs_type = VMEM_FREE; + vmp->vm_freemap |= VS_SIZE(vprev); + VMEM_INSERT(p, vsp, k); + + cv_broadcast(&vmp->vm_cv); +} + +/* + * Add vsp to the appropriate freelist. + */ +static void +vmem_freelist_insert(vmem_t *vmp, vmem_seg_t *vsp) +{ + + if (vmp->vm_cflags & VMC_TIMEFREE) { + vmem_freelist_insert_sort_by_time(vmp, vsp); + return; + } + + vmem_seg_t *vprev; + + ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); + + vprev = (vmem_seg_t *)&vmp->vm_freelist[highbit(VS_SIZE(vsp)) - 1]; + vsp->vs_type = VMEM_FREE; + vmp->vm_freemap |= VS_SIZE(vprev); + VMEM_INSERT(vprev, vsp, k); + + cv_broadcast(&vmp->vm_cv); +} + +/* + * Take vsp from the freelist. + */ +static void +vmem_freelist_delete(vmem_t *vmp, vmem_seg_t *vsp) +{ + ASSERT(*VMEM_HASH(vmp, vsp->vs_start) != vsp); + ASSERT(vsp->vs_type == VMEM_FREE); + + if (vsp->vs_knext->vs_start == 0 && vsp->vs_kprev->vs_start == 0) { + /* + * The segments on both sides of 'vsp' are freelist heads, + * so taking vsp leaves the freelist at vsp->vs_kprev empty. + */ + ASSERT(vmp->vm_freemap & VS_SIZE(vsp->vs_kprev)); + vmp->vm_freemap ^= VS_SIZE(vsp->vs_kprev); + } + VMEM_DELETE(vsp, k); +} + +/* + * Add vsp to the allocated-segment hash table and update kstats. + */ +static void +vmem_hash_insert(vmem_t *vmp, vmem_seg_t *vsp) +{ + vmem_seg_t **bucket; + + vsp->vs_type = VMEM_ALLOC; + bucket = VMEM_HASH(vmp, vsp->vs_start); + vsp->vs_knext = *bucket; + *bucket = vsp; + + if (vmem_seg_size == sizeof (vmem_seg_t)) { + // vsp->vs_depth = (uint8_t)getpcstack(vsp->vs_stack, + // VMEM_STACK_DEPTH); + // vsp->vs_thread = curthread; + + vsp->vs_depth = 0; + vsp->vs_thread = 0; + vsp->vs_timestamp = gethrtime(); + } else { + vsp->vs_depth = 0; + } + + vmp->vm_kstat.vk_alloc.value.ui64++; + vmp->vm_kstat.vk_mem_inuse.value.ui64 += VS_SIZE(vsp); +} + +/* + * Remove vsp from the allocated-segment hash table and update kstats. + */ +static vmem_seg_t * +vmem_hash_delete(vmem_t *vmp, uintptr_t addr, uint32_t size) +{ + vmem_seg_t *vsp, **prev_vspp; + + prev_vspp = VMEM_HASH(vmp, addr); + while ((vsp = *prev_vspp) != NULL) { + if (vsp->vs_start == addr) { + *prev_vspp = vsp->vs_knext; + break; + } + vmp->vm_kstat.vk_lookup.value.ui64++; + prev_vspp = &vsp->vs_knext; + } + + if (vsp == NULL) + panic("vmem_hash_delete(%p, %lx, %lu): bad free (name: %s, addr, size)", + (void *)vmp, addr, size, vmp->vm_name); + if (VS_SIZE(vsp) != size) + panic("vmem_hash_delete(%p, %lx, %lu): (%s) wrong size (expect %lu)", + (void *)vmp, addr, size, vmp->vm_name, VS_SIZE(vsp)); + + vmp->vm_kstat.vk_free.value.ui64++; + vmp->vm_kstat.vk_mem_inuse.value.ui64 -= size; + + return (vsp); +} + +/* + * Create a segment spanning the range [start, end) and add it to the arena. + */ +static vmem_seg_t * +vmem_seg_create(vmem_t *vmp, vmem_seg_t *vprev, uintptr_t start, uintptr_t end) +{ + vmem_seg_t *newseg = vmem_getseg(vmp); + + newseg->vs_start = start; + newseg->vs_end = end; + newseg->vs_type = 0; + newseg->vs_import = 0; + newseg->vs_span_createtime = 0; + + VMEM_INSERT(vprev, newseg, a); + + return (newseg); +} + +/* + * Remove segment vsp from the arena. + */ +static void +vmem_seg_destroy(vmem_t *vmp, vmem_seg_t *vsp) +{ + ASSERT(vsp->vs_type != VMEM_ROTOR); + VMEM_DELETE(vsp, a); + + vmem_putseg(vmp, vsp); +} + +/* + * Add the span [vaddr, vaddr + size) to vmp and update kstats. + */ +static vmem_seg_t * +vmem_span_create(vmem_t *vmp, void *vaddr, uint32_t size, uint8_t import) +{ + vmem_seg_t *newseg, *span; + uintptr_t start = (uintptr_t)vaddr; + uintptr_t end = start + size; + + ASSERT(MUTEX_HELD(&vmp->vm_lock)); + + if ((start | end) & (vmp->vm_quantum - 1)) + panic("vmem_span_create(%p, %p, %lu): misaligned (%s)", + (void *)vmp, vaddr, size, vmp->vm_name); + + span = vmem_seg_create(vmp, vmp->vm_seg0.vs_aprev, start, end); + span->vs_type = VMEM_SPAN; + span->vs_import = import; + + hrtime_t t = 0; + if (vmp->vm_cflags & VMC_TIMEFREE) { + t = gethrtime(); + } + span->vs_span_createtime = t; + + VMEM_INSERT(vmp->vm_seg0.vs_kprev, span, k); + + newseg = vmem_seg_create(vmp, span, start, end); + newseg->vs_span_createtime = t; + + vmem_freelist_insert(vmp, newseg); + + if (import) + vmp->vm_kstat.vk_mem_import.value.ui64 += size; + vmp->vm_kstat.vk_mem_total.value.ui64 += size; + + return (newseg); +} + +/* + * Remove span vsp from vmp and update kstats. + */ +static void +vmem_span_destroy(vmem_t *vmp, vmem_seg_t *vsp) +{ + vmem_seg_t *span = vsp->vs_aprev; + uint32_t size = (uint32_t) VS_SIZE(vsp); + + ASSERT(MUTEX_HELD(&vmp->vm_lock)); + ASSERT(span->vs_type == VMEM_SPAN); + + if (span->vs_import) + vmp->vm_kstat.vk_mem_import.value.ui64 -= size; + vmp->vm_kstat.vk_mem_total.value.ui64 -= size; + + VMEM_DELETE(span, k); + + vmem_seg_destroy(vmp, vsp); + vmem_seg_destroy(vmp, span); +} + +/* + * Allocate the subrange [addr, addr + size) from segment vsp. + * If there are leftovers on either side, place them on the freelist. + * Returns a pointer to the segment representing [addr, addr + size). + */ +static vmem_seg_t * +vmem_seg_alloc(vmem_t *vmp, vmem_seg_t *vsp, uintptr_t addr, uint32_t size) +{ + uintptr_t vs_start = vsp->vs_start; + uintptr_t vs_end = vsp->vs_end; + uint32_t vs_size = (uint32_t)(vs_end - vs_start); + uint32_t realsize = P2ROUNDUP(size, vmp->vm_quantum); + uintptr_t addr_end = addr + realsize; + + ASSERT(P2PHASE(vs_start, vmp->vm_quantum) == 0); + ASSERT(P2PHASE(addr, vmp->vm_quantum) == 0); + ASSERT(vsp->vs_type == VMEM_FREE); + ASSERT(addr >= vs_start && addr_end - 1 <= vs_end - 1); + ASSERT(addr - 1 <= addr_end - 1); + + hrtime_t parent_seg_span_createtime = vsp->vs_span_createtime; + + /* + * If we're allocating from the start of the segment, and the + * remainder will be on the same freelist, we can save quite + * a bit of work. + */ + if (P2SAMEHIGHBIT(vs_size, vs_size - realsize) && addr == vs_start) { + ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); + vsp->vs_start = addr_end; + vsp = vmem_seg_create(vmp, vsp->vs_aprev, addr, addr + size); + vsp->vs_span_createtime = parent_seg_span_createtime; + vmem_hash_insert(vmp, vsp); + return (vsp); + } + + vmem_freelist_delete(vmp, vsp); + + if (vs_end != addr_end) { + vmem_seg_t *v = vmem_seg_create(vmp, vsp, addr_end, vs_end); + v->vs_span_createtime = parent_seg_span_createtime; + vmem_freelist_insert(vmp, v); + } + + if (vs_start != addr) { + vmem_seg_t *v = vmem_seg_create(vmp, vsp->vs_aprev, vs_start, addr); + v->vs_span_createtime = parent_seg_span_createtime; + vmem_freelist_insert(vmp, v); + } + + vsp->vs_start = addr; + vsp->vs_end = addr + size; + + vsp->vs_span_createtime = parent_seg_span_createtime; + + vmem_hash_insert(vmp, vsp); + return (vsp); +} + +/* + * Returns 1 if we are populating, 0 otherwise. + * Call it if we want to prevent recursion from HAT. + */ +int +vmem_is_populator() +{ + return (mutex_owner(&vmem_sleep_lock) == curthread || + mutex_owner(&vmem_nosleep_lock) == curthread || + mutex_owner(&vmem_pushpage_lock) == curthread || + mutex_owner(&vmem_panic_lock) == curthread); +} + +/* + * Populate vmp's segfree list with VMEM_MINFREE vmem_seg_t structures. + */ +static int +vmem_populate(vmem_t *vmp, int vmflag) +{ + char *p; + vmem_seg_t *vsp; + uint32_t nseg; + uint32_t size; + kmutex_t *lp; + int i; + + while (vmp->vm_nsegfree < VMEM_MINFREE && + (vsp = vmem_getseg_global()) != NULL) + vmem_putseg(vmp, vsp); + + if (vmp->vm_nsegfree >= VMEM_MINFREE) + return (1); + + /* + * If we're already populating, tap the reserve. + */ + if (vmem_is_populator()) { + ASSERT(vmp->vm_cflags & VMC_POPULATOR); + return (1); + } + + mutex_exit(&vmp->vm_lock); + + // if (panic_thread == curthread) + // lp = &vmem_panic_lock; + // else + + if (vmflag & VM_NOSLEEP) + lp = &vmem_nosleep_lock; + else if (vmflag & VM_PUSHPAGE) + lp = &vmem_pushpage_lock; + else + lp = &vmem_sleep_lock; + + mutex_enter(lp); + + nseg = VMEM_MINFREE + vmem_populators * VMEM_POPULATE_RESERVE; + size = P2ROUNDUP(nseg * vmem_seg_size, vmem_seg_arena->vm_quantum); + nseg = size / vmem_seg_size; + + /* + * The following vmem_alloc() may need to populate vmem_seg_arena + * and all the things it imports from. When doing so, it will tap + * each arena's reserve to prevent recursion (see the block comment + * above the definition of VMEM_POPULATE_RESERVE). + */ + p = vmem_alloc(vmem_seg_arena, size, vmflag & VM_KMFLAGS); + if (p == NULL) { + mutex_exit(lp); + mutex_enter(&vmp->vm_lock); + vmp->vm_kstat.vk_populate_fail.value.ui64++; + return (0); + } + + /* + * Restock the arenas that may have been depleted during population. + */ + for (i = 0; i < (int)vmem_populators; i++) { + mutex_enter(&vmem_populator[i]->vm_lock); + while (vmem_populator[i]->vm_nsegfree < VMEM_POPULATE_RESERVE) + vmem_putseg(vmem_populator[i], + (vmem_seg_t *)(p + --nseg * vmem_seg_size)); + mutex_exit(&vmem_populator[i]->vm_lock); + } + + mutex_exit(lp); + mutex_enter(&vmp->vm_lock); + + /* + * Now take our own segments. + */ + ASSERT(nseg >= VMEM_MINFREE); + while (vmp->vm_nsegfree < VMEM_MINFREE) + vmem_putseg(vmp, (vmem_seg_t *)(p + --nseg * vmem_seg_size)); + + /* + * Give the remainder to charity. + */ + while (nseg > 0) + vmem_putseg_global((vmem_seg_t *)(p + --nseg * vmem_seg_size)); + + return (1); +} + +/* + * Advance a walker from its previous position to 'afterme'. + * Note: may drop and reacquire vmp->vm_lock. + */ +static void +vmem_advance(vmem_t *vmp, vmem_seg_t *walker, vmem_seg_t *afterme) +{ + vmem_seg_t *vprev = walker->vs_aprev; + vmem_seg_t *vnext = walker->vs_anext; + vmem_seg_t *vsp = NULL; + + VMEM_DELETE(walker, a); + + if (afterme != NULL) + VMEM_INSERT(afterme, walker, a); + + /* + * The walker segment's presence may have prevented its neighbors + * from coalescing. If so, coalesce them now. + */ + if (vprev->vs_type == VMEM_FREE) { + if (vnext->vs_type == VMEM_FREE) { + ASSERT(vprev->vs_end == vnext->vs_start); + ASSERT(vprev->vs_span_createtime == vnext->vs_span_createtime); + vmem_freelist_delete(vmp, vnext); + vmem_freelist_delete(vmp, vprev); + vprev->vs_end = vnext->vs_end; + vmem_freelist_insert(vmp, vprev); + vmem_seg_destroy(vmp, vnext); + } + vsp = vprev; + } else if (vnext->vs_type == VMEM_FREE) { + vsp = vnext; + } + + /* + * vsp could represent a complete imported span, + * in which case we must return it to the source. + */ + if (vsp != NULL && vsp->vs_aprev->vs_import && + vmp->vm_source_free != NULL && + vsp->vs_aprev->vs_type == VMEM_SPAN && + vsp->vs_anext->vs_type == VMEM_SPAN) { + void *vaddr = (void *)vsp->vs_start; + uint32_t size = (uint32_t)VS_SIZE(vsp); + ASSERT(size == VS_SIZE(vsp->vs_aprev)); + vmem_freelist_delete(vmp, vsp); + vmem_span_destroy(vmp, vsp); + vmp->vm_kstat.vk_parent_free.value.ui64++; + mutex_exit(&vmp->vm_lock); + vmp->vm_source_free(vmp->vm_source, vaddr, size); + mutex_enter(&vmp->vm_lock); + } +} + +/* + * VM_NEXTFIT allocations deliberately cycle through all virtual addresses + * in an arena, so that we avoid reusing addresses for as long as possible. + * This helps to catch used-after-freed bugs. It's also the perfect policy + * for allocating things like process IDs, where we want to cycle through + * all values in order. + */ +static void * +vmem_nextfit_alloc(vmem_t *vmp, uint32_t size, int vmflag) +{ + vmem_seg_t *vsp, *rotor; + uintptr_t addr; + uint32_t realsize = P2ROUNDUP(size, vmp->vm_quantum); + uint32_t vs_size; + + mutex_enter(&vmp->vm_lock); + + if (vmp->vm_nsegfree < VMEM_MINFREE && !vmem_populate(vmp, vmflag)) { + mutex_exit(&vmp->vm_lock); + return (NULL); + } + + /* + * The common case is that the segment right after the rotor is free, + * and large enough that extracting 'size' bytes won't change which + * freelist it's on. In this case we can avoid a *lot* of work. + * Instead of the normal vmem_seg_alloc(), we just advance the start + * address of the victim segment. Instead of moving the rotor, we + * create the new segment structure *behind the rotor*, which has + * the same effect. And finally, we know we don't have to coalesce + * the rotor's neighbors because the new segment lies between them. + */ + rotor = &vmp->vm_rotor; + vsp = rotor->vs_anext; + if (vsp->vs_type == VMEM_FREE && (vs_size = (uint32_t)VS_SIZE(vsp)) > realsize && + P2SAMEHIGHBIT(vs_size, vs_size - realsize)) { + ASSERT(highbit(vs_size) == highbit(vs_size - realsize)); + addr = vsp->vs_start; + vsp->vs_start = addr + realsize; + hrtime_t t = vsp->vs_span_createtime; + vmem_hash_insert(vmp, + vmem_seg_create(vmp, rotor->vs_aprev, addr, addr + size)); + vsp->vs_span_createtime = t; + mutex_exit(&vmp->vm_lock); + return ((void *)addr); + } + + /* + * Starting at the rotor, look for a segment large enough to + * satisfy the allocation. + */ + for (;;) { + atomic_inc_64(&vmp->vm_kstat.vk_search.value.ui64); + if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) + break; + vsp = vsp->vs_anext; + if (vsp == rotor) { + /* + * We've come full circle. One possibility is that the + * there's actually enough space, but the rotor itself + * is preventing the allocation from succeeding because + * it's sitting between two free segments. Therefore, + * we advance the rotor and see if that liberates a + * suitable segment. + */ + vmem_advance(vmp, rotor, rotor->vs_anext); + vsp = rotor->vs_aprev; + if (vsp->vs_type == VMEM_FREE && VS_SIZE(vsp) >= size) + break; + /* + * If there's a lower arena we can import from, or it's + * a VM_NOSLEEP allocation, let vmem_xalloc() handle it. + * Otherwise, wait until another thread frees something. + */ + if (vmp->vm_source_alloc != NULL || + (vmflag & VM_NOSLEEP)) { + mutex_exit(&vmp->vm_lock); + return (vmem_xalloc(vmp, size, vmp->vm_quantum, + 0, 0, NULL, NULL, vmflag & (VM_KMFLAGS | VM_NEXTFIT))); + } + atomic_inc_64(&vmp->vm_kstat.vk_wait.value.ui64); + atomic_inc_64(&vmp->vm_kstat.vk_threads_waiting.value.ui64); + atomic_inc_64(&spl_vmem_threads_waiting); + if (spl_vmem_threads_waiting > 1) + dprintf("SPL: %s: waiting for %lu sized alloc after full circle of %s, " + "waiting threads %llu, total threads waiting = %llu.\n", + __func__, size, vmp->vm_name, + vmp->vm_kstat.vk_threads_waiting.value.ui64, + spl_vmem_threads_waiting); + cv_wait(&vmp->vm_cv, &vmp->vm_lock); + atomic_dec_64(&spl_vmem_threads_waiting); + atomic_dec_64(&vmp->vm_kstat.vk_threads_waiting.value.ui64); + vsp = rotor->vs_anext; + } + } + + /* + * We found a segment. Extract enough space to satisfy the allocation. + */ + addr = vsp->vs_start; + vsp = vmem_seg_alloc(vmp, vsp, addr, size); + ASSERT(vsp->vs_type == VMEM_ALLOC && + vsp->vs_start == addr && vsp->vs_end == addr + size); + + /* + * Advance the rotor to right after the newly-allocated segment. + * That's where the next VM_NEXTFIT allocation will begin searching. + */ + vmem_advance(vmp, rotor, vsp); + mutex_exit(&vmp->vm_lock); + return ((void *)addr); +} + +/* + * Checks if vmp is guaranteed to have a size-byte buffer somewhere on its + * freelist. If size is not a power-of-2, it can return a FALSE-negative. + * + * Used to decide if a newly imported span is superfluous after re-acquiring + * the arena lock. + */ +static int +vmem_canalloc(vmem_t *vmp, uint32_t size) +{ + int hb; + int flist = 0; + ASSERT(MUTEX_HELD(&vmp->vm_lock)); + + if ((size & (size - 1)) == 0) + flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); + else if ((hb = highbit(size)) < VMEM_FREELISTS) + flist = lowbit(P2ALIGN(vmp->vm_freemap, 1ULL << hb)); + + return (flist); +} + +// Convenience functions for use when gauging +// allocation ability when not holding the lock. +// These are unreliable because vmp->vm_freemap is +// liable to change immediately after being examined. +int +vmem_canalloc_lock(vmem_t *vmp, uint32_t size) +{ + mutex_enter(&vmp->vm_lock); + int i = vmem_canalloc(vmp, size); + mutex_exit(&vmp->vm_lock); + return (i); +} + +int +vmem_canalloc_atomic(vmem_t *vmp, uint32_t size) +{ + int hb; + int flist = 0; + + //ulong_t freemap = __c11_atomic_load((_Atomic ulong_t *)&vmp->vm_freemap, __ATOMIC_SEQ_CST); + ulong_t freemap = InterlockedOr(&vmp->vm_freemap, 0); + + if (ISP2(size)) + flist = lowbit(P2ALIGN(freemap, size)); + else if ((hb = highbit(size)) < VMEM_FREELISTS) + flist = lowbit(P2ALIGN(freemap, 1ULL << hb)); + + return (flist); +} + +static inline uint64_t +spl_vmem_xnu_useful_bytes_free(void) +{ + extern volatile unsigned int vm_page_free_wanted; + extern volatile unsigned int vm_page_free_count; + extern volatile unsigned int vm_page_free_min; + + if (vm_page_free_wanted > 0) + return (0); + + uint64_t bytes_free = (uint64_t)vm_page_free_count * (uint64_t)PAGESIZE; + uint64_t bytes_min = (uint64_t)vm_page_free_min * (uint64_t)PAGESIZE; + + if (bytes_free <= bytes_min) + return (0); + + uint64_t useful_free = bytes_free - bytes_min; + + return (useful_free); +} + +uint64_t +vmem_xnu_useful_bytes_free(void) +{ + return(spl_vmem_xnu_useful_bytes_free()); +} + + +static void * +spl_vmem_malloc_unconditionally_unlocked(uint32_t size) +{ + extern void *osif_malloc(uint64_t); + atomic_inc_64(&spl_vmem_unconditional_allocs); + atomic_add_64(&spl_vmem_unconditional_alloc_bytes, size); + return(osif_malloc(size)); +} + +static void * +spl_vmem_malloc_unconditionally(uint32_t size) +{ + mutex_enter(&vmem_xnu_alloc_lock); + void *m = spl_vmem_malloc_unconditionally_unlocked(size); + mutex_exit(&vmem_xnu_alloc_lock); + return (m); +} + +static void * +spl_vmem_malloc_if_no_pressure(uint32_t size) +{ + // The mutex serializes concurrent callers, providing time for + // the variables in spl_vmem_xnu_useful_bytes_free() to be updated. + mutex_enter(&vmem_xnu_alloc_lock); + if (spl_vmem_xnu_useful_bytes_free() > (MAX(size,16ULL*1024ULL*1024ULL))) { + extern void *osif_malloc(uint64_t); + void *p = osif_malloc(size); + if (p != NULL) { + spl_vmem_conditional_allocs++; + spl_vmem_conditional_alloc_bytes += size; + } + mutex_exit(&vmem_xnu_alloc_lock); + return (p); + } else { + spl_vmem_conditional_alloc_deny++; + spl_vmem_conditional_alloc_deny_bytes += size; + mutex_exit(&vmem_xnu_alloc_lock); + return (NULL); + } +} + +/* + * Allocate size bytes at offset phase from an align boundary such that the + * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr) + * that does not straddle a nocross-aligned boundary. + */ +void * +vmem_xalloc(vmem_t *vmp, uint32_t size, uint32_t align_arg, uint32_t phase, + uint32_t nocross, void *minaddr, void *maxaddr, int vmflag) +{ + vmem_seg_t *vsp; + vmem_seg_t *vbest = NULL; + uintptr_t addr, taddr, start, end; + uintptr_t align = (align_arg != 0) ? align_arg : vmp->vm_quantum; + void *vaddr, *xvaddr = NULL; + uint32_t xsize; + int hb, flist, resv; + uint32_t mtbf; + + if ((align | phase | nocross) & (vmp->vm_quantum - 1)) + panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "parameters not vm_quantum aligned", + (void *)vmp, size, align_arg, phase, nocross, + minaddr, maxaddr, vmflag); + + if (nocross != 0 && + (align > nocross || P2ROUNDUP(phase + size, align) > nocross)) + panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "overconstrained allocation", + (void *)vmp, size, align_arg, phase, nocross, + minaddr, maxaddr, vmflag); + + if (phase >= align || (align & (align - 1)) != 0 || + (nocross & (nocross - 1)) != 0) + panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "parameters inconsistent or invalid", + (void *)vmp, size, align_arg, phase, nocross, + minaddr, maxaddr, vmflag); + + if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && + (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) + return (NULL); + + mutex_enter(&vmp->vm_lock); + for (;;) { + if (vmp->vm_nsegfree < VMEM_MINFREE && + !vmem_populate(vmp, vmflag)) + break; + do_alloc: + /* + * highbit() returns the highest bit + 1, which is exactly + * what we want: we want to search the first freelist whose + * members are *definitely* large enough to satisfy our + * allocation. However, there are certain cases in which we + * want to look at the next-smallest freelist (which *might* + * be able to satisfy the allocation): + * + * (1) The size is exactly a power of 2, in which case + * the smaller freelist is always big enough; + * + * (2) All other freelists are empty; + * + * (3) We're in the highest possible freelist, which is + * always empty (e.g. the 4GB freelist on 32-bit systems); + * + * (4) We're doing a best-fit or first-fit allocation. + */ + if ((size & (size - 1)) == 0) { + flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); + } else { + hb = highbit(size); + if ((vmp->vm_freemap >> hb) == 0 || + hb == VMEM_FREELISTS || + (vmflag & (VM_BESTFIT | VM_FIRSTFIT))) + hb--; + flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); + } + + for (vbest = NULL, vsp = (flist == 0) ? NULL : + vmp->vm_freelist[flist - 1].vs_knext; + vsp != NULL; vsp = vsp->vs_knext) { + atomic_inc_64(&vmp->vm_kstat.vk_search.value.ui64); + if (vsp->vs_start == 0) { + /* + * We're moving up to a larger freelist, + * so if we've already found a candidate, + * the fit can't possibly get any better. + */ + if (vbest != NULL) + break; + /* + * Find the next non-empty freelist. + */ + flist = lowbit(P2ALIGN(vmp->vm_freemap, + VS_SIZE(vsp))); + if (flist-- == 0) + break; + vsp = (vmem_seg_t *)&vmp->vm_freelist[flist]; + ASSERT(vsp->vs_knext->vs_type == VMEM_FREE); + continue; + } + if (vsp->vs_end - 1 < (uintptr_t)minaddr) + continue; + if (vsp->vs_start > (uintptr_t)maxaddr - 1) + continue; + start = MAX(vsp->vs_start, (uintptr_t)minaddr); + end = MIN(vsp->vs_end - 1, (uintptr_t)maxaddr - 1) + 1; + taddr = P2PHASEUP(start, align, phase); + if (P2BOUNDARY(taddr, size, nocross)) + taddr += + P2ROUNDUP(P2NPHASE(taddr, nocross), align); + if ((taddr - start) + size > end - start || + (vbest != NULL && VS_SIZE(vsp) >= VS_SIZE(vbest))) + continue; + vbest = vsp; + addr = taddr; + if (!(vmflag & VM_BESTFIT) || VS_SIZE(vbest) == size) + break; + } + if (vbest != NULL) + break; + ASSERT(xvaddr == NULL); + if (size == 0) + panic("vmem_xalloc(): size == 0"); + if (vmp->vm_source_alloc != NULL && nocross == 0 && + minaddr == NULL && maxaddr == NULL) { + uint32_t aneeded, asize; + uint32_t aquantum = MAX(vmp->vm_quantum, + vmp->vm_source->vm_quantum); + uint32_t aphase = phase; + if ((align > aquantum) && + !(vmp->vm_cflags & VMC_XALIGN)) { + aphase = (uint32_t) ((P2PHASE(phase, aquantum) != 0) ? + align - vmp->vm_quantum : align - aquantum); + ASSERT(aphase >= phase); + } + aneeded = MAX(size + aphase, vmp->vm_min_import); + asize = P2ROUNDUP(aneeded, aquantum); + + if (asize < size) { + /* + * The rounding induced overflow; return NULL + * if we are permitted to fail the allocation + * (and explicitly panic if we aren't). + */ + if ((vmflag & VM_NOSLEEP) && + !(vmflag & VM_PANIC)) { + mutex_exit(&vmp->vm_lock); + return (NULL); + } + + panic("vmem_xalloc(): size overflow"); + } + + /* + * Determine how many segment structures we'll consume. + * The calculation must be precise because if we're + * here on behalf of vmem_populate(), we are taking + * segments from a very limited reserve. + */ + if (size == asize && !(vmp->vm_cflags & VMC_XALLOC)) + resv = VMEM_SEGS_PER_SPAN_CREATE + + VMEM_SEGS_PER_EXACT_ALLOC; + else if (phase == 0 && + align <= vmp->vm_source->vm_quantum) + resv = VMEM_SEGS_PER_SPAN_CREATE + + VMEM_SEGS_PER_LEFT_ALLOC; + else + resv = VMEM_SEGS_PER_ALLOC_MAX; + + ASSERT(vmp->vm_nsegfree >= resv); + vmp->vm_nsegfree -= resv; /* reserve our segs */ + mutex_exit(&vmp->vm_lock); + if (vmp->vm_cflags & VMC_XALLOC) { + //uint32_t oasize = asize; + vaddr = ((vmem_ximport_t *) + vmp->vm_source_alloc)(vmp->vm_source, + &asize, (uint32_t) align, vmflag & VM_KMFLAGS); + //ASSERT(asize >= oasize); + ASSERT(P2PHASE(asize, + vmp->vm_source->vm_quantum) == 0); + ASSERT(!(vmp->vm_cflags & VMC_XALIGN) || + IS_P2ALIGNED(vaddr, align)); + } else { + atomic_inc_64(&vmp->vm_kstat.vk_parent_alloc.value.ui64); + vaddr = vmp->vm_source_alloc(vmp->vm_source, + asize, vmflag & (VM_KMFLAGS | VM_NEXTFIT)); + } + mutex_enter(&vmp->vm_lock); + vmp->vm_nsegfree += resv; /* claim reservation */ + aneeded = size + (uint32_t)align - vmp->vm_quantum; + aneeded = P2ROUNDUP(aneeded, vmp->vm_quantum); + if (vaddr != NULL) { + /* + * Since we dropped the vmem lock while + * calling the import function, other + * threads could have imported space + * and made our import unnecessary. In + * order to save space, we return + * excess imports immediately. + */ + // but if there are threads waiting below, + // do not return the excess import, rather + // wake those threads up so they can use it. + if (asize > aneeded && + vmp->vm_source_free != NULL && + vmp->vm_kstat.vk_threads_waiting.value.ui64 == 0 && + vmem_canalloc(vmp, aneeded)) { + ASSERT(resv >= + VMEM_SEGS_PER_MIDDLE_ALLOC); + xvaddr = vaddr; + xsize = asize; + goto do_alloc; + } else if (vmp->vm_kstat.vk_threads_waiting.value.ui64 > 0) { + vmp->vm_kstat.vk_excess.value.ui64++; + cv_broadcast(&vmp->vm_cv); + } + vbest = vmem_span_create(vmp, vaddr, asize, 1); + addr = P2PHASEUP(vbest->vs_start, align, phase); + break; + } else if (vmem_canalloc(vmp, aneeded)) { + /* + * Our import failed, but another thread + * added sufficient free memory to the arena + * to satisfy our request. Go back and + * grab it. + */ + ASSERT(resv >= VMEM_SEGS_PER_MIDDLE_ALLOC); + goto do_alloc; + } + } + + /* + * If the requestor chooses to fail the allocation attempt + * rather than reap wait and retry - get out of the loop. + */ + if (vmflag & VM_ABORT) + break; + mutex_exit(&vmp->vm_lock); + + if (vmp->vm_cflags & VMC_IDENTIFIER) + kmem_reap_idspace(); + else + kmem_reap(); + + mutex_enter(&vmp->vm_lock); + if (vmflag & VM_NOSLEEP) + break; + atomic_inc_64(&vmp->vm_kstat.vk_wait.value.ui64); + atomic_inc_64(&vmp->vm_kstat.vk_threads_waiting.value.ui64); + atomic_inc_64(&spl_vmem_threads_waiting); + if (spl_vmem_threads_waiting > 0) { + dprintf("SPL: %s: vmem waiting for %lu sized alloc for %s, " + "waiting threads %llu, total threads waiting = %llu\n", + __func__, size, vmp->vm_name, + vmp->vm_kstat.vk_threads_waiting.value.ui64, + spl_vmem_threads_waiting); + extern int64_t spl_free_set_and_wait_pressure(int64_t, boolean_t, clock_t); + extern int64_t spl_free_manual_pressure_wrapper(void); + mutex_exit(&vmp->vm_lock); + spl_free_set_pressure(0); // release other waiting threads + int64_t target_pressure = size * spl_vmem_threads_waiting; + int64_t delivered_pressure = spl_free_set_and_wait_pressure(target_pressure, + TRUE, USEC2NSEC(500)); + dprintf("SPL: %s: pressure %lld targeted, %lld delivered\n", + __func__, target_pressure, delivered_pressure); + mutex_enter(&vmp->vm_lock); + } + cv_wait(&vmp->vm_cv, &vmp->vm_lock); + atomic_dec_64(&spl_vmem_threads_waiting); + atomic_dec_64(&vmp->vm_kstat.vk_threads_waiting.value.ui64); + } + if (vbest != NULL) { + ASSERT(vbest->vs_type == VMEM_FREE); + ASSERT(vbest->vs_knext != vbest); + /* re-position to end of buffer */ + if (vmflag & VM_ENDALLOC) { + addr += ((vbest->vs_end - (addr + size)) / align) * + align; + } + (void) vmem_seg_alloc(vmp, vbest, addr, size); + mutex_exit(&vmp->vm_lock); + if (xvaddr) { + atomic_inc_64(&vmp->vm_kstat.vk_parent_free.value.ui64); + vmp->vm_source_free(vmp->vm_source, xvaddr, xsize); + } + ASSERT(P2PHASE(addr, align) == phase); + ASSERT(!P2BOUNDARY(addr, size, nocross)); + ASSERT(addr >= (uintptr_t)minaddr); + ASSERT(addr + size - 1 <= (uintptr_t)maxaddr - 1); + return ((void *)addr); + } + if (0 == (vmflag & VM_NO_VBA)) { + vmp->vm_kstat.vk_fail.value.ui64++; + } + mutex_exit(&vmp->vm_lock); + if (vmflag & VM_PANIC) + panic("vmem_xalloc(%p, %lu, %lu, %lu, %lu, %p, %p, %x): " + "cannot satisfy mandatory allocation", + (void *)vmp, size, align_arg, phase, nocross, + minaddr, maxaddr, vmflag); + ASSERT(xvaddr == NULL); + return (NULL); +} + +/* + * Free the segment [vaddr, vaddr + size), where vaddr was a constrained + * allocation. vmem_xalloc() and vmem_xfree() must always be paired because + * both routines bypass the quantum caches. + */ +void +vmem_xfree(vmem_t *vmp, void *vaddr, uint32_t size) +{ + vmem_seg_t *vsp, *vnext, *vprev; + + mutex_enter(&vmp->vm_lock); + + vsp = vmem_hash_delete(vmp, (uintptr_t)vaddr, size); + vsp->vs_end = P2ROUNDUP(vsp->vs_end, vmp->vm_quantum); + + /* + * Attempt to coalesce with the next segment. + */ + vnext = vsp->vs_anext; + if (vnext->vs_type == VMEM_FREE) { + ASSERT(vsp->vs_end == vnext->vs_start); + vmem_freelist_delete(vmp, vnext); + vsp->vs_end = vnext->vs_end; + vmem_seg_destroy(vmp, vnext); + } + + /* + * Attempt to coalesce with the previous segment. + */ + vprev = vsp->vs_aprev; + if (vprev->vs_type == VMEM_FREE) { + ASSERT(vprev->vs_end == vsp->vs_start); + vmem_freelist_delete(vmp, vprev); + vprev->vs_end = vsp->vs_end; + vmem_seg_destroy(vmp, vsp); + vsp = vprev; + } + + /* + * If the entire span is free, return it to the source. + */ + if (vsp->vs_aprev->vs_import && vmp->vm_source_free != NULL && + vsp->vs_aprev->vs_type == VMEM_SPAN && + vsp->vs_anext->vs_type == VMEM_SPAN) { + vaddr = (void *)vsp->vs_start; + size = (uint32_t) VS_SIZE(vsp); + ASSERT(size == VS_SIZE(vsp->vs_aprev)); + vmem_span_destroy(vmp, vsp); + vmp->vm_kstat.vk_parent_free.value.ui64++; + mutex_exit(&vmp->vm_lock); + vmp->vm_source_free(vmp->vm_source, vaddr, size); + } else { + vmem_freelist_insert(vmp, vsp); + mutex_exit(&vmp->vm_lock); + } +} + +/* + * Allocate size bytes from arena vmp. Returns the allocated address + * on success, NULL on failure. vmflag specifies VM_SLEEP or VM_NOSLEEP, + * and may also specify best-fit, first-fit, or next-fit allocation policy + * instead of the default instant-fit policy. VM_SLEEP allocations are + * guaranteed to succeed. + */ +void * +vmem_alloc(vmem_t *vmp, uint32_t size, int vmflag) +{ + vmem_seg_t *vsp; + uintptr_t addr; + int hb; + int flist = 0; + uint32_t mtbf; + + if (size - 1 < vmp->vm_qcache_max) + return (kmem_cache_alloc(vmp->vm_qcache[(size - 1) >> + vmp->vm_qshift], vmflag & VM_KMFLAGS)); + + if ((mtbf = vmem_mtbf | vmp->vm_mtbf) != 0 && gethrtime() % mtbf == 0 && + (vmflag & (VM_NOSLEEP | VM_PANIC)) == VM_NOSLEEP) + return (NULL); + + if (vmflag & VM_NEXTFIT) + return (vmem_nextfit_alloc(vmp, size, vmflag)); + + if (vmflag & (VM_BESTFIT | VM_FIRSTFIT)) + return (vmem_xalloc(vmp, size, vmp->vm_quantum, 0, 0, + NULL, NULL, vmflag)); + if (vmp->vm_cflags & VM_NEXTFIT) + return (vmem_nextfit_alloc(vmp, size, vmflag)); + + /* + * Unconstrained instant-fit allocation from the segment list. + */ + mutex_enter(&vmp->vm_lock); + + if (vmp->vm_nsegfree >= VMEM_MINFREE || vmem_populate(vmp, vmflag)) { + if ((size & (size - 1)) == 0) + flist = lowbit(P2ALIGN(vmp->vm_freemap, size)); + else if ((hb = highbit(size)) < VMEM_FREELISTS) + flist = lowbit(P2ALIGN(vmp->vm_freemap, 1UL << hb)); + } + + if (flist-- == 0) { + mutex_exit(&vmp->vm_lock); + return (vmem_xalloc(vmp, size, vmp->vm_quantum, + 0, 0, NULL, NULL, vmflag)); + } + + ASSERT(size <= (1UL << flist)); + vsp = vmp->vm_freelist[flist].vs_knext; + addr = vsp->vs_start; + if (vmflag & VM_ENDALLOC) { + addr += vsp->vs_end - (addr + size); + } + (void) vmem_seg_alloc(vmp, vsp, addr, size); + mutex_exit(&vmp->vm_lock); + return ((void *)addr); +} + +/* + * Free the segment [vaddr, vaddr + size). + */ +void +vmem_free(vmem_t *vmp, void *vaddr, uint32_t size) +{ + if (size - 1 < vmp->vm_qcache_max) + kmem_cache_free(vmp->vm_qcache[(size - 1) >> vmp->vm_qshift], + vaddr); + else + vmem_xfree(vmp, vaddr, size); +} + +/* + * Determine whether arena vmp contains the segment [vaddr, vaddr + size). + */ +int +vmem_contains(vmem_t *vmp, void *vaddr, uint32_t size) +{ + uintptr_t start = (uintptr_t)vaddr; + uintptr_t end = start + size; + vmem_seg_t *vsp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + + mutex_enter(&vmp->vm_lock); + vmp->vm_kstat.vk_contains.value.ui64++; + for (vsp = seg0->vs_knext; vsp != seg0; vsp = vsp->vs_knext) { + vmp->vm_kstat.vk_contains_search.value.ui64++; + ASSERT(vsp->vs_type == VMEM_SPAN); + if (start >= vsp->vs_start && end - 1 <= vsp->vs_end - 1) + break; + } + mutex_exit(&vmp->vm_lock); + return (vsp != seg0); +} + +/* + * Add the span [vaddr, vaddr + size) to arena vmp. + */ +void * +vmem_add(vmem_t *vmp, void *vaddr, uint32_t size, int vmflag) +{ + if (vaddr == NULL || size == 0) + panic("vmem_add(%p, %p, %lu): bad arguments", + (void *)vmp, vaddr, size); + + ASSERT(!vmem_contains(vmp, vaddr, size)); + + mutex_enter(&vmp->vm_lock); + if (vmem_populate(vmp, vmflag)) + (void) vmem_span_create(vmp, vaddr, size, 0); + else + vaddr = NULL; + mutex_exit(&vmp->vm_lock); + return (vaddr); +} + +/* + * Walk the vmp arena, applying func to each segment matching typemask. + * If VMEM_REENTRANT is specified, the arena lock is dropped across each + * call to func(); otherwise, it is held for the duration of vmem_walk() + * to ensure a consistent snapshot. Note that VMEM_REENTRANT callbacks + * are *not* necessarily consistent, so they may only be used when a hint + * is adequate. + */ +void +vmem_walk(vmem_t *vmp, int typemask, + void (*func)(void *, void *, uint32_t), void *arg) +{ + vmem_seg_t *vsp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + vmem_seg_t walker; + + if (typemask & VMEM_WALKER) + return; + + bzero(&walker, sizeof (walker)); + walker.vs_type = VMEM_WALKER; + + mutex_enter(&vmp->vm_lock); + VMEM_INSERT(seg0, &walker, a); + for (vsp = seg0->vs_anext; vsp != seg0; vsp = vsp->vs_anext) { + if (vsp->vs_type & typemask) { + void *start = (void *)vsp->vs_start; + uint32_t size = (uint32_t) VS_SIZE(vsp); + if (typemask & VMEM_REENTRANT) { + vmem_advance(vmp, &walker, vsp); + mutex_exit(&vmp->vm_lock); + func(arg, start, size); + mutex_enter(&vmp->vm_lock); + vsp = &walker; + } else { + func(arg, start, size); + } + } + } + vmem_advance(vmp, &walker, NULL); + mutex_exit(&vmp->vm_lock); +} + +/* + * Return the total amount of memory whose type matches typemask. Thus: + * + * typemask VMEM_ALLOC yields total memory allocated (in use). + * typemask VMEM_FREE yields total memory free (available). + * typemask (VMEM_ALLOC | VMEM_FREE) yields total arena size. + */ +uint32_t +vmem_size(vmem_t *vmp, int typemask) +{ + int64_t size = 0; + + if (typemask & VMEM_ALLOC) + size += (int64_t)vmp->vm_kstat.vk_mem_inuse.value.ui64; + if (typemask & VMEM_FREE) + size += (int64_t)vmp->vm_kstat.vk_mem_total.value.ui64 - + (int64_t)vmp->vm_kstat.vk_mem_inuse.value.ui64; + if (size < 0) + size = 0; + + return ((uint32_t)size); +} + +uint32_t +vmem_size_locked(vmem_t *vmp, int typemask) +{ + boolean_t m = (mutex_owner(&vmp->vm_lock) == curthread); + + if (!m) + mutex_enter(&vmp->vm_lock); + uint32_t s = vmem_size(vmp, typemask); + if (!m) + mutex_exit(&vmp->vm_lock); + return (s); +} + +uint32_t +vmem_size_semi_atomic(vmem_t *vmp, int typemask) +{ + int64_t size = 0; + uint64_t inuse = 0; + uint64_t total = 0; + + //__sync_swap(&total, vmp->vm_kstat.vk_mem_total.value.ui64); + //__sync_swap(&inuse, vmp->vm_kstat.vk_mem_inuse.value.ui64); + InterlockedExchange64(&total, vmp->vm_kstat.vk_mem_total.value.ui64); + InterlockedExchange64(&inuse, vmp->vm_kstat.vk_mem_inuse.value.ui64); + + int64_t inuse_signed = (int64_t)inuse; + int64_t total_signed = (int64_t)total; + + if (typemask & VMEM_ALLOC) + size += inuse_signed; + if (typemask & VMEM_FREE) + size += total_signed - inuse_signed; + + if (size < 0) + size = 0; + + return ((uint32_t) size); +} + +uint32_t +spl_vmem_size(vmem_t *vmp, int typemask) +{ + return(vmem_size_locked(vmp, typemask)); +} + +/* + * Create an arena called name whose initial span is [base, base + size). + * The arena's natural unit of currency is quantum, so vmem_alloc() + * guarantees quantum-aligned results. The arena may import new spans + * by invoking afunc() on source, and may return those spans by invoking + * ffunc() on source. To make small allocations fast and scalable, + * the arena offers high-performance caching for each integer multiple + * of quantum up to qcache_max. + */ +static vmem_t * +vmem_create_common(const char *name, void *base, uint32_t size, uint32_t quantum, + void *(*afunc)(vmem_t *, uint32_t, int), + void (*ffunc)(vmem_t *, void *, uint32_t), + vmem_t *source, uint32_t qcache_max, int vmflag) +{ + int i; + uint32_t nqcache; + vmem_t *vmp, *cur, **vmpp; + vmem_seg_t *vsp; + vmem_freelist_t *vfp; + uint32_t id = atomic_inc_32_nv(&vmem_id); + + if (vmem_vmem_arena != NULL) { + vmp = vmem_alloc(vmem_vmem_arena, sizeof (vmem_t), + vmflag & VM_KMFLAGS); + } else { + ASSERT(id <= VMEM_INITIAL); + vmp = &vmem0[id - 1]; + } + + /* An identifier arena must inherit from another identifier arena */ + ASSERT(source == NULL || ((source->vm_cflags & VMC_IDENTIFIER) == + (vmflag & VMC_IDENTIFIER))); + + if (vmp == NULL) + return (NULL); + bzero(vmp, sizeof (vmem_t)); + + (void) snprintf(vmp->vm_name, VMEM_NAMELEN, "%s", name); + mutex_init(&vmp->vm_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vmp->vm_cv, NULL, CV_DEFAULT, NULL); + vmp->vm_cflags = vmflag; + vmflag &= VM_KMFLAGS; + + hrtime_t hrnow = gethrtime(); + + vmp->vm_createtime = hrnow; + + vmp->vm_quantum = quantum; + vmp->vm_qshift = highbit(quantum) - 1; + nqcache = MIN(qcache_max >> vmp->vm_qshift, VMEM_NQCACHE_MAX); + + for (i = 0; i <= VMEM_FREELISTS; i++) { + vfp = &vmp->vm_freelist[i]; + vfp->vs_end = 1ULL << i; + vfp->vs_knext = (vmem_seg_t *)(vfp + 1); + vfp->vs_kprev = (vmem_seg_t *)(vfp - 1); + } + + vmp->vm_freelist[0].vs_kprev = NULL; + vmp->vm_freelist[VMEM_FREELISTS].vs_knext = NULL; + vmp->vm_freelist[VMEM_FREELISTS].vs_end = 0; + vmp->vm_hash_table = vmp->vm_hash0; + vmp->vm_hash_mask = VMEM_HASH_INITIAL - 1; + vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); + + vsp = &vmp->vm_seg0; + vsp->vs_anext = vsp; + vsp->vs_aprev = vsp; + vsp->vs_knext = vsp; + vsp->vs_kprev = vsp; + vsp->vs_type = VMEM_SPAN; + vsp->vs_span_createtime = hrnow; + + vsp = &vmp->vm_rotor; + vsp->vs_type = VMEM_ROTOR; + VMEM_INSERT(&vmp->vm_seg0, vsp, a); + + bcopy(&vmem_kstat_template, &vmp->vm_kstat, sizeof (vmem_kstat_t)); + + vmp->vm_id = id; + if (source != NULL) + vmp->vm_kstat.vk_source_id.value.ui32 = source->vm_id; + vmp->vm_source = source; + vmp->vm_source_alloc = afunc; + vmp->vm_source_free = ffunc; + + /* + * Some arenas (like vmem_metadata and kmem_metadata) cannot + * use quantum caching to lower fragmentation. Instead, we + * increase their imports, giving a similar effect. + */ + if (vmp->vm_cflags & VMC_NO_QCACHE) { + if (qcache_max > VMEM_NQCACHE_MAX && ISP2(qcache_max)) { + vmp->vm_min_import = qcache_max; + } else { + vmp->vm_min_import = + VMEM_QCACHE_SLABSIZE(nqcache << vmp->vm_qshift); + } + nqcache = 0; + } + + if (nqcache != 0) { + ASSERT(!(vmflag & VM_NOSLEEP)); + vmp->vm_qcache_max = nqcache << vmp->vm_qshift; + for (i = 0; i < (int)nqcache; i++) { + char buf[VMEM_NAMELEN + 21]; + (void) snprintf(buf, VMEM_NAMELEN + 20, "%s_%lu", vmp->vm_name, + (i + 1) * quantum); + vmp->vm_qcache[i] = kmem_cache_create(buf, + (i + 1) * quantum, quantum, NULL, NULL, NULL, + NULL, vmp, KMC_QCACHE | KMC_NOTOUCH); + } + } + + if ((vmp->vm_ksp = kstat_create("vmem", vmp->vm_id, vmp->vm_name, + "vmem", KSTAT_TYPE_NAMED, sizeof (vmem_kstat_t) / + sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) != NULL) { + vmp->vm_ksp->ks_data = &vmp->vm_kstat; + kstat_install(vmp->vm_ksp); + } + + mutex_enter(&vmem_list_lock); + vmpp = &vmem_list; + while ((cur = *vmpp) != NULL) + vmpp = &cur->vm_next; + *vmpp = vmp; + mutex_exit(&vmem_list_lock); + + if (vmp->vm_cflags & VMC_POPULATOR) { + ASSERT(vmem_populators < VMEM_INITIAL); + vmem_populator[atomic_inc_32_nv(&vmem_populators) - 1] = vmp; + mutex_enter(&vmp->vm_lock); + (void) vmem_populate(vmp, vmflag | VM_PANIC); + mutex_exit(&vmp->vm_lock); + } + + if ((base || size) && vmem_add(vmp, base, size, vmflag) == NULL) { + vmem_destroy(vmp); + return (NULL); + } + + return (vmp); +} + +vmem_t * +vmem_xcreate(const char *name, void *base, uint32_t size, uint32_t quantum, + vmem_ximport_t *afunc, vmem_free_t *ffunc, vmem_t *source, + uint32_t qcache_max, int vmflag) +{ + ASSERT(!(vmflag & (VMC_POPULATOR | VMC_XALLOC))); + vmflag &= ~(VMC_POPULATOR | VMC_XALLOC); + + return (vmem_create_common(name, base, size, quantum, + (vmem_alloc_t *)afunc, ffunc, source, qcache_max, + vmflag | VMC_XALLOC)); +} + +vmem_t * +vmem_create(const char *name, void *base, uint32_t size, uint32_t quantum, + vmem_alloc_t *afunc, vmem_free_t *ffunc, vmem_t *source, + uint32_t qcache_max, int vmflag) +{ + ASSERT(!(vmflag & (VMC_XALLOC | VMC_XALIGN))); + vmflag &= ~(VMC_XALLOC | VMC_XALIGN); + + return (vmem_create_common(name, base, size, quantum, + afunc, ffunc, source, qcache_max, vmflag)); +} + +/* + * Destroy arena vmp. + */ +void +vmem_destroy(vmem_t *vmp) +{ + vmem_t *cur, **vmpp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + vmem_seg_t *vsp, *anext; + uint32_t leaked; + + /* + * set vm_nsegfree to zero because vmem_free_span_list + * would have already freed vm_segfree. + */ + vmp->vm_nsegfree = 0; + mutex_enter(&vmem_list_lock); + vmpp = &vmem_list; + while ((cur = *vmpp) != vmp) + vmpp = &cur->vm_next; + *vmpp = vmp->vm_next; + mutex_exit(&vmem_list_lock); + + leaked = vmem_size(vmp, VMEM_ALLOC); + if (leaked != 0) + dprintf( "SPL: vmem_destroy('%s'): leaked %lu %s\n", + vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? + "identifiers" : "bytes"); + + if (vmp->vm_hash_table != vmp->vm_hash0) + vmem_free(vmem_hash_arena, vmp->vm_hash_table, + (vmp->vm_hash_mask + 1) * sizeof (void *)); + + /* + * Give back the segment structures for anything that's left in the + * arena, e.g. the primary spans and their free segments. + */ + VMEM_DELETE(&vmp->vm_rotor, a); + for (vsp = seg0->vs_anext; vsp != seg0; vsp = anext) { + anext = vsp->vs_anext; + vmem_putseg_global(vsp); + } + + while (vmp->vm_nsegfree > 0) + vmem_putseg_global(vmem_getseg(vmp)); + + kstat_delete(vmp->vm_ksp); + + mutex_destroy(&vmp->vm_lock); + cv_destroy(&vmp->vm_cv); + vmem_free(vmem_vmem_arena, vmp, sizeof(vmem_t)); +} + + +/* + * Destroy arena vmp. + */ +void +vmem_destroy_internal(vmem_t *vmp) +{ + vmem_t *cur, **vmpp; + vmem_seg_t *seg0 = &vmp->vm_seg0; + vmem_seg_t *vsp, *anext; + uint32_t leaked; + + mutex_enter(&vmem_list_lock); + vmpp = &vmem_list; + while ((cur = *vmpp) != vmp) + vmpp = &cur->vm_next; + *vmpp = vmp->vm_next; + mutex_exit(&vmem_list_lock); + + leaked = vmem_size(vmp, VMEM_ALLOC); + if (leaked != 0) + dprintf("SPL: vmem_destroy('%s'): leaked %lu %s\n", + vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? + "identifiers" : "bytes"); + + if (vmp->vm_hash_table != vmp->vm_hash0) + if(vmem_hash_arena != NULL) + vmem_free(vmem_hash_arena, vmp->vm_hash_table, + (vmp->vm_hash_mask + 1) * sizeof (void *)); + + /* + * Give back the segment structures for anything that's left in the + * arena, e.g. the primary spans and their free segments. + */ + VMEM_DELETE(&vmp->vm_rotor, a); + for (vsp = seg0->vs_anext; vsp != seg0; vsp = anext) { + anext = vsp->vs_anext; + vmem_putseg_global(vsp); + } + + while (vmp->vm_nsegfree > 0) + vmem_putseg_global(vmem_getseg(vmp)); + + if (!(vmp->vm_cflags & VMC_IDENTIFIER) && vmem_size(vmp, VMEM_ALLOC) != 0) + dprintf("SPL: vmem_destroy('%s'): STILL %lu bytes at kstat_delete() time\n", + vmp->vm_name, vmem_size(vmp, VMEM_ALLOC)); + + kstat_delete(vmp->vm_ksp); + + mutex_destroy(&vmp->vm_lock); + cv_destroy(&vmp->vm_cv); + + // Alas, to free, requires access to "vmem_vmem_arena" the very thing + // we release first. + //vmem_free(vmem_vmem_arena, vmp, sizeof (vmem_t)); +} + +/* + * Only shrink vmem hashtable if it is 1<vm_kstat.vk_alloc.value.ui64 - + vmp->vm_kstat.vk_free.value.ui64); + + new_size = MAX(VMEM_HASH_INITIAL, 1 << (highbit(3 * nseg + 4) - 2)); + old_size = vmp->vm_hash_mask + 1; + + if ((old_size >> vmem_rescale_minshift) <= new_size && + new_size <= (old_size << 1)) + return; + + new_table = vmem_alloc(vmem_hash_arena, new_size * sizeof (void *), + VM_NOSLEEP); + if (new_table == NULL) + return; + bzero(new_table, new_size * sizeof (void *)); + + mutex_enter(&vmp->vm_lock); + + old_size = vmp->vm_hash_mask + 1; + old_table = vmp->vm_hash_table; + + vmp->vm_hash_mask = new_size - 1; + vmp->vm_hash_table = new_table; + vmp->vm_hash_shift = highbit(vmp->vm_hash_mask); + + for (h = 0; h < old_size; h++) { + vsp = old_table[h]; + while (vsp != NULL) { + uintptr_t addr = vsp->vs_start; + vmem_seg_t *next_vsp = vsp->vs_knext; + vmem_seg_t **hash_bucket = VMEM_HASH(vmp, addr); + vsp->vs_knext = *hash_bucket; + *hash_bucket = vsp; + vsp = next_vsp; + } + } + + mutex_exit(&vmp->vm_lock); + + if (old_table != vmp->vm_hash0) + vmem_free(vmem_hash_arena, old_table, + old_size * sizeof (void *)); +} + +/* + * Perform periodic maintenance on all vmem arenas. + */ + +void +vmem_update(void *dummy) +{ + vmem_t *vmp; + + mutex_enter(&vmem_list_lock); + for (vmp = vmem_list; vmp != NULL; vmp = vmp->vm_next) { + /* + * If threads are waiting for resources, wake them up + * periodically so they can issue another kmem_reap() + * to reclaim resources cached by the slab allocator. + */ + cv_broadcast(&vmp->vm_cv); + + /* + * Rescale the hash table to keep the hash chains short. + */ + vmem_hash_rescale(vmp); + } + mutex_exit(&vmem_list_lock); + +// (void) bsd_timeout(vmem_update, dummy, &vmem_update_interval); +} + +void +vmem_qcache_reap(vmem_t *vmp) +{ + int i; + + /* + * Reap any quantum caches that may be part of this vmem. + */ + for (i = 0; i < VMEM_NQCACHE_MAX; i++) + if (vmp->vm_qcache[i]) + kmem_cache_reap_now(vmp->vm_qcache[i]); +} + +/* given a size, return the appropriate vmem_bucket_arena[] entry */ + +static inline uint16_t +vmem_bucket_number(uint32_t size) +{ + // For VMEM_BUCKET_HIBIT == 12, + // vmem_bucket_arena[n] holds allocations from 2^[n+11]+1 to 2^[n+12], + // so for [n] = 0, 2049-4096, for [n]=5 65537-131072, for [n]=7 (256k+1)-512k + + // set hb: 512k == 19, 256k+1 == 19, 256k == 18, ... + const int hb = highbit(size-1); + + int bucket = hb - VMEM_BUCKET_LOWBIT; + + // very large allocations go into the 16 MiB bucket + if (hb > VMEM_BUCKET_HIBIT) + bucket = VMEM_BUCKET_HIBIT - VMEM_BUCKET_LOWBIT; + + // very small allocations go into the 4 kiB bucket + if (bucket < 0) + bucket = 0; + + return (uint16_t)(bucket); +} + +static inline vmem_t * +vmem_bucket_arena_by_size(uint32_t size) +{ + uint16_t bucket = vmem_bucket_number(size); + + return(vmem_bucket_arena[bucket]); +} + +vmem_t * +spl_vmem_bucket_arena_by_size(uint32_t size) +{ + return(vmem_bucket_arena_by_size(size)); +} + +static inline void +vmem_bucket_wake_all_waiters(void) +{ + for (int i = VMEM_BUCKET_LOWBIT; i < VMEM_BUCKET_HIBIT; i++) { + const int bucket = i - VMEM_BUCKET_LOWBIT; + vmem_t *bvmp = vmem_bucket_arena[bucket]; + cv_broadcast(&bvmp->vm_cv); + } + cv_broadcast(&spl_heap_arena->vm_cv); +} + +/* + * xnu_alloc_throttled_bail() : spin looking for memory + * + */ + +static inline void * +xnu_alloc_throttled_bail(uint64_t now_ticks, vmem_t *calling_vmp, uint32_t size, int vmflags) +{ + + // spin looking for memory + + const uint64_t bigtarget = MAX(size,16ULL*1024ULL*1024ULL); + + static volatile _Atomic uint64_t alloc_lock = FALSE; + + static volatile _Atomic uint64_t force_time = 0; + + uint64_t timeout_ticks = hz / 2; + if (vmflags & VM_PUSHPAGE) + timeout_ticks = hz / 4; + + uint64_t timeout_time = now_ticks + timeout_ticks; + + for (uint32_t suspends = 0, blocked_suspends = 0, try_no_pressure = 0; ; ) { + if (force_time + timeout_ticks > timeout_time) { + // another thread has forced an allocation + // by timing out. push our deadline into the future. + timeout_time = force_time + timeout_ticks; + } + if (alloc_lock) { + blocked_suspends++; + IOSleep(1); + } else if (spl_vmem_xnu_useful_bytes_free() >= bigtarget) { + // if alloc_lock == f then alloc_lock = TRUE and result is TRUE + // otherwise result is FALSE and f = TRUE + //if ( ! __c11_atomic_compare_exchange_strong(&alloc_lock, &f, TRUE, + // __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { + if (InterlockedCompareExchange64(&alloc_lock, TRUE, FALSE) != FALSE) { + // avoid (highly unlikely) data race on alloc_lock. + // if alloc_lock has become TRUE while we were in the + // else if expression then we effectively optimize away + // the (relaxed) load of alloc_lock (== TRUE) into f and + // continue. + continue; + } + // alloc_lock is now visible as TRUE to all threads + try_no_pressure++; + void *m = spl_vmem_malloc_if_no_pressure(size); + if (m != NULL) { + uint64_t ticks = zfs_lbolt() - now_ticks; + xprintf("SPL: %s returning %llu bytes after " + "%llu ticks (hz=%u, seconds = %llu), " + "%u suspends, %u blocked, %u tries (%s)\n", + __func__, (uint64_t)size, + ticks, hz, ticks/hz, suspends, + blocked_suspends, try_no_pressure, calling_vmp->vm_name); + alloc_lock = FALSE; // atomic seq cst, so is published to all threads + return(m); + } else { + alloc_lock = FALSE; + spl_free_set_emergency_pressure(bigtarget); + suspends++; + IOSleep(1); + } + } else if (zfs_lbolt() > timeout_time) { + //if ( ! __c11_atomic_compare_exchange_strong(&alloc_lock, &f, TRUE, + // __ATOMIC_SEQ_CST, __ATOMIC_RELAXED)) { + if (InterlockedCompareExchange64(&alloc_lock, TRUE, FALSE) != FALSE) { + // avoid (highly unlikely) data race on alloc_lock as above + continue; + } + void *mp = spl_vmem_malloc_unconditionally(size); + uint64_t now = zfs_lbolt(); + uint64_t ticks = now - now_ticks; + force_time = now; + xprintf("SPL: %s TIMEOUT %llu bytes after " + "%llu ticks (hz=%u, seconds=%llu), " + "%u suspends, %u blocked, %u tries (%s)\n", + __func__, (uint64_t)size, + ticks, hz, ticks/hz, suspends, + blocked_suspends, try_no_pressure, calling_vmp->vm_name); + alloc_lock = FALSE; + atomic_inc_64(&spl_xat_forced); + return(mp); + } else { + spl_free_set_emergency_pressure(bigtarget); + suspends++; + IOSleep(1); + } + } +} + +static void * +xnu_alloc_throttled(vmem_t *bvmp, uint32_t size, int vmflag) +{ + + // the caller is one of the bucket arenas. + // null_vmp will be spl_default_arena_parent, which is just a placeholder. + + uint64_t now = zfs_lbolt(); + const uint64_t entry_now = now; + + void *m = spl_vmem_malloc_if_no_pressure(size); + + if (m != NULL) { + atomic_inc_64(&spl_xat_success); + spl_xat_lastalloc = gethrtime(); + // wake up waiters on all the arena condvars + // since there is apparently no memory shortage. + vmem_bucket_wake_all_waiters(); + return (m); + } else { + spl_free_set_emergency_pressure((int64_t)size); + } + + if (vmflag & VM_PANIC) { + // force an allocation now to avoid a panic + spl_xat_lastalloc = gethrtime(); + spl_free_set_emergency_pressure(4LL * (int64_t)size); + void *p = spl_vmem_malloc_unconditionally(size); + // p cannot be NULL (unconditional kernel malloc always works or panics) + // therefore: success, wake all waiters on alloc|free condvar + // wake up arena waiters to let them know there is memory + // available in the arena; let waiters on other bucket arenas + // continue sleeping. + cv_broadcast(&bvmp->vm_cv); + return (p); + } + + if (vmflag & VM_NOSLEEP) { + spl_free_set_emergency_pressure(MAX(2LL * (int64_t)size,16LL*1024LL*1024LL)); + kpreempt(KPREEMPT_SYNC); /* cheating a bit, but not really waiting */ + void *p = spl_vmem_malloc_if_no_pressure(size); + if (p != NULL) { + atomic_inc_64(&spl_xat_late_success_nosleep); + cv_broadcast(&bvmp->vm_cv); + spl_xat_lastalloc = gethrtime(); + } + // if p == NULL, then there will be an increment in the fail kstat + return (p); + } + + /* + * Loop for a while trying to satisfy VM_SLEEP allocations. + * + * If we are able to allocate memory, then return the pointer. + * + * We return NULL if some other thread's activity has caused + * sufficient memory to appear in this arena that we can satisfy + * the allocation. + * + * We call xnu_alloc_throttle_bail() after a few milliseconds of waiting; + * it will either return a pointer to newly allocated memory or NULL. We + * return the result. + * + */ + + const uint32_t bucket_number = vmem_bucket_id_to_bucket_number[bvmp->vm_id]; + + static volatile _Atomic uint32_t waiters = 0; + + atomic_inc_32(&waiters); + + if (waiters == 1UL) + atomic_inc_64(&spl_xat_no_waiters); + + static _Atomic uint32_t max_waiters_seen = 0; + + if (waiters > max_waiters_seen) { + max_waiters_seen = waiters; + xprintf("SPL: %s: max_waiters_seen increased to %u\n", __func__, max_waiters_seen); + } + + boolean_t local_xat_pressured = FALSE; + + for (; ;) { + clock_t wait_time = USEC2NSEC(500UL * MAX(waiters,1UL)); + mutex_enter(&bvmp->vm_lock); + spl_xat_sleep++; + if (local_xat_pressured) { + spl_xat_pressured++; + local_xat_pressured = FALSE; + } + (void) cv_timedwait_hires(&bvmp->vm_cv, &bvmp->vm_lock, + wait_time, 0, 0); + mutex_exit(&bvmp->vm_lock); + now = zfs_lbolt(); + // We may be here because of a broadcast to &vmp->vm_cv, + // causing xnu to schedule all the sleepers in priority-weighted FIFO + // order. Because of the mutex_exit(), the sections below here may + // be entered concurrently. + + // spl_vmem_malloc_if_no_pressure does a mutex, so avoid calling it + // unless there is a chance it will succeed. + if (spl_vmem_xnu_useful_bytes_free() > (MAX(size,16ULL*1024ULL*1024ULL))) { + void *a = spl_vmem_malloc_if_no_pressure(size); + if (a != NULL) { + atomic_inc_64(&spl_xat_late_success); + spl_xat_lastalloc = gethrtime(); + waiters--; + atomic_dec_32(&waiters); + // Wake up all waiters on the bucket arena locks, + // since the system apparently has memory again. + vmem_bucket_wake_all_waiters(); + return (a); + } else { + // Probably vm_page_free_count changed while we were + // in the mutex queue in spl_vmem_malloc_if_no_pressure(). + // There is therefore no point in doing the bail-out check + // below, so go back to the top of the for loop. + atomic_inc_64(&spl_xat_late_deny); + continue; + } + } + if (now > entry_now + hz / 4 || spl_vba_threads[bucket_number] > 1UL) { + // If there are other threads waiting for us in vba() + // then when we satisfy this allocation, we satisfy more than one + // thread, so invoke XATB(). + // Otherwise, if we have had no luck for 250 ms, then + // switch to XATB() which is much more aggressive. + if (spl_vba_threads[bucket_number] > 1UL) + atomic_inc_64(&spl_xat_bailed_contended); + atomic_inc_64(&spl_xat_bailed); + static _Atomic uint32_t bailing_threads = 0, max_bailers_seen = 0; + atomic_inc_32(&bailing_threads); + if (bailing_threads > max_bailers_seen) { + max_bailers_seen = bailing_threads; + xprintf("SPL: %s: max_bailers_seen increased to %u\n", + __func__, max_bailers_seen); + } + void *b = xnu_alloc_throttled_bail(now, bvmp, size, vmflag); + atomic_dec_32(&bailing_threads); + spl_xat_lastalloc = gethrtime(); + // wake up waiters on the arena lock, + // since they now have memory they can use. + cv_broadcast(&bvmp->vm_cv); + // open turnstile after having bailed, rather than before + atomic_dec_32(&waiters); + return (b); + } else if (now - entry_now > 0 && ((now - entry_now) % (hz/10))) { + spl_free_set_emergency_pressure(MAX(size,16LL*1024LL*1024LL)); + local_xat_pressured = TRUE; + } + } +} + +static void +xnu_free_throttled(vmem_t *vmp, void *vaddr, uint32_t size) +{ + extern void osif_free(void *, uint64_t); + + // Serialize behind a (short) spin-sleep delay, giving + // xnu time to do freelist management and + // PT teardowns + + // In the usual case there is only one thread in this function, + // so we proceed waitlessly to osif_free(). + + // When there are multiple threads here, we delay the 2nd and later. + + // Explict race: + // The osif_free() is not protected by the vmem_xnu_alloc_lock + // mutex; that is just used for implementing the delay. Consequently, + // the waiters on the same lock in spl_vmem_malloc_if_no_pressure may + // FALSEly see too small a value for vm_page_free_count. We don't + // care in part because xnu performs poorly when doing + // free-then-allocate anwyay. + + // a_waiters gauges the loop exit checking and sleep duration; + // it is a count of the number of threads trying to do work + // in this function. + static volatile _Atomic uint32_t a_waiters = 0; + + // is_freeing protects the osif_free() call; see comment below + static volatile _Atomic uint64_t is_freeing = FALSE; + + atomic_inc_32(&a_waiters); // generates "lock incl ..." + + static _Atomic uint32_t max_waiters_seen = 0; + + if (a_waiters > max_waiters_seen) { + max_waiters_seen = a_waiters; + dprintf("SPL: %s: max_waiters_seen increased to %u\n", __func__, max_waiters_seen); + } + + for (uint32_t iter = 0; a_waiters > 1UL; iter++) { + // there is more than one thread here, so suspend and sleep for 1 ms + atomic_inc_64(&spl_xft_wait); + IOSleep(1); + // If are growing old in this loop, then see if + // anyone else is still in osif_free. If not, we can exit. + if (iter >= a_waiters) { + // if is_freeing == f, then set is_freeing to TRUE with + // release semantics (i.e. "push" it to other cores) then break; + // otherwise, set f to TRUE relaxedly (i.e., optimize it out) + //uint64_t f = FALSE; + //if (__c11_atomic_compare_exchange_weak(&is_freeing, &f, TRUE, + // __ATOMIC_RELEASE, __ATOMIC_RELAXED)) { + if (InterlockedCompareExchange64(&is_freeing, TRUE, FALSE) != FALSE) { + break; + } + } + } + // If there is more than one thread in this function, osif_free() is + // protected by is_freeing. Release it after the osif_free() + // call has been made and the lastfree bookkeeping has been done. + osif_free(vaddr, size); + spl_xat_lastfree = gethrtime(); + is_freeing = B_FALSE; + atomic_dec_32(&a_waiters); + kpreempt(KPREEMPT_SYNC); + // since we just gave back xnu enough to satisfy an allocation + // in at least the smaller buckets, let's wake up anyone in + // the cv_wait() in vmem_xalloc([bucket_#], ...) + vmem_bucket_wake_all_waiters(); +} + +// return 0 if the bit was unset before the atomic OR. +static inline boolean_t +vba_atomic_lock_bucket(volatile _Atomic uint16_t *bbap, uint16_t bucket_bit) +{ + + // We use a test-and-set of the appropriate bit + // in buckets_busy_allocating; if it was not set, + // then break out of the loop. + // + // This compiles into an orl, cmpxchgw instruction pair. + // the return from __c11_atomic_fetch_or() is the + // previous value of buckets_busy_allocating. + + //uint16_t prev = __c11_atomic_fetch_or(bbap, bucket_bit, __ATOMIC_SEQ_CST); + uint16_t prev = InterlockedOr16(bbap, bucket_bit); + if (prev & bucket_bit) + return (FALSE); // we did not acquire the bit lock here + else + return (TRUE); // we turned the bit from 0 to 1 +} + +static void * +vmem_bucket_alloc(vmem_t *null_vmp, uint32_t size, const int vmflags) +{ + + if (vmflags & VM_NO_VBA) + return (NULL); + + // caller is spl_heap_arena looking for memory. + // null_vmp will be spl_default_arena_parent, and so is just a placeholder. + + vmem_t *calling_arena = spl_heap_arena; + + static volatile _Atomic uint32_t hipriority_allocators = 0; // Windosed + boolean_t local_hipriority_allocator = FALSE; + + if (0 != (vmflags & (VM_PUSHPAGE | VM_NOSLEEP | VM_PANIC | VM_ABORT))) { + local_hipriority_allocator = TRUE; + //hipriority_allocators++; + atomic_inc_32(&hipriority_allocators); + } + + if (!ISP2(size)) + atomic_inc_64(&spl_bucket_non_pow2_allocs); + + vmem_t *bvmp = vmem_bucket_arena_by_size(size); + + // there are 13 buckets, so use a 16-bit scalar to hold + // a set of bits, where each bit corresponds to an in-progress + // vmem_alloc(bucket, ...) below. + + static volatile _Atomic uint16_t buckets_busy_allocating = 0; + const uint16_t bucket_number = vmem_bucket_number(size); + const uint16_t bucket_bit = (uint16_t)1 << bucket_number; + + //spl_vba_threads[bucket_number]++; + atomic_inc_32(&spl_vba_threads[bucket_number]); + + static volatile _Atomic uint32_t waiters = 0; + + // First, if we are VM_SLEEP, check for memory, try some pressure, + // and if that doesn't work, force entry into the loop below. + + boolean_t loop_once = FALSE; + + if ((vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) == 0 && + ! vmem_canalloc_atomic(bvmp, size)) { + if (spl_vmem_xnu_useful_bytes_free() < (MAX(size,16ULL*1024ULL*1024ULL))) { + spl_free_set_emergency_pressure(size); + IOSleep(1); + if (! vmem_canalloc_atomic(bvmp, size) && + (spl_vmem_xnu_useful_bytes_free() < (MAX(size,16ULL*1024ULL*1024ULL)))) { + loop_once = TRUE; + } + } + } + + // spin-sleep: if we would need to go to the xnu allocator. + // + // We want to avoid a burst of allocs from bucket_heap's children + // successively hitting a low-memory condition, or alternatively + // each successfully importing memory from xnu when they can share + // a single import. + // + // We also want to take advantage of any memory that becomes available + // in bucket_heap. + // + // If there is more than one thread in this function (~ few percent) + // then the subsequent threads are put into the loop below. They + // can escape the loop if they are [1]non-waiting allocations, or + // [2]if they become the only waiting thread, or + // [3]if the cv_timedwait_hires returns -1 (which represents EWOULDBLOCK + // from msleep() which gets it from _sleep()'s THREAD_TIMED_OUT) + // allocating in the bucket, or [4]if this thread has (rare condition) spent + // a quarter of a second in the loop. + + //if (waiters++ > 1 || loop_once) { + if (atomic_inc_32_nv(&waiters) > 1 || loop_once) { + atomic_inc_64(&spl_vba_loop_entries); + } + + static _Atomic uint32_t max_waiters_seen = 0; + + if (waiters > max_waiters_seen) { + max_waiters_seen = waiters; + dprintf("SPL: %s: max_waiters_seen increased to %u\n", __func__, max_waiters_seen); + } + + // local counters, to be added atomically to global kstat variables + uint64_t local_memory_blocked = 0, local_cv_timeout = 0, local_loop_timeout = 0; + uint64_t local_cv_timeout_blocked = 0, local_loop_timeout_blocked = 0; + uint64_t local_sleep = 0, local_hipriority_blocked = 0; + + const uint64_t loop_ticks = 25; // a tick is 10 msec, so 250 msec + const uint64_t hiprio_loop_ticks = 4; // 40 msec + int crutch = 0; + for (uint64_t entry_time = zfs_lbolt(), loop_timeout = entry_time + loop_ticks, + hiprio_timeout = entry_time + hiprio_loop_ticks, timedout = 0; + waiters > 1UL || loop_once; ) { + loop_once = FALSE; + // non-waiting allocations should proceeed to vmem_alloc() immediately + if (vmflags & (VM_NOSLEEP | VM_PANIC | VM_ABORT)) { + break; + } + if (crutch++ > 25) break; + if (vmem_canalloc_atomic(bvmp, size)) { + // We can probably vmem_alloc(bvmp, size, vmflags). + // At worst case it will give us a NULL and we will + // end up on the vmp's cv_wait. + // + // We can have threads with different bvmp + // taking this exit, and will proceed concurrently. + // + // However, we should protect against a burst of + // callers hitting the same bvmp before the allocation + // results are reflected in vmem_canalloc_atomic(bvmp, ...) + if (local_hipriority_allocator == FALSE && + hipriority_allocators > 0) { + // more high priority allocations are wanted, + // so this thread stays here + local_hipriority_blocked++; + } else if (vba_atomic_lock_bucket(&buckets_busy_allocating, bucket_bit)) { + // we are not being blocked by another allocator + // to the same bucket, or any higher priority allocator + atomic_inc_64(&spl_vba_parent_memory_appeared); + break; + // The vmem_alloc() should return extremely quickly from + // an INSTANTFIT allocation that canalloc predicts will succeed. + } else { + // another thread is trying to use the free memory in the + // bucket_## arena; there might still be free memory there after + // its allocation is completed, and there might be excess in the + // bucket_heap arena, so stick around in this loop. + local_memory_blocked++; + cv_broadcast(&bvmp->vm_cv); + } + } + if (timedout > 0) { + if (local_hipriority_allocator == FALSE && + hipriority_allocators > 0) { + local_hipriority_blocked++; + } else if (vba_atomic_lock_bucket(&buckets_busy_allocating, bucket_bit)) { + if (timedout & 1) + local_cv_timeout++; + if (timedout & 6 || zfs_lbolt() >= loop_timeout) + local_loop_timeout++; + break; + } else { + if (timedout & 1) { + local_cv_timeout_blocked++; + } + if (timedout & 6) { + local_loop_timeout_blocked++; + } else if (zfs_lbolt() > loop_timeout) { + timedout |= 2; + } + // flush the current thread in xat() out of + // xat()'s for() loop and into xat_bail() + cv_broadcast(&bvmp->vm_cv); + } + } + // The bucket is already allocating, or the bucket needs + // more memory to satisfy vmem_allocat(bvmp, size, VM_NOSLEEP), or + // we want to give the bucket some time to acquire more memory. + // + // substitute for the vmp arena's cv_wait in vmem_xalloc() + // (vmp is the bucket_heap AKA spl_heap_arena) + mutex_enter(&calling_arena->vm_lock); + local_sleep++; + if (local_sleep >= 1000ULL) { + atomic_add_64(&spl_vba_sleep, local_sleep - 1ULL); + local_sleep = 1ULL; + atomic_add_64(&spl_vba_cv_timeout_blocked, local_cv_timeout_blocked); + local_cv_timeout_blocked = 0; + atomic_add_64(&spl_vba_loop_timeout_blocked, local_loop_timeout_blocked); + local_loop_timeout_blocked = 0; + atomic_add_64(&spl_vba_hiprio_blocked, local_hipriority_blocked); + local_hipriority_blocked = 0; + if (local_memory_blocked > 1ULL) { + atomic_add_64(&spl_vba_parent_memory_blocked, local_memory_blocked - 1ULL); + local_memory_blocked = 1ULL; + } + } + clock_t wait_time = MSEC2NSEC(30); + if (timedout > 0 || local_memory_blocked > 0) { + wait_time = MSEC2NSEC(1); + } + int ret = (int) cv_timedwait_hires(&calling_arena->vm_cv, &calling_arena->vm_lock, + wait_time, 0, 0); + // We almost certainly have exited because of a signal/broadcast, + // but maybe just timed out. Either way, recheck memory. + mutex_exit(&calling_arena->vm_lock); + if (ret == -1) { + // cv_timedwait_hires timer expired + timedout |= 1; + cv_broadcast(&bvmp->vm_cv); + } else if ((timedout & 2) == 0) { + // we were awakened; check to see if we have been + // in the for loop for a long time + uint64_t n = zfs_lbolt(); + if (n > loop_timeout) { + timedout |= 2; + extern uint64_t real_total_memory; + spl_free_set_emergency_pressure(real_total_memory / 64LL); + // flush the current thread in xat() out of + // xat()'s for() loop and into xat_bail() + cv_broadcast(&bvmp->vm_cv); + } else if (local_hipriority_allocator && n > hiprio_timeout && waiters > 1UL) { + timedout |= 4; + } + } + } + + /* + * Turn on the exclusion bit in buckets_busy_allocating, to + * prevent multiple threads from calling vmem_alloc() on the + * same bucket arena concurrently rather than serially. + * + * This principally reduces the liklihood of asking xnu for + * more memory when other memory is or becomes available. + * + * This exclusion only applies to VM_SLEEP allocations; + * others (VM_PANIC, VM_NOSLEEP, VM_ABORT) will go to + * vmem_alloc() concurrently with any other threads. + * + * Since we aren't doing a test-and-set operation like above, + * we can just use |= and &= below and get correct atomic + * results, instead of using: + * + * __c11_atomic_fetch_or(&buckets_busy_allocating, + * bucket_bit, __ATOMIC_SEQ_CST); + * with the &= down below being written as + * __c11_atomic_fetch_and(&buckets_busy_allocating, + * ~bucket_bit, __ATOMIC_SEQ_CST); + * + * and this makes a difference with no optimization either + * compiling the whole file or with __attribute((optnone)) + * in front of the function decl. In particular, the non- + * optimized version that uses the builtin __c11_atomic_fetch_{and,or} + * preserves the C program order in the machine language output, + * inersting cmpxchgws, while all optimized versions, and the + * non-optimized version using the plainly-written version, reorder + * the "orw regr, memory" and "andw register, memory" (these are atomic + * RMW operations in x86-64 when the memory is naturally aligned) so that + * the strong memory model x86-64 promise that later loads see the + * results of earlier stores. + * + * clang+llvm simply are good at optimizing _Atomics and + * the optimized code differs only in line numbers and + * among all three approaches (as plainly written, using + * the __c11_atomic_fetch_{or,and} with sequential consistency, + * or when compiling with at least -O optimization so an + * atomic_or_16(&buckets_busy_allocating) built with GCC intrinsics + * is actually inlined rather than a function call). + * + */ + + // in case we left the loop by being the only waiter, stop the + // next thread arriving from leaving the for loop because + // vmem_canalloc(bvmp, that_thread's_size) is TRUE. + + //buckets_busy_allocating |= bucket_bit; + InterlockedOr16(&buckets_busy_allocating, bucket_bit); + // update counters + if (local_sleep > 0) + atomic_add_64(&spl_vba_sleep, local_sleep); + if (local_memory_blocked > 0) + atomic_add_64(&spl_vba_parent_memory_blocked, local_memory_blocked); + if (local_cv_timeout > 0) + atomic_add_64(&spl_vba_cv_timeout, local_cv_timeout); + if (local_cv_timeout_blocked > 0) + atomic_add_64(&spl_vba_cv_timeout_blocked, local_cv_timeout_blocked); + if (local_loop_timeout > 0) + atomic_add_64(&spl_vba_loop_timeout, local_loop_timeout); + if (local_loop_timeout_blocked > 0) + atomic_add_64(&spl_vba_loop_timeout_blocked, local_loop_timeout_blocked); + if (local_hipriority_blocked > 0) + atomic_add_64(&spl_vba_hiprio_blocked, local_hipriority_blocked); + + // There is memory in this bucket, or there are no other waiters, + // or we aren't a VM_SLEEP allocation, or we iterated out of the for loop. + // + // vmem_alloc() and vmem_xalloc() do their own mutex serializing + // on bvmp->vm_lock, so we don't have to here. + // + // vmem_alloc may take some time to return (especially for VM_SLEEP + // allocations where we did not take the vm_canalloc(bvmp...) break out + // of the for loop). Therefore, if we didn't enter the for loop at all + // because waiters was 0 when we entered this function, + // subsequent callers will enter the for loop. + + void *m = vmem_alloc(bvmp, size, vmflags); + + // allow another vmem_canalloc() through for this bucket + // by atomically turning off the appropriate bit + + /* + * Except clang+llvm DTRT because of _Atomic, could be written as: + *__c11_atomic_fetch_and(&buckets_busy_allocating, + *~bucket_bit, __ATOMIC_SEQ_CST); + * + * On processors with more relaxed memory models, it might be + * more efficient to do so with release semantics here, and + * in the atomic |= above, with acquire semantics in the bit tests, + * but on the other hand it may be hard to do better than clang+llvm. + */ + + //buckets_busy_allocating &= ~bucket_bit; + InterlockedAnd16(&buckets_busy_allocating, ~bucket_bit); + + if (local_hipriority_allocator) + atomic_dec_32(&hipriority_allocators); + + // if we got an allocation, wake up the arena cv waiters + // to let them try to exit the for(;;) loop above and + // exit the cv_wait() in vmem_xalloc(vmp, ...) + + if (m != NULL) { + cv_broadcast(&calling_arena->vm_cv); + } + + atomic_dec_32(&waiters); + //spl_vba_threads[bucket_number]--; + atomic_dec_32(&spl_vba_threads[bucket_number]); + return (m); +} + +static void +vmem_bucket_free(vmem_t *null_vmp, void *vaddr, uint32_t size) +{ + vmem_t *calling_arena = spl_heap_arena; + + vmem_free(vmem_bucket_arena_by_size(size), vaddr, size); + + // wake up arena waiters to let them try an alloc + cv_broadcast(&calling_arena->vm_cv); +} + +static inline int64_t +vmem_bucket_arena_free(uint16_t bucket) +{ + VERIFY(bucket < VMEM_BUCKETS); + return((int64_t)vmem_size_semi_atomic(vmem_bucket_arena[bucket], VMEM_FREE)); +} + +static inline int64_t +vmem_bucket_arena_used(int bucket) +{ + VERIFY(bucket < VMEM_BUCKETS); + return((int64_t)vmem_size_semi_atomic(vmem_bucket_arena[bucket], VMEM_ALLOC)); +} + + +int64_t +vmem_buckets_size(int typemask) +{ + int64_t total_size = 0; + + for (int i = 0; i < VMEM_BUCKETS; i++) { + int64_t u = vmem_bucket_arena_used(i); + int64_t f = vmem_bucket_arena_free((uint16_t)i); + if (typemask & VMEM_ALLOC) + total_size += u; + if (typemask & VMEM_FREE) + total_size += f; + } + if (total_size < 0) + total_size = 0; + + return((uint32_t) total_size); +} + +static uint64_t +spl_validate_bucket_span_size(uint64_t val) +{ + if (!ISP2(val)) { + TraceEvent(TRACE_WARNING, "SPL: %s: WARNING %llu is not a power of two, not changing.\n", + __func__, val); + return (0); + } + if (val < 128ULL*1024ULL || val > 16ULL*1024ULL*1024ULL) { + TraceEvent(TRACE_WARNING, "SPL: %s: WARNING %llu is out of range [128k - 16M], not changing.\n", + __func__, val); + return (0); + } + return (val); +} + +static inline void +spl_modify_bucket_span_size(int bucket, uint64_t size) +{ + vmem_t *bvmp = vmem_bucket_arena[bucket]; + + mutex_enter(&bvmp->vm_lock); + bvmp->vm_min_import = (uint32_t) size; + mutex_exit(&bvmp->vm_lock); +} + +static inline void +spl_modify_bucket_array() +{ + for (int i = VMEM_BUCKET_LOWBIT; i < VMEM_BUCKET_HIBIT; i++) { + // i = 12, bucket = 0, contains allocs from 8192 to 16383 bytes, + // and should never ask xnu for < 16384 bytes, so as to avoid + // asking xnu for a non-power-of-two size. + const int bucket = i - VMEM_BUCKET_LOWBIT; + const uint32_t bucket_alloc_minimum_size = 1UL << (uint32_t)i; + const uint32_t bucket_parent_alloc_minimum_size = bucket_alloc_minimum_size * 2UL; + + switch(i) { + // see vmem_init() below for details + case 16: + case 17: + spl_modify_bucket_span_size(bucket, + MAX(spl_bucket_tunable_small_span, bucket_parent_alloc_minimum_size)); + break; + default: + spl_modify_bucket_span_size(bucket, + MAX(spl_bucket_tunable_large_span, bucket_parent_alloc_minimum_size)); + break; + } + } +} + +static inline void +spl_dprintf_bucket_span_sizes(void) +{ + // this doesn't have to be super-exact + for (int i = VMEM_BUCKET_LOWBIT; i < VMEM_BUCKET_HIBIT; i++) { + int bnum = i - VMEM_BUCKET_LOWBIT; + vmem_t *bvmp = vmem_bucket_arena[bnum]; + } +} + +static inline void +spl_set_bucket_spans(uint64_t l, uint64_t s) +{ + if (spl_validate_bucket_span_size(l) && + spl_validate_bucket_span_size(s)) { + atomic_swap_64(&spl_bucket_tunable_large_span, l); + atomic_swap_64(&spl_bucket_tunable_small_span, s); + spl_modify_bucket_array(); + } +} + +void +spl_set_bucket_tunable_large_span(uint64_t size) +{ + uint64_t s = 0; + + mutex_enter(&vmem_xnu_alloc_lock); + atomic_swap_64(&s, spl_bucket_tunable_small_span); + spl_set_bucket_spans(size, s); + mutex_exit(&vmem_xnu_alloc_lock); + + spl_dprintf_bucket_span_sizes(); +} + +void +spl_set_bucket_tunable_small_span(uint64_t size) +{ + uint64_t l = 0; + + mutex_enter(&vmem_xnu_alloc_lock); + atomic_swap_64(&l, spl_bucket_tunable_large_span); + spl_set_bucket_spans(l, size); + mutex_exit(&vmem_xnu_alloc_lock); + + spl_dprintf_bucket_span_sizes(); +} + +static void * +spl_vmem_default_alloc(vmem_t *vmp, uint32_t size, int vmflags) +{ + extern void *osif_malloc(uint64_t); + return(osif_malloc(size)); +} + +static void +spl_vmem_default_free(vmem_t *vmp, void *vaddr, uint32_t size) +{ + extern void osif_free(void *, uint64_t); + osif_free(vaddr, size); +} + +vmem_t * +vmem_init(const char *heap_name, + void *heap_start, uint32_t heap_size, uint32_t heap_quantum, + void *(*heap_alloc)(vmem_t *, uint32_t, int), + void (*heap_free)(vmem_t *, void *, uint32_t)) +{ + uint32_t id; + int nseg = VMEM_SEG_INITIAL; + vmem_t *heap; + + // XNU mutexes need initialisation + mutex_init(&vmem_list_lock, "vmem_list_lock", MUTEX_DEFAULT, NULL); + mutex_init(&vmem_segfree_lock, "vmem_segfree_lock", MUTEX_DEFAULT, NULL); + mutex_init(&vmem_sleep_lock, "vmem_sleep_lock", MUTEX_DEFAULT, NULL); + mutex_init(&vmem_nosleep_lock, "vmem_nosleep_lock", MUTEX_DEFAULT, NULL); + mutex_init(&vmem_pushpage_lock, "vmem_pushpage_lock", MUTEX_DEFAULT, NULL); + mutex_init(&vmem_panic_lock, "vmem_panic_lock", MUTEX_DEFAULT, NULL); + + mutex_init(&vmem_xnu_alloc_lock, "vmem_xnu_alloc_lock", MUTEX_DEFAULT, NULL); + + while (--nseg >= 0) + vmem_putseg_global(&vmem_seg0[nseg]); + + /* + * On OSX we ultimately have to use the OS allocator + * as the ource and sink of memory as it is allocated + * and freed. + * + * The spl_root_arena_parent is needed in order to provide a + * base arena with an always-NULL afunc and ffunc in order to + * end the searches done by vmem_[x]alloc and vm_xfree; it + * serves no other purpose; its stats will always be zero. + * + */ + spl_default_arena_parent = vmem_create("spl_default_arena_parent", // id 0 + NULL, 0, heap_quantum, NULL, NULL, NULL, 0, VM_SLEEP); + + // illumos/openzfs has a gigantic pile of memory that it can use for its first arena; + // o3x is not so lucky, so we start with this + + //static char initial_default_block[16ULL*1024ULL*1024ULL] __attribute__((aligned(4096))) = { 0 }; + __declspec(align(PAGE_SIZE)) static char initial_default_block[16ULL * 1024ULL * 1024ULL] = { 0 }; + + // The default arena is very low-bandwidth; it supplies the initial large + // allocation for the heap arena below, and it serves as the parent of the + // vmem_metadata arena. It will typically do only 2 or 3 parent_alloc calls + // (to spl_vmem_default_alloc) in total. + + spl_default_arena = vmem_create("spl_default_arena", // id 1 + initial_default_block, 16ULL*1024ULL*1024ULL, + heap_quantum, spl_vmem_default_alloc, spl_vmem_default_free, + spl_default_arena_parent, 16ULL*1024ULL*1024ULL, VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE); + + VERIFY(spl_default_arena != NULL); + + // The bucket arenas satisfy allocations & frees from the bucket heap + // that are dispatched to the bucket whose power-of-two label is the + // smallest allocation that vmem_bucket_allocate will ask for. + // + // The bucket arenas in turn exchange memory with XNU's allocator/freer in + // large spans (~ 1 MiB is stable on all systems but creates bucket fragmentation) + // + // Segregating by size constrains internal fragmentation within the bucket and + // provides kstat.vmem visiblity and span-size policy to be applied to particular + // buckets (notably the sources of most allocations, see the comments below) + // + // For VMEM_BUCKET_HIBIT == 12, + // vmem_bucket_arena[n] holds allocations from 2^[n+11]+1 to 2^[n+12], + // so for [n] = 0, 2049-4096, for [n]=5 65537-131072, for [n]=7 (256k+1)-512k + // + // so "kstat.vmvm.vmem.bucket_1048576" should be read as the bucket arena containing + // allocations 1 MiB and smaller, but larger than 512 kiB. + + // create arenas for the VMEM_BUCKETS, id 2 - id 14 + + extern uint64_t real_total_memory; + VERIFY3U(real_total_memory,>=,1024ULL*1024ULL*1024ULL); + + // adjust minimum bucket span size for memory size + // see comments in the switch below + // large span: 1 MiB and bigger on large-memory (> 32 GiB) systems + // small span: 256 kiB and bigger on large-memory systems + const uint64_t k = 1024ULL; + const uint64_t qm = 256ULL * k; + const uint64_t m = 1024ULL* k; + const uint64_t big = MAX(real_total_memory / (k * 32ULL), m); + const uint64_t small = MAX(real_total_memory / (k * 128ULL), qm); + spl_bucket_tunable_large_span = MIN(big, 16ULL * m); + spl_bucket_tunable_small_span = small; + dprintf("SPL: %s: real_total_memory %llu, large spans %llu, small spans %llu\n", + __func__, real_total_memory, + spl_bucket_tunable_large_span, spl_bucket_tunable_small_span); + char* buf = vmem_alloc(spl_default_arena, VMEM_NAMELEN + 21, VM_SLEEP); + for (int32_t i = VMEM_BUCKET_LOWBIT; i <= VMEM_BUCKET_HIBIT; i++) { + size_t minimum_allocsize = 0; + const uint64_t bucket_largest_size = (1ULL << (uint64_t)i); + + (void) snprintf(buf, VMEM_NAMELEN + 20, "%s_%llu", + "bucket", bucket_largest_size); + + switch (i) { + case 15: + case 16: + /* + * With the arrival of abd, the 2^15 (== 32768) and 2^16 + * buckets are by far the most busy, holding respectively + * the qcache spans of kmem_va (the kmem_alloc et al. heap) + * and zfs_qcache (notably the source for the abd_chunk arena) + * + * The lifetime of early (i.e., after import and mount) + * allocations can be highly variable, leading + * to persisting fragmentation from the first eviction after + * arc has grown large. This can happen if, for example, + * there substantial import and mounting (and mds/mdworker and + * backupd scanning) activity before a user logs in and starts + * demanding memory in userland (e.g. by firing up a browser or + * mail app). + * + * Crucially, this makes it difficult to give back memory to xnu + * without holding the ARC size down for long periods of time. + * + * We can mitigate this by exchanging smaller + * amounts of memory with xnu for these buckets. + * There are two downsides: xnu's memory + * freelist will be prone to greater + * fragmentation, which will affect all + * allocation and free activity using xnu's + * allocator including kexts other than our; and + * we are likely to have more waits in the throttled + * alloc function, as more threads are likely to require + * slab importing into the kmem layer and fewer threads + * can be satisfied by a small allocation vs a large one. + * + * The import sizes are sysadmin-tunable by setting + * kstat.spl.misc.spl_misc.spl_tunable_small_span + * to a power-of-two number of bytes in zsysctl.conf + * should a sysadmin prefer non-early allocations to + * be larger or smaller depending on system performance + * and workload. + * + * However, a zfs booting system must use the defaults + * here for the earliest allocations, therefore they. + * should be only large enough to protect system performance + * if the sysadmin never changes the tunable span sizes. + */ + minimum_allocsize = MAX(spl_bucket_tunable_small_span, + bucket_largest_size * 4); + break; + default: + /* + * These buckets are all relatively low bandwidth and + * with relatively uniform lifespans for most allocations + * (borrowed arc buffers dominate). They should be large + * enough that they do not pester xnu. + */ + minimum_allocsize = MAX(spl_bucket_tunable_large_span, + bucket_largest_size * 4); + break; + } + const int bucket_number = i - VMEM_BUCKET_LOWBIT; + vmem_t *b = vmem_create(buf, NULL, 0, heap_quantum, + xnu_alloc_throttled, xnu_free_throttled, spl_default_arena_parent, + minimum_allocsize, VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE | VMC_TIMEFREE); + VERIFY(b != NULL); + b->vm_min_import = minimum_allocsize; + b->vm_source = b; + vmem_bucket_arena[bucket_number] = b; + vmem_bucket_id_to_bucket_number[b->vm_id] = bucket_number; + } + vmem_free(spl_default_arena, buf, VMEM_NAMELEN + 21); + // spl_heap_arena, the bucket heap, is the primary interface to the vmem system + + // all arenas not rooted to vmem_metadata will be rooted to spl_heap arena. + + spl_heap_arena = vmem_create("bucket_heap", // id 15 + NULL, 0, heap_quantum, + vmem_bucket_alloc, vmem_bucket_free, spl_default_arena_parent, 0, + VM_SLEEP | VMC_TIMEFREE | VMC_OLDFIRST); + + VERIFY(spl_heap_arena != NULL); + + // add a fixed-sized allocation to spl_heap_arena; this reduces the + // need to talk to the bucket arenas by a substantial margin + // (kstat.vmem.vmem.bucket_heap.{alloc+free} is much greater than + // kstat.vmem.vmem.bucket_heap.parent_{alloc+free}, and improves with + // increasing initial fixed allocation size. + + const uint32_t mib = 1024ULL * 1024ULL; + const uint32_t gib = 1024ULL * mib; + uint32_t resv_size = 128ULL * mib; + extern uint64_t real_total_memory; + + if (real_total_memory >= 4ULL * gib) + resv_size = 256ULL * mib; + if (real_total_memory >= 8ULL * gib) + resv_size = 512ULL * mib; + if (real_total_memory >= 16ULL * gib) + resv_size = gib; + + dprintf("SPL: %s adding fixed allocation of %llu to the bucket_heap\n", + __func__, (uint64_t)resv_size); + + spl_heap_arena_initial_alloc = vmem_add(spl_heap_arena, + vmem_alloc(spl_default_arena, resv_size, VM_SLEEP), + resv_size, VM_SLEEP); + + VERIFY(spl_heap_arena_initial_alloc != NULL); + + spl_heap_arena_initial_alloc_size = resv_size; + + // kstat.vmem.vmem.heap : kmem_cache_alloc() and similar calls + // to handle in-memory datastructures other than arc and zio buffers. + + heap = vmem_create(heap_name, // id 16 + NULL, 0, heap_quantum, + vmem_alloc, vmem_free, spl_heap_arena, 0, + VM_SLEEP); + + VERIFY(heap != NULL); + + // Root all the low bandwidth metadata arenas to the default arena. + // The vmem_metadata allocations will all be 32 kiB or larger, + // and the total allocation will generally cap off around 24 MiB. + + vmem_metadata_arena = vmem_create("vmem_metadata", // id 17 + NULL, 0, heap_quantum, vmem_alloc, vmem_free, spl_default_arena, + 8 * PAGESIZE, VM_SLEEP | VMC_POPULATOR | VMC_NO_QCACHE); + + VERIFY(vmem_metadata_arena != NULL); + + vmem_seg_arena = vmem_create("vmem_seg", // id 18 + NULL, 0, heap_quantum, + vmem_alloc, vmem_free, vmem_metadata_arena, 0, + VM_SLEEP | VMC_POPULATOR); + + VERIFY(vmem_seg_arena != NULL); + + vmem_hash_arena = vmem_create("vmem_hash", // id 19 + NULL, 0, 8, + vmem_alloc, vmem_free, vmem_metadata_arena, 0, + VM_SLEEP); + + VERIFY(vmem_hash_arena != NULL); + + vmem_vmem_arena = vmem_create("vmem_vmem", // id 20 + vmem0, sizeof (vmem0), 1, + vmem_alloc, vmem_free, vmem_metadata_arena, 0, + VM_SLEEP); + + VERIFY(vmem_vmem_arena != NULL); + + // 21 (0-based) vmem_create before this line. - macroized NUMBER_OF_ARENAS_IN_VMEM_INIT + for (id = 0; id < vmem_id; id++) { + (void) vmem_xalloc(vmem_vmem_arena, sizeof (vmem_t), + 1, 0, 0, &vmem0[id], &vmem0[id + 1], + VM_NOSLEEP | VM_BESTFIT | VM_PANIC); + } + + dprintf("SPL: starting vmem_update() thread\n"); + vmem_update(NULL); + + return (heap); +} + +struct free_slab { + vmem_t *vmp; + uint32_t slabsize; + void *slab; + list_node_t next; +}; +static list_t freelist; + +static void vmem_fini_freelist(void *vmp, void *start, uint32_t size) +{ + struct free_slab *fs; + + MALLOC(fs, struct free_slab *, sizeof(struct free_slab), M_TEMP, M_WAITOK); + fs->vmp = vmp; + fs->slabsize = size; + fs->slab = start; + list_link_init(&fs->next); + list_insert_tail(&freelist, fs); +} + + +void vmem_free_span_list() +{ + int total = 0; + int total_count = 0; + struct free_slab* fs; + int release = 1; + + while ((fs = list_head(&freelist))) { + total_count++; + total += fs->slabsize; + list_remove(&freelist, fs); + for (int id = 0; id < VMEM_INITIAL; id++) { + if (&vmem0[id] == fs->slab) { + release = 0; + break; + } + } + if (release) + fs->vmp->vm_source_free(fs->vmp, fs->slab, fs->slabsize); + release = 1; + FREE(fs, M_TEMP); + } +} + +static void vmem_fini_void(void *vmp, void *start, uint32_t size) +{ + return; +} + +void +vmem_fini(vmem_t *heap) +{ + struct free_slab *fs; + uint64_t total; + +// bsd_untimeout(vmem_update, NULL); + + dprintf("SPL: %s: stopped vmem_update. Creating list and walking arenas.\n", __func__); + + /* Create a list of slabs to free by walking the list of allocs */ + list_create(&freelist, sizeof (struct free_slab), + offsetof(struct free_slab, next)); + + /* Walk to list of allocations */ + + // walking with VMEM_REENTRANT causes segment consolidation and freeing of spans + // the freelist contains a list of segments that are still allocated + // at the time of the walk; unfortunately the lists cannot be exact without + // complex multiple passes, locking, and a more complex vmem_fini_freelist(). + // + // Walking withoutu VMEM_REENTRANT can produce a nearly-exact list of unfreed + // spans, which Illumos would then free directly after the list is complete. + // + // Unfortunately in O3X, that lack of exactness can lead to a panic + // caused by attempting to free to xnu memory that we already freed to xnu. + // Fortunately, we can get a sense of what would have been destroyed + // after the (non-reentrant) walking, and we dprintf that at the end of this function. + + // Walk all still-alive arenas from leaves to the root + + vmem_walk(heap, VMEM_ALLOC | VMEM_REENTRANT, vmem_fini_void, heap); + + vmem_walk(heap, VMEM_ALLOC, vmem_fini_freelist, heap); + vmem_free_span_list(); + dprintf("\nSPL: %s destroying heap\n", __func__); + vmem_destroy(heap); // PARENT: spl_heap_arena + + dprintf("SPL: %s: walking spl_heap_arena, aka bucket_heap (pass 1)\n", __func__); + + vmem_walk(spl_heap_arena, VMEM_ALLOC | VMEM_REENTRANT, vmem_fini_void, spl_heap_arena); + + dprintf("SPL: %s: calling vmem_xfree(spl_default_arena, ptr, %llu);\n", + __func__, (uint64_t)spl_heap_arena_initial_alloc_size); + + // forcibly remove the initial alloc from spl_heap_arena arena, whether + // or not it is empty. below this point, any activity on spl_default_arena + // other than a non-reentrant(!) walk and a destroy is unsafe (UAF or MAF). + + // However, all the children of spl_heap_arena should now be destroyed. + + vmem_xfree(spl_default_arena, spl_heap_arena_initial_alloc, + spl_heap_arena_initial_alloc_size); + + dprintf("SPL: %s: walking spl_heap_arena, aka bucket_heap (pass 2)\n", __func__); + + vmem_walk(spl_heap_arena, VMEM_ALLOC, vmem_fini_freelist, spl_heap_arena); + vmem_free_span_list(); + + dprintf("SPL: %s: walking bucket arenas...\n", __func__); + + for (int i = VMEM_BUCKET_LOWBIT; i <= VMEM_BUCKET_HIBIT; i++) { + const int bucket = i - VMEM_BUCKET_LOWBIT; + vmem_walk(vmem_bucket_arena[bucket], VMEM_ALLOC | VMEM_REENTRANT, + vmem_fini_void, vmem_bucket_arena[bucket]); + + vmem_walk(vmem_bucket_arena[bucket], VMEM_ALLOC, + vmem_fini_freelist, vmem_bucket_arena[bucket]); + } + vmem_free_span_list(); + + dprintf("SPL: %s destroying spl_bucket_arenas...", __func__); + for (int32_t i = VMEM_BUCKET_LOWBIT; i <= VMEM_BUCKET_HIBIT; i++) { + vmem_t* vmpt = vmem_bucket_arena[i - VMEM_BUCKET_LOWBIT]; + dprintf(" %llu", (1ULL << i)); + vmem_destroy(vmpt); // parent: spl_default_arena_parent + } + dprintf("\n"); + + dprintf("SPL: %s: walking vmem metadata-related arenas...\n", __func__); + + vmem_walk(vmem_vmem_arena, VMEM_ALLOC | VMEM_REENTRANT, + vmem_fini_void, vmem_vmem_arena); + + vmem_walk(vmem_vmem_arena, VMEM_ALLOC, + vmem_fini_freelist, vmem_vmem_arena); + + vmem_free_span_list(); + + // We should not do VMEM_REENTRANT on vmem_seg_arena or vmem_hash_arena or below + // to avoid causing work in vmem_seg_arena and vmem_hash_arena. + + vmem_walk(vmem_seg_arena, VMEM_ALLOC, + vmem_fini_freelist, vmem_seg_arena); + + vmem_free_span_list(); + + vmem_walk(vmem_hash_arena, VMEM_ALLOC, + vmem_fini_freelist, vmem_hash_arena); + vmem_free_span_list(); + + vmem_walk(vmem_metadata_arena, VMEM_ALLOC, + vmem_fini_freelist, vmem_metadata_arena); + + vmem_free_span_list(); + dprintf("SPL: %s walking the root arena (spl_default_arena)...\n", __func__); + + vmem_walk(spl_default_arena, VMEM_ALLOC, + vmem_fini_freelist, spl_default_arena); + + vmem_free_span_list(); + + dprintf("SPL: %s destroying bucket heap\n", __func__); + vmem_destroy(spl_heap_arena); // PARENT: spl_default_arena_parent (but depends on buckets) + + + // destroying the vmem_vmem arena and any arena afterwards + // requires the use of vmem_destroy_internal(), which does + // not talk to vmem_vmem_arena like vmem_destroy() does. + //dprintf("SPL: %s destroying vmem_vmem_arena\n", __func__); + //vmem_destroy_internal(vmem_vmem_arena); // parent: vmem_metadata_arena + + // destroying the seg arena means we must no longer + // talk to vmem_populate() + dprintf("SPL: %s destroying vmem_seg_arena\n", __func__); + vmem_destroy(vmem_seg_arena); + + // vmem_hash_arena may be freed-to in vmem_destroy_internal() + // so it should be just before the vmem_metadata_arena. + dprintf("SPL: %s destroying vmem_hash_arena\n", __func__); + vmem_destroy(vmem_hash_arena); // parent: vmem_metadata_arena + vmem_hash_arena = NULL; + + // XXX: if we panic on unload below here due to destroyed mutex, vmem_init() + // will need some reworking (e.g. have vmem_metadata_arena talk directly + // to xnu), or alternatively a vmem_destroy_internal_internal() + // function that does not touch vmem_hash_arena will need writing. + + dprintf("SPL: %s destroying vmem_metadata_arena\n", __func__); + vmem_destroy(vmem_metadata_arena); // parent: spl_default_arena + + dprintf("\nSPL: %s destroying spl_default_arena\n", __func__); + vmem_destroy(spl_default_arena); // parent: spl_default_arena_parent + dprintf("\nSPL: %s destroying spl_default_arena_parant\n", __func__); + vmem_destroy(spl_default_arena_parent); + + dprintf("SPL: %s destroying vmem_vmem_arena\n", __func__); + vmem_destroy_internal(vmem_vmem_arena); + + dprintf("SPL: arenas removed, now try destroying mutexes... "); + + dprintf("vmem_xnu_alloc_lock "); + mutex_destroy(&vmem_xnu_alloc_lock); + dprintf("vmem_panic_lock "); + mutex_destroy(&vmem_panic_lock); + dprintf("vmem_pushpage_lock "); + mutex_destroy(&vmem_pushpage_lock); + dprintf("vmem_nosleep_lock "); + mutex_destroy(&vmem_nosleep_lock); + dprintf("vmem_sleep_lock "); + mutex_destroy(&vmem_sleep_lock); + dprintf("vmem_segfree_lock "); + mutex_destroy(&vmem_segfree_lock); + dprintf("vmem_list_lock "); + mutex_destroy(&vmem_list_lock); + + dprintf("\nSPL: %s: walking list of live slabs at time of call to %s\n", + __func__, __func__); + + // annoyingly, some of these should be returned to xnu, but + // we have no idea which have already been freed to xnu, and + // freeing a second time results in a panic. + + /* Now release the list of allocs to built above */ + total = 0; + uint64_t total_count = 0; + while((fs = list_head(&freelist))) { + total_count++; + total+=fs->slabsize; + list_remove(&freelist, fs); + //extern void segkmem_free(vmem_t *, void *, uint32_t); + //segkmem_free(fs->vmp, fs->slab, fs->slabsize); + FREE(fs, M_TEMP); + } + dprintf("SPL: WOULD HAVE released %llu bytes (%llu spans) from arenas\n", + total, total_count); + list_destroy(&freelist); + dprintf("SPL: %s: Brief delay for readability...\n", __func__); + delay(hz); + dprintf("SPL: %s: done!\n", __func__); +} + +/* + * return TRUE if inuse is much smaller than imported + */ +static inline boolean_t +bucket_fragmented(const uint16_t bn, const uint64_t now) +{ + + // early during uptime, just let buckets grow. + + if (now < 600 * hz) + return (FALSE); + + // if there has been no pressure in the past five minutes, + // then we will just let the bucket grow. + + const uint64_t timeout = 5ULL * 60ULL * hz; + + if (spl_free_last_pressure_wrapper() + timeout < now) + return (FALSE); + + const vmem_t *vmp = vmem_bucket_arena[bn]; + + const int64_t imported = (int64_t)vmp->vm_kstat.vk_mem_import.value.ui64; + const int64_t inuse = (int64_t)vmp->vm_kstat.vk_mem_inuse.value.ui64; + const int64_t tiny = 64LL*1024LL*1024LL; + const int64_t small = tiny * 2LL; // 128 M + const int64_t medium = small * 2LL; // 256 + const int64_t large = medium * 2LL; // 512 + const int64_t huge = large * 2LL; // 1 G + const int64_t super_huge = huge * 2LL; // 2 + + const int64_t amount_free = imported - inuse; + + if (amount_free <= tiny || imported <= small) + return (FALSE); + + const int64_t percent_free = (amount_free * 100LL) / imported; + + if (percent_free > 75LL) { + return (TRUE); + } else if (imported <= medium) { + return (percent_free >= 50); + } else if (imported <= large) { + return (percent_free >= 33); + } else if (imported <= huge) { + return (percent_free >= 25); + } else if (imported <= super_huge) { + return (percent_free >= 15); + } else { + return (percent_free >= 10); + } +} + +/* + * return TRUE if the bucket for size is fragmented + * */ +static inline boolean_t +spl_arc_no_grow_impl(const uint16_t b, const uint32_t size, const boolean_t buf_is_metadata, + kmem_cache_t **kc) +{ + + static _Atomic uint8_t frag_suppression_counter[VMEM_BUCKETS] = { 0 }; + + const uint64_t now = zfs_lbolt(); + + const boolean_t fragmented = bucket_fragmented(b, now); + + if (fragmented) { + if (size < 32768) { + // Don't suppress small qcached blocks when the + // qcache size (bucket_262144) is fragmented, + // since they will push everything else towards + // the tails of ARC lists without eating up a large + // amount of space themselves. + return (FALSE); + } + const uint32_t b_bit = (uint32_t)1 << (uint32_t)b; + //spl_arc_no_grow_bits |= b_bit; + InterlockedOr64(&spl_arc_no_grow_bits, b_bit); + + const uint32_t sup_at_least_every = MIN(b_bit, 255); + const uint32_t sup_at_most_every = MAX(b_bit, 16); + const uint32_t sup_every = MIN(sup_at_least_every,sup_at_most_every); + if (frag_suppression_counter[b] >= sup_every) { + frag_suppression_counter[b] = 0; + return (TRUE); + } else { + frag_suppression_counter[b]++; + return (FALSE); + } + } else { + const uint32_t b_bit = (uint32_t)1 << (uint32_t)b; + //spl_arc_no_grow_bits &= ~b_bit; + InterlockedAnd64(&spl_arc_no_grow_bits, ~b_bit); + } + + extern boolean_t spl_zio_is_suppressed(const uint32_t, const uint64_t, const boolean_t, + kmem_cache_t **); + + return (spl_zio_is_suppressed(size, now, buf_is_metadata, kc)); +} + +static inline uint16_t +vmem_bucket_number_arc_no_grow(const uint32_t size) +{ + // qcaching on arc + if (size < 128*1024) + return(vmem_bucket_number(262144)); + else + return(vmem_bucket_number(size)); +} + +boolean_t +spl_arc_no_grow(uint32_t size, boolean_t buf_is_metadata, kmem_cache_t **zp) +{ + const uint16_t b = vmem_bucket_number_arc_no_grow(size); + + const boolean_t rv = spl_arc_no_grow_impl(b, size, buf_is_metadata, zp); + + if (rv) { + atomic_inc_64(&spl_arc_no_grow_count); + } + + return((boolean_t)rv); +} diff --git a/module/os/windows/spl/spl-vnode.c b/module/os/windows/spl/spl-vnode.c new file mode 100644 index 000000000000..ea33269002a0 --- /dev/null +++ b/module/os/windows/spl/spl-vnode.c @@ -0,0 +1,1818 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + */ + +#include +#include +//#include +#include +#include +//#include + +#include + +#include +#include + +#ifdef DEBUG_IOCOUNT +#include +#endif + +#include + +//#define FIND_MAF + + +/* Counter for unique vnode ID */ +static uint64_t vnode_vid_counter = 6; /* ZFSCTL_INO_SHARES + 1; */ + +/* Total number of active vnodes */ +static uint64_t vnode_active = 0; + +/* The kmem cache for vnodes */ +static kmem_cache_t *vnode_cache = NULL; + +/* List of all vnodes */ +static kmutex_t vnode_all_list_lock; +static list_t vnode_all_list; + +/* list of all getf/releasef active */ +static kmutex_t spl_getf_lock; +static list_t spl_getf_list; + +enum vtype iftovt_tab[16] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, +}; +int vttoif_tab[9] = { + 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, + S_IFSOCK, S_IFIFO, S_IFMT, +}; + +/* + * In a real VFS the filesystem would register the callbacks for + * VNOP_ACTIVE and VNOP_RECLAIM - but here we just call them direct + */ +//extern int zfs_zinactive(struct vnode *), void *, void*); +extern int zfs_vnop_reclaim(struct vnode *); + +int vnode_recycle_int(vnode_t *vp, int flags); + + +int +vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode, + struct vnode **vpp, enum create crwhy, mode_t umask) +{ + vfs_context_t *vctx; + int fmode; + int error=0; + + fmode = filemode; + if (crwhy) + fmode |= O_CREAT; + // TODO I think this should be 'fmode' instead of 'filemode' + // vctx = vfs_context_create((vfs_context_t)0); + //error = vnode_open(pnamep, filemode, createmode, 0, vpp, vctx); + //(void) vfs_context_rele(vctx); + //printf("vn_open '%s' -> %d (vp %p)\n", pnamep, error, *vpp); + return (error); +} + +int +vn_openat(char *pnamep, enum uio_seg seg, int filemode, int createmode, + struct vnode **vpp, enum create crwhy, + mode_t umask, struct vnode *startvp) +{ + char *path; + int pathlen = MAXPATHLEN; + int error=0; + + path = (char *)kmem_zalloc(MAXPATHLEN, KM_SLEEP); + + //error = vn_getpath(startvp, path, &pathlen); + if (error == 0) { + // strlcat(path, pnamep, MAXPATHLEN); + // error = vn_open(path, seg, filemode, createmode, vpp, crwhy, + // umask); + } + + kmem_free(path, MAXPATHLEN); + return (error); +} + +extern errno_t vnode_rename(const char *, const char *, int, vfs_context_t *); + +errno_t +vnode_rename(const char *from, const char *to, int flags, vfs_context_t *vctx) +{ + /* + * We need proper KPI changes to be able to safely update + * the zpool.cache file. For now, we return EPERM. + */ + return (EPERM); +} + +int +vn_rename(char *from, char *to, enum uio_seg seg) +{ + vfs_context_t *vctx; + int error=0; + + //vctx = vfs_context_create((vfs_context_t)0); + + //error = vnode_rename(from, to, 0, vctx); + + //(void) vfs_context_rele(vctx); + + return (error); +} + +extern errno_t vnode_remove(const char *, int, enum vtype, vfs_context_t *); + +errno_t +vnode_remove(const char *name, int flag, enum vtype type, vfs_context_t *vctx) +{ + /* + * Now that zed ZFS Event Daemon can handle the rename of zpool.cache + * we will silence this limitation, and look in zed.d/config.sync.sh + */ + /* + IOLog("vnode_remove: \"%s\"\n", name); + IOLog("zfs: vnode_remove not yet supported\n"); + */ + return (EPERM); +} + + +int +vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag) +{ + vfs_context_t *vctx; + enum vtype type; + int error=0; + + //type = dirflag == RMDIRECTORY ? VDIR : VREG; + + //vctx = vfs_context_create((vfs_context_t)0); + + //error = vnode_remove(fnamep, 0, type, vctx); + + //(void) vfs_context_rele(vctx); + + return (error); +} + +int zfs_vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, ssize_t len, + offset_t offset, enum uio_seg seg, int ioflag, rlim64_t ulimit, + cred_t *cr, ssize_t *residp) +{ + uio_t *auio; + int spacetype; + int error=0; + vfs_context_t *vctx; + + //spacetype = UIO_SEG_IS_USER_SPACE(seg) ? UIO_USERSPACE32 : UIO_SYSSPACE; + + //vctx = vfs_context_create((vfs_context_t)0); + //auio = uio_create(1, 0, spacetype, rw); + //uio_reset(auio, offset, spacetype, rw); + //uio_addiov(auio, (uint64_t)(uintptr_t)base, len); + + if (rw == UIO_READ) { + // error = VNOP_READ(vp, auio, ioflag, vctx); + } else { + // error = VNOP_WRITE(vp, auio, ioflag, vctx); + } + + if (residp) { + // *residp = uio_resid(auio); + } else { + // if (uio_resid(auio) && error == 0) + error = EIO; + } + +// uio_free(auio); + // vfs_context_rele(vctx); + + return (error); +} + +int kernel_ioctl(PDEVICE_OBJECT DeviceObject, long cmd, void *inbuf, uint32_t inlen, + void *outbuf, uint32_t outlen) +{ + NTSTATUS status; + PFILE_OBJECT FileObject; + + dprintf("%s: trying to send kernel ioctl %x\n", __func__, cmd); + + IO_STATUS_BLOCK IoStatusBlock; + KEVENT Event; + PIRP Irp; + NTSTATUS Status; + ULONG Remainder; + PAGED_CODE(); + + /* Build the information IRP */ + KeInitializeEvent(&Event, SynchronizationEvent, FALSE); + Irp = IoBuildDeviceIoControlRequest(cmd, + DeviceObject, + inbuf, + inlen, + outbuf, + outlen, + FALSE, + &Event, + &IoStatusBlock); + if (!Irp) return STATUS_NO_MEMORY; + + /* Override verification */ + IoGetNextIrpStackLocation(Irp)->Flags |= SL_OVERRIDE_VERIFY_VOLUME; + + /* Do the request */ + Status = IoCallDriver(DeviceObject, Irp); + if (Status == STATUS_PENDING) { + /* Wait for completion */ + KeWaitForSingleObject(&Event, + Executive, + KernelMode, + FALSE, + NULL); + Status = IoStatusBlock.Status; + } + + return Status; +} + +/* Linux TRIM API */ +int blk_queue_discard(PDEVICE_OBJECT dev) +{ + STORAGE_PROPERTY_QUERY spqTrim; + spqTrim.PropertyId = (STORAGE_PROPERTY_ID)StorageDeviceTrimProperty; + spqTrim.QueryType = PropertyStandardQuery; + + DWORD bytesReturned = 0; + DEVICE_TRIM_DESCRIPTOR dtd = { 0 }; + + if (kernel_ioctl(dev, IOCTL_STORAGE_QUERY_PROPERTY, + &spqTrim, sizeof(spqTrim), &dtd, sizeof(dtd)) == 0) { + return dtd.TrimEnabled; + } + return 0; // No trim +} + +int blk_queue_discard_secure(PDEVICE_OBJECT dev) +{ + return 0; // No secure trim +} + +int blk_queue_nonrot(PDEVICE_OBJECT dev) +{ + STORAGE_PROPERTY_QUERY spqSeekP; + spqSeekP.PropertyId = (STORAGE_PROPERTY_ID)StorageDeviceSeekPenaltyProperty; + spqSeekP.QueryType = PropertyStandardQuery; + DWORD bytesReturned = 0; + DEVICE_SEEK_PENALTY_DESCRIPTOR dspd = { 0 }; + if (kernel_ioctl(dev, IOCTL_STORAGE_QUERY_PROPERTY, + &spqSeekP, sizeof(spqSeekP), &dspd, sizeof(dspd)) == 0) { + return !dspd.IncursSeekPenalty; + } + return 0; // Not SSD; +} + +int blkdev_issue_discard_bytes(PDEVICE_OBJECT dev, uint64_t offset, uint64_t size, uint32_t flags) +{ + int Status = 0; + struct setAttrAndRange { + DEVICE_MANAGE_DATA_SET_ATTRIBUTES dmdsa; + DEVICE_DATA_SET_RANGE range; + } set; + + set.dmdsa.Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES); + set.dmdsa.Action = DeviceDsmAction_Trim; + set.dmdsa.Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED; + set.dmdsa.ParameterBlockOffset = 0; + set.dmdsa.ParameterBlockLength = 0; + set.dmdsa.DataSetRangesOffset = FIELD_OFFSET(struct setAttrAndRange, range); + set.dmdsa.DataSetRangesLength = 1 * sizeof(DEVICE_DATA_SET_RANGE); + + set.range.LengthInBytes = size; + set.range.StartingOffset = offset; + + Status = kernel_ioctl(dev, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, + &set, sizeof(set), NULL, 0); + + if (Status == 0) { + return 0; // TRIM OK + } + + // Linux returncodes are negative + return -Status; +} + + +int +VOP_SPACE(HANDLE h, int cmd, struct flock *fl, int flags, offset_t off, + cred_t *cr, void *ctx) +{ + if (cmd == F_FREESP) { + NTSTATUS Status; + DWORD ret; + FILE_ZERO_DATA_INFORMATION fzdi; + fzdi.FileOffset.QuadPart = fl->l_start; + fzdi.BeyondFinalZero.QuadPart = fl->l_start + fl->l_len; + + Status = ZwFsControlFile( + h, + NULL, + NULL, + NULL, + NULL, + FSCTL_SET_ZERO_DATA, + &fzdi, sizeof(fzdi), + NULL, + 0 + ); + + return (Status); + } + + return (STATUS_NOT_SUPPORTED); +} + +int +VOP_CLOSE(struct vnode *vp, int flag, int count, offset_t off, void *cr, void *k) +{ + // vfs_context_t vctx; + int error=0; + + //vctx = vfs_context_create((vfs_context_t)0); + //error = vnode_close(vp, flag & FWRITE, vctx); + //(void) vfs_context_rele(vctx); + return (error); +} + +int +VOP_FSYNC(struct vnode *vp, int flags, void* unused, void *uused2) +{ +// vfs_context_t vctx; + int error=0; + + //vctx = vfs_context_create((vfs_context_t)0); + //error = VNOP_FSYNC(vp, (flags == FSYNC), vctx); + //(void) vfs_context_rele(vctx); + return (error); +} + +int VOP_GETATTR(struct vnode *vp, vattr_t *vap, int flags, void *x3, void *x4) +{ +// vfs_context_t vctx; + int error=0; + + //vap->va_size = 134217728; + //return 0; + + // panic("take this"); + //printf("VOP_GETATTR(%p, %p, %d)\n", vp, vap, flags); + //vctx = vfs_context_create((vfs_context_t)0); + //error= vnode_getattr(vp, vap, vctx); + //(void) vfs_context_rele(vctx); + return error; +} + +#if 1 +errno_t VNOP_LOOKUP(struct vnode *, struct vnode **, struct componentname *, vfs_context_t *); + +errno_t VOP_LOOKUP(struct vnode *vp, struct vnode **vpp, struct componentname *cn, vfs_context_t *ct) +{ + //return VNOP_LOOKUP(vp,vpp,cn,ct); + return ENOTSUP; +} +#endif +#if 0 +extern errno_t VNOP_MKDIR (struct vnode *, struct vnode **, + struct componentname *, struct vnode_attr *, + vfs_context_t); +errno_t VOP_MKDIR(struct vnode *vp, struct vnode **vpp, + struct componentname *cn, struct vnode_attr *vattr, + vfs_context_t ct) +{ + return VNOP_MKDIR(vp, vpp, cn, vattr, ct); +} + +extern errno_t VNOP_REMOVE (struct vnode *, struct vnode *, + struct componentname *, int, vfs_context_t); +errno_t VOP_REMOVE (struct vnode *vp, struct vnode *dp, + struct componentname *cn, int flags, + vfs_context_t ct) +{ + return VNOP_REMOVE(vp, dp, cn, flags, ct); +} + + +extern errno_t VNOP_SYMLINK (struct vnode *, struct vnode **, + struct componentname *, struct vnode_attr *, + char *, vfs_context_t); +errno_t VOP_SYMLINK (struct vnode *vp, struct vnode **vpp, + struct componentname *cn, struct vnode_attr *attr, + char *name, vfs_context_t ct) +{ + return VNOP_SYMLINK(vp, vpp, cn, attr, name, ct); +} +#endif + + +#undef VFS_ROOT + +extern int VFS_ROOT(mount_t *, struct vnode **, vfs_context_t); +int spl_vfs_root(mount_t *mount, struct vnode **vp) +{ + // return VFS_ROOT(mount, vp, vfs_context_current() ); + *vp = NULL; + return -1; +} + + + +void vfs_mountedfrom(struct mount *vfsp, char *osname) +{ +// (void) copystr(osname, vfs_statfs(vfsp)->f_mntfromname, MNAMELEN - 1, 0); +} + + +/* + * DNLC Name Cache Support + */ +struct vnode * +dnlc_lookup(struct vnode *dvp, char *name) +{ + struct componentname cn; + struct vnode *vp = NULL; + + //return DNLC_NO_VNODE; + bzero(&cn, sizeof (cn)); + //cn.cn_nameiop = LOOKUP; + //cn.cn_flags = ISLASTCN; + //cn.cn_nameptr = (char *)name; + //cn.cn_namelen = strlen(name); + + switch(0/*cache_lookup(dvp, &vp, &cn)*/) { + case -1: + break; + case ENOENT: + vp = DNLC_NO_VNODE; + break; + default: + vp = NULL; + } + return (vp); +} + +int dnlc_purge_vfsp(struct mount *mp, int flags) +{ + // cache_purgevfs(mp); + return 0; +} + +void dnlc_remove(struct vnode *vp, char *name) +{ + // cache_purge(vp); + return; +} + + +/* + * + * + */ +void dnlc_update(struct vnode *vp, char *name, struct vnode *tp) +{ + +#if 0 + // If tp is NULL, it is a negative-cache entry + struct componentname cn; + + // OSX panics if you give empty(non-NULL) name + if (!name || !*name || !strlen(name)) return; + + bzero(&cn, sizeof(cn)); + cn.cn_nameiop = CREATE; + cn.cn_flags = ISLASTCN; + cn.cn_nameptr = (char *)name; + cn.cn_namelen = strlen(name); + + cache_enter(vp, tp == DNLC_NO_VNODE ? NULL : tp, &cn); +#endif + return; +} + +static int vnode_fileobject_compare(const void *arg1, const void *arg2) +{ + const vnode_fileobjects_t *node1 = arg1; + const vnode_fileobjects_t *node2 = arg2; + if (node1->fileobject > node2->fileobject) + return 1; + if (node1->fileobject < node2->fileobject) + return -1; + return 0; +} + +static int +zfs_vnode_cache_constructor(void *buf, void *arg, int kmflags) +{ + vnode_t *vp = buf; + + // So the Windows structs have to be zerod, even though we call + // their setup functions. + memset(vp, 0, sizeof(*vp)); + + mutex_init(&vp->v_mutex, NULL, MUTEX_DEFAULT, NULL); + avl_create(&vp->v_fileobjects, vnode_fileobject_compare, + sizeof(vnode_fileobjects_t), offsetof(vnode_fileobjects_t, avlnode)); + + ExInitializeResourceLite(&vp->resource); + ExInitializeResourceLite(&vp->pageio_resource); + ExInitializeFastMutex(&vp->AdvancedFcbHeaderMutex); + + return 0; +} + +static void +zfs_vnode_cache_destructor(void *buf, void *arg) +{ + vnode_t *vp = buf; + + //ExDeleteFastMutex(&vp->AdvancedFcbHeaderMutex); + ExDeleteResourceLite(&vp->pageio_resource); + ExDeleteResourceLite(&vp->resource); + + avl_destroy(&vp->v_fileobjects); + mutex_destroy(&vp->v_mutex); + +} + +int spl_vnode_init(void) +{ + mutex_init(&spl_getf_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&spl_getf_list, sizeof(struct spl_fileproc), + offsetof(struct spl_fileproc, f_next)); + mutex_init(&vnode_all_list_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&vnode_all_list, sizeof(struct vnode), + offsetof(struct vnode, v_list)); + + vnode_cache = kmem_cache_create("zfs_vnode_cache", + sizeof(vnode_t), 0, + zfs_vnode_cache_constructor, + zfs_vnode_cache_destructor, NULL, NULL, + NULL, 0); + + return 0; +} + +void spl_vnode_fini(void) +{ + mutex_destroy(&vnode_all_list_lock); + list_destroy(&vnode_all_list); + mutex_destroy(&spl_getf_lock); + list_destroy(&spl_getf_list); + + if (vnode_cache) + kmem_cache_destroy(vnode_cache); + vnode_cache = NULL; +} + +#include +struct fileproc; + +extern int fp_drop(struct proc *p, int fd, struct fileproc *fp, int locked); +extern int fp_drop_written(struct proc *p, int fd, struct fileproc *fp, + int locked); +extern int fp_lookup(struct proc *p, int fd, struct fileproc **resultfp, int locked); +extern int fo_read(struct fileproc *fp, struct uio *uio, int flags, + vfs_context_t ctx); +extern int fo_write(struct fileproc *fp, struct uio *uio, int flags, + vfs_context_t ctx); +extern int file_vnode_withvid(int, struct vnode **, uint64_t *); +extern int file_drop(int); + +#if ZFS_LEOPARD_ONLY +#define file_vnode_withvid(a, b, c) file_vnode(a, b) +#endif + + +/* + * getf(int fd) - hold a lock on a file descriptor, to be released by calling + * releasef(). On OSX we will also look up the vnode of the fd for calls + * to spl_vn_rdwr(). + */ +void *getf(uint64_t fd) +{ + struct spl_fileproc *sfp = NULL; + HANDLE h; + +#if 1 + struct fileproc *fp = NULL; + struct vnode *vp; + uint64_t vid; + + /* + * We keep the "fp" pointer as well, both for unlocking in releasef() and + * used in vn_rdwr(). + */ + + sfp = kmem_alloc(sizeof(*sfp), KM_SLEEP); + if (!sfp) return NULL; + + // if (fp_lookup(current_proc(), fd, &fp, 0/*!locked*/)) { + // kmem_free(sfp, sizeof(*sfp)); + // return (NULL); + // } + + /* + * The f_vnode ptr is used to point back to the "sfp" node itself, as it is + * the only information passed to vn_rdwr. + */ + if (ObReferenceObjectByHandle((HANDLE)fd, 0, 0, KernelMode, &fp, 0) != STATUS_SUCCESS) { + dprintf("%s: failed to get fd %d fp 0x\n", __func__, fd); + } + + sfp->f_vnode = sfp; + + sfp->f_fd = fd; + sfp->f_offset = 0; + sfp->f_proc = current_proc(); + sfp->f_fp = (void *)fp; + sfp->f_file = (uint64_t) fp; + + mutex_enter(&spl_getf_lock); + list_insert_tail(&spl_getf_list, sfp); + mutex_exit(&spl_getf_lock); + + //printf("SPL: new getf(%d) ret %p fp is %p so vnode set to %p\n", + // fd, sfp, fp, sfp->f_vnode); +#endif + return sfp; +} + +struct vnode *getf_vnode(void *fp) +{ + struct vnode *vp = NULL; +#if 0 + struct spl_fileproc *sfp = (struct spl_fileproc *) fp; + uint32_t vid; + + if (!file_vnode_withvid(sfp->f_fd, &vp, &vid)) { + file_drop(sfp->f_fd); + } +#endif + return vp; +} + +void releasef(uint64_t fd) +{ + +#if 1 + struct spl_fileproc *fp = NULL; + struct proc *p = NULL; + + //printf("SPL: releasef(%d)\n", fd); + + p = (void *)current_proc(); + mutex_enter(&spl_getf_lock); + for (fp = list_head(&spl_getf_list); fp != NULL; + fp = list_next(&spl_getf_list, fp)) { + if ((fp->f_proc == p) && fp->f_fd == fd) break; + } + mutex_exit(&spl_getf_lock); + if (!fp) return; // Not found + + //printf("SPL: releasing %p\n", fp); + + // Release the hold from getf(). +// if (fp->f_writes) +// fp_drop_written(p, fd, fp->f_fp, 0/*!locked*/); +// else +// fp_drop(p, fd, fp->f_fp, 0/*!locked*/); + if (fp->f_fp) + ObDereferenceObject(fp->f_fp); + + // Remove node from the list + mutex_enter(&spl_getf_lock); + list_remove(&spl_getf_list, fp); + mutex_exit(&spl_getf_lock); + + // Free the node + kmem_free(fp, sizeof(*fp)); +#endif +} + + + +/* + * Our version of vn_rdwr, here "vp" is not actually a vnode, but a ptr + * to the node allocated in getf(). We use the "fp" part of the node to + * be able to issue IO. + * You must call getf() before calling spl_vn_rdwr(). + */ +int spl_vn_rdwr(enum uio_rw rw, + struct vnode *vp, + caddr_t base, + ssize_t len, + offset_t offset, + enum uio_seg seg, + int ioflag, + rlim64_t ulimit, /* meaningful only if rw is UIO_WRITE */ + cred_t *cr, + ssize_t *residp) +{ + struct spl_fileproc *sfp = (struct spl_fileproc*)vp; + uio_t *auio; + int spacetype; + int error=0; + vfs_context_t *vctx; + + //spacetype = UIO_SEG_IS_USER_SPACE(seg) ? UIO_USERSPACE32 : UIO_SYSSPACE; + + //vctx = vfs_context_create((vfs_context_t)0); + //auio = uio_create(1, 0, spacetype, rw); + ///uio_reset(auio, offset, spacetype, rw); + + //uio_addiov(auio, (uint64_t)(uintptr_t)base, len); + //LARGE_INTEGER Offset; + //Offset.QuadPart = offset; + IO_STATUS_BLOCK iob; + LARGE_INTEGER off; + + off.QuadPart = offset; + + if (rw == UIO_READ) { + error = ZwReadFile((HANDLE)sfp->f_fd, NULL, NULL, NULL, &iob, base, (ULONG)len, &off, NULL); + // error = fo_read(sfp->f_fp, auio, ioflag, vctx); + } else { + // error = fo_write(sfp->f_fp, auio, ioflag, vctx); + error = ZwWriteFile((HANDLE)sfp->f_fd, NULL, NULL, NULL, &iob, base, (ULONG)len, &off, NULL); + sfp->f_writes = 1; + } + + if (residp) { + *residp = len - iob.Information; + } else { + if ((iob.Information < len) && error == 0) + error = EIO; + } + + //uio_free(auio); + //vfs_context_rele(vctx); + + return (error); +} + +void spl_rele_async(void *arg) +{ + struct vnode *vp = (struct vnode *)arg; +#ifdef DEBUG_IOCOUNT + if (vp) { + znode_t *zp = VTOZ(vp); + if (zp) dprintf("%s: Dec iocount from %u for '%s' \n", __func__, + &vp->v_iocount, + zp->z_name_cache); + } +#endif + if (vp) VN_RELE(vp); +} + +void vn_rele_async(struct vnode *vp, void *taskq) +{ +#ifdef DEBUG_IOCOUNT + if (vp) { + znode_t *zp = VTOZ(vp); + if (zp) dprintf("%s: Dec iocount in future, now %u for '%s' \n", __func__, + vp->v_iocount, + zp->z_name_cache); + } +#endif + VERIFY(taskq_dispatch((taskq_t *)taskq, + (task_func_t *)spl_rele_async, vp, TQ_SLEEP) != 0); +} + + + +vfs_context_t *spl_vfs_context_kernel(void) +{ +// return vfs_context_kernel(); + return NULL; +} + +#undef build_path +extern int build_path(struct vnode *vp, char *buff, int buflen, int *outlen, + int flags, vfs_context_t *ctx); + +int spl_build_path(struct vnode *vp, char *buff, int buflen, int *outlen, + int flags, vfs_context_t *ctx) +{ + //return build_path(vp, buff, buflen, outlen, flags, ctx); + return 0; +} + +/* + * vnode_notify was moved from KERNEL_PRIVATE to KERNEL in 10.11, but to be + * backward compatible, we keep the wrapper for now. + */ +extern int vnode_notify(struct vnode *, uint32_t, struct vnode_attr*); +int spl_vnode_notify(struct vnode *vp, uint32_t type, struct vnode_attr *vap) +{ + //return vnode_notify(vp, type, vap); + return 0; +} + +extern int vfs_get_notify_attributes(struct vnode_attr *vap); +int spl_vfs_get_notify_attributes(struct vnode_attr *vap) +{ + //return vfs_get_notify_attributes(vap); + return 0; +} + +/* Root directory vnode for the system a.k.a. '/' */ +/* Must use vfs_rootvnode() to acquire a reference, and + * vnode_put() to release it + */ + +/* + * From early boot (mountroot) we can not call vfs_rootvnode() + * or it will panic. So the default here is to return NULL until + * root has been mounted. XNU will call vfs_root() once that is + * done, so we use that to inform us that root is mounted. In nonboot, + * vfs_start is called early from kextload (zfs_osx.cpp). + */ +static int spl_skip_getrootdir = 1; + +struct vnode * +getrootdir(void) +{ + struct vnode *rvnode = NULL; + if (spl_skip_getrootdir) return NULL; + +// rvnode = vfs_rootvnode(); +// if (rvnode) +// vnode_put(rvnode); + return rvnode; +} + +void spl_vfs_start() +{ + spl_skip_getrootdir = 0; +} + + +int vnode_vfsisrdonly(vnode_t *vp) +{ + return 0; +} + +uint64_t vnode_vid(vnode_t *vp) +{ + return vp->v_id; +} + +int vnode_isreg(vnode_t *vp) +{ + return vp->v_type == VREG; +} + +int vnode_isdir(vnode_t *vp) +{ + return vp->v_type == VDIR; +} + +void *vnode_fsnode(struct vnode *dvp) +{ + return dvp->v_data; +} + +enum vtype vnode_vtype(vnode_t *vp) +{ + return vp->v_type; +} + +int vnode_isblk(vnode_t *vp) +{ + return vp->v_type == VBLK; +} + +int vnode_ischr(vnode_t *vp) +{ + return vp->v_type == VCHR; +} + +int vnode_isswap(vnode_t *vp) +{ + return 0; +} + +int vnode_isfifo(vnode_t *vp) +{ + return 0; +} + +int vnode_islnk(vnode_t *vp) +{ + return 0; +} + +mount_t *vnode_mountedhere(vnode_t *vp) +{ + return NULL; +} + +void ubc_setsize(struct vnode *vp, uint64_t size) +{ +} + +int vnode_isinuse(vnode_t *vp, uint64_t refcnt) +{ + if (((vp->v_usecount /*+ vp->v_iocount*/) > refcnt)) // xnu uses usecount +kusecount, not iocount + return 1; + return 0; +} + +int vnode_isidle(vnode_t *vp) +{ + if ((vp->v_usecount == 0) && (vp->v_iocount <= 1)) + return 1; + return 0; +} + +#ifdef DEBUG_IOCOUNT +int vnode_getwithref(vnode_t *vp, char *file, int line) +#else +int vnode_getwithref(vnode_t *vp) +#endif +{ + KIRQL OldIrql; + int error = 0; +#ifdef FIND_MAF + ASSERT(!(vp->v_flags & 0x8000)); +#endif + + mutex_enter(&vp->v_mutex); + if ((vp->v_flags & VNODE_DEAD)) { + error = ENOENT; +// } else if (vnode_deleted(vp)) { +// error = ENOENT; + } else { +#ifdef DEBUG_IOCOUNT + if (vp) { + znode_t *zp = VTOZ(vp); + if (zp) dprintf("%s: Inc iocount now %u for '%s' (%s:%d) thread %p \n", __func__, + atomic_inc_32_nv(&vp->v_iocount), + zp->z_name_cache, + file, line, current_thread()); + } +#else + atomic_inc_32(&vp->v_iocount); +#endif + } + + mutex_exit(&vp->v_mutex); + return error; +} + +#ifdef DEBUG_IOCOUNT +int vnode_debug_getwithvid(vnode_t *vp, uint64_t id, char *file, int line) +#else +int vnode_getwithvid(vnode_t *vp, uint64_t id) +#endif +{ + KIRQL OldIrql; + int error = 0; + +#ifdef FIND_MAF + ASSERT(!(vp->v_flags & 0x8000)); +#endif + + mutex_enter(&vp->v_mutex); + if ((vp->v_flags & VNODE_DEAD)) { + error = ENOENT; + } else if (id != vp->v_id) { + error = ENOENT; +// } else if (vnode_deleted(vp)) { +// error = ENOENT; + } else { +#ifdef DEBUG_IOCOUNT + if (vp) { + znode_t *zp = VTOZ(vp); + if (zp) dprintf("%s: Inc iocount now %u for '%s' (%s:%d) thread %p\n", __func__, + atomic_inc_32_nv(&vp->v_iocount), + zp->z_name_cache, file, line, current_thread()); + } +#else + atomic_inc_32(&vp->v_iocount); +#endif + } + + mutex_exit(&vp->v_mutex); + return error; +} + +extern void zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct); + +#ifdef DEBUG_IOCOUNT +int vnode_put(vnode_t *vp, char *file, int line) +#else +int vnode_put(vnode_t *vp) +#endif +{ + KIRQL OldIrql; + int calldrain = 0; + ASSERT(!(vp->v_flags & VNODE_DEAD)); + ASSERT(vp->v_iocount > 0); + ASSERT((vp->v_flags & ~VNODE_VALIDBITS) == 0); +#ifdef DEBUG_IOCOUNT + if (vp) { + znode_t *zp = VTOZ(vp); + if (zp) dprintf("%s: Dec iocount now %u for '%s' (%s:%d) thread %p \n", __func__, + atomic_dec_32_nv(&vp->v_iocount), + zp->z_name_cache, file, line, current_thread()); + } +#else + atomic_dec_32(&vp->v_iocount); +#endif + // Now idle? + mutex_enter(&vp->v_mutex); + + if (vp->v_iocount == 0) { + + calldrain = 1; + + if (vp->v_flags & VNODE_NEEDINACTIVE) { + vp->v_flags &= ~VNODE_NEEDINACTIVE; + mutex_exit(&vp->v_mutex); + zfs_inactive(vp, NULL, NULL); + mutex_enter(&vp->v_mutex); + } + } + + vp->v_flags &= ~VNODE_NEEDINACTIVE; + +#if 0 + // Re-test for idle, as we may have dropped lock for inactive + if ((vp->v_usecount == 0) && (vp->v_iocount == 0)) { + // Was it marked TERM, but we were waiting for last ref to leave. + if ((vp->v_flags & VNODE_MARKTERM)) { + //vnode_recycle_int(vp, VNODE_LOCKED); //OldIrql is lost! + KeReleaseSpinLock(&vp->v_spinlock, OldIrql); + vnode_recycle_int(vp, 0); //OldIrql is lost! + return 0; + } + } +#endif + mutex_exit(&vp->v_mutex); + + // Temporarily - should perhaps be own thread? + if (calldrain) + vnode_drain_delayclose(0); + + return 0; +} + +int vnode_recycle_int(vnode_t *vp, int flags) +{ + KIRQL OldIrql; + ASSERT((vp->v_flags & VNODE_DEAD) == 0); + + // Mark it for recycle, if we are not ROOT. + if (!(vp->v_flags&VNODE_MARKROOT)) { + if (vp->v_flags & VNODE_MARKTERM) + dprintf("already marked\n"); + vp->v_flags |= VNODE_MARKTERM; // Mark it terminating + dprintf("%s: marking %p VNODE_MARKTERM\n", __func__, vp); + } + + // Already locked calling in... + if (!(flags & VNODELOCKED)) { + mutex_enter(&vp->v_mutex); + } + + // Doublecheck CcMgr is gone (should be if avl is empty) + // If it hasn't quite let go yet, let the node linger on deadlist. + if (vp->SectionObjectPointers.DataSectionObject != NULL || + vp->SectionObjectPointers.ImageSectionObject != NULL || + vp->SectionObjectPointers.SharedCacheMap != NULL) { + dprintf("%s: %p still has CcMgr, lingering on dead list.\n", __func__, vp); + mutex_exit(&vp->v_mutex); + return -1; + } + + // We will only reclaim idle nodes, and not mountpoints(ROOT) + if ((flags & FORCECLOSE) || + + ((vp->v_usecount == 0) && + (vp->v_iocount <= 1) && + avl_is_empty(&vp->v_fileobjects) && + ((vp->v_flags&VNODE_MARKROOT) == 0))) { + + ASSERT3P(vp->SectionObjectPointers.DataSectionObject, == , NULL); + ASSERT3P(vp->SectionObjectPointers.ImageSectionObject, == , NULL); + ASSERT3P(vp->SectionObjectPointers.SharedCacheMap, == , NULL); + + vp->v_flags |= VNODE_DEAD; // Mark it dead + // Since we might get swapped out (noticably FsRtlTeardownPerStreamContexts) + // we hold a look until the very end. + vp->v_iocount = 1; + + mutex_exit(&vp->v_mutex); + + FsRtlTeardownPerStreamContexts(&vp->FileHeader); + FsRtlUninitializeFileLock(&vp->lock); + + // Call sync? If vnode_write + //zfs_fsync(vp, 0, NULL, NULL); + + // Call inactive? + if (vp->v_flags & VNODE_NEEDINACTIVE) { + vp->v_flags &= ~VNODE_NEEDINACTIVE; + zfs_inactive(vp, NULL, NULL); + } + + + // Tell FS to release node. + if (zfs_vnop_reclaim(vp)) + panic("vnode_recycle: cannot reclaim\n"); // My fav panic from OSX + + KIRQL OldIrql; + mutex_enter(&vp->v_mutex); + ASSERT(avl_is_empty(&vp->v_fileobjects)); + // We are all done with it. + vp->v_iocount = 0; + mutex_exit(&vp->v_mutex); + +#ifdef FIND_MAF + vp->v_flags |= 0x8000; +#endif + + /* + * Windows has a habit of copying FsContext (vp) without our knowledge and attempt + * To call fsDispatcher. We notice in vnode_getwithref(), which calls mutex_enter + * so we can not free the vp right here like we want to, or that would be a MAF. + * So we let it linger and age, there is no great way to know for sure that it + * has finished trying. + */ + dprintf("vp %p left on DEAD list\n", vp); + vp->v_age = gethrtime(); + + return 0; + } + + mutex_exit(&vp->v_mutex); + + return -1; +} + + +int vnode_recycle(vnode_t *vp) +{ + if (vp->v_flags & VNODE_FLUSHING) + return -1; + return vnode_recycle_int(vp, 0); +} + +void vnode_create(mount_t *mp, void *v_data, int type, int flags, struct vnode **vpp) +{ + struct vnode *vp; + // cache_alloc does not zero the struct, we need to + // make sure that those things that need clearing is + // done here. + vp = kmem_cache_alloc(vnode_cache, KM_SLEEP); + *vpp = vp; + vp->v_flags = 0; + vp->v_mount = mp; + vp->v_data = v_data; + vp->v_type = type; + vp->v_id = atomic_inc_64_nv(&(vnode_vid_counter)); + vp->v_iocount = 1; + vp->v_usecount = 0; + vp->v_unlink = 0; + atomic_inc_64(&vnode_active); + + list_link_init(&vp->v_list); + ASSERT(vnode_fileobject_empty(vp, 1)); // lying about locked is ok. + + if (flags & VNODE_MARKROOT) + vp->v_flags |= VNODE_MARKROOT; + + + // Initialise the Windows specific data. + memset(&vp->SectionObjectPointers, 0, sizeof(vp->SectionObjectPointers)); + + FsRtlSetupAdvancedHeader(&vp->FileHeader, &vp->AdvancedFcbHeaderMutex); + + FsRtlInitializeFileLock(&vp->lock, NULL, NULL); + vp->FileHeader.Resource = &vp->resource; + vp->FileHeader.PagingIoResource = &vp->pageio_resource; + + // Add only to list once we have finished initialising. + mutex_enter(&vnode_all_list_lock); + list_insert_tail(&vnode_all_list, vp); + mutex_exit(&vnode_all_list_lock); +} + +int vnode_isvroot(vnode_t *vp) +{ + return (vp->v_flags & VNODE_MARKROOT); +} + +mount_t *vnode_mount(vnode_t *vp) +{ + return NULL; +} + +void vnode_clearfsnode(vnode_t *vp) +{ + vp->v_data = NULL; +} + +void *vnode_sectionpointer(vnode_t *vp) +{ + return &vp->SectionObjectPointers; +} + +int +vnode_ref(vnode_t *vp) +{ + ASSERT(vp->v_iocount > 0); + ASSERT(!(vp->v_flags & VNODE_DEAD)); + atomic_inc_32(&vp->v_usecount); + return 0; +} + +void +vnode_rele(vnode_t *vp) +{ + KIRQL OldIrql; + + ASSERT(!(vp->v_flags & VNODE_DEAD)); + ASSERT(vp->v_iocount > 0); + ASSERT(vp->v_usecount > 0); + atomic_dec_32(&vp->v_usecount); + + // Grab lock and inspect + mutex_enter(&vp->v_mutex); + + // If we were the last usecount, but vp is still + // busy, we set NEEDINACTIVE + if (vp->v_usecount > 0 || vp->v_iocount > 0) { + vp->v_flags |= VNODE_NEEDINACTIVE; + } else { + // We are idle, call inactive, grab a hold + // so we can call inactive unlocked + vp->v_flags &= ~VNODE_NEEDINACTIVE; + mutex_exit(&vp->v_mutex); + atomic_inc_32(&vp->v_iocount); + + zfs_inactive(vp, NULL, NULL); +#ifdef DEBUG_VERBOSE + if (vp) { + znode_t *zp = VTOZ(vp); + if (zp) dprintf("%s: Inc iocount to %u for %s \n", __func__, vp->v_iocount, zp->z_name_cache); + } +#endif + atomic_dec_32(&vp->v_iocount); + // Re-check we are still free, and recycle (markterm) was called + // we can reclaim now + mutex_enter(&vp->v_mutex); + if ((vp->v_iocount == 0) && (vp->v_usecount == 0) && + ((vp->v_flags & (VNODE_MARKTERM)))) { + mutex_exit(&vp->v_mutex); + vnode_recycle_int(vp, 0); + return; + } + } + + mutex_exit(&vp->v_mutex); +} + +/* + * Periodically walk through list and release vnodes that are now idle. + * Set force=1 to perform check now. + * Will return number of vnodes with delete set, but not yet reclaimed. + */ +int vnode_drain_delayclose(int force) +{ + struct vnode *vp, *next = NULL; + int ret = 0; + int candidate = 0; + static hrtime_t last = 0; + const hrtime_t interval = SEC2NSEC(2); + const hrtime_t curtime = gethrtime(); + + mutex_enter(&vnode_all_list_lock); + // This should probably be its own thread, but for now, run once every 2s + if (!force && curtime - last < interval) { + mutex_exit(&vnode_all_list_lock); + return 0; + } + last = curtime; + + dprintf("%s: scanning\n", __func__); + + for (vp = list_head(&vnode_all_list); + vp; + vp = next) { + + next = list_next(&vnode_all_list, vp); + + // Make sure everything about the vp has been released. + vnode_lock(vp); + + // If we see a deleted node awaiting recycle, signal return code + if ((vp->v_flags & VNODE_MARKTERM)) + candidate = 1; + else + candidate = 0; + + if ((vp->v_flags & VNODE_MARKTERM) && + !(vp->v_flags & VNODE_DEAD) && + (vp->v_iocount == 0) && + (vp->v_usecount == 0) && + vnode_fileobject_empty(vp, /* locked */ 1) && + !vnode_isvroot(vp) && + (vp->SectionObjectPointers.ImageSectionObject == NULL) && + (vp->SectionObjectPointers.DataSectionObject == NULL)) { + // We are ready to let go + dprintf("%s: drain %p\n", __func__, vp); + + // Pass VNODELOCKED as we hold vp, recycle will unlock. + // We have to give up all_list due to recycle -> reclaim -> rmnode -> purgedir -> zget -> vnode_create + mutex_exit(&vnode_all_list_lock); + if (vnode_recycle_int(vp, VNODELOCKED) == 0) + candidate = 0; // If recycle was ok, this isnt a node we wait for + mutex_enter(&vnode_all_list_lock); + + // If successful, vp is freed. Do not use vp from here: + + } else if ((vp->v_flags & VNODE_DEAD) && + (vp->v_age != 0) && + (curtime - vp->v_age > SEC2NSEC(5))) { + // Arbitrary time! fixme? It would be nice to know when Windows really wont try this vp again. + // fastfat seems to clear up the cache of the parent directory, perhaps this is the missing + // bit. It is non-trivial to get parent from here though. + + //dprintf("age is %llu %d\n", (curtime - vp->v_age), NSEC2SEC(curtime - vp->v_age)); + + // Finally free vp. + list_remove(&vnode_all_list, vp); + vnode_unlock(vp); + dprintf("%s: freeing DEAD vp %p\n", __func__, vp); + + kmem_cache_free(vnode_cache, vp); // Holding all_list_lock, that OK? + atomic_dec_64(&vnode_active); + + } else { + vnode_unlock(vp); + } + + if (candidate) ret++; + } + mutex_exit(&vnode_all_list_lock); + + return ret; +} + +int mount_count_nodes(struct mount *mp, int flags) +{ + int count = 0; + struct vnode *rvp; + + mutex_enter(&vnode_all_list_lock); + for (rvp = list_head(&vnode_all_list); + rvp; + rvp = list_next(&vnode_all_list, rvp)) { + if (rvp->v_mount != mp) + continue; + if ((flags&SKIPROOT) && vnode_isvroot(rvp)) + continue; + count++; + } + mutex_exit(&vnode_all_list_lock); + return count; +} + +int vflush(struct mount *mp, struct vnode *skipvp, int flags) +{ + // Iterate the vnode list and call reclaim + // flags: + // SKIPROOT : dont release root nodes (mountpoints) + // SKIPSYSTEM : dont release vnodes marked as system + // FORCECLOSE : release everything, force unmount + + // if mp is NULL, we are reclaiming nodes, until threshold + int isbusy = 0; + int reclaims = 0; + vnode_fileobjects_t *node; + struct vnode *rvp; + + dprintf("vflush start\n"); + +repeat: + mutex_enter(&vnode_all_list_lock); + while (1) { + for (rvp = list_head(&vnode_all_list); + rvp; + rvp = list_next(&vnode_all_list, rvp)) { + + // skip vnodes not belonging to this mount + if (mp && rvp->v_mount != mp) + continue; + + // If we aren't FORCE and asked to SKIPROOT, and node + // is MARKROOT, then go to next. + if (!(flags & FORCECLOSE)) + if ((flags & SKIPROOT)) + if (rvp->v_flags & VNODE_MARKROOT) + continue; + + // We are to remove this node, even if ROOT - unmark it. + mutex_exit(&vnode_all_list_lock); + + // Release the AVL tree + KIRQL OldIrql; + + // Attempt to flush out any caches; + mutex_enter(&rvp->v_mutex); + // Make sure we don't call vnode_cacheflush() again + // from IRP_MJ_CLOSE. + rvp->v_flags |= VNODE_FLUSHING; + + while ((node = avl_first(&rvp->v_fileobjects)) != NULL) { + FILE_OBJECT *fileobject = node->fileobject; + + avl_remove(&rvp->v_fileobjects, node); + + // Because the CC* calls can re-enter ZFS, we need to + // release the lock, and because we release the lock the + // while has to start from the top each time. We release + // the node at end of this while. + + // Try to lock fileobject before we use it. + if (NT_SUCCESS(ObReferenceObjectByPointer( + fileobject, // fixme, keep this in dvd + 0, + *IoFileObjectType, + KernelMode))) { + + mutex_exit(&rvp->v_mutex); + vnode_flushcache(rvp, fileobject, TRUE); + + ObDereferenceObject(fileobject); + + mutex_enter(&rvp->v_mutex); + } // if ObReferenceObjectByPointer + + + // Grab mutex for the while() above. + kmem_free(node, sizeof(*node)); + + } // while + + // vnode_recycle_int() will call mutex_exit(&rvp->v_mutex); + // re-check flags, due to releasing locks + isbusy = 1; + if (!(rvp->v_flags & VNODE_DEAD)) + isbusy = vnode_recycle_int(rvp, (flags & FORCECLOSE) | VNODELOCKED); + else + mutex_exit(&rvp->v_mutex); + + mutex_enter(&vnode_all_list_lock); + + if (!isbusy) { + reclaims++; + break; // must restart loop if unlinked node + } + } + + // If the end of the list was reached, stop entirely + if (!rvp) break; + } + + mutex_exit(&vnode_all_list_lock); + + if (mp == NULL && reclaims > 0) { + dprintf("%s: %llu reclaims processed.\n", __func__, reclaims); + } + + + kpreempt(KPREEMPT_SYNC); + + // Check if all nodes have gone, or we are waiting for CcMgr + // not counting the MARKROOT vnode for the mount. So if empty list, + // or it is exactly one node with MARKROOT, then we are done. + // Unless FORCECLOSE, then root as well shall be gone. + + // Ok, we need to count nodes that match this mount, not "all" + // nodes, possibly belonging to other mounts. + + if (mount_count_nodes(mp, (flags & FORCECLOSE) ? 0 : SKIPROOT) > 0) { + dprintf("%s: waiting for vnode flush1.\n", __func__); + // Is there a better wakeup we can do here? + delay(hz >> 1); + vnode_drain_delayclose(1); + goto repeat; + } + + dprintf("vflush end\n"); + + return 0; +} + +/* + * Set the Windows SecurityPolicy + */ +void vnode_setsecurity(vnode_t *vp, void *sd) +{ + vp->security_descriptor = sd; +} +void *vnode_security(vnode_t *vp) +{ + return vp->security_descriptor; +} + +extern CACHE_MANAGER_CALLBACKS CacheManagerCallbacks; + +void vnode_couplefileobject(vnode_t *vp, FILE_OBJECT *fileobject, uint64_t size) +{ + if (fileobject) { + + fileobject->FsContext = vp; + + // Make sure it is pointing to the right vp. + if (fileobject->SectionObjectPointer != vnode_sectionpointer(vp)) { + fileobject->SectionObjectPointer = vnode_sectionpointer(vp); + } + + // If this fo's CcMgr hasn't been initialised, do so now + // this ties each fileobject to CcMgr, it is not about + // the vp itself. CcInit will be called many times on a vp, + // once for each fileobject. + dprintf("%s: vp %p fo %p\n", __func__, vp, fileobject); + + // Add this fileobject to the list of known ones. + vnode_fileobject_add(vp, fileobject); + + if (vnode_isvroot(vp)) return; + + vnode_pager_setsize(vp, size); + vnode_setsizechange(vp, 0); // We are updating now, clear sizechange + + CcInitializeCacheMap(fileobject, + (PCC_FILE_SIZES)&vp->FileHeader.AllocationSize, + FALSE, + &CacheManagerCallbacks, vp); + dprintf("return init\n"); + } +} + +// Attempt to boot CcMgr out of the fileobject, return +// true if we could +int vnode_flushcache(vnode_t *vp, FILE_OBJECT *fileobject, boolean_t hard) +{ + CACHE_UNINITIALIZE_EVENT UninitializeCompleteEvent; + NTSTATUS WaitStatus; + LARGE_INTEGER Zero = { 0,0 }; + int ret = 0; + + if (vp == NULL) + return 1; + + if (fileobject == NULL) + return 1; + + // Have CcMgr already released it? + if (fileobject->SectionObjectPointer == NULL) + return 1; + + // Because CcUninitializeCacheMap() can call MJ_CLOSE immediately, and we + // don't want to free anything in *that* call, take a usecount++ here, that + // way we skip the vnode_isinuse() test + atomic_inc_32(&vp->v_usecount); + + if (fileobject->SectionObjectPointer->ImageSectionObject) { + if (hard) + (VOID)MmForceSectionClosed(fileobject->SectionObjectPointer, TRUE); + else + (VOID)MmFlushImageSection(fileobject->SectionObjectPointer, MmFlushForWrite); + } + + // DataSection next + if (fileobject->SectionObjectPointer->DataSectionObject) { + IO_STATUS_BLOCK iosb; + CcFlushCache(fileobject->SectionObjectPointer, NULL, 0, &iosb); + ExAcquireResourceExclusiveLite(vp->FileHeader.PagingIoResource, TRUE); + ExReleaseResourceLite(vp->FileHeader.PagingIoResource); + } + + CcPurgeCacheSection(fileobject->SectionObjectPointer, NULL, 0, FALSE /*hard*/); + + KeInitializeEvent(&UninitializeCompleteEvent.Event, + SynchronizationEvent, + FALSE); + + // Try to release cache + dprintf("calling CcUninit: fo %p\n", fileobject); + int temp = CcUninitializeCacheMap(fileobject, + hard ? &Zero : NULL, + NULL); + dprintf("complete CcUninit\n"); + + // Remove usecount lock held above. + atomic_dec_32(&vp->v_usecount); + + // Unable to fully release CcMgr + dprintf("%s: ret %d : vp %p fo %p\n", __func__, ret, + vp, fileobject); + + return ret; +} + + +void vnode_decouplefileobject(vnode_t *vp, FILE_OBJECT *fileobject) +{ + if (fileobject && fileobject->FsContext) { + dprintf("%s: fo %p -X-> %p\n", __func__, fileobject, vp); + + // If we are flushing, we do nothing here. + if (vp->v_flags & VNODE_FLUSHING) return; + + if (vnode_flushcache(vp, fileobject, FALSE)) + fileobject->FsContext = NULL; + } +} + +void vnode_setsizechange(vnode_t *vp, int set) +{ + if (set) + vp->v_flags |= VNODE_SIZECHANGE; + else + vp->v_flags &= ~VNODE_SIZECHANGE; +} + +int vnode_sizechange(vnode_t *vp) +{ + return (vp->v_flags & VNODE_SIZECHANGE); +} + +int vnode_isrecycled(vnode_t *vp) +{ + return (vp->v_flags&(VNODE_MARKTERM | VNODE_DEAD)); +} + +void vnode_lock(vnode_t *vp) +{ + mutex_enter(&vp->v_mutex); +} + +//int vnode_trylock(vnode_t *vp); + +void vnode_unlock(vnode_t *vp) +{ + mutex_exit(&vp->v_mutex); +} + + +/* + * Add a FileObject to the list of FO in the vnode. + * Return 1 if we actually added it + * Return 0 if it was already in the list. + */ +int vnode_fileobject_add(vnode_t *vp, void *fo) +{ + vnode_fileobjects_t *node; + avl_index_t idx; + KIRQL OldIrql; + mutex_enter(&vp->v_mutex); + // Early out to avoid memory alloc + vnode_fileobjects_t search; + search.fileobject = fo; + if (avl_find(&vp->v_fileobjects, &search, &idx) != NULL) { + mutex_exit(&vp->v_mutex); + return 0; + } + mutex_exit(&vp->v_mutex); + + node = kmem_alloc(sizeof(*node), KM_SLEEP); + node->fileobject = fo; + + mutex_enter(&vp->v_mutex); + if (avl_find(&vp->v_fileobjects, node, &idx) == NULL) { + avl_insert(&vp->v_fileobjects, node, idx); + mutex_exit(&vp->v_mutex); + return 1; + } else { + mutex_exit(&vp->v_mutex); + kmem_free(node, sizeof(*node)); + return 0; + } + // not reached. + mutex_exit(&vp->v_mutex); + return 0; +} + +/* + * Remove a FileObject from the list of FO in the vnode. + * Return 1 if we actually removed it + * Return 0 if it was not in the list. + */ +int vnode_fileobject_remove(vnode_t *vp, void *fo) +{ + vnode_fileobjects_t search, *node; + KIRQL OldIrql; + mutex_enter(&vp->v_mutex); + search.fileobject = fo; + node = avl_find(&vp->v_fileobjects, &search, NULL); + if (node == NULL) { + mutex_exit(&vp->v_mutex); + + return 0; + } + avl_remove(&vp->v_fileobjects, node); + mutex_exit(&vp->v_mutex); + kmem_free(node, sizeof(*node)); + + return 1; +} + +/* + * Check and make sure the list of FileObjects is empty + */ +int vnode_fileobject_empty(vnode_t *vp, int locked) +{ + KIRQL OldIrql; + + if (!locked) + mutex_enter(&vp->v_mutex); + boolean_t ret = avl_is_empty(&vp->v_fileobjects); + if (!locked) + mutex_exit(&vp->v_mutex); + + return ret; +} + +// Get cached EA size, returns 1 is it is cached, 0 if not. +int vnode_easize(struct vnode *vp, uint64_t *size) +{ + if (vp->v_flags & VNODE_EASIZE) { + *size = vp->v_easize; + return 1; + } + return 0; +} + +void vnode_set_easize(struct vnode *vp, uint64_t size) +{ + vp->v_easize = size; + vp->v_flags |= VNODE_EASIZE; +} + +void vnode_clear_easize(struct vnode *vp) +{ + vp->v_flags &= ~VNODE_EASIZE; +} + +#ifdef DEBUG_IOCOUNT +void vnode_check_iocount(void) +{ + /* Iterate all vnodes, checking that iocount is zero. */ + struct vnode *rvp; + mutex_enter(&vnode_all_list_lock); + for (rvp = list_head(&vnode_all_list); + rvp; + rvp = list_next(&vnode_all_list, rvp)) { + ASSERT0(rvp->v_iocount); + } + mutex_exit(&vnode_all_list_lock); +} +#endif + + + diff --git a/module/os/windows/spl/spl-windows.c b/module/os/windows/spl/spl-windows.c new file mode 100644 index 000000000000..0e73bd62653f --- /dev/null +++ b/module/os/windows/spl/spl-windows.c @@ -0,0 +1,647 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2018 Jorgen Lundman + * + */ + +#include +#include + +#include +//#include +//#include + +#include +#include +#include +//#include +#include +#include + +//#define MACH_KERNEL_PRIVATE + +//#include + +#define DEBUG 1 // for backtrace debugging info + +struct utsname utsname = { { 0 } }; + +//extern struct machine_info machine_info; + +unsigned int max_ncpus = 0; +uint64_t total_memory = 0; +uint64_t real_total_memory = 0; + +volatile unsigned int vm_page_free_wanted = 0; +volatile unsigned int vm_page_free_min = 512; +volatile unsigned int vm_page_free_count = 5000; +volatile unsigned int vm_page_speculative_count = 5500; + +uint64_t spl_GetPhysMem(void); + +#include +//#include +/* protect against: + * /System/Library/Frameworks/Kernel.framework/Headers/mach/task.h:197: error: conflicting types for ‘spl_thread_create’ + * ../../include/sys/thread.h:72: error: previous declaration of ‘spl_thread_create’ was here + */ +#define _task_user_ +//#include + +#include + +// Size in bytes of the memory allocated in seg_kmem +extern uint64_t segkmem_total_mem_allocated; +#define MAXHOSTNAMELEN 64 +extern char hostname[MAXHOSTNAMELEN]; + +uint32_t spl_hostid = 0; + +/* + * Solaris delay is in ticks (hz) and Windows in 100 nanosecs + * 1 HZ is 10 milliseconds, 10000000 nanoseconds. + */ +void +windows_delay(int ticks) +{ + LARGE_INTEGER interval; + // * 10000000 / 100 + interval.QuadPart = -((uint64_t)ticks) * 100000ULL; + KeDelayExecutionThread(KernelMode, FALSE, &interval); +} + +uint32_t zone_get_hostid(void *zone) +{ + return spl_hostid; +} + +const char *spl_panicstr(void) +{ + return ""; +} + +int spl_system_inshutdown(void) +{ + return 0; +} + +void +hrt2ts(hrtime_t hrt, timespec_t *tsp) +{ + tsp->tv_sec = (time_t)(hrt / NANOSEC); + tsp->tv_nsec = (hrt % NANOSEC); +} + +// If we want to implement this on Windows, we could probably use +// https://stackoverflow.com/questions/590160/how-to-log-stack-frames-with-windows-x64 +// which calls RtlCaptureStackBackTrace(); +int +getpcstack(uintptr_t *pcstack, int pcstack_limit) +{ + return RtlCaptureStackBackTrace(1, pcstack_limit, (PVOID *)pcstack, NULL); +} + +/* + * fnv_32a_str - perform a 32 bit Fowler/Noll/Vo FNV-1a hash on a string + * + * input: + * str - string to hash + * hval - previous hash value or 0 if first call + * + * returns: + * 32 bit hash as a static hash type + * + * NOTE: To use the recommended 32 bit FNV-1a hash, use FNV1_32A_INIT as the + * hval arg on the first call to either fnv_32a_buf() or fnv_32a_str(). + */ +uint32_t +fnv_32a_str(const char *str, uint32_t hval) +{ + unsigned char *s = (unsigned char *)str; /* unsigned string */ + + /* + * FNV-1a hash each octet in the buffer + */ + while (*s) { + + /* xor the bottom with the current octet */ + hval ^= (uint32_t)*s++; + + /* multiply by the 32 bit FNV magic prime mod 2^32 */ +#if defined(NO_FNV_GCC_OPTIMIZATION) + hval *= FNV_32_PRIME; +#else + hval += (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24); +#endif + } + + /* return our new hash value */ + return hval; +} + +/* + * fnv_32a_buf - perform a 32 bit Fowler/Noll/Vo FNV-1a hash on a buffer + * + * input: + *buf- start of buffer to hash + *len- length of buffer in octets + *hval- previous hash value or 0 if first call + * + * returns: + *32 bit hash as a static hash type + * + * NOTE: To use the recommended 32 bit FNV-1a hash, use FNV1_32A_INIT as the + * hval arg on the first call to either fnv_32a_buf() or fnv_32a_str(). + */ +uint32_t +fnv_32a_buf(void *buf, size_t len, uint32_t hval) +{ + unsigned char *bp = (unsigned char *)buf;/* start of buffer */ + unsigned char *be = bp + len;/* beyond end of buffer */ + + /* + * FNV-1a hash each octet in the buffer + */ + while (bp < be) { + + /* xor the bottom with the current octet */ + hval ^= (uint32_t)*bp++; + + /* multiply by the 32 bit FNV magic prime mod 2^32 */ +#if defined(NO_FNV_GCC_OPTIMIZATION) + hval *= FNV_32_PRIME; +#else + hval += (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24); +#endif + } + + /* return our new hash value */ + return hval; +} + +/* + * Function to free a MDL chain + */ +void UnlockAndFreeMdl(PMDL Mdl) +{ + PMDL currentMdl, nextMdl; + + for (currentMdl = Mdl; currentMdl != NULL; currentMdl = nextMdl) + { + nextMdl = currentMdl->Next; + if (currentMdl->MdlFlags & MDL_PAGES_LOCKED) + { + MmUnlockPages(currentMdl); + } + IoFreeMdl(currentMdl); + } +} + +int +ddi_copyin(const void *from, void *to, size_t len, int flags) +{ + int error = 0; + PMDL mdl = NULL; + PCHAR buffer = NULL; + + if (from == NULL || + to == NULL || + len == 0) + return 0; + + /* Fake ioctl() issued by kernel, so we just need to bcopy */ + if (flags & FKIOCTL) { + if (flags & FCOPYSTR) + strlcpy(to, from, len); + else + bcopy(from, to, len); + return 0; + } + + //ret = copyin((user_addr_t)from, (void *)to, len); + // Lets try reading from the input nvlist + dprintf("SPL: trying windows copyin: %p:%d\n", from, len); + + try { + ProbeForRead((void *)from, len, sizeof(UCHAR)); + } + except(EXCEPTION_EXECUTE_HANDLER) + { + error = GetExceptionCode(); + } + if (error) { + TraceEvent(TRACE_ERROR, "SPL: Exception while accessing inBuf 0X%08X\n", error); + goto out; + } + + mdl = IoAllocateMdl((void *)from, len, FALSE, FALSE, NULL); + if (!mdl) { + error = STATUS_INSUFFICIENT_RESOURCES; + goto out; + } + + try { + MmProbeAndLockPages(mdl, UserMode, IoReadAccess); + } + except(EXCEPTION_EXECUTE_HANDLER) + { + error = GetExceptionCode(); + } + if (error) { + TraceEvent(TRACE_ERROR, "SPL: Exception while locking inBuf 0X%08X\n", error); + goto out; + } + + buffer = MmGetSystemAddressForMdlSafe(mdl, NormalPagePriority | MdlMappingNoExecute); + + if (!buffer) { + error = STATUS_INSUFFICIENT_RESOURCES; + } else { + // Success, copy over the data. + if (flags & FCOPYSTR) + strlcpy(to, buffer, len); + else + bcopy(buffer, to, len); + } + + dprintf("SPL: copyin return %d (%d bytes)\n", error, len); + +out: + if (mdl) { + UnlockAndFreeMdl(mdl); + } + + return error; +} + + +int +ddi_copyout(const void *from, void *to, size_t len, int flags) +{ + int error = 0; + PMDL mdl = NULL; + PCHAR buffer = NULL; + + if (from == NULL || + to == NULL || + len == 0) + return 0; + + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + bcopy(from, to, len); + return 0; + } + + //dprintf("SPL: trying windows copyout: %p:%d\n", to, len); + + mdl = IoAllocateMdl(to, len, FALSE, FALSE, NULL); + if (!mdl) { + error = STATUS_INSUFFICIENT_RESOURCES; + TraceEvent(TRACE_ERROR, "SPL: copyout failed to allocate mdl\n"); + goto out; + } + + try { + MmProbeAndLockPages(mdl, UserMode, IoWriteAccess); + } + except(EXCEPTION_EXECUTE_HANDLER) + { + error = GetExceptionCode(); + } + if (error != 0) { + TraceEvent(TRACE_ERROR, "SPL: Exception while locking outBuf 0X%08X\n", + error); + goto out; + } + + buffer = MmGetSystemAddressForMdlSafe(mdl, NormalPagePriority | MdlMappingNoExecute); + + if (!buffer) { + error = STATUS_INSUFFICIENT_RESOURCES; + goto out; + } else { + // Success, copy over the data. + bcopy(from, buffer, len); + } + //dprintf("SPL: copyout return %d (%d bytes)\n", error, len); +out: + if (mdl) { + UnlockAndFreeMdl(mdl); + } + + return error; +} + +int +ddi_copysetup(void *to, size_t len, void **out_buffer, PMDL *out_mdl) +{ + int error = 0; + PMDL mdl = NULL; + PCHAR buffer = NULL; + + if (to == NULL || + out_buffer == NULL || + out_mdl == NULL || + len == 0) + return 0; + + //dprintf("SPL: trying windows copyout_ex: %p:%d\n", to, len); + + // Do we have to call both? Or is calling ProbeForWrite enough? + try { + ProbeForRead(to, len, sizeof(UCHAR)); + } + except(EXCEPTION_EXECUTE_HANDLER) + { + error = GetExceptionCode(); + } + if (error) { + TraceEvent(TRACE_ERROR, "SPL: Exception while accessing inBuf 0X%08X\n", error); + goto out; + } + + try { + ProbeForWrite(to, len, sizeof(UCHAR)); + } + except(EXCEPTION_EXECUTE_HANDLER) + { + error = GetExceptionCode(); + } + if (error) { + TraceEvent(TRACE_ERROR, "SPL: Exception while accessing inBuf 0X%08X\n", error); + goto out; + } + + mdl = IoAllocateMdl(to, len, FALSE, FALSE, NULL); + if (!mdl) { + error = STATUS_INSUFFICIENT_RESOURCES; + TraceEvent(TRACE_ERROR, "SPL: copyout failed to allocate mdl\n"); + goto out; + } + + try { + MmProbeAndLockPages(mdl, UserMode, IoWriteAccess); + } + except(EXCEPTION_EXECUTE_HANDLER) + { + error = GetExceptionCode(); + } + if (error != 0) { + TraceEvent(TRACE_ERROR, "SPL: Exception while locking outBuf 0X%08X\n", + error); + goto out; + } + + buffer = MmGetSystemAddressForMdlSafe(mdl, NormalPagePriority | MdlMappingNoExecute); + + if (!buffer) { + error = STATUS_INSUFFICIENT_RESOURCES; + goto out; + } + + *out_buffer = buffer; + *out_mdl = mdl; + return 0; + +out: + if (mdl) { + UnlockAndFreeMdl(mdl); + } + + return error; +} + + +/* Technically, this call does not exist in IllumOS, but we use it for + * consistency. + */ +int ddi_copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done) +{ + int ret = 0; + + ret = ddi_copyin((const void *)uaddr, kaddr, len, FCOPYSTR); + if ((ret == STATUS_SUCCESS) && done) { + *done = strlen(kaddr) + 1; // copyinstr includes the NULL byte + } + return ret; +} + + + + +int spl_start (void) +{ + //max_ncpus = processor_avail_count; + int ncpus; + size_t len = sizeof(ncpus); + + dprintf("SPL: start\n"); + max_ncpus = KeQueryActiveProcessorCountEx(ALL_PROCESSOR_GROUPS); + if (!max_ncpus) max_ncpus = 1; + dprintf("SPL: total ncpu %d\n", max_ncpus); + + // Not sure how to get physical RAM size in a Windows Driver + // So until then, pull some numbers out of the aether. Next + // we could let users pass in a value, somehow... + total_memory = spl_GetPhysMem(); + + // Set 2GB as code above doesnt work + if (!total_memory) + total_memory = 2ULL * 1024ULL * 1024ULL * 1024ULL; + + dprintf("SPL: memsize %llu (before adjustment)\n", total_memory); + /* + * Setting the total memory to physmem * 80% here, since kmem is + * not in charge of all memory and we need to leave some room for + * the OS X allocator. We internally add pressure if we step over it + */ + real_total_memory = total_memory; + total_memory = total_memory * 50ULL / 100ULL; // smd: experiment with 50%, 8GiB + physmem = total_memory / PAGE_SIZE; + + // We need to set these to some non-zero values + // so we don't think there is permanent memory + // pressure. + vm_page_free_count = (unsigned int)(physmem/2ULL); + vm_page_speculative_count = vm_page_free_count; + + /* + * For some reason, (CTLFLAG_KERN is not set) looking up hostname + * returns 1. So we set it to uuid just to give it *something*. + * As it happens, ZFS sets the nodename on init. + */ + //len = sizeof(utsname.nodename); + //sysctlbyname("kern.uuid", &utsname.nodename, &len, NULL, 0); + + //len = sizeof(utsname.release); + //sysctlbyname("kern.osrelease", &utsname.release, &len, NULL, 0); + + //len = sizeof(utsname.version); + //sysctlbyname("kern.version", &utsname.version, &len, NULL, 0); + + //strlcpy(utsname.nodename, hostname, sizeof(utsname.nodename)); + strlcpy(utsname.nodename, "Windows", sizeof(utsname.nodename)); + spl_mutex_subsystem_init(); + //DbgBreakPoint(); + spl_kmem_init(total_memory); + + spl_tsd_init(); + spl_rwlock_init(); + spl_taskq_init(); + + spl_vnode_init(); + spl_kmem_thread_init(); + spl_kmem_mp_init(); + + IOLog("SPL: Loaded module v%s-%s%s, " + "(ncpu %d, memsize %llu, pages %llu)\n", + SPL_META_VERSION, SPL_META_RELEASE, SPL_DEBUG_STR, + max_ncpus, total_memory, physmem); + return STATUS_SUCCESS; +} + +extern uint64_t zfs_threads; + +int spl_stop (void) +{ + spl_kmem_thread_fini(); + spl_vnode_fini(); + spl_taskq_fini(); + spl_rwlock_fini(); + spl_tsd_fini(); + spl_kmem_fini(); + // XXX: we run into a bunch of problems with this kstat_fini stuff, as it calls vmem_fini a second time + // after spl_kmem_fini()->kernelheap_fini()->vmem_fini(heap_arena) got called + // and therefore destroys global structures twice + // so skip that for the moment + // + // spl_kstat_fini(); + spl_mutex_subsystem_fini(); + IOLog("SPL: Unloaded module v%s-%s " + "(os_mem_alloc: %llu)\n", + SPL_META_VERSION, SPL_META_RELEASE, + segkmem_total_mem_allocated); + while (zfs_threads >= 1) { + IOLog("SPL: active threads %d\n", zfs_threads); + delay(hz << 2); + } + return STATUS_SUCCESS; +} + + + + +#define UNICODE + +#pragma pack(push, 4) +typedef struct { + UCHAR Type; + UCHAR ShareDisposition; + USHORT Flags; + ULONGLONG Start; + ULONG Length; +} MEMORY, *PMEMORY; +#pragma pack(pop) + +/* TimoVJL */ +LONGLONG GetMemResources(char *pData) +{ + LONGLONG llMem = 0; + char *pPtr; + uint32_t *pDW; + pDW = (uint32_t *)pData; + if (*pDW != 1) return 0; + DWORD nCnt = *(uint32_t *)(pData + 0x10); // Count + pPtr = pData + 0x14; + DWORD nRLen = 0; + if (*(pData + 0x14) == *(pData + 0x24)) nRLen = 16; + if (*(pData + 0x14) == *(pData + 0x28)) nRLen = 20; + PMEMORY pMem; + for (DWORD nIdx = 0; nRLen && nIdx < nCnt; nIdx++) { + pMem = (PMEMORY)(pPtr + nRLen * nIdx); + if (pMem->Type == 3) llMem += pMem->Length; + if (pMem->Type == 7 && pMem->Flags == 0x200) llMem += ((LONGLONG)pMem->Length) << 8; + pMem += nRLen; + } + return llMem; +} + +NTSTATUS +spl_query_memsize( + IN PWSTR ValueName, + IN ULONG ValueType, + IN PVOID ValueData, + IN ULONG ValueLength, + IN PVOID Context, + IN PVOID EntryContext +) +{ + + dprintf("%s: '%S' type 0x%x len 0x%x\n", __func__, + ValueName, ValueType, ValueLength); + + if ((ValueType == REG_RESOURCE_LIST) && + (_wcsicmp(L".Translated", ValueName) == 0)) { + uint64_t *value; + value = EntryContext; + if (value) + *value = GetMemResources(ValueData); + dprintf("%s: memsize is %llu\n", __func__, value ? *value : 0); + } + + return STATUS_SUCCESS; +} + + +uint64_t spl_GetPhysMem(void) +{ + uint64_t memory; + NTSTATUS status; + static RTL_QUERY_REGISTRY_TABLE query[2] = + { + { + .Flags = RTL_QUERY_REGISTRY_REQUIRED + /*| RTL_QUERY_REGISTRY_DIRECT*/ + | RTL_QUERY_REGISTRY_NOEXPAND + | RTL_QUERY_REGISTRY_TYPECHECK, + .QueryRoutine = spl_query_memsize, + } + }; + + query[0].EntryContext = &memory; + status = RtlQueryRegistryValues( + RTL_REGISTRY_ABSOLUTE, + L"\\REGISTRY\\MACHINE\\HARDWARE\\RESOURCEMAP\\System Resources\\Physical Memory", + query, NULL, NULL); + + if (status != STATUS_SUCCESS) { + TraceEvent(TRACE_ERROR, "%s: size query failed: 0x%x\n", __func__, status); + return 0ULL; + } + + return memory; +} + + diff --git a/module/os/windows/spl/spl-xdr.c b/module/os/windows/spl/spl-xdr.c new file mode 100644 index 000000000000..1317826fec86 --- /dev/null +++ b/module/os/windows/spl/spl-xdr.c @@ -0,0 +1,523 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * + * Copyright (C) 2017 Jorgen Lundman + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +/* + * SPL's XDR mem implementation. + * + * This is used by libnvpair to serialize/deserialize the name-value pair data + * structures into byte arrays in a well-defined and portable manner. + * + * These data structures are used by the DMU/ZFS to flexibly manipulate various + * information in memory and later serialize it/deserialize it to disk. + * Examples of usages include the pool configuration, lists of pool and dataset + * properties, etc. + * + * Reference documentation for the XDR representation and XDR operations can be + * found in RFC 1832 and xdr(3), respectively. + * + * === Implementation shortcomings === + * + * It is assumed that the following C types have the following sizes: + * + * char/unsigned char: 1 byte + * short/unsigned short: 2 bytes + * int/unsigned int: 4 bytes + * longlong_t/u_longlong_t: 8 bytes + * + * The C standard allows these types to be larger (and in the case of ints, + * shorter), so if that is the case on some compiler/architecture, the build + * will fail (on purpose). + * + * If someone wants to fix the code to work properly on such environments, then: + * + * 1) Preconditions should be added to xdrmem_enc functions to make sure the + * caller doesn't pass arguments which exceed the expected range. + * 2) Functions which take signed integers should be changed to properly do + * sign extension. + * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger + * problems than this implementation. + * + * It is also assumed that: + * + * 1) Chars have 8 bits. + * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned + * memcpy, memset and memcmp. + * 3) Arrays passed to xdr_array() are packed and the compiler/architecture + * supports element-sized-aligned memory accesses. + * 4) Negative integers are natively stored in two's complement binary + * representation. + * + * No checks are done for the 4 assumptions above, though. + * + * === Caller expectations === + * + * Existing documentation does not describe the semantics of XDR operations very + * well. Therefore, some assumptions about failure semantics will be made and + * will be described below: + * + * 1) If any encoding operation fails (e.g., due to lack of buffer space), the + * the stream should be considered valid only up to the encoding operation + * previous to the one that first failed. However, the stream size as returned + * by xdr_control() cannot be considered to be strictly correct (it may be + * bigger). + * + * Putting it another way, if there is an encoding failure it's undefined + * whether anything is added to the stream in that operation and therefore + * neither xdr_control() nor future encoding operations on the same stream can + * be relied upon to produce correct results. + * + * 2) If a decoding operation fails, it's undefined whether anything will be + * decoded into passed buffers/pointers during that operation, or what the + * values on those buffers will look like. + * + * Future decoding operations on the same stream will also have similar + * undefined behavior. + * + * 3) When the first decoding operation fails it is OK to trust the results of + * previous decoding operations on the same stream, as long as the caller + * expects a failure to be possible (e.g. due to end-of-stream). + * + * However, this is highly discouraged because the caller should know the + * stream size and should be coded to expect any decoding failure to be data + * corruption due to hardware, accidental or even malicious causes, which should + * be handled gracefully in all cases. + * + * In very rare situations where there are strong reasons to believe the data + * can be trusted to be valid and non-tampered with, then the caller may assume + * a decoding failure to be a bug (e.g. due to mismatched data types) and may + * fail non-gracefully. + * + * 4) Non-zero padding bytes will cause the decoding operation to fail. + * + * 5) Zero bytes on string types will also cause the decoding operation to fail. + * + * 6) It is assumed that either the pointer to the stream buffer given by the + * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int + * memory accesses. + * + * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap. + * + * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user + * space or MMIO space), the computer may explode. + */ + +static struct xdr_ops xdrmem_encode_ops; +static struct xdr_ops xdrmem_decode_ops; + +void +xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, + const enum xdr_op op) +{ + switch (op) { + case XDR_ENCODE: + xdrs->x_ops = &xdrmem_encode_ops; + break; + case XDR_DECODE: + xdrs->x_ops = &xdrmem_decode_ops; + break; + default: + TraceEvent(TRACE_ERROR, "SPL: Invalid op value: %d\n", op); + xdrs->x_ops = NULL; /* Let the caller know we failed */ + return; + } + + xdrs->x_op = op; + xdrs->x_addr = addr; + xdrs->x_addr_end = addr + size; + + if (xdrs->x_addr_end < xdrs->x_addr) { + TraceEvent(TRACE_ERROR, "SPL: Overflow while creating xdrmem: %p, %u\n", addr, size); + xdrs->x_ops = NULL; + } +} +EXPORT_SYMBOL(xdrmem_create); + +static bool_t +xdrmem_control(XDR *xdrs, int req, void *info) +{ + struct xdr_bytesrec *rec = (struct xdr_bytesrec *) info; + + if (req != XDR_GET_BYTES_AVAIL) { + TraceEvent(TRACE_ERROR, "SPL: Called with unknown request: %d\n", req); + return FALSE; + } + + rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */ + rec->xc_num_avail = (size_t)(xdrs->x_addr_end - xdrs->x_addr); + + return TRUE; +} + +static bool_t +xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + uint_t size = P2ROUNDUP(cnt, 4); + uint_t pad; + + if (size < cnt) + return FALSE; /* Integer overflow */ + + if (xdrs->x_addr > xdrs->x_addr_end) + return FALSE; + + if (xdrs->x_addr_end - xdrs->x_addr < size) + return FALSE; + + memcpy(xdrs->x_addr, cp, cnt); + + xdrs->x_addr += cnt; + + pad = size - cnt; + if (pad > 0) { + memset(xdrs->x_addr, 0, pad); + xdrs->x_addr += pad; + } + + return TRUE; +} + +static bool_t +xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + static uint32_t zero = 0; + uint_t size = P2ROUNDUP(cnt, 4); + uint_t pad; + + if (size < cnt) + return FALSE; /* Integer overflow */ + + if (xdrs->x_addr > xdrs->x_addr_end) + return FALSE; + + if (xdrs->x_addr_end - xdrs->x_addr < size) + return FALSE; + + memcpy(cp, xdrs->x_addr, cnt); + xdrs->x_addr += cnt; + + pad = size - cnt; + if (pad > 0) { + /* An inverted memchr() would be useful here... */ + if (memcmp(&zero, xdrs->x_addr, pad) != 0) + return FALSE; + + xdrs->x_addr += pad; + } + + return TRUE; +} + +static bool_t +xdrmem_enc_uint32(XDR *xdrs, uint32_t val) +{ + if (xdrs->x_addr + sizeof(uint32_t) > xdrs->x_addr_end) + return FALSE; + + *((uint32_t *) xdrs->x_addr) = BE_32(val); + + xdrs->x_addr += sizeof(uint32_t); + + return TRUE; +} + +static bool_t +xdrmem_dec_uint32(XDR *xdrs, uint32_t *val) +{ + if (xdrs->x_addr + sizeof(uint32_t) > xdrs->x_addr_end) + return FALSE; + + *val = BE_32(*((uint32_t *) xdrs->x_addr)); + + xdrs->x_addr += sizeof(uint32_t); + + return TRUE; +} + +static bool_t +xdrmem_enc_char(XDR *xdrs, char *cp) +{ + uint32_t val; + + //BUILD_BUG_ON(sizeof(char) != 1); + val = *((unsigned char *) cp); + + return xdrmem_enc_uint32(xdrs, val); +} + +static bool_t +xdrmem_dec_char(XDR *xdrs, char *cp) +{ + uint32_t val; + + //BUILD_BUG_ON(sizeof(char) != 1); + + if (!xdrmem_dec_uint32(xdrs, &val)) + return FALSE; + + /* + * If any of the 3 other bytes are non-zero then val will be greater + * than 0xff and we fail because according to the RFC, this block does + * not have a char encoded in it. + */ + if (val > 0xff) + return FALSE; + + *((unsigned char *) cp) = (uint8_t)val; + + return TRUE; +} + +static bool_t +xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp) +{ + //BUILD_BUG_ON(sizeof(unsigned short) != 2); + + return xdrmem_enc_uint32(xdrs, *usp); +} + +static bool_t +xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp) +{ + uint32_t val; + + //BUILD_BUG_ON(sizeof(unsigned short) != 2); + + if (!xdrmem_dec_uint32(xdrs, &val)) + return FALSE; + + /* + * Short ints are not in the RFC, but we assume similar logic as in + * xdrmem_dec_char(). + */ + if (val > 0xffff) + return FALSE; + + *usp = (uint16_t)val; + + return TRUE; +} + +static bool_t +xdrmem_enc_uint(XDR *xdrs, unsigned *up) +{ + //BUILD_BUG_ON(sizeof(unsigned) != 4); + + return xdrmem_enc_uint32(xdrs, *up); +} + +static bool_t +xdrmem_dec_uint(XDR *xdrs, unsigned *up) +{ + //BUILD_BUG_ON(sizeof(unsigned) != 4); + + return xdrmem_dec_uint32(xdrs, (uint32_t *) up); +} + +static bool_t +xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp) +{ + //BUILD_BUG_ON(sizeof(u_longlong_t) != 8); + + if (!xdrmem_enc_uint32(xdrs, *ullp >> 32)) + return FALSE; + + return xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff); +} + +static bool_t +xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp) +{ + uint32_t low, high; + + //BUILD_BUG_ON(sizeof(u_longlong_t) != 8); + + if (!xdrmem_dec_uint32(xdrs, &high)) + return FALSE; + if (!xdrmem_dec_uint32(xdrs, &low)) + return FALSE; + + *ullp = ((u_longlong_t) high << 32) | low; + + return TRUE; +} + +static bool_t +xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i; + caddr_t addr = *arrp; + + if (*sizep > maxsize || *sizep > UINT_MAX / elsize) + return FALSE; + + if (!xdrmem_enc_uint(xdrs, sizep)) + return FALSE; + + for (i = 0; i < *sizep; i++) { + if (!elproc(xdrs, addr)) + return FALSE; + addr += elsize; + } + + return TRUE; +} + +static bool_t +xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i, size; + bool_t alloc = FALSE; + caddr_t addr; + + if (!xdrmem_dec_uint(xdrs, sizep)) + return FALSE; + + size = *sizep; + + if (size > maxsize || size > UINT_MAX / elsize) + return FALSE; + + /* + * The Solaris man page says: "If *arrp is NULL when decoding, + * xdr_array() allocates memory and *arrp points to it". + */ + if (*arrp == NULL) { + //BUILD_BUG_ON(sizeof(uint_t) > sizeof(size_t)); + + *arrp = kmem_alloc(size * elsize, KM_NOSLEEP); + if (*arrp == NULL) + return FALSE; + + alloc = TRUE; + } + + addr = *arrp; + + for (i = 0; i < size; i++) { + if (!elproc(xdrs, addr)) { + if (alloc) + kmem_free(*arrp, size * elsize); + return FALSE; + } + addr += elsize; + } + + return TRUE; +} + +static bool_t +xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + size_t slen = strlen(*sp); + uint_t len; + + if (slen > maxsize) + return FALSE; + + len = slen; + + if (!xdrmem_enc_uint(xdrs, &len)) + return FALSE; + + return xdrmem_enc_bytes(xdrs, *sp, len); +} + +static bool_t +xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + uint_t size; + bool_t alloc = FALSE; + + if (!xdrmem_dec_uint(xdrs, &size)) + return FALSE; + + if (size > maxsize || size > UINT_MAX - 1) + return FALSE; + + /* + * Solaris man page: "If *sp is NULL when decoding, xdr_string() + * allocates memory and *sp points to it". + */ + if (*sp == NULL) { + //BUILD_BUG_ON(sizeof(uint_t) > sizeof(size_t)); + + *sp = kmem_alloc(size + 1, KM_NOSLEEP); + if (*sp == NULL) + return FALSE; + + alloc = TRUE; + } + + if (!xdrmem_dec_bytes(xdrs, *sp, size)) + goto fail; + + if (kmemchr(*sp, 0, size) != NULL) + goto fail; + + (*sp)[size] = '\0'; + + return TRUE; + +fail: + if (alloc) + kmem_free(*sp, size + 1); + + return FALSE; +} + +static struct xdr_ops xdrmem_encode_ops = { + .xdr_control = xdrmem_control, + .xdr_char = xdrmem_enc_char, + .xdr_u_short = xdrmem_enc_ushort, + .xdr_u_int = xdrmem_enc_uint, + .xdr_u_longlong_t = xdrmem_enc_ulonglong, + .xdr_opaque = xdrmem_enc_bytes, + .xdr_string = xdr_enc_string, + .xdr_array = xdr_enc_array +}; + +static struct xdr_ops xdrmem_decode_ops = { + .xdr_control = xdrmem_control, + .xdr_char = xdrmem_dec_char, + .xdr_u_short = xdrmem_dec_ushort, + .xdr_u_int = xdrmem_dec_uint, + .xdr_u_longlong_t = xdrmem_dec_ulonglong, + .xdr_opaque = xdrmem_dec_bytes, + .xdr_string = xdr_dec_string, + .xdr_array = xdr_dec_array +}; diff --git a/module/os/windows/spl/spl-zlib.c b/module/os/windows/spl/spl-zlib.c new file mode 100644 index 000000000000..a758198f425b --- /dev/null +++ b/module/os/windows/spl/spl-zlib.c @@ -0,0 +1,199 @@ +/*****************************************************************************\ + * + * zlib.h -- interface of the 'zlib' general purpose compression library + * version 1.2.5, April 19th, 2010 + * + * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * Jean-loup Gailly + * Mark Adler +\*****************************************************************************/ + + +#include +#include +#include +#include + +#ifdef DEBUG_SUBSYSTEM +#undef DEBUG_SUBSYSTEM +#endif + +#define DEBUG_SUBSYSTEM SS_ZLIB + +static spl_kmem_cache_t *zlib_workspace_cache; + +/* + * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc + * and vfree for every call. Using a kmem_cache also has the advantage + * that improves the odds that the memory used will be local to this cpu. + * To further improve things it might be wise to create a dedicated per-cpu + * workspace for use. This would take some additional care because we then + * must disable preemption around the critical section, and verify that + * zlib_deflate* and zlib_inflate* never internally call schedule(). + */ +static void * +zlib_workspace_alloc(int flags) +{ + return kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS)); +} + +static void +zlib_workspace_free(void *workspace) +{ + kmem_cache_free(zlib_workspace_cache, workspace); +} + +/* + * Compresses the source buffer into the destination buffer. The level + * parameter has the same meaning as in deflateInit. sourceLen is the byte + * length of the source buffer. Upon entry, destLen is the total size of the + * destination buffer, which must be at least 0.1% larger than sourceLen plus + * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer. + * + * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + * memory, Z_BUF_ERROR if there was not enough room in the output buffer, + * Z_STREAM_ERROR if the level parameter is invalid. + */ +int +z_compress_level(void *dest, size_t *destLen, const void *source, + size_t sourceLen, int level) +{ + z_stream stream; + int err; + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return Z_BUF_ERROR; + + stream.workspace = zlib_workspace_alloc(KM_SLEEP); + if (!stream.workspace) + return Z_MEM_ERROR; + + err = zlib_deflateInit(&stream, level); + if (err != Z_OK) { + zlib_workspace_free(stream.workspace); + return err; + } + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_deflateEnd(&stream); + zlib_workspace_free(stream.workspace); + return err == Z_OK ? Z_BUF_ERROR : err; + } + *destLen = stream.total_out; + + err = zlib_deflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + return err; +} +EXPORT_SYMBOL(z_compress_level); + +/* + * Decompresses the source buffer into the destination buffer. sourceLen is + * the byte length of the source buffer. Upon entry, destLen is the total + * size of the destination buffer, which must be large enough to hold the + * entire uncompressed data. (The size of the uncompressed data must have + * been saved previously by the compressor and transmitted to the decompressor + * by some mechanism outside the scope of this compression library.) + * Upon exit, destLen is the actual size of the compressed buffer. + * This function can be used to decompress a whole file at once if the + * input file is mmap'ed. + * + * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + * enough memory, Z_BUF_ERROR if there was not enough room in the output + * buffer, or Z_DATA_ERROR if the input data was corrupted. + */ +int +z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen) +{ + z_stream stream; + int err; + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return Z_BUF_ERROR; + + stream.workspace = zlib_workspace_alloc(KM_SLEEP); + if (!stream.workspace) + return Z_MEM_ERROR; + + err = zlib_inflateInit(&stream); + if (err != Z_OK) { + zlib_workspace_free(stream.workspace); + return err; + } + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_inflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + if (err == Z_NEED_DICT || + (err == Z_BUF_ERROR && stream.avail_in == 0)) + return Z_DATA_ERROR; + + return err; + } + *destLen = stream.total_out; + + err = zlib_inflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + return err; +} +EXPORT_SYMBOL(z_uncompress); + +int +spl_zlib_init(void) +{ + int size; + SENTRY; + + size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + + zlib_workspace_cache = kmem_cache_create( + "spl_zlib_workspace_cache", + size, 0, NULL, NULL, NULL, NULL, NULL, + KMC_VMEM | KMC_NOEMERGENCY); + if (!zlib_workspace_cache) + SRETURN(1); + + SRETURN(0); +} + +void +spl_zlib_fini(void) +{ + SENTRY; + kmem_cache_destroy(zlib_workspace_cache); + zlib_workspace_cache = NULL; + SEXIT; +} diff --git a/module/os/windows/zfs/CMakeLists.txt b/module/os/windows/zfs/CMakeLists.txt new file mode 100644 index 000000000000..03abf2698828 --- /dev/null +++ b/module/os/windows/zfs/CMakeLists.txt @@ -0,0 +1,129 @@ +wdk_add_library(zfskern + KMDF 1.9 + abd.c + aggsum.c + arc.c + blkptr.c + bplist.c + bpobj.c + bptree.c + bqueue.c + cityhash.c + dbuf.c + dbuf_stats.c + ddt.c + ddt_zap.c + dmu.c + dmu_diff.c + dmu_object.c + dmu_objset.c + dmu_recv.c + dmu_send.c + dmu_traverse.c + dmu_tx.c + dmu_zfetch.c + dnode.c + dnode_sync.c + dsl_bookmark.c + dsl_crypt.c + dsl_dataset.c + dsl_deadlist.c + dsl_deleg.c + dsl_destroy.c + dsl_dir.c + dsl_pool.c + dsl_proc.c + dsl_prop.c + dsl_scan.c + dsl_synctask.c + dsl_userhold.c + edonr_zfs.c + fm.c + gzip.c + hkdf.c + lz4.c + lzjb.c + metaslab.c + mmp.c + multilist.c + range_tree.c + refcount.c + rrwlock.c + sa.c + sha256.c + skein_zfs.c + spa.c + space_map.c + space_reftree.c + spa_boot.c + spa_checkpoint.c + spa_config.c + spa_errlog.c + spa_history.c + spa_misc.c + spa_stats.c + txg.c + uberblock.c + unique.c + vdev.c + vdev_cache.c + vdev_disk.c + vdev_file.c + vdev_indirect.c + vdev_indirect_births.c + vdev_indirect_mapping.c + vdev_initialize.c + vdev_label.c + vdev_mirror.c + vdev_missing.c + vdev_queue.c + vdev_raidz.c + vdev_removal.c + vdev_root.c + vdev_trim.c + zap.c + zap_leaf.c + zap_micro.c + zcp.c + zcp_get.c + zcp_global.c + zcp_iter.c + zcp_synctask.c + zfeature.c + zfeature_common.c + zfs_acl.c + zfs_byteswap.c + zfs_debug.c + zfs_dir.c + zfs_fm.c + zfs_fuid.c + zfs_ioctl.c + zfs_kstat_windows.c + zfs_log.c + zfs_onexit.c + zfs_replay.c + zfs_rlock.c + zfs_sa.c + zfs_vfsops.c + zfs_vnops.c + zfs_vnops_windows.c + zfs_vnops_windows_lib.c + zfs_vnops_windows_mount.c + zfs_windows.c + zfs_windows_zvol.c + zfs_windows_zvol_scsi.c + zfs_windows_zvol_wmi.c + zfs_znode.c + zil.c + zio.c + zio_checksum.c + zio_compress.c + zio_crypt.c + zio_inject.c + zle.c + zrlock.c + zthr.c + zvol.c +) + +target_link_libraries(zfskern PRIVATE splkern icpkern) \ No newline at end of file From 40074cae18c664eaf0fa2ecb2b327d95779d2765 Mon Sep 17 00:00:00 2001 From: Jorgen Lundman Date: Fri, 31 Jul 2020 10:48:35 +0900 Subject: [PATCH 002/231] Windows: ZFS source file place holders --- include/os/windows/zfs/sys/spa.h | 1192 --- include/os/windows/zfs/sys/zfs_windows.h | 161 + include/os/windows/zfs/zfs_config.h | 95 - module/os/windows/zfs/abd_os.c | 0 module/os/windows/zfs/arc_os.c | 0 module/os/windows/zfs/policy.c | 0 module/os/windows/zfs/qat.c | 0 module/os/windows/zfs/qat_compress.c | 0 module/os/windows/zfs/qat_crypt.c | 0 module/os/windows/zfs/spa_misc_os.c | 0 module/os/windows/zfs/spa_stats.c | 0 module/os/windows/zfs/trace.c | 0 module/os/windows/zfs/vdev_disk.c | 810 ++ module/os/windows/zfs/vdev_file.c | 601 ++ module/os/windows/zfs/zfs_acl.c | 2997 +++++++ module/os/windows/zfs/zfs_ctldir.c | 2126 +++++ module/os/windows/zfs/zfs_debug.c | 137 + module/os/windows/zfs/zfs_dir.c | 1289 +++ module/os/windows/zfs/zfs_file_os.c | 0 module/os/windows/zfs/zfs_fuid_os.c | 0 module/os/windows/zfs/zfs_ioctl_os.c | 0 module/os/windows/zfs/zfs_kstat_windows.c | 630 ++ module/os/windows/zfs/zfs_vfsops.c | 4054 +++++++++ module/os/windows/zfs/zfs_vnops.c | 7681 +++++++++++++++++ module/os/windows/zfs/zfs_vnops_windows.c | 4913 +++++++++++ module/os/windows/zfs/zfs_vnops_windows_lib.c | 3751 ++++++++ .../os/windows/zfs/zfs_vnops_windows_mount.c | 1369 +++ module/os/windows/zfs/zfs_znode.c | 2635 ++++++ module/os/windows/zfs/zio_crypt.c | 0 module/os/windows/zfs/zvol_os.c | 0 30 files changed, 33154 insertions(+), 1287 deletions(-) delete mode 100644 include/os/windows/zfs/sys/spa.h create mode 100644 include/os/windows/zfs/sys/zfs_windows.h delete mode 100644 include/os/windows/zfs/zfs_config.h create mode 100644 module/os/windows/zfs/abd_os.c create mode 100644 module/os/windows/zfs/arc_os.c create mode 100644 module/os/windows/zfs/policy.c create mode 100644 module/os/windows/zfs/qat.c create mode 100644 module/os/windows/zfs/qat_compress.c create mode 100644 module/os/windows/zfs/qat_crypt.c create mode 100644 module/os/windows/zfs/spa_misc_os.c create mode 100644 module/os/windows/zfs/spa_stats.c create mode 100644 module/os/windows/zfs/trace.c create mode 100644 module/os/windows/zfs/vdev_disk.c create mode 100644 module/os/windows/zfs/vdev_file.c create mode 100644 module/os/windows/zfs/zfs_acl.c create mode 100644 module/os/windows/zfs/zfs_ctldir.c create mode 100644 module/os/windows/zfs/zfs_debug.c create mode 100644 module/os/windows/zfs/zfs_dir.c create mode 100644 module/os/windows/zfs/zfs_file_os.c create mode 100644 module/os/windows/zfs/zfs_fuid_os.c create mode 100644 module/os/windows/zfs/zfs_ioctl_os.c create mode 100644 module/os/windows/zfs/zfs_kstat_windows.c create mode 100644 module/os/windows/zfs/zfs_vfsops.c create mode 100644 module/os/windows/zfs/zfs_vnops.c create mode 100644 module/os/windows/zfs/zfs_vnops_windows.c create mode 100644 module/os/windows/zfs/zfs_vnops_windows_lib.c create mode 100644 module/os/windows/zfs/zfs_vnops_windows_mount.c create mode 100644 module/os/windows/zfs/zfs_znode.c create mode 100644 module/os/windows/zfs/zio_crypt.c create mode 100644 module/os/windows/zfs/zvol_os.c diff --git a/include/os/windows/zfs/sys/spa.h b/include/os/windows/zfs/sys/spa.h deleted file mode 100644 index 7738112c5882..000000000000 --- a/include/os/windows/zfs/sys/spa.h +++ /dev/null @@ -1,1192 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright 2013 Saso Kiselkov. All rights reserved. - * Copyright (c) 2014 Integros [integros.com] - * Copyright 2017 Joyent, Inc. - * Copyright (c) 2017 Datto Inc. - * Copyright (c) 2017, Intel Corporation. - */ - -#ifndef _SYS_SPA_H -#define _SYS_SPA_H - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Forward references that lots of things need. - */ -typedef struct spa spa_t; -typedef struct vdev vdev_t; -typedef struct metaslab metaslab_t; -typedef struct metaslab_group metaslab_group_t; -typedef struct metaslab_class metaslab_class_t; -typedef struct zio zio_t; -typedef struct zilog zilog_t; -typedef struct spa_aux_vdev spa_aux_vdev_t; -typedef struct ddt ddt_t; -typedef struct ddt_entry ddt_entry_t; -typedef struct zbookmark_phys zbookmark_phys_t; - -struct dsl_pool; -struct dsl_dataset; -struct dsl_crypto_params; - -/* - * General-purpose 32-bit and 64-bit bitfield encodings. - */ -#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) -#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) -#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) -#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) - -#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) -#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) - -#define BF32_SET(x, low, len, val) do { \ - ASSERT3U(val, <, 1U << (len)); \ - ASSERT3U(low + len, <=, 32); \ - (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ -_NOTE(CONSTCOND) } while (0) - -#define BF64_SET(x, low, len, val) do { \ - ASSERT3U(val, <, 1ULL << (len)); \ - ASSERT3U(low + len, <=, 64); \ - ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ -_NOTE(CONSTCOND) } while (0) - -#define BF32_GET_SB(x, low, len, shift, bias) \ - ((BF32_GET(x, low, len) + (bias)) << (shift)) -#define BF64_GET_SB(x, low, len, shift, bias) \ - ((BF64_GET(x, low, len) + (bias)) << (shift)) - -#define BF32_SET_SB(x, low, len, shift, bias, val) do { \ - ASSERT(IS_P2ALIGNED(val, 1U << shift)); \ - ASSERT3S((val) >> (shift), >=, bias); \ - BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ -_NOTE(CONSTCOND) } while (0) -#define BF64_SET_SB(x, low, len, shift, bias, val) do { \ - ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \ - ASSERT3S((val) >> (shift), >=, bias); \ - BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ -_NOTE(CONSTCOND) } while (0) - -/* - * We currently support block sizes from 512 bytes to 16MB. - * The benefits of larger blocks, and thus larger IO, need to be weighed - * against the cost of COWing a giant block to modify one byte, and the - * large latency of reading or writing a large block. - * - * Note that although blocks up to 16MB are supported, the recordsize - * property can not be set larger than zfs_max_recordsize (default 1MB). - * See the comment near zfs_max_recordsize in dsl_dataset.c for details. - * - * Note that although the LSIZE field of the blkptr_t can store sizes up - * to 32MB, the dnode's dn_datablkszsec can only store sizes up to - * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. - */ -#define SPA_MINBLOCKSHIFT 9 -#define SPA_OLD_MAXBLOCKSHIFT 17 -#define SPA_MAXBLOCKSHIFT 24 -#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) -#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) -#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) - -/* - * Size of block to hold the configuration data (a packed nvlist) - */ -#define SPA_CONFIG_BLOCKSIZE (1ULL << 14) - -/* - * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. - * The ASIZE encoding should be at least 64 times larger (6 more bits) - * to support up to 4-way RAID-Z mirror mode with worst-case gang block - * overhead, three DVAs per bp, plus one more bit in case we do anything - * else that expands the ASIZE. - */ -#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ -#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ -#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ - -#define SPA_COMPRESSBITS 7 -#define SPA_VDEVBITS 24 - -/* - * All SPA data is represented by 128-bit data virtual addresses (DVAs). - * The members of the dva_t should be considered opaque outside the SPA. - */ -typedef struct dva { - uint64_t dva_word[2]; -} dva_t; - -/* - * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. - */ -typedef struct zio_cksum { - uint64_t zc_word[4]; -} zio_cksum_t; - -/* - * Some checksums/hashes need a 256-bit initialization salt. This salt is kept - * secret and is suitable for use in MAC algorithms as the key. - */ -typedef struct zio_cksum_salt { - uint8_t zcs_bytes[32]; -} zio_cksum_salt_t; - -/* - * Each block is described by its DVAs, time of birth, checksum, etc. - * The word-by-word, bit-by-bit layout of the blkptr is as follows: - * - * 64 56 48 40 32 24 16 8 0 - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | pad | vdev1 | GRID | ASIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 1 |G| offset1 | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 2 | pad | vdev2 | GRID | ASIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 3 |G| offset2 | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 4 | pad | vdev3 | GRID | ASIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 5 |G| offset3 | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 7 | padding | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 8 | padding | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 9 | physical birth txg | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * a | logical birth txg | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * b | fill count | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * c | checksum[0] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * d | checksum[1] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * e | checksum[2] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * f | checksum[3] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * - * Legend: - * - * vdev virtual device ID - * offset offset into virtual device - * LSIZE logical size - * PSIZE physical size (after compression) - * ASIZE allocated size (including RAID-Z parity and gang block headers) - * GRID RAID-Z layout information (reserved for future use) - * cksum checksum function - * comp compression function - * G gang block indicator - * B byteorder (endianness) - * D dedup - * X encryption - * E blkptr_t contains embedded data (see below) - * lvl level of indirection - * type DMU object type - * phys birth txg when dva[0] was written; zero if same as logical birth txg - * note that typically all the dva's would be written in this - * txg, but they could be different if they were moved by - * device removal. - * log. birth transaction group in which the block was logically born - * fill count number of non-zero blocks under this bp - * checksum[4] 256-bit checksum of the data this bp describes - */ - -/* - * The blkptr_t's of encrypted blocks also need to store the encryption - * parameters so that the block can be decrypted. This layout is as follows: - * - * 64 56 48 40 32 24 16 8 0 - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | vdev1 | GRID | ASIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 1 |G| offset1 | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 2 | vdev2 | GRID | ASIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 3 |G| offset2 | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 4 | salt | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 5 | IV1 | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 7 | padding | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 8 | padding | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 9 | physical birth txg | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * a | logical birth txg | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * b | IV2 | fill count | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * c | checksum[0] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * d | checksum[1] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * e | MAC[0] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * f | MAC[1] | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * - * Legend: - * - * salt Salt for generating encryption keys - * IV1 First 64 bits of encryption IV - * X Block requires encryption handling (set to 1) - * E blkptr_t contains embedded data (set to 0, see below) - * fill count number of non-zero blocks under this bp (truncated to 32 bits) - * IV2 Last 32 bits of encryption IV - * checksum[2] 128-bit checksum of the data this bp describes - * MAC[2] 128-bit message authentication code for this data - * - * The X bit being set indicates that this block is one of 3 types. If this is - * a level 0 block with an encrypted object type, the block is encrypted - * (see BP_IS_ENCRYPTED()). If this is a level 0 block with an unencrypted - * object type, this block is authenticated with an HMAC (see - * BP_IS_AUTHENTICATED()). Otherwise (if level > 0), this bp will use the MAC - * words to store a checksum-of-MACs from the level below (see - * BP_HAS_INDIRECT_MAC_CKSUM()). For convenience in the code, BP_IS_PROTECTED() - * refers to both encrypted and authenticated blocks and BP_USES_CRYPT() - * refers to any of these 3 kinds of blocks. - * - * The additional encryption parameters are the salt, IV, and MAC which are - * explained in greater detail in the block comment at the top of zio_crypt.c. - * The MAC occupies half of the checksum space since it serves a very similar - * purpose: to prevent data corruption on disk. The only functional difference - * is that the checksum is used to detect on-disk corruption whether or not the - * encryption key is loaded and the MAC provides additional protection against - * malicious disk tampering. We use the 3rd DVA to store the salt and first - * 64 bits of the IV. As a result encrypted blocks can only have 2 copies - * maximum instead of the normal 3. The last 32 bits of the IV are stored in - * the upper bits of what is usually the fill count. Note that only blocks at - * level 0 or -2 are ever encrypted, which allows us to guarantee that these - * 32 bits are not trampled over by other code (see zio_crypt.c for details). - * The salt and IV are not used for authenticated bps or bps with an indirect - * MAC checksum, so these blocks can utilize all 3 DVAs and the full 64 bits - * for the fill count. - */ - -/* - * "Embedded" blkptr_t's don't actually point to a block, instead they - * have a data payload embedded in the blkptr_t itself. See the comment - * in blkptr.c for more details. - * - * The blkptr_t is laid out as follows: - * - * 64 56 48 40 32 24 16 8 0 - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | payload | - * 1 | payload | - * 2 | payload | - * 3 | payload | - * 4 | payload | - * 5 | payload | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 7 | payload | - * 8 | payload | - * 9 | payload | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * a | logical birth txg | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * b | payload | - * c | payload | - * d | payload | - * e | payload | - * f | payload | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * - * Legend: - * - * payload contains the embedded data - * B (byteorder) byteorder (endianness) - * D (dedup) padding (set to zero) - * X encryption (set to zero; see above) - * E (embedded) set to one - * lvl indirection level - * type DMU object type - * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) - * comp compression function of payload - * PSIZE size of payload after compression, in bytes - * LSIZE logical size of payload, in bytes - * note that 25 bits is enough to store the largest - * "normal" BP's LSIZE (2^16 * 2^9) in bytes - * log. birth transaction group in which the block was logically born - * - * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded - * bp's they are stored in units of SPA_MINBLOCKSHIFT. - * Generally, the generic BP_GET_*() macros can be used on embedded BP's. - * The B, D, X, lvl, type, and comp fields are stored the same as with normal - * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must - * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before - * other macros, as they assert that they are only used on BP's of the correct - * "embedded-ness". Encrypted blkptr_t's cannot be embedded because they use - * the payload space for encryption parameters (see the comment above on - * how encryption parameters are stored). - */ - -#define BPE_GET_ETYPE(bp) \ - (ASSERT(BP_IS_EMBEDDED(bp)), \ - BF64_GET((bp)->blk_prop, 40, 8)) -#define BPE_SET_ETYPE(bp, t) do { \ - ASSERT(BP_IS_EMBEDDED(bp)); \ - BF64_SET((bp)->blk_prop, 40, 8, t); \ -_NOTE(CONSTCOND) } while (0) - -#define BPE_GET_LSIZE(bp) \ - (ASSERT(BP_IS_EMBEDDED(bp)), \ - BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) -#define BPE_SET_LSIZE(bp, x) do { \ - ASSERT(BP_IS_EMBEDDED(bp)); \ - BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ -_NOTE(CONSTCOND) } while (0) - -#define BPE_GET_PSIZE(bp) \ - (ASSERT(BP_IS_EMBEDDED(bp)), \ - BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) -#define BPE_SET_PSIZE(bp, x) do { \ - ASSERT(BP_IS_EMBEDDED(bp)); \ - BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ -_NOTE(CONSTCOND) } while (0) - -typedef enum bp_embedded_type { - BP_EMBEDDED_TYPE_DATA, - BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */ - NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED -} bp_embedded_type_t; - -#define BPE_NUM_WORDS 14 -#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) -#define BPE_IS_PAYLOADWORD(bp, wp) \ - ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) - -#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ -#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ -#define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */ - -/* - * A block is a hole when it has either 1) never been written to, or - * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads - * without physically allocating disk space. Holes are represented in the - * blkptr_t structure by zeroed blk_dva. Correct checking for holes is - * done through the BP_IS_HOLE macro. For holes, the logical size, level, - * DMU object type, and birth times are all also stored for holes that - * were written to at some point (i.e. were punched after having been filled). - */ -typedef struct blkptr { - dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ - uint64_t blk_prop; /* size, compression, type, etc */ - uint64_t blk_pad[2]; /* Extra space for the future */ - uint64_t blk_phys_birth; /* txg when block was allocated */ - uint64_t blk_birth; /* transaction group at birth */ - uint64_t blk_fill; /* fill count */ - zio_cksum_t blk_cksum; /* 256-bit checksum */ -} blkptr_t; - -/* - * Macros to get and set fields in a bp or DVA. - */ -#define DVA_GET_ASIZE(dva) \ - BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) -#define DVA_SET_ASIZE(dva, x) \ - BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ - SPA_MINBLOCKSHIFT, 0, x) - -#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) -#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) - -#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) -#define DVA_SET_VDEV(dva, x) \ - BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) - -#define DVA_GET_OFFSET(dva) \ - BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) -#define DVA_SET_OFFSET(dva, x) \ - BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) - -#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) -#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) - -#define BP_GET_LSIZE(bp) \ - (BP_IS_EMBEDDED(bp) ? \ - (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ - BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) -#define BP_SET_LSIZE(bp, x) do { \ - ASSERT(!BP_IS_EMBEDDED(bp)); \ - BF64_SET_SB((bp)->blk_prop, \ - 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ -_NOTE(CONSTCOND) } while (0) - -#define BP_GET_PSIZE(bp) \ - (BP_IS_EMBEDDED(bp) ? 0 : \ - BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)) -#define BP_SET_PSIZE(bp, x) do { \ - ASSERT(!BP_IS_EMBEDDED(bp)); \ - BF64_SET_SB((bp)->blk_prop, \ - 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ -_NOTE(CONSTCOND) } while (0) - -#define BP_GET_COMPRESS(bp) \ - BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) -#define BP_SET_COMPRESS(bp, x) \ - BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) - -#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) -#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) - -#define BP_GET_CHECKSUM(bp) \ - (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \ - BF64_GET((bp)->blk_prop, 40, 8)) -#define BP_SET_CHECKSUM(bp, x) do { \ - ASSERT(!BP_IS_EMBEDDED(bp)); \ - BF64_SET((bp)->blk_prop, 40, 8, x); \ -_NOTE(CONSTCOND) } while (0) - -#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) -#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) - -#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) -#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) - -/* encrypted, authenticated, and MAC cksum bps use the same bit */ -#define BP_USES_CRYPT(bp) BF64_GET((bp)->blk_prop, 61, 1) -#define BP_SET_CRYPT(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) - -#define BP_IS_ENCRYPTED(bp) \ - (BP_USES_CRYPT(bp) && \ - BP_GET_LEVEL(bp) <= 0 && \ - DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) - -#define BP_IS_AUTHENTICATED(bp) \ - (BP_USES_CRYPT(bp) && \ - BP_GET_LEVEL(bp) <= 0 && \ - !DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) - -#define BP_HAS_INDIRECT_MAC_CKSUM(bp) \ - (BP_USES_CRYPT(bp) && BP_GET_LEVEL(bp) > 0) - -#define BP_IS_PROTECTED(bp) \ - (BP_IS_ENCRYPTED(bp) || BP_IS_AUTHENTICATED(bp)) - -#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) -#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) - -#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) -#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) - -#define BP_PHYSICAL_BIRTH(bp) \ - (BP_IS_EMBEDDED(bp) ? 0 : \ - (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) - -#define BP_SET_BIRTH(bp, logical, physical) \ -{ \ - ASSERT(!BP_IS_EMBEDDED(bp)); \ - (bp)->blk_birth = (logical); \ - (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ -} - -#define BP_GET_FILL(bp) \ - ((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \ - ((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill)) - -#define BP_SET_FILL(bp, fill) \ -{ \ - if (BP_IS_ENCRYPTED(bp)) \ - BF64_SET((bp)->blk_fill, 0, 32, fill); \ - else \ - (bp)->blk_fill = fill; \ -} - -#define BP_GET_IV2(bp) \ - (ASSERT(BP_IS_ENCRYPTED(bp)), \ - BF64_GET((bp)->blk_fill, 32, 32)) -#define BP_SET_IV2(bp, iv2) \ -{ \ - ASSERT(BP_IS_ENCRYPTED(bp)); \ - BF64_SET((bp)->blk_fill, 32, 32, iv2); \ -} - -#define BP_IS_METADATA(bp) \ - (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) - -#define BP_GET_ASIZE(bp) \ - (BP_IS_EMBEDDED(bp) ? 0 : \ - DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ - DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ - (DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) - -#define BP_GET_UCSIZE(bp) \ - (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) - -#define BP_GET_NDVAS(bp) \ - (BP_IS_EMBEDDED(bp) ? 0 : \ - !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ - !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ - (!!DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) - -#define BP_COUNT_GANG(bp) \ - (BP_IS_EMBEDDED(bp) ? 0 : \ - (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ - DVA_GET_GANG(&(bp)->blk_dva[1]) + \ - (DVA_GET_GANG(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))) - -#define DVA_EQUAL(dva1, dva2) \ - ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ - (dva1)->dva_word[0] == (dva2)->dva_word[0]) - -#define BP_EQUAL(bp1, bp2) \ - (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ - (bp1)->blk_birth == (bp2)->blk_birth && \ - DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ - DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ - DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) - -#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ - (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ - ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ - ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ - ((zc1).zc_word[3] - (zc2).zc_word[3]))) - -#define ZIO_CHECKSUM_MAC_EQUAL(zc1, zc2) \ - (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ - ((zc1).zc_word[1] - (zc2).zc_word[1]))) - -#define ZIO_CHECKSUM_IS_ZERO(zc) \ - (0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \ - (zc)->zc_word[2] | (zc)->zc_word[3])) - -#define ZIO_CHECKSUM_BSWAP(zcp) \ -{ \ - (zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]); \ - (zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]); \ - (zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]); \ - (zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]); \ -} - -#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) - -#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ -{ \ - (zcp)->zc_word[0] = w0; \ - (zcp)->zc_word[1] = w1; \ - (zcp)->zc_word[2] = w2; \ - (zcp)->zc_word[3] = w3; \ -} - -#define MAX_DATA_MAC_LEN 16 -#define MAX_DATA_IV_LEN 12 - -#define ZIO_SET_MAC(bp, mac) \ - bcopy((mac), &(bp)->blk_cksum.zc_word[2], MAX_DATA_MAC_LEN); - -#define ZIO_SET_IV(bp, iv) \ - bcopy((iv), (bp)->blk_iv, MAX_DATA_IV_LEN); - -#define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0]) -#define BP_IS_GANG(bp) \ - (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp))) -#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ - (dva)->dva_word[1] == 0ULL) -#define BP_IS_HOLE(bp) \ - (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) - -/* BP_IS_RAIDZ(bp) assumes no block compression */ -#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ - BP_GET_PSIZE(bp)) - -#define BP_ZERO(bp) \ -{ \ - (bp)->blk_dva[0].dva_word[0] = 0; \ - (bp)->blk_dva[0].dva_word[1] = 0; \ - (bp)->blk_dva[1].dva_word[0] = 0; \ - (bp)->blk_dva[1].dva_word[1] = 0; \ - (bp)->blk_dva[2].dva_word[0] = 0; \ - (bp)->blk_dva[2].dva_word[1] = 0; \ - (bp)->blk_prop = 0; \ - (bp)->blk_pad[0] = 0; \ - (bp)->blk_pad[1] = 0; \ - (bp)->blk_phys_birth = 0; \ - (bp)->blk_birth = 0; \ - (bp)->blk_fill = 0; \ - ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ -} - -#ifdef _BIG_ENDIAN -#define ZFS_HOST_BYTEORDER (0ULL) -#else -#define ZFS_HOST_BYTEORDER (1ULL) -#endif - -#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) - -#define BP_SPRINTF_LEN 400 - -/* - * This macro allows code sharing between zfs, libzpool, and mdb. - * 'func' is either snprintf() or mdb_snprintf(). - * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. - */ -#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, crypt_type, \ - compress) \ -{ \ - static const char *copyname[] = \ - { "zero", "single", "double", "triple" }; \ - int len = 0; \ - int copies = 0; \ - int d; \ - \ - if (bp == NULL) { \ - len += func(buf + len, size - len, ""); \ - } else if (BP_IS_HOLE(bp)) { \ - len += func(buf + len, size - len, \ - "HOLE [L%llu %s] " \ - "size=%llxL birth=%lluL", \ - (u_longlong_t)BP_GET_LEVEL(bp), \ - type, \ - (u_longlong_t)BP_GET_LSIZE(bp), \ - (u_longlong_t)bp->blk_birth); \ - } else if (BP_IS_EMBEDDED(bp)) { \ - len = func(buf + len, size - len, \ - "EMBEDDED [L%llu %s] et=%u %s " \ - "size=%llxL/%llxP birth=%lluL", \ - (u_longlong_t)BP_GET_LEVEL(bp), \ - type, \ - (int)BPE_GET_ETYPE(bp), \ - compress, \ - (u_longlong_t)BPE_GET_LSIZE(bp), \ - (u_longlong_t)BPE_GET_PSIZE(bp), \ - (u_longlong_t)bp->blk_birth); \ - } else { \ - for (d = 0; d < BP_GET_NDVAS(bp); d++) { \ - const dva_t *dva = &bp->blk_dva[d]; \ - if (DVA_IS_VALID(dva)) \ - copies++; \ - len += func(buf + len, size - len, \ - "DVA[%d]=<%llu:%llx:%llx>%c", d, \ - (u_longlong_t)DVA_GET_VDEV(dva), \ - (u_longlong_t)DVA_GET_OFFSET(dva), \ - (u_longlong_t)DVA_GET_ASIZE(dva), \ - ws); \ - } \ - if (BP_IS_ENCRYPTED(bp)) { \ - len += func(buf + len, size - len, \ - "salt=%llx iv=%llx:%llx%c", \ - (u_longlong_t)bp->blk_dva[2].dva_word[0], \ - (u_longlong_t)bp->blk_dva[2].dva_word[1], \ - (u_longlong_t)BP_GET_IV2(bp), \ - ws); \ - } \ - if (BP_IS_GANG(bp) && \ - DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ - DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ - copies--; \ - len += func(buf + len, size - len, \ - "[L%llu %s] %s %s %s %s %s %s %s%c" \ - "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ - "cksum=%llx:%llx:%llx:%llx", \ - (u_longlong_t)BP_GET_LEVEL(bp), \ - type, \ - checksum, \ - compress, \ - crypt_type, \ - BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ - BP_IS_GANG(bp) ? "gang" : "contiguous", \ - BP_GET_DEDUP(bp) ? "dedup" : "unique", \ - copyname[copies], \ - ws, \ - (u_longlong_t)BP_GET_LSIZE(bp), \ - (u_longlong_t)BP_GET_PSIZE(bp), \ - (u_longlong_t)bp->blk_birth, \ - (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ - (u_longlong_t)BP_GET_FILL(bp), \ - ws, \ - (u_longlong_t)bp->blk_cksum.zc_word[0], \ - (u_longlong_t)bp->blk_cksum.zc_word[1], \ - (u_longlong_t)bp->blk_cksum.zc_word[2], \ - (u_longlong_t)bp->blk_cksum.zc_word[3]); \ - } \ - ASSERT(len < size); \ -} - -#include - -#define BP_GET_BUFC_TYPE(bp) \ - (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) - -typedef enum spa_import_type { - SPA_IMPORT_EXISTING, - SPA_IMPORT_ASSEMBLE -} spa_import_type_t; - -// Hold module busy to stop unregister until all exported. -extern uint64_t zfs_module_busy; - -/* - * Send TRIM commands in-line during normal pool operation while deleting. - * OFF: no - * ON: yes - */ -typedef enum { - SPA_AUTOTRIM_OFF = 0, /* default */ - SPA_AUTOTRIM_ON -} spa_autotrim_t; - -/* - * Reason TRIM command was issued, used internally for accounting purposes. - */ -typedef enum trim_type { - TRIM_TYPE_MANUAL = 0, - TRIM_TYPE_AUTO = 1, -} trim_type_t; - -/* state manipulation functions */ -extern int spa_open(const char *pool, spa_t **, void *tag); -extern int spa_open_rewind(const char *pool, spa_t **, void *tag, - nvlist_t *policy, nvlist_t **config); -extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, - size_t buflen); -extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, - nvlist_t *zplprops, struct dsl_crypto_params *dcp); -extern int spa_import_rootpool(char *devpath, char *devid); -extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, - uint64_t flags); -extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); -extern int spa_destroy(char *pool); -extern int spa_checkpoint(const char *pool); -extern int spa_checkpoint_discard(const char *pool); -extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, - boolean_t hardforce); -extern int spa_reset(char *pool); -extern void spa_async_request(spa_t *spa, int flag); -extern void spa_async_unrequest(spa_t *spa, int flag); -extern void spa_async_suspend(spa_t *spa); -extern void spa_async_resume(spa_t *spa); -extern spa_t *spa_inject_addref(char *pool); -extern void spa_inject_delref(spa_t *spa); -extern void spa_scan_stat_init(spa_t *spa); -extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); - -#define SPA_ASYNC_CONFIG_UPDATE 0x01 -#define SPA_ASYNC_REMOVE 0x02 -#define SPA_ASYNC_PROBE 0x04 -#define SPA_ASYNC_RESILVER_DONE 0x08 -#define SPA_ASYNC_RESILVER 0x10 -#define SPA_ASYNC_AUTOEXPAND 0x20 -#define SPA_ASYNC_REMOVE_DONE 0x40 -#define SPA_ASYNC_REMOVE_STOP 0x80 -#define SPA_ASYNC_INITIALIZE_RESTART 0x100 -#define SPA_ASYNC_TRIM_RESTART 0x200 -#define SPA_ASYNC_AUTOTRIM_RESTART 0x400 - -/* - * Controls the behavior of spa_vdev_remove(). - */ -#define SPA_REMOVE_UNSPARE 0x01 -#define SPA_REMOVE_DONE 0x02 - -/* device manipulation */ -extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); -extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, - int replacing); -extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, - int replace_done); -extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); -extern boolean_t spa_vdev_remove_active(spa_t *spa); -extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t guid, - nvlist_t *vdev_errlist); -extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, - uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist); -extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); -extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); -extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, - nvlist_t *props, boolean_t exp); - -/* spare state (which is global across all pools) */ -extern void spa_spare_add(vdev_t *vd); -extern void spa_spare_remove(vdev_t *vd); -extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); -extern void spa_spare_activate(vdev_t *vd); - -/* L2ARC state (which is global across all pools) */ -extern void spa_l2cache_add(vdev_t *vd); -extern void spa_l2cache_remove(vdev_t *vd); -extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); -extern void spa_l2cache_activate(vdev_t *vd); -extern void spa_l2cache_drop(spa_t *spa); - -/* scanning */ -extern int spa_scan(spa_t *spa, pool_scan_func_t func); -extern int spa_scan_stop(spa_t *spa); -extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); - -/* spa syncing */ -extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ -extern void spa_sync_allpools(void); - -extern int zfs_sync_pass_deferred_free; - -/* spa namespace global mutex */ -extern kmutex_t spa_namespace_lock; - -/* - * SPA configuration functions in spa_config.c - */ - -#define SPA_CONFIG_UPDATE_POOL 0 -#define SPA_CONFIG_UPDATE_VDEVS 1 - -extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t); -extern void spa_config_load(void); -extern nvlist_t *spa_all_configs(uint64_t *); -extern void spa_config_set(spa_t *spa, nvlist_t *config); -extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, - int getstats); -extern void spa_config_update(spa_t *spa, int what); - -/* - * Miscellaneous SPA routines in spa_misc.c - */ - -/* Namespace manipulation */ -extern spa_t *spa_lookup(const char *name); -extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); -extern void spa_remove(spa_t *spa); -extern spa_t *spa_next(spa_t *prev); - -/* Refcount functions */ -extern void spa_open_ref(spa_t *spa, void *tag); -extern void spa_close(spa_t *spa, void *tag); -extern void spa_async_close(spa_t *spa, void *tag); -extern boolean_t spa_refcount_zero(spa_t *spa); - -#define SCL_NONE 0x00 -#define SCL_CONFIG 0x01 -#define SCL_STATE 0x02 -#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ -#define SCL_ALLOC 0x08 -#define SCL_ZIO 0x10 -#define SCL_FREE 0x20 -#define SCL_VDEV 0x40 -#define SCL_LOCKS 7 -#define SCL_ALL ((1 << SCL_LOCKS) - 1) -#define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) - -/* Historical pool statistics */ -typedef struct spa_stats_history { - kmutex_t lock; - uint64_t count; - uint64_t size; - kstat_t *kstat; - void *_private; - list_t list; -} spa_stats_history_t; - -typedef struct spa_stats { - spa_stats_history_t read_history; - spa_stats_history_t txg_history; - spa_stats_history_t tx_assign_histogram; - spa_stats_history_t io_history; - spa_stats_history_t mmp_history; - spa_stats_history_t iostats; -} spa_stats_t; - -typedef enum txg_state { - TXG_STATE_BIRTH = 0, - TXG_STATE_OPEN = 1, - TXG_STATE_QUIESCED = 2, - TXG_STATE_WAIT_FOR_SYNC = 3, - TXG_STATE_SYNCED = 4, - TXG_STATE_COMMITTED = 5, -} txg_state_t; - -/* Assorted pool IO kstats */ -typedef struct spa_iostats { - kstat_named_t trim_extents_written; - kstat_named_t trim_bytes_written; - kstat_named_t trim_extents_skipped; - kstat_named_t trim_bytes_skipped; - kstat_named_t trim_extents_failed; - kstat_named_t trim_bytes_failed; - kstat_named_t autotrim_extents_written; - kstat_named_t autotrim_bytes_written; - kstat_named_t autotrim_extents_skipped; - kstat_named_t autotrim_bytes_skipped; - kstat_named_t autotrim_extents_failed; - kstat_named_t autotrim_bytes_failed; -} spa_iostats_t; - -extern void spa_stats_init(spa_t *spa); -extern void spa_stats_destroy(spa_t *spa); -extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, - uint32_t aflags); -extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time); -extern int spa_txg_history_set(spa_t *spa, uint64_t txg, - txg_state_t completed_state, hrtime_t completed_time); -extern int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, - uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty); -extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); -extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id); -extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error, - hrtime_t duration); -extern void *spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, - uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id, - int error); -extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, - uint64_t extents_written, uint64_t bytes_written, - uint64_t extents_skipped, uint64_t bytes_skipped, - uint64_t extents_failed, uint64_t bytes_failed); - -/* Pool configuration locks */ -extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); -extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); -extern void spa_config_exit(spa_t *spa, int locks, void *tag); -extern int spa_config_held(spa_t *spa, int locks, krw_t rw); - -/* Pool vdev add/remove lock */ -extern uint64_t spa_vdev_enter(spa_t *spa); -extern uint64_t spa_vdev_config_enter(spa_t *spa); -extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, - int error, char *tag); -extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); - -/* Pool vdev state change lock */ -extern void spa_vdev_state_enter(spa_t *spa, int oplock); -extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); - -/* Log state */ -typedef enum spa_log_state { - SPA_LOG_UNKNOWN = 0, /* unknown log state */ - SPA_LOG_MISSING, /* missing log(s) */ - SPA_LOG_CLEAR, /* clear the log(s) */ - SPA_LOG_GOOD, /* log(s) are good */ -} spa_log_state_t; - -extern spa_log_state_t spa_get_log_state(spa_t *spa); -extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); -extern int spa_reset_logs(spa_t *spa); - -/* Log claim callback */ -extern void spa_claim_notify(zio_t *zio); -extern void spa_deadman(void *); - -/* Accessor functions */ -extern boolean_t spa_shutting_down(spa_t *spa); -extern struct dsl_pool *spa_get_dsl(spa_t *spa); -extern boolean_t spa_is_initializing(spa_t *spa); -extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa); -extern blkptr_t *spa_get_rootblkptr(spa_t *spa); -extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); -extern void spa_altroot(spa_t *, char *, size_t); -extern int spa_sync_pass(spa_t *spa); -extern char *spa_name(spa_t *spa); -extern uint64_t spa_guid(spa_t *spa); -extern uint64_t spa_load_guid(spa_t *spa); -extern uint64_t spa_last_synced_txg(spa_t *spa); -extern uint64_t spa_first_txg(spa_t *spa); -extern uint64_t spa_syncing_txg(spa_t *spa); -extern uint64_t spa_final_dirty_txg(spa_t *spa); -extern uint64_t spa_version(spa_t *spa); -extern pool_state_t spa_state(spa_t *spa); -extern spa_load_state_t spa_load_state(spa_t *spa); -extern uint64_t spa_freeze_txg(spa_t *spa); -extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); -extern uint64_t spa_get_dspace(spa_t *spa); -extern uint64_t spa_get_checkpoint_space(spa_t *spa); -extern uint64_t spa_get_slop_space(spa_t *spa); -extern void spa_update_dspace(spa_t *spa); -extern uint64_t spa_version(spa_t *spa); -extern boolean_t spa_deflate(spa_t *spa); -extern metaslab_class_t *spa_normal_class(spa_t *spa); -extern metaslab_class_t *spa_log_class(spa_t *spa); -extern metaslab_class_t *spa_special_class(spa_t *spa); -extern metaslab_class_t *spa_dedup_class(spa_t *spa); -extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, - dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); - -extern void spa_evicting_os_register(spa_t *, objset_t *os); -extern void spa_evicting_os_deregister(spa_t *, objset_t *os); -extern void spa_evicting_os_wait(spa_t *spa); -extern int spa_max_replication(spa_t *spa); -extern int spa_prev_software_version(spa_t *spa); -extern uint8_t spa_get_failmode(spa_t *spa); -extern boolean_t spa_suspended(spa_t *spa); -extern uint64_t spa_bootfs(spa_t *spa); -extern uint64_t spa_delegation(spa_t *spa); -extern objset_t *spa_meta_objset(spa_t *spa); -extern uint64_t spa_deadman_synctime(spa_t *spa); -extern spa_autotrim_t spa_get_autotrim(spa_t *spa); - -/* Miscellaneous support routines */ -extern void spa_load_failed(spa_t *spa, const char *fmt, ...); -extern void spa_load_note(spa_t *spa, const char *fmt, ...); -extern void spa_activate_mos_feature(spa_t *spa, const char *feature, - dmu_tx_t *tx); -extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); -extern int spa_rename(const char *oldname, const char *newname); -extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); -extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); -extern char *spa_strdup(const char *); -extern void spa_strfree(char *); -extern uint64_t spa_get_random(uint64_t range); -extern uint64_t spa_generate_guid(spa_t *spa); -extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); -extern void spa_freeze(spa_t *spa); -extern int spa_change_guid(spa_t *spa); -extern void spa_upgrade(spa_t *spa, uint64_t version); -extern void spa_evict_all(void); -extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, - boolean_t l2cache); -extern boolean_t spa_has_spare(spa_t *, uint64_t guid); -extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); -extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); -extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); -extern boolean_t spa_has_slogs(spa_t *spa); -extern boolean_t spa_is_root(spa_t *spa); -extern boolean_t spa_writeable(spa_t *spa); -extern boolean_t spa_has_pending_synctask(spa_t *spa); -extern int spa_maxblocksize(spa_t *spa); -extern int spa_minashift(spa_t *spa); -extern int spa_maxdnodesize(spa_t *spa); -extern boolean_t spa_has_checkpoint(spa_t *spa); -extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); -extern boolean_t spa_suspend_async_destroy(spa_t *spa); -extern uint64_t spa_min_claim_txg(spa_t *spa); -extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp); -extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, - const blkptr_t *bp); -typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, - void *arg); -extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, - spa_remap_cb_t callback, void *arg); -extern uint64_t spa_get_last_removal_txg(spa_t *spa); -extern boolean_t spa_trust_config(spa_t *spa); -extern uint64_t spa_missing_tvds_allowed(spa_t *spa); -extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); -extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); -extern boolean_t spa_multihost(spa_t *spa); -extern unsigned long spa_get_hostid(void); -extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); - -extern int spa_mode(spa_t *spa); -extern uint64_t zfs_strtonum(const char *str, char **nptr); - -extern char *spa_his_ievent_table[]; - -extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); -extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, - char *his_buf); -extern int spa_history_log(spa_t *spa, const char *his_buf); -extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); -extern void spa_history_log_version(spa_t *spa, const char *operation); -extern void spa_history_log_internal(spa_t *spa, const char *operation, - dmu_tx_t *tx, const char *fmt, ...); -extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, - dmu_tx_t *tx, const char *fmt, ...); -extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, - dmu_tx_t *tx, const char *fmt, ...); - -/* error handling */ -struct zbookmark_phys; -extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); -extern void zfs_ereport_post(const char *_class, spa_t *spa, vdev_t *vd, - const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, - uint64_t length); -extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, - const char *name, nvlist_t *aux); -extern void zfs_post_remove(spa_t *spa, vdev_t *vd); -extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); -extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); -extern uint64_t spa_get_errlog_size(spa_t *spa); -extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); -extern void spa_errlog_rotate(spa_t *spa); -extern void spa_errlog_drain(spa_t *spa); -extern void spa_errlog_sync(spa_t *spa, uint64_t txg); -extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); - -/* vdev cache */ -extern void vdev_cache_stat_init(void); -extern void vdev_cache_stat_fini(void); - -/* Initialization and termination */ -extern void spa_init(int flags); -extern void spa_fini(void); -extern void spa_boot_init(void); - -/* properties */ -extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); -extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); -extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); -extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); - -/* asynchronous event notification */ -extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, - const char *name); -extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, - const char *name); -extern void spa_event_post(sysevent_t *ev); -extern void spa_event_discard(sysevent_t *ev); - -#ifdef ZFS_DEBUG -#define dprintf_bp(bp, fmt, ...) do { \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ - char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ - snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ - dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ - kmem_free(__blkbuf, BP_SPRINTF_LEN); \ - } \ -_NOTE(CONSTCOND) } while (0) -#else -#define dprintf_bp(bp, fmt, ...) -#endif - -extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SPA_H */ diff --git a/include/os/windows/zfs/sys/zfs_windows.h b/include/os/windows/zfs/sys/zfs_windows.h new file mode 100644 index 000000000000..3e99c226fad8 --- /dev/null +++ b/include/os/windows/zfs/sys/zfs_windows.h @@ -0,0 +1,161 @@ +/* +* CDDL HEADER START +* +* The contents of this file are subject to the terms of the +* Common Development and Distribution License (the "License"). +* You may not use this file except in compliance with the License. +* +* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +* or http://www.opensolaris.org/os/licensing. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* When distributing Covered Code, include this CDDL HEADER in each +* file and include the License file at usr/src/OPENSOLARIS.LICENSE. +* If applicable, add the following below this CDDL HEADER, with the +* fields enclosed by brackets "[]" replaced with your own identifying +* information: Portions Copyright [yyyy] [name of copyright owner] +* +* CDDL HEADER END +*/ +/* +* Copyright (c) 2017 Jorgen Lundman +*/ + +#ifndef SYS_WINDOWS_H_INCLUDED +#define SYS_WINDOWS_H_INCLUDED + + +#include + +extern PDEVICE_OBJECT ioctlDeviceObject; +extern PDEVICE_OBJECT fsDiskDeviceObject; + +#define ZFS_SERIAL (ULONG)'wZFS' +#define VOLUME_LABEL L"ZFS" +DECLARE_GLOBAL_CONST_UNICODE_STRING(ZFSVolumeName, VOLUME_LABEL); + + + +// We have to remember "query directory" related items, like index and +// search pattern. This is attached in IRP_MJ_CREATE to fscontext2 +#define ZFS_DIRLIST_MAGIC 0x6582feac +struct zfs_dirlist { + uint32_t magic; // Identifier + uint32_t dir_eof; // Directory listing completed? + uint64_t uio_offset; // Directory list offset + uint64_t ea_index; // EA list offset + uint32_t deleteonclose; // Marked for deletion + uint32_t ContainsWildCards; // searchname has wildcards + UNICODE_STRING searchname; // Search pattern +}; + +typedef struct zfs_dirlist zfs_dirlist_t; + +extern CACHE_MANAGER_CALLBACKS CacheManagerCallbacks; + +extern NTSTATUS dev_ioctl(PDEVICE_OBJECT DeviceObject, ULONG ControlCode, PVOID InputBuffer, ULONG InputBufferSize, + PVOID OutputBuffer, ULONG OutputBufferSize, BOOLEAN Override, IO_STATUS_BLOCK* iosb); + +extern int zfs_windows_mount(zfs_cmd_t *zc); +extern int zfs_windows_unmount(zfs_cmd_t *zc); +extern NTSTATUS zfsdev_ioctl(PDEVICE_OBJECT DeviceObject, PIRP Irp, int flag); +extern void zfs_windows_vnops_callback(PDEVICE_OBJECT deviceObject); +extern void zfs_send_notify(zfsvfs_t *zfsvfs, char *name, int, ULONG FilterMatch, ULONG Action); +extern void zfs_send_notify_stream(zfsvfs_t *, char *, int, ULONG, ULONG, char *stream); +extern void zfs_set_security(struct vnode *vp, struct vnode *dvp); +extern uint64_t zfs_sid2uid(SID *sid); + +BOOLEAN vattr_apply_lx_ea(vattr_t *vap, PFILE_FULL_EA_INFORMATION ea); +NTSTATUS vnode_apply_eas(struct vnode *vp, PFILE_FULL_EA_INFORMATION eas, ULONG eaLength, PULONG pEaErrorOffset); + +extern NTSTATUS zfsdev_open(dev_t dev, PIRP Irp); +extern NTSTATUS zfsdev_release(dev_t dev, PIRP Irp); + +extern int zfs_vnop_recycle(znode_t *zp, int force); +extern uint64_t zfs_blksz(znode_t *zp); + +extern int zfs_vnop_mount(PDEVICE_OBJECT DiskDevice, PIRP Irp, PIO_STACK_LOCATION IrpSp); + +extern int zfs_build_path(znode_t *start_zp, znode_t *start_parent, char **fullpath, uint32_t *returnsize, uint32_t *start_zp_offset); + +extern int xattr_protected(char *name); +extern int xattr_stream(char *name); +extern uint64_t xattr_getsize(struct vnode *vp); +extern char *major2str(int major, int minor); +extern char *common_status_str(NTSTATUS Status); +extern char *create_options(ULONG options); +extern char *create_reply(NTSTATUS, ULONG reply); + +/* zfs_vnop_windows_lib.h */ +extern int AsciiStringToUnicodeString(char *in, PUNICODE_STRING out); +extern void FreeUnicodeString(PUNICODE_STRING s); +extern int zfs_vfs_uuid_gen(const char *osname, uuid_t uuid); +extern int zfs_vfs_uuid_unparse(uuid_t uuid, char *dst); +extern int pn_alloc(pathname_t *p); +extern int pn_free(pathname_t *p); +extern int zfs_vnop_ioctl_fullfsync(struct vnode *, vfs_context_t *, zfsvfs_t *); +extern int zfs_setwinflags(znode_t *zp, uint32_t winflags); +extern uint32_t zfs_getwinflags(znode_t *zp); +extern NTSTATUS zfs_setunlink(FILE_OBJECT *fo, vnode_t *dvp); +extern int zfs_find_dvp_vp(zfsvfs_t *, char *, int finalpartmaynotexist, + int finalpartmustnotexist, char **lastname, struct vnode **dvpp, + struct vnode **vpp, int flags); + +/* IRP_MJ_SET_INFORMATION helpers */ +extern NTSTATUS file_disposition_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS file_disposition_information_ex(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS file_endoffile_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS file_link_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS file_rename_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); + +/* IRP_MJ_GET_INFORMATION helpers */ +extern NTSTATUS file_basic_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_BASIC_INFORMATION *); +extern NTSTATUS file_standard_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_STANDARD_INFORMATION *); +extern NTSTATUS file_position_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_POSITION_INFORMATION *); +extern NTSTATUS file_ea_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_EA_INFORMATION *); +extern NTSTATUS file_network_open_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_NETWORK_OPEN_INFORMATION *); +extern NTSTATUS file_standard_link_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_STANDARD_LINK_INFORMATION *); +extern NTSTATUS file_id_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_ID_INFORMATION *); +extern NTSTATUS file_case_sensitive_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_CASE_SENSITIVE_INFORMATION *); +extern NTSTATUS file_stat_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_STAT_INFORMATION *); +extern NTSTATUS file_stat_lx_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_STAT_LX_INFORMATION *); +extern NTSTATUS file_name_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_NAME_INFORMATION *, PULONG usedspace, int normalize); +extern NTSTATUS file_remote_protocol_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_REMOTE_PROTOCOL_INFORMATION *); +extern NTSTATUS file_stream_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_STREAM_INFORMATION *, PULONG usedspace); +extern NTSTATUS file_attribute_tag_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_ATTRIBUTE_TAG_INFORMATION *tag); +extern NTSTATUS file_internal_information(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION, + FILE_INTERNAL_INFORMATION *infernal); + +/* IRP_MJ_DEVICE_CONTROL helpers */ +extern NTSTATUS QueryCapabilities(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS ioctl_query_device_name(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS ioctl_disk_get_drive_geometry(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS ioctl_disk_get_drive_geometry_ex(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS ioctl_disk_get_partition_info(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS ioctl_disk_get_partition_info_ex(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS ioctl_disk_get_length_info(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS ioctl_volume_is_io_capable(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS ioctl_storage_get_hotplug_info(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS ioctl_storage_query_property(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS ioctl_query_unique_id(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS ioctl_mountdev_query_suggested_link_name(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS ioctl_mountdev_query_stable_guid(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); +extern NTSTATUS ioctl_query_stable_guid(PDEVICE_OBJECT, PIRP, PIO_STACK_LOCATION); + + +#endif diff --git a/include/os/windows/zfs/zfs_config.h b/include/os/windows/zfs/zfs_config.h deleted file mode 100644 index 1593f5cfe334..000000000000 --- a/include/os/windows/zfs/zfs_config.h +++ /dev/null @@ -1,95 +0,0 @@ -/* zfs_config.h. Generated from zfs_config.h.in by configure. */ -/* zfs_config.h.in. Generated from configure.ac by autoheader. */ - -/* Define to 1 to enabled dmu tx validation */ -/* #undef DEBUG_DMU_TX */ - -/* Path where the Filesystems bundle is installed. */ -#define FILESYSTEMS_PREFIX "/Library/Filesystems" - -/* Define to 1 if you have the header file. */ -#define HAVE_DLFCN_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_INTTYPES_H 1 - -/* Define if you have libblkid */ -/* #undef HAVE_LIBBLKID */ - -/* Define if you have libuuid */ -#define HAVE_LIBUUID 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_MEMORY_H 1 - -/* Define to 1 if you have the `mlockall' function. */ -#define HAVE_MLOCKALL 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STDINT_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STDLIB_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STRINGS_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STRING_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_SYS_STAT_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_SYS_TYPES_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_UNISTD_H 1 - -/* Define if you have zlib */ -#define HAVE_ZLIB 1 - -/* Path where the kernel module is installed. */ -#define KERNEL_MODPREFIX "/Library/Extensions" - -/* Define to the sub-directory where libtool stores uninstalled libraries. */ -#define LT_OBJDIR ".libs/" - -/* Define to a directory where mount(2) will look for mount_zfs. */ -#define MOUNTEXECDIR "${exec_prefix}/sbin" - -/* Define ZFS_BOOT to enable kext load at boot */ -#define ZFS_BOOT 1 - -/* zfs debugging enabled */ -/* #undef ZFS_DEBUG */ - -/* Define the project author. */ -#define ZFS_META_AUTHOR "OpenZFS on OS X" - -/* Define the project release date. */ -/* #undef ZFS_META_DATA */ - -/* Define the project license. */ -#define ZFS_META_LICENSE "CDDL" - -/* Define the libtool library 'age' version information. */ -/* #undef ZFS_META_LT_AGE */ - -/* Define the libtool library 'current' version information. */ -/* #undef ZFS_META_LT_CURRENT */ - -/* Define the libtool library 'revision' version information. */ -/* #undef ZFS_META_LT_REVISION */ - -/* Define the project name. */ -#define ZFS_META_NAME "zfs" - -/* Define the project release. */ -#define ZFS_META_RELEASE "1" - -/* Define the project version. */ -#define ZFS_META_VERSION "0.2.4" - -/* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-" ZFS_META_VERSION "-" ZFS_META_RELEASE diff --git a/module/os/windows/zfs/abd_os.c b/module/os/windows/zfs/abd_os.c new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/module/os/windows/zfs/arc_os.c b/module/os/windows/zfs/arc_os.c new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/module/os/windows/zfs/policy.c b/module/os/windows/zfs/policy.c new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/module/os/windows/zfs/qat.c b/module/os/windows/zfs/qat.c new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/module/os/windows/zfs/qat_compress.c b/module/os/windows/zfs/qat_compress.c new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/module/os/windows/zfs/qat_crypt.c b/module/os/windows/zfs/qat_crypt.c new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/module/os/windows/zfs/spa_misc_os.c b/module/os/windows/zfs/spa_misc_os.c new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/module/os/windows/zfs/spa_stats.c b/module/os/windows/zfs/spa_stats.c new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/module/os/windows/zfs/trace.c b/module/os/windows/zfs/trace.c new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/module/os/windows/zfs/vdev_disk.c b/module/os/windows/zfs/vdev_disk.c new file mode 100644 index 000000000000..5f59d808633f --- /dev/null +++ b/module/os/windows/zfs/vdev_disk.c @@ -0,0 +1,810 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + /* + * Copyright (c) 2017 Jorgen Lundman + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +/* + * Virtual device vector for disks. + */ + + +#undef dprintf +#define dprintf + +wchar_t zfs_vdev_protection_filter[64] = { L"\0" }; + +static void vdev_disk_close(vdev_t *); + +extern void UnlockAndFreeMdl(PMDL); + +static void +vdev_disk_alloc(vdev_t *vd) +{ + vdev_disk_t *dvd; + + dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); + +} + +static void +vdev_disk_free(vdev_t *vd) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + if (dvd == NULL) + return; + + kmem_free(dvd, sizeof (vdev_disk_t)); + vd->vdev_tsd = NULL; +} + +static void disk_exclusive(DEVICE_OBJECT *device, boolean_t excl) +{ + SET_DISK_ATTRIBUTES diskAttrs = { 0 }; + DWORD requiredSize; + DWORD returnedSize; + + // Set disk attributes. + diskAttrs.Version = sizeof(diskAttrs); + diskAttrs.AttributesMask = DISK_ATTRIBUTE_OFFLINE | DISK_ATTRIBUTE_READ_ONLY; + diskAttrs.Attributes = excl ? DISK_ATTRIBUTE_OFFLINE | DISK_ATTRIBUTE_READ_ONLY : 0; + diskAttrs.Persist = FALSE; + + if (kernel_ioctl(device, IOCTL_DISK_SET_DISK_ATTRIBUTES, + &diskAttrs, sizeof(diskAttrs), NULL, 0) != 0) { + dprintf("IOCTL_DISK_SET_DISK_ATTRIBUTES"); + return; + } + + // Tell the system that the disk was changed. + if (kernel_ioctl(device, IOCTL_DISK_UPDATE_PROPERTIES, NULL, 0, NULL, 0) != 0) + dprintf("IOCTL_DISK_UPDATE_PROPERTIES"); + +} + + +/* + * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when + * even a fallback to DKIOCGMEDIAINFO fails. + */ +#ifdef DEBUG +#define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__) +#else +#define VDEV_DEBUG(...) /* Nothing... */ +#endif + +static int +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) +{ + spa_t *spa = vd->vdev_spa; + vdev_disk_t *dvd = vd->vdev_tsd; + int error = EINVAL; + uint64_t capacity = 0, blksz = 0, pbsize = 0; + int isssd; + char *vdev_path = NULL; + + PAGED_CODE(); + + dprintf("%s: open of '%s' (physpath '%s')\n", __func__, vd->vdev_path, vd->vdev_physpath ? vd->vdev_physpath : ""); + /* + * We must have a pathname, and it must be absolute. + * It can also start with # for partition encoded paths + */ + if (vd->vdev_path == NULL || (vd->vdev_path[0] != '/' && vd->vdev_path[0] != '#')) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (dvd != NULL) { + if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) { + /* + * If we are opening a device in its offline notify + * context, the LDI handle was just closed. Clean + * up the LDI event callbacks and free vd->vdev_tsd. + */ + vdev_disk_free(vd); + } + else { + ASSERT(vd->vdev_reopening); + goto skip_open; + } + } + + /* + * Create vd->vdev_tsd. + */ + vdev_disk_alloc(vd); + dvd = vd->vdev_tsd; + + /* + * If we have not yet opened the device, try to open it by the + * specified path. + */ + NTSTATUS ntstatus; + uint8_t *FileName = NULL; + uint32_t FileLength; + + // Use vd->vdev_physpath first, if set, otherwise + // usual vd->vdev_path + vdev_path = vd->vdev_path; + if (vd->vdev_physpath) + vdev_path = vd->vdev_physpath; + + /* Check for partition encoded paths */ + if (vdev_path[0] == '#') { + uint8_t *end; + end = &vdev_path[0]; + while (end && end[0] == '#') end++; + ddi_strtoull(end, &end, 10, &vd->vdev_win_offset); + while (end && end[0] == '#') end++; + ddi_strtoull(end, &end, 10, &vd->vdev_win_length); + while (end && end[0] == '#') end++; + + FileName = end; + + } + else { + + FileName = vd->vdev_path; + + } + + // Apparently in Userland it is "\\?\" but in + // kernel has to be "\??\" - is there not a name that works in both? + if (!strncmp("\\\\?\\", FileName, 4)) { + FileName[1] = '?'; + } + + dprintf("%s: opening '%s'\n", __func__, FileName); + + ANSI_STRING AnsiFilespec; + UNICODE_STRING UnicodeFilespec; + OBJECT_ATTRIBUTES ObjectAttributes; + + SHORT UnicodeName[PATH_MAX]; + CHAR AnsiName[PATH_MAX]; + USHORT NameLength = 0; + + memset(UnicodeName, 0, sizeof(SHORT) * PATH_MAX); + memset(AnsiName, 0, sizeof(UCHAR) * PATH_MAX); + + NameLength = strlen(FileName); + ASSERT(NameLength < PATH_MAX); + + memmove(AnsiName, FileName, NameLength); + + AnsiFilespec.MaximumLength = AnsiFilespec.Length = NameLength; + AnsiFilespec.Buffer = AnsiName; + + UnicodeFilespec.MaximumLength = PATH_MAX * 2; + UnicodeFilespec.Length = 0; + UnicodeFilespec.Buffer = (PWSTR)UnicodeName; + + RtlAnsiStringToUnicodeString(&UnicodeFilespec, &AnsiFilespec, FALSE); + + ObjectAttributes.Length = sizeof(OBJECT_ATTRIBUTES); + ObjectAttributes.RootDirectory = NULL; + ObjectAttributes.Attributes = /*OBJ_CASE_INSENSITIVE |*/ OBJ_KERNEL_HANDLE; + ObjectAttributes.ObjectName = &UnicodeFilespec; + ObjectAttributes.SecurityDescriptor = NULL; + ObjectAttributes.SecurityQualityOfService = NULL; + IO_STATUS_BLOCK iostatus; + + ntstatus = ZwCreateFile(&dvd->vd_lh, + spa_mode(spa) == FREAD ? GENERIC_READ | SYNCHRONIZE : GENERIC_READ | GENERIC_WRITE | SYNCHRONIZE, + &ObjectAttributes, + &iostatus, + 0, + FILE_ATTRIBUTE_NORMAL, + /* FILE_SHARE_WRITE | */ FILE_SHARE_READ, + FILE_OPEN, + FILE_SYNCHRONOUS_IO_NONALERT | (spa_mode(spa) == FREAD ? 0 : FILE_NO_INTERMEDIATE_BUFFERING), + NULL, + 0); + + if (ntstatus == STATUS_SUCCESS) { + error = 0; + } else { + error = EINVAL; // GetLastError(); + dvd->vd_lh = NULL; + } + + /* + * If we succeeded in opening the device, but 'vdev_wholedisk' + * is not yet set, then this must be a slice. + */ + if (error == 0 && vd->vdev_wholedisk == -1ULL) + vd->vdev_wholedisk = 0; + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + // Since we will use DeviceObject and FileObject to do ioctl and IO + // we grab them now and lock them in place. + // Convert HANDLE to FileObject + PFILE_OBJECT FileObject; + PDEVICE_OBJECT DeviceObject; + NTSTATUS status; + + // This adds a reference to FileObject + status = ObReferenceObjectByHandle( + dvd->vd_lh, // fixme, keep this in dvd + 0, + *IoFileObjectType, + KernelMode, + &FileObject, + NULL + ); + if (status != STATUS_SUCCESS) { + ZwClose(dvd->vd_lh); + dvd->vd_lh = NULL; + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return EIO; + } + + + // Convert FileObject to DeviceObject + PDEVICE_OBJECT pTopDevice = IoGetRelatedDeviceObject(FileObject); + PDEVICE_OBJECT pSendToDevice = pTopDevice; // default + + /* + Move to the top of the device stack or until we find the protection filter driver. + We need to stay under that driver so we can still access the disk + after protecting it. + The custom protection filter is optional: if none set we stay under the default "partmgr" driver; + otherwise we will stay under the first one found. + By default the disk gets minimal protection being set offline and read only through "partmgr". + A custom filter driver can provide enhanced protection for the vdev disk. + */ + UNICODE_STRING customFilterName; + UNICODE_STRING defaultFilterName; + RtlInitUnicodeString(&customFilterName, zfs_vdev_protection_filter); + RtlInitUnicodeString(&defaultFilterName, L"\\Driver\\partmgr"); // default + + DeviceObject = FileObject->DeviceObject; // bottom of stack + while (DeviceObject) { + if ((zfs_vdev_protection_filter[0] != L'\0' ? !RtlCompareUnicodeString(&DeviceObject->DriverObject->DriverName, &customFilterName, TRUE) : FALSE) || + !RtlCompareUnicodeString(&DeviceObject->DriverObject->DriverName, &defaultFilterName, TRUE)) { + dprintf("%s: disk %s : vdev protection filter set to %S\n", __func__, + FileName, DeviceObject->DriverObject->DriverName.Buffer); + break; + } + pSendToDevice = DeviceObject; + DeviceObject = DeviceObject->AttachedDevice; + } + DeviceObject = pSendToDevice; + + // Grab a reference to DeviceObject + ObReferenceObject(DeviceObject); + + dvd->vd_FileObject = FileObject; + dvd->vd_DeviceObject = DeviceObject; + + // Make disk readonly and offline, so that users can't partition/format it. + disk_exclusive(pTopDevice, TRUE); + +skip_open: + + /* + * Determine the actual size of the device. + */ + if (vd->vdev_win_length != 0) { + *psize = vd->vdev_win_length; + } else { + DISK_GEOMETRY_EX geometry_ex; + DWORD len; + error = kernel_ioctl(dvd->vd_DeviceObject, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, + NULL, 0, + &geometry_ex, sizeof(geometry_ex)); + if (error == 0) + capacity = geometry_ex.DiskSize.QuadPart; + } + /* + * Determine the device's minimum transfer size. + * If the ioctl isn't supported, assume DEV_BSIZE. + */ + // fill in capacity, blksz, pbsize + STORAGE_PROPERTY_QUERY storageQuery; + memset(&storageQuery, 0, sizeof(STORAGE_PROPERTY_QUERY)); + storageQuery.PropertyId = StorageAccessAlignmentProperty; + storageQuery.QueryType = PropertyStandardQuery; + + STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR diskAlignment = { 0 }; + memset(&diskAlignment, 0, sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR)); + DWORD outsize; + + error = kernel_ioctl(dvd->vd_DeviceObject, IOCTL_STORAGE_QUERY_PROPERTY, + &storageQuery, sizeof(STORAGE_PROPERTY_QUERY), + &diskAlignment, sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR)); + + if (error == 0) { + blksz = diskAlignment.BytesPerLogicalSector; + pbsize = diskAlignment.BytesPerPhysicalSector; + if (!blksz) blksz = DEV_BSIZE; + if (!pbsize) pbsize = DEV_BSIZE; + } else { + blksz = pbsize = DEV_BSIZE; + } + + // Set psize to the size of the partition. For now, assume virtual + // since ioctls do not seem to work. + if (vd->vdev_win_length != 0) + *psize = vd->vdev_win_length; + + // Set max_psize to the biggest it can be, expanding.. + *max_psize = *psize; + + + if (!blksz) blksz = DEV_BSIZE; + if (!pbsize) pbsize = DEV_BSIZE; + + *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; + dprintf("%s: picked ashift %llu for device\n", __func__, *ashift); + + /* + * Clear the nowritecache bit, so that on a vdev_reopen() we will + * try again. + */ + vd->vdev_nowritecache = B_FALSE; + + /* Set when device reports it supports TRIM. */ + vd->vdev_has_trim = !!blk_queue_discard(dvd->vd_DeviceObject); + + /* Set when device reports it supports secure TRIM. */ + vd->vdev_has_securetrim = !!blk_queue_discard_secure(dvd->vd_DeviceObject); + + /* Inform the ZIO pipeline that we are non-rotational */ + /* Best choice seems to be either TRIM, or SeekPenalty */ + vd->vdev_nonrot = vd->vdev_has_trim || blk_queue_nonrot(dvd->vd_DeviceObject); + + dprintf("%s: nonrot %d, trim %d, securetrim %d\n", __func__, + vd->vdev_nonrot, vd->vdev_has_trim, vd->vdev_has_securetrim); + + return (0); +} + + +static void +vdev_disk_close(vdev_t *vd) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + if (vd->vdev_reopening || dvd == NULL) + return; + + vd->vdev_delayed_close = B_FALSE; + /* + * If we closed the LDI handle due to an offline notify from LDI, + * don't free vd->vdev_tsd or unregister the callbacks here; + * the offline finalize callback or a reopen will take care of it. + */ + if (dvd->vd_ldi_offline) + return; + + if (dvd->vd_lh != NULL) { + dprintf("%s: \n", __func__); + + // Undo disk readonly and offline. + disk_exclusive(IoGetRelatedDeviceObject(dvd->vd_FileObject), FALSE); + + // Release our holds + ObDereferenceObject(dvd->vd_FileObject); + ObDereferenceObject(dvd->vd_DeviceObject); + // Close file + ZwClose(dvd->vd_lh); + } + + dvd->vd_lh = NULL; + dvd->vd_FileObject = NULL; + dvd->vd_DeviceObject = NULL; + + vdev_disk_free(vd); +} + +int +vdev_disk_physio(vdev_t *vd, caddr_t data, + size_t size, uint64_t offset, int flags, boolean_t isdump) +{ + vdev_disk_t *dvd = vd->vdev_tsd; + + //dprintf("%s: \n", __func__); + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (dvd == NULL || (dvd->vd_ldi_offline)) + return (EIO); + + ASSERT(vd->vdev_ops == &vdev_disk_ops); + + return EIO; +} + +static void +vdev_disk_ioctl_free(zio_t *zio) +{ + kmem_free(zio->io_vsd, sizeof (struct dk_callback)); +} + +static const zio_vsd_ops_t vdev_disk_vsd_ops = { + vdev_disk_ioctl_free, + zio_vsd_default_cksum_report +}; + +static void +vdev_disk_ioctl_done(void *zio_arg, int error) +{ + zio_t *zio = zio_arg; + + zio->io_error = error; + + zio_interrupt(zio); +} + +struct vdev_disk_callback_struct { + zio_t *zio; + PIRP irp; + void *b_addr; + char work_item[0]; +}; +typedef struct vdev_disk_callback_struct vd_callback_t; + +static void +vdev_disk_io_start_done(void *param) +{ + vd_callback_t *vb = (vd_callback_t *)param; + + ASSERT(vb != NULL); + + NTSTATUS status = vb->irp->IoStatus.Status; + zio_t *zio = vb->zio; + zio->io_error = (!NT_SUCCESS(status) ? EIO : 0); + + // Return abd buf + if (zio->io_type == ZIO_TYPE_READ) { + VERIFY3S(zio->io_abd->abd_size, >= , zio->io_size); + abd_return_buf_copy_off(zio->io_abd, vb->b_addr, + 0, zio->io_size, zio->io_abd->abd_size); + } else { + VERIFY3S(zio->io_abd->abd_size, >= , zio->io_size); + abd_return_buf_off(zio->io_abd, vb->b_addr, + 0, zio->io_size, zio->io_abd->abd_size); + } + + UnlockAndFreeMdl(vb->irp->MdlAddress); + IoFreeIrp(vb->irp); + kmem_free(vb, sizeof(vd_callback_t) + IoSizeofWorkItem()); + vb = NULL; + zio_delay_interrupt(zio); +} + +static VOID +DiskIoWkRtn( + __in PVOID pDummy, // Not used. + __in PVOID pWkParms // Parm list pointer. +) +{ + vd_callback_t *vb = (vd_callback_t *)pWkParms; + + UNREFERENCED_PARAMETER(pDummy); + IoUninitializeWorkItem((PIO_WORKITEM)vb->work_item); + vdev_disk_io_start_done(vb); +} + +/* +* IO has finished callback, in Windows this is called as a different +* IRQ level, so we can practically do nothing here. (Can't call mutex +* locking, like from kmem_free()) +*/ + +IO_COMPLETION_ROUTINE vdev_disk_io_intrxxx; + +static NTSTATUS +vdev_disk_io_intrxxx(PDEVICE_OBJECT DeviceObject, PIRP irp, PVOID Context) +{ + vd_callback_t *vb = (vd_callback_t *)Context; + + ASSERT(vb != NULL); + + vdev_disk_t *dvd = vb->zio->io_vd->vdev_tsd; + + /* If IRQL is below DIPATCH_LEVEL then there is no issue in calling + * vdev_disk_io_start_done() directly; otherwise queue a new Work Item + */ + if (KeGetCurrentIrql() < DISPATCH_LEVEL) + vdev_disk_io_start_done(vb); + else { + IoInitializeWorkItem(dvd->vd_DeviceObject, (PIO_WORKITEM)vb->work_item); + IoQueueWorkItem((PIO_WORKITEM)vb->work_item, DiskIoWkRtn, DelayedWorkQueue, vb); + } + return STATUS_MORE_PROCESSING_REQUIRED; +} + +static void +vdev_disk_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_disk_t *dvd = vd->vdev_tsd; + struct dk_callback *dkc; + buf_t *bp; + unsigned long trim_flags = 0; + int flags, error = 0; + + //dprintf("%s: type 0x%x offset 0x%llx len 0x%llx \n", __func__, zio->io_type, zio->io_offset, zio->io_size); + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (dvd == NULL || (dvd->vd_ldi_offline)) { + zio->io_error = ENXIO; + zio_interrupt(zio); + return; + } + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + + if (zfs_nocacheflush) + break; + + if (vd->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + + zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); + zio->io_vsd_ops = &vdev_disk_vsd_ops; + + dkc->dkc_callback = vdev_disk_ioctl_done; +// dkc->dkc_flag = FLUSH_VOLATILE; + dkc->dkc_cookie = zio; + +// error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, +// (uintptr_t)dkc, FKIOCTL, kcred, NULL); + + if (error == 0) { + /* + * The ioctl will be done asychronously, + * and will call vdev_disk_ioctl_done() + * upon completion. + */ + zio_execute(zio); // until we have ioctl + return; + } + + zio->io_error = error; + + break; + + default: + zio->io_error = SET_ERROR(ENOTSUP); + } /* io_cmd */ + + zio_execute(zio); + return; + + case ZIO_TYPE_WRITE: + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) + flags = B_WRITE; + else + flags = B_WRITE | B_ASYNC; + break; + + case ZIO_TYPE_READ: + if (zio->io_priority == ZIO_PRIORITY_SYNC_READ) + flags = B_READ; + else + flags = B_READ | B_ASYNC; + break; + + case ZIO_TYPE_TRIM: +#if defined(BLKDEV_DISCARD_SECURE) + if (zio->io_trim_flags & ZIO_TRIM_SECURE) + trim_flags |= BLKDEV_DISCARD_SECURE; +#endif + zio->io_error = -blkdev_issue_discard_bytes(dvd->vd_DeviceObject, + zio->io_offset, zio->io_size, trim_flags); + zio_interrupt(zio); + return; + + default: + zio->io_error = SET_ERROR(ENOTSUP); + zio_execute(zio); + return; + } /* io_type */ + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + + /* Stop OSX from also caching our data */ + flags |= B_NOCACHE | B_PASSIVE; // Windowsify me + + zio->io_target_timestamp = zio_handle_io_delay(zio); + + ASSERT(zio->io_size != 0); + + PIRP irp = NULL; + PIO_STACK_LOCATION irpStack = NULL; + IO_STATUS_BLOCK IoStatusBlock = { 0 }; + LARGE_INTEGER offset; + + offset.QuadPart = zio->io_offset + vd->vdev_win_offset; + + /* Preallocate space for IoWorkItem, required for vdev_disk_io_start_done callback */ + vd_callback_t *vb = (vd_callback_t *)kmem_alloc(sizeof(vd_callback_t) + IoSizeofWorkItem(), KM_SLEEP); + + vb->zio = zio; + + if (zio->io_type == ZIO_TYPE_READ) { + ASSERT3S(zio->io_abd->abd_size, >= , zio->io_size); + vb->b_addr = + abd_borrow_buf(zio->io_abd, zio->io_abd->abd_size); + } else { + vb->b_addr = + abd_borrow_buf_copy(zio->io_abd, zio->io_abd->abd_size); + } + + if (flags & B_READ) { + irp = IoBuildAsynchronousFsdRequest(IRP_MJ_READ, + dvd->vd_DeviceObject, + vb->b_addr, + (ULONG)zio->io_size, + &offset, + &IoStatusBlock); + } else { + irp = IoBuildAsynchronousFsdRequest(IRP_MJ_WRITE, + dvd->vd_DeviceObject, + vb->b_addr, + (ULONG)zio->io_size, + &offset, + &IoStatusBlock); + } + + if (!irp) { + kmem_free(vb, sizeof(vd_callback_t) + IoSizeofWorkItem()); + zio->io_error = EIO; + zio_interrupt(zio); + return; + } + + vb->irp = irp; + + irpStack = IoGetNextIrpStackLocation(irp); + + irpStack->Flags |= SL_OVERRIDE_VERIFY_VOLUME; // SetFlag(IoStackLocation->Flags, SL_OVERRIDE_VERIFY_VOLUME); + //SetFlag(ReadIrp->Flags, IRP_NOCACHE); + irpStack->FileObject = dvd->vd_FileObject; + + IoSetCompletionRoutine(irp, + vdev_disk_io_intrxxx, + vb, // "Context" in vdev_disk_io_intr() + TRUE, // On Success + TRUE, // On Error + TRUE);// On Cancel + + IoCallDriver(dvd->vd_DeviceObject, irp); + + return; +} + +static void +vdev_disk_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + /* + * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if + * the device has been removed. If this is the case, then we trigger an + * asynchronous removal of the device. Otherwise, probe the device and + * make sure it's still accessible. + */ + if (zio->io_error == EIO && !vd->vdev_remove_wanted) { + vdev_disk_t *dvd = vd->vdev_tsd; +// int state = DKIO_NONE; + } else if (!vd->vdev_delayed_close) { + vd->vdev_delayed_close = B_TRUE; + } +} + +static void +vdev_disk_hold(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + + /* We must have a pathname, and it must be absolute. */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') + return; + + /* + * Only prefetch path and devid info if the device has + * never been opened. + */ + if (vd->vdev_tsd != NULL) + return; + + /* XXX: Implement me as a vnode lookup for the device */ + vd->vdev_name_vp = NULL; + vd->vdev_devid_vp = NULL; +} + +static void +vdev_disk_rele(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + + /* XXX: Implement me as a vnode rele for the device */ +} + +vdev_ops_t vdev_disk_ops = { + vdev_disk_open, + vdev_disk_close, + vdev_default_asize, + vdev_disk_io_start, + vdev_disk_io_done, + NULL, + NULL, + vdev_disk_hold, + vdev_disk_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +/* + * Given the root disk device devid or pathname, read the label from + * the device, and construct a configuration nvlist. + */ +int +vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) +{ + return -1; +} diff --git a/module/os/windows/zfs/vdev_file.c b/module/os/windows/zfs/vdev_file.c new file mode 100644 index 000000000000..a7bcb07e268a --- /dev/null +++ b/module/os/windows/zfs/vdev_file.c @@ -0,0 +1,601 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Virtual device vector for files. + */ + +static taskq_t *vdev_file_taskq; + +extern void UnlockAndFreeMdl(PMDL); + +static void +vdev_file_hold(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static void +vdev_file_rele(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +#ifdef _KERNEL +extern int VOP_GETATTR(struct vnode *vp, vattr_t *vap, int flags, void *x3, void *x4); +#endif + +static int +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) +{ +#if _KERNEL + static vattr_t vattr; + vdev_file_t *vf; +#endif + int error = 0; + + dprintf("vdev_file_open %p\n", vd->vdev_tsd); + /* Rotational optimizations only make sense on block devices */ + vd->vdev_nonrot = B_TRUE; + + /* + * Allow TRIM on file based vdevs. This may not always be supported, + * since it depends on your kernel version and underlying filesystem + * type but it is always safe to attempt. + */ + vd->vdev_has_trim = B_TRUE; + + /* + * Disable secure TRIM on file based vdevs. There is no way to + * request this behavior from the underlying filesystem. + */ + vd->vdev_has_securetrim = B_FALSE; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || (vd->vdev_path[0] != '/' && + vd->vdev_path[0] != '\\')) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ +#ifdef _KERNEL + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + vf = vd->vdev_tsd; + goto skip_open; + } + + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); +#endif + + /* + * We always open the files from the root of the global zone, even if + * we're in a local zone. If the user has gotten to this point, the + * administrator has already decided that the pool should be available + * to local zone users, so the underlying devices should be as well. + */ + ASSERT(vd->vdev_path != NULL && ( + vd->vdev_path[0] == '/' || vd->vdev_path[0] == '\\')); + + /* + vn_openat(char *pnamep, + enum uio_seg seg, + int filemode, + int createmode, + struct vnode **vpp, + enum create crwhy, + mode_t umask, + struct vnode *startvp) + extern int vn_openat(char *pnamep, enum uio_seg seg, int filemode, + int createmode, struct vnode **vpp, enum create crwhy, + mode_t umask, struct vnode *startvp); + */ + uint8_t *FileName = NULL; + FileName = vd->vdev_path; + + if (!strncmp("\\\\?\\", FileName, 4)) { + FileName[1] = '?'; + } + + dprintf("%s: opening '%s'\n", __func__, FileName); + +#ifdef _KERNEL + + ANSI_STRING AnsiFilespec; + UNICODE_STRING UnicodeFilespec; + OBJECT_ATTRIBUTES ObjectAttributes; + + SHORT UnicodeName[PATH_MAX]; + CHAR AnsiName[PATH_MAX]; + USHORT NameLength = 0; + NTSTATUS ntstatus; + + memset(UnicodeName, 0, sizeof(SHORT) * PATH_MAX); + memset(AnsiName, 0, sizeof(UCHAR) * PATH_MAX); + + NameLength = strlen(FileName); + ASSERT(NameLength < PATH_MAX); + + memmove(AnsiName, FileName, NameLength); + + AnsiFilespec.MaximumLength = AnsiFilespec.Length = NameLength; + AnsiFilespec.Buffer = AnsiName; + + UnicodeFilespec.MaximumLength = PATH_MAX * 2; + UnicodeFilespec.Length = 0; + UnicodeFilespec.Buffer = (PWSTR)UnicodeName; + + RtlAnsiStringToUnicodeString(&UnicodeFilespec, &AnsiFilespec, FALSE); + + ObjectAttributes.Length = sizeof(OBJECT_ATTRIBUTES); + ObjectAttributes.RootDirectory = NULL; + ObjectAttributes.Attributes = /*OBJ_CASE_INSENSITIVE |*/ OBJ_KERNEL_HANDLE; + ObjectAttributes.ObjectName = &UnicodeFilespec; + ObjectAttributes.SecurityDescriptor = NULL; + ObjectAttributes.SecurityQualityOfService = NULL; + IO_STATUS_BLOCK iostatus; + + ntstatus = ZwCreateFile(&vf->vf_handle, + spa_mode(vd->vdev_spa) == FREAD ? GENERIC_READ | SYNCHRONIZE : GENERIC_READ | GENERIC_WRITE | SYNCHRONIZE, + &ObjectAttributes, + &iostatus, + 0, + FILE_ATTRIBUTE_NORMAL, + /* FILE_SHARE_WRITE | */ FILE_SHARE_READ, + FILE_OPEN, + FILE_SYNCHRONOUS_IO_NONALERT | (spa_mode(vd->vdev_spa) == FREAD ? 0 : FILE_NO_INTERMEDIATE_BUFFERING), + NULL, + 0); + + if (ntstatus == STATUS_SUCCESS) { + error = 0; + } else { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + goto failed; + } + + + /* + * Make sure it's a regular file. + */ + FILE_STANDARD_INFORMATION info; + IO_STATUS_BLOCK iob; + + if ((ZwQueryInformationFile( + vf->vf_handle, + &iob, + &info, + sizeof(info), + FileStandardInformation) != STATUS_SUCCESS) || + (info.Directory != FALSE)) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + ZwClose(vf->vf_handle); + error = ENOENT; + goto failed; + } + + // Since we will use DeviceObject and FileObject to do ioctl and IO + // we grab them now and lock them in place. + // Convert HANDLE to FileObject + PFILE_OBJECT FileObject; + PDEVICE_OBJECT DeviceObject; + NTSTATUS status; + + // This adds a reference to FileObject + status = ObReferenceObjectByHandle( + vf->vf_handle, + 0, + *IoFileObjectType, + KernelMode, + &FileObject, + NULL + ); + if (status != STATUS_SUCCESS) { + ZwClose(vf->vf_handle); + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + error = EIO; + goto failed; + } + + // Convert FileObject to DeviceObject + DeviceObject = IoGetRelatedDeviceObject(FileObject); + + // Grab a reference to DeviceObject + ObReferenceObject(DeviceObject); + + vf->vf_FileObject = FileObject; + vf->vf_DeviceObject = DeviceObject; + + // Change it to SPARSE, so TRIM might work + status = ZwFsControlFile( + vf->vf_handle, + NULL, + NULL, + NULL, + NULL, + FSCTL_SET_SPARSE, + NULL, + 0, + NULL, + 0 + ); + dprintf("%s: set Sparse 0x%x.\n", __func__, status); + +#endif + +#if _KERNEL +skip_open: + /* + * Determine the physical size of the file. + */ + //vattr.va_mask = AT_SIZE; + //vn_lock(vf->vf_vnode, LK_SHARED | LK_RETRY); + //error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL); + //VN_UNLOCK(vf->vf_vnode); +#endif + +#ifdef _KERNEL + *max_psize = *psize = info.EndOfFile.QuadPart; +#else + /* userland's vn_open() will get the device size for us, so we can + * just look it up - there is argument for a userland VOP_GETATTR to make + * this function cleaner. */ +// *max_psize = *psize = vp->v_size; +#endif + *ashift = SPA_MINBLOCKSHIFT; + + return (0); + +failed: +#ifdef _KERNEL + if (vf) { + if (vf->vf_handle != NULL) { + vf->vf_handle = NULL; + } + + kmem_free(vf, sizeof(vdev_file_t)); + vd->vdev_tsd = NULL; + } +#endif + return error; +} + +static void +vdev_file_close(vdev_t *vd) +{ +#ifdef _KERNEL + vdev_file_t *vf = vd->vdev_tsd; + + if (vd->vdev_reopening || vf == NULL) + return; + + if (vf->vf_handle != NULL) { + + // Release our holds + ObDereferenceObject(vf->vf_FileObject); + ObDereferenceObject(vf->vf_DeviceObject); + + ZwClose(vf->vf_handle); + } + + vf->vf_FileObject = NULL; + vf->vf_DeviceObject = NULL; + vf->vf_handle = NULL; + vd->vdev_delayed_close = B_FALSE; + kmem_free(vf, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; +#endif +} + +#ifdef _KERNEL +struct vdev_file_callback_struct { + zio_t *zio; + PIRP irp; + void *b_data; + char work_item[0]; +}; +typedef struct vdev_file_callback_struct vf_callback_t; + +static void +vdev_file_io_start_done(void *param) +{ + vf_callback_t *vb = (vf_callback_t *)param; + + ASSERT(vb != NULL); + + NTSTATUS status = vb->irp->IoStatus.Status; + zio_t *zio = vb->zio; + zio->io_error = (!NT_SUCCESS(status) ? EIO : 0); + + // Return abd buf + if (zio->io_type == ZIO_TYPE_READ) { + VERIFY3S(zio->io_abd->abd_size, >= , zio->io_size); + abd_return_buf_copy_off(zio->io_abd, vb->b_data, + 0, zio->io_size, zio->io_abd->abd_size); + } else { + VERIFY3S(zio->io_abd->abd_size, >= , zio->io_size); + abd_return_buf_off(zio->io_abd, vb->b_data, + 0, zio->io_size, zio->io_abd->abd_size); + } + + UnlockAndFreeMdl(vb->irp->MdlAddress); + IoFreeIrp(vb->irp); + kmem_free(vb, sizeof(vf_callback_t) + IoSizeofWorkItem()); + vb = NULL; + zio_delay_interrupt(zio); +} + +static VOID +FileIoWkRtn( + __in PVOID pDummy, // Not used. + __in PVOID pWkParms // Parm list pointer. +) +{ + vf_callback_t *vb = (vf_callback_t *)pWkParms; + + UNREFERENCED_PARAMETER(pDummy); + IoUninitializeWorkItem((PIO_WORKITEM)vb->work_item); + vdev_file_io_start_done(vb); +} + +static NTSTATUS +vdev_file_io_intrxxx(PDEVICE_OBJECT DeviceObject, PIRP irp, PVOID Context) +{ + vf_callback_t *vb = (vf_callback_t *)Context; + + ASSERT(vb != NULL); + + /* If IRQL is below DIPATCH_LEVEL then there is no issue in calling + * vdev_file_io_start_done() directly; otherwise queue a new Work Item + */ + if (KeGetCurrentIrql() < DISPATCH_LEVEL) + vdev_file_io_start_done(vb); + else { + vdev_file_t *vf = vb->zio->io_vd->vdev_tsd; + IoInitializeWorkItem(vf->vf_DeviceObject, (PIO_WORKITEM)vb->work_item); + IoQueueWorkItem((PIO_WORKITEM)vb->work_item, FileIoWkRtn, DelayedWorkQueue, vb); + } + + return STATUS_MORE_PROCESSING_REQUIRED; +} +#endif + +/* + * count the number of mismatches of zio->io_size and zio->io_abd->abd_size below + */ +_Atomic uint64_t zfs_vdev_file_size_mismatch_cnt = 0; + +static void +vdev_file_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + ssize_t resid = 0; + + + if (zio->io_type == ZIO_TYPE_IOCTL) { + + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: +#if 0 + if (!vnode_getwithvid(vf->vf_vnode, vf->vf_vid)) { + zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, + kcred, NULL); + vnode_put(vf->vf_vnode); + } +#endif + break; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + + zio_interrupt(zio); + return; + + } else if (zio->io_type == ZIO_TYPE_TRIM) { +#ifdef _KERNEL + struct flock flck; + vdev_file_t *vf = vd->vdev_tsd; + + ASSERT3U(zio->io_size, != , 0); + bzero(&flck, sizeof(flck)); + flck.l_type = F_FREESP; + flck.l_start = zio->io_offset; + flck.l_len = zio->io_size; + flck.l_whence = 0; + + zio->io_error = VOP_SPACE(vf->vf_handle, F_FREESP, &flck, + 0, 0, kcred, NULL); + +#endif + zio_execute(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); + + + ASSERT(zio->io_size != 0); + +#ifdef _KERNEL + vdev_file_t *vf = vd->vdev_tsd; + + PIRP irp = NULL; + PIO_STACK_LOCATION irpStack = NULL; + IO_STATUS_BLOCK IoStatusBlock = { 0 }; + LARGE_INTEGER offset; + + offset.QuadPart = zio->io_offset + vd->vdev_win_offset; + + /* Preallocate space for IoWorkItem, required for vdev_file_io_start_done callback */ + vf_callback_t *vb = (vf_callback_t *)kmem_alloc(sizeof(vf_callback_t) + IoSizeofWorkItem(), KM_SLEEP); + + vb->zio = zio; + +#ifdef DEBUG + if (zio->io_abd->abd_size != zio->io_size) { + zfs_vdev_file_size_mismatch_cnt++; + // this dprintf can be very noisy + dprintf("ZFS: %s: trimming zio->io_abd from 0x%x to 0x%llx\n", + __func__, zio->io_abd->abd_size, zio->io_size); + } +#endif + + if (zio->io_type == ZIO_TYPE_READ) { + ASSERT3S(zio->io_abd->abd_size, >= , zio->io_size); + vb->b_data = + abd_borrow_buf(zio->io_abd, zio->io_abd->abd_size); + } else { + ASSERT3S(zio->io_abd->abd_size, >= , zio->io_size); + vb->b_data = + abd_borrow_buf_copy(zio->io_abd, zio->io_abd->abd_size); + } + + if (zio->io_type == ZIO_TYPE_READ) { + irp = IoBuildAsynchronousFsdRequest(IRP_MJ_READ, + vf->vf_DeviceObject, + vb->b_data, + (ULONG)zio->io_size, + &offset, + &IoStatusBlock); + } else { + irp = IoBuildAsynchronousFsdRequest(IRP_MJ_WRITE, + vf->vf_DeviceObject, + vb->b_data, + (ULONG)zio->io_size, + &offset, + &IoStatusBlock); + } + + if (!irp) { + kmem_free(vb, sizeof(vf_callback_t) + IoSizeofWorkItem()); + zio->io_error = EIO; + zio_interrupt(zio); + return; + } + + irpStack = IoGetNextIrpStackLocation(irp); + + irpStack->Flags |= SL_OVERRIDE_VERIFY_VOLUME; // SetFlag(IoStackLocation->Flags, SL_OVERRIDE_VERIFY_VOLUME); + //SetFlag(ReadIrp->Flags, IRP_NOCACHE); + irpStack->FileObject = vf->vf_FileObject; + + IoSetCompletionRoutine(irp, + vdev_file_io_intrxxx, + vb, // "Context" in vdev_file_io_intr() + TRUE, // On Success + TRUE, // On Error + TRUE);// On Cancel + + IoCallDriver(vf->vf_DeviceObject, irp); +#endif + + return; +} + + +/* ARGSUSED */ +static void +vdev_file_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_file_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + NULL, + vdev_file_hold, + vdev_file_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_FILE, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +void +vdev_file_init(void) +{ + vdev_file_taskq = taskq_create("vdev_file_taskq", 100, minclsyspri, + max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); + + VERIFY(vdev_file_taskq); +} + +void +vdev_file_fini(void) +{ + taskq_destroy(vdev_file_taskq); +} + +/* + * From userland we access disks just like files. + */ +#ifndef _KERNEL + +vdev_ops_t vdev_disk_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + NULL, + vdev_file_hold, + vdev_file_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +#endif diff --git a/module/os/windows/zfs/zfs_acl.c b/module/os/windows/zfs/zfs_acl.c new file mode 100644 index 000000000000..4ade9099e746 --- /dev/null +++ b/module/os/windows/zfs/zfs_acl.c @@ -0,0 +1,2997 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include + +#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE +#define DENY ACE_ACCESS_DENIED_ACE_TYPE +#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE +#define MIN_ACE_TYPE ALLOW + +#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) +#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) +#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) + +#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ + ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ + ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) + +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ + ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) + +#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) + +#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ + ZFS_ACL_PROTECTED) + +#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ + ZFS_ACL_OBJ_ACE) + +#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) + +static uint16_t +zfs_ace_v0_get_type(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_v0_get_flags(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_v0_get_mask(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_v0_get_who(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_fuid); +} + +static void +zfs_ace_v0_set_type(void *acep, uint16_t type) +{ + ((zfs_oldace_t *)acep)->z_type = type; +} + +static void +zfs_ace_v0_set_flags(void *acep, uint16_t flags) +{ + ((zfs_oldace_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_v0_set_mask(void *acep, uint32_t mask) +{ + ((zfs_oldace_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_v0_set_who(void *acep, uint64_t who) +{ + ((zfs_oldace_t *)acep)->z_fuid = who; +} + +/*ARGSUSED*/ +static size_t +zfs_ace_v0_size(void *acep) +{ + return (sizeof (zfs_oldace_t)); +} + +static size_t +zfs_ace_v0_abstract_size(void) +{ + return (sizeof (zfs_oldace_t)); +} + +static int +zfs_ace_v0_mask_off(void) +{ + return (offsetof(zfs_oldace_t, z_access_mask)); +} + +/*ARGSUSED*/ +static int +zfs_ace_v0_data(void *acep, void **datap) +{ + *datap = NULL; + return (0); +} + +static acl_ops_t zfs_acl_v0_ops = { + zfs_ace_v0_get_mask, + zfs_ace_v0_set_mask, + zfs_ace_v0_get_flags, + zfs_ace_v0_set_flags, + zfs_ace_v0_get_type, + zfs_ace_v0_set_type, + zfs_ace_v0_get_who, + zfs_ace_v0_set_who, + zfs_ace_v0_size, + zfs_ace_v0_abstract_size, + zfs_ace_v0_mask_off, + zfs_ace_v0_data +}; + +static uint16_t +zfs_ace_fuid_get_type(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_fuid_get_flags(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_fuid_get_mask(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_fuid_get_who(void *args) +{ + uint16_t entry_type; + zfs_ace_t *acep = args; + + entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (-1); + return (((zfs_ace_t *)acep)->z_fuid); +} + +static void +zfs_ace_fuid_set_type(void *acep, uint16_t type) +{ + ((zfs_ace_hdr_t *)acep)->z_type = type; +} + +static void +zfs_ace_fuid_set_flags(void *acep, uint16_t flags) +{ + ((zfs_ace_hdr_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_fuid_set_mask(void *acep, uint32_t mask) +{ + ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_fuid_set_who(void *arg, uint64_t who) +{ + zfs_ace_t *acep = arg; + + uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return; + acep->z_fuid = who; +} + +static size_t +zfs_ace_fuid_size(void *acep) +{ + zfs_ace_hdr_t *zacep = acep; + uint16_t entry_type; + + switch (zacep->z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + return (sizeof (zfs_object_ace_t)); + case ALLOW: + case DENY: + entry_type = + (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); + if (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (sizeof (zfs_ace_hdr_t)); + /*FALLTHROUGH*/ + default: + return (sizeof (zfs_ace_t)); + } +} + +static size_t +zfs_ace_fuid_abstract_size(void) +{ + return (sizeof (zfs_ace_hdr_t)); +} + +static int +zfs_ace_fuid_mask_off(void) +{ + return (offsetof(zfs_ace_hdr_t, z_access_mask)); +} + +static int +zfs_ace_fuid_data(void *acep, void **datap) +{ + zfs_ace_t *zacep = acep; + zfs_object_ace_t *zobjp; + + switch (zacep->z_hdr.z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjp = acep; + *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); + return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); + default: + *datap = NULL; + return (0); + } +} + +static acl_ops_t zfs_acl_fuid_ops = { + zfs_ace_fuid_get_mask, + zfs_ace_fuid_set_mask, + zfs_ace_fuid_get_flags, + zfs_ace_fuid_set_flags, + zfs_ace_fuid_get_type, + zfs_ace_fuid_set_type, + zfs_ace_fuid_get_who, + zfs_ace_fuid_set_who, + zfs_ace_fuid_size, + zfs_ace_fuid_abstract_size, + zfs_ace_fuid_mask_off, + zfs_ace_fuid_data +}; + +/* + * The following three functions are provided for compatibility with + * older ZPL version in order to determine if the file use to have + * an external ACL and what version of ACL previously existed on the + * file. Would really be nice to not need this, sigh. + */ +uint64_t +zfs_external_acl(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + int error; + + if (zp->z_is_sa) + return (0); + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_extern_obj); + else { + /* + * after upgrade the SA_ZPL_ZNODE_ACL should have been + * removed + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (0); + } +} + +/* + * Determine size of ACL in bytes + * + * This is more complicated than it should be since we have to deal + * with old external ACLs. + */ +static int +zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, + zfs_acl_phys_t *aclphys) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t acl_count; + int size; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_is_sa) { + if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), + &size)) != 0) + return (error); + *aclsize = size; + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), + &acl_count, sizeof (acl_count))) != 0) + return (error); + *aclcount = acl_count; + } else { + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + aclphys, sizeof (*aclphys))) != 0) + return (error); + + if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { + *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); + *aclcount = aclphys->z_acl_size; + } else { + *aclsize = aclphys->z_acl_size; + *aclcount = aclphys->z_acl_count; + } + } + return (0); +} + +int +zfs_znode_acl_version(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + + if (zp->z_is_sa) + return (ZFS_ACL_VERSION_FUID); + else { + int error; + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_version); + else { + /* + * After upgrade SA_ZPL_ZNODE_ACL should have + * been removed. + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (ZFS_ACL_VERSION_FUID); + } + } +} + +static int +zfs_acl_version(int version) +{ + if (version < ZPL_VERSION_FUID) + return (ZFS_ACL_VERSION_INITIAL); + else + return (ZFS_ACL_VERSION_FUID); +} + +static int +zfs_acl_version_zp(znode_t *zp) +{ + return (zfs_acl_version(zp->z_zfsvfs->z_version)); +} + +zfs_acl_t * +zfs_acl_alloc(int vers) +{ + zfs_acl_t *aclp; + + aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); + list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), + offsetof(zfs_acl_node_t, z_next)); + aclp->z_version = vers; + if (vers == ZFS_ACL_VERSION_FUID) + aclp->z_ops = &zfs_acl_fuid_ops; + else + aclp->z_ops = &zfs_acl_v0_ops; + return (aclp); +} + +zfs_acl_node_t * +zfs_acl_node_alloc(size_t bytes) +{ + zfs_acl_node_t *aclnode; + + aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); + if (bytes) { + aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); + aclnode->z_allocdata = aclnode->z_acldata; + aclnode->z_allocsize = bytes; + aclnode->z_size = bytes; + } + + return (aclnode); +} + +static void +zfs_acl_node_free(zfs_acl_node_t *aclnode) +{ + if (aclnode->z_allocsize) + kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); + kmem_free(aclnode, sizeof (zfs_acl_node_t)); +} + +static void +zfs_acl_release_nodes(zfs_acl_t *aclp) +{ + zfs_acl_node_t *aclnode; + + while ((aclnode = list_head(&aclp->z_acl))) { + list_remove(&aclp->z_acl, aclnode); + zfs_acl_node_free(aclnode); + } + aclp->z_acl_count = 0; + aclp->z_acl_bytes = 0; +} + +void +zfs_acl_free(zfs_acl_t *aclp) +{ + zfs_acl_release_nodes(aclp); + list_destroy(&aclp->z_acl); + kmem_free(aclp, sizeof (zfs_acl_t)); +} + +static boolean_t +zfs_acl_valid_ace_type(uint_t type, uint_t flags) +{ + uint16_t entry_type; + + switch (type) { + case ALLOW: + case DENY: + case ACE_SYSTEM_AUDIT_ACE_TYPE: + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry_type = flags & ACE_TYPE_FLAGS; + return (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE || entry_type == 0 || + entry_type == ACE_IDENTIFIER_GROUP); + default: + if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +zfs_ace_valid(umode_t obj_mode, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) +{ + /* + * first check type of entry + */ + + if (!zfs_acl_valid_ace_type(type, iflags)) + return (B_FALSE); + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (aclp->z_version < ZFS_ACL_VERSION_FUID) + return (B_FALSE); + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + } + + /* + * next check inheritance level flags + */ + + if (S_ISDIR(obj_mode) && + (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) + return (B_FALSE); + } + + return (B_TRUE); +} + +static void * +zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, + uint32_t *access_mask, uint16_t *iflags, uint16_t *type) +{ + zfs_acl_node_t *aclnode; + + ASSERT(aclp); + + if (start == NULL) { + aclnode = list_head(&aclp->z_acl); + if (aclnode == NULL) + return (NULL); + + aclp->z_next_ace = aclnode->z_acldata; + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + } + + aclnode = aclp->z_curr_node; + + if (aclnode == NULL) + return (NULL); + + if (aclnode->z_ace_idx >= aclnode->z_ace_count) { + aclnode = list_next(&aclp->z_acl, aclnode); + if (aclnode == NULL) + return (NULL); + else { + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + aclp->z_next_ace = aclnode->z_acldata; + } + } + + if (aclnode->z_ace_idx < aclnode->z_ace_count) { + void *acep = aclp->z_next_ace; + size_t ace_size; + + /* + * Make sure we don't overstep our bounds + */ + ace_size = aclp->z_ops->ace_size(acep); + + if (((caddr_t)acep + ace_size) > + ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { + return (NULL); + } + + *iflags = aclp->z_ops->ace_flags_get(acep); + *type = aclp->z_ops->ace_type_get(acep); + *access_mask = aclp->z_ops->ace_mask_get(acep); + *who = aclp->z_ops->ace_who_get(acep); + aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; + aclnode->z_ace_idx++; + + return ((void *)acep); + } + return (NULL); +} + +/*ARGSUSED*/ +static uint64_t +zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, + uint16_t *flags, uint16_t *type, uint32_t *mask) +{ + zfs_acl_t *aclp = datap; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + uint64_t who; + + acep = zfs_acl_next_ace(aclp, acep, &who, mask, + flags, type); + return ((uint64_t)(uintptr_t)acep); +} + +#if 0 // unused function +static zfs_acl_node_t * +zfs_acl_curr_node(zfs_acl_t *aclp) +{ + ASSERT(aclp->z_curr_node); + return (aclp->z_curr_node); +} +#endif + +/* + * Copy ACE to internal ZFS format. + * While processing the ACL each ACE will be validated for correctness. + * ACE FUIDs will be created later. + */ +int +zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *aclp, + void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, + zfs_fuid_info_t **fuidp, cred_t *cr) +{ + int i; + uint16_t entry_type; + zfs_ace_t *aceptr = z_acl; + ace_t *acep = datap; + zfs_object_ace_t *zobjacep; + ace_object_t *aceobjp; + + for (i = 0; i != aclcnt; i++) { + aceptr->z_hdr.z_access_mask = acep->a_access_mask; + aceptr->z_hdr.z_flags = acep->a_flags; + aceptr->z_hdr.z_type = acep->a_type; + entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; + if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE) { + aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, + cr, (entry_type == 0) ? + ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); + } + + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_mode, aclp, aceptr->z_hdr.z_type, + aceptr->z_hdr.z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + + switch (acep->a_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjacep = (zfs_object_ace_t *)aceptr; + aceobjp = (ace_object_t *)acep; + + bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, + sizeof (aceobjp->a_obj_type)); + bcopy(aceobjp->a_inherit_obj_type, + zobjacep->z_inherit_type, + sizeof (aceobjp->a_inherit_obj_type)); + acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); + break; + default: + acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); + } + + aceptr = (zfs_ace_t *)((caddr_t)aceptr + + aclp->z_ops->ace_size(aceptr)); + } + + *size = (caddr_t)aceptr - (caddr_t)z_acl; + + return (0); +} + +/* + * Copy ZFS ACEs to fixed size ace_t layout + */ +#if 0 // unused function +static void +zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, + void *datap, int filter) +{ + uint64_t who; + uint32_t access_mask; + uint16_t iflags, type; + zfs_ace_hdr_t *zacep = NULL; + ace_t *acep = datap; + ace_object_t *objacep; + zfs_object_ace_t *zobjacep; + size_t ace_size; + uint16_t entry_type; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (filter) { + continue; + } + zobjacep = (zfs_object_ace_t *)zacep; + objacep = (ace_object_t *)acep; + bcopy(zobjacep->z_object_type, + objacep->a_obj_type, + sizeof (zobjacep->z_object_type)); + bcopy(zobjacep->z_inherit_type, + objacep->a_inherit_obj_type, + sizeof (zobjacep->z_inherit_type)); + ace_size = sizeof (ace_object_t); + break; + default: + ace_size = sizeof (ace_t); + break; + } + + entry_type = (iflags & ACE_TYPE_FLAGS); + if ((entry_type != ACE_OWNER && + entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE)) { + acep->a_who = zfs_fuid_map_id(zfsvfs, who, + cr, (entry_type & ACE_IDENTIFIER_GROUP) ? + ZFS_ACE_GROUP : ZFS_ACE_USER); + } else { + acep->a_who = (uid_t)(int64_t)who; + } + acep->a_access_mask = access_mask; + acep->a_flags = iflags; + acep->a_type = type; + acep = (ace_t *)((caddr_t)acep + ace_size); + } +} +#endif + +static int +zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep, + zfs_oldace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + zfs_oldace_t *aceptr = z_acl; + + for (i = 0; i != aclcnt; i++, aceptr++) { + aceptr->z_access_mask = acep[i].a_access_mask; + aceptr->z_type = acep[i].a_type; + aceptr->z_flags = acep[i].a_flags; + aceptr->z_fuid = acep[i].a_who; + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_mode, aclp, aceptr->z_type, + aceptr->z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + } + *size = (caddr_t)aceptr - (caddr_t)z_acl; + return (0); +} + +/* + * convert old ACL format to new + */ +void +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) +{ + zfs_oldace_t *oldaclp; + int i; + uint16_t type, iflags; + uint32_t access_mask; + uint64_t who; + void *cookie = NULL; + zfs_acl_node_t *newaclnode; + + ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); + /* + * First create the ACE in a contiguous piece of memory + * for zfs_copy_ace_2_fuid(). + * + * We only convert an ACL once, so this won't happen + * everytime. + */ + oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, + KM_SLEEP); + i = 0; + while ((cookie = zfs_acl_next_ace(aclp, cookie, &who, + &access_mask, &iflags, &type))) { + oldaclp[i].z_flags = iflags; + oldaclp[i].z_type = type; + oldaclp[i].z_fuid = who; + oldaclp[i++].z_access_mask = access_mask; + } + + newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * + sizeof (zfs_object_ace_t)); + aclp->z_ops = &zfs_acl_fuid_ops; + VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, zp->z_mode, aclp, + oldaclp, newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size, NULL, cr) == 0); + newaclnode->z_ace_count = aclp->z_acl_count; + aclp->z_version = ZFS_ACL_VERSION; + kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); + + /* + * Release all previous ACL nodes + */ + + zfs_acl_release_nodes(aclp); + + list_insert_head(&aclp->z_acl, newaclnode); + + aclp->z_acl_bytes = newaclnode->z_size; + aclp->z_acl_count = newaclnode->z_ace_count; + +} + +/* + * Convert unix access mask to v4 access mask + */ +static uint32_t +zfs_unix_to_v4(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & S_IXOTH) + new_mask |= ACE_EXECUTE; + if (access_mask & S_IWOTH) + new_mask |= ACE_WRITE_DATA; + if (access_mask & S_IROTH) + new_mask |= ACE_READ_DATA; + return (new_mask); +} + +static void +zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, + uint16_t access_type, uint64_t fuid, uint16_t entry_type) +{ + uint16_t type = entry_type & ACE_TYPE_FLAGS; + + aclp->z_ops->ace_mask_set(acep, access_mask); + aclp->z_ops->ace_type_set(acep, access_type); + aclp->z_ops->ace_flags_set(acep, entry_type); + if ((type != ACE_OWNER && type != OWNING_GROUP && + type != ACE_EVERYONE)) + aclp->z_ops->ace_who_set(acep, fuid); +} + +/* + * Determine mode of file based on ACL. + */ +uint64_t +zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, + uint64_t *pflags, uint64_t fuid, uint64_t fgid) +{ + int entry_type; + mode_t mode; + mode_t seen = 0; + zfs_ace_hdr_t *acep = NULL; + uint64_t who; + uint16_t iflags, type; + uint32_t access_mask; + boolean_t an_exec_denied = B_FALSE; + + + mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, + &access_mask, &iflags, &type))) { + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + /* + * Skip over any inherit_only ACEs + */ + if (iflags & ACE_INHERIT_ONLY_ACE) + continue; + + + /* + * Apple has unusual expectations to emulate hfs in that the mode is not + * updated: + * -rw-r--r-- 1 root wheel 0 Nov 12 12:39 file.txt + * chmod +a "root allow execute" file.txt + * ZFS: -rwxr--r--+ 1 root wheel 0 Nov 12 12:39 file.txt + * HFS: -rw-r--r--+ 1 root wheel 0 Nov 12 12:39 file.txt + * 0: user:root allow execute + */ + if (entry_type == ACE_OWNER +#ifndef _WIN32 + || (entry_type == 0 && who == fuid) +#endif + ) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRUSR))) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWUSR))) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXUSR))) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + } else if (entry_type == OWNING_GROUP +#ifndef _WIN32 + || (entry_type == ACE_IDENTIFIER_GROUP && who == fgid) +#endif + ) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRGRP))) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWGRP))) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXGRP))) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + } else if (entry_type == ACE_EVERYONE) { + if ((access_mask & ACE_READ_DATA)) { + if (!(seen & S_IRUSR)) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if (!(seen & S_IRGRP)) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if (!(seen & S_IROTH)) { + seen |= S_IROTH; + if (type == ALLOW) { + mode |= S_IROTH; + } + } + } + if ((access_mask & ACE_WRITE_DATA)) { + if (!(seen & S_IWUSR)) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if (!(seen & S_IWGRP)) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if (!(seen & S_IWOTH)) { + seen |= S_IWOTH; + if (type == ALLOW) { + mode |= S_IWOTH; + } + } + } + if ((access_mask & ACE_EXECUTE)) { + if (!(seen & S_IXUSR)) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + if (!(seen & S_IXGRP)) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + if (!(seen & S_IXOTH)) { + seen |= S_IXOTH; + if (type == ALLOW) { + mode |= S_IXOTH; + } + } + } + } else { + /* + * Only care if this IDENTIFIER_GROUP or + * USER ACE denies execute access to someone, + * mode is not affected + */ + if ((access_mask & ACE_EXECUTE) && type == DENY) + an_exec_denied = B_TRUE; + } + } + + /* + * Failure to allow is effectively a deny, so execute permission + * is denied if it was never mentioned or if we explicitly + * weren't allowed it. + */ + if (!an_exec_denied && + ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || + (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) + an_exec_denied = B_TRUE; + + if (an_exec_denied) + *pflags &= ~ZFS_NO_EXECS_DENIED; + else + *pflags |= ZFS_NO_EXECS_DENIED; + + return (mode); +} + +/* + * Read an external acl object. If the intent is to modify, always + * create a new acl and leave any cached acl in place. + */ +static int +zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, + boolean_t will_modify) +{ + zfs_acl_t *aclp; + int aclsize = 0; + int acl_count = 0; + zfs_acl_node_t *aclnode; + zfs_acl_phys_t znode_acl; + int version; + int error; + boolean_t drop_lock = B_FALSE; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if (zp->z_acl_cached && !will_modify) { + *aclpp = zp->z_acl_cached; + return (0); + } + + /* + * close race where znode could be upgrade while trying to + * read the znode attributes. + * + * But this could only happen if the file isn't already an SA + * znode + */ + if (!zp->z_is_sa && !have_lock) { + mutex_enter(&zp->z_lock); + drop_lock = B_TRUE; + } + version = zfs_znode_acl_version(zp); + + if ((error = zfs_acl_znode_info(zp, &aclsize, + &acl_count, &znode_acl)) != 0) { + goto done; + } + + aclp = zfs_acl_alloc(version); + + aclp->z_acl_count = acl_count; + aclp->z_acl_bytes = aclsize; + + aclnode = zfs_acl_node_alloc(aclsize); + aclnode->z_ace_count = aclp->z_acl_count; + aclnode->z_size = aclsize; + + if (!zp->z_is_sa) { + if (znode_acl.z_acl_extern_obj) { + error = dmu_read(zp->z_zfsvfs->z_os, + znode_acl.z_acl_extern_obj, 0, aclnode->z_size, + aclnode->z_acldata, DMU_READ_PREFETCH); + } else { + bcopy(znode_acl.z_ace_data, aclnode->z_acldata, + aclnode->z_size); + } + } else { + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs), + aclnode->z_acldata, aclnode->z_size); + } + + if (error != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + goto done; + } + + list_insert_head(&aclp->z_acl, aclnode); + + *aclpp = aclp; + if (!will_modify) + zp->z_acl_cached = aclp; +done: + if (drop_lock) + mutex_exit(&zp->z_lock); + return (error); +} + +/*ARGSUSED*/ +void +zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, + boolean_t start, void *userdata) +{ + zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; + + if (start) { + cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); + } else { + cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, + cb->cb_acl_node); + } + *dataptr = cb->cb_acl_node->z_acldata; + *length = cb->cb_acl_node->z_size; +} + +int +zfs_acl_chown_setattr(znode_t *zp) +{ + int error; + zfs_acl_t *aclp; + + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACLTYPE_POSIXACL) + return (0); + + ASSERT(MUTEX_HELD(&zp->z_lock)); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); + if (error == 0 && aclp->z_acl_count > 0) + zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, + &zp->z_pflags, zp->z_uid, zp->z_gid); + + /* + * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL + * nor a DACL_ACES SA in which case ENOENT is returned from + * zfs_acl_node_read() when the SA can't be located. + * Allow chown/chgrp to succeed in these cases rather than + * returning an error that makes no sense in the context of + * the caller. + */ + if (error == ENOENT) + return (0); + + return (error); +} + +/* + * common code for setting ACLs. + * + * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. + * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's + * already checked the acl and knows whether to inherit. + */ +int +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) +{ + int error; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_type_t otype; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t mode; + sa_bulk_attr_t bulk[5]; + uint64_t ctime[2]; + int count = 0; + zfs_acl_phys_t acl_phys; + + mode = zp->z_mode; + + mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, + zp->z_uid, zp->z_gid); + + zp->z_mode = mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + /* + * Upgrade needed? + */ + if (!zfsvfs->z_use_fuids) { + otype = DMU_OT_OLDACL; + } else { + if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && + (zfsvfs->z_version >= ZPL_VERSION_FUID)) + zfs_acl_xform(zp, aclp, cr); + ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); + otype = DMU_OT_ACL; + } + + /* + * Arrgh, we have to handle old on disk format + * as well as newer (preferred) SA format. + */ + + if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ + locate.cb_aclp = aclp; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, aclp->z_acl_bytes); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), + NULL, &aclp->z_acl_count, sizeof (uint64_t)); + } else { /* Painful legacy way */ + zfs_acl_node_t *aclnode; + uint64_t off = 0; + uint64_t aoid; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + &acl_phys, sizeof (acl_phys))) != 0) + return (error); + + aoid = acl_phys.z_acl_extern_obj; + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && + aclp->z_version != acl_phys.z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, aoid, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? + DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? + DN_OLD_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, + aoid, aclp->z_acl_bytes, 0, tx); + } + acl_phys.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } + } else { + void *start = acl_phys.z_ace_data; + /* + * Migrating back embedded? + */ + if (acl_phys.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + acl_phys.z_acl_extern_obj, tx); + if (error) + return (error); + acl_phys.z_acl_extern_obj = 0; + } + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + } + /* + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. + */ + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + acl_phys.z_acl_size = aclp->z_acl_count; + acl_phys.z_acl_count = aclp->z_acl_bytes; + } else { + acl_phys.z_acl_size = aclp->z_acl_bytes; + acl_phys.z_acl_count = aclp->z_acl_count; + } + acl_phys.z_acl_version = aclp->z_version; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (acl_phys)); + } + + /* + * Replace ACL wide bits, but first clear them. + */ + zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; + + zp->z_pflags |= aclp->z_hints; + + if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) + zp->z_pflags |= ZFS_ACL_TRIVIAL; + +// zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE); + return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); +} + +static void +zfs_acl_chmod(umode_t umode, uint64_t mode, boolean_t split, boolean_t trim, + zfs_acl_t *aclp) +{ + void *acep = NULL; + uint64_t who; + int new_count, new_bytes; + int ace_size; + int entry_type; + uint16_t iflags, type; + uint32_t access_mask; + zfs_acl_node_t *newnode; + size_t abstract_size = aclp->z_ops->ace_abstract_size(); + void *zacep; + boolean_t isdir; + trivial_acl_t masks; + + new_count = new_bytes = 0; + + isdir = S_ISDIR(umode); + + acl_trivial_access_masks((mode_t)mode, isdir, &masks); + + newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); + + zacep = newnode->z_acldata; + if (masks.allow0) { + zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } if (masks.deny1) { + zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny2) { + zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + entry_type = (iflags & ACE_TYPE_FLAGS); + /* + * ACEs used to represent the file mode may be divided + * into an equivalent pair of inherit-only and regular + * ACEs, if they are inheritable. + * Skip regular ACEs, which are replaced by the new mode. + */ + if (split && (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE)) { + if (!isdir || !(iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + continue; + /* + * We preserve owner@, group@, or @everyone + * permissions, if they are inheritable, by + * copying them to inherit_only ACEs. This + * prevents inheritable permissions from being + * altered along with the file mode. + */ + iflags |= ACE_INHERIT_ONLY_ACE; + } + + /* + * If this ACL has any inheritable ACEs, mark that in + * the hints (which are later masked into the pflags) + * so create knows to do inheritance. + */ + if (isdir && (iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if ((type != ALLOW && type != DENY) || + (iflags & ACE_INHERIT_ONLY_ACE)) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + break; + } + } else { + /* + * Limit permissions granted by ACEs to be no greater + * than permissions of the requested group mode. + * Applies when the "aclmode" property is set to + * "groupmask". + */ + if ((type == ALLOW) && trim) + access_mask &= masks.group; + } + zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); + ace_size = aclp->z_ops->ace_size(acep); + zacep = (void *)((uintptr_t)zacep + ace_size); + new_count++; + new_bytes += ace_size; + } + zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); + + new_count += 3; + new_bytes += abstract_size * 3; + zfs_acl_release_nodes(aclp); + aclp->z_acl_count = new_count; + aclp->z_acl_bytes = new_bytes; + newnode->z_ace_count = new_count; + newnode->z_size = new_bytes; + list_insert_tail(&aclp->z_acl, newnode); +} + +int +zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) +{ + int error = 0; + + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) + *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + else + error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); + + if (error == 0) { + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(zp->z_mode, mode, B_TRUE, + (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); + } + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Should ACE be inherited? + */ +static int +zfs_ace_can_use(umode_t umode, uint16_t acep_flags) +{ + int iflags = (acep_flags & 0xf); + + if (S_ISDIR(umode) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + else if (iflags & ACE_FILE_INHERIT_ACE) + return (!S_ISDIR((umode) && + (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); + return (0); +} + +/* + * inherit inheritable ACEs from parent + */ +static zfs_acl_t * +zfs_acl_inherit(zfsvfs_t *zfsvfs, zfs_acl_t *paclp, + uint64_t umode, boolean_t *need_chmod) +{ + void *pacep = NULL; + void *acep; + zfs_acl_node_t *aclnode; + zfs_acl_t *aclp = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t iflags, newflags, type; + size_t ace_size; + void *data1, *data2 = NULL; + size_t data1sz, data2sz = 0; + uint_t aclinherit; + boolean_t isdir = S_ISDIR(umode); + boolean_t islnk = S_ISLNK(umode); + boolean_t isreg = S_ISREG(umode); + + *need_chmod = B_TRUE; + + aclp = zfs_acl_alloc(paclp->z_version); + aclinherit = zfsvfs->z_acl_inherit; + if (aclinherit == ZFS_ACL_DISCARD || islnk) + return (aclp); + + while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, + &access_mask, &iflags, &type))) { + + /* + * don't inherit bogus ACEs + */ + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + /* + * Check if ACE is inheritable by this vnode + */ + if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || + !zfs_ace_can_use(umode, iflags)) + continue; + + /* + * If owner@, group@, or everyone@ inheritable + * then zfs_acl_chmod() isn't needed. + */ + if ((aclinherit == ZFS_ACL_PASSTHROUGH || + aclinherit == ZFS_ACL_PASSTHROUGH_X) && + ((iflags & (ACE_OWNER|ACE_EVERYONE)) || + ((iflags & OWNING_GROUP) == OWNING_GROUP)) && + (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) + *need_chmod = B_FALSE; + + /* + * Strip inherited execute permission from file if + * not in mode + */ + if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && + !isdir && ((umode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { + access_mask &= ~ACE_EXECUTE; + } + + /* + * Strip write_acl and write_owner from permissions + * when inheriting an ACE + */ + if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { + access_mask &= ~RESTRICTED_CLEAR; + } + + ace_size = aclp->z_ops->ace_size(pacep); + aclnode = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode); + acep = aclnode->z_acldata; + + zfs_set_ace(aclp, acep, access_mask, type, + who, iflags|ACE_INHERITED_ACE); + + /* + * Copy special opaque data if any + */ + if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) { + VERIFY((data2sz = aclp->z_ops->ace_data(acep, + &data2)) == data1sz); + bcopy(data1, data2, data2sz); + } + + aclp->z_acl_count++; + aclnode->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + newflags = aclp->z_ops->ace_flags_get(acep); + + /* + * If ACE is not to be inherited further, or if the vnode is + * not a directory, remove all inheritance flags + */ + if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { + newflags &= ~ALL_INHERIT; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + continue; + } + + /* + * This directory has an inheritable ACE + */ + aclp->z_hints |= ZFS_INHERIT_ACE; + + /* + * If only FILE_INHERIT is set then turn on + * inherit_only + */ + if ((iflags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } else { + newflags &= ~ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } + } + + return (aclp); +} + +/* + * Create file system object initial permissions + * including inheritable ACEs. + * Also, create FUIDs for owner and group. + */ +int +zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) +{ + int error; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_acl_t *paclp = NULL; + gid_t gid; + boolean_t need_chmod = B_TRUE; + boolean_t trim = B_FALSE; + boolean_t inherited = B_FALSE; + + bzero(acl_ids, sizeof (zfs_acl_ids_t)); + acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); + + if (vsecp) + if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, + &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) + return (error); + /* + * Determine uid and gid. + */ + if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || + ((flag & IS_XATTR) && (vap->va_type == VDIR))) { + acl_ids->z_fuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, + ZFS_OWNER, &acl_ids->z_fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, + ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + } else { + acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, + cr, &acl_ids->z_fuidp); + acl_ids->z_fgid = 0; + if (vap->va_mask & AT_GID) { + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + if (acl_ids->z_fgid != dzp->z_gid && + !groupmember(vap->va_gid, cr) && + secpolicy_vnode_create_gid(cr) != 0) + acl_ids->z_fgid = 0; + } + if (acl_ids->z_fgid == 0) { + if (dzp->z_mode & S_ISGID) { + char *domain; + uint32_t rid; + + acl_ids->z_fgid = dzp->z_gid; + gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, + cr, ZFS_GROUP); + + if (zfsvfs->z_use_fuids && + IS_EPHEMERAL(acl_ids->z_fgid)) { + domain = zfs_fuid_idx_domain( + &zfsvfs->z_fuid_idx, + FUID_INDEX(acl_ids->z_fgid)); + rid = FUID_RID(acl_ids->z_fgid); + zfs_fuid_node_add(&acl_ids->z_fuidp, + domain, rid, + FUID_INDEX(acl_ids->z_fgid), + acl_ids->z_fgid, ZFS_GROUP); + } + } else { + acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, + ZFS_GROUP, cr, &acl_ids->z_fuidp); +#ifdef __FreeBSD__ + gid = acl_ids->z_fgid = dzp->z_gid; +#else + gid = crgetgid(cr); +#endif + } + } + } + + /* + * If we're creating a directory, and the parent directory has the + * set-GID bit set, set in on the new directory. + * Otherwise, if the user is neither privileged nor a member of the + * file's new group, clear the file's set-GID bit. + */ + + if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && + (vap->va_type == VDIR)) { + acl_ids->z_mode |= S_ISGID; + } else { + if ((acl_ids->z_mode & S_ISGID) && + secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0) + acl_ids->z_mode &= ~S_ISGID; + } + + if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_acl_lock); + mutex_enter(&dzp->z_lock); + if (!(flag & IS_ROOT_NODE) && + (dzp->z_pflags & ZFS_INHERIT_ACE) && + !(dzp->z_pflags & ZFS_XATTR)) { + VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, + &paclp, B_FALSE)); + acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, + paclp, acl_ids->z_mode, &need_chmod); + inherited = B_TRUE; + } else { + acl_ids->z_aclp = + zfs_acl_alloc(zfs_acl_version_zp(dzp)); + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + mutex_exit(&dzp->z_lock); + mutex_exit(&dzp->z_acl_lock); + + if (need_chmod) { + if (vap->va_type == VDIR) + acl_ids->z_aclp->z_hints |= + ZFS_ACL_AUTO_INHERIT; + + if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) + trim = B_TRUE; + zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE, + trim, acl_ids->z_aclp); + } + } + + if (inherited || vsecp) { + acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, + acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, + acl_ids->z_fuid, acl_ids->z_fgid); + if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + + return (0); +} + +/* + * Free ACL and fuid_infop, but not the acl_ids structure + */ +void +zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) +{ + if (acl_ids->z_aclp) + zfs_acl_free(acl_ids->z_aclp); + if (acl_ids->z_fuidp) + zfs_fuid_info_free(acl_ids->z_fuidp); + acl_ids->z_aclp = NULL; + acl_ids->z_fuidp = NULL; +} + +boolean_t +zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids) +{ + return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) || + zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid)); +} + +/* + * Retrieve a file's ACL + */ +int +zfs_getacl(znode_t *zp, struct kauth_acl **aclpp, boolean_t skipaclcheck, + cred_t *cr) +{ + zfs_acl_t *aclp; + kauth_acl_t *k_acl; + uint32_t ace_flags = 0; + kauth_ace_rights_t *rights = 0; + guid_t *guidp; + uint64_t who; + uint32_t access_mask; + uint16_t flags; + uint16_t type; + int i; + int error; + void *zacep = NULL; + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_TRUE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } +// if ((k_acl = kauth_acl_alloc(aclp->z_acl_count)) == NULL) { +// mutex_exit(&zp->z_acl_lock); +// *aclpp = (kauth_acl_t *) KAUTH_FILESEC_NONE; +// return (ENOMEM); +// } + + dprintf("acl_count %d\n",aclp->z_acl_count); + +// k_acl->acl_entrycount = aclp->z_acl_count; +// k_acl->acl_flags = 0; +#if 0 + *aclpp = k_acl; + + /* + * Translate Open Solaris ACEs to Mac OS X ACLs + */ + i = 0; + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &flags, &type))) { + rights = 0; + ace_flags = 0; + +// guidp = &k_acl->acl_ace[i].ace_applicable; + + if (flags & ACE_OWNER) { +#if HIDE_TRIVIAL_ACL + continue; +#endif + who = -1; +// nfsacl_set_wellknown(KAUTH_WKG_OWNER, guidp); + } else if ((flags & OWNING_GROUP) == OWNING_GROUP) { +#if HIDE_TRIVIAL_ACL + continue; +#endif + who = -1; + // nfsacl_set_wellknown(KAUTH_WKG_GROUP, guidp); + } else if (flags & ACE_EVERYONE) { +#if HIDE_TRIVIAL_ACL + continue; +#endif + who = -1; + // nfsacl_set_wellknown(KAUTH_WKG_EVERYBODY, guidp); + /* Try to get a guid from our uid */ + } else { + + dprintf("ZFS: trying to map uid %d flags %x type %x\n", who, flags, + type); + + if (flags & OWNING_GROUP) { +// if (kauth_cred_gid2guid(who, guidp) == 0) { +// dprintf("ZFS: appears to be a group\n"); +// } +// } else if (kauth_cred_uid2guid(who, guidp) == 0) { +// dprintf("ZFS: appears to be a user\n"); + } else { + dprintf("ZFS: Unable to map\n"); + bzero(guidp, sizeof (guid_t)); + } + } + + //access_mask = aclp->z_acl[i].a_access_mask; + if (access_mask & ACE_READ_DATA) + rights |= KAUTH_VNODE_READ_DATA; + if (access_mask & ACE_WRITE_DATA) + rights |= KAUTH_VNODE_WRITE_DATA; + if (access_mask & ACE_APPEND_DATA) + rights |= KAUTH_VNODE_APPEND_DATA; + if (access_mask & ACE_READ_NAMED_ATTRS) + rights |= KAUTH_VNODE_READ_EXTATTRIBUTES; + if (access_mask & ACE_WRITE_NAMED_ATTRS) + rights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; + if (access_mask & ACE_EXECUTE) + rights |= KAUTH_VNODE_EXECUTE; + if (access_mask & ACE_DELETE_CHILD) + rights |= KAUTH_VNODE_DELETE_CHILD; + if (access_mask & ACE_READ_ATTRIBUTES) + rights |= KAUTH_VNODE_READ_ATTRIBUTES; + if (access_mask & ACE_WRITE_ATTRIBUTES) + rights |= KAUTH_VNODE_WRITE_ATTRIBUTES; + if (access_mask & ACE_DELETE) + rights |= KAUTH_VNODE_DELETE; + if (access_mask & ACE_READ_ACL) + rights |= KAUTH_VNODE_READ_SECURITY; + if (access_mask & ACE_WRITE_ACL) + rights |= KAUTH_VNODE_WRITE_SECURITY; + if (access_mask & ACE_WRITE_OWNER) + rights |= KAUTH_VNODE_TAKE_OWNERSHIP; + if (access_mask & ACE_SYNCHRONIZE) + rights |= KAUTH_VNODE_SYNCHRONIZE; + k_acl->acl_ace[i].ace_rights = rights; + + //flags = aclp->z_acl[i].a_flags; + if (flags & ACE_FILE_INHERIT_ACE) + ace_flags |= KAUTH_ACE_FILE_INHERIT; + if (flags & ACE_DIRECTORY_INHERIT_ACE) + ace_flags |= KAUTH_ACE_DIRECTORY_INHERIT; + if (flags & ACE_NO_PROPAGATE_INHERIT_ACE) + ace_flags |= KAUTH_ACE_LIMIT_INHERIT; + if (flags & ACE_INHERIT_ONLY_ACE) + ace_flags |= KAUTH_ACE_ONLY_INHERIT; + + //type = aclp->z_acl[i].a_type; + switch(type) { + case ACE_ACCESS_ALLOWED_ACE_TYPE: + ace_flags |= KAUTH_ACE_PERMIT; + break; + case ACE_ACCESS_DENIED_ACE_TYPE: + ace_flags |= KAUTH_ACE_DENY; + break; + case ACE_SYSTEM_AUDIT_ACE_TYPE: + ace_flags |= KAUTH_ACE_AUDIT; + break; + case ACE_SYSTEM_ALARM_ACE_TYPE: + ace_flags |= KAUTH_ACE_ALARM; + break; + } + k_acl->acl_ace[i].ace_flags = ace_flags; + i++; + } + k_acl->acl_entrycount = i; +#endif + + mutex_exit(&zp->z_acl_lock); + + zfs_acl_free(aclp); + + return (0); +} + +int +zfs_addacl_trivial(znode_t *zp, ace_t *aces, int *nentries, int seen_type) +{ + zfs_acl_t *aclp; + uint64_t who; + uint32_t access_mask; + uint16_t flags; + uint16_t type; + int i; + int error; + void *zacep = NULL; + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_TRUE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + dprintf("ondisk acl_count %d\n",aclp->z_acl_count); + + // Start at the end + i = *nentries; + + /* + * Translate Open Solaris ACEs to Mac OS X ACLs + */ + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &flags, &type))) { + + if (flags & ACE_OWNER) { + if (seen_type & ACE_OWNER) continue; + seen_type |= ACE_OWNER; + who = -1; + } else if ((flags & OWNING_GROUP) == OWNING_GROUP) { + if (seen_type & ACE_GROUP) continue; + seen_type |= ACE_GROUP; + who = -1; + } else if (flags & ACE_EVERYONE) { + if (seen_type & ACE_EVERYONE) continue; + seen_type |= ACE_EVERYONE; + who = -1; + /* Try to get a guid from our uid */ + } else { + + // Only deal with the trivials + continue; + + } + + aces[i].a_who = who; + aces[i].a_access_mask = access_mask; + aces[i].a_flags = flags; + aces[i].a_type = type; + + dprintf("zfs: adding entry %d for type %x sizeof %d\n", i, type, + sizeof(aces[i])); + i++; + } + + *nentries=i; + mutex_exit(&zp->z_acl_lock); + + zfs_acl_free(aclp); + + return (0); +} + + +int +zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_mode, + vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + int aclcnt = vsecp->vsa_aclcnt; + int error; + + if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) + return (SET_ERROR(EINVAL)); + + aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); + + aclp->z_hints = 0; + aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + if ((error = zfs_copy_ace_2_oldace(obj_mode, aclp, + (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, + aclcnt, &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } else { + if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_mode, aclp, + vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, + &aclnode->z_size, fuidp, cr)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } + aclp->z_acl_bytes = aclnode->z_size; + aclnode->z_ace_count = aclcnt; + aclp->z_acl_count = aclcnt; + list_insert_head(&aclp->z_acl, aclnode); + + /* + * If flags are being set then add them to z_hints + */ + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { + if (vsecp->vsa_aclflags & ACL_PROTECTED) + aclp->z_hints |= ZFS_ACL_PROTECTED; + if (vsecp->vsa_aclflags & ACL_DEFAULTED) + aclp->z_hints |= ZFS_ACL_DEFAULTED; + if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + } + + *zaclp = aclp; + + return (0); +} + + + +/* + * Set a file's ACL + */ +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + //ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp; + zfs_fuid_info_t *fuidp = NULL; + boolean_t fuid_dirtied; + uint64_t acl_obj; + + // Anyone remember why we commented this out? + //if (mask == 0) + // return (ENOSYS); + + if (zp->z_pflags & ZFS_IMMUTABLE) + return (SET_ERROR(EPERM)); + + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) + return (error); + + error = zfs_vsec_2_aclp(zfsvfs, vnode_vtype(ZTOV(zp)), vsecp, cr, &fuidp, + &aclp); + if (error) + return (error); + + /* + * If ACL wide flags aren't being set then preserve any + * existing flags. + */ + //if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { + // aclp->z_hints |= + // (zp->z_pflags & V4_ACL_WIDE_FLAGS); + //} +top: + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + /* + * If old version and ACL won't fit in bonus and we aren't + * upgrading then take out necessary DMU holds + */ + + if ((acl_obj = zfs_external_acl(zp)) != 0) { + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); + } + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_NOWAIT); + if (error) { + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_acl_free(aclp); + return (error); + } + + error = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT(error == 0); + ASSERT(zp->z_acl_cached == NULL); + zp->z_acl_cached = aclp; + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + zfs_log_acl(zilog, tx, zp, vsecp, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + + return (error); +} + + +/* + * Check accesses of interest (AoI) against attributes of the dataset + * such as read-only. Returns zero if no AoI conflict with dataset + * attributes, otherwise an appropriate errno is returned. + */ +static int +zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) +{ + if ((v4_mode & WRITE_MASK) && + (vfs_isrdonly(zp->z_zfsvfs->z_vfs)) && + (!IS_DEVVP(ZTOV(zp)) || + (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) { + return (SET_ERROR(EROFS)); + } + + /* + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common(). + */ + if ((v4_mode & WRITE_MASK_DATA) && + (zp->z_pflags & ZFS_IMMUTABLE)) { + return (EPERM); + } +#ifdef sun + if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && + (zp->z_pflags & ZFS_NOUNLINK)) { + return (EPERM); +#else + /* + * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK + * (sunlnk) is set. We just don't allow directory removal, which is + * handled in zfs_zaccess_delete(). + */ + if ((v4_mode & ACE_DELETE) && + (zp->z_pflags & ZFS_NOUNLINK)) { + return (SET_ERROR(EPERM)); + } +#endif + + if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && + (zp->z_pflags & ZFS_AV_QUARANTINED))) { + return (SET_ERROR(EACCES)); + } + + return (0); +} + +/* + * The primary usage of this function is to loop through all of the + * ACEs in the znode, determining what accesses of interest (AoI) to + * the caller are allowed or denied. The AoI are expressed as bits in + * the working_mode parameter. As each ACE is processed, bits covered + * by that ACE are removed from the working_mode. This removal + * facilitates two things. The first is that when the working mode is + * empty (= 0), we know we've looked at all the AoI. The second is + * that the ACE interpretation rules don't allow a later ACE to undo + * something granted or denied by an earlier ACE. Removing the + * discovered access or denial enforces this rule. At the end of + * processing the ACEs, all AoI that were found to be denied are + * placed into the working_mode, giving the caller a mask of denied + * accesses. Returns: + * 0 if all AoI granted + * EACCES if the denied mask is non-zero + * other error if abnormal failure (e.g., IO error) + * + * A secondary usage of the function is to determine if any of the + * AoI are granted. If an ACE grants any access in + * the working_mode, we immediately short circuit out of the function. + * This mode is chosen by setting anyaccess to B_TRUE. The + * working_mode is not a denied access mask upon exit if the function + * is used in this manner. + */ +static int +zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, + boolean_t anyaccess, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_acl_t *aclp; + int error; + uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t gowner; + uid_t fowner; + + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + ASSERT(zp->z_acl_cached); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + uint32_t mask_matched; + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + if (vnode_isdir(ZTOV(zp)) && (iflags & ACE_INHERIT_ONLY_ACE)) + continue; + + /* Skip ACE if it does not affect any AoI */ + mask_matched = (access_mask & *working_mode); + if (!mask_matched) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + checkit = B_FALSE; + + switch (entry_type) { + case ACE_OWNER: + if (uid == fowner) + checkit = B_TRUE; + break; + case OWNING_GROUP: + who = gowner; + /*FALLTHROUGH*/ + case ACE_IDENTIFIER_GROUP: + checkit = zfs_groupmember(zfsvfs, who, cr); + break; + case ACE_EVERYONE: + checkit = B_TRUE; + break; + + /* USER Entry */ + default: + if (entry_type == 0) { + uid_t newid; + + newid = zfs_fuid_map_id(zfsvfs, who, cr, + ZFS_ACE_USER); + if (newid != IDMAP_WK_CREATOR_OWNER_UID && + uid == newid) + checkit = B_TRUE; + break; + } else { + mutex_exit(&zp->z_acl_lock); + return (SET_ERROR(EIO)); + } + } + + if (checkit) { + if (type == DENY) { + DTRACE_PROBE3(zfs__ace__denies, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + deny_mask |= mask_matched; + } else { + DTRACE_PROBE3(zfs__ace__allows, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + if (anyaccess) { + mutex_exit(&zp->z_acl_lock); + return (0); + } + } + *working_mode &= ~mask_matched; + } + + /* Are we done? */ + if (*working_mode == 0) + break; + } + + mutex_exit(&zp->z_acl_lock); + + /* Put the found 'denies' back on the working mode */ + if (deny_mask) { + *working_mode |= deny_mask; + return (SET_ERROR(EACCES)); + } else if (*working_mode) { + return (-1); + } + + return (0); +} + +/* + * Return true if any access whatsoever granted, we don't actually + * care what access is granted. + */ +boolean_t +zfs_has_access(znode_t *zp, cred_t *cr) +{ + uint32_t have = ACE_ALL_PERMS; + + if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { + uid_t owner; + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0); + } + return (B_TRUE); +} + +static int +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int err; + + *working_mode = v4_mode; + *check_privs = B_TRUE; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0 || zfsvfs->z_replay) { + *working_mode = 0; + return (0); + } + + if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { + *check_privs = B_FALSE; + return (err); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + /* + * Note: ZFS_READONLY represents the "DOS R/O" attribute. + * When that flag is set, we should behave as if write access + * were not granted by anything in the ACL. In particular: + * We _must_ allow writes after opening the file r/w, then + * setting the DOS R/O attribute, and writing some more. + * (Similar to how you can write after fchmod(fd, 0444).) + * + * Therefore ZFS_READONLY is ignored in the dataset check + * above, and checked here as if part of the ACL check. + * Also note: DOS R/O is ignored for directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + !vnode_isdir(ZTOV(zp)) && + (zp->z_pflags & ZFS_READONLY)) { + return (SET_ERROR(EPERM)); + } + + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); +} + +static int +zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, + cred_t *cr) +{ + if (*working_mode != ACE_WRITE_DATA) + return (SET_ERROR(EACCES)); + + return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, + check_privs, B_FALSE, cr)); +} + +int +zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) +{ + boolean_t owner = B_FALSE; + boolean_t groupmbr = B_FALSE; + boolean_t is_attr; + uid_t uid = crgetuid(cr); + int error; + + if (zdp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + is_attr = ((zdp->z_pflags & ZFS_XATTR) && + (vnode_isdir(ZTOV(zdp)))); + if (is_attr) + goto slow; + + + mutex_enter(&zdp->z_acl_lock); + + if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + + if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + + if (uid == zdp->z_uid) { + owner = B_TRUE; + if (zdp->z_mode & S_IXUSR) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (groupmember(zdp->z_gid, cr)) { + groupmbr = B_TRUE; + if (zdp->z_mode & S_IXGRP) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (!owner && !groupmbr) { + if (zdp->z_mode & S_IXOTH) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + } + + mutex_exit(&zdp->z_acl_lock); + +slow: + DTRACE_PROBE(zfs__fastpath__execute__access__miss); + ZFS_ENTER(zdp->z_zfsvfs); + error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); + ZFS_EXIT(zdp->z_zfsvfs); + return (error); +} + +/* + * Determine whether Access should be granted/denied. + * + * The least priv subsytem is always consulted as a basic privilege + * can define any form of access. + */ +int +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +{ + uint32_t working_mode; + int error; + int is_attr; + boolean_t check_privs; + znode_t *xzp = NULL; + znode_t *check_zp = zp; + mode_t needed_bits; + uid_t owner; + + is_attr = ((zp->z_pflags & ZFS_XATTR) && (vnode_isdir(ZTOV(zp)))); + +#ifdef _WIN32 + /* + * In FreeBSD, we don't care about permissions of individual ADS. + * Note that not checking them is not just an optimization - without + * this shortcut, EA operations may bogusly fail with EACCES. + */ + if (zp->z_pflags & ZFS_XATTR) + return (0); +#else + /* + * If attribute then validate against base file + */ + if (is_attr) { + uint64_t parent; + + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zp->z_zfsvfs), &parent, + sizeof (parent))) != 0) + return (error); + + /* + * Cache the lookup on the parent file znode as + * zp->z_xattr_parent and hold a reference. This + * effectively pins the parent in memory until all + * child xattr znodes have been destroyed and + * release their references in zfs_inode_destroy(). + */ + error = zfs_zget(zp->z_zfsvfs, parent, &check_zp); + if (error) + return (error); + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + if (zp->z_xattr_parent == NULL) + zp->z_xattr_parent = check_zp; + rw_exit(&zp->z_xattr_lock); + } + + check_zp = xzp; + + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } +#endif + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + /* + * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC + * in needed_bits. Map the bits mapped by working_mode (currently + * missing) in missing_bits. + * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), + * needed_bits. + */ + needed_bits = 0; + + working_mode = mode; + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + owner == crgetuid(cr)) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VWRITE; + if (working_mode & ACE_EXECUTE) + needed_bits |= VEXEC; + + if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, + &check_privs, skipaclchk, cr)) == 0) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits)); + } + + if (error && !check_privs) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (error); + } + + if (error && (flags & V_APPEND)) { + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + } + + if (error && check_privs) { + mode_t checkmode = 0; + + /* + * First check for implicit owner permission on + * read_acl/read_attributes + */ + + error = 0; + ASSERT(working_mode != 0); + + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && + owner == crgetuid(cr))) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VWRITE; + if (working_mode & ACE_EXECUTE) + checkmode |= VEXEC; + + error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner, + needed_bits & ~checkmode, needed_bits); + + if (error == 0 && (working_mode & ACE_WRITE_OWNER)) + error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner); + if (error == 0 && (working_mode & ACE_WRITE_ACL)) + error = secpolicy_vnode_setdac(ZTOV(check_zp), cr, owner); + + if (error == 0 && (working_mode & + (ACE_DELETE|ACE_DELETE_CHILD))) + error = secpolicy_vnode_remove(ZTOV(check_zp), cr); + + if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { + error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner); + } + if (error == 0) { + /* + * See if any bits other than those already checked + * for are still present. If so then return EACCES + */ + if (working_mode & ~(ZFS_CHECKED_MASKS)) { + error = SET_ERROR(EACCES); + } + } + } else if (error == 0) { + error = secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits); + } + + + if (is_attr) + VN_RELE(ZTOV(xzp)); + + return (error); +} + +/* + * Translate traditional unix VREAD/VWRITE/VEXEC mode into + * native ACL format and call zfs_zaccess() + */ +int +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +{ + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); +} + +/* + * Access function for secpolicy_vnode_setattr + */ +int +zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) +{ + int v4_mode = zfs_unix_to_v4(mode >> 6); + + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); +} + +/* See zfs_zaccess_delete() */ +uint64_t zfs_write_implies_delete_child = 1; + +/* + * Determine whether delete access should be granted. + * + * The following chart outlines how we handle delete permissions which is + * how recent versions of windows (Windows 2008) handles it. The efficiency + * comes from not having to check the parent ACL where the object itself grants + * delete: + * + * ------------------------------------------------------- + * | Parent Dir | Target Object Permissions | + * | permissions | | + * ------------------------------------------------------- + * | | ACL Allows | ACL Denies| Delete | + * | | Delete | Delete | unspecified| + * ------------------------------------------------------- + * | ACL Allows | Permit | Deny * | Permit | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL specifies | | | | + * | only allow | Permit | Deny * | Permit | + * | write and | | | | + * | execute | | | | + * ------------------------------------------------------- + * | ACL denies | | | | + * | write and | Permit | Deny | Deny | + * | execute | | | | + * ------------------------------------------------------- + * ^ + * | + * Re. execute permission on the directory: if that's missing, + * the vnode lookup of the target will fail before we get here. + * + * Re [*] in the table above: NFSv4 would normally Permit delete for + * these two cells of the matrix. + * See acl.h for notes on which ACE_... flags should be checked for which + * operations. Specifically, the NFSv4 committee recommendation is in + * conflict with the Windows interpretation of DENY ACEs, where DENY ACEs + * should take precedence ahead of ALLOW ACEs. + * + * This implementation always consults the target object's ACL first. + * If a DENY ACE is present on the target object that specifies ACE_DELETE, + * delete access is denied. If an ALLOW ACE with ACE_DELETE is present on + * the target object, access is allowed. If and only if no entries with + * ACE_DELETE are present in the object's ACL, check the container's ACL + * for entries with ACE_DELETE_CHILD. + * + * A summary of the logic implemented from the table above is as follows: + * + * First check for DENY ACEs that apply. + * If either target or container has a deny, EACCES. + * + * Delete access can then be summarized as follows: + * 1: The object to be deleted grants ACE_DELETE, or + * 2: The containing directory grants ACE_DELETE_CHILD. + * In a Windows system, that would be the end of the story. + * In this system, (2) has some complications... + * 2a: "sticky" bit on a directory adds restrictions, and + * 2b: existing ACEs from previous versions of ZFS may + * not carry ACE_DELETE_CHILD where they should, so we + * also allow delete when ACE_WRITE_DATA is granted. + * + * Note: 2b is technically a work-around for a prior bug, + * which hopefully can go away some day. For those who + * no longer need the work around, and for testing, this + * work-around is made conditional via the tunable: + * zfs_write_implies_delete_child + */ +int +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +{ + uint32_t wanted_dirperms; + uint32_t dzp_working_mode = 0; + uint32_t zp_working_mode = 0; + int dzp_error, zp_error; + boolean_t dzpcheck_privs; + boolean_t zpcheck_privs; + + if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + return (SET_ERROR(EPERM)); + + /* + * Case 1: + * If target object grants ACE_DELETE then we are done. This is + * indicated by a return value of 0. For this case we don't worry + * about the sticky bit because sticky only applies to the parent + * directory and this is the child access result. + * + * If we encounter a DENY ACE here, we're also done (EACCES). + * Note that if we hit a DENY ACE here (on the target) it should + * take precedence over a DENY ACE on the container, so that when + * we have more complete auditing support we will be able to + * report an access failure against the specific target. + * (This is part of why we're checking the target first.) + */ + zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, + &zpcheck_privs, B_FALSE, cr); + if (zp_error == EACCES) { + /* We hit a DENY ACE. */ + if (!zpcheck_privs) + return (SET_ERROR(zp_error)); + + return (secpolicy_vnode_remove(ZTOV(zp), cr)); + } + if (zp_error == 0) + return (0); + + /* + * Case 2: + * If the containing directory grants ACE_DELETE_CHILD, + * or we're in backward compatibility mode and the + * containing directory has ACE_WRITE_DATA, allow. + * Case 2b is handled with wanted_dirperms. + */ + wanted_dirperms = ACE_DELETE_CHILD; + if (zfs_write_implies_delete_child) + wanted_dirperms |= ACE_WRITE_DATA; + dzp_error = zfs_zaccess_common(dzp, wanted_dirperms, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); + if (dzp_error == EACCES) { + /* We hit a DENY ACE. */ + if (!dzpcheck_privs) + return (SET_ERROR(dzp_error)); + return (secpolicy_vnode_remove(ZTOV(zp), cr)); + } + + /* + * Cases 2a, 2b (continued) + * + * Note: dzp_working_mode now contains any permissions + * that were NOT granted. Therefore, if any of the + * wanted_dirperms WERE granted, we will have: + * dzp_working_mode != wanted_dirperms + * We're really asking if ANY of those permissions + * were granted, and if so, grant delete access. + */ + if (dzp_working_mode != wanted_dirperms) + dzp_error = 0; + + /* + * dzp_error is 0 if the container granted us permissions to "modify". + * If we do not have permission via one or more ACEs, our current + * privileges may still permit us to modify the container. + * + * dzpcheck_privs is false when i.e. the FS is read-only. + * Otherwise, do privilege checks for the container. + */ + if (dzp_error != 0 && dzpcheck_privs) { + uid_t owner; + /* + * The secpolicy call needs the requested access and + * the current access mode of the container, but it + * only knows about Unix-style modes (VEXEC, VWRITE), + * so this must condense the fine-grained ACE bits into + * Unix modes. + * + * The VEXEC flag is easy, because we know that has + * always been checked before we get here (during the + * lookup of the target vnode). The container has not + * granted us permissions to "modify", so we do not set + * the VWRITE flag in the current access mode. + */ + owner = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, + ZFS_OWNER); + dzp_error = secpolicy_vnode_access2(cr, ZTOV(dzp), + owner, VEXEC, VWRITE|VEXEC); + } + if (dzp_error != 0) { + /* + * Note: We may have dzp_error = -1 here (from + * zfs_zacess_common). Don't return that. + */ + return (SET_ERROR(EACCES)); + } + + /* + * At this point, we know that the directory permissions allow + * us to modify, but we still need to check for the additional + * restrictions that apply when the "sticky bit" is set. + * + * Yes, zfs_sticky_remove_access() also checks this bit, but + * checking it here and skipping the call below is nice when + * you're watching all of this with dtrace. + */ + if ((dzp->z_mode & S_ISVTX) == 0) + return (0); + /* + * zfs_sticky_remove_access will succeed if: + * 1. The sticky bit is absent. + * 2. We pass the sticky bit restrictions. + * 3. We have privileges that always allow file removal. + */ + return (zfs_sticky_remove_access(dzp, zp, cr)); +} + +int +zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, + znode_t *tzp, cred_t *cr) +{ + int add_perm; + int error; + + if (szp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + add_perm = (vnode_isdir(ZTOV(szp))) ? + ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; + + /* + * Rename permissions are combination of delete permission + + * add file/subdir permission. + * + * BSD operating systems also require write permission + * on the directory being moved from one parent directory + * to another. + */ + if (vnode_isdir(ZTOV(szp)) && ZTOV(sdzp) != ZTOV(tdzp)) { + if ((error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr))) + return (error); + } + + /* + * first make sure we do the delete portion. + * + * If that succeeds then check for add_file/add_subdir permissions + */ + + if ((error = zfs_zaccess_delete(sdzp, szp, cr))) + return (error); + + /* + * If we have a tzp, see if we can delete it? + */ + if (tzp) { + if ((error = zfs_zaccess_delete(tdzp, tzp, cr))) + return (error); + } + + /* + * Now check for add permissions + */ + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + + return (error); +} diff --git a/module/os/windows/zfs/zfs_ctldir.c b/module/os/windows/zfs/zfs_ctldir.c new file mode 100644 index 000000000000..4d87a734f24a --- /dev/null +++ b/module/os/windows/zfs/zfs_ctldir.c @@ -0,0 +1,2126 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek . + * All rights reserved. + * Copyright (C) 2011 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * LLNL-CODE-403049. + * Rewritten for Linux by: + * Rohan Puri + * Brian Behlendorf + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * + * Rewritten for OSX by (based on FreeBSD) + * Jorgen Lundman + */ + +/* + * ZFS control directory (a.k.a. ".zfs") + * + * This directory provides a common location for all ZFS meta-objects. + * Currently, this is only the 'snapshot' directory, but this may expand in the + * future. The elements are built using the GFS primitives, as the hierarchy + * does not actually exist on disk. + * + * For 'snapshot', we don't want to have all snapshots always mounted, because + * this would take up a huge amount of space in /etc/mnttab. We have three + * types of objects: + * + * ctldir ------> snapshotdir -------> snapshot + * | + * | + * V + * mounted fs + * + * The 'snapshot' node contains just enough information to lookup '..' and act + * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we + * perform an automount of the underlying filesystem and return the + * corresponding vnode. + * + * All mounts are handled automatically by an user mode helper which invokes + * the mount procedure. Unmounts are handled by allowing the mount + * point to expire so the kernel may automatically unmount it. + * + * The '.zfs', '.zfs/snapshot', and all directories created under + * '.zfs/snapshot' (ie: '.zfs/snapshot/') all share the same + * zfsvfs_t as the head filesystem (what '.zfs' lives under). + * + * File systems mounted ontop of the GFS nodes '.zfs/snapshot/' + * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. + * However, vnodes within these mounted on file systems have their v_vfsp + * fields set to the head filesystem to make NFS happy (see + * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t + * so that it cannot be freed until all snapshots have been unmounted. + */ + +#include +#include +#include +#include +#include +#include +#include +//#include +#include +#include +#include +#include + +#include +#include +#include + +#include "zfs_namecheck.h" + +//#define dprintf printf + +/* + * OSX FreeBSD + * ------- --------- + * Short: iocount usecount + * Long: usecount holdcount + * incr: vnode_get vget + * decr: vnode_put vrele / vput (vput unlocks and vrele) + * + */ + + +//typedef struct vnodeopv_entry_desc vop_vector; +#define vop_vector vnodeopv_entry_desc + +typedef struct zfsctl_node { + gfs_dir_t zc_gfs_private; + uint64_t zc_id; + timestruc_t zc_cmtime; /* ctime and mtime, always the same */ +} zfsctl_node_t; + +typedef struct zfsctl_snapdir { + zfsctl_node_t sd_node; + kmutex_t sd_lock; + avl_tree_t sd_snaps; +} zfsctl_snapdir_t; + +typedef struct { + char *se_name; + struct vnode *se_root; + avl_node_t se_node; +} zfs_snapentry_t; + +static int +snapentry_compare(const void *a, const void *b) +{ + const zfs_snapentry_t *sa = a; + const zfs_snapentry_t *sb = b; + int ret = strcmp(sa->se_name, sb->se_name); + + if (ret < 0) + return (-1); + else if (ret > 0) + return (1); + else + return (0); +} + +<<<<<<< HEAD:ZFSin/zfs/module/zfs/zfs_ctldir.c +#ifdef sun +vnodeops_t *zfsctl_ops_root; +vnodeops_t *zfsctl_ops_snapdir; +vnodeops_t *zfsctl_ops_snapshot; +vnodeops_t *zfsctl_ops_shares; +vnodeops_t *zfsctl_ops_shares_dir; + +static const fs_operation_def_t zfsctl_tops_root[]; +static const fs_operation_def_t zfsctl_tops_snapdir[]; +static const fs_operation_def_t zfsctl_tops_snapshot[]; +static const fs_operation_def_t zfsctl_tops_shares[]; +#endif /* !sun */ +#ifdef __FreeBSD__ +static struct vop_vector zfsctl_ops_root; +static struct vop_vector zfsctl_ops_snapdir; +static struct vop_vector zfsctl_ops_snapshot; +static struct vop_vector zfsctl_ops_shares; +static struct vop_vector zfsctl_ops_shares_dir; +#endif /* !sun */ +#ifdef _WIN32 +======= +#ifdef __APPLE__ +>>>>>>> b3b9184f... Snapshot mounts fail on second mount:module/zfs/zfs_ctldir.c +struct vnodeopv_desc zfsctl_ops_root; +struct vnodeopv_desc zfsctl_ops_snapdir; +struct vnodeopv_desc zfsctl_ops_snapshot; +#endif + +static struct vnode *zfsctl_mknode_snapdir(struct vnode *); +//static struct vnode *zfsctl_mknode_shares(struct vnode *); +static struct vnode *zfsctl_snapshot_mknode(struct vnode *, uint64_t objset); +static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *); + + +/* + * Root directory elements. We only have two entries + * snapshot and shares. + */ +static gfs_dirent_t zfsctl_root_entries[] = { + { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, +#ifndef _WIN32 + { "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE }, +#endif + { NULL } +}; + +/* include . and .. in the calculation */ +#define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \ + sizeof (gfs_dirent_t)) + 1) + +int (**zfsctl_ops_root_dvnodeops) (void *); +int (**zfsctl_ops_snapdir_dvnodeops) (void *); +int (**zfsctl_ops_snapshot_dvnodeops) (void *); + +#define LK_EXCLUSIVE 0 + +int +traverse(struct vnode **cvpp, int lktype) +{ + struct vnode *cvp; + struct vnode *tvp; + vfs_t *vfsp; + int error; + int loop = 0; + + dprintf("+traverse\n"); + + cvp = *cvpp; + tvp = NULL; + + /* + * If this vnode is mounted on, then we transparently indirect + * to the vnode which is the root of the mounted file system. + * Before we do this we must check that an unmount is not in + * progress on this vnode. + */ + + for (;;) { + /* + * Reached the end of the mount chain? + */ + vfsp = vnode_mountedhere(cvp); + if (vfsp == NULL) + break; + error = vfs_busy(vfsp, 0); + /* + * tvp is NULL for *cvpp vnode, which we can't unlock. + */ + VN_RELE(cvp); + + if (error) + return (error); + + /* + * The read lock must be held across the call to VFS_ROOT() to + * prevent a concurrent unmount from destroying the vfs. + */ + error = VFS_ROOT(vfsp, lktype, &tvp); + vfs_unbusy(vfsp); + if (error != 0) + return (error); + + cvp = tvp; + + if (loop++>5) { + dprintf("loop detected, abort\n"); + break; + } + + } + + dprintf("-traverse\n"); + *cvpp = cvp; + return (0); +} + + + + +/* + * Initialize the various GFS pieces we'll need to create and manipulate .zfs + * directories. This is called from the ZFS init routine, and initializes the + * vnode ops vectors that we'll be using. + */ +void +zfsctl_init(void) +{ +#ifdef sun + VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0); +#endif +} + +void +zfsctl_fini(void) +{ +#ifdef sun + /* + * Remove vfsctl vnode ops + */ + if (zfsctl_ops_root) + vn_freevnodeops(zfsctl_ops_root); + if (zfsctl_ops_snapdir) + vn_freevnodeops(zfsctl_ops_snapdir); + if (zfsctl_ops_snapshot) + vn_freevnodeops(zfsctl_ops_snapshot); + if (zfsctl_ops_shares) + vn_freevnodeops(zfsctl_ops_shares); + if (zfsctl_ops_shares_dir) + vn_freevnodeops(zfsctl_ops_shares_dir); + + zfsctl_ops_root = NULL; + zfsctl_ops_snapdir = NULL; + zfsctl_ops_snapshot = NULL; + zfsctl_ops_shares = NULL; + zfsctl_ops_shares_dir = NULL; +#endif /* sun */ +} + +boolean_t +zfsctl_is_node(struct vnode *vp) +{ + if (vnode_tag(vp) == VT_OTHER) + return B_TRUE; + return B_FALSE; +} + +/* + * Return the inode number associated with the 'snapshot' or + * 'shares' directory. + */ +/* ARGSUSED */ +static ino64_t +zfsctl_root_inode_cb(struct vnode *vp, int index) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(vp)); + + ASSERT(index <= 2); + + if (index == 0) + return (ZFSCTL_INO_SNAPDIR); + + return (zfsvfs->z_shares_dir); +} + +/* + * Create the '.zfs' directory. This directory is cached as part of the VFS + * structure. This results in a hold on the vfs_t. The code in zfs_umount() + * therefore checks against a vfs_count of 2 instead of 1. This reference + * is removed when the ctldir is destroyed in the unmount. + */ +void +zfsctl_create(zfsvfs_t *zfsvfs) +{ + struct vnode *vp = NULL; //, *rvp = NULL; + zfsctl_node_t *zcp; +// uint64_t crtime[2]; + + + ASSERT(zfsvfs->z_ctldir == NULL); + + dprintf("zfsctl_create\n"); + + /* + * This creates a vnode with VROOT set, this is so that unmount's + * vflush() (called before our vfs_unmount) will pass (and not block + * waiting for the usercount ref to be released). We then release the + * VROOT vnode in zfsctl_destroy, and release the usercount ref. + */ + + vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs, + zfsctl_ops_root_dvnodeops, + ZFSCTL_INO_ROOT, zfsctl_root_entries, + zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL); + + zcp = vnode_fsnode(vp); + zcp->zc_id = ZFSCTL_INO_ROOT; + +#ifndef __APPLE__ + VERIFY(VFS_ROOT(zfsvfs->z_vfs, 0, &rvp) == 0); + VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + &crtime, sizeof (crtime))); + ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime); + VN_RELE(rvp); +#endif + +#ifdef __LINUX__ + /* + * We're only faking the fact that we have a root of a filesystem for + * the sake of the GFS interfaces. Undo the flag manipulation it did + * for us. + */ + vp->v_vflag &= ~VV_ROOT; +#endif + /* In OSX we mark the node VSYSTEM instead */ + + zfsvfs->z_ctldir = vp; + + /* + * Explicitely hold a usecount (not iocount) reference here, so that + * .zfs is hold until unmount is called + */ + vnode_ref(zfsvfs->z_ctldir); // Hold an usecount ref + + VN_RELE(zfsvfs->z_ctldir); // release iocount ref(vnode_get/vnode_create) +} + + +/* + * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. + * There might still be more references if we were force unmounted, but only + * new zfs_inactive() calls can occur and they don't reference .zfs + */ +void +zfsctl_destroy(zfsvfs_t *zfsvfs) +{ + struct vnode *vp; + + dprintf("zfsctl: releasing rootvp %p\n", zfsvfs->z_ctldir); + vp = zfsvfs->z_ctldir; + zfsvfs->z_ctldir = NULL; + if (vp && !vnode_getwithref(vp)) { + + /* + * Finally release the vnode_ref held in zfsctl_create() + */ + vnode_rele(vp); + vnode_recycle(vp); + VN_RELE(vp); + } + +} + +/* + * Given a root znode, retrieve the associated .zfs directory. + * Add a hold to the vnode and return it. + */ +struct vnode * +zfsctl_root(znode_t *zp) +{ + ASSERT(zfs_has_ctldir(zp)); + dprintf("zfsctl_root hold\n"); + VN_HOLD(zp->z_zfsvfs->z_ctldir); + return (zp->z_zfsvfs->z_ctldir); +} + +/* + * Common open routine. Disallow any write access. + */ +/* ARGSUSED */ +static int +zfsctl_common_open(struct vnop_open_args *ap) +{ + int flags = ap->a_mode; + + dprintf("zfsctl_open: %p on %p\n", + ap->a_vp, vnode_mountedhere(ap->a_vp)); + + if (flags & FWRITE) + return (EACCES); + + return (0); +} + +/* + * Common close routine. Nothing to do here. + */ +/* ARGSUSED */ +static int +zfsctl_common_close(struct vnop_close_args *ap) +{ + return (0); +} + + + +/* + * Common access routine. Disallow writes. + */ +/* ARGSUSED */ +static int +zfsctl_common_access(ap) + struct vnop_access_args /* { + struct vnode *a_vp; + accmode_t a_accmode; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + int accmode = ap->a_action; + dprintf("zfsctl_access\n"); + +#ifdef TODO + if (flags & V_ACE_MASK) { + if (accmode & ACE_ALL_WRITE_PERMS) + return (EACCES); + } else { +#endif + if (accmode & VWRITE) + return (EACCES); +#ifdef TODO + } +#endif + + return (0); +} + +/* + * Common getattr function. Fill in basic information. + */ +static void +zfsctl_common_getattr(struct vnode *vp, vattr_t *vap) +{ + timestruc_t now; + + dprintf("zfsctl: +getattr: %p\n", + vp); + +#ifdef _WIN32 + VATTR_SET_SUPPORTED(vap, va_mode); + VATTR_SET_SUPPORTED(vap, va_type); + VATTR_SET_SUPPORTED(vap, va_uid); + VATTR_SET_SUPPORTED(vap, va_gid); + VATTR_SET_SUPPORTED(vap, va_data_size); + VATTR_SET_SUPPORTED(vap, va_total_size); + VATTR_SET_SUPPORTED(vap, va_data_alloc); + VATTR_SET_SUPPORTED(vap, va_total_alloc); + VATTR_SET_SUPPORTED(vap, va_access_time); + VATTR_SET_SUPPORTED(vap, va_dirlinkcount); + VATTR_SET_SUPPORTED(vap, va_flags); +#endif + + vap->va_dirlinkcount = 1; //directory hard links. + vap->va_nlink = 3; + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_rdev = 0; + /* + * We are a purely virtual object, so we have no + * blocksize or allocated blocks. + */ + // vap->va_blksize = 0; + vap->va_data_alloc = 512; + vap->va_total_alloc = 512; + vap->va_data_size = 2; // . + .. + vap->va_total_size = 2; + vap->va_nblocks = 0; + //vap->va_seq = 0; + vap->va_gen = 0; + + vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | + S_IROTH | S_IXOTH; + vap->va_type = VDIR; + + if (VATTR_IS_ACTIVE(vap, va_nchildren) && vnode_isdir(vp)) { + VATTR_RETURN(vap, va_nchildren, vap->va_nlink - 2); + } + vap->va_iosize = 512; + + /* + * We live in the now (for atime). + */ + gethrestime(&now); + vap->va_atime = now; + /* FreeBSD: Reset chflags(2) flags. */ + vap->va_flags = 0; + + dprintf("zfsctl: -getattr\n"); +} + +<<<<<<< HEAD:ZFSin/zfs/module/zfs/zfs_ctldir.c +#ifndef _WIN32 +/*ARGSUSED*/ +static int +zfsctl_common_fid(ap) + struct vnop_fid_args /* { + struct vnode *a_vp; + struct fid *a_fid; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + fid_t *fidp = (void *)ap->a_fid; + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(vp)); + zfsctl_node_t *zcp = vnode_fsnode(vp); + uint64_t object = zcp->zc_id; + zfid_short_t *zfid; + int i; + + ZFS_ENTER(zfsvfs); + + fidp->fid_len = SHORT_FID_LEN; + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = SHORT_FID_LEN; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* .zfs znodes always have a generation number of 0 */ + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = 0; + + ZFS_EXIT(zfsvfs); + return (0); +} +#endif + +/*ARGSUSED*/ +#ifndef _WIN32 +static int +zfsctl_shares_fid(ap) + struct vop_fid_args /* { + struct vnode *a_vp; + struct fid *a_fid; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + fid_t *fidp = (void *)ap->a_fid; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { + error = VOP_FID(ZTOV(dzp), fidp); + VN_RELE(ZTOV(dzp)); + } + + ZFS_EXIT(zfsvfs); + return (error); +} +#endif + +======= +>>>>>>> b3b9184f... Snapshot mounts fail on second mount:module/zfs/zfs_ctldir.c +/* + * Gets the full dataset name that corresponds to the given snapshot name + * Example: + * zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1" + */ +static int +zfsctl_common_reclaim(ap) + struct vnop_reclaim_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + gfs_file_t *fp = vnode_fsnode(vp); + + dprintf("zfsctl: +reclaim vp %p mountedon %p\n", vp, + vnode_mountedhere(vp)); + + /* + * Destroy the vm object and flush associated pages. + */ +#ifdef _WIN32 + /* + * It would appear that Darwin does not guarantee that vnop_inactive is + * always called, but reclaim is used instead. All release happens in here + * and inactive callbacks are mostly empty. + */ + if (fp) { + + if (fp->gfs_type == GFS_DIR) + gfs_dir_inactive(vp); + else + gfs_file_inactive(vp); + + kmem_free(fp, fp->gfs_size); + + } + + vnode_removefsref(vp); /* ADDREF from vnode_create */ + vnode_clearfsnode(vp); /* vp->v_data = NULL */ + +#endif + + dprintf("zfsctl: -reclaim vp %p\n", vp); + return (0); +} + +#define ZFSCTL_INO_SNAP(id) (id) + +/* + * Get root directory attributes. + */ +/* ARGSUSED */ +static int +zfsctl_root_getattr(ap) + struct vnop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(vp)); + zfsctl_node_t *zcp = vnode_fsnode(vp); + + ZFS_ENTER(zfsvfs); +#ifdef _WIN32 + VATTR_SET_SUPPORTED(vap, va_modify_time); + VATTR_SET_SUPPORTED(vap, va_create_time); + VATTR_SET_SUPPORTED(vap, va_fsid); + VATTR_SET_SUPPORTED(vap, va_fileid); // SPL: va_nodeid + VATTR_CLEAR_SUPPORTED(vap, va_acl); +#endif + // CALL statvfs to get FSID here + vap->va_fsid = vfs_statfs(vnode_mount(vp))->f_fsid.val[0]; + vap->va_nodeid = ZFSCTL_INO_ROOT; + vap->va_nlink = vap->va_size = NROOT_ENTRIES; + vap->va_mtime = vap->va_ctime = zcp->zc_cmtime; + vap->va_ctime = vap->va_ctime; + + if (VATTR_IS_ACTIVE(vap, va_name) && vap->va_name) { + (void)strlcpy(vap->va_name, ".zfs", MAXPATHLEN); + VATTR_SET_SUPPORTED(vap, va_name); + } + + zfsctl_common_getattr(vp, vap); + + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* + * Special case the handling of "..". + */ +/* ARGSUSED */ +int +zfsctl_root_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, pathname_t *pnp, + int flags, struct vnode *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(dvp)); + int err; + + dprintf("zfsctl_root_lookup dvp %p\n", dvp); + + if (!zfsvfs) return EINVAL; + + /* + * No extended attributes allowed under .zfs + */ +#ifndef _WIN32 + if (flags & LOOKUP_XATTR) + return (EINVAL); +#endif + + ZFS_ENTER(zfsvfs); + + if (strcmp(nm, "..") == 0) { + err = VFS_ROOT(vnode_mount(dvp), LK_EXCLUSIVE, vpp); + } else { + err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp); + } + + ZFS_EXIT(zfsvfs); + + return (err); +} + + + + +/* + * Special case the handling of "..". + */ +/* ARGSUSED */ +int +zfsctl_freebsd_root_lookup(ap) + struct vnop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + struct vnode *dvp = ap->a_dvp; + struct vnode **vpp = ap->a_vpp; + cred_t *cr = (cred_t *)vfs_context_ucred((ap)->a_context); + int flags = ap->a_cnp->cn_flags; + int nameiop = ap->a_cnp->cn_nameiop; + char nm[NAME_MAX + 1]; + int err; + + dprintf("zfsctl: +freebsd_root_lookup: nameiop %d\n", nameiop); + + + if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE)) { + dprintf("failed\n"); + return (EOPNOTSUPP); + } + + ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); + strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); + + err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr, NULL, NULL, NULL); + +#ifdef __FreeBSD__ + if (err == 0 && (nm[0] != '.' || nm[1] != '\0')) + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); +#endif + + return (err); +} + + +#ifdef _WIN32 +#define VOPFUNC int (*)(void *) +#include +/* Directory vnode operations template */ +//int (**zfsctl_ops_root_dvnodeops) (void *); +static struct vnodeopv_entry_desc zfsctl_ops_root_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_open_desc, (VOPFUNC)zfsctl_common_open}, + {&vnop_close_desc, (VOPFUNC)zfsctl_common_close}, + //{&vnop_ioctl_desc, (VOPFUNC)zfs_vnop_ioctl}, + {&vnop_getattr_desc, (VOPFUNC)zfsctl_root_getattr}, + {&vnop_access_desc, (VOPFUNC)zfsctl_common_access}, + {&vnop_readdir_desc, (VOPFUNC)gfs_vop_readdir}, + //{&vnop_readdirattr_desc, (VOPFUNC)zfs_vnop_readdirattr}, + //{&vnop_lookup_desc, (VOPFUNC)zfsctl_root_lookup}, + {&vnop_lookup_desc, (VOPFUNC)zfsctl_freebsd_root_lookup}, + {&vnop_inactive_desc, (VOPFUNC)gfs_vop_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfsctl_common_reclaim}, + + { &vnop_revoke_desc, (VOPFUNC)err_revoke }, /* revoke */ + { &vnop_fsync_desc, (VOPFUNC)nop_fsync }, /* fsync */ + + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfsctl_ops_root = +{ &zfsctl_ops_root_dvnodeops, zfsctl_ops_root_template }; + +#endif + + + +static int +zfsctl_snapshot_zname(struct vnode *vp, const char *name, int len, char *zname) +{ + objset_t *os = ((zfsvfs_t *)(vfs_fsprivate(vnode_mount(vp))))->z_os; + + if (zfs_component_namecheck(name, NULL, NULL) != 0) + return (EILSEQ); + dmu_objset_name(os, zname); + if (strlen(zname) + 1 + strlen(name) >= len) + return (ENAMETOOLONG); + (void) strlcat(zname, "@", len); + (void) strlcat(zname, name, len); + return (0); +} + +static int +zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr) +{ + struct vnode *svp = sep->se_root; + int error; + struct vnop_inactive_args iap; + + ASSERT(vn_ismntpt(svp)); + + /* this will be dropped by dounmount() */ + if ((error = vn_vfswlock(svp)) != 0) + return (error); + + /* + * We can't use VN_RELE(), as that will try to invoke + * zfsctl_snapdir_inactive(), which would cause us to destroy + * the sd_lock mutex held by our caller. + */ + //ASSERT(svp->v_count == 1); + iap.a_vp = svp; + gfs_vop_inactive(&iap); + + dprintf("zfsctldir: Releasing '%s'\n", sep->se_name); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + sep->se_name = NULL; + kmem_free(sep, sizeof (zfs_snapentry_t)); + sep = NULL; + + return (0); +} + + +/* + * This creates a snapshot under '.zfs/snapshot'. + */ +/* ARGSUSED */ +static int +zfsctl_snapdir_mkdir(struct vnode *dvp, char *dirname, vattr_t *vap, struct vnode **vpp, + cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp) +{ + return ENOTSUP; +#if 0 + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(dvp)); + char name[MAXNAMELEN]; + int err, error; + //static enum symfollow follow = NO_FOLLOW; + static enum uio_seg seg = UIO_SYSSPACE; + + if (snapshot_namecheck(dirname, NULL, NULL) != 0) + + zfs_sb_t *zsb = ITOZSB(dip); + char *dsname; + int error; + + dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + + if (zfs_component_namecheck(dirname, NULL, NULL) != 0) { + error = SET_ERROR(EILSEQ); + goto out; + } + + dmu_objset_name(zfsvfs->z_os, name); + + *vpp = NULL; + + err = zfs_secpolicy_snapshot_perms(name, cr); + if (err) + return (err); + + if (err == 0) { + // err = dmu_objset_snapshot(name, dirname, NULL, NULL, + // B_FALSE, B_FALSE, -1); + if (err) + return (err); + err = zfsctl_snapdir_lookup(dvp, dirname, vpp, + 0, cr, NULL, NULL); + } +out: + return (err); +#endif +} + +static int +zfsctl_freebsd_snapdir_mkdir(ap) + struct vnop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + +// ASSERT(ap->a_cnp->cn_flags & SAVENAME); + cred_t *cr = (cred_t *)vfs_context_ucred((ap)->a_context); + + return (zfsctl_snapdir_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, NULL, + ap->a_vpp, cr, NULL, 0, NULL)); +} + +static int +zfsctl_snapdir_readdir_cb(struct vnode *vp, void *dp, int *eofp, + offset_t *offp, offset_t *nextp, void *data, int flags); + +/* + * Lookup entry point for the 'snapshot' directory. Try to open the + * snapshot if it exist, creating the pseudo filesystem vnode as necessary. + * Perform a mount of the associated dataset on top of the vnode. + */ +/* ARGSUSED */ +#if !defined (__OPTIMIZE__) +#pragma GCC diagnostic ignored "-Wframe-larger-than=" +#endif +int +zfsctl_snapdir_lookup(ap) + struct vnop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + struct vnode *dvp = ap->a_dvp; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + char nm[NAME_MAX + 1]; + zfsctl_snapdir_t *sdp = vnode_fsnode(dvp); + objset_t *snap; + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + char real[ZFS_MAX_DATASET_NAME_LEN]; + char *mountpoint; + zfs_snapentry_t *sep, search; + size_t mountpoint_len; + avl_index_t where; + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(dvp)); + int err; + int flags = 0; + + /* + * No extended attributes allowed under .zfs + */ +#ifndef _WIN32 + if (flags & LOOKUP_XATTR) + return (EINVAL); +#endif + + if (!sdp) return ENOENT; + + ASSERT(ap->a_cnp->cn_namelen < sizeof(nm)); + strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); + + dprintf("zfsctl_snapdir_lookup '%s'\n", nm); + + ASSERT(vnode_isdir(dvp)); + + if (!strcmp(nm, ".autodiskmounted")) return EINVAL; + + + + *vpp = NULL; + + /* + * If we get a recursive call, that means we got called + * from the domount() code while it was trying to look up the + * spec (which looks like a local path for zfs). We need to + * add some flag to domount() to tell it not to do this lookup. + */ + if (MUTEX_HELD(&sdp->sd_lock)) + return (ENOENT); + + ZFS_ENTER(zfsvfs); + + // Returns if LOCK is held, otherwise we do not hold vpp + if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (flags & FIGNORECASE) { + boolean_t conflict = B_FALSE; + + err = dmu_snapshot_realname(zfsvfs->z_os, nm, real, + MAXNAMELEN, &conflict); + if (err == 0) { + strlcpy(nm, real, sizeof(nm)); + } else if (err != ENOTSUP) { + ZFS_EXIT(zfsvfs); + return (err); + } +#if 0 + if (realpnp) + (void) strlcpy(realpnp->pn_buf, nm, + realpnp->pn_bufsize); + if (conflict && direntflags) + *direntflags = ED_CASE_CONFLICT; +#endif + } + + dprintf("looking for name '%s'\n", nm); + + mutex_enter(&sdp->sd_lock); + search.se_name = (char *)nm; + if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { + *vpp = sep->se_root; + VN_HOLD(*vpp); + err = traverse(vpp, LK_EXCLUSIVE | LK_RETRY); + + if (err) { + VN_RELE(*vpp); + *vpp = NULL; + dprintf("vnrele\n"); + } else if (*vpp == sep->se_root) { + /* + * The snapshot was unmounted behind our backs, + * try to remount it. + */ + VERIFY(zfsctl_snapshot_zname(dvp, nm, ZFS_MAX_DATASET_NAME_LEN, snapname) == 0); + dprintf("goto domount\n"); + goto domount; + } else { + /* + * VROOT was set during the traverse call. We need + * to clear it since we're pretending to be part + * of our parent's vfs. + */ + //(*vpp)->v_flag &= ~VROOT; + } + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + return (err); + } + + /* + * The requested snapshot is not currently mounted, look it up. + */ + err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname); + if (err) { + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + /* + * handle "ls *" or "?" in a graceful manner, + * forcing EILSEQ to ENOENT. + * Since shell ultimately passes "*" or "?" as name to lookup + */ + return (err == EILSEQ ? ENOENT : err); + } + if (dmu_objset_hold(snapname, FTAG, &snap) != 0) { + mutex_exit(&sdp->sd_lock); + /* Translate errors and add SAVENAME when needed. */ + if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE) { + err = EJUSTRETURN; + //cnp->cn_flags |= SAVENAME; + } else { + err = ENOENT; + } + ZFS_EXIT(zfsvfs); + return (err); + } + + sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP); + sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); + (void) strlcpy(sep->se_name, nm, strlen(nm) + 1); + dprintf("must not exist, Calling snapshot_mknode for '%s'\n", snapname); + VN_RELE(*vpp); + *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap)); + avl_insert(&sdp->sd_snaps, sep, where); + + dmu_objset_rele(snap, FTAG); +domount: + + // vfs_statfs(vfsp)->f_mntfromname + mountpoint_len = strlen(vfs_statfs(vnode_mount(dvp))->f_mntonname) + + strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(nm) + 1; + mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); + (void) snprintf(mountpoint, mountpoint_len, + "%s/" ZFS_CTLDIR_NAME "/snapshot/%s", + vfs_statfs(vnode_mount(dvp))->f_mntonname, nm); + +#ifdef __FreeBSD__ + err = mount_snapshot(curthread, vpp, "zfs", mountpoint, snapname, 0); +#endif + +#ifdef _WIN32 + + dprintf("Would call mount here on '%s' for '%s': mountedhere %p\n", + mountpoint, snapname, vnode_mountedhere(*vpp)); + +#ifdef _KERNEL + + /* In upstream ZFS, mount_snapshot takes the current vp in vpp, + * allocates a new mount, and creates a new mvp for it. Then + * calls vput(vp) to release the current vnode. This lookup + * functions then returns the lock held for mvp (in vpp). + * + * In OSX, we do not get a new vnode, since we are not calling mount + * so we need to return 'vp' with a reference. We release one of the + * references here, and return the other. + */ + + // VN_RELE(*vpp); + + /* + * The world isn't ready for this yet + zfs_ereport_snapshot_post(FM_EREPORT_ZFS_SNAPSHOT_MOUNT, + dmu_objset_spa(zfsvfs->z_os), snapname); + */ + +#endif // KERNEL +#endif // APPLE + + kmem_free(mountpoint, mountpoint_len); + if (err == 0) { + /* + * Fix up the root vnode mounted on .zfs/snapshot/. + * + * This is where we lie about our v_vfsp in order to + * make .zfs/snapshot/ accessible over NFS + * without requiring manual mounts of . + */ +#ifndef _WIN32 + ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); + VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; +#endif + + } + mutex_exit(&sdp->sd_lock); + ZFS_EXIT(zfsvfs); + if (err != 0) { + dprintf("The illumos rele here %p\n", *vpp); + VN_RELE(*vpp); + *vpp = NULL; + } + + /* + * Gross hack for now, fix meeeee + */ + dprintf("Lookup complete: %d %p\n", err, err==0?*vpp:NULL); + return (err); +} + +/* ARGSUSED */ +<<<<<<< HEAD:ZFSin/zfs/module/zfs/zfs_ctldir.c +#ifndef _WIN32 +int +zfsctl_shares_lookup(ap) + struct vnop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + struct vnode *dvp = ap->a_dvp; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(dvp)); + char nm[NAME_MAX + 1]; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + ASSERT(cnp->cn_namelen < sizeof(nm)); + strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1); + + if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) + error = VOP_LOOKUP(ZTOV(dzp), vpp, cnp); + + VN_RELE(ZTOV(dzp)); + ZFS_EXIT(zfsvfs); + + return (error); +} +#endif + +/* ARGSUSED */ +======= +>>>>>>> b3b9184f... Snapshot mounts fail on second mount:module/zfs/zfs_ctldir.c +static int +zfsctl_snapdir_readdir_cb(struct vnode *vp, void *dp, int *eofp, + offset_t *offp, offset_t *nextp, void *data, int flags) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(vp)); + char snapname[MAXNAMELEN]; + uint64_t id, cookie; + boolean_t case_conflict; + int error; + dirent64_t *odp; + + ZFS_ENTER(zfsvfs); + + cookie = *offp; + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, + sizeof (snapname), snapname, &id, &cookie, &case_conflict); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error) { + ZFS_EXIT(zfsvfs); + if (error == ENOENT) { + *eofp = 1; + return (0); + } + return (error); + } + + odp=dp; + (void) strlcpy(odp->d_name, snapname, ZFS_MAX_DATASET_NAME_LEN); + odp->d_ino = ZFSCTL_INO_SNAP(id); + + *nextp = cookie; + + ZFS_EXIT(zfsvfs); + + return (0); +} + +<<<<<<< HEAD:ZFSin/zfs/module/zfs/zfs_ctldir.c +#ifndef _WIN32 +/* ARGSUSED */ +static int +zfsctl_shares_readdir(ap) + struct vnop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *a_ncookies; + u_long **a_cookies; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + uio_t *uiop = ap->a_uio; + cred_t *cr = (cred_t *)vfs_context_ucred((ap)->a_context); + int *eofp = ap->a_eofflag; + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(vp)); + znode_t *dzp; + int error = 0; + ulong *cookies; + + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } +#if 0 + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { + vn_lock(ZTOV(dzp), LK_SHARED | LK_RETRY); + error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ap->a_numdirent, &cookies); + VN_RELE(ZTOV(dzp)); + } else { + *eofp = 1; + error = ENOENT; + } +#endif + + ZFS_EXIT(zfsvfs); + return (error); +} +#endif +======= +>>>>>>> b3b9184f... Snapshot mounts fail on second mount:module/zfs/zfs_ctldir.c + +/* + * pvp is the '.zfs' directory (zfsctl_node_t). + * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t). + * + * This function is the callback to create a GFS vnode for '.zfs/snapshot' + * when a lookup is performed on .zfs for "snapshot". + */ +struct vnode * +zfsctl_mknode_snapdir(struct vnode *pvp) +{ + struct vnode *vp; + zfsctl_snapdir_t *sdp; + + dprintf("+mknode_snapdir\n"); + + vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, vnode_mount(pvp), + zfsctl_ops_snapdir_dvnodeops, NULL, NULL, MAXNAMELEN, + zfsctl_snapdir_readdir_cb, NULL, 0); + sdp = vnode_fsnode(vp); + sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR; + sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)vnode_fsnode(pvp))->zc_cmtime; + mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&sdp->sd_snaps, snapentry_compare, + sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node)); + +#ifndef _WIN32 + VOP_UNLOCK(vp, 0); +#endif + + dprintf("-mknode_snapdir: %p\n", vp); + return (vp); +} + +<<<<<<< HEAD:ZFSin/zfs/module/zfs/zfs_ctldir.c +#ifndef _WIN32 +struct vnode * +zfsctl_mknode_shares(struct vnode *pvp) +{ + struct vnode *vp; + zfsctl_node_t *sdp; + + vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp, + &zfsctl_ops_shares, NULL, NULL, MAXNAMELEN, + NULL, NULL); + sdp = vnode_fsnode(vp); + sdp->zc_cmtime = ((zfsctl_node_t *)vnode_fsnode(pvp))->zc_cmtime; + VOP_UNLOCK(vp, 0); + return (vp); + +} +#endif + +#ifndef _WIN32 +/* ARGSUSED */ +static int +zfsctl_shares_getattr(ap) + struct vnop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + cred_t *cr = (cred_t *)vfs_context_ucred((ap)->a_context); + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(vp)); + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { + vn_lock(ZTOV(dzp), LK_SHARED | LK_RETRY); + error = VOP_GETATTR(ZTOV(dzp), vap, cr); + VN_RELE(ZTOV(dzp)); + } + ZFS_EXIT(zfsvfs); + return (error); + + +} +#endif + +======= +>>>>>>> b3b9184f... Snapshot mounts fail on second mount:module/zfs/zfs_ctldir.c +/* ARGSUSED */ +static int +zfsctl_snapdir_getattr(ap) + struct vnop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(vp)); + zfsctl_snapdir_t *sdp = vnode_fsnode(vp); + + dprintf("zfsctl: +snapdir_getattr: %p: (v_data %p)\n", vp, sdp); + + if (!sdp) return ENOENT; + + ZFS_ENTER(zfsvfs); + zfsctl_common_getattr(vp, vap); + vap->va_nodeid = gfs_file_inode(vp); + vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2; + vap->va_ctime = vap->va_mtime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); +#ifdef _WIN32 + VATTR_SET_SUPPORTED(vap, va_modify_time); + VATTR_SET_SUPPORTED(vap, va_create_time); + VATTR_SET_SUPPORTED(vap, va_nlink); + VATTR_SET_SUPPORTED(vap, va_fileid); + VATTR_CLEAR_SUPPORTED(vap, va_acl); +#endif + ZFS_EXIT(zfsvfs); + dprintf("zfsctl: -snapdir_getattr\n"); + + return (0); +} + +/* ARGSUSED */ +static int +zfsctl_snapdir_reclaim(ap) + struct vnop_inactive_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + zfsctl_snapdir_t *sdp = vnode_fsnode(vp); + zfs_snapentry_t *sep; + + vnode_removefsref(vp); + + if (!sdp) return 0; + + /* + * On forced unmount we have to free snapshots from here. + */ + mutex_enter(&sdp->sd_lock); + while ((sep = avl_first(&sdp->sd_snaps)) != NULL) { + dprintf("Removing (reclaim) snap '%s'\n", sep->se_name); + avl_remove(&sdp->sd_snaps, sep); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + kmem_free(sep, sizeof (zfs_snapentry_t)); + } + mutex_exit(&sdp->sd_lock); + gfs_dir_inactive(vp); + ASSERT(avl_numnodes(&sdp->sd_snaps) == 0); + mutex_destroy(&sdp->sd_lock); + avl_destroy(&sdp->sd_snaps); + kmem_free(sdp, sizeof (zfsctl_snapdir_t)); + + vnode_clearfsnode(vp); + + return (0); +} + +<<<<<<< HEAD:ZFSin/zfs/module/zfs/zfs_ctldir.c +#ifdef sun +static const fs_operation_def_t zfsctl_tops_snapdir[] = { + { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, + { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, + { VOPNAME_IOCTL, { .error = fs_inval } }, + { VOPNAME_GETATTR, { .vop_getattr = zfsctl_snapdir_getattr } }, + { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, + { VOPNAME_RENAME, { .vop_rename = zfsctl_snapdir_rename } }, + { VOPNAME_RMDIR, { .vop_rmdir = zfsctl_snapdir_remove } }, + { VOPNAME_MKDIR, { .vop_mkdir = zfsctl_snapdir_mkdir } }, + { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_snapdir_lookup } }, + { VOPNAME_SEEK, { .vop_seek = fs_seek } }, + { VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapdir_inactive } }, + { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, + { NULL } +}; + +static const fs_operation_def_t zfsctl_tops_shares[] = { + { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, + { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, + { VOPNAME_IOCTL, { .error = fs_inval } }, + { VOPNAME_GETATTR, { .vop_getattr = zfsctl_shares_getattr } }, + { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, + { VOPNAME_READDIR, { .vop_readdir = zfsctl_shares_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_shares_lookup } }, + { VOPNAME_SEEK, { .vop_seek = fs_seek } }, + { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, + { VOPNAME_FID, { .vop_fid = zfsctl_shares_fid } }, + { NULL } +}; +#endif /* !sun */ + +#ifdef __FreeBSD__ +static struct vop_vector zfsctl_ops_snapdir = { + .vop_default = &default_vnodeops, + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_ioctl = VOP_EINVAL, + .vop_getattr = zfsctl_snapdir_getattr, + .vop_access = zfsctl_common_access, + .vop_mkdir = zfsctl_freebsd_snapdir_mkdir, + .vop_readdir = gfs_vop_readdir, + .vop_lookup = zfsctl_snapdir_lookup, + .vop_inactive = zfsctl_snapdir_inactive, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, +}; + +static struct vop_vector zfsctl_ops_shares = { + .vop_default = &default_vnodeops, + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_ioctl = VOP_EINVAL, + .vop_getattr = zfsctl_shares_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = zfsctl_shares_readdir, + .vop_lookup = zfsctl_shares_lookup, + .vop_inactive = gfs_vop_inactive, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_shares_fid, +}; +#endif /* FreeBSD */ + +#ifdef _WIN32 +======= +#ifdef __APPLE__ +>>>>>>> b3b9184f... Snapshot mounts fail on second mount:module/zfs/zfs_ctldir.c + +static struct vnodeopv_entry_desc zfsctl_ops_snapdir_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_open_desc, (VOPFUNC)zfsctl_common_open}, + {&vnop_close_desc, (VOPFUNC)zfsctl_common_close}, + //{&vnop_ioctl_desc, (VOPFUNC)zfs_vnop_ioctl}, + {&vnop_getattr_desc, (VOPFUNC)zfsctl_snapdir_getattr}, + {&vnop_access_desc, (VOPFUNC)zfsctl_common_access}, + {&vnop_mkdir_desc, (VOPFUNC)zfsctl_freebsd_snapdir_mkdir}, + {&vnop_readdir_desc, (VOPFUNC)gfs_vop_readdir}, + //{&vnop_readdirattr_desc, (VOPFUNC)zfs_vnop_readdirattr}, + {&vnop_lookup_desc, (VOPFUNC)zfsctl_snapdir_lookup}, + {&vnop_reclaim_desc, (VOPFUNC)zfsctl_snapdir_reclaim}, + // {&vnop_reclaim_desc, (VOPFUNC)zfsctl_common_reclaim}, + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfsctl_ops_snapdir = +{ &zfsctl_ops_snapdir_dvnodeops, zfsctl_ops_snapdir_template }; + +<<<<<<< HEAD:ZFSin/zfs/module/zfs/zfs_ctldir.c +#ifndef _WIN32 +int (**zfsctl_ops_shares_dvnodeops) (void *); +static struct vnodeopv_entry_desc zfsctl_ops_shares_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_open_desc, (VOPFUNC)zfsctl_common_open}, + {&vnop_close_desc, (VOPFUNC)zfsctl_common_close}, + //{&vnop_ioctl_desc, (VOPFUNC)zfs_vnop_ioctl}, + {&vnop_getattr_desc, (VOPFUNC)zfsctl_shares_getattr}, + {&vnop_access_desc, (VOPFUNC)zfsctl_common_access}, + {&vnop_readdir_desc, (VOPFUNC)zfsctl_shares_readdir}, + //{&vnop_readdirattr_desc, (VOPFUNC)zfs_vnop_readdirattr}, + {&vnop_lookup_desc, (VOPFUNC)zfsctl_shares_lookup}, + {&vnop_inactive_desc, (VOPFUNC)gfs_vop_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfsctl_common_reclaim}, + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfsctl_ops_shares = +{ &zfsctl_ops_shares_dvnodeops, zfsctl_ops_shares_template }; +#endif + +======= +>>>>>>> b3b9184f... Snapshot mounts fail on second mount:module/zfs/zfs_ctldir.c +#endif + +/* + * pvp is the GFS vnode '.zfs/snapshot'. + * + * This creates a GFS node under '.zfs/snapshot' representing each + * snapshot. This newly created GFS node is what we mount snapshot + * vfs_t's ontop of. + */ +static struct vnode * +zfsctl_snapshot_mknode(struct vnode *pvp, uint64_t objset) +{ + struct vnode *vp = NULL; + zfsctl_node_t *zcp; +#if 1 + dprintf("+snapshot_mknode\n"); + vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, vnode_mount(pvp), + zfsctl_ops_snapshot_dvnodeops, NULL, NULL, MAXNAMELEN, NULL, NULL, 0); + zcp = vnode_fsnode(vp); + zcp->zc_id = objset; + dprintf("-snapshot_mknode\n"); +#endif + return (vp); +} + +static int +zfsctl_snapshot_inactive(ap) + struct vnop_inactive_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + cred_t *cr = (cred_t *)vfs_context_ucred((ap)->a_context); + struct vnop_inactive_args iap; + zfsctl_snapdir_t *sdp; + zfs_snapentry_t *sep, *next; + int locked; + struct vnode *dvp; + + dprintf("zfsctl_snapshot_inactive: %p mountedhere %p\n",vp, + vnode_mountedhere(vp)); + + if (vnode_isinuse(vp,1)) + goto end; + + VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0); + sdp = vnode_fsnode(dvp); + + if (!(locked = MUTEX_HELD(&sdp->sd_lock))) + mutex_enter(&sdp->sd_lock); + + ASSERT(!vn_ismntpt(vp)); + + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + next = AVL_NEXT(&sdp->sd_snaps, sep); + + if (sep->se_root == vp) { + dprintf("Removing (inactive) snap '%s'\n", sep->se_name); + avl_remove(&sdp->sd_snaps, sep); + kmem_free(sep->se_name, strlen(sep->se_name) + 1); + kmem_free(sep, sizeof (zfs_snapentry_t)); + /* + * After releasing the snapshot/$name entry, we need to + * recycle the vnode, as we will always create a new one + * in zfsctl_snapdir_lookup() - we do not keep a reference + * to it once the AVL node is removed. + */ + vnode_recycle(vp); + break; + } + sep = next; + } + ASSERT(sep != NULL); + + if (!locked) + mutex_exit(&sdp->sd_lock); + VN_RELE(dvp); + +end: + /* + * Dispose of the vnode for the snapshot mount point. + * This is safe to do because once this entry has been removed + * from the AVL tree, it can't be found again, so cannot become + * "active". If we lookup the same name again we will end up + * creating a new vnode. + */ + iap.a_vp = vp; + return (gfs_vop_inactive(&iap)); +} + +<<<<<<< HEAD:ZFSin/zfs/module/zfs/zfs_ctldir.c +#if 0 // unused function +static int +zfsctl_traverse_begin(struct vnode **vpp, int lktype) +{ + VN_HOLD(*vpp); + /* Snapshot should be already mounted, but just in case. */ + if (vnode_mount(*vpp) == NULL) + return (ENOENT); + return (traverse(vpp, lktype)); +} + +static void +zfsctl_traverse_end(struct vnode *vp, int err) +{ + VN_RELE(vp); +} + +static int +zfsctl_snapshot_getattr(ap) + struct vnop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + int err; + + dprintf("zfsctl: XXX +snapshot_getattr\n"); + err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY); + if (err == 0) + err = VOP_GETATTR(vp, ap->a_vap, 0, NULL, NULL); + zfsctl_traverse_end(vp, err); + dprintf("zfsctl: XXX -snapshot_getattr\n"); + return (err); +} +#endif + +#ifdef _WIN32 +======= +#ifdef __APPLE__ +>>>>>>> b3b9184f... Snapshot mounts fail on second mount:module/zfs/zfs_ctldir.c +/* + * This call is pretty much identical to snapdir_getattr, but we have + * separated them to avoid any vnop_snapshot calling vnop_snapdir and the + * confusion that can come from that. + */ + +/* ARGSUSED */ +static int +zfsctl_snapshot_getattr(ap) + struct vnop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(vp)); + zfsctl_snapdir_t *sdp = vnode_fsnode(vp); + + dprintf("zfsctl: +snapshot_getattr: %p: (v_data %p)\n", vp, sdp); + + if (!sdp) return ENOENT; + + ZFS_ENTER(zfsvfs); + zfsctl_common_getattr(vp, vap); + vap->va_nodeid = gfs_file_inode(vp); + vap->va_nlink = vap->va_size = 2; + vap->va_ctime = vap->va_mtime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); +#ifdef _WIN32 + VATTR_SET_SUPPORTED(vap, va_modify_time); + VATTR_SET_SUPPORTED(vap, va_create_time); + VATTR_SET_SUPPORTED(vap, va_nlink); + VATTR_SET_SUPPORTED(vap, va_fileid); + VATTR_CLEAR_SUPPORTED(vap, va_acl); +#endif + ZFS_EXIT(zfsvfs); + dprintf("zfsctl: -snapshot_getattr\n"); + + return (0); +} + +#endif + +<<<<<<< HEAD:ZFSin/zfs/module/zfs/zfs_ctldir.c + + +#ifndef _WIN32 +static int +zfsctl_snapshot_fid(ap) + struct vnop_fid_args /* { + struct vnode *a_vp; + struct fid *a_fid; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + int err; + + err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY); + if (err == 0) + err = VOP_VPTOFH(vp, (void *)ap->a_fid); + zfsctl_traverse_end(vp, err); + return (err); +} +#endif + +======= +>>>>>>> b3b9184f... Snapshot mounts fail on second mount:module/zfs/zfs_ctldir.c +#if 1 // unused function +static int +zfsctl_snapshot_lookup(ap) + struct vnop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + struct vnode *dvp = ap->a_dvp; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + cred_t *cr = (cred_t *)vfs_context_ucred((ap)->a_context); + + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(dvp)); + int error; + + if (cnp->cn_namelen != 2 || cnp->cn_nameptr[0] != '.' || + cnp->cn_nameptr[1] != '.') { + return (ENOENT); + } + + ASSERT(vnode_isdir(dvp)); + ASSERT(zfsvfs->z_ctldir != NULL); + + dprintf("zfsctl_snapshot_lookup 'snapshot' name '%s'\n", + cnp->cn_nameptr); + + error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", vpp, + NULL, 0, NULL, cr, NULL, NULL, NULL); + + return (error); +} +#endif + +<<<<<<< HEAD:ZFSin/zfs/module/zfs/zfs_ctldir.c +#ifndef _WIN32 +static int +zfsctl_snapshot_vptocnp(struct vnop_vptocnp_args *ap) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(ap->a_vp)); + struct vnode *dvp, *vp; + zfsctl_snapdir_t *sdp; + zfs_snapentry_t *sep; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, + NULL, 0, NULL, kcred, NULL, NULL, NULL); + if (error != 0) + return (error); + sdp = vnode_fsnode(dvp); + + mutex_enter(&sdp->sd_lock); + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + vp = sep->se_root; + if (vp == ap->a_vp) + break; + sep = AVL_NEXT(&sdp->sd_snaps, sep); + } + if (sep == NULL) { + mutex_exit(&sdp->sd_lock); + error = ENOENT; + } else { + size_t len; + + len = strlen(sep->se_name); + *ap->a_buflen -= len; + bcopy(sep->se_name, ap->a_buf + *ap->a_buflen, len); + mutex_exit(&sdp->sd_lock); + vref(dvp); + *ap->a_vpp = dvp; + } + VN_RELE(dvp); + + return (error); +} +#endif + +/* + * These VP's should never see the light of day. They should always + * be covered. + */ +#ifndef _WIN32 +static struct vop_vector zfsctl_ops_snapshot = { + .vop_default = &default_vnodeops, + .vop_inactive = zfsctl_snapshot_inactive, + .vop_lookup = zfsctl_snapshot_lookup, + .vop_reclaim = zfsctl_common_reclaim, + .vop_getattr = zfsctl_snapshot_getattr, + .vop_fid = zfsctl_snapshot_fid, + .vop_vptocnp = zfsctl_snapshot_vptocnp, +}; +#endif + +#ifdef _WIN32 +static struct vnodeopv_entry_desc zfsctl_ops_snapshot_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_inactive_desc, (VOPFUNC)zfsctl_snapshot_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfsctl_common_reclaim}, + + /* + * In normal ZFS, the ".zfs/snashot/snap", the "snap" is immediately + * mounted over, so these vnodeops are not used. But in OSX, since we + * are unable to mount from the kernel, we need to define enough vnodeops + * such that userland mount call will succeed. + */ + {&vnop_getattr_desc, (VOPFUNC)zfsctl_snapshot_getattr}, + {&vnop_revoke_desc, (VOPFUNC)err_revoke }, + {&vnop_fsync_desc, (VOPFUNC)nop_fsync }, + + {&vnop_lookup_desc, (VOPFUNC)zfsctl_snapshot_lookup}, + + //{&vnop_readdir_desc, (VOPFUNC)gfs_vop_readdir}, + {&vnop_readdir_desc, (VOPFUNC)nop_readdir}, + + // {&vnop_open_desc, (VOPFUNC)zfsctl_common_openX}, + //{&vnop_close_desc, (VOPFUNC)zfsctl_common_closeX}, + //{&vnop_open_desc, (VOPFUNC)nop_open}, + //{&vnop_close_desc, (VOPFUNC)nop_close}, + + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfsctl_ops_snapshot = +{ &zfsctl_ops_snapshot_dvnodeops, zfsctl_ops_snapshot_template }; +#endif + + + +======= +>>>>>>> b3b9184f... Snapshot mounts fail on second mount:module/zfs/zfs_ctldir.c +int +zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(vfsp); + struct vnode *dvp, *vp; + zfsctl_snapdir_t *sdp; + zfsctl_node_t *zcp; + zfs_snapentry_t *sep; + int error; + dprintf("zfsctl_lookup_objset\n"); + + ASSERT(zfsvfs->z_ctldir != NULL); + error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, + NULL, 0, NULL, kcred, NULL, NULL, NULL); + if (error != 0) + return (error); + sdp = vnode_fsnode(dvp); + + mutex_enter(&sdp->sd_lock); + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + vp = sep->se_root; + zcp = vnode_fsnode(vp); + if (zcp->zc_id == objsetid) + break; + + sep = AVL_NEXT(&sdp->sd_snaps, sep); + } + + if (sep != NULL) { + VN_HOLD(vp); + /* + * Return the mounted root rather than the covered mount point. + * Takes the GFS vnode at .zfs/snapshot/ + * and returns the ZFS vnode mounted on top of the GFS node. + * This ZFS vnode is the root of the vfs for objset 'objsetid'. + */ + error = traverse(&vp, LK_SHARED | LK_RETRY); + if (error == 0) { + if (vp == sep->se_root) + error = EINVAL; + else + *zfsvfsp = VTOZ(vp)->z_zfsvfs; + } + mutex_exit(&sdp->sd_lock); + VN_RELE(vp); + } else { + error = EINVAL; + mutex_exit(&sdp->sd_lock); + } + + VN_RELE(dvp); + + return (error); +} + +/* + * Unmount any snapshots for the given filesystem. This is called from + * zfs_umount() - if we have a ctldir, then go through and unmount all the + * snapshots. + */ +int +zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(vfsp); + struct vnode *dvp; + zfsctl_snapdir_t *sdp; + zfs_snapentry_t *sep, *next; + int error; + + dprintf("unmount_snapshots\n"); + + ASSERT(zfsvfs->z_ctldir != NULL); + error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, + NULL, 0, NULL, cr, NULL, NULL, NULL); + if (error != 0) + return (error); + + sdp = vnode_fsnode(dvp); + if (!sdp) return 0; + + mutex_enter(&sdp->sd_lock); + + sep = avl_first(&sdp->sd_snaps); + while (sep != NULL) { + next = AVL_NEXT(&sdp->sd_snaps, sep); + + /* + * If this snapshot is not mounted, then it must + * have just been unmounted by somebody else, and + * will be cleaned up by zfsctl_snapdir_inactive(). + */ + if (vn_ismntpt(sep->se_root)) { + error = zfsctl_unmount_snap(sep, fflags, cr); + if (error) { + avl_index_t where; + + /* + * Before reinserting snapshot to the tree, + * check if it was actually removed. For example + * when snapshot mount point is busy, we will + * have an error here, but there will be no need + * to reinsert snapshot. + */ + if (avl_find(&sdp->sd_snaps, sep, &where) == NULL) + avl_insert(&sdp->sd_snaps, sep, where); + break; + } + } + sep = next; + } + + mutex_exit(&sdp->sd_lock); + + VN_RELE(dvp); + + dprintf("umount_snapshot err %d\n", error); + return (error); +} + + + +/* + * Covered VNOPs, from before the snapshot is mounted + */ +/* ARGSUSED */ +static int +zfsctl_covered_open(struct vnop_open_args *ap) +{ + int flags = ap->a_mode; + + dprintf("%s: %p on %p\n", __func__, + ap->a_vp, vnode_mountedhere(ap->a_vp)); + + if (flags & FWRITE) + return (EACCES); + + return (0); +} + +/* ARGSUSED */ +static int +zfsctl_covered_close(struct vnop_close_args *ap) +{ + return (0); +} + +static int +zfsctl_covered_readdir(struct vnop_readdir_args *ap) +#if 0 + struct vnop_readdir_args { + struct vnode a_vp; + struct uio *a_uio; + int a_flags; + int *a_eofflag; + int *a_numdirent; + vfs_context_t a_context; + }; +#endif +{ + + dprintf("%s: %p\n", __func__, ap->a_vp); + + if (*ap->a_numdirent == 0) + *ap->a_numdirent = 2; /* . and .. */ + + return 0; +} + + +#ifdef __APPLE__ +static struct vnodeopv_entry_desc zfsctl_ops_snapshot_template[] = { + {&vnop_default_desc, (VOPFUNC)vn_default_error }, + {&vnop_inactive_desc, (VOPFUNC)zfsctl_snapshot_inactive}, + {&vnop_reclaim_desc, (VOPFUNC)zfsctl_common_reclaim}, + + /* + * In normal ZFS, the ".zfs/snashot/snap", the "snap" is immediately + * mounted over, so these vnodeops are not used. But in OSX, since we + * are unable to mount from the kernel, we need to define enough vnodeops + * such that userland mount call will succeed. + */ + {&vnop_getattr_desc, (VOPFUNC)zfsctl_snapshot_getattr}, + {&vnop_revoke_desc, (VOPFUNC)err_revoke }, + {&vnop_fsync_desc, (VOPFUNC)nop_fsync }, + + {&vnop_lookup_desc, (VOPFUNC)zfsctl_snapshot_lookup}, + + //{&vnop_readdir_desc, (VOPFUNC)gfs_vop_readdir}, + {&vnop_readdir_desc, (VOPFUNC)zfsctl_covered_readdir}, + + {&vnop_open_desc, (VOPFUNC)zfsctl_covered_open}, + {&vnop_close_desc, (VOPFUNC)zfsctl_covered_close}, + //{&vnop_open_desc, (VOPFUNC)nop_open}, + //{&vnop_close_desc, (VOPFUNC)nop_close}, + + {NULL, (VOPFUNC)NULL } +}; +struct vnodeopv_desc zfsctl_ops_snapshot = +{ &zfsctl_ops_snapshot_dvnodeops, zfsctl_ops_snapshot_template }; +#endif diff --git a/module/os/windows/zfs/zfs_debug.c b/module/os/windows/zfs/zfs_debug.c new file mode 100644 index 000000000000..368eae70c0d3 --- /dev/null +++ b/module/os/windows/zfs/zfs_debug.c @@ -0,0 +1,137 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#include + +#if !defined(_KERNEL) || !defined(__linux__) +list_t zfs_dbgmsgs; +int zfs_dbgmsg_size; +kmutex_t zfs_dbgmsgs_lock; +int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ +#endif + +void +zfs_panic_recover(const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); + va_end(adx); +} + +/* + * Debug logging is enabled by default for production kernel builds. + * The overhead for this is negligible and the logs can be valuable when + * debugging. For non-production user space builds all debugging except + * logging is enabled since performance is no longer a concern. + */ +void +zfs_dbgmsg_init(void) +{ +#if !defined(_KERNEL) || !defined(__linux__) + list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), + offsetof(zfs_dbgmsg_t, zdm_node)); + mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); +#endif +} + +void +zfs_dbgmsg_fini(void) +{ +#if !defined(_KERNEL) || !defined(__linux__) + zfs_dbgmsg_t *zdm; + + while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) { + int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_destroy(&zfs_dbgmsgs_lock); + ASSERT0(zfs_dbgmsg_size); +#endif +} + +#if !defined(_KERNEL) || !defined(__linux__) +/* + * Print these messages by running: + * echo ::zfs_dbgmsg | mdb -k + * + * Monitor these messages by running: + * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' + * + * When used with libzpool, monitor with: + * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}' + */ +void +zfs_dbgmsg(const char *fmt, ...) +{ + int size; + va_list adx; + zfs_dbgmsg_t *zdm; + + va_start(adx, fmt); + size = vsnprintf(NULL, 0, fmt, adx); + va_end(adx); + + /* + * There is one byte of string in sizeof (zfs_dbgmsg_t), used + * for the terminating null. + */ + zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP); + zdm->zdm_timestamp = gethrestime_sec(); + + va_start(adx, fmt); + (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx); + va_end(adx); + + DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg); + dprintf("%s: %s\n", __func__, zdm->zdm_msg); + + mutex_enter(&zfs_dbgmsgs_lock); + list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size; + while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) { + zdm = list_remove_head(&zfs_dbgmsgs); + size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_exit(&zfs_dbgmsgs_lock); +} + +void +zfs_dbgmsg_print(const char *tag) +{ + zfs_dbgmsg_t *zdm; + + dprintf("ZFS_DBGMSG(%s):\n", tag); + mutex_enter(&zfs_dbgmsgs_lock); + for (zdm = list_head(&zfs_dbgmsgs); zdm; + zdm = list_next(&zfs_dbgmsgs, zdm)) + dprintf("%s\n", zdm->zdm_msg); + mutex_exit(&zfs_dbgmsgs_lock); +} +#endif diff --git a/module/os/windows/zfs/zfs_dir.c b/module/os/windows/zfs/zfs_dir.c new file mode 100644 index 000000000000..2a6160f08e72 --- /dev/null +++ b/module/os/windows/zfs/zfs_dir.c @@ -0,0 +1,1289 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include +//#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups + * of names after deciding which is the appropriate lookup interface. + */ +static int +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, matchtype_t mt, + boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) +{ + int error; + + if (zfsvfs->z_norm) { + boolean_t conflict = B_FALSE; + size_t bufsz = 0; + char *buf = NULL; + + if (rpnp) { + buf = rpnp->pn_buf; + bufsz = rpnp->pn_bufsize; + } + + /* + * In the non-mixed case we only expect there would ever + * be one match, but we need to use the normalizing lookup. + */ + error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, + zoid, mt, buf, bufsz, &conflict); + if (!error && deflags) + *deflags = conflict ? ED_CASE_CONFLICT : 0; + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); + } + *zoid = ZFS_DIRENT_OBJ(*zoid); + + if (error == ENOENT && update) + dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE); + + return (error); +} + +/* + * Lock a directory entry. A dirlock on protects that name + * in dzp's directory zap object. As long as you hold a dirlock, you can + * assume two things: (1) dzp cannot be reaped, and (2) no other thread + * can change the zap entry for (i.e. link or unlink) this name. + * + * Input arguments: + * dzp - znode for directory + * name - name of entry to lock + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * ZSHARED: allow concurrent access with other ZSHARED callers. + * ZXATTR: we want dzp's xattr directory + * ZCILOOK: On a mixed sensitivity file system, + * this lookup should be case-insensitive. + * ZCIEXACT: On a purely case-insensitive file system, + * this lookup should be case-sensitive. + * ZRENAMING: we are locking for renaming, force narrow locks + * ZHAVELOCK: Don't grab the z_name_lock for this call. The + * current thread already holds it. + * + * Output arguments: + * zpp - pointer to the znode for the entry (NULL if there isn't one) + * dlpp - pointer to the dirlock for this entry (NULL on error) + * direntflags - (case-insensitive lookup only) + * flags if multiple case-sensitive matches exist in directory + * realpnp - (case-insensitive lookup only) + * actual name matched within the directory + * + * Return value: 0 on success or errno on failure. + * + * NOTE: Always checks for, and rejects, '.' and '..'. + * NOTE: For case-insensitive file systems we take wide locks (see below), + * but return znode pointers to a single match. + */ +int +zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, + int flag, int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_dirlock_t *dl; + boolean_t update; + matchtype_t mt = 0; + uint64_t zoid; + vnode_t *vp = NULL; + int error = 0; + int cmpflags; + + *zpp = NULL; + *dlpp = NULL; + + /* + * Verify that we are not trying to lock '.', '..', or '.zfs' + */ + if ((name[0] == '.' && + ((name[1] == '\0') || (name[1] == '.' && name[2] == '\0'))) || + (zfs_has_ctldir(dzp) && (strcmp(name, ZFS_CTLDIR_NAME) == 0))) + return (SET_ERROR(EEXIST)); + + /* + * Case sensitivity and normalization preferences are set when + * the file system is created. These are stored in the + * zfsvfs->z_case and zfsvfs->z_norm fields. These choices + * affect what vnodes can be cached in the DNLC, how we + * perform zap lookups, and the "width" of our dirlocks. + * + * A normal dirlock locks a single name. Note that with + * normalization a name can be composed multiple ways, but + * when normalized, these names all compare equal. A wide + * dirlock locks multiple names. We need these when the file + * system is supporting mixed-mode access. It is sometimes + * necessary to lock all case permutations of file name at + * once so that simultaneous case-insensitive/case-sensitive + * behaves as rationally as possible. + */ + + /* + * When matching we may need to normalize & change case according to + * FS settings. + * + * Note that a normalized match is necessary for a case insensitive + * filesystem when the lookup request is not exact because normalization + * can fold case independent of normalizing code point sequences. + * + * See the table above zfs_dropname(). + */ + if (zfsvfs->z_norm != 0) { + mt = MT_NORMALIZE; + + /* + * Determine if the match needs to honor the case specified in + * lookup, and if so keep track of that so that during + * normalization we don't fold case. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE && + (flag & ZCIEXACT)) || + (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) { + mt |= MT_MATCH_CASE; + } + } + + /* + * Only look in or update the DNLC if we are looking for the + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name. + * + * Maybe can add TO-UPPERed version of name to dnlc in ci-only + * case for performance improvement? + */ + update = !zfsvfs->z_norm || + (zfsvfs->z_case == ZFS_CASE_MIXED && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); + + /* + * ZRENAMING indicates we are in a situation where we should + * take narrow locks regardless of the file system's + * preferences for normalizing and case folding. This will + * prevent us deadlocking trying to grab the same wide lock + * twice if the two names happen to be case-insensitive + * matches. + */ + if (flag & ZRENAMING) + cmpflags = 0; + else + cmpflags = zfsvfs->z_norm; + + /* + * Wait until there are no locks on this name. + * + * Don't grab the the lock if it is already held. However, cannot + * have both ZSHARED and ZHAVELOCK together. + */ + ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); + if (!(flag & ZHAVELOCK)) + rw_enter(&dzp->z_name_lock, RW_READER); + + mutex_enter(&dzp->z_lock); + for (;;) { + if (dzp->z_unlinked) { + mutex_exit(&dzp->z_lock); + if (!(flag & ZHAVELOCK)) + rw_exit(&dzp->z_name_lock); + return (SET_ERROR(ENOENT)); + } + for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { + if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, + U8_UNICODE_LATEST, &error) == 0) || error != 0) + break; + } + if (error != 0) { + mutex_exit(&dzp->z_lock); + if (!(flag & ZHAVELOCK)) + rw_exit(&dzp->z_name_lock); + return (SET_ERROR(ENOENT)); + } + if (dl == NULL) { + size_t namesize; + + /* + * Allocate a new dirlock and add it to the list. + */ + namesize = strlen(name) + 1; + dl = kmem_alloc(sizeof (zfs_dirlock_t) + namesize, + KM_SLEEP); + cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); + dl->dl_name = (char *)(dl + 1); + //bcopy(name, dl->dl_name, namesize); + strlcpy(dl->dl_name, name, namesize); + dl->dl_sharecnt = 0; + dl->dl_namelock = 0; + dl->dl_namesize = namesize; + dl->dl_dzp = dzp; + dl->dl_next = dzp->z_dirlocks; + dzp->z_dirlocks = dl; + break; + } + if ((flag & ZSHARED) && dl->dl_sharecnt != 0) + break; + cv_wait(&dl->dl_cv, &dzp->z_lock); + } + + /* + * If the z_name_lock was NOT held for this dirlock record it. + */ + if (flag & ZHAVELOCK) + dl->dl_namelock = 1; + + if (flag & ZSHARED) + dl->dl_sharecnt++; + + mutex_exit(&dzp->z_lock); + + /* + * We have a dirlock on the name. (Note that it is the dirlock, + * not the dzp's z_lock, that protects the name in the zap object.) + * See if there's an object by this name; if so, put a hold on it. + */ + if (flag & ZXATTR) { + error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, + sizeof (zoid)); + if (error == 0) + error = (zoid == 0 ? SET_ERROR(ENOENT) : 0); + } else { + if (update) + vp = dnlc_lookup(ZTOV(dzp), name); + if (vp == DNLC_NO_VNODE) { + VN_RELE(vp); + error = SET_ERROR(ENOENT); + } else if (vp) { + if (flag & ZNEW) { + zfs_dirent_unlock(dl); + VN_RELE(vp); + return (SET_ERROR(EEXIST)); + } + *dlpp = dl; + *zpp = VTOZ(vp); + return (0); + } else { + error = zfs_match_find(zfsvfs, dzp, name, mt, + update, direntflags, realpnp, &zoid); + } + } + if (error) { + if (error != ENOENT || (flag & ZEXISTS)) { + zfs_dirent_unlock(dl); + return (error); + } + } else { + if (flag & ZNEW) { + zfs_dirent_unlock(dl); + return (SET_ERROR(EEXIST)); + } +#ifdef APPLE_SA_RECOVER + zfsvfs->z_recover_parent = dzp->z_id; +#endif /* APPLE_SA_RECOVER */ + error = zfs_zget(zfsvfs, zoid, zpp); +#ifdef APPLE_SA_RECOVER + zfsvfs->z_recover_parent = 0; +#endif /* APPLE_SA_RECOVER */ + + if (error) { + zfs_dirent_unlock(dl); + return (error); + } + if (!(flag & ZXATTR) && update) + dnlc_update(ZTOV(dzp), name, ZTOV(*zpp)); + } + + *dlpp = dl; + + return (0); +} + +/* + * Unlock this directory entry and wake anyone who was waiting for it. + */ +void +zfs_dirent_unlock(zfs_dirlock_t *dl) +{ + znode_t *dzp = dl->dl_dzp; + zfs_dirlock_t **prev_dl, *cur_dl; + + mutex_enter(&dzp->z_lock); + + if (!dl->dl_namelock) + rw_exit(&dzp->z_name_lock); + + if (dl->dl_sharecnt > 1) { + dl->dl_sharecnt--; + mutex_exit(&dzp->z_lock); + return; + } + prev_dl = &dzp->z_dirlocks; + while ((cur_dl = *prev_dl) != dl) + prev_dl = &cur_dl->dl_next; + *prev_dl = dl->dl_next; + cv_broadcast(&dl->dl_cv); + mutex_exit(&dzp->z_lock); + + cv_destroy(&dl->dl_cv); + kmem_free(dl, sizeof (*dl) + dl->dl_namesize); +} + +/* + * Look up an entry in a directory. + * + * NOTE: '.' and '..' are handled as special cases because + * no directory entries are actually stored for them. If this is + * the root of a filesystem, then '.zfs' is also treated as a + * special pseudo-directory. + */ +int +zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags, + int *deflg, pathname_t *rpnp) +{ + zfs_dirlock_t *dl; + znode_t *zp; + int error = 0; + uint64_t parent; + int unlinked; + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + mutex_enter(&dzp->z_lock); + unlinked = dzp->z_unlinked; + mutex_exit(&dzp->z_lock); + if (unlinked) + return (ENOENT); + + *vpp = ZTOV(dzp); + VN_HOLD(*vpp); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + /* + * If we are a snapshot mounted under .zfs, return + * the vp for the snapshot directory. + */ + if ((error = sa_lookup(dzp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + return (error); + if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { +// error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, +// "snapshot", vpp, NULL, 0, NULL, kcred, +// NULL, NULL, NULL); + return (error); + } + + mutex_enter(&dzp->z_lock); + unlinked = dzp->z_unlinked; + mutex_exit(&dzp->z_lock); + if (unlinked) + return (ENOENT); + + rw_enter(&dzp->z_parent_lock, RW_READER); + error = zfs_zget(zfsvfs, parent, &zp); + if (error == 0) + *vpp = ZTOV(zp); + rw_exit(&dzp->z_parent_lock); + } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { + // *vpp = zfsctl_root(dzp); + } else { + int zf; + + zf = ZEXISTS | ZSHARED; + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); + if (error == 0) { + *vpp = ZTOV(zp); + zfs_dirent_unlock(dl); + dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + } + rpnp = NULL; + } + + if ((flags & FIGNORECASE) && rpnp && !error) + (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); + + return (error); +} + +/* + * unlinked Set (formerly known as the "delete queue") Error Handling + * + * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we + * don't specify the name of the entry that we will be manipulating. We + * also fib and say that we won't be adding any new entries to the + * unlinked set, even though we might (this is to lower the minimum file + * size that can be deleted in a full filesystem). So on the small + * chance that the nlink list is using a fat zap (ie. has more than + * 2000 entries), we *may* not pre-read a block that's needed. + * Therefore it is remotely possible for some of the assertions + * regarding the unlinked set below to fail due to i/o error. On a + * nondebug system, this will result in the space being leaked. + */ +void +zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int err; + + ASSERT(zp->z_unlinked); + ASSERT(zp->z_links == 0); + + if (( err = zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)) + != 0) { + zfs_panic_recover("zfs: zfs_unlinked_add(id %llu) failed to add to unlinked list: %d\n", + zp->z_id, + err); + } +} + +/* + * Clean up any znodes that had no links when we either crashed or + * (force) umounted the file system. + */ +static void +zfs_unlinked_drain_task(void *arg) +{ + zfsvfs_t *zfsvfs = arg; + zap_cursor_t zc; + zap_attribute_t zap; + dmu_object_info_t doi; + znode_t *zp; + int error; + + /* + * Iterate over the contents of the unlinked set. + */ + for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); + zap_cursor_retrieve(&zc, &zap) == 0 && + /* Only checking for a shutdown request, so no locking reqd. */ + zfsvfs->z_drain_state == ZFS_DRAIN_RUNNING; + zap_cursor_advance(&zc)) { + + /* + * See what kind of object we have in list + */ + + error = dmu_object_info(zfsvfs->z_os, + zap.za_first_integer, &doi); + if (error != 0) + continue; + + ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || + (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); + /* + * We need to re-mark these list entries for deletion, + * so we pull them back into core and set zp->z_unlinked. + */ + error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); + + /* + * We may pick up znodes that are already marked for deletion. + * This could happen during the purge of an extended attribute + * directory. All we need to do is skip over them, since they + * are already in the system marked z_unlinked. + */ + if (error != 0) + continue; + + zp->z_unlinked = B_TRUE; + + /* + * iput() is Linux's equivalent to illumos' VN_RELE(). It will + * decrement the inode's ref count and may cause the inode to be + * synchronously freed. We interrupt freeing of this inode, by + * checking the return value of dmu_objset_zfs_unmounting() in + * dmu_free_long_range(), when an unmount is requested. + */ + VN_RELE(ZTOV(zp)); + + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + } + zap_cursor_fini(&zc); + + mutex_enter(&zfsvfs->z_drain_lock); + zfsvfs->z_drain_state = ZFS_DRAIN_SHUTDOWN; + cv_broadcast(&zfsvfs->z_drain_cv); + mutex_exit(&zfsvfs->z_drain_lock); +} + +/* + * Sets z_draining then tries to dispatch async unlinked drain. + * If that fails executes synchronous unlinked drain. + */ +void +zfs_unlinked_drain(zfsvfs_t *zfsvfs) +{ + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + + mutex_enter(&zfsvfs->z_drain_lock); + ASSERT(zfsvfs->z_drain_state == ZFS_DRAIN_SHUTDOWN); + zfsvfs->z_drain_state = ZFS_DRAIN_RUNNING; + mutex_exit(&zfsvfs->z_drain_lock); + + if (taskq_dispatch( + dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)), + zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP) == 0) { + zfs_dbgmsg("async zfs_unlinked_drain dispatch failed"); + zfs_unlinked_drain_task(zfsvfs); + } +} + +/* + * Wait for the unlinked drain taskq task to stop. This will interrupt the + * unlinked set processing if it is in progress. + */ +void +zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs) +{ + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + + mutex_enter(&zfsvfs->z_drain_lock); + while (zfsvfs->z_drain_state != ZFS_DRAIN_SHUTDOWN) { + zfsvfs->z_drain_state = ZFS_DRAIN_SHUTDOWN_REQ; + cv_wait(&zfsvfs->z_drain_cv, &zfsvfs->z_drain_lock); + } + mutex_exit(&zfsvfs->z_drain_lock); +} + + + +/* + * Delete the entire contents of a directory. Return a count + * of the number of entries that could not be deleted. If we encounter + * an error, return a count of at least one so that the directory stays + * in the unlinked set. + * + * NOTE: this function assumes that the directory is inactive, + * so there is no need to lock its entries before deletion. + * Also, it assumes the directory contents is *only* regular + * files. + */ +extern unsigned int rwlock_detect_problem; +static int +zfs_purgedir(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + znode_t *xzp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_dirlock_t dl; + int skipped = 0; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + error = zfs_zget_ext(zfsvfs, + ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp, + /*ZGET_FLAG_WITHOUT_VNODE*/0); + if (error) { +#ifdef _WIN32 + if (error == EIO) { + dprintf("ZFS: Detected problem with item %llu\n", + dzp->z_id); + } +#endif + skipped += 1; + continue; + } + +#ifdef _WIN32 + ASSERT(S_ISREG(xzp->z_mode) || + S_ISLNK(xzp->z_mode)); +#else + ASSERT((ZTOV(xzp)->v_type == VREG) || + (ZTOV(xzp)->v_type == VLNK)); +#endif + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + /* Is this really needed ? */ + zfs_sa_upgrade_txholds(tx, xzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + //VN_RELE(ZTOV(xzp)); // async +#ifdef _WON32 + if (ZTOV(xzp) == NULL) { + zfs_zinactive(xzp); + } else { + VN_RELE_ASYNC(ZTOV(xzp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); + } +#else + VN_RELE_ASYNC(ZTOV(xzp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); +#endif + + skipped += 1; + continue; + } + bzero(&dl, sizeof (dl)); + dl.dl_dzp = dzp; + dl.dl_name = zap.za_name; + + error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); + if (error) + skipped += 1; + dmu_tx_commit(tx); + +#ifdef _WIN32 + if (ZTOV(xzp) == NULL) { + zfs_zinactive(xzp); + } else { + VN_RELE_ASYNC(ZTOV(xzp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); + } +#else + VN_RELE_ASYNC(ZTOV(xzp), dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); +#endif + } + zap_cursor_fini(&zc); + if (error != ENOENT) + skipped += 1; + +#ifdef _WIN32 + if (error == EIO) { + dprintf("ZFS: purgedir detected corruption. dropping %llu\n", + dzp->z_id); + return 0; // Remove this dir anyway + } +#endif + return (skipped); +} + + +/* + * This function is either called directly from reclaim, or in a delayed + * manner, so the value of zp->z_vnode may be NULL. + */ +void +zfs_rmnode(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + znode_t *xzp = NULL; + dmu_tx_t *tx; + uint64_t acl_obj; + uint64_t xattr_obj; + int error; + + ASSERT(zp->z_links == 0); + + /* + * If this is an attribute directory, purge its contents. + */ + + if ((IFTOVT((mode_t)zp->z_mode) == VDIR) && + (zp->z_pflags & ZFS_XATTR)) { + + if (zfs_purgedir(zp) != 0) { + /* + * Not enough space to delete some xattrs. + * Leave it in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + } + + /* + * Free up all the data in the file. + */ + error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); + if (error) { + /* + * Not enough space or we were interrupted by unmount. + * Leave the file in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + /* Can't release zp before vp, so tell VFS to release */ + zfs_znode_free(zp); + return; + } + + /* + * If the file has extended attributes, we're going to unlink + * the xattr dir. + */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget_ext(zfsvfs, xattr_obj, &xzp, + ZGET_FLAG_WITHOUT_VNODE); + ASSERT(error == 0); + } + + acl_obj = zfs_external_acl(zp); + + /* + * Set up the final transaction. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + if (xzp) { + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + if (acl_obj) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + /* + * Not enough space to delete the file. Leave it in the + * unlinked set, leaking it until the fs is remounted (at + * which point we'll call zfs_unlinked_drain() to process it). + */ + dmu_tx_abort(tx); + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + goto out; + } + + if (xzp) { + ASSERT(error == 0); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ + xzp->z_links = 0; /* no more links to it */ + VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &xzp->z_links, sizeof (xzp->z_links), tx)); + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + } + + /* Remove this znode from the unlinked set */ + VERIFY3U(0, ==, + zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + + zfs_znode_delete(zp, tx); + + dmu_tx_commit(tx); +out: + if (xzp) { + /* Only release object if we are the only user */ + if (ZTOV(xzp) == NULL) + zfs_zinactive(xzp); + else + VN_RELE_ASYNC(ZTOV(xzp), dsl_pool_vnrele_taskq( + dmu_objset_pool(zfsvfs->z_os))); + } +} + +static uint64_t +zfs_dirent(znode_t *zp, uint64_t mode) +{ + uint64_t de = zp->z_id; + + if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) + de |= IFTODT(mode) << 60; + return (de); +} + +/* + * Link zp into dl. Can only fail if zp has been unlinked. + */ +int +zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) +{ + znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + uint64_t value; +#ifdef _WIN32 + /* OS X - don't access the vnode here since it might not be attached. */ + int zp_is_dir = S_ISDIR(zp->z_mode); +#else + int zp_is_dir = (vnode_isdir(vp)); +#endif + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; +#ifdef _WIN32 + uint64_t addtime[2]; +#endif + + mutex_enter(&zp->z_lock); + + if (!(flag & ZRENAMING)) { + if (zp->z_unlinked) { /* no new links to unlinked zp */ + ASSERT(!(flag & (ZNEW | ZEXISTS))); + mutex_exit(&zp->z_lock); + return (SET_ERROR(ENOENT)); + } + zp->z_links++; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &dzp->z_id, sizeof (dzp->z_id)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (!(flag & ZNEW)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime, B_TRUE); + } + + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + + mutex_exit(&zp->z_lock); + + mutex_enter(&dzp->z_lock); + dzp->z_size++; + dzp->z_links += zp_is_dir; + count = 0; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + mutex_exit(&dzp->z_lock); + + value = zfs_dirent(zp, zp->z_mode); + error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, + 8, 1, &value, tx); + ASSERT(error == 0); + + dnlc_update(ZTOV(dzp), dl->dl_name, vp); + + return (0); +} + +/* + * The match type in the code for this function should conform to: + * + * ------------------------------------------------------------------------ + * fs type | z_norm | lookup type | match type + * ---------|-------------|-------------|---------------------------------- + * CS !norm | 0 | 0 | 0 (exact) + * CS norm | formX | 0 | MT_NORMALIZE + * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE + * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE + * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | ZCILOOK | MT_NORMALIZE + * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE + * + * Abbreviations: + * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed + * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) + * formX = unicode normalization form set on fs creation + */ +static int +zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, + int flag) +{ + int error; + + if (zp->z_zfsvfs->z_norm) { + matchtype_t mt = MT_NORMALIZE; + + if ((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE && + (flag & ZCIEXACT)) || + (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED && + !(flag & ZCILOOK))) { + mt |= MT_MATCH_CASE; + } + + error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id, + dl->dl_name, mt, tx); + } else { + error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, + tx); + } + + return (error); +} + +/* + * Unlink zp from dl, and mark zp for deletion if this was the last link. + * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). + * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. + * If it's non-NULL, we use it to indicate whether the znode needs deletion, + * and it's the caller's job to do it. + */ +int +zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, + boolean_t *unlinkedp) +{ + znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); +#ifdef _WIN32 + /* OS X - don't access the vnode here since it might not be attached. */ + int zp_is_dir = S_ISDIR(zp->z_mode); +#else + int zp_is_dir = vnode_isdir((vp)); +#endif + boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + if (ZTOV(dzp)) + dnlc_remove(ZTOV(dzp), dl->dl_name); + + if (!(flag & ZRENAMING)) { + + if (vp) { + if (vn_vfswlock(vp)) /* prevent new mounts on zp */ + return ((EBUSY)); + + if (vn_ismntpt(vp)) { /* don't remove mount point */ + vn_vfsunlock(vp); + return ((EBUSY)); + } + } + + mutex_enter(&zp->z_lock); + + if (zp_is_dir && !zfs_dirempty(zp)) { + mutex_exit(&zp->z_lock); + vn_vfsunlock(vp); + return (SET_ERROR(ENOTEMPTY)); + } + + /* + * If we get here, we are going to try to remove the object. + * First try removing the name from the directory; if that + * fails, return the error. + */ + error = zfs_dropname(dl, zp, dzp, tx, flag); + if (error != 0) { + mutex_exit(&zp->z_lock); + vn_vfsunlock(vp); + return (error); + } + + if (zp->z_links <= zp_is_dir) { + zfs_panic_recover("zfs: link count on vnode %p objID %llu is %u, " + "should be at least %u", zp->z_vnode, + zp->z_id, + (int)zp->z_links, + zp_is_dir + 1); + zp->z_links = zp_is_dir + 1; + } + if (--zp->z_links == zp_is_dir) { + zp->z_unlinked = B_TRUE; + zp->z_links = 0; + unlinked = B_TRUE; + } else { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, + B_TRUE); + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &zp->z_links, sizeof (zp->z_links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + count = 0; + ASSERT(error == 0); + mutex_exit(&zp->z_lock); + vn_vfsunlock(vp); + } else { + error = zfs_dropname(dl, zp, dzp, tx, flag); + if (error != 0) + return (error); + } + + mutex_enter(&dzp->z_lock); + dzp->z_size--; /* one dirent removed */ + dzp->z_links -= zp_is_dir; /* ".." link from zp */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + mutex_exit(&dzp->z_lock); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Indicate whether the directory is empty. Works with or without z_lock + * held, but can only be consider a hint in the latter case. Returns true + * if only "." and ".." remain and there's no work in progress. + */ +boolean_t +zfs_dirempty(znode_t *dzp) +{ + return (dzp->z_size == 2 && dzp->z_dirlocks == 0); +} + +int +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + dmu_tx_t *tx; + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; +#ifdef DEBUG + uint64_t parent; +#endif + + *xvpp = NULL; + + /* + * In FreeBSD, access checking for creating an EA is being done + * in zfs_setextattr(), + */ +#ifndef __FreeBSD__ + if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))) + return (error); +#endif + + if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, + &acl_ids)) != 0) + return (error); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + return (SET_ERROR(EDQUOT)); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + return (error); + } + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + +#ifdef DEBUG + error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + ASSERT(error == 0 && parent == zp->z_id); +#endif + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + sizeof (xzp->z_id), tx)); + + (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, + xzp, "", NULL, acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); +#ifdef _WIN32 + /* + * Obtain and attach the vnode after committing the transaction + */ + zfs_znode_getvnode(xzp, zp, zfsvfs); +#endif + *xvpp = ZTOV(xzp); + + return (0); +} + +/* + * Return a znode for the extended attribute directory for zp. + * ** If the directory does not already exist, it is created ** + * + * IN: zp - znode to obtain attribute directory from + * cr - credentials of caller + * flags - flags from the VOP_LOOKUP call + * + * OUT: xzpp - pointer to extended attribute znode + * + * RETURN: 0 on success + * error number on failure + */ +int +zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + zfs_dirlock_t *dl; + vattr_t va; + int error; +top: + error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); + if (error) + return (error); + + if (xzp != NULL) { + *xvpp = ZTOV(xzp); + zfs_dirent_unlock(dl); + return (0); + } + + + if (!(flags & CREATE_XATTR_DIR)) { + zfs_dirent_unlock(dl); + return (SET_ERROR(ENOENT)); + } + + if (vfs_isrdonly(zfsvfs->z_vfs)) { + zfs_dirent_unlock(dl); + return (SET_ERROR(EROFS)); + } + + /* + * The ability to 'create' files in an attribute + * directory comes from the write_xattr permission on the base file. + * + * The ability to 'search' an attribute directory requires + * read_xattr permission on the base file. + * + * Once in a directory the ability to read/write attributes + * is controlled by the permissions on the attribute file. + */ + va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID; + va.va_type = VDIR; + va.va_mode = S_IFDIR | S_ISVTX | 0777; + zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); + + error = zfs_make_xattrdir(zp, &va, xvpp, cr); + zfs_dirent_unlock(dl); + + if (error == ERESTART) { + /* NB: we already did dmu_tx_wait() if necessary */ + goto top; + } + if (error == 0) + VOP_UNLOCK(*xvpp, 0); + + return (error); +} + +/* + * Decide whether it is okay to remove within a sticky directory. + * + * In sticky directories, write access is not sufficient; + * you can remove entries from a directory only if: + * + * you own the directory, + * you own the entry, + * the entry is a plain file and you have write access, + * or you are privileged (checked in secpolicy...). + * + * The function returns 0 if remove access is granted. + */ +int +zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) +{ + uid_t uid; + uid_t downer; + uid_t fowner; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + + if (zdp->z_zfsvfs->z_replay) + return (0); + + if ((zdp->z_mode & S_ISVTX) == 0) + return (0); + + downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); + + if ((uid = crgetuid(cr)) == downer || uid == fowner || + (vnode_isreg(ZTOV(zp)) && + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) + return (0); + else + return (secpolicy_vnode_remove(ZTOV(zp), cr)); +} diff --git a/module/os/windows/zfs/zfs_file_os.c b/module/os/windows/zfs/zfs_file_os.c new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/module/os/windows/zfs/zfs_fuid_os.c b/module/os/windows/zfs/zfs_fuid_os.c new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/module/os/windows/zfs/zfs_ioctl_os.c b/module/os/windows/zfs/zfs_ioctl_os.c new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/module/os/windows/zfs/zfs_kstat_windows.c b/module/os/windows/zfs/zfs_kstat_windows.c new file mode 100644 index 000000000000..cdbc04ef81ee --- /dev/null +++ b/module/os/windows/zfs/zfs_kstat_windows.c @@ -0,0 +1,630 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2017 Jorgen Lundman + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include + +/* + * In Solaris the tunable are set via /etc/system. Until we have a load + * time configuration, we add them to writable kstat tunables. + * + * This table is more or less populated from IllumOS mdb zfs_params sources + * https://github.com/illumos/illumos-gate/blob/master/usr/src/cmd/mdb/common/modules/zfs/zfs.c#L336-L392 + * + */ + + + +osx_kstat_t osx_kstat = { + { "spa_version", KSTAT_DATA_UINT64 }, + { "zpl_version", KSTAT_DATA_UINT64 }, + + { "active_vnodes", KSTAT_DATA_UINT64 }, + { "vnop_debug", KSTAT_DATA_UINT64 }, + { "reclaim_nodes", KSTAT_DATA_UINT64 }, + { "ignore_negatives", KSTAT_DATA_UINT64 }, + { "ignore_positives", KSTAT_DATA_UINT64 }, + { "create_negatives", KSTAT_DATA_UINT64 }, + { "force_formd_normalized", KSTAT_DATA_UINT64 }, + { "skip_unlinked_drain", KSTAT_DATA_UINT64 }, + { "use_system_sync", KSTAT_DATA_UINT64 }, + + { "zfs_arc_max", KSTAT_DATA_UINT64 }, + { "zfs_arc_min", KSTAT_DATA_UINT64 }, + { "zfs_arc_meta_limit", KSTAT_DATA_UINT64 }, + { "zfs_arc_meta_min", KSTAT_DATA_UINT64 }, + { "zfs_arc_grow_retry", KSTAT_DATA_UINT64 }, + { "zfs_arc_shrink_shift", KSTAT_DATA_UINT64 }, + { "zfs_arc_p_min_shift", KSTAT_DATA_UINT64 }, + { "zfs_arc_average_blocksize", KSTAT_DATA_UINT64 }, + + { "l2arc_write_max", KSTAT_DATA_UINT64 }, + { "l2arc_write_boost", KSTAT_DATA_UINT64 }, + { "l2arc_headroom", KSTAT_DATA_UINT64 }, + { "l2arc_headroom_boost", KSTAT_DATA_UINT64 }, + { "l2arc_max_block_size", KSTAT_DATA_UINT64 }, + { "l2arc_feed_secs", KSTAT_DATA_UINT64 }, + { "l2arc_feed_min_ms", KSTAT_DATA_UINT64 }, + + { "max_active", KSTAT_DATA_UINT64 }, + { "sync_read_min_active", KSTAT_DATA_UINT64 }, + { "sync_read_max_active", KSTAT_DATA_UINT64 }, + { "sync_write_min_active", KSTAT_DATA_UINT64 }, + { "sync_write_max_active", KSTAT_DATA_UINT64 }, + { "async_read_min_active", KSTAT_DATA_UINT64 }, + { "async_read_max_active", KSTAT_DATA_UINT64 }, + { "async_write_min_active", KSTAT_DATA_UINT64 }, + { "async_write_max_active", KSTAT_DATA_UINT64 }, + { "scrub_min_active", KSTAT_DATA_UINT64 }, + { "scrub_max_active", KSTAT_DATA_UINT64 }, + { "async_write_min_dirty_pct", KSTAT_DATA_INT64 }, + { "async_write_max_dirty_pct", KSTAT_DATA_INT64 }, + { "aggregation_limit", KSTAT_DATA_INT64 }, + { "read_gap_limit", KSTAT_DATA_INT64 }, + { "write_gap_limit", KSTAT_DATA_INT64 }, + + {"arc_reduce_dnlc_percent", KSTAT_DATA_INT64 }, + {"arc_lotsfree_percent", KSTAT_DATA_INT64 }, + {"zfs_dirty_data_max", KSTAT_DATA_INT64 }, + {"zfs_dirty_data_sync", KSTAT_DATA_INT64 }, + {"zfs_delay_max_ns", KSTAT_DATA_INT64 }, + {"zfs_delay_min_dirty_percent", KSTAT_DATA_INT64 }, + {"zfs_delay_scale", KSTAT_DATA_INT64 }, + {"spa_asize_inflation", KSTAT_DATA_INT64 }, + {"zfs_mdcomp_disable", KSTAT_DATA_INT64 }, + {"zfs_prefetch_disable", KSTAT_DATA_INT64 }, + {"zfetch_max_streams", KSTAT_DATA_INT64 }, + {"zfetch_min_sec_reap", KSTAT_DATA_INT64 }, + {"zfetch_array_rd_sz", KSTAT_DATA_INT64 }, + {"zfs_default_bs", KSTAT_DATA_INT64 }, + {"zfs_default_ibs", KSTAT_DATA_INT64 }, + {"metaslab_aliquot", KSTAT_DATA_INT64 }, + {"spa_max_replication_override",KSTAT_DATA_INT64 }, + {"spa_mode_global", KSTAT_DATA_INT64 }, + {"zfs_flags", KSTAT_DATA_INT64 }, + {"zfs_txg_timeout", KSTAT_DATA_INT64 }, + {"zfs_vdev_cache_max", KSTAT_DATA_INT64 }, + {"zfs_vdev_cache_size", KSTAT_DATA_INT64 }, + {"zfs_vdev_cache_bshift", KSTAT_DATA_INT64 }, + {"vdev_mirror_shift", KSTAT_DATA_INT64 }, + {"zfs_scrub_limit", KSTAT_DATA_INT64 }, + {"zfs_no_scrub_io", KSTAT_DATA_INT64 }, + {"zfs_no_scrub_prefetch", KSTAT_DATA_INT64 }, + {"fzap_default_block_shift", KSTAT_DATA_INT64 }, + {"zfs_immediate_write_sz", KSTAT_DATA_INT64 }, + {"zfs_read_chunk_size", KSTAT_DATA_INT64 }, + {"zfs_nocacheflush", KSTAT_DATA_INT64 }, + {"zil_replay_disable", KSTAT_DATA_INT64 }, + {"metaslab_df_alloc_threshold", KSTAT_DATA_INT64 }, + {"metaslab_df_free_pct", KSTAT_DATA_INT64 }, + {"zio_injection_enabled", KSTAT_DATA_INT64 }, + {"zvol_immediate_write_sz", KSTAT_DATA_INT64 }, + + { "l2arc_noprefetch", KSTAT_DATA_INT64 }, + { "l2arc_feed_again", KSTAT_DATA_INT64 }, + { "l2arc_norw", KSTAT_DATA_INT64 }, + + {"zfs_recover", KSTAT_DATA_INT64 }, + + {"zfs_free_bpobj_enabled", KSTAT_DATA_INT64 }, + + {"zfs_send_corrupt_data", KSTAT_DATA_UINT64 }, + {"zfs_send_queue_length", KSTAT_DATA_UINT64 }, + {"zfs_recv_queue_length", KSTAT_DATA_UINT64 }, + + {"zvol_inhibit_dev", KSTAT_DATA_UINT64 }, + {"zfs_send_set_freerecords_bit",KSTAT_DATA_UINT64 }, + + {"zfs_write_implies_delete_child",KSTAT_DATA_UINT64 }, + {"zfs_send_holes_without_brth_tme",KSTAT_DATA_UINT64 }, + + {"dbuf_cache_max_bytes", KSTAT_DATA_UINT64 }, + + {"zfs_vdev_queue_depth_pct", KSTAT_DATA_UINT64 }, + {"zio_dva_throttle_enabled", KSTAT_DATA_UINT64 }, + + {"zfs_vdev_file_size_mismatch_cnt",KSTAT_DATA_UINT64 }, + + {"zfs_lua_max_instrlimit", KSTAT_DATA_UINT64 }, + {"zfs_lua_max_memlimit", KSTAT_DATA_UINT64 }, + + {"zfs_trim_extent_bytes_max", KSTAT_DATA_UINT64 }, + {"zfs_trim_extent_bytes_min", KSTAT_DATA_UINT64 }, + {"zfs_trim_metaslab_skip", KSTAT_DATA_UINT64 }, + {"zfs_trim_txg_batch", KSTAT_DATA_UINT64 }, + {"zfs_trim_queue_limit", KSTAT_DATA_UINT64 }, + + {"hostid", KSTAT_DATA_UINT32 }, + {"send_unmodified_spill_blocks", KSTAT_DATA_UINT64 }, + {"special_class_metadata_rsrv_pct", KSTAT_DATA_UINT64 }, + + { "zfs_disable_wincache", KSTAT_DATA_UINT64 }, + { "zfs_disable_removablemedia", KSTAT_DATA_UINT64 }, + { "zfs_vdev_initialize_value", KSTAT_DATA_UINT64 }, +}; + + + + +static kstat_t *osx_kstat_ksp; + +#if !defined (__OPTIMIZE__) +#pragma GCC diagnostic ignored "-Wframe-larger-than=" +#endif + +static int osx_kstat_update(kstat_t *ksp, int rw) +{ + osx_kstat_t *ks = ksp->ks_data; + + if (rw == KSTAT_WRITE) { + + /* win32 */ + + debug_vnop_osx_printf = ks->win32_debug.value.ui64; + extern void saveBuffer(void); + if (ks->win32_debug.value.ui64 == 1337) + saveBuffer(); + if (ks->win32_debug.value.ui64 == 9119) + panic("ZFS: User requested panic\n"); + zfs_vnop_ignore_negatives = ks->win32_ignore_negatives.value.ui64; + zfs_vnop_ignore_positives = ks->win32_ignore_positives.value.ui64; + zfs_vnop_create_negatives = ks->win32_create_negatives.value.ui64; + zfs_vnop_force_formd_normalized_output = ks->win32_force_formd_normalized.value.ui64; + zfs_vnop_skip_unlinked_drain = ks->win32_skip_unlinked_drain.value.ui64; + zfs_vfs_sync_paranoia = ks->win32_use_system_sync.value.ui64; + + /* ARC */ + arc_kstat_update(ksp, rw); + arc_kstat_update_osx(ksp, rw); + + /* L2ARC */ + l2arc_write_max = ks->l2arc_write_max.value.ui64; + l2arc_write_boost = ks->l2arc_write_boost.value.ui64; + l2arc_headroom = ks->l2arc_headroom.value.ui64; + l2arc_headroom_boost = ks->l2arc_headroom_boost.value.ui64; + l2arc_max_block_size = ks->l2arc_max_block_size.value.ui64; + l2arc_feed_secs = ks->l2arc_feed_secs.value.ui64; + l2arc_feed_min_ms = ks->l2arc_feed_min_ms.value.ui64; + + l2arc_noprefetch = ks->l2arc_noprefetch.value.i64; + l2arc_feed_again = ks->l2arc_feed_again.value.i64; + l2arc_norw = ks->l2arc_norw.value.i64; + + /* vdev_queue */ + + zfs_vdev_max_active = + ks->zfs_vdev_max_active.value.ui64; + zfs_vdev_sync_read_min_active = + ks->zfs_vdev_sync_read_min_active.value.ui64; + zfs_vdev_sync_read_max_active = + ks->zfs_vdev_sync_read_max_active.value.ui64; + zfs_vdev_sync_write_min_active = + ks->zfs_vdev_sync_write_min_active.value.ui64; + zfs_vdev_sync_write_max_active = + ks->zfs_vdev_sync_write_max_active.value.ui64; + zfs_vdev_async_read_min_active = + ks->zfs_vdev_async_read_min_active.value.ui64; + zfs_vdev_async_read_max_active = + ks->zfs_vdev_async_read_max_active.value.ui64; + zfs_vdev_async_write_min_active = + ks->zfs_vdev_async_write_min_active.value.ui64; + zfs_vdev_async_write_max_active = + ks->zfs_vdev_async_write_max_active.value.ui64; + zfs_vdev_scrub_min_active = + ks->zfs_vdev_scrub_min_active.value.ui64; + zfs_vdev_scrub_max_active = + ks->zfs_vdev_scrub_max_active.value.ui64; + zfs_vdev_async_write_active_min_dirty_percent = + ks->zfs_vdev_async_write_active_min_dirty_percent.value.i64; + zfs_vdev_async_write_active_max_dirty_percent = + ks->zfs_vdev_async_write_active_max_dirty_percent.value.i64; + zfs_vdev_aggregation_limit = + ks->zfs_vdev_aggregation_limit.value.i64; + zfs_vdev_read_gap_limit = + ks->zfs_vdev_read_gap_limit.value.i64; + zfs_vdev_write_gap_limit = + ks->zfs_vdev_write_gap_limit.value.i64; + + arc_reduce_dnlc_percent = + ks->arc_reduce_dnlc_percent.value.i64; + arc_lotsfree_percent = + ks->arc_lotsfree_percent.value.i64; + zfs_dirty_data_max = + ks->zfs_dirty_data_max.value.i64; + zfs_dirty_data_sync = + ks->zfs_dirty_data_sync.value.i64; + zfs_delay_max_ns = + ks->zfs_delay_max_ns.value.i64; + zfs_delay_min_dirty_percent = + ks->zfs_delay_min_dirty_percent.value.i64; + zfs_delay_scale = + ks->zfs_delay_scale.value.i64; + spa_asize_inflation = + ks->spa_asize_inflation.value.i64; + zfs_mdcomp_disable = + ks->zfs_mdcomp_disable.value.i64; + zfs_prefetch_disable = + ks->zfs_prefetch_disable.value.i64; + zfetch_max_streams = + ks->zfetch_max_streams.value.i64; + zfetch_min_sec_reap = + ks->zfetch_min_sec_reap.value.i64; + zfetch_array_rd_sz = + ks->zfetch_array_rd_sz.value.i64; + zfs_default_bs = + ks->zfs_default_bs.value.i64; + zfs_default_ibs = + ks->zfs_default_ibs.value.i64; + metaslab_aliquot = + ks->metaslab_aliquot.value.i64; + spa_max_replication_override = + ks->spa_max_replication_override.value.i64; + spa_mode_global = + ks->spa_mode_global.value.i64; + zfs_flags = + ks->zfs_flags.value.i64; + zfs_txg_timeout = + ks->zfs_txg_timeout.value.i64; + zfs_vdev_cache_max = + ks->zfs_vdev_cache_max.value.i64; + zfs_vdev_cache_size = + ks->zfs_vdev_cache_size.value.i64; + zfs_no_scrub_io = + ks->zfs_no_scrub_io.value.i64; + zfs_no_scrub_prefetch = + ks->zfs_no_scrub_prefetch.value.i64; + fzap_default_block_shift = + ks->fzap_default_block_shift.value.i64; + zfs_immediate_write_sz = + ks->zfs_immediate_write_sz.value.i64; + zfs_read_chunk_size = + ks->zfs_read_chunk_size.value.i64; + zfs_nocacheflush = + ks->zfs_nocacheflush.value.i64; + zil_replay_disable = + ks->zil_replay_disable.value.i64; + metaslab_df_alloc_threshold = + ks->metaslab_df_alloc_threshold.value.i64; + metaslab_df_free_pct = + ks->metaslab_df_free_pct.value.i64; + zio_injection_enabled = + ks->zio_injection_enabled.value.i64; + zvol_immediate_write_sz = + ks->zvol_immediate_write_sz.value.i64; + + zfs_recover = + ks->zfs_recover.value.i64; + + zfs_free_bpobj_enabled = + ks->zfs_free_bpobj_enabled.value.i64; + + zfs_send_corrupt_data = + ks->zfs_send_corrupt_data.value.ui64; + zfs_send_queue_length = + ks->zfs_send_queue_length.value.ui64; + zfs_recv_queue_length = + ks->zfs_recv_queue_length.value.ui64; + + zvol_inhibit_dev = + ks->zvol_inhibit_dev.value.ui64; + zfs_send_set_freerecords_bit = + ks->zfs_send_set_freerecords_bit.value.ui64; + + zfs_write_implies_delete_child = + ks->zfs_write_implies_delete_child.value.ui64; + send_holes_without_birth_time = + ks->zfs_send_holes_without_birth_time.value.ui64; + + dbuf_cache_max_bytes = + ks->dbuf_cache_max_bytes.value.ui64; + + zfs_vdev_queue_depth_pct = + ks->zfs_vdev_queue_depth_pct.value.ui64; + + zio_dva_throttle_enabled = + (boolean_t) ks->zio_dva_throttle_enabled.value.ui64; + + zfs_lua_max_instrlimit = + ks->zfs_lua_max_instrlimit.value.ui64; + zfs_lua_max_memlimit = + ks->zfs_lua_max_memlimit.value.ui64; + + zfs_trim_extent_bytes_max = + ks->zfs_trim_extent_bytes_max.value.ui64; + zfs_trim_extent_bytes_min = + ks->zfs_trim_extent_bytes_min.value.ui64; + zfs_trim_metaslab_skip = + ks->zfs_trim_metaslab_skip.value.ui64; + zfs_trim_txg_batch = + ks->zfs_trim_txg_batch.value.ui64; + zfs_trim_queue_limit = + ks->zfs_trim_queue_limit.value.ui64; + + spl_hostid = ks->win32_hw_hostid.value.ui32; + zfs_send_unmodified_spill_blocks = + ks->zfs_send_unmodified_spill_blocks.value.ui64; + zfs_special_class_metadata_reserve_pct = + ks->zfs_special_class_metadata_reserve_pct.value.ui64; + + zfs_disable_wincache = + ks->zfs_disable_wincache.value.ui64; + zfs_disable_removablemedia = + ks->zfs_disable_removablemedia.value.ui64; + zfs_initialize_value = + ks->zfs_vdev_initialize_value.value.ui64; + } else { + + /* kstat READ */ + ks->spa_version.value.ui64 = SPA_VERSION; + ks->zpl_version.value.ui64 = ZPL_VERSION; + + /* win32 */ + ks->win32_active_vnodes.value.ui64 = vnop_num_vnodes; + ks->win32_reclaim_nodes.value.ui64 = vnop_num_reclaims; + ks->win32_debug.value.ui64 = debug_vnop_osx_printf; + ks->win32_ignore_negatives.value.ui64 = zfs_vnop_ignore_negatives; + ks->win32_ignore_positives.value.ui64 = zfs_vnop_ignore_positives; + ks->win32_create_negatives.value.ui64 = zfs_vnop_create_negatives; + ks->win32_force_formd_normalized.value.ui64 = zfs_vnop_force_formd_normalized_output; + ks->win32_skip_unlinked_drain.value.ui64 = zfs_vnop_skip_unlinked_drain; + ks->win32_use_system_sync.value.ui64 = zfs_vfs_sync_paranoia; + + /* ARC */ + arc_kstat_update(ksp, rw); + arc_kstat_update_osx(ksp, rw); + + /* L2ARC */ + ks->l2arc_write_max.value.ui64 = l2arc_write_max; + ks->l2arc_write_boost.value.ui64 = l2arc_write_boost; + ks->l2arc_headroom.value.ui64 = l2arc_headroom; + ks->l2arc_headroom_boost.value.ui64 = l2arc_headroom_boost; + ks->l2arc_max_block_size.value.ui64 = l2arc_max_block_size; + ks->l2arc_feed_secs.value.ui64 = l2arc_feed_secs; + ks->l2arc_feed_min_ms.value.ui64 = l2arc_feed_min_ms; + + ks->l2arc_noprefetch.value.i64 = l2arc_noprefetch; + ks->l2arc_feed_again.value.i64 = l2arc_feed_again; + ks->l2arc_norw.value.i64 = l2arc_norw; + + /* vdev_queue */ + ks->zfs_vdev_max_active.value.ui64 = + zfs_vdev_max_active ; + ks->zfs_vdev_sync_read_min_active.value.ui64 = + zfs_vdev_sync_read_min_active ; + ks->zfs_vdev_sync_read_max_active.value.ui64 = + zfs_vdev_sync_read_max_active ; + ks->zfs_vdev_sync_write_min_active.value.ui64 = + zfs_vdev_sync_write_min_active ; + ks->zfs_vdev_sync_write_max_active.value.ui64 = + zfs_vdev_sync_write_max_active ; + ks->zfs_vdev_async_read_min_active.value.ui64 = + zfs_vdev_async_read_min_active ; + ks->zfs_vdev_async_read_max_active.value.ui64 = + zfs_vdev_async_read_max_active ; + ks->zfs_vdev_async_write_min_active.value.ui64 = + zfs_vdev_async_write_min_active ; + ks->zfs_vdev_async_write_max_active.value.ui64 = + zfs_vdev_async_write_max_active ; + ks->zfs_vdev_scrub_min_active.value.ui64 = + zfs_vdev_scrub_min_active ; + ks->zfs_vdev_scrub_max_active.value.ui64 = + zfs_vdev_scrub_max_active ; + ks->zfs_vdev_async_write_active_min_dirty_percent.value.i64 = + zfs_vdev_async_write_active_min_dirty_percent ; + ks->zfs_vdev_async_write_active_max_dirty_percent.value.i64 = + zfs_vdev_async_write_active_max_dirty_percent ; + ks->zfs_vdev_aggregation_limit.value.i64 = + zfs_vdev_aggregation_limit ; + ks->zfs_vdev_read_gap_limit.value.i64 = + zfs_vdev_read_gap_limit ; + ks->zfs_vdev_write_gap_limit.value.i64 = + zfs_vdev_write_gap_limit; + + ks->arc_reduce_dnlc_percent.value.i64 = + arc_reduce_dnlc_percent; + ks->arc_lotsfree_percent.value.i64 = + arc_lotsfree_percent; + ks->zfs_dirty_data_max.value.i64 = + zfs_dirty_data_max; + ks->zfs_dirty_data_sync.value.i64 = + zfs_dirty_data_sync; + ks->zfs_delay_max_ns.value.i64 = + zfs_delay_max_ns; + ks->zfs_delay_min_dirty_percent.value.i64 = + zfs_delay_min_dirty_percent; + ks->zfs_delay_scale.value.i64 = + zfs_delay_scale; + ks->spa_asize_inflation.value.i64 = + spa_asize_inflation; + ks->zfs_mdcomp_disable.value.i64 = + zfs_mdcomp_disable; + ks->zfs_prefetch_disable.value.i64 = + zfs_prefetch_disable; + ks->zfetch_max_streams.value.i64 = + zfetch_max_streams; + ks->zfetch_min_sec_reap.value.i64 = + zfetch_min_sec_reap; + ks->zfetch_array_rd_sz.value.i64 = + zfetch_array_rd_sz; + ks->zfs_default_bs.value.i64 = + zfs_default_bs; + ks->zfs_default_ibs.value.i64 = + zfs_default_ibs; + ks->metaslab_aliquot.value.i64 = + metaslab_aliquot; + ks->spa_max_replication_override.value.i64 = + spa_max_replication_override; + ks->spa_mode_global.value.i64 = + spa_mode_global; + ks->zfs_flags.value.i64 = + zfs_flags; + ks->zfs_txg_timeout.value.i64 = + zfs_txg_timeout; + ks->zfs_vdev_cache_max.value.i64 = + zfs_vdev_cache_max; + ks->zfs_vdev_cache_size.value.i64 = + zfs_vdev_cache_size; + ks->zfs_no_scrub_io.value.i64 = + zfs_no_scrub_io; + ks->zfs_no_scrub_prefetch.value.i64 = + zfs_no_scrub_prefetch; + ks->fzap_default_block_shift.value.i64 = + fzap_default_block_shift; + ks->zfs_immediate_write_sz.value.i64 = + zfs_immediate_write_sz; + ks->zfs_read_chunk_size.value.i64 = + zfs_read_chunk_size; + ks->zfs_nocacheflush.value.i64 = + zfs_nocacheflush; + ks->zil_replay_disable.value.i64 = + zil_replay_disable; + ks->metaslab_df_alloc_threshold.value.i64 = + metaslab_df_alloc_threshold; + ks->metaslab_df_free_pct.value.i64 = + metaslab_df_free_pct; + ks->zio_injection_enabled.value.i64 = + zio_injection_enabled; + ks->zvol_immediate_write_sz.value.i64 = + zvol_immediate_write_sz; + + ks->zfs_recover.value.i64 = + zfs_recover; + + ks->zfs_free_bpobj_enabled.value.i64 = + zfs_free_bpobj_enabled; + + ks->zfs_send_corrupt_data.value.ui64 = + zfs_send_corrupt_data; + ks->zfs_send_queue_length.value.ui64 = + zfs_send_queue_length; + ks->zfs_recv_queue_length.value.ui64 = + zfs_recv_queue_length; + + ks->zvol_inhibit_dev.value.ui64 = + zvol_inhibit_dev; + ks->zfs_send_set_freerecords_bit.value.ui64 = + zfs_send_set_freerecords_bit; + + ks->zfs_write_implies_delete_child.value.ui64 = + zfs_write_implies_delete_child; + ks->zfs_send_holes_without_birth_time.value.ui64 = + send_holes_without_birth_time; + + ks->dbuf_cache_max_bytes.value.ui64 = dbuf_cache_max_bytes; + + ks->zfs_vdev_queue_depth_pct.value.ui64 = zfs_vdev_queue_depth_pct; + ks->zio_dva_throttle_enabled.value.ui64 = (uint64_t) zio_dva_throttle_enabled; + + ks->zfs_vdev_file_size_mismatch_cnt.value.ui64 = zfs_vdev_file_size_mismatch_cnt; + + ks->zfs_lua_max_instrlimit.value.ui64 = zfs_lua_max_instrlimit; + ks->zfs_lua_max_memlimit.value.ui64 = zfs_lua_max_memlimit; + + ks->zfs_trim_extent_bytes_max.value.ui64 = + zfs_trim_extent_bytes_max; + ks->zfs_trim_extent_bytes_min.value.ui64 = + zfs_trim_extent_bytes_min; + ks->zfs_trim_metaslab_skip.value.ui64 = + zfs_trim_metaslab_skip; + ks->zfs_trim_txg_batch.value.ui64 = + zfs_trim_txg_batch; + ks->zfs_trim_queue_limit.value.ui64 = + zfs_trim_queue_limit; + + ks->win32_hw_hostid.value.ui32 = spl_hostid; + ks->zfs_send_unmodified_spill_blocks.value.ui64 = + zfs_send_unmodified_spill_blocks; + ks->zfs_special_class_metadata_reserve_pct.value.ui64 = + zfs_special_class_metadata_reserve_pct; + + ks->zfs_disable_wincache.value.ui64 = + zfs_disable_wincache; + ks->zfs_disable_removablemedia.value.ui64 = + zfs_disable_removablemedia; + ks->zfs_vdev_initialize_value.value.ui64 = + zfs_initialize_value; + } + + return 0; +} + +int kstat_osx_init(PUNICODE_STRING RegistryPath) +{ + int error = 0; + + osx_kstat_ksp = kstat_create("zfs", 0, "tunable", "win32", + KSTAT_TYPE_NAMED, sizeof (osx_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE); + + if (osx_kstat_ksp != NULL) { + osx_kstat_ksp->ks_data = &osx_kstat; + osx_kstat_ksp->ks_update = osx_kstat_update; + kstat_install(osx_kstat_ksp); + + // We don't hold the ksp here, only call at init, so there are + // no other threads. + KSTAT_ENTER(osx_kstat_ksp); + error = KSTAT_UPDATE(osx_kstat_ksp, KSTAT_READ); + if (error != 0) goto out; + + // Returns number of changed, zero means nothing to do. + error = spl_kstat_registry(RegistryPath, osx_kstat_ksp); + if (error == 0) goto out; + + error = KSTAT_UPDATE(osx_kstat_ksp, KSTAT_WRITE); + + out: + KSTAT_EXIT(osx_kstat_ksp); + } + + return 0; +} + +void kstat_osx_fini(void) +{ + if (osx_kstat_ksp != NULL) { + kstat_delete(osx_kstat_ksp); + osx_kstat_ksp = NULL; + } +} diff --git a/module/os/windows/zfs/zfs_vfsops.c b/module/os/windows/zfs/zfs_vfsops.c new file mode 100644 index 000000000000..ba4ea1d4b65d --- /dev/null +++ b/module/os/windows/zfs/zfs_vfsops.c @@ -0,0 +1,4054 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek . + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ +/* Portions Copyright 2013 Jorgen Lundman */ + +#include + +#ifndef _WIN32 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fs/fs_subr.h" +#include +#endif /* !_WIN32 */ + +#include + +#ifndef _WIN32 +#include +#include +#include +#endif /* !_WIN32 */ + +#include + +#include +#include + +#ifndef _WIN32 +#include +#include +#endif /* !_WIN32 */ + +#include + +#include +#include +#ifndef _WIN32 +#include +#include +#include +#include +#include +#include +#include +#endif /* !_WIN32 */ + +#include + +#ifndef _WIN32 +#include +#include +#include +#include +#endif /* !_WIN32 */ + +#include + +#ifndef _WIN32 +#include +#endif /* !_WIN32 */ + +#ifdef __LINUX__ +#include +#endif /* __LINUX__ */ + +#include "zfs_comutil.h" + +#ifdef _WIN32 +#include +#include +#include +#endif /* _WIN32 */ + +//#define dprintf kprintf +//#define dprintf printf + +#ifdef _WIN32 +unsigned int zfs_vnop_skip_unlinked_drain = 0; + +//int zfs_module_start(kmod_info_t *ki, void *data); +//int zfs_module_stop(kmod_info_t *ki, void *data); +extern int getzfsvfs(const char *dsname, zfsvfs_t **zfvp); + + +/* + * AVL tree of hardlink entries, which we need to map for Finder. The va_linkid + * needs to be unique for each hardlink target, as well as, return the znode + * in vget(va_linkid). Unfortunately, the va_linkid is 32bit (lost in the + * syscall translation to userland struct). We sort the AVL tree by + * -> directory id + * -> z_id + * -> name + * + */ +static int hardlinks_compare(const void *arg1, const void *arg2) +{ + const hardlinks_t *node1 = arg1; + const hardlinks_t *node2 = arg2; + int value; + if (node1->hl_parent > node2->hl_parent) + return 1; + if (node1->hl_parent < node2->hl_parent) + return -1; + if (node1->hl_fileid > node2->hl_fileid) + return 1; + if (node1->hl_fileid < node2->hl_fileid) + return -1; + + value = strncmp(node1->hl_name, node2->hl_name, PATH_MAX); + if (value < 0) return -1; + if (value > 0) return 1; + return 0; +} + +/* + * Lookup same information from linkid, to get at parentid, objid and name + */ +static int hardlinks_compare_linkid(const void *arg1, const void *arg2) +{ + const hardlinks_t *node1 = arg1; + const hardlinks_t *node2 = arg2; + if (node1->hl_linkid > node2->hl_linkid) + return 1; + if (node1->hl_linkid < node2->hl_linkid) + return -1; + return 0; +} + + + + +/* + * Mac OS X needs a file system modify time + * + * We use the mtime of the "com.apple.system.mtime" + * extended attribute, which is associated with the + * file system root directory. This attribute has + * no associated data. + */ +#define ZFS_MTIME_XATTR "com.apple.system.mtime" + +extern int zfs_obtain_xattr(znode_t *, const char *, mode_t, cred_t *, vnode_t **, int); + + +/* + * We need to keep a count of active fs's. + * This is necessary to prevent our kext + * from being unloaded after a umount -f + */ +uint32_t zfs_active_fs_count = 0; + +extern void zfs_ioctl_init(void); +extern void zfs_ioctl_fini(void); + +#endif + +static int +zfsvfs_parse_option(char *option, char *value, vfs_t *vfsp) +{ + if (!option || !*option) return 0; + dprintf("parse '%s' '%s'\n", option?option:"", + value?value:""); + if (!strcasecmp(option, "readonly")) { + if (value && *value && + !strcasecmp(value, "off")) + vfs_clearflags(vfsp, (uint64_t)MNT_RDONLY); + else + vfs_setflags(vfsp, (uint64_t)MNT_RDONLY); + } + return 0; +} + +/* + * Parse the raw mntopts and return a vfs_t describing the options. + */ +static int +zfsvfs_parse_options(char *mntopts, vfs_t *vfsp) +{ + int error = 0; + + if (mntopts != NULL) { + char *p, *t, *v; + char *keep; + + int len = strlen(mntopts) + 1; + keep = kmem_alloc(len, KM_SLEEP); + t = keep; + memcpy(t, mntopts, len); + + while(1) { + while (t && *t == ' ') t++; + + p = strpbrk(t, ","); + if (p) *p = 0; + + // find "=" + v = strpbrk(t, "="); + if (v) { + *v = 0; + v++; + while (*v == ' ') v++; + } + error = zfsvfs_parse_option(t, v, vfsp); + if (error) break; + if (!p) break; + t = &p[1]; + } + kmem_free(keep, len); + } + + return (error); +} + + +/* The OS sync ignored by default, as ZFS handles internal periodic + * syncs. (As per illumos) Unfortunately, we can not tell the difference + * of when users run "sync" by hand. Sync is called on umount though. + */ +uint64_t zfs_vfs_sync_paranoia = 0; + +int +zfs_vfs_sync(struct mount *vfsp, int waitfor, vfs_context_t *context) +{ + /* + * Data integrity is job one. We don't want a compromised kernel + * writing to the storage pool, so we never sync during panic. + */ + if (spl_panicstr()) + return (0); + + /* Check if sysctl setting wants sync - and we are not unmounting */ + if (zfs_vfs_sync_paranoia == 0 && + !vfs_isunmount(vfsp)) + return (0); + + if (vfsp != NULL) { + /* + * Sync a specific filesystem. + */ +#if 1 + zfsvfs_t *zfsvfs = vfs_fsprivate(vfsp); + dsl_pool_t *dp; + + ZFS_ENTER(zfsvfs); + dp = dmu_objset_pool(zfsvfs->z_os); + + /* + * If the system is shutting down, then skip any + * filesystems which may exist on a suspended pool. + */ + if (spl_system_inshutdown() && spa_suspended(dp->dp_spa)) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, 0); + + ZFS_EXIT(zfsvfs); + +#endif + } else { +#if 1 + /* + * Sync all ZFS filesystems. This is what happens when you + * run sync(1M). Unlike other filesystems, ZFS honors the + * request by waiting for all pools to commit all dirty data. + */ + spa_sync_allpools(); +#endif + } + + return (0); + +} + + + +#ifndef _WIN32 +static int +zfs_create_unique_device(dev_t *dev) +{ + major_t new_major; + + do { + ASSERT3U(zfs_minor, <=, MAXMIN32); + minor_t start = zfs_minor; + do { + mutex_enter(&zfs_dev_mtx); + if (zfs_minor >= MAXMIN32) { + /* + * If we're still using the real major + * keep out of /dev/zfs and /dev/zvol minor + * number space. If we're using a getudev()'ed + * major number, we can use all of its minors. + */ + if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) + zfs_minor = ZFS_MIN_MINOR; + else + zfs_minor = 0; + } else { + zfs_minor++; + } + *dev = makedevice(zfs_major, zfs_minor); + mutex_exit(&zfs_dev_mtx); + } while (vfs_devismounted(*dev) && zfs_minor != start); + if (zfs_minor == start) { + /* + * We are using all ~262,000 minor numbers for the + * current major number. Create a new major number. + */ + if ((new_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "zfs_mount: Can't get unique major " + "device number."); + return (-1); + } + mutex_enter(&zfs_dev_mtx); + zfs_major = new_major; + zfs_minor = 0; + + mutex_exit(&zfs_dev_mtx); + } else { + break; + } + /* CONSTANTCONDITION */ + } while (1); + + return (0); +} +#endif /* !__FreeBSD__ */ + +static void +atime_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == B_TRUE) { + zfsvfs->z_atime = B_TRUE; + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NOATIME); + } else { + zfsvfs->z_atime = B_FALSE; + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NOATIME); + } +} + +#ifdef LINUX +static void +relatime_changed_cb(void *arg, uint64_t newval) +{ + ((zfs_sb_t *)arg)->z_relatime = newval; +} +#endif + +static void +xattr_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + /* + * Apple does have MNT_NOUSERXATTR mount option, but unfortunately the VFS + * layer returns EACCESS if xattr access is attempted. Finder etc, will + * do so, even if filesystem capabilities is set without xattr, rendering + * the mount option useless. We no longer set it, and handle xattrs being + * disabled internally. + */ + + if (newval == ZFS_XATTR_OFF) { + zfsvfs->z_xattr = B_FALSE; + //vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NOUSERXATTR); + } else { + zfsvfs->z_xattr = B_TRUE; + //vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NOUSERXATTR); + + if (newval == ZFS_XATTR_SA) + zfsvfs->z_xattr_sa = B_TRUE; + else + zfsvfs->z_xattr_sa = B_FALSE; + } +} + +#if 0 // unused function +static void +acltype_changed_cb(void *arg, uint64_t newval) +{ +#ifdef LINUX + switch (newval) { + case ZFS_ACLTYPE_OFF: + zsb->z_acl_type = ZFS_ACLTYPE_OFF; + zsb->z_sb->s_flags &= ~MS_POSIXACL; + break; + case ZFS_ACLTYPE_POSIXACL: +#ifdef CONFIG_FS_POSIX_ACL + zsb->z_acl_type = ZFS_ACLTYPE_POSIXACL; + zsb->z_sb->s_flags |= MS_POSIXACL; +#else + zsb->z_acl_type = ZFS_ACLTYPE_OFF; + zsb->z_sb->s_flags &= ~MS_POSIXACL; +#endif /* CONFIG_FS_POSIX_ACL */ + break; + default: + break; + } +#endif +} +#endif + +static void +blksz_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); + ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); + ASSERT(ISP2(newval)); + + zfsvfs->z_max_blksz = newval; + //zfsvfs->z_vfs->mnt_stat.f_iosize = newval; +} + +static void +readonly_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_TRUE) { + /* XXX locking on vfs_flag? */ + + // We need to release the mtime_vp when readonly, as it will not + // call VNOP_SYNC in RDONLY. + +#if 0 + if (zfsvfs->z_mtime_vp) { + vnode_rele(zfsvfs->z_mtime_vp); + vnode_recycle(zfsvfs->z_mtime_vp); + zfsvfs->z_mtime_vp = NULL; + } +#endif + // Flush any writes + //vflush(mp, NULLVP, SKIPSYSTEM); + + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_RDONLY); + zfsvfs->z_rdonly = 1; + } else { + // FIXME, we don't re-open mtime_vp here. + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_RDONLY); + zfsvfs->z_rdonly = 0; + } +} + +static void +devices_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NODEV); + } else { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NODEV); + } +} + +static void +setuid_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NOSUID); + } else { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NOSUID); + } +} + +static void +exec_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_NOEXEC); + } else { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_NOEXEC); + } +} + +/* + * The nbmand mount option can be changed at mount time. + * We can't allow it to be toggled on live file systems or incorrect + * behavior may be seen from cifs clients + * + * This property isn't registered via dsl_prop_register(), but this callback + * will be called when a file system is first mounted + */ +#if 0 // unused function +static void +nbmand_changed_cb(void *arg, uint64_t newval) +{ +#if 0 + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); + } else { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); + } +#endif +} +#endif + +static void +snapdir_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + zfsvfs->z_show_ctldir = newval; + dnlc_purge_vfsp(zfsvfs->z_vfs, 0); +} + +static void +vscan_changed_cb(void *arg, uint64_t newval) +{ + //zfsvfs_t *zfsvfs = arg; + + //zfsvfs->z_vscan = newval; +} + +static void +acl_mode_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_mode = newval; +} + +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_inherit = newval; +} + +#ifdef _WIN32 +static void +finderbrowse_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_DONTBROWSE); + } else { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_DONTBROWSE); + } +} +static void +ignoreowner_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == B_FALSE) { + vfs_clearflags(zfsvfs->z_vfs, (uint64_t)MNT_IGNORE_OWNERSHIP); + } else { + vfs_setflags(zfsvfs->z_vfs, (uint64_t)MNT_IGNORE_OWNERSHIP); + } +} + +static void +mimic_hfs_changed_cb(void *arg, uint64_t newval) +{ + // FIXME - what do we do in here? + zfsvfs_t *zfsvfs = arg; + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(zfsvfs->z_vfs); + + if(newval == 0) { + strlcpy(vfsstatfs->f_fstypename, "zfs", MFSTYPENAMELEN); + } else { + strlcpy(vfsstatfs->f_fstypename, "hfs", MFSTYPENAMELEN); + } +} + +#endif + +static int +zfs_register_callbacks(struct mount *vfsp) +{ + struct dsl_dataset *ds = NULL; + + objset_t *os = NULL; + zfsvfs_t *zfsvfs = NULL; + boolean_t readonly = B_FALSE; + boolean_t do_readonly = B_FALSE; + boolean_t setuid = B_FALSE; + boolean_t do_setuid = B_FALSE; + boolean_t exec = B_FALSE; + boolean_t do_exec = B_FALSE; + boolean_t devices = B_FALSE; + boolean_t do_devices = B_FALSE; + boolean_t xattr = B_FALSE; + boolean_t do_xattr = B_FALSE; + boolean_t atime = B_FALSE; + boolean_t do_atime = B_FALSE; + boolean_t finderbrowse = B_FALSE; + boolean_t do_finderbrowse = B_FALSE; + boolean_t ignoreowner = B_FALSE; + boolean_t do_ignoreowner = B_FALSE; + int error = 0; + + ASSERT(vfsp); + zfsvfs = vfs_fsprivate(vfsp); + ASSERT(zfsvfs); + os = zfsvfs->z_os; + + /* + * This function can be called for a snapshot when we update snapshot's + * mount point, which isn't really supported. + */ + if (dmu_objset_is_snapshot(os)) + return (EOPNOTSUPP); + + /* + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and + * restore them after we register the callbacks. + */ +#define vfs_optionisset(X, Y, Z) (vfs_flags(X)&(Y)) + + if (vfs_optionisset(vfsp, MNT_RDONLY, NULL) || + !spa_writeable(dmu_objset_spa(os))) { + readonly = B_TRUE; + do_readonly = B_TRUE; +#ifndef _WIN32 + /* Apple has no option to pass RW to mount, ie + * zfs set readonly=on D ; zfs mount -o rw D + */ + } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { + readonly = B_FALSE; + do_readonly = B_TRUE; +#endif + } + if (vfs_optionisset(vfsp, MNT_NODEV, NULL)) { + devices = B_FALSE; + do_devices = B_TRUE; +#ifndef _WIN32 + } else { + devices = B_TRUE; + do_devices = B_TRUE; +#endif + } + /* xnu SETUID, not IllumOS SUID */ + if (vfs_optionisset(vfsp, MNT_NOSUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; +#ifndef _WIN32 + } else { + setuid = B_TRUE; + do_setuid = B_TRUE; +#endif + } + if (vfs_optionisset(vfsp, MNT_NOEXEC, NULL)) { + exec = B_FALSE; + do_exec = B_TRUE; +#ifndef _WIN32 + } else { + exec = B_TRUE; + do_exec = B_TRUE; +#endif + } + if (vfs_optionisset(vfsp, MNT_NOUSERXATTR, NULL)) { + xattr = B_FALSE; + do_xattr = B_TRUE; +#ifndef _WIN32 + } else { + xattr = B_TRUE; + do_xattr = B_TRUE; +#endif + } + if (vfs_optionisset(vfsp, MNT_NOATIME, NULL)) { + atime = B_FALSE; + do_atime = B_TRUE; +#ifndef _WIN32 + } else { + atime = B_TRUE; + do_atime = B_TRUE; +#endif + } + if (vfs_optionisset(vfsp, MNT_DONTBROWSE, NULL)) { + finderbrowse = B_FALSE; + do_finderbrowse = B_TRUE; +#ifndef _WIN32 + } else { + finderbrowse = B_TRUE; + do_finderbrowse = B_TRUE; +#endif + } + if (vfs_optionisset(vfsp, MNT_IGNORE_OWNERSHIP, NULL)) { + ignoreowner = B_TRUE; + do_ignoreowner = B_TRUE; +#ifndef _WIN32 + } else { + ignoreowner = B_FALSE; + do_ignoreowner = B_TRUE; +#endif + } + + /* + * nbmand is a special property. It can only be changed at + * mount time. + * + * This is weird, but it is documented to only be changeable + * at mount time. + */ +#ifdef __LINUX__ + uint64_t nbmand = 0; + + if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { + nbmand = B_FALSE; + } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { + nbmand = B_TRUE; + } else { + char osname[ZFS_MAX_DATASET_NAME_LEN]; + + dmu_objset_name(os, osname); + if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, + NULL)) { + return (error); + } + } +#endif + + /* + * Register property callbacks. + * + * It would probably be fine to just check for i/o error from + * the first prop_register(), but I guess I like to go + * overboard... + */ + ds = dmu_objset_ds(os); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + error = dsl_prop_register(ds, + + zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); + // This appears to be PROP_PRIVATE, investigate if we want this + // ZOL calls this ACLTYPE + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, + zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); +#ifdef _APPLE_ + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_APPLE_BROWSE), finderbrowse_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_APPLE_IGNOREOWNER), ignoreowner_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_APPLE_MIMIC_HFS), mimic_hfs_changed_cb, zfsvfs); +#endif + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (error) + goto unregister; + + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (do_readonly) + readonly_changed_cb(zfsvfs, readonly); + if (do_setuid) + setuid_changed_cb(zfsvfs, setuid); + if (do_exec) + exec_changed_cb(zfsvfs, exec); + if (do_devices) + devices_changed_cb(zfsvfs, devices); + if (do_xattr) + xattr_changed_cb(zfsvfs, xattr); + if (do_atime) + atime_changed_cb(zfsvfs, atime); +#ifdef _WIN32 + if (do_finderbrowse) + finderbrowse_changed_cb(zfsvfs, finderbrowse); + if (do_ignoreowner) + ignoreowner_changed_cb(zfsvfs, ignoreowner); +#endif +#ifndef _WIN32 + + nbmand_changed_cb(zfsvfs, nbmand); +#endif + + return (0); + +unregister: + dsl_prop_unregister_all(ds, zfsvfs); + return (error); +} + +static int +zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, + uint64_t *userp, uint64_t *groupp) +{ + //int error = 0; + + /* + * Is it a valid type of object to track? + */ + if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) + return (SET_ERROR(ENOENT)); + + /* + * If we have a NULL data pointer + * then assume the id's aren't changing and + * return EEXIST to the dmu to let it know to + * use the same ids + */ + if (data == NULL) + return (SET_ERROR(EEXIST)); + + if (bonustype == DMU_OT_ZNODE) { + znode_phys_t *znp = data; + *userp = znp->zp_uid; + *groupp = znp->zp_gid; + } else { +#if 1 + int hdrsize; + sa_hdr_phys_t *sap = data; + sa_hdr_phys_t sa = *sap; + boolean_t swap = B_FALSE; + + ASSERT(bonustype == DMU_OT_SA); + + if (sa.sa_magic == 0) { + /* + * This should only happen for newly created + * files that haven't had the znode data filled + * in yet. + */ + *userp = 0; + *groupp = 0; + return (0); + } + if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { + sa.sa_magic = SA_MAGIC; + sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); + swap = B_TRUE; + } else { + if (sa.sa_magic != SA_MAGIC) { + dprintf("ZFS: sa.sa_magic %x is not SA_MAGIC\n", + sa.sa_magic); + return -1; + } + VERIFY3U(sa.sa_magic, ==, SA_MAGIC); + } + + hdrsize = sa_hdrsize(&sa); + VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); + *userp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_UID_OFFSET)); + *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_GID_OFFSET)); + if (swap) { + *userp = BSWAP_64(*userp); + *groupp = BSWAP_64(*groupp); + } +#endif + } + return (0); +} + +static void +fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, + char *domainbuf, int buflen, uid_t *ridp) +{ + uint64_t fuid; + const char *domain; + + fuid = zfs_strtonum(fuidstr, NULL); + + domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); + if (domain) + (void) strlcpy(domainbuf, domain, buflen); + else + domainbuf[0] = '\0'; + *ridp = FUID_RID(fuid); +} + +static uint64_t +zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) +{ + switch (type) { + case ZFS_PROP_USERUSED: + return (DMU_USERUSED_OBJECT); + case ZFS_PROP_GROUPUSED: + return (DMU_GROUPUSED_OBJECT); + case ZFS_PROP_USERQUOTA: + return (zfsvfs->z_userquota_obj); + case ZFS_PROP_GROUPQUOTA: + return (zfsvfs->z_groupquota_obj); + default: + return (SET_ERROR(ENOTSUP)); + break; + } + return (0); +} + +int +zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) +{ + int error; + zap_cursor_t zc; + zap_attribute_t za; + zfs_useracct_t *buf = vbuf; + uint64_t obj; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == 0) { + *bufsizep = 0; + return (0); + } + + for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > + *bufsizep) + break; + + fuidstr_to_sid(zfsvfs, za.za_name, + buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); + + buf->zu_space = za.za_first_integer; + buf++; + } + if (error == ENOENT) + error = 0; + + ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); + *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; + *cookiep = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + return (error); +} + +/* + * buf must be big enough (eg, 32 bytes) + */ +static int +id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, + char *buf, boolean_t addok) +{ + uint64_t fuid; + int domainid = 0; + + if (domain && domain[0]) { + domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); + if (domainid == -1) + return (SET_ERROR(ENOENT)); + } + fuid = FUID_ENCODE(domainid, rid); + (void) snprintf(buf, 32, "%llx", (longlong_t)fuid); + return (0); +} + +int +zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valp) +{ + char buf[32]; + int err; + uint64_t obj; + + *valp = 0; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == 0) + return (0); + + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); + if (err) + return (err); + + err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); + if (err == ENOENT) + err = 0; + return (err); +} + +int +zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota) +{ + char buf[32]; + int err; + dmu_tx_t *tx; + uint64_t *objp; + boolean_t fuid_dirtied; + + if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) + return (SET_ERROR(EINVAL)); + + if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) + return (SET_ERROR(ENOTSUP)); + + objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : + &zfsvfs->z_groupquota_obj; + + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); + if (err) + return (err); + fuid_dirtied = zfsvfs->z_fuid_dirty; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); + if (*objp == 0) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + zfs_userquota_prop_prefixes[type]); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + mutex_enter(&zfsvfs->z_lock); + if (*objp == 0) { + *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, + DMU_OT_NONE, 0, tx); + VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); + } + mutex_exit(&zfsvfs->z_lock); + + if (quota == 0) { + err = zap_remove(zfsvfs->z_os, *objp, buf, tx); + if (err == ENOENT) + err = 0; + } else { + err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); + } + ASSERT(err == 0); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + dmu_tx_commit(tx); + return (err); +} + +boolean_t +zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) +{ + char buf[32]; + uint64_t used, quota, usedobj, quotaobj; + int err; + + usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + (void) snprintf(buf, sizeof(buf), "%llx", (longlong_t)fuid); + err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); + if (err != 0) + return (B_FALSE); + + err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); + if (err != 0) + return (B_FALSE); + return (used >= quota); +} + +boolean_t +zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) +{ + uint64_t fuid; + uint64_t quotaobj; + + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + fuid = isgroup ? zp->z_gid : zp->z_uid; + + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); +} + +/* + * Associate this zfsvfs with the given objset, which must be owned. + * This will cache a bunch of on-disk state from the objset in the + * zfsvfs. + */ +static int +zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + uint64_t val; + + zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + zfsvfs->z_os = os; + + /* Volume status "all ok" */ + zfsvfs->z_notification_conditions = 0; + zfsvfs->z_freespace_notify_warninglimit = 0; + zfsvfs->z_freespace_notify_dangerlimit = 0; + zfsvfs->z_freespace_notify_desiredlevel = 0; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error != 0) + return (error); + if (zfsvfs->z_version > + zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { + dprintf("Can't mount a version %lld file system " + "on a version %lld pool\n. Pool must be upgraded to mount " + "this file system.\n", (u_longlong_t)zfsvfs->z_version, + (u_longlong_t)spa_version(dmu_objset_spa(os))); + return (SET_ERROR(ENOTSUP)); + } + error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); + if (error != 0) + return (error); + zfsvfs->z_norm = (int)val; + + error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); + if (error != 0) + return (error); + zfsvfs->z_utf8 = (val != 0); + + error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); + if (error != 0) + return (error); + zfsvfs->z_case = (uint_t)val; + + error = zfs_get_zplprop(os, ZFS_PROP_ACLMODE, &val); + if (error != 0) + return (error); + zfsvfs->z_acl_mode = (uint_t)val; +#if __APPLE__ + zfs_get_zplprop(os, ZFS_PROP_APPLE_LASTUNMOUNT, &val); + zfsvfs->z_last_unmount_time = val; +#endif + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + uint64_t sa_obj = 0; + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); + if ((error == 0) && (val == ZFS_XATTR_SA)) + zfsvfs->z_xattr_sa = B_TRUE; + } + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error != 0) + return (error); + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error != 0) + return (error); + ASSERT(zfsvfs->z_root != 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], + 8, 1, &zfsvfs->z_userquota_obj); + if (error == ENOENT) + zfsvfs->z_userquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], + 8, 1, &zfsvfs->z_groupquota_obj); + if (error == ENOENT) + zfsvfs->z_groupquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error == ENOENT) + zfsvfs->z_fuid_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, + &zfsvfs->z_shares_dir); + if (error == ENOENT) + zfsvfs->z_shares_dir = 0; + else if (error != 0) + return (error); + + return (0); +} + +int +zfsvfs_create(const char *osname, zfsvfs_t **zfvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + int error; + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + /* + * We claim to always be readonly so we can open snapshots; + * other ZPL code will prevent us from writing to snapshots. + */ + error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, B_TRUE, + zfsvfs, &os); + if (error != 0) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + error = zfsvfs_create_impl(zfvp, zfsvfs, os); + if (error != 0) { + dmu_objset_disown(os, B_TRUE, zfsvfs); + } + return (error); +} + +int +zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + + zfsvfs->z_vfs = NULL; + zfsvfs->z_parent = zfsvfs; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); +#ifdef _WIN32 + rw_init(&zfsvfs->z_hardlinks_lock, NULL, RW_DEFAULT, NULL); + avl_create(&zfsvfs->z_hardlinks, hardlinks_compare, + sizeof (hardlinks_t), offsetof(hardlinks_t, hl_node)); + avl_create(&zfsvfs->z_hardlinks_linkid, hardlinks_compare_linkid, + sizeof (hardlinks_t), offsetof(hardlinks_t, hl_node_linkid)); + zfsvfs->z_rdonly = 0; +#endif + for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + mutex_init(&zfsvfs->z_drain_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&zfsvfs->z_drain_cv, NULL, CV_DEFAULT, NULL); + + error = zfsvfs_init(zfsvfs, os); + if (error != 0) { + *zfvp = NULL; + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + *zfvp = zfsvfs; + return (0); +} + +static int +zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) +{ + int error; + boolean_t readonly = vfs_isrdonly(zfsvfs->z_vfs); + + /* + * Check for a bad on-disk format version now since we + * lied about owning the dataset readonly before. + */ + if (!readonly && + dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) + return (SET_ERROR(EROFS)); + + error = zfs_register_callbacks(zfsvfs->z_vfs); + if (error) + return (error); + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + + /* + * If we are not mounting (ie: online recv), then we don't + * have to worry about replaying the log as we blocked all + * operations out since we closed the ZIL. + */ + if (mounting) { + + /* + * During replay we remove the read only flag to + * allow replays to succeed. + */ + + if (readonly != 0) + readonly_changed_cb(zfsvfs, B_FALSE); + else + if (!zfs_vnop_skip_unlinked_drain) + zfs_unlinked_drain(zfsvfs); + + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + } + } + + /* restore readonly bit */ + if (readonly != 0) + readonly_changed_cb(zfsvfs, B_TRUE); + } + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + return (0); +} + +extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ + +void +zfsvfs_free(zfsvfs_t *zfsvfs) +{ + int i; + dprintf("+zfsvfs_free\n"); + /* + * This is a barrier to prevent the filesystem from going away in + * zfs_znode_move() until we can safely ensure that the filesystem is + * not unmounted. We consider the filesystem valid before the barrier + * and invalid after the barrier. + */ + //rw_enter(&zfsvfs_lock, RW_READER); + //rw_exit(&zfsvfs_lock); + + zfs_fuid_destroy(zfsvfs); + + cv_destroy(&zfsvfs->z_drain_cv); + mutex_destroy(&zfsvfs->z_drain_lock); + mutex_destroy(&zfsvfs->z_znodes_lock); + mutex_destroy(&zfsvfs->z_lock); + list_destroy(&zfsvfs->z_all_znodes); + rrm_destroy(&zfsvfs->z_teardown_lock); + rw_destroy(&zfsvfs->z_teardown_inactive_lock); + rw_destroy(&zfsvfs->z_fuid_lock); +#ifdef _WIN32 + dprintf("ZFS: Unloading hardlink AVLtree: %lu\n", + avl_numnodes(&zfsvfs->z_hardlinks)); + void *cookie = NULL; + hardlinks_t *hardlink; + rw_destroy(&zfsvfs->z_hardlinks_lock); + while((hardlink = avl_destroy_nodes(&zfsvfs->z_hardlinks_linkid, &cookie))) { + } + cookie = NULL; + while((hardlink = avl_destroy_nodes(&zfsvfs->z_hardlinks, &cookie))) { + kmem_free(hardlink, sizeof(*hardlink)); + } + avl_destroy(&zfsvfs->z_hardlinks); + avl_destroy(&zfsvfs->z_hardlinks_linkid); +#endif + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + dprintf("-zfsvfs_free\n"); +} + +static void +zfs_set_fuid_feature(zfsvfs_t *zfsvfs) +{ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_vfs) { +#if 0 + if (zfsvfs->z_use_fuids) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } else { + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } +#endif + } + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); +} + +static int +zfs_domount(struct mount *vfsp, dev_t mount_dev, char *osname, char *options, + vfs_context_t *ctx) +{ +dprintf("%s\n", __func__); + int error = 0; + zfsvfs_t *zfsvfs; +#ifndef _WIN32 + uint64_t recordsize, fsid_guid; + vnode_t *vp; +#else + uint64_t mimic_hfs = 0; +// struct timeval tv; +#endif + + ASSERT(vfsp); + ASSERT(osname); + + error = zfsvfs_create(osname, &zfsvfs); + if (error) + return (error); + zfsvfs->z_vfs = vfsp; + + error = zfsvfs_parse_options(options, zfsvfs->z_vfs); + if (error) + goto out; + +#ifdef illumos + /* Initialize the generic filesystem structure. */ + vfsp->vfs_bcount = 0; + vfsp->vfs_data = NULL; + + if (zfs_create_unique_device(&mount_dev) == -1) { + error = ENODEV; + goto out; + } + ASSERT(vfs_devismounted(mount_dev) == 0); +#endif + + +#ifdef _WIN32 + zfsvfs->z_rdev = mount_dev; + + /* HFS sets this prior to mounting */ + //vfs_setflags(vfsp, (uint64_t)((unsigned int)MNT_DOVOLFS)); + /* Advisory locking should be handled at the VFS layer */ + //vfs_setlocklocal(vfsp); + + /* + * Record the mount time (for Spotlight) + */ + //microtime(&tv); + //zfsvfs->z_mount_time = tv.tv_sec; + + vfs_setfsprivate(vfsp, zfsvfs); +#else + if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, + NULL)) + goto out; + zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; + zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; + + vfsp->vfs_data = zfsvfs; + vfsp->mnt_flag |= MNT_LOCAL; + vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; + vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; + vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; +#endif + + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + +#ifdef __APPLE__ + error = dsl_prop_get_integer(osname, "com.apple.mimic_hfs", &mimic_hfs, NULL); + if (zfsvfs->z_rdev) { + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(vfsp); + vfsstatfs->f_fsid.val[0] = zfsvfs->z_rdev; + vfsstatfs->f_fsid.val[1] = vfs_typenum(vfsp); + } else { + // Otherwise, ask VFS to give us a random unique one. + vfs_getnewfsid(vfsp); + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(vfsp); + zfsvfs->z_rdev = vfsstatfs->f_fsid.val[0]; + } + + /* If we are readonly (ie, waiting for rootmount) we need to reply + * honestly, so launchd runs fsck_zfs and mount_zfs + */ + if(mimic_hfs) { + struct vfsstatfs *vfsstatfs; + vfsstatfs = vfs_statfs(vfsp); + strlcpy(vfsstatfs->f_fstypename, "hfs", MFSTYPENAMELEN); + } + +#endif + + /* + * Set features for file system. + */ + zfs_set_fuid_feature(zfsvfs); +#if 0 + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); + } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + } + vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); +#endif + + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + uint64_t pval; + char fsname[MAXNAMELEN]; + zfsvfs_t *fs_zfsvfs; + + dmu_fsname(osname, fsname); + error = getzfsvfs(fsname, &fs_zfsvfs); + if (error == 0) { + if (fs_zfsvfs->z_unmounted) + error = SET_ERROR(EINVAL); + vfs_unbusy(fs_zfsvfs->z_vfs); + } + if (error) { + dprintf("file system '%s' is unmounted : error %d\n", + fsname, + error); + goto out; + } + + atime_changed_cb(zfsvfs, B_FALSE); + readonly_changed_cb(zfsvfs, B_TRUE); + if ((error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))) + goto out; + xattr_changed_cb(zfsvfs, pval); + zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; + + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + } else { + if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) + goto out; + } + +#ifdef _WIN32 + vfs_setflags(vfsp, (uint64_t)((unsigned int)MNT_JOURNALED)); + + if ((vfs_flags(vfsp) & MNT_ROOTFS) != 0) { + /* Root FS */ + vfs_clearflags(vfsp, + (uint64_t)((unsigned int)MNT_UNKNOWNPERMISSIONS)); + vfs_clearflags(vfsp, + (uint64_t)((unsigned int)MNT_IGNORE_OWNERSHIP)); + //} else { + //vfs_setflags(vfsp, + // (uint64_t)((unsigned int)MNT_AUTOMOUNTED)); + } + + vfs_mountedfrom(vfsp, osname); +#else + /* Grab extra reference. */ + VERIFY(VFS_ROOT(vfsp, LK_EXCLUSIVE, &vp) == 0); + VOP_UNLOCK(vp, 0); +#endif + +#if 0 // Want .zfs or not + if (!zfsvfs->z_issnap) { + zfsctl_create(zfsvfs); + } +#endif +out: + if (error) { + vfs_setfsprivate(vfsp, NULL); + dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); + zfsvfs_free(zfsvfs); + } else { + atomic_inc_32(&zfs_active_fs_count); + } + + return (error); +} + +void +zfs_unregister_callbacks(zfsvfs_t *zfsvfs) +{ + objset_t *os = zfsvfs->z_os; + + /* + * Unregister properties. + */ + if (!dmu_objset_is_snapshot(os)) { + dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); + } +} + +#ifdef SECLABEL +/* + * Convert a decimal digit string to a uint64_t integer. + */ +static int +str_to_uint64(char *str, uint64_t *objnum) +{ + uint64_t num = 0; + + while (*str) { + if (*str < '0' || *str > '9') + return (EINVAL); + + num = num*10 + *str++ - '0'; + } + + *objnum = num; + return (0); +} + +/* + * The boot path passed from the boot loader is in the form of + * "rootpool-name/root-filesystem-object-number'. Convert this + * string to a dataset name: "rootpool-name/root-filesystem-name". + */ +static int +zfs_parse_bootfs(char *bpath, char *outpath) +{ + char *slashp; + uint64_t objnum; + int error; + + if (*bpath == 0 || *bpath == '/') + return (EINVAL); + + (void) strcpy(outpath, bpath); + + slashp = strchr(bpath, '/'); + + /* if no '/', just return the pool name */ + if (slashp == NULL) { + return (0); + } + + /* if not a number, just return the root dataset name */ + if (str_to_uint64(slashp+1, &objnum)) { + return (0); + } + + *slashp = '\0'; + error = dsl_dsobj_to_dsname(bpath, objnum, outpath); + *slashp = '/'; + + return (error); +} + +/* + * Check that the hex label string is appropriate for the dataset being + * mounted into the global_zone proper. + * + * Return an error if the hex label string is not default or + * admin_low/admin_high. For admin_low labels, the corresponding + * dataset must be readonly. + */ +int +zfs_check_global_label(const char *dsname, const char *hexsl) +{ + if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_HIGH) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_LOW) == 0) { + /* must be readonly */ + uint64_t rdonly; + + if (dsl_prop_get_integer(dsname, + zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) + return (SET_ERROR(EACCES)); + return (rdonly ? 0 : EACCES); + } + return (SET_ERROR(EACCES)); +} + +/* + * zfs_mount_label_policy: + * Determine whether the mount is allowed according to MAC check. + * by comparing (where appropriate) label of the dataset against + * the label of the zone being mounted into. If the dataset has + * no label, create one. + * + * Returns: + * 0 : access allowed + * >0 : error code, such as EACCES + */ +static int +zfs_mount_label_policy(vfs_t *vfsp, char *osname) +{ + int error, retv; + zone_t *mntzone = NULL; + ts_label_t *mnt_tsl; + bslabel_t *mnt_sl; + bslabel_t ds_sl; + char ds_hexsl[MAXNAMELEN]; + + retv = EACCES; /* assume the worst */ + + /* + * Start by getting the dataset label if it exists. + */ + error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + 1, sizeof (ds_hexsl), &ds_hexsl, NULL); + if (error) + return (EACCES); + + /* + * If labeling is NOT enabled, then disallow the mount of datasets + * which have a non-default label already. No other label checks + * are needed. + */ + if (!is_system_labeled()) { + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + return (EACCES); + } + + /* + * Get the label of the mountpoint. If mounting into the global + * zone (i.e. mountpoint is not within an active zone and the + * zoned property is off), the label must be default or + * admin_low/admin_high only; no other checks are needed. + */ + mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); + if (mntzone->zone_id == GLOBAL_ZONEID) { + uint64_t zoned; + + zone_rele(mntzone); + + if (dsl_prop_get_integer(osname, + zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) + return (EACCES); + if (!zoned) + return (zfs_check_global_label(osname, ds_hexsl)); + else + /* + * This is the case of a zone dataset being mounted + * initially, before the zone has been fully created; + * allow this mount into global zone. + */ + return (0); + } + + mnt_tsl = mntzone->zone_slabel; + ASSERT(mnt_tsl != NULL); + label_hold(mnt_tsl); + mnt_sl = label2bslabel(mnt_tsl); + + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { + /* + * The dataset doesn't have a real label, so fabricate one. + */ + char *str = NULL; + + if (l_to_str_internal(mnt_sl, &str) == 0 && + dsl_prop_set_string(osname, + zfs_prop_to_name(ZFS_PROP_MLSLABEL), + ZPROP_SRC_LOCAL, str) == 0) + retv = 0; + if (str != NULL) + kmem_free(str, strlen(str) + 1); + } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { + /* + * Now compare labels to complete the MAC check. If the + * labels are equal then allow access. If the mountpoint + * label dominates the dataset label, allow readonly access. + * Otherwise, access is denied. + */ + if (blequal(mnt_sl, &ds_sl)) + retv = 0; + else if (bldominates(mnt_sl, &ds_sl)) { + vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); + retv = 0; + } + } + + label_rele(mnt_tsl); + zone_rele(mntzone); + return (retv); +} +#endif /* SECLABEL */ + +#ifdef ZFS_BOOT +/* Used by mountroot */ +#define ZFS_BOOT_MOUNT_DEV 1 +int zfs_boot_get_path(char *, int); +#endif + +/* + * zfs_vfs_mountroot + * Given a device vnode created by vfs_mountroot bdevvp, + * and with the root pool already imported, root mount the + * dataset specified in the pool's bootfs property. + * + * Inputs: + * mp: VFS mount struct + * devvp: device vnode, currently only used to retrieve the + * dev_t for the fsid. Could vnode_get, vnode_ref, vnode_put, + * with matching get/rele/put in zfs_vfs_umount, but this is + * already done by XNU as well. + * ctx: VFS context, unused. + * + * Return: + * 0 on success, positive int on failure. + */ +int +zfs_vfs_mountroot(struct mount *mp, struct vnode *devvp, vfs_context_t *ctx) +{ + int error = EINVAL; +#if 0 + /* + static int zfsrootdone = 0; + */ + zfsvfs_t *zfsvfs = NULL; + //znode_t *zp = NULL; + spa_t *spa = 0; + char *zfs_bootfs = 0; + char *path = 0; + dev_t dev = 0; + //int len = MAXPATHLEN; + +printf("ZFS: %s\n", __func__); + ASSERT(mp); + ASSERT(devvp); + ASSERT(ctx); + if (!mp || !devvp | !ctx) { + cmn_err(CE_NOTE, "%s: missing one of mp %p devvp %p" + " or ctx %p", __func__, mp, devvp, ctx); + return (EINVAL); + } + + /* Look up bootfs variable from pool here */ + zfs_bootfs = kmem_alloc(MAXPATHLEN, KM_SLEEP); + if (!zfs_bootfs) { + cmn_err(CE_NOTE, "%s: bootfs alloc failed", + __func__); + return (ENOMEM); + } + +#ifdef ZFS_BOOT_MOUNT_DEV + path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + if (!path) { + cmn_err(CE_NOTE, "%s: path alloc failed", + __func__); + kmem_free(zfs_bootfs, MAXPATHLEN); + return (ENOMEM); + } +#endif + + mutex_enter(&spa_namespace_lock); + spa = spa_next(NULL); + if (!spa) { + mutex_exit(&spa_namespace_lock); + cmn_err(CE_NOTE, "%s: no pool available", + __func__); + goto out; + } + + error = dsl_dsobj_to_dsname(spa_name(spa), + spa_bootfs(spa), zfs_bootfs); + if (error != 0) { + mutex_exit(&spa_namespace_lock); + cmn_err(CE_NOTE, "%s: bootfs to name error %d", + __func__, error); + goto out; + } + mutex_exit(&spa_namespace_lock); + +#ifdef ZFS_BOOT_MOUNT_DEV + /* XXX Could also do IOKit lookup from dev_t to diskN */ + error = zfs_boot_get_path(path, MAXPATHLEN); + if (error != 0) { + cmn_err(CE_NOTE, "get_path: error %d", error); + goto out; + } +#endif + + /* + * By setting the dev_t value in the mount vfsp, + * mount_zfs will be called with the /dev/diskN + * proxy, but we can leave the dataset name in + * the mountedfrom field + */ + dev = vnode_specrdev(devvp); + + printf("Setting readonly\n"); + + if ((error = zfs_domount(mp, dev, zfs_bootfs, NULL, ctx)) != 0) { + //cmn_err(CE_NOTE, "zfs_domount: error %d", error); + printf("zfs_domount: error %d", error); + /* Only drop the usecount if mount fails */ + //vnode_rele(devvp); + goto out; + } + // vfs_clearflags(mp, (u_int64_t)((unsigned int)MNT_AUTOMOUNTED)); + +#ifdef ZFS_BOOT_MOUNT_DEV + // override the mount from field + if (strlen(path) > 0) { + vfs_mountedfrom(mp, path); + } else { + printf("%s couldn't set vfs_mountedfrom\n", __func__); + //goto out; + } +#endif + + zfsvfs = (zfsvfs_t *)vfs_fsprivate(mp); + ASSERT(zfsvfs); + if (!zfsvfs) { + cmn_err(CE_NOTE, "missing zfsvfs"); + goto out; + } + + /* Set this mount to read-only */ + zfsvfs->z_rdonly = 1; + + /* + * Due to XNU mount flags, readonly gets set off for a short + * while, which means mimic will kick in if enabled. But we need + * to reply with true "zfs" until root has been remounted RW, so + * that launchd tries to run mount_zfs instead of mount_hfs + */ + mimic_hfs_changed_cb(zfsvfs, B_FALSE); + + /* + * Leave rootvp held. The root file system is never unmounted. + * + * XXX APPLE + * xnu will in fact call vfs_unmount on the root filesystem + * during shutdown/reboot. + */ + +out: + + if (path) { + kmem_free(path, MAXPATHLEN); + } + if (zfs_bootfs) { + kmem_free(zfs_bootfs, MAXPATHLEN); + } + +#endif + return (error); +} + +#ifdef __LINUX__ +static int +getpoolname(const char *osname, char *poolname) +{ + char *p; + + p = strchr(osname, '/'); + if (p == NULL) { + if (strlen(osname) >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strlcpy(poolname, osname, MAXNAMELEN); + } else { + if (p - osname >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strncpy(poolname, osname, p - osname); + poolname[p - osname] = '\0'; + } + return (0); +} +#endif + +/*ARGSUSED*/ +int +zfs_vfs_mount(struct mount *vfsp, vnode_t *mvp /*devvp*/, + user_addr_t data, vfs_context_t *context) +{ + int error = 0; + cred_t *cr = NULL;//(cred_t *)vfs_context_ucred(context); + char *osname = NULL; + char *options = NULL; + uint64_t flags = vfs_flags(vfsp); + int canwrite; + int rdonly = 0; + int mflag = 0; + +#ifdef _WIN32 + struct zfs_mount_args *mnt_args = (struct zfs_mount_args *)data; + size_t osnamelen = 0; + uint32_t cmdflags = 0; + +dprintf("%s\n", __func__); + cmdflags = (uint32_t) vfs_flags(vfsp) & MNT_CMDFLAGS; + rdonly = vfs_isrdonly(vfsp); +dprintf("%s cmdflags %u rdonly %d\n", __func__, cmdflags, rdonly); + + /* + * Get the objset name (the "special" mount argument). + */ + if (data) { + + // Allocate string area + osname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + strlcpy(osname, mnt_args->fspec, MAXPATHLEN); + + } + + if (mnt_args->struct_size == sizeof(*mnt_args)) { + + mflag = mnt_args->mflag; + + if (mnt_args->optlen) { + options = kmem_alloc(mnt_args->optlen, KM_SLEEP); + strlcpy(options, mnt_args->optptr, mnt_args->optlen); + } + //dprintf("vfs_mount: fspec '%s' : mflag %04llx : optptr %p : optlen %d :" + dprintf("%s: fspec '%s' : mflag %04x : optptr %p : optlen %d :" + " options %s\n", __func__, + osname, + mnt_args->mflag, + mnt_args->optptr, + mnt_args->optlen, + options); + } + +// (void) dnlc_purge_vfsp(vfsp, 0); + + if (mflag & MS_RDONLY) { + dprintf("%s: adding MNT_RDONLY\n", __func__); + flags |= MNT_RDONLY; + } + + if (mflag & MS_OVERLAY) { + dprintf("%s: adding MNT_UNION\n", __func__); + flags |= MNT_UNION; + } + + if (mflag & MS_FORCE) { + dprintf("%s: adding MNT_FORCE\n", __func__); + flags |= MNT_FORCE; + } + + if (mflag & MS_REMOUNT) { + dprintf("%s: adding MNT_UPDATE on MS_REMOUNT\n", __func__); + flags |= MNT_UPDATE; + } + + vfs_setflags(vfsp, (uint64_t)flags); + +#endif + + + /* + * If full-owner-access is enabled and delegated administration is + * turned on, we must set nosuid. + */ +#if 0 + if (zfs_super_owner && + dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { + secpolicy_fs_mount_clearopts(cr, vfsp); + } +#endif + + /* + * Check for mount privilege? + * + * If we don't have privilege then see if + * we have local permission to allow it + */ + error = secpolicy_fs_mount(cr, mvp, vfsp); + if (error) { + if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) { + cmn_err(CE_NOTE, "%s: mount perm error", + __func__); + goto out; + } + +#if 0 + if (!(vfsp->vfs_flag & MS_REMOUNT)) { + vattr_t vattr; + + /* + * Make sure user is the owner of the mount point + * or has sufficient privileges. + */ + + vattr.va_mask = AT_UID; + + vn_lock(mvp, LK_SHARED | LK_RETRY); + if (VOP_GETATTR(mvp, &vattr, cr)) { + VOP_UNLOCK(mvp, 0); + goto out; + } + + if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && + VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { + VOP_UNLOCK(mvp, 0); + goto out; + } + VOP_UNLOCK(mvp, 0); + } +#endif + secpolicy_fs_mount_clearopts(cr, vfsp); + } + + /* + * Refuse to mount a filesystem if we are in a local zone and the + * dataset is not visible. + */ + if (!INGLOBALZONE(curthread) && + (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { + error = EPERM; + goto out; + } + +#ifdef SECLABEL + error = zfs_mount_label_policy(vfsp, osname); + if (error) + goto out; +#endif + +#ifndef _WIN32 + vfsp->vfs_flag |= MNT_NFS4ACLS; + + /* + * When doing a remount, we simply refresh our temporary properties + * according to those options set in the current VFS options. + */ + if (vfsp->vfs_flag & MS_REMOUNT) { + /* refresh mount options */ + zfs_unregister_callbacks(vfsp->vfs_data); + error = zfs_register_callbacks(vfsp); + goto out; + } + + /* Initial root mount: try hard to import the requested root pool. */ + if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && + (vfsp->vfs_flag & MNT_UPDATE) == 0) { + char pname[MAXNAMELEN]; + + error = getpoolname(osname, pname); + if (error == 0) + error = spa_import_rootpool(pname); + if (error) + goto out; + } +#else + /* + * When doing a remount, we simply refresh our temporary properties + * according to those options set in the current VFS options. + */ + if (cmdflags & MNT_UPDATE) { + + if (cmdflags & MNT_RELOAD) { + dprintf("%s: reload after fsck\n", __func__); + error = 0; + goto out; + } + + /* refresh mount options */ + zfsvfs_t *zfsvfs = vfs_fsprivate(vfsp); + ASSERT(zfsvfs); + + if (zfsvfs->z_rdonly == 0 && (flags & MNT_RDONLY || + vfs_isrdonly(vfsp))) { + /* downgrade */ + dprintf("%s: downgrade requested\n", __func__); + zfsvfs->z_rdonly = 1; + readonly_changed_cb(zfsvfs, B_TRUE); + zfs_unregister_callbacks(zfsvfs); + error = zfs_register_callbacks(vfsp); + if (error) { + dprintf("%s: remount returned %d", + __func__, error); + } + } + + //if (zfsvfs->z_rdonly != 0 && vfs_iswriteupgrade(vfsp)) { + if (0 /*vfs_iswriteupgrade(vfsp)*/) { + /* upgrade */ + dprintf("%s: upgrade requested\n", __func__); + zfsvfs->z_rdonly = 0; + readonly_changed_cb(zfsvfs, B_FALSE); + zfs_unregister_callbacks(zfsvfs); + error = zfs_register_callbacks(vfsp); + if (error) { + dprintf("%s: remount returned %d", + __func__, error); + } + } + +/* + printf("%s: would do remount\n", __func__); + + zfs_unregister_callbacks(zfsvfs); + error = zfs_register_callbacks(vfsp); + + if (error) { + //cmn_err(CE_NOTE, "%s: remount returned %d", + printf("%s: remount returned %d", + __func__, error); + } +*/ + goto out; + } + + if (vfs_fsprivate(vfsp) != NULL) { + dprintf("already mounted\n"); + error = 0; + goto out; + } + +//dprintf("%s: calling zfs_domount\n", __func__); +#endif + + error = zfs_domount(vfsp, 0, osname, options, context); + + if (error) { + //cmn_err(CE_NOTE, "%s: zfs_domount returned %d\n", + dprintf("%s: zfs_domount returned %d\n", + __func__, error); + error = 0; + goto out; + } + +#ifdef sun + /* + * Add an extra VFS_HOLD on our parent vfs so that it can't + * disappear due to a forced unmount. + */ + if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) + VFS_HOLD(mvp->v_vfsp); +#endif /* sun */ + +out: +#ifdef _WIN32 + //dprintf("%s out: %d\n", __func__, error); + if (error == 0) { +#if 0 + zfsvfs_t *zfsvfs = vfs_fsprivate(vfsp); + ASSERT(zfsvfs); +#endif + + //dprintf("%s: setting vfs flags\n", __func__); + /* Indicate to VFS that we support ACLs. */ +// vfs_setextendedsecurity(vfsp); + } + + if (error) + dprintf("zfs_vfs_mount: error %d\n", error); + + if (osname) + kmem_free(osname, MAXPATHLEN); + + if (options) + kmem_free(options, mnt_args->optlen); +#endif + + return (error); +} + + +int +zfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, vfs_context_t *context) +{ +#if 0 + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + uint64_t refdbytes, availbytes, usedobjs, availobjs; + uint64_t log_blksize; + uint64_t log_blkcnt; + + dprintf("vfs_getattr\n"); + + ZFS_ENTER(zfsvfs); + + /* Finder will show the old/incorrect size, we can force a sync of the pool + * to make it correct, but that has side effects which are undesirable. + */ + /* txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); */ + + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + + VFSATTR_RETURN(fsap, f_objcount, usedobjs); + VFSATTR_RETURN(fsap, f_maxobjcount, 0x7fffffffffffffff); + /* + * Carbon depends on f_filecount and f_dircount so + * make up some values based on total objects. + */ + VFSATTR_RETURN(fsap, f_filecount, usedobjs - (usedobjs / 4)); + VFSATTR_RETURN(fsap, f_dircount, usedobjs / 4); + + /* + * Model after HFS in working out if we should use the legacy size + * 512, or go to 4096. Note that XNU only likes those two + * blocksizes, so we don't use the ZFS recordsize + */ + log_blkcnt = (u_int64_t)((refdbytes + availbytes) >> SPA_MINBLOCKSHIFT); + log_blksize = (log_blkcnt > 0x000000007fffffff) ? + 4096 : + (1 << SPA_MINBLOCKSHIFT); + + /* + * The underlying storage pool actually uses multiple block sizes. + * We report the fragsize as the smallest block size we support, + * and we report our blocksize as the filesystem's maximum blocksize. + */ + VFSATTR_RETURN(fsap, f_bsize, log_blksize); + VFSATTR_RETURN(fsap, f_iosize, zfsvfs->z_max_blksz); + + /* + * The following report "total" blocks of various kinds in the + * file system, but reported in terms of f_frsize - the + * "fragment" size. + */ + VFSATTR_RETURN(fsap, f_blocks, + (u_int64_t)((refdbytes + availbytes) / log_blksize)); + VFSATTR_RETURN(fsap, f_bfree, (u_int64_t)(availbytes / log_blksize)); + VFSATTR_RETURN(fsap, f_bavail, fsap->f_bfree); /* no root reservation */ + VFSATTR_RETURN(fsap, f_bused, fsap->f_blocks - fsap->f_bfree); + + /* + * statvfs() should really be called statufs(), because it assumes + * static metadata. ZFS doesn't preallocate files, so the best + * we can do is report the max that could possibly fit in f_files, + * and that minus the number actually used in f_ffree. + * For f_ffree, report the smaller of the number of object available + * and the number of blocks (each object will take at least a block). + */ + VFSATTR_RETURN(fsap, f_ffree, (u_int64_t)MIN(availobjs, fsap->f_bfree)); + VFSATTR_RETURN(fsap, f_files, fsap->f_ffree + usedobjs); + + if (VFSATTR_IS_ACTIVE(fsap, f_fsid)) { + fsap->f_fsid.val[0] = zfsvfs->z_rdev; + fsap->f_fsid.val[1] = vfs_typenum(mp); + VFSATTR_SET_SUPPORTED(fsap, f_fsid); + } + if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) { + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_HARDLINKS | // ZFS + VOL_CAP_FMT_SPARSE_FILES | // ZFS + VOL_CAP_FMT_2TB_FILESIZE | // ZFS + VOL_CAP_FMT_JOURNAL | VOL_CAP_FMT_JOURNAL_ACTIVE | // ZFS + VOL_CAP_FMT_SYMBOLICLINKS | // msdos.. + + // ZFS has root times just fine + /*VOL_CAP_FMT_NO_ROOT_TIMES |*/ + + // Ask XNU to remember zero-runs, instead of writing + // zeros to it. + VOL_CAP_FMT_ZERO_RUNS | + + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_PATH_FROM_ID | + VOL_CAP_FMT_64BIT_OBJECT_IDS | + VOL_CAP_FMT_HIDDEN_FILES ; + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] = + VOL_CAP_INT_ATTRLIST | // ZFS + VOL_CAP_INT_NFSEXPORT | // ZFS + VOL_CAP_INT_EXTENDED_SECURITY | // ZFS +#if NAMEDSTREAMS + VOL_CAP_INT_NAMEDSTREAMS | // ZFS +#endif + VOL_CAP_INT_EXTENDED_ATTR | // ZFS + VOL_CAP_INT_VOL_RENAME | // msdos.. + VOL_CAP_INT_ADVLOCK | + + // ZFS does not yet have exchangedata (it's in a branch) + /* VOL_CAP_INT_EXCHANGEDATA| */ + + // ZFS does not yet have copyfile + /* VOL_CAP_INT_COPYFILE| */ + + // ZFS does not yet have allocate + /*VOL_CAP_INT_ALLOCATE|*/ + + VOL_CAP_INT_FLOCK ; + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_RESERVED1] = 0; + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_RESERVED2] = 0; + + /* This is the list of valid capabilities at time of + * compile. The valid list should have them all defined + * and the "capability" list above should enable only + * those we have implemented + */ + fsap->f_capabilities.valid[VOL_CAPABILITIES_FORMAT] = + VOL_CAP_FMT_PERSISTENTOBJECTIDS | + VOL_CAP_FMT_SYMBOLICLINKS | + VOL_CAP_FMT_HARDLINKS | + VOL_CAP_FMT_JOURNAL | + VOL_CAP_FMT_JOURNAL_ACTIVE | + VOL_CAP_FMT_NO_ROOT_TIMES | + VOL_CAP_FMT_SPARSE_FILES | + VOL_CAP_FMT_ZERO_RUNS | + VOL_CAP_FMT_CASE_SENSITIVE | + VOL_CAP_FMT_CASE_PRESERVING | + VOL_CAP_FMT_FAST_STATFS | + VOL_CAP_FMT_2TB_FILESIZE | + VOL_CAP_FMT_OPENDENYMODES | + VOL_CAP_FMT_PATH_FROM_ID | + VOL_CAP_FMT_64BIT_OBJECT_IDS | + VOL_CAP_FMT_NO_VOLUME_SIZES | + VOL_CAP_FMT_DECMPFS_COMPRESSION | + VOL_CAP_FMT_HIDDEN_FILES ; + fsap->f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] = + VOL_CAP_INT_SEARCHFS | + VOL_CAP_INT_ATTRLIST | + VOL_CAP_INT_NFSEXPORT | + VOL_CAP_INT_READDIRATTR | + VOL_CAP_INT_EXCHANGEDATA | + VOL_CAP_INT_COPYFILE | + VOL_CAP_INT_ALLOCATE | + VOL_CAP_INT_VOL_RENAME | + VOL_CAP_INT_ADVLOCK | + VOL_CAP_INT_FLOCK | + VOL_CAP_INT_EXTENDED_ATTR | + VOL_CAP_INT_USERACCESS | +#if NAMEDSTREAMS + VOL_CAP_INT_NAMEDSTREAMS | +#endif + + VOL_CAP_INT_MANLOCK ; + fsap->f_capabilities.valid[VOL_CAPABILITIES_RESERVED1] = 0; + fsap->f_capabilities.valid[VOL_CAPABILITIES_RESERVED2] = 0; + + /* Check if we are case-sensitive */ + if (zfsvfs->z_case == ZFS_CASE_SENSITIVE) + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] + |= VOL_CAP_FMT_CASE_SENSITIVE; + + /* Check if xattr is enabled */ + if (zfsvfs->z_xattr == B_TRUE) { + fsap->f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] + |= VOL_CAP_INT_EXTENDED_ATTR; + } + + VFSATTR_SET_SUPPORTED(fsap, f_capabilities); + } + if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) { + + fsap->f_attributes.validattr.commonattr = + ATTR_CMN_NAME | + ATTR_CMN_DEVID | + ATTR_CMN_FSID | + ATTR_CMN_OBJTYPE | + ATTR_CMN_OBJTAG | + ATTR_CMN_OBJID | + ATTR_CMN_OBJPERMANENTID | + ATTR_CMN_PAROBJID | + /* ATTR_CMN_SCRIPT | */ + ATTR_CMN_CRTIME | + ATTR_CMN_MODTIME | + ATTR_CMN_CHGTIME | + ATTR_CMN_ACCTIME | + /* ATTR_CMN_BKUPTIME | */ + ATTR_CMN_FNDRINFO | + ATTR_CMN_OWNERID | + ATTR_CMN_GRPID | + ATTR_CMN_ACCESSMASK | + ATTR_CMN_FLAGS | + ATTR_CMN_USERACCESS | + ATTR_CMN_EXTENDED_SECURITY | + ATTR_CMN_UUID | + ATTR_CMN_GRPUUID | + 0; + fsap->f_attributes.validattr.volattr = + ATTR_VOL_FSTYPE | + ATTR_VOL_SIGNATURE | + ATTR_VOL_SIZE | + ATTR_VOL_SPACEFREE | + ATTR_VOL_SPACEAVAIL | + ATTR_VOL_MINALLOCATION | + ATTR_VOL_ALLOCATIONCLUMP | + ATTR_VOL_IOBLOCKSIZE | + ATTR_VOL_OBJCOUNT | + ATTR_VOL_FILECOUNT | + ATTR_VOL_DIRCOUNT | + ATTR_VOL_MAXOBJCOUNT | + ATTR_VOL_MOUNTPOINT | + ATTR_VOL_NAME | + ATTR_VOL_MOUNTFLAGS | + ATTR_VOL_MOUNTEDDEVICE | + /* ATTR_VOL_ENCODINGSUSED */ + ATTR_VOL_CAPABILITIES | + ATTR_VOL_ATTRIBUTES; + fsap->f_attributes.validattr.dirattr = + ATTR_DIR_LINKCOUNT | + ATTR_DIR_ENTRYCOUNT | + ATTR_DIR_MOUNTSTATUS; + fsap->f_attributes.validattr.fileattr = + ATTR_FILE_LINKCOUNT | + ATTR_FILE_TOTALSIZE | + ATTR_FILE_ALLOCSIZE | + /* ATTR_FILE_IOBLOCKSIZE */ + ATTR_FILE_DEVTYPE | + /* ATTR_FILE_FORKCOUNT */ + /* ATTR_FILE_FORKLIST */ + ATTR_FILE_DATALENGTH | + ATTR_FILE_DATAALLOCSIZE | + ATTR_FILE_RSRCLENGTH | + ATTR_FILE_RSRCALLOCSIZE; + fsap->f_attributes.validattr.forkattr = 0; + fsap->f_attributes.nativeattr.commonattr = + ATTR_CMN_NAME | + ATTR_CMN_DEVID | + ATTR_CMN_FSID | + ATTR_CMN_OBJTYPE | + ATTR_CMN_OBJTAG | + ATTR_CMN_OBJID | + ATTR_CMN_OBJPERMANENTID | + ATTR_CMN_PAROBJID | + /* ATTR_CMN_SCRIPT | */ + ATTR_CMN_CRTIME | + ATTR_CMN_MODTIME | + /* ATTR_CMN_CHGTIME | */ /* Supported but not native */ + ATTR_CMN_ACCTIME | + /* ATTR_CMN_BKUPTIME | */ + /* ATTR_CMN_FNDRINFO | */ + ATTR_CMN_OWNERID | /* Supported but not native */ + ATTR_CMN_GRPID | /* Supported but not native */ + ATTR_CMN_ACCESSMASK | /* Supported but not native */ + ATTR_CMN_FLAGS | + ATTR_CMN_USERACCESS | + ATTR_CMN_EXTENDED_SECURITY | + ATTR_CMN_UUID | + ATTR_CMN_GRPUUID | + 0; + fsap->f_attributes.nativeattr.volattr = + ATTR_VOL_FSTYPE | + ATTR_VOL_SIGNATURE | + ATTR_VOL_SIZE | + ATTR_VOL_SPACEFREE | + ATTR_VOL_SPACEAVAIL | + ATTR_VOL_MINALLOCATION | + ATTR_VOL_ALLOCATIONCLUMP | + ATTR_VOL_IOBLOCKSIZE | + ATTR_VOL_OBJCOUNT | + ATTR_VOL_FILECOUNT | + ATTR_VOL_DIRCOUNT | + ATTR_VOL_MAXOBJCOUNT | + ATTR_VOL_MOUNTPOINT | + ATTR_VOL_NAME | + ATTR_VOL_MOUNTFLAGS | + ATTR_VOL_MOUNTEDDEVICE | + /* ATTR_VOL_ENCODINGSUSED */ + ATTR_VOL_CAPABILITIES | + ATTR_VOL_ATTRIBUTES; + fsap->f_attributes.nativeattr.dirattr = 0; + fsap->f_attributes.nativeattr.fileattr = + /* ATTR_FILE_LINKCOUNT | */ /* Supported but not native */ + ATTR_FILE_TOTALSIZE | + ATTR_FILE_ALLOCSIZE | + /* ATTR_FILE_IOBLOCKSIZE */ + ATTR_FILE_DEVTYPE | + /* ATTR_FILE_FORKCOUNT */ + /* ATTR_FILE_FORKLIST */ + ATTR_FILE_DATALENGTH | + ATTR_FILE_DATAALLOCSIZE | + ATTR_FILE_RSRCLENGTH | + ATTR_FILE_RSRCALLOCSIZE; + fsap->f_attributes.nativeattr.forkattr = 0; + + VFSATTR_SET_SUPPORTED(fsap, f_attributes); + } + if (VFSATTR_IS_ACTIVE(fsap, f_create_time)) { + char osname[MAXNAMELEN]; + uint64_t value; + + // Get dataset name + dmu_objset_name(zfsvfs->z_os, osname); + dsl_prop_get_integer(osname, "CREATION", + &value, NULL); + fsap->f_create_time.tv_sec = value; + fsap->f_create_time.tv_nsec = 0; + VFSATTR_SET_SUPPORTED(fsap, f_create_time); + } + if (VFSATTR_IS_ACTIVE(fsap, f_modify_time)) { + timestruc_t now; + uint64_t mtime[2]; + + gethrestime(&now); + ZFS_TIME_ENCODE(&now, mtime); + //fsap->f_modify_time = mtime; + ZFS_TIME_DECODE(&fsap->f_modify_time, mtime); + + VFSATTR_SET_SUPPORTED(fsap, f_modify_time); + } + /* + * For Carbon compatibility, pretend to support this legacy/unused + * attribute + */ + if (VFSATTR_IS_ACTIVE(fsap, f_backup_time)) { + fsap->f_backup_time.tv_sec = 0; + fsap->f_backup_time.tv_nsec = 0; + VFSATTR_SET_SUPPORTED(fsap, f_backup_time); + } + + if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) { + char osname[MAXNAMELEN], *slash; + dmu_objset_name(zfsvfs->z_os, osname); + + slash = strrchr(osname, '/'); + if (slash) { + /* Advance past last slash */ + slash += 1; + } else { + /* Copy whole osname (pool root) */ + slash = osname; + } + strlcpy(fsap->f_vol_name, slash, MAXPATHLEN); + +#if 0 + /* + * Finder volume name is set to the basename of the mountpoint path, + * unless the mountpoint path is "/" or NULL, in which case we use + * the f_mntfromname, such as "MyPool/mydataset" + */ + /* Get last path component of mnt 'on' */ + char *volname = strrchr(vfs_statfs(zfsvfs->z_vfs)->f_mntonname, '/'); + if (volname && (*(&volname[1]) != '\0')) { + strlcpy(fsap->f_vol_name, &volname[1], MAXPATHLEN); + } else { + /* Get last path component of mnt 'from' */ + volname = strrchr(vfs_statfs(zfsvfs->z_vfs)->f_mntfromname, '/'); + if (volname && (*(&volname[1]) != '\0')) { + strlcpy(fsap->f_vol_name, &volname[1], MAXPATHLEN); + } else { + strlcpy(fsap->f_vol_name, + vfs_statfs(zfsvfs->z_vfs)->f_mntfromname, + MAXPATHLEN); + } + } +#endif + + VFSATTR_SET_SUPPORTED(fsap, f_vol_name); + dprintf("vfs_getattr: volume name '%s'\n", fsap->f_vol_name); + } +/* + if (!zfsvfs->z_issnap) { + VFSATTR_RETURN(fsap, f_fssubtype, 0); + } else { + VFSATTR_RETURN(fsap, f_fssubtype, 2); + } +*/ + + /* If we are mimicing, we need to let userland know we are really ZFS */ + VFSATTR_RETURN(fsap, f_fssubtype, MNTTYPE_ZFS_SUBTYPE); + + /* According to joshade over at + * https://github.com/joshado/liberate-applefileserver/blob/master/liberate.m + * the following values need to be returned for it to be considered + * by Apple's AFS. + */ + VFSATTR_RETURN(fsap, f_signature, 18475); /* */ + VFSATTR_RETURN(fsap, f_carbon_fsid, 0); + // Make up a UUID here, based on the name + if (VFSATTR_IS_ACTIVE(fsap, f_uuid)) { + + char osname[MAXNAMELEN]; + int error; + +#if 0 + uint64_t pguid = 0; + uint64_t guid = 0; + spa_t *spa = 0; + + if (zfsvfs->z_os == NULL || + (guid = dmu_objset_fsid_guid(zfsvfs->z_os)) == 0ULL || + (spa = dmu_objset_spa(zfsvfs->z_os)) == NULL || + (pguid = spa_guid(spa)) == 0ULL) { + dprintf("%s couldn't get pguid or guid %llu %llu\n", + __func__, pguid, guid); + } + + if (guid != 0ULL && pguid != 0ULL) { + /* + * Seeding with the pool guid would + * also avoid clashes across pools + */ + /* Print 16 hex chars (8b guid), plus null char */ + /* snprintf puts null char */ + snprintf(osname, 33, "%016llx%016llx", pguid, guid); + osname[32] = '\0'; /* just in case */ + dprintf("%s: using pguid+guid [%s]\n", __func__, osname); + } else { + /* XXX */ +#endif + // Get dataset name + dmu_objset_name(zfsvfs->z_os, osname); + dprintf("%s: osname [%s]\n", __func__, osname); +#if 0 + /* XXX */ + } +#endif + if ((error = zfs_vfs_uuid_gen(osname, + fsap->f_uuid)) != 0) { + dprintf("%s uuid_gen error %d\n", __func__, error); + } else { + /* return f_uuid in fsap */ + VFSATTR_SET_SUPPORTED(fsap, f_uuid); + } + } + + uint64_t missing = 0; + missing = (fsap->f_active ^ (fsap->f_active & fsap->f_supported)); + if ( missing != 0) { + dprintf("vfs_getattr:: asked %08llx replied %08llx missing %08llx\n", + fsap->f_active, fsap->f_supported, + missing); + } + + ZFS_EXIT(zfsvfs); +#endif + return (0); +} + +int +zfs_vnode_lock(vnode_t *vp, int flags) +{ + int error; + + ASSERT(vp != NULL); + + error = vn_lock(vp, flags); + return (error); +} + + +#if !defined(HAVE_SPLIT_SHRINKER_CALLBACK) && !defined(HAVE_SHRINK) && \ + defined(HAVE_D_PRUNE_ALIASES) +/* + * Linux kernels older than 3.1 do not support a per-filesystem shrinker. + * To accommodate this we must improvise and manually walk the list of znodes + * attempting to prune dentries in order to be able to drop the inodes. + * + * To avoid scanning the same znodes multiple times they are always rotated + * to the end of the z_all_znodes list. New znodes are inserted at the + * end of the list so we're always scanning the oldest znodes first. + */ +static int +zfs_sb_prune_aliases(zfs_sb_t *zsb, unsigned long nr_to_scan) +{ + znode_t **zp_array, *zp; + int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); + int objects = 0; + int i = 0, j = 0; + + zp_array = kmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); + + mutex_enter(&zsb->z_znodes_lock); + while ((zp = list_head(&zsb->z_all_znodes)) != NULL) { + + if ((i++ > nr_to_scan) || (j >= max_array)) + break; + + ASSERT(list_link_active(&zp->z_link_node)); + list_remove(&zsb->z_all_znodes, zp); + list_insert_tail(&zsb->z_all_znodes, zp); + + /* Skip active znodes and .zfs entries */ + if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) + continue; + + if (igrab(ZTOI(zp)) == NULL) + continue; + + zp_array[j] = zp; + j++; + } + mutex_exit(&zsb->z_znodes_lock); + + for (i = 0; i < j; i++) { + zp = zp_array[i]; + + ASSERT3P(zp, !=, NULL); + d_prune_aliases(ZTOI(zp)); + + if (atomic_read(&ZTOI(zp)->i_count) == 1) + objects++; + + iput(ZTOI(zp)); + } + + kmem_free(zp_array, max_array * sizeof (znode_t *)); + + return (objects); +} +#endif /* HAVE_D_PRUNE_ALIASES */ +/* + * The ARC has requested that the filesystem drop entries from the dentry + * and inode caches. This can occur when the ARC needs to free meta data + * blocks but can't because they are all pinned by entries in these caches. + */ + +/* Get vnode for the root object of this mount */ +int +zfs_vfs_root(struct mount *mp, vnode_t **vpp, vfs_context_t *context) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + znode_t *rootzp; + int error; + + if (!zfsvfs) return EIO; + + ZFS_ENTER_NOERROR(zfsvfs); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *vpp = ZTOV(rootzp); + + ZFS_EXIT(zfsvfs); + +#if 0 + if (error == 0) { + error = zfs_vnode_lock(*vpp, 0); + if (error == 0) + (*vpp)->v_vflag |= VV_ROOT; + } +#endif + if (error != 0) + *vpp = NULL; + + /* zfs_vfs_mountroot() can be called first, and we need to stop + * getrootdir() from being called until root is mounted. XNU calls + * vfs_start() once that is done, then VFS_ROOT(&rootvnode) to set + * the rootvnode, so we inform SPL that getrootdir() is ready to + * be called. We only need to call this one, worth optimising? + */ + spl_vfs_start(); + + return (error); +} + +/* + * Teardown the zfsvfs::z_os. + * + * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' + * and 'z_teardown_inactive_lock' held. + */ +static int +zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) +{ + znode_t *zp; + /* + * We have experienced deadlocks with dmu_recv_end happening between + * suspend_fs() and resume_fs(). Clearly something is not quite ready + * so we will wait for pools to be synced first. + * This is considered a temporary solution until we can work out + * the full issue. + */ + + zfs_unlinked_drain_stop_wait(zfsvfs); + + /* + * If someone has not already unmounted this file system, + * drain the iput_taskq to ensure all active references to the + * zfs_sb_t have been handled only then can it be safely destroyed. + */ + if (zfsvfs->z_os) + taskq_wait(dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); + + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + + if (!unmounting) { + /* + * We purge the parent filesystem's vfsp as the parent + * filesystem and all of its snapshots have their vnode's + * v_vfsp set to the parent's filesystem's vfsp. Note, + * 'z_parent' is self referential for non-snapshots. + */ + (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); +#ifdef FREEBSD_NAMECACHE + cache_purgevfs(zfsvfs->z_parent->z_vfs); +#endif + } + + /* + * Close the zil. NB: Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + + /* + * If we are not unmounting (ie: online recv) and someone already + * unmounted this file system while we were doing the switcheroo, + * or a reopen of z_os failed then just bail out now. + */ + if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + return (SET_ERROR(EIO)); + } + /* + * At this point there are no VFS ops active, and any new VFS ops + * will fail with EIO since we have z_teardown_lock for writer (only + * relevant for forced unmount). + * + * Release all holds on dbufs. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; + zp = list_next(&zfsvfs->z_all_znodes, zp)) + if (zp->z_sa_hdl) { + /* ASSERT(ZTOV(zp)->v_count >= 0); */ + zfs_znode_dmu_fini(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * If we are unmounting, set the unmounted flag and let new VFS ops + * unblock. zfs_inactive will have the unmounted behavior, and all + * other VFS ops will fail with EIO. + */ + if (unmounting) { + zfsvfs->z_unmounted = B_TRUE; + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * z_os will be NULL if there was an error in attempting to reopen + * zfsvfs, so just return as the properties had already been + * unregistered and cached data had been evicted before. + */ + if (zfsvfs->z_os == NULL) + return (0); + + /* + * Unregister properties. + */ + zfs_unregister_callbacks(zfsvfs); + + /* + * Evict cached data + */ + if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && + !(vfs_isrdonly(zfsvfs->z_vfs))) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + dmu_objset_evict_dbufs(zfsvfs->z_os); + + dprintf("-teardown\n"); + return (0); +} + + +int +zfs_vfs_unmount(struct mount *mp, int mntflags, vfs_context_t *context) +{ + dprintf("%s\n", __func__); + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + + //kthread_t *td = (kthread_t *)curthread; + objset_t *os; +#ifndef _WIN32 + cred_t *cr = (cred_t *)vfs_context_ucred(context); +#endif + int ret; + + dprintf("+unmount\n"); + + zfs_unlinked_drain_stop_wait(zfsvfs); + + /* + * We might skip the sync called in the unmount path, since + * zfs_vfs_sync() is generally ignoring xnu's calls, and alas, + * mount_isforce() is set AFTER that sync call, so we can not + * detect unmount is inflight. But why not just sync now, it + * is safe. Optionally, sync if (mount_isforce()); + */ + spa_sync_allpools(); + +#ifndef _WIN32 + /*XXX NOEL: delegation admin stuffs, add back if we use delg. admin */ + ret = secpolicy_fs_unmount(cr, zfsvfs->z_vfs); + if (ret) { + if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), + ZFS_DELEG_PERM_MOUNT, cr)) + return (ret); + } +#endif + /* + * We purge the parent filesystem's vfsp as the parent filesystem + * and all of its snapshots have their vnode's v_vfsp set to the + * parent's filesystem's vfsp. Note, 'z_parent' is self + * referential for non-snapshots. + */ + (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); + + /* + * Unmount any snapshots mounted under .zfs before unmounting the + * dataset itself. + */ + dprintf("z_ctldir check: %p\n",zfsvfs->z_ctldir ); + if (zfsvfs->z_ctldir != NULL) { + +#ifndef _WIN32 + // We can not unmount from kernel space, and there is a known + // race causing panic + if ((ret = zfsctl_umount_snapshots(zfsvfs->z_vfs, 0 /*fflag*/, cr)) != 0) + return (ret); +#endif + dprintf("vflush 1\n"); + ret = vflush(zfsvfs->z_vfs, zfsvfs->z_ctldir, (mntflags & MNT_FORCE) ? FORCECLOSE : 0|SKIPSYSTEM); + //ret = vflush(zfsvfs->z_vfs, NULLVP, 0); + //ASSERT(ret == EBUSY); + if (!(mntflags & MNT_FORCE)) { + if (vnode_isinuse(zfsvfs->z_ctldir, 1)) { + dprintf("zfsctl vp still in use %p\n", zfsvfs->z_ctldir); + return (EBUSY); + } + //ASSERT(zfsvfs->z_ctldir->v_count == 1); + } + dprintf("z_ctldir destroy\n"); +#ifndef _WIN32 + zfsctl_destroy(zfsvfs); +#endif + ASSERT(zfsvfs->z_ctldir == NULL); + } + +#if 0 + // If we are ourselves a snapshot + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + struct vnode *vp; + printf("We are unmounting a snapshot\n"); + vp = vfs_vnodecovered(zfsvfs->z_vfs); + if (vp) { + struct vnop_inactive_args ap; + ap.a_vp = vp; + printf(".. telling gfs layer\n"); + gfs_dir_inactive(&ap); + printf("..and put\n"); + vnode_put(vp); + } + } +#endif + + ret = vflush(mp, NULLVP, SKIPSYSTEM); + + if (mntflags & MNT_FORCE) { + /* + * Mark file system as unmounted before calling + * vflush(FORCECLOSE). This way we ensure no future vnops + * will be called and risk operating on DOOMED vnodes. + */ + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + zfsvfs->z_unmounted = B_TRUE; + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * Flush all the files. + */ + ret = vflush(mp, NULLVP, (mntflags & MNT_FORCE) ? FORCECLOSE|SKIPSYSTEM : SKIPSYSTEM); + + if ((ret != 0) && !(mntflags & MNT_FORCE)) { + if (!zfsvfs->z_issnap) { +#ifndef _WIN32 + zfsctl_create(zfsvfs); +#endif + //ASSERT(zfsvfs->z_ctldir != NULL); + } + return (ret); + } + +#ifdef __APPLE__ + if (!vfs_isrdonly(zfsvfs->z_vfs) && + spa_writeable(dmu_objset_spa(zfsvfs->z_os)) && + !(mntflags & MNT_FORCE)) { + /* Update the last-unmount time for Spotlight's next mount */ + char osname[MAXNAMELEN]; + timestruc_t now; + dmu_tx_t *tx; + int error; + uint64_t value; + + dmu_objset_name(zfsvfs->z_os, osname); + dprintf("ZFS: '%s' Updating spotlight LASTUNMOUNT property\n", + osname); + + gethrestime(&now); + zfsvfs->z_last_unmount_time = now.tv_sec; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + value = zfsvfs->z_last_unmount_time; + error = zap_update(zfsvfs->z_os, MASTER_NODE_OBJ, + zfs_prop_to_name(ZFS_PROP_APPLE_LASTUNMOUNT), + 8, 1, + &value, tx); + dmu_tx_commit(tx); + } + dprintf("ZFS: '%s' set lastunmount to 0x%lx (%d)\n", + osname, zfsvfs->z_last_unmount_time, error); + } + +#endif + +#ifdef sun + if (!(fflag & MS_FORCE)) { + /* + * Check the number of active vnodes in the file system. + * Our count is maintained in the vfs structure, but the + * number is off by 1 to indicate a hold on the vfs + * structure itself. + * + * The '.zfs' directory maintains a reference of its + * own, and any active references underneath are + * reflected in the vnode count. + */ + if (zfsvfs->z_ctldir == NULL) { + if (vfsp->vfs_count > 1) + return (EBUSY); + } else { + if (vfsp->vfs_count > 2 || + zfsvfs->z_ctldir->v_count > 1) + return (EBUSY); + } + } +#endif + + /* + * Last chance to dump unreferenced system files. + */ + (void) vflush(mp, NULLVP, FORCECLOSE); + + VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + os = zfsvfs->z_os; + + dprintf("OS %p\n", os); + /* + * z_os will be NULL if there was an error in + * attempting to reopen zfsvfs. + */ + if (os != NULL) { + /* + * Unset the objset user_ptr. + */ + mutex_enter(&os->os_user_ptr_lock); + dprintf("mutex\n"); + dmu_objset_set_user(os, NULL); + dprintf("set\n"); + mutex_exit(&os->os_user_ptr_lock); + + /* + * Finally release the objset + */ + dprintf("disown\n"); + dmu_objset_disown(os, B_TRUE, zfsvfs); + } + + dprintf("OS released\n"); + + /* + * We can now safely destroy the '.zfs' directory node. + */ +#ifndef _WIN32 + if (zfsvfs->z_ctldir != NULL) + zfsctl_destroy(zfsvfs); +#endif +#if 0 + if (zfsvfs->z_issnap) { + vnode_t *svp = vfsp->mnt_vnodecovered; + + if (svp->v_count >= 2) + VN_RELE(svp); + } +#endif + + dprintf("freevfs\n"); + zfs_freevfs(zfsvfs->z_vfs); + + dprintf("-unmount\n"); + + return (0); +} + +static int +zfs_vget_internal(zfsvfs_t *zfsvfs, ino64_t ino, vnode_t **vpp) +{ + znode_t *zp; + int err = 0; +#if 0 + dprintf("vget get %llu\n", ino); + /* + * zfs_zget() can't operate on virtual entries like .zfs/ or + * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. + * This will make NFS to switch to LOOKUP instead of using VGET. + */ + if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || + (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) + return (EOPNOTSUPP); + + + /* + * Check to see if we expect to find this in the hardlink avl tree of + * hashes. Use the MSB set high as indicator. + */ + hardlinks_t *findnode = NULL; + if ((1ULL<<31) & ino) { + hardlinks_t *searchnode; + avl_index_t loc; + + searchnode = kmem_alloc(sizeof(hardlinks_t), KM_SLEEP); + + dprintf("ZFS: vget looking for (%llx,%llu)\n", ino, ino); + + searchnode->hl_linkid = ino; + + rw_enter(&zfsvfs->z_hardlinks_lock, RW_READER); + findnode = avl_find(&zfsvfs->z_hardlinks_linkid, searchnode, &loc); + rw_exit(&zfsvfs->z_hardlinks_lock); + + kmem_free(searchnode, sizeof(hardlinks_t)); + + if (findnode) { + dprintf("ZFS: vget found (%llu, %llu, %u): '%s'\n", + findnode->hl_parent, + findnode->hl_fileid, findnode->hl_linkid, + findnode->hl_name); + // Lookup the actual zp instead + ino = findnode->hl_fileid; + } // findnode + } // MSB set + + + /* We can not be locked during zget. */ + if (!ino) { + dprintf("%s: setting ino from %lld to 2\n", __func__, ino); + ino = 2; + } + + err = zfs_zget(zfsvfs, ino, &zp); + + if (err) { + dprintf("zget failed %d\n", err); + return err; + } + + /* Don't expose EA objects! */ + if (zp->z_pflags & ZFS_XATTR) { + err = ENOENT; + goto out; + } + if (zp->z_unlinked) { + err = EINVAL; + goto out; + } + + *vpp = ZTOV(zp); + + err = zfs_vnode_lock(*vpp, 0/*flags*/); + + /* + * Spotlight requires that vap->va_name() is set when returning + * from vfs_vget, so that vfs_getrealpath() can succeed in returning + * a path to mds. + */ + char *name = kmem_alloc(MAXPATHLEN + 2, KM_SLEEP); + + /* Root can't lookup in ZAP */ + if (zp->z_id == zfsvfs->z_root) { + + dmu_objset_name(zfsvfs->z_os, name); + dprintf("vget: set root '%s'\n", name); + vnode_update_identity(*vpp, NULL, name, + strlen(name), 0, + VNODE_UPDATE_NAME); + + } else { + uint64_t parent; + + // if its a hardlink cache + if (findnode) { + + dprintf("vget: updating vnode to '%s' and parent %llu\n", + findnode->hl_name, findnode->hl_parent); + + vnode_update_identity(*vpp, + NULL, + findnode->hl_name, + strlen(findnode->hl_name), + 0, + VNODE_UPDATE_NAME|VNODE_UPDATE_PARENT); + mutex_enter(&zp->z_lock); + strlcpy(zp->z_name_cache, findnode->hl_name, PATH_MAX); + zp->z_finder_parentid = findnode->hl_parent; + mutex_exit(&zp->z_lock); + + + // If we already have the name, cached in zfs_vnop_lookup + } else if (zp->z_name_cache[0]) { + dprintf("vget: cached name '%s'\n", zp->z_name_cache); + vnode_update_identity(*vpp, NULL, zp->z_name_cache, + strlen(zp->z_name_cache), 0, + VNODE_UPDATE_NAME); + + /* If needed, if findnode is set, we can update the parentid too */ + + } else { + + /* Lookup name from ID, grab parent */ + VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)) == 0); + + if (zap_value_search(zfsvfs->z_os, parent, zp->z_id, + ZFS_DIRENT_OBJ(-1ULL), name) == 0) { + + dprintf("vget: set name '%s'\n", name); + vnode_update_identity(*vpp, NULL, name, + strlen(name), 0, + VNODE_UPDATE_NAME); + } else { + dprintf("vget: unable to get name for %llu\n", zp->z_id); + } // !zap_search + } + } // rootid + + kmem_free(name, MAXPATHLEN + 2); + + out: + /* + * We do not release the vp here in vget, if we do, we panic with io_count + * != 1 + * + * VN_RELE(ZTOV(zp)); + */ + if (err != 0) { + VN_RELE(ZTOV(zp)); + *vpp = NULL; + } + + dprintf("vget return %d\n", err); +#endif + return (err); +} + +#ifdef _WIN32 +/* + * Get a vnode from a file id (ignoring the generation) + * + * Use by NFS Server (readdirplus) and VFS (build_path) + */ +int +zfs_vfs_vget(struct mount *mp, ino64_t ino, vnode_t **vpp, vfs_context_t *context) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + int error; + + ZFS_ENTER(zfsvfs); + + /* + * On Mac OS X we always export the root directory id as 2. + * So we don't expect to see the real root directory id + * from zfs_vfs_vget KPI (unless of course the real id was + * already 2). + */ + if (ino == 2) ino = zfsvfs->z_root; + + if ((ino == zfsvfs->z_root) && (zfsvfs->z_root != 2)) { + error = VFS_ROOT(mp, 0, vpp); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_vget_internal(zfsvfs, ino, vpp); + + ZFS_EXIT(zfsvfs); + return (error); +} +#endif /* _WIN32 */ + + +#ifndef _WIN32 +static int +zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, + struct ucred **credanonp, int *numsecflavors, int **secflavors) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + /* + * If this is regular file system vfsp is the same as + * zfsvfs->z_parent->z_vfs, but if it is snapshot, + * zfsvfs->z_parent->z_vfs represents parent file system + * which we have to use here, because only this file system + * has mnt_export configured. + */ + return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, + credanonp, numsecflavors, secflavors)); +} + +CTASSERT(SHORT_FID_LEN <= sizeof(struct fid)); +CTASSERT(LONG_FID_LEN <= sizeof(struct fid)); + +#endif + +#ifdef _WIN32 + +int +zfs_vfs_setattr( struct mount *mp, struct vfs_attr *fsap, vfs_context_t *context) +{ + // 10a286 bits has an implementation of this + return (ENOTSUP); +} + +/* + * NFS Server File Handle File ID + */ +typedef struct zfs_zfid { + uint8_t zf_object[8]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[8]; /* gen[i] = gen >> (8 * i) */ +} zfs_zfid_t; +#endif /* _WIN32 */ + +#ifdef _WIN32 +/* + * File handle to vnode pointer + */ +int +zfs_vfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, + vnode_t **vpp, vfs_context_t *context) +{ +dprintf("%s\n", __func__); + zfsvfs_t *zfsvfs = vfs_fsprivate(mp); + zfs_zfid_t *zfid = (zfs_zfid_t *)fhp; + znode_t *zp; + uint64_t obj_num = 0; + uint64_t fid_gen = 0; + uint64_t zp_gen; + int i; + int error; + + *vpp = NULL; + + ZFS_ENTER(zfsvfs); + + if (fhlen < sizeof (zfs_zfid_t)) { + error = EINVAL; + goto out; + } + + /* + * Grab the object and gen numbers in an endian neutral manner + */ + for (i = 0; i < sizeof (zfid->zf_object); i++) + obj_num |= ((uint64_t)zfid->zf_object[i]) << (8 * i); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); + + if ((error = zfs_zget(zfsvfs, obj_num, &zp))) { + goto out; + } + + zp_gen = zp->z_gen; + if (zp_gen == 0) + zp_gen = 1; + + if (zp->z_unlinked || zp_gen != fid_gen) { + VN_RELE(ZTOV(zp)); + error = EINVAL; + goto out; + } + *vpp = ZTOV(zp); +out: + ZFS_EXIT(zfsvfs); + return (error); +} +#endif //_WIN32 + +#ifdef _WIN32 +/* + * Vnode pointer to File handle + * + * XXX Do we want to check the DSL sharenfs property? + */ +int +zfs_vfs_vptofh(vnode_t *vp, int *fhlenp, unsigned char *fhp, vfs_context_t *context) +{ +dprintf("%s\n", __func__); +#if 0 +zfsvfs_t *zfsvfs = vfs_fsprivate(vnode_mount(vp)); + zfs_zfid_t *zfid = (zfs_zfid_t *)fhp; + znode_t *zp = VTOZ(vp); + uint64_t obj_num; + uint64_t zp_gen; + int i; + //int error; + + if (*fhlenp < sizeof (zfs_zfid_t)) { + return (EOVERFLOW); + } + + ZFS_ENTER(zfsvfs); + + obj_num = zp->z_id; + zp_gen = zp->z_gen; + if (zp_gen == 0) + zp_gen = 1; + + /* + * Store the object and gen numbers in an endian neutral manner + */ + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(obj_num >> (8 * i)); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(zp_gen >> (8 * i)); + + *fhlenp = sizeof (zfs_zfid_t); + + ZFS_EXIT(zfsvfs); +#endif + return (0); +} +#endif /* _WIN32 */ + + +/* + * Block out VOPs and close zfsvfs_t::z_os + * + * Note, if successful, then we return with the 'z_teardown_lock' and + * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying + * dataset and objset intact so that they can be atomically handed off during + * a subsequent rollback or recv operation and the resume thereafter. + */ +int +zfs_suspend_fs(zfsvfs_t *zfsvfs) +{ + int error; + + if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) + return (error); + + return (0); +} + +/* + * Reopen zfsvfs_t::z_os and release VOPs. + */ +int +zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + int err, err2; + znode_t *zp; + + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just update the objset_t, as the one we + * had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + VERIFY0(dmu_objset_from_ds(ds, &os)); + + err = zfsvfs_init(zfsvfs, os); + if (err != 0) + goto bail; + + VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + + zfs_set_fuid_feature(zfsvfs); + //zfsvfs->z_rollback_time = jiffies; + + /* + * Attempt to re-establish all the active inodes with their + * dbufs. If a zfs_rezget() fails, then we unhash the inode + * and mark it stale. This prevents a collision if a new + * inode/object is created which must use the same inode + * number. The stale inode will be be released when the + * VFS prunes the dentry holding the remaining references + * on the stale inode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + err2 = zfs_rezget(zp); + if (err2) { + //remove_inode_hash(ZTOI(zp)); + zp->z_is_stale = B_TRUE; + } + } + mutex_exit(&zfsvfs->z_znodes_lock); + + if (!vfs_isrdonly(zfsvfs->z_vfs) && !zfsvfs->z_unmounted) { + /* + * zfs_suspend_fs() could have interrupted freeing + * of dnodes. We need to restart this freeing so + * that we don't "leak" the space. + */ + zfs_unlinked_drain(zfsvfs); + } + +bail: + /* release the VFS ops */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + if (err) { + /* + * Since we couldn't setup the sa framework, try to force + * unmount this file system. + */ +#ifndef _WIN32 + if (zfsvfs->z_os) + (void) zfs_umount(zfsvfs->z_sb); +#endif + } + return (err); +} + + +void +zfs_freevfs(struct mount *vfsp) +{ + zfsvfs_t *zfsvfs = vfs_fsprivate(vfsp); + + dprintf("+freevfs\n"); + +#ifdef sun + /* + * If this is a snapshot, we have an extra VFS_HOLD on our parent + * from zfs_mount(). Release it here. If we came through + * zfs_mountroot() instead, we didn't grab an extra hold, so + * skip the VFS_RELE for rootvfs. + */ + if (zfsvfs->z_issnap && (vfsp != rootvfs)) + VFS_RELE(zfsvfs->z_parent->z_vfs); +#endif /* sun */ + + vfs_setfsprivate(vfsp, NULL); + + zfsvfs_free(zfsvfs); + + atomic_dec_32(&zfs_active_fs_count); + dprintf("-freevfs\n"); +} + +#ifdef __i386__ +static int desiredvnodes_backup; +#endif + +static void +zfs_vnodes_adjust(void) +{ + // What is this? +#ifdef __i386XXX__ + int newdesiredvnodes; + + desiredvnodes_backup = desiredvnodes; + + /* + * We calculate newdesiredvnodes the same way it is done in + * vntblinit(). If it is equal to desiredvnodes, it means that + * it wasn't tuned by the administrator and we can tune it down. + */ + newdesiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * + vm_kmem_size / (5 * (sizeof(struct vm_object) + + sizeof(struct vnode)))); + if (newdesiredvnodes == desiredvnodes) + desiredvnodes = (3 * newdesiredvnodes) / 4; +#endif +} + +static void +zfs_vnodes_adjust_back(void) +{ + +#ifdef __i386XXX__ + desiredvnodes = desiredvnodes_backup; +#endif +} + +void +zfs_init(void) +{ + + dprintf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); + + /* + * Initialize .zfs directory structures + */ +// zfsctl_init(); + + /* + * Initialize znode cache, vnode ops, etc... + */ + zfs_znode_init(); + + /* + * Reduce number of vnodes. Originally number of vnodes is calculated + * with UFS inode in mind. We reduce it here, because it's too big for + * ZFS/i386. + */ + zfs_vnodes_adjust(); + + dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); +} + +void +zfs_fini(void) +{ +// zfsctl_fini(); + zfs_znode_fini(); + zfs_vnodes_adjust_back(); +} + +int +zfs_busy(void) +{ + return (zfs_active_fs_count != 0); +} + +int +zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) +{ + int error; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + + if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) + return (SET_ERROR(EINVAL)); + + if (newvers < zfsvfs->z_version) + return (SET_ERROR(EINVAL)); + + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (SET_ERROR(ENOTSUP)); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &newvers, tx); + + if (error) { + dmu_tx_commit(tx); + return (error); + } + + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT(error==0); + + VERIFY(0 == sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + + spa_history_log_internal(dmu_objset_spa(os), "upgrade", tx, + "oldver=%llu newver=%llu dataset = %llu", zfsvfs->z_version, newvers, + dmu_objset_id(os)); + + dmu_tx_commit(tx); + + zfsvfs->z_version = newvers; + os->os_version = newvers; + + zfs_set_fuid_feature(zfsvfs); + + return (0); +} + +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) { + pname = ZPL_VERSION_STR; + } else { + pname = zfs_prop_to_name(prop); + } + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLMODE: + *value = ZFS_ACLTYPE_OFF; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + +/* + * Return true if the coresponding vfs's unmounted flag is set. + * Otherwise return false. + * If this function returns true we know VFS unmount has been initiated. + */ +boolean_t +zfs_get_vfs_flag_unmounted(objset_t *os) +{ + zfsvfs_t *zfvp; + boolean_t unmounted = B_FALSE; + + ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); + + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + if (zfvp != NULL && zfvp->z_vfs != NULL && + (vfs_isunmount(zfvp->z_vfs))) + unmounted = B_TRUE; + mutex_exit(&os->os_user_ptr_lock); + + return (unmounted); +} + +#ifdef _KERNEL +void +zfsvfs_update_fromname(const char *oldname, const char *newname) +{ +#if 0 + char tmpbuf[MAXPATHLEN]; + struct mount *mp; + char *fromname; + size_t oldlen; + + oldlen = strlen(oldname); + + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + fromname = mp->mnt_stat.f_mntfromname; + if (strcmp(fromname, oldname) == 0) { + (void)strlcpy(fromname, newname, + sizeof(mp->mnt_stat.f_mntfromname)); + continue; + } + if (strncmp(fromname, oldname, oldlen) == 0 && + (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { + (void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s", + newname, fromname + oldlen); + (void)strlcpy(fromname, tmpbuf, + sizeof(mp->mnt_stat.f_mntfromname)); + continue; + } + } + mtx_unlock(&mountlist_mtx); +#endif +} + +#endif diff --git a/module/os/windows/zfs/zfs_vnops.c b/module/os/windows/zfs/zfs_vnops.c new file mode 100644 index 000000000000..a4e46a72f3dd --- /dev/null +++ b/module/os/windows/zfs/zfs_vnops.c @@ -0,0 +1,7681 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ +/* Portions Copyright 2013, 2017 Jorgen Lundman */ + +#include + +#include +#include +#include +#include +#include +#include +#include +//#include +#include +#include +#include +#include +#include +#include +#include +//#include +//#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include +//#include +//#include +#include +//#include +//#include +//#include +#include +#include +#include +#include + +//#define dprintf printf + +int zfs_vnop_force_formd_normalized_output = 0; /* disabled by default */ + + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. + * + * (2) VN_RELE() should always be the last thing except for zil_commit() + * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which acquires range locks) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to + * dmu_tx_assign(). This is critical because we don't want to block + * while holding locks. + * + * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * reduces lock contention and CPU usage when we must wait (note that if + * throughput is constrained by the storage, nearly every transaction + * must wait). + * + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing + * to use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zsb->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. On subsequent + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, + * to indicate that this operation has already called dmu_tx_wait(). + * This will ensure that we don't retry forever, waiting a short bit + * each time. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * if (error == ERESTART) { + * waited = B_TRUE; + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * zil_commit(zilog, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + + +/* ARGSUSED */ +int +zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(*vpp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Honor ZFS_APPENDONLY file attribute */ + if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + ((flag & FAPPEND) == 0)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + +#if 0 + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { + if (fs_vscan(*vpp, cr, 0) != 0) { + ZFS_EXIT(zfsvfs); + return ((EACCES)); + } + } +#endif + + /* Keep a count of the synchronous opens in the znode */ + if (flag & (FSYNC | FDSYNC)) + atomic_inc_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* ARGSUSED */ +int +zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + /* + * Clean up any locks held by this process on the vp. + */ +#ifndef _WIN32 + cleanlocks(vp, ddi_get_pid(), 0); + cleanshares(vp, ddi_get_pid()); +#endif + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Decrement the synchronous opens in the znode */ + if ((flag & (FSYNC | FDSYNC)) && (count == 1)) + atomic_dec_32(&zp->z_sync_cnt); + +#if 0 + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) + VERIFY(fs_vscan(vp, cr, 1) == 0); +#endif + + ZFS_EXIT(zfsvfs); + return (0); +} + +#if defined(SEEK_HOLE) && defined(SEEK_DATA) +/* + * Lseek support for finding holes (cmd == SEEK_HOLE) and + * data (cmd == SEEK_DATA). "off" is an in/out parameter. + */ +static int +zfs_holey_common(struct vnode *vp, int cmd, loff_t *off) +{ + znode_t *zp = VTOZ(vp); + uint64_t noff = (uint64_t)*off; /* new offset */ + uint64_t file_sz; + int error; + boolean_t hole; + + file_sz = zp->z_size; + if (noff >= file_sz) { + return (SET_ERROR(ENXIO)); + } + + if (cmd == SEEK_HOLE) + hole = B_TRUE; + else + hole = B_FALSE; + + error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); + + if (error == ESRCH) + return (SET_ERROR(ENXIO)); + + /* + * We could find a hole that begins after the logical end-of-file, + * because dmu_offset_next() only works on whole blocks. If the + * EOF falls mid-block, then indicate that the "virtual hole" + * at the end of the file begins at the logical EOF, rather than + * at the end of the last block. + */ + if (noff > file_sz) { + ASSERT(hole); + noff = file_sz; + } + + if (noff < *off) + return (error); + *off = noff; + return (error); +} + +int +zfs_holey(struct vnode *vp, int cmd, loff_t *off) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_holey_common(vp, cmd, off); + + ZFS_EXIT(zfsvfs); + return (error); +} + +#endif /* SEEK_HOLE && SEEK_DATA */ + +#if defined(_KERNEL) +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + */ +static void +update_pages(vnode_t *vp, int64_t nbytes, struct uio *uio, + dmu_tx_t *tx) +{ +#if 0 + znode_t *zp = VTOZ(vp); + //zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int len = nbytes; + int error = 0; + vm_offset_t vaddr = 0; + upl_t upl; + upl_page_info_t *pl = NULL; + off_t upl_start; + int upl_size; + int upl_page; + off_t off; + + upl_start = uio_offset(uio); + off = upl_start & (PAGE_SIZE - 1); + upl_start &= ~PAGE_MASK; + upl_size = (off + nbytes + (PAGE_SIZE - 1)) & ~PAGE_MASK; + + dprintf("update_pages %llu - %llu (adjusted %llu - %llu): off %llu\n", + uio_offset(uio), nbytes, upl_start, upl_size, off); + /* + * Create a UPL for the current range and map its + * page list into the kernel virtual address space. + */ + error = ubc_create_upl(vp, upl_start, upl_size, &upl, &pl, + UPL_FILE_IO | UPL_SET_LITE); + if ((error != KERN_SUCCESS) || !upl) { + printf("ZFS: update_pages failed to ubc_create_upl: %d\n", error); + return; + } + + if (ubc_upl_map(upl, &vaddr) != KERN_SUCCESS) { + printf("ZFS: update_pages failed to ubc_upl_map: %d\n", error); + (void) ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY); + return; + } + + for (upl_page = 0; len > 0; ++upl_page) { + uint64_t bytes = MIN(PAGESIZE - off, len); + //uint64_t woff = uio_offset(uio); + /* + * We don't want a new page to "appear" in the middle of + * the file update (because it may not get the write + * update data), so we grab a lock to block + * zfs_getpage(). + */ + rw_enter(&zp->z_map_lock, RW_WRITER); + if (pl && upl_valid_page(pl, upl_page)) { + rw_exit(&zp->z_map_lock); + uio_setrw(uio, UIO_WRITE); + error = uiomove((caddr_t)vaddr + off, bytes, UIO_WRITE, uio); + if (error == 0) { + + /* + dmu_write(zfsvfs->z_os, zp->z_id, + woff, bytes, (caddr_t)vaddr + off, tx); + */ + /* + * We don't need a ubc_upl_commit_range() + * here since the dmu_write() effectively + * pushed this page to disk. + */ + } else { + /* + * page is now in an unknown state so dump it. + */ + ubc_upl_abort_range(upl, upl_start, PAGESIZE, + UPL_ABORT_DUMP_PAGES); + } + } else { // !upl_valid_page + /* + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, + uio, bytes, tx); + */ + rw_exit(&zp->z_map_lock); + } + + vaddr += PAGE_SIZE; + upl_start += PAGE_SIZE; + len -= bytes; + off = 0; + if (error) + break; + } + + /* + * Unmap the page list and free the UPL. + */ + (void) ubc_upl_unmap(upl); + /* + * We want to abort here since due to dmu_write() + * we effectively didn't dirty any pages. + */ + (void) ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY); +#endif +} +#endif + +/* + * Read with UIO_NOCOPY flag means that sendfile(2) requests + * ZFS to populate a range of page cache pages with data. + * + * NOTE: this function could be optimized to pre-allocate + * all pages in advance, drain VPO_BUSY on all of them, + * map them into contiguous KVA region and populate them + * in one single dmu_read() call. + */ +#ifndef _WIN32 +static int +mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + struct sf_buf *sf; + vm_object_t obj; + page_t *pp; + int64_t start; + caddr_t va; + int len = nbytes; + int off; + int error = 0; + + ASSERT(uio->uio_segflg == UIO_NOCOPY); + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); + + zfs_vmobject_wlock(obj); + for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { + int bytes = MIN(PAGESIZE, len); + + pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_NOBUSY | + VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY); + if (pp->valid == 0) { + vm_page_io_start(pp); + zfs_vmobject_wunlock(obj); + va = zfs_map_page(pp, &sf); + error = dmu_read(os, zp->z_id, start, bytes, va, + DMU_READ_PREFETCH); + if (bytes != PAGESIZE && error == 0) + bzero(va + bytes, PAGESIZE - bytes); + zfs_unmap_page(sf); + zfs_vmobject_wlock(obj); + vm_page_io_finish(pp); + vm_page_lock(pp); + if (error) { + vm_page_free(pp); + } else { + pp->valid = VM_PAGE_BITS_ALL; + vm_page_activate(pp); + } + vm_page_unlock(pp); + } + if (error) + break; + uio->uio_resid -= bytes; + uio->uio_offset += bytes; + len -= bytes; + } + zfs_vmobject_wunlock(obj); + return (error); +} +#endif + + +static int +mappedread(vnode_t *vp, ssize_t nbytes, struct uio *uio) +{ + int error = 0; +#if 0 + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + int len = nbytes; + vm_offset_t vaddr = 0; + upl_t upl; + upl_page_info_t *pl = NULL; + off_t upl_start; + int upl_size; + int upl_page; + off_t off; + + + upl_start = uio_offset(uio); + off = upl_start & PAGE_MASK; + upl_start &= ~PAGE_MASK; + upl_size = (off + nbytes + (PAGE_SIZE - 1)) & ~PAGE_MASK; + + dprintf("zfs_mappedread: %llu - %d (adj %llu - %llu)\n", + uio_offset(uio), nbytes, + upl_start, upl_size); + + /* + * Create a UPL for the current range and map its + * page list into the kernel virtual address space. + */ + error = ubc_create_upl(vp, upl_start, upl_size, &upl, &pl, + UPL_FILE_IO | UPL_SET_LITE); + if ((error != KERN_SUCCESS) || !upl) { + printf("ZFS: mappedread failed to ubc_create_upl: %d\n", error); + return EIO; + } + + if (ubc_upl_map(upl, &vaddr) != KERN_SUCCESS) { + printf("ZFS: mappedread failed to ubc_upl_map: %d\n", error); + (void) ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY); + return ENOMEM; + } + + for (upl_page = 0; len > 0; ++upl_page) { + uint64_t bytes = MIN(PAGE_SIZE - off, len); + if (pl && upl_valid_page(pl, upl_page)) { + uio_setrw(uio, UIO_READ); + + dprintf("uiomove to addy %p (%llu) for %llu bytes\n", vaddr+off, + off, bytes); + + error = uiomove((caddr_t)vaddr + off, bytes, UIO_READ, uio); + } else { + dprintf("dmu_read to addy %llu for %llu bytes\n", + uio_offset(uio), bytes); + error = dmu_read_uio(os, zp->z_id, uio, bytes); + } + + vaddr += PAGE_SIZE; + len -= bytes; + off = 0; + if (error) + break; + } + + /* + * Unmap the page list and free the UPL. + */ + (void) ubc_upl_unmap(upl); + (void) ubc_upl_abort(upl, UPL_ABORT_FREE_ON_EMPTY); +#endif + return (error); +} + +offset_t zfs_read_chunk_size = MAX_UPL_TRANSFER * PAGE_SIZE; /* Tunable */ + +/* + * Read bytes from specified file into supplied buffer. + * + * IN: vp - vnode of file to be read from. + * uio - structure supplying read location, range info, + * and return buffer. + * ioflag - SYNC flags; used to provide FRSYNC semantics. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - updated offset and range, buffer filled. + * + * RETURN: 0 on success, error code on failure. + * + * Side Effects: + * vp - atime updated if byte count > 0 + */ +/* ARGSUSED */ +int +zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + ssize_t n, nbytes; + int error = 0; +#ifndef _WIN32 + xuio_t *xuio = NULL; +#endif + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + os = zfsvfs->z_os; + + if (zp->z_pflags & ZFS_AV_QUARANTINED) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + /* + * Validate file offset + */ + if (uio_offset(uio) < (offset_t)0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Fasttrack empty reads + */ + if (uio_resid(uio) == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * Note: In Mac OS X, mandatory lock checking occurs up in VFS layer. + * Check for mandatory locks + */ +#ifndef _WIN32 + if (MANDMODE(zp->z_mode)) { + if (error = chklock(vp, FREAD, + uio_offset(uio), uio_resid(uio), + uio->uio_fmode, ct)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EAGAIN)); + } + } +#endif + + /* + * If we're in FRSYNC mode, sync out this znode before reading it. + */ + if (zfsvfs->z_log && + (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) + zil_commit(zfsvfs->z_log, zp->z_id); + + /* + * Lock the range against changes. + */ + locked_range_t *lr = rangelock_enter(&zp->z_rangelock, + uio_offset(uio), uio_resid(uio), RL_READER); + + /* + * If we are reading past end-of-file we can skip + * to the end; but we might still need to set atime. + */ + if (uio_offset(uio) >= zp->z_size) { + error = 0; + goto out; + } + + ASSERT(uio_offset(uio) < zp->z_size); + n = MIN(uio_resid(uio), zp->z_size - uio_offset(uio)); + +#ifdef sun + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { + int nblk; + int blksz = zp->z_blksz; + uint64_t offset = uio_offset(uio); + + xuio = (xuio_t *)uio; + if ((ISP2(blksz))) { + nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, + blksz)) / blksz; + } else { + ASSERT(offset + n <= blksz); + nblk = 1; + } + (void) dmu_xuio_init(xuio, nblk); + + if (vn_has_cached_data(vp)) { + /* + * For simplicity, we always allocate a full buffer + * even if we only expect to read a portion of a block. + */ + while (--nblk >= 0) { + (void) dmu_xuio_add(xuio, + dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz), 0, blksz); + } + } + } +#endif /* sun */ + + while (n > 0) { + nbytes = MIN(n, zfs_read_chunk_size - + P2PHASE(uio_offset(uio), zfs_read_chunk_size)); + +#ifdef __FreeBSD__ + if (uio->uio_segflg == UIO_NOCOPY) + error = mappedread_sf(vp, nbytes, uio); + else +#endif /* __FreeBSD__ */ + if (vn_has_cached_data(vp)) + error = mappedread(vp, nbytes, uio); + else + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes); + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + + n -= nbytes; + } +out: + rangelock_exit(lr); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + if (error) dprintf("zfs_read returning error %d\n", error); + return (error); +} + +/* + * Write the bytes to a file. + * + * IN: vp - vnode of file to be written to. + * uio - structure supplying write location, range info, + * and data buffer. + * ioflag - FAPPEND flag set if in append mode. + * cr - credentials of caller. + * ct - caller context (NFS/CIFS fem monitor only) + * + * OUT: uio - updated offset and range. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime|mtime updated if byte count > 0 + */ + +/* ARGSUSED */ +int +zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + rlim64_t limit = MAXOFFSET_T; + ssize_t start_resid = uio_resid(uio); + ssize_t tx_bytes; + uint64_t end_size; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog; + offset_t woff; + ssize_t n, nbytes; + int max_blksz = zfsvfs->z_max_blksz; + int error = 0; + arc_buf_t *abuf; + const iovec_t *aiov = NULL; + xuio_t *xuio = NULL; + int i_iov = 0; + //int iovcnt = uio_iovcnt(uio); + iovec_t *iovp = (iovec_t *)uio_curriovbase(uio); + int write_eof; + int count = 0; + sa_bulk_attr_t bulk[4]; + uint64_t mtime[2], ctime[2]; + struct uio *uio_copy = NULL; + /* + * Fasttrack empty write + */ + n = start_resid; + if (n == 0) + return (0); + + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* + * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our + * callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (vfs_flags(zfsvfs->z_vfs) & MNT_RDONLY) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * If immutable or not appending then return EPERM. + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common() + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) || + ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && + (uio_offset(uio) < zp->z_size))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + zilog = zfsvfs->z_log; + + /* + * Validate file offset + */ + woff = ioflag & FAPPEND ? zp->z_size : uio_offset(uio); + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + +#ifndef _WIN32 + /* + * Check for mandatory locks before calling zfs_range_lock() + * in order to prevent a deadlock with locks set via fcntl(). + */ + if (MANDMODE((mode_t)zp->z_mode) && + (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EAGAIN)); + } +#endif + +#ifdef sun + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + * Skip this if uio contains loaned arc_buf. + */ +#ifdef HAVE_UIO_ZEROCOPY + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) + xuio = (xuio_t *)uio; + else +#endif + zfs_prefault_write(MIN(n, max_blksz), uio); +#endif /* sun */ + + /* + * If in append mode, set the io offset pointer to eof. + */ + locked_range_t *lr; + if (ioflag & FAPPEND) { + /* + * Obtain an appending range lock to guarantee file append + * semantics. We reset the write offset once we have the lock. + */ + lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); + woff = lr->lr_offset; + if (lr->lr_length == UINT64_MAX) { + /* + * We overlocked the file because this write will cause + * the file block size to increase. + * Note that zp_size cannot change with this lock held. + */ + woff = zp->z_size; + } + uio_setoffset(uio, woff); + } else { + /* + * Note that if the file block size will change as a result of + * this write, then this range lock will lock the entire file + * so that we can re-write the block safely. + */ + lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); + } + +#ifndef _WIN32 + if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { + rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFBIG)); + } +#endif + + if (woff >= limit) { + rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return ((EFBIG)); + } + + if ((woff + n) > limit || woff > (limit - n)) + n = limit - woff; + + /* Will this write extend the file length? */ + write_eof = (woff + n > zp->z_size); + + end_size = MAX(zp->z_size, woff + n); + + /* + * Write the file in reasonable size chunks. Each chunk is written + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + + while (n > 0) { + abuf = NULL; + woff = uio_offset(uio); + + if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || + zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { + if (abuf != NULL) + dmu_return_arcbuf(abuf); + error = SET_ERROR(EDQUOT); + break; + } + + if (xuio && abuf == NULL) { + dprintf(" xuio \n"); +#if 0 //fixme + ASSERT(i_iov < iovcnt); +#endif + aiov = &iovp[i_iov]; + abuf = dmu_xuio_arcbuf(xuio, i_iov); + dmu_xuio_clear(xuio, i_iov); + DTRACE_PROBE3(zfs_cp_write, int, i_iov, + iovec_t *, aiov, arc_buf_t *, abuf); + ASSERT((aiov->iov_base == abuf->b_data) || + ((char *)aiov->iov_base - (char *)abuf->b_data + + aiov->iov_len == arc_buf_size(abuf))); + i_iov++; + } else if (abuf == NULL && n >= max_blksz && + woff >= zp->z_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { + /* + * This write covers a full block. "Borrow" a buffer + * from the dmu so that we can fill it before we enter + * a transaction. This avoids the possibility of + * holding up the transaction if the data copy hangs + * up on a pagefault (e.g., from an NFS server mapping). + */ + uint64_t cbytes; + + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + max_blksz); + ASSERT(abuf != NULL); + ASSERT(arc_buf_size(abuf) == max_blksz); + //dprintf(" uiocopy before %llu\n", uio_offset(uio)); + if ((error = uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes))) { + dmu_return_arcbuf(abuf); + break; + } + //dprintf(" uiocopy after %llu cbytes %llu\n", + // uio_offset(uio), cbytes); + ASSERT(cbytes == max_blksz); + } + + /* + * Start a transaction. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + if (abuf != NULL) + dmu_return_arcbuf(abuf); + break; + } + + /* + * If rangelock_enter() over-locked we grow the blocksize + * and then reduce the lock range. This will only happen + * on the first iteration since rangelock_reduce() will + * shrink down lr_length to the appropriate size. + */ + if (lr->lr_length == UINT64_MAX) { + uint64_t new_blksz; + if (zp->z_blksz > max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, + 1ULL << highbit64(zp->z_blksz)); + } else { + new_blksz = MIN(end_size, max_blksz); + } + + dprintf("growing buffer to %llu\n", new_blksz); + zfs_grow_blocksize(zp, new_blksz, tx); + rangelock_reduce(lr, woff, n); + } + + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + + if (woff + nbytes > zp->z_size) + vnode_pager_setsize(vp, woff + nbytes); + + if (abuf == NULL) { + + if ( vn_has_cached_data(vp) ) + uio_copy = uio_duplicate(uio); + + tx_bytes = uio_resid(uio); + + error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes, tx); + tx_bytes -= uio_resid(uio); + + } else { + tx_bytes = nbytes; + ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); + /* + * If this is not a full block write, but we are + * extending the file past EOF and this data starts + * block-aligned, use assign_arcbuf(). Otherwise, + * write via dmu_write(). + */ + if (tx_bytes < max_blksz && (!write_eof || + aiov->iov_base != abuf->b_data)) { + ASSERT(xuio); + dmu_write(zfsvfs->z_os, zp->z_id, woff, + aiov->iov_len, aiov->iov_base, tx); + dmu_return_arcbuf(abuf); + xuio_stat_wbuf_copied(); + } else { + ASSERT(xuio || tx_bytes == max_blksz); + error = dmu_assign_arcbuf_by_dbuf( + sa_get_db(zp->z_sa_hdl), woff, abuf, tx); + if (error != 0) { + dmu_return_arcbuf(abuf); + dmu_tx_commit(tx); + break; + } + } + ASSERT(tx_bytes <= uio_resid(uio)); + uioskip(uio, tx_bytes); + } + + if (tx_bytes && vn_has_cached_data(vp)) { +#ifdef _WIN32 + if (uio_copy) { + dprintf("Updatepage copy call %llu vs %llu (tx_bytes %llu) numvecs %d\n", + woff, uio_offset(uio_copy), tx_bytes, uio_iovcnt(uio_copy)); + update_pages(vp, tx_bytes, uio_copy, tx); + uio_free(uio_copy); + uio_copy = NULL; + } else { + dprintf("XXXXUpdatepage call %llu vs %llu (tx_bytes %llu) numvecs %d\n", + woff, uio_offset(uio), tx_bytes, uio_iovcnt(uio)); + update_pages(vp, tx_bytes, uio, tx); + } +#else + update_pages(vp, woff, tx_bytes, zfsvfs->z_os, + zp->z_id, 0, tx); +#endif + } + + /* + * If we made no progress, we're done. If we made even + * partial progress, update the znode and ZIL accordingly. + */ + if (tx_bytes == 0) { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + dmu_tx_commit(tx); + ASSERT(error != 0); + break; + } + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the excute bits is set. + * + * It would be nice to to this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. + */ + mutex_enter(&zp->z_acl_lock); + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(vp, cr, + (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { + uint64_t newmode; + zp->z_mode &= ~(S_ISUID | S_ISGID); + newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); + } + mutex_exit(&zp->z_acl_lock); + + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, + B_TRUE); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((end_size = zp->z_size) < uio_offset(uio)) { + (void) atomic_cas_64(&zp->z_size, end_size, + uio_offset(uio)); + ASSERT(error == 0); + } + + /* + * If we are replaying and eof is non zero then force + * the file size to the specified eof. Note, there's no + * concurrency during replay. + */ + if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) + zp->z_size = zfsvfs->z_replay_eof; + + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, + NULL, NULL); + dmu_tx_commit(tx); + + if (error != 0) + break; + ASSERT(tx_bytes == nbytes); + n -= nbytes; + +#ifdef sun + if (!xuio && n > 0) + uio_prefaultpages(MIN(n, max_blksz), uio); +#endif /* sun */ +#ifdef _WIN32 + if (!xuio && n > 0) + zfs_prefault_write(MIN(n, max_blksz), uio); +#endif /* sun */ + + + } + + dprintf("zfs_write done remainder %llu\n", n); + + rangelock_exit(lr); + + /* + * If we're in replay mode, or we made no progress, return error. + * Otherwise, it's at least a partial write, so it's successful. + */ + if (zfsvfs->z_replay || uio_resid(uio) == start_resid) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (ioflag & (FSYNC | FDSYNC) || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, zp->z_id); + + ZFS_EXIT(zfsvfs); + return (0); +} + +void +zfs_get_done(zgd_t *zgd, int error) +{ +#ifndef __APPLE__ + znode_t *zp = zgd->zgd_private; + objset_t *os = zp->z_zfsvfs->z_os; +#endif + + ASSERT(zgd->zgd_lr != NULL); + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + rangelock_exit(zgd->zgd_lr); + + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + /* + * We only need to release the vnode if zget took the path to call + * vnode_get() with already existing vnodes. If zget (would) call to + * allocate new vnode, we don't (ZGET_FLAG_WITHOUT_VNODE), and it is + * attached after zfs_get_data() is finished (and immediately released). + */ + VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); + if (error == 0 && zgd->zgd_bp) + zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); + + kmem_free(zgd, sizeof (zgd_t)); +} + +#ifdef DEBUG +static int zil_fault_io = 0; +#endif + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, + zio_t *zio) +{ + zfsvfs_t *zfsvfs = arg; + objset_t *os = zfsvfs->z_os; + znode_t *zp; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + dmu_buf_t *db; + zgd_t *zgd; + int error = 0; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + /* + * Nothing to do if the file has been removed + * This zget is moved into zil.c + */ + if (zfs_zget(zfsvfs, object, &zp) != 0) + return (SET_ERROR(ENOENT)); + + if (zp->z_unlinked) { + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_vnrele_taskq(dmu_objset_pool(os))); + return (SET_ERROR(ENOENT)); + } + + zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_lwb = lwb; + zgd->zgd_private = zp; + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + /* test for truncation needs to be done while range locked */ + if (offset >= zp->z_size) { + error = SET_ERROR(ENOENT); + } else { + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + } + ASSERT(error == 0 || error == ENOENT); + } else { /* indirect write */ + /* + * Have to lock the whole block to ensure when it's + * written out and its checksum is being calculated + * that no one can change the data. We need to re-check + * blocksize after we get the lock in case it's changed! + */ + for (;;) { + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; + offset -= blkoff; + zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + if (zp->z_blksz == size) + break; + offset += blkoff; + rangelock_exit(zgd->zgd_lr); + } + /* test for truncation needs to be done while range locked */ + if (lr->lr_offset >= zp->z_size) + error = SET_ERROR(ENOENT); +#ifdef DEBUG + if (zil_fault_io) { + error = SET_ERROR(EIO); + zil_fault_io = 0; + } +#endif + if (error == 0) + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= size); + + /* + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. + */ + if (error == 0) { + return (0); + } + + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP. + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); + error = 0; + } + } + } + + zfs_get_done(zgd, error); + + return (error); +} + +/*ARGSUSED*/ +int +zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (flag & V_ACE_MASK) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); + else + error = zfs_zaccess_rwx(zp, mode, flag, cr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * If vnode is for a device return a specfs vnode instead. + */ +static int +specvp_check(vnode_t **vpp, cred_t *cr) +{ + int error = 0; + + if (IS_DEVVP(*vpp)) { +#ifndef _WIN32 + struct vnode *svp; + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) + error = (ENOSYS); + *vpp = svp; +#endif + } + return (error); +} + + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held vnode reference for it. + * + * IN: dvp - vnode of directory to search. + * nm - name of entry to lookup. + * pnp - full pathname to lookup [UNUSED]. + * flags - LOOKUP_XATTR set if looking for an attribute. + * rdir - root directory vnode [UNUSED]. + * cr - credentials of caller. + * ct - caller context + * direntflags - directory lookup flags + * realpnp - returned pathname. + * + * OUT: vpp - vnode of located entry, NULL if not found. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +int +zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, + int nameiop, cred_t *cr, int flags) +{ + znode_t *zdp = VTOZ(dvp); + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + int error = 0; + int *direntflags = NULL; + void *realpnp = NULL; + +#ifndef _WIN32 + /* fast path */ + if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { + + if (dvp->v_type != VDIR) { + return (SET_ERROR(ENOTDIR)); + } else if (zdp->z_sa_hdl == NULL) { + return (SET_ERROR(EIO)); + } + + if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { + error = zfs_fastaccesschk_execute(zdp, cr); + if (!error) { + *vpp = dvp; + VN_HOLD(*vpp); + return (0); + } + return (error); + } else if (!zdp->z_zfsvfs->z_norm && + (zdp->z_zfsvfs->z_case == ZFS_CASE_SENSITIVE)) { + + vnode_t *tvp = dnlc_lookup(dvp, nm); + + if (tvp) { + error = zfs_fastaccesschk_execute(zdp, cr); + if (error) { + VN_RELE(tvp); + return (error); + } + if (tvp == DNLC_NO_VNODE) { + VN_RELE(tvp); + return (SET_ERROR(ENOENT)); + } else { + *vpp = tvp; + return (specvp_check(vpp, cr)); + } + } + } + } +#endif + DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + + *vpp = NULL; + + +#ifndef _WIN32 + if (flags & LOOKUP_XATTR) { +#ifdef TODO + /* + * If the xattr property is off, refuse the lookup request. + */ + if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return ((EINVAL)); + } +#endif + + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Do we have permission to get into attribute directory? + */ + + if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, + B_FALSE, cr)) { + VN_RELE(*vpp); + *vpp = NULL; + } + + ZFS_EXIT(zfsvfs); + return (error); + } +#endif + + + if (!vnode_isdir(dvp)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTDIR)); + } + + /* + * Check accessibility of directory. + */ + + if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); + if (error == 0) + error = specvp_check(vpp, cr); + + /* Translate errors and add SAVENAME when needed. */ + if (cnp->cn_flags & ISLASTCN) { + switch (nameiop) { + case CREATE: + case RENAME: + if (error == ENOENT) { + error = EJUSTRETURN; + //cnp->cn_flags |= SAVENAME; + break; + } + /* FALLTHROUGH */ + case VN_DELETE: + if (error == 0) + ;//cnp->cn_flags |= SAVENAME; + break; + } + } + if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) { + int ltype = 0; + +#ifndef _WIN32 + if (cnp->cn_flags & ISDOTDOT) { + ltype = VOP_ISLOCKED(dvp); + VOP_UNLOCK(dvp, 0); + } +#endif + ZFS_EXIT(zfsvfs); + error = zfs_vnode_lock(*vpp, 0/*cnp->cn_lkflags*/); + if (cnp->cn_flags & ISDOTDOT) + vn_lock(dvp, ltype | LK_RETRY); + if (error != 0) { + VN_RELE(*vpp); + *vpp = NULL; + return (error); + } + } else { + ZFS_EXIT(zfsvfs); + } + +#if defined (FREEBSD_NAMECACHE) + /* + * Insert name into cache (as non-existent) if appropriate. + */ + if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) + cache_enter(dvp, *vpp, cnp); + /* + * Insert name into cache if appropriate. + */ + if (error == 0 && (cnp->cn_flags & MAKEENTRY)) { + if (!(cnp->cn_flags & ISLASTCN) || + (nameiop != VN_DELETE && nameiop != RENAME)) { + cache_enter(dvp, *vpp, cnp); + } + } +#endif + + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the vp of the created or trunc'd file. + * + * IN: dvp - vnode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - file flag. + * ct - caller context + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created or trunc'd entry. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated if new entry created + * vp - ctime|mtime always, atime if new + */ + +/* ARGSUSED */ +int +zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, + vnode_t **vpp, cred_t *cr) +{ + znode_t *zp, *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + objset_t *os; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t have_acl = B_FALSE; + void *vsecp = NULL; + int flag = 0; + boolean_t waited = B_FALSE; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || (vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr(dvp, vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } +top: + *vpp = NULL; + + if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) + vap->va_mode &= ~S_ISVTX; + + if (*name == '\0') { + /* + * Null component name refers to the directory itself. + */ + VN_HOLD(dvp); + zp = dzp; + dl = NULL; + error = 0; + } else { + /* possible VN_HOLD(zp) */ + int zflg = 0; + + if (flag & FIGNORECASE) + zflg |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL); + if (error) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + if (strcmp(name, "..") == 0) + error = SET_ERROR(EISDIR); + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if (zp == NULL) { + uint64_t txtype; + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + + if ((dzp->z_pflags & ZFS_XATTR) && + (vap->va_type != VREG)) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EINVAL); + goto out; + } + + if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + have_acl = B_TRUE; + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + error = dmu_tx_assign(tx, + (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + (void) zfs_link_create(dl, zp, tx, ZNEW); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + if (flag & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + /* + * OS X - attach the vnode _after_ committing the transaction + */ + zfs_znode_getvnode(zp, dzp, zfsvfs); + + } else { + int aflags = (flag & FAPPEND) ? V_APPEND : 0; + + if (have_acl) + zfs_acl_ids_free(&acl_ids); + have_acl = B_FALSE; + + /* + * A directory entry already exists for this name. + */ + /* + * Can't truncate an existing file if in exclusive mode. + */ + if (excl) { + error = SET_ERROR(EEXIST); + goto out; + } + /* + * Can't open a directory for writing. + */ + if ((vnode_isdir(ZTOV(zp))) && (mode & S_IWRITE)) { + error = SET_ERROR(EISDIR); + goto out; + } + /* + * Verify requested access to file. + */ + if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { + goto out; + } + + mutex_enter(&dzp->z_lock); + dzp->z_seq++; + mutex_exit(&dzp->z_lock); + + /* + * Truncate regular files if requested. + */ + if ((vnode_isreg(ZTOV(zp))) && + (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { + /* we can't hold any locks when calling zfs_freesp() */ + zfs_dirent_unlock(dl); + dl = NULL; + error = zfs_freesp(zp, 0, 0, mode, TRUE); + if (error == 0) { + vnevent_create(ZTOV(zp), ct); + } + } + } +out: + if (dl) + zfs_dirent_unlock(dl); + + if (error) { + if (zp) + VN_RELE(ZTOV(zp)); + } else { + *vpp = ZTOV(zp); + error = specvp_check(vpp, cr); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dvp - vnode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime + * vp - ctime (if nlink > 0) + */ + +uint64_t null_xattr = 0; + +/*ARGSUSED*/ +int +zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, + int flags) +{ + znode_t *zp, *dzp = VTOZ(dvp); + znode_t *xzp; + vnode_t *vp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t acl_obj, xattr_obj; + uint64_t xattr_obj_unlinked = 0; + uint64_t obj = 0; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + boolean_t may_delete_now = FALSE, delete_now = FALSE; + boolean_t unlinked, toobig = FALSE; + uint64_t txtype; + pathname_t *realnmp = NULL; + pathname_t realnm; + int error; + int zflg = ZEXISTS; + boolean_t waited = B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) { + zflg |= ZCILOOK; + pn_alloc(&realnm); + realnmp = &realnm; + } + +top: + xattr_obj = 0; + xzp = NULL; + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + // This calls grabs vp->v_iocount++ + if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, realnmp))) { + if (realnmp) + pn_free(realnmp); + ZFS_EXIT(zfsvfs); + return (error); + } + + vp = ZTOV(zp); + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + /* + * Need to use rmdir for removing directories. + */ + if (vnode_isdir(vp)) { + error = SET_ERROR(EPERM); + goto out; + } + + vnevent_remove(vp, dvp, name, ct); + + if (realnmp) + dnlc_remove(dvp, realnmp->pn_buf); + else + dnlc_remove(dvp, name); + /* + * On Mac OSX, we lose the option of having this optimization because + * the VFS layer holds the last reference on the vnode whereas in + * Solaris this code holds the last ref. Hence, it's sketchy + * business(not to mention hackish) to start deleting the znode + * and clearing out the vnode when the VFS still has a reference + * open on it, even though it's dropping it shortly. + */ +#ifdef _WIN32 + may_delete_now = !vnode_isinuse(vp, 0) && !vn_has_cached_data(vp); +#else + VI_LOCK(vp); + may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp); + VI_UNLOCK(vp); +#endif + +#ifdef LINUX + mutex_enter(&zp->z_lock); + may_delete_now = atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped); + mutex_exit(&zp->z_lock); +#endif + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the vnode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + obj = zp->z_id; + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + if (may_delete_now) { + toobig = + zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; +#ifdef _WIN32 + /* Currently we have no real vnop_inactive support, so everything + * has to be directly deleted, even large files. + */ + toobig = 0; +#endif + /* if the file is too big, only hold_free a token amount */ + dmu_tx_hold_free(tx, zp->z_id, 0, + (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); + } + + /* are there any extended attributes? */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT(error==0); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + + mutex_enter(&zp->z_lock); + if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + mutex_exit(&zp->z_lock); + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + /* + * Mark this transaction as typically resulting in a net free of + * space, unless object removal will be delayed indefinitely + * (due to active holds on the vnode due to the file being open). + */ + if (may_delete_now) + dmu_tx_mark_netfree(tx); + + /* + * Mark this transaction as typically resulting in a net free of space + */ + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + VN_RELE(vp); + if (xzp) + VN_RELE(ZTOV(xzp)); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + if (realnmp) + pn_free(realnmp); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) { + /* + * Hold z_lock so that we can make sure that the ACL obj + * hasn't changed. Could have been deleted due to + * zfs_sa_upgrade(). + */ + mutex_enter(&zp->z_lock); +#ifndef _WIN32 + VI_LOCK(vp); +#endif + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); + delete_now = may_delete_now && !toobig && + !vnode_isinuse(vp,0) && !vn_has_cached_data(vp) && + xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == + acl_obj; +#ifndef _WIN32 + VI_UNLOCK(vp); +#endif + } + + dprintf("vnop_remove: may_delete_now is %d, delete_now %d. iocount %u\n", + may_delete_now, delete_now, vp->v_iocount); + + if (delete_now) { + if (xattr_obj_unlinked) { + ASSERT3U(xzp->z_links, ==, 2); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = 1; + xzp->z_links = 0; + error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &xzp->z_links, sizeof (xzp->z_links), tx); + ASSERT3U(error, ==, 0); + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + + if (zp->z_is_sa) + error = sa_remove(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), tx); + else + error = sa_update(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), &null_xattr, + sizeof (uint64_t), tx); + ASSERT(error==0); + } + +#ifndef _WIN32 + VI_LOCK(vp); + vp->v_count--; + ASSERT0(vp->v_count); + VI_UNLOCK(vp); +#endif + mutex_exit(&zp->z_lock); + vnode_pager_setsize(vp, 0); + + /* + * Call recycle which will call vnop_reclaim directly if it can + * so tell reclaim to not do anything with this node, so we can + * release it directly. If recycle/reclaim didn't work out, defer + * it by placing it on the unlinked list. + */ + + zp->z_fastpath = B_TRUE; + + zfs_znode_delete(zp, tx); + vp->v_data = NULL; + vp = NULL; + zp = NULL; + + } else if (unlinked) { + mutex_exit(&zp->z_lock); + zfs_unlinked_add(zp, tx); + } + + txtype = TX_REMOVE; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name, obj); + + dmu_tx_commit(tx); +out: + if (realnmp) + pn_free(realnmp); + + zfs_dirent_unlock(dl); + + if (xzp) { + VN_RELE(ZTOV(xzp)); + vnode_recycle(ZTOV(xzp)); + } + if (!delete_now) { + VN_RELE(vp); + } + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Create a new directory and insert it into dvp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dvp - vnode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * ct - caller context + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created directory. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dvp - ctime|mtime updated + * vp - ctime|mtime|atime updated + */ +/*ARGSUSED*/ +int +zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, + caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + znode_t *zp, *dzp = VTOZ(dvp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + zfs_dirlock_t *dl; + uint64_t txtype; + dmu_tx_t *tx; + int error; + int zf = ZNEW; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t waited = B_FALSE; + + ASSERT(vap->va_type == VDIR); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || (vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return ((EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (dzp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr(dvp, (vattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, + vsecp, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * First make sure the new directory doesn't exist. + * + * Existence is checked first to make sure we don't return + * EACCES instead of EEXIST which can cause some applications + * to fail. + */ +top: + *vpp = NULL; + + if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, + NULL, NULL))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + /* + * Add a new entry to the directory. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + /* + * Now put new name in parent dir. + */ + (void) zfs_link_create(dl, zp, tx, ZNEW); + + *vpp = ZTOV(zp); + + txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, + acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + /* + * OS X - attach the vnode _after_ committing the transaction + */ + zfs_znode_getvnode(zp, dzp, zfsvfs); + *vpp = ZTOV(zp); + + zfs_dirent_unlock(dl); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dvp - vnode of directory to remove from. + * name - name of directory to be removed. + * cwd - vnode of current working directory. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, + caller_context_t *ct, int flags) +{ + znode_t *dzp = VTOZ(dvp); + znode_t *zp; + vnode_t *vp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zflg = ZEXISTS; + boolean_t waited = B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; +top: + zp = NULL; + + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + vp = ZTOV(zp); + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + if (!vnode_isdir(vp)) { + error = SET_ERROR(ENOTDIR); + goto out; + } + + if (vp == cwd) { + error = SET_ERROR(EINVAL); + goto out; + } + + vnevent_rmdir(vp, dvp, name, ct); + + /* + * Grab a lock on the directory to make sure that noone is + * trying to add (or lookup) entries while we are removing it. + */ + rw_enter(&zp->z_name_lock, RW_WRITER); + + /* + * Grab a lock on the parent pointer to make sure we play well + * with the treewalk and directory rename code. + */ + rw_enter(&zp->z_parent_lock, RW_WRITER); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); + zfs_dirent_unlock(dl); + VN_RELE(vp); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + +#if defined (FREEBSD_NAMECACHE) + cache_purge(dvp); +#endif + + error = zfs_link_destroy(dl, zp, tx, zflg, NULL); + + if (error == 0) { + uint64_t txtype = TX_RMDIR; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); + } + + dmu_tx_commit(tx); + + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); +#if defined (FREEBSD_NAMECACHE) + cache_purge(vp); +#endif +out: + zfs_dirent_unlock(dl); + + if (error == 0) { + dprintf("%s: releasing vp %p\n", __func__, vp); + if (vnode_recycle(vp) != 0) + VN_RELE(vp); + } else { + VN_RELE(vp); + } + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Read as many directory entries as will fit into the provided + * buffer from the given directory cursor position (specified in + * the uio structure. + * + * IN: vp - vnode of directory to read. + * uio - structure supplying read location, range info, + * and return buffer. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * OUT: uio - updated offset and range, buffer filled. + * eofp - set to true if end-of-file detected. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* + * uio points to a buffer to be filled with struct FILE_FULL_DIR_INFORMATION + * where the NextEntryOffset has value of next structure, or 0 when last. + * FileNameLength holds the length of the FileName to follow, then + * it has (variable) FileName immediately after the struct. + * If another FILE_FULL_DIR_INFORMATION struct is to follow, it has to be aligned to 8 bytes +*/ +/* ARGSUSED */ +int +zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, zfs_dirlist_t *zccb, int flags, int dirlisttype, int *a_numdirent) +{ + int error = 0; + + znode_t *zp = VTOZ(vp); + znode_t *tzp; + iovec_t *iovp; + FILE_FULL_DIR_INFORMATION *eodp = NULL; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + caddr_t outbuf; + size_t bufsize; + zap_cursor_t zc; + zap_attribute_t zap; + uint_t bytes_wanted; + uint64_t offset; /* must be unsigned; checks for < 1 */ + uint64_t parent; + int local_eof; + int outcount; + uint8_t prefetch; + boolean_t check_sysattrs; + uint8_t type; + int numdirent = 0; + char *bufptr; + void *nameptr = NULL; + ULONG namelenholder = 0; + uint32_t *eofp = &zccb->dir_eof; + int last_alignment = 0; + int skip_this_entry; + int structsize = 0; + int flag_index_specified = flags & SL_INDEX_SPECIFIED ? 1 : 0; + int flag_restart_scan = flags & SL_RESTART_SCAN ? 1 : 0; + int flag_return_single_entry= flags & SL_RETURN_SINGLE_ENTRY ? 1 : 0; + FILE_DIRECTORY_INFORMATION *fdi; + + dprintf("+zfs_readdir: Index %d, Restart %d, Single %d\n", + flag_index_specified, flag_restart_scan, flag_return_single_entry); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If we are not given an eof variable, + * use a local one. + */ + if (eofp == NULL) + eofp = &local_eof; + + /* + * Check for valid iov_len. + */ + if (uio_curriovlen(uio) <= 0) { + ZFS_EXIT(zfsvfs); + return ((EINVAL)); + } + + /* + * Quit if directory has been removed (posix) + */ + if ((*eofp = zp->z_unlinked) != 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + // Make sure the dirlist type is a valid one + switch (dirlisttype) { + case FileFullDirectoryInformation: + case FileIdBothDirectoryInformation: + case FileBothDirectoryInformation: + case FileDirectoryInformation: + case FileNamesInformation: + case FileIdFullDirectoryInformation: + break; + default: + dprintf("%s: ** Directory type %d not handled!\n", __func__, dirlisttype); + ZFS_EXIT(zfsvfs); + return ((EINVAL)); + } + + error = 0; + os = zfsvfs->z_os; + offset = uio_offset(uio); + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Get space to change directory entries into fs independent format. + */ + bytes_wanted = uio_curriovlen(uio); + bufsize = (size_t)bytes_wanted; + outbuf = kmem_zalloc(bufsize, KM_SLEEP); // ZERO? + bufptr = (char *)outbuf; + + /* + * If this VFS supports the system attribute view interface; and + * we're looking at an extended attribute directory; and we care + * about normalization conflicts on this vfs; then we must check + * for normalization conflicts with the sysattr name space. + */ +#ifdef TODO + check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && + (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && + (flags & V_RDDIR_ENTFLAGS); +#else + check_sysattrs = 0; +#endif + + /* + * Transform to file-system independent format + */ + //zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + + outcount = 0; + while (outcount < bytes_wanted) { + ino64_t objnum; + ushort_t reclen, rawsize; + uint64_t *next = NULL; + size_t namelen; + int force_formd_normalized_output; + size_t nfdlen; + + skip_this_entry = 0; + + + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void)strlcpy(zap.za_name, ".", MAXNAMELEN); + zap.za_normalization_conflict = 0; + objnum = (zp->z_id == zfsvfs->z_root) ? 2 : zp->z_id; + type = DT_DIR; + } else if (offset == 1) { + (void)strlcpy(zap.za_name, "..", MAXNAMELEN); + zap.za_normalization_conflict = 0; + objnum = (parent == zfsvfs->z_root) ? 2 : parent; + objnum = (zp->z_id == zfsvfs->z_root) ? 1 : objnum; + type = DT_DIR; +#if 1 + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void)strlcpy(zap.za_name, ZFS_CTLDIR_NAME, MAXNAMELEN); + zap.za_normalization_conflict = 0; + objnum = ZFSCTL_INO_ROOT; + type = DT_DIR; +#endif + } else { + + /* + * Grab next entry. + */ + if ((error = zap_cursor_retrieve(&zc, &zap))) { + if ((*eofp = (error == ENOENT)) != 0) + break; + else + goto update; + } + + if (zap.za_integer_length != 8 || + zap.za_num_integers != 1) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset); + error = SET_ERROR(ENXIO); + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + /* + * MacOS X can extract the object type here such as: + * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); + */ + type = ZFS_DIRENT_TYPE(zap.za_first_integer); + + if (check_sysattrs && !zap.za_normalization_conflict) { +#ifdef TODO + zap.za_normalization_conflict = + xattr_sysattr_casechk(zap.za_name); +#else + panic("%s:%u: TODO", __func__, __LINE__); +#endif + } + } + + /* + * Check if name will fit. + * + * Note: non-ascii names may expand (up to 3x) when converted to NFD + */ + namelen = strlen(zap.za_name); + + /* sysctl to force formD normalization of vnop output */ + if (zfs_vnop_force_formd_normalized_output && + !is_ascii_str(zap.za_name)) + force_formd_normalized_output = 1; + else + force_formd_normalized_output = 0; + + if (force_formd_normalized_output) + namelen = MIN(MAXNAMLEN, namelen * 3); + + + /* + * Do magic filename conversion for Windows here + */ + + error = RtlUTF8ToUnicodeN(NULL, 0, &namelenholder, zap.za_name, namelen); + + // Did they provide a search pattern + if (zccb->searchname.Buffer && zccb->searchname.Length) { + UNICODE_STRING thisname; + WCHAR tmpname[PATH_MAX]; + ULONG tmpnamelen; + // We need to convert name to a tmp buffer here, as the output + // buffer might not have enough room to hold the whole name, and + // we need the whole name to do search match. + error = RtlUTF8ToUnicodeN(tmpname, PATH_MAX, &tmpnamelen, zap.za_name, namelen); + //dprintf("%s: '%.*S' -> '%s'\n", __func__, + // tmpnamelen / sizeof(WCHAR), tmpname, zap.za_name); + + + thisname.Buffer = tmpname; + thisname.Length = thisname.MaximumLength = tmpnamelen; + // wildcard? + if (zccb->ContainsWildCards) { + if (!FsRtlIsNameInExpression(&zccb->searchname, + &thisname, + !(zfsvfs->z_case == ZFS_CASE_SENSITIVE), + NULL)) + skip_this_entry = 1; + } else { + if (!FsRtlAreNamesEqual(&thisname, + &zccb->searchname, + !(zfsvfs->z_case == ZFS_CASE_SENSITIVE), + NULL)) + skip_this_entry = 1; + } +#if 0 + dprintf("comparing names '%.*S' == '%.*S' skip %d\n", + thisname.Length / sizeof(WCHAR), thisname.Buffer, + zccb->searchname.Length / sizeof(WCHAR), zccb->searchname.Buffer, + skip_this_entry); +#endif + } + + + if (!skip_this_entry) { + // Windows combines vnop_readdir and vnop_getattr, so we need to lookup + // a bunch of values, we try to do that as lightweight as possible. + znode_t dummy = { 0 }; // For "." and ".." + int get_zp = ENOENT; + + tzp = &dummy; + + // If "." use zp, if ".." use dzp, neither needs releasing. Otherwise, call zget. + if (offset == 0 || offset == 1) + tzp = zp; + else + get_zp = zfs_zget_ext(zfsvfs, + offset == 1 ? parent : objnum, &tzp, // objnum is adjusted above +#if 1 + ZGET_FLAG_UNLINKED); +#else + ZGET_FLAG_UNLINKED | ZGET_FLAG_WITHOUT_VNODE ); +#endif + + // If we failed to get the node (someone else might have deleted it), but we + // need to return the name still, so it can be removed. + if (get_zp != 0 && tzp == NULL) + skip_this_entry = 1; + + // Is it worth warning about failing stat here? + if (!skip_this_entry) { + + // We need to fill in more fields. + sa_bulk_attr_t bulk[3]; + int count = 0; + uint64_t mtime[2]; + uint64_t ctime[2]; + uint64_t crtime[2]; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + sa_bulk_lookup(tzp->z_sa_hdl, bulk, count); + // Is it worth warning about failed lookup here? + + structsize = 0; + + switch (dirlisttype) { + + case FileFullDirectoryInformation: + structsize = FIELD_OFFSET(FILE_FULL_DIR_INFORMATION, FileName[0]); + if (outcount + structsize + namelenholder > bufsize) break; + + eodp = (FILE_FULL_DIR_INFORMATION *)bufptr; + eodp->FileIndex = offset; + eodp->AllocationSize.QuadPart = S_ISDIR(tzp->z_mode) ? 0 : P2ROUNDUP(tzp->z_size, zfs_blksz(tzp)); // File size in block alignment + eodp->EndOfFile.QuadPart = S_ISDIR(tzp->z_mode) ? 0 : tzp->z_size; // File size in bytes + TIME_UNIX_TO_WINDOWS(mtime, eodp->LastWriteTime.QuadPart); + TIME_UNIX_TO_WINDOWS(ctime, eodp->ChangeTime.QuadPart); + TIME_UNIX_TO_WINDOWS(crtime, eodp->CreationTime.QuadPart); + TIME_UNIX_TO_WINDOWS(tzp->z_atime, eodp->LastAccessTime.QuadPart); + eodp->EaSize = tzp->z_pflags & ZFS_REPARSEPOINT ? 0xa0000003 : xattr_getsize(ZTOV(tzp)); // Magic code to change dir icon to link + eodp->FileAttributes = zfs_getwinflags(tzp); + nameptr = eodp->FileName; + eodp->FileNameLength = namelenholder; + + break; + + case FileIdBothDirectoryInformation: + structsize = FIELD_OFFSET(FILE_ID_BOTH_DIR_INFORMATION, FileName[0]); + if (outcount + structsize + namelenholder > bufsize) break; + + eodp = (FILE_FULL_DIR_INFORMATION *)bufptr; + FILE_ID_BOTH_DIR_INFORMATION *fibdi = (FILE_ID_BOTH_DIR_INFORMATION *)bufptr; + fibdi->AllocationSize.QuadPart = S_ISDIR(tzp->z_mode) ? 0 : P2ROUNDUP(tzp->z_size, zfs_blksz(tzp)); + fibdi->EndOfFile.QuadPart = S_ISDIR(tzp->z_mode) ? 0 : tzp->z_size; + TIME_UNIX_TO_WINDOWS(mtime, fibdi->LastWriteTime.QuadPart); + TIME_UNIX_TO_WINDOWS(ctime, fibdi->ChangeTime.QuadPart); + TIME_UNIX_TO_WINDOWS(crtime, fibdi->CreationTime.QuadPart); + TIME_UNIX_TO_WINDOWS(tzp->z_atime, fibdi->LastAccessTime.QuadPart); + fibdi->EaSize = tzp->z_pflags & ZFS_REPARSEPOINT ? 0xa0000003 : xattr_getsize(ZTOV(tzp)); + fibdi->FileAttributes = zfs_getwinflags(tzp); + fibdi->FileId.QuadPart = objnum; + fibdi->FileIndex = offset; + fibdi->ShortNameLength = 0; + nameptr = fibdi->FileName; + fibdi->FileNameLength = namelenholder; + + break; + + case FileBothDirectoryInformation: + structsize = FIELD_OFFSET(FILE_BOTH_DIR_INFORMATION, FileName[0]); + if (outcount + structsize + namelenholder > bufsize) break; + + eodp = (FILE_FULL_DIR_INFORMATION *)bufptr; + FILE_BOTH_DIR_INFORMATION *fbdi = (FILE_BOTH_DIR_INFORMATION *)bufptr; + fbdi->AllocationSize.QuadPart = S_ISDIR(tzp->z_mode) ? 0 : P2ROUNDUP(tzp->z_size, zfs_blksz(tzp)); + fbdi->EndOfFile.QuadPart = S_ISDIR(tzp->z_mode) ? 0 : tzp->z_size; + TIME_UNIX_TO_WINDOWS(mtime, fbdi->LastWriteTime.QuadPart); + TIME_UNIX_TO_WINDOWS(ctime, fbdi->ChangeTime.QuadPart); + TIME_UNIX_TO_WINDOWS(crtime, fbdi->CreationTime.QuadPart); + TIME_UNIX_TO_WINDOWS(tzp->z_atime, fbdi->LastAccessTime.QuadPart); + fbdi->EaSize = tzp->z_pflags & ZFS_REPARSEPOINT ? 0xa0000003 : xattr_getsize(ZTOV(tzp)); + fbdi->FileAttributes = zfs_getwinflags(tzp); + fbdi->FileIndex = offset; + fbdi->ShortNameLength = 0; + nameptr = fbdi->FileName; + fbdi->FileNameLength = namelenholder; + + break; + + case FileDirectoryInformation: + structsize = FIELD_OFFSET(FILE_DIRECTORY_INFORMATION, FileName[0]); + if (outcount + structsize + namelenholder > bufsize) break; + eodp = (FILE_FULL_DIR_INFORMATION *)bufptr; + //FILE_DIRECTORY_INFORMATION *fdi = (FILE_DIRECTORY_INFORMATION *)bufptr; + fdi = (FILE_DIRECTORY_INFORMATION *)bufptr; + fdi->AllocationSize.QuadPart = S_ISDIR(tzp->z_mode) ? 0 : P2ROUNDUP(tzp->z_size, zfs_blksz(tzp)); + fdi->EndOfFile.QuadPart = S_ISDIR(tzp->z_mode) ? 0 : tzp->z_size; + TIME_UNIX_TO_WINDOWS(mtime, fdi->LastWriteTime.QuadPart); + TIME_UNIX_TO_WINDOWS(ctime, fdi->ChangeTime.QuadPart); + TIME_UNIX_TO_WINDOWS(crtime, fdi->CreationTime.QuadPart); + TIME_UNIX_TO_WINDOWS(tzp->z_atime, fdi->LastAccessTime.QuadPart); + fdi->FileAttributes = zfs_getwinflags(tzp); + //dtype == DT_DIR ? FILE_ATTRIBUTE_DIRECTORY : + // tzp->z_pflags&ZFS_REPARSEPOINT ? FILE_ATTRIBUTE_REPARSE_POINT : FILE_ATTRIBUTE_NORMAL; + fdi->FileIndex = offset; + nameptr = fdi->FileName; + fdi->FileNameLength = namelenholder; + + break; + + case FileNamesInformation: + structsize = FIELD_OFFSET(FILE_NAMES_INFORMATION, FileName[0]); + if (outcount + structsize + namelenholder > bufsize) break; + eodp = (FILE_FULL_DIR_INFORMATION *)bufptr; + FILE_NAMES_INFORMATION *fni = (FILE_NAMES_INFORMATION *)bufptr; + fni = (FILE_NAMES_INFORMATION *)bufptr; + fni->FileIndex = offset; + nameptr = fni->FileName; + fni->FileNameLength = namelenholder; + break; + + case FileIdFullDirectoryInformation: + structsize = FIELD_OFFSET(FILE_ID_FULL_DIR_INFORMATION, FileName[0]); + if (outcount + structsize + namelenholder > bufsize) break; + + eodp = (FILE_FULL_DIR_INFORMATION *)bufptr; + FILE_ID_FULL_DIR_INFORMATION *fifdi = (FILE_ID_FULL_DIR_INFORMATION *)bufptr; + fifdi->FileIndex = offset; + fifdi->AllocationSize.QuadPart = S_ISDIR(tzp->z_mode) ? 0 : P2ROUNDUP(tzp->z_size, zfs_blksz(tzp)); // File size in block alignment + fifdi->EndOfFile.QuadPart = S_ISDIR(tzp->z_mode) ? 0 : tzp->z_size; // File size in bytes + TIME_UNIX_TO_WINDOWS(mtime, fifdi->LastWriteTime.QuadPart); + TIME_UNIX_TO_WINDOWS(ctime, fifdi->ChangeTime.QuadPart); + TIME_UNIX_TO_WINDOWS(crtime, fifdi->CreationTime.QuadPart); + TIME_UNIX_TO_WINDOWS(tzp->z_atime, fifdi->LastAccessTime.QuadPart); + fifdi->EaSize = tzp->z_pflags & ZFS_REPARSEPOINT ? 0xa0000003 : xattr_getsize(ZTOV(tzp)); // Magic code to change dir icon to link + fifdi->FileAttributes = zfs_getwinflags(tzp); + fifdi->FileId.QuadPart = zp->z_id; + nameptr = fifdi->FileName; + fifdi->FileNameLength = namelenholder; + } + + // Release the zp +#if 1 + if (get_zp == 0 && tzp != NULL) { + VN_RELE(ZTOV(tzp)); + } +#else + if (get_zp == 0) { + if (ZTOV(tzp) == NULL) { + zfs_zinactive(tzp); + } + else { + VN_RELE(ZTOV(tzp)); + } + } +#endif + + // If know we can't fit struct, just leave + if (outcount + structsize + namelenholder > bufsize) break; + + rawsize = structsize + namelenholder; + reclen = DIRENT_RECLEN(rawsize); + + /* + * Will this entry fit in the buffer? This time with alignment + */ + if (outcount + reclen > bufsize) { + + /* + * Did we manage to fit anything in the buffer? + */ + if (!outcount) { + error = (EINVAL); + goto update; + } + break; + } + // If it is going to fit, compute alignment, in case + // this dir entry is the last one, we don't align last one. + last_alignment = reclen - rawsize; + + + // Convert the filename over, or as much as we can fit + ULONG namelenholder2 = 0; + error = RtlUTF8ToUnicodeN(nameptr, namelenholder, &namelenholder2, zap.za_name, namelen); + ASSERT(namelenholder == namelenholder2); +#if 0 + dprintf("%s: '%.*S' -> '%s' (namelen %d bytes: structsize %d)\n", __func__, + namelenholder / sizeof(WCHAR), nameptr, zap.za_name, namelenholder, structsize); +#endif + + // If we aren't to skip, advance all pointers + eodp->NextEntryOffset = reclen; + + outcount += reclen; + bufptr += reclen; + numdirent++; + } // !skip_this_entry + } // while + + ASSERT(outcount <= bufsize); + + /* Prefetch znode */ + if (prefetch) + dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + + if (!skip_this_entry && flag_return_single_entry) break; + } + + // The last eodp should have Next offset of 0 + // This assumes NextEntryOffset is the FIRST entry in all structs + if (eodp) eodp->NextEntryOffset = 0; + + // The outcout += reclen; above unfortunately adds the possibly + // aligned (to 8 bytes) length. But the last entry should not + // be rounded-up. + if ((outcount > last_alignment) && + (last_alignment > 0)) { + outcount -= last_alignment; + } + + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + + if ((error = uiomove(outbuf, (long)outcount, UIO_READ, uio))) { + /* + * Reset the pointer. + */ + offset = uio_offset(uio); + } + +update: + zap_cursor_fini(&zc); + if (outbuf) { + kmem_free(outbuf, bufsize); + } + + if (error == ENOENT) + error = 0; + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + uio_setoffset(uio, offset); + if (a_numdirent) + *a_numdirent = numdirent; + ZFS_EXIT(zfsvfs); + + dprintf("-zfs_readdir: num %d\n", numdirent); + + return (error); +} + +ulong_t zfs_fsync_sync_cnt = 4; + +int +zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + if (vn_has_cached_data(vp) /*&& !(syncflag & FNODSYNC)*/ && + vnode_isreg(vp) && !vnode_isswap(vp)) { +// cluster_push(vp, /* waitdata ? IO_SYNC : */ 0); + } + + (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); + + if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED && + !vnode_isrecycled(vp)) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + } + //tsd_set(zfs_fsyncer_key, NULL); + return (0); +} + + +/* + * Get the requested file attributes and place them in the provided + * vattr structure. + * + * IN: vp - vnode of file. + * vap - va_mask identifies requested attributes. + * If AT_XVATTR set, then optional attrs are requested + * flags - ATTR_NOACLCHECK (CIFS server context) + * cr - credentials of caller. + * ct - caller context + * + * OUT: vap - attribute values. + * + * RETURN: 0 (always succeeds) + */ +/* ARGSUSED */ +int +zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; +#ifndef _WIN32 + uint32_t blksize; + u_longlong_t nblocks; +#endif + uint64_t links; + uint64_t mtime[2], ctime[2], crtime[2], rdev; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap = NULL; + boolean_t skipaclchk = /*(flags & ATTR_NOACLCHECK) ? B_TRUE :*/ B_FALSE; + sa_bulk_attr_t bulk[4]; + int count = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + if (vnode_isblk(vp) || vnode_ischr(vp)) + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + + if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. + * Also, if we are the owner don't bother, since owner should + * always be allowed to read basic attributes of file. + */ + if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && + (vap->va_uid != crgetuid(cr))) { + if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, + skipaclchk, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * Return all attributes. It's cheaper to provide the answer + * than to determine whether we were asked the question. + */ + + mutex_enter(&zp->z_lock); + vap->va_type = IFTOVT(zp->z_mode); + vap->va_mode = zp->z_mode & ~S_IFMT; +#ifndef _WIN32 +#ifdef sun + vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; +#else + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; +#endif +#endif + vap->va_nodeid = zp->z_id; + if (vnode_isvroot((vp)) && zfs_show_ctldir(zp)) + links = zp->z_links + 1; + else + links = zp->z_links; + vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ + vap->va_size = zp->z_size; +#ifdef sun + vap->va_rdev = vp->v_rdev; +#else +// if (vnode_isblk(vp) || vnode_ischr(vp)) +// vap->va_rdev = zfs_cmpldev(rdev); +#endif + //vap->va_seq = zp->z_seq; + vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ + + /* + * Add in any requested optional attributes and the create time. + * Also set the corresponding bits in the returned attribute bitmap. + */ + if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + xoap->xoa_archive = + ((zp->z_pflags & ZFS_ARCHIVE) != 0); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + xoap->xoa_readonly = + ((zp->z_pflags & ZFS_READONLY) != 0); + XVA_SET_RTN(xvap, XAT_READONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + xoap->xoa_system = + ((zp->z_pflags & ZFS_SYSTEM) != 0); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + xoap->xoa_hidden = + ((zp->z_pflags & ZFS_HIDDEN) != 0); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + xoap->xoa_nounlink = + ((zp->z_pflags & ZFS_NOUNLINK) != 0); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + xoap->xoa_immutable = + ((zp->z_pflags & ZFS_IMMUTABLE) != 0); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + xoap->xoa_appendonly = + ((zp->z_pflags & ZFS_APPENDONLY) != 0); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + xoap->xoa_nodump = + ((zp->z_pflags & ZFS_NODUMP) != 0); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + xoap->xoa_opaque = + ((zp->z_pflags & ZFS_OPAQUE) != 0); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + xoap->xoa_av_quarantined = + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + xoap->xoa_av_modified = + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && + vnode_isreg(vp)) { + zfs_sa_get_scanstamp(zp, xvap); + } + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + uint64_t times[2]; + + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + times, sizeof (times)); + ZFS_TIME_DECODE(&xoap->xoa_createtime, times); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_GEN)) { + xoap->xoa_generation = zp->z_gen; + XVA_SET_RTN(xvap, XAT_GEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + xoap->xoa_offline = + ((zp->z_pflags & ZFS_OFFLINE) != 0); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + xoap->xoa_sparse = + ((zp->z_pflags & ZFS_SPARSE) != 0); + XVA_SET_RTN(xvap, XAT_SPARSE); + } + } + + ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); + ZFS_TIME_DECODE(&vap->va_mtime, mtime); + ZFS_TIME_DECODE(&vap->va_ctime, ctime); + ZFS_TIME_DECODE(&vap->va_crtime, crtime); + + mutex_exit(&zp->z_lock); + +#ifdef _WIN32 + + /* If we are told to ignore owners, we scribble over the uid and gid here + * unless root. + */ +// if (((unsigned int)vfs_flags(zfsvfs->z_vfs)) & MNT_IGNORE_OWNERSHIP) { +// if (kauth_cred_getuid(cr) != 0) { +// vap->va_uid = UNKNOWNUID; +// vap->va_gid = UNKNOWNGID; +// } +// } + +#else + uint64_t blksize, nblocks; + + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); + vap->va_blksize = blksize; + vap->va_bytes = nblocks << 9; /* nblocks * 512 */ + + if (zp->z_blksz == 0) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + vap->va_blksize = zfsvfs->z_max_blksz; + } +#endif + + ZFS_EXIT(zfsvfs); + return (0); +} + +#ifdef LINUX +/* + * Get the basic file attributes and place them in the provided kstat + * structure. The inode is assumed to be the authoritative source + * for most of the attributes. However, the znode currently has the + * authoritative atime, blksize, and block count. + * + * IN: ip - inode of file. + * + * OUT: sp - kstat values. + * + * RETURN: 0 (always succeeds) + */ +/* ARGSUSED */ +int +zfs_getattr_fast(struct inode *ip, struct kstat *sp) +{ + znode_t *zp = ITOZ(ip); + zfs_sb_t *zsb = ITOZSB(ip); + uint32_t blksize; + u_longlong_t nblocks; + + ZFS_ENTER(zsb); + ZFS_VERIFY_ZP(zp); + + mutex_enter(&zp->z_lock); + + generic_fillattr(ip, sp); + ZFS_TIME_DECODE(&sp->atime, zp->z_atime); + + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); + sp->blksize = blksize; + sp->blocks = nblocks; + + if (unlikely(zp->z_blksz == 0)) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + sp->blksize = zsb->z_max_blksz; + } +} +#endif + + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: vp - vnode of file to be modified. + * vap - new attribute values. + * If AT_XVATTR set, then optional attrs are being set + * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). + * cr - credentials of caller. + * ct - caller context + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +int +zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + int err=0, err2; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog; + dmu_tx_t *tx; + vattr_t oldva; + xvattr_t *tmpxvattr; + uint_t mask = vap->va_mask; + uint_t saved_mask = 0; + uint64_t saved_mode; + int trim_mask = 0; + uint64_t new_mode; + uint64_t new_uid, new_gid; + uint64_t xattr_obj; + uint64_t mtime[2], ctime[2], crtime[2]; + znode_t *attrzp; + int need_policy = FALSE; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp; + boolean_t skipaclchk = /*(flags & ATTR_NOACLCHECK) ? B_TRUE :*/ B_FALSE; + boolean_t fuid_dirtied = B_FALSE; +#define _NUM_BULK 10 + sa_bulk_attr_t *bulk, *xattr_bulk; + int count = 0, xattr_count = 0; + vsecattr_t vsecattr; + int seen_type = 0; + int aclbsize; /* size of acl list in bytes */ + ace_t *aaclp; + struct kauth_acl *kauth; + + if (mask == 0) + return (0); + + if (mask & AT_NOSET) + return ((EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + dprintf("+setattr: zp %p, vp %p\n", zp, vp); + + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & AT_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (mask & AT_SIZE && vnode_isdir(vp)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + + if (mask & AT_SIZE && !vnode_isreg(vp) && !vnode_isfifo(vp)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * If this is an xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + + tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); + xva_init(tmpxvattr); + + bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * _NUM_BULK, KM_SLEEP); + xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * _NUM_BULK, KM_SLEEP); + + /* + * Immutable files can only alter immutable bit and atime + */ +#ifndef _WIN32 + if ((zp->z_pflags & ZFS_IMMUTABLE) && + ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || + ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + err = SET_ERROR(EPERM); + goto out3; + } +#else + //chflags uchg sends AT_MODE on OS X, so allow AT_MODE to be in the mask. + if ((zp->z_pflags & ZFS_IMMUTABLE) && + ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME)) || + ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + err = SET_ERROR(EPERM); + goto out3; + } +#endif + + /* + * Note: ZFS_READONLY is handled in zfs_zaccess_common. + */ + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + + /* + * This test now hinders NFS from working as expected. Most like the + * 32bit timestamp issues have already been fixed. + */ +#if 0 + if (mask & (AT_ATIME | AT_MTIME)) { + if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { + err = SET_ERROR(EOVERFLOW); + goto out3; + } + } +#endif + +top: + attrzp = NULL; + aclp = NULL; + + /* Can this be moved to before the top label? */ + if (vfs_isrdonly(zfsvfs->z_vfs)) { + err = SET_ERROR(EROFS); + goto out3; + } + + /* + * First validate permissions + */ + + if (mask & AT_SIZE) { + err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); + if (err) + goto out3; + + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + if (err) + goto out3; + } + + if (mask & (AT_ATIME|AT_MTIME) || + ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + skipaclchk, cr); + } + + if (mask & (AT_UID|AT_GID)) { + int idmask = (mask & (AT_UID|AT_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & AT_MODE)) + vap->va_mode = zp->z_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & AT_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); + + /* + * If both AT_UID and AT_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || + ((idmask == AT_UID) && take_owner) || + ((idmask == AT_GID) && take_group)) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + skipaclchk, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + secpolicy_setid_clear(vap, vp, cr); + trim_mask = (mask & (AT_UID|AT_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + mutex_enter(&zp->z_lock); + oldva.va_mode = zp->z_mode; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & AT_XVATTR) { + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((zp->z_pflags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((!vnode_isreg(vp) && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + mutex_exit(&zp->z_lock); + err = SET_ERROR(EPERM); + goto out3; + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + mutex_exit(&zp->z_lock); + + if (mask & AT_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + err = secpolicy_setid_setsticky_clear(vp, vap, + &oldva, cr); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + trim_mask |= AT_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + if (trim_mask & AT_MODE) { + /* + * Save the mode, as secpolicy_vnode_setattr() + * will overwrite it with ova.va_mode. + */ + saved_mode = vap->va_mode; + } + } + err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + if (err) + goto out3; + + if (trim_mask) { + vap->va_mask |= saved_mask; + if (trim_mask & AT_MODE) { + /* + * Recover the mode after + * secpolicy_vnode_setattr(). + */ + vap->va_mode = saved_mode; + } + } + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + if ((mask & (AT_UID | AT_GID))) { + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + + if (err == 0 && xattr_obj) { + err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); + if (err) + goto out2; + } + if (mask & AT_UID) { + new_uid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_uid != zp->z_uid && + zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { + if (attrzp) + VN_RELE(ZTOV(attrzp)); + err = (EDQUOT); + goto out2; + } + } + + if (mask & AT_GID) { + new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &fuidp); + if (new_gid != zp->z_gid && + zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { + if (attrzp) + VN_RELE(ZTOV(attrzp)); + err = (EDQUOT); + goto out2; + } + } + } + tx = dmu_tx_create(zfsvfs->z_os); + + /* + * ACLs are currently not working, there appears to be two implementations + * here, one is old MacZFS "zfs_setacl" and the other is ZFS (FBSD?) + * with zfs_external_acl(). + */ + + if (mask & AT_ACL) { +#if 0 + if ((vap->va_acl != (kauth_acl_t) KAUTH_FILESEC_NONE) && + (vap->va_acl->acl_entrycount > 0) && + (vap->va_acl->acl_entrycount != KAUTH_FILESEC_NOACL)) { + + vsecattr.vsa_mask = VSA_ACE; + kauth = vap->va_acl; + +#if HIDE_TRIVIAL_ACL + // We might have to add 3 trivial acls, depending on + // what was handed to us. + aclbsize = ( 3 + kauth->acl_entrycount ) * sizeof(ace_t); + dprintf("Given %d ACLs, adding 3\n", kauth->acl_entrycount); +#else + aclbsize = kauth->acl_entrycount * sizeof(ace_t); + dprintf("Given %d ACLs\n", kauth->acl_entrycount); +#endif + + vsecattr.vsa_aclentp = kmem_zalloc(aclbsize, KM_SLEEP); + aaclp = vsecattr.vsa_aclentp; + vsecattr.vsa_aclentsz = aclbsize; + +#if HIDE_TRIVIAL_ACL + // Add in the trivials, keep "seen_type" as a bit pattern of + // which trivials we have seen + seen_type = 0; + + dprintf("aces_from_acl %d entries\n", kauth->acl_entrycount); + aces_from_acl(vsecattr.vsa_aclentp, + &vsecattr.vsa_aclcnt, kauth, &seen_type); + + // Add in trivials at end, based on the "seen_type". + zfs_addacl_trivial(zp, vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, + seen_type); + dprintf("together at last: %d\n", vsecattr.vsa_aclcnt); +#else + // aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, kauth); +#endif + +// err = zfs_setacl(zp, &vsecattr, B_TRUE, cr); +// kmem_free(aaclp, aclbsize); + + } else { + + seen_type = 0; + vsecattr.vsa_mask = VSA_ACE; + vsecattr.vsa_aclcnt = 0; + aclbsize = ( 3 ) * sizeof(ace_t); + vsecattr.vsa_aclentp = kmem_zalloc(aclbsize, KM_SLEEP); + aaclp = vsecattr.vsa_aclentp; + vsecattr.vsa_aclentsz = aclbsize; + // Clearing, we need to pass in the trivials only + zfs_addacl_trivial(zp, vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, + seen_type); + + if ((err = zfs_setacl(zp, &vsecattr, B_TRUE, cr))) + dprintf("setattr: setacl failed: %d\n", err); + + kmem_free(aaclp, aclbsize); + + } // blank ACL? +#endif // 0 + } // ACL + + + if (mask & AT_MODE) { + uint64_t pmode = zp->z_mode; + uint64_t acl_obj; + + if(!(mask & AT_ACL)) { + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + } else { + new_mode = pmode; + } + + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && + !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { + err = (EPERM); + goto out; + } + + if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) + goto out; + + mutex_enter(&zp->z_lock); + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, + aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + mutex_exit(&zp->z_lock); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if ((mask & AT_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + } + + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); + } + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + goto out; + + count = 0; + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&attrzp->z_acl_lock); + mutex_enter(&attrzp->z_lock); + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, + sizeof (attrzp->z_pflags)); + } + + if (mask & (AT_UID|AT_GID)) { + + if (mask & AT_UID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + zp->z_uid = new_uid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + attrzp->z_uid = new_uid; + } + } + + if (mask & AT_GID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), + NULL, &new_gid, sizeof (new_gid)); + zp->z_gid = new_gid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + attrzp->z_gid = new_gid; + } + } + if (!(mask & AT_MODE)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), + NULL, &new_mode, sizeof (new_mode)); + new_mode = zp->z_mode; + } + err = zfs_acl_chown_setattr(zp); + ASSERT(err == 0); + if (attrzp) { + err = zfs_acl_chown_setattr(attrzp); + ASSERT(err == 0); + } + + /* + * When importing ZEVO volumes, and 'chown' is used, we end up calling + * SA_LOOKUP with 'sa_addr' == NULL. Unsure why this happens, for + * now, we shall stick a plaster over this open-fracture + */ + if (err == 2) { + dprintf("setattr: triggered SA_LOOKUP == NULL problem\n"); + err = 0; + } + + } + + if (mask & AT_MODE) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = new_mode; + /* + * Mode change needs to trigger corresponding update to trivial ACLs. + * ACL change already does this, and another call to zfs_aclset_common + * would overwrite our explicit ACL changes. + */ + if(!(mask & AT_ACL)) { + ASSERT3U((uintptr_t)aclp, !=, 0); + err = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT(err==0); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = aclp; + aclp = NULL; + } + } + + if (mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); + } + + if (mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + } + + if (mask & AT_CRTIME) { + ZFS_TIME_ENCODE(&vap->va_crtime, crtime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, + crtime, sizeof (crtime)); + } + + /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ + if (mask & AT_SIZE && !(mask & AT_MTIME)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, + B_TRUE); + } else if (mask != 0) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, + B_TRUE); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(attrzp, STATE_CHANGED, + mtime, ctime, B_TRUE); + } + } + +#ifdef _WIN32 + // * you are not allowed to change "change time" in POSIX, But windows allows it (ifstest too) + if (mask & AT_CTIME) { + ZFS_TIME_ENCODE(&vap->va_ctime, ctime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof(ctime)); + } +#endif + + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & AT_XVATTR)) { + + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + +/* + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + ASSERT(vp->v_type == VREG); +*/ + + zfs_xvattr_set(zp, xvap, tx); + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + + mutex_exit(&zp->z_lock); + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&zp->z_acl_lock); + + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&attrzp->z_acl_lock); + mutex_exit(&attrzp->z_lock); + } +out: + + if (err == 0 && attrzp) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT(err2 == 0); + } + + if (attrzp) + VN_RELE(ZTOV(attrzp)); + if (aclp) + zfs_acl_free(aclp); + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) { + dmu_tx_abort(tx); + if (err == ERESTART) + goto top; + } else { + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } + +out2: + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + +out3: + dprintf("-setattr: zp %p size %llu\n", zp, zp->z_size); + + kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * _NUM_BULK); + kmem_free(bulk, sizeof (sa_bulk_attr_t) * _NUM_BULK); + kmem_free(tmpxvattr, sizeof (xvattr_t)); +#undef _NUM_BULK + + ZFS_EXIT(zfsvfs); + return (err); +} + +typedef struct zfs_zlock { + krwlock_t *zl_rwlock; /* lock we acquired */ + znode_t *zl_znode; /* znode we held */ + struct zfs_zlock *zl_next; /* next in list */ +} zfs_zlock_t; + +/* + * Drop locks and release vnodes that were held by zfs_rename_lock(). + */ +static void +zfs_rename_unlock(zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + + while ((zl = *zlpp) != NULL) { + if (zl->zl_znode != NULL) + VN_RELE(ZTOV(zl->zl_znode)); + rw_exit(zl->zl_rwlock); + *zlpp = zl->zl_next; + kmem_free(zl, sizeof (*zl)); + } +} + +/* + * Search back through the directory tree, using the ".." entries. + * Lock each directory in the chain to prevent concurrent renames. + * Fail any attempt to move a directory into one of its own descendants. + * XXX - z_parent_lock can overlap with map or grow locks + */ +static int +zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + znode_t *zp = tdzp; + uint64_t rootid = zp->z_zfsvfs->z_root; + uint64_t oidp = zp->z_id; + krwlock_t *rwlp = &szp->z_parent_lock; + krw_t rw = RW_WRITER; + + /* + * First pass write-locks szp and compares to zp->z_id. + * Later passes read-lock zp and compare to zp->z_parent. + */ + do { + if (!rw_tryenter(rwlp, rw)) { + /* + * Another thread is renaming in this path. + * Note that if we are a WRITER, we don't have any + * parent_locks held yet. + */ + if (rw == RW_READER && zp->z_id > szp->z_id) { + /* + * Drop our locks and restart + */ + zfs_rename_unlock(&zl); + *zlpp = NULL; + zp = tdzp; + oidp = zp->z_id; + rwlp = &szp->z_parent_lock; + rw = RW_WRITER; + continue; + } else { + /* + * Wait for other thread to drop its locks + */ + rw_enter(rwlp, rw); + } + } + + zl = kmem_alloc(sizeof (*zl), KM_SLEEP); + zl->zl_rwlock = rwlp; + zl->zl_znode = NULL; + zl->zl_next = *zlpp; + *zlpp = zl; + + if (oidp == szp->z_id) /* We're a descendant of szp */ + return (SET_ERROR(EINVAL)); + + if (oidp == rootid) /* We've hit the top */ + return (0); + + if (rw == RW_READER) { /* i.e. not the first pass */ + int error = zfs_zget(zp->z_zfsvfs, oidp, &zp); + if (error) + return (error); + zl->zl_znode = zp; + } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs), + &oidp, sizeof (oidp)); + rwlp = &zp->z_parent_lock; + rw = RW_READER; + + } while (zp->z_id != sdzp->z_id); + + return (0); +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdvp - Source directory containing the "old entry". + * snm - Old entry name. + * tdvp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * sdvp,tdvp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, + caller_context_t *ct, int flags) +{ + znode_t *tdzp, *szp, *tzp; + znode_t *sdzp = VTOZ(sdvp); + zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; + zilog_t *zilog; +#ifndef _WIN32 + vnode_t *realvp; +#endif + uint64_t addtime[2]; + zfs_dirlock_t *sdl, *tdl; + dmu_tx_t *tx; + zfs_zlock_t *zl; + int cmp, serr, terr; + int error = 0; + int zflg = 0; + boolean_t waited = B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(sdzp); + zilog = zfsvfs->z_log; + +#ifndef _WIN32 + /* + * Make sure we have the real vp for the target directory. + */ + if (VOP_REALVP(tdvp, &realvp, ct) == 0) + tdvp = realvp; + + if (tdvp->v_vfsp != sdvp->v_vfsp || zfsctl_is_node(tdvp)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } +#endif + + tdzp = VTOZ(tdvp); + ZFS_VERIFY_ZP(tdzp); + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + +top: + szp = NULL; + tzp = NULL; + zl = NULL; + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Lock source and target directory entries. To prevent deadlock, + * a lock ordering must be defined. We lock the directory with + * the smallest object id first, or if it's a tie, the one with + * the lexically first name. + */ + if (sdzp->z_id < tdzp->z_id) { + cmp = -1; + } else if (sdzp->z_id > tdzp->z_id) { + cmp = 1; + } else { + /* + * First compare the two name arguments without + * considering any case folding. + */ + int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); + + cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); + ASSERT(error == 0 || !zfsvfs->z_utf8); + if (cmp == 0) { + /* + * POSIX: "If the old argument and the new argument + * both refer to links to the same existing file, + * the rename() function shall return successfully + * and perform no other action." + */ + ZFS_EXIT(zfsvfs); + return (0); + } + /* + * If the file system is case-folding, then we may + * have some more checking to do. A case-folding file + * system is either supporting mixed case sensitivity + * access or is completely case-insensitive. Note + * that the file system is always case preserving. + * + * In mixed sensitivity mode case sensitive behavior + * is the default. FIGNORECASE must be used to + * explicitly request case insensitive behavior. + * + * If the source and target names provided differ only + * by case (e.g., a request to rename 'tim' to 'Tim'), + * we will treat this as a special case in the + * case-insensitive mode: as long as the source name + * is an exact match, we will allow this to proceed as + * a name-change request. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + (zfsvfs->z_case == ZFS_CASE_MIXED && + flags & FIGNORECASE)) && + u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, + &error) == 0) { + /* + * case preserving rename request, require exact + * name matches + */ + zflg |= ZCIEXACT; + zflg &= ~ZCILOOK; + } + } + + /* + * If the source and destination directories are the same, we should + * grab the z_name_lock of that directory only once. + */ + if (sdzp == tdzp) { + zflg |= ZHAVELOCK; + rw_enter(&sdzp->z_name_lock, RW_READER); + } + + if (cmp < 0) { + serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, + ZEXISTS | zflg, NULL, NULL); + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); + } else { + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, zflg, NULL, NULL); + serr = zfs_dirent_lock(&sdl, + sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, + NULL, NULL); + } + + if (serr) { + /* + * Source entry invalid or not there. + */ + if (!terr) { + zfs_dirent_unlock(tdl); + if (tzp) + VN_RELE(ZTOV(tzp)); + } + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + /* + * FreeBSD: In OpenSolaris they only check if rename source is + * ".." here, because "." is handled in their lookup. This is + * not the case for FreeBSD, so we check for "." explicitly. + */ + if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0) + serr = (EINVAL); + ZFS_EXIT(zfsvfs); + return (serr); + } + if (terr) { + zfs_dirent_unlock(sdl); + VN_RELE(ZTOV(szp)); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (strcmp(tnm, "..") == 0) + terr = (EINVAL); + ZFS_EXIT(zfsvfs); + return (terr); + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + + if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) + goto out; + + if (vnode_isdir(ZTOV(szp))) { + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) + goto out; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if (vnode_isdir(ZTOV(szp))) { + if (!vnode_isdir(ZTOV(tzp))) { + error = SET_ERROR(ENOTDIR); + goto out; + } + } else { + if (vnode_isdir(ZTOV(tzp))) { + error = SET_ERROR(EISDIR); + goto out; + } + } + /* + * POSIX dictates that when the source and target + * entries refer to the same file object, rename + * must do nothing and exit without error. + */ + if (szp->z_id == tzp->z_id) { + error = 0; + goto out; + } + } + + vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); + if (tzp) + vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); + + /* + * notify the target directory if it is not the same + * as source directory. + */ + if (tdvp != sdvp) { + vnevent_rename_dest_dir(tdvp, ct); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + if (zl != NULL) + zfs_rename_unlock(&zl); + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + VN_RELE(ZTOV(szp)); + if (tzp) + VN_RELE(ZTOV(tzp)); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + + if (error == 0) { + error = zfs_link_create(tdl, szp, tx, ZRENAMING); + if (error == 0) { + szp->z_pflags |= ZFS_AV_MODIFIED; + + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT(error==0); + +#ifdef __APPLE__ + /* If we moved an entry into a different directory (sdzp != tdzp) + * then we also need to update ADDEDTIME (ADDTIME) property for + * FinderInfo. We are already inside error == 0 conditional + */ + if ((sdzp != tdzp) && + zfsvfs->z_use_sa == B_TRUE) { + timestruc_t now; + gethrestime(&now); + ZFS_TIME_ENCODE(&now, addtime); + error = sa_update(szp->z_sa_hdl, SA_ZPL_ADDTIME(zfsvfs), + (void *)&addtime, sizeof (addtime), tx); + dprintf("ZFS: Updating ADDEDTIME on zp/vp %p/%p: %llu\n", + szp, ZTOV(szp), addtime[0]); + } +#endif + + + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + if (error == 0) { + zfs_log_rename(zilog, tx, TX_RENAME | + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); + + /* + * Update path information for the target vnode + */ + vn_renamepath(tdvp, ZTOV(szp), tnm, + strlen(tnm)); + +#ifdef __APPLE__ + /* Update cached name - for vget, and access without + * calling vnop_lookup first - it is easier to clear + * it out and let getattr look it up if needed. + */ + if (tzp) { + mutex_enter(&tzp->z_lock); + tzp->z_name_cache[0] = 0; + mutex_exit(&tzp->z_lock); + } + if (szp) { + mutex_enter(&szp->z_lock); + szp->z_name_cache[0] = 0; + mutex_exit(&szp->z_lock); + } +#endif + + } else { + /* + * At this point, we have successfully created + * the target name, but have failed to remove + * the source name. Since the create was done + * with the ZRENAMING flag, there are + * complications; for one, the link count is + * wrong. The easiest way to deal with this + * is to remove the newly created target, and + * return the original error. This must + * succeed; fortunately, it is very unlikely to + * fail, since we just created it. + */ + VERIFY3U(zfs_link_destroy(tdl, szp, tx, + ZRENAMING, NULL), ==, 0); + } + } + + +#if defined (FREEBSD_NAMECACHE) + if (error == 0) { + cache_purge(sdvp); + cache_purge(tdvp); + cache_purge(ZTOV(szp)); + if (tzp) + cache_purge(ZTOV(tzp)); + } +#endif + } + + dmu_tx_commit(tx); +out: + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + + VN_RELE(ZTOV(szp)); + if (tzp) + VN_RELE(ZTOV(tzp)); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dvp - Directory to contain new symbolic link. + * link - Name for new symlink entry. + * vap - Attributes of new entry. + * target - Target path of new symlink. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, + cred_t *cr) +{ + znode_t *zp, *dzp = VTOZ(dvp); + zfs_dirlock_t *dl; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t len = strlen(link); + int error; + int zflg = ZNEW; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; + int flags = 0; + boolean_t waited = B_FALSE; + + ASSERT(vap->va_type == VLNK); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENAMETOOLONG)); + } + + if ((error = zfs_acl_ids_create(dzp, 0, + vap, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } +top: + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); + if (error) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create a new object for the symlink. + * for version 4 ZPL datsets the symlink will be an SA attribute + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + link, len, tx); + else + zfs_sa_symlink(zp, link, len, tx); + mutex_exit(&zp->z_lock); + + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); + /* + * Insert the new object into the directory. + */ + (void) zfs_link_create(dl, zp, tx, ZNEW); + + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + *vpp = ZTOV(zp); + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + /* + * OS X - attach the vnode _after_ committing the transaction + */ + zfs_znode_getvnode(zp, dzp, zfsvfs); + *vpp = ZTOV(zp); + + zfs_dirent_unlock(dl); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by vp. + * + * IN: vp - vnode of symbolic link. + * uoip - structure to contain the link path. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - structure to contain the link path. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +int +zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); + mutex_exit(&zp->z_lock); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Insert a new entry into directory tdvp referencing svp. + * + * IN: tdvp - Directory to contain new entry. + * svp - vnode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * ct - caller context + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * tdvp - ctime|mtime updated + * svp - ctime updated + */ +/* ARGSUSED */ +int +zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, + caller_context_t *ct, int flags) +{ + znode_t *dzp = VTOZ(tdvp); + znode_t *tzp, *szp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; +#ifndef _WIN32 + vnode_t *realvp; +#endif + int error; + int zf = ZNEW; + uint64_t parent; + uid_t owner; + boolean_t waited = B_FALSE; + + ASSERT(vnode_isdir(tdvp)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + +#ifdef _WIN32 + if (vnode_mount(svp) != vnode_mount(tdvp)) { + ZFS_EXIT(zfsvfs); + return (EXDEV); + } +#else + + if (VOP_REALVP(svp, &realvp, ct) == 0) + svp = realvp; + +#endif + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (vnode_isdir(svp)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + +#ifndef _WIN32 + if (svp->v_vfsp != tdvp->v_vfsp || zfsctl_is_node(svp)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } +#endif + + szp = VTOZ(svp); + ZFS_VERIFY_ZP(szp); + + /* Prevent links to .zfs/shares files */ + + if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + if (parent == zfsvfs->z_shares_dir) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + + owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + +top: + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, dzp); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_create(dl, szp, tx, 0); + + if (error == 0) { + uint64_t txtype = TX_LINK; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_link(zilog, tx, txtype, dzp, szp, name); + } + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (error == 0) { + vnevent_link(svp, ct); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +#ifdef sun +/* + * zfs_null_putapage() is used when the file system has been force + * unmounted. It just drops the pages. + */ +/* ARGSUSED */ +static int +zfs_null_putapage(vnode_t *vp, page_t **pp, u_offset_t *offp, + size_t *lenp, int flags, cred_t *cr) +{ + pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); + return (0); +} + +/* + * Push a page out to disk, klustering if possible. + * + * IN: vp - file to push page to. + * pp - page to push. + * flags - additional flags. + * cr - credentials of caller. + * + * OUT: offp - start of range pushed. + * lenp - len of range pushed. + * + * RETURN: 0 if success + * error code if failure + * + * NOTE: callers must have locked the page to be pushed. On + * exit, the page (and all other pages in the kluster) must be + * unlocked. + */ +/* ARGSUSED */ +static int +zfs_putapage(vnode_t *vp, page_t **pp, u_offset_t *offp, + size_t *lenp, int flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_tx_t *tx; + u_offset_t off, koff; + size_t len, klen; + int err; + + off = pp->p_offset; + len = PAGESIZE; + /* + * If our blocksize is bigger than the page size, try to kluster + * multiple pages so that we write a full block (thus avoiding + * a read-modify-write). + */ + if (off < zp->z_size && zp->z_blksz > PAGESIZE) { + klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); + koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; + ASSERT(koff <= zp->z_size); + if (koff + klen > zp->z_size) + klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE); + pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); + } + ASSERT3U(btop(len), ==, btopr(len)); + + /* + * The ordering here is critical and must adhere to the following + * rules in order to avoid deadlocking in either zfs_read() or + * zfs_free_range() due to a lock inversion. + * + * 1) The page must be unlocked prior to acquiring the range lock. + * This is critical because zfs_read() calls find_lock_page() + * which may block on the page lock while holding the range lock. + * + * 2) Before setting or clearing write back on a page the range lock + * must be held in order to prevent a lock inversion with the + * zfs_free_range() function. + * + * This presents a problem because upon entering this function the + * page lock is already held. To safely acquire the range lock the + * page lock must be dropped. This creates a window where another + * process could truncate, invalidate, dirty, or write out the page. + * + * Therefore, after successfully reacquiring the range and page locks + * the current page state is checked. In the common case everything + * will be as is expected and it can be written out. However, if + * the page state has changed it must be handled accordingly. + */ + mapping = pp->mapping; + redirty_page_for_writepage(wbc, pp); + unlock_page(pp); + + locked_range_t *lr = rangelock_enter(&zp->z_rangelock, + pgoff, pglen, RL_WRITER); + lock_page(pp); + + /* Page mapping changed or it was no longer dirty, we're done */ + if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { + unlock_page(pp); + rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* Another process started write block if required */ + if (PageWriteback(pp)) { + unlock_page(pp); + rangelock_exit(lr); + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(pp); + + ZFS_EXIT(zsb); + return (0); + } + + /* Clear the dirty flag the required locks are held */ + if (!clear_page_dirty_for_io(pp)) { + unlock_page(pp); + rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * Counterpart for redirty_page_for_writepage() above. This page + * was in fact not skipped and should not be counted as if it were. + */ + wbc->pages_skipped--; + set_page_writeback(pp); + unlock_page(pp); + + tx = dmu_tx_create(zsb->z_os); + dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_NOWAIT); + if (err != 0) { + if (err == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + goto out; + } + + if (zp->z_blksz <= PAGESIZE) { + caddr_t va = zfs_map_page(pp, S_READ); + ASSERT3U(len, <=, PAGESIZE); + dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); + zfs_unmap_page(pp, va); + } else { + err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); + } + + if (err == 0) { + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, + B_TRUE); + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(err); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); + } + dmu_tx_commit(tx); + +out: + pvn_write_done(pp, (err ? B_ERROR : 0) | flags); + if (offp) + *offp = off; + if (lenp) + *lenp = len; + rangelock_exit(lr); + + if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * Note that this is rarely called under writepages(), because + * writepages() normally handles the entire commit for + * performance reasons. + */ + if (zsb->z_log != NULL) + zil_commit(zsb->z_log, zp->z_id); + } + + return (err); +} + +/* + * Copy the portion of the file indicated from pages into the file. + * The pages are stored in a page list attached to the files vnode. + * + * IN: vp - vnode of file to push page data to. + * off - position in file to put data. + * len - amount of data to write. + * flags - flags to control the operation. + * cr - credentials of caller. + * ct - caller context. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * vp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + page_t *pp; + size_t io_len; + u_offset_t io_off; + uint_t blksz; + rl_t *rl; + int error = 0; + + if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * Align this request to the file block size in case we kluster. + * XXX - this can result in pretty aggresive locking, which can + * impact simultanious read/write access. One option might be + * to break up long requests (len == 0) into block-by-block + * operations to get narrower locking. + */ + blksz = zp->z_blksz; + if (ISP2(blksz)) + io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); + else + io_off = 0; + if (len > 0 && ISP2(blksz)) + io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); + else + io_len = 0; + + if (io_len == 0) { + /* + * Search the entire vp list for pages >= io_off. + */ + rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); + error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); + goto out; + } + rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); + + if (off > zp->z_size) { + /* past end of file */ + zfs_range_unlock(rl); + ZFS_EXIT(zfsvfs); + return (0); + } + + len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off); + + for (off = io_off; io_off < off + len; io_off += io_len) { + if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { + pp = page_lookup(vp, io_off, + (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); + } else { + pp = page_lookup_nowait(vp, io_off, + (flags & B_FREE) ? SE_EXCL : SE_SHARED); + } + + if (pp != NULL && pvn_getdirty(pp, flags)) { + int err; + + /* + * Found a dirty page to push + */ + err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); + if (err) + error = err; + } else { + io_len = PAGESIZE; + } + } +out: + zfs_range_unlock(rl); + if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + return (error); +} +#endif /* sun */ + +/*ARGSUSED*/ +void +zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) { + /* + * The fs has been unmounted, or we did a + * suspend/resume and this file no longer exists. + */ + + + rw_exit(&zfsvfs->z_teardown_inactive_lock); + return; + } + + mutex_enter(&zp->z_lock); + if (zp->z_unlinked) { + /* + * Fast path to recycle a vnode of a removed file. + */ + mutex_exit(&zp->z_lock); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + return; + } + mutex_exit(&zp->z_lock); + + if (zp->z_atime_dirty && zp->z_unlinked == 0) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + mutex_enter(&zp->z_lock); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&zp->z_atime, sizeof (zp->z_atime), tx); + zp->z_atime_dirty = 0; + mutex_exit(&zp->z_lock); + dmu_tx_commit(tx); + } + } + rw_exit(&zfsvfs->z_teardown_inactive_lock); +} + +#ifdef sun +/* + * Bounds-check the seek operation. + * + * IN: vp - vnode seeking within + * ooff - old file offset + * noffp - pointer to new file offset + * ct - caller context + * + * RETURN: 0 if success + * EINVAL if new offset invalid + */ +/* ARGSUSED */ +static int +zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + if (vp->v_type == VDIR) + return (0); + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); +} + +/* + * Pre-filter the generic locking function to trap attempts to place + * a mandatory lock on a memory mapped file. + */ +static int +zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, + flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * We are following the UFS semantics with respect to mapcnt + * here: If we see that the file is mapped already, then we will + * return an error, but we don't worry about races between this + * function and zfs_map(). + */ + if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) { + ZFS_EXIT(zfsvfs); + return ((EAGAIN)); + } + ZFS_EXIT(zfsvfs); + return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); +} + +/* + * If we can't find a page in the cache, we will create a new page + * and fill it with file data. For efficiency, we may try to fill + * multiple pages at once (klustering) to fill up the supplied page + * list. Note that the pages to be filled are held with an exclusive + * lock to prevent access by other threads while they are being filled. + */ +static int +zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, + caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) +{ + znode_t *zp = VTOZ(vp); + page_t *pp, *cur_pp; + objset_t *os = zp->z_zfsvfs->z_os; + u_offset_t io_off, total; + size_t io_len; + int err; + + if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { + /* + * We only have a single page, don't bother klustering + */ + io_off = off; + io_len = PAGESIZE; + pp = page_create_va(vp, io_off, io_len, + PG_EXCL | PG_WAIT, seg, addr); + } else { + /* + * Try to find enough pages to fill the page list + */ + pp = pvn_read_kluster(vp, off, seg, addr, &io_off, + &io_len, off, plsz, 0); + } + if (pp == NULL) { + /* + * The page already exists, nothing to do here. + */ + *pl = NULL; + return (0); + } + + /* + * Fill the pages in the kluster. + */ + cur_pp = pp; + for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { + caddr_t va; + + ASSERT3U(io_off, ==, cur_pp->p_offset); + va = zfs_map_page(cur_pp, S_WRITE); + err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, + DMU_READ_PREFETCH); + zfs_unmap_page(cur_pp, va); + if (err) { + /* On error, toss the entire kluster */ + pvn_read_done(pp, B_ERROR); + /* convert checksum errors into IO errors */ + if (err == ECKSUM) + err = SET_ERROR(EIO); + return (err); + } + cur_pp = cur_pp->p_next; + } + + /* + * Fill in the page list array from the kluster starting + * from the desired offset `off'. + * NOTE: the page list will always be null terminated. + */ + pvn_plist_init(pp, pl, plsz, off, io_len, rw); + ASSERT(pl == NULL || (*pl)->p_offset == off); + + return (0); +} + +/* + * Return pointers to the pages for the file region [off, off + len] + * in the pl array. If plsz is greater than len, this function may + * also return page pointers from after the specified region + * (i.e. the region [off, off + plsz]). These additional pages are + * only returned if they are already in the cache, or were created as + * part of a klustered read. + * + * IN: vp - vnode of file to get data from. + * off - position in file to get data from. + * len - amount of data to retrieve. + * plsz - length of provided page list. + * seg - segment to obtain pages for. + * addr - virtual address of fault. + * rw - mode of created pages. + * cr - credentials of caller. + * ct - caller context. + * + * OUT: protp - protection mode of created pages. + * pl - list of pages created. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +static int +zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, + enum seg_rw rw, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + page_t **pl0 = pl; + int err = 0; + + /* we do our own caching, faultahead is unnecessary */ + if (pl == NULL) + return (0); + else if (len > plsz) + len = plsz; + else + len = P2ROUNDUP(len, PAGESIZE); + ASSERT(plsz >= len); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (protp) + *protp = PROT_ALL; + + /* + * Loop through the requested range [off, off + len) looking + * for pages. If we don't find a page, we will need to create + * a new page and fill it with data from the file. + */ + while (len > 0) { + if (*pl = page_lookup(vp, off, SE_SHARED)) + *(pl+1) = NULL; + else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) + goto out; + while (*pl) { + ASSERT3U((*pl)->p_offset, ==, off); + off += PAGESIZE; + addr += PAGESIZE; + if (len > 0) { + ASSERT3U(len, >=, PAGESIZE); + len -= PAGESIZE; + } + ASSERT3U(plsz, >=, PAGESIZE); + plsz -= PAGESIZE; + pl++; + } + } + + /* + * Fill out the page array with any pages already in the cache. + */ + while (plsz > 0 && + (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { + off += PAGESIZE; + plsz -= PAGESIZE; + } +out: + if (err) { + /* + * Release any pages we have previously locked. + */ + while (pl > pl0) + page_unlock(*--pl); + } else { + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + } + + *pl = NULL; + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * Request a memory map for a section of a file. This code interacts + * with common code and the VM system as follows: + * + * common code calls mmap(), which ends up in smmap_common() + * + * this calls VOP_MAP(), which takes you into (say) zfs + * + * zfs_map() calls as_map(), passing segvn_create() as the callback + * + * segvn_create() creates the new segment and calls VOP_ADDMAP() + * + * zfs_addmap() updates z_mapcnt + */ +/*ARGSUSED*/ +/* Apple version is in zfs_vnops_osx.c */ +#ifdef __FreeBSD__ +static int +zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + segvn_crargs_t vn_a; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * Note: ZFS_READONLY is handled in zfs_zaccess_common. + */ + + if ((prot & PROT_WRITE) && (zp->z_pflags & + (ZFS_IMMUTABLE | ZFS_APPENDONLY))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((prot & (PROT_READ | PROT_EXEC)) && + (zp->z_pflags & ZFS_AV_QUARANTINED)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + if (vp->v_flag & VNOMAP) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOSYS)); + } + + if (off < 0 || len > MAXOFFSET_T - off) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENXIO)); + } + + if (vp->v_type != VREG) { + ZFS_EXIT(zfsvfs); + return ((ENODEV)); + } + + /* + * If file is locked, disallow mapping. + */ + if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) { + ZFS_EXIT(zfsvfs); + return ((EAGAIN)); + } + + as_rangelock(as); + error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); + if (error != 0) { + as_rangeunlock(as); + ZFS_EXIT(zfsvfs); + return (error); + } + + vn_a.vp = vp; + vn_a.offset = (u_offset_t)off; + vn_a.type = flags & MAP_TYPE; + vn_a.prot = prot; + vn_a.maxprot = maxprot; + vn_a.cred = cr; + vn_a.amp = NULL; + vn_a.flags = flags & ~MAP_TYPE; + vn_a.szc = 0; + vn_a.lgrp_mem_policy_flags = 0; + + error = as_map(as, *addrp, len, segvn_create, &vn_a); + + as_rangeunlock(as); + ZFS_EXIT(zfsvfs); + return (error); +} +#endif + +/* ARGSUSED */ +static int +zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + uint64_t pages = btopr(len); + + atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); + return (0); +} + +/* + * The reason we push dirty pages as part of zfs_delmap() is so that we get a + * more accurate mtime for the associated file. Since we don't have a way of + * detecting when the data was actually modified, we have to resort to + * heuristics. If an explicit msync() is done, then we mark the mtime when the + * last page is pushed. The problem occurs when the msync() call is omitted, + * which by far the most common case: + * + * open() + * mmap() + * + * munmap() + * close() + *