From e4710f61d0b988aaf19e514554d9dadfd74c9db6 Mon Sep 17 00:00:00 2001 From: David Chisnall Date: Mon, 25 Sep 2023 12:03:14 +0100 Subject: [PATCH] Enable hardware stack zeroing. This replaces the software stack zeroing path with offload to a hardware engine, when available. The hardware zeroing pipeline is controlled by the ZTOP SCR (currently assumed to be number 27). This zeroes from the top of the capability in ZTOP to the bottom. We assume that short-lived cross-compartment calls or returns from deeply nested calls will result in ranges where each is a subset of another, so we try to coalesce. If we're about to zero a region that is a superset of the current zeroing region, we skip it, otherwise we wait for the previous zeroing region to finish. This is a fairly simple approach but was chosen because adding a lot of instructions in this path can rapidly offset the benefits. 0.3% speedup on Sonata. 2% speedup on the Ibex SAFE simulator. The difference here is largely due to the UART at 115,200 b/s being *very* slow in comparison to the CPU and so the test suite is mostly waiting to write debug messages. The simulator UART can write one character per cycle. The allocator portion of the test suite (which does a lot of work per message) is around 1% faster. The test suite crashes on the Arty A7, which may be a bug in the A7 version but may be a bug in this code and so we shouldn't merge it yet. --- sdk/boards/ibex-arty-a7-100.json | 3 +- sdk/boards/ibex-safe-simulator.json | 1 + sdk/boards/sonata.json | 1 + sdk/core/loader/boot.cc | 2 +- sdk/core/switcher/entry.S | 88 +++++++++++++++++++--- sdk/core/switcher/trusted-stack-assembly.h | 21 ++++-- sdk/core/switcher/tstack.h | 45 ++++++----- sdk/xmake.lua | 10 ++- 8 files changed, 133 insertions(+), 38 deletions(-) diff --git a/sdk/boards/ibex-arty-a7-100.json b/sdk/boards/ibex-arty-a7-100.json index db541725..5993b92e 100644 --- a/sdk/boards/ibex-arty-a7-100.json +++ b/sdk/boards/ibex-arty-a7-100.json @@ -72,5 +72,6 @@ "timer_hz" : 33000000, "tickrate_hz" : 100, "revoker" : "hardware", - "stack_high_water_mark" : true + "stack_high_water_mark" : true, + "fast_stack_zeroing" : true } diff --git a/sdk/boards/ibex-safe-simulator.json b/sdk/boards/ibex-safe-simulator.json index 03315983..8dc330fd 100644 --- a/sdk/boards/ibex-safe-simulator.json +++ b/sdk/boards/ibex-safe-simulator.json @@ -47,6 +47,7 @@ "tickrate_hz" : 10, "revoker" : "hardware", "stack_high_water_mark" : true, + "fast_stack_zeroing" : true, "simulation": true, "simulator" : "${sdk}/../scripts/run-ibex-safe-sim.sh" } diff --git a/sdk/boards/sonata.json b/sdk/boards/sonata.json index 9fdccd6f..ec1eedd3 100644 --- a/sdk/boards/sonata.json +++ b/sdk/boards/sonata.json @@ -51,6 +51,7 @@ "tickrate_hz" : 100, "revoker" : "software", "stack_high_water_mark" : true, + "fast_stack_zeroing" : true, "simulator" : "${sdk}/../scripts/run-sonata.sh", "simulation": false } diff --git a/sdk/core/loader/boot.cc b/sdk/core/loader/boot.cc index 16a5ee65..02d1ed21 100644 --- a/sdk/core/loader/boot.cc +++ b/sdk/core/loader/boot.cc @@ -873,7 +873,7 @@ namespace threadTStack->mstatus = (priv::MSTATUS_MPIE | (priv::MSTATUS_PRV_M << priv::MSTATUS_MPP_SHIFT)); -#ifdef CONFIG_MSHWM +#ifdef CHERIOT_HAS_MSHWM threadTStack->mshwm = stack.top(); threadTStack->mshwmb = stack.base(); #endif diff --git a/sdk/core/switcher/entry.S b/sdk/core/switcher/entry.S index 8836d088..7e568579 100644 --- a/sdk/core/switcher/entry.S +++ b/sdk/core/switcher/entry.S @@ -19,6 +19,13 @@ */ #define CSR_MSHWMB 0xbc2 +/** + * The special capability register for the fast zeroing control. + * Writing a capability here starts zeroing from the address of this register + * down to the base. + */ +#define SCR_ZTOP 0x1b + #define MAX_FAULTS_PER_COMPARTMENT_CALL 1024 #define SPILL_SLOT_cs0 0 @@ -129,6 +136,46 @@ switcher_scheduler_entry_csp: * and integer register. All three registers are clobbered. */ .macro zero_stack base top scratch +#ifdef CHERIOT_HAS_ZTOP + // Derive the capability to the range that should be zeroed. + // This will be stored in base, leaving top as a second scratch register to use + csub \top, c\top, c\base + // Inexact bounds here may round up, but will still be within the bounds of + // the stack. We must ensure that we zero the entire used region of the + // stack, zeroing slightly more is fine, in one direction. Data on the stack that we must + // not corrupt is *above* the range that we are zeroing. The hardware + // zeroing zeroes from the capability's address, downwards, and the address + // is set to the top of the to-zero range. The inexact bounds may give us + // a base that is lower or a top that is higher. Imprecision at the base + // can only zero already-zeroed parts of the stack. + csetbounds c\base, c\base, \top + // The base register now contains the range to zero, top is available + cincoffset c\base, c\base, \top + + cspecialr c\scratch, SCR_ZTOP + + // If the current zeroing range is a subset of the range that we're about + // to zero, don't bother waiting for it to finish, just zero the larger + // range. + ctestsubset \top, c\base, c\scratch + bnez \top, 1f + + // If ztop is untagged, no zeroing is happening and we can just start + // zeroing the new range. + cgettag \top, c\scratch + beqz \top, 1f + + // If ztop is tagged, we're zeroing downwards. Store a zero at the base of + // the to-zero region. This will block until the zeroing is complete. + cgetbase \top, c\scratch + csetaddr c\top, c\scratch, \top + csb zero, 0(c\top) +1: + // Set the new value + cspecialw SCR_ZTOP, c\base + +#else + addi \scratch, \top, -32 addi \top, \top, -16 bgt \base, \scratch, 1f @@ -146,6 +193,8 @@ switcher_scheduler_entry_csp: csc cnull, 0(c\base) csc cnull, 8(c\base) 2: + +#endif .endm /** @@ -215,7 +264,7 @@ __Z26compartment_switcher_entryz: sub s1, s0, s1 csetboundsexact ct2, csp, s1 csetaddr csp, ct2, s0 -#ifdef CONFIG_MSHWM +#ifdef CHERIOT_HAS_MSHWM // Read the stack high water mark (which is 16-byte aligned) csrr gp, CSR_MSHWM // Skip zeroing if high water mark >= stack pointer @@ -227,7 +276,7 @@ __Z26compartment_switcher_entryz: #endif zero_stack t2, s0, gp after_zero: -#ifdef CONFIG_MSHWM +#ifdef CHERIOT_HAS_MSHWM // store new stack top as stack high water mark csrw CSR_MSHWM, sp #endif @@ -382,11 +431,20 @@ exception_entry_asm: csc ct0, TrustedStack_offset_mepcc(csp) csrr t1, mstatus csw t1, TrustedStack_offset_mstatus(csp) -#ifdef CONFIG_MSHWM +#ifdef CHERIOT_HAS_MSHWM csrr t1, CSR_MSHWM csw t1, TrustedStack_offset_mshwm(csp) csrr t1, CSR_MSHWMB csw t1, TrustedStack_offset_mshwmb(csp) +# ifdef CHERIOT_HAS_ZTOP + // Stop zeroing and capture the current zeroing value. Note: cspecialr is + // encoded as cspecialrw and so we can't just use cnull as the source. We + // probably could use the current ct1 value, since it's guaranteed to be + // untagged at this point in the code. + cmove ct1, cnull + cspecialrw ct1, SCR_ZTOP, ct1 + csc ct1, TrustedStack_offset_ztop(csp) +# endif #endif csrr t1, mcause csw t1, TrustedStack_offset_mcause(csp) @@ -474,7 +532,7 @@ exception_entry_asm: .Linstall_context: clw x1, TrustedStack_offset_mstatus(csp) csrw mstatus, x1 -#ifdef CONFIG_MSHWM +#ifdef CHERIOT_HAS_MSHWM clw x1, TrustedStack_offset_mshwm(csp) csrw CSR_MSHWM, x1 clw x1, TrustedStack_offset_mshwmb(csp) @@ -484,7 +542,19 @@ exception_entry_asm: csb zero, TrustedStack_offset_inForcedUnwind(csp) // c2 is csp, which will be loaded last and will overwrite the trusted // stack pointer with the thread's stack pointer. +#ifdef CHERIOT_HAS_ZTOP + // If we have ZTOP, each of these loads will take an extra cycle while the + // zeroing is running. We know that they aren't part of the stack (if the + // trusted stack and the stack overlap, everything is broken) so restart + // the zeroing as late as possible. We are reloading via csp, so it must + // be restored last. + reloadRegisters cgp, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15 + clc c1, TrustedStack_offset_ztop(csp) + cspecialw SCR_ZTOP, c1 + reloadRegisters c1, csp +#else reloadRegisters c1, cgp, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, csp +#endif mret // If we detect an invalid entry and there is no error handler installed, we want @@ -699,7 +769,7 @@ exception_entry_asm: // Load the trusted stack pointer to ct1 cspecialr ct1, mtdc -#ifdef CONFIG_MSHWM +#ifdef CHERIOT_HAS_MSHWM // Update the spilled copy of the stack high water mark to ensure that we // will clear all of the stack used by the error handler and the spilled // context. @@ -742,7 +812,7 @@ exception_entry_asm: j .Linstall_context .Lhandle_injected_error: -#ifdef CONFIG_MSHWM +#ifdef CHERIOT_HAS_MSHWM clw x1, TrustedStack_offset_mshwm(csp) csrw CSR_MSHWM, x1 clw x1, TrustedStack_offset_mshwmb(csp) @@ -813,8 +883,8 @@ exception_entry_asm: clc cgp, SPILL_SLOT_cgp(csp) cincoffset csp, csp, SPILL_SLOT_SIZE #ifndef CONFIG_NO_SWITCHER_SAFETY -#ifdef CONFIG_MSHWM - // read the stack high water mark, which is 16-byte aligned +#ifdef CHERIOT_HAS_MSHWM + // Read the stack high water mark, which is 16-byte aligned // we will use this as base address for stack clearing // note that it cannot be greater than stack top as we // we set it to stack top when we pushed to trusted stack frame @@ -825,7 +895,7 @@ exception_entry_asm: cgetaddr t1, csp csetaddr ct2, csp, tp zero_stack t2, t1, tp -#ifdef CONFIG_MSHWM +#ifdef CHERIOT_HAS_MSHWM csrw CSR_MSHWM, sp #endif #endif // CONFIG_NO_SWITCHER_SAFETY diff --git a/sdk/core/switcher/trusted-stack-assembly.h b/sdk/core/switcher/trusted-stack-assembly.h index f3215483..bf95d09a 100644 --- a/sdk/core/switcher/trusted-stack-assembly.h +++ b/sdk/core/switcher/trusted-stack-assembly.h @@ -20,14 +20,21 @@ EXPORT_ASSEMBLY_OFFSET(TrustedStack, c13, 13 * 8) EXPORT_ASSEMBLY_OFFSET(TrustedStack, c14, 14 * 8) EXPORT_ASSEMBLY_OFFSET(TrustedStack, c15, 15 * 8) EXPORT_ASSEMBLY_OFFSET(TrustedStack, hazardPointers, 16 * 8) -EXPORT_ASSEMBLY_OFFSET(TrustedStack, mstatus, 17 * 8) -EXPORT_ASSEMBLY_OFFSET(TrustedStack, mcause, (17 * 8) + 4) -#ifdef CONFIG_MSHWM -EXPORT_ASSEMBLY_OFFSET(TrustedStack, mshwm, 18 * 8) -EXPORT_ASSEMBLY_OFFSET(TrustedStack, mshwmb, (18 * 8) + 4) - +#ifdef CHERIOT_HAS_ZTOP +EXPORT_ASSEMBLY_OFFSET(TrustedStack, ztop, 17 * 8) +# define TSTACK_ZTOP_WORDS 1 +#else +# define TSTACK_ZTOP_WORDS 0 +#endif +EXPORT_ASSEMBLY_OFFSET(TrustedStack, + mstatus, + (17 + TSTACK_ZTOP_WORDS) * 8) +EXPORT_ASSEMBLY_OFFSET(TrustedStack, mcause, TrustedStack_offset_mstatus + 4) +#ifdef CHERIOT_HAS_MSHWM +EXPORT_ASSEMBLY_OFFSET(TrustedStack, mshwm, (18 + TSTACK_ZTOP_WORDS) * 8) +EXPORT_ASSEMBLY_OFFSET(TrustedStack, mshwmb, TrustedStack_offset_mshwm + 4) // Size of everything up to this point -# define TSTACK_REGFRAME_SZ (19 * 8) +# define TSTACK_REGFRAME_SZ ((19 + TSTACK_ZTOP_WORDS) * 8) // frameoffset, inForcedUnwind and padding # define TSTACK_HEADER_SZ 16 #else diff --git a/sdk/core/switcher/tstack.h b/sdk/core/switcher/tstack.h index 08752492..cf7a66af 100644 --- a/sdk/core/switcher/tstack.h +++ b/sdk/core/switcher/tstack.h @@ -28,29 +28,36 @@ struct TrustedStackFrame uint16_t errorHandlerCount; }; +#if defined(CHERIOT_HAS_ZTOP) && !defined(CHERIOT_HAS_MSHWM) +# error Platforms with ZTOP must have MSHWM +#endif + template struct TrustedStackGeneric { - void *mepcc; - void *c1; - void *csp; - void *cgp; - void *c4; - void *c5; - void *c6; - void *c7; - void *c8; - void *c9; - void *c10; - void *c11; - void *c12; - void *c13; - void *c14; - void *c15; - void *hazardPointers; + void *mepcc; + void *c1; + void *csp; + void *cgp; + void *c4; + void *c5; + void *c6; + void *c7; + void *c8; + void *c9; + void *c10; + void *c11; + void *c12; + void *c13; + void *c14; + void *c15; + void *hazardPointers; +#ifdef CHERIOT_HAS_ZTOP + void *ztop; +#endif size_t mstatus; size_t mcause; -#ifdef CONFIG_MSHWM +#ifdef CHERIOT_HAS_MSHWM uint32_t mshwm; uint32_t mshwmb; #endif @@ -66,7 +73,7 @@ struct TrustedStackGeneric uint8_t inForcedUnwind; // Padding up to multiple of 16-bytes. uint8_t padding[ -#ifdef CONFIG_MSHWM +#ifdef CHERIOT_HAS_MSHWM 11 #else 3 diff --git a/sdk/xmake.lua b/sdk/xmake.lua index 96b03cdd..11e4b21b 100644 --- a/sdk/xmake.lua +++ b/sdk/xmake.lua @@ -375,8 +375,16 @@ rule("firmware") local loader = target:deps()['cheriot.loader']; + if board.fast_stack_zeroing and not board.stack_high_water_mark then + error("Fast stack zeroing requires the stack high-water mark") + end if board.stack_high_water_mark then - add_defines("CONFIG_MSHWM") + add_defines("CHERIOT_HAS_MSHWM") + if (board.fast_stack_zeroing) then + add_defines("CHERIOT_HAS_ZTOP") + -- If we have ztop, we need space to spill and reload it. + loader:set('loader_trusted_stack_size', loader:get('loader_trusted_stack_size') + 8) + end else -- If we don't have the stack high watermark, the trusted stack is smaller. loader:set('loader_trusted_stack_size', 176)