From 5248e1d4e1ee950d1703ec57141257b02446dd67 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Mon, 2 Dec 2024 09:32:15 +0000 Subject: [PATCH] [AArch64] Fix frame-pointer offset with hazard padding (#118091) The `-aarch64-stack-hazard-size=` option disables register paring (as the hazard padding may mean the offset is too large for STP/LDP). This broke setting the frame-pointer offset, as the code to find the frame record looked for a (FP, LR) register pair. This patch resolves this by looking for FP, LR as two unpaired registers when hazard padding is enabled. --- .../Target/AArch64/AArch64FrameLowering.cpp | 19 ++- .../CodeGen/AArch64/stack-hazard-windows.ll | 118 +++++++++++++ llvm/test/CodeGen/AArch64/stack-hazard.ll | 157 +++++++++++++++--- 3 files changed, 265 insertions(+), 29 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/stack-hazard-windows.ll diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index d6673969aa3056..2d5ae1c1d14dc8 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -3167,11 +3167,24 @@ static void computeCalleeSaveRegisterPairs( (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) && "Offset out of bounds for LDP/STP immediate"); + auto isFrameRecord = [&] { + if (RPI.isPaired()) + return IsWindows ? RPI.Reg1 == AArch64::FP && RPI.Reg2 == AArch64::LR + : RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP; + // Otherwise, look for the frame record as two unpaired registers. This is + // needed for -aarch64-stack-hazard-size=, which disables register + // pairing (as the padding may be too large for the LDP/STP offset). Note: + // On Windows, this check works out as current reg == FP, next reg == LR, + // and on other platforms current reg == FP, previous reg == LR. This + // works out as the correct pre-increment or post-increment offsets + // respectively. + return i > 0 && RPI.Reg1 == AArch64::FP && + CSI[i - 1].getReg() == AArch64::LR; + }; + // Save the offset to frame record so that the FP register can point to the // innermost frame record (spilled FP and LR registers). - if (NeedsFrameRecord && - ((!IsWindows && RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) || - (IsWindows && RPI.Reg1 == AArch64::FP && RPI.Reg2 == AArch64::LR))) + if (NeedsFrameRecord && isFrameRecord()) AFI->setCalleeSaveBaseToFrameRecordOffset(Offset); RegPairs.push_back(RPI); diff --git a/llvm/test/CodeGen/AArch64/stack-hazard-windows.ll b/llvm/test/CodeGen/AArch64/stack-hazard-windows.ll new file mode 100644 index 00000000000000..2a034fe5e5290c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/stack-hazard-windows.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64-windows-pc-msvc -aarch64-stack-hazard-size=0 | FileCheck %s --check-prefixes=CHECK0 +; RUN: llc < %s -mtriple=aarch64-windows-pc-msvc -aarch64-stack-hazard-size=64 | FileCheck %s --check-prefixes=CHECK64 +; RUN: llc < %s -mtriple=aarch64-windows-pc-msvc -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK1024 + +define i32 @fpr_csr_stackobj(double %x) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { +; CHECK0-LABEL: fpr_csr_stackobj: +; CHECK0: .seh_proc fpr_csr_stackobj +; CHECK0-NEXT: // %bb.0: // %entry +; CHECK0-NEXT: str x23, [sp, #-48]! // 8-byte Folded Spill +; CHECK0-NEXT: .seh_save_reg_x x23, 48 +; CHECK0-NEXT: stp x29, x30, [sp, #8] // 16-byte Folded Spill +; CHECK0-NEXT: .seh_save_fplr 8 +; CHECK0-NEXT: stp d9, d10, [sp, #24] // 16-byte Folded Spill +; CHECK0-NEXT: .seh_save_fregp d9, 24 +; CHECK0-NEXT: add x29, sp, #8 +; CHECK0-NEXT: .seh_add_fp 8 +; CHECK0-NEXT: .seh_endprologue +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: str d0, [x29, #32] +; CHECK0-NEXT: .seh_startepilogue +; CHECK0-NEXT: ldp d9, d10, [sp, #24] // 16-byte Folded Reload +; CHECK0-NEXT: .seh_save_fregp d9, 24 +; CHECK0-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload +; CHECK0-NEXT: .seh_save_fplr 8 +; CHECK0-NEXT: ldr x23, [sp], #48 // 8-byte Folded Reload +; CHECK0-NEXT: .seh_save_reg_x x23, 48 +; CHECK0-NEXT: .seh_endepilogue +; CHECK0-NEXT: ret +; CHECK0-NEXT: .seh_endfunclet +; CHECK0-NEXT: .seh_endproc +; +; CHECK64-LABEL: fpr_csr_stackobj: +; CHECK64: .seh_proc fpr_csr_stackobj +; CHECK64-NEXT: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #192 +; CHECK64-NEXT: .seh_stackalloc 192 +; CHECK64-NEXT: str x23, [sp, #80] // 8-byte Folded Spill +; CHECK64-NEXT: .seh_save_reg x23, 80 +; CHECK64-NEXT: str x29, [sp, #88] // 8-byte Folded Spill +; CHECK64-NEXT: .seh_save_reg x29, 88 +; CHECK64-NEXT: str x30, [sp, #96] // 8-byte Folded Spill +; CHECK64-NEXT: .seh_save_reg x30, 96 +; CHECK64-NEXT: str d9, [sp, #168] // 8-byte Folded Spill +; CHECK64-NEXT: .seh_save_freg d9, 168 +; CHECK64-NEXT: str d10, [sp, #176] // 8-byte Folded Spill +; CHECK64-NEXT: .seh_save_freg d10, 176 +; CHECK64-NEXT: add x29, sp, #88 +; CHECK64-NEXT: .seh_add_fp 88 +; CHECK64-NEXT: .seh_endprologue +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: //APP +; CHECK64-NEXT: //NO_APP +; CHECK64-NEXT: stur d0, [x29, #-16] +; CHECK64-NEXT: .seh_startepilogue +; CHECK64-NEXT: ldr d10, [sp, #176] // 8-byte Folded Reload +; CHECK64-NEXT: .seh_save_freg d10, 176 +; CHECK64-NEXT: ldr d9, [sp, #168] // 8-byte Folded Reload +; CHECK64-NEXT: .seh_save_freg d9, 168 +; CHECK64-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload +; CHECK64-NEXT: .seh_save_reg x30, 96 +; CHECK64-NEXT: ldr x29, [sp, #88] // 8-byte Folded Reload +; CHECK64-NEXT: .seh_save_reg x29, 88 +; CHECK64-NEXT: ldr x23, [sp, #80] // 8-byte Folded Reload +; CHECK64-NEXT: .seh_save_reg x23, 80 +; CHECK64-NEXT: add sp, sp, #192 +; CHECK64-NEXT: .seh_stackalloc 192 +; CHECK64-NEXT: .seh_endepilogue +; CHECK64-NEXT: ret +; CHECK64-NEXT: .seh_endfunclet +; CHECK64-NEXT: .seh_endproc +; +; CHECK1024-LABEL: fpr_csr_stackobj: +; CHECK1024: .seh_proc fpr_csr_stackobj +; CHECK1024-NEXT: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1072 +; CHECK1024-NEXT: str x23, [sp] // 8-byte Folded Spill +; CHECK1024-NEXT: str x29, [sp, #8] // 8-byte Folded Spill +; CHECK1024-NEXT: .seh_save_reg x29, 8 +; CHECK1024-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK1024-NEXT: .seh_save_reg x30, 16 +; CHECK1024-NEXT: str d9, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NEXT: .seh_save_freg d9, 1048 +; CHECK1024-NEXT: str d10, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NEXT: .seh_save_freg d10, 1056 +; CHECK1024-NEXT: add x29, sp, #8 +; CHECK1024-NEXT: .seh_add_fp 8 +; CHECK1024-NEXT: .seh_endprologue +; CHECK1024-NEXT: sub sp, sp, #1040 +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: //APP +; CHECK1024-NEXT: //NO_APP +; CHECK1024-NEXT: stur d0, [x29, #-16] +; CHECK1024-NEXT: .seh_startepilogue +; CHECK1024-NEXT: add sp, sp, #1040 +; CHECK1024-NEXT: .seh_stackalloc 1040 +; CHECK1024-NEXT: ldr d10, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NEXT: .seh_save_freg d10, 1056 +; CHECK1024-NEXT: ldr d9, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NEXT: .seh_save_freg d9, 1048 +; CHECK1024-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK1024-NEXT: .seh_save_reg x30, 16 +; CHECK1024-NEXT: ldr x29, [sp, #8] // 8-byte Folded Reload +; CHECK1024-NEXT: .seh_save_reg x29, 8 +; CHECK1024-NEXT: ldr x23, [sp] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1072 +; CHECK1024-NEXT: .seh_endepilogue +; CHECK1024-NEXT: ret +; CHECK1024-NEXT: .seh_endfunclet +; CHECK1024-NEXT: .seh_endproc +entry: + %a = alloca double + tail call void asm sideeffect "", "~{x23},~{d9},~{d10}"() + store double %x, ptr %a + ret i32 0 +} diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll index 50a2e41f45756d..a4c2b30566a951 100644 --- a/llvm/test/CodeGen/AArch64/stack-hazard.ll +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -337,19 +337,18 @@ define i32 @csr_d8_allocd_framepointer(double %d) "aarch64_pstate_sm_compatible" ; CHECK64-LABEL: csr_d8_allocd_framepointer: ; CHECK64: // %bb.0: // %entry ; CHECK64-NEXT: sub sp, sp, #176 -; CHECK64-NEXT: str d8, [sp, #80] // 8-byte Folded Spill +; CHECK64-NEXT: stp d0, d8, [sp, #72] // 8-byte Folded Spill ; CHECK64-NEXT: stp x29, x30, [sp, #152] // 16-byte Folded Spill -; CHECK64-NEXT: add x29, sp, #80 -; CHECK64-NEXT: .cfi_def_cfa w29, 96 +; CHECK64-NEXT: add x29, sp, #152 +; CHECK64-NEXT: .cfi_def_cfa w29, 24 ; CHECK64-NEXT: .cfi_offset w30, -16 ; CHECK64-NEXT: .cfi_offset w29, -24 ; CHECK64-NEXT: .cfi_offset b8, -96 ; CHECK64-NEXT: //APP ; CHECK64-NEXT: //NO_APP -; CHECK64-NEXT: stur d0, [x29, #-8] ; CHECK64-NEXT: ldr x29, [sp, #152] // 8-byte Folded Reload -; CHECK64-NEXT: ldr d8, [sp, #80] // 8-byte Folded Reload ; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: ldr d8, [sp, #80] // 8-byte Folded Reload ; CHECK64-NEXT: add sp, sp, #176 ; CHECK64-NEXT: ret ; @@ -358,17 +357,17 @@ define i32 @csr_d8_allocd_framepointer(double %d) "aarch64_pstate_sm_compatible" ; CHECK1024-NEXT: sub sp, sp, #1056 ; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill ; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: mov x29, sp +; CHECK1024-NEXT: add x29, sp, #1032 ; CHECK1024-NEXT: str x30, [sp, #1040] // 8-byte Folded Spill ; CHECK1024-NEXT: sub sp, sp, #1040 -; CHECK1024-NEXT: .cfi_def_cfa w29, 1056 +; CHECK1024-NEXT: .cfi_def_cfa w29, 24 ; CHECK1024-NEXT: .cfi_offset w30, -16 ; CHECK1024-NEXT: .cfi_offset w29, -24 ; CHECK1024-NEXT: .cfi_offset b8, -1056 ; CHECK1024-NEXT: mov w0, wzr ; CHECK1024-NEXT: //APP ; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: stur d0, [x29, #-8] +; CHECK1024-NEXT: str d0, [sp, #1032] ; CHECK1024-NEXT: add sp, sp, #1040 ; CHECK1024-NEXT: ldr x30, [sp, #1040] // 8-byte Folded Reload ; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload @@ -2893,8 +2892,8 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK64-NEXT: stp x29, x30, [sp, #128] // 16-byte Folded Spill ; CHECK64-NEXT: stp x9, x20, [sp, #144] // 16-byte Folded Spill ; CHECK64-NEXT: str x19, [sp, #160] // 8-byte Folded Spill -; CHECK64-NEXT: mov x29, sp -; CHECK64-NEXT: .cfi_def_cfa w29, 176 +; CHECK64-NEXT: add x29, sp, #128 +; CHECK64-NEXT: .cfi_def_cfa w29, 48 ; CHECK64-NEXT: .cfi_offset w19, -16 ; CHECK64-NEXT: .cfi_offset w20, -24 ; CHECK64-NEXT: .cfi_offset w30, -40 @@ -2913,11 +2912,11 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK64-NEXT: mov w20, w0 ; CHECK64-NEXT: msub x9, x8, x8, x9 ; CHECK64-NEXT: mov sp, x9 -; CHECK64-NEXT: stur x9, [x29, #-80] -; CHECK64-NEXT: sub x9, x29, #80 -; CHECK64-NEXT: sturh wzr, [x29, #-70] -; CHECK64-NEXT: stur wzr, [x29, #-68] -; CHECK64-NEXT: sturh w8, [x29, #-72] +; CHECK64-NEXT: stur x9, [x29, #-208] +; CHECK64-NEXT: sub x9, x29, #208 +; CHECK64-NEXT: sturh wzr, [x29, #-198] +; CHECK64-NEXT: stur wzr, [x29, #-196] +; CHECK64-NEXT: sturh w8, [x29, #-200] ; CHECK64-NEXT: msr TPIDR2_EL0, x9 ; CHECK64-NEXT: .cfi_offset vg, -32 ; CHECK64-NEXT: smstop sm @@ -2926,14 +2925,14 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK64-NEXT: .cfi_restore vg ; CHECK64-NEXT: smstart za ; CHECK64-NEXT: mrs x8, TPIDR2_EL0 -; CHECK64-NEXT: sub x0, x29, #80 +; CHECK64-NEXT: sub x0, x29, #208 ; CHECK64-NEXT: cbnz x8, .LBB33_2 ; CHECK64-NEXT: // %bb.1: // %entry ; CHECK64-NEXT: bl __arm_tpidr2_restore ; CHECK64-NEXT: .LBB33_2: // %entry ; CHECK64-NEXT: mov w0, w20 ; CHECK64-NEXT: msr TPIDR2_EL0, xzr -; CHECK64-NEXT: mov sp, x29 +; CHECK64-NEXT: sub sp, x29, #128 ; CHECK64-NEXT: .cfi_def_cfa wsp, 176 ; CHECK64-NEXT: ldp x20, x19, [sp, #152] // 16-byte Folded Reload ; CHECK64-NEXT: ldr d14, [sp, #8] // 8-byte Folded Reload @@ -2972,8 +2971,8 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK1024-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill ; CHECK1024-NEXT: str x20, [sp, #1120] // 8-byte Folded Spill ; CHECK1024-NEXT: str x19, [sp, #1128] // 8-byte Folded Spill -; CHECK1024-NEXT: mov x29, sp -; CHECK1024-NEXT: .cfi_def_cfa w29, 1136 +; CHECK1024-NEXT: add x29, sp, #1088 +; CHECK1024-NEXT: .cfi_def_cfa w29, 48 ; CHECK1024-NEXT: .cfi_offset w19, -8 ; CHECK1024-NEXT: .cfi_offset w20, -16 ; CHECK1024-NEXT: .cfi_offset w28, -24 @@ -2993,14 +2992,14 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK1024-NEXT: mov w20, w0 ; CHECK1024-NEXT: msub x9, x8, x8, x9 ; CHECK1024-NEXT: mov sp, x9 -; CHECK1024-NEXT: sub x10, x29, #784 +; CHECK1024-NEXT: sub x10, x29, #1872 ; CHECK1024-NEXT: stur x9, [x10, #-256] -; CHECK1024-NEXT: sub x9, x29, #774 -; CHECK1024-NEXT: sub x10, x29, #772 +; CHECK1024-NEXT: sub x9, x29, #1862 +; CHECK1024-NEXT: sub x10, x29, #1860 ; CHECK1024-NEXT: sturh wzr, [x9, #-256] -; CHECK1024-NEXT: sub x9, x29, #1040 +; CHECK1024-NEXT: sub x9, x29, #2128 ; CHECK1024-NEXT: stur wzr, [x10, #-256] -; CHECK1024-NEXT: sub x10, x29, #776 +; CHECK1024-NEXT: sub x10, x29, #1864 ; CHECK1024-NEXT: sturh w8, [x10, #-256] ; CHECK1024-NEXT: msr TPIDR2_EL0, x9 ; CHECK1024-NEXT: .cfi_offset vg, -32 @@ -3010,14 +3009,14 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ ; CHECK1024-NEXT: .cfi_restore vg ; CHECK1024-NEXT: smstart za ; CHECK1024-NEXT: mrs x8, TPIDR2_EL0 -; CHECK1024-NEXT: sub x0, x29, #1040 +; CHECK1024-NEXT: sub x0, x29, #2128 ; CHECK1024-NEXT: cbnz x8, .LBB33_2 ; CHECK1024-NEXT: // %bb.1: // %entry ; CHECK1024-NEXT: bl __arm_tpidr2_restore ; CHECK1024-NEXT: .LBB33_2: // %entry ; CHECK1024-NEXT: mov w0, w20 ; CHECK1024-NEXT: msr TPIDR2_EL0, xzr -; CHECK1024-NEXT: mov sp, x29 +; CHECK1024-NEXT: sub sp, x29, #1088 ; CHECK1024-NEXT: .cfi_def_cfa wsp, 1136 ; CHECK1024-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK1024-NEXT: ldr x19, [sp, #1128] // 8-byte Folded Reload @@ -3049,3 +3048,109 @@ entry: ret i32 %x } declare void @other() + +declare void @bar(ptr noundef) "aarch64_pstate_sm_compatible" + +define i32 @sve_stack_object_and_vla(double %d, i64 %sz) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { +; CHECK0-LABEL: sve_stack_object_and_vla: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK0-NEXT: stp x28, x19, [sp, #16] // 16-byte Folded Spill +; CHECK0-NEXT: mov x29, sp +; CHECK0-NEXT: addvl sp, sp, #-1 +; CHECK0-NEXT: mov x19, sp +; CHECK0-NEXT: .cfi_def_cfa w29, 32 +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w28, -16 +; CHECK0-NEXT: .cfi_offset w30, -24 +; CHECK0-NEXT: .cfi_offset w29, -32 +; CHECK0-NEXT: lsl x9, x0, #2 +; CHECK0-NEXT: mov x8, sp +; CHECK0-NEXT: add x9, x9, #15 +; CHECK0-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK0-NEXT: sub x0, x8, x9 +; CHECK0-NEXT: mov sp, x0 +; CHECK0-NEXT: mov z0.s, #0 // =0x0 +; CHECK0-NEXT: ptrue p0.s +; CHECK0-NEXT: st1w { z0.s }, p0, [x29, #-1, mul vl] +; CHECK0-NEXT: bl bar +; CHECK0-NEXT: mov w0, wzr +; CHECK0-NEXT: mov sp, x29 +; CHECK0-NEXT: ldp x28, x19, [sp, #16] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: sve_stack_object_and_vla: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: sub sp, sp, #96 +; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK64-NEXT: add x29, sp, #64 +; CHECK64-NEXT: stp x28, x19, [sp, #80] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-1 +; CHECK64-NEXT: mov x19, sp +; CHECK64-NEXT: .cfi_def_cfa w29, 32 +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w28, -16 +; CHECK64-NEXT: .cfi_offset w30, -24 +; CHECK64-NEXT: .cfi_offset w29, -32 +; CHECK64-NEXT: lsl x9, x0, #2 +; CHECK64-NEXT: mov x8, sp +; CHECK64-NEXT: add x9, x9, #15 +; CHECK64-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK64-NEXT: sub x0, x8, x9 +; CHECK64-NEXT: mov sp, x0 +; CHECK64-NEXT: mov z0.s, #0 // =0x0 +; CHECK64-NEXT: ptrue p0.s +; CHECK64-NEXT: sub x8, x29, #64 +; CHECK64-NEXT: st1w { z0.s }, p0, [x8, #-1, mul vl] +; CHECK64-NEXT: bl bar +; CHECK64-NEXT: mov w0, wzr +; CHECK64-NEXT: sub sp, x29, #64 +; CHECK64-NEXT: ldp x28, x19, [sp, #80] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK64-NEXT: add sp, sp, #96 +; CHECK64-NEXT: ret +; +; CHECK1024-LABEL: sve_stack_object_and_vla: +; CHECK1024: // %bb.0: // %entry +; CHECK1024-NEXT: sub sp, sp, #1056 +; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NEXT: add x29, sp, #1024 +; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NEXT: str x28, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NEXT: str x19, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NEXT: sub sp, sp, #1024 +; CHECK1024-NEXT: addvl sp, sp, #-1 +; CHECK1024-NEXT: mov x19, sp +; CHECK1024-NEXT: .cfi_def_cfa w29, 32 +; CHECK1024-NEXT: .cfi_offset w19, -8 +; CHECK1024-NEXT: .cfi_offset w28, -16 +; CHECK1024-NEXT: .cfi_offset w30, -24 +; CHECK1024-NEXT: .cfi_offset w29, -32 +; CHECK1024-NEXT: lsl x9, x0, #2 +; CHECK1024-NEXT: mov x8, sp +; CHECK1024-NEXT: add x9, x9, #15 +; CHECK1024-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK1024-NEXT: sub x0, x8, x9 +; CHECK1024-NEXT: mov sp, x0 +; CHECK1024-NEXT: mov z0.s, #0 // =0x0 +; CHECK1024-NEXT: ptrue p0.s +; CHECK1024-NEXT: sub x8, x29, #1024 +; CHECK1024-NEXT: st1w { z0.s }, p0, [x8, #-1, mul vl] +; CHECK1024-NEXT: bl bar +; CHECK1024-NEXT: mov w0, wzr +; CHECK1024-NEXT: sub sp, x29, #1024 +; CHECK1024-NEXT: ldr x19, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x28, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NEXT: add sp, sp, #1056 +; CHECK1024-NEXT: ret +entry: + %a = alloca + %b = alloca i32, i64 %sz, align 4 + store zeroinitializer, ptr %a + call void @bar(ptr noundef nonnull %b) + ret i32 0 +}