diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index f13c162776a9b1..73184ddb081ae2 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1107,6 +1107,81 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } } +// FORM_STRIDED_TUPLE nodes are created to improve register allocation where +// a consecutive multi-vector tuple is constructed from the same indices of +// multiple strided loads. This may still result in unnecessary copies between +// the loads and the tuple. Here we try to return a hint to assign the +// contiguous ZPRMulReg starting at the same register as the first operand of +// the pseudo, which should be a subregister of the first strided load. +// +// For example, if the first strided load has been assigned $z16_z20_z24_z28 +// and the operands of the pseudo are each accessing subregister zsub2, we +// should look through through Order to find a contiguous register which +// begins with $z24 (i.e. $z24_z25_z26_z27). +// +bool AArch64RegisterInfo::getRegAllocationHints( + Register VirtReg, ArrayRef Order, + SmallVectorImpl &Hints, const MachineFunction &MF, + const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { + const AArch64Subtarget &STI = MF.getSubtarget(); + const TargetRegisterInfo *TRI = STI.getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + bool DefaultHints = + TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM); + + unsigned RegID = MRI.getRegClass(VirtReg)->getID(); + if (RegID != AArch64::ZPR2Mul2RegClassID && + RegID != AArch64::ZPR4Mul4RegClassID) + return DefaultHints; + + for (MachineInstr &MI : MRI.def_instructions(VirtReg)) { + if (MI.getOpcode() != AArch64::FORM_STRIDED_TUPLE_X2_PSEUDO && + MI.getOpcode() != AArch64::FORM_STRIDED_TUPLE_X4_PSEUDO) + continue; + + // Look up the physical register mapped to the first load of the pseudo. + Register FirstLoadVirtReg = MI.getOperand(1).getReg(); + if (!VRM->hasPhys(FirstLoadVirtReg)) + continue; + + unsigned SubRegIdx = 0; + MCRegister FirstLoadPhysReg = VRM->getPhys(FirstLoadVirtReg); + + // The subreg number is used to access the correct unit of the + // strided register found in the map above. + switch (MI.getOperand(1).getSubReg()) { + case AArch64::zsub0: + break; + case AArch64::zsub1: + SubRegIdx = 1; + break; + case AArch64::zsub2: + SubRegIdx = 2; + break; + case AArch64::zsub3: + SubRegIdx = 3; + break; + default: + continue; + } + + SmallVector RegUnits; + for (MCRegUnit Unit : TRI->regunits(FirstLoadPhysReg)) + RegUnits.push_back(Unit); + + // Find the contiguous ZPRMul register which starts with the + // same register unit as the strided register and add to Hints. + Register StartReg = RegUnits[SubRegIdx]; + for (unsigned I = 0; I < Order.size(); ++I) { + Register Reg = *TRI->regunits(Order[I]).begin(); + if (Reg == StartReg) + Hints.push_back(Order[I]); + } + } + + return DefaultHints; +} + unsigned AArch64RegisterInfo::getLocalAddressRegister( const MachineFunction &MF) const { const auto &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 5c8a5e029584fc..11da624af4881b 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -134,6 +134,11 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo { unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; + bool getRegAllocationHints(Register VirtReg, ArrayRef Order, + SmallVectorImpl &Hints, + const MachineFunction &MF, const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const override; + unsigned getLocalAddressRegister(const MachineFunction &MF) const; bool regNeedsCFI(unsigned Reg, unsigned &RegToUseForCFI) const; diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll index eddff238ace031..ef569e480ea3d6 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll @@ -590,12 +590,8 @@ define void @udot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1] -; CHECK-NEXT: mov z0.d, z16.d -; CHECK-NEXT: mov z1.d, z17.d -; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z24.d -; CHECK-NEXT: mov z1.d, z25.d -; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() @@ -622,26 +618,10 @@ define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] -; CHECK-NEXT: mov z0.d, z16.d -; CHECK-NEXT: mov z1.d, z17.d -; CHECK-NEXT: mov z2.d, z18.d -; CHECK-NEXT: mov z3.d, z19.d -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z20.d -; CHECK-NEXT: mov z1.d, z21.d -; CHECK-NEXT: mov z2.d, z22.d -; CHECK-NEXT: mov z3.d, z23.d -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z24.d -; CHECK-NEXT: mov z1.d, z25.d -; CHECK-NEXT: mov z2.d, z26.d -; CHECK-NEXT: mov z3.d, z27.d -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z28.d -; CHECK-NEXT: mov z1.d, z29.d -; CHECK-NEXT: mov z2.d, z30.d -; CHECK-NEXT: mov z3.d, z31.d -; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() @@ -752,12 +732,8 @@ define void @usdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1] -; CHECK-NEXT: mov z0.d, z16.d -; CHECK-NEXT: mov z1.d, z17.d -; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z24.d -; CHECK-NEXT: mov z1.d, z25.d -; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() @@ -784,26 +760,10 @@ define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] -; CHECK-NEXT: mov z0.d, z16.d -; CHECK-NEXT: mov z1.d, z17.d -; CHECK-NEXT: mov z2.d, z18.d -; CHECK-NEXT: mov z3.d, z19.d -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z20.d -; CHECK-NEXT: mov z1.d, z21.d -; CHECK-NEXT: mov z2.d, z22.d -; CHECK-NEXT: mov z3.d, z23.d -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z24.d -; CHECK-NEXT: mov z1.d, z25.d -; CHECK-NEXT: mov z2.d, z26.d -; CHECK-NEXT: mov z3.d, z27.d -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z28.d -; CHECK-NEXT: mov z1.d, z29.d -; CHECK-NEXT: mov z2.d, z30.d -; CHECK-NEXT: mov z3.d, z31.d -; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() @@ -916,12 +876,8 @@ define void @sdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1] -; CHECK-NEXT: mov z0.d, z16.d -; CHECK-NEXT: mov z1.d, z17.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z24.d -; CHECK-NEXT: mov z1.d, z25.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() @@ -948,26 +904,10 @@ define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] -; CHECK-NEXT: mov z0.d, z16.d -; CHECK-NEXT: mov z1.d, z17.d -; CHECK-NEXT: mov z2.d, z18.d -; CHECK-NEXT: mov z3.d, z19.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z20.d -; CHECK-NEXT: mov z1.d, z21.d -; CHECK-NEXT: mov z2.d, z22.d -; CHECK-NEXT: mov z3.d, z23.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z24.d -; CHECK-NEXT: mov z1.d, z25.d -; CHECK-NEXT: mov z2.d, z26.d -; CHECK-NEXT: mov z3.d, z27.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z28.d -; CHECK-NEXT: mov z1.d, z29.d -; CHECK-NEXT: mov z2.d, z30.d -; CHECK-NEXT: mov z3.d, z31.d -; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() @@ -1080,12 +1020,8 @@ define void @sudot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ld1b { z16.b, z24.b }, pn8/z, [x0] ; CHECK-NEXT: ld1b { z17.b, z25.b }, pn8/z, [x0, x1] -; CHECK-NEXT: mov z0.d, z16.d -; CHECK-NEXT: mov z1.d, z17.d -; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z24.d -; CHECK-NEXT: mov z1.d, z25.d -; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() @@ -1112,26 +1048,10 @@ define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] -; CHECK-NEXT: mov z0.d, z16.d -; CHECK-NEXT: mov z1.d, z17.d -; CHECK-NEXT: mov z2.d, z18.d -; CHECK-NEXT: mov z3.d, z19.d -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z20.d -; CHECK-NEXT: mov z1.d, z21.d -; CHECK-NEXT: mov z2.d, z22.d -; CHECK-NEXT: mov z3.d, z23.d -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z24.d -; CHECK-NEXT: mov z1.d, z25.d -; CHECK-NEXT: mov z2.d, z26.d -; CHECK-NEXT: mov z3.d, z27.d -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z28.d -; CHECK-NEXT: mov z1.d, z29.d -; CHECK-NEXT: mov z2.d, z30.d -; CHECK-NEXT: mov z3.d, z31.d -; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: sudot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll index dec2dfb6f687ca..49106e12378bea 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll @@ -82,12 +82,8 @@ define void @svdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0] ; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9] -; CHECK-NEXT: mov z0.d, z16.d -; CHECK-NEXT: mov z1.d, z17.d -; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] -; CHECK-NEXT: mov z0.d, z24.d -; CHECK-NEXT: mov z1.d, z25.d -; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() @@ -114,26 +110,10 @@ define void @svdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] -; CHECK-NEXT: mov z0.d, z16.d -; CHECK-NEXT: mov z1.d, z17.d -; CHECK-NEXT: mov z2.d, z18.d -; CHECK-NEXT: mov z3.d, z19.d -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z20.d -; CHECK-NEXT: mov z1.d, z21.d -; CHECK-NEXT: mov z2.d, z22.d -; CHECK-NEXT: mov z3.d, z23.d -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z24.d -; CHECK-NEXT: mov z1.d, z25.d -; CHECK-NEXT: mov z2.d, z26.d -; CHECK-NEXT: mov z3.d, z27.d -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z28.d -; CHECK-NEXT: mov z1.d, z29.d -; CHECK-NEXT: mov z2.d, z30.d -; CHECK-NEXT: mov z3.d, z31.d -; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: svdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() @@ -218,12 +198,8 @@ define void @uvdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: ld1h { z16.h, z24.h }, pn8/z, [x0] ; CHECK-NEXT: ld1h { z17.h, z25.h }, pn8/z, [x9] -; CHECK-NEXT: mov z0.d, z16.d -; CHECK-NEXT: mov z1.d, z17.d -; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] -; CHECK-NEXT: mov z0.d, z24.d -; CHECK-NEXT: mov z1.d, z25.d -; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z16.h, z17.h }, z0.h[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx2], { z24.h, z25.h }, z0.h[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() @@ -250,26 +226,10 @@ define void @uvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] -; CHECK-NEXT: mov z0.d, z16.d -; CHECK-NEXT: mov z1.d, z17.d -; CHECK-NEXT: mov z2.d, z18.d -; CHECK-NEXT: mov z3.d, z19.d -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z20.d -; CHECK-NEXT: mov z1.d, z21.d -; CHECK-NEXT: mov z2.d, z22.d -; CHECK-NEXT: mov z3.d, z23.d -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z24.d -; CHECK-NEXT: mov z1.d, z25.d -; CHECK-NEXT: mov z2.d, z26.d -; CHECK-NEXT: mov z3.d, z27.d -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z28.d -; CHECK-NEXT: mov z1.d, z29.d -; CHECK-NEXT: mov z2.d, z30.d -; CHECK-NEXT: mov z3.d, z31.d -; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: uvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() @@ -331,26 +291,10 @@ define void @suvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] -; CHECK-NEXT: mov z0.d, z16.d -; CHECK-NEXT: mov z1.d, z17.d -; CHECK-NEXT: mov z2.d, z18.d -; CHECK-NEXT: mov z3.d, z19.d -; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z20.d -; CHECK-NEXT: mov z1.d, z21.d -; CHECK-NEXT: mov z2.d, z22.d -; CHECK-NEXT: mov z3.d, z23.d -; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z24.d -; CHECK-NEXT: mov z1.d, z25.d -; CHECK-NEXT: mov z2.d, z26.d -; CHECK-NEXT: mov z3.d, z27.d -; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z28.d -; CHECK-NEXT: mov z1.d, z29.d -; CHECK-NEXT: mov z2.d, z30.d -; CHECK-NEXT: mov z3.d, z31.d -; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: suvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8() @@ -412,26 +356,10 @@ define void @usvdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 { ; CHECK-NEXT: add x10, x9, x1 ; CHECK-NEXT: ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9] ; CHECK-NEXT: ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10] -; CHECK-NEXT: mov z0.d, z16.d -; CHECK-NEXT: mov z1.d, z17.d -; CHECK-NEXT: mov z2.d, z18.d -; CHECK-NEXT: mov z3.d, z19.d -; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z20.d -; CHECK-NEXT: mov z1.d, z21.d -; CHECK-NEXT: mov z2.d, z22.d -; CHECK-NEXT: mov z3.d, z23.d -; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z24.d -; CHECK-NEXT: mov z1.d, z25.d -; CHECK-NEXT: mov z2.d, z26.d -; CHECK-NEXT: mov z3.d, z27.d -; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] -; CHECK-NEXT: mov z0.d, z28.d -; CHECK-NEXT: mov z1.d, z29.d -; CHECK-NEXT: mov z2.d, z30.d -; CHECK-NEXT: mov z3.d, z31.d -; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0] +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0] +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0] +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0] +; CHECK-NEXT: usvdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0] ; CHECK-NEXT: ret entry: %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()