diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0a2db3450e579..66a4b539d9da5 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -27884,6 +27884,65 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // X86 Optimization Hooks //===----------------------------------------------------------------------===// +bool +X86TargetLowering::targetShrinkDemandedConstant(SDValue Op, + const APInt &Demanded, + TargetLoweringOpt &TLO) const { + // Only optimize Ands to prevent shrinking a constant that could be + // matched by movzx. + if (Op.getOpcode() != ISD::AND) + return false; + + EVT VT = Op.getValueType(); + + // Ignore vectors. + if (VT.isVector()) + return false; + + unsigned Size = VT.getSizeInBits(); + + // Make sure the RHS really is a constant. + ConstantSDNode *C = dyn_cast(Op.getOperand(1)); + if (!C) + return false; + + const APInt &Mask = C->getAPIntValue(); + + // Clear all non-demanded bits initially. + APInt ShrunkMask = Mask & Demanded; + + // Find the width of the shrunk mask. + unsigned Width = ShrunkMask.getActiveBits(); + + // If the mask is all 0s there's nothing to do here. + if (Width == 0) + return false; + + // Find the next power of 2 width, rounding up to a byte. + Width = PowerOf2Ceil(std::max(Width, 8U)); + // Truncate the width to size to handle illegal types. + Width = std::min(Width, Size); + + // Calculate a possible zero extend mask for this constant. + APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width); + + // If we aren't changing the mask, just return true to keep it and prevent + // the caller from optimizing. + if (ZeroExtendMask == Mask) + return true; + + // Make sure the bits in the ZeroExtendMask are also set in the original mask. + // TODO: We should be able to set bits that aren't demanded too. + if (!ZeroExtendMask.isSubsetOf(Mask)) + return false; + + // Replace the constant with the zero extend mask. + SDLoc DL(Op); + SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT); + SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); + return TLO.CombineTo(Op, NewOp); +} + void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 462c59ade1790..7c6d7ba1124ca 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -835,6 +835,9 @@ namespace llvm { EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; + bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, + TargetLoweringOpt &TLO) const override; + /// Determine which of the bits specified in Mask are known to be either /// zero or one and return them in the KnownZero/KnownOne bitsets. void computeKnownBitsForTargetNode(const SDValue Op, diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 06600a4ef2869..0582e041488f5 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -1514,6 +1514,10 @@ def : Pat<(i8 (trunc GR16:$src)), (EXTRACT_SUBREG GR16:$src, sub_8bit)>, Requires<[In64BitMode]>; +def immff00_ffff : ImmLeaf= 0xff00 && Imm <= 0xffff; +}]>; + // h-register tricks def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))), (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>, @@ -1534,7 +1538,7 @@ def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>; def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>; -def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), +def : Pat<(srl (and_su GR32:$src, immff00_ffff), (i8 8)), (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>; // h-register tricks. diff --git a/test/CodeGen/X86/3addr-or.ll b/test/CodeGen/X86/3addr-or.ll index a31c4f2cf602c..acae1d1c96baa 100644 --- a/test/CodeGen/X86/3addr-or.ll +++ b/test/CodeGen/X86/3addr-or.ll @@ -14,16 +14,18 @@ define i32 @test1(i32 %x) nounwind ssp { ret i32 %t1 } +; This test no longer requires or to be converted to 3 addr form because we are +; are able to use a zero extend instead of an 'and' which gives the register +; allocator freedom. define i64 @test2(i8 %A, i8 %B) nounwind { ; CHECK-LABEL: test2: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def %esi killed %esi def %rsi ; CHECK-NEXT: # kill: def %edi killed %edi def %rdi ; CHECK-NEXT: shll $4, %edi ; CHECK-NEXT: andl $48, %edi -; CHECK-NEXT: andl $240, %esi -; CHECK-NEXT: shrq $4, %rsi -; CHECK-NEXT: leaq (%rsi,%rdi), %rax +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: shrq $4, %rax +; CHECK-NEXT: orq %rdi, %rax ; CHECK-NEXT: retq %C = zext i8 %A to i64 %D = shl i64 %C, 4 diff --git a/test/CodeGen/X86/popcnt.ll b/test/CodeGen/X86/popcnt.ll index 95a835e651dcd..478ec1bcec86d 100644 --- a/test/CodeGen/X86/popcnt.ll +++ b/test/CodeGen/X86/popcnt.ll @@ -71,7 +71,6 @@ define i16 @cnt16(i16 %x) nounwind readnone { ; X32-NEXT: andl $13107, %eax # imm = 0x3333 ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: andl $-16, %ecx ; X32-NEXT: shrl $4, %ecx ; X32-NEXT: addl %eax, %ecx ; X32-NEXT: andl $3855, %ecx # imm = 0xF0F @@ -94,7 +93,6 @@ define i16 @cnt16(i16 %x) nounwind readnone { ; X64-NEXT: andl $13107, %edi # imm = 0x3333 ; X64-NEXT: addl %eax, %edi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: andl $-16, %eax ; X64-NEXT: shrl $4, %eax ; X64-NEXT: addl %edi, %eax ; X64-NEXT: andl $3855, %eax # imm = 0xF0F diff --git a/test/CodeGen/X86/pr21792.ll b/test/CodeGen/X86/pr21792.ll index 1bb6ea6c59218..5c7d4317c0c38 100644 --- a/test/CodeGen/X86/pr21792.ll +++ b/test/CodeGen/X86/pr21792.ll @@ -12,19 +12,18 @@ define void @func(<4 x float> %vx) { ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pextrq $1, %xmm0, %rdx -; CHECK-NEXT: movq %rdx, %rcx -; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: movq %rax, %r9 +; CHECK-NEXT: pextrq $1, %xmm0, %rax +; CHECK-NEXT: movzwl %ax, %ecx +; CHECK-NEXT: shrq $32, %rax +; CHECK-NEXT: movq %xmm0, %rdx +; CHECK-NEXT: movzwl %dx, %r8d +; CHECK-NEXT: movq %rdx, %r9 ; CHECK-NEXT: shrq $32, %r9 -; CHECK-NEXT: andl $2032, %eax # imm = 0x7F0 -; CHECK-NEXT: leaq stuff(%rax), %rdi +; CHECK-NEXT: leaq stuff(%r8), %rdi ; CHECK-NEXT: leaq stuff(%r9), %rsi -; CHECK-NEXT: andl $2032, %edx # imm = 0x7F0 -; CHECK-NEXT: leaq stuff(%rdx), %rdx -; CHECK-NEXT: leaq stuff(%rcx), %rcx -; CHECK-NEXT: leaq stuff+8(%rax), %r8 +; CHECK-NEXT: leaq stuff(%rcx), %rdx +; CHECK-NEXT: leaq stuff(%rax), %rcx +; CHECK-NEXT: leaq stuff+8(%r8), %r8 ; CHECK-NEXT: leaq stuff+8(%r9), %r9 ; CHECK-NEXT: callq toto ; CHECK-NEXT: popq %rax diff --git a/test/CodeGen/X86/zext-demanded.ll b/test/CodeGen/X86/zext-demanded.ll index e142c6d822143..b2a2252dcc9d6 100644 --- a/test/CodeGen/X86/zext-demanded.ll +++ b/test/CodeGen/X86/zext-demanded.ll @@ -5,25 +5,22 @@ ; demanded bits shortcomings. ; The backend will insert a zext to promote the shift to i32. -; TODO: we should be able to use movzx here. define i16 @test1(i16 %x) { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: -; CHECK-NEXT: andl $65534, %edi # imm = 0xFFFE -; CHECK-NEXT: shrl %edi -; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movzwl %di, %eax +; CHECK-NEXT: shrl %eax +; CHECK-NEXT: # kill: def %ax killed %ax killed %eax ; CHECK-NEXT: retq %y = lshr i16 %x, 1 ret i16 %y } -; TODO: we should be able to use movzx here. define i32 @test2(i32 %x) { ; CHECK-LABEL: test2: ; CHECK: # %bb.0: -; CHECK-NEXT: andl $65534, %edi # imm = 0xFFFE -; CHECK-NEXT: shrl %edi -; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: movzwl %di, %eax +; CHECK-NEXT: shrl %eax ; CHECK-NEXT: retq %y = and i32 %x, 65535 %z = lshr i32 %y, 1