Don't over-optimize the abi layout #93405

Urgau · 2022-01-28T01:39:17Z

This PR removes the aggregate "optimization" done to small (<= ptr-size * 2) return arguments because this cause many problems:

This block auto-vectorization (especially in with floats) Basic vectorization performance regression from 1.48.0 onwards #85265 (comment)
This adds unnecessary steps for packing (in the function) and unpacking (at the caller) the return arguments.

Specifically the behavior before the this PR was to aggregate every-type (struct, array) that was pass or return and was small enough (<= ptr-size * 2) into a single and unique integer, no matter the underline type. This would means for example that [f32; 3] (which is a very common representation for a vector in 3 dimensions) would be represented as a unique i96 in the LLVM-IR that would need to be pack to be returned and than unpack at the caller to be used. #91447 (comment)

I expect this change to have some compile time improvement and regressions, the improvement will probably be due to the removal of the packing and unpacking machinery and the regressions will most certanly be caused by the extra optimizations like auto-vectorization that LLVM will be able to perform. Overall I thing is change is worth while as it fix some regressions and improve the generated assembly.

Examples (Rust)

pub struct Stats
{
    x: f32,
    y: f32,
    z: f32
}

pub fn sum(a: &Stats, b: &Stats) -> Stats
{
    Stats {
        x: a.x + b.x,
        y: a.y + b.y,
        z: a.z + b.z
    }
}

pub struct Stats2
{
    x: u32,
    y: u32,
    z: f32
}

pub fn sum2(a: &Stats2, b: &Stats2) -> Stats2
{
    Stats2 {
        x: a.x + b.x,
        y: a.y + b.y,
        z: a.z + b.z
    }
}

pub struct Stats3
{
    x: u64,
    y: u32,
    z: f32
}

pub fn sum3(a: &Stats3, b: &Stats3) -> Stats3
{
    Stats3 {
        x: a.x + b.x,
        y: a.y + b.y,
        z: a.z + b.z
    }
}

pub struct Stats4
{
    x: u16,
    y: u16,
    z: u16
}

pub fn sum4(a: &Stats4, b: &Stats4) -> Stats4
{
    Stats4 {
        x: a.x + b.x,
        y: a.y + b.y,
        z: a.z + b.z
    }
}

pub struct Stats5
{
    x: u16,
    y: u16,
    z: u16,
    a: u16,
    b: u16,
    c: u16,
    d: u16,
    e: u16,
    f: u16,
}

pub fn sum5(a: &Stats5, b: &Stats5) -> Stats5
{
    Stats5 {
        x: a.x + b.x,
        y: a.y + b.y,
        z: a.z + b.z,
        a: a.a + b.a,
        b: a.b + b.b,
        c: a.c + b.c,
        d: a.d + b.d,
        e: a.e + b.e,
        f: a.f + b.f,
    }
}

pub struct Uuu {
    a: u64,
    b: bool,
    c: u8,
    d: u32,
}

pub fn sum6(a: &Uuu, b: &Uuu) -> Uuu {
    Uuu {
        a: a.a + b.a,
        b: a.b,
        c: a.c + b.c,
        d: a.d + b.d,
    }
}

pub fn case_1(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
    [
        a[0] + b[0],
        a[1] + b[1],
        a[2] + b[2],
        a[3] + b[3],
    ]
}

pub fn case_2(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
    let mut c = [0.0; 4];
    for i in 0..4 {
        c[i] = a[i] + b[i];
    }
    c
}

#[no_mangle]
pub fn array_clone(a: &[u8; 2]) -> [u8; 2] {
    // CHECK-NOT: getelementptr
    // CHECK-NOT: load i8
    // CHECK-NOT: zext
    // CHECK-NOT: shl
    // CHECK: load i16
    // CHECK-NEXT: ret i16
    a.clone()
}

#[no_mangle]
pub fn array_clone_big(a: &[u8; 16]) -> [u8; 16] {
    a.clone()
}

#[no_mangle]
pub fn array_eq_value(a: [u16; 8], b: [u16; 8]) -> bool {
    a == b
}

#[no_mangle]
pub fn is_zero_slice_short(data: &[u8; 4]) -> bool {
    &data[..] == [0; 4]
}

pub struct Bytes {
  a: u8,
  b: u8,
  c: u8,
  d: u8,
}

// CHECK-LABEL: small_array_alignment
#[no_mangle]
pub fn small_array_alignment(x: &mut [i8; 4], y: [i8; 4]) {
    *x = y;
}

// CHECK-LABEL: small_struct_alignment
#[no_mangle]
pub fn small_struct_alignment(x: &mut Bytes, y: Bytes) {
    *x = y;
}

Examples (ASM - Current nightly)

	.text
	.intel_syntax noprefix
	.file	"a.a146b597-cgu.0"
	.section	.text._ZN1a3sum17hf9af3d6e76074e96E,"ax",@progbits
	.globl	_ZN1a3sum17hf9af3d6e76074e96E
	.p2align	4, 0x90
	.type	_ZN1a3sum17hf9af3d6e76074e96E,@function
_ZN1a3sum17hf9af3d6e76074e96E:
	.cfi_startproc
	vmovss	xmm0, dword ptr [rdi]
	vaddss	xmm0, xmm0, dword ptr [rsi]
	vmovsd	xmm1, qword ptr [rdi + 4]
	vmovsd	xmm2, qword ptr [rsi + 4]
	vaddps	xmm1, xmm1, xmm2
	vmovd	eax, xmm0
	vextractps	edx, xmm1, 1
	vextractps	ecx, xmm1, 0
	shl	rcx, 32
	or	rax, rcx
	ret
.Lfunc_end0:
	.size	_ZN1a3sum17hf9af3d6e76074e96E, .Lfunc_end0-_ZN1a3sum17hf9af3d6e76074e96E
	.cfi_endproc

	.section	.text._ZN1a4sum217h24c2e94f4c556471E,"ax",@progbits
	.globl	_ZN1a4sum217h24c2e94f4c556471E
	.p2align	4, 0x90
	.type	_ZN1a4sum217h24c2e94f4c556471E,@function
_ZN1a4sum217h24c2e94f4c556471E:
	.cfi_startproc
	mov	eax, dword ptr [rsi]
	mov	ecx, dword ptr [rsi + 4]
	add	eax, dword ptr [rdi]
	add	ecx, dword ptr [rdi + 4]
	vmovss	xmm0, dword ptr [rdi + 8]
	vaddss	xmm0, xmm0, dword ptr [rsi + 8]
	vmovd	edx, xmm0
	shl	rcx, 32
	or	rax, rcx
	ret
.Lfunc_end1:
	.size	_ZN1a4sum217h24c2e94f4c556471E, .Lfunc_end1-_ZN1a4sum217h24c2e94f4c556471E
	.cfi_endproc

	.section	.text._ZN1a4sum317hd5bc84ce8141d548E,"ax",@progbits
	.globl	_ZN1a4sum317hd5bc84ce8141d548E
	.p2align	4, 0x90
	.type	_ZN1a4sum317hd5bc84ce8141d548E,@function
_ZN1a4sum317hd5bc84ce8141d548E:
	.cfi_startproc
	mov	rax, qword ptr [rsi]
	add	rax, qword ptr [rdi]
	mov	edx, dword ptr [rsi + 8]
	add	edx, dword ptr [rdi + 8]
	vmovss	xmm0, dword ptr [rdi + 12]
	vaddss	xmm0, xmm0, dword ptr [rsi + 12]
	vmovd	ecx, xmm0
	shl	rcx, 32
	or	rdx, rcx
	ret
.Lfunc_end2:
	.size	_ZN1a4sum317hd5bc84ce8141d548E, .Lfunc_end2-_ZN1a4sum317hd5bc84ce8141d548E
	.cfi_endproc

	.section	.text._ZN1a4sum417h9691e4d029ee251dE,"ax",@progbits
	.globl	_ZN1a4sum417h9691e4d029ee251dE
	.p2align	4, 0x90
	.type	_ZN1a4sum417h9691e4d029ee251dE,@function
_ZN1a4sum417h9691e4d029ee251dE:
	.cfi_startproc
	movzx	eax, word ptr [rsi]
	add	ax, word ptr [rdi]
	vmovd	xmm0, dword ptr [rdi + 2]
	vmovd	xmm1, dword ptr [rsi + 2]
	vpaddw	xmm0, xmm1, xmm0
	vpmovzxwq	xmm0, xmm0
	vpsllq	xmm1, xmm0, 16
	vpxor	xmm2, xmm2, xmm2
	vpunpckhdq	xmm0, xmm2, xmm0
	vpor	xmm0, xmm0, xmm1
	vmovq	rcx, xmm0
	movzx	eax, ax
	or	rax, rcx
	ret
.Lfunc_end3:
	.size	_ZN1a4sum417h9691e4d029ee251dE, .Lfunc_end3-_ZN1a4sum417h9691e4d029ee251dE
	.cfi_endproc

	.section	.text._ZN1a4sum517hb405ed71ffe589ceE,"ax",@progbits
	.globl	_ZN1a4sum517hb405ed71ffe589ceE
	.p2align	4, 0x90
	.type	_ZN1a4sum517hb405ed71ffe589ceE,@function
_ZN1a4sum517hb405ed71ffe589ceE:
	.cfi_startproc
	mov	rax, rdi
	vmovdqu	xmm0, xmmword ptr [rdx]
	vpaddw	xmm0, xmm0, xmmword ptr [rsi]
	movzx	ecx, word ptr [rdx + 16]
	add	cx, word ptr [rsi + 16]
	vmovdqu	xmmword ptr [rdi], xmm0
	mov	word ptr [rdi + 16], cx
	ret
.Lfunc_end4:
	.size	_ZN1a4sum517hb405ed71ffe589ceE, .Lfunc_end4-_ZN1a4sum517hb405ed71ffe589ceE
	.cfi_endproc

	.section	.text._ZN1a4sum617hc70e45116b4c4cb5E,"ax",@progbits
	.globl	_ZN1a4sum617hc70e45116b4c4cb5E
	.p2align	4, 0x90
	.type	_ZN1a4sum617hc70e45116b4c4cb5E,@function
_ZN1a4sum617hc70e45116b4c4cb5E:
	.cfi_startproc
	mov	rax, qword ptr [rsi]
	add	rax, qword ptr [rdi]
	movzx	ecx, byte ptr [rdi + 12]
	mov	r8b, byte ptr [rsi + 13]
	add	r8b, byte ptr [rdi + 13]
	mov	edx, dword ptr [rsi + 8]
	add	edx, dword ptr [rdi + 8]
	movzx	esi, r8b
	shl	rsi, 40
	shl	rcx, 32
	or	rcx, rsi
	or	rdx, rcx
	ret
.Lfunc_end5:
	.size	_ZN1a4sum617hc70e45116b4c4cb5E, .Lfunc_end5-_ZN1a4sum617hc70e45116b4c4cb5E
	.cfi_endproc

	.section	.text._ZN1a6case_117hafdcab5a8df01687E,"ax",@progbits
	.globl	_ZN1a6case_117hafdcab5a8df01687E
	.p2align	4, 0x90
	.type	_ZN1a6case_117hafdcab5a8df01687E,@function
_ZN1a6case_117hafdcab5a8df01687E:
	.cfi_startproc
	mov	rax, rsi
	shld	rax, rdi, 32
	vmovd	xmm0, esi
	shr	rsi, 32
	vmovq	xmm1, rax
	vmovq	xmm2, rsi
	vpunpckldq	xmm1, xmm2, xmm1
	vmovd	xmm2, ecx
	vaddss	xmm0, xmm0, xmm2
	vmovd	xmm2, edx
	shrd	rdx, rcx, 32
	shr	rcx, 32
	vmovq	xmm3, rdx
	vmovq	xmm4, rcx
	vpunpckldq	xmm3, xmm4, xmm3
	vmovd	xmm4, edi
	vaddps	xmm1, xmm1, xmm3
	vaddps	xmm2, xmm2, xmm4
	vextractps	eax, xmm2, 0
	vmovd	ecx, xmm0
	vextractps	edx, xmm1, 0
	vextractps	esi, xmm1, 1
	shl	rsi, 32
	shl	rdx, 32
	or	rdx, rcx
	or	rax, rsi
	ret
.Lfunc_end6:
	.size	_ZN1a6case_117hafdcab5a8df01687E, .Lfunc_end6-_ZN1a6case_117hafdcab5a8df01687E
	.cfi_endproc

	.section	.text._ZN1a6case_217h872d9cdc533ae429E,"ax",@progbits
	.globl	_ZN1a6case_217h872d9cdc533ae429E
	.p2align	4, 0x90
	.type	_ZN1a6case_217h872d9cdc533ae429E,@function
_ZN1a6case_217h872d9cdc533ae429E:
	.cfi_startproc
	mov	rax, rsi
	shld	rax, rdi, 32
	vmovd	xmm0, esi
	shr	rsi, 32
	vmovq	xmm1, rax
	vmovq	xmm2, rsi
	vpunpckldq	xmm1, xmm2, xmm1
	vmovd	xmm2, ecx
	vaddss	xmm0, xmm0, xmm2
	vmovd	xmm2, edx
	shrd	rdx, rcx, 32
	shr	rcx, 32
	vmovq	xmm3, rdx
	vmovq	xmm4, rcx
	vpunpckldq	xmm3, xmm4, xmm3
	vmovd	xmm4, edi
	vaddps	xmm1, xmm1, xmm3
	vaddps	xmm2, xmm2, xmm4
	vextractps	eax, xmm2, 0
	vmovd	ecx, xmm0
	vextractps	edx, xmm1, 0
	vextractps	esi, xmm1, 1
	shl	rsi, 32
	shl	rdx, 32
	or	rdx, rcx
	or	rax, rsi
	ret
.Lfunc_end7:
	.size	_ZN1a6case_217h872d9cdc533ae429E, .Lfunc_end7-_ZN1a6case_217h872d9cdc533ae429E
	.cfi_endproc

	.section	.text.array_clone,"ax",@progbits
	.globl	array_clone
	.p2align	4, 0x90
	.type	array_clone,@function
array_clone:
	.cfi_startproc
	movzx	eax, word ptr [rdi]
	ret
.Lfunc_end8:
	.size	array_clone, .Lfunc_end8-array_clone
	.cfi_endproc

	.section	.text.array_clone_big,"ax",@progbits
	.globl	array_clone_big
	.p2align	4, 0x90
	.type	array_clone_big,@function
array_clone_big:
	.cfi_startproc
	mov	rax, qword ptr [rdi]
	mov	rdx, qword ptr [rdi + 8]
	ret
.Lfunc_end9:
	.size	array_clone_big, .Lfunc_end9-array_clone_big
	.cfi_endproc

	.section	.text.array_eq_value,"ax",@progbits
	.globl	array_eq_value
	.p2align	4, 0x90
	.type	array_eq_value,@function
array_eq_value:
	.cfi_startproc
	xor	rsi, rcx
	xor	rdi, rdx
	or	rdi, rsi
	sete	al
	ret
.Lfunc_end10:
	.size	array_eq_value, .Lfunc_end10-array_eq_value
	.cfi_endproc

	.section	.text.is_zero_slice_short,"ax",@progbits
	.globl	is_zero_slice_short
	.p2align	4, 0x90
	.type	is_zero_slice_short,@function
is_zero_slice_short:
	.cfi_startproc
	cmp	dword ptr [rdi], 0
	sete	al
	ret
.Lfunc_end11:
	.size	is_zero_slice_short, .Lfunc_end11-is_zero_slice_short
	.cfi_endproc

	.section	.text.small_array_alignment,"ax",@progbits
	.globl	small_array_alignment
	.p2align	4, 0x90
	.type	small_array_alignment,@function
small_array_alignment:
	.cfi_startproc
	mov	dword ptr [rdi], esi
	ret
.Lfunc_end12:
	.size	small_array_alignment, .Lfunc_end12-small_array_alignment
	.cfi_endproc

	.type	.Lalloc41,@object
	.section	.rodata.cst4,"aM",@progbits,4
.Lalloc41:
	.zero	4
	.size	.Lalloc41, 4

	.globl	small_struct_alignment
	.type	small_struct_alignment,@function
.set small_struct_alignment, small_array_alignment
	.section	".note.GNU-stack","",@progbits

Examples (ASM - This PR)

	.text
	.intel_syntax noprefix
	.file	"a.a9ba02be-cgu.0"
	.section	.text._ZN1a3sum17h8c043182f8b901bbE,"ax",@progbits
	.globl	_ZN1a3sum17h8c043182f8b901bbE
	.p2align	4, 0x90
	.type	_ZN1a3sum17h8c043182f8b901bbE,@function
_ZN1a3sum17h8c043182f8b901bbE:
	.cfi_startproc
	sub	rsp, 4
	.cfi_def_cfa_offset 12
	vmovss	xmm0, dword ptr [rdi]
	vaddss	xmm0, xmm0, dword ptr [rsi]
	vmovss	xmm1, dword ptr [rdi + 4]
	vaddss	xmm1, xmm1, dword ptr [rsi + 4]
	vmovss	xmm2, dword ptr [rdi + 8]
	vaddss	xmm2, xmm2, dword ptr [rsi + 8]
	vmovss	dword ptr [rsp], xmm2
	fld	dword ptr [rsp]
	add	rsp, 4
	.cfi_def_cfa_offset 8
	ret
.Lfunc_end0:
	.size	_ZN1a3sum17h8c043182f8b901bbE, .Lfunc_end0-_ZN1a3sum17h8c043182f8b901bbE
	.cfi_endproc

	.section	.text._ZN1a4sum217he622ac4d5666af14E,"ax",@progbits
	.globl	_ZN1a4sum217he622ac4d5666af14E
	.p2align	4, 0x90
	.type	_ZN1a4sum217he622ac4d5666af14E,@function
_ZN1a4sum217he622ac4d5666af14E:
	.cfi_startproc
	mov	eax, dword ptr [rsi]
	mov	edx, dword ptr [rsi + 4]
	add	eax, dword ptr [rdi]
	add	edx, dword ptr [rdi + 4]
	vmovss	xmm0, dword ptr [rdi + 8]
	vaddss	xmm0, xmm0, dword ptr [rsi + 8]
	ret
.Lfunc_end1:
	.size	_ZN1a4sum217he622ac4d5666af14E, .Lfunc_end1-_ZN1a4sum217he622ac4d5666af14E
	.cfi_endproc

	.section	.text._ZN1a4sum317h04cf9869dc54377cE,"ax",@progbits
	.globl	_ZN1a4sum317h04cf9869dc54377cE
	.p2align	4, 0x90
	.type	_ZN1a4sum317h04cf9869dc54377cE,@function
_ZN1a4sum317h04cf9869dc54377cE:
	.cfi_startproc
	mov	r8, qword ptr [rdx]
	add	r8, qword ptr [rsi]
	mov	ecx, dword ptr [rdx + 8]
	add	ecx, dword ptr [rsi + 8]
	vmovss	xmm0, dword ptr [rsi + 12]
	vaddss	xmm0, xmm0, dword ptr [rdx + 12]
	mov	rax, rdi
	mov	qword ptr [rdi], r8
	mov	dword ptr [rdi + 8], ecx
	vmovss	dword ptr [rdi + 12], xmm0
	ret
.Lfunc_end2:
	.size	_ZN1a4sum317h04cf9869dc54377cE, .Lfunc_end2-_ZN1a4sum317h04cf9869dc54377cE
	.cfi_endproc

	.section	.text._ZN1a4sum417h67d9515994751a4fE,"ax",@progbits
	.globl	_ZN1a4sum417h67d9515994751a4fE
	.p2align	4, 0x90
	.type	_ZN1a4sum417h67d9515994751a4fE,@function
_ZN1a4sum417h67d9515994751a4fE:
	.cfi_startproc
	movzx	eax, word ptr [rsi]
	movzx	edx, word ptr [rsi + 2]
	add	ax, word ptr [rdi]
	add	dx, word ptr [rdi + 2]
	movzx	ecx, word ptr [rsi + 4]
	add	cx, word ptr [rdi + 4]
	ret
.Lfunc_end3:
	.size	_ZN1a4sum417h67d9515994751a4fE, .Lfunc_end3-_ZN1a4sum417h67d9515994751a4fE
	.cfi_endproc

	.section	.text._ZN1a4sum517hc3b00dfa19b7572aE,"ax",@progbits
	.globl	_ZN1a4sum517hc3b00dfa19b7572aE
	.p2align	4, 0x90
	.type	_ZN1a4sum517hc3b00dfa19b7572aE,@function
_ZN1a4sum517hc3b00dfa19b7572aE:
	.cfi_startproc
	mov	rax, rdi
	vmovdqu	xmm0, xmmword ptr [rdx]
	vpaddw	xmm0, xmm0, xmmword ptr [rsi]
	movzx	ecx, word ptr [rdx + 16]
	add	cx, word ptr [rsi + 16]
	vmovdqu	xmmword ptr [rdi], xmm0
	mov	word ptr [rdi + 16], cx
	ret
.Lfunc_end4:
	.size	_ZN1a4sum517hc3b00dfa19b7572aE, .Lfunc_end4-_ZN1a4sum517hc3b00dfa19b7572aE
	.cfi_endproc

	.section	.text._ZN1a4sum617h761310a682ddc851E,"ax",@progbits
	.globl	_ZN1a4sum617h761310a682ddc851E
	.p2align	4, 0x90
	.type	_ZN1a4sum617h761310a682ddc851E,@function
_ZN1a4sum617h761310a682ddc851E:
	.cfi_startproc
	mov	r8, qword ptr [rdx]
	add	r8, qword ptr [rsi]
	mov	rax, rdi
	mov	cl, byte ptr [rdx + 13]
	add	cl, byte ptr [rsi + 13]
	mov	dil, byte ptr [rsi + 12]
	mov	edx, dword ptr [rdx + 8]
	add	edx, dword ptr [rsi + 8]
	mov	qword ptr [rax], r8
	mov	byte ptr [rax + 12], dil
	mov	byte ptr [rax + 13], cl
	mov	dword ptr [rax + 8], edx
	ret
.Lfunc_end5:
	.size	_ZN1a4sum617h761310a682ddc851E, .Lfunc_end5-_ZN1a4sum617h761310a682ddc851E
	.cfi_endproc

	.section	.text._ZN1a6case_117ha530dd71df48c3daE,"ax",@progbits
	.globl	_ZN1a6case_117ha530dd71df48c3daE
	.p2align	4, 0x90
	.type	_ZN1a6case_117ha530dd71df48c3daE,@function
_ZN1a6case_117ha530dd71df48c3daE:
	.cfi_startproc
	mov	rax, rdi
	vmovups	xmm0, xmmword ptr [rsi]
	vaddps	xmm0, xmm0, xmmword ptr [rdx]
	vmovups	xmmword ptr [rdi], xmm0
	ret
.Lfunc_end6:
	.size	_ZN1a6case_117ha530dd71df48c3daE, .Lfunc_end6-_ZN1a6case_117ha530dd71df48c3daE
	.cfi_endproc

	.section	.text._ZN1a6case_217hb59f9b1a3cb611cbE,"ax",@progbits
	.globl	_ZN1a6case_217hb59f9b1a3cb611cbE
	.p2align	4, 0x90
	.type	_ZN1a6case_217hb59f9b1a3cb611cbE,@function
_ZN1a6case_217hb59f9b1a3cb611cbE:
	.cfi_startproc
	mov	rax, rdi
	vmovups	xmm0, xmmword ptr [rsi]
	vaddps	xmm0, xmm0, xmmword ptr [rdx]
	vmovups	xmmword ptr [rdi], xmm0
	ret
.Lfunc_end7:
	.size	_ZN1a6case_217hb59f9b1a3cb611cbE, .Lfunc_end7-_ZN1a6case_217hb59f9b1a3cb611cbE
	.cfi_endproc

	.section	.text.array_clone,"ax",@progbits
	.globl	array_clone
	.p2align	4, 0x90
	.type	array_clone,@function
array_clone:
	.cfi_startproc
	mov	al, byte ptr [rdi]
	mov	dl, byte ptr [rdi + 1]
	ret
.Lfunc_end8:
	.size	array_clone, .Lfunc_end8-array_clone
	.cfi_endproc

	.section	.text.array_clone_big,"ax",@progbits
	.globl	array_clone_big
	.p2align	4, 0x90
	.type	array_clone_big,@function
array_clone_big:
	.cfi_startproc
	mov	rax, rdi
	vmovups	xmm0, xmmword ptr [rsi]
	vmovups	xmmword ptr [rdi], xmm0
	ret
.Lfunc_end9:
	.size	array_clone_big, .Lfunc_end9-array_clone_big
	.cfi_endproc

	.section	.text.array_eq_value,"ax",@progbits
	.globl	array_eq_value
	.p2align	4, 0x90
	.type	array_eq_value,@function
array_eq_value:
	.cfi_startproc
	vmovdqu	xmm0, xmmword ptr [rdi]
	vpxor	xmm0, xmm0, xmmword ptr [rsi]
	vptest	xmm0, xmm0
	sete	al
	ret
.Lfunc_end10:
	.size	array_eq_value, .Lfunc_end10-array_eq_value
	.cfi_endproc

	.section	.text.is_zero_slice_short,"ax",@progbits
	.globl	is_zero_slice_short
	.p2align	4, 0x90
	.type	is_zero_slice_short,@function
is_zero_slice_short:
	.cfi_startproc
	cmp	dword ptr [rdi], 0
	sete	al
	ret
.Lfunc_end11:
	.size	is_zero_slice_short, .Lfunc_end11-is_zero_slice_short
	.cfi_endproc

	.section	.text.small_array_alignment,"ax",@progbits
	.globl	small_array_alignment
	.p2align	4, 0x90
	.type	small_array_alignment,@function
small_array_alignment:
	.cfi_startproc
	mov	byte ptr [rdi], sil
	mov	byte ptr [rdi + 1], dl
	mov	byte ptr [rdi + 2], cl
	mov	byte ptr [rdi + 3], r8b
	ret
.Lfunc_end12:
	.size	small_array_alignment, .Lfunc_end12-small_array_alignment
	.cfi_endproc

	.section	.text.small_struct_alignment,"ax",@progbits
	.globl	small_struct_alignment
	.p2align	4, 0x90
	.type	small_struct_alignment,@function
small_struct_alignment:
	.cfi_startproc
	mov	byte ptr [rdi], sil
	mov	byte ptr [rdi + 1], dl
	mov	byte ptr [rdi + 2], cl
	mov	byte ptr [rdi + 3], r8b
	ret
.Lfunc_end13:
	.size	small_struct_alignment, .Lfunc_end13-small_struct_alignment
	.cfi_endproc

	.type	.Lalloc41,@object
	.section	.rodata.cst4,"aM",@progbits,4
.Lalloc41:
	.zero	4
	.size	.Lalloc41, 4

	.section	".note.GNU-stack","",@progbits

Revert #85828 (not needed anymore and block auto-vectorization if left)
Fixes #85265
Fixes #91447

cc @scottmcm

rust-highfive · 2022-01-28T01:39:20Z

Some changes occured to the CTFE / Miri engine

cc @rust-lang/miri

rust-highfive · 2022-01-28T01:39:22Z

r? @davidtwco

(rust-highfive has picked a reviewer for you, use r? to override)

bors · 2022-01-29T19:45:21Z

☔ The latest upstream changes (presumably #93457) made this pull request unmergeable. Please resolve the merge conflicts.

This reverts commit 2456495.

davidtwco · 2022-01-31T03:53:03Z

I'll trigger a perf run so that we can see the performance impact of this. Implementation looks reasonable to me, but I'm not sure I'm the right person to decide to land this.

@bors try @rust-timer queue

r? rust-lang/compiler

rust-timer · 2022-01-31T03:53:05Z

Awaiting bors try build completion.

@rustbot label: +S-waiting-on-perf

bors · 2022-01-31T03:53:11Z

⌛ Trying commit 803e19c6ebc647d4e600967c255fccea838bce9f with merge 237000707ffe286d58aa8f08d523cf9c4e9928f3...

bors · 2022-01-31T05:12:22Z

☀️ Try build successful - checks-actions
Build commit: 237000707ffe286d58aa8f08d523cf9c4e9928f3 (237000707ffe286d58aa8f08d523cf9c4e9928f3)

rust-timer · 2022-01-31T05:12:23Z

Queued 237000707ffe286d58aa8f08d523cf9c4e9928f3 with parent e58e7b1, future comparison URL.

rust-timer · 2022-01-31T12:22:56Z

Finished benchmarking commit (237000707ffe286d58aa8f08d523cf9c4e9928f3): comparison url.

Summary: This benchmark run shows 26 relevant improvements 🎉 but 390 relevant regressions 😿 to instruction counts.

Average relevant regression: 2.0%
Average relevant improvement: -2.5%
Largest improvement in instruction counts: -7.1% on full builds of deeply-nested-async check
Largest regression in instruction counts: 20.3% on incr-patched: u8 3072 builds of issue-46449 debug

If you disagree with this performance assessment, please file an issue in rust-lang/rustc-perf.

Benchmarking this pull request likely means that it is perf-sensitive, so we're automatically marking it as not fit for rolling up. While you can manually mark this PR as fit for rollup, we strongly recommend not doing so since this PR led to changes in compiler perf.

Next Steps: If you can justify the regressions found in this try perf run, please indicate this with @rustbot label: +perf-regression-triaged along with sufficient written justification. If you cannot justify the regressions please fix the regressions and do another perf run. If the next run shows neutral or positive results, the label will be automatically removed.

@bors rollup=never
@rustbot label: +S-waiting-on-review -S-waiting-on-perf +perf-regression

the8472 · 2022-02-01T11:19:23Z

@bors try @rust-timer queue

rust-timer · 2022-02-01T11:19:25Z

Awaiting bors try build completion.

@rustbot label: +S-waiting-on-perf

bors · 2022-02-01T11:19:41Z

⌛ Trying commit 646350f with merge c3b1e20af9859a1275613895f5bfe5485b0d51f9...

bors · 2022-02-01T12:44:07Z

☀️ Try build successful - checks-actions
Build commit: c3b1e20af9859a1275613895f5bfe5485b0d51f9 (c3b1e20af9859a1275613895f5bfe5485b0d51f9)

rust-timer · 2022-02-01T12:44:09Z

Queued c3b1e20af9859a1275613895f5bfe5485b0d51f9 with parent 93e8201, future comparison URL.

rust-timer · 2022-02-01T17:24:14Z

Finished benchmarking commit (c3b1e20af9859a1275613895f5bfe5485b0d51f9): comparison url.

Summary: This benchmark run shows 9 relevant improvements 🎉 but 504 relevant regressions 😿 to instruction counts.

Average relevant regression: 3.7%
Average relevant improvement: -5.3%
Largest improvement in instruction counts: -7.3% on full builds of deeply-nested-async check
Largest regression in instruction counts: 25.7% on incr-patched: u8 3072 builds of issue-46449 debug

If you disagree with this performance assessment, please file an issue in rust-lang/rustc-perf.

Benchmarking this pull request likely means that it is perf-sensitive, so we're automatically marking it as not fit for rolling up. While you can manually mark this PR as fit for rollup, we strongly recommend not doing so since this PR led to changes in compiler perf.

Next Steps: If you can justify the regressions found in this try perf run, please indicate this with @rustbot label: +perf-regression-triaged along with sufficient written justification. If you cannot justify the regressions please fix the regressions and do another perf run. If the next run shows neutral or positive results, the label will be automatically removed.

@bors rollup=never
@rustbot label: +S-waiting-on-review -S-waiting-on-perf +perf-regression

Urgau · 2022-02-01T17:44:33Z

These perf regressions are way more than I anticipated. I don't think my changes are worth these massive regressions.
So I will close this PR for now. I have some other ideas to help mitigate these regressions but they will require changing the vast majority of the PR so I prefer to open a new PR for that.

rustbot added the T-compiler Relevant to the compiler team, which will review and decide on the PR/issue. label Jan 28, 2022

rust-highfive assigned davidtwco Jan 28, 2022

rust-highfive added the S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. label Jan 28, 2022

This comment has been minimized.

Sign in to view

Urgau added 3 commits January 30, 2022 21:41

Do not over-optimize the abi layout

3aeb61b

Revert "Stop generating allocas+memcmp for simple array equality"

f97aded

This reverts commit 2456495.

Fix and improve codegen tests

780d37c

Urgau force-pushed the optimize-abi branch from b8b295a to 803e19c Compare January 30, 2022 20:59

Miksel12 mentioned this pull request Jan 30, 2022

Rust should pass vectors by vector register #93490

Closed

rust-highfive unassigned davidtwco Jan 31, 2022

rust-highfive assigned jackh726 Jan 31, 2022

rustbot added the S-waiting-on-perf Status: Waiting on a perf run to be completed. label Jan 31, 2022

rustbot added perf-regression Performance regression. and removed S-waiting-on-perf Status: Waiting on a perf run to be completed. labels Jan 31, 2022

Cleanup of raw_eq in codegen cranelift, gcc and tests

7600c02

Urgau force-pushed the optimize-abi branch from 30018a2 to 7600c02 Compare January 31, 2022 15:54

Also put argument with ptr-size*2 on the stack

646350f

rustbot added the S-waiting-on-perf Status: Waiting on a perf run to be completed. label Feb 1, 2022

rustbot removed the S-waiting-on-perf Status: Waiting on a perf run to be completed. label Feb 1, 2022

Urgau closed this Feb 1, 2022

Urgau mentioned this pull request Feb 2, 2022

Don't aggregate homogeneous floats in the Rust ABI #93564

Closed

Urgau deleted the optimize-abi branch May 5, 2023 16:47

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Don't over-optimize the abi layout #93405

Don't over-optimize the abi layout #93405

Urgau commented Jan 28, 2022 •

edited

Loading

rust-highfive commented Jan 28, 2022

rust-highfive commented Jan 28, 2022

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

bors commented Jan 29, 2022

davidtwco commented Jan 31, 2022

rust-timer commented Jan 31, 2022

bors commented Jan 31, 2022

bors commented Jan 31, 2022

rust-timer commented Jan 31, 2022

rust-timer commented Jan 31, 2022

the8472 commented Feb 1, 2022

rust-timer commented Feb 1, 2022

bors commented Feb 1, 2022

bors commented Feb 1, 2022

rust-timer commented Feb 1, 2022

rust-timer commented Feb 1, 2022

Urgau commented Feb 1, 2022

Don't over-optimize the abi layout #93405

Don't over-optimize the abi layout #93405

Conversation

Urgau commented Jan 28, 2022 • edited Loading

rust-highfive commented Jan 28, 2022

rust-highfive commented Jan 28, 2022

This comment has been minimized.

This comment has been minimized.

This comment has been minimized.

bors commented Jan 29, 2022

davidtwco commented Jan 31, 2022

rust-timer commented Jan 31, 2022

bors commented Jan 31, 2022

bors commented Jan 31, 2022

rust-timer commented Jan 31, 2022

rust-timer commented Jan 31, 2022

the8472 commented Feb 1, 2022

rust-timer commented Feb 1, 2022

bors commented Feb 1, 2022

bors commented Feb 1, 2022

rust-timer commented Feb 1, 2022

rust-timer commented Feb 1, 2022

Urgau commented Feb 1, 2022

Urgau commented Jan 28, 2022 •

edited

Loading