Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unnecessary memcpy when using array initialization shorthand #56882

Closed
isegal opened this issue Dec 16, 2018 · 1 comment
Closed

Unnecessary memcpy when using array initialization shorthand #56882

isegal opened this issue Dec 16, 2018 · 1 comment

Comments

@isegal
Copy link

isegal commented Dec 16, 2018

There is a performance regression with shorthand array initialization that causes temporary stack allocation and copy. It appears to have been introduced between Rust 1.11.0 and 1.12.0 and exists all the way up to current beta.

(All examples henceforth are with -C opt-level=3)

Example:

pub struct BigTest {
    arr: [u32; 128]
}

impl BigTest {
    pub fn new() -> BigTest {
        BigTest {
            arr: [123; 128],
        }
    }
}

pub fn test() -> BigTest {
    BigTest::new()
}

It appears that in this case, a temporary is allocated on stack, initialized and then copied.
This could cause performance issues with large array initialization.

.LCPI0_0:
        .long   123
        .long   123
        .long   123
        .long   123
example::BigTest::new:
        push    rbx
        sub     rsp, 512
        mov     rbx, rdi
        movaps  xmm0, xmmword ptr [rip + .LCPI0_0]
        movaps  xmmword ptr [rsp], xmm0
        movaps  xmmword ptr [rsp + 16], xmm0
        movaps  xmmword ptr [rsp + 32], xmm0
        movaps  xmmword ptr [rsp + 48], xmm0
        movaps  xmmword ptr [rsp + 64], xmm0
        movaps  xmmword ptr [rsp + 80], xmm0
        movaps  xmmword ptr [rsp + 96], xmm0
        movaps  xmmword ptr [rsp + 112], xmm0
        movaps  xmmword ptr [rsp + 128], xmm0
        movaps  xmmword ptr [rsp + 144], xmm0
        movaps  xmmword ptr [rsp + 160], xmm0
        movaps  xmmword ptr [rsp + 176], xmm0
        movaps  xmmword ptr [rsp + 192], xmm0
        movaps  xmmword ptr [rsp + 208], xmm0
        movaps  xmmword ptr [rsp + 224], xmm0
        movaps  xmmword ptr [rsp + 240], xmm0
        movaps  xmmword ptr [rsp + 256], xmm0
        movaps  xmmword ptr [rsp + 272], xmm0
        movaps  xmmword ptr [rsp + 288], xmm0
        movaps  xmmword ptr [rsp + 304], xmm0
        movaps  xmmword ptr [rsp + 320], xmm0
        movaps  xmmword ptr [rsp + 336], xmm0
        movaps  xmmword ptr [rsp + 352], xmm0
        movaps  xmmword ptr [rsp + 368], xmm0
        movaps  xmmword ptr [rsp + 384], xmm0
        movaps  xmmword ptr [rsp + 400], xmm0
        movaps  xmmword ptr [rsp + 416], xmm0
        movaps  xmmword ptr [rsp + 432], xmm0
        movaps  xmmword ptr [rsp + 448], xmm0
        movaps  xmmword ptr [rsp + 464], xmm0
        movaps  xmmword ptr [rsp + 480], xmm0
        movaps  xmmword ptr [rsp + 496], xmm0
        mov     rsi, rsp
        mov     edx, 512
        call    qword ptr [rip + memcpy@GOTPCREL]
        mov     rax, rbx
        add     rsp, 512
        pop     rbx
        ret

example::test:
        push    rbx
        mov     rbx, rdi
        call    qword ptr [rip + example::BigTest::new@GOTPCREL]
        mov     rax, rbx
        pop     rbx
        ret

Without shorthand, there is no temporary allocation:

pub struct BigTest {

    arr: [u32; 128]
}

impl BigTest {
    pub fn new() -> BigTest {
        BigTest {
    
            arr: [
            123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,
            123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,
            123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,
            123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,
            123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,
            123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,
            123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,
            123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,123,
            ], 
        }
    }
}

pub fn test() -> BigTest {
    BigTest::new()
}
.LCPI0_0:
        .long   123
        .long   123
        .long   123
        .long   123
example::BigTest::new:
        mov     rax, rdi
        movaps  xmm0, xmmword ptr [rip + .LCPI0_0]
        movups  xmmword ptr [rdi], xmm0
        movups  xmmword ptr [rdi + 16], xmm0
        movups  xmmword ptr [rdi + 32], xmm0
        movups  xmmword ptr [rdi + 48], xmm0
        movups  xmmword ptr [rdi + 64], xmm0
        movups  xmmword ptr [rdi + 80], xmm0
        movups  xmmword ptr [rdi + 96], xmm0
        movups  xmmword ptr [rdi + 112], xmm0
        movups  xmmword ptr [rdi + 128], xmm0
        movups  xmmword ptr [rdi + 144], xmm0
        movups  xmmword ptr [rdi + 160], xmm0
        movups  xmmword ptr [rdi + 176], xmm0
        movups  xmmword ptr [rdi + 192], xmm0
        movups  xmmword ptr [rdi + 208], xmm0
        movups  xmmword ptr [rdi + 224], xmm0
        movups  xmmword ptr [rdi + 240], xmm0
        movups  xmmword ptr [rdi + 256], xmm0
        movups  xmmword ptr [rdi + 272], xmm0
        movups  xmmword ptr [rdi + 288], xmm0
        movups  xmmword ptr [rdi + 304], xmm0
        movups  xmmword ptr [rdi + 320], xmm0
        movups  xmmword ptr [rdi + 336], xmm0
        movups  xmmword ptr [rdi + 352], xmm0
        movups  xmmword ptr [rdi + 368], xmm0
        movups  xmmword ptr [rdi + 384], xmm0
        movups  xmmword ptr [rdi + 400], xmm0
        movups  xmmword ptr [rdi + 416], xmm0
        movups  xmmword ptr [rdi + 432], xmm0
        movups  xmmword ptr [rdi + 448], xmm0
        movups  xmmword ptr [rdi + 464], xmm0
        movups  xmmword ptr [rdi + 480], xmm0
        movups  xmmword ptr [rdi + 496], xmm0
        ret

example::test:
        push    rbx
        mov     rbx, rdi
        call    qword ptr [rip + example::BigTest::new@GOTPCREL]
        mov     rax, rbx
        pop     rbx
        ret

Rust 1.11.0 when using shorthand, there is no extra allocation:

.LCPI0_0:
        .long   123
        .long   123
        .long   123
        .long   123
example::BigTest::new:
        push    rbp
        mov     rbp, rsp
        movaps  xmm0, xmmword ptr [rip + .LCPI0_0]
        movups  xmmword ptr [rdi], xmm0
        movups  xmmword ptr [rdi + 16], xmm0
        movups  xmmword ptr [rdi + 32], xmm0
        movups  xmmword ptr [rdi + 48], xmm0
        movups  xmmword ptr [rdi + 64], xmm0
        movups  xmmword ptr [rdi + 80], xmm0
        movups  xmmword ptr [rdi + 96], xmm0
        movups  xmmword ptr [rdi + 112], xmm0
        movups  xmmword ptr [rdi + 128], xmm0
        movups  xmmword ptr [rdi + 144], xmm0
        movups  xmmword ptr [rdi + 160], xmm0
        movups  xmmword ptr [rdi + 176], xmm0
        movups  xmmword ptr [rdi + 192], xmm0
        movups  xmmword ptr [rdi + 208], xmm0
        movups  xmmword ptr [rdi + 224], xmm0
        movups  xmmword ptr [rdi + 240], xmm0
        movups  xmmword ptr [rdi + 256], xmm0
        movups  xmmword ptr [rdi + 272], xmm0
        movups  xmmword ptr [rdi + 288], xmm0
        movups  xmmword ptr [rdi + 304], xmm0
        movups  xmmword ptr [rdi + 320], xmm0
        movups  xmmword ptr [rdi + 336], xmm0
        movups  xmmword ptr [rdi + 352], xmm0
        movups  xmmword ptr [rdi + 368], xmm0
        movups  xmmword ptr [rdi + 384], xmm0
        movups  xmmword ptr [rdi + 400], xmm0
        movups  xmmword ptr [rdi + 416], xmm0
        movups  xmmword ptr [rdi + 432], xmm0
        movups  xmmword ptr [rdi + 448], xmm0
        movups  xmmword ptr [rdi + 464], xmm0
        movups  xmmword ptr [rdi + 480], xmm0
        movups  xmmword ptr [rdi + 496], xmm0
        mov     rax, rdi
        pop     rbp
        ret

.LCPI1_0:
        .long   123
        .long   123
        .long   123
        .long   123
example::test:
        push    rbp
        mov     rbp, rsp
        movaps  xmm0, xmmword ptr [rip + .LCPI1_0]
        movups  xmmword ptr [rdi], xmm0
        movups  xmmword ptr [rdi + 16], xmm0
        movups  xmmword ptr [rdi + 32], xmm0
        movups  xmmword ptr [rdi + 48], xmm0
        movups  xmmword ptr [rdi + 64], xmm0
        movups  xmmword ptr [rdi + 80], xmm0
        movups  xmmword ptr [rdi + 96], xmm0
        movups  xmmword ptr [rdi + 112], xmm0
        movups  xmmword ptr [rdi + 128], xmm0
        movups  xmmword ptr [rdi + 144], xmm0
        movups  xmmword ptr [rdi + 160], xmm0
        movups  xmmword ptr [rdi + 176], xmm0
        movups  xmmword ptr [rdi + 192], xmm0
        movups  xmmword ptr [rdi + 208], xmm0
        movups  xmmword ptr [rdi + 224], xmm0
        movups  xmmword ptr [rdi + 240], xmm0
        movups  xmmword ptr [rdi + 256], xmm0
        movups  xmmword ptr [rdi + 272], xmm0
        movups  xmmword ptr [rdi + 288], xmm0
        movups  xmmword ptr [rdi + 304], xmm0
        movups  xmmword ptr [rdi + 320], xmm0
        movups  xmmword ptr [rdi + 336], xmm0
        movups  xmmword ptr [rdi + 352], xmm0
        movups  xmmword ptr [rdi + 368], xmm0
        movups  xmmword ptr [rdi + 384], xmm0
        movups  xmmword ptr [rdi + 400], xmm0
        movups  xmmword ptr [rdi + 416], xmm0
        movups  xmmword ptr [rdi + 432], xmm0
        movups  xmmword ptr [rdi + 448], xmm0
        movups  xmmword ptr [rdi + 464], xmm0
        movups  xmmword ptr [rdi + 480], xmm0
        movups  xmmword ptr [rdi + 496], xmm0
        mov     rax, rdi
        pop     rbp
        ret

Source: https://rust.godbolt.org/z/isxu3Y

@isegal isegal changed the title Unneccessary memcpy when using array initialization shorthand Unnecessary memcpy when using array initialization shorthand Dec 16, 2018
@nikic
Copy link
Contributor

nikic commented Dec 16, 2018

Duplicate of #56333. The issue is that rustc generates many redundant memcpys and LLVM is not always able to optimize them away.

@nikic nikic closed this as completed Dec 16, 2018
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants