Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Box::new(expr) first puts expr on the stack, then copies. #50047

Closed
glandium opened this issue Apr 18, 2018 · 7 comments
Closed

Box::new(expr) first puts expr on the stack, then copies. #50047

glandium opened this issue Apr 18, 2018 · 7 comments
Labels
A-box Area: Our favorite opsem complication A-MIR Area: Mid-level IR (MIR) - https://blog.rust-lang.org/2016/04/19/MIR.html C-enhancement Category: An issue proposing an enhancement or a PR with one. I-slow Issue: Problems and improvements with respect to performance of generated code. WG-llvm Working group: LLVM backend code generation

Comments

@glandium
Copy link
Contributor

Consider the following code:

pub fn foo() -> Box<[u8; 4096]> {
    Box::new([0; 4096])
}

(made it big because it's kind of simpler to see the memset and memcpy calls in the resulting asm)

It generates the following assembly:

example::foo:
  push rbx
  mov eax, 4096
  call __rust_probestack
  sub rsp, rax
  mov rdi, rsp
  xor esi, esi
  mov edx, 4096
  call memset@PLT
  mov edi, 4096
  mov esi, 1
  call __rust_alloc@PLT
  mov rbx, rax
  test rbx, rbx
  je .LBB1_1
  mov rsi, rsp
  mov edx, 4096
  mov rdi, rbx
  call memcpy@PLT
  mov rax, rbx
  add rsp, 4096
  pop rbx
  ret
.LBB1_1:
  call <alloc::alloc::Global as core::alloc::GlobalAlloc>::oom
  ud2

which does a memset, alloc, memcpy dance.

I was accepting this as a fact of life, but today, I was looking at a random old version of rustc on godbolt, and it turns out before 1.12, the memset, alloc, memcpy dance wasn't happening:

example::foo:
  push rbx
  mov edi, 4096
  mov esi, 1
  call __rust_allocate@PLT
  mov rbx, rax
  test rbx, rbx
  je .LBB0_2
  xor esi, esi
  mov edx, 4096
  mov rdi, rbx
  call memset@PLT
  mov rax, rbx
  pop rbx
  ret
.LBB0_2:
  call alloc::oom::oom@PLT

https://godbolt.org/g/J3cy5E

The llvm ir back then looks like the following:

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define noalias dereferenceable(4096) [4096 x i8]* @example::foo() unnamed_addr #0 {
entry-block:
  %0 = tail call i8* @__rust_allocate(i64 4096, i64 1) #1, !noalias !0
  %1 = icmp eq i8* %0, null
  br i1 %1, label %then-block-57-.i.i, label %"_ZN5alloc5boxed30_$LT$impl$u20$Box$LT$T$GT$$GT$3new17ha7ffa7dfb1e725d2E.exit"

then-block-57-.i.i: ; preds = %entry-block
  tail call void @alloc::oom::oom(), !noalias !0
  unreachable

"_ZN5alloc5boxed30_$LT$impl$u20$Box$LT$T$GT$$GT$3new17ha7ffa7dfb1e725d2E.exit": ; preds = %entry-block
  %2 = bitcast i8* %0 to [4096 x i8]*
  call void @llvm.memset.p0i8.i64(i8* nonnull %0, i8 0, i64 4096, i32 1, i1 false)
  ret [4096 x i8]* %2
}

declare noalias i8* @__rust_allocate(i64, i64) unnamed_addr #1

declare void @alloc::oom::oom() unnamed_addr #2

declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3

attributes #0 = { uwtable }
attributes #1 = { nounwind }
attributes #2 = { cold noinline noreturn }
attributes #3 = { argmemonly nounwind }

!0 = !{!1}
!1 = distinct !{!1, !2, !"alloc::boxed::<impl Box<T>>::new: %x"}
!2 = distinct !{!2, !"alloc::boxed::<impl Box<T>>::new"}

while on nightly, it looks like:

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define internal fastcc void @"<alloc::alloc::Global as core::alloc::GlobalAlloc>::oom"() unnamed_addr #0 {
  tail call void @__rust_oom()
  unreachable
}

define noalias align 1 dereferenceable(4096) [4096 x i8]* @example::foo() unnamed_addr #1 {
  %_1 = alloca [4096 x i8], align 1
  %_1.0.sroa_idx2 = getelementptr inbounds [4096 x i8], [4096 x i8]* %_1, i64 0, i64 0
  call void @llvm.lifetime.start.p0i8(i64 4096, i8* nonnull %_1.0.sroa_idx2)
  call void @llvm.memset.p0i8.i64(i8* nonnull %_1.0.sroa_idx2, i8 0, i64 4096, i32 1, i1 false)
  %0 = tail call i8* @__rust_alloc(i64 4096, i64 1) #5, !noalias !0
  %1 = icmp eq i8* %0, null
  br i1 %1, label %bb7.i.i, label %"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17hbb8214c4d412a6d3E.exit"

bb7.i.i: ; preds = %start
  tail call fastcc void @"<alloc::alloc::Global as core::alloc::GlobalAlloc>::oom"() #5, !noalias !0
  unreachable

"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17hbb8214c4d412a6d3E.exit": ; preds = %start
  %2 = bitcast i8* %0 to [4096 x i8]*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %0, i8* nonnull %_1.0.sroa_idx2, i64 4096, i32 1, i1 false) #5
  call void @llvm.lifetime.end.p0i8(i64 4096, i8* nonnull %_1.0.sroa_idx2)
  ret [4096 x i8]* %2
}

declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #2

declare void @__rust_oom() unnamed_addr #3

declare noalias i8* @__rust_alloc(i64, i64) unnamed_addr #4

declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #2

declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2

declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2

attributes #0 = { inlinehint noreturn nounwind uwtable "probe-stack"="__rust_probestack" }
attributes #1 = { nounwind uwtable "probe-stack"="__rust_probestack" }
attributes #2 = { argmemonly nounwind }
attributes #3 = { cold noreturn nounwind "probe-stack"="__rust_probestack" }
attributes #4 = { nounwind "probe-stack"="__rust_probestack" }
attributes #5 = { nounwind }

!0 = !{!1}
!1 = distinct !{!1, !2, !"<alloc::boxed::Box<T>>::new: %x"}
!2 = distinct !{!2, !"<alloc::boxed::Box<T>>::new"}
@nox nox assigned nox and unassigned nox Apr 18, 2018
@rcoh
Copy link
Contributor

rcoh commented Apr 18, 2018

1.12.0 was the release that added MIR: "rustc translates code to LLVM IR via its own "middle" IR (MIR)", which seems like the mostly likely cause from that release.

@kennytm kennytm added I-slow Issue: Problems and improvements with respect to performance of generated code. C-enhancement Category: An issue proposing an enhancement or a PR with one. A-MIR Area: Mid-level IR (MIR) - https://blog.rust-lang.org/2016/04/19/MIR.html WG-llvm Working group: LLVM backend code generation labels Apr 18, 2018
@glandium
Copy link
Contributor Author

The corresponding MIR:

const foo::{{initializer}}: usize ={
  let mut _0: usize; // return place

  bb0: { 
  _0 = const 4096usize; // bb0[0]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:1:26: 1:30
  return; // bb0[1]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:1:26: 1:30
  }
}

const foo::{{initializer}}: usize ={
  let mut _0: usize; // return place

  bb0: { 
  _0 = const 4096usize; // bb0[0]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:18: 2:22
  return; // bb0[1]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:18: 2:22
  }
}

fn foo() -> std::boxed::Box<[u8; 4096]>{
  let mut _0: std::boxed::Box<[u8; 4096]>; // return place
  let mut _1: [u8; 4096];

  bb0: { 
  StorageLive(_1); // bb0[0]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:14: 2:23
  _1 = [const 0u8; 4096]; // bb0[1]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:14: 2:23
  _0 = const <std::boxed::Box<T>>::new(move _1) -> bb1; // bb0[2]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:5: 2:24
  }

  bb1: { 
  StorageDead(_1); // bb1[0]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:2:23: 2:24
  return; // bb1[1]: scope 0 at /tmp/compiler-explorer-compiler118318-63-13s7ff5.cpzhg/example.rs:3:2: 3:2
  }
}

@oli-obk
Copy link
Contributor

oli-obk commented Apr 19, 2018

Can we add a trick to HAIR that treats Box::new calls just like the box syntax?

@glandium
Copy link
Contributor Author

Related:

pub fn bar(buf: [u8; 4096]) -> Box<[u8; 4096]> {
    Box::new(buf)
}

copies buf to the local stack before copying it in the box:

example::bar:
  push rbx
  mov eax, 4096
  call __rust_probestack
  sub rsp, rax
  mov rax, rdi
  mov rdi, rsp
  mov edx, 4096
  mov rsi, rax
  call memcpy@PLT
  mov edi, 4096
  mov esi, 1
  call __rust_alloc@PLT
  mov rbx, rax
  test rbx, rbx
  je .LBB2_1
  mov rsi, rsp
  mov edx, 4096
  mov rdi, rbx
  call memcpy@PLT
  mov rax, rbx
  add rsp, 4096
  pop rbx
  ret
.LBB2_1:
  call <alloc::alloc::Global as core::alloc::GlobalAlloc>::oom
  ud2

@glandium
Copy link
Contributor Author

So, one interesting fact, it doesn't happen when the object is small enough:

pub fn foo() -> Box<[u8; 8]> {
    Box::new([0; 8])
}

pub fn bar() -> Box<[u8; 9]> {
    Box::new([0; 9])
}
example::foo:
  push rax
  mov edi, 8
  mov esi, 1
  call __rust_alloc@PLT
  test rax, rax
  je .LBB1_1
  mov qword ptr [rax], 0
  pop rcx
  ret
.LBB1_1:
  call <alloc::alloc::Global as core::alloc::GlobalAlloc>::oom
  ud2

example::bar:
  sub rsp, 24
  mov byte ptr [rsp + 16], 0
  mov qword ptr [rsp + 8], 0
  mov edi, 9
  mov esi, 1
  call __rust_alloc@PLT
  test rax, rax
  je .LBB2_1
  mov cl, byte ptr [rsp + 16]
  mov byte ptr [rax + 8], cl
  mov rcx, qword ptr [rsp + 8]
  mov qword ptr [rax], rcx
  add rsp, 24
  ret
.LBB2_1:
  call <alloc::alloc::Global as core::alloc::GlobalAlloc>::oom
  ud2

The MIR in both cases looks similar, but the LLVM-IR differs:

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define internal fastcc void @"<alloc::alloc::Global as core::alloc::GlobalAlloc>::oom"() unnamed_addr #0 {
  tail call void @__rust_oom()
  unreachable
}

define noalias align 1 dereferenceable(8) [8 x i8]* @example::foo() unnamed_addr #1 {
  %0 = tail call i8* @__rust_alloc(i64 8, i64 1) #5
  %1 = icmp eq i8* %0, null
  br i1 %1, label %bb7.i.i, label %"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17h4e4edfda70298278E.exit"

bb7.i.i: ; preds = %start
  tail call fastcc void @"<alloc::alloc::Global as core::alloc::GlobalAlloc>::oom"() #5
  unreachable

"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17h4e4edfda70298278E.exit": ; preds = %start
  %2 = bitcast i8* %0 to [8 x i8]*
  %_3.sroa.0.0..sroa_cast.i = bitcast i8* %0 to i64*
  store i64 0, i64* %_3.sroa.0.0..sroa_cast.i, align 1
  ret [8 x i8]* %2
}

define noalias align 1 dereferenceable(9) [9 x i8]* @example::bar() unnamed_addr #1 {
  %_1 = alloca [9 x i8], align 1
  %_1.0.sroa_idx2 = getelementptr inbounds [9 x i8], [9 x i8]* %_1, i64 0, i64 0
  call void @llvm.lifetime.start.p0i8(i64 9, i8* nonnull %_1.0.sroa_idx2)
  call void @llvm.memset.p0i8.i64(i8* nonnull %_1.0.sroa_idx2, i8 0, i64 9, i32 1, i1 false)
  %0 = tail call i8* @__rust_alloc(i64 9, i64 1) #5, !noalias !0
  %1 = icmp eq i8* %0, null
  br i1 %1, label %bb7.i.i, label %"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17ha687450047947beaE.exit"

bb7.i.i: ; preds = %start
  tail call fastcc void @"<alloc::alloc::Global as core::alloc::GlobalAlloc>::oom"() #5, !noalias !0
  unreachable

"_ZN35_$LT$alloc..boxed..Box$LT$T$GT$$GT$3new17ha687450047947beaE.exit": ; preds = %start
  %2 = bitcast i8* %0 to [9 x i8]*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %0, i8* nonnull %_1.0.sroa_idx2, i64 9, i32 1, i1 false) #5
  call void @llvm.lifetime.end.p0i8(i64 9, i8* nonnull %_1.0.sroa_idx2)
  ret [9 x i8]* %2
}

declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #2

declare void @__rust_oom() unnamed_addr #3

declare noalias i8* @__rust_alloc(i64, i64) unnamed_addr #4

declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #2

declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2

declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2

attributes #0 = { inlinehint noreturn nounwind uwtable "probe-stack"="__rust_probestack" }
attributes #1 = { nounwind uwtable "probe-stack"="__rust_probestack" }
attributes #2 = { argmemonly nounwind }
attributes #3 = { cold noreturn nounwind "probe-stack"="__rust_probestack" }
attributes #4 = { nounwind "probe-stack"="__rust_probestack" }
attributes #5 = { nounwind }

!0 = !{!1}
!1 = distinct !{!1, !2, !"<alloc::boxed::Box<T>>::new: %x"}
!2 = distinct !{!2, !"<alloc::boxed::Box<T>>::new"}

@arthurprs
Copy link
Contributor

Related (or dup?) of #41160

@Mark-Simulacrum
Copy link
Member

Yeah, this seems like the same issue as #41160; closing.

@workingjubilee workingjubilee added the A-box Area: Our favorite opsem complication label Oct 1, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
A-box Area: Our favorite opsem complication A-MIR Area: Mid-level IR (MIR) - https://blog.rust-lang.org/2016/04/19/MIR.html C-enhancement Category: An issue proposing an enhancement or a PR with one. I-slow Issue: Problems and improvements with respect to performance of generated code. WG-llvm Working group: LLVM backend code generation
Projects
None yet
Development

No branches or pull requests

8 participants