setindex(::Tuple) performance regression #46049

maleadt · 2022-07-15T14:02:06Z

The following code optimizes away on 1.7, but contains a dynamic call on 1.8:

# a simple immutable array type backed by stack memory
#
# similar to StaticArrays, but immutable to prevent optimization bugs (JuliaLang/julia#41800)

struct LocalArray{L,T}
    data::NTuple{L,T}

    LocalArray{L,T}(::UndefInitializer) where {L,T} = new{L,T}()
    LocalArray{L,T}(x::NTuple{L,T}) where {L,T} = new{L,T}(x)
end

@inline function setindex(v::LocalArray{L,T} , val::T, i::Int) where {L,T}
    new_data = Base.setindex(v.data, val, i)
    LocalArray{L,T}(new_data)
end


function kernel()
    v = LocalArray{16,Int}(undef)
    setindex(v, 0, 1)
    return
end

code_llvm(kernel, Tuple{})

define void @julia_kernel_112() #0 {
top:
  ret void
}

vs

;  @ /home/tim/Julia/pkg/CUDA/wip.jl:20 within `kernel`
define void @julia_kernel_190() #0 {
top:
  %0 = alloca {}*, align 8
  %gcframe2 = alloca [3 x {}*], align 16
  %gcframe2.sub = getelementptr inbounds [3 x {}*], [3 x {}*]* %gcframe2, i64 0, i64 0
  %1 = bitcast [3 x {}*]* %gcframe2 to i8*
  call void @llvm.memset.p0i8.i32(i8* noundef nonnull align 16 dereferenceable(24) %1, i8 0, i32 24, i1 false)
  %2 = alloca { i64, i64, [16 x i64] }, align 8
  %thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #2
  %ppgcstack_i8 = getelementptr i8, i8* %thread_ptr, i64 -8
  %ppgcstack = bitcast i8* %ppgcstack_i8 to {}****
  %pgcstack = load {}***, {}**** %ppgcstack, align 8
;  @ /home/tim/Julia/pkg/CUDA/wip.jl:22 within `kernel`
; ┌ @ /home/tim/Julia/pkg/CUDA/wip.jl:13 within `setindex` @ tuple.jl:55
; │┌ @ tuple.jl:58 within `_setindex`
    %3 = bitcast [3 x {}*]* %gcframe2 to i64*
    store i64 4, i64* %3, align 16
    %4 = getelementptr inbounds [3 x {}*], [3 x {}*]* %gcframe2, i64 0, i64 1
    %5 = bitcast {}** %4 to {}***
    %6 = load {}**, {}*** %pgcstack, align 8
    store {}** %6, {}*** %5, align 8
    %7 = bitcast {}*** %pgcstack to {}***
    store {}** %gcframe2.sub, {}*** %7, align 8
    %8 = getelementptr inbounds { i64, i64, [16 x i64] }, { i64, i64, [16 x i64] }* %2, i64 0, i32 2, i64 2
    %9 = bitcast i64* %8 to <4 x i64>*
    store <4 x i64> <i64 139704043836304, i64 139704201753424, i64 7808250546924, i64 13692355740775>, <4 x i64>* %9, align 8
    %10 = getelementptr inbounds { i64, i64, [16 x i64] }, { i64, i64, [16 x i64] }* %2, i64 0, i32 2, i64 6
    %11 = bitcast i64* %10 to <4 x i64>*
    store <4 x i64> <i64 0, i64 -1047008294339651328, i64 17, i64 -1047008294339651328>, <4 x i64>* %11, align 8
    %12 = getelementptr inbounds { i64, i64, [16 x i64] }, { i64, i64, [16 x i64] }* %2, i64 0, i32 2, i64 10
    %13 = bitcast i64* %12 to <4 x i64>*
    store <4 x i64> <i64 139704043895616, i64 140732546470088, i64 4, i64 140732546470080>, <4 x i64>* %13, align 8
    %14 = getelementptr inbounds { i64, i64, [16 x i64] }, { i64, i64, [16 x i64] }* %2, i64 0, i32 2, i64 14
    %15 = bitcast i64* %14 to <2 x i64>*
    store <2 x i64> <i64 139704161878032, i64 0>, <2 x i64>* %15, align 8
; ││ @ tuple.jl:60 within `_setindex`
    %16 = bitcast { i64, i64, [16 x i64] }* %2 to <4 x i64>*
    store <4 x i64> <i64 0, i64 1, i64 5937600496, i64 139704201753104>, <4 x i64>* %16, align 8
; ││┌ @ ntuple.jl:19 within `ntuple`
     %17 = call nonnull {}* @j__ntuple_192({ i64, i64, [16 x i64] }* nocapture readonly %2, i64 signext 16) #0
; │└└
; │ @ /home/tim/Julia/pkg/CUDA/wip.jl:14 within `setindex`
   %18 = bitcast {}* %17 to i64*
   %19 = getelementptr inbounds i64, i64* %18, i64 -1
   %20 = load atomic i64, i64* %19 unordered, align 8
   %21 = and i64 %20, -16
   %22 = inttoptr i64 %21 to {}*
   %.not = icmp eq {}* %22, inttoptr (i64 139704201753184 to {}*)
   br i1 %.not, label %L13, label %L10

L10:                                              ; preds = %top
   %23 = getelementptr inbounds [3 x {}*], [3 x {}*]* %gcframe2, i64 0, i64 2
   store {}* %17, {}** %23, align 16
   store {}* %17, {}** %0, align 8
   %24 = call nonnull {}* @ijl_apply_generic({}* inttoptr (i64 139704201753104 to {}*), {}** nonnull %0, i32 1)
   br label %L13

L13:                                              ; preds = %L10, %top
   %25 = load {}*, {}** %4, align 8
   %26 = bitcast {}*** %pgcstack to {}**
   store {}* %25, {}** %26, align 8
; └
;  @ /home/tim/Julia/pkg/CUDA/wip.jl:23 within `kernel`
  ret void
}

Bisected to #40635. Originally encountered in GPU code, JuliaGPU/GemmKernels.jl#99.
PR incoming.

maleadt added performance Must go faster regression Regression in behavior compared to a previous version labels Jul 15, 2022

maleadt mentioned this issue Jul 15, 2022

Specialize tuple setindex to avoid ntuple-related performance regression. #46050

Merged

maleadt changed the title ~~ntuple regression~~ setindex(::Tuple) performance regression Jul 15, 2022

KristofferC closed this as completed in #46050 Jul 20, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

setindex(::Tuple) performance regression #46049

setindex(::Tuple) performance regression #46049

maleadt commented Jul 15, 2022 •

edited

Loading

setindex(::Tuple) performance regression #46049

setindex(::Tuple) performance regression #46049

Comments

maleadt commented Jul 15, 2022 • edited Loading

maleadt commented Jul 15, 2022 •

edited

Loading