Skip to content

Commit

Permalink
ci: run 1.10 tests only on Lux and LuxLib
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Nov 5, 2024
1 parent 8c42e61 commit 900c21c
Show file tree
Hide file tree
Showing 14 changed files with 6 additions and 37 deletions.
1 change: 1 addition & 0 deletions .buildkite/benchmarks.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
steps:
- group: ":racehorse: Benchmarks"
if: build.message !~ /\[skip benchmarks\]/ && build.message !~ /\[skip ci\]/ && !build.pull_request.draft
steps:
- label: "CPU: Run Benchmarks with {{matrix.threads}} thread(s)"
matrix:
Expand Down
8 changes: 5 additions & 3 deletions .buildkite/testing.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
steps:
- group: ":julia: (Lux) CUDA GPU"
steps:
- label: ":julia: Julia {{matrix.julia}} + CUDA GPU"
- label: ":julia: Julia {{matrix.julia}} + CUDA GPU + {{matrix.group}}"
plugins:
- JuliaCI/julia#v1:
version: "{{matrix.julia}}"
Expand Down Expand Up @@ -45,13 +45,16 @@ steps:
include(joinpath(dir, "../test/runtests.jl"))'
env:
BACKEND_GROUP: "CUDA"
LUX_TEST_GROUP: "{{matrix.group}}"
if: build.message !~ /\[skip tests\]/ && build.message !~ /\[skip ci\]/
timeout_in_minutes: 120
matrix:
setup:
julia:
- "1.10"
- "1"
group:
- "!reactant"
- "reactant"

- group: ":julia: (Lux) AMD GPU"
steps:
Expand Down Expand Up @@ -105,7 +108,6 @@ steps:
matrix:
setup:
julia:
- "1.10"
- "1"

env:
Expand Down
1 change: 0 additions & 1 deletion .buildkite/testing_luxcuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ steps:
matrix:
setup:
julia:
- "1.10"
- "1"

env:
Expand Down
2 changes: 0 additions & 2 deletions .buildkite/testing_luxlib.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ steps:
matrix:
setup:
julia:
- "1.10"
- "1"

- group: ":julia: (LuxLib) AMD GPU"
Expand Down Expand Up @@ -95,7 +94,6 @@ steps:
matrix:
setup:
julia:
- "1.10"
- "1"

env:
Expand Down
1 change: 0 additions & 1 deletion .buildkite/testing_luxtestutils.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ steps:
matrix:
setup:
julia:
- "1.10"
- "1"

env:
Expand Down
4 changes: 0 additions & 4 deletions .buildkite/testing_mldatadevices.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ steps:
matrix:
setup:
julia:
- "1.10"
- "1"
group:
- "CUDA"
Expand Down Expand Up @@ -60,7 +59,6 @@ steps:
matrix:
setup:
julia:
- "1.10"
- "1"

- group: ":julia: (MLDataDevices) Metal GPU"
Expand Down Expand Up @@ -91,7 +89,6 @@ steps:
matrix:
setup:
julia:
- "1.10"
- "1"

- group: ":julia: (MLDataDevices) oneAPI GPU"
Expand Down Expand Up @@ -122,7 +119,6 @@ steps:
matrix:
setup:
julia:
- "1.10"
- "1"

env:
Expand Down
1 change: 0 additions & 1 deletion .buildkite/testing_weightinitializers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ steps:
matrix:
setup:
julia:
- "1.10"
- "1"

- group: ":julia: (WeightInitializers) AMD GPU"
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/CI_LuxCUDA.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ jobs:
fail-fast: false
matrix:
version:
- "1.10"
- "1"
steps:
- uses: actions/checkout@v4
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/CI_LuxCore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ jobs:
fail-fast: false
matrix:
version:
- "1.10"
- "1"
os:
- ubuntu-latest
Expand Down
15 changes: 0 additions & 15 deletions .github/workflows/CI_LuxLib.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,21 +41,6 @@ jobs:
loopvec:
- "true"
include:
- version: "1.10"
os: ubuntu-latest
test_group: "dense"
blas_backend: "blis"
loopvec: "true"
- version: "1.10"
os: ubuntu-latest
test_group: "dense"
blas_backend: "mkl"
loopvec: "true"
- version: "1.10"
os: macos-latest
test_group: "dense"
blas_backend: "appleaccelerate"
loopvec: "true"
- version: "1.10"
os: ubuntu-latest
test_group: "dense"
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/CI_LuxTestUtils.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ jobs:
fail-fast: false
matrix:
version:
- "1.10"
- "1"
os:
- ubuntu-latest
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/CI_MLDataDevices.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ jobs:
fail-fast: false
matrix:
version:
- "1.10"
- "1"
os:
- ubuntu-latest
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/CI_WeightInitializers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ jobs:
fail-fast: false
matrix:
version:
- "1.10"
- "1"
os:
- ubuntu-latest
Expand Down
5 changes: 0 additions & 5 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,6 @@ if BACKEND_GROUP == "all" || BACKEND_GROUP == "cuda"
@info sprint(CUDA.versioninfo)
end

if BACKEND_GROUP == "all" || BACKEND_GROUP == "amdgpu"
using AMDGPU
@info sprint(AMDGPU.versioninfo)
end

using Lux

@testset "Load Tests" begin
Expand Down

1 comment on commit 900c21c

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 900c21c Previous: 8b87c2b Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4270.5 ns 4709 ns 0.91
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4000 ns 4792 ns 0.83
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 5875 ns 5166 ns 1.14
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4895.5 ns 4416 ns 1.11
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 59833 ns 60862 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10375 ns 10416 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9958 ns 9875 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10792 ns 11417 ns 0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10125 ns 10542 ns 0.96
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 422438 ns 426730.5 ns 0.99
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1083 ns 1000 ns 1.08
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1000 ns 1333 ns 0.75
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1417 ns 1291 ns 1.10
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1125 ns 1395.5 ns 0.81
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 18109 ns 18565 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4166 ns 4209 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4125 ns 4042 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4187.5 ns 4167 ns 1.00
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4042 ns 4000 ns 1.01
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 109209 ns 111556 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57645.5 ns 56375 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47000 ns 46916 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 38125 ns 46167 ns 0.83
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82084 ns 80959 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37455 ns 37697 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1973687 ns 2046500 ns 0.96
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2089416 ns 2089354 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2085625 ns 2048708.5 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1985813 ns 1993834 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 195917 ns 199690 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 146416.5 ns 147104.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 147020.5 ns 144104.5 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 145667 ns 148584 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 145604.5 ns 144583.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 166391 ns 165605 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1129209 ns 1131291 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1126375 ns 1119584 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1147667 ns 1111791.5 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1104209 ns 1118209 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 521058.5 ns 531488 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3416.5 ns 3500 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3333 ns 3709 ns 0.90
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6333 ns 5520.5 ns 1.15
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3250 ns 3375 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 66594 ns 71213 ns 0.94
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8792 ns 9084 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9291 ns 9625 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9250 ns 10167 ns 0.91
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9292 ns 8584 ns 1.08
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 493812 ns 497375 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 14750 ns 15458.5 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15458 ns 15250 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19167 ns 19146 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16437.5 ns 14604 ns 1.13
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 53833 ns 55040 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215416.5 ns 213833 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 213208.5 ns 213292 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 214271 ns 215292 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 227104 ns 217500 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 271460 ns 277020.5 ns 0.98
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 542 ns 542 ns 1
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 625 ns 583 ns 1.07
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 792 ns 709 ns 1.12
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 583 ns 625 ns 0.93
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 17470 ns 17919 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1750 ns 1542 ns 1.13
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1417 ns 1458 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1709 ns 1916 ns 0.89
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1645.5 ns 1375 ns 1.20
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 101826.5 ns 104816 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7250 ns 1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5916 ns 5875 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5292 ns 5916 ns 0.89
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10000 ns 9875 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23857.5 ns 24078 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 226895.5 ns 229750 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 230375 ns 228583 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 231584 ns 230292 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 258625 ns 213917 ns 1.21
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 167659 ns 172648 ns 0.97
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3875 ns 3875 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3875 ns 3833 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3916 ns 3875 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3833 ns 3875 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23468 ns 23922 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16750 ns 16458 ns 1.02
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 17042 ns 16583 ns 1.03
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 17000 ns 16958 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16625 ns 16750 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 160597 ns 166168.5 ns 0.97
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 572166 ns 579542 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 575000 ns 576458 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 587458 ns 578750 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 578334 ns 574667 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113397 ns 113828 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1421708 ns 1424688 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1420125 ns 1421083 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1430083 ns 1423208.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 1413292 ns 1419500 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 209669.5 ns 215564 ns 0.97
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1074458 ns 1071229.5 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 958250.5 ns 961417 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 1334396 ns 1343000 ns 0.99
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1310875 ns 1300000.5 ns 1.01
lenet(28, 28, 1, 64)/forward/GPU/CUDA 269120.5 ns 277770.5 ns 0.97
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 5769437 ns 5955916 ns 0.97
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 4470625 ns 4519500 ns 0.99
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 4941021 ns 4916354.5 ns 1.01
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5552042 ns 5726333 ns 0.97
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1066489 ns 1105672 ns 0.96
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 542 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 583 ns 0.93
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23585 ns 24042 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2083 ns 2084 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2167 ns 2084 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2250 ns 2208 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2125 ns 2125 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 169900 ns 173326.5 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4084 ns 4000 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6250 ns 4584 ns 1.36
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7209 ns 7083 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6125 ns 4125 ns 1.48
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 64199 ns 65959 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11083 ns 11084 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11625 ns 11000 ns 1.06
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12000 ns 12292 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10917 ns 10791 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 446167.5 ns 456125.5 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6042 ns 7000 ns 0.86
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7042 ns 6458 ns 1.09
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8833 ns 8500 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7250 ns 6292 ns 1.15
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 51074.5 ns 54186 ns 0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17292 ns 16708 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 18334 ns 17875 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18083 ns 18750 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17229.5 ns 16875 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 299895.5 ns 308312 ns 0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 459 ns 500 ns 0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 32630 ns 33294 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8458 ns 8708 ns 0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9041 ns 9208 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9166 ns 9458 ns 0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8459 ns 8292 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 158907 ns 162415.5 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 64625 ns 64625 ns 1
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 64250 ns 64667 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 65000 ns 64666 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 64667 ns 64625 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111460 ns 112234 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 289667 ns 284395.5 ns 1.02
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 279750 ns 286937.5 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 289625 ns 285291 ns 1.02
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 281250 ns 277917 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 184453.5 ns 188885.5 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 3347125 ns 3237000 ns 1.03
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 3015520.5 ns 3046417 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 2792979 ns 3014917 ns 0.93
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 4064520.5 ns 3953541.5 ns 1.03
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 588037 ns 577323 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 7500166 ns 7569937.5 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 7470229.5 ns 7460791.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 7393937.5 ns 7457666.5 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 8209000 ns 8209666 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1331630 ns 1380365.5 ns 0.96
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 19529541 ns 18994750 ns 1.03
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 19142959 ns 19146458 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 19022708 ns 19185583 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 15703750 ns 15773833 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23617083 ns 24040875 ns 0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 33598208 ns 33769833 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 41100666 ns 37025062.5 ns 1.11
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 35022333 ns 34849833 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1855178.5 ns 1855448 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 189352250 ns 192176500 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 163568208 ns 165400792 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 158452896 ns 153088459 ns 1.04
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 438607167 ns 439540208 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13925600.5 ns 13926820 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 287704167 ns 292222499.5 ns 0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 337952937.5 ns 338088333 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 291466708 ns 298393250 ns 0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 395696000 ns 394164437.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 21334 ns 23395.5 ns 0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 24375 ns 23000 ns 1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 25771 ns 26479.5 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 23584 ns 22271 ns 1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 95861 ns 96215.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 103625 ns 103541.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 103708 ns 104375 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 104625 ns 105000 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 103479.5 ns 106291 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 510517.5 ns 499410 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5750 ns 7125 ns 0.81
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7208 ns 6542 ns 1.10
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7666.5 ns 7916 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7166 ns 5875 ns 1.22
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 68604 ns 67753 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14708 ns 15250 ns 0.96
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15916 ns 15500 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16666 ns 16666 ns 1
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14667 ns 14667 ns 1
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 483804.5 ns 471687 ns 1.03
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 2876500 ns 3030208.5 ns 0.95
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2063833 ns 2057020.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2288208 ns 2271375 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4870416 ns 4518521 ns 1.08
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 587700 ns 585712 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 23421375 ns 23780833 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 17990750 ns 17907042 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 18312792 ns 16907896 ns 1.08
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 35646292 ns 34889792 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3104605 ns 3222471 ns 0.96
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33240625 ns 33703875 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 27662417 ns 27577959 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 27837459 ns 27463958 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41788833 ns 41773187 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 72083 ns 73687.5 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 78729 ns 73292 ns 1.07
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 75729.5 ns 83417 ns 0.91
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 72459 ns 74667 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 100762.5 ns 101830 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 204458 ns 318542 ns 0.64
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 219041 ns 216770.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 320458 ns 219750 ns 1.46
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 205312.5 ns 297396 ns 0.69
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 541454.5 ns 550055 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11333 ns 11937.5 ns 0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12416 ns 11958 ns 1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13834 ns 13395.5 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 13125 ns 11584 ns 1.13
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 69856.5 ns 71500 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26520.5 ns 26666 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 27458 ns 26875 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 28291 ns 27792 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26500 ns 26500 ns 1
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 473341 ns 478647.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11833 ns 12458 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12750 ns 12750 ns 1
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 14333 ns 14042 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 13375 ns 12042 ns 1.11
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 51587 ns 54279 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26375 ns 25792 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26583 ns 25791 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26666 ns 26584 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26417 ns 25833.5 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 302777.5 ns 307846.5 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 178666.5 ns 180187.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 180292 ns 179750 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 184416.5 ns 183375 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 179709 ns 179041 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 55677 ns 57080 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 591146.5 ns 584708.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 588583 ns 587833 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 593062 ns 595750 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 582708.5 ns 587000 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 285027 ns 286439 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5667 ns 6541.5 ns 0.87
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7167 ns 6708 ns 1.07
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7895.5 ns 7500 ns 1.05
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7291 ns 5750 ns 1.27
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 69657.5 ns 70275 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14167 ns 13937.5 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14958 ns 14708 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15854.5 ns 15583 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14583 ns 13500 ns 1.08
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 460443 ns 465284 ns 0.99
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 1194208.5 ns 1198000 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 1216792 ns 1218958 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 1262604 ns 1268562.5 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 1318166.5 ns 1315416 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 301559 ns 302635 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 4098416 ns 4311792 ns 0.95
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 4352937.5 ns 4360354 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 4631875 ns 4524583 ns 1.02
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 4436562.5 ns 4481833 ns 0.99
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1042661.5 ns 1039337 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1750 ns 1833 ns 0.95
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1833 ns 1834 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1834 ns 1834 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1875 ns 1833 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23523 ns 23819 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4792 ns 4834 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4875 ns 4875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4916 ns 5000 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4875 ns 4875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 187370 ns 189325 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5500 ns 6250 ns 0.88
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6334 ns 6084 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 8604 ns 8291 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7292 ns 5750 ns 1.27
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 54466 ns 56699 ns 0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10958 ns 11125 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 11792 ns 12083 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11708.5 ns 11875 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 11166 ns 11125 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 330839 ns 333470 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 333 ns 333 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22873.5 ns 23140 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2708 ns 2834 ns 0.96
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2959 ns 2709 ns 1.09
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3042 ns 3042 ns 1
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2750 ns 2750 ns 1
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 157537.5 ns 160474 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10750 ns 11833 ns 0.91
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 13708 ns 12500 ns 1.10
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 14958 ns 15000 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 14583 ns 11667 ns 1.25
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 55574.5 ns 57479 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 25209 ns 24667 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 25250 ns 25000 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25375 ns 25583 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24979.5 ns 25125 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 292656 ns 294701.5 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4208 ns 4167 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4125 ns 4208 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4167 ns 4208 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4167 ns 4125 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24774 ns 25243 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16333 ns 15959 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16125 ns 16167 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16125 ns 16500 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16084 ns 16125 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 195031.5 ns 196657.5 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5708 ns 5667 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5750 ns 5708 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5750 ns 5709 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5709 ns 5708 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 33326 ns 34103 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 21125 ns 20375 ns 1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 20875 ns 21166 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21583 ns 21500 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 21500 ns 21083 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 175195.5 ns 178406.5 ns 0.98
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 415708 ns 380541 ns 1.09
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 376667 ns 375333 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 471499.5 ns 487875 ns 0.97
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 523500 ns 532687 ns 0.98
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66680.5 ns 67192 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 924750.5 ns 993167 ns 0.93
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 849291 ns 884334 ns 0.96
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 1217521 ns 1238562.5 ns 0.98
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 1302292 ns 1412624.5 ns 0.92
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 189339 ns 189581 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 79792 ns 86875 ns 0.92
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 82667 ns 80583 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 84208 ns 85875 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82833 ns 80791.5 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193132 ns 192886.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1917625.5 ns 1924208 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1915292 ns 1916917 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1940917 ns 1920541 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1896541 ns 1907750 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 395963 ns 398152 ns 0.99
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21798 ns 22307 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1792 ns 1791 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1875 ns 1792 ns 1.05
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1834 ns 1833 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 167505 ns 170162 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5834 ns 6792 ns 0.86
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 7500 ns 7458.5 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9958 ns 9604.5 ns 1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6875 ns 6458.5 ns 1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 58244.5 ns 60140 ns 0.97
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9375 ns 8875 ns 1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9333 ns 9208 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9354.5 ns 9250 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9625 ns 9208 ns 1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 302935 ns 308605.5 ns 0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 119443416.5 ns 156095333.5 ns 0.77
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 173896250 ns 174294250 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 155811625 ns 147908167 ns 1.05
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 108054541 ns 105395375 ns 1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5469386 ns 5479498 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 616746166.5 ns 674867041 ns 0.91
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 555745625 ns 555334333 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 468855125 ns 454020333.5 ns 1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 760571396 ns 758003104 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34956216 ns 34951781 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 648663875 ns 701059834 ns 0.93
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 664591146 ns 666716125.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 601178041.5 ns 580121499.5 ns 1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 746069334 ns 741952792 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 59458 ns 57708 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47083 ns 47333 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39166 ns 47250 ns 0.83
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83208 ns 83959 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37582 ns 37806 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1926708 ns 1934958.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1983042 ns 1972000 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1986937.5 ns 1976374.5 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1850250 ns 1886667 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 173017.5 ns 174540 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 265187.5 ns 274833.5 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 267959 ns 267625 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 276771 ns 288750 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 266917 ns 275791.5 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 128834.5 ns 127747 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 604083 ns 588791.5 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 692833.5 ns 676334 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 705709 ns 669375.5 ns 1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 590291.5 ns 637708 ns 0.93
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 683429 ns 705367 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2195333 ns 2201812.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2225625 ns 2173417 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2230583 ns 2204166 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2183333 ns 2175854 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 133325.5 ns 133869 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5480833 ns 5561000 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5508958 ns 5485083 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5585895.5 ns 5500791 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5490125 ns 5486667 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 766206 ns 758600 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 646750 ns 650375 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 660250 ns 639375 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 642917 ns 639250 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 647375 ns 645541 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 47306 ns 46906 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1828875 ns 1797375 ns 1.02
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1721042 ns 1723000 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1665209 ns 1729417 ns 0.96
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 2097000 ns 2102375 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 223896.5 ns 224012.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58667 ns 57125 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47750 ns 46792 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 38958 ns 46792 ns 0.83
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82750 ns 83625 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 29191 ns 28934 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2029083.5 ns 2042125 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2091166 ns 2085750 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2107249.5 ns 2086104 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1994854.5 ns 1992187.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 190986 ns 192769 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 13371291 ns 13486000 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 12436583.5 ns 12454854 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 12675625 ns 12584062 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 15146959 ns 15166646 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 517535.5 ns 516981.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 47259416 ns 47757417 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 41746209 ns 41920875 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 41384750 ns 41057895.5 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 58440500 ns 58660917 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3203835 ns 3200471 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 73984667 ns 74173979 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 91223791.5 ns 68296125 ns 1.34
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 90609938 ns 90853250 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 77234000 ns 76369500 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 59000 ns 57542 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47417 ns 47333 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 38917 ns 47208 ns 0.82
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81125 ns 83542 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 47741 ns 47283 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1911646 ns 1917416.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1970541 ns 1969750 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1976417 ns 1977666 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1882083 ns 1891062.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 195868.5 ns 191945 ns 1.02
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 250 ns 1.17
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 417 ns 0.90
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 416 ns 0.90
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 333 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 32615 ns 32084 ns 1.02
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6500 ns 6166 ns 1.05
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6375 ns 6417 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6750 ns 6959 ns 0.97
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6375 ns 6334 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 176818 ns 173427.5 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 291 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32102 ns 31620 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2625 ns 2667 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2875 ns 2792 ns 1.03
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2916 ns 2959 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2625 ns 2625 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 164236.5 ns 161588.5 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 286096229 ns 322222750 ns 0.89
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 339570541 ns 341161875 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 321242167 ns 313409520.5 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 271493208 ns 272857666 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7111512 ns 7106282 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 987492667 ns 1057275812.5 ns 0.93
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 939040416 ns 937359791 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 868433209 ns 852420750 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1162204042 ns 1161160000 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34040446 ns 34076180 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1310851000.5 ns 1357441042 ns 0.97
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1685402625 ns 1321006541.5 ns 1.28
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1648347125 ns 1604272875 ns 1.03
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1310788750 ns 1302899708.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1412625 ns 1417312.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1412041.5 ns 1438625 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1424625 ns 1422375 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1408334 ns 1404187.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 128501 ns 127360 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5028875 ns 5059667 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5030104 ns 5032458 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5062042 ns 5024750 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5014021 ns 5017709 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 597004.5 ns 498493.5 ns 1.20
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 168008834 ns 172134417 ns 0.98
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 130299417 ns 132190854 ns 0.99
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 148283479 ns 125671875 ns 1.18
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 161948354 ns 162159562.5 ns 1.00
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 5052268 ns 4881912.5 ns 1.03
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 662817209 ns 676531000 ns 0.98
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 492884417 ns 642244500 ns 0.77
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 507367709 ns 502997666 ns 1.01
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 678320708 ns 678617458 ns 1.00
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 17294527 ns 17408311 ns 0.99
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 8884604 ns 9098854 ns 0.98
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 8801959 ns 8775166.5 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 8221541.5 ns 7856833.5 ns 1.05
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 10127167 ns 10166000 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1611762 ns 1591045 ns 1.01
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 36027125 ns 37558563 ns 0.96
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 36933063 ns 37073459 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 34547750 ns 33526542 ns 1.03
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 38824854 ns 38790125 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6452267 ns 6476971 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 47375 ns 47333 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47250 ns 47333 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47542 ns 47625 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47333 ns 47125 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 19020 ns 19085 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 50312.5 ns 50333 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50500 ns 52875 ns 0.96
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 50958.5 ns 53083 ns 0.96
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 50333 ns 50250 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 226580 ns 184149.5 ns 1.23
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6542 ns 7458 ns 0.88
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7187.5 ns 7333 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9083 ns 8667 ns 1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8625 ns 6708 ns 1.29
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 117383.5 ns 84192.5 ns 1.39
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9625 ns 9917 ns 0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10208 ns 9917 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10333.5 ns 11041 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10209 ns 9917 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 723908.5 ns 493810 ns 1.47
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6083 ns 7542 ns 0.81
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 8250 ns 7667 ns 1.08
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 9417 ns 9667 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 8375 ns 5417 ns 1.55
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 157024.5 ns 91440.5 ns 1.72
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13292 ns 12625 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13792 ns 13833 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13708 ns 14000 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12834 ns 12666 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 618769 ns 454481 ns 1.36
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 1042 ns 1000 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1042 ns 1083 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1042 ns 1083 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1083 ns 1000 ns 1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 32863 ns 32617 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7875 ns 7708 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8000 ns 8145.5 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8208 ns 8500 ns 0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8250 ns 8125 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 246953.5 ns 196206.5 ns 1.26
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 25062.5 ns 23083 ns 1.09
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23291.5 ns 23375 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23542 ns 23583 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23250 ns 23750 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 18661 ns 18627 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 52625 ns 52500 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 52833 ns 52875 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 52875 ns 53417 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 52333 ns 52166 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 364018 ns 249106 ns 1.46
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1403750 ns 1448167 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1451354 ns 1405000 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1407542 ns 1405874.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1406458 ns 1403917 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 196760 ns 195637 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5023250 ns 5038167 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5018687.5 ns 5020646 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5042125 ns 5017458 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5001750 ns 5008375 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 766930 ns 558064 ns 1.37
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3048708 ns 3065354.5 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2082646 ns 2082084 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2300125 ns 2285291 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4855000 ns 4897375 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 583278 ns 583035 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24263250 ns 24715854 ns 0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18905459 ns 18870292 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 19193375 ns 18758208 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 36575416 ns 36783917 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3216229 ns 3184571 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 34013563 ns 34426125 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28342229 ns 28319896 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 28436750 ns 28022958.5 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 43339875 ns 41761166.5 ns 1.04
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 144288959 ns 144957333 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 142279583 ns 142855500 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 126469000.5 ns 124763354 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 168866000 ns 173311167 ns 0.97
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22582893 ns 22559600 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 1275599313 ns 956543708 ns 1.33
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1058487228.5 ns 1622781604 ns 0.65
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 712851209 ns 1236835833 ns 0.58
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 668538250 ns 673901750 ns 0.99
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 119108875 ns 118606884 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 83125 ns 74208 ns 1.12
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 76208 ns 74834 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 78125 ns 86875 ns 0.90
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 72729 ns 73041.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 365097 ns 204598.5 ns 1.78
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 189959 ns 278208.5 ns 0.68
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 287792 ns 202666.5 ns 1.42
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 268875 ns 288416 ns 0.93
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 189583.5 ns 287917 ns 0.66
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1559670.5 ns 1117217.5 ns 1.40
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 35476167 ns 36148959 ns 0.98
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 35447729.5 ns 35295854 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 32304459 ns 32189834 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 40935146 ns 40944021 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5843273 ns 5845476 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 147875542 ns 151293125 ns 0.98
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 152751312.5 ns 152622708.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 139824437 ns 134152417 ns 1.04
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 287719375 ns 287902584 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 34882914 ns 34882228 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 120880395.5 ns 155688000 ns 0.78
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 174358791 ns 174601250 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 155429791 ns 147696687.5 ns 1.05
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 106966959 ns 106151041.5 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5456342 ns 5471843 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 470623375 ns 518343938 ns 0.91
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 466918000 ns 467330167 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 456589562.5 ns 438511083.5 ns 1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 742113834 ns 738327500 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 32255425 ns 32271735 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 706243291.5 ns 689829417 ns 1.02
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 652697541.5 ns 655962042 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 591007625 ns 572893458 ns 1.03
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 851805375 ns 850499333 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1320583.5 ns 1204208 ns 1.10
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 965875 ns 909228.5 ns 1.06
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 736687.5 ns 975604.5 ns 0.76
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 1944666.5 ns 2068166 ns 0.94
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 564187.5 ns 573967.5 ns 0.98
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 2971708.5 ns 2921979 ns 1.02
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 2620334 ns 2595937 ns 1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 2535604 ns 2601958 ns 0.97
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 3604083.5 ns 3701291 ns 0.97
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1878347.5 ns 1629819 ns 1.15
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 6649958 ns 6735042 ns 0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 6493042 ns 6496187.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 6437479.5 ns 6432833.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 4435750 ns 4458667 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7375 ns 7208 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6208 ns 6084 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5375 ns 6125 ns 0.88
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9916 ns 10000 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 25400 ns 25112 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 213645.5 ns 214479.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 221833 ns 219625 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221250 ns 221583 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 205875 ns 206125 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 293719.5 ns 247799 ns 1.19
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 301604437.5 ns 312548750 ns 0.96
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 221356625 ns 223228250 ns 0.99
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 223278083.5 ns 196993083 ns 1.13
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 312163250 ns 310829208 ns 1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7672763 ns 7675013 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1078062604.5 ns 1097849625.5 ns 0.98
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 896268771 ns 906889750 ns 0.99
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 880668729 ns 868243875 ns 1.01
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 1161143188 ns 1161595250 ns 1.00
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26517571 ns 26504585 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5500 ns 5250 ns 1.05
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5750 ns 6520.5 ns 0.88
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9437.5 ns 7375 ns 1.28
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5875 ns 5125 ns 1.15
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 201555 ns 155225.5 ns 1.30
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7500 ns 6917 ns 1.08
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7458 ns 7541 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7750 ns 7584 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7041.5 ns 7250 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 699933.5 ns 614403 ns 1.14
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 500 ns 542 ns 0.92
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 583 ns 584 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 458 ns 1.09
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 23724.5 ns 24324 ns 0.98
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9208 ns 9209 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9625 ns 9333 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9604.5 ns 9709 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9042 ns 9083 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 234828.5 ns 214987 ns 1.09
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 351500 ns 352000 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 350896 ns 351167 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 354624.5 ns 352000 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 351708 ns 351667 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 20984 ns 21526 ns 0.97
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 775417 ns 822667 ns 0.94
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 824916 ns 803791 ns 1.03
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 830958 ns 774000 ns 1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 823958 ns 819209 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 306663 ns 271931 ns 1.13
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 338083 ns 315625 ns 1.07
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 341500 ns 334062.5 ns 1.02
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 443667 ns 448958 ns 0.99
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 325667 ns 335542 ns 0.97
batchedmm(16, Bsize=32)/forward/GPU/CUDA 17821 ns 18135.5 ns 0.98
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 696042 ns 693229 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 739416.5 ns 737125 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 1042874.5 ns 1034583 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 692645.5 ns 697563 ns 0.99
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 273141.5 ns 240714.5 ns 1.13
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 358458.5 ns 329166 ns 1.09
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 349125 ns 345354 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 431291.5 ns 424875 ns 1.02
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 370875 ns 374166 ns 0.99
batchedmm(16, Bsize=128)/forward/GPU/CUDA 22357.5 ns 22796 ns 0.98
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 756625 ns 753187.5 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 744208.5 ns 751083 ns 0.99
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 1073250 ns 1069042 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 818125.5 ns 824250 ns 0.99
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 221398.5 ns 214489 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3459 ns 3458 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3541 ns 3500 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3792 ns 3875 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3291 ns 3292 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 17956 ns 18145 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4208 ns 4417 ns 0.95
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4208 ns 4208 ns 1
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4416 ns 4333 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4125 ns 4209 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 275839.5 ns 237972.5 ns 1.16
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3792 ns 6417 ns 0.59
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3375 ns 4042 ns 0.83
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6750 ns 6542 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6625 ns 3375 ns 1.96
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 205448.5 ns 174590 ns 1.18
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8334 ns 8209 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8459 ns 8250 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8500 ns 8708 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8541 ns 8709 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1183984 ns 1063088 ns 1.11
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 202625 ns 203375 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 210416 ns 209625 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209292 ns 210958 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200000 ns 200833 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34588 ns 34926 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 603792 ns 601916 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 670625 ns 633750 ns 1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 630958 ns 622208.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 631187.5 ns 586000 ns 1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 352652 ns 307649.5 ns 1.15
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 967521 ns 966417 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 927063 ns 932833 ns 0.99
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 964437.5 ns 945958.5 ns 1.02
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 1281853.5 ns 1291166 ns 0.99
batchedmm(128, Bsize=128)/forward/GPU/CUDA 207244 ns 208387 ns 0.99
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 4451771 ns 4606250 ns 0.97
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 4482750 ns 4489917 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 4474208 ns 4299708 ns 1.04
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 6201166 ns 6229250 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 945549 ns 933347.5 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3604.5 ns 3875 ns 0.93
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3167 ns 3833 ns 0.83
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6792 ns 6167 ns 1.10
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3167 ns 2917 ns 1.09
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 233201 ns 191984.5 ns 1.21
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7500 ns 7666 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7375 ns 7125 ns 1.04
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7291 ns 7667 ns 0.95
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7083 ns 7208 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 1014881 ns 941897 ns 1.08
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1602833.5 ns 1602667 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1187916 ns 1171416 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1364062 ns 1364375 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2343729.5 ns 2512583 ns 0.93
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 212955.5 ns 215456.5 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12334792 ns 12345833 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9602042 ns 9563708.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9404958 ns 9248333 ns 1.02
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 17966833 ns 18039541.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1949853 ns 1941766 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17347084 ns 17410875 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14365000 ns 14343875 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14512666 ns 14290187.5 ns 1.02
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21005479.5 ns 21033375 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 89791 ns 93146 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 91729.5 ns 89750 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 94291 ns 92375 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 117416.5 ns 104667 ns 1.12
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 126285 ns 126306.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2023917 ns 2057146 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2013416.5 ns 2030833 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2058875 ns 2027062.5 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2027875 ns 2024458 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1031286 ns 951168 ns 1.08
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 346791.5 ns 327771 ns 1.06
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 343583.5 ns 344667 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 412250 ns 393729 ns 1.05
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 306166 ns 312667 ns 0.98
batchedmm(2, Bsize=4)/forward/GPU/CUDA 16010 ns 16220 ns 0.99
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 702291 ns 703375.5 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 728979.5 ns 721271 ns 1.01
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 1025458 ns 1023666.5 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 639875 ns 653917 ns 0.98
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 193209 ns 187186 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7292 ns 7083 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6083 ns 6125 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5334 ns 5833 ns 0.91
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10000 ns 9916 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33620 ns 34409 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220479.5 ns 214083 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 231958 ns 222333.5 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 232041 ns 221187.5 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 220500 ns 206125 ns 1.07
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 311751 ns 301322.5 ns 1.03
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3708 ns 3708 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3708 ns 3708 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3709 ns 3667 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3667 ns 3625 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22440 ns 23004 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14500 ns 14250 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14417 ns 14333 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14167 ns 14500 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14291 ns 14416 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 468658 ns 460312.5 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 95166 ns 92937.5 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 138021 ns 133375 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 99167 ns 96583.5 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 122458 ns 136958 ns 0.89
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125691 ns 125681 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1931875 ns 1754208.5 ns 1.10
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1954979 ns 1922334 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1946854 ns 1933417 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1923729.5 ns 1927416.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 940251.5 ns 955943 ns 0.98
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 880500 ns 857708 ns 1.03
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 815125 ns 817583 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 1172292 ns 1222291.5 ns 0.96
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 960167 ns 963416 ns 1.00
lenet(28, 28, 1, 32)/forward/GPU/CUDA 270704 ns 275885 ns 0.98
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2803000 ns 2826354 ns 0.99
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 2526833 ns 2472708.5 ns 1.02
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 3361333 ns 3311750 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3405875 ns 3417042 ns 1.00
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1569154 ns 1599363 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 15146 ns 15667 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18000 ns 15541 ns 1.16
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21666 ns 18791 ns 1.15
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18125 ns 15042 ns 1.20
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 141811.5 ns 143363 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 217083 ns 221562 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229375 ns 257625 ns 0.89
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 257396 ns 216167 ns 1.19
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 215833 ns 253521 ns 0.85
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 635765.5 ns 648580 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 219750 ns 221958 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 221500 ns 222584 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 226021 ns 222875 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 223937.5 ns 219542 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 270450 ns 287448 ns 0.94
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 509917 ns 560521 ns 0.91
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 557729 ns 506729 ns 1.10
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 549792 ns 497875 ns 1.10
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 555791 ns 524917 ns 1.06
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1308245 ns 1378195 ns 0.95
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 333479 ns 312208.5 ns 1.07
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 335541.5 ns 334917 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 437333 ns 355354.5 ns 1.23
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 319417 ns 323229.5 ns 0.99
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16583 ns 16853 ns 0.98
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 715333 ns 710916 ns 1.01
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 730292 ns 725333.5 ns 1.01
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 1025458.5 ns 1020291 ns 1.01
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 655792 ns 666458 ns 0.98
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 193313 ns 196645 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17625 ns 18292 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17625 ns 17250 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20437.5 ns 20250 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18000 ns 16687 ns 1.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 144711.5 ns 147801.5 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 216667 ns 219292 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 224083 ns 219437.5 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 226625 ns 213646 ns 1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 223417 ns 222104.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 903796 ns 1001312.5 ns 0.90
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4625 ns 6458 ns 0.72
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6750 ns 4792 ns 1.41
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7438 ns 7250 ns 1.03
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6625 ns 4458 ns 1.49
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 174159.5 ns 238642 ns 0.73
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10437.5 ns 10792 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10750 ns 10375 ns 1.04
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10770.5 ns 11375 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10833 ns 10333 ns 1.05
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 1024421 ns 1064757 ns 0.96
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3646 ns 6042 ns 0.60
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3334 ns 3792 ns 0.88
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 5625 ns 4750 ns 1.18
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3500 ns 3209 ns 1.09
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 231660 ns 236410 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7708 ns 7250 ns 1.06
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7792 ns 7333 ns 1.06
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7625 ns 8042 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7167 ns 7584 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 1037611.5 ns 1074231 ns 0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23838833 ns 24130479 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 33990646 ns 38799500 ns 0.88
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 41585708 ns 37733750 ns 1.10
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34896229 ns 34918167 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1839186 ns 1843476 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 184662833 ns 186803646 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 159634000 ns 159613166 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 151746084 ns 146295625 ns 1.04
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 415075875 ns 412659125 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 16506413 ns 16523543 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 427351833 ns 436777542 ns 0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 251624521 ns 253178667 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 233926312.5 ns 232826083.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 484091542 ns 484428667 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 181666 ns 183792 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 183416.5 ns 182000 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 186125 ns 185584 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 183834 ns 182354.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 173529.5 ns 220958.5 ns 0.79
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 587541 ns 593000 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 600458 ns 587187 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 632375 ns 588166 ns 1.08
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 631354 ns 632000 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1005977 ns 1068694.5 ns 0.94
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 3816041.5 ns 3862583.5 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 3637833 ns 3623187 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 3539646 ns 3513333 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 5351396 ns 5351459 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 554127 ns 534395 ns 1.04
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 17372333 ns 17921270.5 ns 0.97
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 17218458.5 ns 17168125 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 16979478.5 ns 16586271 ns 1.02
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 22177625 ns 22125084 ns 1.00
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2616933 ns 2619299 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 583 ns 542 ns 1.08
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 542 ns 500 ns 1.08
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 542 ns 625 ns 0.87
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 459 ns 459 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32036 ns 32390 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9667 ns 9417 ns 1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9750 ns 8875 ns 1.10
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10125 ns 10125 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9291 ns 9125 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 260858 ns 265134.5 ns 0.98
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 506491042 ns 505787208 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 428949104 ns 430827229 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 474815000 ns 432173291.5 ns 1.10
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 671461979 ns 584857000 ns 1.15
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12484614.5 ns 12384263 ns 1.01
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 2043435104.5 ns 2073799791.5 ns 0.99
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 1631358667 ns 1628408167 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 1546812271 ns 1495535812 ns 1.03
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 2216473375.5 ns 2213815333 ns 1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49204869.5 ns 49261027.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1642542 ns 1644542 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1194625 ns 1184062.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1380791 ns 1367187.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2487084 ns 2468292 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 215546 ns 217369 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12711687.5 ns 12780979.5 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9927625 ns 9943666 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9788604.5 ns 9649896 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18464437.5 ns 18379437 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1995889.5 ns 2035807.5 ns 0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17669166.5 ns 17754833 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14709437.5 ns 14655042 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14807645.5 ns 14543333 ns 1.02
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21465708 ns 21358459 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26250 ns 26250 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26250 ns 26250 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26291 ns 26583 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26167 ns 26208 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23873 ns 23360 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66917 ns 66834 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 67333 ns 67542 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 67083 ns 67083 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66833 ns 66875 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 382426 ns 392635.5 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203834 ns 203542 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209542 ns 209584 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209584 ns 209708 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 199584 ns 199875 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26132 ns 25945.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 613833.5 ns 608625 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 636667 ns 632958.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 671166.5 ns 622333 ns 1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 628229.5 ns 584541.5 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 308600 ns 349189 ns 0.88
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 671687.5 ns 653500 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 645937.5 ns 670875 ns 0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 644791.5 ns 547042 ns 1.18
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 676334 ns 675666.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 131667 ns 131441 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2241875 ns 2289416 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2192250 ns 2233958 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2297042 ns 2245708 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2246249.5 ns 2234188 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1114838 ns 1153968 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 16791 ns 17583 ns 0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17500 ns 17000 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20958 ns 21083.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16770.5 ns 17479 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 143001 ns 142918 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 230375 ns 226645.5 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 231791.5 ns 230417 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 266208 ns 220688 ns 1.21
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 260728.5 ns 218917 ns 1.19
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 959584 ns 981199 ns 0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 541 ns 0.92
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 458 ns 1.09
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23163 ns 23217 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9604.5 ns 10041.5 ns 0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 10292 ns 10125 ns 1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10625 ns 10417 ns 1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9584 ns 9250 ns 1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 255611 ns 255034 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5416.5 ns 5916 ns 0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5750 ns 6229.5 ns 0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9458 ns 8563 ns 1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5708 ns 5500 ns 1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 219432 ns 222902 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7833 ns 7250 ns 1.08
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7750 ns 7709 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7709 ns 7750 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7000 ns 6958.5 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 764584 ns 767625.5 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1959 ns 2291 ns 0.86
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 2083 ns 2250 ns 0.93
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2417 ns 2333 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2208 ns 2333 ns 0.95
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 17893 ns 17725 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6875 ns 6542 ns 1.05
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6542 ns 6667 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6583 ns 6958 ns 0.95
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 6291 ns 6583 ns 0.96
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 320459 ns 317996.5 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 747709 ns 748750 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 749833 ns 747083 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 754999.5 ns 747042 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 749375 ns 749125 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 21357 ns 21402 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 774854 ns 790729 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 792687.5 ns 790333.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 817042 ns 773125 ns 1.06
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 811166 ns 775458.5 ns 1.05
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 295013.5 ns 291072 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7334 ns 7209 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6000 ns 6042 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5208.5 ns 6083 ns 0.86
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10166 ns 10125 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33519 ns 32814 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 219666 ns 220166 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 268125 ns 240583 ns 1.11
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 252000.5 ns 228583 ns 1.10
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213562 ns 255708 ns 0.84
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 354278 ns 355564.5 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10875 ns 12541 ns 0.87
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11833 ns 10500 ns 1.13
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 12770.5 ns 13167 ns 0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 12000 ns 10125 ns 1.19
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 238132.5 ns 239405.5 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24708 ns 24791.5 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24584 ns 24375 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25292 ns 25541 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24500 ns 24812.5 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 1094067.5 ns 1085912 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 106709834 ns 108107292 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 116906583.5 ns 117455666.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 127036729 ns 120529584 ns 1.05
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 117807000 ns 117307042 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2657653 ns 2652543 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 392558792 ns 395929750 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 365774917 ns 367066041 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 431860937.5 ns 354756333 ns 1.22
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 483379250 ns 484413208 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 15196086 ns 15198392 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 758564875.5 ns 767591687.5 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 761412666 ns 579795958 ns 1.31
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 748747542 ns 743372729 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 765232583 ns 765609167 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6625 ns 7458.5 ns 0.89
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7334 ns 7479.5 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 9041.5 ns 8916 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8250 ns 6708 ns 1.23
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 231038.5 ns 232243 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14625 ns 13917 ns 1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14750 ns 14125 ns 1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14292 ns 15166 ns 0.94
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14542 ns 14458 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 1043294.5 ns 1035695.5 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5875 ns 9042 ns 0.65
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 7959 ns 6833 ns 1.16
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 9167 ns 9750 ns 0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6333 ns 5500 ns 1.15
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 228571 ns 227355.5 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12791 ns 12625 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13167 ns 12959 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13375 ns 12917 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12333 ns 12292 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 779066.5 ns 753887 ns 1.03
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 347625 ns 327750 ns 1.06
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 342625 ns 342666.5 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 416812 ns 398083 ns 1.05
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 307083 ns 317437.5 ns 0.97
batchedmm(2, Bsize=128)/forward/GPU/CUDA 17023 ns 16593 ns 1.03
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 710208.5 ns 702854.5 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 732125 ns 720833 ns 1.02
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 1032542 ns 1025771 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 653979.5 ns 661750 ns 0.99
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 200196.5 ns 196204.5 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 334 ns 292 ns 1.14
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 333 ns 291 ns 1.14
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 23569 ns 23062 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6375 ns 6333 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6584 ns 6584 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6834 ns 6792 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6042 ns 6250 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 241926 ns 236488 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5708 ns 5833 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5834 ns 5792 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 5875 ns 5875 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5708 ns 5667 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 24556.5 ns 24282 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 21562.5 ns 21687 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 22000 ns 21584 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21709 ns 21750 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21167 ns 21063 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 265433.5 ns 260349.5 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 144917 ns 172458 ns 0.84
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 191292 ns 185292 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 149333 ns 148917 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 149250 ns 186625 ns 0.80
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 167659 ns 166632 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1319292 ns 1351354.5 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1331416 ns 1310042 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1362958 ns 1312208 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1326125 ns 1317292 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1343729.5 ns 1279433 ns 1.05
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 22250 ns 24291 ns 0.92
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 23791 ns 22125 ns 1.08
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 25875 ns 25958 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 23666.5 ns 21916 ns 1.08
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 286115 ns 277859 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 146125 ns 127896 ns 1.14
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 118500 ns 174583 ns 0.68
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 129833 ns 118667 ns 1.09
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 175792 ns 135125 ns 1.30
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1461317 ns 1390180 ns 1.05
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 291 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 23352 ns 22950 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6334 ns 6416.5 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6459 ns 6625 ns 0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6709 ns 6834 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6125 ns 6250 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 258095.5 ns 253555 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4625 ns 6000 ns 0.77
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4125 ns 4167 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7625 ns 7375 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4895.5 ns 4666 ns 1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 256357.5 ns 241371.5 ns 1.06
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9959 ns 10166 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10125 ns 10042 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10333 ns 10625 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10333 ns 10333 ns 1
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 1358318.5 ns 1304285.5 ns 1.04
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1584 ns 1584 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1583 ns 1583 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 23389 ns 22830 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5667 ns 5708 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5875 ns 5667 ns 1.04
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6000 ns 6042 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5625 ns 5583 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 275350.5 ns 270940 ns 1.02
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 6780125 ns 6820479 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 6371125 ns 6334041.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 6531396 ns 6486416.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 7625875 ns 7665459 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 214804 ns 213607.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 24015354 ns 24142500 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 21285667 ns 21253833 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 21085125 ns 20999479 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 29769250 ns 29726209 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2112477.5 ns 2083084.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 37264541.5 ns 37375166.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 45538167 ns 33959583 ns 1.34
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 45665125 ns 45667583 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 38235958 ns 37873562.5 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6208 ns 6979.5 ns 0.89
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5958.5 ns 6667 ns 0.89
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 8750 ns 8104.5 ns 1.08
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7500 ns 5479.5 ns 1.37
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 236550 ns 228629.5 ns 1.03
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8750 ns 8375 ns 1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8375 ns 8375 ns 1
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8500 ns 8584 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8958 ns 8125 ns 1.10
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1063848.5 ns 1060872.5 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 1554084 ns 1527229 ns 1.02
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 1262375 ns 1259812.5 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 1631958.5 ns 1616208 ns 1.01
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2152375 ns 2147979 ns 1.00
lenet(28, 28, 1, 128)/forward/GPU/CUDA 277465 ns 271439 ns 1.02
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 7881667 ns 7973083.5 ns 0.99
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 6612667 ns 6586020.5 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 7276167 ns 7034625 ns 1.03
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 10468062.5 ns 10461334 ns 1.00
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1876576 ns 1861989 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 346375 ns 318167 ns 1.09
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 348937.5 ns 341959 ns 1.02
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 423416.5 ns 408000 ns 1.04
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 336687 ns 345291 ns 0.98
batchedmm(128, Bsize=4)/forward/GPU/CUDA 46390 ns 46596 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 735208 ns 734812.5 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 782458 ns 781000 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 1081666.5 ns 1068667 ns 1.01
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 758458.5 ns 746084 ns 1.02
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 311011.5 ns 299516.5 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397375 ns 397708 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288250 ns 288000 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 212583 ns 288125 ns 0.74
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 754104.5 ns 752083 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 44494 ns 44143 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 675959 ns 633750 ns 1.07
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 532333 ns 531000 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 474000 ns 530834 ns 0.89
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 973417 ns 973250 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 189847 ns 188258.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 599375 ns 667374.5 ns 0.90
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 650333 ns 643458.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 660375 ns 545833 ns 1.21
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 655833.5 ns 678833.5 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132321 ns 131695 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2469395.5 ns 2403188 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2363959 ns 2439250 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2519875.5 ns 2454541 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2465916 ns 2454542 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1345989 ns 1200754 ns 1.12
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 345583 ns 325000 ns 1.06
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 342834 ns 340500 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 416375 ns 394250 ns 1.06
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 306979.5 ns 314000 ns 0.98
batchedmm(2, Bsize=32)/forward/GPU/CUDA 16330 ns 15982 ns 1.02
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 703104 ns 702813 ns 1.00
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 729708 ns 719125 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 1026937.5 ns 1024146 ns 1.00
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 645959 ns 651667 ns 0.99
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 199885.5 ns 196545 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1460542 ns 1458417 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1500583 ns 1503167 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1491791 ns 1499542 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1441917 ns 1439209 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 41671 ns 40255 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5133500 ns 5142459 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5293250 ns 5295000.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5309521 ns 5017687.5 ns 1.06
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4977042 ns 4991625 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 197710 ns 197920.5 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3708 ns 3708 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3708 ns 3708 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3709 ns 3667 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3666 ns 3667 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 33362 ns 33701 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15125 ns 14917 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15500 ns 15333 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15125 ns 15375 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15083 ns 15125 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 381216.5 ns 380032 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 71375 ns 71375 ns 1
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 71208 ns 71292 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 71583 ns 71250 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 71208 ns 71250 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113946.5 ns 113118 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 319833 ns 322292 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 319208 ns 321459 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 327125 ns 327292 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 318375 ns 318334 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 195156 ns 196182.5 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 959 ns 1000 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1042 ns 1083 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 1000 ns 958 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 23764 ns 23902 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8084 ns 7959 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8542 ns 8083 ns 1.06
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8416 ns 8541 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7833.5 ns 8125 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 263039 ns 263222.5 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 472416 ns 451021 ns 1.05
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 468125 ns 470667 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 549250 ns 556978.5 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 550333 ns 567333 ns 0.97
batchedmm(128, Bsize=32)/forward/GPU/CUDA 128804.5 ns 129930 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 1375292 ns 1413124.5 ns 0.97
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1372208 ns 1374375 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1633459 ns 1599125 ns 1.02
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 1580500 ns 1589500 ns 0.99
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 274739 ns 275820 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 416 ns 416 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 291 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31574 ns 31985 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6458 ns 6375 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6875 ns 6375 ns 1.08
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6708 ns 6833 ns 0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6000 ns 6291 ns 0.95
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 261869 ns 265480 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1727625 ns 1723041.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1783958 ns 1770375 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1730916 ns 1726791 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1729333 ns 1769792 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 168455 ns 169107.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4352625 ns 4370833 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4372937.5 ns 4358458 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4412458 ns 4355958 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4358042 ns 4350000 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1234725 ns 1170977 ns 1.05
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6709 ns 6625 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 6584 ns 6750 ns 0.98
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7417 ns 7041 ns 1.05
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6542 ns 9000 ns 0.73
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 19619.5 ns 21354 ns 0.92
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 51083 ns 33104.5 ns 1.54
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 35625 ns 51458 ns 0.69
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 49875 ns 33083 ns 1.51
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 70208 ns 51042 ns 1.38
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 211156 ns 211403.5 ns 1.00
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 354291 ns 332479 ns 1.07
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 347584 ns 345500 ns 1.01
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 432708 ns 420625 ns 1.03
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 319521.5 ns 326208 ns 0.98
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18053 ns 18610.5 ns 0.97
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 719104 ns 719166 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 735979 ns 732604 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 1039063 ns 1029625 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 672750 ns 679354 ns 0.99
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 343671.5 ns 345590 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 75417 ns 75167 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 75333 ns 75125 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 75708 ns 75292 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 74709 ns 74875 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46983 ns 47792 ns 0.98
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 324417 ns 334542 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 327000 ns 340667 ns 0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 334917 ns 326000 ns 1.03
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 324083 ns 326708 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 207721.5 ns 213631.5 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1486334 ns 1484750 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1527500 ns 1530208 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1519000 ns 1526875 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1466541 ns 1463833 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 51914 ns 52711 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5119333.5 ns 5145375.5 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5300396 ns 5286834 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5303708 ns 4997792 ns 1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4989375 ns 4998437.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 201413 ns 207150 ns 0.97
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28167 ns 28209 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28166 ns 28250 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28333 ns 28292 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28208 ns 28209 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24393 ns 24880 ns 0.98
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66542 ns 66375 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66292 ns 66584 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66542 ns 66542 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66584 ns 66541 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 530998 ns 537867.5 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1493250 ns 1339125 ns 1.12
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 1120167 ns 1143854 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 947625 ns 1056979.5 ns 0.90
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2256500 ns 2227833 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 570331 ns 577124.5 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 3075542 ns 3019562 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 2732479 ns 2730250 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 2643125 ns 2578250 ns 1.03
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 3814770.5 ns 3815792 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 2010818 ns 2002712 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 8738917 ns 8920709 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 8777854.5 ns 8781875 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 8781417 ns 8792854 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 6360687.5 ns 6367541.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 81146 ns 84000 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 81708.5 ns 82083 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 83708 ns 84583 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 87687.5 ns 80791.5 ns 1.09
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 192383.5 ns 192031 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2016791.5 ns 2015625 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2012708 ns 2019458.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2041312 ns 1745917 ns 1.17
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2015208 ns 2013895.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 798885.5 ns 797860.5 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.