diff --git a/Project.toml b/Project.toml index d32d76ffd..dbbe4c427 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "AMDGPU" uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" authors = ["Julian P Samaroo ", "Valentin Churavy ", "Anton Smirnov "] -version = "1.1.3" +version = "1.1.4" [deps] AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" diff --git a/docs/make.jl b/docs/make.jl index db6b78cab..0735739f1 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -27,10 +27,8 @@ function main() "Exceptions" => "exceptions.md", "Profiling" => "profiling.md", "Memory" => "memory.md", + "Caching Memory Allocator" => "caching_allocator.md", "Host-Call" => "hostcall.md", - "Intrinsics" => [ - "Execution Control" => "execution_control.md", - ], "Printing" => "printing.md", "Logging" => "logging.md", "API Reference" => "api.md" diff --git a/docs/src/assets/gc-vram-breakdown.png b/docs/src/assets/gc-vram-breakdown.png new file mode 100644 index 000000000..a1ccd6698 Binary files /dev/null and b/docs/src/assets/gc-vram-breakdown.png differ diff --git a/docs/src/assets/with-caching-allocator.png b/docs/src/assets/with-caching-allocator.png new file mode 100644 index 000000000..5b6ecd034 Binary files /dev/null and b/docs/src/assets/with-caching-allocator.png differ diff --git a/docs/src/assets/without-caching-allocator.png b/docs/src/assets/without-caching-allocator.png new file mode 100644 index 000000000..15d1ca890 Binary files /dev/null and b/docs/src/assets/without-caching-allocator.png differ diff --git a/docs/src/caching_allocator.md b/docs/src/caching_allocator.md new file mode 100644 index 000000000..6181caeee --- /dev/null +++ b/docs/src/caching_allocator.md @@ -0,0 +1,76 @@ +# Caching Memory Allocator + +Julia uses Garbage-Collection (GC) for automatic memory management. +However, it does not know about other memory spaces, +therefore it sees no difference between 1 KiB GPU allocation and 1 GiB +and doesn't free it in time. + +This leads to a situations where all of the GPU memory is used, +even though your algorithm only requires a fraction of it. + +Current mechanism of dealing with OOM (Out-Of-Memory) errors during allocations +is to manually trigger GC and retry allocating again doing this in several rounds +each more aggressive than previous. + +However, manually triggering GC is very expensive, since it requires scanning +all Julia objects, not just ROCArrays, so the actual memory freeing takes a +fraction of GC time: +![](./assets/gc-vram-breakdown.png) + +On the image above, red region is a call to GC and green region is +where actual GPU memory is being freed. + +--- + +To help with memory management, we can use caching memory allocator. +It is usefult in scenarios where we execute the same function multiple times +and have the same memory allocation pattern. +One such example is training DL models, where given the model and its parameters +we compute loss, gradients w.r.t. loss and perform in-place parameter update. +In this case, every iteration performs same operations and memory allocations +and with caching allocator we can efficiently re-use them without returning +the memory back to OS. + +## Example + +We have a for-loop, where each iteration requires 2 GiB of VRAM. +We create a caching allocator with the name `:loop` and pass a function to +execute. +First iteration will allocate, but subsequent won't. + +```julia +using AMDGPU + +function main() + n = 1024^2 * 256 + for i in 1:1000 + AMDGPU.with_caching_allocator(:loop, n) do n + sin.(AMDGPU.rand(Float32, n)) # 2 GiB allocation + return + end + end +end +``` + +The reason for marking a region of code where to re-use the memory and +not extending it to the whole program instead, is because we cannot rely on GC +to tell us when the memory is no longer used (it is too slow for that), +so we create such region manually. + +You can free all memory held by allocator, by invalidating it using its name +with [`AMDGPU.invalidate_caching_allocator!`](@ref). +Or if you want some region of code within [`AMDGPU.with_caching_allocator`](@ref) +to execute without relying on cache, use [`AMDGPU.with_no_caching`](@ref). + +||Without Caching Allocator|With Caching Allocator| +|:---:|:---:|:---:| +|VRAM Usage|![](./assets/without-caching-allocator.png)|![](./assets/with-caching-allocator.png)| +|Execution time (seconds)|`12.865149`|`0.020943`| + +## API + +```@docs +AMDGPU.with_caching_allocator +AMDGPU.with_no_caching +AMDGPU.invalidate_caching_allocator! +``` diff --git a/docs/src/execution_control.md b/docs/src/execution_control.md deleted file mode 100644 index 4fb4681d7..000000000 --- a/docs/src/execution_control.md +++ /dev/null @@ -1,27 +0,0 @@ -# Execution Control and Intrinsics - -GPU execution is similar to CPU execution in some ways, although there are many -differences. AMD GPUs have Compute Units (CUs), which can be thought of like -CPU cores. Those CUs have (on pre-Navi architectures) 64 "shader processors", -which are essentially the same as CPU SIMD lanes. The lanes in a CU operate in -lockstep just like CPU SIMD lanes, and have execution masks and various kinds -of SIMD instructions available. CUs execute wavefronts, which are pieces of -work split off from a single kernel launch. A single CU can run one out of many -wavefronts (one is chosen by the CU scheduler each cycle), which allows for -very efficient parallel and concurrent execution on the device. Each wavefront -runs independently of the other wavefronts, only stopping to synchronize with -other wavefronts or terminate when specified by the program. - -We can control wavefront execution through a variety of intrinsics provided by -ROCm. For example, the `endpgm()` intrinsic stops the current wavefront's -execution, and is also automatically inserted by the compiler at the end of -each kernel (except in certain unique cases). - -`signal_completion(x)` signals the "kernel doorbell" with the value `x`, which -is the signal checked by the CPU `wait` call to determine when the kernel has -completed. This doorbell is set to `0` automatically by GPU hardware once the -kernel is complete. - -`sendmsg(x,y=0)` and `sendmsghalt(x,y=0)` can be used to signal special -conditions to the scheduler/hardware, such as making requests to stop wavefront -generation, or halt all running wavefronts. Check the ISA manual for details! diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl index 3c46f1070..15f8531de 100644 --- a/src/AMDGPU.jl +++ b/src/AMDGPU.jl @@ -114,7 +114,7 @@ include("tls.jl") include("highlevel.jl") include("reflection.jl") include("array.jl") -include("memory_record.jl") +include("caching_allocator.jl") include("conversions.jl") include("broadcast.jl") include("exception_handler.jl") diff --git a/src/array.jl b/src/array.jl index b045e3006..793be1689 100644 --- a/src/array.jl +++ b/src/array.jl @@ -7,11 +7,24 @@ mutable struct ROCArray{T, N, B} <: AbstractGPUArray{T, N} ::UndefInitializer, dims::Dims{N}, ) where {T, N, B <: Mem.AbstractAMDBuffer} @assert isbitstype(T) "ROCArray only supports bits types" - data = DataRef(pool_free, pool_alloc(B, prod(dims) * sizeof(T))) - x = new{T, N, B}(data, dims, 0) - x = finalizer(unsafe_free!, x) - RECORD_MEMORY[] && record!(x) - return x + + alloc_name = cache_alloc_name() + # Do not use caching allocator if it is not set or + # the buffer is not a device memory. + x = if !(B <: Mem.HIPBuffer) || alloc_name == :none + data = DataRef(pool_free, pool_alloc(B, prod(dims) * sizeof(T))) + x = new{T, N, B}(data, dims, 0) + else + alloc = cache_allocator!(alloc_name) + tmp = alloc!(alloc, B, T, dims) + if tmp ≡ nothing + data = DataRef(pool_free, pool_alloc(B, prod(dims) * sizeof(T))) + tmp = new{T, N, B}(data, dims, 0) + add_busy!(alloc, tmp) + end + tmp::ROCArray{T, N, B} + end + return finalizer(unsafe_free!, x) end function ROCArray{T, N}( diff --git a/src/caching_allocator.jl b/src/caching_allocator.jl new file mode 100644 index 000000000..03880f443 --- /dev/null +++ b/src/caching_allocator.jl @@ -0,0 +1,151 @@ +# NOTE: EXPERIMENTAL API. + +struct CacheAllocator + lock::ReentrantLock + busy::Dict{UInt64, Vector{ROCArray}} # hash((T, dims)) => ROCArray[] + free::Dict{UInt64, Vector{ROCArray}} +end + +CacheAllocator() = CacheAllocator( + ReentrantLock(), + Dict{UInt64, Vector{ROCArray}}(), + Dict{UInt64, Vector{ROCArray}}(), +) + +const CACHE_ALLOCS::LockedObject{Dict{Symbol, CacheAllocator}} = + LockedObject(Dict{Symbol, CacheAllocator}()) + +function cache_allocator!(cache_name::Symbol) + allocs = CACHE_ALLOCS.payload + alloc = get(allocs, cache_name, nothing) + alloc ≡ nothing || return alloc + + return Base.@lock CACHE_ALLOCS.lock begin + allocs[cache_name] = CacheAllocator() + end +end + +function get_free_pool(alloc::CacheAllocator, uid) + free_pool = get(alloc.free, uid, nothing) + if free_pool ≡ nothing + free_pool = Base.@lock alloc.lock alloc.free[uid] = ROCArray[] + end + return free_pool +end + +function get_busy_pool(alloc::CacheAllocator, uid) + busy_pool = get(alloc.busy, uid, nothing) + if busy_pool ≡ nothing + busy_pool = Base.@lock alloc.lock alloc.busy[uid] = ROCArray[] + end + return busy_pool +end + +function alloc!( + alloc::CacheAllocator, ::Type{Mem.HIPBuffer}, ::Type{T}, dims::Dims{N}, +)::Maybe{ROCArray{T, N, Mem.HIPBuffer}} where {T, N} + uid = hash((T, dims)) + free_pool = get_free_pool(alloc, uid) + isempty(free_pool) && return nothing + + # @info "Cache hit" + busy_pool = get_busy_pool(alloc, uid) + x = pop!(free_pool) + # Array was manually freed via `unsafe_free!`. + x.buf.freed && return nothing + + push!(busy_pool, x) + return x +end + +# Mark `x` array as busy, used during cache misses to add new allocations. +function add_busy!(alloc::CacheAllocator, x::ROCArray{T}) where T + uid = hash((T, size(x))) + busy_pool = get_busy_pool(alloc, uid) + Base.@lock alloc.lock push!(busy_pool, x) + return +end + +function free_busy!(alloc::CacheAllocator) + for uid in alloc.busy.keys + free_pool = get_free_pool(alloc, uid) + busy_pool = get_busy_pool(alloc, uid) + isempty(busy_pool) && continue + + Base.@lock alloc.lock begin + append!(free_pool, busy_pool) + empty!(busy_pool) + end + end +end + +# Public API. + +""" + with_caching_allocator(f, alloc_name::Symbol, args...) + +Execute function `f` with arguments `args...` using +caching allocator given by its name `alloc_name`. + +All GPU memory allocations will attempt to hit this cache +before doing actual allocation (in case of cache miss). +After executing `f`, all "busy" memory within the allocator is marked as free, +so it can be re-used with the next call. + +# Returns + +Result of the `f` function. +""" +function with_caching_allocator(f, alloc_name::Symbol, args...) + alloc = cache_allocator!(alloc_name) + # Enable usage of cache allocator during allocations. + cache_alloc_name!(alloc_name) + res = f(args...) + # Mark all allocations during `f` as free to re-use and disable allocator. + free_busy!(alloc) + cache_alloc_name!(:none) + return res +end + +""" + with_no_caching(f) + +Execute function `f`, but avoid hitting any caching allocator. +This is useful to call from within [`with_caching_allocator`](@ref), +so that the memory is independent from it. + +# Returns + +Result of the `f` function. +""" +function with_no_caching(f) + alloc_name = cache_alloc_name() + cache_alloc_name!(:none) + res = f() + cache_alloc_name!(alloc_name) + return res +end + +""" + invalidate_caching_allocator!(alloc_name::Symbol) + +Free all memory held by caching allocator given by it name `alloc_name`. +""" +function invalidate_caching_allocator!(alloc_name::Symbol) + alloc = cache_allocator!(alloc_name) + alloc ≡ nothing && return + + Base.@lock alloc.lock begin + for (_, pool) in alloc.free + map(AMDGPU.unsafe_free!, pool) + end + # TODO is other threads use the same, signal that it is invalidated somehow? + # TODO error if pool is in use, i.e. non empty `busy`? + for (_, pool) in alloc.busy + map(AMDGPU.unsafe_free!, pool) + end + empty!(alloc.busy) + empty!(alloc.free) + end + return +end diff --git a/src/memory_record.jl b/src/memory_record.jl deleted file mode 100644 index 85b01c6d8..000000000 --- a/src/memory_record.jl +++ /dev/null @@ -1,48 +0,0 @@ -# NOTE: EXPERIMENTAL API. - -const MemoryRecords = LockedObject(Dict{UInt64, ROCArray}()) - -const RECORD_MEMORY::Ref{Bool} = Ref(false) - -function record_memory!(rec::Bool; free::Bool = true, sync::Bool = false) - RECORD_MEMORY[] = rec - if !rec - free && free_records!(; sync) - end - return -end - -record_memory() = RECORD_MEMORY[] - -function record!(x) - Base.lock(records -> records[_hash(x)] = x, MemoryRecords) - return -end - -function free_records!(; sync::Bool = false) - Base.lock(MemoryRecords) do records - for (k, x) in records - unsafe_free!(x) - end - empty!(records) - end - sync && AMDGPU.synchronize() - return -end - -function remove_record!(x) - record_memory() || return - - k = _hash(x) - Base.lock(MemoryRecords) do records - if k in records.keys - pop!(records, k) - end - end - return -end - -_hash(x::ROCArray) = - Base.hash(x.buf.rc.obj.mem.ptr, - Base.hash(x.offset, - Base.hash(x.dims))) diff --git a/src/tls.jl b/src/tls.jl index 0fedd9e6c..05444810e 100644 --- a/src/tls.jl +++ b/src/tls.jl @@ -2,14 +2,16 @@ mutable struct TaskLocalState device::HIPDevice context::HIPContext streams::Vector{Union{HIPStream,Nothing}} + cache_alloc_name::Symbol end function TaskLocalState( dev::HIPDevice = something(HIP.DEFAULT_DEVICE[], HIPDevice(1)), ctx::HIPContext = HIPContext(dev), + cache_alloc_name::Symbol = :none, ) streams = Union{Nothing, HIPStream}[nothing for _ in 1:HIP.ndevices()] - TaskLocalState(dev, ctx, streams) + TaskLocalState(dev, ctx, streams, cache_alloc_name) end function Base.getproperty(state::TaskLocalState, field::Symbol) @@ -26,6 +28,17 @@ task_local_state()::Union{Nothing, TaskLocalState} = task_local_state!(args...)::TaskLocalState = get!(() -> TaskLocalState(args...), task_local_storage(), :AMDGPU) +Base.copy(state::TaskLocalState) = TaskLocalState( + state.device, state.context, copy(state.streams), state.cache_alloc_name) + +function Base.show(io::IO, state::TaskLocalState) + println(io, "TaskLocalState:") + println(io, " Device: $(state.device)") + println(io, " HIP Context: $(state.context)") + println(io, " HIP Stream: $(state.stream)") + println(io, " Cache Allocator: $(state.cache_alloc_name)") +end + """ device()::HIPDevice @@ -179,15 +192,10 @@ function priority!(f::Function, p::Symbol) end end -Base.copy(state::TaskLocalState) = TaskLocalState( - state.device, state.context, copy(state.streams)) +cache_alloc_name()::Symbol = task_local_state!().cache_alloc_name -function Base.show(io::IO, state::TaskLocalState) - println(io, "TaskLocalState:") - println(io, " Device: $(state.device)") - println(io, " HIP Context: $(state.context)") - println(io, " HIP Stream: $(state.stream)") -end +cache_alloc_name!(name::Symbol)::Symbol = + task_local_state!().cache_alloc_name = name @inline function prepare_state(state = task_local_state!()) hip_ctx = Ref{HIP.hipContext_t}() @@ -196,13 +204,3 @@ end HIP.context!(state.context) return end - -function synchronize_rocm_tasks(ex) - quote - try - $(ex) - finally - $task_local_state() ≢ nothing && $device_synchronize() - end - end -end