diff --git a/Project.toml b/Project.toml
index d32d76ffd..dbbe4c427 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "AMDGPU"
 uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 authors = ["Julian P Samaroo <jpsamaroo@jpsamaroo.me>", "Valentin Churavy <v.churavy@gmail.com>", "Anton Smirnov <tonysmn97@gmail.com>"]
-version = "1.1.3"
+version = "1.1.4"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
diff --git a/docs/make.jl b/docs/make.jl
index db6b78cab..0735739f1 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -27,10 +27,8 @@ function main()
             "Exceptions" => "exceptions.md",
             "Profiling" => "profiling.md",
             "Memory" => "memory.md",
+            "Caching Memory Allocator" => "caching_allocator.md",
             "Host-Call" => "hostcall.md",
-            "Intrinsics" => [
-                "Execution Control" => "execution_control.md",
-            ],
             "Printing" => "printing.md",
             "Logging" => "logging.md",
             "API Reference" => "api.md"
diff --git a/docs/src/assets/gc-vram-breakdown.png b/docs/src/assets/gc-vram-breakdown.png
new file mode 100644
index 000000000..a1ccd6698
Binary files /dev/null and b/docs/src/assets/gc-vram-breakdown.png differ
diff --git a/docs/src/assets/with-caching-allocator.png b/docs/src/assets/with-caching-allocator.png
new file mode 100644
index 000000000..5b6ecd034
Binary files /dev/null and b/docs/src/assets/with-caching-allocator.png differ
diff --git a/docs/src/assets/without-caching-allocator.png b/docs/src/assets/without-caching-allocator.png
new file mode 100644
index 000000000..15d1ca890
Binary files /dev/null and b/docs/src/assets/without-caching-allocator.png differ
diff --git a/docs/src/caching_allocator.md b/docs/src/caching_allocator.md
new file mode 100644
index 000000000..6181caeee
--- /dev/null
+++ b/docs/src/caching_allocator.md
@@ -0,0 +1,76 @@
+# Caching Memory Allocator
+
+Julia uses Garbage-Collection (GC) for automatic memory management.
+However, it does not know about other memory spaces,
+therefore it sees no difference between 1 KiB GPU allocation and 1 GiB
+and doesn't free it in time.
+
+This leads to a situations where all of the GPU memory is used,
+even though your algorithm only requires a fraction of it.
+
+Current mechanism of dealing with OOM (Out-Of-Memory) errors during allocations
+is to manually trigger GC and retry allocating again doing this in several rounds
+each more aggressive than previous.
+
+However, manually triggering GC is very expensive, since it requires scanning
+all Julia objects, not just ROCArrays, so the actual memory freeing takes a
+fraction of GC time:
+![](./assets/gc-vram-breakdown.png)
+
+On the image above, red region is a call to GC and green region is
+where actual GPU memory is being freed.
+
+---
+
+To help with memory management, we can use caching memory allocator.
+It is usefult in scenarios where we execute the same function multiple times
+and have the same memory allocation pattern.
+One such example is training DL models, where given the model and its parameters
+we compute loss, gradients w.r.t. loss and perform in-place parameter update.
+In this case, every iteration performs same operations and memory allocations
+and with caching allocator we can efficiently re-use them without returning
+the memory back to OS.
+
+## Example
+
+We have a for-loop, where each iteration requires 2 GiB of VRAM.
+We create a caching allocator with the name `:loop` and pass a function to
+execute.
+First iteration will allocate, but subsequent won't.
+
+```julia
+using AMDGPU
+
+function main()
+    n = 1024^2 * 256
+    for i in 1:1000
+        AMDGPU.with_caching_allocator(:loop, n) do n
+            sin.(AMDGPU.rand(Float32, n)) # 2 GiB allocation
+            return
+        end
+    end
+end
+```
+
+The reason for marking a region of code where to re-use the memory and
+not extending it to the whole program instead, is because we cannot rely on GC
+to tell us when the memory is no longer used (it is too slow for that),
+so we create such region manually.
+
+You can free all memory held by allocator, by invalidating it using its name
+with [`AMDGPU.invalidate_caching_allocator!`](@ref).
+Or if you want some region of code within [`AMDGPU.with_caching_allocator`](@ref)
+to execute without relying on cache, use [`AMDGPU.with_no_caching`](@ref).
+
+||Without Caching Allocator|With Caching Allocator|
+|:---:|:---:|:---:|
+|VRAM Usage|![](./assets/without-caching-allocator.png)|![](./assets/with-caching-allocator.png)|
+|Execution time (seconds)|`12.865149`|`0.020943`|
+
+## API
+
+```@docs
+AMDGPU.with_caching_allocator
+AMDGPU.with_no_caching
+AMDGPU.invalidate_caching_allocator!
+```
diff --git a/docs/src/execution_control.md b/docs/src/execution_control.md
deleted file mode 100644
index 4fb4681d7..000000000
--- a/docs/src/execution_control.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# Execution Control and Intrinsics
-
-GPU execution is similar to CPU execution in some ways, although there are many
-differences. AMD GPUs have Compute Units (CUs), which can be thought of like
-CPU cores. Those CUs have (on pre-Navi architectures) 64 "shader processors",
-which are essentially the same as CPU SIMD lanes. The lanes in a CU operate in
-lockstep just like CPU SIMD lanes, and have execution masks and various kinds
-of SIMD instructions available. CUs execute wavefronts, which are pieces of
-work split off from a single kernel launch. A single CU can run one out of many
-wavefronts (one is chosen by the CU scheduler each cycle), which allows for
-very efficient parallel and concurrent execution on the device. Each wavefront
-runs independently of the other wavefronts, only stopping to synchronize with
-other wavefronts or terminate when specified by the program.
-
-We can control wavefront execution through a variety of intrinsics provided by
-ROCm. For example, the `endpgm()` intrinsic stops the current wavefront's
-execution, and is also automatically inserted by the compiler at the end of
-each kernel (except in certain unique cases).
-
-`signal_completion(x)` signals the "kernel doorbell" with the value `x`, which
-is the signal checked by the CPU `wait` call to determine when the kernel has
-completed. This doorbell is set to `0` automatically by GPU hardware once the
-kernel is complete.
-
-`sendmsg(x,y=0)` and `sendmsghalt(x,y=0)` can be used to signal special
-conditions to the scheduler/hardware, such as making requests to stop wavefront
-generation, or halt all running wavefronts. Check the ISA manual for details!
diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl
index 3c46f1070..15f8531de 100644
--- a/src/AMDGPU.jl
+++ b/src/AMDGPU.jl
@@ -114,7 +114,7 @@ include("tls.jl")
 include("highlevel.jl")
 include("reflection.jl")
 include("array.jl")
-include("memory_record.jl")
+include("caching_allocator.jl")
 include("conversions.jl")
 include("broadcast.jl")
 include("exception_handler.jl")
diff --git a/src/array.jl b/src/array.jl
index b045e3006..793be1689 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -7,11 +7,24 @@ mutable struct ROCArray{T, N, B} <: AbstractGPUArray{T, N}
         ::UndefInitializer, dims::Dims{N},
     ) where {T, N, B <: Mem.AbstractAMDBuffer}
         @assert isbitstype(T) "ROCArray only supports bits types"
-        data = DataRef(pool_free, pool_alloc(B, prod(dims) * sizeof(T)))
-        x = new{T, N, B}(data, dims, 0)
-        x = finalizer(unsafe_free!, x)
-        RECORD_MEMORY[] && record!(x)
-        return x
+
+        alloc_name = cache_alloc_name()
+        # Do not use caching allocator if it is not set or
+        # the buffer is not a device memory.
+        x = if !(B <: Mem.HIPBuffer) || alloc_name == :none
+            data = DataRef(pool_free, pool_alloc(B, prod(dims) * sizeof(T)))
+            x = new{T, N, B}(data, dims, 0)
+        else
+            alloc = cache_allocator!(alloc_name)
+            tmp = alloc!(alloc, B, T, dims)
+            if tmp ≡ nothing
+                data = DataRef(pool_free, pool_alloc(B, prod(dims) * sizeof(T)))
+                tmp = new{T, N, B}(data, dims, 0)
+                add_busy!(alloc, tmp)
+            end
+            tmp::ROCArray{T, N, B}
+        end
+        return finalizer(unsafe_free!, x)
     end
 
     function ROCArray{T, N}(
diff --git a/src/caching_allocator.jl b/src/caching_allocator.jl
new file mode 100644
index 000000000..03880f443
--- /dev/null
+++ b/src/caching_allocator.jl
@@ -0,0 +1,151 @@
+# NOTE: EXPERIMENTAL API.
+
+struct CacheAllocator
+    lock::ReentrantLock
+    busy::Dict{UInt64, Vector{ROCArray}} # hash((T, dims)) => ROCArray[]
+    free::Dict{UInt64, Vector{ROCArray}}
+end
+
+CacheAllocator() = CacheAllocator(
+    ReentrantLock(),
+    Dict{UInt64, Vector{ROCArray}}(),
+    Dict{UInt64, Vector{ROCArray}}(),
+)
+
+const CACHE_ALLOCS::LockedObject{Dict{Symbol, CacheAllocator}} =
+    LockedObject(Dict{Symbol, CacheAllocator}())
+
+function cache_allocator!(cache_name::Symbol)
+    allocs = CACHE_ALLOCS.payload
+    alloc = get(allocs, cache_name, nothing)
+    alloc ≡ nothing || return alloc
+
+    return Base.@lock CACHE_ALLOCS.lock begin
+        allocs[cache_name] = CacheAllocator()
+    end
+end
+
+function get_free_pool(alloc::CacheAllocator, uid)
+    free_pool = get(alloc.free, uid, nothing)
+    if free_pool ≡ nothing
+        free_pool = Base.@lock alloc.lock alloc.free[uid] = ROCArray[]
+    end
+    return free_pool
+end
+
+function get_busy_pool(alloc::CacheAllocator, uid)
+    busy_pool = get(alloc.busy, uid, nothing)
+    if busy_pool ≡ nothing
+        busy_pool = Base.@lock alloc.lock alloc.busy[uid] = ROCArray[]
+    end
+    return busy_pool
+end
+
+function alloc!(
+    alloc::CacheAllocator, ::Type{Mem.HIPBuffer}, ::Type{T}, dims::Dims{N},
+)::Maybe{ROCArray{T, N, Mem.HIPBuffer}} where {T, N}
+    uid = hash((T, dims))
+    free_pool = get_free_pool(alloc, uid)
+    isempty(free_pool) && return nothing
+
+    # @info "Cache hit"
+    busy_pool = get_busy_pool(alloc, uid)
+    x = pop!(free_pool)
+    # Array was manually freed via `unsafe_free!`.
+    x.buf.freed && return nothing
+
+    push!(busy_pool, x)
+    return x
+end
+
+# Mark `x` array as busy, used during cache misses to add new allocations.
+function add_busy!(alloc::CacheAllocator, x::ROCArray{T}) where T
+    uid = hash((T, size(x)))
+    busy_pool = get_busy_pool(alloc, uid)
+    Base.@lock alloc.lock push!(busy_pool, x)
+    return
+end
+
+function free_busy!(alloc::CacheAllocator)
+    for uid in alloc.busy.keys
+        free_pool = get_free_pool(alloc, uid)
+        busy_pool = get_busy_pool(alloc, uid)
+        isempty(busy_pool) && continue
+
+        Base.@lock alloc.lock begin
+            append!(free_pool, busy_pool)
+            empty!(busy_pool)
+        end
+    end
+end
+
+# Public API.
+
+"""
+    with_caching_allocator(f, alloc_name::Symbol, args...)
+
+Execute function `f` with arguments `args...` using
+caching allocator given by its name `alloc_name`.
+
+All GPU memory allocations will attempt to hit this cache
+before doing actual allocation (in case of cache miss).
+After executing `f`, all "busy" memory within the allocator is marked as free,
+so it can be re-used with the next call.
+
+# Returns
+
+Result of the `f` function.
+"""
+function with_caching_allocator(f, alloc_name::Symbol, args...)
+    alloc = cache_allocator!(alloc_name)
+    # Enable usage of cache allocator during allocations.
+    cache_alloc_name!(alloc_name)
+    res = f(args...)
+    # Mark all allocations during `f` as free to re-use and disable allocator.
+    free_busy!(alloc)
+    cache_alloc_name!(:none)
+    return res
+end
+
+"""
+    with_no_caching(f)
+
+Execute function `f`, but avoid hitting any caching allocator.
+This is useful to call from within [`with_caching_allocator`](@ref),
+so that the memory is independent from it.
+
+# Returns
+
+Result of the `f` function.
+"""
+function with_no_caching(f)
+    alloc_name = cache_alloc_name()
+    cache_alloc_name!(:none)
+    res = f()
+    cache_alloc_name!(alloc_name)
+    return res
+end
+
+"""
+    invalidate_caching_allocator!(alloc_name::Symbol)
+
+Free all memory held by caching allocator given by it name `alloc_name`.
+"""
+function invalidate_caching_allocator!(alloc_name::Symbol)
+    alloc = cache_allocator!(alloc_name)
+    alloc ≡ nothing && return
+
+    Base.@lock alloc.lock begin
+        for (_, pool) in alloc.free
+            map(AMDGPU.unsafe_free!, pool)
+        end
+        # TODO is other threads use the same, signal that it is invalidated somehow?
+        # TODO error if pool is in use, i.e. non empty `busy`?
+        for (_, pool) in alloc.busy
+            map(AMDGPU.unsafe_free!, pool)
+        end
+        empty!(alloc.busy)
+        empty!(alloc.free)
+    end
+    return
+end
diff --git a/src/memory_record.jl b/src/memory_record.jl
deleted file mode 100644
index 85b01c6d8..000000000
--- a/src/memory_record.jl
+++ /dev/null
@@ -1,48 +0,0 @@
-# NOTE: EXPERIMENTAL API.
-
-const MemoryRecords = LockedObject(Dict{UInt64, ROCArray}())
-
-const RECORD_MEMORY::Ref{Bool} = Ref(false)
-
-function record_memory!(rec::Bool; free::Bool = true, sync::Bool = false)
-    RECORD_MEMORY[] = rec
-    if !rec
-        free && free_records!(; sync)
-    end
-    return
-end
-
-record_memory() = RECORD_MEMORY[]
-
-function record!(x)
-    Base.lock(records -> records[_hash(x)] = x, MemoryRecords)
-    return
-end
-
-function free_records!(; sync::Bool = false)
-    Base.lock(MemoryRecords) do records
-        for (k, x) in records
-            unsafe_free!(x)
-        end
-        empty!(records)
-    end
-    sync && AMDGPU.synchronize()
-    return
-end
-
-function remove_record!(x)
-    record_memory() || return
-
-    k = _hash(x)
-    Base.lock(MemoryRecords) do records
-        if k in records.keys
-            pop!(records, k)
-        end
-    end
-    return
-end
-
-_hash(x::ROCArray) =
-    Base.hash(x.buf.rc.obj.mem.ptr,
-        Base.hash(x.offset,
-            Base.hash(x.dims)))
diff --git a/src/tls.jl b/src/tls.jl
index 0fedd9e6c..05444810e 100644
--- a/src/tls.jl
+++ b/src/tls.jl
@@ -2,14 +2,16 @@ mutable struct TaskLocalState
     device::HIPDevice
     context::HIPContext
     streams::Vector{Union{HIPStream,Nothing}}
+    cache_alloc_name::Symbol
 end
 
 function TaskLocalState(
     dev::HIPDevice = something(HIP.DEFAULT_DEVICE[], HIPDevice(1)),
     ctx::HIPContext = HIPContext(dev),
+    cache_alloc_name::Symbol = :none,
 )
     streams = Union{Nothing, HIPStream}[nothing for _ in 1:HIP.ndevices()]
-    TaskLocalState(dev, ctx, streams)
+    TaskLocalState(dev, ctx, streams, cache_alloc_name)
 end
 
 function Base.getproperty(state::TaskLocalState, field::Symbol)
@@ -26,6 +28,17 @@ task_local_state()::Union{Nothing, TaskLocalState} =
 task_local_state!(args...)::TaskLocalState =
     get!(() -> TaskLocalState(args...), task_local_storage(), :AMDGPU)
 
+Base.copy(state::TaskLocalState) = TaskLocalState(
+    state.device, state.context, copy(state.streams), state.cache_alloc_name)
+
+function Base.show(io::IO, state::TaskLocalState)
+    println(io, "TaskLocalState:")
+    println(io, "  Device: $(state.device)")
+    println(io, "  HIP Context: $(state.context)")
+    println(io, "  HIP Stream: $(state.stream)")
+    println(io, "  Cache Allocator: $(state.cache_alloc_name)")
+end
+
 """
     device()::HIPDevice
 
@@ -179,15 +192,10 @@ function priority!(f::Function, p::Symbol)
     end
 end
 
-Base.copy(state::TaskLocalState) = TaskLocalState(
-    state.device, state.context, copy(state.streams))
+cache_alloc_name()::Symbol = task_local_state!().cache_alloc_name
 
-function Base.show(io::IO, state::TaskLocalState)
-    println(io, "TaskLocalState:")
-    println(io, "  Device: $(state.device)")
-    println(io, "  HIP Context: $(state.context)")
-    println(io, "  HIP Stream: $(state.stream)")
-end
+cache_alloc_name!(name::Symbol)::Symbol =
+    task_local_state!().cache_alloc_name = name
 
 @inline function prepare_state(state = task_local_state!())
     hip_ctx = Ref{HIP.hipContext_t}()
@@ -196,13 +204,3 @@ end
         HIP.context!(state.context)
     return
 end
-
-function synchronize_rocm_tasks(ex)
-    quote
-        try
-            $(ex)
-        finally
-            $task_local_state() ≢ nothing && $device_synchronize()
-        end
-    end
-end