diff --git a/Manifest.toml b/Manifest.toml index f77b08635..3a43bda5a 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -89,9 +89,9 @@ uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" [[GPUArrays]] deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization", "Statistics"] -git-tree-sha1 = "df5b8569904c5c10e84c640984cfff054b18c086" +git-tree-sha1 = "ececbf05f8904c92814bdbd0aafd5540b0bf2e9a" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" -version = "6.4.1" +version = "7.0.1" [[GPUCompiler]] deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] @@ -101,9 +101,9 @@ version = "0.12.5" [[HIP_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "ROCmCompilerSupport_jll", "ROCmDeviceLibs_jll", "ROCmOpenCLRuntime_jll", "hsa_rocr_jll"] -git-tree-sha1 = "0a7a9fb9cde9cd2225d9b6dc32c744785514e9b8" +git-tree-sha1 = "5097d8f7b6842156ab0928371b3d03fefd8decab" uuid = "2696aab5-0948-5276-aa9a-2a86a37016b8" -version = "4.0.0+0" +version = "4.0.0+1" [[InteractiveUtils]] deps = ["Markdown"] diff --git a/Project.toml b/Project.toml index b9959bc80..d183de555 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "AMDGPU" uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" authors = ["Julian P Samaroo "] -version = "0.2.10" +version = "0.2.11" [deps] AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" @@ -28,7 +28,7 @@ AbstractFFTs = "0.5, 1.0" Adapt = "3.0" BinaryProvider = "0.5" CEnum = "0.2, 0.3, 0.4" -GPUArrays = "6" +GPUArrays = "6, 7" GPUCompiler = "0.12" HIP_jll = "4" LLVM = "4" diff --git a/deps/build.jl b/deps/build.jl index 9ba68d0a3..85342295e 100644 --- a/deps/build.jl +++ b/deps/build.jl @@ -213,8 +213,11 @@ function main() if use_artifacts config[:device_libs_path] = ROCmDeviceLibs_jll.bitcode_path config[:device_libs_configured] = true + config[:device_libs_downloaded] = false else include("download_device_libs.jl") + config[:device_libs_configured] = true + config[:device_libs_downloaded] = true end ### Find external HIP-based libraries diff --git a/deps/loaddeps.jl b/deps/loaddeps.jl index af1d091b4..a351a466c 100644 --- a/deps/loaddeps.jl +++ b/deps/loaddeps.jl @@ -35,11 +35,13 @@ if !hip_configured const librocrand = nothing const libmiopen = nothing end -if !device_libs_configured +if device_libs_configured && device_libs_downloaded # Fallback to download device_libs_deps = joinpath(@__DIR__, "deps.jl") isfile(device_libs_deps) && include(device_libs_deps) const device_libs_path = joinpath(@__DIR__, "usr", "lib") +elseif !device_libs_configured + const device_libs_path = "" end const configured = hsa_configured diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl index 160d5eb98..45cc407f6 100644 --- a/src/AMDGPU.jl +++ b/src/AMDGPU.jl @@ -21,7 +21,7 @@ export get_agents, profile, get_first_isa_string, launch! export get_default_agent, get_default_queue export ROCArray, ROCVector, ROCMatrix, ROCVecOrMat -export roc, roczeros, rocones, rocfill +export roc ### HSA Runtime ### @@ -77,21 +77,25 @@ include("broadcast.jl") include("mapreduce.jl") #include("gpuarray_interface.jl") -allowscalar(x::Bool) = nothing +allowscalar(x::Bool) = GPUArrays.allowscalar(x) ### Initialization and Shutdown ### const HSA_REFCOUNT = Threads.Atomic{UInt}(0) function hsaref!() + #= if Threads.atomic_add!(HSA_REFCOUNT, UInt(1)) > typemax(UInt)-10 Core.println("HSA_REFCOUNT OVERFLOW!") exit(1) end + =# end function hsaunref!() + #= if Threads.atomic_sub!(HSA_REFCOUNT, UInt(1)) == 1 HSA.shut_down() end + =# end # Load binary dependencies @@ -126,6 +130,9 @@ check_library("MIOpen", libmiopen) # we need to load it after rocRAND.jl include(joinpath(@__DIR__, "random.jl")) +# Utilities +include("utils.jl") + function __init__() if hsa_configured # Make sure we load the library found by the last `] build` @@ -137,6 +144,7 @@ function __init__() status = HSA.init() if status == HSA.STATUS_SUCCESS hsaref!() + HSA_REFCOUNT[] = 1 # Register shutdown hook atexit() do hsaunref!() diff --git a/src/array.jl b/src/array.jl index b32d7d797..66a596a37 100644 --- a/src/array.jl +++ b/src/array.jl @@ -362,3 +362,10 @@ roc(xs) = adapt(Float32Adaptor(), xs) Base.unsafe_convert(::Type{Ptr{T}}, x::ROCArray{T}) where T = Base.unsafe_convert(Ptr{T}, x.buf) + +# some nice utilities + +ones(dims...) = ones(Float32, dims...) +ones(T::Type, dims...) = fill!(ROCArray{T}(undef, dims...), one(T)) +zeros(dims...) = zeros(Float32, dims...) +zeros(T::Type, dims...) = fill!(ROCArray{T}(undef, dims...), zero(T)) diff --git a/src/random.jl b/src/random.jl index cd8acbd54..f6f87a6ee 100644 --- a/src/random.jl +++ b/src/random.jl @@ -76,9 +76,11 @@ randn(T::Type, dim1::Integer, dims::Integer...; kwargs...) = # untyped out-of-place rand(dim1::Integer, dims::Integer...) = - Random.rand(rocrand_rng(), Dims((dim1, dims...))) + #Random.rand(rocrand_rng(), Dims((dim1, dims...))) + Random.rand!(ROCArray{Float32}(undef, dim1, dims...)) randn(dim1::Integer, dims::Integer...; kwargs...) = - Random.randn(rocrand_rng(), Dims((dim1, dims...)); kwargs...) + #Random.randn(rocrand_rng(), Dims((dim1, dims...)); kwargs...) + Random.randn!(ROCArray{Float32}(undef, dim1, dims...)) # rand_logn, rand_poisson const rand_logn = librocrand !== nothing ? rocRAND.rand_logn : (x...;kwargs...) -> error("Not supported without rocRAND.") diff --git a/src/signal.jl b/src/signal.jl index 32bed945b..aa48af58a 100644 --- a/src/signal.jl +++ b/src/signal.jl @@ -27,7 +27,7 @@ the minimum latency for the software waiter; lower values can decrease latency at the cost of increased polling load. `timeout`, if not `nothing`, sets the timeout for the signal, after which the call will error. """ -function Base.wait(signal::HSASignal; soft=true, minlat=0.001, timeout=nothing) +function Base.wait(signal::HSASignal; soft=true, minlat=0.000001, timeout=nothing) if soft start_time = time_ns() while true @@ -35,14 +35,18 @@ function Base.wait(signal::HSASignal; soft=true, minlat=0.001, timeout=nothing) if value < 1 return end - if timeout !== nothing - now_time = time_ns() - diff_time = (now_time - start_time) / 10^9 - if diff_time > timeout - error("Timeout while waiting on signal") - end + now_time = time_ns() + diff_time = (now_time - start_time) / 10^9 + if timeout !== nothing && diff_time > timeout + error("Timeout while waiting on signal") + end + if minlat < 0.001 && diff_time < 10^3 + # Use Libc.systemsleep for higher precision in the microsecond range + Libc.systemsleep(minlat) + yield() + else + sleep(minlat) end - sleep(minlat) end else # Wait on the dispatch completion signal until the kernel is finished diff --git a/src/utils.jl b/src/utils.jl new file mode 100644 index 000000000..95f6e0067 --- /dev/null +++ b/src/utils.jl @@ -0,0 +1,29 @@ +function versioninfo(io::IO=stdout) + println("HSA Runtime ($(hsa_configured ? "ready" : "MISSING"))") + if hsa_configured + println("- Version: $(libhsaruntime_version)") + println("- Initialized: $(repr(HSA_REFCOUNT[] > 0))") + end + println("HIP Runtime ($(hip_configured ? "ready" : "MISSING"))") + if hip_configured + # TODO: println("- Version: $(libhip_version)") + end + println("ROCm-Device-Libs ($(device_libs_configured ? "ready" : "MISSING"))") + if device_libs_configured + # TODO: println("- Version: $(device_libs_version)") + println("- Downloaded: $(repr(device_libs_downloaded))") + end + println("rocBLAS ($(librocblas !== nothing ? "ready" : "MISSING"))") + println("rocFFT ($(librocfft !== nothing ? "ready" : "MISSING"))") + println("rocRAND ($(librocrand !== nothing ? "ready" : "MISSING"))") + println("rocSPARSE ($(librocsparse !== nothing ? "ready" : "MISSING"))") + println("rocALUTION ($(librocalution !== nothing ? "ready" : "MISSING"))") + println("MIOpen ($(libmiopen !== nothing ? "ready" : "MISSING"))") + + if hsa_configured && HSA_REFCOUNT[] > 0 + println("HSA Agents ($(length(agents()))):") + for agent in agents() + println("- ", repr(agent)) + end + end +end