From e571300fae2e11e47eae65ea36649fd326e1835f Mon Sep 17 00:00:00 2001 From: Karl Pierce Date: Mon, 16 Oct 2023 21:10:57 -0400 Subject: [PATCH] [NDTensors][NDTensorsCUDAExt] Improve performance of CUDA backend (#1194) --- .../ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl | 3 +- NDTensors/ext/NDTensorsCUDAExt/adapt.jl | 2 +- NDTensors/ext/NDTensorsCUDAExt/imports.jl | 2 +- NDTensors/ext/NDTensorsCUDAExt/iscu.jl | 1 + NDTensors/src/NDTensors.jl | 6 + NDTensors/src/abstractarray/fill.jl | 10 +- NDTensors/src/abstractarray/mul.jl | 20 ++ NDTensors/src/abstractarray/permutedims.jl | 37 ++++ NDTensors/src/abstractarray/similar.jl | 5 + .../abstractarray/tensoralgebra/contract.jl | 176 +++++++++++++++++ NDTensors/src/array/mul.jl | 4 + NDTensors/src/array/permutedims.jl | 15 ++ NDTensors/src/arraytensor/array.jl | 4 +- NDTensors/src/dense/dense.jl | 1 - NDTensors/src/dense/densetensor.jl | 9 +- NDTensors/src/dense/tensoralgebra/contract.jl | 180 +----------------- NDTensors/src/imports.jl | 2 - NDTensors/src/linearalgebra/linearalgebra.jl | 44 ++++- NDTensors/src/tensor/permutedims.jl | 1 + NDTensors/src/tensor/set_types.jl | 3 + NDTensors/test/blocksparse.jl | 2 +- NDTensors/test/linearalgebra.jl | 4 - src/ITensors.jl | 1 + src/broadcast.jl | 49 +++-- src/imports.jl | 4 +- src/itensor.jl | 7 +- src/mps/dmrg.jl | 10 +- src/set_types.jl | 1 + src/tensor_operations/permutations.jl | 4 +- test/ITensorNetworkMaps/utils/utils.jl | 2 +- test/base/test_itensor_scalar_contract.jl | 8 +- 31 files changed, 387 insertions(+), 230 deletions(-) create mode 100644 NDTensors/ext/NDTensorsCUDAExt/iscu.jl create mode 100644 NDTensors/src/abstractarray/mul.jl create mode 100644 NDTensors/src/abstractarray/permutedims.jl create mode 100644 NDTensors/src/abstractarray/tensoralgebra/contract.jl create mode 100644 NDTensors/src/array/mul.jl create mode 100644 NDTensors/src/array/permutedims.jl create mode 100644 NDTensors/src/tensor/permutedims.jl create mode 100644 src/set_types.jl diff --git a/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl b/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl index 41f3e6ea57..369c398d98 100644 --- a/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl +++ b/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl @@ -4,7 +4,7 @@ using NDTensors using NDTensors.SetParameters using Adapt using Functors -using LinearAlgebra: BlasFloat +using LinearAlgebra if isdefined(Base, :get_extension) using CUDA @@ -18,6 +18,7 @@ end include("imports.jl") include("set_types.jl") +include("iscu.jl") include("adapt.jl") include("linearalgebra.jl") end diff --git a/NDTensors/ext/NDTensorsCUDAExt/adapt.jl b/NDTensors/ext/NDTensorsCUDAExt/adapt.jl index 718901da4c..165b5405ce 100644 --- a/NDTensors/ext/NDTensorsCUDAExt/adapt.jl +++ b/NDTensors/ext/NDTensorsCUDAExt/adapt.jl @@ -16,7 +16,7 @@ buffertype(::NDTensorCuArrayAdaptor{B}) where {B} = B function Adapt.adapt_storage(adaptor::NDTensorCuArrayAdaptor, xs::AbstractArray) ElT = eltype(xs) BufT = buffertype(adaptor) - return isbits(xs) ? xs : CuArray{ElT,1,BufT}(xs) + return isbits(xs) ? xs : adapt(CuArray{ElT,1,BufT}, xs) end function NDTensors.adapt_storagetype( diff --git a/NDTensors/ext/NDTensorsCUDAExt/imports.jl b/NDTensors/ext/NDTensorsCUDAExt/imports.jl index d8736ae0a2..96b8383c74 100644 --- a/NDTensors/ext/NDTensorsCUDAExt/imports.jl +++ b/NDTensors/ext/NDTensorsCUDAExt/imports.jl @@ -1,6 +1,6 @@ import NDTensors: cu, set_ndims, set_eltype, set_eltype_if_unspecified, similartype import NDTensors: - ContractionProperties, _contract!, GemmBackend, auto_select_backend, _gemm! + ContractionProperties, _contract!, GemmBackend, auto_select_backend, _gemm!, iscu import NDTensors.SetParameters: nparameters, get_parameter, set_parameter, default_parameter import .CUDA: CuArrayAdaptor diff --git a/NDTensors/ext/NDTensorsCUDAExt/iscu.jl b/NDTensors/ext/NDTensorsCUDAExt/iscu.jl new file mode 100644 index 0000000000..5c3fc95a25 --- /dev/null +++ b/NDTensors/ext/NDTensorsCUDAExt/iscu.jl @@ -0,0 +1 @@ +iscu(::Type{<:CuArray}) = true diff --git a/NDTensors/src/NDTensors.jl b/NDTensors/src/NDTensors.jl index 8445b925cb..641253dec0 100644 --- a/NDTensors/src/NDTensors.jl +++ b/NDTensors/src/NDTensors.jl @@ -50,8 +50,12 @@ include("abstractarray/set_types.jl") include("abstractarray/to_shape.jl") include("abstractarray/similar.jl") include("abstractarray/ndims.jl") +include("abstractarray/permutedims.jl") include("abstractarray/fill.jl") +include("abstractarray/mul.jl") include("array/set_types.jl") +include("array/permutedims.jl") +include("array/mul.jl") include("tupletools.jl") include("emptynumber.jl") include("nodata.jl") @@ -63,9 +67,11 @@ include("tensor/tensor.jl") include("dims.jl") include("tensor/set_types.jl") include("tensor/similar.jl") +include("tensor/permutedims.jl") include("adapt.jl") include("tensoralgebra/generic_tensor_operations.jl") include("tensoralgebra/contraction_logic.jl") +include("abstractarray/tensoralgebra/contract.jl") ##################################### # DenseTensor and DiagTensor diff --git a/NDTensors/src/abstractarray/fill.jl b/NDTensors/src/abstractarray/fill.jl index 85e8377d3a..146459a4ce 100644 --- a/NDTensors/src/abstractarray/fill.jl +++ b/NDTensors/src/abstractarray/fill.jl @@ -1,13 +1,11 @@ -function generic_randn(arraytype::Type{<:AbstractArray}, dim::Integer=0) +function generic_randn( + arraytype::Type{<:AbstractArray}, dim::Integer=0; rng=Random.default_rng() +) arraytype_specified = set_unspecified_parameters( leaf_parenttype(arraytype), DefaultParameters() ) data = similar(arraytype_specified, dim) - ElT = eltype(data) - for i in 1:length(data) - data[i] = randn(ElT) - end - return data + return randn!(rng, data) end function generic_zeros(arraytype::Type{<:AbstractArray}, dim::Integer=0) diff --git a/NDTensors/src/abstractarray/mul.jl b/NDTensors/src/abstractarray/mul.jl new file mode 100644 index 0000000000..954ecca3e1 --- /dev/null +++ b/NDTensors/src/abstractarray/mul.jl @@ -0,0 +1,20 @@ +function mul!!(CM::AbstractArray, AM::AbstractArray, BM::AbstractArray, α, β) + return mul!!( + leaf_parenttype(CM), CM, leaf_parenttype(AM), AM, leaf_parenttype(BM), BM, α, β + ) + return CM +end + +function mul!!( + ::Type{<:AbstractArray}, + CM, + ::Type{<:AbstractArray}, + AM, + ::Type{<:AbstractArray}, + BM, + α, + β, +) + mul!(CM, AM, BM, α, β) + return CM +end diff --git a/NDTensors/src/abstractarray/permutedims.jl b/NDTensors/src/abstractarray/permutedims.jl new file mode 100644 index 0000000000..f23db4ea73 --- /dev/null +++ b/NDTensors/src/abstractarray/permutedims.jl @@ -0,0 +1,37 @@ +## NOTICE!!: Here we are not importing Base.permutedims or Base.permutedims! but +## are writing our own implementation. This allows us to +# NDTensors.permutedims +function permutedims(M::AbstractArray, perm) + return permutedims(leaf_parenttype(M), M, perm) +end + +# NDTensors.permutedims +function permutedims(::Type{<:AbstractArray}, M, perm) + return Base.permutedims(M, perm) +end + +# NDTensors.permutedims! +function permutedims!(Mdest::AbstractArray, M::AbstractArray, perm) + return permutedims!(leaf_parenttype(Mdest), Mdest, leaf_parenttype(M), M, perm) +end + +# NDTensors.permutedims! +function permutedims!(::Type{<:AbstractArray}, Mdest, ::Type{<:AbstractArray}, M, perm) + return Base.permutedims!(Mdest, M, perm) +end + +function permutedims!!(B::AbstractArray, A::AbstractArray, perm, f) + return permutedims!!(leaf_parenttype(B), B, leaf_parenttype(A), A, perm, f) +end + +function permutedims!!( + Bleaftype::Type{<:AbstractArray}, B, Aleaftype::Type{<:AbstractArray}, A, perm, f +) + permutedims!(Bleaftype, B, Aleaftype, A, perm, f) + return B +end + +function permutedims!(::Type{<:AbstractArray}, B, ::Type{<:AbstractArray}, A, perm, f) + B .= f.(B, Base.permutedims(A, perm)) + return B +end diff --git a/NDTensors/src/abstractarray/similar.jl b/NDTensors/src/abstractarray/similar.jl index 7e8313cd1c..873e22e44e 100644 --- a/NDTensors/src/abstractarray/similar.jl +++ b/NDTensors/src/abstractarray/similar.jl @@ -56,6 +56,11 @@ function similar(arraytype::Type{<:AbstractArray}, dims::Tuple) return similartype(arraytype, shape)(undef, NDTensors.to_shape(arraytype, shape)) end +# For when there are CUArray specific issues inline +iscu(A::AbstractArray) = iscu(typeof(A)) +function iscu(A::Type{<:AbstractArray}) + return (leaf_parenttype(A) == A ? false : iscu(leaf_parenttype(A))) +end # This function actually allocates the data. # Catches conversions of dimensions specified by ranges # dimensions specified by integers with `Base.to_shape`. diff --git a/NDTensors/src/abstractarray/tensoralgebra/contract.jl b/NDTensors/src/abstractarray/tensoralgebra/contract.jl new file mode 100644 index 0000000000..5db9bf5370 --- /dev/null +++ b/NDTensors/src/abstractarray/tensoralgebra/contract.jl @@ -0,0 +1,176 @@ +using LinearAlgebra: BlasFloat +export backend_auto, backend_blas, backend_generic + +@eval struct GemmBackend{T} + (f::Type{<:GemmBackend})() = $(Expr(:new, :f)) +end +GemmBackend(s) = GemmBackend{Symbol(s)}() +macro GemmBackend_str(s) + return :(GemmBackend{$(Expr(:quote, Symbol(s)))}) +end + +const gemm_backend = Ref(:Auto) +function backend_auto() + return gemm_backend[] = :Auto +end +function backend_blas() + return gemm_backend[] = :BLAS +end +function backend_generic() + return gemm_backend[] = :Generic +end + +@inline function auto_select_backend( + ::Type{<:StridedVecOrMat{<:BlasFloat}}, + ::Type{<:StridedVecOrMat{<:BlasFloat}}, + ::Type{<:StridedVecOrMat{<:BlasFloat}}, +) + return GemmBackend(:BLAS) +end + +@inline function auto_select_backend( + ::Type{<:AbstractVecOrMat}, ::Type{<:AbstractVecOrMat}, ::Type{<:AbstractVecOrMat} +) + return GemmBackend(:Generic) +end + +function _gemm!( + tA, tB, alpha, A::TA, B::TB, beta, C::TC +) where {TA<:AbstractVecOrMat,TB<:AbstractVecOrMat,TC<:AbstractVecOrMat} + if gemm_backend[] == :Auto + _gemm!(auto_select_backend(TA, TB, TC), tA, tB, alpha, A, B, beta, C) + else + _gemm!(GemmBackend(gemm_backend[]), tA, tB, alpha, A, B, beta, C) + end +end + +# BLAS matmul +function _gemm!( + ::GemmBackend{:BLAS}, + tA, + tB, + alpha, + A::AbstractVecOrMat, + B::AbstractVecOrMat, + beta, + C::AbstractVecOrMat, +) + #@timeit_debug timer "BLAS.gemm!" begin + return BLAS.gemm!(tA, tB, alpha, A, B, beta, C) + #end # @timeit +end + +# generic matmul +function _gemm!( + ::GemmBackend{:Generic}, + tA, + tB, + alpha::AT, + A::AbstractVecOrMat, + B::AbstractVecOrMat, + beta::BT, + C::AbstractVecOrMat, +) where {AT,BT} + mul!(C, tA == 'T' ? transpose(A) : A, tB == 'T' ? transpose(B) : B, alpha, beta) + return C +end + +# Non-trivial permutation +function _contract_scalar_perm!( + Rᵃ::AbstractArray{ElR}, Tᵃ::AbstractArray, perm, α, β=zero(ElR) +) where {ElR} + if iszero(β) + if iszero(α) + fill!(Rᵃ, 0) + else + Rᵃ = permutedims!!(Rᵃ, Tᵃ, perm, (r, t) -> α * t) + end + elseif isone(β) + if iszero(α) + # Rᵃ .= Rᵃ + # No-op + else + Rᵃ = permutedims!!(Rᵃ, Tᵃ, perm, (r, t) -> r + α * t) + end + else + if iszero(α) + # Rᵃ .= β .* Rᵃ + LinearAlgebra.scal!(length(Rᵃ), β, Rᵃ, 1) + else + Rᵃ .= α .* permutedims(Tᵃ, perm) .+ β .* Rᵃ + end + end + return Rᵃ +end + +function _contract!( + CT::AbstractArray{El,NC}, + AT::AbstractArray{El,NA}, + BT::AbstractArray{El,NB}, + props::ContractionProperties, + α::Number=one(El), + β::Number=zero(El), +) where {El,NC,NA,NB} + tA = 'N' + if props.permuteA + #@timeit_debug timer "_contract!: permutedims A" begin + Ap = permutedims(AT, props.PA) + #end # @timeit + AM = transpose(reshape(Ap, (props.dmid, props.dleft))) + else + #A doesn't have to be permuted + if Atrans(props) + AM = transpose(reshape(AT, (props.dmid, props.dleft))) + else + AM = reshape(AT, (props.dleft, props.dmid)) + end + end + + tB = 'N' + if props.permuteB + #@timeit_debug timer "_contract!: permutedims B" begin + Bp = permutedims(BT, props.PB) + #end # @timeit + BM = reshape(Bp, (props.dmid, props.dright)) + else + if Btrans(props) + BM = transpose(reshape(BT, (props.dright, props.dmid))) + else + BM = reshape(BT, (props.dmid, props.dright)) + end + end + + # TODO: this logic may be wrong + if props.permuteC + # if we are computing C = α * A B + β * C + # we need to make sure C is permuted to the same + # ordering as A B which is the inverse of props.PC + if β ≠ 0 + CM = reshape(permutedims(CT, invperm(props.PC)), (props.dleft, props.dright)) + else + # Need to copy here since we will be permuting + # into C later + CM = reshape(copy(CT), (props.dleft, props.dright)) + end + else + if Ctrans(props) + CM = transpose(reshape(CT, (props.dright, props.dleft))) + else + CM = reshape(CT, (props.dleft, props.dright)) + end + end + + #tC = similar(CM) + #_gemm!(tA, tB, El(α), AM, BM, El(β), CM) + CM = mul!!(CM, AM, BM, El(α), El(β)) + + if props.permuteC + Cr = reshape(CM, props.newCrange) + # TODO: use invperm(pC) here? + #@timeit_debug timer "_contract!: permutedims C" begin + CT .= permutedims(Cr, props.PC) + #end # @timeit + end + + return CT +end diff --git a/NDTensors/src/array/mul.jl b/NDTensors/src/array/mul.jl new file mode 100644 index 0000000000..54104ce502 --- /dev/null +++ b/NDTensors/src/array/mul.jl @@ -0,0 +1,4 @@ +function mul!!(::Type{<:Array}, CM, ::Type{<:Array}, AM, ::Type{<:Array}, BM, α, β) + @strided mul!(CM, AM, BM, α, β) + return CM +end diff --git a/NDTensors/src/array/permutedims.jl b/NDTensors/src/array/permutedims.jl new file mode 100644 index 0000000000..7a033201fb --- /dev/null +++ b/NDTensors/src/array/permutedims.jl @@ -0,0 +1,15 @@ +# NDTensors.permutedims +function permutedims(::Type{<:Array}, M, perm) + return @strided Base.permutedims(M, perm) +end + +# NDTensors.permutedims! +function permutedims!(::Type{<:Array}, Mdest, ::Type{<:Array}, M, perm) + @strided Mdest .= Base.permutedims(M, perm) + return Mdest +end + +function permutedims!(::Type{<:Array}, B, ::Type{<:Array}, A, perm, f) + @strided B .= f.(B, Base.permutedims(A, perm)) + return B +end diff --git a/NDTensors/src/arraytensor/array.jl b/NDTensors/src/arraytensor/array.jl index 40e89c5181..997204394d 100644 --- a/NDTensors/src/arraytensor/array.jl +++ b/NDTensors/src/arraytensor/array.jl @@ -61,6 +61,8 @@ end function permutedims!( output_array::MatrixOrArrayStorage, array::MatrixOrArrayStorage, perm, f::Function ) - @strided output_array .= f.(output_array, permutedims(array, perm)) + output_array = permutedims!!( + leaf_parenttype(output_array), output_array, leaf_parenttype(array), array, perm, f + ) return output_array end diff --git a/NDTensors/src/dense/dense.jl b/NDTensors/src/dense/dense.jl index 12e9b05882..96afd695b5 100644 --- a/NDTensors/src/dense/dense.jl +++ b/NDTensors/src/dense/dense.jl @@ -1,7 +1,6 @@ # # Dense storage # -using LinearAlgebra: BlasFloat struct Dense{ElT,DataT<:AbstractArray} <: TensorStorage{ElT} data::DataT diff --git a/NDTensors/src/dense/densetensor.jl b/NDTensors/src/dense/densetensor.jl index 8a019e1f3a..ed773d6f0e 100644 --- a/NDTensors/src/dense/densetensor.jl +++ b/NDTensors/src/dense/densetensor.jl @@ -80,6 +80,10 @@ end # Single index # +@propagate_inbounds function getindex(T::DenseTensor{<:Number}) + return (iscu(T) ? NDTensors.cpu(data(T))[] : data(T)[]) +end + @propagate_inbounds function getindex(T::DenseTensor{<:Number}, I::Integer...) Base.@_inline_meta return getindex(data(T), Base._sub2ind(T, I...)) @@ -195,7 +199,7 @@ function permutedims!( ) where {N,StoreT<:StridedArray} RA = array(R) TA = array(T) - @strided RA .= permutedims(TA, perm) + permutedims!(RA, TA, perm) return R end @@ -243,8 +247,7 @@ function permutedims!( end RA = array(R) TA = array(T) - @strided RA .= f.(RA, permutedims(TA, perm)) - return R + return permutedims!!(RA, TA, perm, f) end """ diff --git a/NDTensors/src/dense/tensoralgebra/contract.jl b/NDTensors/src/dense/tensoralgebra/contract.jl index b5faa91616..e1d99ce2eb 100644 --- a/NDTensors/src/dense/tensoralgebra/contract.jl +++ b/NDTensors/src/dense/tensoralgebra/contract.jl @@ -1,79 +1,3 @@ -export backend_auto, backend_blas, backend_generic - -@eval struct GemmBackend{T} - (f::Type{<:GemmBackend})() = $(Expr(:new, :f)) -end -GemmBackend(s) = GemmBackend{Symbol(s)}() -macro GemmBackend_str(s) - return :(GemmBackend{$(Expr(:quote, Symbol(s)))}) -end - -const gemm_backend = Ref(:Auto) -function backend_auto() - return gemm_backend[] = :Auto -end -function backend_blas() - return gemm_backend[] = :BLAS -end -function backend_generic() - return gemm_backend[] = :Generic -end - -@inline function auto_select_backend( - ::Type{<:StridedVecOrMat{<:BlasFloat}}, - ::Type{<:StridedVecOrMat{<:BlasFloat}}, - ::Type{<:StridedVecOrMat{<:BlasFloat}}, -) - return GemmBackend(:BLAS) -end - -@inline function auto_select_backend( - ::Type{<:AbstractVecOrMat}, ::Type{<:AbstractVecOrMat}, ::Type{<:AbstractVecOrMat} -) - return GemmBackend(:Generic) -end - -function _gemm!( - tA, tB, alpha, A::TA, B::TB, beta, C::TC -) where {TA<:AbstractVecOrMat,TB<:AbstractVecOrMat,TC<:AbstractVecOrMat} - if gemm_backend[] == :Auto - _gemm!(auto_select_backend(TA, TB, TC), tA, tB, alpha, A, B, beta, C) - else - _gemm!(GemmBackend(gemm_backend[]), tA, tB, alpha, A, B, beta, C) - end -end - -# BLAS matmul -function _gemm!( - ::GemmBackend{:BLAS}, - tA, - tB, - alpha, - A::AbstractVecOrMat, - B::AbstractVecOrMat, - beta, - C::AbstractVecOrMat, -) - #@timeit_debug timer "BLAS.gemm!" begin - return BLAS.gemm!(tA, tB, alpha, A, B, beta, C) - #end # @timeit -end - -# generic matmul -function _gemm!( - ::GemmBackend{:Generic}, - tA, - tB, - alpha::AT, - A::AbstractVecOrMat, - B::AbstractVecOrMat, - beta::BT, - C::AbstractVecOrMat, -) where {AT,BT} - mul!(C, tA == 'T' ? transpose(A) : A, tB == 'T' ? transpose(B) : B, alpha, beta) - return C -end - function contraction_output(tensor1::DenseTensor, tensor2::DenseTensor, indsR) tensortypeR = contraction_output_type(typeof(tensor1), typeof(tensor2), indsR) return NDTensors.similar(tensortypeR, indsR) @@ -167,34 +91,6 @@ function _contract_scalar_noperm!( return R end -# Non-trivial permutation -function _contract_scalar_perm!( - Rᵃ::AbstractArray{ElR}, Tᵃ::AbstractArray, perm, α, β=zero(ElR) -) where {ElR} - if iszero(β) - if iszero(α) - fill!(Rᵃ, 0) - else - @strided Rᵃ .= α .* permutedims(Tᵃ, perm) - end - elseif isone(β) - if iszero(α) - # Rᵃ .= Rᵃ - # No-op - else - @strided Rᵃ .= α .* permutedims(Tᵃ, perm) .+ Rᵃ - end - else - if iszero(α) - # Rᵃ .= β .* Rᵃ - LinearAlgebra.scal!(length(Rᵃ), β, Rᵃ, 1) - else - Rᵃ .= α .* permutedims(Tᵃ, perm) .+ β .* Rᵃ - end - end - return Rᵃ -end - function _contract_scalar_maybe_perm!( ::Order{N}, R::DenseTensor{ElR,NR}, labelsR, T::DenseTensor, labelsT, α, β=zero(ElR) ) where {ElR,NR,N} @@ -233,9 +129,9 @@ function _contract_scalar_maybe_perm!( β=zero(ElR), ) where {ElR,NR} if nnz(T₁) == 1 - _contract_scalar_maybe_perm!(R, labelsR, T₂, labelsT₂, α * T₁[1], β) + _contract_scalar_maybe_perm!(R, labelsR, T₂, labelsT₂, α * T₁[], β) elseif nnz(T₂) == 1 - _contract_scalar_maybe_perm!(R, labelsR, T₁, labelsT₁, α * T₂[1], β) + _contract_scalar_maybe_perm!(R, labelsR, T₁, labelsT₁, α * T₂[], β) else error("In _contract_scalar_perm!, one tensor must be a scalar") end @@ -333,75 +229,3 @@ function _contract!( return _contract!(C, A, B, props, α, β) end - -function _contract!( - CT::AbstractArray{El,NC}, - AT::AbstractArray{El,NA}, - BT::AbstractArray{El,NB}, - props::ContractionProperties, - α::Number=one(El), - β::Number=zero(El), -) where {El,NC,NA,NB} - tA = 'N' - if props.permuteA - #@timeit_debug timer "_contract!: permutedims A" begin - @strided Ap = permutedims(AT, props.PA) - #end # @timeit - AM = transpose(reshape(Ap, (props.dmid, props.dleft))) - else - #A doesn't have to be permuted - if Atrans(props) - AM = transpose(reshape(AT, (props.dmid, props.dleft))) - else - AM = reshape(AT, (props.dleft, props.dmid)) - end - end - - tB = 'N' - if props.permuteB - #@timeit_debug timer "_contract!: permutedims B" begin - @strided Bp = permutedims(BT, props.PB) - #end # @timeit - BM = reshape(Bp, (props.dmid, props.dright)) - else - if Btrans(props) - BM = transpose(reshape(BT, (props.dright, props.dmid))) - else - BM = reshape(BT, (props.dmid, props.dright)) - end - end - - # TODO: this logic may be wrong - if props.permuteC - # if we are computing C = α * A B + β * C - # we need to make sure C is permuted to the same - # ordering as A B which is the inverse of props.PC - if β ≠ 0 - CM = reshape(permutedims(CT, invperm(props.PC)), (props.dleft, props.dright)) - else - # Need to copy here since we will be permuting - # into C later - CM = reshape(copy(CT), (props.dleft, props.dright)) - end - else - if Ctrans(props) - CM = transpose(reshape(CT, (props.dright, props.dleft))) - else - CM = reshape(CT, (props.dleft, props.dright)) - end - end - - #tC = similar(CM) - #_gemm!(tA, tB, El(α), AM, BM, El(β), CM) - @strided mul!(CM, AM, BM, El(α), El(β)) - - if props.permuteC - Cr = reshape(CM, props.newCrange) - # TODO: use invperm(pC) here? - #@timeit_debug timer "_contract!: permutedims C" begin - @strided CT .= permutedims(Cr, props.PC) - #end # @timeit - end - - return CT -end diff --git a/NDTensors/src/imports.jl b/NDTensors/src/imports.jl index 13b466fca4..4b58569f21 100644 --- a/NDTensors/src/imports.jl +++ b/NDTensors/src/imports.jl @@ -31,8 +31,6 @@ import Base: iterate, length, map, - permutedims, - permutedims!, print, promote_rule, randn, diff --git a/NDTensors/src/linearalgebra/linearalgebra.jl b/NDTensors/src/linearalgebra/linearalgebra.jl index b681cd4a9e..e2c8133eb9 100644 --- a/NDTensors/src/linearalgebra/linearalgebra.jl +++ b/NDTensors/src/linearalgebra/linearalgebra.jl @@ -218,7 +218,9 @@ function LinearAlgebra.eigen( use_relative_cutoff::Bool = get(kwargs, :use_relative_cutoff, use_relative_cutoff) matrixT = matrix(T) - if any(!isfinite, matrixT) + ## TODO Here I am calling parent to ensure that the correct `any` function + ## is envoked for non-cpu matrices + if any(!isfinite, parent(matrixT)) throw( ArgumentError( "Trying to perform the eigendecomposition of a matrix containing NaNs or Infs" @@ -234,9 +236,11 @@ function LinearAlgebra.eigen( VM = VM[:, p] if truncate + cpu_dm = NDTensors.cpu(DM) truncerr, _ = truncate!( - DM; mindim, maxdim, cutoff, use_absolute_cutoff, use_relative_cutoff, kwargs... + cpu_dm; mindim, maxdim, cutoff, use_absolute_cutoff, use_relative_cutoff, kwargs... ) + DM = adapt(typeof(DM), cpu_dm) dD = length(DM) if dD < size(VM, 2) VM = VM[:, 1:dD] @@ -400,6 +404,8 @@ function qx(qx::Function, T::DenseTensor{<:Any,2}; kwargs...) Qinds = IndsT((ind(T, 1), q)) Xinds = IndsT((q, ind(T, 2))) QM = convert(typeof(XM), QM) + ## Here I convert QM twice because of an issue in CUDA where convert does not take QM to be a UnifiedBuffer array + QM = convert(typeof(XM), QM) Q = tensor(Dense(vec(QM)), Qinds) #Q was strided X = tensor(Dense(vec(XM)), Xinds) return Q, X @@ -418,9 +424,15 @@ non-negative. Such a QR decomposition of a matrix is unique. Returns a tuple (Q,R). """ function qr_positive(M::AbstractMatrix) + iscuda = iscu(M) + if iscuda + cutype = leaf_parenttype(M) + M = NDTensors.cpu(M) + end sparseQ, R = qr(M) Q = convert(typeof(R), sparseQ) nc = size(Q, 2) + ## TODO issue here for GPU because tying to access indices for c in 1:nc if R[c, c] != 0.0 #sign(0.0)==0.0 so we don't want to zero out a column of Q. sign_Rc = sign(R[c, c]) @@ -430,6 +442,10 @@ function qr_positive(M::AbstractMatrix) end end end + if iscuda + Q = adapt(cutype, Q) + R = adapt(cutype, R) + end return (Q, R) end @@ -442,6 +458,11 @@ non-negative. Such a QL decomposition of a matrix is unique. Returns a tuple (Q,L). """ function ql_positive(M::AbstractMatrix) + iscuda = iscu(M) + if iscuda + cutype = leaf_parenttype(M) + M = NDTensors.cpu(M) + end sparseQ, L = ql(M) Q = convert(typeof(L), sparseQ) nr, nc = size(L) @@ -455,6 +476,10 @@ function ql_positive(M::AbstractMatrix) end end end + if iscuda + Q = adapt(cutype, Q) + L = adapt(cutype, L) + end return (Q, L) end @@ -467,13 +492,26 @@ function ql(A::AbstractMatrix; kwargs...) T = eltype(A) AA = similar(A, LinearAlgebra._qreltype(T), size(A)) copyto!(AA, A) - return ql!(AA; kwargs...) + iscuda = iscu(AA) + if iscuda + cutype = leaf_parenttype(AA) + AA = NDTensors.cpu(AA) + end + Q, L = ql!(AA; kwargs...) + if iscuda + Q = adapt(cutype, Q) + L = adapt(cutype, L) + end + return (Q, L) end # # This is where the low level call to lapack actually occurs. Most of the work is # about unpacking Q and L from the A matrix. # function ql!(A::StridedMatrix{<:LAPACK.BlasFloat}) + if iscu(A) + throw("Error: ql is not implemented in CUDA.jl") + end tau = Base.similar(A, min(size(A)...)) x = LAPACK.geqlf!(A, tau) #save L from the lower portion of A, before orgql! mangles it! diff --git a/NDTensors/src/tensor/permutedims.jl b/NDTensors/src/tensor/permutedims.jl new file mode 100644 index 0000000000..f79e229504 --- /dev/null +++ b/NDTensors/src/tensor/permutedims.jl @@ -0,0 +1 @@ +Base.permutedims(A::Tensor, perm) = NDTensors.permutedims(A, perm) diff --git a/NDTensors/src/tensor/set_types.jl b/NDTensors/src/tensor/set_types.jl index 77f7130d6b..eb1fc548d0 100644 --- a/NDTensors/src/tensor/set_types.jl +++ b/NDTensors/src/tensor/set_types.jl @@ -23,3 +23,6 @@ end function set_indstype(tensortype::Type{<:Tensor}, inds::Tuple) return Tensor{eltype(tensortype),length(inds),storagetype(tensortype),typeof(inds)} end + +parenttype(tensortype::Type{<:Tensor}) = storagetype(tensortype) +parenttype(storagetype::Type{<:TensorStorage}) = datatype(storagetype) diff --git a/NDTensors/test/blocksparse.jl b/NDTensors/test/blocksparse.jl index 6a541f15a1..8f52f51f14 100644 --- a/NDTensors/test/blocksparse.jl +++ b/NDTensors/test/blocksparse.jl @@ -162,7 +162,7 @@ end @test nnzblocks(A) == nnzblocks(B) @test nnz(A) == nnz(B) - Ap = permutedims(A, (3, 2, 1)) + Ap = NDTensors.permutedims(A, (3, 2, 1)) for (bAp, bB) in zip(eachnzblock(Ap), eachnzblock(B)) blockAp = blockview(Ap, bAp) diff --git a/NDTensors/test/linearalgebra.jl b/NDTensors/test/linearalgebra.jl index a643837e2d..1ffda564ab 100644 --- a/NDTensors/test/linearalgebra.jl +++ b/NDTensors/test/linearalgebra.jl @@ -51,10 +51,6 @@ devs = devices_list(copy(ARGS)) A[i, :] = A[1, :] end end - if qx == ql && dev != NDTensors.cpu - @test_broken qx(A; positive=positive) - continue - end Q, X = qx(A; positive=positive) #X is R or L. @test A ≈ Q * X atol = eps @test array(Q)' * array(Q) ≈ Id atol = eps diff --git a/src/ITensors.jl b/src/ITensors.jl index c1effe5ebe..d571e436bf 100644 --- a/src/ITensors.jl +++ b/src/ITensors.jl @@ -138,6 +138,7 @@ include("broadcast.jl") include("tensor_operations/matrix_decomposition.jl") include("iterativesolvers.jl") include("adapt.jl") +include("set_types.jl") ##################################### # Experimental ITensor Functions diff --git a/src/broadcast.jl b/src/broadcast.jl index af3082ac1a..bbb4570790 100644 --- a/src/broadcast.jl +++ b/src/broadcast.jl @@ -134,7 +134,6 @@ find_type(::Type{T}, ::Tuple{}) where {T} = nothing # function Base.copyto!(T::ITensor, bc::Broadcasted) - @show typeof(bc) return error( "The broadcasting operation you are attempting is not yet implemented for ITensors, please raise an issue if you would like it to be supported.", ) @@ -171,7 +170,11 @@ function Base.copyto!( ) α = find_type(Number, bc.args) A = find_type(ITensor, bc.args) - map!((t, a) -> bc.f(a, α), T, T, A) + ## GPU compilers can have a problem when map is + ## Given bc.f. map seems to make a closure with a + ## relatively complicated signature + f = bc.f + map!((t, a) -> f(a, α), T, T, A) return T end @@ -183,10 +186,14 @@ function Base.copyto!( R::ITensor, bc::Broadcasted{ITensorStyle,<:Any,typeof(/),<:Tuple{<:ITensor,<:ITensor}} ) T1, T2 = bc.args + f = bc.f if R === T1 - map!((t1, t2) -> bc.f(t1, t2), R, T1, T2) + map!((t1, t2) -> f(t1, t2), R, T1, T2) + ## I tried this and it is numberically wrong + #map!(f, R, T1, T2) elseif R === T2 - map!((t1, t2) -> bc.f(t2, t1), R, T2, T1) + map!((t1, t2) -> f(t2, t1), R, T2, T1) + #map!(f, R, T2, T1) else error("When dividing two ITensors in-place, one must be the same as the output ITensor") end @@ -223,7 +230,8 @@ function Base.copyto!( ) α = find_type(Number, bc.args) A = find_type(ITensor, bc.args) - map!((t, a) -> bc.f(α, a), T, T, A) + f = bc.f + map!((t, a) -> f(α, a), T, T, A) return T end @@ -249,7 +257,8 @@ function Base.copyto!( powf = find_type(Base.RefValue{<:Function}, bc.args).x @assert !isnothing(powf) T = find_type(ITensor, bc.args) - map!((r, t) -> bc.f(^, t, α), R, R, T) + f = bc.f + map!((r, t) -> f(^, t, α), R, R, T) return R end @@ -276,18 +285,18 @@ function Base.copyto!( return T end -# -# B .+= A -# - function fmap(bc::Broadcasted{ITensorStyle,<:Any,typeof(+),<:Tuple{Vararg{ITensor}}}) - return (r, t) -> bc.f(r, t) + return + end function fmap(bc::Broadcasted{ITensorStyle,<:Any,typeof(-),<:Tuple{Vararg{ITensor}}}) - return (r, t) -> bc.f(r, t) + return - end +# +# B .+= A +# + function Base.copyto!( T::ITensor, bc::Broadcasted{ITensorStyle,<:Any,typeof(+),<:Tuple{Vararg{ITensor}}} ) @@ -343,6 +352,7 @@ function Base.copyto!( ) C = find_type(ITensor, bc.args) bc_bc = find_type(Broadcasted, bc.args) + if T === C A = find_type(ITensor, bc_bc.args) α = find_type(Number, bc_bc.args) @@ -350,11 +360,14 @@ function Base.copyto!( # Check if it is the case .^(::Int) γ = find_type(Base.RefValue{<:Val}, bc_bc.args) powf = find_type(Base.RefValue{<:Function}, bc_bc.args) + ## Putting fmap in the map call still doesn't actually grab the function and causes GPU to fail so just realize the function slightly earlier here + f1 = bc.f + f2 = bc_bc.f if !isnothing(α) && !isnothing(A) - map!((r, t) -> bc.f(r, bc_bc.f(t, α)), T, T, A) + map!((r, t) -> f1(r, f2(t, α)), T, T, A) elseif !isnothing(γ) && !isnothing(A) && !isnothing(powf) - map!((r, t) -> bc.f(r, bc_bc.f(powf[], t, γ[])), T, T, A) + map!((r, t) -> f1(r, f2(powf[], t, γ[])), T, T, A) else # In-place contraction: # C .+= α .* A .* B @@ -365,7 +378,7 @@ function Base.copyto!( else A, B = bc_bc_bc.args end - mul!(T, A, B, β, bc.f(1)) + mul!(T, A, B, β, f1(1)) end else error("When adding two ITensors in-place, one must be the same as the output ITensor") @@ -378,6 +391,7 @@ end # C .= β .* C .+ α .* A .* B # +## TODO this code doesn't actually get called function Base.copyto!( T::ITensor, bc::Broadcasted{ITensorOpScalarStyle,<:Any,typeof(+),<:Tuple{Vararg{Broadcasted}}}, @@ -414,6 +428,7 @@ end # For A .+= α # +## TODO this code fails because of scalar indexing function Base.copyto!( T::ITensor, bc::Broadcasted{ @@ -483,8 +498,9 @@ function Base.copyto!( ) R̃ = find_type(ITensor, bc.args) bc2 = find_type(Broadcasted, bc.args) + f = bc2.f if R === R̃ - map!((r, t) -> r + bc2.f(t), R, R, bc2.args[1]) + map!((r, t) -> r + f(t), R, R, bc2.args[1]) else error("In C .= B .+ f.(A), C and B must be the same ITensor") end @@ -495,6 +511,7 @@ end # For B .= f.(B) + g.(A) # +## TODO check to see if this code is being called as expected function Base.copyto!( R::ITensor, bc::Broadcasted{ITensorStyle,<:Any,typeof(+),<:Tuple{Vararg{Broadcasted}}} ) diff --git a/src/imports.jl b/src/imports.jl index 1df7f62a18..02e5830eb5 100644 --- a/src/imports.jl +++ b/src/imports.jl @@ -51,7 +51,6 @@ import Base: map, map!, ndims, - permutedims, print, promote_rule, push!, @@ -127,6 +126,8 @@ using ITensors.NDTensors: enable_auto_fermion, fill!!, randn!!, + permutedims, + permutedims!, set_eltype, single_precision, timer, @@ -158,6 +159,7 @@ import ITensors.NDTensors: inds, insertblock!, insert_diag_blocks!, + leaf_parenttype, matrix, maxdim, mindim, diff --git a/src/itensor.jl b/src/itensor.jl index c26786e5e2..a56c219aad 100644 --- a/src/itensor.jl +++ b/src/itensor.jl @@ -843,7 +843,7 @@ size(A::ITensor, d::Int) = size(tensor(A), d) _isemptyscalar(A::ITensor) = _isemptyscalar(tensor(A)) _isemptyscalar(A::Tensor) = ndims(A) == 0 && isemptystorage(A) && eltype(A) === EmptyNumber - +NDTensors.iscu(A::ITensor) = NDTensors.iscu(tensor(A)) """ dir(A::ITensor, i::Index) @@ -1140,8 +1140,9 @@ A[i => 1, i' => 2] # 2.0, same as: A[i' => 2, i => 1] @propagate_inbounds (getindex(T::ITensor, ivs::Vararg{Any,N})::Any) where {N} = _getindex(tensor(T), ivs...) +## Allowing one to get the first ITensor element if its an order 0 tensor or an order 1 tensor with a dimension of 1. Also convert GPU back to CPU @propagate_inbounds function getindex(T::ITensor)::Any - if order(T) != 0 + if order(T) != 0 && dim(T) != 1 throw( DimensionMismatch( "In scalar(T) or T[], ITensor T is not a scalar (it has indices $(inds(T)))." @@ -1892,7 +1893,7 @@ diag(T::ITensor) = diag(tensor(T)) mul!(C::ITensor, A::ITensor, B::ITensor, args...)::ITensor = contract!(C, A, B, args...) -dot(A::ITensor, B::ITensor) = (dag(A) * B)[] +dot(A::ITensor, B::ITensor) = NDTensors.cpu(dag(A) * B)[] inner(y::ITensor, A::ITensor, x::ITensor) = (dag(y) * A * x)[] inner(y::ITensor, x::ITensor) = (dag(y) * x)[] diff --git a/src/mps/dmrg.jl b/src/mps/dmrg.jl index a6b78c6f8c..f81d12b288 100644 --- a/src/mps/dmrg.jl +++ b/src/mps/dmrg.jl @@ -278,7 +278,15 @@ function dmrg(PH, psi0::MPS, sweeps::Sweeps; kwargs...) end energy = vals[1] - phi::ITensor = vecs[1] + ## Right now there is a conversion problem in CUDA.jl where `UnifiedMemory` Arrays are being converted + ## into `DeviceMemory`. This conversion line is here temporarily to fix that problem when it arises + ## Adapt is only called when using CUDA backend. CPU will work as implemented previously. + phi::ITensor = if NDTensors.iscu(phi) && NDTensors.iscu(vecs[1]) + adapt(set_eltype(leaf_parenttype(phi), eltype(vecs[1])), vecs[1]) + else + vecs[1] + end + #phi::ITensor = vecs[1] ortho = ha == 1 ? "left" : "right" diff --git a/src/set_types.jl b/src/set_types.jl new file mode 100644 index 0000000000..8d866d7c27 --- /dev/null +++ b/src/set_types.jl @@ -0,0 +1 @@ +leaf_parenttype(T::ITensor) = leaf_parenttype(tensor(T)) diff --git a/src/tensor_operations/permutations.jl b/src/tensor_operations/permutations.jl index 5f7bb174e0..b85e4f5345 100644 --- a/src/tensor_operations/permutations.jl +++ b/src/tensor_operations/permutations.jl @@ -55,12 +55,12 @@ function permute(T::ITensor, new_inds...; kwargs...) end # TODO: move to NDTensors -function permutedims(::AllowAlias, T::Tensor, perm) +function NDTensors.permutedims(::AllowAlias, T::Tensor, perm) return NDTensors.is_trivial_permutation(perm) ? T : permutedims(NeverAlias(), T, perm) end # TODO: move to NDTensors, define `permutedims` in terms of `NeverAlias` -function permutedims(::NeverAlias, T::Tensor, perm) +function NDTensors.permutedims(::NeverAlias, T::Tensor, perm) return permutedims(T, perm) end diff --git a/test/ITensorNetworkMaps/utils/utils.jl b/test/ITensorNetworkMaps/utils/utils.jl index 02275aee89..7b4254c4a5 100644 --- a/test/ITensorNetworkMaps/utils/utils.jl +++ b/test/ITensorNetworkMaps/utils/utils.jl @@ -33,7 +33,7 @@ function ITensors.prime(::typeof(linkinds), tn::InfTN) return tn_p end -interleave(x::Vector, y::Vector) = permutedims([x y])[:] +interleave(x::Vector, y::Vector) = Base.permutedims([x y])[:] function ITensors.linkind(tn::InfTN, e) return commonind(tn[e[1]], tn[e[2]]) diff --git a/test/base/test_itensor_scalar_contract.jl b/test/base/test_itensor_scalar_contract.jl index c322139bba..63e311e59e 100644 --- a/test/base/test_itensor_scalar_contract.jl +++ b/test/base/test_itensor_scalar_contract.jl @@ -65,14 +65,14 @@ end R .= A .* B @test !any(isnan, R) - @test array(R) ≈ permutedims(array(A), (2, 1, 3)) * array(B)[] + @test array(R) ≈ NDTensors.permutedims(array(A), (2, 1, 3)) * array(B)[] R .= NaN @test any(isnan, R) R .= B .* A @test !any(isnan, R) - @test array(R) ≈ permutedims(array(A), (2, 1, 3)) * array(B)[] + @test array(R) ≈ NDTensors.permutedims(array(A), (2, 1, 3)) * array(B)[] end @testset "General contraction, permutation" for ElA in BlasFloats, ElB in BlasFloats @@ -89,7 +89,7 @@ end R .= A .* B @test !any(isnan, R) @test reshape(array(R), 6, 2) ≈ - reshape(permutedims(array(A), (2, 1, 3)), 6, 2) * array(B) + reshape(NDTensors.permutedims(array(A), (2, 1, 3)), 6, 2) * array(B) R .= NaN @test any(isnan, R) @@ -97,6 +97,6 @@ end R .= B .* A @test !any(isnan, R) @test reshape(array(R), 6, 2) ≈ - reshape(permutedims(array(A), (2, 1, 3)), 6, 2) * array(B) + reshape(NDTensors.permutedims(array(A), (2, 1, 3)), 6, 2) * array(B) end end