[NDTensorsCUDAExt] Fix bug in slicing Adjoint CuMatrix (#1236)

ITensor · Nov 8, 2023 · 8ba0e17 · 8ba0e17
1 parent 3b1fa88
commit 8ba0e17
Show file tree

Hide file tree

Showing 18 changed files with 166 additions and 35 deletions.
diff --git a/NDTensors/ext/NDTensorsCUDAExt/adapt.jl b/NDTensors/ext/NDTensorsCUDAExt/adapt.jl
@@ -16,7 +16,8 @@ buffertype(::NDTensorCuArrayAdaptor{B}) where {B} = B
 function Adapt.adapt_storage(adaptor::NDTensorCuArrayAdaptor, xs::AbstractArray)
   ElT = eltype(xs)
   BufT = buffertype(adaptor)
-  return isbits(xs) ? xs : adapt(CuArray{ElT,1,BufT}, xs)
+  N = ndims(xs)
+  return isbits(xs) ? xs : adapt(CuArray{ElT,N,BufT}, xs)
 end
 
 function NDTensors.adapt_storagetype(

diff --git a/NDTensors/ext/NDTensorsCUDAExt/indexing.jl b/NDTensors/ext/NDTensorsCUDAExt/indexing.jl
@@ -7,14 +7,12 @@ function setindex!(E::Exposed{<:CuArray}, x::Number)
   return unexpose(E)
 end
 
-function Base.getindex(E::Exposed{<:CuArray,<:Adjoint}, I...)
-  Ap = parent(E)
-  return expose(Ap)[I...]
+function Base.getindex(E::Exposed{<:CuArray,<:Adjoint}, i, j)
+  return (expose(parent(E))[j, i])'
 end
 
 function Base.copy(E::Exposed{<:CuArray,<:Base.ReshapedArray})
-  Ap = parent(E)
-  return copy(expose(Ap))
+  return reshape(copy(expose(parent(E))), size(unexpose(E)))
 end
 
 Base.any(f, E::Exposed{<:CuArray,<:NDTensors.Tensor}) = any(f, data(unexpose(E)))

diff --git a/NDTensors/ext/examples/NDTensorCUDA.jl b/NDTensors/ext/examples/NDTensorCUDA.jl
@@ -58,6 +58,9 @@ function main()
   f(A, B, C, D) = (A * B * C * D)[]
 
   #Use Zygote to take the gradient of the four tensors on GPU
+  #Currently this code fails with CUDA.allowscalar(false)
+  # Because of outer calling the _gemm! function which calls a 
+  # generic implementation
   grad = gradient(f, cA, cB, cC, cD)
   @test NDTensors.cpu(cB * cC * cD) ≈ NDTensors.cpu(grad[1])
   @test (cB * cC * cD) ≈ grad[1]

diff --git a/NDTensors/src/Unwrap/src/Unwrap.jl b/NDTensors/src/Unwrap/src/Unwrap.jl
@@ -20,6 +20,4 @@ include("functions/permutedims.jl")
 export IsWrappedArray,
   is_wrapped_array, parenttype, unwrap_type, expose, Exposed, unexpose, cpu
 
-## TODO write exposed based functions in the NDTensors Extensions when necessary
-
 end
diff --git a/NDTensors/src/Unwrap/test/runtests.jl b/NDTensors/src/Unwrap/test/runtests.jl
@@ -0,0 +1,90 @@
+using Test
+using NDTensors.Unwrap
+using NDTensors
+using LinearAlgebra
+
+include("../../../test/device_list.jl")
+@testset "Testing Unwrap" for dev in devices_list(ARGS)
+  v = dev(Vector{Float64}(undef, 10))
+  vt = transpose(v)
+  va = v'
+
+  E = expose(v)
+  Et = expose(vt)
+  Ea = expose(va)
+  v_type = typeof(v)
+  e_type = eltype(v)
+  @test typeof(E) == Exposed{v_type,v_type}
+  @test typeof(Et) == Exposed{v_type,LinearAlgebra.Transpose{e_type,v_type}}
+  @test typeof(Ea) == Exposed{v_type,LinearAlgebra.Adjoint{e_type,v_type}}
+
+  @test parent(E) == v
+  @test parent(Et) == v
+  @test parent(Ea) == v
+  @test transpose(E) == vt
+  @test cpu(E) == v
+  @test cpu(Et) == vt
+
+  m = reshape(v, (5, 2))
+  mt = transpose(m)
+  ma = m'
+  E = expose(m)
+  Et = expose(mt)
+  Ea = expose(ma)
+
+  m_type = typeof(m)
+  @test typeof(E) == Exposed{m_type,m_type}
+  @test typeof(Et) == Exposed{m_type,LinearAlgebra.Transpose{e_type,m_type}}
+  @test typeof(Ea) == Exposed{m_type,LinearAlgebra.Adjoint{e_type,m_type}}
+
+  o = dev(Vector{Float32})(undef, 1)
+  expose(o)[] = 2
+  @test expose(o)[] == 2
+
+  fill!(m, 0)
+  @test any(!Base.isinf, expose(m))
+
+  mp = copy(Ea)
+  @test mp == ma
+  fill!(ma, 2.0)
+  copyto!(expose(mp), expose(ma))
+  @test mp == ma
+
+  q, r = qr(expose(mp))
+  @test q * r ≈ mp
+
+  q, r = Unwrap.qr_positive(expose(mp))
+  @test q * r ≈ mp
+
+  square = dev(rand(Float64, (10, 10)))
+  square = (square + transpose(square)) ./ 2.0
+  ## CUDA only supports Hermitian or Symmetric eigen decompositions
+  ## So I symmetrize square and call symetric here
+  l, U = eigen(expose(Symmetric(square)))
+  @test square * U ≈ U * Diagonal(l)
+
+  U, S, V, = svd(expose(mp))
+  @test U * Diagonal(S) * V' ≈ mp
+
+  cm = dev(fill!(Matrix{Float64}(undef, (2, 2)), 0.0))
+  mul!(expose(cm), expose(mp), expose(mp'), 1.0, 0.0)
+  @test cm ≈ mp * mp'
+
+  @test permutedims(expose(mp), (2, 1)) == transpose(mp)
+  fill!(mt, 3.0)
+  permutedims!(expose(m), expose(mt), (2, 1))
+  @test norm(m) == sqrt(3^2 * 10)
+  @test size(m) == (5, 2)
+  permutedims!(expose(m), expose(mt), (2, 1), +)
+  @test size(m) == (5, 2)
+  @test norm(m) == sqrt(6^2 * 10)
+
+  m = reshape(m, (5, 2, 1))
+  mt = fill!(similar(m), 3.0)
+  m = permutedims(expose(m), (2, 1, 3))
+  @test size(m) == (2, 5, 1)
+  permutedims!(expose(m), expose(mt), (2, 1, 3))
+  @test norm(m) == sqrt(3^2 * 10)
+  permutedims!(expose(m), expose(mt), (2, 1, 3), -)
+  @test norm(m) == 0
+end
diff --git a/NDTensors/src/dense/tensoralgebra/outer.jl b/NDTensors/src/dense/tensoralgebra/outer.jl
@@ -21,9 +21,12 @@ function outer!(
   v1 = data(T1)
   v2 = data(T2)
   RM = reshape(R, length(v1), length(v2))
+  ## Potential fix is call reshape on array
+  #RM = reshape(array(R), length(v1), length(v2))
   #RM .= v1 .* transpose(v2)
   #mul!(RM, v1, transpose(v2))
   _gemm!('N', 'T', one(ElR), v1, v2, zero(ElR), RM)
+  #mul!!(RM, v1, transpose(v2), one(ElR), zero(ElR))
   return R
 end
 

diff --git a/NDTensors/src/linearalgebra/linearalgebra.jl b/NDTensors/src/linearalgebra/linearalgebra.jl
@@ -157,7 +157,7 @@ function svd(
   spec = Spectrum(P, truncerr)
   dS = length(P)
   if dS < length(MS)
-    MU = MU[:, 1:dS]
+    MU = expose(MU)[:, 1:dS]
     # Fails on some GPU backends like Metal.
     # resize!(MS, dS)
     MS = MS[1:dS]
@@ -187,6 +187,7 @@ function eigen(
   matrixT = matrix(T)
   ## TODO Here I am calling parent to ensure that the correct `any` function
   ## is envoked for non-cpu matrices
+  ## TODO use expose here
   if any(!isfinite, parent(matrixT))
     throw(
       ArgumentError(
@@ -195,6 +196,7 @@ function eigen(
     )
   end
 
+  ### What do we do if DM is full of Nan or Inf?
   DM, VM = eigen(expose(matrixT))
 
   # Sort by largest to smallest eigenvalues

diff --git a/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl b/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
@@ -0,0 +1,16 @@
+## This is getting closer still working on it. No need to review
+## Failing for CUDA mostly with eigen (I believe there is some noise in
+## eigen decomp with CUBLAS to give slightly different answer than BLAS)
+module TestITensorDMRG
+using Test
+using ITensors
+using NDTensors
+using Random
+
+reference_energies = Dict([
+  (4, -1.6160254037844384), (8, -3.374932598687889), (10, -4.258035207282885)
+])
+
+default_rtol(elt::Type) = 10^(0.75 * log10(eps(real(elt))))
+include("dmrg.jl")
+end
diff --git a/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl b/NDTensors/test/ITensors/TestITensorDMRG/dmrg.jl
@@ -0,0 +1,24 @@
+function test_dmrg(elt, N::Integer, dev::Function)
+  sites = siteinds("S=1/2", N)
+
+  os = OpSum()
+  for j in 1:(N - 1)
+    os += "Sz", j, "Sz", j + 1
+    os += 0.5, "S+", j, "S-", j + 1
+    os += 0.5, "S-", j, "S+", j + 1
+  end
+
+  Random.seed!(1234)
+  psi0 = dev(randomMPS(Float64, sites; linkdims=4))
+  H = dev(MPO(elt, os, sites))
+
+  nsweeps = 3
+  cutoff = [1e-3, 1e-13]
+  noise = [1e-12, 0]
+  ## running these with nsweeps = 100 and no maxdim
+  ## all problems do not have a maxlinkdim > 32
+  maxdim = 32
+
+  energy, psi = dmrg(H, psi0; nsweeps, cutoff, maxdim, noise, outputlevel=0)
+  @test energy ≈ reference_energies[N] rtol = default_rtol(elt)
+end
diff --git a/NDTensors/test/ITensors/TestITensorDMRG/runtests.jl b/NDTensors/test/ITensors/TestITensorDMRG/runtests.jl
@@ -0,0 +1,14 @@
+using Test
+using NDTensors
+include("TestITensorDMRG.jl")
+
+include("../../device_list.jl")
+
+@testset "Testing DMRG different backends" begin
+  for dev in devices_list(ARGS),
+    N in [4, 10],
+    elt in (Float32, ComplexF32, Float64, ComplexF64)
+
+    TestITensorDMRG.test_dmrg(elt, N, dev)
+  end
+end
diff --git a/NDTensors/test/ITensors/runtests.jl b/NDTensors/test/ITensors/runtests.jl
@@ -0,0 +1 @@
+include("TestITensorDMRG/runtests.jl")
diff --git a/NDTensors/test/Project.toml b/NDTensors/test/Project.toml
@@ -6,6 +6,7 @@ ITensors = "9136182c-28ba-11e9-034c-db9fb085ebd5"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 TBLIS = "48530278-0828-4a49-9772-0f3830dfa1e9"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

diff --git a/NDTensors/test/Unwrap.jl b/NDTensors/test/Unwrap.jl
@@ -0,0 +1,4 @@
+using Test
+using NDTensors
+
+include(joinpath(pkgdir(NDTensors), "src", "Unwrap", "test", "runtests.jl"))
diff --git a/NDTensors/test/blocksparse.jl b/NDTensors/test/blocksparse.jl
@@ -1,12 +1,6 @@
 using NDTensors
 using LinearAlgebra
 using Test
-if "cuda" in ARGS || "all" in ARGS
-  using CUDA
-end
-if "metal" in ARGS || "all" in ARGS
-  using Metal
-end
 
 @testset "BlockSparseTensor basic functionality" begin
   C = nothing

diff --git a/NDTensors/test/combiner.jl b/NDTensors/test/combiner.jl
@@ -5,13 +5,6 @@ using Test
 # Testing generic block indices
 using ITensors: QN, Index
 
-if "cuda" in ARGS || "all" in ARGS
-  using CUDA
-end
-if "metal" in ARGS || "all" in ARGS
-  using Metal
-end
-
 @testset "CombinerTensor basic functionality" begin
   include("device_list.jl")
   devs = devices_list(copy(ARGS))

diff --git a/NDTensors/test/dense.jl b/NDTensors/test/dense.jl
@@ -1,11 +1,5 @@
 using NDTensors
 using Test
-if "cuda" in ARGS || "all" in ARGS
-  using CUDA
-end
-if "metal" in ARGS || "all" in ARGS
-  using Metal
-end
 
 @testset "Dense Tensors" begin
   include("device_list.jl")

diff --git a/NDTensors/test/linearalgebra.jl b/NDTensors/test/linearalgebra.jl
@@ -2,13 +2,6 @@ using NDTensors
 using LinearAlgebra
 using Test
 
-if "cuda" in ARGS || "all" in ARGS
-  using CUDA
-end
-if "metal" in ARGS || "all" in ARGS
-  using Metal
-end
-
 @testset "random_orthog" begin
   n, m = 10, 4
   O1 = random_orthog(n, m)

diff --git a/NDTensors/test/runtests.jl b/NDTensors/test/runtests.jl
@@ -25,6 +25,7 @@ end
     "SmallVectors.jl",
     "SortedSets.jl",
     "TagSets.jl",
+    "Unwrap.jl",
     "linearalgebra.jl",
     "dense.jl",
     "blocksparse.jl",
@@ -34,6 +35,7 @@ end
     "emptystorage.jl",
     "combiner.jl",
     "arraytensor/arraytensor.jl",
+    "ITensors/runtests.jl",
   ]
     println("Running $filename")
     include(filename)