ITensor · mtfishman · Nov 15, 2023 · Nov 9, 2023 · Nov 9, 2023 · Nov 10, 2023
diff --git a/NDTensors/Project.toml b/NDTensors/Project.toml
@@ -23,6 +23,7 @@ Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"
 StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
+GPUArraysCore="46192b85-c4d5-4398-a991-12ede77f4527"
 
 [weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

diff --git a/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl b/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl
@@ -5,16 +5,21 @@ using NDTensors.SetParameters
 using NDTensors.Unwrap
 using Adapt
 using Functors
-using LinearAlgebra
+using LinearAlgebra: LinearAlgebra, Adjoint, Transpose, mul!, svd
 using CUDA
 using CUDA.CUBLAS
 using CUDA.CUSOLVER
 
+## TODO I added copyto and permutedims which match the functions in 
+## NDTensorsMetalExt because I found similar issues in CUDA
 include("imports.jl")
 include("default_kwargs.jl")
+include("copyto.jl")
 include("set_types.jl")
 include("iscu.jl")
 include("adapt.jl")
 include("indexing.jl")
 include("linearalgebra.jl")
+include("mul.jl")
+include("permutedims.jl")
 end
diff --git a/NDTensors/ext/NDTensorsCUDAExt/copyto.jl b/NDTensors/ext/NDTensorsCUDAExt/copyto.jl
@@ -0,0 +1,28 @@
+## IT looks like CuArray suffers from the same issues as MtlArray.
+## To fix this subarray copyto problem I copied same code from MetalExt
+## This means we can probably write a generic implmenetation for GPUArrays
+function Base.copy(src::Exposed{<:CuArray,<:Base.ReshapedArray})
+  return reshape(copy(parent(src)), size(unexpose(src)))
+end
+
+function Base.copy(
+  src::Exposed{
+    <:CuArray,<:SubArray{<:Any,<:Any,<:Base.ReshapedArray{<:Any,<:Any,<:Adjoint}}
+  },
+)
+  return copy(@view copy(expose(parent(src)))[parentindices(unexpose(src))...])
+end
+
+# Catches a bug in `copyto!` in CUDA backend.
+function Base.copyto!(dest::Exposed{<:CuArray}, src::Exposed{<:CuArray,<:SubArray})
+  copyto!(dest, expose(copy(src)))
+  return unexpose(dest)
+end
+
+# Catches a bug in `copyto!` in CUDA backend.
+function Base.copyto!(
+  dest::Exposed{<:CuArray}, src::Exposed{<:CuArray,<:Base.ReshapedArray}
+)
+  copyto!(dest, expose(parent(src)))
+  return unexpose(dest)
+end
diff --git a/NDTensors/ext/NDTensorsCUDAExt/imports.jl b/NDTensors/ext/NDTensorsCUDAExt/imports.jl
@@ -2,5 +2,3 @@ import NDTensors: cu, set_ndims, set_eltype, set_eltype_if_unspecified, similart
 import NDTensors:
   ContractionProperties, _contract!, GemmBackend, auto_select_backend, _gemm!, iscu
 import NDTensors.SetParameters: nparameters, get_parameter, set_parameter, default_parameter
-
-import .CUDA: CuArrayAdaptor
diff --git a/NDTensors/ext/NDTensorsCUDAExt/indexing.jl b/NDTensors/ext/NDTensorsCUDAExt/indexing.jl
@@ -2,7 +2,7 @@ function Base.getindex(E::Exposed{<:CuArray})
   return CUDA.@allowscalar unexpose(E)[]
 end
 
-function setindex!(E::Exposed{<:CuArray}, x::Number)
+function Base.setindex!(E::Exposed{<:CuArray}, x::Number)
   CUDA.@allowscalar unexpose(E)[] = x
   return unexpose(E)
 end
@@ -11,10 +11,6 @@ function Base.getindex(E::Exposed{<:CuArray,<:Adjoint}, i, j)
   return (expose(parent(E))[j, i])'
 end
 
-function Base.copy(E::Exposed{<:CuArray,<:Base.ReshapedArray})
-  return reshape(copy(expose(parent(E))), size(unexpose(E)))
-end
-
 Base.any(f, E::Exposed{<:CuArray,<:NDTensors.Tensor}) = any(f, data(unexpose(E)))
 
 function Base.print_array(io::IO, E::Exposed{<:CuArray})

diff --git a/NDTensors/ext/NDTensorsCUDAExt/mul.jl b/NDTensors/ext/NDTensorsCUDAExt/mul.jl
@@ -0,0 +1,45 @@
+# This was calling generic matrix multiplication.
+# TODO: Raise an issue with `CUDA.jl`.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:CuArray,<:LinearAlgebra.Transpose},
+  AM::Exposed{<:CuArray},
+  BM::Exposed{<:CuArray},
+  α,
+  β,
+)
+  mul!(transpose(CM), transpose(BM), transpose(AM), α, β)
+  return unexpose(CM)
+end
+
+# This was calling generic matrix multiplication.
+# TODO: Raise an issue with `CUDA.jl`.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:CuArray,<:LinearAlgebra.Adjoint},
+  AM::Exposed{<:CuArray},
+  BM::Exposed{<:CuArray},
+  α,
+  β,
+)
+  mul!(CM', BM', AM', α, β)
+  return unexpose(CM)
+end
+
+## TODO I wasn't sure the best route to go here, if there is a better route than
+## copy please let me know!
+## Fix issue in CUDA.jl where it cannot distinguish Transpose{Reshape{Adjoint{CuArray}}}
+## as a CuArray and calls generic matmul
+function LinearAlgebra.mul!(
+  CM::Exposed{<:CuArray},
+  AM::Exposed{<:CuArray},
+  BM::Exposed{
+    <:CuArray,
+    <:LinearAlgebra.Transpose{
+      <:Any,<:Base.ReshapedArray{<:Any,<:Any,<:LinearAlgebra.Adjoint}
+    },
+  },
+  α,
+  β,
+)
+  mul!(CM, AM, expose(transpose(copy(expose(parent(BM))))), α, β)
+  return unexpose(CM)
+end
diff --git a/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl b/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl
@@ -0,0 +1,7 @@
+function Base.permutedims!(
+  Edest::Exposed{<:CuArray,<:Base.ReshapedArray}, Esrc::Exposed{<:CuArray}, perm
+)
+  Aperm = permutedims(Esrc, perm)
+  copyto!(expose(parent(Edest)), expose(Aperm))
+  return unexpose(Edest)
+end
diff --git a/NDTensors/ext/NDTensorsMetalExt/copyto.jl b/NDTensors/ext/NDTensorsMetalExt/copyto.jl
@@ -12,12 +12,14 @@ end
 
 # Catches a bug in `copyto!` in Metal backend.
 function Base.copyto!(dest::Exposed{<:MtlArray}, src::Exposed{<:MtlArray,<:SubArray})
-  return copyto!(dest, expose(copy(src)))
+  copyto!(dest, expose(copy(src)))
+  return unexpose(dest)
 end
 
 # Catches a bug in `copyto!` in Metal backend.
 function Base.copyto!(
   dest::Exposed{<:MtlArray}, src::Exposed{<:MtlArray,<:Base.ReshapedArray}
 )
-  return copyto!(dest, expose(parent(src)))
+  copyto!(dest, expose(parent(src)))
+  return unexpose(dest)
 end
diff --git a/NDTensors/ext/NDTensorsMetalExt/mul.jl b/NDTensors/ext/NDTensorsMetalExt/mul.jl
@@ -10,3 +10,12 @@ function LinearAlgebra.mul!(
   mul!(transpose(CM), transpose(BM), transpose(AM), α, β)
   return unexpose(CM)
 end
+
+# This was calling generic matrix multiplication.
+# TODO: Raise an issue with `Metal.jl`.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:MtlArray,<:Adjoint}, AM::Exposed{<:MtlArray}, BM::Exposed{<:MtlArray}, α, β
+)
+  mul!(CM', BM', AM', α, β)
+  return unexpose(CM)
+end
diff --git a/NDTensors/ext/examples/NDTensorCUDA.jl b/NDTensors/ext/examples/NDTensorCUDA.jl
@@ -61,9 +61,9 @@ function main()
   #Currently this code fails with CUDA.allowscalar(false)
   # Because of outer calling the _gemm! function which calls a 
   # generic implementation
-  grad = gradient(f, cA, cB, cC, cD)
-  @test NDTensors.cpu(cB * cC * cD) ≈ NDTensors.cpu(grad[1])
-  @test (cB * cC * cD) ≈ grad[1]
+  @allowscalar grad = gradient(f, cA, cB, cC, cD)
+  @allowscalar @test NDTensors.cpu(cB * cC * cD) ≈ NDTensors.cpu(grad[1])
+  @allowscalar @test (cB * cC * cD) ≈ grad[1]
   # Create a tuple of indices
   decomp = (
     dim(NDTensors.ind(grad[1], 1)),

diff --git a/NDTensors/src/NDTensors.jl b/NDTensors/src/NDTensors.jl
@@ -17,6 +17,7 @@ using SplitApplyCombine
 using Strided
 using TimerOutputs
 using TupleTools
+using GPUArraysCore
 
 include("SetParameters/src/SetParameters.jl")
 using .SetParameters

diff --git a/NDTensors/src/Unwrap/src/functions/abstractarray.jl b/NDTensors/src/Unwrap/src/functions/abstractarray.jl
@@ -2,6 +2,8 @@ parent(E::Exposed) = parent(unexpose(E))
 
 transpose(E::Exposed) = transpose(unexpose(E))
 
+adjoint(E::Exposed) = adjoint(unexpose(E))
+
 cpu(E::Exposed) = cpu(unexpose(E))
 
 getindex(E::Exposed) = unexpose(E)[]

diff --git a/NDTensors/src/Unwrap/src/import.jl b/NDTensors/src/Unwrap/src/import.jl
@@ -1,4 +1,5 @@
 import Base:
+  adjoint,
   permutedims,
   permutedims!,
   copy,

diff --git a/NDTensors/src/Unwrap/test/runtests.jl b/NDTensors/src/Unwrap/test/runtests.jl
@@ -2,6 +2,7 @@ using Test
 using NDTensors.Unwrap
 using NDTensors
 using LinearAlgebra
+using GPUArraysCore: @allowscalar
 
 include("../../../test/device_list.jl")
 @testset "Testing Unwrap $dev, $elt" for dev in devices_list(ARGS),
@@ -24,8 +25,8 @@ include("../../../test/device_list.jl")
   @test parent(Et) == v
   @test parent(Ea) == v
   @test transpose(E) == vt
-  @test cpu(E) == v
-  @test cpu(Et) == vt
+  @test cpu(E) == cpu(v)
+  @test cpu(Et) == cpu(vt)
 
   m = reshape(v, (5, 2))
   mt = transpose(m)
@@ -125,17 +126,61 @@ include("../../../test/device_list.jl")
   y = dev(randn(elt, 16))
   x = reshape(dev(randn(elt, 4, 4))', 16)
   copyto!(expose(y), expose(x))
-  @test y == x
-  @test copy(x) == x
+  @allowscalar begin
+    @test y == x
+    @test copy(x) == x
+  end
 
   y = dev(randn(elt, 8))
   x = @view reshape(dev(randn(elt, 8, 8))', 64)[1:8]
   copyto!(expose(y), expose(x))
-  @test y == x
-  @test copy(x) == x
+  @allowscalar begin
+    @test y == x
+    @test copy(x) == x
+  end
 
   y = Base.ReshapedArray(dev(randn(elt, 16)), (4, 4), ())
   x = dev(randn(elt, 4, 4))
   permutedims!(expose(y), expose(x), (2, 1))
   @test NDTensors.cpu(y) == transpose(NDTensors.cpu(x))
+
+  ##########################################
+  ### Testing an issue with CUDA&Metal transpose/adjoint mul
+  A = dev(randn(elt, (3, 2)))
+  B = dev(randn(elt, (3, 4)))
+  C = dev(randn(elt, (4, 2)))
+  Cp = copy(C)
+
+  ## This fails with scalar indexing
+  if dev != NDTensors.cpu
+    @test_broken mul!(transpose(C), transpose(A), B, true, false)
+  end
+  mul!(C, transpose(B), A, true, false)
+  mul!(expose(transpose(Cp)), expose(transpose(A)), expose(B), true, false)
+  @test C ≈ Cp
+  Cp = zero(C)
+  ## Try calling mul!! with transposes to verify that code works
+  Cpt = NDTensors.mul!!(transpose(Cp), transpose(A), B, true, false)
+  @test transpose(Cpt) ≈ C
+
+  Cp = zero(C)
+  ## This fails with scalar indexing 
+  if dev != NDTensors.cpu
+    @test_broken mul!(C', A', B, true, false)
+  end
+  mul!(C, B', A, true, false)
+  mul!(expose(Cp'), expose(A'), expose(B), true, false)
+  @test C ≈ Cp
+  Cp = zero(C)
+  Cpt = NDTensors.mul!!(Cp', A', B, true, false)
+  @test Cpt' ≈ C
+
+  ##################################
+  ### Add test for transpose(reshape(adjoint )) failure in CUDA
+  A = dev(transpose(reshape(randn(elt, 2, 12)', (12, 2))))
+  B = dev(randn(elt, 2, 2))
+  C = dev(zeros(elt, (2, 12)))
+  NDTensors.mul!!(C, B, A, true, false)
+  Cp = B * copy(A)
+  @test @allowscalar C ≈ Cp
 end
diff --git a/NDTensors/src/blocksparse/blocksparsetensor.jl b/NDTensors/src/blocksparse/blocksparsetensor.jl
@@ -335,11 +335,15 @@ view(T::BlockSparseTensor, b::Block) = blockview(T, b)
 # convert to Dense
 function dense(T::TensorT) where {TensorT<:BlockSparseTensor}
   R = zeros(dense(TensorT), inds(T))
+  ## Here this failed with scalar indexing (R[blockindices] = blockview)
+  ## We can fix this by using copyto the arrays
+  r = array(R)
   for block in keys(blockoffsets(T))
     # TODO: make sure this assignment is efficient
-    R[blockindices(T, block)] = blockview(T, block)
+    rview = @view r[blockindices(T, block)]
+    copyto!(expose(rview), expose(array(blockview(T, block))))
   end
-  return R
+  return tensor(Dense(r), inds(T))
 end
 
 #

diff --git a/NDTensors/src/blocksparse/linearalgebra.jl b/NDTensors/src/blocksparse/linearalgebra.jl
@@ -200,7 +200,7 @@ function svd(
     if (sV * sVP) == -1
       Vb *= -1
     end
-    copyto!(expose(blockview(V, blockV)), expose(Vb))
+    copyto!(blockview(V, blockV), Vb)
   end
   return U, S, V, Spectrum(d, truncerr)
 end

diff --git a/NDTensors/test/Project.toml b/NDTensors/test/Project.toml
@@ -11,6 +11,7 @@ SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 TBLIS = "48530278-0828-4a49-9772-0f3830dfa1e9"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+GPUArraysCore="46192b85-c4d5-4398-a991-12ede77f4527"
 
 [extras]
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"