diff --git a/NDTensors/Project.toml b/NDTensors/Project.toml
index a28621d734..208364fe04 100644
--- a/NDTensors/Project.toml
+++ b/NDTensors/Project.toml
@@ -11,6 +11,7 @@ Dictionaries = "85a47980-9c8c-11e8-2b9f-f7ca1fa99fb4"
 FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
 Folds = "41a02a25-b8f0-4f67-bc48-60067656b558"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
+GPUArraysCore="46192b85-c4d5-4398-a991-12ede77f4527"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -44,6 +45,7 @@ Dictionaries = "0.3.5"
 FLoops = "0.2.1"
 Folds = "0.2.8"
 Functors = "0.2, 0.3, 0.4"
+GPUArraysCore = "0.1"
 HDF5 = "0.14, 0.15, 0.16, 0.17"
 InlineStrings = "1"
 LinearAlgebra = "1.6"
diff --git a/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl b/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl
index f9e8813152..cd2a83827f 100644
--- a/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl
+++ b/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl
@@ -5,16 +5,19 @@ using NDTensors.SetParameters
 using NDTensors.Unwrap
 using Adapt
 using Functors
-using LinearAlgebra
+using LinearAlgebra: LinearAlgebra, Adjoint, Transpose, mul!, svd
 using CUDA
 using CUDA.CUBLAS
 using CUDA.CUSOLVER
 
 include("imports.jl")
 include("default_kwargs.jl")
+include("copyto.jl")
 include("set_types.jl")
 include("iscu.jl")
 include("adapt.jl")
 include("indexing.jl")
 include("linearalgebra.jl")
+include("mul.jl")
+include("permutedims.jl")
 end
diff --git a/NDTensors/ext/NDTensorsCUDAExt/copyto.jl b/NDTensors/ext/NDTensorsCUDAExt/copyto.jl
new file mode 100644
index 0000000000..73867192c4
--- /dev/null
+++ b/NDTensors/ext/NDTensorsCUDAExt/copyto.jl
@@ -0,0 +1,26 @@
+# Same definition as `MtlArray`.
+function Base.copy(src::Exposed{<:CuArray,<:Base.ReshapedArray})
+  return reshape(copy(parent(src)), size(unexpose(src)))
+end
+
+function Base.copy(
+  src::Exposed{
+    <:CuArray,<:SubArray{<:Any,<:Any,<:Base.ReshapedArray{<:Any,<:Any,<:Adjoint}}
+  },
+)
+  return copy(@view copy(expose(parent(src)))[parentindices(unexpose(src))...])
+end
+
+# Catches a bug in `copyto!` in CUDA backend.
+function Base.copyto!(dest::Exposed{<:CuArray}, src::Exposed{<:CuArray,<:SubArray})
+  copyto!(dest, expose(copy(src)))
+  return unexpose(dest)
+end
+
+# Catches a bug in `copyto!` in CUDA backend.
+function Base.copyto!(
+  dest::Exposed{<:CuArray}, src::Exposed{<:CuArray,<:Base.ReshapedArray}
+)
+  copyto!(dest, expose(parent(src)))
+  return unexpose(dest)
+end
diff --git a/NDTensors/ext/NDTensorsCUDAExt/imports.jl b/NDTensors/ext/NDTensorsCUDAExt/imports.jl
index 96b8383c74..a9e49db657 100644
--- a/NDTensors/ext/NDTensorsCUDAExt/imports.jl
+++ b/NDTensors/ext/NDTensorsCUDAExt/imports.jl
@@ -2,5 +2,3 @@ import NDTensors: cu, set_ndims, set_eltype, set_eltype_if_unspecified, similart
 import NDTensors:
   ContractionProperties, _contract!, GemmBackend, auto_select_backend, _gemm!, iscu
 import NDTensors.SetParameters: nparameters, get_parameter, set_parameter, default_parameter
-
-import .CUDA: CuArrayAdaptor
diff --git a/NDTensors/ext/NDTensorsCUDAExt/indexing.jl b/NDTensors/ext/NDTensorsCUDAExt/indexing.jl
index 2f0f6790e0..263e2442a6 100644
--- a/NDTensors/ext/NDTensorsCUDAExt/indexing.jl
+++ b/NDTensors/ext/NDTensorsCUDAExt/indexing.jl
@@ -2,7 +2,7 @@ function Base.getindex(E::Exposed{<:CuArray})
   return CUDA.@allowscalar unexpose(E)[]
 end
 
-function setindex!(E::Exposed{<:CuArray}, x::Number)
+function Base.setindex!(E::Exposed{<:CuArray}, x::Number)
   CUDA.@allowscalar unexpose(E)[] = x
   return unexpose(E)
 end
@@ -11,10 +11,6 @@ function Base.getindex(E::Exposed{<:CuArray,<:Adjoint}, i, j)
   return (expose(parent(E))[j, i])'
 end
 
-function Base.copy(E::Exposed{<:CuArray,<:Base.ReshapedArray})
-  return reshape(copy(expose(parent(E))), size(unexpose(E)))
-end
-
 Base.any(f, E::Exposed{<:CuArray,<:NDTensors.Tensor}) = any(f, data(unexpose(E)))
 
 function Base.print_array(io::IO, E::Exposed{<:CuArray})
diff --git a/NDTensors/ext/NDTensorsCUDAExt/mul.jl b/NDTensors/ext/NDTensorsCUDAExt/mul.jl
new file mode 100644
index 0000000000..b3751b5fc1
--- /dev/null
+++ b/NDTensors/ext/NDTensorsCUDAExt/mul.jl
@@ -0,0 +1,43 @@
+# This was calling generic matrix multiplication.
+# TODO: Raise an issue with `CUDA.jl`.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:CuArray,<:LinearAlgebra.Transpose},
+  AM::Exposed{<:CuArray},
+  BM::Exposed{<:CuArray},
+  α,
+  β,
+)
+  mul!(transpose(CM), transpose(BM), transpose(AM), α, β)
+  return unexpose(CM)
+end
+
+# This was calling generic matrix multiplication.
+# TODO: Raise an issue with `CUDA.jl`.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:CuArray,<:LinearAlgebra.Adjoint},
+  AM::Exposed{<:CuArray},
+  BM::Exposed{<:CuArray},
+  α,
+  β,
+)
+  mul!(CM', BM', AM', α, β)
+  return unexpose(CM)
+end
+
+## Fix issue in CUDA.jl where it cannot distinguish Transpose{Reshape{Adjoint{CuArray}}}
+## as a CuArray and calls generic matmul
+function LinearAlgebra.mul!(
+  CM::Exposed{<:CuArray},
+  AM::Exposed{<:CuArray},
+  BM::Exposed{
+    <:CuArray,
+    <:LinearAlgebra.Transpose{
+      <:Any,<:Base.ReshapedArray{<:Any,<:Any,<:LinearAlgebra.Adjoint}
+    },
+  },
+  α,
+  β,
+)
+  mul!(CM, AM, expose(transpose(copy(expose(parent(BM))))), α, β)
+  return unexpose(CM)
+end
diff --git a/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl b/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl
new file mode 100644
index 0000000000..2dba8286bb
--- /dev/null
+++ b/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl
@@ -0,0 +1,7 @@
+function Base.permutedims!(
+  Edest::Exposed{<:CuArray,<:Base.ReshapedArray}, Esrc::Exposed{<:CuArray}, perm
+)
+  Aperm = permutedims(Esrc, perm)
+  copyto!(expose(parent(Edest)), expose(Aperm))
+  return unexpose(Edest)
+end
diff --git a/NDTensors/ext/NDTensorsMetalExt/copyto.jl b/NDTensors/ext/NDTensorsMetalExt/copyto.jl
index ba111fb099..c32b5f2f01 100644
--- a/NDTensors/ext/NDTensorsMetalExt/copyto.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/copyto.jl
@@ -12,12 +12,14 @@ end
 
 # Catches a bug in `copyto!` in Metal backend.
 function Base.copyto!(dest::Exposed{<:MtlArray}, src::Exposed{<:MtlArray,<:SubArray})
-  return copyto!(dest, expose(copy(src)))
+  copyto!(dest, expose(copy(src)))
+  return unexpose(dest)
 end
 
 # Catches a bug in `copyto!` in Metal backend.
 function Base.copyto!(
   dest::Exposed{<:MtlArray}, src::Exposed{<:MtlArray,<:Base.ReshapedArray}
 )
-  return copyto!(dest, expose(parent(src)))
+  copyto!(dest, expose(parent(src)))
+  return unexpose(dest)
 end
diff --git a/NDTensors/ext/NDTensorsMetalExt/mul.jl b/NDTensors/ext/NDTensorsMetalExt/mul.jl
index f5268fdd7f..7c388a0fc8 100644
--- a/NDTensors/ext/NDTensorsMetalExt/mul.jl
+++ b/NDTensors/ext/NDTensorsMetalExt/mul.jl
@@ -10,3 +10,12 @@ function LinearAlgebra.mul!(
   mul!(transpose(CM), transpose(BM), transpose(AM), α, β)
   return unexpose(CM)
 end
+
+# This was calling generic matrix multiplication.
+# TODO: Raise an issue with `Metal.jl`.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:MtlArray,<:Adjoint}, AM::Exposed{<:MtlArray}, BM::Exposed{<:MtlArray}, α, β
+)
+  mul!(CM', BM', AM', α, β)
+  return unexpose(CM)
+end
diff --git a/NDTensors/ext/examples/NDTensorCUDA.jl b/NDTensors/ext/examples/NDTensorCUDA.jl
index ca4ffbe23e..6a9efe749f 100644
--- a/NDTensors/ext/examples/NDTensorCUDA.jl
+++ b/NDTensors/ext/examples/NDTensorCUDA.jl
@@ -61,9 +61,9 @@ function main()
   #Currently this code fails with CUDA.allowscalar(false)
   # Because of outer calling the _gemm! function which calls a 
   # generic implementation
-  grad = gradient(f, cA, cB, cC, cD)
-  @test NDTensors.cpu(cB * cC * cD) ≈ NDTensors.cpu(grad[1])
-  @test (cB * cC * cD) ≈ grad[1]
+  @allowscalar grad = gradient(f, cA, cB, cC, cD)
+  @allowscalar @test NDTensors.cpu(cB * cC * cD) ≈ NDTensors.cpu(grad[1])
+  @allowscalar @test (cB * cC * cD) ≈ grad[1]
   # Create a tuple of indices
   decomp = (
     dim(NDTensors.ind(grad[1], 1)),
diff --git a/NDTensors/src/NDTensors.jl b/NDTensors/src/NDTensors.jl
index aee947e2aa..2dd3735997 100644
--- a/NDTensors/src/NDTensors.jl
+++ b/NDTensors/src/NDTensors.jl
@@ -6,6 +6,7 @@ using Compat
 using Dictionaries
 using FLoops
 using Folds
+using GPUArraysCore
 using InlineStrings
 using Random
 using LinearAlgebra
diff --git a/NDTensors/src/Unwrap/src/functions/abstractarray.jl b/NDTensors/src/Unwrap/src/functions/abstractarray.jl
index db98228d98..2ef6833807 100644
--- a/NDTensors/src/Unwrap/src/functions/abstractarray.jl
+++ b/NDTensors/src/Unwrap/src/functions/abstractarray.jl
@@ -2,6 +2,8 @@ parent(E::Exposed) = parent(unexpose(E))
 
 transpose(E::Exposed) = transpose(unexpose(E))
 
+adjoint(E::Exposed) = adjoint(unexpose(E))
+
 cpu(E::Exposed) = cpu(unexpose(E))
 
 getindex(E::Exposed) = unexpose(E)[]
diff --git a/NDTensors/src/Unwrap/src/import.jl b/NDTensors/src/Unwrap/src/import.jl
index 742cda3b2e..66c1235995 100644
--- a/NDTensors/src/Unwrap/src/import.jl
+++ b/NDTensors/src/Unwrap/src/import.jl
@@ -1,4 +1,5 @@
 import Base:
+  adjoint,
   permutedims,
   permutedims!,
   copy,
diff --git a/NDTensors/src/Unwrap/test/runtests.jl b/NDTensors/src/Unwrap/test/runtests.jl
index 127f85cfae..68cbb74e18 100644
--- a/NDTensors/src/Unwrap/test/runtests.jl
+++ b/NDTensors/src/Unwrap/test/runtests.jl
@@ -2,6 +2,7 @@ using Test
 using NDTensors.Unwrap
 using NDTensors
 using LinearAlgebra
+using GPUArraysCore: @allowscalar
 
 include("../../../test/device_list.jl")
 @testset "Testing Unwrap $dev, $elt" for dev in devices_list(ARGS),
@@ -24,8 +25,8 @@ include("../../../test/device_list.jl")
   @test parent(Et) == v
   @test parent(Ea) == v
   @test transpose(E) == vt
-  @test cpu(E) == v
-  @test cpu(Et) == vt
+  @test cpu(E) == cpu(v)
+  @test cpu(Et) == cpu(vt)
 
   m = reshape(v, (5, 2))
   mt = transpose(m)
@@ -125,17 +126,68 @@ include("../../../test/device_list.jl")
   y = dev(randn(elt, 16))
   x = reshape(dev(randn(elt, 4, 4))', 16)
   copyto!(expose(y), expose(x))
-  @test y == x
-  @test copy(x) == x
+  @allowscalar begin
+    @test y == x
+    @test copy(x) == x
+  end
 
   y = dev(randn(elt, 8))
   x = @view reshape(dev(randn(elt, 8, 8))', 64)[1:8]
   copyto!(expose(y), expose(x))
-  @test y == x
-  @test copy(x) == x
+  @allowscalar begin
+    @test y == x
+    @test copy(x) == x
+  end
 
   y = Base.ReshapedArray(dev(randn(elt, 16)), (4, 4), ())
   x = dev(randn(elt, 4, 4))
   permutedims!(expose(y), expose(x), (2, 1))
   @test NDTensors.cpu(y) == transpose(NDTensors.cpu(x))
+
+  ##########################################
+  ### Testing an issue with CUDA&Metal transpose/adjoint mul
+  A = dev(randn(elt, (3, 2)))
+  B = dev(randn(elt, (3, 4)))
+  C = dev(randn(elt, (4, 2)))
+  Cp = copy(C)
+
+  ## This fails with scalar indexing
+  if dev != NDTensors.cpu
+    @test_broken mul!(transpose(C), transpose(A), B, true, false)
+  end
+  mul!(C, transpose(B), A, true, false)
+  mul!(expose(transpose(Cp)), expose(transpose(A)), expose(B), true, false)
+  @test C ≈ Cp
+  Cp = zero(C)
+  ## Try calling mul!! with transposes to verify that code works
+  Cpt = NDTensors.mul!!(transpose(Cp), transpose(A), B, true, false)
+  @test transpose(Cpt) ≈ C
+
+  Cp = zero(C)
+  ## This fails with scalar indexing 
+  if dev != NDTensors.cpu
+    @test_broken mul!(C', A', B, true, false)
+  end
+  mul!(C, B', A, true, false)
+  mul!(expose(Cp'), expose(A'), expose(B), true, false)
+  @test C ≈ Cp
+  Cp = zero(C)
+  Cpt = NDTensors.mul!!(Cp', A', B, true, false)
+  @test Cpt' ≈ C
+
+  ##################################
+  ### Add test for transpose(reshape(adjoint )) failure in CUDA
+
+  A = dev(transpose(reshape(randn(elt, 2, 12)', (12, 2))))
+  B = dev(randn(elt, 2, 2))
+  C = dev(zeros(elt, 2, 12))
+  NDTensors.mul!(expose(C), expose(B), expose(A), true, false)
+  Cp = NDTensors.cpu(similar(C))
+  NDTensors.mul!(
+    expose(Cp), expose(NDTensors.cpu(B)), expose(NDTensors.cpu(A)), true, false
+  )
+  @test NDTensors.cpu(C) ≈ Cp
+  NDTensors.zero(C)
+  NDTensors.mul!!(C, B, A, true, false)
+  @test NDTensors.cpu(C) ≈ Cp
 end
diff --git a/NDTensors/src/blocksparse/blocksparsetensor.jl b/NDTensors/src/blocksparse/blocksparsetensor.jl
index 65f824a48d..f59238f955 100644
--- a/NDTensors/src/blocksparse/blocksparsetensor.jl
+++ b/NDTensors/src/blocksparse/blocksparsetensor.jl
@@ -335,11 +335,15 @@ view(T::BlockSparseTensor, b::Block) = blockview(T, b)
 # convert to Dense
 function dense(T::TensorT) where {TensorT<:BlockSparseTensor}
   R = zeros(dense(TensorT), inds(T))
+  ## Here this failed with scalar indexing (R[blockindices] = blockview)
+  ## We can fix this by using copyto the arrays
+  r = array(R)
   for block in keys(blockoffsets(T))
     # TODO: make sure this assignment is efficient
-    R[blockindices(T, block)] = blockview(T, block)
+    rview = @view r[blockindices(T, block)]
+    copyto!(expose(rview), expose(array(blockview(T, block))))
   end
-  return R
+  return tensor(Dense(r), inds(T))
 end
 
 #
diff --git a/NDTensors/src/blocksparse/linearalgebra.jl b/NDTensors/src/blocksparse/linearalgebra.jl
index f3a830ccf7..bc69bb1954 100644
--- a/NDTensors/src/blocksparse/linearalgebra.jl
+++ b/NDTensors/src/blocksparse/linearalgebra.jl
@@ -200,7 +200,7 @@ function svd(
     if (sV * sVP) == -1
       Vb *= -1
     end
-    copyto!(expose(blockview(V, blockV)), expose(Vb))
+    copyto!(blockview(V, blockV), Vb)
   end
   return U, S, V, Spectrum(d, truncerr)
 end
diff --git a/NDTensors/test/Project.toml b/NDTensors/test/Project.toml
index 824ce0a350..0970083593 100644
--- a/NDTensors/test/Project.toml
+++ b/NDTensors/test/Project.toml
@@ -2,6 +2,7 @@
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Dictionaries = "85a47980-9c8c-11e8-2b9f-f7ca1fa99fb4"
+GPUArraysCore="46192b85-c4d5-4398-a991-12ede77f4527"
 ITensors = "9136182c-28ba-11e9-034c-db9fb085ebd5"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
diff --git a/NDTensors/test/blocksparse.jl b/NDTensors/test/blocksparse.jl
index c8d0224725..e85580ab80 100644
--- a/NDTensors/test/blocksparse.jl
+++ b/NDTensors/test/blocksparse.jl
@@ -1,6 +1,7 @@
 using NDTensors
 using LinearAlgebra
 using Test
+using GPUArraysCore: @allowscalar
 
 @testset "BlockSparseTensor basic functionality" begin
   C = nothing
@@ -45,26 +46,29 @@ using Test
     @test nnzblocks(A) == 2
     @test nnzblocks(B) == 2
 
-    A[1, 5] = 15
-    A[2, 5] = 25
-
-    @test A[1, 1] == 0
-    @test A[1, 5] == 15
-    @test A[2, 5] == 25
+    @allowscalar begin
+      A[1, 5] = 15
+      A[2, 5] = 25
 
+      @test A[1, 1] == 0
+      @test A[1, 5] == 15
+      @test A[2, 5] == 25
+    end
     D = dense(A)
 
-    @test D == A
+    @allowscalar begin
+      @test D == A
 
-    for I in eachindex(A)
-      @test D[I] == A[I]
+      for I in eachindex(A)
+        @test D[I] == A[I]
+      end
     end
 
     A12 = blockview(A, (1, 2))
 
     @test dims(A12) == (2, 5)
 
-    for I in eachindex(A12)
+    @allowscalar for I in eachindex(A12)
       @test A12[I] == A[I + CartesianIndex(0, 4)]
     end
 
@@ -73,7 +77,7 @@ using Test
 
     C = A + B
 
-    for I in eachindex(C)
+    @allowscalar for I in eachindex(C)
       @test C[I] == A[I] + B[I]
     end
 
@@ -84,7 +88,7 @@ using Test
     @test nnz(A) == nnz(Ap)
     @test nnzblocks(A) == nnzblocks(Ap)
 
-    for I in eachindex(C)
+    @allowscalar for I in eachindex(C)
       @test A[I] == Ap[NDTensors.permute(I, (2, 1))]
     end
 
@@ -159,7 +163,7 @@ using Test
 
       Ap = NDTensors.permutedims(A, (3, 2, 1))
 
-      for (bAp, bB) in zip(eachnzblock(Ap), eachnzblock(B))
+      @allowscalar for (bAp, bB) in zip(eachnzblock(Ap), eachnzblock(B))
         blockAp = blockview(Ap, bAp)
         blockB = blockview(B, bB)
         @test reshape(blockAp, size(blockB)) == blockB
@@ -170,7 +174,7 @@ using Test
   @testset "BlockSparseTensor setindex! add block" begin
     T = BlockSparseTensor([2, 3], [4, 5])
 
-    for I in eachindex(T)
+    @allowscalar for I in eachindex(T)
       @test T[I] == 0.0
     end
     @test nnz(T) == 0
@@ -226,35 +230,45 @@ using Test
       A = dev(BlockSparseTensor([(2, 1), (1, 2)], [2, 2], [2, 2]))
       randn!(A)
       U, S, V = svd(A)
-      @test isapprox(norm(array(U) * array(S) * array(V)' - array(A)), 0; atol=1e-14)
+      @test @allowscalar isapprox(
+        norm(array(U) * array(S) * array(V)' - array(A)), 0; atol=1e-14
+      )
     end
 
     @testset "svd example 2" begin
       A = dev(BlockSparseTensor([(1, 2), (2, 3)], [2, 2], [3, 2, 3]))
       randn!(A)
       U, S, V = svd(A)
-      @test isapprox(norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-14)
+      @test @allowscalar isapprox(
+        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-14
+      )
     end
 
     @testset "svd example 3" begin
       A = dev(BlockSparseTensor([(2, 1), (3, 2)], [3, 2, 3], [2, 2]))
       randn!(A)
       U, S, V = svd(A)
-      @test isapprox(norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-14)
+      @test @allowscalar isapprox(
+        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-14
+      )
     end
 
     @testset "svd example 4" begin
       A = dev(BlockSparseTensor([(2, 1), (3, 2)], [2, 3, 4], [5, 6]))
       randn!(A)
       U, S, V = svd(A)
-      @test isapprox(norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-13)
+      @test @allowscalar isapprox(
+        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-13
+      )
     end
 
     @testset "svd example 5" begin
       A = dev(BlockSparseTensor([(1, 2), (2, 3)], [5, 6], [2, 3, 4]))
       randn!(A)
       U, S, V = svd(A)
-      @test isapprox(norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-13)
+      @test @allowscalar isapprox(
+        norm(array(U) * array(S) * array(V)' - array(A)), 0.0; atol=1e-13
+      )
     end
   end
 
diff --git a/NDTensors/test/combiner.jl b/NDTensors/test/combiner.jl
index e422e69fa0..52e1cae5b4 100644
--- a/NDTensors/test/combiner.jl
+++ b/NDTensors/test/combiner.jl
@@ -1,6 +1,7 @@
 using NDTensors
 using LinearAlgebra
 using Test
+using GPUArraysCore: @allowscalar
 
 # Testing generic block indices
 using ITensors: QN, Index
@@ -21,13 +22,13 @@ using ITensors: QN, Index
       output_tensor = contract(input_tensor, (1, -1, -2), combiner_tensor, (2, -1, -2))
       @test output_tensor isa DenseTensor
       @test dims(output_tensor) == output_tensor_inds
-      for i in 1:length(input_tensor)
+      @allowscalar for i in 1:length(input_tensor)
         @test input_tensor[i] == output_tensor[i]
       end
 
       # Test uncombining
       new_input_tensor = contract(output_tensor, (1, -1), combiner_tensor, (-1, 2, 3))
-      @test new_input_tensor == input_tensor
+      @test NDTensors.cpu(new_input_tensor) == NDTensors.cpu(input_tensor)
 
       # Catch invalid combining
       input_tensor_inds = (d,)
@@ -61,14 +62,14 @@ using ITensors: QN, Index
       @test output_tensor isa BlockSparseTensor
       @test dims(output_tensor) == dims(output_tensor_inds)
       output_tensor = permutedims(output_tensor, (2, 1))
-      for i in 1:length(input_tensor)
+      @allowscalar for i in 1:length(input_tensor)
         @test input_tensor[i] == output_tensor[i]
       end
 
       # Test uncombining. Broken for inds that are not `Index`.
       new_input_tensor = contract(output_tensor, (1, -1), combiner_tensor, (-1, 2, 3))
       new_input_tensor = permutedims(new_input_tensor, (3, 1, 2))
-      @test new_input_tensor == input_tensor
+      @test NDTensors.cpu(new_input_tensor) == NDTensors.cpu(input_tensor)
 
       # Catch invalid combining
       invalid_input_tensor_inds = (k,)
diff --git a/NDTensors/test/dense.jl b/NDTensors/test/dense.jl
index eb6678c689..e716c2b4f0 100644
--- a/NDTensors/test/dense.jl
+++ b/NDTensors/test/dense.jl
@@ -1,5 +1,6 @@
 using NDTensors
 using Test
+using GPUArraysCore: @allowscalar
 
 @testset "Dense Tensors" begin
   include("device_list.jl")
@@ -10,11 +11,11 @@ using Test
     # Testing with GPU and CPU backends
     @testset "DenseTensor basic functionality" begin
       A = dev(Tensor(elt, (3, 4)))
-      for I in eachindex(A)
+      @allowscalar for I in eachindex(A)
         @test A[I] == 0
       end
 
-      @test A[2, 1] isa elt
+      @test @allowscalar A[2, 1] isa elt
       @test dims(A[1:2, 1]) == (2,)
       @test dims(A[1:2, 2]) == (2,)
       @test dims(A[2:3, 2]) == (2,)
@@ -25,49 +26,58 @@ using Test
 
       randn!(A)
 
-      for I in eachindex(A)
-        @test A[I] != 0
-      end
-
-      for I in eachindex(A)
-        @test A[I] != 0
-      end
-
       @test ndims(A) == 2
       @test dims(A) == (3, 4)
       @test inds(A) == (3, 4)
 
-      A[1, 1] = 11
+      Aview = A[2:3, 2:3]
+      @test dims(Aview) == (2, 2)
 
-      @test A[1, 1] == 11
+      B = dev(Tensor(undef, (3, 4)))
+      randn!(B)
+      C = copy(A)
+      C = permutedims!!(C, B, (1, 2), +)
 
-      Aview = A[2:3, 2:3]
+      Ap = permutedims(A, (2, 1))
+      @allowscalar begin
+        for I in eachindex(A)
+          @test A[I] != 0
+        end
 
-      @test dims(Aview) == (2, 2)
-      @test A[2, 2] == Aview[1, 1]
+        for I in eachindex(A)
+          @test A[I] != 0
+        end
 
-      @test A * 2.0 == 2.0 * A
+        ## TODO Currently this fails with scalar indexing on CUDA
+        ## Because A + B calls 
+        ## +(A::DenseTensor{Float64, 2, Tuple{Int64, Int64}, Dense{Float64, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}}, B::DenseTensor{Float64, 2, Tuple{Int64, Int64}, Dense{Float64, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}})
+        ## @ Base ./arraymath.jl:8
+        #C = A + B
 
-      Asim = similar(data(A), 10)
-      @test eltype(Asim) == elt
-      @test length(Asim) == 10
+        for I in eachindex(C)
+          @test C[I] == A[I] + B[I]
+        end
 
-      B = dev(Tensor(undef, (3, 4)))
-      randn!(B)
+        for I in eachindex(A)
+          @test A[I] == Ap[NDTensors.permute(I, (2, 1))]
+        end
 
-      C = A + B
+        A[1, 1] = 11
+        @test A[1, 1] == 11
 
-      for I in eachindex(C)
-        @test C[I] == A[I] + B[I]
+        @test A[2, 2] == Aview[1, 1]
       end
 
-      Ap = permutedims(A, (2, 1))
+      ## Right now this treats the `Tensor` type as an abstract Array 
+      ## And calls getindex instead of CUDA.==. Can fix by converting to CPU or 
+      ## Just looking at the data
+      @test data(A * 2.0) == data(2.0 * A)
 
-      for I in eachindex(A)
-        @test A[I] == Ap[NDTensors.permute(I, (2, 1))]
-      end
+      Asim = similar(data(A), 10)
+      @test eltype(Asim) == elt
+      @test length(Asim) == 10
 
-      t = Tensor(complex(elt), (100, 100))
+      t = dev(Tensor(complex(elt), (100, 100)))
       randn!(t)
       @test conj(data(store(t))) == data(store(conj(t)))
       @test typeof(conj(t)) <: DenseTensor
@@ -75,33 +85,33 @@ using Test
       @test Dense(complex(elt)) == Dense{complex(elt)}()
       @test Dense(complex(elt)) == complex(Dense(elt))
 
-      D = Tensor(complex(elt), (100, 100))
+      D = dev(Tensor(complex(elt), (100, 100)))
       @test eltype(D) == complex(elt)
       @test ndims(D) == 2
       @test dim(D) == 100^2
 
-      E = Tensor(complex(elt), undef, (100, 100))
+      E = dev(Tensor(complex(elt), undef, (100, 100)))
       @test eltype(E) == complex(elt)
       @test ndims(E) == 2
       @test dim(E) == 100^2
 
-      F = Tensor(elt, (100, 100))
+      F = dev(Tensor(elt, (100, 100)))
       @test eltype(F) == elt
       @test ndims(F) == 2
       @test dim(F) == 100^2
 
-      G = Tensor(elt, undef, (100, 100))
+      G = dev(Tensor(elt, undef, (100, 100)))
       @test eltype(G) == elt
       @test ndims(G) == 2
       @test dim(G) == 100^2
 
-      H = Tensor(complex(elt), undef, (100, 100))
+      H = dev(Tensor(complex(elt), undef, (100, 100)))
       @test eltype(H) == complex(elt)
       @test ndims(H) == 2
       @test dim(H) == 100^2
 
-      I_arr = rand(elt, 10, 10, 10)
-      I = Tensor(I_arr, (10, 10, 10))
+      I_arr = dev(rand(elt, 10, 10, 10))
+      I = dev(Tensor(I_arr, (10, 10, 10)))
       @test eltype(I) == elt
       @test dim(I) == 1000
       @test Array(I) == I_arr
@@ -115,13 +125,13 @@ using Test
       T = dev(randomTensor(elt, (2, 2)))
       @test dims(T) == (2, 2)
       @test eltype(T) == elt
-      @test T[1, 1] ≉ 0
+      @test @allowscalar T[1, 1] ≉ 0
       @test norm(T) ≉ 0
 
       Tc = dev(randomTensor(complex(elt), (2, 2)))
       @test dims(Tc) == (2, 2)
       @test eltype(Tc) == complex(elt)
-      @test Tc[1, 1] ≉ 0
+      @test @allowscalar Tc[1, 1] ≉ 0
       @test norm(Tc) ≉ 0
     end
 
@@ -133,7 +143,7 @@ using Test
       iT = imag(T)
       cT = conj(T)
 
-      for n1 in 1:d1, n2 in 1:d2, n3 in 1:d3
+      @allowscalar for n1 in 1:d1, n2 in 1:d2, n3 in 1:d3
         @test rT[n1, n2, n3] ≈ real(T[n1, n2, n3])
         @test iT[n1, n2, n3] ≈ imag(T[n1, n2, n3])
         @test cT[n1, n2, n3] ≈ conj(T[n1, n2, n3])
@@ -153,8 +163,10 @@ using Test
       @test dims(T) == (2, 3, 4)
       @test ndims(T) == 3
       @test inds(T) == (MyInd(2), MyInd(3), MyInd(4))
-      T[2, 1, 2] = 1.21
-      @test T[2, 1, 2] == elt(1.21)
+      @allowscalar begin
+        T[2, 1, 2] = 1.21
+        @test T[2, 1, 2] == elt(1.21)
+      end
       @test norm(T) == elt(1.21)
 
       T = dev(randomTensor(complex(elt), (MyInd(4), MyInd(3))))
@@ -200,23 +212,23 @@ using Test
       @testset "No permutation" begin
         R = dev(Tensor(complex(elt), (2, 2, 1)))
         fill!(R, NaN)
-        @test any(isnan, R)
+        @test @allowscalar any(isnan, R)
         T1 = dev(randomTensor((2, 2, 1)))
         T2 = dev(randomTensor(complex(elt), (1, 1)))
         NDTensors.contract!(R, (1, 2, 3), T1, (1, 2, -1), T2, (-1, 1))
-        @test !any(isnan, R)
-        @test convert(Array, R) ≈ convert(Array, T1) * T2[1]
+        @test @allowscalar !any(isnan, R)
+        @test convert(Array, R) ≈ convert(Array, T1) * T2[]
       end
 
       @testset "Permutation" begin
         R = dev(Tensor(complex(elt), (2, 2, 1)))
         fill!(R, NaN)
-        @test any(isnan, R)
+        @test @allowscalar any(isnan, R)
         T1 = dev(randomTensor((2, 2, 1)))
         T2 = dev(randomTensor(complex(elt), (1, 1)))
         NDTensors.contract!(R, (2, 1, 3), T1, (1, 2, -1), T2, (-1, 1))
-        @test !any(isnan, R)
-        @test convert(Array, R) ≈ permutedims(convert(Array, T1), (2, 1, 3)) * T2[1]
+        @test @allowscalar !any(isnan, R)
+        @test convert(Array, R) ≈ permutedims(convert(Array, T1), (2, 1, 3)) * T2[]
       end
     end
   end
diff --git a/NDTensors/test/device_list.jl b/NDTensors/test/device_list.jl
index 6433e170b3..021a20a75b 100644
--- a/NDTensors/test/device_list.jl
+++ b/NDTensors/test/device_list.jl
@@ -12,7 +12,6 @@ function devices_list(test_args)
   end
 
   if "cuda" in test_args || "all" in test_args
-    CUDA.allowscalar()
     if CUDA.functional()
       push!(devs, NDTensors.cu)
     else
diff --git a/NDTensors/test/diag.jl b/NDTensors/test/diag.jl
index a712afbc70..168049a9ef 100644
--- a/NDTensors/test/diag.jl
+++ b/NDTensors/test/diag.jl
@@ -1,5 +1,6 @@
 using NDTensors
 using Test
+using GPUArraysCore: @allowscalar
 
 @testset "DiagTensor basic functionality" begin
   include("device_list.jl")
@@ -18,7 +19,7 @@ using Test
     d = rand(real(elt), 10)
     D = dev(Diag{elt}(d))
     @test eltype(D) == elt
-    @test dev(Array(dense(D))) == convert.(elt, d)
+    @test @allowscalar dev(Array(dense(D))) == convert.(elt, d)
     simD = similar(D)
     @test length(simD) == length(D)
     @test eltype(simD) == eltype(D)
@@ -32,15 +33,26 @@ using Test
     d = 3
     vr = rand(elt, d)
     D = dev(tensor(Diag(vr), (d, d)))
-    @test Array(D) == NDTensors.LinearAlgebra.diagm(0 => vr)
-    @test matrix(D) == NDTensors.LinearAlgebra.diagm(0 => vr)
-    @test permutedims(D, (2, 1)) == D
+    Da = Array(D)
+    Dm = Matrix(D)
+    @allowscalar begin
+      @test Da == NDTensors.LinearAlgebra.diagm(0 => vr)
+      @test Da == NDTensors.LinearAlgebra.diagm(0 => vr)
+
+      ## TODO Currently this permutedims requires scalar indexing on GPU. 
+      Da = permutedims(D, (2, 1))
+      @test Da == D
+    end
 
     # Regression test for https://github.com/ITensor/ITensors.jl/issues/1199
     S = dev(tensor(Diag(randn(elt, 2)), (2, 2)))
+    ## This was creating a `Dense{ReshapedArray{Adjoint{Matrix}}}` which, in mul!, was 
+    ## becoming a Transpose{ReshapedArray{Adjoint{Matrix}}} which was causing issues on
+    ## dispatching GPU mul!
     V = dev(tensor(Dense(randn(elt, 12, 2)'), (3, 4, 2)))
-    @test contract(S, (2, -1), V, (3, 4, -1)) ≈
-      contract(dense(S), (2, -1), copy(V), (3, 4, -1))
+    S1 = contract(S, (2, -1), V, (3, 4, -1))
+    S2 = contract(dense(S), (2, -1), copy(V), (3, 4, -1))
+    @test @allowscalar S1 ≈ S2
   end
 end
 @testset "DiagTensor contractions" begin
diff --git a/NDTensors/test/linearalgebra.jl b/NDTensors/test/linearalgebra.jl
index 7d0a1c1250..11e94dd362 100644
--- a/NDTensors/test/linearalgebra.jl
+++ b/NDTensors/test/linearalgebra.jl
@@ -1,6 +1,7 @@
 using NDTensors
 using LinearAlgebra
 using Test
+using GPUArraysCore: @allowscalar
 
 @testset "random_orthog" begin
   n, m = 10, 4
@@ -44,16 +45,17 @@ devs = devices_list(copy(ARGS))
     #
     A = dev(randomTensor(elt, (n, m)))
     # We want to test 0.0 on the diagonal.  We need to make all rows equal to gaurantee this with numerical roundoff.
-    if singular
+    @allowscalar if singular
       for i in 2:n
         A[i, :] = A[1, :]
       end
     end
     Q, X = qx(A; positive=positive) #X is R or L.
-    @test A ≈ Q * X atol = eps
-    @test array(Q)' * array(Q) ≈ Id atol = eps
-    @test array(Q) * array(Q)' ≈ Id atol = eps
-    if positive
+    Ap = Q * X
+    @test NDTensors.cpu(A) ≈ NDTensors.cpu(Ap) atol = eps
+    @test NDTensors.cpu(array(Q)' * array(Q)) ≈ Id atol = eps
+    @test NDTensors.cpu(array(Q) * array(Q)') ≈ Id atol = eps
+    @allowscalar if positive
       nr, nc = size(X)
       dr = qx == ql ? Base.max(0, nc - nr) : 0
       diagX = diag(X[:, (1 + dr):end]) #location of diag(L) is shifted dr columns over the right.
@@ -65,15 +67,16 @@ devs = devices_list(copy(ARGS))
     #
     A = dev(randomTensor(elt, (m, n))) #Tall array
     # We want to test 0.0 on the diagonal.  We need make all rows equal to gaurantee this with numerical roundoff.
-    if singular
+    @allowscalar if singular
       for i in 2:m
         A[i, :] = A[1, :]
       end
     end
     Q, X = qx(A; positive=positive)
-    @test A ≈ Q * X atol = eps
-    @test array(Q)' * array(Q) ≈ Id atol = eps
-    if positive
+    Ap = Q * X
+    @test NDTensors.cpu(A) ≈ NDTensors.cpu(Ap) atol = eps
+    @test NDTensors.cpu(array(Q)' * array(Q)) ≈ Id atol = eps
+    @allowscalar if positive
       nr, nc = size(X)
       dr = qx == ql ? Base.max(0, nc - nr) : 0
       diagX = diag(X[:, (1 + dr):end]) #location of diag(L) is shifted dr columns over the right.