From e571300fae2e11e47eae65ea36649fd326e1835f Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Mon, 16 Oct 2023 21:10:57 -0400
Subject: [PATCH] [NDTensors][NDTensorsCUDAExt] Improve performance of CUDA
 backend (#1194)

---
 .../ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl  |   3 +-
 NDTensors/ext/NDTensorsCUDAExt/adapt.jl       |   2 +-
 NDTensors/ext/NDTensorsCUDAExt/imports.jl     |   2 +-
 NDTensors/ext/NDTensorsCUDAExt/iscu.jl        |   1 +
 NDTensors/src/NDTensors.jl                    |   6 +
 NDTensors/src/abstractarray/fill.jl           |  10 +-
 NDTensors/src/abstractarray/mul.jl            |  20 ++
 NDTensors/src/abstractarray/permutedims.jl    |  37 ++++
 NDTensors/src/abstractarray/similar.jl        |   5 +
 .../abstractarray/tensoralgebra/contract.jl   | 176 +++++++++++++++++
 NDTensors/src/array/mul.jl                    |   4 +
 NDTensors/src/array/permutedims.jl            |  15 ++
 NDTensors/src/arraytensor/array.jl            |   4 +-
 NDTensors/src/dense/dense.jl                  |   1 -
 NDTensors/src/dense/densetensor.jl            |   9 +-
 NDTensors/src/dense/tensoralgebra/contract.jl | 180 +-----------------
 NDTensors/src/imports.jl                      |   2 -
 NDTensors/src/linearalgebra/linearalgebra.jl  |  44 ++++-
 NDTensors/src/tensor/permutedims.jl           |   1 +
 NDTensors/src/tensor/set_types.jl             |   3 +
 NDTensors/test/blocksparse.jl                 |   2 +-
 NDTensors/test/linearalgebra.jl               |   4 -
 src/ITensors.jl                               |   1 +
 src/broadcast.jl                              |  49 +++--
 src/imports.jl                                |   4 +-
 src/itensor.jl                                |   7 +-
 src/mps/dmrg.jl                               |  10 +-
 src/set_types.jl                              |   1 +
 src/tensor_operations/permutations.jl         |   4 +-
 test/ITensorNetworkMaps/utils/utils.jl        |   2 +-
 test/base/test_itensor_scalar_contract.jl     |   8 +-
 31 files changed, 387 insertions(+), 230 deletions(-)
 create mode 100644 NDTensors/ext/NDTensorsCUDAExt/iscu.jl
 create mode 100644 NDTensors/src/abstractarray/mul.jl
 create mode 100644 NDTensors/src/abstractarray/permutedims.jl
 create mode 100644 NDTensors/src/abstractarray/tensoralgebra/contract.jl
 create mode 100644 NDTensors/src/array/mul.jl
 create mode 100644 NDTensors/src/array/permutedims.jl
 create mode 100644 NDTensors/src/tensor/permutedims.jl
 create mode 100644 src/set_types.jl

diff --git a/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl b/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl
index 41f3e6ea57..369c398d98 100644
--- a/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl
+++ b/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl
@@ -4,7 +4,7 @@ using NDTensors
 using NDTensors.SetParameters
 using Adapt
 using Functors
-using LinearAlgebra: BlasFloat
+using LinearAlgebra
 
 if isdefined(Base, :get_extension)
   using CUDA
@@ -18,6 +18,7 @@ end
 
 include("imports.jl")
 include("set_types.jl")
+include("iscu.jl")
 include("adapt.jl")
 include("linearalgebra.jl")
 end
diff --git a/NDTensors/ext/NDTensorsCUDAExt/adapt.jl b/NDTensors/ext/NDTensorsCUDAExt/adapt.jl
index 718901da4c..165b5405ce 100644
--- a/NDTensors/ext/NDTensorsCUDAExt/adapt.jl
+++ b/NDTensors/ext/NDTensorsCUDAExt/adapt.jl
@@ -16,7 +16,7 @@ buffertype(::NDTensorCuArrayAdaptor{B}) where {B} = B
 function Adapt.adapt_storage(adaptor::NDTensorCuArrayAdaptor, xs::AbstractArray)
   ElT = eltype(xs)
   BufT = buffertype(adaptor)
-  return isbits(xs) ? xs : CuArray{ElT,1,BufT}(xs)
+  return isbits(xs) ? xs : adapt(CuArray{ElT,1,BufT}, xs)
 end
 
 function NDTensors.adapt_storagetype(
diff --git a/NDTensors/ext/NDTensorsCUDAExt/imports.jl b/NDTensors/ext/NDTensorsCUDAExt/imports.jl
index d8736ae0a2..96b8383c74 100644
--- a/NDTensors/ext/NDTensorsCUDAExt/imports.jl
+++ b/NDTensors/ext/NDTensorsCUDAExt/imports.jl
@@ -1,6 +1,6 @@
 import NDTensors: cu, set_ndims, set_eltype, set_eltype_if_unspecified, similartype
 import NDTensors:
-  ContractionProperties, _contract!, GemmBackend, auto_select_backend, _gemm!
+  ContractionProperties, _contract!, GemmBackend, auto_select_backend, _gemm!, iscu
 import NDTensors.SetParameters: nparameters, get_parameter, set_parameter, default_parameter
 
 import .CUDA: CuArrayAdaptor
diff --git a/NDTensors/ext/NDTensorsCUDAExt/iscu.jl b/NDTensors/ext/NDTensorsCUDAExt/iscu.jl
new file mode 100644
index 0000000000..5c3fc95a25
--- /dev/null
+++ b/NDTensors/ext/NDTensorsCUDAExt/iscu.jl
@@ -0,0 +1 @@
+iscu(::Type{<:CuArray}) = true
diff --git a/NDTensors/src/NDTensors.jl b/NDTensors/src/NDTensors.jl
index 8445b925cb..641253dec0 100644
--- a/NDTensors/src/NDTensors.jl
+++ b/NDTensors/src/NDTensors.jl
@@ -50,8 +50,12 @@ include("abstractarray/set_types.jl")
 include("abstractarray/to_shape.jl")
 include("abstractarray/similar.jl")
 include("abstractarray/ndims.jl")
+include("abstractarray/permutedims.jl")
 include("abstractarray/fill.jl")
+include("abstractarray/mul.jl")
 include("array/set_types.jl")
+include("array/permutedims.jl")
+include("array/mul.jl")
 include("tupletools.jl")
 include("emptynumber.jl")
 include("nodata.jl")
@@ -63,9 +67,11 @@ include("tensor/tensor.jl")
 include("dims.jl")
 include("tensor/set_types.jl")
 include("tensor/similar.jl")
+include("tensor/permutedims.jl")
 include("adapt.jl")
 include("tensoralgebra/generic_tensor_operations.jl")
 include("tensoralgebra/contraction_logic.jl")
+include("abstractarray/tensoralgebra/contract.jl")
 
 #####################################
 # DenseTensor and DiagTensor
diff --git a/NDTensors/src/abstractarray/fill.jl b/NDTensors/src/abstractarray/fill.jl
index 85e8377d3a..146459a4ce 100644
--- a/NDTensors/src/abstractarray/fill.jl
+++ b/NDTensors/src/abstractarray/fill.jl
@@ -1,13 +1,11 @@
-function generic_randn(arraytype::Type{<:AbstractArray}, dim::Integer=0)
+function generic_randn(
+  arraytype::Type{<:AbstractArray}, dim::Integer=0; rng=Random.default_rng()
+)
   arraytype_specified = set_unspecified_parameters(
     leaf_parenttype(arraytype), DefaultParameters()
   )
   data = similar(arraytype_specified, dim)
-  ElT = eltype(data)
-  for i in 1:length(data)
-    data[i] = randn(ElT)
-  end
-  return data
+  return randn!(rng, data)
 end
 
 function generic_zeros(arraytype::Type{<:AbstractArray}, dim::Integer=0)
diff --git a/NDTensors/src/abstractarray/mul.jl b/NDTensors/src/abstractarray/mul.jl
new file mode 100644
index 0000000000..954ecca3e1
--- /dev/null
+++ b/NDTensors/src/abstractarray/mul.jl
@@ -0,0 +1,20 @@
+function mul!!(CM::AbstractArray, AM::AbstractArray, BM::AbstractArray, α, β)
+  return mul!!(
+    leaf_parenttype(CM), CM, leaf_parenttype(AM), AM, leaf_parenttype(BM), BM, α, β
+  )
+  return CM
+end
+
+function mul!!(
+  ::Type{<:AbstractArray},
+  CM,
+  ::Type{<:AbstractArray},
+  AM,
+  ::Type{<:AbstractArray},
+  BM,
+  α,
+  β,
+)
+  mul!(CM, AM, BM, α, β)
+  return CM
+end
diff --git a/NDTensors/src/abstractarray/permutedims.jl b/NDTensors/src/abstractarray/permutedims.jl
new file mode 100644
index 0000000000..f23db4ea73
--- /dev/null
+++ b/NDTensors/src/abstractarray/permutedims.jl
@@ -0,0 +1,37 @@
+## NOTICE!!: Here we are not importing Base.permutedims or Base.permutedims! but
+## are writing our own implementation. This allows us to 
+# NDTensors.permutedims
+function permutedims(M::AbstractArray, perm)
+  return permutedims(leaf_parenttype(M), M, perm)
+end
+
+# NDTensors.permutedims
+function permutedims(::Type{<:AbstractArray}, M, perm)
+  return Base.permutedims(M, perm)
+end
+
+# NDTensors.permutedims!
+function permutedims!(Mdest::AbstractArray, M::AbstractArray, perm)
+  return permutedims!(leaf_parenttype(Mdest), Mdest, leaf_parenttype(M), M, perm)
+end
+
+# NDTensors.permutedims!
+function permutedims!(::Type{<:AbstractArray}, Mdest, ::Type{<:AbstractArray}, M, perm)
+  return Base.permutedims!(Mdest, M, perm)
+end
+
+function permutedims!!(B::AbstractArray, A::AbstractArray, perm, f)
+  return permutedims!!(leaf_parenttype(B), B, leaf_parenttype(A), A, perm, f)
+end
+
+function permutedims!!(
+  Bleaftype::Type{<:AbstractArray}, B, Aleaftype::Type{<:AbstractArray}, A, perm, f
+)
+  permutedims!(Bleaftype, B, Aleaftype, A, perm, f)
+  return B
+end
+
+function permutedims!(::Type{<:AbstractArray}, B, ::Type{<:AbstractArray}, A, perm, f)
+  B .= f.(B, Base.permutedims(A, perm))
+  return B
+end
diff --git a/NDTensors/src/abstractarray/similar.jl b/NDTensors/src/abstractarray/similar.jl
index 7e8313cd1c..873e22e44e 100644
--- a/NDTensors/src/abstractarray/similar.jl
+++ b/NDTensors/src/abstractarray/similar.jl
@@ -56,6 +56,11 @@ function similar(arraytype::Type{<:AbstractArray}, dims::Tuple)
   return similartype(arraytype, shape)(undef, NDTensors.to_shape(arraytype, shape))
 end
 
+# For when there are CUArray specific issues inline
+iscu(A::AbstractArray) = iscu(typeof(A))
+function iscu(A::Type{<:AbstractArray})
+  return (leaf_parenttype(A) == A ? false : iscu(leaf_parenttype(A)))
+end
 # This function actually allocates the data.
 # Catches conversions of dimensions specified by ranges
 # dimensions specified by integers with `Base.to_shape`.
diff --git a/NDTensors/src/abstractarray/tensoralgebra/contract.jl b/NDTensors/src/abstractarray/tensoralgebra/contract.jl
new file mode 100644
index 0000000000..5db9bf5370
--- /dev/null
+++ b/NDTensors/src/abstractarray/tensoralgebra/contract.jl
@@ -0,0 +1,176 @@
+using LinearAlgebra: BlasFloat
+export backend_auto, backend_blas, backend_generic
+
+@eval struct GemmBackend{T}
+  (f::Type{<:GemmBackend})() = $(Expr(:new, :f))
+end
+GemmBackend(s) = GemmBackend{Symbol(s)}()
+macro GemmBackend_str(s)
+  return :(GemmBackend{$(Expr(:quote, Symbol(s)))})
+end
+
+const gemm_backend = Ref(:Auto)
+function backend_auto()
+  return gemm_backend[] = :Auto
+end
+function backend_blas()
+  return gemm_backend[] = :BLAS
+end
+function backend_generic()
+  return gemm_backend[] = :Generic
+end
+
+@inline function auto_select_backend(
+  ::Type{<:StridedVecOrMat{<:BlasFloat}},
+  ::Type{<:StridedVecOrMat{<:BlasFloat}},
+  ::Type{<:StridedVecOrMat{<:BlasFloat}},
+)
+  return GemmBackend(:BLAS)
+end
+
+@inline function auto_select_backend(
+  ::Type{<:AbstractVecOrMat}, ::Type{<:AbstractVecOrMat}, ::Type{<:AbstractVecOrMat}
+)
+  return GemmBackend(:Generic)
+end
+
+function _gemm!(
+  tA, tB, alpha, A::TA, B::TB, beta, C::TC
+) where {TA<:AbstractVecOrMat,TB<:AbstractVecOrMat,TC<:AbstractVecOrMat}
+  if gemm_backend[] == :Auto
+    _gemm!(auto_select_backend(TA, TB, TC), tA, tB, alpha, A, B, beta, C)
+  else
+    _gemm!(GemmBackend(gemm_backend[]), tA, tB, alpha, A, B, beta, C)
+  end
+end
+
+# BLAS matmul
+function _gemm!(
+  ::GemmBackend{:BLAS},
+  tA,
+  tB,
+  alpha,
+  A::AbstractVecOrMat,
+  B::AbstractVecOrMat,
+  beta,
+  C::AbstractVecOrMat,
+)
+  #@timeit_debug timer "BLAS.gemm!" begin
+  return BLAS.gemm!(tA, tB, alpha, A, B, beta, C)
+  #end # @timeit
+end
+
+# generic matmul
+function _gemm!(
+  ::GemmBackend{:Generic},
+  tA,
+  tB,
+  alpha::AT,
+  A::AbstractVecOrMat,
+  B::AbstractVecOrMat,
+  beta::BT,
+  C::AbstractVecOrMat,
+) where {AT,BT}
+  mul!(C, tA == 'T' ? transpose(A) : A, tB == 'T' ? transpose(B) : B, alpha, beta)
+  return C
+end
+
+# Non-trivial permutation
+function _contract_scalar_perm!(
+  Rᵃ::AbstractArray{ElR}, Tᵃ::AbstractArray, perm, α, β=zero(ElR)
+) where {ElR}
+  if iszero(β)
+    if iszero(α)
+      fill!(Rᵃ, 0)
+    else
+      Rᵃ = permutedims!!(Rᵃ, Tᵃ, perm, (r, t) -> α * t)
+    end
+  elseif isone(β)
+    if iszero(α)
+      # Rᵃ .= Rᵃ
+      # No-op
+    else
+      Rᵃ = permutedims!!(Rᵃ, Tᵃ, perm, (r, t) -> r + α * t)
+    end
+  else
+    if iszero(α)
+      # Rᵃ .= β .* Rᵃ
+      LinearAlgebra.scal!(length(Rᵃ), β, Rᵃ, 1)
+    else
+      Rᵃ .= α .* permutedims(Tᵃ, perm) .+ β .* Rᵃ
+    end
+  end
+  return Rᵃ
+end
+
+function _contract!(
+  CT::AbstractArray{El,NC},
+  AT::AbstractArray{El,NA},
+  BT::AbstractArray{El,NB},
+  props::ContractionProperties,
+  α::Number=one(El),
+  β::Number=zero(El),
+) where {El,NC,NA,NB}
+  tA = 'N'
+  if props.permuteA
+    #@timeit_debug timer "_contract!: permutedims A" begin
+    Ap = permutedims(AT, props.PA)
+    #end # @timeit
+    AM = transpose(reshape(Ap, (props.dmid, props.dleft)))
+  else
+    #A doesn't have to be permuted
+    if Atrans(props)
+      AM = transpose(reshape(AT, (props.dmid, props.dleft)))
+    else
+      AM = reshape(AT, (props.dleft, props.dmid))
+    end
+  end
+
+  tB = 'N'
+  if props.permuteB
+    #@timeit_debug timer "_contract!: permutedims B" begin
+    Bp = permutedims(BT, props.PB)
+    #end # @timeit
+    BM = reshape(Bp, (props.dmid, props.dright))
+  else
+    if Btrans(props)
+      BM = transpose(reshape(BT, (props.dright, props.dmid)))
+    else
+      BM = reshape(BT, (props.dmid, props.dright))
+    end
+  end
+
+  # TODO: this logic may be wrong
+  if props.permuteC
+    # if we are computing C = α * A B + β * C
+    # we need to make sure C is permuted to the same 
+    # ordering as A B which is the inverse of props.PC
+    if β ≠ 0
+      CM = reshape(permutedims(CT, invperm(props.PC)), (props.dleft, props.dright))
+    else
+      # Need to copy here since we will be permuting
+      # into C later  
+      CM = reshape(copy(CT), (props.dleft, props.dright))
+    end
+  else
+    if Ctrans(props)
+      CM = transpose(reshape(CT, (props.dright, props.dleft)))
+    else
+      CM = reshape(CT, (props.dleft, props.dright))
+    end
+  end
+
+  #tC = similar(CM)
+  #_gemm!(tA, tB, El(α), AM, BM, El(β), CM)
+  CM = mul!!(CM, AM, BM, El(α), El(β))
+
+  if props.permuteC
+    Cr = reshape(CM, props.newCrange)
+    # TODO: use invperm(pC) here?
+    #@timeit_debug timer "_contract!: permutedims C" begin 
+    CT .= permutedims(Cr, props.PC)
+    #end # @timeit
+  end
+
+  return CT
+end
diff --git a/NDTensors/src/array/mul.jl b/NDTensors/src/array/mul.jl
new file mode 100644
index 0000000000..54104ce502
--- /dev/null
+++ b/NDTensors/src/array/mul.jl
@@ -0,0 +1,4 @@
+function mul!!(::Type{<:Array}, CM, ::Type{<:Array}, AM, ::Type{<:Array}, BM, α, β)
+  @strided mul!(CM, AM, BM, α, β)
+  return CM
+end
diff --git a/NDTensors/src/array/permutedims.jl b/NDTensors/src/array/permutedims.jl
new file mode 100644
index 0000000000..7a033201fb
--- /dev/null
+++ b/NDTensors/src/array/permutedims.jl
@@ -0,0 +1,15 @@
+# NDTensors.permutedims
+function permutedims(::Type{<:Array}, M, perm)
+  return @strided Base.permutedims(M, perm)
+end
+
+# NDTensors.permutedims!
+function permutedims!(::Type{<:Array}, Mdest, ::Type{<:Array}, M, perm)
+  @strided Mdest .= Base.permutedims(M, perm)
+  return Mdest
+end
+
+function permutedims!(::Type{<:Array}, B, ::Type{<:Array}, A, perm, f)
+  @strided B .= f.(B, Base.permutedims(A, perm))
+  return B
+end
diff --git a/NDTensors/src/arraytensor/array.jl b/NDTensors/src/arraytensor/array.jl
index 40e89c5181..997204394d 100644
--- a/NDTensors/src/arraytensor/array.jl
+++ b/NDTensors/src/arraytensor/array.jl
@@ -61,6 +61,8 @@ end
 function permutedims!(
   output_array::MatrixOrArrayStorage, array::MatrixOrArrayStorage, perm, f::Function
 )
-  @strided output_array .= f.(output_array, permutedims(array, perm))
+  output_array = permutedims!!(
+    leaf_parenttype(output_array), output_array, leaf_parenttype(array), array, perm, f
+  )
   return output_array
 end
diff --git a/NDTensors/src/dense/dense.jl b/NDTensors/src/dense/dense.jl
index 12e9b05882..96afd695b5 100644
--- a/NDTensors/src/dense/dense.jl
+++ b/NDTensors/src/dense/dense.jl
@@ -1,7 +1,6 @@
 #
 # Dense storage
 #
-using LinearAlgebra: BlasFloat
 
 struct Dense{ElT,DataT<:AbstractArray} <: TensorStorage{ElT}
   data::DataT
diff --git a/NDTensors/src/dense/densetensor.jl b/NDTensors/src/dense/densetensor.jl
index 8a019e1f3a..ed773d6f0e 100644
--- a/NDTensors/src/dense/densetensor.jl
+++ b/NDTensors/src/dense/densetensor.jl
@@ -80,6 +80,10 @@ end
 # Single index
 #
 
+@propagate_inbounds function getindex(T::DenseTensor{<:Number})
+  return (iscu(T) ? NDTensors.cpu(data(T))[] : data(T)[])
+end
+
 @propagate_inbounds function getindex(T::DenseTensor{<:Number}, I::Integer...)
   Base.@_inline_meta
   return getindex(data(T), Base._sub2ind(T, I...))
@@ -195,7 +199,7 @@ function permutedims!(
 ) where {N,StoreT<:StridedArray}
   RA = array(R)
   TA = array(T)
-  @strided RA .= permutedims(TA, perm)
+  permutedims!(RA, TA, perm)
   return R
 end
 
@@ -243,8 +247,7 @@ function permutedims!(
   end
   RA = array(R)
   TA = array(T)
-  @strided RA .= f.(RA, permutedims(TA, perm))
-  return R
+  return permutedims!!(RA, TA, perm, f)
 end
 
 """
diff --git a/NDTensors/src/dense/tensoralgebra/contract.jl b/NDTensors/src/dense/tensoralgebra/contract.jl
index b5faa91616..e1d99ce2eb 100644
--- a/NDTensors/src/dense/tensoralgebra/contract.jl
+++ b/NDTensors/src/dense/tensoralgebra/contract.jl
@@ -1,79 +1,3 @@
-export backend_auto, backend_blas, backend_generic
-
-@eval struct GemmBackend{T}
-  (f::Type{<:GemmBackend})() = $(Expr(:new, :f))
-end
-GemmBackend(s) = GemmBackend{Symbol(s)}()
-macro GemmBackend_str(s)
-  return :(GemmBackend{$(Expr(:quote, Symbol(s)))})
-end
-
-const gemm_backend = Ref(:Auto)
-function backend_auto()
-  return gemm_backend[] = :Auto
-end
-function backend_blas()
-  return gemm_backend[] = :BLAS
-end
-function backend_generic()
-  return gemm_backend[] = :Generic
-end
-
-@inline function auto_select_backend(
-  ::Type{<:StridedVecOrMat{<:BlasFloat}},
-  ::Type{<:StridedVecOrMat{<:BlasFloat}},
-  ::Type{<:StridedVecOrMat{<:BlasFloat}},
-)
-  return GemmBackend(:BLAS)
-end
-
-@inline function auto_select_backend(
-  ::Type{<:AbstractVecOrMat}, ::Type{<:AbstractVecOrMat}, ::Type{<:AbstractVecOrMat}
-)
-  return GemmBackend(:Generic)
-end
-
-function _gemm!(
-  tA, tB, alpha, A::TA, B::TB, beta, C::TC
-) where {TA<:AbstractVecOrMat,TB<:AbstractVecOrMat,TC<:AbstractVecOrMat}
-  if gemm_backend[] == :Auto
-    _gemm!(auto_select_backend(TA, TB, TC), tA, tB, alpha, A, B, beta, C)
-  else
-    _gemm!(GemmBackend(gemm_backend[]), tA, tB, alpha, A, B, beta, C)
-  end
-end
-
-# BLAS matmul
-function _gemm!(
-  ::GemmBackend{:BLAS},
-  tA,
-  tB,
-  alpha,
-  A::AbstractVecOrMat,
-  B::AbstractVecOrMat,
-  beta,
-  C::AbstractVecOrMat,
-)
-  #@timeit_debug timer "BLAS.gemm!" begin
-  return BLAS.gemm!(tA, tB, alpha, A, B, beta, C)
-  #end # @timeit
-end
-
-# generic matmul
-function _gemm!(
-  ::GemmBackend{:Generic},
-  tA,
-  tB,
-  alpha::AT,
-  A::AbstractVecOrMat,
-  B::AbstractVecOrMat,
-  beta::BT,
-  C::AbstractVecOrMat,
-) where {AT,BT}
-  mul!(C, tA == 'T' ? transpose(A) : A, tB == 'T' ? transpose(B) : B, alpha, beta)
-  return C
-end
-
 function contraction_output(tensor1::DenseTensor, tensor2::DenseTensor, indsR)
   tensortypeR = contraction_output_type(typeof(tensor1), typeof(tensor2), indsR)
   return NDTensors.similar(tensortypeR, indsR)
@@ -167,34 +91,6 @@ function _contract_scalar_noperm!(
   return R
 end
 
-# Non-trivial permutation
-function _contract_scalar_perm!(
-  Rᵃ::AbstractArray{ElR}, Tᵃ::AbstractArray, perm, α, β=zero(ElR)
-) where {ElR}
-  if iszero(β)
-    if iszero(α)
-      fill!(Rᵃ, 0)
-    else
-      @strided Rᵃ .= α .* permutedims(Tᵃ, perm)
-    end
-  elseif isone(β)
-    if iszero(α)
-      # Rᵃ .= Rᵃ
-      # No-op
-    else
-      @strided Rᵃ .= α .* permutedims(Tᵃ, perm) .+ Rᵃ
-    end
-  else
-    if iszero(α)
-      # Rᵃ .= β .* Rᵃ
-      LinearAlgebra.scal!(length(Rᵃ), β, Rᵃ, 1)
-    else
-      Rᵃ .= α .* permutedims(Tᵃ, perm) .+ β .* Rᵃ
-    end
-  end
-  return Rᵃ
-end
-
 function _contract_scalar_maybe_perm!(
   ::Order{N}, R::DenseTensor{ElR,NR}, labelsR, T::DenseTensor, labelsT, α, β=zero(ElR)
 ) where {ElR,NR,N}
@@ -233,9 +129,9 @@ function _contract_scalar_maybe_perm!(
   β=zero(ElR),
 ) where {ElR,NR}
   if nnz(T₁) == 1
-    _contract_scalar_maybe_perm!(R, labelsR, T₂, labelsT₂, α * T₁[1], β)
+    _contract_scalar_maybe_perm!(R, labelsR, T₂, labelsT₂, α * T₁[], β)
   elseif nnz(T₂) == 1
-    _contract_scalar_maybe_perm!(R, labelsR, T₁, labelsT₁, α * T₂[1], β)
+    _contract_scalar_maybe_perm!(R, labelsR, T₁, labelsT₁, α * T₂[], β)
   else
     error("In _contract_scalar_perm!, one tensor must be a scalar")
   end
@@ -333,75 +229,3 @@ function _contract!(
 
   return _contract!(C, A, B, props, α, β)
 end
-
-function _contract!(
-  CT::AbstractArray{El,NC},
-  AT::AbstractArray{El,NA},
-  BT::AbstractArray{El,NB},
-  props::ContractionProperties,
-  α::Number=one(El),
-  β::Number=zero(El),
-) where {El,NC,NA,NB}
-  tA = 'N'
-  if props.permuteA
-    #@timeit_debug timer "_contract!: permutedims A" begin
-    @strided Ap = permutedims(AT, props.PA)
-    #end # @timeit
-    AM = transpose(reshape(Ap, (props.dmid, props.dleft)))
-  else
-    #A doesn't have to be permuted
-    if Atrans(props)
-      AM = transpose(reshape(AT, (props.dmid, props.dleft)))
-    else
-      AM = reshape(AT, (props.dleft, props.dmid))
-    end
-  end
-
-  tB = 'N'
-  if props.permuteB
-    #@timeit_debug timer "_contract!: permutedims B" begin
-    @strided Bp = permutedims(BT, props.PB)
-    #end # @timeit
-    BM = reshape(Bp, (props.dmid, props.dright))
-  else
-    if Btrans(props)
-      BM = transpose(reshape(BT, (props.dright, props.dmid)))
-    else
-      BM = reshape(BT, (props.dmid, props.dright))
-    end
-  end
-
-  # TODO: this logic may be wrong
-  if props.permuteC
-    # if we are computing C = α * A B + β * C
-    # we need to make sure C is permuted to the same 
-    # ordering as A B which is the inverse of props.PC
-    if β ≠ 0
-      CM = reshape(permutedims(CT, invperm(props.PC)), (props.dleft, props.dright))
-    else
-      # Need to copy here since we will be permuting
-      # into C later  
-      CM = reshape(copy(CT), (props.dleft, props.dright))
-    end
-  else
-    if Ctrans(props)
-      CM = transpose(reshape(CT, (props.dright, props.dleft)))
-    else
-      CM = reshape(CT, (props.dleft, props.dright))
-    end
-  end
-
-  #tC = similar(CM)
-  #_gemm!(tA, tB, El(α), AM, BM, El(β), CM)
-  @strided mul!(CM, AM, BM, El(α), El(β))
-
-  if props.permuteC
-    Cr = reshape(CM, props.newCrange)
-    # TODO: use invperm(pC) here?
-    #@timeit_debug timer "_contract!: permutedims C" begin
-    @strided CT .= permutedims(Cr, props.PC)
-    #end # @timeit
-  end
-
-  return CT
-end
diff --git a/NDTensors/src/imports.jl b/NDTensors/src/imports.jl
index 13b466fca4..4b58569f21 100644
--- a/NDTensors/src/imports.jl
+++ b/NDTensors/src/imports.jl
@@ -31,8 +31,6 @@ import Base:
   iterate,
   length,
   map,
-  permutedims,
-  permutedims!,
   print,
   promote_rule,
   randn,
diff --git a/NDTensors/src/linearalgebra/linearalgebra.jl b/NDTensors/src/linearalgebra/linearalgebra.jl
index b681cd4a9e..e2c8133eb9 100644
--- a/NDTensors/src/linearalgebra/linearalgebra.jl
+++ b/NDTensors/src/linearalgebra/linearalgebra.jl
@@ -218,7 +218,9 @@ function LinearAlgebra.eigen(
   use_relative_cutoff::Bool = get(kwargs, :use_relative_cutoff, use_relative_cutoff)
 
   matrixT = matrix(T)
-  if any(!isfinite, matrixT)
+  ## TODO Here I am calling parent to ensure that the correct `any` function
+  ## is envoked for non-cpu matrices
+  if any(!isfinite, parent(matrixT))
     throw(
       ArgumentError(
         "Trying to perform the eigendecomposition of a matrix containing NaNs or Infs"
@@ -234,9 +236,11 @@ function LinearAlgebra.eigen(
   VM = VM[:, p]
 
   if truncate
+    cpu_dm = NDTensors.cpu(DM)
     truncerr, _ = truncate!(
-      DM; mindim, maxdim, cutoff, use_absolute_cutoff, use_relative_cutoff, kwargs...
+      cpu_dm; mindim, maxdim, cutoff, use_absolute_cutoff, use_relative_cutoff, kwargs...
     )
+    DM = adapt(typeof(DM), cpu_dm)
     dD = length(DM)
     if dD < size(VM, 2)
       VM = VM[:, 1:dD]
@@ -400,6 +404,8 @@ function qx(qx::Function, T::DenseTensor{<:Any,2}; kwargs...)
   Qinds = IndsT((ind(T, 1), q))
   Xinds = IndsT((q, ind(T, 2)))
   QM = convert(typeof(XM), QM)
+  ## Here I convert QM twice because of an issue in CUDA where convert does not take QM to be a UnifiedBuffer array
+  QM = convert(typeof(XM), QM)
   Q = tensor(Dense(vec(QM)), Qinds) #Q was strided
   X = tensor(Dense(vec(XM)), Xinds)
   return Q, X
@@ -418,9 +424,15 @@ non-negative. Such a QR decomposition of a
 matrix is unique. Returns a tuple (Q,R).
 """
 function qr_positive(M::AbstractMatrix)
+  iscuda = iscu(M)
+  if iscuda
+    cutype = leaf_parenttype(M)
+    M = NDTensors.cpu(M)
+  end
   sparseQ, R = qr(M)
   Q = convert(typeof(R), sparseQ)
   nc = size(Q, 2)
+  ## TODO issue here for GPU because tying to access indices
   for c in 1:nc
     if R[c, c] != 0.0 #sign(0.0)==0.0 so we don't want to zero out a column of Q.
       sign_Rc = sign(R[c, c])
@@ -430,6 +442,10 @@ function qr_positive(M::AbstractMatrix)
       end
     end
   end
+  if iscuda
+    Q = adapt(cutype, Q)
+    R = adapt(cutype, R)
+  end
   return (Q, R)
 end
 
@@ -442,6 +458,11 @@ non-negative. Such a QL decomposition of a
 matrix is unique. Returns a tuple (Q,L).
 """
 function ql_positive(M::AbstractMatrix)
+  iscuda = iscu(M)
+  if iscuda
+    cutype = leaf_parenttype(M)
+    M = NDTensors.cpu(M)
+  end
   sparseQ, L = ql(M)
   Q = convert(typeof(L), sparseQ)
   nr, nc = size(L)
@@ -455,6 +476,10 @@ function ql_positive(M::AbstractMatrix)
       end
     end
   end
+  if iscuda
+    Q = adapt(cutype, Q)
+    L = adapt(cutype, L)
+  end
   return (Q, L)
 end
 
@@ -467,13 +492,26 @@ function ql(A::AbstractMatrix; kwargs...)
   T = eltype(A)
   AA = similar(A, LinearAlgebra._qreltype(T), size(A))
   copyto!(AA, A)
-  return ql!(AA; kwargs...)
+  iscuda = iscu(AA)
+  if iscuda
+    cutype = leaf_parenttype(AA)
+    AA = NDTensors.cpu(AA)
+  end
+  Q, L = ql!(AA; kwargs...)
+  if iscuda
+    Q = adapt(cutype, Q)
+    L = adapt(cutype, L)
+  end
+  return (Q, L)
 end
 #
 # This is where the low level call to lapack actually occurs.  Most of the work is
 # about unpacking Q and L from the A matrix.
 #
 function ql!(A::StridedMatrix{<:LAPACK.BlasFloat})
+  if iscu(A)
+    throw("Error: ql is not implemented in CUDA.jl")
+  end
   tau = Base.similar(A, min(size(A)...))
   x = LAPACK.geqlf!(A, tau)
   #save L from the lower portion of A, before orgql! mangles it!
diff --git a/NDTensors/src/tensor/permutedims.jl b/NDTensors/src/tensor/permutedims.jl
new file mode 100644
index 0000000000..f79e229504
--- /dev/null
+++ b/NDTensors/src/tensor/permutedims.jl
@@ -0,0 +1 @@
+Base.permutedims(A::Tensor, perm) = NDTensors.permutedims(A, perm)
diff --git a/NDTensors/src/tensor/set_types.jl b/NDTensors/src/tensor/set_types.jl
index 77f7130d6b..eb1fc548d0 100644
--- a/NDTensors/src/tensor/set_types.jl
+++ b/NDTensors/src/tensor/set_types.jl
@@ -23,3 +23,6 @@ end
 function set_indstype(tensortype::Type{<:Tensor}, inds::Tuple)
   return Tensor{eltype(tensortype),length(inds),storagetype(tensortype),typeof(inds)}
 end
+
+parenttype(tensortype::Type{<:Tensor}) = storagetype(tensortype)
+parenttype(storagetype::Type{<:TensorStorage}) = datatype(storagetype)
diff --git a/NDTensors/test/blocksparse.jl b/NDTensors/test/blocksparse.jl
index 6a541f15a1..8f52f51f14 100644
--- a/NDTensors/test/blocksparse.jl
+++ b/NDTensors/test/blocksparse.jl
@@ -162,7 +162,7 @@ end
       @test nnzblocks(A) == nnzblocks(B)
       @test nnz(A) == nnz(B)
 
-      Ap = permutedims(A, (3, 2, 1))
+      Ap = NDTensors.permutedims(A, (3, 2, 1))
 
       for (bAp, bB) in zip(eachnzblock(Ap), eachnzblock(B))
         blockAp = blockview(Ap, bAp)
diff --git a/NDTensors/test/linearalgebra.jl b/NDTensors/test/linearalgebra.jl
index a643837e2d..1ffda564ab 100644
--- a/NDTensors/test/linearalgebra.jl
+++ b/NDTensors/test/linearalgebra.jl
@@ -51,10 +51,6 @@ devs = devices_list(copy(ARGS))
         A[i, :] = A[1, :]
       end
     end
-    if qx == ql && dev != NDTensors.cpu
-      @test_broken qx(A; positive=positive)
-      continue
-    end
     Q, X = qx(A; positive=positive) #X is R or L.
     @test A ≈ Q * X atol = eps
     @test array(Q)' * array(Q) ≈ Id atol = eps
diff --git a/src/ITensors.jl b/src/ITensors.jl
index c1effe5ebe..d571e436bf 100644
--- a/src/ITensors.jl
+++ b/src/ITensors.jl
@@ -138,6 +138,7 @@ include("broadcast.jl")
 include("tensor_operations/matrix_decomposition.jl")
 include("iterativesolvers.jl")
 include("adapt.jl")
+include("set_types.jl")
 
 #####################################
 # Experimental ITensor Functions
diff --git a/src/broadcast.jl b/src/broadcast.jl
index af3082ac1a..bbb4570790 100644
--- a/src/broadcast.jl
+++ b/src/broadcast.jl
@@ -134,7 +134,6 @@ find_type(::Type{T}, ::Tuple{}) where {T} = nothing
 #
 
 function Base.copyto!(T::ITensor, bc::Broadcasted)
-  @show typeof(bc)
   return error(
     "The broadcasting operation you are attempting is not yet implemented for ITensors, please raise an issue if you would like it to be supported.",
   )
@@ -171,7 +170,11 @@ function Base.copyto!(
 )
   α = find_type(Number, bc.args)
   A = find_type(ITensor, bc.args)
-  map!((t, a) -> bc.f(a, α), T, T, A)
+  ## GPU compilers can have a problem when map is 
+  ## Given bc.f. map seems to make a closure with a 
+  ## relatively complicated signature
+  f = bc.f
+  map!((t, a) -> f(a, α), T, T, A)
   return T
 end
 
@@ -183,10 +186,14 @@ function Base.copyto!(
   R::ITensor, bc::Broadcasted{ITensorStyle,<:Any,typeof(/),<:Tuple{<:ITensor,<:ITensor}}
 )
   T1, T2 = bc.args
+  f = bc.f
   if R === T1
-    map!((t1, t2) -> bc.f(t1, t2), R, T1, T2)
+    map!((t1, t2) -> f(t1, t2), R, T1, T2)
+    ## I tried this and it is numberically wrong
+    #map!(f, R, T1, T2)
   elseif R === T2
-    map!((t1, t2) -> bc.f(t2, t1), R, T2, T1)
+    map!((t1, t2) -> f(t2, t1), R, T2, T1)
+    #map!(f, R, T2, T1)
   else
     error("When dividing two ITensors in-place, one must be the same as the output ITensor")
   end
@@ -223,7 +230,8 @@ function Base.copyto!(
 )
   α = find_type(Number, bc.args)
   A = find_type(ITensor, bc.args)
-  map!((t, a) -> bc.f(α, a), T, T, A)
+  f = bc.f
+  map!((t, a) -> f(α, a), T, T, A)
   return T
 end
 
@@ -249,7 +257,8 @@ function Base.copyto!(
   powf = find_type(Base.RefValue{<:Function}, bc.args).x
   @assert !isnothing(powf)
   T = find_type(ITensor, bc.args)
-  map!((r, t) -> bc.f(^, t, α), R, R, T)
+  f = bc.f
+  map!((r, t) -> f(^, t, α), R, R, T)
   return R
 end
 
@@ -276,18 +285,18 @@ function Base.copyto!(
   return T
 end
 
-#
-# B .+= A
-#
-
 function fmap(bc::Broadcasted{ITensorStyle,<:Any,typeof(+),<:Tuple{Vararg{ITensor}}})
-  return (r, t) -> bc.f(r, t)
+  return +
 end
 
 function fmap(bc::Broadcasted{ITensorStyle,<:Any,typeof(-),<:Tuple{Vararg{ITensor}}})
-  return (r, t) -> bc.f(r, t)
+  return -
 end
 
+#
+# B .+= A
+#
+
 function Base.copyto!(
   T::ITensor, bc::Broadcasted{ITensorStyle,<:Any,typeof(+),<:Tuple{Vararg{ITensor}}}
 )
@@ -343,6 +352,7 @@ function Base.copyto!(
 )
   C = find_type(ITensor, bc.args)
   bc_bc = find_type(Broadcasted, bc.args)
+
   if T === C
     A = find_type(ITensor, bc_bc.args)
     α = find_type(Number, bc_bc.args)
@@ -350,11 +360,14 @@ function Base.copyto!(
     # Check if it is the case .^(::Int)
     γ = find_type(Base.RefValue{<:Val}, bc_bc.args)
     powf = find_type(Base.RefValue{<:Function}, bc_bc.args)
+    ## Putting fmap in the map call still doesn't actually grab the function and causes GPU to fail so just realize the function slightly earlier here
+    f1 = bc.f
+    f2 = bc_bc.f
 
     if !isnothing(α) && !isnothing(A)
-      map!((r, t) -> bc.f(r, bc_bc.f(t, α)), T, T, A)
+      map!((r, t) -> f1(r, f2(t, α)), T, T, A)
     elseif !isnothing(γ) && !isnothing(A) && !isnothing(powf)
-      map!((r, t) -> bc.f(r, bc_bc.f(powf[], t, γ[])), T, T, A)
+      map!((r, t) -> f1(r, f2(powf[], t, γ[])), T, T, A)
     else
       # In-place contraction:
       # C .+= α .* A .* B
@@ -365,7 +378,7 @@ function Base.copyto!(
       else
         A, B = bc_bc_bc.args
       end
-      mul!(T, A, B, β, bc.f(1))
+      mul!(T, A, B, β, f1(1))
     end
   else
     error("When adding two ITensors in-place, one must be the same as the output ITensor")
@@ -378,6 +391,7 @@ end
 # C .= β .* C .+ α .* A .* B
 #
 
+## TODO this code doesn't actually get called
 function Base.copyto!(
   T::ITensor,
   bc::Broadcasted{ITensorOpScalarStyle,<:Any,typeof(+),<:Tuple{Vararg{Broadcasted}}},
@@ -414,6 +428,7 @@ end
 # For A .+= α
 #
 
+## TODO this code fails because of scalar indexing 
 function Base.copyto!(
   T::ITensor,
   bc::Broadcasted{
@@ -483,8 +498,9 @@ function Base.copyto!(
 )
   R̃ = find_type(ITensor, bc.args)
   bc2 = find_type(Broadcasted, bc.args)
+  f = bc2.f
   if R === R̃
-    map!((r, t) -> r + bc2.f(t), R, R, bc2.args[1])
+    map!((r, t) -> r + f(t), R, R, bc2.args[1])
   else
     error("In C .= B .+ f.(A), C and B must be the same ITensor")
   end
@@ -495,6 +511,7 @@ end
 # For B .= f.(B) + g.(A)
 #
 
+## TODO check to see if this code is being called as expected
 function Base.copyto!(
   R::ITensor, bc::Broadcasted{ITensorStyle,<:Any,typeof(+),<:Tuple{Vararg{Broadcasted}}}
 )
diff --git a/src/imports.jl b/src/imports.jl
index 1df7f62a18..02e5830eb5 100644
--- a/src/imports.jl
+++ b/src/imports.jl
@@ -51,7 +51,6 @@ import Base:
   map,
   map!,
   ndims,
-  permutedims,
   print,
   promote_rule,
   push!,
@@ -127,6 +126,8 @@ using ITensors.NDTensors:
   enable_auto_fermion,
   fill!!,
   randn!!,
+  permutedims,
+  permutedims!,
   set_eltype,
   single_precision,
   timer,
@@ -158,6 +159,7 @@ import ITensors.NDTensors:
   inds,
   insertblock!,
   insert_diag_blocks!,
+  leaf_parenttype,
   matrix,
   maxdim,
   mindim,
diff --git a/src/itensor.jl b/src/itensor.jl
index c26786e5e2..a56c219aad 100644
--- a/src/itensor.jl
+++ b/src/itensor.jl
@@ -843,7 +843,7 @@ size(A::ITensor, d::Int) = size(tensor(A), d)
 
 _isemptyscalar(A::ITensor) = _isemptyscalar(tensor(A))
 _isemptyscalar(A::Tensor) = ndims(A) == 0 && isemptystorage(A) && eltype(A) === EmptyNumber
-
+NDTensors.iscu(A::ITensor) = NDTensors.iscu(tensor(A))
 """
     dir(A::ITensor, i::Index)
 
@@ -1140,8 +1140,9 @@ A[i => 1, i' => 2] # 2.0, same as: A[i' => 2, i => 1]
 @propagate_inbounds (getindex(T::ITensor, ivs::Vararg{Any,N})::Any) where {N} =
   _getindex(tensor(T), ivs...)
 
+## Allowing one to get the first ITensor element if its an order 0 tensor or an order 1 tensor with a dimension of 1. Also convert GPU back to CPU
 @propagate_inbounds function getindex(T::ITensor)::Any
-  if order(T) != 0
+  if order(T) != 0 && dim(T) != 1
     throw(
       DimensionMismatch(
         "In scalar(T) or T[], ITensor T is not a scalar (it has indices $(inds(T)))."
@@ -1892,7 +1893,7 @@ diag(T::ITensor) = diag(tensor(T))
 
 mul!(C::ITensor, A::ITensor, B::ITensor, args...)::ITensor = contract!(C, A, B, args...)
 
-dot(A::ITensor, B::ITensor) = (dag(A) * B)[]
+dot(A::ITensor, B::ITensor) = NDTensors.cpu(dag(A) * B)[]
 
 inner(y::ITensor, A::ITensor, x::ITensor) = (dag(y) * A * x)[]
 inner(y::ITensor, x::ITensor) = (dag(y) * x)[]
diff --git a/src/mps/dmrg.jl b/src/mps/dmrg.jl
index a6b78c6f8c..f81d12b288 100644
--- a/src/mps/dmrg.jl
+++ b/src/mps/dmrg.jl
@@ -278,7 +278,15 @@ function dmrg(PH, psi0::MPS, sweeps::Sweeps; kwargs...)
         end
 
         energy = vals[1]
-        phi::ITensor = vecs[1]
+        ## Right now there is a conversion problem in CUDA.jl where `UnifiedMemory` Arrays are being converted 
+        ## into `DeviceMemory`. This conversion line is here temporarily to fix that problem when it arises
+        ## Adapt is only called when using CUDA backend. CPU will work as implemented previously.
+        phi::ITensor = if NDTensors.iscu(phi) && NDTensors.iscu(vecs[1])
+          adapt(set_eltype(leaf_parenttype(phi), eltype(vecs[1])), vecs[1])
+        else
+          vecs[1]
+        end
+        #phi::ITensor = vecs[1]
 
         ortho = ha == 1 ? "left" : "right"
 
diff --git a/src/set_types.jl b/src/set_types.jl
new file mode 100644
index 0000000000..8d866d7c27
--- /dev/null
+++ b/src/set_types.jl
@@ -0,0 +1 @@
+leaf_parenttype(T::ITensor) = leaf_parenttype(tensor(T))
diff --git a/src/tensor_operations/permutations.jl b/src/tensor_operations/permutations.jl
index 5f7bb174e0..b85e4f5345 100644
--- a/src/tensor_operations/permutations.jl
+++ b/src/tensor_operations/permutations.jl
@@ -55,12 +55,12 @@ function permute(T::ITensor, new_inds...; kwargs...)
 end
 
 # TODO: move to NDTensors
-function permutedims(::AllowAlias, T::Tensor, perm)
+function NDTensors.permutedims(::AllowAlias, T::Tensor, perm)
   return NDTensors.is_trivial_permutation(perm) ? T : permutedims(NeverAlias(), T, perm)
 end
 
 # TODO: move to NDTensors, define `permutedims` in terms of `NeverAlias`
-function permutedims(::NeverAlias, T::Tensor, perm)
+function NDTensors.permutedims(::NeverAlias, T::Tensor, perm)
   return permutedims(T, perm)
 end
 
diff --git a/test/ITensorNetworkMaps/utils/utils.jl b/test/ITensorNetworkMaps/utils/utils.jl
index 02275aee89..7b4254c4a5 100644
--- a/test/ITensorNetworkMaps/utils/utils.jl
+++ b/test/ITensorNetworkMaps/utils/utils.jl
@@ -33,7 +33,7 @@ function ITensors.prime(::typeof(linkinds), tn::InfTN)
   return tn_p
 end
 
-interleave(x::Vector, y::Vector) = permutedims([x y])[:]
+interleave(x::Vector, y::Vector) = Base.permutedims([x y])[:]
 
 function ITensors.linkind(tn::InfTN, e)
   return commonind(tn[e[1]], tn[e[2]])
diff --git a/test/base/test_itensor_scalar_contract.jl b/test/base/test_itensor_scalar_contract.jl
index c322139bba..63e311e59e 100644
--- a/test/base/test_itensor_scalar_contract.jl
+++ b/test/base/test_itensor_scalar_contract.jl
@@ -65,14 +65,14 @@ end
 
     R .= A .* B
     @test !any(isnan, R)
-    @test array(R) ≈ permutedims(array(A), (2, 1, 3)) * array(B)[]
+    @test array(R) ≈ NDTensors.permutedims(array(A), (2, 1, 3)) * array(B)[]
 
     R .= NaN
     @test any(isnan, R)
 
     R .= B .* A
     @test !any(isnan, R)
-    @test array(R) ≈ permutedims(array(A), (2, 1, 3)) * array(B)[]
+    @test array(R) ≈ NDTensors.permutedims(array(A), (2, 1, 3)) * array(B)[]
   end
 
   @testset "General contraction, permutation" for ElA in BlasFloats, ElB in BlasFloats
@@ -89,7 +89,7 @@ end
     R .= A .* B
     @test !any(isnan, R)
     @test reshape(array(R), 6, 2) ≈
-      reshape(permutedims(array(A), (2, 1, 3)), 6, 2) * array(B)
+      reshape(NDTensors.permutedims(array(A), (2, 1, 3)), 6, 2) * array(B)
 
     R .= NaN
     @test any(isnan, R)
@@ -97,6 +97,6 @@ end
     R .= B .* A
     @test !any(isnan, R)
     @test reshape(array(R), 6, 2) ≈
-      reshape(permutedims(array(A), (2, 1, 3)), 6, 2) * array(B)
+      reshape(NDTensors.permutedims(array(A), (2, 1, 3)), 6, 2) * array(B)
   end
 end