ITensor · kmp5VT · Oct 18, 2023 · Oct 18, 2023 · Oct 18, 2023 · Oct 23, 2023
diff --git a/NDTensors/src/blocksparse/linearalgebra.jl b/NDTensors/src/blocksparse/linearalgebra.jl
@@ -75,7 +75,7 @@ function LinearAlgebra.svd(T::BlockSparseMatrix{ElT}; kwargs...) where {ElT}
 
   dropblocks = Int[]
   if truncate
-    truncerr, docut = truncate!(d; kwargs...)
+    truncerr, docut = truncate!!(d; kwargs...)
     for n in 1:nnzblocks(T)
       blockdim = _truncated_blockdim(
         Ss[n], docut; min_blockdim, singular_values=true, truncate
@@ -237,7 +237,7 @@ function LinearAlgebra.eigen(
   sort!(d; rev=true, by=abs)
 
   if truncate
-    truncerr, docut = truncate!(d; kwargs...)
+    truncerr, docut = truncate!!(d; kwargs...)
     for n in 1:nnzblocks(T)
       blockdim = _truncated_blockdim(Ds[n], docut)
       if blockdim == 0

diff --git a/NDTensors/src/dense/dense.jl b/NDTensors/src/dense/dense.jl
@@ -101,7 +101,16 @@ Dense(::Type{ElT}) where {ElT} = Dense{ElT}()
 setdata(D::Dense, ndata) = Dense(ndata)
 setdata(storagetype::Type{<:Dense}, data) = Dense(data)
 
-copy(D::Dense) = Dense(copy(data(D)))
+## There is an GPU arrays which are ReshapedArray can 
+## fail when trying to copy (fail meaning they call get_index which is slow) so this forces a fix. 
+## TODO make a better implementation
+function copy(D::Dense)
+  d = data(D)
+  if d isa Base.ReshapedArray
+    return Dense(copy(parent(d)))
+  end
+  return Dense(copy(data(D)))
+end
 
 function Base.real(T::Type{<:Dense})
   return set_datatype(T, similartype(datatype(T), real(eltype(T))))

diff --git a/NDTensors/src/linearalgebra/svd.jl b/NDTensors/src/linearalgebra/svd.jl
@@ -1,11 +1,15 @@
 
+## TODO here it looks at the elements of S so convert to CPU when on GPU
+## Could write this as a GPU impl which just converts S to array. S 
+## is not used again so we don't need to convert back to GPU.
 function checkSVDDone(S::AbstractArray, thresh::Float64)
   N = length(S)
   (N <= 1 || thresh < 0.0) && return (true, 1)
-  S1t = S[1] * thresh
+  Scpu = NDTensors.cpu(S)
+  S1t = Scpu[1] * thresh
   start = 2
   while start <= N
-    (S[start] < S1t) && break
+    (Scpu[start] < S1t) && break
     start += 1
   end
   if start >= N
@@ -32,9 +36,8 @@ function svd_recursive(M::AbstractMatrix; thresh::Float64=1E-3, north_pass::Int=
   V = M' * U
 
   V, R = qr_positive(V)
-  for n in 1:Nd
-    D[n] = R[n, n]
-  end
+  n = size(R)[1]
+  D = diag(R, (n - Nd))
 
   (done, start) = checkSVDDone(D, thresh)
 

diff --git a/NDTensors/src/truncate.jl b/NDTensors/src/truncate.jl
@@ -1,4 +1,23 @@
-export truncate!
+export truncate!!, truncate!
+
+## TODO Here truncate does logical operations of the values in P
+## So its more efficient to just make it a CPU vector and 
+## convert back to GPU
+
+function truncate!!(P::AbstractVector; kwargs...)
+  return truncate!(leaf_parenttype(P), P; kwargs...)
+end
+
+function truncate!(::Type{<:Array}, P; kwargs...)
+  return truncate!(P; kwargs...)
+end
+
+function truncate!(::Type{<:AbstractArray}, P; kwargs...)
+  P_cpu = cpu(P)
+  values = truncate!(P_cpu; kwargs...)
+  P = adapt(leaf_parenttype(P), P_cpu)
+  return values
+end
 
 function truncate!(P::AbstractVector{ElT}; kwargs...)::Tuple{ElT,ElT} where {ElT}
   cutoff::Union{Nothing,ElT} = get(kwargs, :cutoff, zero(ElT))