From d069e4b05dee9fa88872b851bbf3f978f4a7c701 Mon Sep 17 00:00:00 2001 From: Pierre Karpman Date: Tue, 17 Dec 2019 15:50:49 +0100 Subject: [PATCH 1/5] use fast reduce in one more case of non-trivial increments --- fflas-ffpack/fflas/fflas_fadd.h | 2 +- fflas-ffpack/fflas/fflas_fscal.inl | 33 ++++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/fflas-ffpack/fflas/fflas_fadd.h b/fflas-ffpack/fflas/fflas_fadd.h index dc02393be..d1a15cbfc 100644 --- a/fflas-ffpack/fflas/fflas_fadd.h +++ b/fflas-ffpack/fflas/fflas_fadd.h @@ -39,7 +39,7 @@ namespace FFLAS { struct support_simd_add : public std::true_type {} ; template<> struct support_simd_add : public std::true_type {} ; -#ifdef SIMD_INT +#ifdef SIMD_INT // why? PK - 2019/12 template<> struct support_simd_add : public std::true_type {} ; template<> diff --git a/fflas-ffpack/fflas/fflas_fscal.inl b/fflas-ffpack/fflas/fflas_fscal.inl index 314d9ea1c..9f5a9eb2e 100644 --- a/fflas-ffpack/fflas/fflas_fscal.inl +++ b/fflas-ffpack/fflas/fflas_fscal.inl @@ -103,7 +103,6 @@ namespace FFLAS { namespace vectorised { namespace unswitch { } } - template inline typename std::enable_if::value, void>::type scalp(const Field &F, typename Field::Element_ptr T, const typename Field::Element alpha, typename Field::ConstElement_ptr U, const size_t n, const size_t & incX, HelperMod &H) @@ -116,6 +115,19 @@ namespace FFLAS { namespace vectorised { namespace unswitch { } } + template + inline typename std::enable_if::value, void>::type + scalp(const Field &F, typename Field::Element_ptr T, const typename Field::Element alpha, typename Field::ConstElement_ptr U, const size_t n, + const size_t & incX, const size_t & incY, HelperMod &H) + { + typename Field::ConstElement_ptr Xi = U; + typename Field::Element_ptr Yi = T; + for (; Xi < U+n*incX ; Xi+=incX,Yi+=incY) + { + *Yi = reduce(alpha*(*Xi),H); + } + } + } // unswitch } // vectorised } // FFLAS @@ -141,6 +153,16 @@ namespace FFLAS { namespace vectorised { unswitch::scalp(F,T,alpha,U,n,incX,H); } + template + inline typename std::enable_if::value, void>::type + scalp(const Field &F, typename Field::Element_ptr T, const typename Field::Element alpha, typename Field::ConstElement_ptr U, const size_t n, const size_t & incX, const size_t & incY) + { + HelperMod H(F); + + unswitch::scalp(F,T,alpha,U,n,incX,incY,H); + } + + } // vectorised } // FFLAS @@ -187,11 +209,10 @@ namespace FFLAS { namespace details { if(incX == 1 && incY==1) { vectorised::scalp(F,Y,a,X,N); } - else { - typename Field::ConstElement_ptr Xi = X ; - typename Field::Element_ptr Yi = Y ; - for (; Xi < X+N*incX; Xi+=incX,Yi+=incY ) - F.mul(*Yi, *Xi , a); + else + { + // no copy for now + vectorised::scalp(F,Y,a,X,N,incX,incY); } } From 24c162fcaad98fb9257201ed3e48c6d5a7457f11 Mon Sep 17 00:00:00 2001 From: Pierre Karpman Date: Tue, 17 Dec 2019 15:51:37 +0100 Subject: [PATCH 2/5] vectorised faxpy based on fscal --- fflas-ffpack/fflas/fflas_faxpy.inl | 185 ++++++++++++++++++++++++++++- 1 file changed, 180 insertions(+), 5 deletions(-) diff --git a/fflas-ffpack/fflas/fflas_faxpy.inl b/fflas-ffpack/fflas/fflas_faxpy.inl index aab866585..b37fedb36 100644 --- a/fflas-ffpack/fflas/fflas_faxpy.inl +++ b/fflas-ffpack/fflas/fflas_faxpy.inl @@ -2,6 +2,7 @@ * Copyright (C) 2005 Clement Pernet * * Written by Clement Pernet + * Pierre Karpman * * * ========LICENCE======== @@ -27,18 +28,169 @@ #ifndef __FFLASFFPACK_faxpy_INL #define __FFLASFFPACK_faxpy_INL +/* most of this is a shameless copy from fscal... */ +namespace FFLAS { namespace vectorised { namespace unswitch { -namespace FFLAS { + // TODO axpy not p (also do this for fscal)? +#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS - template + // TODO check if returning in register is better?? + template inline void - faxpy( const Field& F, const size_t N, + VEC_FMA(typename SimdT::vect_t & A, typename SimdT::vect_t & X, typename SimdT::vect_t & Y, HelperModSimd & H) + { + Y = SimdT::fmadd(Y,A,X); + VEC_MOD(Y, H); + } + + template + inline typename std::enable_if::value, void>::type + axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, HelperMod &G) + { + typedef typename Field::Element Element; + using simd = Simd; + using vect_t = typename simd::vect_t; + HelperModSimd H(F,G); + vect_t A, U, V; + A = simd::set1(a); + + size_t i = 0; + + if (n < simd::vect_size) + { + for (; i < n ; i++) + { + Y[i] = reduce(a*X[i]+Y[i], H); + } + return; + } + + long st = long(Y) % simd::alignment; + if (st) + { // the array Y is not aligned (process few elements until (Y+i) is aligned) + for (size_t j = static_cast(st) ; j < simd::alignment ; j+=sizeof(Element), i++) + { + Y[i] = reduce(a*X[i]+Y[i], H); + } + } + FFLASFFPACK_check((long(Y+i) % simd::alignment == 0)); + if (long(Y+i)%simd::alignment==0) + { + // perform the loop using SIMD + for (;i <= n - simd::vect_size ; i += simd::vect_size) + { + U = simd::loadu(X+i); // could be transformed into a load at the cost of an additional top check. Worth it? + V = simd::load(Y+i); + VEC_FMA(A, U, V, H); + simd::store(Y+i,V); + } + } + // perform the last elt from T without SIMD + for (; i < n ; i++) + { + Y[i] = reduce(a*X[i]+Y[i], H); + } + } + +#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS + + template + inline typename std::enable_if::value && FFLAS::support_fast_mod::value, void>::type + axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, HelperMod &H) + { + size_t i = 0; + + for (; i < n ; i++) + { + Y[i] = reduce(a*X[i] + Y[i], H); + } + } + + + template + inline typename std::enable_if::value, void>::type + axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, const size_t incX, const size_t incY, + HelperMod &H) + { + typename Field::ConstElement_ptr Xi = X; + typename Field::Element_ptr Yi=Y; + for (; Xi < X+n*incX; Xi+=incX, Yi+=incY ) + *Yi = reduce(a*(*Xi) + (*Yi), H); + } + +} // unswitch +} // vectorised +} // FFLAS + +namespace FFLAS { namespace vectorised { + + // simd_mod => fast_mod, so this declaration's one in two + template + inline typename std::enable_if::value, void>::type + axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n) + { + HelperMod H(F); + + unswitch::axpyp(F,a,X,Y,n,H); + } + + template + inline typename std::enable_if::value, void>::type + axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, const size_t incX, const size_t incY) + { + HelperMod H(F); + + unswitch::axpyp(F,a,X,Y,n,incX,incY,H); + } + +} // vectorised +} // FFLAS + +namespace FFLAS { namespace details { + + /*** Entry-points for specialised code ***/ + // simd_mod => fast_mod; + // will default to the best supported option in unswitch + + template + inline typename std::enable_if::value, void>::type + faxpy( const Field & F , const size_t N, const typename Field::Element a, typename Field::ConstElement_ptr X, const size_t incX, - typename Field::Element_ptr Y, const size_t incY ) + typename Field::Element_ptr Y, const size_t incY, + FieldCategories::ModularTag) { + if((incX == 1) && (incY == 1)) + { + vectorised::axpyp(F,a,X,Y,N); + } + else + { + if (N < FFLASFFPACK_COPY_REDUCE) // defined in fflas_freduce.inl + { + vectorised::axpyp(F,a,X,Y,N,incX,incY); + } + else + { + typename Field::Element_ptr Xc = fflas_new (F,N) ; + typename Field::Element_ptr Yc = fflas_new (F,N) ; + fassign(F,N,X,incX,Xc,1); + fassign(F,N,Y,incY,Yc,1); + faxpy(F,N,a,Xc,1,Yc,1,FieldCategories::ModularTag()); + fassign(F,N,Yc,1,Y,incY); + fflas_delete(Xc); + fflas_delete(Yc); + } + } + } + template + inline void + faxpy(const Field& F, const size_t N, const typename Field::Element a, + typename Field::ConstElement_ptr X, const size_t incX, + typename Field::Element_ptr Y, const size_t incY, FC) + { if (F.isZero(a)) return ; @@ -51,11 +203,29 @@ namespace FFLAS { //return fneg(F,N,X,incX,Y,incY); typename Field::ConstElement_ptr Xi = X; - typename Field::Element_ptr Yi=Y; + typename Field::Element_ptr Yi = Y; for (; Xi < X+N*incX; Xi+=incX, Yi+=incY ) F.axpyin( *Yi, a, *Xi ); } +} // details +} // FFLAS + +namespace FFLAS { + + /***************************/ + /* LEVEL 1 */ + /***************************/ + + template + inline void + faxpy(const Field& F, const size_t n, const typename Field::Element a, + typename Field::ConstElement_ptr X, const size_t incX, + typename Field::Element_ptr Y, const size_t incY) + { + details::faxpy(F,n,a,X,incX,Y,incY,typename FieldTraits::category()); + } + template<> inline void faxpy( const Givaro::DoubleDomain& , const size_t N, @@ -84,6 +254,11 @@ namespace FFLAS { cblas_saxpy( (int)N, a, x, (int)incx, y, (int)incy); } + + /***************************/ + /* LEVEL 2 */ + /***************************/ + template inline void faxpy( const Field& F, const size_t m, const size_t n, From 5c977199eb6e8d06543c9b065245e0eef64a8945 Mon Sep 17 00:00:00 2001 From: Pierre Karpman Date: Tue, 17 Dec 2019 17:03:27 +0100 Subject: [PATCH 3/5] done by blas --- fflas-ffpack/fflas/fflas_faxpy.inl | 1 - 1 file changed, 1 deletion(-) diff --git a/fflas-ffpack/fflas/fflas_faxpy.inl b/fflas-ffpack/fflas/fflas_faxpy.inl index b37fedb36..570af633e 100644 --- a/fflas-ffpack/fflas/fflas_faxpy.inl +++ b/fflas-ffpack/fflas/fflas_faxpy.inl @@ -32,7 +32,6 @@ namespace FFLAS { namespace vectorised { namespace unswitch { - // TODO axpy not p (also do this for fscal)? #ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS // TODO check if returning in register is better?? From a24e5c58f3860bcbd9b4debca23e9f9708fa49cd Mon Sep 17 00:00:00 2001 From: Pierre Karpman Date: Thu, 20 Feb 2020 17:57:35 +0100 Subject: [PATCH 4/5] (hastily?) addressing PR comments --- fflas-ffpack/fflas/fflas_faxpy.inl | 40 ++++++++++++++++++----- fflas-ffpack/fflas/fflas_freduce.inl | 47 +++++++++++++++++++++++++--- fflas-ffpack/fflas/fflas_fscal.inl | 45 +++++++++++++++++++++++--- 3 files changed, 116 insertions(+), 16 deletions(-) diff --git a/fflas-ffpack/fflas/fflas_faxpy.inl b/fflas-ffpack/fflas/fflas_faxpy.inl index 570af633e..8bc7e8dd2 100644 --- a/fflas-ffpack/fflas/fflas_faxpy.inl +++ b/fflas-ffpack/fflas/fflas_faxpy.inl @@ -172,14 +172,38 @@ namespace FFLAS { namespace details { } else { - typename Field::Element_ptr Xc = fflas_new (F,N) ; - typename Field::Element_ptr Yc = fflas_new (F,N) ; - fassign(F,N,X,incX,Xc,1); - fassign(F,N,Y,incY,Yc,1); - faxpy(F,N,a,Xc,1,Yc,1,FieldCategories::ModularTag()); - fassign(F,N,Yc,1,Y,incY); - fflas_delete(Xc); - fflas_delete(Yc); + typename Field::Element_ptr Xc; + typename Field::Element_ptr Yc; + if (incX != 1) + { + Xc = fflas_new (F,N); + fassign(F,N,X,incX,Xc,1); + } + else + { + Xc = const_cast(X); // Oh the horror + } + if (incY != 1) + { + Yc = fflas_new (F,N); + fassign(F,N,Y,incY,Yc,1); + } + else + { + Yc = const_cast(Y); + } + + vectorised::axpyp(F,a,Xc,Yc,N); + + if (incY != 1) + { + fassign(F,N,Yc,1,Y,incY); + fflas_delete(Yc); + } + if (incX != 1) + { + fflas_delete(Xc); + } } } } diff --git a/fflas-ffpack/fflas/fflas_freduce.inl b/fflas-ffpack/fflas/fflas_freduce.inl index e17b58b37..2f30546b6 100644 --- a/fflas-ffpack/fflas/fflas_freduce.inl +++ b/fflas-ffpack/fflas/fflas_freduce.inl @@ -544,10 +544,49 @@ namespace FFLAS { namespace details { } else { - typename Field::Element_ptr Xi = A ; - typename Field::ConstElement_ptr Yi = B ; - for (; Xi < A+m*incX; Xi+=incX, Yi += incY ) - F.reduce (*Xi , *Yi); + if (m < FFLASFFPACK_COPY_REDUCE) + { + // TODO also write a ``fast'' modp that handles an incY + typename Field::Element_ptr Xi = A ; + typename Field::ConstElement_ptr Yi = B ; + for (; Xi < A+m*incX; Xi+=incX, Yi += incY ) + F.reduce (*Xi , *Yi); + } + else + { + typename Field::Element_ptr Ac; + typename Field::Element_ptr Bc; + if (incX != 1) + { + Ac = fflas_new (F,m); + fassign(F,m,A,incX,Ac,1); + } + else + { + Ac = const_cast(A); // Oh the horror + } + if (incY != 1) + { + Bc = fflas_new (F,m); + fassign(F,m,B,incY,Bc,1); + } + else + { + Bc = const_cast(B); + } + + vectorised::modp(F,Bc,m,Ac); + + if (incY != 1) + { + fassign(F,m,Bc,1,B,incY); + fflas_delete(Bc); + } + if (incX != 1) + { + fflas_delete(Ac); + } + } } } diff --git a/fflas-ffpack/fflas/fflas_fscal.inl b/fflas-ffpack/fflas/fflas_fscal.inl index 9f5a9eb2e..66f60d205 100644 --- a/fflas-ffpack/fflas/fflas_fscal.inl +++ b/fflas-ffpack/fflas/fflas_fscal.inl @@ -184,7 +184,7 @@ namespace FFLAS { namespace details { } else { - if (N < FFLASFFPACK_COPY_REDUCE) + if (N < FFLASFFPACK_COPY_REDUCE) // defined in fflas_freduce.inl { vectorised::scalp(F,X,a,X,N,incX); } @@ -192,7 +192,7 @@ namespace FFLAS { namespace details { { typename Field::Element_ptr Xc = fflas_new (F,N) ; fassign (F,N,X,incX,Xc,1); - fscalin (F,N,a,Xc,1,FieldCategories::ModularTag()); + vectorised::scalp(F,Xc,a,Xc,N); fassign (F,N,Xc,1,X,incX); fflas_delete (Xc); } @@ -211,8 +211,45 @@ namespace FFLAS { namespace details { } else { - // no copy for now - vectorised::scalp(F,Y,a,X,N,incX,incY); + if (N < FFLASFFPACK_COPY_REDUCE) + { + vectorised::scalp(F,Y,a,X,N,incX,incY); + } + else + { + typename Field::Element_ptr Xc; + typename Field::Element_ptr Yc; + if (incX != 1) + { + Xc = fflas_new (F,N); + fassign(F,N,X,incX,Xc,1); + } + else + { + Xc = const_cast(X); // Oh the horror + } + if (incY != 1) + { + Yc = fflas_new (F,N); + fassign(F,N,Y,incY,Yc,1); + } + else + { + Yc = const_cast(Y); + } + + vectorised::scalp(F,Yc,a,Xc,N); + + if (incY != 1) + { + fassign(F,N,Yc,1,Y,incY); + fflas_delete(Yc); + } + if (incX != 1) + { + fflas_delete(Xc); + } + } } } From 56e5e3499894fbefd603a8a1678b6b6de888a60e Mon Sep 17 00:00:00 2001 From: Pierre Karpman Date: Mon, 24 Feb 2020 14:46:11 +0100 Subject: [PATCH 5/5] const bugfix? --- fflas-ffpack/fflas/fflas_faxpy.inl | 2 +- fflas-ffpack/fflas/fflas_freduce.inl | 16 ++++++++-------- fflas-ffpack/fflas/fflas_fscal.inl | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fflas-ffpack/fflas/fflas_faxpy.inl b/fflas-ffpack/fflas/fflas_faxpy.inl index 8bc7e8dd2..ee4c6a84d 100644 --- a/fflas-ffpack/fflas/fflas_faxpy.inl +++ b/fflas-ffpack/fflas/fflas_faxpy.inl @@ -190,7 +190,7 @@ namespace FFLAS { namespace details { } else { - Yc = const_cast(Y); + Yc = Y; } vectorised::axpyp(F,a,Xc,Yc,N); diff --git a/fflas-ffpack/fflas/fflas_freduce.inl b/fflas-ffpack/fflas/fflas_freduce.inl index 2f30546b6..09460a5ed 100644 --- a/fflas-ffpack/fflas/fflas_freduce.inl +++ b/fflas-ffpack/fflas/fflas_freduce.inl @@ -524,7 +524,7 @@ namespace FFLAS { namespace details { { typename Field::Element_ptr Ac = fflas_new (F,m) ; fassign (F,m,A,incX,Ac,1); - freduce (F,m,Ac,1,FieldCategories::ModularTag()); + vectorised::modp(F,Ac,m,Ac); fassign (F,m,Ac,1,A,incX); fflas_delete (Ac); } @@ -563,7 +563,7 @@ namespace FFLAS { namespace details { } else { - Ac = const_cast(A); // Oh the horror + Ac = A; } if (incY != 1) { @@ -572,20 +572,20 @@ namespace FFLAS { namespace details { } else { - Bc = const_cast(B); + Bc = const_cast(B); // Oh the horror } vectorised::modp(F,Bc,m,Ac); - if (incY != 1) - { - fassign(F,m,Bc,1,B,incY); - fflas_delete(Bc); - } if (incX != 1) { + fassign(F,m,Ac,1,A,incX); fflas_delete(Ac); } + if (incY != 1) + { + fflas_delete(Bc); + } } } } diff --git a/fflas-ffpack/fflas/fflas_fscal.inl b/fflas-ffpack/fflas/fflas_fscal.inl index 66f60d205..ef1d35f1a 100644 --- a/fflas-ffpack/fflas/fflas_fscal.inl +++ b/fflas-ffpack/fflas/fflas_fscal.inl @@ -235,7 +235,7 @@ namespace FFLAS { namespace details { } else { - Yc = const_cast(Y); + Yc = Y; } vectorised::scalp(F,Yc,a,Xc,N);