Skip to content

Commit

Permalink
Vfmadd (#289)
Browse files Browse the repository at this point in the history
* use fast reduce in one more case of non-trivial increments

* vectorised faxpy based on fscal

* done by blas

* (hastily?) addressing PR comments

* const bugfix?
  • Loading branch information
P1K authored Sep 16, 2021
1 parent f4d3d81 commit 83550b2
Show file tree
Hide file tree
Showing 4 changed files with 314 additions and 19 deletions.
2 changes: 1 addition & 1 deletion fflas-ffpack/fflas/fflas_fadd.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ namespace FFLAS {
struct support_simd_add<float> : public std::true_type {} ;
template<>
struct support_simd_add<double> : public std::true_type {} ;
#ifdef SIMD_INT
#ifdef SIMD_INT // why? PK - 2019/12
template<>
struct support_simd_add<int32_t> : public std::true_type {} ;
template<>
Expand Down
208 changes: 203 additions & 5 deletions fflas-ffpack/fflas/fflas_faxpy.inl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* Copyright (C) 2005 Clement Pernet
*
* Written by Clement Pernet <[email protected]>
* Pierre Karpman <[email protected]>
*
*
* ========LICENCE========
Expand All @@ -27,18 +28,192 @@
#ifndef __FFLASFFPACK_faxpy_INL
#define __FFLASFFPACK_faxpy_INL

/* most of this is a shameless copy from fscal... */

namespace FFLAS { namespace vectorised { namespace unswitch {

namespace FFLAS {
#ifdef __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS

template<class Field>
// TODO check if returning in register is better??
template<class Field, class SimdT>
inline void
faxpy( const Field& F, const size_t N,
VEC_FMA(typename SimdT::vect_t & A, typename SimdT::vect_t & X, typename SimdT::vect_t & Y, HelperModSimd<Field,SimdT> & H)
{
Y = SimdT::fmadd(Y,A,X);
VEC_MOD(Y, H);
}

template<class Field>
inline typename std::enable_if<FFLAS::support_simd_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, HelperMod<Field> &G)
{
typedef typename Field::Element Element;
using simd = Simd<Element>;
using vect_t = typename simd::vect_t;
HelperModSimd<Field,simd> H(F,G);
vect_t A, U, V;
A = simd::set1(a);

size_t i = 0;

if (n < simd::vect_size)
{
for (; i < n ; i++)
{
Y[i] = reduce(a*X[i]+Y[i], H);
}
return;
}

long st = long(Y) % simd::alignment;
if (st)
{ // the array Y is not aligned (process few elements until (Y+i) is aligned)
for (size_t j = static_cast<size_t>(st) ; j < simd::alignment ; j+=sizeof(Element), i++)
{
Y[i] = reduce(a*X[i]+Y[i], H);
}
}
FFLASFFPACK_check((long(Y+i) % simd::alignment == 0));
if (long(Y+i)%simd::alignment==0)
{
// perform the loop using SIMD
for (;i <= n - simd::vect_size ; i += simd::vect_size)
{
U = simd::loadu(X+i); // could be transformed into a load at the cost of an additional top check. Worth it?
V = simd::load(Y+i);
VEC_FMA(A, U, V, H);
simd::store(Y+i,V);
}
}
// perform the last elt from T without SIMD
for (; i < n ; i++)
{
Y[i] = reduce(a*X[i]+Y[i], H);
}
}

#endif // __FFLASFFPACK_HAVE_SSE4_1_INSTRUCTIONS

template<class Field>
inline typename std::enable_if<!FFLAS::support_simd_mod<typename Field::Element>::value && FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, HelperMod<Field> &H)
{
size_t i = 0;

for (; i < n ; i++)
{
Y[i] = reduce(a*X[i] + Y[i], H);
}
}


template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, const size_t incX, const size_t incY,
HelperMod<Field> &H)
{
typename Field::ConstElement_ptr Xi = X;
typename Field::Element_ptr Yi=Y;
for (; Xi < X+n*incX; Xi+=incX, Yi+=incY )
*Yi = reduce(a*(*Xi) + (*Yi), H);
}

} // unswitch
} // vectorised
} // FFLAS

namespace FFLAS { namespace vectorised {

// simd_mod => fast_mod, so this declaration's one in two
template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n)
{
HelperMod<Field> H(F);

unswitch::axpyp(F,a,X,Y,n,H);
}

template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
axpyp(const Field &F, const typename Field::Element a, typename Field::ConstElement_ptr X, typename Field::Element_ptr Y, const size_t n, const size_t incX, const size_t incY)
{
HelperMod<Field> H(F);

unswitch::axpyp(F,a,X,Y,n,incX,incY,H);
}

} // vectorised
} // FFLAS

namespace FFLAS { namespace details {

/*** Entry-points for specialised code ***/
// simd_mod => fast_mod;
// will default to the best supported option in unswitch

template<class Field>
inline typename std::enable_if<FFLAS::support_fast_mod<typename Field::Element>::value, void>::type
faxpy( const Field & F , const size_t N,
const typename Field::Element a,
typename Field::ConstElement_ptr X, const size_t incX,
typename Field::Element_ptr Y, const size_t incY )
typename Field::Element_ptr Y, const size_t incY,
FieldCategories::ModularTag)
{
if((incX == 1) && (incY == 1))
{
vectorised::axpyp(F,a,X,Y,N);
}
else
{
if (N < FFLASFFPACK_COPY_REDUCE) // defined in fflas_freduce.inl
{
vectorised::axpyp(F,a,X,Y,N,incX,incY);
}
else
{
typename Field::Element_ptr Xc;
typename Field::Element_ptr Yc;
if (incX != 1)
{
Xc = fflas_new (F,N);
fassign(F,N,X,incX,Xc,1);
}
else
{
Xc = const_cast<typename Field::Element_ptr>(X); // Oh the horror
}
if (incY != 1)
{
Yc = fflas_new (F,N);
fassign(F,N,Y,incY,Yc,1);
}
else
{
Yc = Y;
}

vectorised::axpyp(F,a,Xc,Yc,N);

if (incY != 1)
{
fassign(F,N,Yc,1,Y,incY);
fflas_delete(Yc);
}
if (incX != 1)
{
fflas_delete(Xc);
}
}
}
}

template<class Field, class FC>
inline void
faxpy(const Field& F, const size_t N, const typename Field::Element a,
typename Field::ConstElement_ptr X, const size_t incX,
typename Field::Element_ptr Y, const size_t incY, FC)
{
if (F.isZero(a))
return ;

Expand All @@ -51,11 +226,29 @@ namespace FFLAS {
//return fneg(F,N,X,incX,Y,incY);

typename Field::ConstElement_ptr Xi = X;
typename Field::Element_ptr Yi=Y;
typename Field::Element_ptr Yi = Y;
for (; Xi < X+N*incX; Xi+=incX, Yi+=incY )
F.axpyin( *Yi, a, *Xi );
}

} // details
} // FFLAS

namespace FFLAS {

/***************************/
/* LEVEL 1 */
/***************************/

template<class Field>
inline void
faxpy(const Field& F, const size_t n, const typename Field::Element a,
typename Field::ConstElement_ptr X, const size_t incX,
typename Field::Element_ptr Y, const size_t incY)
{
details::faxpy(F,n,a,X,incX,Y,incY,typename FieldTraits<Field>::category());
}

template<>
inline void
faxpy( const Givaro::DoubleDomain& , const size_t N,
Expand Down Expand Up @@ -84,6 +277,11 @@ namespace FFLAS {
cblas_saxpy( (int)N, a, x, (int)incx, y, (int)incy);
}


/***************************/
/* LEVEL 2 */
/***************************/

template<class Field>
inline void
faxpy( const Field& F, const size_t m, const size_t n,
Expand Down
49 changes: 44 additions & 5 deletions fflas-ffpack/fflas/fflas_freduce.inl
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,7 @@ namespace FFLAS { namespace details {
{
typename Field::Element_ptr Ac = fflas_new (F,m) ;
fassign (F,m,A,incX,Ac,1);
freduce (F,m,Ac,1,FieldCategories::ModularTag());
vectorised::modp(F,Ac,m,Ac);
fassign (F,m,Ac,1,A,incX);
fflas_delete (Ac);
}
Expand All @@ -544,10 +544,49 @@ namespace FFLAS { namespace details {
}
else
{
typename Field::Element_ptr Xi = A ;
typename Field::ConstElement_ptr Yi = B ;
for (; Xi < A+m*incX; Xi+=incX, Yi += incY )
F.reduce (*Xi , *Yi);
if (m < FFLASFFPACK_COPY_REDUCE)
{
// TODO also write a ``fast'' modp that handles an incY
typename Field::Element_ptr Xi = A ;
typename Field::ConstElement_ptr Yi = B ;
for (; Xi < A+m*incX; Xi+=incX, Yi += incY )
F.reduce (*Xi , *Yi);
}
else
{
typename Field::Element_ptr Ac;
typename Field::Element_ptr Bc;
if (incX != 1)
{
Ac = fflas_new (F,m);
fassign(F,m,A,incX,Ac,1);
}
else
{
Ac = A;
}
if (incY != 1)
{
Bc = fflas_new (F,m);
fassign(F,m,B,incY,Bc,1);
}
else
{
Bc = const_cast<typename Field::Element_ptr>(B); // Oh the horror
}

vectorised::modp(F,Bc,m,Ac);

if (incX != 1)
{
fassign(F,m,Ac,1,A,incX);
fflas_delete(Ac);
}
if (incY != 1)
{
fflas_delete(Bc);
}
}
}
}

Expand Down
Loading

0 comments on commit 83550b2

Please sign in to comment.