From cd1a7e7346d9f039c206e1097a2ac63bf10c7b0a Mon Sep 17 00:00:00 2001
From: Daniel <daniel@poradnik-webmastera.com>
Date: Tue, 8 Jan 2019 23:12:48 +0100
Subject: [PATCH 1/2] Optimized app for SSE2, SSE4.1, AVX, AVX2 and AVX512

---
 alglib-3.14.0/src/linalg.cpp                  | 407 ++++++++++++++----
 boinc_app/Makefile                            |  86 ++--
 boinc_app/all.cpp                             |   6 +
 boinc_app/cambala_boinc_app.cpp               | 119 ++++-
 .../{ => test}/ac_modes_R7km_dtimes1.txt      |   0
 boinc_app/test/cambala_depths_out.ref         |   1 +
 boinc_app/test/cambala_out.ref                |  26 ++
 boinc_app/{ => test}/in                       |   0
 boinc_app/test/out.ref                        |   1 +
 boinc_app/test/test.sh                        |  11 +
 10 files changed, 553 insertions(+), 104 deletions(-)
 create mode 100644 boinc_app/all.cpp
 rename boinc_app/{ => test}/ac_modes_R7km_dtimes1.txt (100%)
 create mode 100644 boinc_app/test/cambala_depths_out.ref
 create mode 100644 boinc_app/test/cambala_out.ref
 rename boinc_app/{ => test}/in (100%)
 create mode 100644 boinc_app/test/out.ref
 create mode 100755 boinc_app/test/test.sh

diff --git a/alglib-3.14.0/src/linalg.cpp b/alglib-3.14.0/src/linalg.cpp
index b6f67c2..cf530a9 100644
--- a/alglib-3.14.0/src/linalg.cpp
+++ b/alglib-3.14.0/src/linalg.cpp
@@ -23,6 +23,10 @@ A copy of the GNU General Public License is available at
 #include "stdafx.h"
 #include "linalg.h"
 
+#ifdef __SSE2__
+#include <immintrin.h>
+#endif
+
 // disable some irrelevant warnings
 #if (AE_COMPILER==AE_MSVC) && !defined(AE_ALL_WARNINGS)
 #pragma warning(disable:4100)
@@ -45365,6 +45369,98 @@ static void evd_internaldlaebz(ae_int_t ijob,
             c->ptr.p_double[ji] = 0.5*(ab->ptr.pp_double[ji][1]+ab->ptr.pp_double[ji][2]);
         }
     }
+	
+    auto postproc = [&](ae_int_t ji, ae_int_t itmp1, double tmp1)->bool
+    {
+        if( ijob<=2 )
+        {
+            
+            /*
+             * IJOB=2: Choose all intervals containing eigenvalues.
+             *
+             * Insure that N(w) is monotone
+             */
+            itmp1 = ae_minint(nab->ptr.pp_int[ji][2], ae_maxint(nab->ptr.pp_int[ji][1], itmp1, _state), _state);
+            
+            /*
+             * Update the Queue -- add intervals if both halves
+             * contain eigenvalues.
+             */
+            if( itmp1==nab->ptr.pp_int[ji][2] )
+            {
+                
+                /*
+                 * No eigenvalue in the upper interval:
+                 * just use the lower interval.
+                 */
+                ab->ptr.pp_double[ji][2] = tmp1;
+            }
+            else
+            {
+                if( itmp1==nab->ptr.pp_int[ji][1] )
+                {
+                    
+                    /*
+                     * No eigenvalue in the lower interval:
+                     * just use the upper interval.
+                     */
+                    ab->ptr.pp_double[ji][1] = tmp1;
+                }
+                else
+                {
+                    if( klnew<mmax )
+                    {
+                        
+                        /*
+                         * Eigenvalue in both intervals -- add upper to queue.
+                         */
+                        klnew = klnew+1;
+                        ab->ptr.pp_double[klnew][2] = ab->ptr.pp_double[ji][2];
+                        nab->ptr.pp_int[klnew][2] = nab->ptr.pp_int[ji][2];
+                        ab->ptr.pp_double[klnew][1] = tmp1;
+                        nab->ptr.pp_int[klnew][1] = itmp1;
+                        ab->ptr.pp_double[ji][2] = tmp1;
+                        nab->ptr.pp_int[ji][2] = itmp1;
+                    }
+                    else
+                    {
+                        *info = mmax+1;
+                        return true;
+                    }
+                }
+            }
+        }
+        else
+        {
+            
+            /*
+             * IJOB=3: Binary search.  Keep only the interval
+             * containing  w  s.t. N(w) = NVAL
+             */
+            if( itmp1<=nval->ptr.p_int[ji] )
+            {
+                ab->ptr.pp_double[ji][1] = tmp1;
+                nab->ptr.pp_int[ji][1] = itmp1;
+            }
+            if( itmp1>=nval->ptr.p_int[ji] )
+            {
+                ab->ptr.pp_double[ji][2] = tmp1;
+                nab->ptr.pp_int[ji][2] = itmp1;
+            }
+        }
+        
+        return false;
+    };
+	
+    typedef ae_int_t ae_int_v2t __attribute__((vector_size(16)));
+    typedef ae_int_t ae_int_v4t __attribute__((vector_size(32)));
+    typedef ae_int_t ae_int_v8t __attribute__((vector_size(64)));
+    
+#ifdef __x86_64__
+    static_assert(sizeof(ae_int_t) == 8, "ae_int_t must be 64 bit");
+#else // !__x86_64__
+    static_assert(sizeof(ae_int_t) == 4, "ae_int_t must be 32 bit");
+#endif // !__x86_64__
     
     /*
      * Iteration loop
@@ -45379,7 +45475,238 @@ static void evd_internaldlaebz(ae_int_t ijob,
          * Serial Version of the loop
          */
         klnew = kl;
-        for(ji=kf; ji<=kl; ji++)
+        ji=kf;
+        
+#ifdef __AVX512F__
+        for(; ji<=kl-7; ji+=8)
+        {
+            
+            /*
+             * Compute N(w), the number of eigenvalues less than w
+             */
+            __m512d tmp1 = _mm512_loadu_pd(&c->ptr.p_double[ji]);
+            __m512d tmp2 = _mm512_set1_pd(d->ptr.p_double[1])-tmp1;
+
+            //__mmask8 cmp = _mm512_cmple_pd_mask(tmp2, _mm512_set1_pd(pivmin));
+            __mmask8 cmp = _mm512_cmp_pd_mask(tmp2, _mm512_set1_pd(pivmin), _CMP_LE_OS);
+            __m512i itmp1 = _mm512_maskz_set1_epi64(cmp, 1);
+
+            __m512d tmp2min = _mm512_min_pd(tmp2, _mm512_set1_pd(-pivmin));
+            tmp2 = _mm512_mask_blend_pd(cmp, tmp2, tmp2min);
+            
+            for(j=2; j<=n; j++)
+            {
+                tmp2 = d->ptr.p_double[j]-e2->ptr.p_double[j-1]/tmp2-tmp1;
+
+                cmp = _mm512_cmp_pd_mask(tmp2, _mm512_set1_pd(pivmin), _CMP_LE_OS);
+
+                itmp1 = _mm512_mask_add_epi64(itmp1, cmp, itmp1, _mm512_set1_epi64(1));
+
+                tmp2min = _mm512_min_pd(tmp2, _mm512_set1_pd(-pivmin));
+                tmp2 = _mm512_mask_blend_pd(cmp, tmp2, tmp2min);
+            }
+
+            ae_int_v8t itmp1v = (ae_int_v8t)itmp1;
+            for (int idx = 0; idx < 8; ++idx)
+                if (postproc(ji + idx, itmp1v[idx], tmp1[idx]))
+                    return;
+        }
+
+        if(ji<=kl-3)
+        {
+            
+            /*
+             * Compute N(w), the number of eigenvalues less than w
+             */
+            __m256d tmp1 = _mm256_loadu_pd(&c->ptr.p_double[ji]);
+            __m256d tmp2 = _mm256_set1_pd(d->ptr.p_double[1])-tmp1;
+
+            //__mmask8 cmp = _mm512_cmple_pd_mask(tmp2, _mm512_set1_pd(pivmin));
+            __mmask8 cmp = _mm256_cmp_pd_mask(tmp2, _mm256_set1_pd(pivmin), _CMP_LE_OS);
+            __m256i itmp1 = _mm256_maskz_set1_epi64(cmp, 1);
+
+            __m256d tmp2min = _mm256_min_pd(tmp2, _mm256_set1_pd(-pivmin));
+            tmp2 = _mm256_mask_blend_pd(cmp, tmp2, tmp2min);
+            
+            for(j=2; j<=n; j++)
+            {
+                tmp2 = d->ptr.p_double[j]-e2->ptr.p_double[j-1]/tmp2-tmp1;
+
+                cmp = _mm256_cmp_pd_mask(tmp2, _mm256_set1_pd(pivmin), _CMP_LE_OS);
+
+                itmp1 = _mm256_mask_add_epi64(itmp1, cmp, itmp1, _mm256_set1_epi64x(1));
+
+                tmp2min = _mm256_min_pd(tmp2, _mm256_set1_pd(-pivmin));
+                tmp2 = _mm256_mask_blend_pd(cmp, tmp2, tmp2min);
+            }
+
+            ae_int_v4t itmp1v = (ae_int_v4t)itmp1;
+            for (int idx = 0; idx < 4; ++idx)
+                if (postproc(ji + idx, itmp1v[idx], tmp1[idx]))
+                    return;
+            
+            ji+=4;
+        }
+
+        if(ji<=kl-1)
+        {
+            
+            /*
+             * Compute N(w), the number of eigenvalues less than w
+             */
+            __m128d tmp1 = _mm_loadu_pd(&c->ptr.p_double[ji]);
+            __m128d tmp2 = _mm_set1_pd(d->ptr.p_double[1])-tmp1;
+
+            //__mmask8 cmp = _mm512_cmple_pd_mask(tmp2, _mm512_set1_pd(pivmin));
+            __mmask8 cmp = _mm_cmp_pd_mask(tmp2, _mm_set1_pd(pivmin), _CMP_LE_OS);
+            __m128i itmp1 = _mm_maskz_set1_epi64(cmp, 1);
+
+            __m128d tmp2min = _mm_min_pd(tmp2, _mm_set1_pd(-pivmin));
+            tmp2 = _mm_mask_blend_pd(cmp, tmp2, tmp2min);
+            
+            for(j=2; j<=n; j++)
+            {
+                tmp2 = d->ptr.p_double[j]-e2->ptr.p_double[j-1]/tmp2-tmp1;
+
+                cmp = _mm_cmp_pd_mask(tmp2, _mm_set1_pd(pivmin), _CMP_LE_OS);
+
+                itmp1 = _mm_mask_add_epi64(itmp1, cmp, itmp1, _mm_set1_epi64x(1));
+
+                tmp2min = _mm_min_pd(tmp2, _mm_set1_pd(-pivmin));
+                tmp2 = _mm_mask_blend_pd(cmp, tmp2, tmp2min);
+            }
+
+            ae_int_v2t itmp1v = (ae_int_v2t)itmp1;
+            for (int idx = 0; idx < 2; ++idx)
+                if (postproc(ji + idx, itmp1v[idx], tmp1[idx]))
+                    return;
+            
+            ji+=2;
+        }
+#else // !__AVX512F__
+
+#ifdef __AVX__
+        for(; ji<=kl-3; ji+=4)
+        {
+            /*
+             * Compute N(w), the number of eigenvalues less than w
+             */
+            __m256d tmp1 = _mm256_loadu_pd(&c->ptr.p_double[ji]);
+            __m256d tmp2 = _mm256_set1_pd(d->ptr.p_double[1])-tmp1;
+            
+            //__m256d tmp_cmp = _mm256_cmp_pd(tmp2, _mm256_set1_pd(pivmin), _CMP_LE_OS);
+            __m256d tmp_cmp = tmp2 <= _mm256_set1_pd(pivmin);
+#ifdef __AVX2__
+            __m256i itmp1 = _mm256_and_si256(_mm256_castpd_si256(tmp_cmp), _mm256_set1_epi64x(1));
+#else // !__AVX2__
+            __m256d tmp_cmp_and = _mm256_and_pd(tmp_cmp, _mm256_castsi256_pd(_mm256_set1_epi64x(1)));
+            __m128i itmp1_1 = _mm256_castsi256_si128(_mm256_castpd_si256(tmp_cmp_and));
+            __m128i itmp1_2 = _mm256_extractf128_si256(_mm256_castpd_si256(tmp_cmp_and), 1);
+#endif // !__AVX2__
+            __m256d tmp_min = _mm256_min_pd(tmp2,_mm256_set1_pd(-pivmin));
+            tmp2 = _mm256_blendv_pd(tmp2, tmp_min, tmp_cmp);
+            
+            for(j=2; j<=n; j++)
+            {
+                tmp2 = _mm256_set1_pd(d->ptr.p_double[j])-_mm256_set1_pd(e2->ptr.p_double[j-1])/tmp2-tmp1;
+                
+                //tmp_cmp = _mm256_cmp_pd(tmp2, _mm256_set1_pd(pivmin), _CMP_LE_OS);
+                tmp_cmp = tmp2 <= _mm256_set1_pd(pivmin);
+#ifdef __AVX2__
+                itmp1 = _mm256_add_epi64(itmp1, _mm256_and_si256(_mm256_castpd_si256(tmp_cmp), _mm256_set1_epi64x(1)));
+#else // !__AVX2__
+                tmp_cmp_and = _mm256_and_pd(tmp_cmp, _mm256_castsi256_pd(_mm256_set1_epi64x(1)));
+                itmp1_1 = _mm_add_epi64(itmp1_1, _mm256_castsi256_si128(_mm256_castpd_si256(tmp_cmp_and)));
+                itmp1_2 = _mm_add_epi64(itmp1_2, _mm256_extractf128_si256(_mm256_castpd_si256(tmp_cmp_and), 1));
+#endif // !__AVX2__
+                tmp_min = _mm256_min_pd(tmp2,_mm256_set1_pd(-pivmin));
+                tmp2 = _mm256_blendv_pd(tmp2, tmp_min, tmp_cmp);
+            }
+#ifdef __AVX2__
+            ae_int_v4t itmp1v = (ae_int_v4t)itmp1;
+            for (int idx = 0; idx < 4; ++idx)
+                if (postproc(ji + idx, itmp1v[idx], tmp1[idx]))
+                    return;
+#else // !__AVX2__
+            ae_int_v2t itmp1v = (ae_int_v2t)itmp1_1;
+            for (int idx = 0; idx < 2; ++idx)
+                if (postproc(ji + idx, itmp1v[idx], tmp1[idx]))
+                    return;
+            itmp1v = (ae_int_v2t)itmp1_2;
+            for (int idx = 0; idx < 2; ++idx)
+                if (postproc(ji + idx + 2, itmp1v[idx], tmp1[idx + 2]))
+                    return;
+#endif // !__AVX2__
+        }
+#endif // __AVX__
+        
+#ifdef __SSE2__
+#ifdef __AVX__
+        if(ji<=kl-1)
+#else // !__AVX__
+        for(; ji<=kl-1; ji+=2)
+#endif // !__AVX__
+        {
+            /*
+             * Compute N(w), the number of eigenvalues less than w
+             */
+            __m128d tmp1 = _mm_loadu_pd(&c->ptr.p_double[ji]);
+            __m128d tmp2 = _mm_set1_pd(d->ptr.p_double[1])-tmp1;
+            
+            __m128d tmp_cmp = _mm_cmple_pd(tmp2, _mm_set1_pd(pivmin));
+            __m128i itmp1 = _mm_and_si128(_mm_castpd_si128(tmp_cmp), _mm_set1_epi64x(1));
+            __m128d tmp_min = _mm_min_pd(tmp2,_mm_set1_pd(-pivmin));
+#ifdef __SSE4_1__
+            tmp2 = _mm_blendv_pd(tmp2, tmp_min, tmp_cmp);
+#else // !__SSE4_1__
+            tmp2 = _mm_or_pd(
+                _mm_andnot_pd(tmp_cmp, tmp2),
+                _mm_and_pd(tmp_cmp, tmp_min)
+            );
+#endif // !__SSE4_1__
+            
+            for(j=2; j<=n; j++)
+            {
+                tmp2 = _mm_set1_pd(d->ptr.p_double[j])-_mm_set1_pd(e2->ptr.p_double[j-1])/tmp2-tmp1;
+                
+                tmp_cmp = _mm_cmple_pd(tmp2, _mm_set1_pd(pivmin));
+                itmp1 = _mm_add_epi64(itmp1, _mm_and_si128(_mm_castpd_si128(tmp_cmp), _mm_set1_epi64x(1)));
+                tmp_min = _mm_min_pd(tmp2,_mm_set1_pd(-pivmin));
+#ifdef __SSE4_1__
+                tmp2 = _mm_blendv_pd(tmp2, tmp_min, tmp_cmp);
+#else // !__SSE4_1__
+                tmp2 = _mm_or_pd(
+                    _mm_andnot_pd(tmp_cmp, tmp2),
+                    _mm_and_pd(tmp_cmp, tmp_min)
+                );
+#endif // !__SSE4_1__
+            }
+            
+            ae_int_v2t itmp1v = (ae_int_v2t)itmp1;
+            for (int idx = 0; idx < 2; ++idx)
+            {
+#ifdef __x86_64__
+                if (postproc(ji + idx, itmp1v[idx], tmp1[idx]))
+                    return;
+#else // !__x86_64__
+                if (postproc(ji + idx, itmp1v[idx*2], tmp1[idx]))
+                    return;
+#endif // !__x86_64__
+            }
+#ifdef __AVX__
+            ji+=2;
+#endif // __AVX__
+        }
+#endif // __SSE2__
+
+#endif // !__AVX512F__
+        
+        // Scalar loop
+#ifdef __SSE2__
+        if(ji<=kl)
+#else // !__SSE2__
+        for(; ji<=kl; ji++)
+#endif // !__SSE2__
         {
             
             /*
@@ -45419,82 +45746,8 @@ static void evd_internaldlaebz(ae_int_t ijob,
                     tmp2 = ae_minreal(tmp2, -pivmin, _state);
                 }
             }
-            if( ijob<=2 )
-            {
-                
-                /*
-                 * IJOB=2: Choose all intervals containing eigenvalues.
-                 *
-                 * Insure that N(w) is monotone
-                 */
-                itmp1 = ae_minint(nab->ptr.pp_int[ji][2], ae_maxint(nab->ptr.pp_int[ji][1], itmp1, _state), _state);
-                
-                /*
-                 * Update the Queue -- add intervals if both halves
-                 * contain eigenvalues.
-                 */
-                if( itmp1==nab->ptr.pp_int[ji][2] )
-                {
-                    
-                    /*
-                     * No eigenvalue in the upper interval:
-                     * just use the lower interval.
-                     */
-                    ab->ptr.pp_double[ji][2] = tmp1;
-                }
-                else
-                {
-                    if( itmp1==nab->ptr.pp_int[ji][1] )
-                    {
-                        
-                        /*
-                         * No eigenvalue in the lower interval:
-                         * just use the upper interval.
-                         */
-                        ab->ptr.pp_double[ji][1] = tmp1;
-                    }
-                    else
-                    {
-                        if( klnew<mmax )
-                        {
-                            
-                            /*
-                             * Eigenvalue in both intervals -- add upper to queue.
-                             */
-                            klnew = klnew+1;
-                            ab->ptr.pp_double[klnew][2] = ab->ptr.pp_double[ji][2];
-                            nab->ptr.pp_int[klnew][2] = nab->ptr.pp_int[ji][2];
-                            ab->ptr.pp_double[klnew][1] = tmp1;
-                            nab->ptr.pp_int[klnew][1] = itmp1;
-                            ab->ptr.pp_double[ji][2] = tmp1;
-                            nab->ptr.pp_int[ji][2] = itmp1;
-                        }
-                        else
-                        {
-                            *info = mmax+1;
-                            return;
-                        }
-                    }
-                }
-            }
-            else
-            {
-                
-                /*
-                 * IJOB=3: Binary search.  Keep only the interval
-                 * containing  w  s.t. N(w) = NVAL
-                 */
-                if( itmp1<=nval->ptr.p_int[ji] )
-                {
-                    ab->ptr.pp_double[ji][1] = tmp1;
-                    nab->ptr.pp_int[ji][1] = itmp1;
-                }
-                if( itmp1>=nval->ptr.p_int[ji] )
-                {
-                    ab->ptr.pp_double[ji][2] = tmp1;
-                    nab->ptr.pp_int[ji][2] = itmp1;
-                }
-            }
+            if (postproc(ji, itmp1, tmp1))
+                return;
         }
         kl = klnew;
         
diff --git a/boinc_app/Makefile b/boinc_app/Makefile
index 619c9ce..5f77c85 100644
--- a/boinc_app/Makefile
+++ b/boinc_app/Makefile
@@ -1,12 +1,11 @@
 CAMBALA = ../cambala
-ALGLIB = ../../alglib
+ALGLIB = ../alglib-3.14.0/src
 BOINC_DIR = ../../boinc
 BOINC_API_DIR = $(BOINC_DIR)/api
 BOINC_LIB_DIR = $(BOINC_DIR)/lib
 BOINC_ZIP_DIR = $(BOINC_DIR)/zip
 FREETYPE_DIR = /usr/include/freetype2
 
-CPP = g++
 CPPFLAGS = -O3 \
     -std=c++0x -static -Wall -W -Wshadow -Wpointer-arith -Wcast-qual -Wcast-align -Wwrite-strings -fno-common \
     -DAPP_GRAPHICS -D __STDC_LIMIT_MACROS -D __STDC_FORMAT_MACROS -D NDEBUG \
@@ -20,42 +19,77 @@ CPPFLAGS = -O3 \
     -L /usr/X11R6/lib \
     -L.
 
+ifeq ($(MinGW32),1)
+$(info ===== Compiling MinGW 32-bit app version =====)
+CPP = i686-w64-mingw32-g++
+else
+ifeq ($(MinGW64),1)
+$(info ===== Compiling MinGW 64-bit app version =====)
+CPP = x86_64-w64-mingw32-g++
+else
+ifeq ($(M32),1)
+$(info ===== Compiling 32-bit app version =====)
+CPP = g++ -m32
+else
+CPP = g++
+endif
+endif
+endif
+
+ifeq ($(SSE2),1)
+$(info ===== Compiling SSE2 app version =====)
+CPPFLAGS += -msse2
+else
+ifeq ($(SSE41),1)
+$(info ===== Compiling SSE4.1 app version =====)
+CPPFLAGS += -msse4.1
+else
+ifeq ($(AVX),1)
+$(info ===== Compiling AVX app version =====)
+CPPFLAGS += -mavx -mtune=sandybridge
+else
+ifeq ($(AVX2),1)
+$(info ===== Compiling AVX2 app version =====)
+CPPFLAGS += -mavx2 -mfma -mtune=haswell
+else
+ifeq ($(AVX512),1)
+$(info ===== Compiling AVX512 app version =====)
+CPPFLAGS += -march=skylake-avx512
+ifeq ($(MinGW64),1)
+# MinGW needs workaround for "invalid register for .seh_savexmm" bug
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782
+CPPFLAGS += \
+	-ffixed-xmm16 -ffixed-xmm17 -ffixed-xmm18 -ffixed-xmm19 -ffixed-xmm20 -ffixed-xmm21 -ffixed-xmm22 -ffixed-xmm23 \
+	-ffixed-xmm24 -ffixed-xmm25 -ffixed-xmm26 -ffixed-xmm27 -ffixed-xmm28 -ffixed-xmm29 -ffixed-xmm30 -ffixed-xmm31
+endif
+endif
+endif
+endif
+endif
+endif
+
+ifeq ($(NOWARN),1)
+CPPFLAGS += -w
+endif
+
 release: cambala_boinc_app
 
 libstdc++.a:
-	ln -s `g++ -print-file-name=libstdc++.a`
-
-ttfont.cpp:
-	ln -s ../../api/ttfont.cpp .
+	ln -s `${CPP} -print-file-name=libstdc++.a`
 
 clean: distclean
 
 distclean:
-	/bin/rm -f $(PROGS) *.o libstdc++.a cambala_boinc_app
+	/bin/rm -f $(PROGS) *.o libstdc++.a cambala_boinc_app cambala_boinc_app.exe
 
-cambala_boinc_app: cambala_boinc_app.o alglibinternal.o alglibmisc.o ap.o linalg.o specialfunctions.o sequential.o \
+cambala_boinc_app: cambala_boinc_app.o all.o \
 	libstdc++.a $(BOINC_API_DIR)/libboinc_api.a $(BOINC_LIB_DIR)/libboinc.a
-	$(CPP) $(CPPFLAGS) cambala_boinc_app.o alglibinternal.o alglibmisc.o ap.o linalg.o specialfunctions.o sequential.o -o cambala_boinc_app \
+	$(CPP) $(CPPFLAGS) cambala_boinc_app.o all.o -o cambala_boinc_app \
 	libstdc++.a -pthread $(BOINC_API_DIR)/libboinc_api.a \
 	$(BOINC_LIB_DIR)/libboinc.a
 	
-specialfunctions.o: $(ALGLIB)/specialfunctions.cpp 
-	${CPP} ${CPPFLAGS} $(ALGLIB)/specialfunctions.cpp -c
-
-linalg.o: $(ALGLIB)/linalg.cpp 
-	${CPP} ${CPPFLAGS} $(ALGLIB)/linalg.cpp -c
-
-ap.o: $(ALGLIB)/ap.cpp 
-	${CPP} ${CPPFLAGS} $(ALGLIB)/ap.cpp -c
-
-alglibmisc.o: $(ALGLIB)/alglibmisc.cpp 
-	${CPP} ${CPPFLAGS} $(ALGLIB)/alglibmisc.cpp -c
-
-alglibinternal.o: $(ALGLIB)/alglibinternal.cpp 
-	${CPP} ${CPPFLAGS} $(ALGLIB)/alglibinternal.cpp -c
-
-sequential.o: $(CAMBALA)/sequential.cpp
-	${CPP} ${CPPFLAGS} $(CAMBALA)/sequential.cpp -c
+all.o: all.cpp $(ALGLIB)/specialfunctions.cpp $(ALGLIB)/linalg.cpp $(ALGLIB)/ap.cpp $(ALGLIB)/alglibmisc.cpp $(ALGLIB)/alglibinternal.cpp $(CAMBALA)/sequential.cpp
+	${CPP} ${CPPFLAGS} all.cpp -c
 
 cambala_boinc_app.o: cambala_boinc_app.cpp
 	${CPP} ${CPPFLAGS} cambala_boinc_app.cpp -c
diff --git a/boinc_app/all.cpp b/boinc_app/all.cpp
new file mode 100644
index 0000000..c5e13ff
--- /dev/null
+++ b/boinc_app/all.cpp
@@ -0,0 +1,6 @@
+#include <ap.cpp>
+#include <alglibinternal.cpp>
+#include <specialfunctions.cpp>
+#include <alglibmisc.cpp>
+#include <linalg.cpp>
+#include <sequential.cpp>
diff --git a/boinc_app/cambala_boinc_app.cpp b/boinc_app/cambala_boinc_app.cpp
index fc970ca..90fcb81 100644
--- a/boinc_app/cambala_boinc_app.cpp
+++ b/boinc_app/cambala_boinc_app.cpp
@@ -32,10 +32,125 @@
 #include "utils.h"
 #include "point.h"
 
+#if defined(__i386__) || defined (__x86_64__)
+#include <cpuid.h>
+#endif
+
 #define CHECKPOINT_FILE "chpt"
 #define INPUT_FILENAME "in"
 #define OUTPUT_FILENAME "out"
 
+__attribute__((noreturn))
+void PrintFatalError(const char* str)
+{
+	// print error to unredirected stderr first
+	fprintf(stderr, "Error: %s", str);
+	
+	// now try to send it back to server
+	int retval = boinc_init();
+    if (0 == retval)
+	{
+		fprintf(stderr, "Error: %s", str);
+		boinc_finish(1);
+    }
+
+	exit(1);
+}
+
+void VerifyCpu()
+{
+#if (defined(__i386__) || defined (__x86_64__)) && defined(__SSE2__)
+	unsigned int a, b, c, d;
+
+	if (!__get_cpuid(1, &a, &b, &c, &d))
+	{
+		PrintFatalError("CPUID instruction is not supported by your CPU!\n");
+	}
+
+	if (0 == (d & bit_SSE2))
+	{
+		PrintFatalError("SSE2 instructions are not supported by your CPU!\n");
+	}
+
+#ifdef __SSE4_1__
+	if (0 == (c & bit_SSE4_1))
+	{
+		PrintFatalError("SSE4.1 instructions are not supported by your CPU!\n");
+	}
+#endif
+
+#ifdef __AVX__
+	if (0 == (c & bit_AVX))
+	{
+		PrintFatalError("AVX instructions are not supported by your CPU!\n");
+	}
+
+	// AVX also needs OS support, check for it
+	if (0 == (c & bit_OSXSAVE))
+	{
+		PrintFatalError("OSXSAVE instructions are not supported by your CPU!\n");
+	}
+
+	unsigned int eax, edx;
+	unsigned int ecx = 0; // _XCR_XFEATURE_ENABLED_MASK
+	__asm__ ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx));
+	if (0x6 != (eax & 0x6)) // XSTATE_SSE | XSTATE_YMM
+	{
+		PrintFatalError("AVX instructions are not supported by your OS!\n");
+	}
+#endif
+
+#ifdef __AVX2__
+	if (__get_cpuid_max(0, 0) < 7)
+	{
+		PrintFatalError("CPUID level 7 is not supported by your CPU!\n");
+	}
+
+	unsigned int a2, b2, c2, d2;
+	__cpuid_count(7, 0, a2, b2, c2, d2);
+
+	if (0 == (b2 & bit_AVX2))
+	{
+		PrintFatalError("AVX2 instructions are not supported by your CPU!\n");
+	}
+#endif
+
+#ifdef __FMA__
+	// Some AMD CPUs(s) support FMA but no AVX2. FMA does not provide significat 
+	// boost for this app, so it should be enabled on AVX2 CPUs only.
+#ifndef __AVX2__
+#error AVX2 is not enabled!
+#endif
+	if (0 == (c & bit_FMA))
+	{
+		PrintFatalError("FMA instructions are not supported by your CPU!\n");
+	}
+#endif
+
+#ifdef __AVX512F__
+	// AVX512 consists of few subsets. Skylake-AVX512 target supports F, BW, DQ, VL and CD.
+	// We need data loaded during AVX and AVX2 checks, make sure we have it
+#if !defined(__AVX__) || !defined(__AVX2__)
+#error AVX or AVX2 is not enabled!
+#endif
+
+	const unsigned int avx512bits = bit_AVX512F | bit_AVX512DQ | bit_AVX512CD | bit_AVX512BW | bit_AVX512VL;
+
+	if (avx512bits != (b2 & avx512bits))
+	{
+		PrintFatalError("AVX512 instructions are not supported by your CPU!\n");
+	}
+
+	// AVX512 also needs OS support, check for it
+	if (0xe6 != (eax & 0xe6)) // XSTATE_SSE | XSTATE_YMM | XSTATE_OPMASK | XSTATE_ZMM | XSTATE_HI_ZMM
+	{
+		PrintFatalError("AVX512 instructions are not supported by your OS!\n");
+	}
+#endif
+
+#endif
+}
+
 using namespace std;
 char buf[256];
 
@@ -46,6 +161,8 @@ int do_checkpoint( const long long &total_points,
 
 int main(int argc, char **argv)
 {
+	VerifyCpu();
+
 	int retval = boinc_init();
     if ( retval ) {
         fprintf(stderr, "%s APP: boinc_init() returned %d\n",
@@ -53,7 +170,7 @@ int main(int argc, char **argv)
         );
         exit( retval );
     }
-	
+
 	search_space_point cur_record_point;
 	cur_record_point.residual = START_HUGE_VALUE;
 
diff --git a/boinc_app/ac_modes_R7km_dtimes1.txt b/boinc_app/test/ac_modes_R7km_dtimes1.txt
similarity index 100%
rename from boinc_app/ac_modes_R7km_dtimes1.txt
rename to boinc_app/test/ac_modes_R7km_dtimes1.txt
diff --git a/boinc_app/test/cambala_depths_out.ref b/boinc_app/test/cambala_depths_out.ref
new file mode 100644
index 0000000..746d57f
--- /dev/null
+++ b/boinc_app/test/cambala_depths_out.ref
@@ -0,0 +1 @@
+23 33 50 300 
diff --git a/boinc_app/test/cambala_out.ref b/boinc_app/test/cambala_out.ref
new file mode 100644
index 0000000..9893165
--- /dev/null
+++ b/boinc_app/test/cambala_out.ref
@@ -0,0 +1,26 @@
+Input parameters :
+launch_type bruteforce
+object_function_type uniform
+ppm 2
+init_iterated_local_search_runs 10
+cw1_init_arr :
+1506 1490 1460 
+cw2_init_arr :
+1506 1490 1480 
+ncpl_init_arr :
+1 1 11 
+nR 1
+R1 7000
+R2 7000
+ntau 1
+tau1 0
+tau2 0
+nrhob 1
+rhob1 1.7
+rhob2 1.7
+ncb 1
+cb1 1700
+cb2 1700
+dtimes_file ac_modes_R7km_dtimes1.txt
+spmag_file no
+launch_type bruteforce
diff --git a/boinc_app/in b/boinc_app/test/in
similarity index 100%
rename from boinc_app/in
rename to boinc_app/test/in
diff --git a/boinc_app/test/out.ref b/boinc_app/test/out.ref
new file mode 100644
index 0000000..7f94ec5
--- /dev/null
+++ b/boinc_app/test/out.ref
@@ -0,0 +1 @@
+0.003945 1700 1.7 7000 0 1506 1490 1462 23 33 50 300 
\ No newline at end of file
diff --git a/boinc_app/test/test.sh b/boinc_app/test/test.sh
new file mode 100755
index 0000000..08c1ec4
--- /dev/null
+++ b/boinc_app/test/test.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/bash
+
+rm -f boinc_finish_called cambala_depths_out cambala_out chpt out stderr.txt
+
+time ../cambala_boinc_app
+
+echo
+
+#diff -qs cambala_depths_out cambala_depths_out.ref
+#diff -qs cambala_out cambala_out.ref
+diff -qs out out.ref

From 0810bed595d85b2b7103670078aa4d007648a536 Mon Sep 17 00:00:00 2001
From: Daniel <daniel@poradnik-webmastera.com>
Date: Sun, 13 Jan 2019 16:48:45 +0100
Subject: [PATCH 2/2] Simplified makefile, silenced some warnings from alglib

---
 boinc_app/Makefile | 24 ++++++------------------
 boinc_app/all.cpp  |  7 +++++++
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/boinc_app/Makefile b/boinc_app/Makefile
index 5f77c85..c310737 100644
--- a/boinc_app/Makefile
+++ b/boinc_app/Makefile
@@ -22,37 +22,29 @@ CPPFLAGS = -O3 \
 ifeq ($(MinGW32),1)
 $(info ===== Compiling MinGW 32-bit app version =====)
 CPP = i686-w64-mingw32-g++
-else
-ifeq ($(MinGW64),1)
+else ifeq ($(MinGW64),1)
 $(info ===== Compiling MinGW 64-bit app version =====)
 CPP = x86_64-w64-mingw32-g++
-else
-ifeq ($(M32),1)
+else ifeq ($(M32),1)
 $(info ===== Compiling 32-bit app version =====)
 CPP = g++ -m32
 else
 CPP = g++
 endif
-endif
-endif
 
 ifeq ($(SSE2),1)
 $(info ===== Compiling SSE2 app version =====)
 CPPFLAGS += -msse2
-else
-ifeq ($(SSE41),1)
+else ifeq ($(SSE41),1)
 $(info ===== Compiling SSE4.1 app version =====)
 CPPFLAGS += -msse4.1
-else
-ifeq ($(AVX),1)
+else ifeq ($(AVX),1)
 $(info ===== Compiling AVX app version =====)
 CPPFLAGS += -mavx -mtune=sandybridge
-else
-ifeq ($(AVX2),1)
+else ifeq ($(AVX2),1)
 $(info ===== Compiling AVX2 app version =====)
 CPPFLAGS += -mavx2 -mfma -mtune=haswell
-else
-ifeq ($(AVX512),1)
+else ifeq ($(AVX512),1)
 $(info ===== Compiling AVX512 app version =====)
 CPPFLAGS += -march=skylake-avx512
 ifeq ($(MinGW64),1)
@@ -63,10 +55,6 @@ CPPFLAGS += \
 	-ffixed-xmm24 -ffixed-xmm25 -ffixed-xmm26 -ffixed-xmm27 -ffixed-xmm28 -ffixed-xmm29 -ffixed-xmm30 -ffixed-xmm31
 endif
 endif
-endif
-endif
-endif
-endif
 
 ifeq ($(NOWARN),1)
 CPPFLAGS += -w
diff --git a/boinc_app/all.cpp b/boinc_app/all.cpp
index c5e13ff..5bbfcdb 100644
--- a/boinc_app/all.cpp
+++ b/boinc_app/all.cpp
@@ -1,6 +1,13 @@
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+
 #include <ap.cpp>
 #include <alglibinternal.cpp>
 #include <specialfunctions.cpp>
 #include <alglibmisc.cpp>
 #include <linalg.cpp>
+
+#pragma GCC diagnostic pop
+
 #include <sequential.cpp>