From cd1a7e7346d9f039c206e1097a2ac63bf10c7b0a Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 8 Jan 2019 23:12:48 +0100 Subject: [PATCH 1/2] Optimized app for SSE2, SSE4.1, AVX, AVX2 and AVX512 --- alglib-3.14.0/src/linalg.cpp | 407 ++++++++++++++---- boinc_app/Makefile | 86 ++-- boinc_app/all.cpp | 6 + boinc_app/cambala_boinc_app.cpp | 119 ++++- .../{ => test}/ac_modes_R7km_dtimes1.txt | 0 boinc_app/test/cambala_depths_out.ref | 1 + boinc_app/test/cambala_out.ref | 26 ++ boinc_app/{ => test}/in | 0 boinc_app/test/out.ref | 1 + boinc_app/test/test.sh | 11 + 10 files changed, 553 insertions(+), 104 deletions(-) create mode 100644 boinc_app/all.cpp rename boinc_app/{ => test}/ac_modes_R7km_dtimes1.txt (100%) create mode 100644 boinc_app/test/cambala_depths_out.ref create mode 100644 boinc_app/test/cambala_out.ref rename boinc_app/{ => test}/in (100%) create mode 100644 boinc_app/test/out.ref create mode 100755 boinc_app/test/test.sh diff --git a/alglib-3.14.0/src/linalg.cpp b/alglib-3.14.0/src/linalg.cpp index b6f67c2..cf530a9 100644 --- a/alglib-3.14.0/src/linalg.cpp +++ b/alglib-3.14.0/src/linalg.cpp @@ -23,6 +23,10 @@ A copy of the GNU General Public License is available at #include "stdafx.h" #include "linalg.h" +#ifdef __SSE2__ +#include +#endif + // disable some irrelevant warnings #if (AE_COMPILER==AE_MSVC) && !defined(AE_ALL_WARNINGS) #pragma warning(disable:4100) @@ -45365,6 +45369,98 @@ static void evd_internaldlaebz(ae_int_t ijob, c->ptr.p_double[ji] = 0.5*(ab->ptr.pp_double[ji][1]+ab->ptr.pp_double[ji][2]); } } + + auto postproc = [&](ae_int_t ji, ae_int_t itmp1, double tmp1)->bool + { + if( ijob<=2 ) + { + + /* + * IJOB=2: Choose all intervals containing eigenvalues. + * + * Insure that N(w) is monotone + */ + itmp1 = ae_minint(nab->ptr.pp_int[ji][2], ae_maxint(nab->ptr.pp_int[ji][1], itmp1, _state), _state); + + /* + * Update the Queue -- add intervals if both halves + * contain eigenvalues. + */ + if( itmp1==nab->ptr.pp_int[ji][2] ) + { + + /* + * No eigenvalue in the upper interval: + * just use the lower interval. + */ + ab->ptr.pp_double[ji][2] = tmp1; + } + else + { + if( itmp1==nab->ptr.pp_int[ji][1] ) + { + + /* + * No eigenvalue in the lower interval: + * just use the upper interval. + */ + ab->ptr.pp_double[ji][1] = tmp1; + } + else + { + if( klnewptr.pp_double[klnew][2] = ab->ptr.pp_double[ji][2]; + nab->ptr.pp_int[klnew][2] = nab->ptr.pp_int[ji][2]; + ab->ptr.pp_double[klnew][1] = tmp1; + nab->ptr.pp_int[klnew][1] = itmp1; + ab->ptr.pp_double[ji][2] = tmp1; + nab->ptr.pp_int[ji][2] = itmp1; + } + else + { + *info = mmax+1; + return true; + } + } + } + } + else + { + + /* + * IJOB=3: Binary search. Keep only the interval + * containing w s.t. N(w) = NVAL + */ + if( itmp1<=nval->ptr.p_int[ji] ) + { + ab->ptr.pp_double[ji][1] = tmp1; + nab->ptr.pp_int[ji][1] = itmp1; + } + if( itmp1>=nval->ptr.p_int[ji] ) + { + ab->ptr.pp_double[ji][2] = tmp1; + nab->ptr.pp_int[ji][2] = itmp1; + } + } + + return false; + }; + + typedef ae_int_t ae_int_v2t __attribute__((vector_size(16))); + typedef ae_int_t ae_int_v4t __attribute__((vector_size(32))); + typedef ae_int_t ae_int_v8t __attribute__((vector_size(64))); + +#ifdef __x86_64__ + static_assert(sizeof(ae_int_t) == 8, "ae_int_t must be 64 bit"); +#else // !__x86_64__ + static_assert(sizeof(ae_int_t) == 4, "ae_int_t must be 32 bit"); +#endif // !__x86_64__ /* * Iteration loop @@ -45379,7 +45475,238 @@ static void evd_internaldlaebz(ae_int_t ijob, * Serial Version of the loop */ klnew = kl; - for(ji=kf; ji<=kl; ji++) + ji=kf; + +#ifdef __AVX512F__ + for(; ji<=kl-7; ji+=8) + { + + /* + * Compute N(w), the number of eigenvalues less than w + */ + __m512d tmp1 = _mm512_loadu_pd(&c->ptr.p_double[ji]); + __m512d tmp2 = _mm512_set1_pd(d->ptr.p_double[1])-tmp1; + + //__mmask8 cmp = _mm512_cmple_pd_mask(tmp2, _mm512_set1_pd(pivmin)); + __mmask8 cmp = _mm512_cmp_pd_mask(tmp2, _mm512_set1_pd(pivmin), _CMP_LE_OS); + __m512i itmp1 = _mm512_maskz_set1_epi64(cmp, 1); + + __m512d tmp2min = _mm512_min_pd(tmp2, _mm512_set1_pd(-pivmin)); + tmp2 = _mm512_mask_blend_pd(cmp, tmp2, tmp2min); + + for(j=2; j<=n; j++) + { + tmp2 = d->ptr.p_double[j]-e2->ptr.p_double[j-1]/tmp2-tmp1; + + cmp = _mm512_cmp_pd_mask(tmp2, _mm512_set1_pd(pivmin), _CMP_LE_OS); + + itmp1 = _mm512_mask_add_epi64(itmp1, cmp, itmp1, _mm512_set1_epi64(1)); + + tmp2min = _mm512_min_pd(tmp2, _mm512_set1_pd(-pivmin)); + tmp2 = _mm512_mask_blend_pd(cmp, tmp2, tmp2min); + } + + ae_int_v8t itmp1v = (ae_int_v8t)itmp1; + for (int idx = 0; idx < 8; ++idx) + if (postproc(ji + idx, itmp1v[idx], tmp1[idx])) + return; + } + + if(ji<=kl-3) + { + + /* + * Compute N(w), the number of eigenvalues less than w + */ + __m256d tmp1 = _mm256_loadu_pd(&c->ptr.p_double[ji]); + __m256d tmp2 = _mm256_set1_pd(d->ptr.p_double[1])-tmp1; + + //__mmask8 cmp = _mm512_cmple_pd_mask(tmp2, _mm512_set1_pd(pivmin)); + __mmask8 cmp = _mm256_cmp_pd_mask(tmp2, _mm256_set1_pd(pivmin), _CMP_LE_OS); + __m256i itmp1 = _mm256_maskz_set1_epi64(cmp, 1); + + __m256d tmp2min = _mm256_min_pd(tmp2, _mm256_set1_pd(-pivmin)); + tmp2 = _mm256_mask_blend_pd(cmp, tmp2, tmp2min); + + for(j=2; j<=n; j++) + { + tmp2 = d->ptr.p_double[j]-e2->ptr.p_double[j-1]/tmp2-tmp1; + + cmp = _mm256_cmp_pd_mask(tmp2, _mm256_set1_pd(pivmin), _CMP_LE_OS); + + itmp1 = _mm256_mask_add_epi64(itmp1, cmp, itmp1, _mm256_set1_epi64x(1)); + + tmp2min = _mm256_min_pd(tmp2, _mm256_set1_pd(-pivmin)); + tmp2 = _mm256_mask_blend_pd(cmp, tmp2, tmp2min); + } + + ae_int_v4t itmp1v = (ae_int_v4t)itmp1; + for (int idx = 0; idx < 4; ++idx) + if (postproc(ji + idx, itmp1v[idx], tmp1[idx])) + return; + + ji+=4; + } + + if(ji<=kl-1) + { + + /* + * Compute N(w), the number of eigenvalues less than w + */ + __m128d tmp1 = _mm_loadu_pd(&c->ptr.p_double[ji]); + __m128d tmp2 = _mm_set1_pd(d->ptr.p_double[1])-tmp1; + + //__mmask8 cmp = _mm512_cmple_pd_mask(tmp2, _mm512_set1_pd(pivmin)); + __mmask8 cmp = _mm_cmp_pd_mask(tmp2, _mm_set1_pd(pivmin), _CMP_LE_OS); + __m128i itmp1 = _mm_maskz_set1_epi64(cmp, 1); + + __m128d tmp2min = _mm_min_pd(tmp2, _mm_set1_pd(-pivmin)); + tmp2 = _mm_mask_blend_pd(cmp, tmp2, tmp2min); + + for(j=2; j<=n; j++) + { + tmp2 = d->ptr.p_double[j]-e2->ptr.p_double[j-1]/tmp2-tmp1; + + cmp = _mm_cmp_pd_mask(tmp2, _mm_set1_pd(pivmin), _CMP_LE_OS); + + itmp1 = _mm_mask_add_epi64(itmp1, cmp, itmp1, _mm_set1_epi64x(1)); + + tmp2min = _mm_min_pd(tmp2, _mm_set1_pd(-pivmin)); + tmp2 = _mm_mask_blend_pd(cmp, tmp2, tmp2min); + } + + ae_int_v2t itmp1v = (ae_int_v2t)itmp1; + for (int idx = 0; idx < 2; ++idx) + if (postproc(ji + idx, itmp1v[idx], tmp1[idx])) + return; + + ji+=2; + } +#else // !__AVX512F__ + +#ifdef __AVX__ + for(; ji<=kl-3; ji+=4) + { + /* + * Compute N(w), the number of eigenvalues less than w + */ + __m256d tmp1 = _mm256_loadu_pd(&c->ptr.p_double[ji]); + __m256d tmp2 = _mm256_set1_pd(d->ptr.p_double[1])-tmp1; + + //__m256d tmp_cmp = _mm256_cmp_pd(tmp2, _mm256_set1_pd(pivmin), _CMP_LE_OS); + __m256d tmp_cmp = tmp2 <= _mm256_set1_pd(pivmin); +#ifdef __AVX2__ + __m256i itmp1 = _mm256_and_si256(_mm256_castpd_si256(tmp_cmp), _mm256_set1_epi64x(1)); +#else // !__AVX2__ + __m256d tmp_cmp_and = _mm256_and_pd(tmp_cmp, _mm256_castsi256_pd(_mm256_set1_epi64x(1))); + __m128i itmp1_1 = _mm256_castsi256_si128(_mm256_castpd_si256(tmp_cmp_and)); + __m128i itmp1_2 = _mm256_extractf128_si256(_mm256_castpd_si256(tmp_cmp_and), 1); +#endif // !__AVX2__ + __m256d tmp_min = _mm256_min_pd(tmp2,_mm256_set1_pd(-pivmin)); + tmp2 = _mm256_blendv_pd(tmp2, tmp_min, tmp_cmp); + + for(j=2; j<=n; j++) + { + tmp2 = _mm256_set1_pd(d->ptr.p_double[j])-_mm256_set1_pd(e2->ptr.p_double[j-1])/tmp2-tmp1; + + //tmp_cmp = _mm256_cmp_pd(tmp2, _mm256_set1_pd(pivmin), _CMP_LE_OS); + tmp_cmp = tmp2 <= _mm256_set1_pd(pivmin); +#ifdef __AVX2__ + itmp1 = _mm256_add_epi64(itmp1, _mm256_and_si256(_mm256_castpd_si256(tmp_cmp), _mm256_set1_epi64x(1))); +#else // !__AVX2__ + tmp_cmp_and = _mm256_and_pd(tmp_cmp, _mm256_castsi256_pd(_mm256_set1_epi64x(1))); + itmp1_1 = _mm_add_epi64(itmp1_1, _mm256_castsi256_si128(_mm256_castpd_si256(tmp_cmp_and))); + itmp1_2 = _mm_add_epi64(itmp1_2, _mm256_extractf128_si256(_mm256_castpd_si256(tmp_cmp_and), 1)); +#endif // !__AVX2__ + tmp_min = _mm256_min_pd(tmp2,_mm256_set1_pd(-pivmin)); + tmp2 = _mm256_blendv_pd(tmp2, tmp_min, tmp_cmp); + } +#ifdef __AVX2__ + ae_int_v4t itmp1v = (ae_int_v4t)itmp1; + for (int idx = 0; idx < 4; ++idx) + if (postproc(ji + idx, itmp1v[idx], tmp1[idx])) + return; +#else // !__AVX2__ + ae_int_v2t itmp1v = (ae_int_v2t)itmp1_1; + for (int idx = 0; idx < 2; ++idx) + if (postproc(ji + idx, itmp1v[idx], tmp1[idx])) + return; + itmp1v = (ae_int_v2t)itmp1_2; + for (int idx = 0; idx < 2; ++idx) + if (postproc(ji + idx + 2, itmp1v[idx], tmp1[idx + 2])) + return; +#endif // !__AVX2__ + } +#endif // __AVX__ + +#ifdef __SSE2__ +#ifdef __AVX__ + if(ji<=kl-1) +#else // !__AVX__ + for(; ji<=kl-1; ji+=2) +#endif // !__AVX__ + { + /* + * Compute N(w), the number of eigenvalues less than w + */ + __m128d tmp1 = _mm_loadu_pd(&c->ptr.p_double[ji]); + __m128d tmp2 = _mm_set1_pd(d->ptr.p_double[1])-tmp1; + + __m128d tmp_cmp = _mm_cmple_pd(tmp2, _mm_set1_pd(pivmin)); + __m128i itmp1 = _mm_and_si128(_mm_castpd_si128(tmp_cmp), _mm_set1_epi64x(1)); + __m128d tmp_min = _mm_min_pd(tmp2,_mm_set1_pd(-pivmin)); +#ifdef __SSE4_1__ + tmp2 = _mm_blendv_pd(tmp2, tmp_min, tmp_cmp); +#else // !__SSE4_1__ + tmp2 = _mm_or_pd( + _mm_andnot_pd(tmp_cmp, tmp2), + _mm_and_pd(tmp_cmp, tmp_min) + ); +#endif // !__SSE4_1__ + + for(j=2; j<=n; j++) + { + tmp2 = _mm_set1_pd(d->ptr.p_double[j])-_mm_set1_pd(e2->ptr.p_double[j-1])/tmp2-tmp1; + + tmp_cmp = _mm_cmple_pd(tmp2, _mm_set1_pd(pivmin)); + itmp1 = _mm_add_epi64(itmp1, _mm_and_si128(_mm_castpd_si128(tmp_cmp), _mm_set1_epi64x(1))); + tmp_min = _mm_min_pd(tmp2,_mm_set1_pd(-pivmin)); +#ifdef __SSE4_1__ + tmp2 = _mm_blendv_pd(tmp2, tmp_min, tmp_cmp); +#else // !__SSE4_1__ + tmp2 = _mm_or_pd( + _mm_andnot_pd(tmp_cmp, tmp2), + _mm_and_pd(tmp_cmp, tmp_min) + ); +#endif // !__SSE4_1__ + } + + ae_int_v2t itmp1v = (ae_int_v2t)itmp1; + for (int idx = 0; idx < 2; ++idx) + { +#ifdef __x86_64__ + if (postproc(ji + idx, itmp1v[idx], tmp1[idx])) + return; +#else // !__x86_64__ + if (postproc(ji + idx, itmp1v[idx*2], tmp1[idx])) + return; +#endif // !__x86_64__ + } +#ifdef __AVX__ + ji+=2; +#endif // __AVX__ + } +#endif // __SSE2__ + +#endif // !__AVX512F__ + + // Scalar loop +#ifdef __SSE2__ + if(ji<=kl) +#else // !__SSE2__ + for(; ji<=kl; ji++) +#endif // !__SSE2__ { /* @@ -45419,82 +45746,8 @@ static void evd_internaldlaebz(ae_int_t ijob, tmp2 = ae_minreal(tmp2, -pivmin, _state); } } - if( ijob<=2 ) - { - - /* - * IJOB=2: Choose all intervals containing eigenvalues. - * - * Insure that N(w) is monotone - */ - itmp1 = ae_minint(nab->ptr.pp_int[ji][2], ae_maxint(nab->ptr.pp_int[ji][1], itmp1, _state), _state); - - /* - * Update the Queue -- add intervals if both halves - * contain eigenvalues. - */ - if( itmp1==nab->ptr.pp_int[ji][2] ) - { - - /* - * No eigenvalue in the upper interval: - * just use the lower interval. - */ - ab->ptr.pp_double[ji][2] = tmp1; - } - else - { - if( itmp1==nab->ptr.pp_int[ji][1] ) - { - - /* - * No eigenvalue in the lower interval: - * just use the upper interval. - */ - ab->ptr.pp_double[ji][1] = tmp1; - } - else - { - if( klnewptr.pp_double[klnew][2] = ab->ptr.pp_double[ji][2]; - nab->ptr.pp_int[klnew][2] = nab->ptr.pp_int[ji][2]; - ab->ptr.pp_double[klnew][1] = tmp1; - nab->ptr.pp_int[klnew][1] = itmp1; - ab->ptr.pp_double[ji][2] = tmp1; - nab->ptr.pp_int[ji][2] = itmp1; - } - else - { - *info = mmax+1; - return; - } - } - } - } - else - { - - /* - * IJOB=3: Binary search. Keep only the interval - * containing w s.t. N(w) = NVAL - */ - if( itmp1<=nval->ptr.p_int[ji] ) - { - ab->ptr.pp_double[ji][1] = tmp1; - nab->ptr.pp_int[ji][1] = itmp1; - } - if( itmp1>=nval->ptr.p_int[ji] ) - { - ab->ptr.pp_double[ji][2] = tmp1; - nab->ptr.pp_int[ji][2] = itmp1; - } - } + if (postproc(ji, itmp1, tmp1)) + return; } kl = klnew; diff --git a/boinc_app/Makefile b/boinc_app/Makefile index 619c9ce..5f77c85 100644 --- a/boinc_app/Makefile +++ b/boinc_app/Makefile @@ -1,12 +1,11 @@ CAMBALA = ../cambala -ALGLIB = ../../alglib +ALGLIB = ../alglib-3.14.0/src BOINC_DIR = ../../boinc BOINC_API_DIR = $(BOINC_DIR)/api BOINC_LIB_DIR = $(BOINC_DIR)/lib BOINC_ZIP_DIR = $(BOINC_DIR)/zip FREETYPE_DIR = /usr/include/freetype2 -CPP = g++ CPPFLAGS = -O3 \ -std=c++0x -static -Wall -W -Wshadow -Wpointer-arith -Wcast-qual -Wcast-align -Wwrite-strings -fno-common \ -DAPP_GRAPHICS -D __STDC_LIMIT_MACROS -D __STDC_FORMAT_MACROS -D NDEBUG \ @@ -20,42 +19,77 @@ CPPFLAGS = -O3 \ -L /usr/X11R6/lib \ -L. +ifeq ($(MinGW32),1) +$(info ===== Compiling MinGW 32-bit app version =====) +CPP = i686-w64-mingw32-g++ +else +ifeq ($(MinGW64),1) +$(info ===== Compiling MinGW 64-bit app version =====) +CPP = x86_64-w64-mingw32-g++ +else +ifeq ($(M32),1) +$(info ===== Compiling 32-bit app version =====) +CPP = g++ -m32 +else +CPP = g++ +endif +endif +endif + +ifeq ($(SSE2),1) +$(info ===== Compiling SSE2 app version =====) +CPPFLAGS += -msse2 +else +ifeq ($(SSE41),1) +$(info ===== Compiling SSE4.1 app version =====) +CPPFLAGS += -msse4.1 +else +ifeq ($(AVX),1) +$(info ===== Compiling AVX app version =====) +CPPFLAGS += -mavx -mtune=sandybridge +else +ifeq ($(AVX2),1) +$(info ===== Compiling AVX2 app version =====) +CPPFLAGS += -mavx2 -mfma -mtune=haswell +else +ifeq ($(AVX512),1) +$(info ===== Compiling AVX512 app version =====) +CPPFLAGS += -march=skylake-avx512 +ifeq ($(MinGW64),1) +# MinGW needs workaround for "invalid register for .seh_savexmm" bug +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782 +CPPFLAGS += \ + -ffixed-xmm16 -ffixed-xmm17 -ffixed-xmm18 -ffixed-xmm19 -ffixed-xmm20 -ffixed-xmm21 -ffixed-xmm22 -ffixed-xmm23 \ + -ffixed-xmm24 -ffixed-xmm25 -ffixed-xmm26 -ffixed-xmm27 -ffixed-xmm28 -ffixed-xmm29 -ffixed-xmm30 -ffixed-xmm31 +endif +endif +endif +endif +endif +endif + +ifeq ($(NOWARN),1) +CPPFLAGS += -w +endif + release: cambala_boinc_app libstdc++.a: - ln -s `g++ -print-file-name=libstdc++.a` - -ttfont.cpp: - ln -s ../../api/ttfont.cpp . + ln -s `${CPP} -print-file-name=libstdc++.a` clean: distclean distclean: - /bin/rm -f $(PROGS) *.o libstdc++.a cambala_boinc_app + /bin/rm -f $(PROGS) *.o libstdc++.a cambala_boinc_app cambala_boinc_app.exe -cambala_boinc_app: cambala_boinc_app.o alglibinternal.o alglibmisc.o ap.o linalg.o specialfunctions.o sequential.o \ +cambala_boinc_app: cambala_boinc_app.o all.o \ libstdc++.a $(BOINC_API_DIR)/libboinc_api.a $(BOINC_LIB_DIR)/libboinc.a - $(CPP) $(CPPFLAGS) cambala_boinc_app.o alglibinternal.o alglibmisc.o ap.o linalg.o specialfunctions.o sequential.o -o cambala_boinc_app \ + $(CPP) $(CPPFLAGS) cambala_boinc_app.o all.o -o cambala_boinc_app \ libstdc++.a -pthread $(BOINC_API_DIR)/libboinc_api.a \ $(BOINC_LIB_DIR)/libboinc.a -specialfunctions.o: $(ALGLIB)/specialfunctions.cpp - ${CPP} ${CPPFLAGS} $(ALGLIB)/specialfunctions.cpp -c - -linalg.o: $(ALGLIB)/linalg.cpp - ${CPP} ${CPPFLAGS} $(ALGLIB)/linalg.cpp -c - -ap.o: $(ALGLIB)/ap.cpp - ${CPP} ${CPPFLAGS} $(ALGLIB)/ap.cpp -c - -alglibmisc.o: $(ALGLIB)/alglibmisc.cpp - ${CPP} ${CPPFLAGS} $(ALGLIB)/alglibmisc.cpp -c - -alglibinternal.o: $(ALGLIB)/alglibinternal.cpp - ${CPP} ${CPPFLAGS} $(ALGLIB)/alglibinternal.cpp -c - -sequential.o: $(CAMBALA)/sequential.cpp - ${CPP} ${CPPFLAGS} $(CAMBALA)/sequential.cpp -c +all.o: all.cpp $(ALGLIB)/specialfunctions.cpp $(ALGLIB)/linalg.cpp $(ALGLIB)/ap.cpp $(ALGLIB)/alglibmisc.cpp $(ALGLIB)/alglibinternal.cpp $(CAMBALA)/sequential.cpp + ${CPP} ${CPPFLAGS} all.cpp -c cambala_boinc_app.o: cambala_boinc_app.cpp ${CPP} ${CPPFLAGS} cambala_boinc_app.cpp -c diff --git a/boinc_app/all.cpp b/boinc_app/all.cpp new file mode 100644 index 0000000..c5e13ff --- /dev/null +++ b/boinc_app/all.cpp @@ -0,0 +1,6 @@ +#include +#include +#include +#include +#include +#include diff --git a/boinc_app/cambala_boinc_app.cpp b/boinc_app/cambala_boinc_app.cpp index fc970ca..90fcb81 100644 --- a/boinc_app/cambala_boinc_app.cpp +++ b/boinc_app/cambala_boinc_app.cpp @@ -32,10 +32,125 @@ #include "utils.h" #include "point.h" +#if defined(__i386__) || defined (__x86_64__) +#include +#endif + #define CHECKPOINT_FILE "chpt" #define INPUT_FILENAME "in" #define OUTPUT_FILENAME "out" +__attribute__((noreturn)) +void PrintFatalError(const char* str) +{ + // print error to unredirected stderr first + fprintf(stderr, "Error: %s", str); + + // now try to send it back to server + int retval = boinc_init(); + if (0 == retval) + { + fprintf(stderr, "Error: %s", str); + boinc_finish(1); + } + + exit(1); +} + +void VerifyCpu() +{ +#if (defined(__i386__) || defined (__x86_64__)) && defined(__SSE2__) + unsigned int a, b, c, d; + + if (!__get_cpuid(1, &a, &b, &c, &d)) + { + PrintFatalError("CPUID instruction is not supported by your CPU!\n"); + } + + if (0 == (d & bit_SSE2)) + { + PrintFatalError("SSE2 instructions are not supported by your CPU!\n"); + } + +#ifdef __SSE4_1__ + if (0 == (c & bit_SSE4_1)) + { + PrintFatalError("SSE4.1 instructions are not supported by your CPU!\n"); + } +#endif + +#ifdef __AVX__ + if (0 == (c & bit_AVX)) + { + PrintFatalError("AVX instructions are not supported by your CPU!\n"); + } + + // AVX also needs OS support, check for it + if (0 == (c & bit_OSXSAVE)) + { + PrintFatalError("OSXSAVE instructions are not supported by your CPU!\n"); + } + + unsigned int eax, edx; + unsigned int ecx = 0; // _XCR_XFEATURE_ENABLED_MASK + __asm__ ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx)); + if (0x6 != (eax & 0x6)) // XSTATE_SSE | XSTATE_YMM + { + PrintFatalError("AVX instructions are not supported by your OS!\n"); + } +#endif + +#ifdef __AVX2__ + if (__get_cpuid_max(0, 0) < 7) + { + PrintFatalError("CPUID level 7 is not supported by your CPU!\n"); + } + + unsigned int a2, b2, c2, d2; + __cpuid_count(7, 0, a2, b2, c2, d2); + + if (0 == (b2 & bit_AVX2)) + { + PrintFatalError("AVX2 instructions are not supported by your CPU!\n"); + } +#endif + +#ifdef __FMA__ + // Some AMD CPUs(s) support FMA but no AVX2. FMA does not provide significat + // boost for this app, so it should be enabled on AVX2 CPUs only. +#ifndef __AVX2__ +#error AVX2 is not enabled! +#endif + if (0 == (c & bit_FMA)) + { + PrintFatalError("FMA instructions are not supported by your CPU!\n"); + } +#endif + +#ifdef __AVX512F__ + // AVX512 consists of few subsets. Skylake-AVX512 target supports F, BW, DQ, VL and CD. + // We need data loaded during AVX and AVX2 checks, make sure we have it +#if !defined(__AVX__) || !defined(__AVX2__) +#error AVX or AVX2 is not enabled! +#endif + + const unsigned int avx512bits = bit_AVX512F | bit_AVX512DQ | bit_AVX512CD | bit_AVX512BW | bit_AVX512VL; + + if (avx512bits != (b2 & avx512bits)) + { + PrintFatalError("AVX512 instructions are not supported by your CPU!\n"); + } + + // AVX512 also needs OS support, check for it + if (0xe6 != (eax & 0xe6)) // XSTATE_SSE | XSTATE_YMM | XSTATE_OPMASK | XSTATE_ZMM | XSTATE_HI_ZMM + { + PrintFatalError("AVX512 instructions are not supported by your OS!\n"); + } +#endif + +#endif +} + using namespace std; char buf[256]; @@ -46,6 +161,8 @@ int do_checkpoint( const long long &total_points, int main(int argc, char **argv) { + VerifyCpu(); + int retval = boinc_init(); if ( retval ) { fprintf(stderr, "%s APP: boinc_init() returned %d\n", @@ -53,7 +170,7 @@ int main(int argc, char **argv) ); exit( retval ); } - + search_space_point cur_record_point; cur_record_point.residual = START_HUGE_VALUE; diff --git a/boinc_app/ac_modes_R7km_dtimes1.txt b/boinc_app/test/ac_modes_R7km_dtimes1.txt similarity index 100% rename from boinc_app/ac_modes_R7km_dtimes1.txt rename to boinc_app/test/ac_modes_R7km_dtimes1.txt diff --git a/boinc_app/test/cambala_depths_out.ref b/boinc_app/test/cambala_depths_out.ref new file mode 100644 index 0000000..746d57f --- /dev/null +++ b/boinc_app/test/cambala_depths_out.ref @@ -0,0 +1 @@ +23 33 50 300 diff --git a/boinc_app/test/cambala_out.ref b/boinc_app/test/cambala_out.ref new file mode 100644 index 0000000..9893165 --- /dev/null +++ b/boinc_app/test/cambala_out.ref @@ -0,0 +1,26 @@ +Input parameters : +launch_type bruteforce +object_function_type uniform +ppm 2 +init_iterated_local_search_runs 10 +cw1_init_arr : +1506 1490 1460 +cw2_init_arr : +1506 1490 1480 +ncpl_init_arr : +1 1 11 +nR 1 +R1 7000 +R2 7000 +ntau 1 +tau1 0 +tau2 0 +nrhob 1 +rhob1 1.7 +rhob2 1.7 +ncb 1 +cb1 1700 +cb2 1700 +dtimes_file ac_modes_R7km_dtimes1.txt +spmag_file no +launch_type bruteforce diff --git a/boinc_app/in b/boinc_app/test/in similarity index 100% rename from boinc_app/in rename to boinc_app/test/in diff --git a/boinc_app/test/out.ref b/boinc_app/test/out.ref new file mode 100644 index 0000000..7f94ec5 --- /dev/null +++ b/boinc_app/test/out.ref @@ -0,0 +1 @@ +0.003945 1700 1.7 7000 0 1506 1490 1462 23 33 50 300 \ No newline at end of file diff --git a/boinc_app/test/test.sh b/boinc_app/test/test.sh new file mode 100755 index 0000000..08c1ec4 --- /dev/null +++ b/boinc_app/test/test.sh @@ -0,0 +1,11 @@ +#!/usr/bin/bash + +rm -f boinc_finish_called cambala_depths_out cambala_out chpt out stderr.txt + +time ../cambala_boinc_app + +echo + +#diff -qs cambala_depths_out cambala_depths_out.ref +#diff -qs cambala_out cambala_out.ref +diff -qs out out.ref From 0810bed595d85b2b7103670078aa4d007648a536 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 13 Jan 2019 16:48:45 +0100 Subject: [PATCH 2/2] Simplified makefile, silenced some warnings from alglib --- boinc_app/Makefile | 24 ++++++------------------ boinc_app/all.cpp | 7 +++++++ 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/boinc_app/Makefile b/boinc_app/Makefile index 5f77c85..c310737 100644 --- a/boinc_app/Makefile +++ b/boinc_app/Makefile @@ -22,37 +22,29 @@ CPPFLAGS = -O3 \ ifeq ($(MinGW32),1) $(info ===== Compiling MinGW 32-bit app version =====) CPP = i686-w64-mingw32-g++ -else -ifeq ($(MinGW64),1) +else ifeq ($(MinGW64),1) $(info ===== Compiling MinGW 64-bit app version =====) CPP = x86_64-w64-mingw32-g++ -else -ifeq ($(M32),1) +else ifeq ($(M32),1) $(info ===== Compiling 32-bit app version =====) CPP = g++ -m32 else CPP = g++ endif -endif -endif ifeq ($(SSE2),1) $(info ===== Compiling SSE2 app version =====) CPPFLAGS += -msse2 -else -ifeq ($(SSE41),1) +else ifeq ($(SSE41),1) $(info ===== Compiling SSE4.1 app version =====) CPPFLAGS += -msse4.1 -else -ifeq ($(AVX),1) +else ifeq ($(AVX),1) $(info ===== Compiling AVX app version =====) CPPFLAGS += -mavx -mtune=sandybridge -else -ifeq ($(AVX2),1) +else ifeq ($(AVX2),1) $(info ===== Compiling AVX2 app version =====) CPPFLAGS += -mavx2 -mfma -mtune=haswell -else -ifeq ($(AVX512),1) +else ifeq ($(AVX512),1) $(info ===== Compiling AVX512 app version =====) CPPFLAGS += -march=skylake-avx512 ifeq ($(MinGW64),1) @@ -63,10 +55,6 @@ CPPFLAGS += \ -ffixed-xmm24 -ffixed-xmm25 -ffixed-xmm26 -ffixed-xmm27 -ffixed-xmm28 -ffixed-xmm29 -ffixed-xmm30 -ffixed-xmm31 endif endif -endif -endif -endif -endif ifeq ($(NOWARN),1) CPPFLAGS += -w diff --git a/boinc_app/all.cpp b/boinc_app/all.cpp index c5e13ff..5bbfcdb 100644 --- a/boinc_app/all.cpp +++ b/boinc_app/all.cpp @@ -1,6 +1,13 @@ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + #include #include #include #include #include + +#pragma GCC diagnostic pop + #include