From bf00c82082da450e26bfe5da2079b17bc143392a Mon Sep 17 00:00:00 2001 From: Alexander Trush Date: Fri, 12 Jan 2018 03:39:40 +0300 Subject: [PATCH 1/5] SSE2: two options for not using the mask4to1bits array --- RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp index 1dec526..937a44a 100644 --- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp +++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp @@ -89,7 +89,7 @@ MovePairSearch::MovePairSearch() // Initialize mask4to1bits lookup table void MovePairSearch::InitMask4to1bits() { -#if defined(__SSE2__) && (!defined(__AVX2__) || defined(DISABLE_PEXT)) +#if (!defined(__AVX2__) || defined(DISABLE_PEXT)) memset(mask4to1bits, 0, sizeof(mask4to1bits)); mask4to1bits[0x0000] = 0; mask4to1bits[0x000f] = 1; @@ -579,9 +579,15 @@ void MovePairSearch::MoveRows() vCol1b = _mm_cmpeq_epi32(vCol1b, _mm_setzero_si128()); // create mask from vector // there are 4 bits per result, so we need to extract every 4th one - int mask1 = _mm_movemask_epi8(vCol1a); - int mask2 = _mm_movemask_epi8(vCol1b); - int mask = mask4to1bits[mask1] | (mask4to1bits[mask2] << 4); +#if 0 + __m128i maskAB = _mm_packs_epi32(vCol1a, vCol1b); + __m128i maskab = _mm_packs_epi16(maskAB, _mm_setzero_si128()); + int mask = _mm_movemask_epi8(maskab); +#else + int mask1 = _mm_movemask_ps(_mm_castsi128_ps(vCol1a)); + int mask2 = _mm_movemask_ps(_mm_castsi128_ps(vCol1b)); + int mask = mask1 | (mask2 << 4); +#endif // add one bit for 0th row, and AND result with rowsUsage rowCandidates = (mask << 1) & rowsUsage; From 6b353493bbf0b54246f1a01522d76497962d9140 Mon Sep 17 00:00:00 2001 From: Alexander Trush Date: Fri, 12 Jan 2018 05:05:19 +0300 Subject: [PATCH 2/5] SSE2/AVX: Lifting up code of compression words to bytes (not the fact that this is better, at least on AMD A10-5800) --- .../RakeDiagSearch/MovePairSearch.cpp | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp index 937a44a..f8171f8 100644 --- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp +++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp @@ -574,20 +574,15 @@ void MovePairSearch::MoveRows() vCol1a = _mm_or_si128(vCol1a, vCol2a); vCol1b = _mm_or_si128(vCol1b, vCol2b); + // Saturate_Int32_To_Int8() + __m128i vColpack = _mm_packs_epi32(vCol1a, vCol1b); + vColpack = _mm_packs_epi16(vColpack, _mm_setzero_si128()); + // check if result is zero - vCol1a = _mm_cmpeq_epi32(vCol1a, _mm_setzero_si128()); - vCol1b = _mm_cmpeq_epi32(vCol1b, _mm_setzero_si128()); + __m128i vColzeros = _mm_cmpeq_epi8(vColpack, _mm_setzero_si128()); + // create mask from vector - // there are 4 bits per result, so we need to extract every 4th one -#if 0 - __m128i maskAB = _mm_packs_epi32(vCol1a, vCol1b); - __m128i maskab = _mm_packs_epi16(maskAB, _mm_setzero_si128()); - int mask = _mm_movemask_epi8(maskab); -#else - int mask1 = _mm_movemask_ps(_mm_castsi128_ps(vCol1a)); - int mask2 = _mm_movemask_ps(_mm_castsi128_ps(vCol1b)); - int mask = mask1 | (mask2 << 4); -#endif + int mask = _mm_movemask_epi8(vColzeros); // add one bit for 0th row, and AND result with rowsUsage rowCandidates = (mask << 1) & rowsUsage; From 4b8ddb1bc0cd62e35f66cea14fe74f0555e183c5 Mon Sep 17 00:00:00 2001 From: Alexander Troosh Date: Tue, 16 Jan 2018 03:06:46 -0700 Subject: [PATCH 3/5] Fix build under ARM32 --- RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp | 2 +- RakeDiagSearch/RakeDiagSearch/MovePairSearch.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp index f8171f8..96badb0 100644 --- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp +++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp @@ -89,7 +89,7 @@ MovePairSearch::MovePairSearch() // Initialize mask4to1bits lookup table void MovePairSearch::InitMask4to1bits() { -#if (!defined(__AVX2__) || defined(DISABLE_PEXT)) +#if defined(__AVX2__) && defined(DISABLE_PEXT) memset(mask4to1bits, 0, sizeof(mask4to1bits)); mask4to1bits[0x0000] = 0; mask4to1bits[0x000f] = 1; diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h index 91ab4e0..8817fb3 100644 --- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h +++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h @@ -67,7 +67,7 @@ class MovePairSearch string moveSearchComponentHeader; // Header preceding the data about the state of the component of rows permutation static const bool isDebug = false; // Flag of displaying debug information -#if defined(__SSE2__) && (!defined(__AVX2__) || defined(DISABLE_PEXT)) +#if defined(__AVX2__) && defined(DISABLE_PEXT) unsigned char mask4to1bits[0x10000]; // Lookup table to map 4 bit packs returned by movemask to 1 bit #endif }; From e0d8a8300495a81486710e49094a7ba5a59bfa5b Mon Sep 17 00:00:00 2001 From: Alexander Troosh Date: Wed, 17 Jan 2018 01:38:57 -0700 Subject: [PATCH 4/5] Reduce size of squareA_MaskT[][] --- RakeDiagSearch/RakeDiagSearch/Makefile | 2 +- .../RakeDiagSearch/MovePairSearch.cpp | 50 +++++++++++-------- .../RakeDiagSearch/MovePairSearch.h | 2 +- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/RakeDiagSearch/RakeDiagSearch/Makefile b/RakeDiagSearch/RakeDiagSearch/Makefile index cd76cea..13da121 100644 --- a/RakeDiagSearch/RakeDiagSearch/Makefile +++ b/RakeDiagSearch/RakeDiagSearch/Makefile @@ -4,7 +4,7 @@ BOINC_LIB_DIR = $(BOINC_DIR)/lib CXX = g++ -CXXFLAGS += -O3 -ftree-vectorize -std=c++11 -static-libgcc -static-libstdc++ \ +CXXFLAGS += -O3 -g -ftree-vectorize -std=c++11 -static-libgcc -static-libstdc++ \ -I$(BOINC_DIR) \ -I$(BOINC_LIB_DIR) \ -I$(BOINC_API_DIR) \ diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp index 96badb0..a8d4429 100644 --- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp +++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp @@ -332,11 +332,17 @@ void MovePairSearch::OnSquareGenerated(Square newSquare) { squareA[i][j] = newSquare.Matrix[i][j]; squareA_Mask[i][j] = 1u << newSquare.Matrix[i][j]; + } + } #if defined (__SSE2__) || defined(__ARM_NEON) - squareA_MaskT[j][i] = squareA_Mask[i][j]; -#endif + for (int i = 0; i < Rank - 1; i++) + { + for (int j = 0; j < Rank; j++) + { + squareA_MaskT[j][i] = squareA_Mask[i + 1][j]; } } +#endif // Start the rows permutation MoveRows(); @@ -515,8 +521,8 @@ void MovePairSearch::MoveRows() // load bitmasks for columns which will be on diagonals // for performance reasons load this as a row from transposed square // also excluse 0th element, row 0 has fixed position in square - __m256i vCol1 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[currentRowId][1]); - __m256i vCol2 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[Rank - 1 - currentRowId][1]); + __m256i vCol1 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[currentRowId][0]); + __m256i vCol2 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[Rank - 1 - currentRowId][0]); // AND loaded values with diagnonal masks __m256i vDiagMask1 = _mm256_set1_epi32(diagonalValues1); @@ -555,10 +561,10 @@ void MovePairSearch::MoveRows() // load bitmasks for columns which will be on diagonals // for performance reasons load this as a row from transposed square // also excluse 0th element, row 0 has fixed position in square - __m128i vCol1a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][1]); - __m128i vCol1b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][5]); - __m128i vCol2a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][1]); - __m128i vCol2b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][5]); + __m128i vCol1a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][0]); + __m128i vCol1b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][4]); + __m128i vCol2a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][0]); + __m128i vCol2b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][4]); // AND loaded values with diagnonal masks __m128i vDiagMask1 = _mm_set1_epi32(diagonalValues1); @@ -591,10 +597,10 @@ void MovePairSearch::MoveRows() // load bitmasks for columns which will be on diagonals // for performance reasons load this as a row from transposed square // also excluse 0th element, row 0 has fixed position in square - uint32x4_t vCol1a = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][1]); - uint32x4_t vCol1b = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][5]); - uint32x4_t vCol2a = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][1]); - uint32x4_t vCol2b = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][5]); + uint32x4_t vCol1a = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][0]); + uint32x4_t vCol1b = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][4]); + uint32x4_t vCol2a = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][0]); + uint32x4_t vCol2b = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][4]); // AND loaded values with diagnonal masks uint32x4_t vDiagMask1 = vdupq_n_u32(diagonalValues1); @@ -624,15 +630,15 @@ void MovePairSearch::MoveRows() // load bitmasks for columns which will be on diagonals // for performance reasons load this as a row from transposed square // also excluse 0th element, row 0 has fixed position in square - uint32x2_t vCol1a = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][1]); - uint32x2_t vCol1b = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][3]); - uint32x2_t vCol1c = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][5]); - uint32x2_t vCol1d = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][7]); - - uint32x2_t vCol2a = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][1]); - uint32x2_t vCol2b = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][3]); - uint32x2_t vCol2c = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][5]); - uint32x2_t vCol2d = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][7]); + uint32x2_t vCol1a = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][0]); + uint32x2_t vCol1b = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][2]); + uint32x2_t vCol1c = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][4]); + uint32x2_t vCol1d = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][6]); + + uint32x2_t vCol2a = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][0]); + uint32x2_t vCol2b = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][2]); + uint32x2_t vCol2c = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][4]); + uint32x2_t vCol2d = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][6]); // AND loaded values with diagnonal masks uint32x2_t vDiagMask1 = vdup_n_u32(diagonalValues1); @@ -642,7 +648,7 @@ void MovePairSearch::MoveRows() vCol1b = vand_u32(vCol1b, vDiagMask1); vCol1c = vand_u32(vCol1c, vDiagMask1); vCol1d = vand_u32(vCol1d, vDiagMask1); - + vCol2a = vand_u32(vCol2a, vDiagMask2); vCol2b = vand_u32(vCol2b, vDiagMask2); vCol2c = vand_u32(vCol2c, vDiagMask2); diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h index 8817fb3..eb49641 100644 --- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h +++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h @@ -43,7 +43,7 @@ class MovePairSearch int squareB[Rank][Rank]; // Generated DLS, the rows inside which will be permuted int squareA_Mask[Rank][Rank]; // Bitmasks for values in squareA #if defined (__SSE2__) || defined(__ARM_NEON) - int squareA_MaskT[Rank][Rank]; // Transposed copy of squareA_Mask + int squareA_MaskT[Rank][Rank - 1]; // Transposed copy of squareA_Mask #endif int rowsHistory[Rank]; // Array of the history of rows usage; rowsHistory[number of the row][value] = 0 | 1, where 0 means the row with the number "value" has been used for the row "number of the row" of the generated square; 1 - the row can be used. int currentSquareRows[Rank]; // Array listing the current rows used in the square. The number of the used row is at the i-th position From 346608c6c963e78ba9ed5f0e032b81743ee34319 Mon Sep 17 00:00:00 2001 From: Alexander Troosh Date: Wed, 17 Jan 2018 03:40:25 -0700 Subject: [PATCH 5/5] Unification of 32 and 64 bit codes for ARM processor --- .../RakeDiagSearch/MovePairSearch.cpp | 66 ++----------------- 1 file changed, 6 insertions(+), 60 deletions(-) diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp index a8d4429..6178344 100644 --- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp +++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp @@ -442,15 +442,8 @@ void MovePairSearch::MoveRows() #ifdef __ARM_NEON // Set the powers of 2 const uint32_t powersOf2[8] = { 1, 2, 4, 8, 16, 32, 64, 128 }; -#ifdef __aarch64__ const uint32x4_t vPowersOf2Lo = vld1q_u32(powersOf2); const uint32x4_t vPowersOf2Hi = vld1q_u32(powersOf2+4); -#else - const uint32x2_t vPowersOf2_1 = vld1_u32(powersOf2); - const uint32x2_t vPowersOf2_2 = vld1_u32(powersOf2+2); - const uint32x2_t vPowersOf2_3 = vld1_u32(powersOf2+4); - const uint32x2_t vPowersOf2_4 = vld1_u32(powersOf2+6); -#endif #endif while (1) @@ -593,7 +586,6 @@ void MovePairSearch::MoveRows() // add one bit for 0th row, and AND result with rowsUsage rowCandidates = (mask << 1) & rowsUsage; #elif defined(__ARM_NEON) -#ifdef __aarch64__ // load bitmasks for columns which will be on diagonals // for performance reasons load this as a row from transposed square // also excluse 0th element, row 0 has fixed position in square @@ -622,62 +614,16 @@ void MovePairSearch::MoveRows() // create mask from vector uint32x4_t v = vorrq_u32(vandq_u32(vCol1a, vPowersOf2Lo), vandq_u32(vCol1b, vPowersOf2Hi)); +#ifdef __aarch64__ uint32_t mask = vaddvq_u64(vpaddlq_u32(v)); +#else + uint32x2_t s = vmovn_u64(vpaddlq_u32(v)); + uint32_t mask = s[0] + s[1]; +#endif // add one bit for 0th row, and AND result with rowsUsage rowCandidates = (mask << 1) & rowsUsage; -#else // !__aarch64__ - // load bitmasks for columns which will be on diagonals - // for performance reasons load this as a row from transposed square - // also excluse 0th element, row 0 has fixed position in square - uint32x2_t vCol1a = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][0]); - uint32x2_t vCol1b = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][2]); - uint32x2_t vCol1c = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][4]); - uint32x2_t vCol1d = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][6]); - - uint32x2_t vCol2a = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][0]); - uint32x2_t vCol2b = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][2]); - uint32x2_t vCol2c = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][4]); - uint32x2_t vCol2d = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][6]); - - // AND loaded values with diagnonal masks - uint32x2_t vDiagMask1 = vdup_n_u32(diagonalValues1); - uint32x2_t vDiagMask2 = vdup_n_u32(diagonalValues2); - - vCol1a = vand_u32(vCol1a, vDiagMask1); - vCol1b = vand_u32(vCol1b, vDiagMask1); - vCol1c = vand_u32(vCol1c, vDiagMask1); - vCol1d = vand_u32(vCol1d, vDiagMask1); - - vCol2a = vand_u32(vCol2a, vDiagMask2); - vCol2b = vand_u32(vCol2b, vDiagMask2); - vCol2c = vand_u32(vCol2c, vDiagMask2); - vCol2d = vand_u32(vCol2d, vDiagMask2); - - // non-zero means that number is duplicated, zero means that it is unique - // OR these values together first - vCol1a = vorr_u32(vCol1a, vCol2a); - vCol1b = vorr_u32(vCol1b, vCol2b); - vCol1c = vorr_u32(vCol1c, vCol2c); - vCol1d = vorr_u32(vCol1d, vCol2d); - - // check if result is zero - vCol1a = vceq_u32(vCol1a, vdup_n_u32(0)); - vCol1b = vceq_u32(vCol1b, vdup_n_u32(0)); - vCol1c = vceq_u32(vCol1c, vdup_n_u32(0)); - vCol1d = vceq_u32(vCol1d, vdup_n_u32(0)); - - // create mask from vector - uint32x2_t v = vorr_u32( - vorr_u32(vand_u32(vCol1a, vPowersOf2_1), vand_u32(vCol1b, vPowersOf2_2)), - vorr_u32(vand_u32(vCol1c, vPowersOf2_3), vand_u32(vCol1d, vPowersOf2_4))); - //uint32_t mask = vaddv_u32(v); - uint32_t mask = v[0] + v[1]; - - // add one bit for 0th row, and AND result with rowsUsage - rowCandidates = (mask << 1) & rowsUsage; -#endif -#endif // AVX2/SSE2 +#endif // AVX2/SSE2/NEON } } }