From bf00c82082da450e26bfe5da2079b17bc143392a Mon Sep 17 00:00:00 2001
From: Alexander Trush <trush@yandex.ru>
Date: Fri, 12 Jan 2018 03:39:40 +0300
Subject: [PATCH 1/5] SSE2: two options for not using the mask4to1bits array

---
 RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
index 1dec526..937a44a 100644
--- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
+++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
@@ -89,7 +89,7 @@ MovePairSearch::MovePairSearch()
 // Initialize mask4to1bits lookup table
 void MovePairSearch::InitMask4to1bits()
 {
-#if defined(__SSE2__) && (!defined(__AVX2__) || defined(DISABLE_PEXT))
+#if (!defined(__AVX2__) || defined(DISABLE_PEXT))
   memset(mask4to1bits, 0, sizeof(mask4to1bits));
   mask4to1bits[0x0000] = 0;
   mask4to1bits[0x000f] = 1;
@@ -579,9 +579,15 @@ void MovePairSearch::MoveRows()
             vCol1b = _mm_cmpeq_epi32(vCol1b, _mm_setzero_si128());
             // create mask from vector
             // there are 4 bits per result, so we need to extract every 4th one
-            int mask1 = _mm_movemask_epi8(vCol1a);
-            int mask2 = _mm_movemask_epi8(vCol1b);
-            int mask = mask4to1bits[mask1] | (mask4to1bits[mask2] << 4);
+#if 0
+            __m128i maskAB = _mm_packs_epi32(vCol1a, vCol1b);
+            __m128i maskab = _mm_packs_epi16(maskAB, _mm_setzero_si128());
+            int mask = _mm_movemask_epi8(maskab);
+#else
+            int mask1 = _mm_movemask_ps(_mm_castsi128_ps(vCol1a));
+            int mask2 = _mm_movemask_ps(_mm_castsi128_ps(vCol1b));
+            int mask = mask1 | (mask2 << 4);
+#endif
 
             // add one bit for 0th row, and AND result with rowsUsage
             rowCandidates = (mask << 1) & rowsUsage;

From 6b353493bbf0b54246f1a01522d76497962d9140 Mon Sep 17 00:00:00 2001
From: Alexander Trush <trush@yandex.ru>
Date: Fri, 12 Jan 2018 05:05:19 +0300
Subject: [PATCH 2/5] SSE2/AVX: Lifting up code of compression words to bytes
 (not the fact that this is better, at least on AMD A10-5800)

---
 .../RakeDiagSearch/MovePairSearch.cpp         | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
index 937a44a..f8171f8 100644
--- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
+++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
@@ -574,20 +574,15 @@ void MovePairSearch::MoveRows()
             vCol1a = _mm_or_si128(vCol1a, vCol2a);
             vCol1b = _mm_or_si128(vCol1b, vCol2b);
 
+            // Saturate_Int32_To_Int8()
+            __m128i vColpack = _mm_packs_epi32(vCol1a, vCol1b);
+            vColpack = _mm_packs_epi16(vColpack, _mm_setzero_si128());
+
             // check if result is zero
-            vCol1a = _mm_cmpeq_epi32(vCol1a, _mm_setzero_si128());
-            vCol1b = _mm_cmpeq_epi32(vCol1b, _mm_setzero_si128());
+            __m128i vColzeros = _mm_cmpeq_epi8(vColpack, _mm_setzero_si128());
+
             // create mask from vector
-            // there are 4 bits per result, so we need to extract every 4th one
-#if 0
-            __m128i maskAB = _mm_packs_epi32(vCol1a, vCol1b);
-            __m128i maskab = _mm_packs_epi16(maskAB, _mm_setzero_si128());
-            int mask = _mm_movemask_epi8(maskab);
-#else
-            int mask1 = _mm_movemask_ps(_mm_castsi128_ps(vCol1a));
-            int mask2 = _mm_movemask_ps(_mm_castsi128_ps(vCol1b));
-            int mask = mask1 | (mask2 << 4);
-#endif
+            int mask = _mm_movemask_epi8(vColzeros);
 
             // add one bit for 0th row, and AND result with rowsUsage
             rowCandidates = (mask << 1) & rowsUsage;

From 4b8ddb1bc0cd62e35f66cea14fe74f0555e183c5 Mon Sep 17 00:00:00 2001
From: Alexander Troosh <trush@yandex.ru>
Date: Tue, 16 Jan 2018 03:06:46 -0700
Subject: [PATCH 3/5] Fix build under ARM32

---
 RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp | 2 +-
 RakeDiagSearch/RakeDiagSearch/MovePairSearch.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
index f8171f8..96badb0 100644
--- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
+++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
@@ -89,7 +89,7 @@ MovePairSearch::MovePairSearch()
 // Initialize mask4to1bits lookup table
 void MovePairSearch::InitMask4to1bits()
 {
-#if (!defined(__AVX2__) || defined(DISABLE_PEXT))
+#if defined(__AVX2__) && defined(DISABLE_PEXT)
   memset(mask4to1bits, 0, sizeof(mask4to1bits));
   mask4to1bits[0x0000] = 0;
   mask4to1bits[0x000f] = 1;
diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h
index 91ab4e0..8817fb3 100644
--- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h
+++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h
@@ -67,7 +67,7 @@ class MovePairSearch
   string moveSearchComponentHeader; // Header preceding the data about the state of the component of rows permutation
   static const bool isDebug = false; // Flag of displaying debug information
 
-#if defined(__SSE2__) && (!defined(__AVX2__) || defined(DISABLE_PEXT))
+#if defined(__AVX2__) && defined(DISABLE_PEXT)
   unsigned char mask4to1bits[0x10000]; // Lookup table to map 4 bit packs returned by movemask to 1 bit
 #endif
 };

From e0d8a8300495a81486710e49094a7ba5a59bfa5b Mon Sep 17 00:00:00 2001
From: Alexander Troosh <trush@yandex.ru>
Date: Wed, 17 Jan 2018 01:38:57 -0700
Subject: [PATCH 4/5] Reduce size of squareA_MaskT[][]

---
 RakeDiagSearch/RakeDiagSearch/Makefile        |  2 +-
 .../RakeDiagSearch/MovePairSearch.cpp         | 50 +++++++++++--------
 .../RakeDiagSearch/MovePairSearch.h           |  2 +-
 3 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/RakeDiagSearch/RakeDiagSearch/Makefile b/RakeDiagSearch/RakeDiagSearch/Makefile
index cd76cea..13da121 100644
--- a/RakeDiagSearch/RakeDiagSearch/Makefile
+++ b/RakeDiagSearch/RakeDiagSearch/Makefile
@@ -4,7 +4,7 @@ BOINC_LIB_DIR = $(BOINC_DIR)/lib
 
 CXX = g++
 
-CXXFLAGS += -O3 -ftree-vectorize -std=c++11 -static-libgcc -static-libstdc++ \
+CXXFLAGS += -O3 -g -ftree-vectorize -std=c++11 -static-libgcc -static-libstdc++ \
     -I$(BOINC_DIR) \
     -I$(BOINC_LIB_DIR) \
     -I$(BOINC_API_DIR) \
diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
index 96badb0..a8d4429 100644
--- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
+++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
@@ -332,11 +332,17 @@ void MovePairSearch::OnSquareGenerated(Square newSquare)
     {
       squareA[i][j] = newSquare.Matrix[i][j];
       squareA_Mask[i][j] = 1u << newSquare.Matrix[i][j];
+    }
+  }
 #if defined (__SSE2__) || defined(__ARM_NEON)
-      squareA_MaskT[j][i] = squareA_Mask[i][j];
-#endif
+  for (int i = 0; i < Rank - 1; i++)
+  {
+    for (int j = 0; j < Rank; j++)
+    {
+      squareA_MaskT[j][i] = squareA_Mask[i + 1][j];
     }
   }
+#endif
 
   // Start the rows permutation
   MoveRows();
@@ -515,8 +521,8 @@ void MovePairSearch::MoveRows()
             // load bitmasks for columns which will be on diagonals
             // for performance reasons load this as a row from transposed square
             // also excluse 0th element, row 0 has fixed position in square
-            __m256i vCol1 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[currentRowId][1]);
-            __m256i vCol2 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[Rank - 1 - currentRowId][1]);
+            __m256i vCol1 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[currentRowId][0]);
+            __m256i vCol2 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[Rank - 1 - currentRowId][0]);
 
             // AND loaded values with diagnonal masks
             __m256i vDiagMask1 = _mm256_set1_epi32(diagonalValues1);
@@ -555,10 +561,10 @@ void MovePairSearch::MoveRows()
             // load bitmasks for columns which will be on diagonals
             // for performance reasons load this as a row from transposed square
             // also excluse 0th element, row 0 has fixed position in square
-            __m128i vCol1a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][1]);
-            __m128i vCol1b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][5]);
-            __m128i vCol2a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][1]);
-            __m128i vCol2b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][5]);
+            __m128i vCol1a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][0]);
+            __m128i vCol1b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][4]);
+            __m128i vCol2a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][0]);
+            __m128i vCol2b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][4]);
 
             // AND loaded values with diagnonal masks
             __m128i vDiagMask1 = _mm_set1_epi32(diagonalValues1);
@@ -591,10 +597,10 @@ void MovePairSearch::MoveRows()
             // load bitmasks for columns which will be on diagonals
             // for performance reasons load this as a row from transposed square
             // also excluse 0th element, row 0 has fixed position in square
-            uint32x4_t vCol1a = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][1]);
-            uint32x4_t vCol1b = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][5]);
-            uint32x4_t vCol2a = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][1]);
-            uint32x4_t vCol2b = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][5]);
+            uint32x4_t vCol1a = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][0]);
+            uint32x4_t vCol1b = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][4]);
+            uint32x4_t vCol2a = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][0]);
+            uint32x4_t vCol2b = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][4]);
 
             // AND loaded values with diagnonal masks
             uint32x4_t vDiagMask1 = vdupq_n_u32(diagonalValues1);
@@ -624,15 +630,15 @@ void MovePairSearch::MoveRows()
             // load bitmasks for columns which will be on diagonals
             // for performance reasons load this as a row from transposed square
             // also excluse 0th element, row 0 has fixed position in square
-            uint32x2_t vCol1a = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][1]);
-            uint32x2_t vCol1b = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][3]);
-            uint32x2_t vCol1c = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][5]);
-            uint32x2_t vCol1d = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][7]);
-            
-            uint32x2_t vCol2a = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][1]);
-            uint32x2_t vCol2b = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][3]);
-            uint32x2_t vCol2c = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][5]);
-            uint32x2_t vCol2d = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][7]);
+            uint32x2_t vCol1a = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][0]);
+            uint32x2_t vCol1b = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][2]);
+            uint32x2_t vCol1c = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][4]);
+            uint32x2_t vCol1d = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][6]);
+
+            uint32x2_t vCol2a = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][0]);
+            uint32x2_t vCol2b = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][2]);
+            uint32x2_t vCol2c = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][4]);
+            uint32x2_t vCol2d = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][6]);
 
             // AND loaded values with diagnonal masks
             uint32x2_t vDiagMask1 = vdup_n_u32(diagonalValues1);
@@ -642,7 +648,7 @@ void MovePairSearch::MoveRows()
             vCol1b = vand_u32(vCol1b, vDiagMask1);
             vCol1c = vand_u32(vCol1c, vDiagMask1);
             vCol1d = vand_u32(vCol1d, vDiagMask1);
-            
+
             vCol2a = vand_u32(vCol2a, vDiagMask2);
             vCol2b = vand_u32(vCol2b, vDiagMask2);
             vCol2c = vand_u32(vCol2c, vDiagMask2);
diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h
index 8817fb3..eb49641 100644
--- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h
+++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h
@@ -43,7 +43,7 @@ class MovePairSearch
   int squareB[Rank][Rank];          // Generated DLS, the rows inside which will be permuted 
   int squareA_Mask[Rank][Rank];     // Bitmasks for values in squareA
 #if defined (__SSE2__) || defined(__ARM_NEON)
-  int squareA_MaskT[Rank][Rank];    // Transposed copy of squareA_Mask
+  int squareA_MaskT[Rank][Rank - 1];    // Transposed copy of squareA_Mask
 #endif
   int rowsHistory[Rank];      // Array of the history of rows usage; rowsHistory[number of the row][value] = 0 | 1, where 0 means the row with the number "value" has been used for the row "number of the row" of the generated square; 1 - the row can be used.
   int currentSquareRows[Rank];      // Array listing the current rows used in the square. The number of the used row is at the i-th position

From 346608c6c963e78ba9ed5f0e032b81743ee34319 Mon Sep 17 00:00:00 2001
From: Alexander Troosh <trush@yandex.ru>
Date: Wed, 17 Jan 2018 03:40:25 -0700
Subject: [PATCH 5/5] Unification of 32 and 64 bit codes for ARM processor

---
 .../RakeDiagSearch/MovePairSearch.cpp         | 66 ++-----------------
 1 file changed, 6 insertions(+), 60 deletions(-)

diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
index a8d4429..6178344 100644
--- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
+++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
@@ -442,15 +442,8 @@ void MovePairSearch::MoveRows()
 #ifdef __ARM_NEON
   // Set the powers of 2
   const uint32_t powersOf2[8] = { 1, 2, 4, 8, 16, 32, 64, 128 };
-#ifdef __aarch64__
   const uint32x4_t vPowersOf2Lo = vld1q_u32(powersOf2);
   const uint32x4_t vPowersOf2Hi = vld1q_u32(powersOf2+4);
-#else
-  const uint32x2_t vPowersOf2_1 = vld1_u32(powersOf2);
-  const uint32x2_t vPowersOf2_2 = vld1_u32(powersOf2+2);
-  const uint32x2_t vPowersOf2_3 = vld1_u32(powersOf2+4);
-  const uint32x2_t vPowersOf2_4 = vld1_u32(powersOf2+6);
-#endif
 #endif
 
   while (1)
@@ -593,7 +586,6 @@ void MovePairSearch::MoveRows()
             // add one bit for 0th row, and AND result with rowsUsage
             rowCandidates = (mask << 1) & rowsUsage;
 #elif defined(__ARM_NEON)
-#ifdef __aarch64__
             // load bitmasks for columns which will be on diagonals
             // for performance reasons load this as a row from transposed square
             // also excluse 0th element, row 0 has fixed position in square
@@ -622,62 +614,16 @@ void MovePairSearch::MoveRows()
 
             // create mask from vector
             uint32x4_t v = vorrq_u32(vandq_u32(vCol1a, vPowersOf2Lo), vandq_u32(vCol1b, vPowersOf2Hi));
+#ifdef __aarch64__
             uint32_t mask = vaddvq_u64(vpaddlq_u32(v));
+#else
+            uint32x2_t s = vmovn_u64(vpaddlq_u32(v));
+            uint32_t mask = s[0] + s[1];
+#endif
 
             // add one bit for 0th row, and AND result with rowsUsage
             rowCandidates = (mask << 1) & rowsUsage;
-#else // !__aarch64__
-            // load bitmasks for columns which will be on diagonals
-            // for performance reasons load this as a row from transposed square
-            // also excluse 0th element, row 0 has fixed position in square
-            uint32x2_t vCol1a = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][0]);
-            uint32x2_t vCol1b = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][2]);
-            uint32x2_t vCol1c = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][4]);
-            uint32x2_t vCol1d = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][6]);
-
-            uint32x2_t vCol2a = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][0]);
-            uint32x2_t vCol2b = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][2]);
-            uint32x2_t vCol2c = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][4]);
-            uint32x2_t vCol2d = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][6]);
-
-            // AND loaded values with diagnonal masks
-            uint32x2_t vDiagMask1 = vdup_n_u32(diagonalValues1);
-            uint32x2_t vDiagMask2 = vdup_n_u32(diagonalValues2);
-
-            vCol1a = vand_u32(vCol1a, vDiagMask1);
-            vCol1b = vand_u32(vCol1b, vDiagMask1);
-            vCol1c = vand_u32(vCol1c, vDiagMask1);
-            vCol1d = vand_u32(vCol1d, vDiagMask1);
-
-            vCol2a = vand_u32(vCol2a, vDiagMask2);
-            vCol2b = vand_u32(vCol2b, vDiagMask2);
-            vCol2c = vand_u32(vCol2c, vDiagMask2);
-            vCol2d = vand_u32(vCol2d, vDiagMask2);
-
-            // non-zero means that number is duplicated, zero means that it is unique
-            // OR these values together first
-            vCol1a = vorr_u32(vCol1a, vCol2a);
-            vCol1b = vorr_u32(vCol1b, vCol2b);
-            vCol1c = vorr_u32(vCol1c, vCol2c);
-            vCol1d = vorr_u32(vCol1d, vCol2d);
-
-            // check if result is zero
-            vCol1a = vceq_u32(vCol1a, vdup_n_u32(0));
-            vCol1b = vceq_u32(vCol1b, vdup_n_u32(0));
-            vCol1c = vceq_u32(vCol1c, vdup_n_u32(0));
-            vCol1d = vceq_u32(vCol1d, vdup_n_u32(0));
-
-            // create mask from vector
-            uint32x2_t v = vorr_u32(
-              vorr_u32(vand_u32(vCol1a, vPowersOf2_1), vand_u32(vCol1b, vPowersOf2_2)),
-              vorr_u32(vand_u32(vCol1c, vPowersOf2_3), vand_u32(vCol1d, vPowersOf2_4)));
-            //uint32_t mask = vaddv_u32(v);
-            uint32_t mask = v[0] + v[1];
-
-            // add one bit for 0th row, and AND result with rowsUsage
-            rowCandidates = (mask << 1) & rowsUsage;
-#endif
-#endif // AVX2/SSE2
+#endif // AVX2/SSE2/NEON
           }
         }
     }