diff --git a/algorithm.c b/algorithm.c
index 2f543556..ad56d833 100644
--- a/algorithm.c
+++ b/algorithm.c
@@ -1147,6 +1147,7 @@ static cl_int queue_cryptonight_kernel(_clState *clState, dev_blk_ctx *blk, __ma
 }
 
 
+#define WORKSIZE clState->wsize
 
 static cl_int queue_equihash_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
 {
@@ -1160,43 +1161,43 @@ static cl_int queue_equihash_kernel(_clState *clState, dev_blk_ctx *blk, __maybe
   uint32_t dbg[2] = {0};
   status |= clEnqueueWriteBuffer(clState->commandQueue, clState->padbuffer8, CL_TRUE, 0, sizeof(dbg), &dbg, 0, NULL, NULL);
   
-  cl_mem buf_ht[2] = {clState->CLbuffer0, clState->buffer1};
   cl_mem rowCounters[2] = {clState->buffer2, clState->buffer3};
   for (int round = 0; round < PARAM_K; round++) {
-    size_t global_ws = NR_ROWS / ROWS_PER_UINT;
+    size_t global_ws = RC_SIZE;
     size_t local_ws = 256;
     unsigned int num = 0;
     cl_kernel *kernel = &clState->extra_kernels[0];
     // Now on every round!!!!
-    CL_SET_ARG(buf_ht[round % 2]);
+    CL_SET_ARG(clState->index_buf[round]);
     CL_SET_ARG(rowCounters[round % 2]);
+    CL_SET_ARG(clState->outputBuffer);
+    CL_SET_ARG(clState->CLbuffer0);
     status |= clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL);
     
-    num = 0;
     kernel = &clState->extra_kernels[1 + round];
     if (!round) {
-      CL_SET_ARG(clState->MidstateBuf);
-      CL_SET_ARG(buf_ht[round % 2]);
-      CL_SET_ARG(rowCounters[round % 2]);
-      work_items = threads;
+      worksize = LOCAL_WORK_SIZE_ROUND0;
+      work_items = NR_INPUTS / ROUND0_INPUTS_PER_WORK_ITEM;
     }
     else {
-      CL_SET_ARG(buf_ht[(round - 1) % 2]);
-      CL_SET_ARG(buf_ht[round % 2]);
-      CL_SET_ARG(rowCounters[(round - 1) % 2]);
-      CL_SET_ARG(rowCounters[round % 2]);
-      work_items = NR_ROWS;
+      worksize = LOCAL_WORK_SIZE;
+      work_items = NR_ROWS * worksize;
     }
-    CL_SET_ARG(clState->padbuffer8);
-    if (round == PARAM_K - 1)
-      CL_SET_ARG(clState->outputBuffer);
-    status |= clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL, &work_items, &worksize, 0, NULL, NULL);
+    status |= clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[1 + round], 1, NULL, &work_items, &worksize, 0, NULL, NULL);
   }
-  work_items = NR_ROWS;
+
+  worksize = LOCAL_WORK_SIZE_POTENTIAL_SOLS;
+  work_items = NR_ROWS * worksize;
+  status |= clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[1 + 9], 1, NULL, &work_items, &worksize, 0, NULL, NULL); 
+
+  worksize = LOCAL_WORK_SIZE_SOLS;
+  work_items = MAX_POTENTIAL_SOLS * worksize;
   status |= clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 1, NULL, &work_items, &worksize, 0, NULL, NULL);
  
   return status;
 }
+#undef WORKSIZE
+
 
 static algorithm_settings_t algos[] = {
   // kernels starting from this will have difficulty calculated by using litecoin algorithm
diff --git a/algorithm/equihash.c b/algorithm/equihash.c
index 1f6a6b4b..35a32efd 100644
--- a/algorithm/equihash.c
+++ b/algorithm/equihash.c
@@ -34,7 +34,7 @@ static const uint8_t blake2b_sigma[12][16] = {
   { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
 };
 
-    
+
 static const uint64_t blake2b_IV[8] = {
   0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
   0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
@@ -43,9 +43,9 @@ static const uint64_t blake2b_IV[8] = {
 };
 
 static const uint64_t blake2b_h[8] = {
-  0x6a09e667f2bdc93aULL, 0xbb67ae8584caa73bULL, 
+  0x6a09e667f2bdc93aULL, 0xbb67ae8584caa73bULL,
   0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
-  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, 
+  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
   0x48ec89c38820de31ULL, 0x5be0cd10137e21b1ULL
 };
 
@@ -75,8 +75,8 @@ static const uint64_t blake2b_h[8] = {
   G(r,5,v[ 1],v[ 6],v[11],v[12]); \
   G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
   G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
-  
-  
+
+
 #define G_fast(r,i,a,b,c,d) \
   a = a + b + (blake2b_sigma[r][2*i] == 1 ? m1 : 0); \
   d = rotr64(d ^ a, 32); \
@@ -99,134 +99,133 @@ static const uint64_t blake2b_h[8] = {
 
 
 void equihash_calc_mid_hash(uint64_t mid_hash[8], uint8_t* header) {
-  uint64_t v[16], *m = (uint64_t*) header;
-  for (int i = 0; i < 8; i++) {
-    v[i] = blake2b_h[i];
-    v[i+8] = blake2b_IV[i];
-  }
-  v[12] ^= 128;
-  for (int r = 0; r < 12; r++) {
-    ROUND(r)
-  }
-  for (int i = 0; i < 8; i++) 
-    mid_hash[i] = blake2b_h[i] ^ v[i] ^ v[i+8];
+    uint64_t v[16], *m = (uint64_t*)header;
+    for (int i = 0; i < 8; i++) {
+        v[i] = blake2b_h[i];
+        v[i + 8] = blake2b_IV[i];
+    }
+    v[12] ^= 128;
+    for (int r = 0; r < 12; r++) {
+        ROUND(r)
+    }
+    for (int i = 0; i < 8; i++)
+        mid_hash[i] = blake2b_h[i] ^ v[i] ^ v[i + 8];
 }
 
 void blake2b_hash(uint8_t *hash, uint64_t mid_hash[8], uint32_t bday) {
-  uint64_t v[16], tmp[8];
-  uint64_t m1 = (uint64_t) bday << 32;
-  for (int i = 0; i < 8; i++) {
-    v[i] = mid_hash[i];
-    v[i+8] = blake2b_IV[i];
-  }
-  v[12] ^= 140 + sizeof(bday);
-  v[14] ^= (int64_t) -1;
-  for (int r = 0; r < 12; r++) {
-    ROUND_fast(r)
-  }
-  for (int i = 0; i < 8; i++) 
-    tmp[i] = mid_hash[i] ^ v[i] ^ v[i+8];
-  memcpy(hash, tmp, 50);
+    uint64_t v[16], tmp[8];
+    uint64_t m1 = (uint64_t)bday << 32;
+    for (int i = 0; i < 8; i++) {
+        v[i] = mid_hash[i];
+        v[i + 8] = blake2b_IV[i];
+    }
+    v[12] ^= 140 + sizeof(bday);
+    v[14] ^= (int64_t)-1;
+    for (int r = 0; r < 12; r++) {
+        ROUND_fast(r)
+    }
+    for (int i = 0; i < 8; i++)
+        tmp[i] = mid_hash[i] ^ v[i] ^ v[i + 8];
+    memcpy(hash, tmp, 50);
 }
 
 void equihash_calc_hash(uint8_t hash[25], uint64_t mid_hash[8], uint32_t bday) {
-  uint8_t tmp[50];
-  blake2b_hash(tmp, mid_hash, bday/2);
-  memcpy(hash, tmp + (bday & 1 ? 25 : 0), 25);
+    uint8_t tmp[50];
+    blake2b_hash(tmp, mid_hash, bday / 2);
+    memcpy(hash, tmp + (bday & 1 ? 25 : 0), 25);
 }
 
 
 // These two copied from the ref impl, for now.
 void ExpandArray(const unsigned char* in, size_t in_len,
-                 unsigned char* out, size_t out_len,
-                 size_t bit_len)
+    unsigned char* out, size_t out_len,
+    size_t bit_len)
 {
-  size_t byte_pad = 0;
-  size_t out_width = ((bit_len+7)/8 + byte_pad);
-  uint32_t bit_len_mask = (((uint32_t)1 << bit_len) - 1);
-
-  // The acc_bits least-significant bits of acc_value represent a bit sequence
-  // in big-endian order.
-  size_t acc_bits = 0;
-  uint32_t acc_value = 0;
-
-  size_t j = 0;
-  for (size_t i = 0; i < in_len; i++) {
-    acc_value = (acc_value << 8) | in[i];
-    acc_bits += 8;
-
-    // When we have bit_len or more bits in the accumulator, write the next
-    // output element.
-    if (acc_bits >= bit_len) {
-      acc_bits -= bit_len;
-      for (size_t x = 0; x < byte_pad; x++) {
-        out[j+x] = 0;
-      }
-      for (size_t x = byte_pad; x < out_width; x++) {
-        out[j+x] = (
-          // Big-endian
-          acc_value >> (acc_bits+(8*(out_width-x-1)))
-        ) & (
-          // Apply bit_len_mask across byte boundaries
-          (bit_len_mask >> (8*(out_width-x-1))) & 0xFF
-        );
-      }
-      j += out_width;
+    size_t byte_pad = 0;
+    size_t out_width = ((bit_len + 7) / 8 + byte_pad);
+    uint32_t bit_len_mask = (((uint32_t)1 << bit_len) - 1);
+
+    // The acc_bits least-significant bits of acc_value represent a bit sequence
+    // in big-endian order.
+    size_t acc_bits = 0;
+    uint32_t acc_value = 0;
+
+    size_t j = 0;
+    for (size_t i = 0; i < in_len; i++) {
+        acc_value = (acc_value << 8) | in[i];
+        acc_bits += 8;
+
+        // When we have bit_len or more bits in the accumulator, write the next
+        // output element.
+        if (acc_bits >= bit_len) {
+            acc_bits -= bit_len;
+            for (size_t x = 0; x < byte_pad; x++) {
+                out[j + x] = 0;
+            }
+            for (size_t x = byte_pad; x < out_width; x++) {
+                out[j + x] = (
+                    // Big-endian
+                    acc_value >> (acc_bits + (8 * (out_width - x - 1)))
+                    ) & (
+                        // Apply bit_len_mask across byte boundaries
+                    (bit_len_mask >> (8 * (out_width - x - 1))) & 0xFF
+                        );
+            }
+            j += out_width;
+        }
     }
-  }
 }
 
 
 void CompressArray(const unsigned char* in, size_t in_len,
-                   unsigned char* out, size_t out_len,
-                   size_t bit_len, size_t byte_pad)
+    unsigned char* out, size_t out_len,
+    size_t bit_len, size_t byte_pad)
 {
-  size_t in_width = ((bit_len+7)/8 + byte_pad);
-  uint32_t bit_len_mask = (((uint32_t)1 << bit_len) - 1);
-
-  // The acc_bits least-significant bits of acc_value represent a bit sequence
-  // in big-endian order.
-  size_t acc_bits = 0;
-  uint32_t acc_value = 0;
-
-  size_t j = 0;
-  for (size_t i = 0; i < out_len; i++) {
-    // When we have fewer than 8 bits left in the accumulator, read the next
-    // input element.
-    if (acc_bits < 8) {
-      acc_value = acc_value << bit_len;
-      for (size_t x = byte_pad; x < in_width; x++) {
-        acc_value = acc_value | (
-          (
-            // Apply bit_len_mask across byte boundaries
-            in[j+x] & ((bit_len_mask >> (8*(in_width-x-1))) & 0xFF)
-          ) << (8*(in_width-x-1))
-        ); // Big-endian
-      }
-      j += in_width;
-      acc_bits += bit_len;
+    size_t in_width = ((bit_len + 7) / 8 + byte_pad);
+    uint32_t bit_len_mask = (((uint32_t)1 << bit_len) - 1);
+
+    // The acc_bits least-significant bits of acc_value represent a bit sequence
+    // in big-endian order.
+    size_t acc_bits = 0;
+    uint32_t acc_value = 0;
+
+    size_t j = 0;
+    for (size_t i = 0; i < out_len; i++) {
+        // When we have fewer than 8 bits left in the accumulator, read the next
+        // input element.
+        if (acc_bits < 8) {
+            acc_value = acc_value << bit_len;
+            for (size_t x = byte_pad; x < in_width; x++) {
+                acc_value = acc_value | (
+                    (
+                        // Apply bit_len_mask across byte boundaries
+                        in[j + x] & ((bit_len_mask >> (8 * (in_width - x - 1))) & 0xFF)
+                        ) << (8 * (in_width - x - 1))
+                    ); // Big-endian
+            }
+            j += in_width;
+            acc_bits += bit_len;
+        }
+
+        acc_bits -= 8;
+        out[i] = (acc_value >> acc_bits) & 0xFF;
     }
-
-    acc_bits -= 8;
-    out[i] = (acc_value >> acc_bits) & 0xFF;
-  }
 }
 
 
 static inline void sort_pair(uint32_t *a, uint32_t len)
 {
-  uint32_t    *b = a + len;
-  uint32_t     tmp, need_sorting = 0;
-  for (uint32_t i = 0; i < len; i++) {
-    if (need_sorting || a[i] > b[i]) {
-      need_sorting = 1;
-      tmp = a[i];
-      a[i] = b[i];
-      b[i] = tmp;
+    uint32_t    *b = a + len;
+    uint32_t     tmp, need_sorting = 0;
+    for (uint32_t i = 0; i < len; i++) {
+        if (need_sorting || a[i] > b[i]) {
+            need_sorting = 1;
+            tmp = a[i];
+            a[i] = b[i];
+            b[i] = tmp;
+        } else if (a[i] < b[i])
+            break;
     }
-    else if (a[i] < b[i])
-      break;
-  }
 }
 
 
@@ -234,49 +233,49 @@ bool submit_tested_work(struct thr_info *, struct work *);
 
 uint32_t equihash_verify_sol(struct work *work, sols_t *sols, int sol_i)
 {
-  uint32_t thr_id = work->thr->id;
-  uint32_t	*inputs = sols->values[sol_i];
-  uint32_t	seen_len = (1 << (PREFIX + 1)) / 8;
-  uint8_t	seen[seen_len];
-  uint32_t	i;
-  uint8_t	tmp;
-  // look for duplicate inputs
-  memset(seen, 0, seen_len);
-  for (i = 0; i < (1 << PARAM_K); i++) {
-    
-    if (inputs[i] / 8 >= seen_len) {
-      sols->valid[sol_i] = 0;
-      return 0;
+    uint32_t thr_id = work->thr->id;
+    uint32_t	*inputs = sols->values[sol_i];
+    uint32_t	seen_len = (1 << (PREFIX + 1)) / 8;
+    uint8_t	seen[(1 << (PREFIX + 1)) / 8];
+    uint32_t	i;
+    uint8_t	tmp;
+    // look for duplicate inputs
+    memset(seen, 0, seen_len);
+    for (i = 0; i < (1 << PARAM_K); i++) {
+
+        if (inputs[i] / 8 >= seen_len) {
+            sols->valid[sol_i] = 0;
+            return 0;
+        }
+        tmp = seen[inputs[i] / 8];
+        seen[inputs[i] / 8] |= 1 << (inputs[i] & 7);
+        if (tmp == seen[inputs[i] / 8]) {
+            // at least one input value is a duplicate
+            sols->valid[sol_i] = 0;
+            return 0;
+        }
     }
-    tmp = seen[inputs[i] / 8];
-    seen[inputs[i] / 8] |= 1 << (inputs[i] & 7);
-    if (tmp == seen[inputs[i] / 8]) {
-      // at least one input value is a duplicate
-      sols->valid[sol_i] = 0;
-      return 0;
+    // the valid flag is already set by the GPU, but set it again because
+    // I plan to change the GPU code to not set it
+    sols->valid[sol_i] = 1;
+    // sort the pairs in place
+    for (uint32_t level = 0; level < PARAM_K; level++) {
+        for (i = 0; i < (1 << PARAM_K); i += (2 << level)) {
+            sort_pair(&inputs[i], 1 << level);
+        }
     }
-  }
-  // the valid flag is already set by the GPU, but set it again because
-  // I plan to change the GPU code to not set it
-  sols->valid[sol_i] = 1;
-  // sort the pairs in place
-  for (uint32_t level = 0; level < PARAM_K; level++) {
-    for (i = 0; i < (1 << PARAM_K); i += (2 << level)) {
-      sort_pair(&inputs[i], 1 << level);
+
+    for (i = 0; i < (1 << PARAM_K); i++)
+        inputs[i] = htobe32(inputs[i]);
+
+    CompressArray((unsigned char*)inputs, 512 * 4, work->equihash_data + 143, 1344, 21, 1);
+
+    gen_hash(work->equihash_data, 1344 + 143, work->hash);
+
+    if (*(uint64_t*)(work->hash + 24) < *(uint64_t*)(work->target + 24)) {
+        submit_tested_work(work->thr, work);
     }
-  }
-    
-  for (i = 0; i < (1 << PARAM_K); i++)
-    inputs[i] = htobe32(inputs[i]);
-    
-  CompressArray((unsigned char*) inputs, 512*4, work->equihash_data + 143, 1344, 21, 1);
-  
-  gen_hash(work->equihash_data, 1344 + 143, work->hash); 
-  
-  if (*(uint64_t*) (work->hash + 24) < *(uint64_t*) (work->target + 24)) {
-    submit_tested_work(work->thr, work);
-  }
-  return 1;
+    return 1;
 }
 
 void equihash_regenhash(struct work *work)
diff --git a/algorithm/ethash.c b/algorithm/ethash.c
index b9606d54..c4eb97a1 100644
--- a/algorithm/ethash.c
+++ b/algorithm/ethash.c
@@ -29,7 +29,7 @@ uint32_t EthCalcEpochNumber(uint8_t *SeedHash)
   uint8_t TestSeedHash[32] = { 0 };
   
   for(int Epoch = 0; Epoch < 2048; ++Epoch) {
-    SHA3_256(TestSeedHash, TestSeedHash, 32);
+    SHA3_256((struct ethash_h256 *)TestSeedHash, TestSeedHash, 32);
     if(!memcmp(TestSeedHash, SeedHash, 32)) return(Epoch + 1);
   }
   
@@ -61,6 +61,10 @@ Node CalcDAGItem(const Node *CacheInputNodes, uint32_t NodeCount, uint32_t NodeI
   return DAGNode;
 }
 
+#ifdef _MSC_VER
+#define restrict
+#endif
+
 // OutHash & MixHash MUST have 32 bytes allocated (at least)
 void LightEthash(uint8_t *restrict OutHash, uint8_t *restrict MixHash, const uint8_t *restrict HeaderPoWHash, const Node *Cache, const uint64_t EpochNumber, const uint64_t Nonce)
 {
@@ -116,7 +120,7 @@ void ethash_regenhash(struct work *work)
   work->Nonce += *((uint32_t *)(work->data + 32));
   applog(LOG_DEBUG, "Regenhash: First qword of input: 0x%016llX.", work->Nonce);
   cg_rlock(&work->pool->data_lock);
-  LightEthash(work->hash, work->mixhash, work->data, work->pool->eth_cache.dag_cache, work->eth_epoch, work->Nonce);
+  LightEthash(work->hash, work->mixhash, work->data, (Node *)work->pool->eth_cache.dag_cache, work->eth_epoch, work->Nonce);
   cg_runlock(&work->pool->data_lock);
   
   char *DbgHash = bin2hex(work->hash, 32);
diff --git a/algorithm/yescryptcommon.c b/algorithm/yescryptcommon.c
index cf7067d0..841a7838 100644
--- a/algorithm/yescryptcommon.c
+++ b/algorithm/yescryptcommon.c
@@ -320,6 +320,10 @@ yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p,
 	    buf, sizeof(buf));
 }
 
+#ifdef _MSC_VER
+#define __thread __declspec(thread)
+#endif
+
 static int
 yescrypt_bsty(const uint8_t * passwd, size_t passwdlen,
     const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p,
diff --git a/driver-opencl.c b/driver-opencl.c
index 0250833d..c60a7c47 100644
--- a/driver-opencl.c
+++ b/driver-opencl.c
@@ -1646,6 +1646,16 @@ static void opencl_thread_shutdown(struct thr_info *thr)
       clReleaseMemObject(clState->buffer3);
     if (clState->padbuffer8)
       clReleaseMemObject(clState->padbuffer8);
+    for (i = 0; i < 9; i++)
+      if (clState->index_buf[i])
+        clReleaseMemObject(clState->index_buf[i]);
+    for (i = 0; i < 4; i++)
+      if (clState->BranchBuffer[i])
+        clReleaseMemObject(clState->BranchBuffer[i]);
+    if (clState->Scratchpads)
+      clReleaseMemObject(clState->Scratchpads);
+    if (clState->States)
+      clReleaseMemObject(clState->States);
     clReleaseKernel(clState->kernel);
     for (i = 0; i < clState->n_extra_kernels; i++)
       clReleaseKernel(clState->extra_kernels[i]);
diff --git a/kernel/equihash-param.h b/kernel/equihash-param.h
index ace80692..8969a366 100644
--- a/kernel/equihash-param.h
+++ b/kernel/equihash-param.h
@@ -1,113 +1,404 @@
-#ifndef __OPENCL_VERSION__
-#define uint uint32_t
-#define uchar uint8_t
-#endif
-
-
-#define PARAM_N				200
-#define PARAM_K				9
-#define PREFIX                          (PARAM_N / (PARAM_K + 1))
-#define NR_INPUTS                       (1 << PREFIX)
-// Approximate log base 2 of number of elements in hash tables
-#define APX_NR_ELMS_LOG                 (PREFIX + 1)
-// Number of rows and slots is affected by this. 20 offers the best performance
-// but occasionally misses ~1% of solutions.
-#define NR_ROWS_LOG                     18
-
-// Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md
-#define OPTIM_SIMPLIFY_ROUND		1
-
-// Number of collision items to track, per thread
-#define THREADS_PER_ROW 8
-#define LDS_COLL_SIZE (NR_SLOTS * 8 * (64 / THREADS_PER_ROW))
-
-// Ratio of time of sleeping before rechecking if task is done (0-1)
-#define SLEEP_RECHECK_RATIO 0.60
-// Ratio of time to busy wait for the solution (0-1)
-// The higher value the higher CPU usage with Nvidia
-#define SLEEP_SKIP_RATIO 0.005
-
-// Make hash tables OVERHEAD times larger than necessary to store the average
-// number of elements per row. The ideal value is as small as possible to
-// reduce memory usage, but not too small or else elements are dropped from the
-// hash tables.
-//
-// The actual number of elements per row is closer to the theoretical average
-// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be
-// smaller.
-//
-// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease
-// performance as they cause VRAM channel conflicts.
-#if NR_ROWS_LOG == 16
-// #error "NR_ROWS_LOG = 16 is currently broken - do not use"
-#define OVERHEAD                        2
-#elif NR_ROWS_LOG == 18
-#define OVERHEAD                        3
-#elif NR_ROWS_LOG == 19
-#define OVERHEAD                        5
-#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND
-#define OVERHEAD                        6
-#elif NR_ROWS_LOG == 20
-#define OVERHEAD                        9
-#endif
-
-#define NR_ROWS                         (1 << NR_ROWS_LOG)
-#define NR_SLOTS            (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD))
-// Length of 1 element (slot) in byte
-#define SLOT_LEN                        32
-// Total size of hash table
-#define HT_SIZE				(NR_ROWS * NR_SLOTS * SLOT_LEN)
-// Length of Zcash block header, nonce (part of header)
-#define ZCASH_BLOCK_HEADER_LEN		140
-// Offset of nTime in header
-#define ZCASH_BLOCK_OFFSET_NTIME        (4 + 3 * 32)
-// Length of nonce
-#define ZCASH_NONCE_LEN			32
-// Length of encoded representation of solution size
-#define ZCASH_SOLSIZE_LEN		3
-// Solution size (1344 = 0x540) represented as a compact integer, in hex
-#define ZCASH_SOLSIZE_HEX               "fd4005"
-// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes)
-#define ZCASH_SOL_LEN                   ((1 << PARAM_K) * (PREFIX + 1) / 8)
-// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization
-#define N_ZERO_BYTES			12
-// Number of bytes Zcash needs out of Blake
-#define ZCASH_HASH_LEN                  50
-// Number of wavefronts per SIMD for the Blake kernel.
-// Blake is ALU-bound (beside the atomic counter being incremented) so we need
-// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
-// instructions. 10 is the max supported by the hw.
-#define BLAKE_WPS               	10
-// Maximum number of solutions reported by kernel to host
-#define MAX_SOLS			10
-// Length of SHA256 target
-#define SHA256_TARGET_LEN               (256 / 8)
-
-#if (NR_SLOTS < 16)
-#define BITS_PER_ROW 4
-#define ROWS_PER_UINT 8
-#define ROW_MASK 0x0F
-#else
-#define BITS_PER_ROW 8
-#define ROWS_PER_UINT 4
-#define ROW_MASK 0xFF
-#endif
-
-// Optional features
-#undef ENABLE_DEBUG
-
-/*
-** Return the offset of Xi in bytes from the beginning of the slot.
-*/
-#define xi_offset_for_round(round)	(8 + ((round) / 2) * 4)
-
-// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
-#define SOL_SIZE			((1 << PARAM_K) * 4)
-typedef struct	sols_s
-{
-    uint	nr;
-    uint	likely_invalids;
-    uchar	valid[MAX_SOLS];
-    uint	values[MAX_SOLS][(1 << PARAM_K)];
-}		sols_t;
-
+// Gateless Gate, a Zcash miner
+// Copyright 2016 zawawa @ bitcointalk.org
+//
+// The initial version of this software was based on:
+// SILENTARMY v5
+// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil
+//
+// This program is free software : you can redistribute it and / or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program.If not, see <http://www.gnu.org/licenses/>.
+
+
+
+#ifndef __OPENCL_VERSION__
+#define uint uint32_t
+#define uchar uint8_t
+#endif
+#ifdef cl_amd_fp64
+#define AMD
+#endif
+#if (defined(__Tahiti__) || defined(__Pitcairn__) || defined(__Capeverde__) || defined(__Oland__)) && !defined(AMD_LEGACY)
+#define AMD_LEGACY
+#endif
+#ifdef cl_nv_pragma_unroll
+#define NVIDIA
+#endif
+//#define ENABLE_DEBUG
+
+
+
+//
+// Parameters for Hash Tables
+//
+
+// There are PARAM_K - 1 hash tables, and each hash table has NR_ROWS rows.
+// Each row contains NR_SLOTS slots.
+
+#define NR_ROWS_LOG              12  /* 12, 13, 14, 15, or 16. */
+#define NR_SLOTS                 684
+
+#define LDS_COLL_SIZE            (NR_SLOTS * 67 / 100)
+
+#define LOCAL_WORK_SIZE          WORKSIZE  
+#define LOCAL_WORK_SIZE_SOLS     WORKSIZE
+#define LOCAL_WORK_SIZE_ROUND0   WORKSIZE
+#define LOCAL_WORK_SIZE_POTENTIAL_SOLS WORKSIZE
+
+#define ROUND0_INPUTS_PER_WORK_ITEM 1
+
+#if defined(AMD)
+#define THREADS_PER_WRITE(round) (((round) <= 5) ? 2 : 1)
+#else
+#define THREADS_PER_WRITE(round) 1
+#endif
+
+#if defined(AMD) && !defined(AMD_LEGACY)
+#define OPTIM_24BYTE_WRITES
+#endif
+#define OPTIM_16BYTE_WRITES
+#if !defined(AMD_LEGACY)
+#define OPTIM_8BYTE_WRITES
+#endif
+
+//#define OPTIM_FAST_INTEGER_DIVISION
+//#define OPTIM_COMPACT_ROW_COUNTERS
+
+#define ADJUSTED_LDS_ARRAY_SIZE(n) (n)
+
+
+
+#define PARAM_N				   200
+#define PARAM_K			       9
+#define PREFIX                 (PARAM_N / (PARAM_K + 1))
+#define NR_INPUTS              (1 << PREFIX)
+#define NR_ROWS                         (1 << NR_ROWS_LOG)
+// Length of 1 element (slot) in byte
+#define SLOT_LEN                 32
+#define ADJUSTED_SLOT_LEN(round) (((round) <= 5) ? SLOT_LEN : SLOT_LEN - 16)
+// Total size of hash table
+#define HT_SIZE				(NR_ROWS * NR_SLOTS * SLOT_LEN)
+// Length of Zcash block header, nonce (part of header)
+#define ZCASH_BLOCK_HEADER_LEN		140
+// Offset of nTime in header
+#define ZCASH_BLOCK_OFFSET_NTIME        (4 + 3 * 32)
+// Length of nonce
+#define ZCASH_NONCE_LEN			32
+// Length of encoded representation of solution size
+#define ZCASH_SOLSIZE_LEN		3
+// Solution size (1344 = 0x540) represented as a compact integer, in hex
+#define ZCASH_SOLSIZE_HEX               "fd4005"
+// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes)
+#define ZCASH_SOL_LEN                   ((1 << PARAM_K) * (PREFIX + 1) / 8)
+// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization
+#define N_ZERO_BYTES			12
+// Number of bytes Zcash needs out of Blake
+#define ZCASH_HASH_LEN                  50
+// Number of wavefronts per SIMD for the Blake kernel.
+// Blake is ALU-bound (beside the atomic counter being incremented) so we need
+// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
+// instructions. 10 is the max supported by the hw.
+#define BLAKE_WPS               	10
+// Maximum number of solutions reported by kernel to host
+#define MAX_SOLS			11
+#define MAX_POTENTIAL_SOLS  4096
+// Length of SHA256 target
+#define SHA256_TARGET_LEN               (256 / 8)
+
+#ifdef OPTIM_COMPACT_ROW_COUNTERS
+#define BITS_PER_ROW ((NR_SLOTS < 3)    ? 2 : \
+	                          (NR_SLOTS < 7)    ? 3 : \
+	                          (NR_SLOTS < 15)   ? 4 : \
+	                          (NR_SLOTS < 31)   ? 5 : \
+	                          (NR_SLOTS < 63)   ? 6 : \
+	                          (NR_SLOTS < 255)  ? 8 : \
+	                          (NR_SLOTS < 1023) ? 10 : \
+                                                         16)
+#else
+#define BITS_PER_ROW  ((NR_SLOTS < 3)   ? 2 : \
+	                          (NR_SLOTS < 15)  ? 4 : \
+	                          (NR_SLOTS < 255) ? 8 : \
+                                                        16)
+#endif
+#define ROWS_PER_UINT (32 / BITS_PER_ROW)
+#define ROW_MASK      ((1 << BITS_PER_ROW) - 1)
+
+
+#define RC_SIZE ((NR_ROWS * 4 + ROWS_PER_UINT - 1) / ROWS_PER_UINT)
+
+
+
+// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
+#define SOL_SIZE			((1 << PARAM_K) * 4)
+typedef struct	sols_s
+{
+    uint	nr;
+    uint	likely_invalids;
+    uchar	valid[MAX_SOLS];
+    uint	values[MAX_SOLS][(1 << PARAM_K)];
+}		sols_t;
+
+typedef struct	potential_sols_s
+{
+    uint	nr;
+    uint	values[MAX_POTENTIAL_SOLS][2];
+} potential_sols_t;
+
+#if NR_ROWS_LOG <= 12 && NR_SLOTS <= (1 << 10)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 20) | ((slot1 & 0x3ff) << 10) | (slot0 & 0x3ff))
+#define DECODE_ROW(REF)   (REF >> 20)
+#define DECODE_SLOT1(REF) ((REF >> 10) & 0x3ff)
+#define DECODE_SLOT0(REF) (REF & 0x3ff)
+
+#elif NR_ROWS_LOG <= 14 && NR_SLOTS <= (1 << 9)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 18) | ((slot1 & 0x1ff) << 9) | (slot0 & 0x1ff))
+#define DECODE_ROW(REF)   (REF >> 18)
+#define DECODE_SLOT1(REF) ((REF >> 9) & 0x1ff)
+#define DECODE_SLOT0(REF) (REF & 0x1ff)
+
+#elif NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
+#define DECODE_ROW(REF)   (REF >> 16)
+#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
+#define DECODE_SLOT0(REF) (REF & 0xff)
+
+#elif NR_ROWS_LOG <= 18 && NR_SLOTS <= (1 << 7)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
+#define DECODE_ROW(REF)   (REF >> 14)
+#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
+#define DECODE_SLOT0(REF) (REF & 0x7f)
+
+#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
+#define DECODE_ROW(REF)   (REF >> 13)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
+#define DECODE_ROW(REF)   (REF >> 12)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+#define NEXT_PRIME_NO(n) \
+	(((n) <= 2) ? 2 : \
+     ((n) <= 3) ? 3 : \
+     ((n) <= 5) ? 5 : \
+     ((n) <= 7) ? 7 : \
+     ((n) <= 11) ? 11 : \
+     ((n) <= 13) ? 13 : \
+     ((n) <= 17) ? 17 : \
+     ((n) <= 19) ? 19 : \
+     ((n) <= 23) ? 23 : \
+     ((n) <= 29) ? 29 : \
+     ((n) <= 31) ? 31 : \
+     ((n) <= 37) ? 37 : \
+     ((n) <= 41) ? 41 : \
+     ((n) <= 43) ? 43 : \
+     ((n) <= 47) ? 47 : \
+     ((n) <= 53) ? 53 : \
+     ((n) <= 59) ? 59 : \
+     ((n) <= 61) ? 61 : \
+     ((n) <= 67) ? 67 : \
+     ((n) <= 71) ? 71 : \
+     ((n) <= 73) ? 73 : \
+     ((n) <= 79) ? 79 : \
+     ((n) <= 83) ? 83 : \
+     ((n) <= 89) ? 89 : \
+     ((n) <= 97) ? 97 : \
+     ((n) <= 101) ? 101 : \
+     ((n) <= 103) ? 103 : \
+     ((n) <= 107) ? 107 : \
+     ((n) <= 109) ? 109 : \
+     ((n) <= 113) ? 113 : \
+     ((n) <= 127) ? 127 : \
+     ((n) <= 131) ? 131 : \
+     ((n) <= 137) ? 137 : \
+     ((n) <= 139) ? 139 : \
+     ((n) <= 149) ? 149 : \
+     ((n) <= 151) ? 151 : \
+     ((n) <= 157) ? 157 : \
+     ((n) <= 163) ? 163 : \
+     ((n) <= 167) ? 167 : \
+     ((n) <= 173) ? 173 : \
+     ((n) <= 179) ? 179 : \
+     ((n) <= 181) ? 181 : \
+     ((n) <= 191) ? 191 : \
+     ((n) <= 193) ? 193 : \
+     ((n) <= 197) ? 197 : \
+     ((n) <= 199) ? 199 : \
+     ((n) <= 211) ? 211 : \
+     ((n) <= 223) ? 223 : \
+     ((n) <= 227) ? 227 : \
+     ((n) <= 229) ? 229 : \
+     ((n) <= 233) ? 233 : \
+     ((n) <= 239) ? 239 : \
+     ((n) <= 241) ? 241 : \
+     ((n) <= 251) ? 251 : \
+     ((n) <= 257) ? 257 : \
+     ((n) <= 263) ? 263 : \
+     ((n) <= 269) ? 269 : \
+     ((n) <= 271) ? 271 : \
+     ((n) <= 277) ? 277 : \
+     ((n) <= 281) ? 281 : \
+     ((n) <= 283) ? 283 : \
+     ((n) <= 293) ? 293 : \
+     ((n) <= 307) ? 307 : \
+     ((n) <= 311) ? 311 : \
+     ((n) <= 313) ? 313 : \
+     ((n) <= 317) ? 317 : \
+     ((n) <= 331) ? 331 : \
+     ((n) <= 337) ? 337 : \
+     ((n) <= 347) ? 347 : \
+     ((n) <= 349) ? 349 : \
+     ((n) <= 353) ? 353 : \
+     ((n) <= 359) ? 359 : \
+     ((n) <= 367) ? 367 : \
+     ((n) <= 373) ? 373 : \
+     ((n) <= 379) ? 379 : \
+     ((n) <= 383) ? 383 : \
+     ((n) <= 389) ? 389 : \
+     ((n) <= 397) ? 397 : \
+     ((n) <= 401) ? 401 : \
+     ((n) <= 409) ? 409 : \
+     ((n) <= 419) ? 419 : \
+     ((n) <= 421) ? 421 : \
+     ((n) <= 431) ? 431 : \
+     ((n) <= 433) ? 433 : \
+     ((n) <= 439) ? 439 : \
+     ((n) <= 443) ? 443 : \
+     ((n) <= 449) ? 449 : \
+     ((n) <= 457) ? 457 : \
+     ((n) <= 461) ? 461 : \
+     ((n) <= 463) ? 463 : \
+     ((n) <= 467) ? 467 : \
+     ((n) <= 479) ? 479 : \
+     ((n) <= 487) ? 487 : \
+     ((n) <= 491) ? 491 : \
+     ((n) <= 499) ? 499 : \
+     ((n) <= 503) ? 503 : \
+     ((n) <= 509) ? 509 : \
+     ((n) <= 521) ? 521 : \
+     ((n) <= 523) ? 523 : \
+     ((n) <= 541) ? 541 : \
+     ((n) <= 547) ? 547 : \
+     ((n) <= 557) ? 557 : \
+     ((n) <= 563) ? 563 : \
+     ((n) <= 569) ? 569 : \
+     ((n) <= 571) ? 571 : \
+     ((n) <= 577) ? 577 : \
+     ((n) <= 587) ? 587 : \
+     ((n) <= 593) ? 593 : \
+     ((n) <= 599) ? 599 : \
+     ((n) <= 601) ? 601 : \
+     ((n) <= 607) ? 607 : \
+     ((n) <= 613) ? 613 : \
+     ((n) <= 617) ? 617 : \
+     ((n) <= 619) ? 619 : \
+     ((n) <= 631) ? 631 : \
+     ((n) <= 641) ? 641 : \
+     ((n) <= 643) ? 643 : \
+     ((n) <= 647) ? 647 : \
+     ((n) <= 653) ? 653 : \
+     ((n) <= 659) ? 659 : \
+     ((n) <= 661) ? 661 : \
+     ((n) <= 673) ? 673 : \
+     ((n) <= 677) ? 677 : \
+     ((n) <= 683) ? 683 : \
+     ((n) <= 691) ? 691 : \
+     ((n) <= 701) ? 701 : \
+     ((n) <= 709) ? 709 : \
+     ((n) <= 719) ? 719 : \
+     ((n) <= 727) ? 727 : \
+     ((n) <= 733) ? 733 : \
+     ((n) <= 739) ? 739 : \
+     ((n) <= 743) ? 743 : \
+     ((n) <= 751) ? 751 : \
+     ((n) <= 757) ? 757 : \
+     ((n) <= 761) ? 761 : \
+     ((n) <= 769) ? 769 : \
+     ((n) <= 773) ? 773 : \
+     ((n) <= 787) ? 787 : \
+     ((n) <= 797) ? 797 : \
+     ((n) <= 809) ? 809 : \
+     ((n) <= 811) ? 811 : \
+     ((n) <= 821) ? 821 : \
+     ((n) <= 823) ? 823 : \
+     ((n) <= 827) ? 827 : \
+     ((n) <= 829) ? 829 : \
+     ((n) <= 839) ? 839 : \
+     ((n) <= 853) ? 853 : \
+     ((n) <= 857) ? 857 : \
+     ((n) <= 859) ? 859 : \
+     ((n) <= 863) ? 863 : \
+     ((n) <= 877) ? 877 : \
+     ((n) <= 881) ? 881 : \
+     ((n) <= 883) ? 883 : \
+     ((n) <= 887) ? 887 : \
+     ((n) <= 907) ? 907 : \
+     ((n) <= 911) ? 911 : \
+     ((n) <= 919) ? 919 : \
+     ((n) <= 929) ? 929 : \
+     ((n) <= 937) ? 937 : \
+     ((n) <= 941) ? 941 : \
+     ((n) <= 947) ? 947 : \
+     ((n) <= 953) ? 953 : \
+     ((n) <= 967) ? 967 : \
+     ((n) <= 971) ? 971 : \
+     ((n) <= 977) ? 977 : \
+     ((n) <= 983) ? 983 : \
+     ((n) <= 991) ? 991 : \
+     ((n) <= 997) ? 997 : \
+     ((n) <= 1009) ? 1009 : \
+		             (n))
+
+#define NEXT_POWER_OF_TWO(n) \
+	(((n) <= 2) ? 2 : \
+	((n) <= 4) ? 4 : \
+	((n) <= 8) ? 8 : \
+	((n) <= 16) ? 16 : \
+	((n) <= 32) ? 32 : \
+	((n) <= 64) ? 64 : \
+	((n) <= 128) ? 128 : \
+	((n) <= 256) ? 256 : \
+	((n) <= 512) ? 512 : \
+	((n) <= 1024) ? 1024 : \
+	((n) <= 2048) ? 2048 : \
+	((n) <= 4096) ? 4096 : \
+	((n) <= 8192) ? 8192 : \
+	((n) <= 16384) ? 16384 : \
+	((n) <= 32768) ? 32768 : \
+                     (n))
+
+#if NR_SLOTS < 255
+#define SLOT_INDEX_TYPE uchar
+#elif NR_SLOTS < 65535
+#define SLOT_INDEX_TYPE ushort
+#else
+#error "Unsupported NR_SLOTS"
+#endif
diff --git a/kernel/equihash.cl b/kernel/equihash.cl
index 460d20a5..73982c1a 100644
--- a/kernel/equihash.cl
+++ b/kernel/equihash.cl
@@ -1,833 +1,894 @@
-#include "equihash-param.h"
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-
-/*
-** Assuming NR_ROWS_LOG == 16, the hash table slots have this layout (length in
-** bytes in parens):
-**
-** round 0, table 0: cnt(4) i(4)                     pad(0)   Xi(23.0) pad(1)
-** round 1, table 1: cnt(4) i(4)                     pad(0.5) Xi(20.5) pad(3)
-** round 2, table 0: cnt(4) i(4) i(4)                pad(0)   Xi(18.0) pad(2)
-** round 3, table 1: cnt(4) i(4) i(4)                pad(0.5) Xi(15.5) pad(4)
-** round 4, table 0: cnt(4) i(4) i(4) i(4)           pad(0)   Xi(13.0) pad(3)
-** round 5, table 1: cnt(4) i(4) i(4) i(4)           pad(0.5) Xi(10.5) pad(5)
-** round 6, table 0: cnt(4) i(4) i(4) i(4) i(4)      pad(0)   Xi( 8.0) pad(4)
-** round 7, table 1: cnt(4) i(4) i(4) i(4) i(4)      pad(0.5) Xi( 5.5) pad(6)
-** round 8, table 0: cnt(4) i(4) i(4) i(4) i(4) i(4) pad(0)   Xi( 3.0) pad(5)
-**
-** If the first byte of Xi is 0xAB then:
-** - on even rounds, 'A' is part of the colliding PREFIX, 'B' is part of Xi
-** - on odd rounds, 'A' and 'B' are both part of the colliding PREFIX, but
-**   'A' is considered redundant padding as it was used to compute the row #
-**
-** - cnt is an atomic counter keeping track of the number of used slots.
-**   it is used in the first slot only; subsequent slots replace it with
-**   4 padding bytes
-** - i encodes either the 21-bit input value (round 0) or a reference to two
-**   inputs from the previous round
-**
-** Formula for Xi length and pad length above:
-** > for i in range(9):
-** >   xi=(200-20*i-NR_ROWS_LOG)/8.; ci=8+4*((i)/2); print xi,32-ci-xi
-**
-** Note that the fractional .5-byte/4-bit padding following Xi for odd rounds
-** is the 4 most significant bits of the last byte of Xi.
-*/
-
-__constant ulong blake_iv[] =
-{
-    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
-    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
-    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
-    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
-};
-
-/*
-** Reset counters in hash table.
-*/
-__kernel
-void kernel_init_ht(__global char *ht, __global uint *rowCounters)
-{
-    rowCounters[get_global_id(0)] = 0;
-}
-
-/*
-** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they
-** represent (hex notation, group of 5 hex digits are a group of PREFIX bits):
-**   aa aa ab bb bb cc cc cd dd...  [round 0]
-**         --------------------
-**      ...ab bb bb cc cc cd dd...  [odd round]
-**               --------------
-**               ...cc cc cd dd...  [next even round]
-**                        -----
-** Bytes underlined are going to be stored in the slot. Preceding bytes
-** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are
-** used to compute the row number.
-**
-** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter)
-** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble)
-** TODO: update lines below with padding nibbles
-** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter)
-** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter)
-** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter)
-** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter)
-** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter)
-** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter)
-** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter)
-**
-** Return 0 if successfully stored, or 1 if the row overflowed.
-*/
-uint ht_store(uint round, __global char *ht, uint i,
-	ulong xi0, ulong xi1, ulong xi2, ulong xi3, __global uint *rowCounters)
-{
-    uint    row;
-    __global char       *p;
-    uint                cnt;
-#if NR_ROWS_LOG == 16
-    if (!(round % 2))
-	row = (xi0 & 0xffff);
-    else
-	// if we have in hex: "ab cd ef..." (little endian xi0) then this
-	// formula computes the row as 0xdebc. it skips the 'a' nibble as it
-	// is part of the PREFIX. The Xi will be stored starting with "ef...";
-	// 'e' will be considered padding and 'f' is part of the current PREFIX
-	row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#elif NR_ROWS_LOG == 18
-    if (!(round % 2))
-	row = (xi0 & 0xffff) | ((xi0 & 0xc00000) >> 6);
-    else
-	row = ((xi0 & 0xc0000) >> 2) |
-	    ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#elif NR_ROWS_LOG == 19
-    if (!(round % 2))
-	row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5);
-    else
-	row = ((xi0 & 0xe0000) >> 1) |
-	    ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#elif NR_ROWS_LOG == 20
-    if (!(round % 2))
-	row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4);
-    else
-	row = ((xi0 & 0xf0000) >> 0) |
-	    ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-    xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
-    xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
-    xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
-    p = ht + row * NR_SLOTS * SLOT_LEN;
-    uint rowIdx = row/ROWS_PER_UINT;
-    uint rowOffset = BITS_PER_ROW*(row%ROWS_PER_UINT);
-    uint xcnt = atomic_add(rowCounters + rowIdx, 1 << rowOffset);
-    xcnt = (xcnt >> rowOffset) & ROW_MASK;
-    cnt = xcnt;
-    if (cnt >= NR_SLOTS)
-      {
-	// avoid overflows
-	atomic_sub(rowCounters + rowIdx, 1 << rowOffset);
-	return 1;
-      }
-    p += cnt * SLOT_LEN + xi_offset_for_round(round);
-    // store "i" (always 4 bytes before Xi)
-    *(__global uint *)(p - 4) = i;
-    if (round == 0 || round == 1)
-      {
-	// store 24 bytes
-	*(__global ulong *)(p + 0) = xi0;
-	*(__global ulong *)(p + 8) = xi1;
-	*(__global ulong *)(p + 16) = xi2;
-      }
-    else if (round == 2)
-      {
-	// store 20 bytes
-	*(__global uint *)(p + 0) = xi0;
-	*(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32);
-	*(__global ulong *)(p + 12) = (xi1 >> 32) | (xi2 << 32);
-      }
-    else if (round == 3)
-      {
-	// store 16 bytes
-	*(__global uint *)(p + 0) = xi0;
-	*(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32);
-	*(__global uint *)(p + 12) = (xi1 >> 32);
-      }
-    else if (round == 4)
-      {
-	// store 16 bytes
-	*(__global ulong *)(p + 0) = xi0;
-	*(__global ulong *)(p + 8) = xi1;
-      }
-    else if (round == 5)
-      {
-	// store 12 bytes
-	*(__global ulong *)(p + 0) = xi0;
-	*(__global uint *)(p + 8) = xi1;
-      }
-    else if (round == 6 || round == 7)
-      {
-	// store 8 bytes
-	*(__global uint *)(p + 0) = xi0;
-	*(__global uint *)(p + 4) = (xi0 >> 32);
-      }
-    else if (round == 8)
-      {
-	// store 4 bytes
-	*(__global uint *)(p + 0) = xi0;
-      }
-    return 0;
-}
-
-#define mix(va, vb, vc, vd, x, y) \
-    va = (va + vb + x); \
-vd = rotate((vd ^ va), (ulong)64 - 32); \
-vc = (vc + vd); \
-vb = rotate((vb ^ vc), (ulong)64 - 24); \
-va = (va + vb + y); \
-vd = rotate((vd ^ va), (ulong)64 - 16); \
-vc = (vc + vd); \
-vb = rotate((vb ^ vc), (ulong)64 - 63);
-
-/*
-** Execute round 0 (blake).
-**
-** Note: making the work group size less than or equal to the wavefront size
-** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local
-** Memory (LDS) Optimization 2-10" in:
-** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/
-*/
-__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-void kernel_round0(__global ulong *blake_state, __global char *ht,
-	__global uint *rowCounters, __global uint *debug)
-{
-    uint                tid = get_global_id(0);
-    ulong               v[16];
-    uint                inputs_per_thread = NR_INPUTS / get_global_size(0);
-    uint                input = tid * inputs_per_thread;
-    uint                input_end = (tid + 1) * inputs_per_thread;
-    uint                dropped = 0;
-    while (input < input_end)
-      {
-	// shift "i" to occupy the high 32 bits of the second ulong word in the
-	// message block
-	ulong word1 = (ulong)input << 32;
-	// init vector v
-	v[0] = blake_state[0];
-	v[1] = blake_state[1];
-	v[2] = blake_state[2];
-	v[3] = blake_state[3];
-	v[4] = blake_state[4];
-	v[5] = blake_state[5];
-	v[6] = blake_state[6];
-	v[7] = blake_state[7];
-	v[8] =  blake_iv[0];
-	v[9] =  blake_iv[1];
-	v[10] = blake_iv[2];
-	v[11] = blake_iv[3];
-	v[12] = blake_iv[4];
-	v[13] = blake_iv[5];
-	v[14] = blake_iv[6];
-	v[15] = blake_iv[7];
-	// mix in length of data
-	v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */;
-	// last block
-	v[14] ^= (ulong)-1;
-
-	// round 1
-	mix(v[0], v[4], v[8],  v[12], 0, word1);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 2
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], word1, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 3
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, word1);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 4
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, word1);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 5
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, word1);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 6
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], word1, 0);
-	// round 7
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], word1, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 8
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, word1);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 9
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], word1, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 10
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], word1, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 11
-	mix(v[0], v[4], v[8],  v[12], 0, word1);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 12
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], word1, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-
-	// compress v into the blake state; this produces the 50-byte hash
-	// (two Xi values)
-	ulong h[7];
-	h[0] = blake_state[0] ^ v[0] ^ v[8];
-	h[1] = blake_state[1] ^ v[1] ^ v[9];
-	h[2] = blake_state[2] ^ v[2] ^ v[10];
-	h[3] = blake_state[3] ^ v[3] ^ v[11];
-	h[4] = blake_state[4] ^ v[4] ^ v[12];
-	h[5] = blake_state[5] ^ v[5] ^ v[13];
-	h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
-
-	// store the two Xi values in the hash table
-#if ZCASH_HASH_LEN == 50
-	dropped += ht_store(0, ht, input * 2,
-		h[0],
-		h[1],
-		h[2],
-		h[3], rowCounters);
-	dropped += ht_store(0, ht, input * 2 + 1,
-		(h[3] >> 8) | (h[4] << (64 - 8)),
-		(h[4] >> 8) | (h[5] << (64 - 8)),
-		(h[5] >> 8) | (h[6] << (64 - 8)),
-		(h[6] >> 8), rowCounters);
-#else
-#error "unsupported ZCASH_HASH_LEN"
-#endif
-
-	input++;
-      }
-#ifdef ENABLE_DEBUG
-    debug[tid * 2] = 0;
-    debug[tid * 2 + 1] = dropped;
-#endif
-}
-
-#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
-#define DECODE_ROW(REF)   (REF >> 16)
-#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
-#define DECODE_SLOT0(REF) (REF & 0xff)
-
-#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
-#define DECODE_ROW(REF)   (REF >> 14)
-#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
-#define DECODE_SLOT0(REF) (REF & 0x7f)
-
-#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
-#define DECODE_ROW(REF)   (REF >> 13)
-#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
-#define DECODE_SLOT0(REF) (REF & 0x3f)
-
-#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
-#define DECODE_ROW(REF)   (REF >> 12)
-#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
-#define DECODE_SLOT0(REF) (REF & 0x3f)
-
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-
-/*
-** Access a half-aligned long, that is a long aligned on a 4-byte boundary.
-*/
-ulong half_aligned_long(__global ulong *p, uint offset)
-{
-    return
-	(((ulong)*(__global uint *)((__global char *)p + offset + 0)) << 0) |
-	(((ulong)*(__global uint *)((__global char *)p + offset + 4)) << 32);
-}
-
-/*
-** Access a well-aligned int.
-*/
-uint well_aligned_int(__global ulong *_p, uint offset)
-{
-    __global char *p = (__global char *)_p;
-    return *(__global uint *)(p + offset);
-}
-
-/*
-** XOR a pair of Xi values computed at "round - 1" and store the result in the
-** hash table being built for "round". Note that when building the table for
-** even rounds we need to skip 1 padding byte present in the "round - 1" table
-** (the "0xAB" byte mentioned in the description at the top of this file.) But
-** also note we can't load data directly past this byte because this would
-** cause an unaligned memory access which is undefined per the OpenCL spec.
-**
-** Return 0 if successfully stored, or 1 if the row overflowed.
-*/
-uint xor_and_store(uint round, __global char *ht_dst, uint row,
-	uint slot_a, uint slot_b, __global ulong *a, __global ulong *b,
-	__global uint *rowCounters)
-{
-    ulong xi0, xi1, xi2;
-#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
-    // Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not
-    // storing the byte containing bits from the previous PREFIX block for
-    if (round == 1 || round == 2)
-      {
-	// xor 24 bytes
-	xi0 = *(a++) ^ *(b++);
-	xi1 = *(a++) ^ *(b++);
-	xi2 = *a ^ *b;
-	if (round == 2)
-	  {
-	    // skip padding byte
-	    xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-	    xi1 = (xi1 >> 8) | (xi2 << (64 - 8));
-	    xi2 = (xi2 >> 8);
-	  }
-      }
-    else if (round == 3)
-      {
-	// xor 20 bytes
-	xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
-	xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8);
-	xi2 = well_aligned_int(a, 16) ^ well_aligned_int(b, 16);
-      }
-    else if (round == 4 || round == 5)
-      {
-	// xor 16 bytes
-	xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
-	xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8);
-	xi2 = 0;
-	if (round == 4)
-	  {
-	    // skip padding byte
-	    xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-	    xi1 = (xi1 >> 8);
-	  }
-      }
-    else if (round == 6)
-      {
-	// xor 12 bytes
-	xi0 = *a++ ^ *b++;
-	xi1 = *(__global uint *)a ^ *(__global uint *)b;
-	xi2 = 0;
-	if (round == 6)
-	  {
-	    // skip padding byte
-	    xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-	    xi1 = (xi1 >> 8);
-	  }
-      }
-    else if (round == 7 || round == 8)
-      {
-	// xor 8 bytes
-	xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
-	xi1 = 0;
-	xi2 = 0;
-	if (round == 8)
-	  {
-	    // skip padding byte
-	    xi0 = (xi0 >> 8);
-	  }
-      }
-    // invalid solutions (which start happenning in round 5) have duplicate
-    // inputs and xor to zero, so discard them
-    if (!xi0 && !xi1)
-	return 0;
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-    return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b),
-	    xi0, xi1, xi2, 0, rowCounters);
-}
-
-/*
-** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi,
-** store them in ht_dst.
-*/
-void equihash_round(uint round,
-  __global char *ht_src,
-  __global char *ht_dst,
-  __global uint *debug,
-  __local uchar *first_words_data,
-  __local uint *collisionsData,
-  __local uint *collisionsNum,
-  __global uint *rowCountersSrc,
-  __global uint *rowCountersDst,
-  uint threadsPerRow)
-{
-    uint globalTid = get_global_id(0) / threadsPerRow;
-    uint localTid = get_local_id(0) / threadsPerRow;
-    uint localGroupId = get_local_id(0) % threadsPerRow;
-    __local uchar *first_words = &first_words_data[NR_SLOTS*localTid];
-    
-    __global char *p;
-    uint    cnt;
-    uchar   mask;
-    uint    i, j;
-    // NR_SLOTS is already oversized (by a factor of OVERHEAD), but we want to
-    // make it even larger
-    uint    n;
-    uint    dropped_coll = 0;
-    uint    dropped_stor = 0;
-    __global ulong  *a, *b;
-    uint    xi_offset;
-    // read first words of Xi from the previous (round - 1) hash table
-    xi_offset = xi_offset_for_round(round - 1);
-    // the mask is also computed to read data from the previous round
-#if NR_ROWS_LOG <= 16
-    mask = ((!(round % 2)) ? 0x0f : 0xf0);
-#elif NR_ROWS_LOG == 18
-    mask = ((!(round % 2)) ? 0x03 : 0x30);
-#elif NR_ROWS_LOG == 19
-    mask = ((!(round % 2)) ? 0x01 : 0x10);
-#elif NR_ROWS_LOG == 20
-    mask = 0; /* we can vastly simplify the code below */
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif    
-    
-  for (uint chunk = 0; chunk < threadsPerRow; chunk++) {
-    uint tid = globalTid + NR_ROWS/threadsPerRow*chunk;
-    uint gid = tid & ~(get_local_size(0) / threadsPerRow - 1);
-//   for (uint tid = get_global_id(0)/threadsPerRow; tid < NR_ROWS; tid += get_global_size(0)/threadsPerRow) {
-    
-    uint rowIdx = tid/ROWS_PER_UINT;
-    uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT);
-    cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
-    cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round
-
-    *collisionsNum = 0;
-    p = (ht_src + tid * NR_SLOTS * SLOT_LEN);
-    p += xi_offset;
-    p += SLOT_LEN*localGroupId;
-    for (i = localGroupId; i < cnt; i += threadsPerRow, p += SLOT_LEN*threadsPerRow)
-      first_words[i] = (*(__global uchar *)p) & mask;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    
-    if (cnt == 0)
-  // no elements in row, no collisions
-  goto part2;  
-    // find collisions
-    for (i = 0; i < cnt-1; i++)
-      {
-  uchar data_i = first_words[i];
-  uint collision = (localTid << 24) | (i << 12) | (i + 1 + localGroupId);
-  for (j = i + 1 + localGroupId; j < cnt; j += threadsPerRow)
-    {
-      if (data_i == first_words[j])
-        {
-    uint index = atomic_inc(collisionsNum);
-    if (index >= LDS_COLL_SIZE) {
-      atomic_dec(collisionsNum);
-      goto part2;
-    }
-    collisionsData[index] = collision;
-        }
-      collision += threadsPerRow;
-    }
-      }
-
-part2:
-    barrier(CLK_LOCAL_MEM_FENCE);
-    uint totalCollisions = *collisionsNum;
-    for (uint index = get_local_id(0); index < totalCollisions; index += get_local_size(0))
-      {
-  uint collision = collisionsData[index];
-  uint collisionThreadId = gid + (collision >> 24);
-  uint i = (collision >> 12) & 0xFFF;
-  uint j = collision & 0xFFF;
-  __global uchar *ptr = ht_src + collisionThreadId * NR_SLOTS * SLOT_LEN +
-      xi_offset;
-  a = (__global ulong *)(ptr + i * SLOT_LEN);
-  b = (__global ulong *)(ptr + j * SLOT_LEN);
-  dropped_stor += xor_and_store(round, ht_dst, collisionThreadId, i, j,
-    a, b, rowCountersDst);
-      }
-  }
-      
-#ifdef ENABLE_DEBUG
-    debug[tid * 2] = dropped_coll;
-    debug[tid * 2 + 1] = dropped_stor;
-#endif
-}
-
-/*
-** This defines kernel_round1, kernel_round2, ..., kernel_round7.
-*/
-#define KERNEL_ROUND(N) \
-__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) \
-void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \
-	__global uint *rowCountersSrc, __global uint *rowCountersDst, \
-       	__global uint *debug) \
-{ \
-    __local uchar first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; \
-    __local uint    collisionsData[LDS_COLL_SIZE]; \
-    __local uint    collisionsNum; \
-    equihash_round(N, ht_src, ht_dst, debug, first_words_data, collisionsData, \
-	    &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); \
-}
-KERNEL_ROUND(1)
-KERNEL_ROUND(2)
-KERNEL_ROUND(3)
-KERNEL_ROUND(4)
-KERNEL_ROUND(5)
-KERNEL_ROUND(6)
-KERNEL_ROUND(7)
-
-// kernel_round8 takes an extra argument, "sols"
-__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-void kernel_round8(__global char *ht_src, __global char *ht_dst,
-	__global uint *rowCountersSrc, __global uint *rowCountersDst,
-	__global uint *debug, __global sols_t *sols)
-{
-    uint		tid = get_global_id(0);
-    __local uchar	first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)];
-    __local uint	collisionsData[LDS_COLL_SIZE];
-    __local uint	collisionsNum;
-    equihash_round(8, ht_src, ht_dst, debug, first_words_data, collisionsData,
-	    &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW);
-    if (!tid)
-	sols->nr = sols->likely_invalids = 0;
-}
-
-uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
-{
-    return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN +
-	    slot * SLOT_LEN + xi_offset - 4);
-}
-
-/*
-** Expand references to inputs. Return 1 if so far the solution appears valid,
-** or 0 otherwise (an invalid solution would be a solution with duplicate
-** inputs, which can be detected at the last step: round == 0).
-*/
-uint expand_refs(uint *ins, uint nr_inputs, __global char **htabs,
-	uint round)
-{
-    __global char	*ht = htabs[round % 2];
-    uint		i = nr_inputs - 1;
-    uint		j = nr_inputs * 2 - 1;
-    uint		xi_offset = xi_offset_for_round(round);
-    int			dup_to_watch = -1;
-    do
-      {
-	ins[j] = expand_ref(ht, xi_offset,
-		DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i]));
-	ins[j - 1] = expand_ref(ht, xi_offset,
-		DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i]));
-	if (!round)
-	  {
-	    if (dup_to_watch == -1)
-		dup_to_watch = ins[j];
-	    else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch)
-		return 0;
-	  }
-	if (!i)
-	    break ;
-	i--;
-	j -= 2;
-      }
-    while (1);
-    return 1;
-}
-
-/*
-** Verify if a potential solution is in fact valid.
-*/
-void potential_sol(__global char **htabs, __global sols_t *sols,
-	uint ref0, uint ref1)
-{
-    uint	nr_values;
-    uint	values_tmp[(1 << PARAM_K)];
-    uint	sol_i;
-    uint	i;
-    nr_values = 0;
-    values_tmp[nr_values++] = ref0;
-    values_tmp[nr_values++] = ref1;
-    uint round = PARAM_K - 1;
-    do
-      {
-	round--;
-	if (!expand_refs(values_tmp, nr_values, htabs, round))
-	    return ;
-	nr_values *= 2;
-      }
-    while (round > 0);
-    // solution appears valid, copy it to sols
-    sol_i = atomic_inc(&sols->nr);
-    if (sol_i >= MAX_SOLS)
-	return ;
-    for (i = 0; i < (1 << PARAM_K); i++)
-	sols->values[sol_i][i] = values_tmp[i];
-    sols->valid[sol_i] = 1;
-}
-
-/*
-** Scan the hash tables to find Equihash solutions.
-*/
-__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols,
-	__global uint *rowCountersSrc, __global uint *rowCountersDst)
-{
-    __local uint counters[WORKSIZE/THREADS_PER_ROW];
-    __local uint refs[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)];
-    __local uint data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)];
-    __local uint collisionsNum;
-    __local ulong collisions[WORKSIZE*4];
-
-    uint globalTid = get_global_id(0) / THREADS_PER_ROW;
-    uint localTid = get_local_id(0) / THREADS_PER_ROW;
-    uint localGroupId = get_local_id(0) % THREADS_PER_ROW;
-    __local uint *refsPtr = &refs[NR_SLOTS*localTid];
-    __local uint *dataPtr = &data[NR_SLOTS*localTid];    
-    
-    __global char	*htabs[2] = { ht0, ht1 };
-    __global char	*hcounters[2] = { rowCountersSrc, rowCountersDst };
-    uint		ht_i = (PARAM_K - 1) % 2; // table filled at last round
-    uint		cnt;
-    uint		xi_offset = xi_offset_for_round(PARAM_K - 1);
-    uint		i, j;
-    __global char	*p;
-    uint		ref_i, ref_j;
-    // it's ok for the collisions array to be so small, as if it fills up
-    // the potential solutions are likely invalid (many duplicate inputs)
-//     ulong		collisions;
-#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
-    // in the final hash table, we are looking for a match on both the bits
-    // part of the previous PREFIX colliding bits, and the last PREFIX bits.
-    uint		mask = 0xffffff;
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-    
-  collisionsNum = 0;    
-  
-  for (uint chunk = 0; chunk < THREADS_PER_ROW; chunk++) {
-    uint tid = globalTid + NR_ROWS/THREADS_PER_ROW*chunk;
-    p = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN;
-    uint rowIdx = tid/ROWS_PER_UINT;
-    uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT);
-    cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
-    cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
-    p += xi_offset;
-    p += SLOT_LEN*localGroupId;
-
-    for (i = get_local_id(0); i < WORKSIZE/THREADS_PER_ROW; i += get_local_size(0))
-      counters[i] = 0;
-    for (i = localGroupId; i < cnt; i += THREADS_PER_ROW, p += SLOT_LEN*THREADS_PER_ROW) {
-      refsPtr[i] = *(__global uint *)(p - 4);
-      dataPtr[i] = (*(__global uint *)p) & mask;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    
-    for (i = 0; i < cnt; i++)
-      {
-	uint a_data = dataPtr[i];
-	ref_i = refsPtr[i];
-	for (j = i + 1 + localGroupId; j < cnt; j += THREADS_PER_ROW)
-	  {
-	    if (a_data == dataPtr[j])
-	      {
-          if (atomic_inc(&counters[localTid]) == 0)
-		        collisions[atomic_inc(&collisionsNum)] = ((ulong)ref_i << 32) | refsPtr[j];
-          goto part2;          
-	      }
-	  }
-      }
-
-part2:
-    continue;
-  }
-  
-    barrier(CLK_LOCAL_MEM_FENCE);
-    uint totalCollisions = collisionsNum;
-    if (get_local_id(0) < totalCollisions) {
-      ulong coll = collisions[get_local_id(0)];
-      potential_sol(htabs, sols, coll >> 32, coll & 0xffffffff);
-    }
-}
+// Gateless Gate, a Zcash miner
+// Copyright 2016 zawawa @ bitcointalk.org
+//
+// The initial version of this software was based on:
+// SILENTARMY v5
+// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil, eXtremal
+//
+// This program is free software : you can redistribute it and / or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#include "equihash-param.h"
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#ifdef AMD
+#pragma OPENCL EXTENSION cl_amd_vec3 : enable
+#endif
+
+
+
+/////////////////
+// HASH TABLES //
+/////////////////
+
+/*
+** With the new hash tables, each slot has this layout (length in bytes in parens):
+**
+** round 0, table 0: i(4) pad(0) Xi(24) pad(4)
+** round 1, table 1: i(4) pad(3) Xi(20) pad(5)
+** round 2, table 2: i(4) pad(0) Xi(19) pad(9)
+** round 3, table 3: i(4) pad(3) Xi(15) pad(10)
+** round 4, table 4: i(4) pad(0) Xi(14) pad(14)
+** round 5, table 5: i(4) pad(3) Xi(10) pad(15)
+** round 6, table 6: i(4) pad(0) Xi( 9) pad(19)
+** round 7, table 7: i(4) pad(3) Xi( 5) pad(20)
+** round 8, table 8: i(4) pad(0) Xi( 4) pad(24)
+*/
+
+typedef union {
+    struct {
+        uint xi[7];
+        uint padding;
+    } slot;
+    uint8 ui8;
+    uint4 ui4[2];
+    uint2 ui2[4];
+    uint  ui[8];
+#ifdef AMD
+    ulong3 ul3;
+    uint3 ui3[2];
+#endif
+} slot_t;
+
+typedef __global slot_t *global_pointer_to_slot_t;
+
+#define UINTS_IN_XI(round) (((round) == 0) ? 6 : \
+                            ((round) == 1) ? 6 : \
+                            ((round) == 2) ? 5 : \
+                            ((round) == 3) ? 5 : \
+                            ((round) == 4) ? 4 : \
+                            ((round) == 5) ? 4 : \
+                            ((round) == 6) ? 3 : \
+                            ((round) == 7) ? 2 : \
+                                             1)
+
+
+
+/*
+** OBSOLETE
+** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they
+** represent (hex notation, group of 5 hex digits are a group of PREFIX bits):
+**   aa aa ab bb bb cc cc cd dd...  [round 0]
+**         --------------------
+**      ...ab bb bb cc cc cd dd...  [odd round]
+**               --------------
+**               ...cc cc cd dd...  [next even round]
+**                        -----
+** Bytes underlined are going to be stored in the slot. Preceding bytes
+** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are
+** used to compute the row number.
+**
+** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter)
+** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble)
+** TODO: update lines below with padding nibbles
+** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter)
+** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter)
+** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter)
+** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter)
+** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter)
+** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter)
+** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter)
+**
+** Return 0 if successfully stored, or 1 if the row overflowed.
+*/
+
+__global char *get_slot_ptr(__global char *ht, uint round, uint row, uint slot)
+{
+    return ht + (row * NR_SLOTS + slot) * ADJUSTED_SLOT_LEN(round);
+}
+
+__global uint *get_xi_ptr(__global char *ht, uint round, uint row, uint slot)
+{
+    return (__global uint *)get_slot_ptr(ht, round, row, slot);
+}
+
+__global uint *get_ref_ptr(__global char *ht, uint round, uint row, uint slot)
+{
+    return get_xi_ptr(ht, round, row, slot) + UINTS_IN_XI(round);
+}
+
+void get_row_counters_index(uint *rowIdx, uint *rowOffset, uint row)
+{
+    if (ROWS_PER_UINT == 3) {
+        uint r = (0x55555555 * row + (row >> 1) - (row >> 3)) >> 30;
+        *rowIdx = (row - r) * 0xAAAAAAAB;
+        *rowOffset = BITS_PER_ROW * r;
+    } else if (ROWS_PER_UINT == 6) {
+        uint r = (0x55555555 * row + (row >> 1) - (row >> 3)) >> 29;
+        *rowIdx = (row - r) * 0xAAAAAAAB * 2;
+        *rowOffset = BITS_PER_ROW * r;
+    } else {
+        *rowIdx = row / ROWS_PER_UINT;
+        *rowOffset = BITS_PER_ROW * (row % ROWS_PER_UINT);
+    }
+}
+
+uint get_row(uint round, uint xi0)
+{
+    uint           row = 0;
+
+    if (NR_ROWS_LOG == 12) {
+        if (!(round % 2))
+            row = (xi0 & 0xfff);
+        else
+            row = ((xi0 & 0x0f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+    } else if (NR_ROWS_LOG == 13) {
+        if (!(round % 2))
+            row = (xi0 & 0x1fff);
+        else
+            row = ((xi0 & 0x1f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+    } else if (NR_ROWS_LOG == 14) {
+        if (!(round % 2))
+            row = (xi0 & 0x3fff);
+        else
+            row = ((xi0 & 0x3f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+    } else if (NR_ROWS_LOG == 15) {
+        if (!(round % 2))
+            row = (xi0 & 0x7fff);
+        else
+            row = ((xi0 & 0x7f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+    } else if (NR_ROWS_LOG == 16) {
+        if (!(round % 2))
+            row = (xi0 & 0xffff);
+        else
+            row = ((xi0 & 0xff0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+    }
+
+    return row;
+}
+
+uint get_nr_slots(__global uint *row_counters, uint row_index)
+{
+    uint rowIdx, rowOffset, nr_slots;
+    get_row_counters_index(&rowIdx, &rowOffset, row_index);
+    nr_slots = (row_counters[rowIdx] >> rowOffset) & ROW_MASK;
+    nr_slots = min(nr_slots, (uint)NR_SLOTS); // handle possible overflow in last round
+    return nr_slots;
+}
+
+uint inc_row_counter(__global uint *rowCounters, uint row)
+{
+    uint rowIdx, rowOffset;
+    get_row_counters_index(&rowIdx, &rowOffset, row);
+    uint nr_slots = atomic_add(rowCounters + rowIdx, 1U << rowOffset);
+    nr_slots = (nr_slots >> rowOffset) & ROW_MASK;
+    if (nr_slots >= NR_SLOTS) {
+        // avoid overflows
+        atomic_sub(rowCounters + rowIdx, 1 << rowOffset);
+    }
+    return nr_slots;
+}
+
+
+
+/*
+** Reset counters in a hash table.
+*/
+
+__kernel
+void kernel_init_ht(__global char *ht, __global uint *rowCounters, __global sols_t *sols, __global potential_sols_t *potential_sols)
+{
+    if (!get_global_id(0))
+        sols->nr = sols->likely_invalids = potential_sols->nr = 0;
+    if (get_global_id(0) < RC_SIZE / 4)
+        rowCounters[get_global_id(0)] = 0;
+}
+
+
+
+/////////////
+// ROUND 0 //
+/////////////
+
+__constant ulong blake_iv[] =
+{
+    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+};
+
+#define mix(va, vb, vc, vd, x, y) \
+    va = (va + vb + (x)); \
+vd = rotate((vd ^ va), (ulong)64 - 32); \
+vc = (vc + vd); \
+vb = rotate((vb ^ vc), (ulong)64 - 24); \
+va = (va + vb + (y)); \
+vd = rotate((vd ^ va), (ulong)64 - 16); \
+vc = (vc + vd); \
+vb = rotate((vb ^ vc), (ulong)64 - 63);
+
+/*
+** Execute round 0 (blake).
+**
+** Note: making the work group size less than or equal to the wavefront size
+** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local
+** Memory (LDS) Optimization 2-10" in:
+** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/
+*/
+
+#if ZCASH_HASH_LEN != 50
+#error "unsupported ZCASH_HASH_LEN"
+#endif
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE_ROUND0, 1, 1)))
+void kernel_round0(__constant ulong *blake_state, __global char *ht,
+    __global uint *rowCounters, __global uint *debug)
+{
+    uint                tid = get_global_id(0);
+#if defined(AMD) && !defined(AMD_LEGACY)
+    volatile ulong      v[16];
+    uint xi0, xi1, xi2, xi3, xi4, xi5, xi6;
+    slot_t slot;
+#else
+    ulong               v[16];
+    uint xi0, xi1, xi2, xi3, xi4, xi5, xi6;
+    slot_t slot;
+#endif
+    ulong               h[7];
+    uint                inputs_per_thread = (NR_INPUTS + get_global_size(0) - 1) / get_global_size(0);
+    uint                dropped = 0;
+
+    for (uint chunk = 0; chunk < inputs_per_thread; ++chunk) {
+        uint input = tid + get_global_size(0) * chunk;
+
+        if (input < NR_INPUTS) {
+            // shift "i" to occupy the high 32 bits of the second ulong word in the
+            // message block
+            ulong word1 = (ulong)input << 32;
+            // init vector v
+            v[0] = blake_state[0];
+            v[1] = blake_state[1];
+            v[2] = blake_state[2];
+            v[3] = blake_state[3];
+            v[4] = blake_state[4];
+            v[5] = blake_state[5];
+            v[6] = blake_state[6];
+            v[7] = blake_state[7];
+            v[8] = blake_iv[0];
+            v[9] = blake_iv[1];
+            v[10] = blake_iv[2];
+            v[11] = blake_iv[3];
+            v[12] = blake_iv[4];
+            v[13] = blake_iv[5];
+            v[14] = blake_iv[6];
+            v[15] = blake_iv[7];
+            // mix in length of data
+            v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */;
+            // last block
+            v[14] ^= (ulong)-1;
+
+#if defined(AMD) && !defined(AMD_LEGACY)
+#pragma unroll 1
+            for (uint blake_round = 1; blake_round <= 9; ++blake_round) {
+#else
+#pragma unroll 9
+            for (uint blake_round = 1; blake_round <= 9; ++blake_round) {
+#endif
+                mix(v[0], v[4], v[8], v[12], 0, (blake_round == 1) ? word1 : 0);
+                mix(v[1], v[5], v[9], v[13], (blake_round == 7) ? word1 : 0, (blake_round == 4) ? word1 : 0);
+                mix(v[2], v[6], v[10], v[14], 0, (blake_round == 8) ? word1 : 0);
+                mix(v[3], v[7], v[11], v[15], 0, 0);
+                mix(v[0], v[5], v[10], v[15], (blake_round == 2) ? word1 : 0, (blake_round == 5) ? word1 : 0);
+                mix(v[1], v[6], v[11], v[12], 0, 0);
+                mix(v[2], v[7], v[8], v[13], (blake_round == 9) ? word1 : 0, (blake_round == 3) ? word1 : 0);
+                mix(v[3], v[4], v[9], v[14], (blake_round == 6) ? word1 : 0, 0);
+            }
+            // round 10
+            mix(v[0], v[4], v[8], v[12], 0, 0);
+            mix(v[1], v[5], v[9], v[13], 0, 0);
+            mix(v[2], v[6], v[10], v[14], 0, 0);
+            mix(v[3], v[7], v[11], v[15], word1, 0);
+            mix(v[0], v[5], v[10], v[15], 0, 0);
+            mix(v[1], v[6], v[11], v[12], 0, 0);
+            mix(v[2], v[7], v[8], v[13], 0, 0);
+            mix(v[3], v[4], v[9], v[14], 0, 0);
+            // round 11
+            mix(v[0], v[4], v[8], v[12], 0, word1);
+            mix(v[1], v[5], v[9], v[13], 0, 0);
+            mix(v[2], v[6], v[10], v[14], 0, 0);
+            mix(v[3], v[7], v[11], v[15], 0, 0);
+            mix(v[0], v[5], v[10], v[15], 0, 0);
+            mix(v[1], v[6], v[11], v[12], 0, 0);
+            mix(v[2], v[7], v[8], v[13], 0, 0);
+            mix(v[3], v[4], v[9], v[14], 0, 0);
+            // round 12
+            mix(v[0], v[4], v[8], v[12], 0, 0);
+            mix(v[1], v[5], v[9], v[13], 0, 0);
+            mix(v[2], v[6], v[10], v[14], 0, 0);
+            mix(v[3], v[7], v[11], v[15], 0, 0);
+            mix(v[0], v[5], v[10], v[15], word1, 0);
+            mix(v[1], v[6], v[11], v[12], 0, 0);
+            mix(v[2], v[7], v[8], v[13], 0, 0);
+            mix(v[3], v[4], v[9], v[14], 0, 0);
+
+            // compress v into the blake state; this produces the 50-byte hash
+            // (two Xi values)
+            h[0] = blake_state[0] ^ v[0] ^ v[8];
+            h[1] = blake_state[1] ^ v[1] ^ v[9];
+            h[2] = blake_state[2] ^ v[2] ^ v[10];
+            h[3] = blake_state[3] ^ v[3] ^ v[11];
+            h[4] = blake_state[4] ^ v[4] ^ v[12];
+            h[5] = blake_state[5] ^ v[5] ^ v[13];
+            h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
+        }
+
+        if (input < NR_INPUTS) {
+            // store the two Xi values in the hash table
+#pragma unroll 1
+            for (uint index = 0; index < 2; ++index) {
+                if (!index) {
+                    xi0 = h[0] & 0xffffffff; xi1 = h[0] >> 32;
+                    xi2 = h[1] & 0xffffffff; xi3 = h[1] >> 32;
+                    xi4 = h[2] & 0xffffffff; xi5 = h[2] >> 32;
+                    xi6 = h[3] & 0xffffffff;
+                } else {
+                    xi0 = ((h[3] >> 8) | (h[4] << (64 - 8))) & 0xffffffff; xi1 = ((h[3] >> 8) | (h[4] << (64 - 8))) >> 32;
+                    xi2 = ((h[4] >> 8) | (h[5] << (64 - 8))) & 0xffffffff; xi3 = ((h[4] >> 8) | (h[5] << (64 - 8))) >> 32;
+                    xi4 = ((h[5] >> 8) | (h[6] << (64 - 8))) & 0xffffffff; xi5 = ((h[5] >> 8) | (h[6] << (64 - 8))) >> 32;
+                    xi6 = (h[6] >> 8) & 0xffffffff;
+                }
+
+                uint row = get_row(0, xi0);
+                uint nr_slots = inc_row_counter(rowCounters, row);
+                if (nr_slots >= NR_SLOTS) {
+                    ++dropped;
+                } else {
+                    slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8));
+                    slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8));
+                    slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8));
+                    slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8));
+                    slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8));
+                    slot.slot.xi[5] = ((xi6 << 24) | (xi5 >> 8));
+                    slot.slot.xi[UINTS_IN_XI(0)] = input * 2 + index;
+                    __global char *p = get_slot_ptr(ht, 0, row, nr_slots);
+                    *(__global uint8 *)p = slot.ui8;
+                }
+            }
+        }
+    }
+
+#ifdef ENABLE_DEBUG
+    debug[tid * 2] = 0;
+    debug[tid * 2 + 1] = dropped;
+#endif
+}
+
+/*
+** XOR a pair of Xi values computed at "round - 1" and store the result in the
+** hash table being built for "round". Note that when building the table for
+** even rounds we need to skip 1 padding byte present in the "round - 1" table
+** (the "0xAB" byte mentioned in the description at the top of this file.) But
+** also note we can't load data directly past this byte because this would
+** cause an unaligned memory access which is undefined per the OpenCL spec.
+**
+** Return 0 if successfully stored, or 1 if the row overflowed.
+*/
+
+uint xor_and_store(uint round, __global char *ht_src, __global char *ht_dst, uint row,
+    uint slot_a, uint slot_b, __local uint *ai, __local uint *bi,
+    __global uint *rowCounters) {
+    uint ret = 0;
+    uint xi0, xi1, xi2, xi3, xi4, xi5;
+
+#if NR_ROWS_LOG < 8 && NR_ROWS_LOG > 20
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+    slot_t slot;
+    __global slot_t *p = 0;
+
+    if (slot_a < NR_SLOTS && slot_b < NR_SLOTS) {
+        xi0 = *ai;
+        xi1 = *(ai += NR_SLOTS);
+        if (round <= 7) xi2 = *(ai += NR_SLOTS);
+        if (round <= 6) xi3 = *(ai += NR_SLOTS);
+        if (round <= 4) xi4 = *(ai += NR_SLOTS);
+        if (round <= 2) xi5 = *(ai += NR_SLOTS);
+
+        xi0 ^= *bi;
+        xi1 ^= *(bi += NR_SLOTS);
+        if (round <= 7) xi2 ^= *(bi += NR_SLOTS);
+        if (round <= 6) xi3 ^= *(bi += NR_SLOTS);
+        if (round <= 4) xi4 ^= *(bi += NR_SLOTS);
+        if (round <= 2) xi5 ^= *(bi += NR_SLOTS);
+
+        if (!(round & 0x1)) {
+            // skip padding bytes
+            xi0 = (xi0 >> 24) | (xi1 << (32 - 24));
+
+            slot.slot.xi[0] = xi1;
+            slot.slot.xi[1] = xi2;
+            slot.slot.xi[2] = xi3;
+            slot.slot.xi[3] = xi4;
+            slot.slot.xi[4] = xi5;
+        } else {
+            slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8));
+            if (round <= 7) slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8));
+            if (round <= 6) slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8));
+            if (round <= 5) slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8));
+            if (round <= 3) slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8));
+            if (round <= 1) slot.slot.xi[5] = ((xi5 >> 8));
+        }
+        slot.slot.xi[UINTS_IN_XI(round)] = ENCODE_INPUTS(row, slot_a, slot_b);
+
+        // invalid solutions (which start happenning in round 5) have duplicate
+        // inputs and xor to zero, so discard them
+        if (xi0 || xi1) {
+            uint new_row = get_row(round, xi0);
+            uint new_slot_index = inc_row_counter(rowCounters, new_row);
+            if (new_slot_index >= NR_SLOTS) {
+                ret = 1;
+            } else {
+                p = (__global slot_t *)get_slot_ptr(ht_dst, round, new_row, new_slot_index);
+            }
+        }
+    }
+
+    if (p) {
+#ifdef OPTIM_8BYTE_WRITES
+        if (round >= 8)
+            *(__global uint2 *)p = slot.ui2[0];
+        else
+#endif
+#ifdef OPTIM_12BYTE_WRITES
+        if (round >= 7)
+            *(__global uint3 *)p = slot.ui3[0];
+        else
+#endif
+#ifdef OPTIM_16BYTE_WRITES
+        if (round >= 6)
+            *(__global uint4 *)p = slot.ui4[0];
+        else
+#endif
+#ifdef OPTIM_24BYTE_WRITES
+        if (round >= 2)
+            *(__global ulong3 *)p = slot.ul3;
+        else
+#endif
+            *(__global uint8 *)p = slot.ui8;
+    }
+    return ret;
+}
+
+uint parallel_xor_and_store(uint round, __global char *ht_src, __global char *ht_dst, uint row,
+    uint slot_a, uint slot_b, __local uint *ai, __local uint *bi,
+    __global uint *rowCounters,
+    __local SLOT_INDEX_TYPE *new_slot_indexes) {
+    uint ret = 0;
+    uint xi0, xi1, xi2, xi3, xi4, xi5;
+    uint write_index = get_local_id(0) / THREADS_PER_WRITE(round);
+    uint write_thread_index = get_local_id(0) % THREADS_PER_WRITE(round);
+    //uint write_index = get_local_id(0) % (get_local_size(0) / THREADS_PER_WRITE(round));
+    //uint write_thread_index = get_local_id(0) / (get_local_size(0) / THREADS_PER_WRITE(round));
+
+#if NR_ROWS_LOG < 8 && NR_ROWS_LOG > 20
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+    slot_t slot;
+    uint new_slot_index;
+    uint new_row;
+
+    if (!write_thread_index)
+        new_slot_indexes[write_index] = NR_SLOTS;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (slot_a < NR_SLOTS && slot_b < NR_SLOTS) {
+        xi0 = *ai;
+        xi1 = *(ai += NR_SLOTS);
+        if (round <= 7) xi2 = *(ai += NR_SLOTS);
+        if (round <= 6) xi3 = *(ai += NR_SLOTS);
+        if (round <= 4) xi4 = *(ai += NR_SLOTS);
+        if (round <= 2) xi5 = *(ai += NR_SLOTS);
+
+        xi0 ^= *bi;
+        xi1 ^= *(bi += NR_SLOTS);
+        if (round <= 7) xi2 ^= *(bi += NR_SLOTS);
+        if (round <= 6) xi3 ^= *(bi += NR_SLOTS);
+        if (round <= 4) xi4 ^= *(bi += NR_SLOTS);
+        if (round <= 2) xi5 ^= *(bi += NR_SLOTS);
+
+        if (!(round & 0x1)) {
+            // skip padding bytes
+            xi0 = (xi0 >> 24) | (xi1 << (32 - 24));
+
+            slot.slot.xi[0] = xi1;
+            slot.slot.xi[1] = xi2;
+            slot.slot.xi[2] = xi3;
+            slot.slot.xi[3] = xi4;
+            slot.slot.xi[4] = xi5;
+        } else {
+            slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8));
+            if (round <= 7) slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8));
+            if (round <= 6) slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8));
+            if (round <= 5) slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8));
+            if (round <= 3) slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8));
+            if (round <= 1) slot.slot.xi[5] = ((xi5 >> 8));
+        }
+        slot.slot.xi[UINTS_IN_XI(round)] = ENCODE_INPUTS(row, slot_a, slot_b);
+        new_row = get_row(round, xi0);
+
+        // invalid solutions (which start happenning in round 5) have duplicate
+        // inputs and xor to zero, so discard them
+        if ((xi0 || xi1) && !write_thread_index) {
+            new_slot_indexes[write_index] = inc_row_counter(rowCounters, new_row);
+#ifdef ENABLE_DEBUG
+            if (new_slot_index >= NR_SLOTS)
+                ret = 1;
+#endif
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (new_slot_indexes[write_index] < NR_SLOTS) {
+        __global slot_t *p = (__global slot_t *)get_slot_ptr(ht_dst, round, new_row, new_slot_indexes[write_index]);
+        *(((__global uint4 *)p) + write_thread_index) = slot.ui4[write_thread_index];
+    }
+    //barrier(CLK_LOCAL_MEM_FENCE);
+    return ret;
+}
+
+/*
+** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi,
+** store them in ht_dst. Each work group processes only one row at a time.
+*/
+
+void equihash_round(uint round,
+    __global char *ht_src,
+    __global char *ht_dst,
+    __global uint *debug,
+    __local uint  *slot_cache,
+    __local SLOT_INDEX_TYPE *collision_array_a,
+    __local SLOT_INDEX_TYPE *collision_array_b,
+    __local uint *nr_collisions,
+    __global uint *rowCountersSrc,
+    __global uint *rowCountersDst,
+    __local uint *bin_first_slots,
+    __local SLOT_INDEX_TYPE *bin_next_slots,
+    __local SLOT_INDEX_TYPE *new_slot_indexes)
+{
+    uint     i, j;
+#ifdef ENABLE_DEBUG
+    uint     dropped_coll = 0;
+    uint     dropped_stor = 0;
+#endif
+
+    // the mask is also computed to read data from the previous round
+#define BIN_MASK(round)        ((((round) + 1) % 2) ? 0xf000 : 0xf0000)
+#define BIN_MASK_OFFSET(round) ((((round) + 1) % 2) ? 3 * 4 : 4 * 4)
+
+#define BIN_MASK2(round) ((NR_ROWS_LOG == 12) ? ((((round) + 1) % 2) ? 0x00f0 : 0xf000) : \
+                          (NR_ROWS_LOG == 13) ? ((((round) + 1) % 2) ? 0x00e0 : 0xe000) : \
+                          (NR_ROWS_LOG == 14) ? ((((round) + 1) % 2) ? 0x00c0 : 0xc000) : \
+                          (NR_ROWS_LOG == 15) ? ((((round) + 1) % 2) ? 0x0080 : 0x8000) : \
+                                                       0)
+#define BIN_MASK2_OFFSET(round) ((NR_ROWS_LOG == 12) ? ((((round) + 1) % 2) ? 0 : 8) : \
+                                 (NR_ROWS_LOG == 13) ? ((((round) + 1) % 2) ? 1 : 9) : \
+                                 (NR_ROWS_LOG == 14) ? ((((round) + 1) % 2) ? 2 : 10) : \
+                                 (NR_ROWS_LOG == 15) ? ((((round) + 1) % 2) ? 3 : 11) : \
+                                                              0)
+
+#define NR_BINS_LOG (20 - NR_ROWS_LOG)
+#define NR_BINS (1 << NR_BINS_LOG)
+
+
+
+    uint nr_slots = 0;
+    uint assigned_row_index = get_group_id(0);
+if (assigned_row_index >= NR_ROWS)
+        return;
+
+    for (i = get_local_id(0); i < NR_BINS; i += get_local_size(0))
+        bin_first_slots[i] = NR_SLOTS;
+    for (i = get_local_id(0); i < NR_SLOTS; i += get_local_size(0))
+        bin_next_slots[i] = NR_SLOTS;
+    if (get_local_id(0) == 0)
+        *nr_collisions = nr_slots = get_nr_slots(rowCountersSrc, assigned_row_index);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (get_local_id(0))
+        nr_slots = *nr_collisions;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (uint phase = 0; phase < 1; ++phase) {
+
+        // Perform a radix sort as slots get loaded into LDS.
+        // Make sure all the work items in the work group enter the loop.
+        for (i = get_local_id(0); i < nr_slots; i += get_local_size(0)) {
+            uint slot_index = i;
+            uint slot_cache_index = i;
+#ifdef NVIDIA
+            uint2 slot_data0, slot_data1, slot_data2;
+            if (UINTS_IN_XI(round - 1) >= 1) slot_data0 = *((__global uint2 *)get_slot_ptr(ht_src, round - 1, assigned_row_index, slot_cache_index) + 0);
+            if (UINTS_IN_XI(round - 1) >= 3) slot_data1 = *((__global uint2 *)get_slot_ptr(ht_src, round - 1, assigned_row_index, slot_cache_index) + 1);
+            if (UINTS_IN_XI(round - 1) >= 5) slot_data2 = *((__global uint2 *)get_slot_ptr(ht_src, round - 1, assigned_row_index, slot_cache_index) + 2);
+
+            if (UINTS_IN_XI(round - 1) >= 1) slot_cache[0 * NR_SLOTS + slot_cache_index] = slot_data0.s0;
+            if (UINTS_IN_XI(round - 1) >= 2) slot_cache[1 * NR_SLOTS + slot_cache_index] = slot_data0.s1;
+            if (UINTS_IN_XI(round - 1) >= 3) slot_cache[2 * NR_SLOTS + slot_cache_index] = slot_data1.s0;
+            if (UINTS_IN_XI(round - 1) >= 4) slot_cache[3 * NR_SLOTS + slot_cache_index] = slot_data1.s1;
+            if (UINTS_IN_XI(round - 1) >= 5) slot_cache[4 * NR_SLOTS + slot_cache_index] = slot_data2.s0;
+            if (UINTS_IN_XI(round - 1) >= 6) slot_cache[5 * NR_SLOTS + slot_cache_index] = slot_data2.s1;
+            uint xi0 = slot_data0.s0;
+#else
+            for (j = 0; j < UINTS_IN_XI(round - 1); ++j)
+                slot_cache[j * NR_SLOTS + slot_cache_index] = *((__global uint *)get_xi_ptr(ht_src, round - 1, assigned_row_index, slot_index) + j);
+            uint xi0 = slot_cache[0 * NR_SLOTS + slot_cache_index];
+#endif
+            uint bin_to_use =
+                  ((xi0 & BIN_MASK(round - 1)) >> BIN_MASK_OFFSET(round - 1))
+                | ((xi0 & BIN_MASK2(round - 1)) >> BIN_MASK2_OFFSET(round - 1));
+            bin_next_slots[i] = atomic_xchg(&bin_first_slots[bin_to_use], i);
+        }
+
+        if (!get_local_id(0))
+            *nr_collisions = 0;
+        uint max_slot_a_index = NR_SLOTS + (get_local_size(0) - NR_SLOTS % get_local_size(0)) - 1;
+        barrier(CLK_LOCAL_MEM_FENCE);
+        for (uint slot_a_index = get_local_id(0); slot_a_index <= max_slot_a_index; slot_a_index += get_local_size(0)) {
+            uint slot_b_index = (slot_a_index < NR_SLOTS) ? bin_next_slots[slot_a_index] : NR_SLOTS;
+            while (slot_b_index < NR_SLOTS) {
+                uint coll_index = atomic_inc(nr_collisions);
+                if (coll_index < LDS_COLL_SIZE) {
+                    collision_array_a[coll_index] = slot_a_index;
+                    collision_array_b[coll_index] = slot_b_index;
+                } else {
+                    atomic_dec(nr_collisions);
+#ifdef ENABLE_DEBUG
+                    ++dropped_coll;
+#endif
+                }
+                slot_b_index = bin_next_slots[slot_b_index];
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            uint nr_collisions_copy = *nr_collisions;
+            //barrier(CLK_LOCAL_MEM_FENCE);
+            while (nr_collisions_copy > 0) {
+                uint collision, slot_index_a = NR_SLOTS, slot_index_b = NR_SLOTS;
+                __local uint *slot_cache_a, *slot_cache_b;
+                uint write_index = get_local_id(0) / THREADS_PER_WRITE(round);
+                if (write_index < nr_collisions_copy) {
+                    slot_index_a = collision_array_a[nr_collisions_copy - 1 - write_index];
+                    slot_index_b = collision_array_b[nr_collisions_copy - 1 - write_index];
+                    slot_cache_a = (__local uint *)&slot_cache[slot_index_a];
+                    slot_cache_b = (__local uint *)&slot_cache[slot_index_b];
+                }
+                //barrier(CLK_LOCAL_MEM_FENCE);
+                if (THREADS_PER_WRITE(round) > 1) {
+#ifdef ENABLE_DEBUG
+                    //dropped_stor += 
+#endif
+                    parallel_xor_and_store(round, ht_src, ht_dst, assigned_row_index, slot_index_a, slot_index_b, slot_cache_a, slot_cache_b, rowCountersDst, new_slot_indexes);
+                } else {
+#ifdef ENABLE_DEBUG
+                    dropped_stor +=
+#endif
+                        xor_and_store(round, ht_src, ht_dst, assigned_row_index, slot_index_a, slot_index_b, slot_cache_a, slot_cache_b, rowCountersDst);
+                }
+
+                if (!get_local_id(0))
+                    *nr_collisions -= min(*nr_collisions, (uint)get_local_size(0) / THREADS_PER_WRITE(round));
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+                nr_collisions_copy = *nr_collisions;
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+    }
+
+
+
+#ifdef ENABLE_DEBUG
+    debug[get_global_id(0) * 2] = dropped_coll;
+    debug[get_global_id(0) * 2 + 1] = dropped_stor;
+#endif
+}
+
+/*
+** This defines kernel_round1, kernel_round2, ..., kernel_round8.
+*/
+
+#define KERNEL_ROUND(kernel_name, N) \
+__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE, 1, 1))) \
+void kernel_name(__global char *ht_src, __global char *ht_dst, \
+	__global uint *rowCountersSrc, __global uint *rowCountersDst, \
+       	__global uint *debug) \
+{ \
+    __local uint    slot_cache[ADJUSTED_LDS_ARRAY_SIZE(UINTS_IN_XI(N - 1) * NR_SLOTS)]; \
+    __local SLOT_INDEX_TYPE collision_array_a[ADJUSTED_LDS_ARRAY_SIZE(LDS_COLL_SIZE)]; \
+    __local SLOT_INDEX_TYPE collision_array_b[ADJUSTED_LDS_ARRAY_SIZE(LDS_COLL_SIZE)]; \
+    __local uint    nr_collisions; \
+	__local uint    bin_first_slots[ADJUSTED_LDS_ARRAY_SIZE(NR_BINS)]; \
+	__local SLOT_INDEX_TYPE bin_next_slots[ADJUSTED_LDS_ARRAY_SIZE(NR_SLOTS)]; \
+	__local SLOT_INDEX_TYPE new_slot_indexes[ADJUSTED_LDS_ARRAY_SIZE((THREADS_PER_WRITE(N) > 1) ? LOCAL_WORK_SIZE / THREADS_PER_WRITE(N) : 0)]; \
+	equihash_round((N), ht_src, ht_dst, debug, slot_cache, collision_array_a, collision_array_b, \
+	    &nr_collisions, rowCountersSrc, rowCountersDst, bin_first_slots, bin_next_slots, new_slot_indexes); \
+}
+
+KERNEL_ROUND(kernel_round1, 1)
+KERNEL_ROUND(kernel_round2, 2)
+KERNEL_ROUND(kernel_round3, 3)
+KERNEL_ROUND(kernel_round4, 4)
+KERNEL_ROUND(kernel_round5, 5)
+KERNEL_ROUND(kernel_round6, 6)
+KERNEL_ROUND(kernel_round7, 7)
+KERNEL_ROUND(kernel_round8, 8)
+
+
+
+void mark_potential_sol(__global potential_sols_t *potential_sols, uint ref0, uint ref1)
+{
+    uint sol_i = atomic_inc(&potential_sols->nr);
+    if (sol_i >= MAX_POTENTIAL_SOLS)
+        return;
+    potential_sols->values[sol_i][0] = ref0;
+    potential_sols->values[sol_i][1] = ref1;
+}
+
+/*
+** Scan the hash tables to find Equihash solutions.
+*/
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE_POTENTIAL_SOLS, 1, 1)))
+void kernel_potential_sols(
+    __global char *ht_src,
+    __global potential_sols_t *potential_sols,
+    __global uint *rowCountersSrc)
+{
+    __local uint refs[ADJUSTED_LDS_ARRAY_SIZE(NR_SLOTS)];
+    __local uint data[ADJUSTED_LDS_ARRAY_SIZE(NR_SLOTS)];
+
+    uint		nr_slots;
+    uint		i, j;
+    __global char	*p;
+    uint		ref_i, ref_j;
+    __local uint    bin_first_slots[ADJUSTED_LDS_ARRAY_SIZE(NR_BINS)];
+    __local SLOT_INDEX_TYPE    bin_next_slots[ADJUSTED_LDS_ARRAY_SIZE(NR_SLOTS)];
+
+    if (!get_global_id(0))
+        potential_sols->nr = 0;
+    barrier(CLK_GLOBAL_MEM_FENCE);
+
+    uint assigned_row_index = (get_global_id(0) / get_local_size(0));
+    if (assigned_row_index >= NR_ROWS)
+        return;
+
+    __local uint nr_slots_shared;
+    for (i = get_local_id(0); i < NR_BINS; i += get_local_size(0))
+        bin_first_slots[i] = NR_SLOTS;
+    for (i = get_local_id(0); i < NR_SLOTS; i += get_local_size(0))
+        bin_next_slots[i] = NR_SLOTS;
+    if (get_local_id(0) == 0)
+        nr_slots_shared = nr_slots = get_nr_slots(rowCountersSrc, assigned_row_index);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (get_local_id(0))
+        nr_slots = nr_slots_shared;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // in the final hash table, we are looking for a match on both the bits
+    // part of the previous PREFIX colliding bits, and the last PREFIX bits.
+    for (i = get_local_id(0); i < nr_slots; i += get_local_size(0)) {
+        ulong slot_first_8bytes = *(__global ulong *) get_slot_ptr(ht_src, PARAM_K - 1, assigned_row_index, i);
+        uint ref_i = refs[i] = slot_first_8bytes >> 32;
+        uint xi_first_4bytes = data[i] = slot_first_8bytes & 0xffffffff;
+        uint bin_to_use =
+            ((xi_first_4bytes & BIN_MASK(PARAM_K - 1)) >> BIN_MASK_OFFSET(PARAM_K - 1))
+            | ((xi_first_4bytes & BIN_MASK2(PARAM_K - 1)) >> BIN_MASK2_OFFSET(PARAM_K - 1));
+        bin_next_slots[i] = atomic_xchg(&bin_first_slots[bin_to_use], i);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (i = get_local_id(0); i < nr_slots; i += get_local_size(0)) {
+        uint data_i = data[i];
+        j = bin_next_slots[i];
+        while (j < NR_SLOTS) {
+            if (data_i == data[j]) {
+                mark_potential_sol(potential_sols, refs[i], refs[j]);
+                return;
+            }
+            j = bin_next_slots[j];
+        }
+    }
+}
+
+
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE_SOLS, 1, 1)))
+void kernel_sols(__global char *ht0,
+    __global char *ht1,
+    __global sols_t *sols,
+    __global uint *rowCountersSrc,
+    __global uint *rowCountersDst,
+    __global char *ht2,
+    __global char *ht3,
+    __global char *ht4,
+    __global char *ht5,
+    __global char *ht6,
+    __global char *ht7,
+    __global char *ht8,
+    __global potential_sols_t *potential_sols)
+{
+    __local uint	inputs_a[ADJUSTED_LDS_ARRAY_SIZE(1 << PARAM_K)], inputs_b[ADJUSTED_LDS_ARRAY_SIZE(1 << (PARAM_K - 1))];
+    __global char	*htabs[] = { ht0, ht1, ht2, ht3, ht4, ht5, ht6, ht7, ht8 };
+
+    if ((get_global_id(0) / get_local_size(0)) < potential_sols->nr && (get_global_id(0) / get_local_size(0)) < MAX_POTENTIAL_SOLS) {
+        __local uint dup_counter;
+        if (get_local_id(0) == 0) {
+            dup_counter = 0;
+            inputs_a[0] = potential_sols->values[(get_global_id(0) / get_local_size(0))][0];
+            inputs_a[1] = potential_sols->values[(get_global_id(0) / get_local_size(0))][1];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for (int round = 7; round >= 0; --round) {
+            if (round % 2) {
+                for (uint i = get_local_id(0); i < (1 << (8 - round)); i += get_local_size(0)) {
+                    inputs_b[i * 2 + 1] = *get_ref_ptr(htabs[round], round, DECODE_ROW(inputs_a[i]), DECODE_SLOT1(inputs_a[i]));
+                    inputs_b[i * 2] = *get_ref_ptr(htabs[round], round, DECODE_ROW(inputs_a[i]), DECODE_SLOT0(inputs_a[i]));
+                }
+            } else {
+                for (uint i = get_local_id(0); i < (1 << (8 - round)); i += get_local_size(0)) {
+                    inputs_a[i * 2 + 1] = *get_ref_ptr(htabs[round], round, DECODE_ROW(inputs_b[i]), DECODE_SLOT1(inputs_b[i]));
+                    inputs_a[i * 2] = *get_ref_ptr(htabs[round], round, DECODE_ROW(inputs_b[i]), DECODE_SLOT0(inputs_b[i]));
+                }
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+        //barrier(CLK_LOCAL_MEM_FENCE);
+
+        int	dup_to_watch = inputs_a[256 * 2 - 1];
+        for (uint j = 3 + get_local_id(0); j < 256 * 2 - 2; j += get_local_size(0))
+            if (inputs_a[j] == dup_to_watch)
+                atomic_inc(&dup_counter);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // solution appears valid, copy it to sols
+        __local uint sol_i;
+        if (get_local_id(0) == 0 && !dup_counter)
+            sol_i = atomic_inc(&sols->nr);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if (sol_i < MAX_SOLS && !dup_counter) {
+            for (uint i = get_local_id(0); i < (1 << PARAM_K); i += get_local_size(0))
+                sols->values[sol_i][i] = inputs_a[i];
+            if (get_local_id(0) == 0)
+                sols->valid[sol_i] = 1;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
diff --git a/ocl.c b/ocl.c
index 0cd615fe..620a9bc2 100644
--- a/ocl.c
+++ b/ocl.c
@@ -798,8 +798,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
     char *kernel_names[] = {"kernel_init_ht",
                             "kernel_round0", "kernel_round1", "kernel_round2",
                             "kernel_round3", "kernel_round4", "kernel_round5",
-                            "kernel_round6", "kernel_round7", "kernel_round8"};
-    clState->n_extra_kernels = 1 + 9;
+                            "kernel_round6", "kernel_round7", "kernel_round8",
+                            "kernel_potential_sols"};
+    clState->n_extra_kernels = 1 + 9 + 1;
     clState->extra_kernels = (cl_kernel *)malloc(sizeof(cl_kernel) * clState->n_extra_kernels);
     for (int i = 0; i < clState->n_extra_kernels; i++) {
       clState->extra_kernels[i] = clCreateKernel(clState->program, kernel_names[i], &status);
@@ -810,19 +811,22 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
     }
 
     char buffer[32];
-    clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, HT_SIZE, NULL, &status);
+    clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, sizeof(potential_sols_t), NULL, &status);
     snprintf(buffer, sizeof(buffer), "CLbuffer0");
     if (status != CL_SUCCESS)
       goto out;
-    clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, HT_SIZE, NULL, &status);
-    snprintf(buffer, sizeof(buffer), "buffer1");
-    if (status != CL_SUCCESS)
-      goto out;
-    clState->buffer2 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, NR_ROWS, NULL, &status);
+    clState->buffer1 = NULL;
+    for (int i = 0; i < 9; i++) {
+      snprintf(buffer, sizeof(buffer), "index_buf[%d]", i);
+      clState->index_buf[i] = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, HT_SIZE, NULL, &status);
+      if (status != CL_SUCCESS)
+        goto out;
+    }
+    clState->buffer2 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, RC_SIZE, NULL, &status);
     snprintf(buffer, sizeof(buffer), "buffer2");
     if (status != CL_SUCCESS)
       goto out;
-    clState->buffer3 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, NR_ROWS, NULL, &status); 
+    clState->buffer3 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, RC_SIZE, NULL, &status); 
     snprintf(buffer, sizeof(buffer), "buffer3");
     if (status != CL_SUCCESS)
       goto out;
@@ -839,13 +843,39 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
     if (status != CL_SUCCESS)
       goto out;
 
+    cl_mem rowCounters[] = {clState->buffer2, clState->buffer3};
+    for (int round = 0; round < PARAM_K; round++) {
+      unsigned int num = 0;
+      cl_kernel *kernel = &clState->extra_kernels[1 + round];
+      if (!round) {
+        CL_SET_ARG(clState->MidstateBuf);
+        CL_SET_ARG(clState->index_buf[round]);
+        CL_SET_ARG(rowCounters[round % 2]);
+      }
+      else {
+        CL_SET_ARG(clState->index_buf[round - 1]);
+        CL_SET_ARG(clState->index_buf[round]);
+        CL_SET_ARG(rowCounters[(round - 1) % 2]);
+        CL_SET_ARG(rowCounters[round % 2]);
+      }
+      CL_SET_ARG(clState->padbuffer8);
+    }
     unsigned int num = 0;
-    cl_kernel *kernel = &clState->kernel;
+    cl_kernel *kernel = &clState->extra_kernels[1 + 9];
+    CL_SET_ARG(clState->index_buf[8]);
     CL_SET_ARG(clState->CLbuffer0);
-    CL_SET_ARG(clState->buffer1);
+    CL_SET_ARG(rowCounters[0]);
+
+    num = 0;
+    kernel = &clState->kernel;
+    CL_SET_ARG(clState->index_buf[0]);
+    CL_SET_ARG(clState->index_buf[1]);
     CL_SET_ARG(clState->outputBuffer);
-    CL_SET_ARG(clState->buffer2);
-    CL_SET_ARG(clState->buffer3);
+    CL_SET_ARG(rowCounters[0]);
+    CL_SET_ARG(rowCounters[1]);
+    for (int i = 2; i < 9; i++)
+      CL_SET_ARG(clState->index_buf[i]);
+    CL_SET_ARG(clState->CLbuffer0);
 
     if (status != CL_SUCCESS) {
       applog(LOG_ERR, "Error %d: Setting Kernel arguments for ALGO_EQUIHASH failed. (clSetKernelArg)", status);