diff --git a/algorithm.c b/algorithm.c
index 8f40a59e..2f543556 100644
--- a/algorithm.c
+++ b/algorithm.c
@@ -1293,10 +1293,10 @@ static algorithm_settings_t algos[] = {
   { "ethash",     ALGO_ETHASH,   "", (1ULL << 32), (1ULL << 32), 1, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x00000000UL, 0, 128, 0, ethash_regenhash, NULL, queue_ethash_kernel, gen_hash, append_ethash_compiler_options },
   { "ethash-genoil",     ALGO_ETHASH,   "", (1ULL << 32), (1ULL << 32), 1, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x00000000UL, 0, 128, 0, ethash_regenhash, NULL, queue_ethash_kernel, gen_hash, append_ethash_compiler_options },
 
-  { "equihash",     ALGO_EQUIHASH,   "", 1, (1ULL << 28), (1ULL << 28), 0, 0, 0x20000, 0xFFFF000000000000ULL, 0x00000000UL, 0, 128, 0, equihash_regenhash, NULL, queue_equihash_kernel, gen_hash, append_equihash_compiler_options },
-  
   { "cryptonight", ALGO_CRYPTONIGHT, "", (1ULL << 32), (1ULL << 32), (1ULL << 32), 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 6, 0, 0, cryptonight_regenhash, NULL, queue_cryptonight_kernel, gen_hash, NULL },
   
+  { "equihash",     ALGO_EQUIHASH,   "", 1, (1ULL << 28), (1ULL << 28), 0, 0, 0x20000, 0xFFFF000000000000ULL, 0x00000000UL, 0, 128, 0, equihash_regenhash, NULL, queue_equihash_kernel, gen_hash, append_equihash_compiler_options },
+  
   // Terminator (do not remove)
   { NULL, ALGO_UNK, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL, NULL }
 };
diff --git a/algorithm.h b/algorithm.h
index 11f43a24..54e2308e 100644
--- a/algorithm.h
+++ b/algorithm.h
@@ -109,9 +109,6 @@ void set_algorithm(algorithm_t* algo, const char* name);
 /* Set to specific N factor. */
 void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor);
 
-/* Set to specific K factor. */
-void set_algorithm_kfactor(algorithm_t* algo, const uint8_t kfactor);
-
 /* Compare two algorithm parameters */
 bool cmp_algorithm(const algorithm_t* algo1, const algorithm_t* algo2);
 
diff --git a/algorithm/extern/blake2-impl.h b/algorithm/extern/blake2-impl.h
deleted file mode 100644
index 5ac7a430..00000000
--- a/algorithm/extern/blake2-impl.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
-   BLAKE2 reference source code package - reference C implementations
-
-   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
-
-   To the extent possible under law, the author(s) have dedicated all copyright
-   and related and neighboring rights to this software to the public domain
-   worldwide. This software is distributed without any warranty.
-
-   You should have received a copy of the CC0 Public Domain Dedication along with
-   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
-*/
-#pragma once
-#ifndef __BLAKE2_IMPL_H__
-#define __BLAKE2_IMPL_H__
-
-#include <stdint.h>
-
-static inline uint32_t load32( const void *src )
-{
-#if defined(NATIVE_LITTLE_ENDIAN)
-  uint32_t w;
-  memcpy(&w, src, sizeof w);
-  return w;
-#else
-  const uint8_t *p = ( const uint8_t * )src;
-  uint32_t w = *p++;
-  w |= ( uint32_t )( *p++ ) <<  8;
-  w |= ( uint32_t )( *p++ ) << 16;
-  w |= ( uint32_t )( *p++ ) << 24;
-  return w;
-#endif
-}
-
-static inline uint64_t load64( const void *src )
-{
-#if defined(NATIVE_LITTLE_ENDIAN)
-  uint64_t w;
-  memcpy(&w, src, sizeof w);
-  return w;
-#else
-  const uint8_t *p = ( const uint8_t * )src;
-  uint64_t w = *p++;
-  w |= ( uint64_t )( *p++ ) <<  8;
-  w |= ( uint64_t )( *p++ ) << 16;
-  w |= ( uint64_t )( *p++ ) << 24;
-  w |= ( uint64_t )( *p++ ) << 32;
-  w |= ( uint64_t )( *p++ ) << 40;
-  w |= ( uint64_t )( *p++ ) << 48;
-  w |= ( uint64_t )( *p++ ) << 56;
-  return w;
-#endif
-}
-
-static inline void store32( void *dst, uint32_t w )
-{
-#if defined(NATIVE_LITTLE_ENDIAN)
-  memcpy(dst, &w, sizeof w);
-#else
-  uint8_t *p = ( uint8_t * )dst;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w;
-#endif
-}
-
-static inline void store64( void *dst, uint64_t w )
-{
-#if defined(NATIVE_LITTLE_ENDIAN)
-  memcpy(dst, &w, sizeof w);
-#else
-  uint8_t *p = ( uint8_t * )dst;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w;
-#endif
-}
-
-static inline uint64_t load48( const void *src )
-{
-  const uint8_t *p = ( const uint8_t * )src;
-  uint64_t w = *p++;
-  w |= ( uint64_t )( *p++ ) <<  8;
-  w |= ( uint64_t )( *p++ ) << 16;
-  w |= ( uint64_t )( *p++ ) << 24;
-  w |= ( uint64_t )( *p++ ) << 32;
-  w |= ( uint64_t )( *p++ ) << 40;
-  return w;
-}
-
-static inline void store48( void *dst, uint64_t w )
-{
-  uint8_t *p = ( uint8_t * )dst;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w; w >>= 8;
-  *p++ = ( uint8_t )w;
-}
-
-static inline uint32_t rotl32( const uint32_t w, const unsigned c )
-{
-  return ( w << c ) | ( w >> ( 32 - c ) );
-}
-
-static inline uint64_t rotl64( const uint64_t w, const unsigned c )
-{
-  return ( w << c ) | ( w >> ( 64 - c ) );
-}
-
-static inline uint32_t rotr32( const uint32_t w, const unsigned c )
-{
-  return ( w >> c ) | ( w << ( 32 - c ) );
-}
-
-static inline uint64_t rotr64( const uint64_t w, const unsigned c )
-{
-  return ( w >> c ) | ( w << ( 64 - c ) );
-}
-
-/* prevents compiler optimizing out memset() */
-static inline void secure_zero_memory( void *v, size_t n )
-{
-  volatile uint8_t *p = ( volatile uint8_t * )v;
-  while( n-- ) *p++ = 0;
-}
-
-#endif
-
diff --git a/algorithm/extern/blake2.h b/algorithm/extern/blake2.h
deleted file mode 100644
index f8aba833..00000000
--- a/algorithm/extern/blake2.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
-   BLAKE2 reference source code package - reference C implementations
-
-   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
-
-   To the extent possible under law, the author(s) have dedicated all copyright
-   and related and neighboring rights to this software to the public domain
-   worldwide. This software is distributed without any warranty.
-
-   You should have received a copy of the CC0 Public Domain Dedication along with
-   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
-*/
-#pragma once
-#ifndef __BLAKE2_H__
-#define __BLAKE2_H__
-
-#include <stddef.h>
-#include <stdint.h>
-
-#if defined(_MSC_VER)
-#define ALIGN(x) __declspec(align(x))
-#else
-#define ALIGN(x) __attribute__((aligned(x)))
-#endif
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-  enum blake2s_constant
-  {
-    BLAKE2S_BLOCKBYTES = 64,
-    BLAKE2S_OUTBYTES   = 32,
-    BLAKE2S_KEYBYTES   = 32,
-    BLAKE2S_SALTBYTES  = 8,
-    BLAKE2S_PERSONALBYTES = 8
-  };
-
-  enum blake2b_constant
-  {
-    BLAKE2B_BLOCKBYTES = 128,
-    BLAKE2B_OUTBYTES   = 64,
-    BLAKE2B_KEYBYTES   = 64,
-    BLAKE2B_SALTBYTES  = 16,
-    BLAKE2B_PERSONALBYTES = 16
-  };
-
-#pragma pack(push, 1)
-  typedef struct __blake2s_param
-  {
-    uint8_t  digest_length; // 1
-    uint8_t  key_length;    // 2
-    uint8_t  fanout;        // 3
-    uint8_t  depth;         // 4
-    uint32_t leaf_length;   // 8
-    uint8_t  node_offset[6];// 14
-    uint8_t  node_depth;    // 15
-    uint8_t  inner_length;  // 16
-    // uint8_t  reserved[0];
-    uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
-    uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
-  } blake2s_param;
-
-  ALIGN( 64 ) typedef struct __blake2s_state
-  {
-    uint32_t h[8];
-    uint32_t t[2];
-    uint32_t f[2];
-    uint8_t  buf[2 * BLAKE2S_BLOCKBYTES];
-    size_t   buflen;
-    uint8_t  last_node;
-  } blake2s_state ;
-
-  typedef struct __blake2b_param
-  {
-    uint8_t  digest_length; // 1
-    uint8_t  key_length;    // 2
-    uint8_t  fanout;        // 3
-    uint8_t  depth;         // 4
-    uint32_t leaf_length;   // 8
-    uint64_t node_offset;   // 16
-    uint8_t  node_depth;    // 17
-    uint8_t  inner_length;  // 18
-    uint8_t  reserved[14];  // 32
-    uint8_t  salt[BLAKE2B_SALTBYTES]; // 48
-    uint8_t  personal[BLAKE2B_PERSONALBYTES];  // 64
-  } blake2b_param;
-
-  ALIGN( 64 ) typedef struct __blake2b_state
-  {
-    uint64_t h[8];
-    uint64_t t[2];
-    uint64_t f[2];
-    uint8_t  buf[2 * BLAKE2B_BLOCKBYTES];
-    size_t   buflen;
-    uint8_t  last_node;
-  } blake2b_state;
-
-  typedef struct __blake2sp_state
-  {
-    blake2s_state S[8][1];
-    blake2s_state R[1];
-    uint8_t buf[8 * BLAKE2S_BLOCKBYTES];
-    size_t  buflen;
-  } blake2sp_state;
-
-  typedef struct __blake2bp_state
-  {
-    blake2b_state S[4][1];
-    blake2b_state R[1];
-    uint8_t buf[4 * BLAKE2B_BLOCKBYTES];
-    size_t  buflen;
-  } blake2bp_state;
-#pragma pack(pop)
-
-  // Streaming API
-  int blake2s_init( blake2s_state *S, const uint8_t outlen );
-  int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
-  int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
-  int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen );
-  int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen );
-
-  int blake2b_init( blake2b_state *S, const uint8_t outlen );
-  int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
-  int blake2b_init_param( blake2b_state *S, const blake2b_param *P );
-  int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen );
-  int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen );
-
-  int blake2sp_init( blake2sp_state *S, const uint8_t outlen );
-  int blake2sp_init_key( blake2sp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
-  int blake2sp_update( blake2sp_state *S, const uint8_t *in, uint64_t inlen );
-  int blake2sp_final( blake2sp_state *S, uint8_t *out, uint8_t outlen );
-
-  int blake2bp_init( blake2bp_state *S, const uint8_t outlen );
-  int blake2bp_init_key( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
-  int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
-  int blake2bp_final( blake2bp_state *S, uint8_t *out, uint8_t outlen );
-
-  // Simple API
-  int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
-  int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
-
-  int blake2sp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
-  int blake2bp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
-
-  static inline int blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
-  {
-    return blake2b( out, in, key, outlen, inlen, keylen );
-  }
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
-
diff --git a/algorithm/extern/blake2b-ref.c b/algorithm/extern/blake2b-ref.c
deleted file mode 100644
index 7064b28d..00000000
--- a/algorithm/extern/blake2b-ref.c
+++ /dev/null
@@ -1,396 +0,0 @@
-/*
-   BLAKE2 reference source code package - reference C implementations
-
-   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
-
-   To the extent possible under law, the author(s) have dedicated all copyright
-   and related and neighboring rights to this software to the public domain
-   worldwide. This software is distributed without any warranty.
-
-   You should have received a copy of the CC0 Public Domain Dedication along with
-   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
-*/
-
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "blake2.h"
-#include "blake2-impl.h"
-
-static const uint64_t blake2b_IV[8] =
-{
-  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
-  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
-  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
-  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
-};
-
-static const uint8_t blake2b_sigma[12][16] =
-{
-  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
-  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
-  { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
-  {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
-  {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
-  {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
-  { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
-  { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
-  {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
-  { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
-  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
-  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
-};
-
-
-static inline int blake2b_set_lastnode( blake2b_state *S )
-{
-  S->f[1] = -1;
-  return 0;
-}
-
-static inline int blake2b_clear_lastnode( blake2b_state *S )
-{
-  S->f[1] = 0;
-  return 0;
-}
-
-/* Some helper functions, not necessarily useful */
-int blake2b_set_lastblock( blake2b_state *S )
-{
-  if( S->last_node ) blake2b_set_lastnode( S );
-
-  S->f[0] = -1;
-  return 0;
-}
-
-static inline int blake2b_clear_lastblock( blake2b_state *S )
-{
-  if( S->last_node ) blake2b_clear_lastnode( S );
-
-  S->f[0] = 0;
-  return 0;
-}
-
-static inline int blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
-{
-  S->t[0] += inc;
-  S->t[1] += ( S->t[0] < inc );
-  return 0;
-}
-
-
-
-// Parameter-related functions
-static inline int blake2b_param_set_digest_length( blake2b_param *P, const uint8_t digest_length )
-{
-  P->digest_length = digest_length;
-  return 0;
-}
-
-static inline int blake2b_param_set_fanout( blake2b_param *P, const uint8_t fanout )
-{
-  P->fanout = fanout;
-  return 0;
-}
-
-static inline int blake2b_param_set_max_depth( blake2b_param *P, const uint8_t depth )
-{
-  P->depth = depth;
-  return 0;
-}
-
-static inline int blake2b_param_set_leaf_length( blake2b_param *P, const uint32_t leaf_length )
-{
-  store32( &P->leaf_length, leaf_length );
-  return 0;
-}
-
-static inline int blake2b_param_set_node_offset( blake2b_param *P, const uint64_t node_offset )
-{
-  store64( &P->node_offset, node_offset );
-  return 0;
-}
-
-static inline int blake2b_param_set_node_depth( blake2b_param *P, const uint8_t node_depth )
-{
-  P->node_depth = node_depth;
-  return 0;
-}
-
-static inline int blake2b_param_set_inner_length( blake2b_param *P, const uint8_t inner_length )
-{
-  P->inner_length = inner_length;
-  return 0;
-}
-
-static inline int blake2b_param_set_salt( blake2b_param *P, const uint8_t salt[BLAKE2B_SALTBYTES] )
-{
-  memcpy( P->salt, salt, BLAKE2B_SALTBYTES );
-  return 0;
-}
-
-static inline int blake2b_param_set_personal( blake2b_param *P, const uint8_t personal[BLAKE2B_PERSONALBYTES] )
-{
-  memcpy( P->personal, personal, BLAKE2B_PERSONALBYTES );
-  return 0;
-}
-
-static inline int blake2b_init0( blake2b_state *S )
-{
-  memset( S, 0, sizeof( blake2b_state ) );
-
-  for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];
-
-  return 0;
-}
-
-/* init xors IV with input parameter block */
-int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
-{
-  blake2b_init0( S );
-  const uint8_t *p = ( const uint8_t * )( P );
-
-  /* IV XOR ParamBlock */
-  for( size_t i = 0; i < 8; ++i )
-    S->h[i] ^= load64( p + sizeof( S->h[i] ) * i );
-
-  return 0;
-}
-
-
-
-int blake2b_init( blake2b_state *S, const uint8_t outlen )
-{
-  blake2b_param P[1];
-
-  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
-
-  P->digest_length = outlen;
-  P->key_length    = 0;
-  P->fanout        = 1;
-  P->depth         = 1;
-  store32( &P->leaf_length, 0 );
-  store64( &P->node_offset, 0 );
-  P->node_depth    = 0;
-  P->inner_length  = 0;
-  memset( P->reserved, 0, sizeof( P->reserved ) );
-  memset( P->salt,     0, sizeof( P->salt ) );
-  memset( P->personal, 0, sizeof( P->personal ) );
-  return blake2b_init_param( S, P );
-}
-
-
-int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
-{
-  blake2b_param P[1];
-
-  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
-
-  if ( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1;
-
-  P->digest_length = outlen;
-  P->key_length    = keylen;
-  P->fanout        = 1;
-  P->depth         = 1;
-  store32( &P->leaf_length, 0 );
-  store64( &P->node_offset, 0 );
-  P->node_depth    = 0;
-  P->inner_length  = 0;
-  memset( P->reserved, 0, sizeof( P->reserved ) );
-  memset( P->salt,     0, sizeof( P->salt ) );
-  memset( P->personal, 0, sizeof( P->personal ) );
-
-  if( blake2b_init_param( S, P ) < 0 ) return -1;
-
-  {
-    uint8_t block[BLAKE2B_BLOCKBYTES];
-    memset( block, 0, BLAKE2B_BLOCKBYTES );
-    memcpy( block, key, keylen );
-    blake2b_update( S, block, BLAKE2B_BLOCKBYTES );
-    secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
-  }
-  return 0;
-}
-
-int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
-{
-  uint64_t m[16];
-  uint64_t v[16];
-  int i;
-
-  for( i = 0; i < 16; ++i )
-    m[i] = load64( block + i * sizeof( m[i] ) );
-
-  for( i = 0; i < 8; ++i )
-    v[i] = S->h[i];
-
-  v[ 8] = blake2b_IV[0];
-  v[ 9] = blake2b_IV[1];
-  v[10] = blake2b_IV[2];
-  v[11] = blake2b_IV[3];
-  v[12] = S->t[0] ^ blake2b_IV[4];
-  v[13] = S->t[1] ^ blake2b_IV[5];
-  v[14] = S->f[0] ^ blake2b_IV[6];
-  v[15] = S->f[1] ^ blake2b_IV[7];
-#define G(r,i,a,b,c,d) \
-  do { \
-    a = a + b + m[blake2b_sigma[r][2*i+0]]; \
-    d = rotr64(d ^ a, 32); \
-    c = c + d; \
-    b = rotr64(b ^ c, 24); \
-    a = a + b + m[blake2b_sigma[r][2*i+1]]; \
-    d = rotr64(d ^ a, 16); \
-    c = c + d; \
-    b = rotr64(b ^ c, 63); \
-  } while(0)
-#define ROUND(r)  \
-  do { \
-    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
-    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
-    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
-    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
-    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
-    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
-    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
-    G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
-  } while(0)
-  ROUND( 0 );
-  ROUND( 1 );
-  ROUND( 2 );
-  ROUND( 3 );
-  ROUND( 4 );
-  ROUND( 5 );
-  ROUND( 6 );
-  ROUND( 7 );
-  ROUND( 8 );
-  ROUND( 9 );
-  ROUND( 10 );
-  ROUND( 11 );
-
-  for( i = 0; i < 8; ++i )
-    S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
-
-#undef G
-#undef ROUND
-  return 0;
-}
-
-/* inlen now in bytes */
-int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen )
-{
-  while( inlen > 0 )
-  {
-    size_t left = S->buflen;
-    size_t fill = 2 * BLAKE2B_BLOCKBYTES - left;
-
-    if( inlen > fill )
-    {
-      memcpy( S->buf + left, in, fill ); // Fill buffer
-      S->buflen += fill;
-      blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
-      blake2b_compress( S, S->buf ); // Compress
-      memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left
-      S->buflen -= BLAKE2B_BLOCKBYTES;
-      in += fill;
-      inlen -= fill;
-    }
-    else // inlen <= fill
-    {
-      memcpy( S->buf + left, in, inlen );
-      S->buflen += inlen; // Be lazy, do not compress
-      in += inlen;
-      inlen -= inlen;
-    }
-  }
-
-  return 0;
-}
-
-/* Is this correct? */
-int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen )
-{
-  uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
-
-  if( outlen > BLAKE2B_OUTBYTES )
-    return -1;
-
-  if( S->buflen > BLAKE2B_BLOCKBYTES )
-  {
-    blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
-    blake2b_compress( S, S->buf );
-    S->buflen -= BLAKE2B_BLOCKBYTES;
-    memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen );
-  }
-
-  blake2b_increment_counter( S, S->buflen );
-  blake2b_set_lastblock( S );
-  memset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
-  blake2b_compress( S, S->buf );
-
-//  printf("S->buflen = %016llX\n", S->buflen);
-
-  for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
-    store64( buffer + sizeof( S->h[i] ) * i, S->h[i] );
-
-  memcpy( out, buffer, outlen );
-  return 0;
-}
-
-/* inlen, at least, should be uint64_t. Others can be size_t. */
-int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
-{
-  blake2b_state S[1];
-
-  /* Verify parameters */
-  if ( NULL == in ) return -1;
-
-  if ( NULL == out ) return -1;
-
-  if( NULL == key ) keylen = 0;
-
-  if( keylen > 0 )
-  {
-    if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
-  }
-  else
-  {
-    if( blake2b_init( S, outlen ) < 0 ) return -1;
-  }
-
-  blake2b_update( S, ( const uint8_t * )in, inlen );
-  blake2b_final( S, out, outlen );
-  return 0;
-}
-
-#if defined(BLAKE2B_SELFTEST)
-#include <string.h>
-#include "blake2-kat.h"
-int main( int argc, char **argv )
-{
-  uint8_t key[BLAKE2B_KEYBYTES];
-  uint8_t buf[KAT_LENGTH];
-
-  for( size_t i = 0; i < BLAKE2B_KEYBYTES; ++i )
-    key[i] = ( uint8_t )i;
-
-  for( size_t i = 0; i < KAT_LENGTH; ++i )
-    buf[i] = ( uint8_t )i;
-
-  for( size_t i = 0; i < KAT_LENGTH; ++i )
-  {
-    uint8_t hash[BLAKE2B_OUTBYTES];
-    blake2b( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
-
-    if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
-    {
-      puts( "error" );
-      return -1;
-    }
-  }
-
-  puts( "ok" );
-  return 0;
-}
-#endif
-
diff --git a/driver-opencl.c b/driver-opencl.c
index f1704c0d..599470f1 100644
--- a/driver-opencl.c
+++ b/driver-opencl.c
@@ -492,12 +492,12 @@ char *set_temp_overheat(char *arg)
       return "Invalid value passed to set temp overheat";
 
     gpus[device].adl.overtemp = val;
-	gpus[device++].sysfs_info.OverHeatTemp = val;
+  gpus[device++].sysfs_info.OverHeatTemp = val;
   }
   if (device == 1) {
     for (i = device; i < MAX_GPUDEVICES; i++) {
       gpus[i].adl.overtemp = val;
-	gpus[i].sysfs_info.OverHeatTemp = val;
+  gpus[i].sysfs_info.OverHeatTemp = val;
     }
   }
 
@@ -529,7 +529,7 @@ char *set_temp_target(char *arg)
     tt = &gpus[device].adl.targettemp;
     *tt = val;
     tt = &gpus[device++].sysfs_info.TargetTemp;
-	*tt = val;
+  *tt = val;
     
   }
   if (device == 1) {
@@ -537,7 +537,7 @@ char *set_temp_target(char *arg)
       tt = &gpus[i].adl.targettemp;
       *tt = val;
       tt = &gpus[i].sysfs_info.TargetTemp;
-	  *tt = val;
+    *tt = val;
     }
   }
 
@@ -1455,7 +1455,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
       // increase nonce
       work->blk.nonce++;
       if (work->getwork_mode == GETWORK_MODE_STRATUM)
-	*(uint16_t*)(work->equihash_data + 108 + strlen(work->nonce1) / 2) += 1;
+  *(uint16_t*)(work->equihash_data + 108 + strlen(work->nonce1) / 2) += 1;
       else {
         *(uint64_t*)(work->equihash_data + 108) += 1;
         
@@ -1476,128 +1476,115 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
   }
   // if (algorithm.type == ALGO_ETHASH) read lock gpu->eth_dag.lock has to be released
   
-  if(gpu->algorithm.type == ALGO_CRYPTONIGHT)
-  {
-	mutex_lock(&work->pool->XMRGlobalNonceLock);
-	work->blk.nonce = work->pool->XMRGlobalNonce;
-	work->pool->XMRGlobalNonce += gpu->max_hashes;
-	mutex_unlock(&work->pool->XMRGlobalNonceLock);
+  if(gpu->algorithm.type == ALGO_CRYPTONIGHT) {
+    mutex_lock(&work->pool->XMRGlobalNonceLock);
+    work->blk.nonce = work->pool->XMRGlobalNonce;
+    work->pool->XMRGlobalNonce += gpu->max_hashes;
+    mutex_unlock(&work->pool->XMRGlobalNonceLock);
   }
   
   if (clState->goffset)
     p_global_work_offset = (size_t *)&work->blk.nonce;
   
-  if(gpu->algorithm.type == ALGO_CRYPTONIGHT)
-	{
-		size_t GlobalThreads = *globalThreads, Nonce[2] = { (size_t)work->blk.nonce, 1}, gthreads[2] = { *globalThreads, 8 }, lthreads[2] = { *localThreads, 8 };
-		size_t BranchBufCount[4] = { 0, 0, 0, 0 };
-		
-		for(int i = 0; i < 4; ++i)
-		{
-			cl_uint zero = 0;
-
-			status = clEnqueueWriteBuffer(clState->commandQueue, clState->BranchBuffer[i], CL_FALSE, sizeof(cl_uint) * GlobalThreads, sizeof(cl_uint), &zero, 0, NULL, NULL);
-
-			if(status != CL_SUCCESS)
-			{
-				applog(LOG_ERR, "Error %d while resetting branch buffer counter %d.\n", status, i);
-				return(-1);
-			}
-		}
-		
-		clFinish(clState->commandQueue);
-		
-		// Main CN P0
-		status = clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 2, Nonce, gthreads, lthreads, 0, NULL, NULL);
-		
-		if(status != CL_SUCCESS)
-		{
-			applog(LOG_ERR, "Error %d while attempting to enqueue kernel 0.", status);
-			return(-1);
-		}
-		
-		// Main CN P1
-		status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[0], 1, p_global_work_offset, globalThreads, localThreads, 0, NULL, NULL);
-		
-		if(status != CL_SUCCESS)
-		{
-			applog(LOG_ERR, "Error %d while attempting to enqueue kernel 1.", status);
-			return(-1);
-		}
-		
-		// Main CN P2
-		status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[1], 2, Nonce, gthreads, lthreads, 0, NULL, NULL);
-		
-		if(status != CL_SUCCESS)
-		{
-			applog(LOG_ERR, "Error %d while attempting to enqueue kernel 2.", status);
-			return(-1);
-		}
-		
-		// Read BranchBuf counters
-		
-		for(int i = 0; i < 4; ++i)
-		{
-			status = clEnqueueReadBuffer(clState->commandQueue, clState->BranchBuffer[i], CL_FALSE, sizeof(cl_uint) * GlobalThreads, sizeof(cl_uint), BranchBufCount + i, 0, NULL, NULL);
-			
-			if(status != CL_SUCCESS)
-			{
-				applog(LOG_ERR, "Error %d while attempting to read branch buffer counter %d.", status, i);
-				return(-1);
-			}
-		}
-		
-		clFinish(clState->commandQueue);
-		
-		for(int i = 0; i < 4; ++i)
-		{
-			if(BranchBufCount[i])
-			{
-				cl_ulong tmp = BranchBufCount[i];
-				
-				// Threads
-				status = clSetKernelArg(clState->extra_kernels[i + 2], 4, sizeof(cl_ulong), &tmp);
-				
-				if(status != CL_SUCCESS)
-				{
-					applog(LOG_ERR, "Error %d while attempting to set argument 4 for kernel %d.", status, i + 2);
-					return(-1);
-				}
-				
-				// Make it a multiple of the local worksize (some drivers will otherwise shit a brick)
-				BranchBufCount[i] += (clState->wsize - (BranchBufCount[i] & (clState->wsize - 1)));
-				
-				status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i + 2], 1, p_global_work_offset, BranchBufCount + i, localThreads, 0, NULL, NULL);
-				
-				if(status != CL_SUCCESS)
-				{
-					applog(LOG_ERR, "Error %d while attempting to enqueue kernel %d.", status, i + 2);
-					return(-1);
-				}
-			}
-		}
-	}
-	else
-	{
-	  status = clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 1, p_global_work_offset,
-		globalThreads, localThreads, 0, NULL, NULL);
-	  if (unlikely(status != CL_SUCCESS)) {
-		if (work->pool->algorithm.type == ALGO_ETHASH)
-		  cg_runlock(&gpu->eth_dag.lock);
-		applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
-		return -1;
-	  }
-
-	  for (i = 0; i < clState->n_extra_kernels; i++) {
-		status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset,
-		  globalThreads, localThreads, 0, NULL, NULL);
-		if (unlikely(status != CL_SUCCESS)) {
-		  applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
-		  return -1;
-		}
-	  }
-	}
-	
+  if (gpu->algorithm.type == ALGO_CRYPTONIGHT) {
+    size_t GlobalThreads = *globalThreads, Nonce[2] = { (size_t)work->blk.nonce, 1}, gthreads[2] = { *globalThreads, 8 }, lthreads[2] = { *localThreads, 8 };
+    size_t BranchBufCount[4] = { 0, 0, 0, 0 };
+    
+    for (int i = 0; i < 4; ++i) {
+      cl_uint zero = 0;
+
+      status = clEnqueueWriteBuffer(clState->commandQueue, clState->BranchBuffer[i], CL_FALSE, sizeof(cl_uint) * GlobalThreads, sizeof(cl_uint), &zero, 0, NULL, NULL);
+
+      if(status != CL_SUCCESS) {
+        applog(LOG_ERR, "Error %d while resetting branch buffer counter %d.\n", status, i);
+        return -1;
+      }
+    }
+    
+    clFinish(clState->commandQueue);
+    
+    // Main CN P0
+    status = clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 2, Nonce, gthreads, lthreads, 0, NULL, NULL);
+    
+    if (status != CL_SUCCESS) {
+      applog(LOG_ERR, "Error %d while attempting to enqueue kernel 0.", status);
+      return -1;
+    }
+    
+    // Main CN P1
+    status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[0], 1, p_global_work_offset, globalThreads, localThreads, 0, NULL, NULL);
+    
+    if (status != CL_SUCCESS) {
+      applog(LOG_ERR, "Error %d while attempting to enqueue kernel 1.", status);
+      return -1;
+    }
+    
+    // Main CN P2
+    status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[1], 2, Nonce, gthreads, lthreads, 0, NULL, NULL);
+    
+    if (status != CL_SUCCESS) {
+      applog(LOG_ERR, "Error %d while attempting to enqueue kernel 2.", status);
+      return -1;
+    }
+    
+    // Read BranchBuf counters
+    
+    for (int i = 0; i < 4; ++i) {
+      status = clEnqueueReadBuffer(clState->commandQueue, clState->BranchBuffer[i], CL_FALSE, sizeof(cl_uint) * GlobalThreads, sizeof(cl_uint), BranchBufCount + i, 0, NULL, NULL);
+      
+      if(status != CL_SUCCESS) {
+        applog(LOG_ERR, "Error %d while attempting to read branch buffer counter %d.", status, i);
+        return(-1);
+      }
+    }
+    
+    clFinish(clState->commandQueue);
+    
+    for (int i = 0; i < 4; ++i) {
+      if(BranchBufCount[i]) {
+        cl_ulong tmp = BranchBufCount[i];
+        
+        // Threads
+        status = clSetKernelArg(clState->extra_kernels[i + 2], 4, sizeof(cl_ulong), &tmp);
+        
+        if (status != CL_SUCCESS) {
+          applog(LOG_ERR, "Error %d while attempting to set argument 4 for kernel %d.", status, i + 2);
+          return -1;
+        }
+        
+        // Make it a multiple of the local worksize (some drivers will otherwise shit a brick)
+        BranchBufCount[i] += (clState->wsize - (BranchBufCount[i] & (clState->wsize - 1)));
+        
+        status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i + 2], 1, p_global_work_offset, BranchBufCount + i, localThreads, 0, NULL, NULL);
+        
+        if (status != CL_SUCCESS) {
+          applog(LOG_ERR, "Error %d while attempting to enqueue kernel %d.", status, i + 2);
+          return -1;
+        }
+      }
+    }
+  }
+  else {
+    status = clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 1, p_global_work_offset,
+      globalThreads, localThreads, 0, NULL, NULL);
+
+    if (unlikely(status != CL_SUCCESS)) {
+      if (work->pool->algorithm.type == ALGO_ETHASH)
+        cg_runlock(&gpu->eth_dag.lock);
+      applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
+      return -1;
+    }
+
+    for (i = 0; i < clState->n_extra_kernels; i++) {
+      status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset,
+      globalThreads, localThreads, 0, NULL, NULL);
+      if (unlikely(status != CL_SUCCESS)) {
+        applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
+        return -1;
+      }
+    }
+  }
+  
   status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
     buffersize, thrdata->res, 0, NULL, NULL);
   if (unlikely(status != CL_SUCCESS)) {
@@ -1628,9 +1615,9 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
     }
     applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id);
     postcalc_hash_async(thr, work, thrdata->res);
-//	postcalc_hash(thr);
-//	submit_tested_work(thr, work);
-//	submit_work_async(work);
+//  postcalc_hash(thr);
+//  submit_tested_work(thr, work);
+//  submit_work_async(work);
     memset(thrdata->res, 0, buffersize);
     /* This finish flushes the writebuffer set with CL_FALSE in clEnqueueWriteBuffer */
     clFinish(clState->commandQueue);
@@ -1652,12 +1639,12 @@ static void opencl_thread_shutdown(struct thr_info *thr)
     clFinish(clState->commandQueue);
     clReleaseMemObject(clState->outputBuffer);
     clReleaseMemObject(clState->CLbuffer0);
-	if (clState->buffer1)
-	clReleaseMemObject(clState->buffer1);
-	if (clState->buffer2)
-	clReleaseMemObject(clState->buffer2);
-	if (clState->buffer3)
-	clReleaseMemObject(clState->buffer3);
+  if (clState->buffer1)
+  clReleaseMemObject(clState->buffer1);
+  if (clState->buffer2)
+  clReleaseMemObject(clState->buffer2);
+  if (clState->buffer3)
+  clReleaseMemObject(clState->buffer3);
     if (clState->padbuffer8)
       clReleaseMemObject(clState->padbuffer8);
     clReleaseKernel(clState->kernel);
diff --git a/kernel/equihash-param.h b/kernel/equihash-param.h
index a40eea69..ace80692 100644
--- a/kernel/equihash-param.h
+++ b/kernel/equihash-param.h
@@ -1,113 +1,113 @@
-#ifndef __OPENCL_VERSION__
-#define uint uint32_t
-#define uchar uint8_t
-#endif
-
-
-#define PARAM_N				200
-#define PARAM_K				9
-#define PREFIX                          (PARAM_N / (PARAM_K + 1))
-#define NR_INPUTS                       (1 << PREFIX)
-// Approximate log base 2 of number of elements in hash tables
-#define APX_NR_ELMS_LOG                 (PREFIX + 1)
-// Number of rows and slots is affected by this. 20 offers the best performance
-// but occasionally misses ~1% of solutions.
-#define NR_ROWS_LOG                     18
-
-// Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md
-#define OPTIM_SIMPLIFY_ROUND		1
-
-// Number of collision items to track, per thread
-#define THREADS_PER_ROW 8
-#define LDS_COLL_SIZE (NR_SLOTS * 8 * (64 / THREADS_PER_ROW))
-
-// Ratio of time of sleeping before rechecking if task is done (0-1)
-#define SLEEP_RECHECK_RATIO 0.60
-// Ratio of time to busy wait for the solution (0-1)
-// The higher value the higher CPU usage with Nvidia
-#define SLEEP_SKIP_RATIO 0.005
-
-// Make hash tables OVERHEAD times larger than necessary to store the average
-// number of elements per row. The ideal value is as small as possible to
-// reduce memory usage, but not too small or else elements are dropped from the
-// hash tables.
-//
-// The actual number of elements per row is closer to the theoretical average
-// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be
-// smaller.
-//
-// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease
-// performance as they cause VRAM channel conflicts.
-#if NR_ROWS_LOG == 16
-// #error "NR_ROWS_LOG = 16 is currently broken - do not use"
-#define OVERHEAD                        2
-#elif NR_ROWS_LOG == 18
-#define OVERHEAD                        3
-#elif NR_ROWS_LOG == 19
-#define OVERHEAD                        5
-#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND
-#define OVERHEAD                        6
-#elif NR_ROWS_LOG == 20
-#define OVERHEAD                        9
-#endif
-
-#define NR_ROWS                         (1 << NR_ROWS_LOG)
-#define NR_SLOTS            (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD))
-// Length of 1 element (slot) in byte
-#define SLOT_LEN                        32
-// Total size of hash table
-#define HT_SIZE				(NR_ROWS * NR_SLOTS * SLOT_LEN)
-// Length of Zcash block header, nonce (part of header)
-#define ZCASH_BLOCK_HEADER_LEN		140
-// Offset of nTime in header
-#define ZCASH_BLOCK_OFFSET_NTIME        (4 + 3 * 32)
-// Length of nonce
-#define ZCASH_NONCE_LEN			32
-// Length of encoded representation of solution size
-#define ZCASH_SOLSIZE_LEN		3
-// Solution size (1344 = 0x540) represented as a compact integer, in hex
-#define ZCASH_SOLSIZE_HEX               "fd4005"
-// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes)
-#define ZCASH_SOL_LEN                   ((1 << PARAM_K) * (PREFIX + 1) / 8)
-// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization
-#define N_ZERO_BYTES			12
-// Number of bytes Zcash needs out of Blake
-#define ZCASH_HASH_LEN                  50
-// Number of wavefronts per SIMD for the Blake kernel.
-// Blake is ALU-bound (beside the atomic counter being incremented) so we need
-// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
-// instructions. 10 is the max supported by the hw.
-#define BLAKE_WPS               	10
-// Maximum number of solutions reported by kernel to host
-#define MAX_SOLS			10
-// Length of SHA256 target
-#define SHA256_TARGET_LEN               (256 / 8)
-
-#if (NR_SLOTS < 16)
-#define BITS_PER_ROW 4
-#define ROWS_PER_UINT 8
-#define ROW_MASK 0x0F
-#else
-#define BITS_PER_ROW 8
-#define ROWS_PER_UINT 4
-#define ROW_MASK 0xFF
-#endif
-
-// Optional features
-#undef ENABLE_DEBUG
-
-/*
-** Return the offset of Xi in bytes from the beginning of the slot.
-*/
-#define xi_offset_for_round(round)	(8 + ((round) / 2) * 4)
-
-// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
-#define SOL_SIZE			((1 << PARAM_K) * 4)
-typedef struct	sols_s
-{
-    uint	nr;
-    uint	likely_invalids;
-    uchar	valid[MAX_SOLS];
-    uint	values[MAX_SOLS][(1 << PARAM_K)];
-}		sols_t;
-
+#ifndef __OPENCL_VERSION__
+#define uint uint32_t
+#define uchar uint8_t
+#endif
+
+
+#define PARAM_N				200
+#define PARAM_K				9
+#define PREFIX                          (PARAM_N / (PARAM_K + 1))
+#define NR_INPUTS                       (1 << PREFIX)
+// Approximate log base 2 of number of elements in hash tables
+#define APX_NR_ELMS_LOG                 (PREFIX + 1)
+// Number of rows and slots is affected by this. 20 offers the best performance
+// but occasionally misses ~1% of solutions.
+#define NR_ROWS_LOG                     18
+
+// Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md
+#define OPTIM_SIMPLIFY_ROUND		1
+
+// Number of collision items to track, per thread
+#define THREADS_PER_ROW 8
+#define LDS_COLL_SIZE (NR_SLOTS * 8 * (64 / THREADS_PER_ROW))
+
+// Ratio of time of sleeping before rechecking if task is done (0-1)
+#define SLEEP_RECHECK_RATIO 0.60
+// Ratio of time to busy wait for the solution (0-1)
+// The higher value the higher CPU usage with Nvidia
+#define SLEEP_SKIP_RATIO 0.005
+
+// Make hash tables OVERHEAD times larger than necessary to store the average
+// number of elements per row. The ideal value is as small as possible to
+// reduce memory usage, but not too small or else elements are dropped from the
+// hash tables.
+//
+// The actual number of elements per row is closer to the theoretical average
+// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be
+// smaller.
+//
+// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease
+// performance as they cause VRAM channel conflicts.
+#if NR_ROWS_LOG == 16
+// #error "NR_ROWS_LOG = 16 is currently broken - do not use"
+#define OVERHEAD                        2
+#elif NR_ROWS_LOG == 18
+#define OVERHEAD                        3
+#elif NR_ROWS_LOG == 19
+#define OVERHEAD                        5
+#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND
+#define OVERHEAD                        6
+#elif NR_ROWS_LOG == 20
+#define OVERHEAD                        9
+#endif
+
+#define NR_ROWS                         (1 << NR_ROWS_LOG)
+#define NR_SLOTS            (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD))
+// Length of 1 element (slot) in byte
+#define SLOT_LEN                        32
+// Total size of hash table
+#define HT_SIZE				(NR_ROWS * NR_SLOTS * SLOT_LEN)
+// Length of Zcash block header, nonce (part of header)
+#define ZCASH_BLOCK_HEADER_LEN		140
+// Offset of nTime in header
+#define ZCASH_BLOCK_OFFSET_NTIME        (4 + 3 * 32)
+// Length of nonce
+#define ZCASH_NONCE_LEN			32
+// Length of encoded representation of solution size
+#define ZCASH_SOLSIZE_LEN		3
+// Solution size (1344 = 0x540) represented as a compact integer, in hex
+#define ZCASH_SOLSIZE_HEX               "fd4005"
+// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes)
+#define ZCASH_SOL_LEN                   ((1 << PARAM_K) * (PREFIX + 1) / 8)
+// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization
+#define N_ZERO_BYTES			12
+// Number of bytes Zcash needs out of Blake
+#define ZCASH_HASH_LEN                  50
+// Number of wavefronts per SIMD for the Blake kernel.
+// Blake is ALU-bound (beside the atomic counter being incremented) so we need
+// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
+// instructions. 10 is the max supported by the hw.
+#define BLAKE_WPS               	10
+// Maximum number of solutions reported by kernel to host
+#define MAX_SOLS			10
+// Length of SHA256 target
+#define SHA256_TARGET_LEN               (256 / 8)
+
+#if (NR_SLOTS < 16)
+#define BITS_PER_ROW 4
+#define ROWS_PER_UINT 8
+#define ROW_MASK 0x0F
+#else
+#define BITS_PER_ROW 8
+#define ROWS_PER_UINT 4
+#define ROW_MASK 0xFF
+#endif
+
+// Optional features
+#undef ENABLE_DEBUG
+
+/*
+** Return the offset of Xi in bytes from the beginning of the slot.
+*/
+#define xi_offset_for_round(round)	(8 + ((round) / 2) * 4)
+
+// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
+#define SOL_SIZE			((1 << PARAM_K) * 4)
+typedef struct	sols_s
+{
+    uint	nr;
+    uint	likely_invalids;
+    uchar	valid[MAX_SOLS];
+    uint	values[MAX_SOLS][(1 << PARAM_K)];
+}		sols_t;
+
diff --git a/kernel/equihash.cl b/kernel/equihash.cl
index 8fedf79e..460d20a5 100644
--- a/kernel/equihash.cl
+++ b/kernel/equihash.cl
@@ -1,833 +1,833 @@
-#include "equihash-param.h"
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-
-/*
-** Assuming NR_ROWS_LOG == 16, the hash table slots have this layout (length in
-** bytes in parens):
-**
-** round 0, table 0: cnt(4) i(4)                     pad(0)   Xi(23.0) pad(1)
-** round 1, table 1: cnt(4) i(4)                     pad(0.5) Xi(20.5) pad(3)
-** round 2, table 0: cnt(4) i(4) i(4)                pad(0)   Xi(18.0) pad(2)
-** round 3, table 1: cnt(4) i(4) i(4)                pad(0.5) Xi(15.5) pad(4)
-** round 4, table 0: cnt(4) i(4) i(4) i(4)           pad(0)   Xi(13.0) pad(3)
-** round 5, table 1: cnt(4) i(4) i(4) i(4)           pad(0.5) Xi(10.5) pad(5)
-** round 6, table 0: cnt(4) i(4) i(4) i(4) i(4)      pad(0)   Xi( 8.0) pad(4)
-** round 7, table 1: cnt(4) i(4) i(4) i(4) i(4)      pad(0.5) Xi( 5.5) pad(6)
-** round 8, table 0: cnt(4) i(4) i(4) i(4) i(4) i(4) pad(0)   Xi( 3.0) pad(5)
-**
-** If the first byte of Xi is 0xAB then:
-** - on even rounds, 'A' is part of the colliding PREFIX, 'B' is part of Xi
-** - on odd rounds, 'A' and 'B' are both part of the colliding PREFIX, but
-**   'A' is considered redundant padding as it was used to compute the row #
-**
-** - cnt is an atomic counter keeping track of the number of used slots.
-**   it is used in the first slot only; subsequent slots replace it with
-**   4 padding bytes
-** - i encodes either the 21-bit input value (round 0) or a reference to two
-**   inputs from the previous round
-**
-** Formula for Xi length and pad length above:
-** > for i in range(9):
-** >   xi=(200-20*i-NR_ROWS_LOG)/8.; ci=8+4*((i)/2); print xi,32-ci-xi
-**
-** Note that the fractional .5-byte/4-bit padding following Xi for odd rounds
-** is the 4 most significant bits of the last byte of Xi.
-*/
-
-__constant ulong blake_iv[] =
-{
-    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
-    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
-    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
-    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
-};
-
-/*
-** Reset counters in hash table.
-*/
-__kernel
-void kernel_init_ht(__global char *ht, __global uint *rowCounters)
-{
-    rowCounters[get_global_id(0)] = 0;
-}
-
-/*
-** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they
-** represent (hex notation, group of 5 hex digits are a group of PREFIX bits):
-**   aa aa ab bb bb cc cc cd dd...  [round 0]
-**         --------------------
-**      ...ab bb bb cc cc cd dd...  [odd round]
-**               --------------
-**               ...cc cc cd dd...  [next even round]
-**                        -----
-** Bytes underlined are going to be stored in the slot. Preceding bytes
-** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are
-** used to compute the row number.
-**
-** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter)
-** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble)
-** TODO: update lines below with padding nibbles
-** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter)
-** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter)
-** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter)
-** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter)
-** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter)
-** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter)
-** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter)
-**
-** Return 0 if successfully stored, or 1 if the row overflowed.
-*/
-uint ht_store(uint round, __global char *ht, uint i,
-	ulong xi0, ulong xi1, ulong xi2, ulong xi3, __global uint *rowCounters)
-{
-    uint    row;
-    __global char       *p;
-    uint                cnt;
-#if NR_ROWS_LOG == 16
-    if (!(round % 2))
-	row = (xi0 & 0xffff);
-    else
-	// if we have in hex: "ab cd ef..." (little endian xi0) then this
-	// formula computes the row as 0xdebc. it skips the 'a' nibble as it
-	// is part of the PREFIX. The Xi will be stored starting with "ef...";
-	// 'e' will be considered padding and 'f' is part of the current PREFIX
-	row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#elif NR_ROWS_LOG == 18
-    if (!(round % 2))
-	row = (xi0 & 0xffff) | ((xi0 & 0xc00000) >> 6);
-    else
-	row = ((xi0 & 0xc0000) >> 2) |
-	    ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#elif NR_ROWS_LOG == 19
-    if (!(round % 2))
-	row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5);
-    else
-	row = ((xi0 & 0xe0000) >> 1) |
-	    ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#elif NR_ROWS_LOG == 20
-    if (!(round % 2))
-	row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4);
-    else
-	row = ((xi0 & 0xf0000) >> 0) |
-	    ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
-	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-    xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
-    xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
-    xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
-    p = ht + row * NR_SLOTS * SLOT_LEN;
-    uint rowIdx = row/ROWS_PER_UINT;
-    uint rowOffset = BITS_PER_ROW*(row%ROWS_PER_UINT);
-    uint xcnt = atomic_add(rowCounters + rowIdx, 1 << rowOffset);
-    xcnt = (xcnt >> rowOffset) & ROW_MASK;
-    cnt = xcnt;
-    if (cnt >= NR_SLOTS)
-      {
-	// avoid overflows
-	atomic_sub(rowCounters + rowIdx, 1 << rowOffset);
-	return 1;
-      }
-    p += cnt * SLOT_LEN + xi_offset_for_round(round);
-    // store "i" (always 4 bytes before Xi)
-    *(__global uint *)(p - 4) = i;
-    if (round == 0 || round == 1)
-      {
-	// store 24 bytes
-	*(__global ulong *)(p + 0) = xi0;
-	*(__global ulong *)(p + 8) = xi1;
-	*(__global ulong *)(p + 16) = xi2;
-      }
-    else if (round == 2)
-      {
-	// store 20 bytes
-	*(__global uint *)(p + 0) = xi0;
-	*(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32);
-	*(__global ulong *)(p + 12) = (xi1 >> 32) | (xi2 << 32);
-      }
-    else if (round == 3)
-      {
-	// store 16 bytes
-	*(__global uint *)(p + 0) = xi0;
-	*(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32);
-	*(__global uint *)(p + 12) = (xi1 >> 32);
-      }
-    else if (round == 4)
-      {
-	// store 16 bytes
-	*(__global ulong *)(p + 0) = xi0;
-	*(__global ulong *)(p + 8) = xi1;
-      }
-    else if (round == 5)
-      {
-	// store 12 bytes
-	*(__global ulong *)(p + 0) = xi0;
-	*(__global uint *)(p + 8) = xi1;
-      }
-    else if (round == 6 || round == 7)
-      {
-	// store 8 bytes
-	*(__global uint *)(p + 0) = xi0;
-	*(__global uint *)(p + 4) = (xi0 >> 32);
-      }
-    else if (round == 8)
-      {
-	// store 4 bytes
-	*(__global uint *)(p + 0) = xi0;
-      }
-    return 0;
-}
-
-#define mix(va, vb, vc, vd, x, y) \
-    va = (va + vb + x); \
-vd = rotate((vd ^ va), (ulong)64 - 32); \
-vc = (vc + vd); \
-vb = rotate((vb ^ vc), (ulong)64 - 24); \
-va = (va + vb + y); \
-vd = rotate((vd ^ va), (ulong)64 - 16); \
-vc = (vc + vd); \
-vb = rotate((vb ^ vc), (ulong)64 - 63);
-
-/*
-** Execute round 0 (blake).
-**
-** Note: making the work group size less than or equal to the wavefront size
-** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local
-** Memory (LDS) Optimization 2-10" in:
-** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/
-*/
-__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-void kernel_round0(__global ulong *blake_state, __global char *ht,
-	__global uint *rowCounters, __global uint *debug)
-{
-    uint                tid = get_global_id(0);
-    ulong               v[16];
-    uint                inputs_per_thread = NR_INPUTS / get_global_size(0);
-    uint                input = tid * inputs_per_thread;
-    uint                input_end = (tid + 1) * inputs_per_thread;
-    uint                dropped = 0;
-    while (input < input_end)
-      {
-	// shift "i" to occupy the high 32 bits of the second ulong word in the
-	// message block
-	ulong word1 = (ulong)input << 32;
-	// init vector v
-	v[0] = blake_state[0];
-	v[1] = blake_state[1];
-	v[2] = blake_state[2];
-	v[3] = blake_state[3];
-	v[4] = blake_state[4];
-	v[5] = blake_state[5];
-	v[6] = blake_state[6];
-	v[7] = blake_state[7];
-	v[8] =  blake_iv[0];
-	v[9] =  blake_iv[1];
-	v[10] = blake_iv[2];
-	v[11] = blake_iv[3];
-	v[12] = blake_iv[4];
-	v[13] = blake_iv[5];
-	v[14] = blake_iv[6];
-	v[15] = blake_iv[7];
-	// mix in length of data
-	v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */;
-	// last block
-	v[14] ^= (ulong)-1;
-
-	// round 1
-	mix(v[0], v[4], v[8],  v[12], 0, word1);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 2
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], word1, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 3
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, word1);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 4
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, word1);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 5
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, word1);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 6
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], word1, 0);
-	// round 7
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], word1, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 8
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, word1);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 9
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], word1, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 10
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], word1, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 11
-	mix(v[0], v[4], v[8],  v[12], 0, word1);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], 0, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-	// round 12
-	mix(v[0], v[4], v[8],  v[12], 0, 0);
-	mix(v[1], v[5], v[9],  v[13], 0, 0);
-	mix(v[2], v[6], v[10], v[14], 0, 0);
-	mix(v[3], v[7], v[11], v[15], 0, 0);
-	mix(v[0], v[5], v[10], v[15], word1, 0);
-	mix(v[1], v[6], v[11], v[12], 0, 0);
-	mix(v[2], v[7], v[8],  v[13], 0, 0);
-	mix(v[3], v[4], v[9],  v[14], 0, 0);
-
-	// compress v into the blake state; this produces the 50-byte hash
-	// (two Xi values)
-	ulong h[7];
-	h[0] = blake_state[0] ^ v[0] ^ v[8];
-	h[1] = blake_state[1] ^ v[1] ^ v[9];
-	h[2] = blake_state[2] ^ v[2] ^ v[10];
-	h[3] = blake_state[3] ^ v[3] ^ v[11];
-	h[4] = blake_state[4] ^ v[4] ^ v[12];
-	h[5] = blake_state[5] ^ v[5] ^ v[13];
-	h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
-
-	// store the two Xi values in the hash table
-#if ZCASH_HASH_LEN == 50
-	dropped += ht_store(0, ht, input * 2,
-		h[0],
-		h[1],
-		h[2],
-		h[3], rowCounters);
-	dropped += ht_store(0, ht, input * 2 + 1,
-		(h[3] >> 8) | (h[4] << (64 - 8)),
-		(h[4] >> 8) | (h[5] << (64 - 8)),
-		(h[5] >> 8) | (h[6] << (64 - 8)),
-		(h[6] >> 8), rowCounters);
-#else
-#error "unsupported ZCASH_HASH_LEN"
-#endif
-
-	input++;
-      }
-#ifdef ENABLE_DEBUG
-    debug[tid * 2] = 0;
-    debug[tid * 2 + 1] = dropped;
-#endif
-}
-
-#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
-#define DECODE_ROW(REF)   (REF >> 16)
-#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
-#define DECODE_SLOT0(REF) (REF & 0xff)
-
-#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
-#define DECODE_ROW(REF)   (REF >> 14)
-#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
-#define DECODE_SLOT0(REF) (REF & 0x7f)
-
-#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
-#define DECODE_ROW(REF)   (REF >> 13)
-#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
-#define DECODE_SLOT0(REF) (REF & 0x3f)
-
-#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
-    ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
-#define DECODE_ROW(REF)   (REF >> 12)
-#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
-#define DECODE_SLOT0(REF) (REF & 0x3f)
-
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-
-/*
-** Access a half-aligned long, that is a long aligned on a 4-byte boundary.
-*/
-ulong half_aligned_long(__global ulong *p, uint offset)
-{
-    return
-	(((ulong)*(__global uint *)((__global char *)p + offset + 0)) << 0) |
-	(((ulong)*(__global uint *)((__global char *)p + offset + 4)) << 32);
-}
-
-/*
-** Access a well-aligned int.
-*/
-uint well_aligned_int(__global ulong *_p, uint offset)
-{
-    __global char *p = (__global char *)_p;
-    return *(__global uint *)(p + offset);
-}
-
-/*
-** XOR a pair of Xi values computed at "round - 1" and store the result in the
-** hash table being built for "round". Note that when building the table for
-** even rounds we need to skip 1 padding byte present in the "round - 1" table
-** (the "0xAB" byte mentioned in the description at the top of this file.) But
-** also note we can't load data directly past this byte because this would
-** cause an unaligned memory access which is undefined per the OpenCL spec.
-**
-** Return 0 if successfully stored, or 1 if the row overflowed.
-*/
-uint xor_and_store(uint round, __global char *ht_dst, uint row,
-	uint slot_a, uint slot_b, __global ulong *a, __global ulong *b,
-	__global uint *rowCounters)
-{
-    ulong xi0, xi1, xi2;
-#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
-    // Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not
-    // storing the byte containing bits from the previous PREFIX block for
-    if (round == 1 || round == 2)
-      {
-	// xor 24 bytes
-	xi0 = *(a++) ^ *(b++);
-	xi1 = *(a++) ^ *(b++);
-	xi2 = *a ^ *b;
-	if (round == 2)
-	  {
-	    // skip padding byte
-	    xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-	    xi1 = (xi1 >> 8) | (xi2 << (64 - 8));
-	    xi2 = (xi2 >> 8);
-	  }
-      }
-    else if (round == 3)
-      {
-	// xor 20 bytes
-	xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
-	xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8);
-	xi2 = well_aligned_int(a, 16) ^ well_aligned_int(b, 16);
-      }
-    else if (round == 4 || round == 5)
-      {
-	// xor 16 bytes
-	xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
-	xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8);
-	xi2 = 0;
-	if (round == 4)
-	  {
-	    // skip padding byte
-	    xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-	    xi1 = (xi1 >> 8);
-	  }
-      }
-    else if (round == 6)
-      {
-	// xor 12 bytes
-	xi0 = *a++ ^ *b++;
-	xi1 = *(__global uint *)a ^ *(__global uint *)b;
-	xi2 = 0;
-	if (round == 6)
-	  {
-	    // skip padding byte
-	    xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
-	    xi1 = (xi1 >> 8);
-	  }
-      }
-    else if (round == 7 || round == 8)
-      {
-	// xor 8 bytes
-	xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
-	xi1 = 0;
-	xi2 = 0;
-	if (round == 8)
-	  {
-	    // skip padding byte
-	    xi0 = (xi0 >> 8);
-	  }
-      }
-    // invalid solutions (which start happenning in round 5) have duplicate
-    // inputs and xor to zero, so discard them
-    if (!xi0 && !xi1)
-	return 0;
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-    return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b),
-	    xi0, xi1, xi2, 0, rowCounters);
-}
-
-/*
-** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi,
-** store them in ht_dst.
-*/
-void equihash_round(uint round,
-  __global char *ht_src,
-  __global char *ht_dst,
-  __global uint *debug,
-  __local uchar *first_words_data,
-  __local uint *collisionsData,
-  __local uint *collisionsNum,
-  __global uint *rowCountersSrc,
-  __global uint *rowCountersDst,
-  uint threadsPerRow)
-{
-    uint globalTid = get_global_id(0) / threadsPerRow;
-    uint localTid = get_local_id(0) / threadsPerRow;
-    uint localGroupId = get_local_id(0) % threadsPerRow;
-    __local uchar *first_words = &first_words_data[NR_SLOTS*localTid];
-    
-    __global char *p;
-    uint    cnt;
-    uchar   mask;
-    uint    i, j;
-    // NR_SLOTS is already oversized (by a factor of OVERHEAD), but we want to
-    // make it even larger
-    uint    n;
-    uint    dropped_coll = 0;
-    uint    dropped_stor = 0;
-    __global ulong  *a, *b;
-    uint    xi_offset;
-    // read first words of Xi from the previous (round - 1) hash table
-    xi_offset = xi_offset_for_round(round - 1);
-    // the mask is also computed to read data from the previous round
-#if NR_ROWS_LOG <= 16
-    mask = ((!(round % 2)) ? 0x0f : 0xf0);
-#elif NR_ROWS_LOG == 18
-    mask = ((!(round % 2)) ? 0x03 : 0x30);
-#elif NR_ROWS_LOG == 19
-    mask = ((!(round % 2)) ? 0x01 : 0x10);
-#elif NR_ROWS_LOG == 20
-    mask = 0; /* we can vastly simplify the code below */
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif    
-    
-  for (uint chunk = 0; chunk < threadsPerRow; chunk++) {
-    uint tid = globalTid + NR_ROWS/threadsPerRow*chunk;
-    uint gid = tid & ~(get_local_size(0) / threadsPerRow - 1);
-//   for (uint tid = get_global_id(0)/threadsPerRow; tid < NR_ROWS; tid += get_global_size(0)/threadsPerRow) {
-    
-    uint rowIdx = tid/ROWS_PER_UINT;
-    uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT);
-    cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
-    cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round
-
-    *collisionsNum = 0;
-    p = (ht_src + tid * NR_SLOTS * SLOT_LEN);
-    p += xi_offset;
-    p += SLOT_LEN*localGroupId;
-    for (i = localGroupId; i < cnt; i += threadsPerRow, p += SLOT_LEN*threadsPerRow)
-      first_words[i] = (*(__global uchar *)p) & mask;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    
-    if (cnt == 0)
-  // no elements in row, no collisions
-  goto part2;  
-    // find collisions
-    for (i = 0; i < cnt-1; i++)
-      {
-  uchar data_i = first_words[i];
-  uint collision = (localTid << 24) | (i << 12) | (i + 1 + localGroupId);
-  for (j = i + 1 + localGroupId; j < cnt; j += threadsPerRow)
-    {
-      if (data_i == first_words[j])
-        {
-    uint index = atomic_inc(collisionsNum);
-    if (index >= LDS_COLL_SIZE) {
-      atomic_dec(collisionsNum);
-      goto part2;
-    }
-    collisionsData[index] = collision;
-        }
-      collision += threadsPerRow;
-    }
-      }
-
-part2:
-    barrier(CLK_LOCAL_MEM_FENCE);
-    uint totalCollisions = *collisionsNum;
-    for (uint index = get_local_id(0); index < totalCollisions; index += get_local_size(0))
-      {
-  uint collision = collisionsData[index];
-  uint collisionThreadId = gid + (collision >> 24);
-  uint i = (collision >> 12) & 0xFFF;
-  uint j = collision & 0xFFF;
-  __global uchar *ptr = ht_src + collisionThreadId * NR_SLOTS * SLOT_LEN +
-      xi_offset;
-  a = (__global ulong *)(ptr + i * SLOT_LEN);
-  b = (__global ulong *)(ptr + j * SLOT_LEN);
-  dropped_stor += xor_and_store(round, ht_dst, collisionThreadId, i, j,
-    a, b, rowCountersDst);
-      }
-  }
-      
-#ifdef ENABLE_DEBUG
-    debug[tid * 2] = dropped_coll;
-    debug[tid * 2 + 1] = dropped_stor;
-#endif
-}
-
-/*
-** This defines kernel_round1, kernel_round2, ..., kernel_round7.
-*/
-#define KERNEL_ROUND(N) \
-__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) \
-void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \
-	__global uint *rowCountersSrc, __global uint *rowCountersDst, \
-       	__global uint *debug) \
-{ \
-    __local uchar first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; \
-    __local uint    collisionsData[LDS_COLL_SIZE]; \
-    __local uint    collisionsNum; \
-    equihash_round(N, ht_src, ht_dst, debug, first_words_data, collisionsData, \
-	    &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); \
-}
-KERNEL_ROUND(1)
-KERNEL_ROUND(2)
-KERNEL_ROUND(3)
-KERNEL_ROUND(4)
-KERNEL_ROUND(5)
-KERNEL_ROUND(6)
-KERNEL_ROUND(7)
-
-// kernel_round8 takes an extra argument, "sols"
-__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-void kernel_round8(__global char *ht_src, __global char *ht_dst,
-	__global uint *rowCountersSrc, __global uint *rowCountersDst,
-	__global uint *debug, __global sols_t *sols)
-{
-    uint		tid = get_global_id(0);
-    __local uchar	first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)];
-    __local uint	collisionsData[LDS_COLL_SIZE];
-    __local uint	collisionsNum;
-    equihash_round(8, ht_src, ht_dst, debug, first_words_data, collisionsData,
-	    &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW);
-    if (!tid)
-	sols->nr = sols->likely_invalids = 0;
-}
-
-uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
-{
-    return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN +
-	    slot * SLOT_LEN + xi_offset - 4);
-}
-
-/*
-** Expand references to inputs. Return 1 if so far the solution appears valid,
-** or 0 otherwise (an invalid solution would be a solution with duplicate
-** inputs, which can be detected at the last step: round == 0).
-*/
-uint expand_refs(uint *ins, uint nr_inputs, __global char **htabs,
-	uint round)
-{
-    __global char	*ht = htabs[round % 2];
-    uint		i = nr_inputs - 1;
-    uint		j = nr_inputs * 2 - 1;
-    uint		xi_offset = xi_offset_for_round(round);
-    int			dup_to_watch = -1;
-    do
-      {
-	ins[j] = expand_ref(ht, xi_offset,
-		DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i]));
-	ins[j - 1] = expand_ref(ht, xi_offset,
-		DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i]));
-	if (!round)
-	  {
-	    if (dup_to_watch == -1)
-		dup_to_watch = ins[j];
-	    else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch)
-		return 0;
-	  }
-	if (!i)
-	    break ;
-	i--;
-	j -= 2;
-      }
-    while (1);
-    return 1;
-}
-
-/*
-** Verify if a potential solution is in fact valid.
-*/
-void potential_sol(__global char **htabs, __global sols_t *sols,
-	uint ref0, uint ref1)
-{
-    uint	nr_values;
-    uint	values_tmp[(1 << PARAM_K)];
-    uint	sol_i;
-    uint	i;
-    nr_values = 0;
-    values_tmp[nr_values++] = ref0;
-    values_tmp[nr_values++] = ref1;
-    uint round = PARAM_K - 1;
-    do
-      {
-	round--;
-	if (!expand_refs(values_tmp, nr_values, htabs, round))
-	    return ;
-	nr_values *= 2;
-      }
-    while (round > 0);
-    // solution appears valid, copy it to sols
-    sol_i = atomic_inc(&sols->nr);
-    if (sol_i >= MAX_SOLS)
-	return ;
-    for (i = 0; i < (1 << PARAM_K); i++)
-	sols->values[sol_i][i] = values_tmp[i];
-    sols->valid[sol_i] = 1;
-}
-
-/*
-** Scan the hash tables to find Equihash solutions.
-*/
-__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols,
-	__global uint *rowCountersSrc, __global uint *rowCountersDst)
-{
-    __local uint counters[WORKSIZE/THREADS_PER_ROW];
-    __local uint refs[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)];
-    __local uint data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)];
-    __local uint collisionsNum;
-    __local ulong collisions[WORKSIZE*4];
-
-    uint globalTid = get_global_id(0) / THREADS_PER_ROW;
-    uint localTid = get_local_id(0) / THREADS_PER_ROW;
-    uint localGroupId = get_local_id(0) % THREADS_PER_ROW;
-    __local uint *refsPtr = &refs[NR_SLOTS*localTid];
-    __local uint *dataPtr = &data[NR_SLOTS*localTid];    
-    
-    __global char	*htabs[2] = { ht0, ht1 };
-    __global char	*hcounters[2] = { rowCountersSrc, rowCountersDst };
-    uint		ht_i = (PARAM_K - 1) % 2; // table filled at last round
-    uint		cnt;
-    uint		xi_offset = xi_offset_for_round(PARAM_K - 1);
-    uint		i, j;
-    __global char	*p;
-    uint		ref_i, ref_j;
-    // it's ok for the collisions array to be so small, as if it fills up
-    // the potential solutions are likely invalid (many duplicate inputs)
-//     ulong		collisions;
-#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
-    // in the final hash table, we are looking for a match on both the bits
-    // part of the previous PREFIX colliding bits, and the last PREFIX bits.
-    uint		mask = 0xffffff;
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-    
-  collisionsNum = 0;    
-  
-  for (uint chunk = 0; chunk < THREADS_PER_ROW; chunk++) {
-    uint tid = globalTid + NR_ROWS/THREADS_PER_ROW*chunk;
-    p = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN;
-    uint rowIdx = tid/ROWS_PER_UINT;
-    uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT);
-    cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
-    cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
-    p += xi_offset;
-    p += SLOT_LEN*localGroupId;
-
-    for (i = get_local_id(0); i < WORKSIZE/THREADS_PER_ROW; i += get_local_size(0))
-      counters[i] = 0;
-    for (i = localGroupId; i < cnt; i += THREADS_PER_ROW, p += SLOT_LEN*THREADS_PER_ROW) {
-      refsPtr[i] = *(__global uint *)(p - 4);
-      dataPtr[i] = (*(__global uint *)p) & mask;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    
-    for (i = 0; i < cnt; i++)
-      {
-	uint a_data = dataPtr[i];
-	ref_i = refsPtr[i];
-	for (j = i + 1 + localGroupId; j < cnt; j += THREADS_PER_ROW)
-	  {
-	    if (a_data == dataPtr[j])
-	      {
-          if (atomic_inc(&counters[localTid]) == 0)
-		        collisions[atomic_inc(&collisionsNum)] = ((ulong)ref_i << 32) | refsPtr[j];
-          goto part2;          
-	      }
-	  }
-      }
-
-part2:
-    continue;
-  }
-  
-    barrier(CLK_LOCAL_MEM_FENCE);
-    uint totalCollisions = collisionsNum;
-    if (get_local_id(0) < totalCollisions) {
-      ulong coll = collisions[get_local_id(0)];
-      potential_sol(htabs, sols, coll >> 32, coll & 0xffffffff);
-    }
-}
+#include "equihash-param.h"
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+
+/*
+** Assuming NR_ROWS_LOG == 16, the hash table slots have this layout (length in
+** bytes in parens):
+**
+** round 0, table 0: cnt(4) i(4)                     pad(0)   Xi(23.0) pad(1)
+** round 1, table 1: cnt(4) i(4)                     pad(0.5) Xi(20.5) pad(3)
+** round 2, table 0: cnt(4) i(4) i(4)                pad(0)   Xi(18.0) pad(2)
+** round 3, table 1: cnt(4) i(4) i(4)                pad(0.5) Xi(15.5) pad(4)
+** round 4, table 0: cnt(4) i(4) i(4) i(4)           pad(0)   Xi(13.0) pad(3)
+** round 5, table 1: cnt(4) i(4) i(4) i(4)           pad(0.5) Xi(10.5) pad(5)
+** round 6, table 0: cnt(4) i(4) i(4) i(4) i(4)      pad(0)   Xi( 8.0) pad(4)
+** round 7, table 1: cnt(4) i(4) i(4) i(4) i(4)      pad(0.5) Xi( 5.5) pad(6)
+** round 8, table 0: cnt(4) i(4) i(4) i(4) i(4) i(4) pad(0)   Xi( 3.0) pad(5)
+**
+** If the first byte of Xi is 0xAB then:
+** - on even rounds, 'A' is part of the colliding PREFIX, 'B' is part of Xi
+** - on odd rounds, 'A' and 'B' are both part of the colliding PREFIX, but
+**   'A' is considered redundant padding as it was used to compute the row #
+**
+** - cnt is an atomic counter keeping track of the number of used slots.
+**   it is used in the first slot only; subsequent slots replace it with
+**   4 padding bytes
+** - i encodes either the 21-bit input value (round 0) or a reference to two
+**   inputs from the previous round
+**
+** Formula for Xi length and pad length above:
+** > for i in range(9):
+** >   xi=(200-20*i-NR_ROWS_LOG)/8.; ci=8+4*((i)/2); print xi,32-ci-xi
+**
+** Note that the fractional .5-byte/4-bit padding following Xi for odd rounds
+** is the 4 most significant bits of the last byte of Xi.
+*/
+
+__constant ulong blake_iv[] =
+{
+    0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+    0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+    0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+    0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+};
+
+/*
+** Reset counters in hash table.
+*/
+__kernel
+void kernel_init_ht(__global char *ht, __global uint *rowCounters)
+{
+    rowCounters[get_global_id(0)] = 0;
+}
+
+/*
+** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they
+** represent (hex notation, group of 5 hex digits are a group of PREFIX bits):
+**   aa aa ab bb bb cc cc cd dd...  [round 0]
+**         --------------------
+**      ...ab bb bb cc cc cd dd...  [odd round]
+**               --------------
+**               ...cc cc cd dd...  [next even round]
+**                        -----
+** Bytes underlined are going to be stored in the slot. Preceding bytes
+** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are
+** used to compute the row number.
+**
+** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter)
+** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble)
+** TODO: update lines below with padding nibbles
+** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter)
+** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter)
+** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter)
+** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter)
+** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter)
+** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter)
+** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter)
+**
+** Return 0 if successfully stored, or 1 if the row overflowed.
+*/
+uint ht_store(uint round, __global char *ht, uint i,
+	ulong xi0, ulong xi1, ulong xi2, ulong xi3, __global uint *rowCounters)
+{
+    uint    row;
+    __global char       *p;
+    uint                cnt;
+#if NR_ROWS_LOG == 16
+    if (!(round % 2))
+	row = (xi0 & 0xffff);
+    else
+	// if we have in hex: "ab cd ef..." (little endian xi0) then this
+	// formula computes the row as 0xdebc. it skips the 'a' nibble as it
+	// is part of the PREFIX. The Xi will be stored starting with "ef...";
+	// 'e' will be considered padding and 'f' is part of the current PREFIX
+	row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
+	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
+#elif NR_ROWS_LOG == 18
+    if (!(round % 2))
+	row = (xi0 & 0xffff) | ((xi0 & 0xc00000) >> 6);
+    else
+	row = ((xi0 & 0xc0000) >> 2) |
+	    ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
+	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
+#elif NR_ROWS_LOG == 19
+    if (!(round % 2))
+	row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5);
+    else
+	row = ((xi0 & 0xe0000) >> 1) |
+	    ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
+	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
+#elif NR_ROWS_LOG == 20
+    if (!(round % 2))
+	row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4);
+    else
+	row = ((xi0 & 0xf0000) >> 0) |
+	    ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
+	    ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+    xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
+    xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
+    xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
+    p = ht + row * NR_SLOTS * SLOT_LEN;
+    uint rowIdx = row/ROWS_PER_UINT;
+    uint rowOffset = BITS_PER_ROW*(row%ROWS_PER_UINT);
+    uint xcnt = atomic_add(rowCounters + rowIdx, 1 << rowOffset);
+    xcnt = (xcnt >> rowOffset) & ROW_MASK;
+    cnt = xcnt;
+    if (cnt >= NR_SLOTS)
+      {
+	// avoid overflows
+	atomic_sub(rowCounters + rowIdx, 1 << rowOffset);
+	return 1;
+      }
+    p += cnt * SLOT_LEN + xi_offset_for_round(round);
+    // store "i" (always 4 bytes before Xi)
+    *(__global uint *)(p - 4) = i;
+    if (round == 0 || round == 1)
+      {
+	// store 24 bytes
+	*(__global ulong *)(p + 0) = xi0;
+	*(__global ulong *)(p + 8) = xi1;
+	*(__global ulong *)(p + 16) = xi2;
+      }
+    else if (round == 2)
+      {
+	// store 20 bytes
+	*(__global uint *)(p + 0) = xi0;
+	*(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32);
+	*(__global ulong *)(p + 12) = (xi1 >> 32) | (xi2 << 32);
+      }
+    else if (round == 3)
+      {
+	// store 16 bytes
+	*(__global uint *)(p + 0) = xi0;
+	*(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32);
+	*(__global uint *)(p + 12) = (xi1 >> 32);
+      }
+    else if (round == 4)
+      {
+	// store 16 bytes
+	*(__global ulong *)(p + 0) = xi0;
+	*(__global ulong *)(p + 8) = xi1;
+      }
+    else if (round == 5)
+      {
+	// store 12 bytes
+	*(__global ulong *)(p + 0) = xi0;
+	*(__global uint *)(p + 8) = xi1;
+      }
+    else if (round == 6 || round == 7)
+      {
+	// store 8 bytes
+	*(__global uint *)(p + 0) = xi0;
+	*(__global uint *)(p + 4) = (xi0 >> 32);
+      }
+    else if (round == 8)
+      {
+	// store 4 bytes
+	*(__global uint *)(p + 0) = xi0;
+      }
+    return 0;
+}
+
+#define mix(va, vb, vc, vd, x, y) \
+    va = (va + vb + x); \
+vd = rotate((vd ^ va), (ulong)64 - 32); \
+vc = (vc + vd); \
+vb = rotate((vb ^ vc), (ulong)64 - 24); \
+va = (va + vb + y); \
+vd = rotate((vd ^ va), (ulong)64 - 16); \
+vc = (vc + vd); \
+vb = rotate((vb ^ vc), (ulong)64 - 63);
+
+/*
+** Execute round 0 (blake).
+**
+** Note: making the work group size less than or equal to the wavefront size
+** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local
+** Memory (LDS) Optimization 2-10" in:
+** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/
+*/
+__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+void kernel_round0(__global ulong *blake_state, __global char *ht,
+	__global uint *rowCounters, __global uint *debug)
+{
+    uint                tid = get_global_id(0);
+    ulong               v[16];
+    uint                inputs_per_thread = NR_INPUTS / get_global_size(0);
+    uint                input = tid * inputs_per_thread;
+    uint                input_end = (tid + 1) * inputs_per_thread;
+    uint                dropped = 0;
+    while (input < input_end)
+      {
+	// shift "i" to occupy the high 32 bits of the second ulong word in the
+	// message block
+	ulong word1 = (ulong)input << 32;
+	// init vector v
+	v[0] = blake_state[0];
+	v[1] = blake_state[1];
+	v[2] = blake_state[2];
+	v[3] = blake_state[3];
+	v[4] = blake_state[4];
+	v[5] = blake_state[5];
+	v[6] = blake_state[6];
+	v[7] = blake_state[7];
+	v[8] =  blake_iv[0];
+	v[9] =  blake_iv[1];
+	v[10] = blake_iv[2];
+	v[11] = blake_iv[3];
+	v[12] = blake_iv[4];
+	v[13] = blake_iv[5];
+	v[14] = blake_iv[6];
+	v[15] = blake_iv[7];
+	// mix in length of data
+	v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */;
+	// last block
+	v[14] ^= (ulong)-1;
+
+	// round 1
+	mix(v[0], v[4], v[8],  v[12], 0, word1);
+	mix(v[1], v[5], v[9],  v[13], 0, 0);
+	mix(v[2], v[6], v[10], v[14], 0, 0);
+	mix(v[3], v[7], v[11], v[15], 0, 0);
+	mix(v[0], v[5], v[10], v[15], 0, 0);
+	mix(v[1], v[6], v[11], v[12], 0, 0);
+	mix(v[2], v[7], v[8],  v[13], 0, 0);
+	mix(v[3], v[4], v[9],  v[14], 0, 0);
+	// round 2
+	mix(v[0], v[4], v[8],  v[12], 0, 0);
+	mix(v[1], v[5], v[9],  v[13], 0, 0);
+	mix(v[2], v[6], v[10], v[14], 0, 0);
+	mix(v[3], v[7], v[11], v[15], 0, 0);
+	mix(v[0], v[5], v[10], v[15], word1, 0);
+	mix(v[1], v[6], v[11], v[12], 0, 0);
+	mix(v[2], v[7], v[8],  v[13], 0, 0);
+	mix(v[3], v[4], v[9],  v[14], 0, 0);
+	// round 3
+	mix(v[0], v[4], v[8],  v[12], 0, 0);
+	mix(v[1], v[5], v[9],  v[13], 0, 0);
+	mix(v[2], v[6], v[10], v[14], 0, 0);
+	mix(v[3], v[7], v[11], v[15], 0, 0);
+	mix(v[0], v[5], v[10], v[15], 0, 0);
+	mix(v[1], v[6], v[11], v[12], 0, 0);
+	mix(v[2], v[7], v[8],  v[13], 0, word1);
+	mix(v[3], v[4], v[9],  v[14], 0, 0);
+	// round 4
+	mix(v[0], v[4], v[8],  v[12], 0, 0);
+	mix(v[1], v[5], v[9],  v[13], 0, word1);
+	mix(v[2], v[6], v[10], v[14], 0, 0);
+	mix(v[3], v[7], v[11], v[15], 0, 0);
+	mix(v[0], v[5], v[10], v[15], 0, 0);
+	mix(v[1], v[6], v[11], v[12], 0, 0);
+	mix(v[2], v[7], v[8],  v[13], 0, 0);
+	mix(v[3], v[4], v[9],  v[14], 0, 0);
+	// round 5
+	mix(v[0], v[4], v[8],  v[12], 0, 0);
+	mix(v[1], v[5], v[9],  v[13], 0, 0);
+	mix(v[2], v[6], v[10], v[14], 0, 0);
+	mix(v[3], v[7], v[11], v[15], 0, 0);
+	mix(v[0], v[5], v[10], v[15], 0, word1);
+	mix(v[1], v[6], v[11], v[12], 0, 0);
+	mix(v[2], v[7], v[8],  v[13], 0, 0);
+	mix(v[3], v[4], v[9],  v[14], 0, 0);
+	// round 6
+	mix(v[0], v[4], v[8],  v[12], 0, 0);
+	mix(v[1], v[5], v[9],  v[13], 0, 0);
+	mix(v[2], v[6], v[10], v[14], 0, 0);
+	mix(v[3], v[7], v[11], v[15], 0, 0);
+	mix(v[0], v[5], v[10], v[15], 0, 0);
+	mix(v[1], v[6], v[11], v[12], 0, 0);
+	mix(v[2], v[7], v[8],  v[13], 0, 0);
+	mix(v[3], v[4], v[9],  v[14], word1, 0);
+	// round 7
+	mix(v[0], v[4], v[8],  v[12], 0, 0);
+	mix(v[1], v[5], v[9],  v[13], word1, 0);
+	mix(v[2], v[6], v[10], v[14], 0, 0);
+	mix(v[3], v[7], v[11], v[15], 0, 0);
+	mix(v[0], v[5], v[10], v[15], 0, 0);
+	mix(v[1], v[6], v[11], v[12], 0, 0);
+	mix(v[2], v[7], v[8],  v[13], 0, 0);
+	mix(v[3], v[4], v[9],  v[14], 0, 0);
+	// round 8
+	mix(v[0], v[4], v[8],  v[12], 0, 0);
+	mix(v[1], v[5], v[9],  v[13], 0, 0);
+	mix(v[2], v[6], v[10], v[14], 0, word1);
+	mix(v[3], v[7], v[11], v[15], 0, 0);
+	mix(v[0], v[5], v[10], v[15], 0, 0);
+	mix(v[1], v[6], v[11], v[12], 0, 0);
+	mix(v[2], v[7], v[8],  v[13], 0, 0);
+	mix(v[3], v[4], v[9],  v[14], 0, 0);
+	// round 9
+	mix(v[0], v[4], v[8],  v[12], 0, 0);
+	mix(v[1], v[5], v[9],  v[13], 0, 0);
+	mix(v[2], v[6], v[10], v[14], 0, 0);
+	mix(v[3], v[7], v[11], v[15], 0, 0);
+	mix(v[0], v[5], v[10], v[15], 0, 0);
+	mix(v[1], v[6], v[11], v[12], 0, 0);
+	mix(v[2], v[7], v[8],  v[13], word1, 0);
+	mix(v[3], v[4], v[9],  v[14], 0, 0);
+	// round 10
+	mix(v[0], v[4], v[8],  v[12], 0, 0);
+	mix(v[1], v[5], v[9],  v[13], 0, 0);
+	mix(v[2], v[6], v[10], v[14], 0, 0);
+	mix(v[3], v[7], v[11], v[15], word1, 0);
+	mix(v[0], v[5], v[10], v[15], 0, 0);
+	mix(v[1], v[6], v[11], v[12], 0, 0);
+	mix(v[2], v[7], v[8],  v[13], 0, 0);
+	mix(v[3], v[4], v[9],  v[14], 0, 0);
+	// round 11
+	mix(v[0], v[4], v[8],  v[12], 0, word1);
+	mix(v[1], v[5], v[9],  v[13], 0, 0);
+	mix(v[2], v[6], v[10], v[14], 0, 0);
+	mix(v[3], v[7], v[11], v[15], 0, 0);
+	mix(v[0], v[5], v[10], v[15], 0, 0);
+	mix(v[1], v[6], v[11], v[12], 0, 0);
+	mix(v[2], v[7], v[8],  v[13], 0, 0);
+	mix(v[3], v[4], v[9],  v[14], 0, 0);
+	// round 12
+	mix(v[0], v[4], v[8],  v[12], 0, 0);
+	mix(v[1], v[5], v[9],  v[13], 0, 0);
+	mix(v[2], v[6], v[10], v[14], 0, 0);
+	mix(v[3], v[7], v[11], v[15], 0, 0);
+	mix(v[0], v[5], v[10], v[15], word1, 0);
+	mix(v[1], v[6], v[11], v[12], 0, 0);
+	mix(v[2], v[7], v[8],  v[13], 0, 0);
+	mix(v[3], v[4], v[9],  v[14], 0, 0);
+
+	// compress v into the blake state; this produces the 50-byte hash
+	// (two Xi values)
+	ulong h[7];
+	h[0] = blake_state[0] ^ v[0] ^ v[8];
+	h[1] = blake_state[1] ^ v[1] ^ v[9];
+	h[2] = blake_state[2] ^ v[2] ^ v[10];
+	h[3] = blake_state[3] ^ v[3] ^ v[11];
+	h[4] = blake_state[4] ^ v[4] ^ v[12];
+	h[5] = blake_state[5] ^ v[5] ^ v[13];
+	h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
+
+	// store the two Xi values in the hash table
+#if ZCASH_HASH_LEN == 50
+	dropped += ht_store(0, ht, input * 2,
+		h[0],
+		h[1],
+		h[2],
+		h[3], rowCounters);
+	dropped += ht_store(0, ht, input * 2 + 1,
+		(h[3] >> 8) | (h[4] << (64 - 8)),
+		(h[4] >> 8) | (h[5] << (64 - 8)),
+		(h[5] >> 8) | (h[6] << (64 - 8)),
+		(h[6] >> 8), rowCounters);
+#else
+#error "unsupported ZCASH_HASH_LEN"
+#endif
+
+	input++;
+      }
+#ifdef ENABLE_DEBUG
+    debug[tid * 2] = 0;
+    debug[tid * 2 + 1] = dropped;
+#endif
+}
+
+#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
+#define DECODE_ROW(REF)   (REF >> 16)
+#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
+#define DECODE_SLOT0(REF) (REF & 0xff)
+
+#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
+#define DECODE_ROW(REF)   (REF >> 14)
+#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
+#define DECODE_SLOT0(REF) (REF & 0x7f)
+
+#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
+#define DECODE_ROW(REF)   (REF >> 13)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+    ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
+#define DECODE_ROW(REF)   (REF >> 12)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+/*
+** Access a half-aligned long, that is a long aligned on a 4-byte boundary.
+*/
+ulong half_aligned_long(__global ulong *p, uint offset)
+{
+    return
+	(((ulong)*(__global uint *)((__global char *)p + offset + 0)) << 0) |
+	(((ulong)*(__global uint *)((__global char *)p + offset + 4)) << 32);
+}
+
+/*
+** Access a well-aligned int.
+*/
+uint well_aligned_int(__global ulong *_p, uint offset)
+{
+    __global char *p = (__global char *)_p;
+    return *(__global uint *)(p + offset);
+}
+
+/*
+** XOR a pair of Xi values computed at "round - 1" and store the result in the
+** hash table being built for "round". Note that when building the table for
+** even rounds we need to skip 1 padding byte present in the "round - 1" table
+** (the "0xAB" byte mentioned in the description at the top of this file.) But
+** also note we can't load data directly past this byte because this would
+** cause an unaligned memory access which is undefined per the OpenCL spec.
+**
+** Return 0 if successfully stored, or 1 if the row overflowed.
+*/
+uint xor_and_store(uint round, __global char *ht_dst, uint row,
+	uint slot_a, uint slot_b, __global ulong *a, __global ulong *b,
+	__global uint *rowCounters)
+{
+    ulong xi0, xi1, xi2;
+#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
+    // Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not
+    // storing the byte containing bits from the previous PREFIX block for
+    if (round == 1 || round == 2)
+      {
+	// xor 24 bytes
+	xi0 = *(a++) ^ *(b++);
+	xi1 = *(a++) ^ *(b++);
+	xi2 = *a ^ *b;
+	if (round == 2)
+	  {
+	    // skip padding byte
+	    xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
+	    xi1 = (xi1 >> 8) | (xi2 << (64 - 8));
+	    xi2 = (xi2 >> 8);
+	  }
+      }
+    else if (round == 3)
+      {
+	// xor 20 bytes
+	xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
+	xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8);
+	xi2 = well_aligned_int(a, 16) ^ well_aligned_int(b, 16);
+      }
+    else if (round == 4 || round == 5)
+      {
+	// xor 16 bytes
+	xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
+	xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8);
+	xi2 = 0;
+	if (round == 4)
+	  {
+	    // skip padding byte
+	    xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
+	    xi1 = (xi1 >> 8);
+	  }
+      }
+    else if (round == 6)
+      {
+	// xor 12 bytes
+	xi0 = *a++ ^ *b++;
+	xi1 = *(__global uint *)a ^ *(__global uint *)b;
+	xi2 = 0;
+	if (round == 6)
+	  {
+	    // skip padding byte
+	    xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
+	    xi1 = (xi1 >> 8);
+	  }
+      }
+    else if (round == 7 || round == 8)
+      {
+	// xor 8 bytes
+	xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
+	xi1 = 0;
+	xi2 = 0;
+	if (round == 8)
+	  {
+	    // skip padding byte
+	    xi0 = (xi0 >> 8);
+	  }
+      }
+    // invalid solutions (which start happenning in round 5) have duplicate
+    // inputs and xor to zero, so discard them
+    if (!xi0 && !xi1)
+	return 0;
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+    return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b),
+	    xi0, xi1, xi2, 0, rowCounters);
+}
+
+/*
+** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi,
+** store them in ht_dst.
+*/
+void equihash_round(uint round,
+  __global char *ht_src,
+  __global char *ht_dst,
+  __global uint *debug,
+  __local uchar *first_words_data,
+  __local uint *collisionsData,
+  __local uint *collisionsNum,
+  __global uint *rowCountersSrc,
+  __global uint *rowCountersDst,
+  uint threadsPerRow)
+{
+    uint globalTid = get_global_id(0) / threadsPerRow;
+    uint localTid = get_local_id(0) / threadsPerRow;
+    uint localGroupId = get_local_id(0) % threadsPerRow;
+    __local uchar *first_words = &first_words_data[NR_SLOTS*localTid];
+    
+    __global char *p;
+    uint    cnt;
+    uchar   mask;
+    uint    i, j;
+    // NR_SLOTS is already oversized (by a factor of OVERHEAD), but we want to
+    // make it even larger
+    uint    n;
+    uint    dropped_coll = 0;
+    uint    dropped_stor = 0;
+    __global ulong  *a, *b;
+    uint    xi_offset;
+    // read first words of Xi from the previous (round - 1) hash table
+    xi_offset = xi_offset_for_round(round - 1);
+    // the mask is also computed to read data from the previous round
+#if NR_ROWS_LOG <= 16
+    mask = ((!(round % 2)) ? 0x0f : 0xf0);
+#elif NR_ROWS_LOG == 18
+    mask = ((!(round % 2)) ? 0x03 : 0x30);
+#elif NR_ROWS_LOG == 19
+    mask = ((!(round % 2)) ? 0x01 : 0x10);
+#elif NR_ROWS_LOG == 20
+    mask = 0; /* we can vastly simplify the code below */
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif    
+    
+  for (uint chunk = 0; chunk < threadsPerRow; chunk++) {
+    uint tid = globalTid + NR_ROWS/threadsPerRow*chunk;
+    uint gid = tid & ~(get_local_size(0) / threadsPerRow - 1);
+//   for (uint tid = get_global_id(0)/threadsPerRow; tid < NR_ROWS; tid += get_global_size(0)/threadsPerRow) {
+    
+    uint rowIdx = tid/ROWS_PER_UINT;
+    uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT);
+    cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
+    cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round
+
+    *collisionsNum = 0;
+    p = (ht_src + tid * NR_SLOTS * SLOT_LEN);
+    p += xi_offset;
+    p += SLOT_LEN*localGroupId;
+    for (i = localGroupId; i < cnt; i += threadsPerRow, p += SLOT_LEN*threadsPerRow)
+      first_words[i] = (*(__global uchar *)p) & mask;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    if (cnt == 0)
+  // no elements in row, no collisions
+  goto part2;  
+    // find collisions
+    for (i = 0; i < cnt-1; i++)
+      {
+  uchar data_i = first_words[i];
+  uint collision = (localTid << 24) | (i << 12) | (i + 1 + localGroupId);
+  for (j = i + 1 + localGroupId; j < cnt; j += threadsPerRow)
+    {
+      if (data_i == first_words[j])
+        {
+    uint index = atomic_inc(collisionsNum);
+    if (index >= LDS_COLL_SIZE) {
+      atomic_dec(collisionsNum);
+      goto part2;
+    }
+    collisionsData[index] = collision;
+        }
+      collision += threadsPerRow;
+    }
+      }
+
+part2:
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint totalCollisions = *collisionsNum;
+    for (uint index = get_local_id(0); index < totalCollisions; index += get_local_size(0))
+      {
+  uint collision = collisionsData[index];
+  uint collisionThreadId = gid + (collision >> 24);
+  uint i = (collision >> 12) & 0xFFF;
+  uint j = collision & 0xFFF;
+  __global uchar *ptr = ht_src + collisionThreadId * NR_SLOTS * SLOT_LEN +
+      xi_offset;
+  a = (__global ulong *)(ptr + i * SLOT_LEN);
+  b = (__global ulong *)(ptr + j * SLOT_LEN);
+  dropped_stor += xor_and_store(round, ht_dst, collisionThreadId, i, j,
+    a, b, rowCountersDst);
+      }
+  }
+      
+#ifdef ENABLE_DEBUG
+    debug[tid * 2] = dropped_coll;
+    debug[tid * 2 + 1] = dropped_stor;
+#endif
+}
+
+/*
+** This defines kernel_round1, kernel_round2, ..., kernel_round7.
+*/
+#define KERNEL_ROUND(N) \
+__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) \
+void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \
+	__global uint *rowCountersSrc, __global uint *rowCountersDst, \
+       	__global uint *debug) \
+{ \
+    __local uchar first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; \
+    __local uint    collisionsData[LDS_COLL_SIZE]; \
+    __local uint    collisionsNum; \
+    equihash_round(N, ht_src, ht_dst, debug, first_words_data, collisionsData, \
+	    &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); \
+}
+KERNEL_ROUND(1)
+KERNEL_ROUND(2)
+KERNEL_ROUND(3)
+KERNEL_ROUND(4)
+KERNEL_ROUND(5)
+KERNEL_ROUND(6)
+KERNEL_ROUND(7)
+
+// kernel_round8 takes an extra argument, "sols"
+__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+void kernel_round8(__global char *ht_src, __global char *ht_dst,
+	__global uint *rowCountersSrc, __global uint *rowCountersDst,
+	__global uint *debug, __global sols_t *sols)
+{
+    uint		tid = get_global_id(0);
+    __local uchar	first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)];
+    __local uint	collisionsData[LDS_COLL_SIZE];
+    __local uint	collisionsNum;
+    equihash_round(8, ht_src, ht_dst, debug, first_words_data, collisionsData,
+	    &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW);
+    if (!tid)
+	sols->nr = sols->likely_invalids = 0;
+}
+
+uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
+{
+    return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN +
+	    slot * SLOT_LEN + xi_offset - 4);
+}
+
+/*
+** Expand references to inputs. Return 1 if so far the solution appears valid,
+** or 0 otherwise (an invalid solution would be a solution with duplicate
+** inputs, which can be detected at the last step: round == 0).
+*/
+uint expand_refs(uint *ins, uint nr_inputs, __global char **htabs,
+	uint round)
+{
+    __global char	*ht = htabs[round % 2];
+    uint		i = nr_inputs - 1;
+    uint		j = nr_inputs * 2 - 1;
+    uint		xi_offset = xi_offset_for_round(round);
+    int			dup_to_watch = -1;
+    do
+      {
+	ins[j] = expand_ref(ht, xi_offset,
+		DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i]));
+	ins[j - 1] = expand_ref(ht, xi_offset,
+		DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i]));
+	if (!round)
+	  {
+	    if (dup_to_watch == -1)
+		dup_to_watch = ins[j];
+	    else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch)
+		return 0;
+	  }
+	if (!i)
+	    break ;
+	i--;
+	j -= 2;
+      }
+    while (1);
+    return 1;
+}
+
+/*
+** Verify if a potential solution is in fact valid.
+*/
+void potential_sol(__global char **htabs, __global sols_t *sols,
+	uint ref0, uint ref1)
+{
+    uint	nr_values;
+    uint	values_tmp[(1 << PARAM_K)];
+    uint	sol_i;
+    uint	i;
+    nr_values = 0;
+    values_tmp[nr_values++] = ref0;
+    values_tmp[nr_values++] = ref1;
+    uint round = PARAM_K - 1;
+    do
+      {
+	round--;
+	if (!expand_refs(values_tmp, nr_values, htabs, round))
+	    return ;
+	nr_values *= 2;
+      }
+    while (round > 0);
+    // solution appears valid, copy it to sols
+    sol_i = atomic_inc(&sols->nr);
+    if (sol_i >= MAX_SOLS)
+	return ;
+    for (i = 0; i < (1 << PARAM_K); i++)
+	sols->values[sol_i][i] = values_tmp[i];
+    sols->valid[sol_i] = 1;
+}
+
+/*
+** Scan the hash tables to find Equihash solutions.
+*/
+__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols,
+	__global uint *rowCountersSrc, __global uint *rowCountersDst)
+{
+    __local uint counters[WORKSIZE/THREADS_PER_ROW];
+    __local uint refs[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)];
+    __local uint data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)];
+    __local uint collisionsNum;
+    __local ulong collisions[WORKSIZE*4];
+
+    uint globalTid = get_global_id(0) / THREADS_PER_ROW;
+    uint localTid = get_local_id(0) / THREADS_PER_ROW;
+    uint localGroupId = get_local_id(0) % THREADS_PER_ROW;
+    __local uint *refsPtr = &refs[NR_SLOTS*localTid];
+    __local uint *dataPtr = &data[NR_SLOTS*localTid];    
+    
+    __global char	*htabs[2] = { ht0, ht1 };
+    __global char	*hcounters[2] = { rowCountersSrc, rowCountersDst };
+    uint		ht_i = (PARAM_K - 1) % 2; // table filled at last round
+    uint		cnt;
+    uint		xi_offset = xi_offset_for_round(PARAM_K - 1);
+    uint		i, j;
+    __global char	*p;
+    uint		ref_i, ref_j;
+    // it's ok for the collisions array to be so small, as if it fills up
+    // the potential solutions are likely invalid (many duplicate inputs)
+//     ulong		collisions;
+#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
+    // in the final hash table, we are looking for a match on both the bits
+    // part of the previous PREFIX colliding bits, and the last PREFIX bits.
+    uint		mask = 0xffffff;
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+    
+  collisionsNum = 0;    
+  
+  for (uint chunk = 0; chunk < THREADS_PER_ROW; chunk++) {
+    uint tid = globalTid + NR_ROWS/THREADS_PER_ROW*chunk;
+    p = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN;
+    uint rowIdx = tid/ROWS_PER_UINT;
+    uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT);
+    cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
+    cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
+    p += xi_offset;
+    p += SLOT_LEN*localGroupId;
+
+    for (i = get_local_id(0); i < WORKSIZE/THREADS_PER_ROW; i += get_local_size(0))
+      counters[i] = 0;
+    for (i = localGroupId; i < cnt; i += THREADS_PER_ROW, p += SLOT_LEN*THREADS_PER_ROW) {
+      refsPtr[i] = *(__global uint *)(p - 4);
+      dataPtr[i] = (*(__global uint *)p) & mask;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    for (i = 0; i < cnt; i++)
+      {
+	uint a_data = dataPtr[i];
+	ref_i = refsPtr[i];
+	for (j = i + 1 + localGroupId; j < cnt; j += THREADS_PER_ROW)
+	  {
+	    if (a_data == dataPtr[j])
+	      {
+          if (atomic_inc(&counters[localTid]) == 0)
+		        collisions[atomic_inc(&collisionsNum)] = ((ulong)ref_i << 32) | refsPtr[j];
+          goto part2;          
+	      }
+	  }
+      }
+
+part2:
+    continue;
+  }
+  
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint totalCollisions = collisionsNum;
+    if (get_local_id(0) < totalCollisions) {
+      ulong coll = collisions[get_local_id(0)];
+      potential_sol(htabs, sols, coll >> 32, coll & 0xffffffff);
+    }
+}
diff --git a/ocl.c b/ocl.c
index 50e3472d..67246f1e 100644
--- a/ocl.c
+++ b/ocl.c
@@ -959,7 +959,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
      * 2 greater >= required amount earlier */
     if (bufsize > cgpu->max_alloc) {
       applog(LOG_WARNING, "Maximum buffer memory device %d supports says %lu",
-        gpu, (unsigned long)(cgpu->max_alloc));
+	gpu, (unsigned long)(cgpu->max_alloc));
       applog(LOG_WARNING, "Your settings come to %lu", (unsigned long)bufsize);
     }
 
@@ -967,35 +967,35 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
       // need additionnal buffers
       clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status);
       if (status != CL_SUCCESS && !clState->buffer1) {
-        applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
-        return NULL;
+	applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
+	return NULL;
       }
 
       clState->buffer2 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf2size, NULL, &status);
       if (status != CL_SUCCESS && !clState->buffer2) {
-        applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer2), decrease TC or increase LG", status);
-        return NULL;
+	applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer2), decrease TC or increase LG", status);
+	return NULL;
       }
 
       clState->buffer3 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf3size, NULL, &status);
       if (status != CL_SUCCESS && !clState->buffer3) {
-        applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer3), decrease TC or increase LG", status);
-        return NULL;
+	applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer3), decrease TC or increase LG", status);
+	return NULL;
       }
     }
     else if (algorithm->type == ALGO_LYRA2REV2) {
       // need additionnal buffers
       clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status);
       if (status != CL_SUCCESS && !clState->buffer1) {
-        applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
-        return NULL;
+	applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
+	return NULL;
       }
     }
     else {
       clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); // we don't need that much just tired...
       if (status != CL_SUCCESS && !clState->buffer1) {
-        applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
-        return NULL;
+	applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
+	return NULL;
       }
     }
 
@@ -1009,40 +1009,35 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
     }
   }
   
-	if(algorithm->type == ALGO_CRYPTONIGHT)
-	{
-		size_t GlobalThreads;
-		readbufsize = 76UL;
+  if (algorithm->type == ALGO_CRYPTONIGHT) {
+    size_t GlobalThreads;
+    readbufsize = 76UL;
 		
-		set_threads_hashes(1, clState->compute_shaders, &GlobalThreads, 1, &cgpu->intensity, &cgpu->xintensity, &cgpu->rawintensity, &cgpu->algorithm);
+    set_threads_hashes(1, clState->compute_shaders, &GlobalThreads, 1, &cgpu->intensity, &cgpu->xintensity, &cgpu->rawintensity, &cgpu->algorithm);
 		
-		for(int i = 0; i < 4; ++i)
-		{
-			clState->BranchBuffer[i] = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, sizeof(cl_uint) * (GlobalThreads + 2), NULL, &status);
+    for (int i = 0; i < 4; ++i) {
+      clState->BranchBuffer[i] = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, sizeof(cl_uint) * (GlobalThreads + 2), NULL, &status);
 			
-			if(status != CL_SUCCESS)
-			{
-				applog(LOG_ERR, "Error %d when creating branch buffer %d.\n", status, i);
-				return NULL;
-			}
-		}
+      if (status != CL_SUCCESS) {
+        applog(LOG_ERR, "Error %d when creating branch buffer %d.\n", status, i);
+        return NULL;
+      }
+    }
 		
-		clState->States = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, 200 * GlobalThreads, NULL, &status);
+    clState->States = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, 200 * GlobalThreads, NULL, &status);
 		
-		if(status != CL_SUCCESS)
-		{
-			applog(LOG_ERR, "Error %d when creating Cryptonight state buffer.\n", status);
-			return NULL;
-		}
+    if(status != CL_SUCCESS) {
+      applog(LOG_ERR, "Error %d when creating Cryptonight state buffer.\n", status);
+      return NULL;
+    }
 		
-		clState->Scratchpads = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, (1 << 21) * GlobalThreads, NULL, &status);
+    clState->Scratchpads = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, (1 << 21) * GlobalThreads, NULL, &status);
 		
-		if(status != CL_SUCCESS)
-		{
-			applog(LOG_ERR, "Error %d when creating Cryptonight scratchpads buffer.\n", status);
-			return NULL;
-		}
-	}
+    if(status != CL_SUCCESS) {
+      applog(LOG_ERR, "Error %d when creating Cryptonight scratchpads buffer.\n", status);
+      return NULL;
+    }
+  }
   
   applog(LOG_DEBUG, "Using read buffer sized %lu", (unsigned long)readbufsize);
   clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, readbufsize, NULL, &status);
diff --git a/sgminer.c b/sgminer.c
index 4df90fb3..2d1f048a 100644
--- a/sgminer.c
+++ b/sgminer.c
@@ -609,6 +609,7 @@ struct pool *add_pool(void)
   cglock_init(&pool->data_lock);
   mutex_init(&pool->stratum_lock);
   cglock_init(&pool->gbt_lock);
+  mutex_init(&pool->XMRGlobalNonceLock);
   INIT_LIST_HEAD(&pool->curlring);
 
   /* Make sure the pool doesn't think we've been idle since time 0 */
@@ -3110,7 +3111,7 @@ share_result(json_t *val, json_t *res, json_t *err, const struct work *work,
 
   cgpu = get_thr_cgpu(work->thr_id);
 
-  if (json_is_true(res) || (work->gbt && json_is_null(res))) {
+  if (json_is_true(res) || (work->gbt && json_is_null(res)) || (pool->algorithm.type == ALGO_CRYPTONIGHT && json_is_null(err))) {
     mutex_lock(&stats_lock);
     cgpu->accepted++;
     total_accepted++;
@@ -5556,7 +5557,7 @@ static bool parse_stratum_response(struct pool *pool, char *s)
   err_val = json_object_get(val, "error");
   id_val = json_object_get(val, "id");
 
-  if ((json_is_null(id_val) || !id_val) && pool->algorithm.type != ALGO_ETHASH) {
+  if ((json_is_null(id_val) || !id_val) && (pool->algorithm.type != ALGO_ETHASH && pool->algorithm.type != ALGO_CRYPTONIGHT)) {
     char *ss;
 
     if (err_val)
@@ -5583,6 +5584,7 @@ static bool parse_stratum_response(struct pool *pool, char *s)
 
   if (!sshare) {
     double pool_diff;
+    bool success = false;
 
     /* Since the share is untracked, we can only guess at what the
      * work difficulty is based on the current pool diff. */
@@ -5590,7 +5592,47 @@ static bool parse_stratum_response(struct pool *pool, char *s)
     pool_diff = pool->swork.diff;
     cg_runlock(&pool->data_lock);
 
-    if (json_is_true(res_val)) {
+    //for cryptonight, the result contains the "status" object which should = "OK" on accept
+    if (pool->algorithm.type == ALGO_CRYPTONIGHT) {
+      json_t *res_id, *res_job;
+      
+      //check if the result contains an id... if so then we need to process as first job, not share response
+      if ((res_id = json_object_get(res_val, "id"))) {
+        cg_wlock(&pool->data_lock);
+        strcpy(pool->XMRAuthID, json_string_value(res_id));
+        cg_wunlock(&pool->data_lock);
+        
+        //get the job object and send to parse notify
+        if ((res_job = json_object_get(res_val, "job"))) {
+          ret = parse_notify_cn(pool, res_job);
+        }
+        
+        goto out;
+      }
+      
+      if (json_is_null(err_val) && !strcmp(json_string_value(json_object_get(res_val, "status")), "OK")) {
+        success = true;
+      }
+      else {
+        char *ss;
+
+        if (err_val) {
+          ss = json_dumps(err_val, JSON_INDENT(3));
+        } 
+        else {
+          ss = strdup("(unknown reason)");
+        }
+        
+        applog(LOG_INFO, "JSON-RPC response decode failed: %s", ss);
+
+        free(ss);
+      }
+    }
+    else {
+      success = json_is_true(res_val);
+    }
+
+    if (success) {
       applog(LOG_NOTICE, "Accepted untracked stratum share from %s", get_pool_name(pool));
 
       /* We don't know what device this came from so we can't
@@ -5730,6 +5772,7 @@ static void wait_lpcurrent(struct pool *pool);
 static void pool_resus(struct pool *pool);
 static void gen_stratum_work(struct pool *pool, struct work *work);
 static void gen_stratum_work_eth(struct pool *pool, struct work *work);
+static void gen_stratum_work_cn(struct pool *pool, struct work *work);
 static void stratum_resumed(struct pool *pool)
 {
   if (!pool->stratum_notify)
@@ -5852,8 +5895,20 @@ static void *stratum_rthread(void *userdata)
       /* Generate a single work item to update the current
        * block database */
       pool->swork.clean = false;
-      if(pool->algorithm.type == ALGO_ETHASH) gen_stratum_work_eth(pool, work);
-      else gen_stratum_work(pool, work);
+      
+      switch(pool->algorithm.type) {
+        case ALGO_ETHASH:
+          gen_stratum_work_eth(pool, work);
+          break;
+        
+        case ALGO_CRYPTONIGHT:
+          gen_stratum_work_cn(pool, work);
+          break;
+          
+        default:
+          gen_stratum_work(pool, work);
+      }
+
       work->longpoll = true;
       /* Return value doesn't matter. We're just informing
        * that we may need to restart. */
@@ -5934,6 +5989,32 @@ static void *stratum_sthread(void *userdata)
       free(ASCIIMixHash);
       free(ASCIIPoWHash);
     }
+    else if (pool->algorithm.type == ALGO_CRYPTONIGHT) {
+      sshare = (struct stratum_share *)calloc(sizeof(struct stratum_share), 1);
+      submitted = false;
+      char *ASCIIResult;
+      uint8_t HashResult[32];
+
+      sshare->sshare_time = time(NULL);
+      /* This work item is freed in parse_stratum_response */
+      sshare->work = work;
+
+      applog(LOG_DEBUG, "stratum_sthread() algorithm = %s", pool->algorithm.name);
+		
+      char *ASCIINonce = bin2hex(&work->XMRNonce, 4);
+      
+      ASCIIResult = bin2hex(work->hash, 32);
+       
+      mutex_lock(&sshare_lock);
+      /* Give the stratum share a unique id */
+      sshare->id = swork_id++;
+      mutex_unlock(&sshare_lock);
+      snprintf(s, s_size, "{\"method\": \"submit\", \"params\": {\"id\": \"%s\", \"job_id\": \"%s\", \"nonce\": \"%s\", \"result\": \"%s\"}, \"id\":%d}", pool->XMRAuthID, work->job_id, ASCIINonce, ASCIIResult, sshare->id);
+
+      free(ASCIINonce);
+      free(ASCIIResult);
+    }
+ 
     else if(pool->algorithm.type == ALGO_EQUIHASH) {
       char *nonce;
       hash32 = (uint32_t *)work->hash;
@@ -6145,10 +6226,18 @@ static bool pool_active(struct pool *pool, bool pinging)
     if (!init) {
       bool ret = initiate_stratum(pool) && (!pool->extranonce_subscribe || subscribe_extranonce(pool)) && auth_stratum(pool);
 
-      if (ret)
+      if (ret) {
         init_stratum_threads(pool);
-      else
+        
+        if (pool->algorithm.type == ALGO_CRYPTONIGHT) {
+          struct work *work = make_work();
+          gen_stratum_work_cn(pool, work);
+          stage_work(work);
+        }
+      }
+      else {
         pool_tclear(pool, &pool->stratum_init);
+      }
       return ret;
     }
     return pool->stratum_active;
@@ -6502,6 +6591,7 @@ void set_target_neoscrypt(unsigned char *target, double diff, const int thr_id)
   }
 }
 
+
 /* Generates stratum based work based on the most recent notify information
  * from the pool. This will keep generating work while a pool is down so we use
  * other means to detect when the pool has died in stratum_thread */
@@ -6537,6 +6627,49 @@ static void gen_stratum_work_eth(struct pool *pool, struct work *work)
   cgtime(&work->tv_staged);
 }
 
+/* Generates stratum based work based on the most recent notify information
+ * from the pool. This will keep generating work while a pool is down so we use
+ * other means to detect when the pool has died in stratum_thread */
+
+static void gen_stratum_work_cn(struct pool *pool, struct work *work)
+{
+  if(pool->algorithm.type != ALGO_CRYPTONIGHT)
+    return;
+
+  applog(LOG_DEBUG, "[THR%d] gen_stratum_work_cn() - algorithm = %s", work->thr_id, pool->algorithm.name);
+  
+  cg_rlock(&pool->data_lock);	
+  work->job_id = strdup(pool->swork.job_id);
+  work->XMRTarget = pool->XMRTarget;
+  //strcpy(work->XMRID, pool->XMRID);
+  //work->XMRBlockBlob = strdup(pool->XMRBlockBlob);
+  memcpy(work->XMRBlob, pool->XMRBlob, 76);
+  memcpy(work->data, work->XMRBlob, 76);
+  memset(work->target, 0xFF, 32);
+  work->sdiff = (double)0xffffffff / pool->XMRTarget;
+  work->work_difficulty = work->sdiff;
+  work->network_diff = pool->diff1;
+  cg_runlock(&pool->data_lock);
+  
+  work->target[7] = work->XMRTarget;
+  
+  local_work++;
+  work->pool = pool;
+  work->stratum = true;
+  work->blk.nonce = 0;
+  work->id = total_work++;
+  work->longpoll = false;
+  work->getwork_mode = GETWORK_MODE_STRATUM;
+
+  work->work_block = work->data[0];
+  // Do not allow ntime rolling
+  work->drv_rolllimit = 0;
+
+  cgtime(&work->tv_staged);
+  
+  applog(LOG_DEBUG, "gen_stratum_work_cn() done.");
+}
+
 static void gen_stratum_work_equihash(struct pool *pool, struct work *work)
 {
   cg_wlock(&pool->data_lock);
@@ -7524,17 +7657,19 @@ void inc_hw_errors(struct thr_info *thr)
 static void rebuild_nonce(struct work *work, uint32_t nonce)
 {
   uint32_t nonce_pos = 76;
-  if (work->pool->algorithm.type == ALGO_CRE) nonce_pos = 140;
-  if(work->pool->algorithm.type == ALGO_ETHASH)
-  {
-	  uint64_t *work_nonce = (uint64_t *)(work->data + 32);
-	  *work_nonce = (uint64_t)htole32(nonce);
+  if (work->pool->algorithm.type == ALGO_CRE)
+    nonce_pos = 140;
+  else if (work->pool->algorithm.type == ALGO_CRYPTONIGHT)
+    nonce_pos = 39;
+
+  if (work->pool->algorithm.type == ALGO_ETHASH) {
+    uint64_t *work_nonce = (uint64_t *)(work->data + 32);
+    *work_nonce = htole32(nonce);
   }
-  else
-  {
-	  uint32_t *work_nonce = (uint32_t *)(work->data + nonce_pos);
+  else {
+    uint32_t *work_nonce = (uint32_t *)(work->data + nonce_pos);
 
-	  *work_nonce = htole32(nonce);
+    *work_nonce = htole32(nonce);
   }
 
   work->pool->algorithm.regenhash(work);
@@ -7557,6 +7692,9 @@ bool test_nonce(struct work *work, uint32_t nonce)
     uint64_t target = *(uint64_t*) (work->device_target + 24);
     return (bswap_64(*(uint64_t*) work->hash) <= target);
   }
+  else if (work->pool->algorithm.type = ALGO_CRYPTONIGHT) {
+    return (((uint32_t *)work->hash)[7] <= work->XMRTarget);
+  }
   else {
     diff1targ = work->pool->algorithm.diff1targ;
   }
@@ -7585,6 +7723,7 @@ static void update_work_stats(struct thr_info *thr, struct work *work)
   total_diff1 += work->device_diff;
   thr->cgpu->diff1 += work->device_diff;
   work->pool->diff1 += work->device_diff;
+  thr->cgpu->last_device_valid_work = time(NULL);
   mutex_unlock(&stats_lock);
 }
 
@@ -7595,14 +7734,15 @@ bool submit_tested_work(struct thr_info *thr, struct work *work)
   struct work *work_out;
   update_work_stats(thr, work);
 
-  if(work->pool->algorithm.type == ALGO_ETHASH) {
+  if (work->pool->algorithm.type == ALGO_ETHASH) {
     uint64_t LETarget = ((uint64_t *)work->target)[3];
 
-    if(bswap_64(((uint64_t *)work->hash)[0]) > LETarget) {
-//      applog(LOG_INFO, "%s %d: Share above target", thr->cgpu->drv->name, thr->cgpu->device_id);
-      return(false);
+    if (bswap_64(((uint64_t *)work->hash)[0]) > LETarget) {
+      return false;
     }
   }
+  else if (work->pool->algorithm.type == ALGO_CRYPTONIGHT) {
+  }
   else if (work->pool->algorithm.type == ALGO_EQUIHASH) {
     applog(LOG_DEBUG, "equihash target: %.16llx", *(uint64_t*) (work->target + 24));
     if (*(uint64_t*) (work->hash + 24) > *(uint64_t*) (work->target + 24))
@@ -9642,8 +9782,20 @@ int main(int argc, char *argv[])
           goto retry;
         }
       }
-      if(pool->algorithm.type == ALGO_ETHASH) gen_stratum_work_eth(pool, work);
-      else gen_stratum_work(pool, work);
+
+      switch(pool->algorithm.type) {
+        case ALGO_ETHASH:
+          gen_stratum_work_eth(pool, work);
+          break;
+          
+        case ALGO_CRYPTONIGHT:
+          gen_stratum_work_cn(pool, work);
+          break;
+          
+        default:
+           gen_stratum_work(pool, work);
+      }
+ 
       applog(LOG_DEBUG, "Generated stratum work");
       stage_work(work);
       continue;
diff --git a/util.h b/util.h
index 70b594c1..2904af2b 100644
--- a/util.h
+++ b/util.h
@@ -135,6 +135,7 @@ bool stratum_send(struct pool *pool, char *s, ssize_t len);
 bool sock_full(struct pool *pool);
 char *recv_line(struct pool *pool);
 bool parse_method(struct pool *pool, char *s);
+bool parse_notify_cn(struct pool *pool, json_t *val);
 bool extract_sockaddr(char *url, char **sockaddr_url, char **sockaddr_port);
 bool auth_stratum(struct pool *pool);
 bool subscribe_extranonce(struct pool *pool);