From 0c85c09e3047f092ca5178cc4801f7bad3606c48 Mon Sep 17 00:00:00 2001 From: Philipp Otterbein Date: Thu, 1 Dec 2016 17:50:29 +0100 Subject: [PATCH] merge XMR --- algorithm.c | 4 +- algorithm.h | 3 - algorithm/extern/blake2-impl.h | 136 --- algorithm/extern/blake2.h | 156 --- algorithm/extern/blake2b-ref.c | 396 -------- driver-opencl.c | 249 +++-- kernel/equihash-param.h | 226 ++--- kernel/equihash.cl | 1666 ++++++++++++++++---------------- ocl.c | 71 +- sgminer.c | 196 +++- util.h | 1 + 11 files changed, 1274 insertions(+), 1830 deletions(-) delete mode 100644 algorithm/extern/blake2-impl.h delete mode 100644 algorithm/extern/blake2.h delete mode 100644 algorithm/extern/blake2b-ref.c diff --git a/algorithm.c b/algorithm.c index 8f40a59e..2f543556 100644 --- a/algorithm.c +++ b/algorithm.c @@ -1293,10 +1293,10 @@ static algorithm_settings_t algos[] = { { "ethash", ALGO_ETHASH, "", (1ULL << 32), (1ULL << 32), 1, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x00000000UL, 0, 128, 0, ethash_regenhash, NULL, queue_ethash_kernel, gen_hash, append_ethash_compiler_options }, { "ethash-genoil", ALGO_ETHASH, "", (1ULL << 32), (1ULL << 32), 1, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x00000000UL, 0, 128, 0, ethash_regenhash, NULL, queue_ethash_kernel, gen_hash, append_ethash_compiler_options }, - { "equihash", ALGO_EQUIHASH, "", 1, (1ULL << 28), (1ULL << 28), 0, 0, 0x20000, 0xFFFF000000000000ULL, 0x00000000UL, 0, 128, 0, equihash_regenhash, NULL, queue_equihash_kernel, gen_hash, append_equihash_compiler_options }, - { "cryptonight", ALGO_CRYPTONIGHT, "", (1ULL << 32), (1ULL << 32), (1ULL << 32), 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 6, 0, 0, cryptonight_regenhash, NULL, queue_cryptonight_kernel, gen_hash, NULL }, + { "equihash", ALGO_EQUIHASH, "", 1, (1ULL << 28), (1ULL << 28), 0, 0, 0x20000, 0xFFFF000000000000ULL, 0x00000000UL, 0, 128, 0, equihash_regenhash, NULL, queue_equihash_kernel, gen_hash, append_equihash_compiler_options }, + // Terminator (do not remove) { NULL, ALGO_UNK, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL, NULL } }; diff --git a/algorithm.h b/algorithm.h index 11f43a24..54e2308e 100644 --- a/algorithm.h +++ b/algorithm.h @@ -109,9 +109,6 @@ void set_algorithm(algorithm_t* algo, const char* name); /* Set to specific N factor. */ void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor); -/* Set to specific K factor. */ -void set_algorithm_kfactor(algorithm_t* algo, const uint8_t kfactor); - /* Compare two algorithm parameters */ bool cmp_algorithm(const algorithm_t* algo1, const algorithm_t* algo2); diff --git a/algorithm/extern/blake2-impl.h b/algorithm/extern/blake2-impl.h deleted file mode 100644 index 5ac7a430..00000000 --- a/algorithm/extern/blake2-impl.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - BLAKE2 reference source code package - reference C implementations - - Written in 2012 by Samuel Neves - - To the extent possible under law, the author(s) have dedicated all copyright - and related and neighboring rights to this software to the public domain - worldwide. This software is distributed without any warranty. - - You should have received a copy of the CC0 Public Domain Dedication along with - this software. If not, see . -*/ -#pragma once -#ifndef __BLAKE2_IMPL_H__ -#define __BLAKE2_IMPL_H__ - -#include - -static inline uint32_t load32( const void *src ) -{ -#if defined(NATIVE_LITTLE_ENDIAN) - uint32_t w; - memcpy(&w, src, sizeof w); - return w; -#else - const uint8_t *p = ( const uint8_t * )src; - uint32_t w = *p++; - w |= ( uint32_t )( *p++ ) << 8; - w |= ( uint32_t )( *p++ ) << 16; - w |= ( uint32_t )( *p++ ) << 24; - return w; -#endif -} - -static inline uint64_t load64( const void *src ) -{ -#if defined(NATIVE_LITTLE_ENDIAN) - uint64_t w; - memcpy(&w, src, sizeof w); - return w; -#else - const uint8_t *p = ( const uint8_t * )src; - uint64_t w = *p++; - w |= ( uint64_t )( *p++ ) << 8; - w |= ( uint64_t )( *p++ ) << 16; - w |= ( uint64_t )( *p++ ) << 24; - w |= ( uint64_t )( *p++ ) << 32; - w |= ( uint64_t )( *p++ ) << 40; - w |= ( uint64_t )( *p++ ) << 48; - w |= ( uint64_t )( *p++ ) << 56; - return w; -#endif -} - -static inline void store32( void *dst, uint32_t w ) -{ -#if defined(NATIVE_LITTLE_ENDIAN) - memcpy(dst, &w, sizeof w); -#else - uint8_t *p = ( uint8_t * )dst; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; -#endif -} - -static inline void store64( void *dst, uint64_t w ) -{ -#if defined(NATIVE_LITTLE_ENDIAN) - memcpy(dst, &w, sizeof w); -#else - uint8_t *p = ( uint8_t * )dst; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; -#endif -} - -static inline uint64_t load48( const void *src ) -{ - const uint8_t *p = ( const uint8_t * )src; - uint64_t w = *p++; - w |= ( uint64_t )( *p++ ) << 8; - w |= ( uint64_t )( *p++ ) << 16; - w |= ( uint64_t )( *p++ ) << 24; - w |= ( uint64_t )( *p++ ) << 32; - w |= ( uint64_t )( *p++ ) << 40; - return w; -} - -static inline void store48( void *dst, uint64_t w ) -{ - uint8_t *p = ( uint8_t * )dst; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; w >>= 8; - *p++ = ( uint8_t )w; -} - -static inline uint32_t rotl32( const uint32_t w, const unsigned c ) -{ - return ( w << c ) | ( w >> ( 32 - c ) ); -} - -static inline uint64_t rotl64( const uint64_t w, const unsigned c ) -{ - return ( w << c ) | ( w >> ( 64 - c ) ); -} - -static inline uint32_t rotr32( const uint32_t w, const unsigned c ) -{ - return ( w >> c ) | ( w << ( 32 - c ) ); -} - -static inline uint64_t rotr64( const uint64_t w, const unsigned c ) -{ - return ( w >> c ) | ( w << ( 64 - c ) ); -} - -/* prevents compiler optimizing out memset() */ -static inline void secure_zero_memory( void *v, size_t n ) -{ - volatile uint8_t *p = ( volatile uint8_t * )v; - while( n-- ) *p++ = 0; -} - -#endif - diff --git a/algorithm/extern/blake2.h b/algorithm/extern/blake2.h deleted file mode 100644 index f8aba833..00000000 --- a/algorithm/extern/blake2.h +++ /dev/null @@ -1,156 +0,0 @@ -/* - BLAKE2 reference source code package - reference C implementations - - Written in 2012 by Samuel Neves - - To the extent possible under law, the author(s) have dedicated all copyright - and related and neighboring rights to this software to the public domain - worldwide. This software is distributed without any warranty. - - You should have received a copy of the CC0 Public Domain Dedication along with - this software. If not, see . -*/ -#pragma once -#ifndef __BLAKE2_H__ -#define __BLAKE2_H__ - -#include -#include - -#if defined(_MSC_VER) -#define ALIGN(x) __declspec(align(x)) -#else -#define ALIGN(x) __attribute__((aligned(x))) -#endif - -#if defined(__cplusplus) -extern "C" { -#endif - - enum blake2s_constant - { - BLAKE2S_BLOCKBYTES = 64, - BLAKE2S_OUTBYTES = 32, - BLAKE2S_KEYBYTES = 32, - BLAKE2S_SALTBYTES = 8, - BLAKE2S_PERSONALBYTES = 8 - }; - - enum blake2b_constant - { - BLAKE2B_BLOCKBYTES = 128, - BLAKE2B_OUTBYTES = 64, - BLAKE2B_KEYBYTES = 64, - BLAKE2B_SALTBYTES = 16, - BLAKE2B_PERSONALBYTES = 16 - }; - -#pragma pack(push, 1) - typedef struct __blake2s_param - { - uint8_t digest_length; // 1 - uint8_t key_length; // 2 - uint8_t fanout; // 3 - uint8_t depth; // 4 - uint32_t leaf_length; // 8 - uint8_t node_offset[6];// 14 - uint8_t node_depth; // 15 - uint8_t inner_length; // 16 - // uint8_t reserved[0]; - uint8_t salt[BLAKE2S_SALTBYTES]; // 24 - uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32 - } blake2s_param; - - ALIGN( 64 ) typedef struct __blake2s_state - { - uint32_t h[8]; - uint32_t t[2]; - uint32_t f[2]; - uint8_t buf[2 * BLAKE2S_BLOCKBYTES]; - size_t buflen; - uint8_t last_node; - } blake2s_state ; - - typedef struct __blake2b_param - { - uint8_t digest_length; // 1 - uint8_t key_length; // 2 - uint8_t fanout; // 3 - uint8_t depth; // 4 - uint32_t leaf_length; // 8 - uint64_t node_offset; // 16 - uint8_t node_depth; // 17 - uint8_t inner_length; // 18 - uint8_t reserved[14]; // 32 - uint8_t salt[BLAKE2B_SALTBYTES]; // 48 - uint8_t personal[BLAKE2B_PERSONALBYTES]; // 64 - } blake2b_param; - - ALIGN( 64 ) typedef struct __blake2b_state - { - uint64_t h[8]; - uint64_t t[2]; - uint64_t f[2]; - uint8_t buf[2 * BLAKE2B_BLOCKBYTES]; - size_t buflen; - uint8_t last_node; - } blake2b_state; - - typedef struct __blake2sp_state - { - blake2s_state S[8][1]; - blake2s_state R[1]; - uint8_t buf[8 * BLAKE2S_BLOCKBYTES]; - size_t buflen; - } blake2sp_state; - - typedef struct __blake2bp_state - { - blake2b_state S[4][1]; - blake2b_state R[1]; - uint8_t buf[4 * BLAKE2B_BLOCKBYTES]; - size_t buflen; - } blake2bp_state; -#pragma pack(pop) - - // Streaming API - int blake2s_init( blake2s_state *S, const uint8_t outlen ); - int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); - int blake2s_init_param( blake2s_state *S, const blake2s_param *P ); - int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen ); - int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen ); - - int blake2b_init( blake2b_state *S, const uint8_t outlen ); - int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); - int blake2b_init_param( blake2b_state *S, const blake2b_param *P ); - int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen ); - int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen ); - - int blake2sp_init( blake2sp_state *S, const uint8_t outlen ); - int blake2sp_init_key( blake2sp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); - int blake2sp_update( blake2sp_state *S, const uint8_t *in, uint64_t inlen ); - int blake2sp_final( blake2sp_state *S, uint8_t *out, uint8_t outlen ); - - int blake2bp_init( blake2bp_state *S, const uint8_t outlen ); - int blake2bp_init_key( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); - int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen ); - int blake2bp_final( blake2bp_state *S, uint8_t *out, uint8_t outlen ); - - // Simple API - int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); - int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); - - int blake2sp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); - int blake2bp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); - - static inline int blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ) - { - return blake2b( out, in, key, outlen, inlen, keylen ); - } - -#if defined(__cplusplus) -} -#endif - -#endif - diff --git a/algorithm/extern/blake2b-ref.c b/algorithm/extern/blake2b-ref.c deleted file mode 100644 index 7064b28d..00000000 --- a/algorithm/extern/blake2b-ref.c +++ /dev/null @@ -1,396 +0,0 @@ -/* - BLAKE2 reference source code package - reference C implementations - - Written in 2012 by Samuel Neves - - To the extent possible under law, the author(s) have dedicated all copyright - and related and neighboring rights to this software to the public domain - worldwide. This software is distributed without any warranty. - - You should have received a copy of the CC0 Public Domain Dedication along with - this software. If not, see . -*/ - -#include -#include -#include - -#include "blake2.h" -#include "blake2-impl.h" - -static const uint64_t blake2b_IV[8] = -{ - 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL -}; - -static const uint8_t blake2b_sigma[12][16] = -{ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } -}; - - -static inline int blake2b_set_lastnode( blake2b_state *S ) -{ - S->f[1] = -1; - return 0; -} - -static inline int blake2b_clear_lastnode( blake2b_state *S ) -{ - S->f[1] = 0; - return 0; -} - -/* Some helper functions, not necessarily useful */ -int blake2b_set_lastblock( blake2b_state *S ) -{ - if( S->last_node ) blake2b_set_lastnode( S ); - - S->f[0] = -1; - return 0; -} - -static inline int blake2b_clear_lastblock( blake2b_state *S ) -{ - if( S->last_node ) blake2b_clear_lastnode( S ); - - S->f[0] = 0; - return 0; -} - -static inline int blake2b_increment_counter( blake2b_state *S, const uint64_t inc ) -{ - S->t[0] += inc; - S->t[1] += ( S->t[0] < inc ); - return 0; -} - - - -// Parameter-related functions -static inline int blake2b_param_set_digest_length( blake2b_param *P, const uint8_t digest_length ) -{ - P->digest_length = digest_length; - return 0; -} - -static inline int blake2b_param_set_fanout( blake2b_param *P, const uint8_t fanout ) -{ - P->fanout = fanout; - return 0; -} - -static inline int blake2b_param_set_max_depth( blake2b_param *P, const uint8_t depth ) -{ - P->depth = depth; - return 0; -} - -static inline int blake2b_param_set_leaf_length( blake2b_param *P, const uint32_t leaf_length ) -{ - store32( &P->leaf_length, leaf_length ); - return 0; -} - -static inline int blake2b_param_set_node_offset( blake2b_param *P, const uint64_t node_offset ) -{ - store64( &P->node_offset, node_offset ); - return 0; -} - -static inline int blake2b_param_set_node_depth( blake2b_param *P, const uint8_t node_depth ) -{ - P->node_depth = node_depth; - return 0; -} - -static inline int blake2b_param_set_inner_length( blake2b_param *P, const uint8_t inner_length ) -{ - P->inner_length = inner_length; - return 0; -} - -static inline int blake2b_param_set_salt( blake2b_param *P, const uint8_t salt[BLAKE2B_SALTBYTES] ) -{ - memcpy( P->salt, salt, BLAKE2B_SALTBYTES ); - return 0; -} - -static inline int blake2b_param_set_personal( blake2b_param *P, const uint8_t personal[BLAKE2B_PERSONALBYTES] ) -{ - memcpy( P->personal, personal, BLAKE2B_PERSONALBYTES ); - return 0; -} - -static inline int blake2b_init0( blake2b_state *S ) -{ - memset( S, 0, sizeof( blake2b_state ) ); - - for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i]; - - return 0; -} - -/* init xors IV with input parameter block */ -int blake2b_init_param( blake2b_state *S, const blake2b_param *P ) -{ - blake2b_init0( S ); - const uint8_t *p = ( const uint8_t * )( P ); - - /* IV XOR ParamBlock */ - for( size_t i = 0; i < 8; ++i ) - S->h[i] ^= load64( p + sizeof( S->h[i] ) * i ); - - return 0; -} - - - -int blake2b_init( blake2b_state *S, const uint8_t outlen ) -{ - blake2b_param P[1]; - - if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1; - - P->digest_length = outlen; - P->key_length = 0; - P->fanout = 1; - P->depth = 1; - store32( &P->leaf_length, 0 ); - store64( &P->node_offset, 0 ); - P->node_depth = 0; - P->inner_length = 0; - memset( P->reserved, 0, sizeof( P->reserved ) ); - memset( P->salt, 0, sizeof( P->salt ) ); - memset( P->personal, 0, sizeof( P->personal ) ); - return blake2b_init_param( S, P ); -} - - -int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ) -{ - blake2b_param P[1]; - - if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1; - - if ( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1; - - P->digest_length = outlen; - P->key_length = keylen; - P->fanout = 1; - P->depth = 1; - store32( &P->leaf_length, 0 ); - store64( &P->node_offset, 0 ); - P->node_depth = 0; - P->inner_length = 0; - memset( P->reserved, 0, sizeof( P->reserved ) ); - memset( P->salt, 0, sizeof( P->salt ) ); - memset( P->personal, 0, sizeof( P->personal ) ); - - if( blake2b_init_param( S, P ) < 0 ) return -1; - - { - uint8_t block[BLAKE2B_BLOCKBYTES]; - memset( block, 0, BLAKE2B_BLOCKBYTES ); - memcpy( block, key, keylen ); - blake2b_update( S, block, BLAKE2B_BLOCKBYTES ); - secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */ - } - return 0; -} - -int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ) -{ - uint64_t m[16]; - uint64_t v[16]; - int i; - - for( i = 0; i < 16; ++i ) - m[i] = load64( block + i * sizeof( m[i] ) ); - - for( i = 0; i < 8; ++i ) - v[i] = S->h[i]; - - v[ 8] = blake2b_IV[0]; - v[ 9] = blake2b_IV[1]; - v[10] = blake2b_IV[2]; - v[11] = blake2b_IV[3]; - v[12] = S->t[0] ^ blake2b_IV[4]; - v[13] = S->t[1] ^ blake2b_IV[5]; - v[14] = S->f[0] ^ blake2b_IV[6]; - v[15] = S->f[1] ^ blake2b_IV[7]; -#define G(r,i,a,b,c,d) \ - do { \ - a = a + b + m[blake2b_sigma[r][2*i+0]]; \ - d = rotr64(d ^ a, 32); \ - c = c + d; \ - b = rotr64(b ^ c, 24); \ - a = a + b + m[blake2b_sigma[r][2*i+1]]; \ - d = rotr64(d ^ a, 16); \ - c = c + d; \ - b = rotr64(b ^ c, 63); \ - } while(0) -#define ROUND(r) \ - do { \ - G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ - G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ - G(r,2,v[ 2],v[ 6],v[10],v[14]); \ - G(r,3,v[ 3],v[ 7],v[11],v[15]); \ - G(r,4,v[ 0],v[ 5],v[10],v[15]); \ - G(r,5,v[ 1],v[ 6],v[11],v[12]); \ - G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ - G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ - } while(0) - ROUND( 0 ); - ROUND( 1 ); - ROUND( 2 ); - ROUND( 3 ); - ROUND( 4 ); - ROUND( 5 ); - ROUND( 6 ); - ROUND( 7 ); - ROUND( 8 ); - ROUND( 9 ); - ROUND( 10 ); - ROUND( 11 ); - - for( i = 0; i < 8; ++i ) - S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; - -#undef G -#undef ROUND - return 0; -} - -/* inlen now in bytes */ -int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen ) -{ - while( inlen > 0 ) - { - size_t left = S->buflen; - size_t fill = 2 * BLAKE2B_BLOCKBYTES - left; - - if( inlen > fill ) - { - memcpy( S->buf + left, in, fill ); // Fill buffer - S->buflen += fill; - blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES ); - blake2b_compress( S, S->buf ); // Compress - memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left - S->buflen -= BLAKE2B_BLOCKBYTES; - in += fill; - inlen -= fill; - } - else // inlen <= fill - { - memcpy( S->buf + left, in, inlen ); - S->buflen += inlen; // Be lazy, do not compress - in += inlen; - inlen -= inlen; - } - } - - return 0; -} - -/* Is this correct? */ -int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen ) -{ - uint8_t buffer[BLAKE2B_OUTBYTES] = {0}; - - if( outlen > BLAKE2B_OUTBYTES ) - return -1; - - if( S->buflen > BLAKE2B_BLOCKBYTES ) - { - blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES ); - blake2b_compress( S, S->buf ); - S->buflen -= BLAKE2B_BLOCKBYTES; - memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen ); - } - - blake2b_increment_counter( S, S->buflen ); - blake2b_set_lastblock( S ); - memset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */ - blake2b_compress( S, S->buf ); - -// printf("S->buflen = %016llX\n", S->buflen); - - for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */ - store64( buffer + sizeof( S->h[i] ) * i, S->h[i] ); - - memcpy( out, buffer, outlen ); - return 0; -} - -/* inlen, at least, should be uint64_t. Others can be size_t. */ -int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ) -{ - blake2b_state S[1]; - - /* Verify parameters */ - if ( NULL == in ) return -1; - - if ( NULL == out ) return -1; - - if( NULL == key ) keylen = 0; - - if( keylen > 0 ) - { - if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1; - } - else - { - if( blake2b_init( S, outlen ) < 0 ) return -1; - } - - blake2b_update( S, ( const uint8_t * )in, inlen ); - blake2b_final( S, out, outlen ); - return 0; -} - -#if defined(BLAKE2B_SELFTEST) -#include -#include "blake2-kat.h" -int main( int argc, char **argv ) -{ - uint8_t key[BLAKE2B_KEYBYTES]; - uint8_t buf[KAT_LENGTH]; - - for( size_t i = 0; i < BLAKE2B_KEYBYTES; ++i ) - key[i] = ( uint8_t )i; - - for( size_t i = 0; i < KAT_LENGTH; ++i ) - buf[i] = ( uint8_t )i; - - for( size_t i = 0; i < KAT_LENGTH; ++i ) - { - uint8_t hash[BLAKE2B_OUTBYTES]; - blake2b( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES ); - - if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) ) - { - puts( "error" ); - return -1; - } - } - - puts( "ok" ); - return 0; -} -#endif - diff --git a/driver-opencl.c b/driver-opencl.c index f1704c0d..599470f1 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -492,12 +492,12 @@ char *set_temp_overheat(char *arg) return "Invalid value passed to set temp overheat"; gpus[device].adl.overtemp = val; - gpus[device++].sysfs_info.OverHeatTemp = val; + gpus[device++].sysfs_info.OverHeatTemp = val; } if (device == 1) { for (i = device; i < MAX_GPUDEVICES; i++) { gpus[i].adl.overtemp = val; - gpus[i].sysfs_info.OverHeatTemp = val; + gpus[i].sysfs_info.OverHeatTemp = val; } } @@ -529,7 +529,7 @@ char *set_temp_target(char *arg) tt = &gpus[device].adl.targettemp; *tt = val; tt = &gpus[device++].sysfs_info.TargetTemp; - *tt = val; + *tt = val; } if (device == 1) { @@ -537,7 +537,7 @@ char *set_temp_target(char *arg) tt = &gpus[i].adl.targettemp; *tt = val; tt = &gpus[i].sysfs_info.TargetTemp; - *tt = val; + *tt = val; } } @@ -1455,7 +1455,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, // increase nonce work->blk.nonce++; if (work->getwork_mode == GETWORK_MODE_STRATUM) - *(uint16_t*)(work->equihash_data + 108 + strlen(work->nonce1) / 2) += 1; + *(uint16_t*)(work->equihash_data + 108 + strlen(work->nonce1) / 2) += 1; else { *(uint64_t*)(work->equihash_data + 108) += 1; @@ -1476,128 +1476,115 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, } // if (algorithm.type == ALGO_ETHASH) read lock gpu->eth_dag.lock has to be released - if(gpu->algorithm.type == ALGO_CRYPTONIGHT) - { - mutex_lock(&work->pool->XMRGlobalNonceLock); - work->blk.nonce = work->pool->XMRGlobalNonce; - work->pool->XMRGlobalNonce += gpu->max_hashes; - mutex_unlock(&work->pool->XMRGlobalNonceLock); + if(gpu->algorithm.type == ALGO_CRYPTONIGHT) { + mutex_lock(&work->pool->XMRGlobalNonceLock); + work->blk.nonce = work->pool->XMRGlobalNonce; + work->pool->XMRGlobalNonce += gpu->max_hashes; + mutex_unlock(&work->pool->XMRGlobalNonceLock); } if (clState->goffset) p_global_work_offset = (size_t *)&work->blk.nonce; - if(gpu->algorithm.type == ALGO_CRYPTONIGHT) - { - size_t GlobalThreads = *globalThreads, Nonce[2] = { (size_t)work->blk.nonce, 1}, gthreads[2] = { *globalThreads, 8 }, lthreads[2] = { *localThreads, 8 }; - size_t BranchBufCount[4] = { 0, 0, 0, 0 }; - - for(int i = 0; i < 4; ++i) - { - cl_uint zero = 0; - - status = clEnqueueWriteBuffer(clState->commandQueue, clState->BranchBuffer[i], CL_FALSE, sizeof(cl_uint) * GlobalThreads, sizeof(cl_uint), &zero, 0, NULL, NULL); - - if(status != CL_SUCCESS) - { - applog(LOG_ERR, "Error %d while resetting branch buffer counter %d.\n", status, i); - return(-1); - } - } - - clFinish(clState->commandQueue); - - // Main CN P0 - status = clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 2, Nonce, gthreads, lthreads, 0, NULL, NULL); - - if(status != CL_SUCCESS) - { - applog(LOG_ERR, "Error %d while attempting to enqueue kernel 0.", status); - return(-1); - } - - // Main CN P1 - status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[0], 1, p_global_work_offset, globalThreads, localThreads, 0, NULL, NULL); - - if(status != CL_SUCCESS) - { - applog(LOG_ERR, "Error %d while attempting to enqueue kernel 1.", status); - return(-1); - } - - // Main CN P2 - status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[1], 2, Nonce, gthreads, lthreads, 0, NULL, NULL); - - if(status != CL_SUCCESS) - { - applog(LOG_ERR, "Error %d while attempting to enqueue kernel 2.", status); - return(-1); - } - - // Read BranchBuf counters - - for(int i = 0; i < 4; ++i) - { - status = clEnqueueReadBuffer(clState->commandQueue, clState->BranchBuffer[i], CL_FALSE, sizeof(cl_uint) * GlobalThreads, sizeof(cl_uint), BranchBufCount + i, 0, NULL, NULL); - - if(status != CL_SUCCESS) - { - applog(LOG_ERR, "Error %d while attempting to read branch buffer counter %d.", status, i); - return(-1); - } - } - - clFinish(clState->commandQueue); - - for(int i = 0; i < 4; ++i) - { - if(BranchBufCount[i]) - { - cl_ulong tmp = BranchBufCount[i]; - - // Threads - status = clSetKernelArg(clState->extra_kernels[i + 2], 4, sizeof(cl_ulong), &tmp); - - if(status != CL_SUCCESS) - { - applog(LOG_ERR, "Error %d while attempting to set argument 4 for kernel %d.", status, i + 2); - return(-1); - } - - // Make it a multiple of the local worksize (some drivers will otherwise shit a brick) - BranchBufCount[i] += (clState->wsize - (BranchBufCount[i] & (clState->wsize - 1))); - - status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i + 2], 1, p_global_work_offset, BranchBufCount + i, localThreads, 0, NULL, NULL); - - if(status != CL_SUCCESS) - { - applog(LOG_ERR, "Error %d while attempting to enqueue kernel %d.", status, i + 2); - return(-1); - } - } - } - } - else - { - status = clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 1, p_global_work_offset, - globalThreads, localThreads, 0, NULL, NULL); - if (unlikely(status != CL_SUCCESS)) { - if (work->pool->algorithm.type == ALGO_ETHASH) - cg_runlock(&gpu->eth_dag.lock); - applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status); - return -1; - } - - for (i = 0; i < clState->n_extra_kernels; i++) { - status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, - globalThreads, localThreads, 0, NULL, NULL); - if (unlikely(status != CL_SUCCESS)) { - applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status); - return -1; - } - } - } - + if (gpu->algorithm.type == ALGO_CRYPTONIGHT) { + size_t GlobalThreads = *globalThreads, Nonce[2] = { (size_t)work->blk.nonce, 1}, gthreads[2] = { *globalThreads, 8 }, lthreads[2] = { *localThreads, 8 }; + size_t BranchBufCount[4] = { 0, 0, 0, 0 }; + + for (int i = 0; i < 4; ++i) { + cl_uint zero = 0; + + status = clEnqueueWriteBuffer(clState->commandQueue, clState->BranchBuffer[i], CL_FALSE, sizeof(cl_uint) * GlobalThreads, sizeof(cl_uint), &zero, 0, NULL, NULL); + + if(status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d while resetting branch buffer counter %d.\n", status, i); + return -1; + } + } + + clFinish(clState->commandQueue); + + // Main CN P0 + status = clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 2, Nonce, gthreads, lthreads, 0, NULL, NULL); + + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d while attempting to enqueue kernel 0.", status); + return -1; + } + + // Main CN P1 + status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[0], 1, p_global_work_offset, globalThreads, localThreads, 0, NULL, NULL); + + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d while attempting to enqueue kernel 1.", status); + return -1; + } + + // Main CN P2 + status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[1], 2, Nonce, gthreads, lthreads, 0, NULL, NULL); + + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d while attempting to enqueue kernel 2.", status); + return -1; + } + + // Read BranchBuf counters + + for (int i = 0; i < 4; ++i) { + status = clEnqueueReadBuffer(clState->commandQueue, clState->BranchBuffer[i], CL_FALSE, sizeof(cl_uint) * GlobalThreads, sizeof(cl_uint), BranchBufCount + i, 0, NULL, NULL); + + if(status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d while attempting to read branch buffer counter %d.", status, i); + return(-1); + } + } + + clFinish(clState->commandQueue); + + for (int i = 0; i < 4; ++i) { + if(BranchBufCount[i]) { + cl_ulong tmp = BranchBufCount[i]; + + // Threads + status = clSetKernelArg(clState->extra_kernels[i + 2], 4, sizeof(cl_ulong), &tmp); + + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d while attempting to set argument 4 for kernel %d.", status, i + 2); + return -1; + } + + // Make it a multiple of the local worksize (some drivers will otherwise shit a brick) + BranchBufCount[i] += (clState->wsize - (BranchBufCount[i] & (clState->wsize - 1))); + + status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i + 2], 1, p_global_work_offset, BranchBufCount + i, localThreads, 0, NULL, NULL); + + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d while attempting to enqueue kernel %d.", status, i + 2); + return -1; + } + } + } + } + else { + status = clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 1, p_global_work_offset, + globalThreads, localThreads, 0, NULL, NULL); + + if (unlikely(status != CL_SUCCESS)) { + if (work->pool->algorithm.type == ALGO_ETHASH) + cg_runlock(&gpu->eth_dag.lock); + applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status); + return -1; + } + + for (i = 0; i < clState->n_extra_kernels; i++) { + status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, + globalThreads, localThreads, 0, NULL, NULL); + if (unlikely(status != CL_SUCCESS)) { + applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status); + return -1; + } + } + } + status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0, buffersize, thrdata->res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { @@ -1628,9 +1615,9 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, } applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id); postcalc_hash_async(thr, work, thrdata->res); -// postcalc_hash(thr); -// submit_tested_work(thr, work); -// submit_work_async(work); +// postcalc_hash(thr); +// submit_tested_work(thr, work); +// submit_work_async(work); memset(thrdata->res, 0, buffersize); /* This finish flushes the writebuffer set with CL_FALSE in clEnqueueWriteBuffer */ clFinish(clState->commandQueue); @@ -1652,12 +1639,12 @@ static void opencl_thread_shutdown(struct thr_info *thr) clFinish(clState->commandQueue); clReleaseMemObject(clState->outputBuffer); clReleaseMemObject(clState->CLbuffer0); - if (clState->buffer1) - clReleaseMemObject(clState->buffer1); - if (clState->buffer2) - clReleaseMemObject(clState->buffer2); - if (clState->buffer3) - clReleaseMemObject(clState->buffer3); + if (clState->buffer1) + clReleaseMemObject(clState->buffer1); + if (clState->buffer2) + clReleaseMemObject(clState->buffer2); + if (clState->buffer3) + clReleaseMemObject(clState->buffer3); if (clState->padbuffer8) clReleaseMemObject(clState->padbuffer8); clReleaseKernel(clState->kernel); diff --git a/kernel/equihash-param.h b/kernel/equihash-param.h index a40eea69..ace80692 100644 --- a/kernel/equihash-param.h +++ b/kernel/equihash-param.h @@ -1,113 +1,113 @@ -#ifndef __OPENCL_VERSION__ -#define uint uint32_t -#define uchar uint8_t -#endif - - -#define PARAM_N 200 -#define PARAM_K 9 -#define PREFIX (PARAM_N / (PARAM_K + 1)) -#define NR_INPUTS (1 << PREFIX) -// Approximate log base 2 of number of elements in hash tables -#define APX_NR_ELMS_LOG (PREFIX + 1) -// Number of rows and slots is affected by this. 20 offers the best performance -// but occasionally misses ~1% of solutions. -#define NR_ROWS_LOG 18 - -// Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md -#define OPTIM_SIMPLIFY_ROUND 1 - -// Number of collision items to track, per thread -#define THREADS_PER_ROW 8 -#define LDS_COLL_SIZE (NR_SLOTS * 8 * (64 / THREADS_PER_ROW)) - -// Ratio of time of sleeping before rechecking if task is done (0-1) -#define SLEEP_RECHECK_RATIO 0.60 -// Ratio of time to busy wait for the solution (0-1) -// The higher value the higher CPU usage with Nvidia -#define SLEEP_SKIP_RATIO 0.005 - -// Make hash tables OVERHEAD times larger than necessary to store the average -// number of elements per row. The ideal value is as small as possible to -// reduce memory usage, but not too small or else elements are dropped from the -// hash tables. -// -// The actual number of elements per row is closer to the theoretical average -// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be -// smaller. -// -// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease -// performance as they cause VRAM channel conflicts. -#if NR_ROWS_LOG == 16 -// #error "NR_ROWS_LOG = 16 is currently broken - do not use" -#define OVERHEAD 2 -#elif NR_ROWS_LOG == 18 -#define OVERHEAD 3 -#elif NR_ROWS_LOG == 19 -#define OVERHEAD 5 -#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND -#define OVERHEAD 6 -#elif NR_ROWS_LOG == 20 -#define OVERHEAD 9 -#endif - -#define NR_ROWS (1 << NR_ROWS_LOG) -#define NR_SLOTS (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD)) -// Length of 1 element (slot) in byte -#define SLOT_LEN 32 -// Total size of hash table -#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN) -// Length of Zcash block header, nonce (part of header) -#define ZCASH_BLOCK_HEADER_LEN 140 -// Offset of nTime in header -#define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32) -// Length of nonce -#define ZCASH_NONCE_LEN 32 -// Length of encoded representation of solution size -#define ZCASH_SOLSIZE_LEN 3 -// Solution size (1344 = 0x540) represented as a compact integer, in hex -#define ZCASH_SOLSIZE_HEX "fd4005" -// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes) -#define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8) -// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization -#define N_ZERO_BYTES 12 -// Number of bytes Zcash needs out of Blake -#define ZCASH_HASH_LEN 50 -// Number of wavefronts per SIMD for the Blake kernel. -// Blake is ALU-bound (beside the atomic counter being incremented) so we need -// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer -// instructions. 10 is the max supported by the hw. -#define BLAKE_WPS 10 -// Maximum number of solutions reported by kernel to host -#define MAX_SOLS 10 -// Length of SHA256 target -#define SHA256_TARGET_LEN (256 / 8) - -#if (NR_SLOTS < 16) -#define BITS_PER_ROW 4 -#define ROWS_PER_UINT 8 -#define ROW_MASK 0x0F -#else -#define BITS_PER_ROW 8 -#define ROWS_PER_UINT 4 -#define ROW_MASK 0xFF -#endif - -// Optional features -#undef ENABLE_DEBUG - -/* -** Return the offset of Xi in bytes from the beginning of the slot. -*/ -#define xi_offset_for_round(round) (8 + ((round) / 2) * 4) - -// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values -#define SOL_SIZE ((1 << PARAM_K) * 4) -typedef struct sols_s -{ - uint nr; - uint likely_invalids; - uchar valid[MAX_SOLS]; - uint values[MAX_SOLS][(1 << PARAM_K)]; -} sols_t; - +#ifndef __OPENCL_VERSION__ +#define uint uint32_t +#define uchar uint8_t +#endif + + +#define PARAM_N 200 +#define PARAM_K 9 +#define PREFIX (PARAM_N / (PARAM_K + 1)) +#define NR_INPUTS (1 << PREFIX) +// Approximate log base 2 of number of elements in hash tables +#define APX_NR_ELMS_LOG (PREFIX + 1) +// Number of rows and slots is affected by this. 20 offers the best performance +// but occasionally misses ~1% of solutions. +#define NR_ROWS_LOG 18 + +// Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md +#define OPTIM_SIMPLIFY_ROUND 1 + +// Number of collision items to track, per thread +#define THREADS_PER_ROW 8 +#define LDS_COLL_SIZE (NR_SLOTS * 8 * (64 / THREADS_PER_ROW)) + +// Ratio of time of sleeping before rechecking if task is done (0-1) +#define SLEEP_RECHECK_RATIO 0.60 +// Ratio of time to busy wait for the solution (0-1) +// The higher value the higher CPU usage with Nvidia +#define SLEEP_SKIP_RATIO 0.005 + +// Make hash tables OVERHEAD times larger than necessary to store the average +// number of elements per row. The ideal value is as small as possible to +// reduce memory usage, but not too small or else elements are dropped from the +// hash tables. +// +// The actual number of elements per row is closer to the theoretical average +// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be +// smaller. +// +// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease +// performance as they cause VRAM channel conflicts. +#if NR_ROWS_LOG == 16 +// #error "NR_ROWS_LOG = 16 is currently broken - do not use" +#define OVERHEAD 2 +#elif NR_ROWS_LOG == 18 +#define OVERHEAD 3 +#elif NR_ROWS_LOG == 19 +#define OVERHEAD 5 +#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND +#define OVERHEAD 6 +#elif NR_ROWS_LOG == 20 +#define OVERHEAD 9 +#endif + +#define NR_ROWS (1 << NR_ROWS_LOG) +#define NR_SLOTS (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD)) +// Length of 1 element (slot) in byte +#define SLOT_LEN 32 +// Total size of hash table +#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN) +// Length of Zcash block header, nonce (part of header) +#define ZCASH_BLOCK_HEADER_LEN 140 +// Offset of nTime in header +#define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32) +// Length of nonce +#define ZCASH_NONCE_LEN 32 +// Length of encoded representation of solution size +#define ZCASH_SOLSIZE_LEN 3 +// Solution size (1344 = 0x540) represented as a compact integer, in hex +#define ZCASH_SOLSIZE_HEX "fd4005" +// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes) +#define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8) +// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization +#define N_ZERO_BYTES 12 +// Number of bytes Zcash needs out of Blake +#define ZCASH_HASH_LEN 50 +// Number of wavefronts per SIMD for the Blake kernel. +// Blake is ALU-bound (beside the atomic counter being incremented) so we need +// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer +// instructions. 10 is the max supported by the hw. +#define BLAKE_WPS 10 +// Maximum number of solutions reported by kernel to host +#define MAX_SOLS 10 +// Length of SHA256 target +#define SHA256_TARGET_LEN (256 / 8) + +#if (NR_SLOTS < 16) +#define BITS_PER_ROW 4 +#define ROWS_PER_UINT 8 +#define ROW_MASK 0x0F +#else +#define BITS_PER_ROW 8 +#define ROWS_PER_UINT 4 +#define ROW_MASK 0xFF +#endif + +// Optional features +#undef ENABLE_DEBUG + +/* +** Return the offset of Xi in bytes from the beginning of the slot. +*/ +#define xi_offset_for_round(round) (8 + ((round) / 2) * 4) + +// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values +#define SOL_SIZE ((1 << PARAM_K) * 4) +typedef struct sols_s +{ + uint nr; + uint likely_invalids; + uchar valid[MAX_SOLS]; + uint values[MAX_SOLS][(1 << PARAM_K)]; +} sols_t; + diff --git a/kernel/equihash.cl b/kernel/equihash.cl index 8fedf79e..460d20a5 100644 --- a/kernel/equihash.cl +++ b/kernel/equihash.cl @@ -1,833 +1,833 @@ -#include "equihash-param.h" - -#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable - -/* -** Assuming NR_ROWS_LOG == 16, the hash table slots have this layout (length in -** bytes in parens): -** -** round 0, table 0: cnt(4) i(4) pad(0) Xi(23.0) pad(1) -** round 1, table 1: cnt(4) i(4) pad(0.5) Xi(20.5) pad(3) -** round 2, table 0: cnt(4) i(4) i(4) pad(0) Xi(18.0) pad(2) -** round 3, table 1: cnt(4) i(4) i(4) pad(0.5) Xi(15.5) pad(4) -** round 4, table 0: cnt(4) i(4) i(4) i(4) pad(0) Xi(13.0) pad(3) -** round 5, table 1: cnt(4) i(4) i(4) i(4) pad(0.5) Xi(10.5) pad(5) -** round 6, table 0: cnt(4) i(4) i(4) i(4) i(4) pad(0) Xi( 8.0) pad(4) -** round 7, table 1: cnt(4) i(4) i(4) i(4) i(4) pad(0.5) Xi( 5.5) pad(6) -** round 8, table 0: cnt(4) i(4) i(4) i(4) i(4) i(4) pad(0) Xi( 3.0) pad(5) -** -** If the first byte of Xi is 0xAB then: -** - on even rounds, 'A' is part of the colliding PREFIX, 'B' is part of Xi -** - on odd rounds, 'A' and 'B' are both part of the colliding PREFIX, but -** 'A' is considered redundant padding as it was used to compute the row # -** -** - cnt is an atomic counter keeping track of the number of used slots. -** it is used in the first slot only; subsequent slots replace it with -** 4 padding bytes -** - i encodes either the 21-bit input value (round 0) or a reference to two -** inputs from the previous round -** -** Formula for Xi length and pad length above: -** > for i in range(9): -** > xi=(200-20*i-NR_ROWS_LOG)/8.; ci=8+4*((i)/2); print xi,32-ci-xi -** -** Note that the fractional .5-byte/4-bit padding following Xi for odd rounds -** is the 4 most significant bits of the last byte of Xi. -*/ - -__constant ulong blake_iv[] = -{ - 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, - 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, - 0x510e527fade682d1, 0x9b05688c2b3e6c1f, - 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, -}; - -/* -** Reset counters in hash table. -*/ -__kernel -void kernel_init_ht(__global char *ht, __global uint *rowCounters) -{ - rowCounters[get_global_id(0)] = 0; -} - -/* -** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they -** represent (hex notation, group of 5 hex digits are a group of PREFIX bits): -** aa aa ab bb bb cc cc cd dd... [round 0] -** -------------------- -** ...ab bb bb cc cc cd dd... [odd round] -** -------------- -** ...cc cc cd dd... [next even round] -** ----- -** Bytes underlined are going to be stored in the slot. Preceding bytes -** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are -** used to compute the row number. -** -** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter) -** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble) -** TODO: update lines below with padding nibbles -** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter) -** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter) -** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter) -** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter) -** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter) -** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter) -** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter) -** -** Return 0 if successfully stored, or 1 if the row overflowed. -*/ -uint ht_store(uint round, __global char *ht, uint i, - ulong xi0, ulong xi1, ulong xi2, ulong xi3, __global uint *rowCounters) -{ - uint row; - __global char *p; - uint cnt; -#if NR_ROWS_LOG == 16 - if (!(round % 2)) - row = (xi0 & 0xffff); - else - // if we have in hex: "ab cd ef..." (little endian xi0) then this - // formula computes the row as 0xdebc. it skips the 'a' nibble as it - // is part of the PREFIX. The Xi will be stored starting with "ef..."; - // 'e' will be considered padding and 'f' is part of the current PREFIX - row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); -#elif NR_ROWS_LOG == 18 - if (!(round % 2)) - row = (xi0 & 0xffff) | ((xi0 & 0xc00000) >> 6); - else - row = ((xi0 & 0xc0000) >> 2) | - ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); -#elif NR_ROWS_LOG == 19 - if (!(round % 2)) - row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5); - else - row = ((xi0 & 0xe0000) >> 1) | - ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); -#elif NR_ROWS_LOG == 20 - if (!(round % 2)) - row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4); - else - row = ((xi0 & 0xf0000) >> 0) | - ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); -#else -#error "unsupported NR_ROWS_LOG" -#endif - xi0 = (xi0 >> 16) | (xi1 << (64 - 16)); - xi1 = (xi1 >> 16) | (xi2 << (64 - 16)); - xi2 = (xi2 >> 16) | (xi3 << (64 - 16)); - p = ht + row * NR_SLOTS * SLOT_LEN; - uint rowIdx = row/ROWS_PER_UINT; - uint rowOffset = BITS_PER_ROW*(row%ROWS_PER_UINT); - uint xcnt = atomic_add(rowCounters + rowIdx, 1 << rowOffset); - xcnt = (xcnt >> rowOffset) & ROW_MASK; - cnt = xcnt; - if (cnt >= NR_SLOTS) - { - // avoid overflows - atomic_sub(rowCounters + rowIdx, 1 << rowOffset); - return 1; - } - p += cnt * SLOT_LEN + xi_offset_for_round(round); - // store "i" (always 4 bytes before Xi) - *(__global uint *)(p - 4) = i; - if (round == 0 || round == 1) - { - // store 24 bytes - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - *(__global ulong *)(p + 16) = xi2; - } - else if (round == 2) - { - // store 20 bytes - *(__global uint *)(p + 0) = xi0; - *(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32); - *(__global ulong *)(p + 12) = (xi1 >> 32) | (xi2 << 32); - } - else if (round == 3) - { - // store 16 bytes - *(__global uint *)(p + 0) = xi0; - *(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32); - *(__global uint *)(p + 12) = (xi1 >> 32); - } - else if (round == 4) - { - // store 16 bytes - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - } - else if (round == 5) - { - // store 12 bytes - *(__global ulong *)(p + 0) = xi0; - *(__global uint *)(p + 8) = xi1; - } - else if (round == 6 || round == 7) - { - // store 8 bytes - *(__global uint *)(p + 0) = xi0; - *(__global uint *)(p + 4) = (xi0 >> 32); - } - else if (round == 8) - { - // store 4 bytes - *(__global uint *)(p + 0) = xi0; - } - return 0; -} - -#define mix(va, vb, vc, vd, x, y) \ - va = (va + vb + x); \ -vd = rotate((vd ^ va), (ulong)64 - 32); \ -vc = (vc + vd); \ -vb = rotate((vb ^ vc), (ulong)64 - 24); \ -va = (va + vb + y); \ -vd = rotate((vd ^ va), (ulong)64 - 16); \ -vc = (vc + vd); \ -vb = rotate((vb ^ vc), (ulong)64 - 63); - -/* -** Execute round 0 (blake). -** -** Note: making the work group size less than or equal to the wavefront size -** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local -** Memory (LDS) Optimization 2-10" in: -** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/ -*/ -__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -void kernel_round0(__global ulong *blake_state, __global char *ht, - __global uint *rowCounters, __global uint *debug) -{ - uint tid = get_global_id(0); - ulong v[16]; - uint inputs_per_thread = NR_INPUTS / get_global_size(0); - uint input = tid * inputs_per_thread; - uint input_end = (tid + 1) * inputs_per_thread; - uint dropped = 0; - while (input < input_end) - { - // shift "i" to occupy the high 32 bits of the second ulong word in the - // message block - ulong word1 = (ulong)input << 32; - // init vector v - v[0] = blake_state[0]; - v[1] = blake_state[1]; - v[2] = blake_state[2]; - v[3] = blake_state[3]; - v[4] = blake_state[4]; - v[5] = blake_state[5]; - v[6] = blake_state[6]; - v[7] = blake_state[7]; - v[8] = blake_iv[0]; - v[9] = blake_iv[1]; - v[10] = blake_iv[2]; - v[11] = blake_iv[3]; - v[12] = blake_iv[4]; - v[13] = blake_iv[5]; - v[14] = blake_iv[6]; - v[15] = blake_iv[7]; - // mix in length of data - v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */; - // last block - v[14] ^= (ulong)-1; - - // round 1 - mix(v[0], v[4], v[8], v[12], 0, word1); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 2 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], word1, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 3 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, word1); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 4 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, word1); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 5 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, word1); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 6 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], word1, 0); - // round 7 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], word1, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 8 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, word1); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 9 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], word1, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 10 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], word1, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 11 - mix(v[0], v[4], v[8], v[12], 0, word1); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 12 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], word1, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - - // compress v into the blake state; this produces the 50-byte hash - // (two Xi values) - ulong h[7]; - h[0] = blake_state[0] ^ v[0] ^ v[8]; - h[1] = blake_state[1] ^ v[1] ^ v[9]; - h[2] = blake_state[2] ^ v[2] ^ v[10]; - h[3] = blake_state[3] ^ v[3] ^ v[11]; - h[4] = blake_state[4] ^ v[4] ^ v[12]; - h[5] = blake_state[5] ^ v[5] ^ v[13]; - h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff; - - // store the two Xi values in the hash table -#if ZCASH_HASH_LEN == 50 - dropped += ht_store(0, ht, input * 2, - h[0], - h[1], - h[2], - h[3], rowCounters); - dropped += ht_store(0, ht, input * 2 + 1, - (h[3] >> 8) | (h[4] << (64 - 8)), - (h[4] >> 8) | (h[5] << (64 - 8)), - (h[5] >> 8) | (h[6] << (64 - 8)), - (h[6] >> 8), rowCounters); -#else -#error "unsupported ZCASH_HASH_LEN" -#endif - - input++; - } -#ifdef ENABLE_DEBUG - debug[tid * 2] = 0; - debug[tid * 2 + 1] = dropped; -#endif -} - -#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff)) -#define DECODE_ROW(REF) (REF >> 16) -#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff) -#define DECODE_SLOT0(REF) (REF & 0xff) - -#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f)) -#define DECODE_ROW(REF) (REF >> 14) -#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f) -#define DECODE_SLOT0(REF) (REF & 0x7f) - -#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */ -#define DECODE_ROW(REF) (REF >> 13) -#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) -#define DECODE_SLOT0(REF) (REF & 0x3f) - -#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) -#define DECODE_ROW(REF) (REF >> 12) -#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) -#define DECODE_SLOT0(REF) (REF & 0x3f) - -#else -#error "unsupported NR_ROWS_LOG" -#endif - -/* -** Access a half-aligned long, that is a long aligned on a 4-byte boundary. -*/ -ulong half_aligned_long(__global ulong *p, uint offset) -{ - return - (((ulong)*(__global uint *)((__global char *)p + offset + 0)) << 0) | - (((ulong)*(__global uint *)((__global char *)p + offset + 4)) << 32); -} - -/* -** Access a well-aligned int. -*/ -uint well_aligned_int(__global ulong *_p, uint offset) -{ - __global char *p = (__global char *)_p; - return *(__global uint *)(p + offset); -} - -/* -** XOR a pair of Xi values computed at "round - 1" and store the result in the -** hash table being built for "round". Note that when building the table for -** even rounds we need to skip 1 padding byte present in the "round - 1" table -** (the "0xAB" byte mentioned in the description at the top of this file.) But -** also note we can't load data directly past this byte because this would -** cause an unaligned memory access which is undefined per the OpenCL spec. -** -** Return 0 if successfully stored, or 1 if the row overflowed. -*/ -uint xor_and_store(uint round, __global char *ht_dst, uint row, - uint slot_a, uint slot_b, __global ulong *a, __global ulong *b, - __global uint *rowCounters) -{ - ulong xi0, xi1, xi2; -#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 - // Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not - // storing the byte containing bits from the previous PREFIX block for - if (round == 1 || round == 2) - { - // xor 24 bytes - xi0 = *(a++) ^ *(b++); - xi1 = *(a++) ^ *(b++); - xi2 = *a ^ *b; - if (round == 2) - { - // skip padding byte - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8) | (xi2 << (64 - 8)); - xi2 = (xi2 >> 8); - } - } - else if (round == 3) - { - // xor 20 bytes - xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); - xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8); - xi2 = well_aligned_int(a, 16) ^ well_aligned_int(b, 16); - } - else if (round == 4 || round == 5) - { - // xor 16 bytes - xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); - xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8); - xi2 = 0; - if (round == 4) - { - // skip padding byte - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8); - } - } - else if (round == 6) - { - // xor 12 bytes - xi0 = *a++ ^ *b++; - xi1 = *(__global uint *)a ^ *(__global uint *)b; - xi2 = 0; - if (round == 6) - { - // skip padding byte - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8); - } - } - else if (round == 7 || round == 8) - { - // xor 8 bytes - xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); - xi1 = 0; - xi2 = 0; - if (round == 8) - { - // skip padding byte - xi0 = (xi0 >> 8); - } - } - // invalid solutions (which start happenning in round 5) have duplicate - // inputs and xor to zero, so discard them - if (!xi0 && !xi1) - return 0; -#else -#error "unsupported NR_ROWS_LOG" -#endif - return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b), - xi0, xi1, xi2, 0, rowCounters); -} - -/* -** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi, -** store them in ht_dst. -*/ -void equihash_round(uint round, - __global char *ht_src, - __global char *ht_dst, - __global uint *debug, - __local uchar *first_words_data, - __local uint *collisionsData, - __local uint *collisionsNum, - __global uint *rowCountersSrc, - __global uint *rowCountersDst, - uint threadsPerRow) -{ - uint globalTid = get_global_id(0) / threadsPerRow; - uint localTid = get_local_id(0) / threadsPerRow; - uint localGroupId = get_local_id(0) % threadsPerRow; - __local uchar *first_words = &first_words_data[NR_SLOTS*localTid]; - - __global char *p; - uint cnt; - uchar mask; - uint i, j; - // NR_SLOTS is already oversized (by a factor of OVERHEAD), but we want to - // make it even larger - uint n; - uint dropped_coll = 0; - uint dropped_stor = 0; - __global ulong *a, *b; - uint xi_offset; - // read first words of Xi from the previous (round - 1) hash table - xi_offset = xi_offset_for_round(round - 1); - // the mask is also computed to read data from the previous round -#if NR_ROWS_LOG <= 16 - mask = ((!(round % 2)) ? 0x0f : 0xf0); -#elif NR_ROWS_LOG == 18 - mask = ((!(round % 2)) ? 0x03 : 0x30); -#elif NR_ROWS_LOG == 19 - mask = ((!(round % 2)) ? 0x01 : 0x10); -#elif NR_ROWS_LOG == 20 - mask = 0; /* we can vastly simplify the code below */ -#else -#error "unsupported NR_ROWS_LOG" -#endif - - for (uint chunk = 0; chunk < threadsPerRow; chunk++) { - uint tid = globalTid + NR_ROWS/threadsPerRow*chunk; - uint gid = tid & ~(get_local_size(0) / threadsPerRow - 1); -// for (uint tid = get_global_id(0)/threadsPerRow; tid < NR_ROWS; tid += get_global_size(0)/threadsPerRow) { - - uint rowIdx = tid/ROWS_PER_UINT; - uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT); - cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK; - cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round - - *collisionsNum = 0; - p = (ht_src + tid * NR_SLOTS * SLOT_LEN); - p += xi_offset; - p += SLOT_LEN*localGroupId; - for (i = localGroupId; i < cnt; i += threadsPerRow, p += SLOT_LEN*threadsPerRow) - first_words[i] = (*(__global uchar *)p) & mask; - barrier(CLK_LOCAL_MEM_FENCE); - - if (cnt == 0) - // no elements in row, no collisions - goto part2; - // find collisions - for (i = 0; i < cnt-1; i++) - { - uchar data_i = first_words[i]; - uint collision = (localTid << 24) | (i << 12) | (i + 1 + localGroupId); - for (j = i + 1 + localGroupId; j < cnt; j += threadsPerRow) - { - if (data_i == first_words[j]) - { - uint index = atomic_inc(collisionsNum); - if (index >= LDS_COLL_SIZE) { - atomic_dec(collisionsNum); - goto part2; - } - collisionsData[index] = collision; - } - collision += threadsPerRow; - } - } - -part2: - barrier(CLK_LOCAL_MEM_FENCE); - uint totalCollisions = *collisionsNum; - for (uint index = get_local_id(0); index < totalCollisions; index += get_local_size(0)) - { - uint collision = collisionsData[index]; - uint collisionThreadId = gid + (collision >> 24); - uint i = (collision >> 12) & 0xFFF; - uint j = collision & 0xFFF; - __global uchar *ptr = ht_src + collisionThreadId * NR_SLOTS * SLOT_LEN + - xi_offset; - a = (__global ulong *)(ptr + i * SLOT_LEN); - b = (__global ulong *)(ptr + j * SLOT_LEN); - dropped_stor += xor_and_store(round, ht_dst, collisionThreadId, i, j, - a, b, rowCountersDst); - } - } - -#ifdef ENABLE_DEBUG - debug[tid * 2] = dropped_coll; - debug[tid * 2 + 1] = dropped_stor; -#endif -} - -/* -** This defines kernel_round1, kernel_round2, ..., kernel_round7. -*/ -#define KERNEL_ROUND(N) \ -__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) \ -void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \ - __global uint *rowCountersSrc, __global uint *rowCountersDst, \ - __global uint *debug) \ -{ \ - __local uchar first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; \ - __local uint collisionsData[LDS_COLL_SIZE]; \ - __local uint collisionsNum; \ - equihash_round(N, ht_src, ht_dst, debug, first_words_data, collisionsData, \ - &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); \ -} -KERNEL_ROUND(1) -KERNEL_ROUND(2) -KERNEL_ROUND(3) -KERNEL_ROUND(4) -KERNEL_ROUND(5) -KERNEL_ROUND(6) -KERNEL_ROUND(7) - -// kernel_round8 takes an extra argument, "sols" -__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -void kernel_round8(__global char *ht_src, __global char *ht_dst, - __global uint *rowCountersSrc, __global uint *rowCountersDst, - __global uint *debug, __global sols_t *sols) -{ - uint tid = get_global_id(0); - __local uchar first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; - __local uint collisionsData[LDS_COLL_SIZE]; - __local uint collisionsNum; - equihash_round(8, ht_src, ht_dst, debug, first_words_data, collisionsData, - &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); - if (!tid) - sols->nr = sols->likely_invalids = 0; -} - -uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot) -{ - return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN + - slot * SLOT_LEN + xi_offset - 4); -} - -/* -** Expand references to inputs. Return 1 if so far the solution appears valid, -** or 0 otherwise (an invalid solution would be a solution with duplicate -** inputs, which can be detected at the last step: round == 0). -*/ -uint expand_refs(uint *ins, uint nr_inputs, __global char **htabs, - uint round) -{ - __global char *ht = htabs[round % 2]; - uint i = nr_inputs - 1; - uint j = nr_inputs * 2 - 1; - uint xi_offset = xi_offset_for_round(round); - int dup_to_watch = -1; - do - { - ins[j] = expand_ref(ht, xi_offset, - DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i])); - ins[j - 1] = expand_ref(ht, xi_offset, - DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i])); - if (!round) - { - if (dup_to_watch == -1) - dup_to_watch = ins[j]; - else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch) - return 0; - } - if (!i) - break ; - i--; - j -= 2; - } - while (1); - return 1; -} - -/* -** Verify if a potential solution is in fact valid. -*/ -void potential_sol(__global char **htabs, __global sols_t *sols, - uint ref0, uint ref1) -{ - uint nr_values; - uint values_tmp[(1 << PARAM_K)]; - uint sol_i; - uint i; - nr_values = 0; - values_tmp[nr_values++] = ref0; - values_tmp[nr_values++] = ref1; - uint round = PARAM_K - 1; - do - { - round--; - if (!expand_refs(values_tmp, nr_values, htabs, round)) - return ; - nr_values *= 2; - } - while (round > 0); - // solution appears valid, copy it to sols - sol_i = atomic_inc(&sols->nr); - if (sol_i >= MAX_SOLS) - return ; - for (i = 0; i < (1 << PARAM_K); i++) - sols->values[sol_i][i] = values_tmp[i]; - sols->valid[sol_i] = 1; -} - -/* -** Scan the hash tables to find Equihash solutions. -*/ -__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols, - __global uint *rowCountersSrc, __global uint *rowCountersDst) -{ - __local uint counters[WORKSIZE/THREADS_PER_ROW]; - __local uint refs[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; - __local uint data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; - __local uint collisionsNum; - __local ulong collisions[WORKSIZE*4]; - - uint globalTid = get_global_id(0) / THREADS_PER_ROW; - uint localTid = get_local_id(0) / THREADS_PER_ROW; - uint localGroupId = get_local_id(0) % THREADS_PER_ROW; - __local uint *refsPtr = &refs[NR_SLOTS*localTid]; - __local uint *dataPtr = &data[NR_SLOTS*localTid]; - - __global char *htabs[2] = { ht0, ht1 }; - __global char *hcounters[2] = { rowCountersSrc, rowCountersDst }; - uint ht_i = (PARAM_K - 1) % 2; // table filled at last round - uint cnt; - uint xi_offset = xi_offset_for_round(PARAM_K - 1); - uint i, j; - __global char *p; - uint ref_i, ref_j; - // it's ok for the collisions array to be so small, as if it fills up - // the potential solutions are likely invalid (many duplicate inputs) -// ulong collisions; -#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 - // in the final hash table, we are looking for a match on both the bits - // part of the previous PREFIX colliding bits, and the last PREFIX bits. - uint mask = 0xffffff; -#else -#error "unsupported NR_ROWS_LOG" -#endif - - collisionsNum = 0; - - for (uint chunk = 0; chunk < THREADS_PER_ROW; chunk++) { - uint tid = globalTid + NR_ROWS/THREADS_PER_ROW*chunk; - p = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN; - uint rowIdx = tid/ROWS_PER_UINT; - uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT); - cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK; - cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round - p += xi_offset; - p += SLOT_LEN*localGroupId; - - for (i = get_local_id(0); i < WORKSIZE/THREADS_PER_ROW; i += get_local_size(0)) - counters[i] = 0; - for (i = localGroupId; i < cnt; i += THREADS_PER_ROW, p += SLOT_LEN*THREADS_PER_ROW) { - refsPtr[i] = *(__global uint *)(p - 4); - dataPtr[i] = (*(__global uint *)p) & mask; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (i = 0; i < cnt; i++) - { - uint a_data = dataPtr[i]; - ref_i = refsPtr[i]; - for (j = i + 1 + localGroupId; j < cnt; j += THREADS_PER_ROW) - { - if (a_data == dataPtr[j]) - { - if (atomic_inc(&counters[localTid]) == 0) - collisions[atomic_inc(&collisionsNum)] = ((ulong)ref_i << 32) | refsPtr[j]; - goto part2; - } - } - } - -part2: - continue; - } - - barrier(CLK_LOCAL_MEM_FENCE); - uint totalCollisions = collisionsNum; - if (get_local_id(0) < totalCollisions) { - ulong coll = collisions[get_local_id(0)]; - potential_sol(htabs, sols, coll >> 32, coll & 0xffffffff); - } -} +#include "equihash-param.h" + +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable + +/* +** Assuming NR_ROWS_LOG == 16, the hash table slots have this layout (length in +** bytes in parens): +** +** round 0, table 0: cnt(4) i(4) pad(0) Xi(23.0) pad(1) +** round 1, table 1: cnt(4) i(4) pad(0.5) Xi(20.5) pad(3) +** round 2, table 0: cnt(4) i(4) i(4) pad(0) Xi(18.0) pad(2) +** round 3, table 1: cnt(4) i(4) i(4) pad(0.5) Xi(15.5) pad(4) +** round 4, table 0: cnt(4) i(4) i(4) i(4) pad(0) Xi(13.0) pad(3) +** round 5, table 1: cnt(4) i(4) i(4) i(4) pad(0.5) Xi(10.5) pad(5) +** round 6, table 0: cnt(4) i(4) i(4) i(4) i(4) pad(0) Xi( 8.0) pad(4) +** round 7, table 1: cnt(4) i(4) i(4) i(4) i(4) pad(0.5) Xi( 5.5) pad(6) +** round 8, table 0: cnt(4) i(4) i(4) i(4) i(4) i(4) pad(0) Xi( 3.0) pad(5) +** +** If the first byte of Xi is 0xAB then: +** - on even rounds, 'A' is part of the colliding PREFIX, 'B' is part of Xi +** - on odd rounds, 'A' and 'B' are both part of the colliding PREFIX, but +** 'A' is considered redundant padding as it was used to compute the row # +** +** - cnt is an atomic counter keeping track of the number of used slots. +** it is used in the first slot only; subsequent slots replace it with +** 4 padding bytes +** - i encodes either the 21-bit input value (round 0) or a reference to two +** inputs from the previous round +** +** Formula for Xi length and pad length above: +** > for i in range(9): +** > xi=(200-20*i-NR_ROWS_LOG)/8.; ci=8+4*((i)/2); print xi,32-ci-xi +** +** Note that the fractional .5-byte/4-bit padding following Xi for odd rounds +** is the 4 most significant bits of the last byte of Xi. +*/ + +__constant ulong blake_iv[] = +{ + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, + 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, + 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, +}; + +/* +** Reset counters in hash table. +*/ +__kernel +void kernel_init_ht(__global char *ht, __global uint *rowCounters) +{ + rowCounters[get_global_id(0)] = 0; +} + +/* +** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they +** represent (hex notation, group of 5 hex digits are a group of PREFIX bits): +** aa aa ab bb bb cc cc cd dd... [round 0] +** -------------------- +** ...ab bb bb cc cc cd dd... [odd round] +** -------------- +** ...cc cc cd dd... [next even round] +** ----- +** Bytes underlined are going to be stored in the slot. Preceding bytes +** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are +** used to compute the row number. +** +** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter) +** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble) +** TODO: update lines below with padding nibbles +** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter) +** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter) +** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter) +** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter) +** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter) +** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter) +** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter) +** +** Return 0 if successfully stored, or 1 if the row overflowed. +*/ +uint ht_store(uint round, __global char *ht, uint i, + ulong xi0, ulong xi1, ulong xi2, ulong xi3, __global uint *rowCounters) +{ + uint row; + __global char *p; + uint cnt; +#if NR_ROWS_LOG == 16 + if (!(round % 2)) + row = (xi0 & 0xffff); + else + // if we have in hex: "ab cd ef..." (little endian xi0) then this + // formula computes the row as 0xdebc. it skips the 'a' nibble as it + // is part of the PREFIX. The Xi will be stored starting with "ef..."; + // 'e' will be considered padding and 'f' is part of the current PREFIX + row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | + ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); +#elif NR_ROWS_LOG == 18 + if (!(round % 2)) + row = (xi0 & 0xffff) | ((xi0 & 0xc00000) >> 6); + else + row = ((xi0 & 0xc0000) >> 2) | + ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | + ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); +#elif NR_ROWS_LOG == 19 + if (!(round % 2)) + row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5); + else + row = ((xi0 & 0xe0000) >> 1) | + ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | + ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); +#elif NR_ROWS_LOG == 20 + if (!(round % 2)) + row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4); + else + row = ((xi0 & 0xf0000) >> 0) | + ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | + ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); +#else +#error "unsupported NR_ROWS_LOG" +#endif + xi0 = (xi0 >> 16) | (xi1 << (64 - 16)); + xi1 = (xi1 >> 16) | (xi2 << (64 - 16)); + xi2 = (xi2 >> 16) | (xi3 << (64 - 16)); + p = ht + row * NR_SLOTS * SLOT_LEN; + uint rowIdx = row/ROWS_PER_UINT; + uint rowOffset = BITS_PER_ROW*(row%ROWS_PER_UINT); + uint xcnt = atomic_add(rowCounters + rowIdx, 1 << rowOffset); + xcnt = (xcnt >> rowOffset) & ROW_MASK; + cnt = xcnt; + if (cnt >= NR_SLOTS) + { + // avoid overflows + atomic_sub(rowCounters + rowIdx, 1 << rowOffset); + return 1; + } + p += cnt * SLOT_LEN + xi_offset_for_round(round); + // store "i" (always 4 bytes before Xi) + *(__global uint *)(p - 4) = i; + if (round == 0 || round == 1) + { + // store 24 bytes + *(__global ulong *)(p + 0) = xi0; + *(__global ulong *)(p + 8) = xi1; + *(__global ulong *)(p + 16) = xi2; + } + else if (round == 2) + { + // store 20 bytes + *(__global uint *)(p + 0) = xi0; + *(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32); + *(__global ulong *)(p + 12) = (xi1 >> 32) | (xi2 << 32); + } + else if (round == 3) + { + // store 16 bytes + *(__global uint *)(p + 0) = xi0; + *(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32); + *(__global uint *)(p + 12) = (xi1 >> 32); + } + else if (round == 4) + { + // store 16 bytes + *(__global ulong *)(p + 0) = xi0; + *(__global ulong *)(p + 8) = xi1; + } + else if (round == 5) + { + // store 12 bytes + *(__global ulong *)(p + 0) = xi0; + *(__global uint *)(p + 8) = xi1; + } + else if (round == 6 || round == 7) + { + // store 8 bytes + *(__global uint *)(p + 0) = xi0; + *(__global uint *)(p + 4) = (xi0 >> 32); + } + else if (round == 8) + { + // store 4 bytes + *(__global uint *)(p + 0) = xi0; + } + return 0; +} + +#define mix(va, vb, vc, vd, x, y) \ + va = (va + vb + x); \ +vd = rotate((vd ^ va), (ulong)64 - 32); \ +vc = (vc + vd); \ +vb = rotate((vb ^ vc), (ulong)64 - 24); \ +va = (va + vb + y); \ +vd = rotate((vd ^ va), (ulong)64 - 16); \ +vc = (vc + vd); \ +vb = rotate((vb ^ vc), (ulong)64 - 63); + +/* +** Execute round 0 (blake). +** +** Note: making the work group size less than or equal to the wavefront size +** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local +** Memory (LDS) Optimization 2-10" in: +** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/ +*/ +__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +void kernel_round0(__global ulong *blake_state, __global char *ht, + __global uint *rowCounters, __global uint *debug) +{ + uint tid = get_global_id(0); + ulong v[16]; + uint inputs_per_thread = NR_INPUTS / get_global_size(0); + uint input = tid * inputs_per_thread; + uint input_end = (tid + 1) * inputs_per_thread; + uint dropped = 0; + while (input < input_end) + { + // shift "i" to occupy the high 32 bits of the second ulong word in the + // message block + ulong word1 = (ulong)input << 32; + // init vector v + v[0] = blake_state[0]; + v[1] = blake_state[1]; + v[2] = blake_state[2]; + v[3] = blake_state[3]; + v[4] = blake_state[4]; + v[5] = blake_state[5]; + v[6] = blake_state[6]; + v[7] = blake_state[7]; + v[8] = blake_iv[0]; + v[9] = blake_iv[1]; + v[10] = blake_iv[2]; + v[11] = blake_iv[3]; + v[12] = blake_iv[4]; + v[13] = blake_iv[5]; + v[14] = blake_iv[6]; + v[15] = blake_iv[7]; + // mix in length of data + v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */; + // last block + v[14] ^= (ulong)-1; + + // round 1 + mix(v[0], v[4], v[8], v[12], 0, word1); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 2 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], word1, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 3 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, word1); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 4 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, word1); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 5 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, word1); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 6 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], word1, 0); + // round 7 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], word1, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 8 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, word1); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 9 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], word1, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 10 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], word1, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 11 + mix(v[0], v[4], v[8], v[12], 0, word1); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 12 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], word1, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + + // compress v into the blake state; this produces the 50-byte hash + // (two Xi values) + ulong h[7]; + h[0] = blake_state[0] ^ v[0] ^ v[8]; + h[1] = blake_state[1] ^ v[1] ^ v[9]; + h[2] = blake_state[2] ^ v[2] ^ v[10]; + h[3] = blake_state[3] ^ v[3] ^ v[11]; + h[4] = blake_state[4] ^ v[4] ^ v[12]; + h[5] = blake_state[5] ^ v[5] ^ v[13]; + h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff; + + // store the two Xi values in the hash table +#if ZCASH_HASH_LEN == 50 + dropped += ht_store(0, ht, input * 2, + h[0], + h[1], + h[2], + h[3], rowCounters); + dropped += ht_store(0, ht, input * 2 + 1, + (h[3] >> 8) | (h[4] << (64 - 8)), + (h[4] >> 8) | (h[5] << (64 - 8)), + (h[5] >> 8) | (h[6] << (64 - 8)), + (h[6] >> 8), rowCounters); +#else +#error "unsupported ZCASH_HASH_LEN" +#endif + + input++; + } +#ifdef ENABLE_DEBUG + debug[tid * 2] = 0; + debug[tid * 2 + 1] = dropped; +#endif +} + +#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff)) +#define DECODE_ROW(REF) (REF >> 16) +#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff) +#define DECODE_SLOT0(REF) (REF & 0xff) + +#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f)) +#define DECODE_ROW(REF) (REF >> 14) +#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f) +#define DECODE_SLOT0(REF) (REF & 0x7f) + +#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */ +#define DECODE_ROW(REF) (REF >> 13) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) +#define DECODE_ROW(REF) (REF >> 12) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#else +#error "unsupported NR_ROWS_LOG" +#endif + +/* +** Access a half-aligned long, that is a long aligned on a 4-byte boundary. +*/ +ulong half_aligned_long(__global ulong *p, uint offset) +{ + return + (((ulong)*(__global uint *)((__global char *)p + offset + 0)) << 0) | + (((ulong)*(__global uint *)((__global char *)p + offset + 4)) << 32); +} + +/* +** Access a well-aligned int. +*/ +uint well_aligned_int(__global ulong *_p, uint offset) +{ + __global char *p = (__global char *)_p; + return *(__global uint *)(p + offset); +} + +/* +** XOR a pair of Xi values computed at "round - 1" and store the result in the +** hash table being built for "round". Note that when building the table for +** even rounds we need to skip 1 padding byte present in the "round - 1" table +** (the "0xAB" byte mentioned in the description at the top of this file.) But +** also note we can't load data directly past this byte because this would +** cause an unaligned memory access which is undefined per the OpenCL spec. +** +** Return 0 if successfully stored, or 1 if the row overflowed. +*/ +uint xor_and_store(uint round, __global char *ht_dst, uint row, + uint slot_a, uint slot_b, __global ulong *a, __global ulong *b, + __global uint *rowCounters) +{ + ulong xi0, xi1, xi2; +#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 + // Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not + // storing the byte containing bits from the previous PREFIX block for + if (round == 1 || round == 2) + { + // xor 24 bytes + xi0 = *(a++) ^ *(b++); + xi1 = *(a++) ^ *(b++); + xi2 = *a ^ *b; + if (round == 2) + { + // skip padding byte + xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); + xi1 = (xi1 >> 8) | (xi2 << (64 - 8)); + xi2 = (xi2 >> 8); + } + } + else if (round == 3) + { + // xor 20 bytes + xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); + xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8); + xi2 = well_aligned_int(a, 16) ^ well_aligned_int(b, 16); + } + else if (round == 4 || round == 5) + { + // xor 16 bytes + xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); + xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8); + xi2 = 0; + if (round == 4) + { + // skip padding byte + xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); + xi1 = (xi1 >> 8); + } + } + else if (round == 6) + { + // xor 12 bytes + xi0 = *a++ ^ *b++; + xi1 = *(__global uint *)a ^ *(__global uint *)b; + xi2 = 0; + if (round == 6) + { + // skip padding byte + xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); + xi1 = (xi1 >> 8); + } + } + else if (round == 7 || round == 8) + { + // xor 8 bytes + xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); + xi1 = 0; + xi2 = 0; + if (round == 8) + { + // skip padding byte + xi0 = (xi0 >> 8); + } + } + // invalid solutions (which start happenning in round 5) have duplicate + // inputs and xor to zero, so discard them + if (!xi0 && !xi1) + return 0; +#else +#error "unsupported NR_ROWS_LOG" +#endif + return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b), + xi0, xi1, xi2, 0, rowCounters); +} + +/* +** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi, +** store them in ht_dst. +*/ +void equihash_round(uint round, + __global char *ht_src, + __global char *ht_dst, + __global uint *debug, + __local uchar *first_words_data, + __local uint *collisionsData, + __local uint *collisionsNum, + __global uint *rowCountersSrc, + __global uint *rowCountersDst, + uint threadsPerRow) +{ + uint globalTid = get_global_id(0) / threadsPerRow; + uint localTid = get_local_id(0) / threadsPerRow; + uint localGroupId = get_local_id(0) % threadsPerRow; + __local uchar *first_words = &first_words_data[NR_SLOTS*localTid]; + + __global char *p; + uint cnt; + uchar mask; + uint i, j; + // NR_SLOTS is already oversized (by a factor of OVERHEAD), but we want to + // make it even larger + uint n; + uint dropped_coll = 0; + uint dropped_stor = 0; + __global ulong *a, *b; + uint xi_offset; + // read first words of Xi from the previous (round - 1) hash table + xi_offset = xi_offset_for_round(round - 1); + // the mask is also computed to read data from the previous round +#if NR_ROWS_LOG <= 16 + mask = ((!(round % 2)) ? 0x0f : 0xf0); +#elif NR_ROWS_LOG == 18 + mask = ((!(round % 2)) ? 0x03 : 0x30); +#elif NR_ROWS_LOG == 19 + mask = ((!(round % 2)) ? 0x01 : 0x10); +#elif NR_ROWS_LOG == 20 + mask = 0; /* we can vastly simplify the code below */ +#else +#error "unsupported NR_ROWS_LOG" +#endif + + for (uint chunk = 0; chunk < threadsPerRow; chunk++) { + uint tid = globalTid + NR_ROWS/threadsPerRow*chunk; + uint gid = tid & ~(get_local_size(0) / threadsPerRow - 1); +// for (uint tid = get_global_id(0)/threadsPerRow; tid < NR_ROWS; tid += get_global_size(0)/threadsPerRow) { + + uint rowIdx = tid/ROWS_PER_UINT; + uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT); + cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK; + cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round + + *collisionsNum = 0; + p = (ht_src + tid * NR_SLOTS * SLOT_LEN); + p += xi_offset; + p += SLOT_LEN*localGroupId; + for (i = localGroupId; i < cnt; i += threadsPerRow, p += SLOT_LEN*threadsPerRow) + first_words[i] = (*(__global uchar *)p) & mask; + barrier(CLK_LOCAL_MEM_FENCE); + + if (cnt == 0) + // no elements in row, no collisions + goto part2; + // find collisions + for (i = 0; i < cnt-1; i++) + { + uchar data_i = first_words[i]; + uint collision = (localTid << 24) | (i << 12) | (i + 1 + localGroupId); + for (j = i + 1 + localGroupId; j < cnt; j += threadsPerRow) + { + if (data_i == first_words[j]) + { + uint index = atomic_inc(collisionsNum); + if (index >= LDS_COLL_SIZE) { + atomic_dec(collisionsNum); + goto part2; + } + collisionsData[index] = collision; + } + collision += threadsPerRow; + } + } + +part2: + barrier(CLK_LOCAL_MEM_FENCE); + uint totalCollisions = *collisionsNum; + for (uint index = get_local_id(0); index < totalCollisions; index += get_local_size(0)) + { + uint collision = collisionsData[index]; + uint collisionThreadId = gid + (collision >> 24); + uint i = (collision >> 12) & 0xFFF; + uint j = collision & 0xFFF; + __global uchar *ptr = ht_src + collisionThreadId * NR_SLOTS * SLOT_LEN + + xi_offset; + a = (__global ulong *)(ptr + i * SLOT_LEN); + b = (__global ulong *)(ptr + j * SLOT_LEN); + dropped_stor += xor_and_store(round, ht_dst, collisionThreadId, i, j, + a, b, rowCountersDst); + } + } + +#ifdef ENABLE_DEBUG + debug[tid * 2] = dropped_coll; + debug[tid * 2 + 1] = dropped_stor; +#endif +} + +/* +** This defines kernel_round1, kernel_round2, ..., kernel_round7. +*/ +#define KERNEL_ROUND(N) \ +__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) \ +void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \ + __global uint *rowCountersSrc, __global uint *rowCountersDst, \ + __global uint *debug) \ +{ \ + __local uchar first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; \ + __local uint collisionsData[LDS_COLL_SIZE]; \ + __local uint collisionsNum; \ + equihash_round(N, ht_src, ht_dst, debug, first_words_data, collisionsData, \ + &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); \ +} +KERNEL_ROUND(1) +KERNEL_ROUND(2) +KERNEL_ROUND(3) +KERNEL_ROUND(4) +KERNEL_ROUND(5) +KERNEL_ROUND(6) +KERNEL_ROUND(7) + +// kernel_round8 takes an extra argument, "sols" +__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +void kernel_round8(__global char *ht_src, __global char *ht_dst, + __global uint *rowCountersSrc, __global uint *rowCountersDst, + __global uint *debug, __global sols_t *sols) +{ + uint tid = get_global_id(0); + __local uchar first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; + __local uint collisionsData[LDS_COLL_SIZE]; + __local uint collisionsNum; + equihash_round(8, ht_src, ht_dst, debug, first_words_data, collisionsData, + &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); + if (!tid) + sols->nr = sols->likely_invalids = 0; +} + +uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot) +{ + return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN + + slot * SLOT_LEN + xi_offset - 4); +} + +/* +** Expand references to inputs. Return 1 if so far the solution appears valid, +** or 0 otherwise (an invalid solution would be a solution with duplicate +** inputs, which can be detected at the last step: round == 0). +*/ +uint expand_refs(uint *ins, uint nr_inputs, __global char **htabs, + uint round) +{ + __global char *ht = htabs[round % 2]; + uint i = nr_inputs - 1; + uint j = nr_inputs * 2 - 1; + uint xi_offset = xi_offset_for_round(round); + int dup_to_watch = -1; + do + { + ins[j] = expand_ref(ht, xi_offset, + DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i])); + ins[j - 1] = expand_ref(ht, xi_offset, + DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i])); + if (!round) + { + if (dup_to_watch == -1) + dup_to_watch = ins[j]; + else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch) + return 0; + } + if (!i) + break ; + i--; + j -= 2; + } + while (1); + return 1; +} + +/* +** Verify if a potential solution is in fact valid. +*/ +void potential_sol(__global char **htabs, __global sols_t *sols, + uint ref0, uint ref1) +{ + uint nr_values; + uint values_tmp[(1 << PARAM_K)]; + uint sol_i; + uint i; + nr_values = 0; + values_tmp[nr_values++] = ref0; + values_tmp[nr_values++] = ref1; + uint round = PARAM_K - 1; + do + { + round--; + if (!expand_refs(values_tmp, nr_values, htabs, round)) + return ; + nr_values *= 2; + } + while (round > 0); + // solution appears valid, copy it to sols + sol_i = atomic_inc(&sols->nr); + if (sol_i >= MAX_SOLS) + return ; + for (i = 0; i < (1 << PARAM_K); i++) + sols->values[sol_i][i] = values_tmp[i]; + sols->valid[sol_i] = 1; +} + +/* +** Scan the hash tables to find Equihash solutions. +*/ +__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols, + __global uint *rowCountersSrc, __global uint *rowCountersDst) +{ + __local uint counters[WORKSIZE/THREADS_PER_ROW]; + __local uint refs[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; + __local uint data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; + __local uint collisionsNum; + __local ulong collisions[WORKSIZE*4]; + + uint globalTid = get_global_id(0) / THREADS_PER_ROW; + uint localTid = get_local_id(0) / THREADS_PER_ROW; + uint localGroupId = get_local_id(0) % THREADS_PER_ROW; + __local uint *refsPtr = &refs[NR_SLOTS*localTid]; + __local uint *dataPtr = &data[NR_SLOTS*localTid]; + + __global char *htabs[2] = { ht0, ht1 }; + __global char *hcounters[2] = { rowCountersSrc, rowCountersDst }; + uint ht_i = (PARAM_K - 1) % 2; // table filled at last round + uint cnt; + uint xi_offset = xi_offset_for_round(PARAM_K - 1); + uint i, j; + __global char *p; + uint ref_i, ref_j; + // it's ok for the collisions array to be so small, as if it fills up + // the potential solutions are likely invalid (many duplicate inputs) +// ulong collisions; +#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 + // in the final hash table, we are looking for a match on both the bits + // part of the previous PREFIX colliding bits, and the last PREFIX bits. + uint mask = 0xffffff; +#else +#error "unsupported NR_ROWS_LOG" +#endif + + collisionsNum = 0; + + for (uint chunk = 0; chunk < THREADS_PER_ROW; chunk++) { + uint tid = globalTid + NR_ROWS/THREADS_PER_ROW*chunk; + p = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN; + uint rowIdx = tid/ROWS_PER_UINT; + uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT); + cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK; + cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round + p += xi_offset; + p += SLOT_LEN*localGroupId; + + for (i = get_local_id(0); i < WORKSIZE/THREADS_PER_ROW; i += get_local_size(0)) + counters[i] = 0; + for (i = localGroupId; i < cnt; i += THREADS_PER_ROW, p += SLOT_LEN*THREADS_PER_ROW) { + refsPtr[i] = *(__global uint *)(p - 4); + dataPtr[i] = (*(__global uint *)p) & mask; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (i = 0; i < cnt; i++) + { + uint a_data = dataPtr[i]; + ref_i = refsPtr[i]; + for (j = i + 1 + localGroupId; j < cnt; j += THREADS_PER_ROW) + { + if (a_data == dataPtr[j]) + { + if (atomic_inc(&counters[localTid]) == 0) + collisions[atomic_inc(&collisionsNum)] = ((ulong)ref_i << 32) | refsPtr[j]; + goto part2; + } + } + } + +part2: + continue; + } + + barrier(CLK_LOCAL_MEM_FENCE); + uint totalCollisions = collisionsNum; + if (get_local_id(0) < totalCollisions) { + ulong coll = collisions[get_local_id(0)]; + potential_sol(htabs, sols, coll >> 32, coll & 0xffffffff); + } +} diff --git a/ocl.c b/ocl.c index 50e3472d..67246f1e 100644 --- a/ocl.c +++ b/ocl.c @@ -959,7 +959,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg * 2 greater >= required amount earlier */ if (bufsize > cgpu->max_alloc) { applog(LOG_WARNING, "Maximum buffer memory device %d supports says %lu", - gpu, (unsigned long)(cgpu->max_alloc)); + gpu, (unsigned long)(cgpu->max_alloc)); applog(LOG_WARNING, "Your settings come to %lu", (unsigned long)bufsize); } @@ -967,35 +967,35 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg // need additionnal buffers clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status); if (status != CL_SUCCESS && !clState->buffer1) { - applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); - return NULL; + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); + return NULL; } clState->buffer2 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf2size, NULL, &status); if (status != CL_SUCCESS && !clState->buffer2) { - applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer2), decrease TC or increase LG", status); - return NULL; + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer2), decrease TC or increase LG", status); + return NULL; } clState->buffer3 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf3size, NULL, &status); if (status != CL_SUCCESS && !clState->buffer3) { - applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer3), decrease TC or increase LG", status); - return NULL; + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer3), decrease TC or increase LG", status); + return NULL; } } else if (algorithm->type == ALGO_LYRA2REV2) { // need additionnal buffers clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status); if (status != CL_SUCCESS && !clState->buffer1) { - applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); - return NULL; + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); + return NULL; } } else { clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); // we don't need that much just tired... if (status != CL_SUCCESS && !clState->buffer1) { - applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); - return NULL; + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); + return NULL; } } @@ -1009,40 +1009,35 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg } } - if(algorithm->type == ALGO_CRYPTONIGHT) - { - size_t GlobalThreads; - readbufsize = 76UL; + if (algorithm->type == ALGO_CRYPTONIGHT) { + size_t GlobalThreads; + readbufsize = 76UL; - set_threads_hashes(1, clState->compute_shaders, &GlobalThreads, 1, &cgpu->intensity, &cgpu->xintensity, &cgpu->rawintensity, &cgpu->algorithm); + set_threads_hashes(1, clState->compute_shaders, &GlobalThreads, 1, &cgpu->intensity, &cgpu->xintensity, &cgpu->rawintensity, &cgpu->algorithm); - for(int i = 0; i < 4; ++i) - { - clState->BranchBuffer[i] = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, sizeof(cl_uint) * (GlobalThreads + 2), NULL, &status); + for (int i = 0; i < 4; ++i) { + clState->BranchBuffer[i] = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, sizeof(cl_uint) * (GlobalThreads + 2), NULL, &status); - if(status != CL_SUCCESS) - { - applog(LOG_ERR, "Error %d when creating branch buffer %d.\n", status, i); - return NULL; - } - } + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d when creating branch buffer %d.\n", status, i); + return NULL; + } + } - clState->States = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, 200 * GlobalThreads, NULL, &status); + clState->States = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, 200 * GlobalThreads, NULL, &status); - if(status != CL_SUCCESS) - { - applog(LOG_ERR, "Error %d when creating Cryptonight state buffer.\n", status); - return NULL; - } + if(status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d when creating Cryptonight state buffer.\n", status); + return NULL; + } - clState->Scratchpads = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, (1 << 21) * GlobalThreads, NULL, &status); + clState->Scratchpads = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, (1 << 21) * GlobalThreads, NULL, &status); - if(status != CL_SUCCESS) - { - applog(LOG_ERR, "Error %d when creating Cryptonight scratchpads buffer.\n", status); - return NULL; - } - } + if(status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d when creating Cryptonight scratchpads buffer.\n", status); + return NULL; + } + } applog(LOG_DEBUG, "Using read buffer sized %lu", (unsigned long)readbufsize); clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, readbufsize, NULL, &status); diff --git a/sgminer.c b/sgminer.c index 4df90fb3..2d1f048a 100644 --- a/sgminer.c +++ b/sgminer.c @@ -609,6 +609,7 @@ struct pool *add_pool(void) cglock_init(&pool->data_lock); mutex_init(&pool->stratum_lock); cglock_init(&pool->gbt_lock); + mutex_init(&pool->XMRGlobalNonceLock); INIT_LIST_HEAD(&pool->curlring); /* Make sure the pool doesn't think we've been idle since time 0 */ @@ -3110,7 +3111,7 @@ share_result(json_t *val, json_t *res, json_t *err, const struct work *work, cgpu = get_thr_cgpu(work->thr_id); - if (json_is_true(res) || (work->gbt && json_is_null(res))) { + if (json_is_true(res) || (work->gbt && json_is_null(res)) || (pool->algorithm.type == ALGO_CRYPTONIGHT && json_is_null(err))) { mutex_lock(&stats_lock); cgpu->accepted++; total_accepted++; @@ -5556,7 +5557,7 @@ static bool parse_stratum_response(struct pool *pool, char *s) err_val = json_object_get(val, "error"); id_val = json_object_get(val, "id"); - if ((json_is_null(id_val) || !id_val) && pool->algorithm.type != ALGO_ETHASH) { + if ((json_is_null(id_val) || !id_val) && (pool->algorithm.type != ALGO_ETHASH && pool->algorithm.type != ALGO_CRYPTONIGHT)) { char *ss; if (err_val) @@ -5583,6 +5584,7 @@ static bool parse_stratum_response(struct pool *pool, char *s) if (!sshare) { double pool_diff; + bool success = false; /* Since the share is untracked, we can only guess at what the * work difficulty is based on the current pool diff. */ @@ -5590,7 +5592,47 @@ static bool parse_stratum_response(struct pool *pool, char *s) pool_diff = pool->swork.diff; cg_runlock(&pool->data_lock); - if (json_is_true(res_val)) { + //for cryptonight, the result contains the "status" object which should = "OK" on accept + if (pool->algorithm.type == ALGO_CRYPTONIGHT) { + json_t *res_id, *res_job; + + //check if the result contains an id... if so then we need to process as first job, not share response + if ((res_id = json_object_get(res_val, "id"))) { + cg_wlock(&pool->data_lock); + strcpy(pool->XMRAuthID, json_string_value(res_id)); + cg_wunlock(&pool->data_lock); + + //get the job object and send to parse notify + if ((res_job = json_object_get(res_val, "job"))) { + ret = parse_notify_cn(pool, res_job); + } + + goto out; + } + + if (json_is_null(err_val) && !strcmp(json_string_value(json_object_get(res_val, "status")), "OK")) { + success = true; + } + else { + char *ss; + + if (err_val) { + ss = json_dumps(err_val, JSON_INDENT(3)); + } + else { + ss = strdup("(unknown reason)"); + } + + applog(LOG_INFO, "JSON-RPC response decode failed: %s", ss); + + free(ss); + } + } + else { + success = json_is_true(res_val); + } + + if (success) { applog(LOG_NOTICE, "Accepted untracked stratum share from %s", get_pool_name(pool)); /* We don't know what device this came from so we can't @@ -5730,6 +5772,7 @@ static void wait_lpcurrent(struct pool *pool); static void pool_resus(struct pool *pool); static void gen_stratum_work(struct pool *pool, struct work *work); static void gen_stratum_work_eth(struct pool *pool, struct work *work); +static void gen_stratum_work_cn(struct pool *pool, struct work *work); static void stratum_resumed(struct pool *pool) { if (!pool->stratum_notify) @@ -5852,8 +5895,20 @@ static void *stratum_rthread(void *userdata) /* Generate a single work item to update the current * block database */ pool->swork.clean = false; - if(pool->algorithm.type == ALGO_ETHASH) gen_stratum_work_eth(pool, work); - else gen_stratum_work(pool, work); + + switch(pool->algorithm.type) { + case ALGO_ETHASH: + gen_stratum_work_eth(pool, work); + break; + + case ALGO_CRYPTONIGHT: + gen_stratum_work_cn(pool, work); + break; + + default: + gen_stratum_work(pool, work); + } + work->longpoll = true; /* Return value doesn't matter. We're just informing * that we may need to restart. */ @@ -5934,6 +5989,32 @@ static void *stratum_sthread(void *userdata) free(ASCIIMixHash); free(ASCIIPoWHash); } + else if (pool->algorithm.type == ALGO_CRYPTONIGHT) { + sshare = (struct stratum_share *)calloc(sizeof(struct stratum_share), 1); + submitted = false; + char *ASCIIResult; + uint8_t HashResult[32]; + + sshare->sshare_time = time(NULL); + /* This work item is freed in parse_stratum_response */ + sshare->work = work; + + applog(LOG_DEBUG, "stratum_sthread() algorithm = %s", pool->algorithm.name); + + char *ASCIINonce = bin2hex(&work->XMRNonce, 4); + + ASCIIResult = bin2hex(work->hash, 32); + + mutex_lock(&sshare_lock); + /* Give the stratum share a unique id */ + sshare->id = swork_id++; + mutex_unlock(&sshare_lock); + snprintf(s, s_size, "{\"method\": \"submit\", \"params\": {\"id\": \"%s\", \"job_id\": \"%s\", \"nonce\": \"%s\", \"result\": \"%s\"}, \"id\":%d}", pool->XMRAuthID, work->job_id, ASCIINonce, ASCIIResult, sshare->id); + + free(ASCIINonce); + free(ASCIIResult); + } + else if(pool->algorithm.type == ALGO_EQUIHASH) { char *nonce; hash32 = (uint32_t *)work->hash; @@ -6145,10 +6226,18 @@ static bool pool_active(struct pool *pool, bool pinging) if (!init) { bool ret = initiate_stratum(pool) && (!pool->extranonce_subscribe || subscribe_extranonce(pool)) && auth_stratum(pool); - if (ret) + if (ret) { init_stratum_threads(pool); - else + + if (pool->algorithm.type == ALGO_CRYPTONIGHT) { + struct work *work = make_work(); + gen_stratum_work_cn(pool, work); + stage_work(work); + } + } + else { pool_tclear(pool, &pool->stratum_init); + } return ret; } return pool->stratum_active; @@ -6502,6 +6591,7 @@ void set_target_neoscrypt(unsigned char *target, double diff, const int thr_id) } } + /* Generates stratum based work based on the most recent notify information * from the pool. This will keep generating work while a pool is down so we use * other means to detect when the pool has died in stratum_thread */ @@ -6537,6 +6627,49 @@ static void gen_stratum_work_eth(struct pool *pool, struct work *work) cgtime(&work->tv_staged); } +/* Generates stratum based work based on the most recent notify information + * from the pool. This will keep generating work while a pool is down so we use + * other means to detect when the pool has died in stratum_thread */ + +static void gen_stratum_work_cn(struct pool *pool, struct work *work) +{ + if(pool->algorithm.type != ALGO_CRYPTONIGHT) + return; + + applog(LOG_DEBUG, "[THR%d] gen_stratum_work_cn() - algorithm = %s", work->thr_id, pool->algorithm.name); + + cg_rlock(&pool->data_lock); + work->job_id = strdup(pool->swork.job_id); + work->XMRTarget = pool->XMRTarget; + //strcpy(work->XMRID, pool->XMRID); + //work->XMRBlockBlob = strdup(pool->XMRBlockBlob); + memcpy(work->XMRBlob, pool->XMRBlob, 76); + memcpy(work->data, work->XMRBlob, 76); + memset(work->target, 0xFF, 32); + work->sdiff = (double)0xffffffff / pool->XMRTarget; + work->work_difficulty = work->sdiff; + work->network_diff = pool->diff1; + cg_runlock(&pool->data_lock); + + work->target[7] = work->XMRTarget; + + local_work++; + work->pool = pool; + work->stratum = true; + work->blk.nonce = 0; + work->id = total_work++; + work->longpoll = false; + work->getwork_mode = GETWORK_MODE_STRATUM; + + work->work_block = work->data[0]; + // Do not allow ntime rolling + work->drv_rolllimit = 0; + + cgtime(&work->tv_staged); + + applog(LOG_DEBUG, "gen_stratum_work_cn() done."); +} + static void gen_stratum_work_equihash(struct pool *pool, struct work *work) { cg_wlock(&pool->data_lock); @@ -7524,17 +7657,19 @@ void inc_hw_errors(struct thr_info *thr) static void rebuild_nonce(struct work *work, uint32_t nonce) { uint32_t nonce_pos = 76; - if (work->pool->algorithm.type == ALGO_CRE) nonce_pos = 140; - if(work->pool->algorithm.type == ALGO_ETHASH) - { - uint64_t *work_nonce = (uint64_t *)(work->data + 32); - *work_nonce = (uint64_t)htole32(nonce); + if (work->pool->algorithm.type == ALGO_CRE) + nonce_pos = 140; + else if (work->pool->algorithm.type == ALGO_CRYPTONIGHT) + nonce_pos = 39; + + if (work->pool->algorithm.type == ALGO_ETHASH) { + uint64_t *work_nonce = (uint64_t *)(work->data + 32); + *work_nonce = htole32(nonce); } - else - { - uint32_t *work_nonce = (uint32_t *)(work->data + nonce_pos); + else { + uint32_t *work_nonce = (uint32_t *)(work->data + nonce_pos); - *work_nonce = htole32(nonce); + *work_nonce = htole32(nonce); } work->pool->algorithm.regenhash(work); @@ -7557,6 +7692,9 @@ bool test_nonce(struct work *work, uint32_t nonce) uint64_t target = *(uint64_t*) (work->device_target + 24); return (bswap_64(*(uint64_t*) work->hash) <= target); } + else if (work->pool->algorithm.type = ALGO_CRYPTONIGHT) { + return (((uint32_t *)work->hash)[7] <= work->XMRTarget); + } else { diff1targ = work->pool->algorithm.diff1targ; } @@ -7585,6 +7723,7 @@ static void update_work_stats(struct thr_info *thr, struct work *work) total_diff1 += work->device_diff; thr->cgpu->diff1 += work->device_diff; work->pool->diff1 += work->device_diff; + thr->cgpu->last_device_valid_work = time(NULL); mutex_unlock(&stats_lock); } @@ -7595,14 +7734,15 @@ bool submit_tested_work(struct thr_info *thr, struct work *work) struct work *work_out; update_work_stats(thr, work); - if(work->pool->algorithm.type == ALGO_ETHASH) { + if (work->pool->algorithm.type == ALGO_ETHASH) { uint64_t LETarget = ((uint64_t *)work->target)[3]; - if(bswap_64(((uint64_t *)work->hash)[0]) > LETarget) { -// applog(LOG_INFO, "%s %d: Share above target", thr->cgpu->drv->name, thr->cgpu->device_id); - return(false); + if (bswap_64(((uint64_t *)work->hash)[0]) > LETarget) { + return false; } } + else if (work->pool->algorithm.type == ALGO_CRYPTONIGHT) { + } else if (work->pool->algorithm.type == ALGO_EQUIHASH) { applog(LOG_DEBUG, "equihash target: %.16llx", *(uint64_t*) (work->target + 24)); if (*(uint64_t*) (work->hash + 24) > *(uint64_t*) (work->target + 24)) @@ -9642,8 +9782,20 @@ int main(int argc, char *argv[]) goto retry; } } - if(pool->algorithm.type == ALGO_ETHASH) gen_stratum_work_eth(pool, work); - else gen_stratum_work(pool, work); + + switch(pool->algorithm.type) { + case ALGO_ETHASH: + gen_stratum_work_eth(pool, work); + break; + + case ALGO_CRYPTONIGHT: + gen_stratum_work_cn(pool, work); + break; + + default: + gen_stratum_work(pool, work); + } + applog(LOG_DEBUG, "Generated stratum work"); stage_work(work); continue; diff --git a/util.h b/util.h index 70b594c1..2904af2b 100644 --- a/util.h +++ b/util.h @@ -135,6 +135,7 @@ bool stratum_send(struct pool *pool, char *s, ssize_t len); bool sock_full(struct pool *pool); char *recv_line(struct pool *pool); bool parse_method(struct pool *pool, char *s); +bool parse_notify_cn(struct pool *pool, json_t *val); bool extract_sockaddr(char *url, char **sockaddr_url, char **sockaddr_port); bool auth_stratum(struct pool *pool); bool subscribe_extranonce(struct pool *pool);