diff --git a/algorithm/equihash.c b/algorithm/equihash.c new file mode 100644 index 00000000..bf339739 --- /dev/null +++ b/algorithm/equihash.c @@ -0,0 +1,390 @@ +#include + +#include "config.h" +#include "miner.h" +#include "algorithm/equihash.h" + +#include "algorithm.h" + +#define N 200UL +#define K 9UL + +#define COLLISION_BIT_LENGTH (N / (K + 1)) +#define COLLISION_BYTE_LENGTH ((COLLISION_BIT_LENGTH + 7) / 8) +#define INIT_SIZE (1 << (COLLISION_BIT_LENGTH + 1)) +#define HASH_LENGTH ((K + 1) * COLLISION_BYTE_LENGTH) +#define INDICES_PER_HASH_OUTPUT (512 / N) +#define HASH_OUTPUT (INDICES_PER_HASH_OUTPUT * N/8) + +#define rotr64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) + + +static const uint8_t blake2b_sigma[12][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } +}; + + +static const uint64_t blake2b_IV[8] = { + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +static const uint64_t blake2b_h[8] = { + 0x6a09e667f2bdc93aULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x48ec89c38820de31ULL, 0x5be0cd10137e21b1ULL +}; + + +// block_header_size: 108 byte +// nonce_size: 32 byte +// birthday_size: 4 byte +// nonce in high part of m[1] + + +#define G(r,i,a,b,c,d) \ + a = a + b + m[blake2b_sigma[r][2*i]]; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b + m[blake2b_sigma[r][2*i+1]]; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); + +#define ROUND(r) \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); + + +#define G_fast(r,i,a,b,c,d) \ + a = a + b + (blake2b_sigma[r][2*i] == 1 ? m1 : 0); \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b + (blake2b_sigma[r][2*i+1] == 1 ? m1 : 0); \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); + +#define ROUND_fast(r) \ + G_fast(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G_fast(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G_fast(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G_fast(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G_fast(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G_fast(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G_fast(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G_fast(r,7,v[ 3],v[ 4],v[ 9],v[14]); + + +void equihash_calc_mid_hash(uint64_t mid_hash[8], uint8_t* header) { + uint64_t v[16], *m = (uint64_t*) header; + for (int i = 0; i < 8; i++) { + v[i] = blake2b_h[i]; + v[i+8] = blake2b_IV[i]; + } + v[12] ^= 128; + for (int r = 0; r < 12; r++) { + ROUND(r) + } + for (int i = 0; i < 8; i++) + mid_hash[i] = blake2b_h[i] ^ v[i] ^ v[i+8]; +} + +void blake2b_hash(uint8_t *hash, uint64_t mid_hash[8], uint32_t bday) { + uint64_t v[16], tmp[8]; + uint64_t m1 = (uint64_t) bday << 32; + for (int i = 0; i < 8; i++) { + v[i] = mid_hash[i]; + v[i+8] = blake2b_IV[i]; + } + v[12] ^= 140 + sizeof(bday); + v[14] ^= (int64_t) -1; + for (int r = 0; r < 12; r++) { + ROUND_fast(r) + } + for (int i = 0; i < 8; i++) + tmp[i] = mid_hash[i] ^ v[i] ^ v[i+8]; + memcpy(hash, tmp, 50); +} + +void equihash_calc_hash(uint8_t hash[25], uint64_t mid_hash[8], uint32_t bday) { + uint8_t tmp[50]; + blake2b_hash(tmp, mid_hash, bday/2); + memcpy(hash, tmp + (bday & 1 ? 25 : 0), 25); +} + +void equihash_sort_indices(uint32_t* indices) { + for (int i = 0; i < 512; i++) + indices[i] = htobe32(indices[i]); + uint32_t tmp[256]; + for (int len = 1; len <= 256; len *= 2){ + for (int i = 0; i < 512; i += 2*len) { + bool is_before = (memcmp(indices + i, indices + i + len, 4*len) < 0); + if (is_before) + continue; + for (int j = i; j < i + len; j++) { + uint32_t tmp = indices[j + len]; + indices[j + len] = indices[j]; + indices[j] = tmp; + } + } + } +} + + +// These two copied from the ref impl, for now. +void ExpandArray(const unsigned char* in, size_t in_len, + unsigned char* out, size_t out_len, + size_t bit_len) +{ + size_t byte_pad = 0; + size_t out_width = ((bit_len+7)/8 + byte_pad); + uint32_t bit_len_mask = (((uint32_t)1 << bit_len) - 1); + + // The acc_bits least-significant bits of acc_value represent a bit sequence + // in big-endian order. + size_t acc_bits = 0; + uint32_t acc_value = 0; + + size_t j = 0; + for (size_t i = 0; i < in_len; i++) { + acc_value = (acc_value << 8) | in[i]; + acc_bits += 8; + + // When we have bit_len or more bits in the accumulator, write the next + // output element. + if (acc_bits >= bit_len) { + acc_bits -= bit_len; + for (size_t x = 0; x < byte_pad; x++) { + out[j+x] = 0; + } + for (size_t x = byte_pad; x < out_width; x++) { + out[j+x] = ( + // Big-endian + acc_value >> (acc_bits+(8*(out_width-x-1))) + ) & ( + // Apply bit_len_mask across byte boundaries + (bit_len_mask >> (8*(out_width-x-1))) & 0xFF + ); + } + j += out_width; + } + } +} + + +void CompressArray(const unsigned char* in, size_t in_len, + unsigned char* out, size_t out_len, + size_t bit_len, size_t byte_pad) +{ + size_t in_width = ((bit_len+7)/8 + byte_pad); + uint32_t bit_len_mask = (((uint32_t)1 << bit_len) - 1); + + // The acc_bits least-significant bits of acc_value represent a bit sequence + // in big-endian order. + size_t acc_bits = 0; + uint32_t acc_value = 0; + + size_t j = 0; + for (size_t i = 0; i < out_len; i++) { + // When we have fewer than 8 bits left in the accumulator, read the next + // input element. + if (acc_bits < 8) { + acc_value = acc_value << bit_len; + for (size_t x = byte_pad; x < in_width; x++) { + acc_value = acc_value | ( + ( + // Apply bit_len_mask across byte boundaries + in[j+x] & ((bit_len_mask >> (8*(in_width-x-1))) & 0xFF) + ) << (8*(in_width-x-1)) + ); // Big-endian + } + j += in_width; + acc_bits += bit_len; + } + + acc_bits -= 8; + out[i] = (acc_value >> acc_bits) & 0xFF; + } +} + + +bool submit_tested_work(struct thr_info *, struct work *); +int equihash_check_solutions(struct work *work, uint32_t* indices, uint64_t *mid_hash) { + int count = 0; + uint8_t hash[10][512][25]; + uint32_t found_idx = work->pool->algorithm.found_idx; + /* + for (int i = 0; i < MIN(indices[found_idx], found_idx / 512); i++) { + for (int j = 0; j < 512; j++) { + equihash_calc_hash(hash[0][j], mid_hash, indices[512*i + j]); + } + for (int depth = 1; depth < 10; depth++) { + for (int j = 0; j < (1 << (9 - depth)); j++) { + for (int u = 0; u < 25; u++) + hash[depth][j][u] = hash[depth-1][2*j][u] ^ hash[depth-1][2*j+1][u]; + + int k = 0; + for (; k < (depth + (depth == 9 ? 1 : 0)) * 20 / 8; k++) + if (hash[depth][j][k] != 0) + goto out; + if ((depth * 20 % 8) && (hash[depth][j][k] & 0xf0)) + goto out; + } + } + count++; + equihash_sort_indices(indices + 512*i); + CompressArray((unsigned char*) (indices + 512*i), 512*4, work->equihash_data + 143, 1344, 21, 1); + gen_hash(work->equihash_data, 1344 + 143, work->hash); + if (*(uint64_t*) (work->hash + 24) < *(uint64_t*) (work->target + 24)) + submit_tested_work(work->thr, work); +out: + continue; + } + return count; */ + + for (int i = 0; i < MIN(indices[found_idx], found_idx / 512); i++) { + equihash_sort_indices(indices + 512*i); + CompressArray((unsigned char*) (indices + 512*i), 512*4, work->equihash_data + 143, 1344, 21, 1); + gen_hash(work->equihash_data, 1344 + 143, work->hash); + if (*(uint64_t*) (work->hash + 24) >= *(uint64_t*) (work->target + 24)) + continue; + for (int j = 0; j < 512; j++) { + equihash_calc_hash(hash[0][j], mid_hash, be32toh(indices[512*i + j])); + } + for (int depth = 1; depth < 10; depth++) { + for (int j = 0; j < (1 << (9 - depth)); j++) { + for (int u = 0; u < 25; u++) + hash[depth][j][u] = hash[depth-1][2*j][u] ^ hash[depth-1][2*j+1][u]; + + int k = 0; + for (; k < (depth + (depth == 9 ? 1 : 0)) * 20 / 8; k++) + if (hash[depth][j][k] != 0) + goto out; + if ((depth * 20 % 8) && (hash[depth][j][k] & 0xf0)) + goto out; + } + } + submit_tested_work(work->thr, work); +out: + continue; + } + return 1; +} + + + +static inline void sort_pair(uint32_t *a, uint32_t len) +{ + uint32_t *b = a + len; + uint32_t tmp, need_sorting = 0; + for (uint32_t i = 0; i < len; i++) { + if (need_sorting || a[i] > b[i]) { + need_sorting = 1; + tmp = a[i]; + a[i] = b[i]; + b[i] = tmp; + } + else if (a[i] < b[i]) + break; + } +} + + +#include "kernel/equihash-param.h" +uint32_t verify_sol(struct work *work, sols_t *sols, int sol_i) +{ + uint32_t thr_id = work->thr->id; + uint32_t *inputs = sols->values[sol_i]; + uint32_t seen_len = (1 << (PREFIX + 1)) / 8; + uint8_t seen[seen_len]; + uint32_t i; + uint8_t tmp; + // look for duplicate inputs + memset(seen, 0, seen_len); + for (i = 0; i < (1 << PARAM_K); i++) { + + if (inputs[i] / 8 >= seen_len) { + sols->valid[sol_i] = 0; + return 0; + } + tmp = seen[inputs[i] / 8]; + seen[inputs[i] / 8] |= 1 << (inputs[i] & 7); + if (tmp == seen[inputs[i] / 8]) { + // at least one input value is a duplicate + sols->valid[sol_i] = 0; + return 0; + } + } + // the valid flag is already set by the GPU, but set it again because + // I plan to change the GPU code to not set it + sols->valid[sol_i] = 1; + // sort the pairs in place + for (uint32_t level = 0; level < PARAM_K; level++) { + for (i = 0; i < (1 << PARAM_K); i += (2 << level)) { + sort_pair(&inputs[i], 1 << level); + } + } + + for (i = 0; i < (1 << PARAM_K); i++) + inputs[i] = htobe32(inputs[i]); + + CompressArray((unsigned char*) inputs, 512*4, work->equihash_data + 143, 1344, 21, 1); + + gen_hash(work->equihash_data, 1344 + 143, work->hash); + + if (work->getwork_mode == GETWORK_MODE_STRATUM) { + +/* char *eqdata = bin2hex(work->equihash_data, 1487); + applog(LOG_DEBUG, "[THR%d] %s: got solution... %s", thr_id, __func__, eqdata); + free(eqdata); + char *whash = bin2hex(work->hash, 32); + applog(LOG_DEBUG, "[THR%d] %s: hash: %s", thr_id, __func__, whash); + free(whash); + char *targ = bin2hex(work->target, 32); + applog(LOG_DEBUG, "[THR%d] %s: target: %s", thr_id, __func__, targ); + free(targ); + applog(LOG_DEBUG, "[THR%d] %s: %08lx <= %08lx?", thr_id, __func__, ((uint32_t *)work->hash)[7], ((uint32_t *)work->target)[7]);*/ + if (((uint32_t *)work->hash)[7] <= ((uint32_t *)work->target)[7]) { + // applog(LOG_DEBUG, "[THR%d] %s: valid!", thr_id, __func__); + submit_nonce(work->thr, work, 0); + } + /*else { + applog(LOG_DEBUG, "[THR%d] %s: invalid...", thr_id, __func__); + }*/ + } + else { + if (*(uint64_t*) (work->hash + 24) < *(uint64_t*) (work->target + 24)) { + submit_tested_work(work->thr, work); + } + } + return 1; +} + +void equihash_regenhash(struct work *work) +{ + +} + diff --git a/algorithm/equihash.h b/algorithm/equihash.h new file mode 100644 index 00000000..79ed5505 --- /dev/null +++ b/algorithm/equihash.h @@ -0,0 +1,12 @@ +#ifndef __EQUIHASH_H +#define __EQUIHASH_H + +#include +#include "miner.h" + +void equihash_calc_mid_hash(uint64_t[8], uint8_t*); +int equihash_check_solutions(struct work*, uint32_t*, uint64_t*); +void equihash_regenhash(struct work *work); +int64_t equihash_scanhash(struct thr_info *thr, struct work *work, int64_t *last_nonce, int64_t __maybe_unused max_nonce); + +#endif // __EQUIHASH_H diff --git a/algorithm/extern/blake2-impl.h b/algorithm/extern/blake2-impl.h new file mode 100644 index 00000000..5ac7a430 --- /dev/null +++ b/algorithm/extern/blake2-impl.h @@ -0,0 +1,136 @@ +/* + BLAKE2 reference source code package - reference C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#pragma once +#ifndef __BLAKE2_IMPL_H__ +#define __BLAKE2_IMPL_H__ + +#include + +static inline uint32_t load32( const void *src ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + uint32_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = ( const uint8_t * )src; + uint32_t w = *p++; + w |= ( uint32_t )( *p++ ) << 8; + w |= ( uint32_t )( *p++ ) << 16; + w |= ( uint32_t )( *p++ ) << 24; + return w; +#endif +} + +static inline uint64_t load64( const void *src ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + uint64_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = ( const uint8_t * )src; + uint64_t w = *p++; + w |= ( uint64_t )( *p++ ) << 8; + w |= ( uint64_t )( *p++ ) << 16; + w |= ( uint64_t )( *p++ ) << 24; + w |= ( uint64_t )( *p++ ) << 32; + w |= ( uint64_t )( *p++ ) << 40; + w |= ( uint64_t )( *p++ ) << 48; + w |= ( uint64_t )( *p++ ) << 56; + return w; +#endif +} + +static inline void store32( void *dst, uint32_t w ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = ( uint8_t * )dst; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; +#endif +} + +static inline void store64( void *dst, uint64_t w ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = ( uint8_t * )dst; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; +#endif +} + +static inline uint64_t load48( const void *src ) +{ + const uint8_t *p = ( const uint8_t * )src; + uint64_t w = *p++; + w |= ( uint64_t )( *p++ ) << 8; + w |= ( uint64_t )( *p++ ) << 16; + w |= ( uint64_t )( *p++ ) << 24; + w |= ( uint64_t )( *p++ ) << 32; + w |= ( uint64_t )( *p++ ) << 40; + return w; +} + +static inline void store48( void *dst, uint64_t w ) +{ + uint8_t *p = ( uint8_t * )dst; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; +} + +static inline uint32_t rotl32( const uint32_t w, const unsigned c ) +{ + return ( w << c ) | ( w >> ( 32 - c ) ); +} + +static inline uint64_t rotl64( const uint64_t w, const unsigned c ) +{ + return ( w << c ) | ( w >> ( 64 - c ) ); +} + +static inline uint32_t rotr32( const uint32_t w, const unsigned c ) +{ + return ( w >> c ) | ( w << ( 32 - c ) ); +} + +static inline uint64_t rotr64( const uint64_t w, const unsigned c ) +{ + return ( w >> c ) | ( w << ( 64 - c ) ); +} + +/* prevents compiler optimizing out memset() */ +static inline void secure_zero_memory( void *v, size_t n ) +{ + volatile uint8_t *p = ( volatile uint8_t * )v; + while( n-- ) *p++ = 0; +} + +#endif + diff --git a/algorithm/extern/blake2.h b/algorithm/extern/blake2.h new file mode 100644 index 00000000..f8aba833 --- /dev/null +++ b/algorithm/extern/blake2.h @@ -0,0 +1,156 @@ +/* + BLAKE2 reference source code package - reference C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#pragma once +#ifndef __BLAKE2_H__ +#define __BLAKE2_H__ + +#include +#include + +#if defined(_MSC_VER) +#define ALIGN(x) __declspec(align(x)) +#else +#define ALIGN(x) __attribute__((aligned(x))) +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + + enum blake2s_constant + { + BLAKE2S_BLOCKBYTES = 64, + BLAKE2S_OUTBYTES = 32, + BLAKE2S_KEYBYTES = 32, + BLAKE2S_SALTBYTES = 8, + BLAKE2S_PERSONALBYTES = 8 + }; + + enum blake2b_constant + { + BLAKE2B_BLOCKBYTES = 128, + BLAKE2B_OUTBYTES = 64, + BLAKE2B_KEYBYTES = 64, + BLAKE2B_SALTBYTES = 16, + BLAKE2B_PERSONALBYTES = 16 + }; + +#pragma pack(push, 1) + typedef struct __blake2s_param + { + uint8_t digest_length; // 1 + uint8_t key_length; // 2 + uint8_t fanout; // 3 + uint8_t depth; // 4 + uint32_t leaf_length; // 8 + uint8_t node_offset[6];// 14 + uint8_t node_depth; // 15 + uint8_t inner_length; // 16 + // uint8_t reserved[0]; + uint8_t salt[BLAKE2S_SALTBYTES]; // 24 + uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32 + } blake2s_param; + + ALIGN( 64 ) typedef struct __blake2s_state + { + uint32_t h[8]; + uint32_t t[2]; + uint32_t f[2]; + uint8_t buf[2 * BLAKE2S_BLOCKBYTES]; + size_t buflen; + uint8_t last_node; + } blake2s_state ; + + typedef struct __blake2b_param + { + uint8_t digest_length; // 1 + uint8_t key_length; // 2 + uint8_t fanout; // 3 + uint8_t depth; // 4 + uint32_t leaf_length; // 8 + uint64_t node_offset; // 16 + uint8_t node_depth; // 17 + uint8_t inner_length; // 18 + uint8_t reserved[14]; // 32 + uint8_t salt[BLAKE2B_SALTBYTES]; // 48 + uint8_t personal[BLAKE2B_PERSONALBYTES]; // 64 + } blake2b_param; + + ALIGN( 64 ) typedef struct __blake2b_state + { + uint64_t h[8]; + uint64_t t[2]; + uint64_t f[2]; + uint8_t buf[2 * BLAKE2B_BLOCKBYTES]; + size_t buflen; + uint8_t last_node; + } blake2b_state; + + typedef struct __blake2sp_state + { + blake2s_state S[8][1]; + blake2s_state R[1]; + uint8_t buf[8 * BLAKE2S_BLOCKBYTES]; + size_t buflen; + } blake2sp_state; + + typedef struct __blake2bp_state + { + blake2b_state S[4][1]; + blake2b_state R[1]; + uint8_t buf[4 * BLAKE2B_BLOCKBYTES]; + size_t buflen; + } blake2bp_state; +#pragma pack(pop) + + // Streaming API + int blake2s_init( blake2s_state *S, const uint8_t outlen ); + int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); + int blake2s_init_param( blake2s_state *S, const blake2s_param *P ); + int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen ); + int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen ); + + int blake2b_init( blake2b_state *S, const uint8_t outlen ); + int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); + int blake2b_init_param( blake2b_state *S, const blake2b_param *P ); + int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen ); + int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen ); + + int blake2sp_init( blake2sp_state *S, const uint8_t outlen ); + int blake2sp_init_key( blake2sp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); + int blake2sp_update( blake2sp_state *S, const uint8_t *in, uint64_t inlen ); + int blake2sp_final( blake2sp_state *S, uint8_t *out, uint8_t outlen ); + + int blake2bp_init( blake2bp_state *S, const uint8_t outlen ); + int blake2bp_init_key( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); + int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen ); + int blake2bp_final( blake2bp_state *S, uint8_t *out, uint8_t outlen ); + + // Simple API + int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); + int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); + + int blake2sp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); + int blake2bp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); + + static inline int blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ) + { + return blake2b( out, in, key, outlen, inlen, keylen ); + } + +#if defined(__cplusplus) +} +#endif + +#endif + diff --git a/algorithm/extern/blake2b-ref.c b/algorithm/extern/blake2b-ref.c new file mode 100644 index 00000000..7064b28d --- /dev/null +++ b/algorithm/extern/blake2b-ref.c @@ -0,0 +1,396 @@ +/* + BLAKE2 reference source code package - reference C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ + +#include +#include +#include + +#include "blake2.h" +#include "blake2-impl.h" + +static const uint64_t blake2b_IV[8] = +{ + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +static const uint8_t blake2b_sigma[12][16] = +{ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } +}; + + +static inline int blake2b_set_lastnode( blake2b_state *S ) +{ + S->f[1] = -1; + return 0; +} + +static inline int blake2b_clear_lastnode( blake2b_state *S ) +{ + S->f[1] = 0; + return 0; +} + +/* Some helper functions, not necessarily useful */ +int blake2b_set_lastblock( blake2b_state *S ) +{ + if( S->last_node ) blake2b_set_lastnode( S ); + + S->f[0] = -1; + return 0; +} + +static inline int blake2b_clear_lastblock( blake2b_state *S ) +{ + if( S->last_node ) blake2b_clear_lastnode( S ); + + S->f[0] = 0; + return 0; +} + +static inline int blake2b_increment_counter( blake2b_state *S, const uint64_t inc ) +{ + S->t[0] += inc; + S->t[1] += ( S->t[0] < inc ); + return 0; +} + + + +// Parameter-related functions +static inline int blake2b_param_set_digest_length( blake2b_param *P, const uint8_t digest_length ) +{ + P->digest_length = digest_length; + return 0; +} + +static inline int blake2b_param_set_fanout( blake2b_param *P, const uint8_t fanout ) +{ + P->fanout = fanout; + return 0; +} + +static inline int blake2b_param_set_max_depth( blake2b_param *P, const uint8_t depth ) +{ + P->depth = depth; + return 0; +} + +static inline int blake2b_param_set_leaf_length( blake2b_param *P, const uint32_t leaf_length ) +{ + store32( &P->leaf_length, leaf_length ); + return 0; +} + +static inline int blake2b_param_set_node_offset( blake2b_param *P, const uint64_t node_offset ) +{ + store64( &P->node_offset, node_offset ); + return 0; +} + +static inline int blake2b_param_set_node_depth( blake2b_param *P, const uint8_t node_depth ) +{ + P->node_depth = node_depth; + return 0; +} + +static inline int blake2b_param_set_inner_length( blake2b_param *P, const uint8_t inner_length ) +{ + P->inner_length = inner_length; + return 0; +} + +static inline int blake2b_param_set_salt( blake2b_param *P, const uint8_t salt[BLAKE2B_SALTBYTES] ) +{ + memcpy( P->salt, salt, BLAKE2B_SALTBYTES ); + return 0; +} + +static inline int blake2b_param_set_personal( blake2b_param *P, const uint8_t personal[BLAKE2B_PERSONALBYTES] ) +{ + memcpy( P->personal, personal, BLAKE2B_PERSONALBYTES ); + return 0; +} + +static inline int blake2b_init0( blake2b_state *S ) +{ + memset( S, 0, sizeof( blake2b_state ) ); + + for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i]; + + return 0; +} + +/* init xors IV with input parameter block */ +int blake2b_init_param( blake2b_state *S, const blake2b_param *P ) +{ + blake2b_init0( S ); + const uint8_t *p = ( const uint8_t * )( P ); + + /* IV XOR ParamBlock */ + for( size_t i = 0; i < 8; ++i ) + S->h[i] ^= load64( p + sizeof( S->h[i] ) * i ); + + return 0; +} + + + +int blake2b_init( blake2b_state *S, const uint8_t outlen ) +{ + blake2b_param P[1]; + + if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1; + + P->digest_length = outlen; + P->key_length = 0; + P->fanout = 1; + P->depth = 1; + store32( &P->leaf_length, 0 ); + store64( &P->node_offset, 0 ); + P->node_depth = 0; + P->inner_length = 0; + memset( P->reserved, 0, sizeof( P->reserved ) ); + memset( P->salt, 0, sizeof( P->salt ) ); + memset( P->personal, 0, sizeof( P->personal ) ); + return blake2b_init_param( S, P ); +} + + +int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ) +{ + blake2b_param P[1]; + + if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1; + + if ( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1; + + P->digest_length = outlen; + P->key_length = keylen; + P->fanout = 1; + P->depth = 1; + store32( &P->leaf_length, 0 ); + store64( &P->node_offset, 0 ); + P->node_depth = 0; + P->inner_length = 0; + memset( P->reserved, 0, sizeof( P->reserved ) ); + memset( P->salt, 0, sizeof( P->salt ) ); + memset( P->personal, 0, sizeof( P->personal ) ); + + if( blake2b_init_param( S, P ) < 0 ) return -1; + + { + uint8_t block[BLAKE2B_BLOCKBYTES]; + memset( block, 0, BLAKE2B_BLOCKBYTES ); + memcpy( block, key, keylen ); + blake2b_update( S, block, BLAKE2B_BLOCKBYTES ); + secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */ + } + return 0; +} + +int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ) +{ + uint64_t m[16]; + uint64_t v[16]; + int i; + + for( i = 0; i < 16; ++i ) + m[i] = load64( block + i * sizeof( m[i] ) ); + + for( i = 0; i < 8; ++i ) + v[i] = S->h[i]; + + v[ 8] = blake2b_IV[0]; + v[ 9] = blake2b_IV[1]; + v[10] = blake2b_IV[2]; + v[11] = blake2b_IV[3]; + v[12] = S->t[0] ^ blake2b_IV[4]; + v[13] = S->t[1] ^ blake2b_IV[5]; + v[14] = S->f[0] ^ blake2b_IV[6]; + v[15] = S->f[1] ^ blake2b_IV[7]; +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b + m[blake2b_sigma[r][2*i+0]]; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b + m[blake2b_sigma[r][2*i+1]]; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ + } while(0) +#define ROUND(r) \ + do { \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ + } while(0) + ROUND( 0 ); + ROUND( 1 ); + ROUND( 2 ); + ROUND( 3 ); + ROUND( 4 ); + ROUND( 5 ); + ROUND( 6 ); + ROUND( 7 ); + ROUND( 8 ); + ROUND( 9 ); + ROUND( 10 ); + ROUND( 11 ); + + for( i = 0; i < 8; ++i ) + S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + +#undef G +#undef ROUND + return 0; +} + +/* inlen now in bytes */ +int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen ) +{ + while( inlen > 0 ) + { + size_t left = S->buflen; + size_t fill = 2 * BLAKE2B_BLOCKBYTES - left; + + if( inlen > fill ) + { + memcpy( S->buf + left, in, fill ); // Fill buffer + S->buflen += fill; + blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES ); + blake2b_compress( S, S->buf ); // Compress + memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left + S->buflen -= BLAKE2B_BLOCKBYTES; + in += fill; + inlen -= fill; + } + else // inlen <= fill + { + memcpy( S->buf + left, in, inlen ); + S->buflen += inlen; // Be lazy, do not compress + in += inlen; + inlen -= inlen; + } + } + + return 0; +} + +/* Is this correct? */ +int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen ) +{ + uint8_t buffer[BLAKE2B_OUTBYTES] = {0}; + + if( outlen > BLAKE2B_OUTBYTES ) + return -1; + + if( S->buflen > BLAKE2B_BLOCKBYTES ) + { + blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES ); + blake2b_compress( S, S->buf ); + S->buflen -= BLAKE2B_BLOCKBYTES; + memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen ); + } + + blake2b_increment_counter( S, S->buflen ); + blake2b_set_lastblock( S ); + memset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */ + blake2b_compress( S, S->buf ); + +// printf("S->buflen = %016llX\n", S->buflen); + + for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */ + store64( buffer + sizeof( S->h[i] ) * i, S->h[i] ); + + memcpy( out, buffer, outlen ); + return 0; +} + +/* inlen, at least, should be uint64_t. Others can be size_t. */ +int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ) +{ + blake2b_state S[1]; + + /* Verify parameters */ + if ( NULL == in ) return -1; + + if ( NULL == out ) return -1; + + if( NULL == key ) keylen = 0; + + if( keylen > 0 ) + { + if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1; + } + else + { + if( blake2b_init( S, outlen ) < 0 ) return -1; + } + + blake2b_update( S, ( const uint8_t * )in, inlen ); + blake2b_final( S, out, outlen ); + return 0; +} + +#if defined(BLAKE2B_SELFTEST) +#include +#include "blake2-kat.h" +int main( int argc, char **argv ) +{ + uint8_t key[BLAKE2B_KEYBYTES]; + uint8_t buf[KAT_LENGTH]; + + for( size_t i = 0; i < BLAKE2B_KEYBYTES; ++i ) + key[i] = ( uint8_t )i; + + for( size_t i = 0; i < KAT_LENGTH; ++i ) + buf[i] = ( uint8_t )i; + + for( size_t i = 0; i < KAT_LENGTH; ++i ) + { + uint8_t hash[BLAKE2B_OUTBYTES]; + blake2b( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES ); + + if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) ) + { + puts( "error" ); + return -1; + } + } + + puts( "ok" ); + return 0; +} +#endif + diff --git a/gbt-util.c b/gbt-util.c new file mode 100644 index 00000000..2b2ce17e --- /dev/null +++ b/gbt-util.c @@ -0,0 +1,291 @@ +#include +#include +#include +#include +#include "miner.h" +#include "sph/sph_sha2.h" + + +static const int8_t base58_lookup[] = +{ + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, + -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8,-1,-1,-1,-1,-1,-1, + -1, 9,10,11,12,13,14,15,16,-1,17,18,19,20,21,-1, + 22,23,24,25,26,27,28,29,30,31,32,-1,-1,-1,-1,-1, + -1,33,34,35,36,37,38,39,40,41,42,43,-1,44,45,46, + 47,48,49,50,51,52,53,54,55,56,57,-1,-1,-1,-1,-1, +}; + +/* + * Utility function to decode base58 wallet address + * out should have at least 1/2 the size of base58_input + * input_size must not exceed 200 + */ +static bool base58_decode(uint8_t* out, int* out_size, char* base58_input, int input_size) +{ + if (input_size == 0) + return false; + if (input_size > 200) + return false; + uint32_t base_array[32] = {0}; + uint32_t base_track[32] = {0}; + int base_array_size = 1; + base_array[0] = 0; + base_track[0] = 57; + // calculate exact size of output + for (int i = 0; i < input_size-1; i++) + { + // multiply baseTrack with 58 + for (int b = base_array_size-1; b >= 0; b--) + { + uint64_t mwc = (uint64_t) base_track[b] * 58ULL; + base_track[b] = (uint32_t) (mwc & 0xFFFFFFFFUL); + mwc >>= 32; + if (mwc != 0) + { + // add carry + for (int carry_idx = b + 1; carry_idx < base_array_size; carry_idx++) + { + mwc += (uint64_t) base_track[carry_idx]; + base_track[carry_idx] = (uint32_t) (mwc & 0xFFFFFFFFUL); + mwc >>= 32; + if (mwc == 0) + break; + } + if (mwc) + { + // extend + base_track[base_array_size] = (uint32_t) mwc; + base_array_size++; + } + } + } + } + // get length of output data + int output_size = 0; + uint64_t last = base_track[base_array_size-1]; + if (last & 0xFF000000) + output_size = base_array_size * 4; + else if (last & 0xFF0000) + output_size = base_array_size * 4 - 1; + else if (last & 0xFF00) + output_size = base_array_size * 4 - 2; + else + output_size = base_array_size * 4 - 3; + // convert base + for (int i = 0; i < input_size; i++) + { + if (base58_input[i] >= sizeof(base58_lookup) / sizeof(base58_lookup[0])) + return false; + int8_t digit = base58_lookup[base58_input[i]]; + if (digit == -1) + return false; + // multiply baseArray with 58 + for (int b = base_array_size-1; b >= 0; b--) + { + uint64_t mwc = (uint64_t) base_array[b] * 58ULL; + base_array[b] = (uint32_t) (mwc & 0xFFFFFFFFUL); + mwc >>= 32; + if (mwc != 0) + { + // add carry + for (int carry_idx = b + 1; carry_idx < base_array_size; carry_idx++) + { + mwc += (uint64_t) base_array[carry_idx]; + base_array[carry_idx] = (uint32_t) (mwc & 0xFFFFFFFFUL); + mwc >>= 32; + if (mwc == 0) + break; + } + if (mwc) + { + // extend + base_array[base_array_size] = (uint32_t) mwc; + base_array_size++; + } + } + } + // add base58 digit to baseArray with carry + uint64_t awc = (uint64_t) digit; + for (int b = 0; awc != 0 && b < base_array_size; b++) + { + awc += (uint64_t) base_array[b]; + base_array[b] = (uint32_t) (awc & 0xFFFFFFFFUL); + awc >>= 32; + } + if (awc) + { + // extend + base_array[base_array_size] = (uint32_t) awc; + base_array_size++; + } + } + *out_size = output_size; + // write bytes to about + for (int i = 0; i < output_size; i++) + out[output_size - i - 1] = (uint8_t) (base_array[i>>2] >> 8 * (i & 3)); + return true; +} + +/* + * Converts a wallet address (any coin) to the coin-independent pubKeyHash + */ +bool address_decode(uint8_t* pub_key_hash, char* wallet_address, int offset) +{ + uint8_t wallet_address_raw[256]; + int wallet_address_raw_size; + if (base58_decode(wallet_address_raw, &wallet_address_raw_size, wallet_address, strlen(wallet_address)) == false) + { + applog(LOG_WARNING, "Address %s is not correctly base58 encoded", wallet_address); + return false; + } + // is length valid? + if (wallet_address_raw_size != 24 + offset) + { + applog(LOG_WARNING, "Decoding address %s yields invalid number of bytes", wallet_address); + return false; + } + // validate checksum + uint8_t address_hash[32]; + sph_sha256_context s256c; + sph_sha256_init(&s256c); + sph_sha256(&s256c, wallet_address_raw, wallet_address_raw_size - 4); + sph_sha256_close(&s256c, address_hash); + sph_sha256_init(&s256c); + sph_sha256(&s256c, address_hash, 32); + sph_sha256_close(&s256c, address_hash); + if (*(uint32_t*) (wallet_address_raw + 20 + offset) != *(uint32_t*) address_hash) + { + applog(LOG_WARNING, "Address %s is invalid", wallet_address); + return false; + } + if (pub_key_hash != NULL) + memcpy(pub_key_hash, wallet_address_raw + offset, 20); + return true; +} + + + +int add_var_int(uint8_t* msg, uint64_t var_int) +{ + int size = 0; + if (var_int <= 0xfcU) + { + msg[0] = var_int & 0xff; + } + else if (var_int <= 0xffffU) + { + msg[0] = 0xfd; + size = 2; + } + else if (var_int <= 0xffffffffU) + { + msg[0] = 0xfe; + size = 4; + } + else + { + msg[0] = 0xff; + size = 8; + } + var_int = htole64(var_int); + memcpy(msg + 1, &var_int, size); + return size + 1; +} + +int add_block_height(uint8_t* msg, uint32_t height) +{ + int size = 4; + if (height <= 0x7f) + size = 1; + else if (height <= 0x7fffU) + size = 2; + else if (height <= 0x7fffffU) + size = 3; + height = htole32(height); + if (msg != NULL) + memcpy(msg, &height, size); + return size; +} + +int add_int32(uint8_t* msg, int32_t val) +{ + val = htole32(val); + memcpy(msg, &val, 4); + return 4; +} + +int add_int64(uint8_t* msg, int64_t val) +{ + val = htole64(val); + memcpy(msg, &val, 8); + return 8; +} + + +bool set_coinbasetxn(struct pool *pool, uint32_t height, uint64_t coinbasevalue, uint64_t coinbasefrvalue, const char *coinbasefrscript) { + int offset = 0; + int height_size = add_block_height(NULL, height); + uint8_t raw_address[20]; + if (!address_decode(raw_address, strchr(pool->rpc_user, '.') + 1, 2)) // decode zcash address + return false; + pool->coinbase = realloc(pool->coinbase, 512 + pool->n2size); // alloc some extra space + + offset += add_int32(pool->coinbase + offset, 1); // version + offset += add_var_int(pool->coinbase + offset, 1); // number of inputs + memset(pool->coinbase + offset, 0, 32); // transaction id + offset += 32; + offset += add_int32(pool->coinbase + offset, 0xffffffff); // output index + if (height <= 0x10) { + offset += add_var_int(pool->coinbase + offset, 1 + height_size + 4 + pool->n2size); + pool->coinbase[offset++] = 0x50 + height; // return OP_height + } + else { + offset += add_var_int(pool->coinbase + offset, 2 + height_size + 4 + pool->n2size); + offset += add_var_int(pool->coinbase + offset, height_size); + offset += add_block_height(pool->coinbase + offset, height); + } + offset += add_var_int(pool->coinbase + offset, 4 + pool->n2size); + pool->nonce2_offset = offset; + offset += pool->n2size; + offset += add_int32(pool->coinbase + offset, 0x4e614e2f); + offset += add_int32(pool->coinbase + offset, 0xffffffff); // sequence number + + bool has_fr = (coinbasefrvalue != 0); + offset += add_var_int(pool->coinbase + offset, has_fr ? 2 : 1); // number of outputs + offset += add_int64(pool->coinbase + offset, coinbasevalue); // coinbasevalue + + offset += add_var_int(pool->coinbase + offset, 25); // size of script + pool->coinbase[offset++] = 0x76; // OP_DUP + pool->coinbase[offset++] = 0xa9; // OP_HASH160 + offset += add_var_int(pool->coinbase + offset, 20); // size of pubkeyHash + memcpy(pool->coinbase + offset, raw_address, 20); // pubkeyHash + offset += 20; + pool->coinbase[offset++] = 0x88; // OP_EQUALVERIFY + pool->coinbase[offset++] = 0xac; // OP_CHECKSIG + + /* + char pubkey_str[] = "03556ae4825538153f719ef90a187eafae03ef1884dc09399c8a2de8929c2cd798"; + uint8_t pubkey[33]; + hex2bin(pubkey, pubkey_str, 33); + offset += add_var_int(pool->coinbase + offset, 35); // size of script + offset += add_var_int(pool->coinbase + offset, 33); // size of pubkey + memcpy(pool->coinbase + offset, pubkey, 33); // pubkey + offset += 33; + pool->coinbase[offset++] = 0xac; // OP_CHECKSIG + */ + if (has_fr) { + int len = strlen(coinbasefrscript) / 2; + offset += add_int64(pool->coinbase + offset, coinbasefrvalue); // founders reward + offset += add_var_int(pool->coinbase + offset, len); // size of founders script + hex2bin(pool->coinbase + offset, coinbasefrscript, len); // founders script + offset += len; + } + offset += add_int32(pool->coinbase + offset, 0); // lock time + pool->coinbase_len = offset; + + return true; +} + diff --git a/gbt-util.h b/gbt-util.h new file mode 100644 index 00000000..962884cf --- /dev/null +++ b/gbt-util.h @@ -0,0 +1,8 @@ +#ifndef __GBT_UTIL_H +#define __GBT_UTIL_H + + +int add_var_int(uint8_t*, uint64_t); +bool set_coinbasetxn(struct pool *, uint32_t, uint64_t, uint64_t, const char *); + +#endif diff --git a/kernel/equihash-param.h b/kernel/equihash-param.h new file mode 100644 index 00000000..a40eea69 --- /dev/null +++ b/kernel/equihash-param.h @@ -0,0 +1,113 @@ +#ifndef __OPENCL_VERSION__ +#define uint uint32_t +#define uchar uint8_t +#endif + + +#define PARAM_N 200 +#define PARAM_K 9 +#define PREFIX (PARAM_N / (PARAM_K + 1)) +#define NR_INPUTS (1 << PREFIX) +// Approximate log base 2 of number of elements in hash tables +#define APX_NR_ELMS_LOG (PREFIX + 1) +// Number of rows and slots is affected by this. 20 offers the best performance +// but occasionally misses ~1% of solutions. +#define NR_ROWS_LOG 18 + +// Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md +#define OPTIM_SIMPLIFY_ROUND 1 + +// Number of collision items to track, per thread +#define THREADS_PER_ROW 8 +#define LDS_COLL_SIZE (NR_SLOTS * 8 * (64 / THREADS_PER_ROW)) + +// Ratio of time of sleeping before rechecking if task is done (0-1) +#define SLEEP_RECHECK_RATIO 0.60 +// Ratio of time to busy wait for the solution (0-1) +// The higher value the higher CPU usage with Nvidia +#define SLEEP_SKIP_RATIO 0.005 + +// Make hash tables OVERHEAD times larger than necessary to store the average +// number of elements per row. The ideal value is as small as possible to +// reduce memory usage, but not too small or else elements are dropped from the +// hash tables. +// +// The actual number of elements per row is closer to the theoretical average +// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be +// smaller. +// +// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease +// performance as they cause VRAM channel conflicts. +#if NR_ROWS_LOG == 16 +// #error "NR_ROWS_LOG = 16 is currently broken - do not use" +#define OVERHEAD 2 +#elif NR_ROWS_LOG == 18 +#define OVERHEAD 3 +#elif NR_ROWS_LOG == 19 +#define OVERHEAD 5 +#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND +#define OVERHEAD 6 +#elif NR_ROWS_LOG == 20 +#define OVERHEAD 9 +#endif + +#define NR_ROWS (1 << NR_ROWS_LOG) +#define NR_SLOTS (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD)) +// Length of 1 element (slot) in byte +#define SLOT_LEN 32 +// Total size of hash table +#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN) +// Length of Zcash block header, nonce (part of header) +#define ZCASH_BLOCK_HEADER_LEN 140 +// Offset of nTime in header +#define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32) +// Length of nonce +#define ZCASH_NONCE_LEN 32 +// Length of encoded representation of solution size +#define ZCASH_SOLSIZE_LEN 3 +// Solution size (1344 = 0x540) represented as a compact integer, in hex +#define ZCASH_SOLSIZE_HEX "fd4005" +// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes) +#define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8) +// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization +#define N_ZERO_BYTES 12 +// Number of bytes Zcash needs out of Blake +#define ZCASH_HASH_LEN 50 +// Number of wavefronts per SIMD for the Blake kernel. +// Blake is ALU-bound (beside the atomic counter being incremented) so we need +// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer +// instructions. 10 is the max supported by the hw. +#define BLAKE_WPS 10 +// Maximum number of solutions reported by kernel to host +#define MAX_SOLS 10 +// Length of SHA256 target +#define SHA256_TARGET_LEN (256 / 8) + +#if (NR_SLOTS < 16) +#define BITS_PER_ROW 4 +#define ROWS_PER_UINT 8 +#define ROW_MASK 0x0F +#else +#define BITS_PER_ROW 8 +#define ROWS_PER_UINT 4 +#define ROW_MASK 0xFF +#endif + +// Optional features +#undef ENABLE_DEBUG + +/* +** Return the offset of Xi in bytes from the beginning of the slot. +*/ +#define xi_offset_for_round(round) (8 + ((round) / 2) * 4) + +// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values +#define SOL_SIZE ((1 << PARAM_K) * 4) +typedef struct sols_s +{ + uint nr; + uint likely_invalids; + uchar valid[MAX_SOLS]; + uint values[MAX_SOLS][(1 << PARAM_K)]; +} sols_t; + diff --git a/kernel/equihash.cl b/kernel/equihash.cl new file mode 100644 index 00000000..8fedf79e --- /dev/null +++ b/kernel/equihash.cl @@ -0,0 +1,833 @@ +#include "equihash-param.h" + +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable + +/* +** Assuming NR_ROWS_LOG == 16, the hash table slots have this layout (length in +** bytes in parens): +** +** round 0, table 0: cnt(4) i(4) pad(0) Xi(23.0) pad(1) +** round 1, table 1: cnt(4) i(4) pad(0.5) Xi(20.5) pad(3) +** round 2, table 0: cnt(4) i(4) i(4) pad(0) Xi(18.0) pad(2) +** round 3, table 1: cnt(4) i(4) i(4) pad(0.5) Xi(15.5) pad(4) +** round 4, table 0: cnt(4) i(4) i(4) i(4) pad(0) Xi(13.0) pad(3) +** round 5, table 1: cnt(4) i(4) i(4) i(4) pad(0.5) Xi(10.5) pad(5) +** round 6, table 0: cnt(4) i(4) i(4) i(4) i(4) pad(0) Xi( 8.0) pad(4) +** round 7, table 1: cnt(4) i(4) i(4) i(4) i(4) pad(0.5) Xi( 5.5) pad(6) +** round 8, table 0: cnt(4) i(4) i(4) i(4) i(4) i(4) pad(0) Xi( 3.0) pad(5) +** +** If the first byte of Xi is 0xAB then: +** - on even rounds, 'A' is part of the colliding PREFIX, 'B' is part of Xi +** - on odd rounds, 'A' and 'B' are both part of the colliding PREFIX, but +** 'A' is considered redundant padding as it was used to compute the row # +** +** - cnt is an atomic counter keeping track of the number of used slots. +** it is used in the first slot only; subsequent slots replace it with +** 4 padding bytes +** - i encodes either the 21-bit input value (round 0) or a reference to two +** inputs from the previous round +** +** Formula for Xi length and pad length above: +** > for i in range(9): +** > xi=(200-20*i-NR_ROWS_LOG)/8.; ci=8+4*((i)/2); print xi,32-ci-xi +** +** Note that the fractional .5-byte/4-bit padding following Xi for odd rounds +** is the 4 most significant bits of the last byte of Xi. +*/ + +__constant ulong blake_iv[] = +{ + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, + 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, + 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, +}; + +/* +** Reset counters in hash table. +*/ +__kernel +void kernel_init_ht(__global char *ht, __global uint *rowCounters) +{ + rowCounters[get_global_id(0)] = 0; +} + +/* +** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they +** represent (hex notation, group of 5 hex digits are a group of PREFIX bits): +** aa aa ab bb bb cc cc cd dd... [round 0] +** -------------------- +** ...ab bb bb cc cc cd dd... [odd round] +** -------------- +** ...cc cc cd dd... [next even round] +** ----- +** Bytes underlined are going to be stored in the slot. Preceding bytes +** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are +** used to compute the row number. +** +** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter) +** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble) +** TODO: update lines below with padding nibbles +** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter) +** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter) +** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter) +** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter) +** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter) +** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter) +** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter) +** +** Return 0 if successfully stored, or 1 if the row overflowed. +*/ +uint ht_store(uint round, __global char *ht, uint i, + ulong xi0, ulong xi1, ulong xi2, ulong xi3, __global uint *rowCounters) +{ + uint row; + __global char *p; + uint cnt; +#if NR_ROWS_LOG == 16 + if (!(round % 2)) + row = (xi0 & 0xffff); + else + // if we have in hex: "ab cd ef..." (little endian xi0) then this + // formula computes the row as 0xdebc. it skips the 'a' nibble as it + // is part of the PREFIX. The Xi will be stored starting with "ef..."; + // 'e' will be considered padding and 'f' is part of the current PREFIX + row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | + ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); +#elif NR_ROWS_LOG == 18 + if (!(round % 2)) + row = (xi0 & 0xffff) | ((xi0 & 0xc00000) >> 6); + else + row = ((xi0 & 0xc0000) >> 2) | + ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | + ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); +#elif NR_ROWS_LOG == 19 + if (!(round % 2)) + row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5); + else + row = ((xi0 & 0xe0000) >> 1) | + ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | + ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); +#elif NR_ROWS_LOG == 20 + if (!(round % 2)) + row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4); + else + row = ((xi0 & 0xf0000) >> 0) | + ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | + ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); +#else +#error "unsupported NR_ROWS_LOG" +#endif + xi0 = (xi0 >> 16) | (xi1 << (64 - 16)); + xi1 = (xi1 >> 16) | (xi2 << (64 - 16)); + xi2 = (xi2 >> 16) | (xi3 << (64 - 16)); + p = ht + row * NR_SLOTS * SLOT_LEN; + uint rowIdx = row/ROWS_PER_UINT; + uint rowOffset = BITS_PER_ROW*(row%ROWS_PER_UINT); + uint xcnt = atomic_add(rowCounters + rowIdx, 1 << rowOffset); + xcnt = (xcnt >> rowOffset) & ROW_MASK; + cnt = xcnt; + if (cnt >= NR_SLOTS) + { + // avoid overflows + atomic_sub(rowCounters + rowIdx, 1 << rowOffset); + return 1; + } + p += cnt * SLOT_LEN + xi_offset_for_round(round); + // store "i" (always 4 bytes before Xi) + *(__global uint *)(p - 4) = i; + if (round == 0 || round == 1) + { + // store 24 bytes + *(__global ulong *)(p + 0) = xi0; + *(__global ulong *)(p + 8) = xi1; + *(__global ulong *)(p + 16) = xi2; + } + else if (round == 2) + { + // store 20 bytes + *(__global uint *)(p + 0) = xi0; + *(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32); + *(__global ulong *)(p + 12) = (xi1 >> 32) | (xi2 << 32); + } + else if (round == 3) + { + // store 16 bytes + *(__global uint *)(p + 0) = xi0; + *(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32); + *(__global uint *)(p + 12) = (xi1 >> 32); + } + else if (round == 4) + { + // store 16 bytes + *(__global ulong *)(p + 0) = xi0; + *(__global ulong *)(p + 8) = xi1; + } + else if (round == 5) + { + // store 12 bytes + *(__global ulong *)(p + 0) = xi0; + *(__global uint *)(p + 8) = xi1; + } + else if (round == 6 || round == 7) + { + // store 8 bytes + *(__global uint *)(p + 0) = xi0; + *(__global uint *)(p + 4) = (xi0 >> 32); + } + else if (round == 8) + { + // store 4 bytes + *(__global uint *)(p + 0) = xi0; + } + return 0; +} + +#define mix(va, vb, vc, vd, x, y) \ + va = (va + vb + x); \ +vd = rotate((vd ^ va), (ulong)64 - 32); \ +vc = (vc + vd); \ +vb = rotate((vb ^ vc), (ulong)64 - 24); \ +va = (va + vb + y); \ +vd = rotate((vd ^ va), (ulong)64 - 16); \ +vc = (vc + vd); \ +vb = rotate((vb ^ vc), (ulong)64 - 63); + +/* +** Execute round 0 (blake). +** +** Note: making the work group size less than or equal to the wavefront size +** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local +** Memory (LDS) Optimization 2-10" in: +** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/ +*/ +__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +void kernel_round0(__global ulong *blake_state, __global char *ht, + __global uint *rowCounters, __global uint *debug) +{ + uint tid = get_global_id(0); + ulong v[16]; + uint inputs_per_thread = NR_INPUTS / get_global_size(0); + uint input = tid * inputs_per_thread; + uint input_end = (tid + 1) * inputs_per_thread; + uint dropped = 0; + while (input < input_end) + { + // shift "i" to occupy the high 32 bits of the second ulong word in the + // message block + ulong word1 = (ulong)input << 32; + // init vector v + v[0] = blake_state[0]; + v[1] = blake_state[1]; + v[2] = blake_state[2]; + v[3] = blake_state[3]; + v[4] = blake_state[4]; + v[5] = blake_state[5]; + v[6] = blake_state[6]; + v[7] = blake_state[7]; + v[8] = blake_iv[0]; + v[9] = blake_iv[1]; + v[10] = blake_iv[2]; + v[11] = blake_iv[3]; + v[12] = blake_iv[4]; + v[13] = blake_iv[5]; + v[14] = blake_iv[6]; + v[15] = blake_iv[7]; + // mix in length of data + v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */; + // last block + v[14] ^= (ulong)-1; + + // round 1 + mix(v[0], v[4], v[8], v[12], 0, word1); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 2 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], word1, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 3 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, word1); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 4 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, word1); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 5 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, word1); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 6 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], word1, 0); + // round 7 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], word1, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 8 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, word1); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 9 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], word1, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 10 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], word1, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 11 + mix(v[0], v[4], v[8], v[12], 0, word1); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 12 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], word1, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + + // compress v into the blake state; this produces the 50-byte hash + // (two Xi values) + ulong h[7]; + h[0] = blake_state[0] ^ v[0] ^ v[8]; + h[1] = blake_state[1] ^ v[1] ^ v[9]; + h[2] = blake_state[2] ^ v[2] ^ v[10]; + h[3] = blake_state[3] ^ v[3] ^ v[11]; + h[4] = blake_state[4] ^ v[4] ^ v[12]; + h[5] = blake_state[5] ^ v[5] ^ v[13]; + h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff; + + // store the two Xi values in the hash table +#if ZCASH_HASH_LEN == 50 + dropped += ht_store(0, ht, input * 2, + h[0], + h[1], + h[2], + h[3], rowCounters); + dropped += ht_store(0, ht, input * 2 + 1, + (h[3] >> 8) | (h[4] << (64 - 8)), + (h[4] >> 8) | (h[5] << (64 - 8)), + (h[5] >> 8) | (h[6] << (64 - 8)), + (h[6] >> 8), rowCounters); +#else +#error "unsupported ZCASH_HASH_LEN" +#endif + + input++; + } +#ifdef ENABLE_DEBUG + debug[tid * 2] = 0; + debug[tid * 2 + 1] = dropped; +#endif +} + +#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff)) +#define DECODE_ROW(REF) (REF >> 16) +#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff) +#define DECODE_SLOT0(REF) (REF & 0xff) + +#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f)) +#define DECODE_ROW(REF) (REF >> 14) +#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f) +#define DECODE_SLOT0(REF) (REF & 0x7f) + +#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */ +#define DECODE_ROW(REF) (REF >> 13) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) +#define DECODE_ROW(REF) (REF >> 12) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#else +#error "unsupported NR_ROWS_LOG" +#endif + +/* +** Access a half-aligned long, that is a long aligned on a 4-byte boundary. +*/ +ulong half_aligned_long(__global ulong *p, uint offset) +{ + return + (((ulong)*(__global uint *)((__global char *)p + offset + 0)) << 0) | + (((ulong)*(__global uint *)((__global char *)p + offset + 4)) << 32); +} + +/* +** Access a well-aligned int. +*/ +uint well_aligned_int(__global ulong *_p, uint offset) +{ + __global char *p = (__global char *)_p; + return *(__global uint *)(p + offset); +} + +/* +** XOR a pair of Xi values computed at "round - 1" and store the result in the +** hash table being built for "round". Note that when building the table for +** even rounds we need to skip 1 padding byte present in the "round - 1" table +** (the "0xAB" byte mentioned in the description at the top of this file.) But +** also note we can't load data directly past this byte because this would +** cause an unaligned memory access which is undefined per the OpenCL spec. +** +** Return 0 if successfully stored, or 1 if the row overflowed. +*/ +uint xor_and_store(uint round, __global char *ht_dst, uint row, + uint slot_a, uint slot_b, __global ulong *a, __global ulong *b, + __global uint *rowCounters) +{ + ulong xi0, xi1, xi2; +#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 + // Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not + // storing the byte containing bits from the previous PREFIX block for + if (round == 1 || round == 2) + { + // xor 24 bytes + xi0 = *(a++) ^ *(b++); + xi1 = *(a++) ^ *(b++); + xi2 = *a ^ *b; + if (round == 2) + { + // skip padding byte + xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); + xi1 = (xi1 >> 8) | (xi2 << (64 - 8)); + xi2 = (xi2 >> 8); + } + } + else if (round == 3) + { + // xor 20 bytes + xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); + xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8); + xi2 = well_aligned_int(a, 16) ^ well_aligned_int(b, 16); + } + else if (round == 4 || round == 5) + { + // xor 16 bytes + xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); + xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8); + xi2 = 0; + if (round == 4) + { + // skip padding byte + xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); + xi1 = (xi1 >> 8); + } + } + else if (round == 6) + { + // xor 12 bytes + xi0 = *a++ ^ *b++; + xi1 = *(__global uint *)a ^ *(__global uint *)b; + xi2 = 0; + if (round == 6) + { + // skip padding byte + xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); + xi1 = (xi1 >> 8); + } + } + else if (round == 7 || round == 8) + { + // xor 8 bytes + xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); + xi1 = 0; + xi2 = 0; + if (round == 8) + { + // skip padding byte + xi0 = (xi0 >> 8); + } + } + // invalid solutions (which start happenning in round 5) have duplicate + // inputs and xor to zero, so discard them + if (!xi0 && !xi1) + return 0; +#else +#error "unsupported NR_ROWS_LOG" +#endif + return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b), + xi0, xi1, xi2, 0, rowCounters); +} + +/* +** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi, +** store them in ht_dst. +*/ +void equihash_round(uint round, + __global char *ht_src, + __global char *ht_dst, + __global uint *debug, + __local uchar *first_words_data, + __local uint *collisionsData, + __local uint *collisionsNum, + __global uint *rowCountersSrc, + __global uint *rowCountersDst, + uint threadsPerRow) +{ + uint globalTid = get_global_id(0) / threadsPerRow; + uint localTid = get_local_id(0) / threadsPerRow; + uint localGroupId = get_local_id(0) % threadsPerRow; + __local uchar *first_words = &first_words_data[NR_SLOTS*localTid]; + + __global char *p; + uint cnt; + uchar mask; + uint i, j; + // NR_SLOTS is already oversized (by a factor of OVERHEAD), but we want to + // make it even larger + uint n; + uint dropped_coll = 0; + uint dropped_stor = 0; + __global ulong *a, *b; + uint xi_offset; + // read first words of Xi from the previous (round - 1) hash table + xi_offset = xi_offset_for_round(round - 1); + // the mask is also computed to read data from the previous round +#if NR_ROWS_LOG <= 16 + mask = ((!(round % 2)) ? 0x0f : 0xf0); +#elif NR_ROWS_LOG == 18 + mask = ((!(round % 2)) ? 0x03 : 0x30); +#elif NR_ROWS_LOG == 19 + mask = ((!(round % 2)) ? 0x01 : 0x10); +#elif NR_ROWS_LOG == 20 + mask = 0; /* we can vastly simplify the code below */ +#else +#error "unsupported NR_ROWS_LOG" +#endif + + for (uint chunk = 0; chunk < threadsPerRow; chunk++) { + uint tid = globalTid + NR_ROWS/threadsPerRow*chunk; + uint gid = tid & ~(get_local_size(0) / threadsPerRow - 1); +// for (uint tid = get_global_id(0)/threadsPerRow; tid < NR_ROWS; tid += get_global_size(0)/threadsPerRow) { + + uint rowIdx = tid/ROWS_PER_UINT; + uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT); + cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK; + cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round + + *collisionsNum = 0; + p = (ht_src + tid * NR_SLOTS * SLOT_LEN); + p += xi_offset; + p += SLOT_LEN*localGroupId; + for (i = localGroupId; i < cnt; i += threadsPerRow, p += SLOT_LEN*threadsPerRow) + first_words[i] = (*(__global uchar *)p) & mask; + barrier(CLK_LOCAL_MEM_FENCE); + + if (cnt == 0) + // no elements in row, no collisions + goto part2; + // find collisions + for (i = 0; i < cnt-1; i++) + { + uchar data_i = first_words[i]; + uint collision = (localTid << 24) | (i << 12) | (i + 1 + localGroupId); + for (j = i + 1 + localGroupId; j < cnt; j += threadsPerRow) + { + if (data_i == first_words[j]) + { + uint index = atomic_inc(collisionsNum); + if (index >= LDS_COLL_SIZE) { + atomic_dec(collisionsNum); + goto part2; + } + collisionsData[index] = collision; + } + collision += threadsPerRow; + } + } + +part2: + barrier(CLK_LOCAL_MEM_FENCE); + uint totalCollisions = *collisionsNum; + for (uint index = get_local_id(0); index < totalCollisions; index += get_local_size(0)) + { + uint collision = collisionsData[index]; + uint collisionThreadId = gid + (collision >> 24); + uint i = (collision >> 12) & 0xFFF; + uint j = collision & 0xFFF; + __global uchar *ptr = ht_src + collisionThreadId * NR_SLOTS * SLOT_LEN + + xi_offset; + a = (__global ulong *)(ptr + i * SLOT_LEN); + b = (__global ulong *)(ptr + j * SLOT_LEN); + dropped_stor += xor_and_store(round, ht_dst, collisionThreadId, i, j, + a, b, rowCountersDst); + } + } + +#ifdef ENABLE_DEBUG + debug[tid * 2] = dropped_coll; + debug[tid * 2 + 1] = dropped_stor; +#endif +} + +/* +** This defines kernel_round1, kernel_round2, ..., kernel_round7. +*/ +#define KERNEL_ROUND(N) \ +__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) \ +void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \ + __global uint *rowCountersSrc, __global uint *rowCountersDst, \ + __global uint *debug) \ +{ \ + __local uchar first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; \ + __local uint collisionsData[LDS_COLL_SIZE]; \ + __local uint collisionsNum; \ + equihash_round(N, ht_src, ht_dst, debug, first_words_data, collisionsData, \ + &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); \ +} +KERNEL_ROUND(1) +KERNEL_ROUND(2) +KERNEL_ROUND(3) +KERNEL_ROUND(4) +KERNEL_ROUND(5) +KERNEL_ROUND(6) +KERNEL_ROUND(7) + +// kernel_round8 takes an extra argument, "sols" +__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +void kernel_round8(__global char *ht_src, __global char *ht_dst, + __global uint *rowCountersSrc, __global uint *rowCountersDst, + __global uint *debug, __global sols_t *sols) +{ + uint tid = get_global_id(0); + __local uchar first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; + __local uint collisionsData[LDS_COLL_SIZE]; + __local uint collisionsNum; + equihash_round(8, ht_src, ht_dst, debug, first_words_data, collisionsData, + &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); + if (!tid) + sols->nr = sols->likely_invalids = 0; +} + +uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot) +{ + return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN + + slot * SLOT_LEN + xi_offset - 4); +} + +/* +** Expand references to inputs. Return 1 if so far the solution appears valid, +** or 0 otherwise (an invalid solution would be a solution with duplicate +** inputs, which can be detected at the last step: round == 0). +*/ +uint expand_refs(uint *ins, uint nr_inputs, __global char **htabs, + uint round) +{ + __global char *ht = htabs[round % 2]; + uint i = nr_inputs - 1; + uint j = nr_inputs * 2 - 1; + uint xi_offset = xi_offset_for_round(round); + int dup_to_watch = -1; + do + { + ins[j] = expand_ref(ht, xi_offset, + DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i])); + ins[j - 1] = expand_ref(ht, xi_offset, + DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i])); + if (!round) + { + if (dup_to_watch == -1) + dup_to_watch = ins[j]; + else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch) + return 0; + } + if (!i) + break ; + i--; + j -= 2; + } + while (1); + return 1; +} + +/* +** Verify if a potential solution is in fact valid. +*/ +void potential_sol(__global char **htabs, __global sols_t *sols, + uint ref0, uint ref1) +{ + uint nr_values; + uint values_tmp[(1 << PARAM_K)]; + uint sol_i; + uint i; + nr_values = 0; + values_tmp[nr_values++] = ref0; + values_tmp[nr_values++] = ref1; + uint round = PARAM_K - 1; + do + { + round--; + if (!expand_refs(values_tmp, nr_values, htabs, round)) + return ; + nr_values *= 2; + } + while (round > 0); + // solution appears valid, copy it to sols + sol_i = atomic_inc(&sols->nr); + if (sol_i >= MAX_SOLS) + return ; + for (i = 0; i < (1 << PARAM_K); i++) + sols->values[sol_i][i] = values_tmp[i]; + sols->valid[sol_i] = 1; +} + +/* +** Scan the hash tables to find Equihash solutions. +*/ +__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols, + __global uint *rowCountersSrc, __global uint *rowCountersDst) +{ + __local uint counters[WORKSIZE/THREADS_PER_ROW]; + __local uint refs[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; + __local uint data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; + __local uint collisionsNum; + __local ulong collisions[WORKSIZE*4]; + + uint globalTid = get_global_id(0) / THREADS_PER_ROW; + uint localTid = get_local_id(0) / THREADS_PER_ROW; + uint localGroupId = get_local_id(0) % THREADS_PER_ROW; + __local uint *refsPtr = &refs[NR_SLOTS*localTid]; + __local uint *dataPtr = &data[NR_SLOTS*localTid]; + + __global char *htabs[2] = { ht0, ht1 }; + __global char *hcounters[2] = { rowCountersSrc, rowCountersDst }; + uint ht_i = (PARAM_K - 1) % 2; // table filled at last round + uint cnt; + uint xi_offset = xi_offset_for_round(PARAM_K - 1); + uint i, j; + __global char *p; + uint ref_i, ref_j; + // it's ok for the collisions array to be so small, as if it fills up + // the potential solutions are likely invalid (many duplicate inputs) +// ulong collisions; +#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 + // in the final hash table, we are looking for a match on both the bits + // part of the previous PREFIX colliding bits, and the last PREFIX bits. + uint mask = 0xffffff; +#else +#error "unsupported NR_ROWS_LOG" +#endif + + collisionsNum = 0; + + for (uint chunk = 0; chunk < THREADS_PER_ROW; chunk++) { + uint tid = globalTid + NR_ROWS/THREADS_PER_ROW*chunk; + p = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN; + uint rowIdx = tid/ROWS_PER_UINT; + uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT); + cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK; + cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round + p += xi_offset; + p += SLOT_LEN*localGroupId; + + for (i = get_local_id(0); i < WORKSIZE/THREADS_PER_ROW; i += get_local_size(0)) + counters[i] = 0; + for (i = localGroupId; i < cnt; i += THREADS_PER_ROW, p += SLOT_LEN*THREADS_PER_ROW) { + refsPtr[i] = *(__global uint *)(p - 4); + dataPtr[i] = (*(__global uint *)p) & mask; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (i = 0; i < cnt; i++) + { + uint a_data = dataPtr[i]; + ref_i = refsPtr[i]; + for (j = i + 1 + localGroupId; j < cnt; j += THREADS_PER_ROW) + { + if (a_data == dataPtr[j]) + { + if (atomic_inc(&counters[localTid]) == 0) + collisions[atomic_inc(&collisionsNum)] = ((ulong)ref_i << 32) | refsPtr[j]; + goto part2; + } + } + } + +part2: + continue; + } + + barrier(CLK_LOCAL_MEM_FENCE); + uint totalCollisions = collisionsNum; + if (get_local_id(0) < totalCollisions) { + ulong coll = collisions[get_local_id(0)]; + potential_sol(htabs, sols, coll >> 32, coll & 0xffffffff); + } +}