diff --git a/algorithm.c b/algorithm.c index 2f543556..ad56d833 100644 --- a/algorithm.c +++ b/algorithm.c @@ -1147,6 +1147,7 @@ static cl_int queue_cryptonight_kernel(_clState *clState, dev_blk_ctx *blk, __ma } +#define WORKSIZE clState->wsize static cl_int queue_equihash_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads) { @@ -1160,43 +1161,43 @@ static cl_int queue_equihash_kernel(_clState *clState, dev_blk_ctx *blk, __maybe uint32_t dbg[2] = {0}; status |= clEnqueueWriteBuffer(clState->commandQueue, clState->padbuffer8, CL_TRUE, 0, sizeof(dbg), &dbg, 0, NULL, NULL); - cl_mem buf_ht[2] = {clState->CLbuffer0, clState->buffer1}; cl_mem rowCounters[2] = {clState->buffer2, clState->buffer3}; for (int round = 0; round < PARAM_K; round++) { - size_t global_ws = NR_ROWS / ROWS_PER_UINT; + size_t global_ws = RC_SIZE; size_t local_ws = 256; unsigned int num = 0; cl_kernel *kernel = &clState->extra_kernels[0]; // Now on every round!!!! - CL_SET_ARG(buf_ht[round % 2]); + CL_SET_ARG(clState->index_buf[round]); CL_SET_ARG(rowCounters[round % 2]); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(clState->CLbuffer0); status |= clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL); - num = 0; kernel = &clState->extra_kernels[1 + round]; if (!round) { - CL_SET_ARG(clState->MidstateBuf); - CL_SET_ARG(buf_ht[round % 2]); - CL_SET_ARG(rowCounters[round % 2]); - work_items = threads; + worksize = LOCAL_WORK_SIZE_ROUND0; + work_items = NR_INPUTS / ROUND0_INPUTS_PER_WORK_ITEM; } else { - CL_SET_ARG(buf_ht[(round - 1) % 2]); - CL_SET_ARG(buf_ht[round % 2]); - CL_SET_ARG(rowCounters[(round - 1) % 2]); - CL_SET_ARG(rowCounters[round % 2]); - work_items = NR_ROWS; + worksize = LOCAL_WORK_SIZE; + work_items = NR_ROWS * worksize; } - CL_SET_ARG(clState->padbuffer8); - if (round == PARAM_K - 1) - CL_SET_ARG(clState->outputBuffer); - status |= clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL, &work_items, &worksize, 0, NULL, NULL); + status |= clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[1 + round], 1, NULL, &work_items, &worksize, 0, NULL, NULL); } - work_items = NR_ROWS; + + worksize = LOCAL_WORK_SIZE_POTENTIAL_SOLS; + work_items = NR_ROWS * worksize; + status |= clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[1 + 9], 1, NULL, &work_items, &worksize, 0, NULL, NULL); + + worksize = LOCAL_WORK_SIZE_SOLS; + work_items = MAX_POTENTIAL_SOLS * worksize; status |= clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 1, NULL, &work_items, &worksize, 0, NULL, NULL); return status; } +#undef WORKSIZE + static algorithm_settings_t algos[] = { // kernels starting from this will have difficulty calculated by using litecoin algorithm diff --git a/algorithm/equihash.c b/algorithm/equihash.c index 1f6a6b4b..35a32efd 100644 --- a/algorithm/equihash.c +++ b/algorithm/equihash.c @@ -34,7 +34,7 @@ static const uint8_t blake2b_sigma[12][16] = { { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } }; - + static const uint64_t blake2b_IV[8] = { 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, @@ -43,9 +43,9 @@ static const uint64_t blake2b_IV[8] = { }; static const uint64_t blake2b_h[8] = { - 0x6a09e667f2bdc93aULL, 0xbb67ae8584caa73bULL, + 0x6a09e667f2bdc93aULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, 0x48ec89c38820de31ULL, 0x5be0cd10137e21b1ULL }; @@ -75,8 +75,8 @@ static const uint64_t blake2b_h[8] = { G(r,5,v[ 1],v[ 6],v[11],v[12]); \ G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ G(r,7,v[ 3],v[ 4],v[ 9],v[14]); - - + + #define G_fast(r,i,a,b,c,d) \ a = a + b + (blake2b_sigma[r][2*i] == 1 ? m1 : 0); \ d = rotr64(d ^ a, 32); \ @@ -99,134 +99,133 @@ static const uint64_t blake2b_h[8] = { void equihash_calc_mid_hash(uint64_t mid_hash[8], uint8_t* header) { - uint64_t v[16], *m = (uint64_t*) header; - for (int i = 0; i < 8; i++) { - v[i] = blake2b_h[i]; - v[i+8] = blake2b_IV[i]; - } - v[12] ^= 128; - for (int r = 0; r < 12; r++) { - ROUND(r) - } - for (int i = 0; i < 8; i++) - mid_hash[i] = blake2b_h[i] ^ v[i] ^ v[i+8]; + uint64_t v[16], *m = (uint64_t*)header; + for (int i = 0; i < 8; i++) { + v[i] = blake2b_h[i]; + v[i + 8] = blake2b_IV[i]; + } + v[12] ^= 128; + for (int r = 0; r < 12; r++) { + ROUND(r) + } + for (int i = 0; i < 8; i++) + mid_hash[i] = blake2b_h[i] ^ v[i] ^ v[i + 8]; } void blake2b_hash(uint8_t *hash, uint64_t mid_hash[8], uint32_t bday) { - uint64_t v[16], tmp[8]; - uint64_t m1 = (uint64_t) bday << 32; - for (int i = 0; i < 8; i++) { - v[i] = mid_hash[i]; - v[i+8] = blake2b_IV[i]; - } - v[12] ^= 140 + sizeof(bday); - v[14] ^= (int64_t) -1; - for (int r = 0; r < 12; r++) { - ROUND_fast(r) - } - for (int i = 0; i < 8; i++) - tmp[i] = mid_hash[i] ^ v[i] ^ v[i+8]; - memcpy(hash, tmp, 50); + uint64_t v[16], tmp[8]; + uint64_t m1 = (uint64_t)bday << 32; + for (int i = 0; i < 8; i++) { + v[i] = mid_hash[i]; + v[i + 8] = blake2b_IV[i]; + } + v[12] ^= 140 + sizeof(bday); + v[14] ^= (int64_t)-1; + for (int r = 0; r < 12; r++) { + ROUND_fast(r) + } + for (int i = 0; i < 8; i++) + tmp[i] = mid_hash[i] ^ v[i] ^ v[i + 8]; + memcpy(hash, tmp, 50); } void equihash_calc_hash(uint8_t hash[25], uint64_t mid_hash[8], uint32_t bday) { - uint8_t tmp[50]; - blake2b_hash(tmp, mid_hash, bday/2); - memcpy(hash, tmp + (bday & 1 ? 25 : 0), 25); + uint8_t tmp[50]; + blake2b_hash(tmp, mid_hash, bday / 2); + memcpy(hash, tmp + (bday & 1 ? 25 : 0), 25); } // These two copied from the ref impl, for now. void ExpandArray(const unsigned char* in, size_t in_len, - unsigned char* out, size_t out_len, - size_t bit_len) + unsigned char* out, size_t out_len, + size_t bit_len) { - size_t byte_pad = 0; - size_t out_width = ((bit_len+7)/8 + byte_pad); - uint32_t bit_len_mask = (((uint32_t)1 << bit_len) - 1); - - // The acc_bits least-significant bits of acc_value represent a bit sequence - // in big-endian order. - size_t acc_bits = 0; - uint32_t acc_value = 0; - - size_t j = 0; - for (size_t i = 0; i < in_len; i++) { - acc_value = (acc_value << 8) | in[i]; - acc_bits += 8; - - // When we have bit_len or more bits in the accumulator, write the next - // output element. - if (acc_bits >= bit_len) { - acc_bits -= bit_len; - for (size_t x = 0; x < byte_pad; x++) { - out[j+x] = 0; - } - for (size_t x = byte_pad; x < out_width; x++) { - out[j+x] = ( - // Big-endian - acc_value >> (acc_bits+(8*(out_width-x-1))) - ) & ( - // Apply bit_len_mask across byte boundaries - (bit_len_mask >> (8*(out_width-x-1))) & 0xFF - ); - } - j += out_width; + size_t byte_pad = 0; + size_t out_width = ((bit_len + 7) / 8 + byte_pad); + uint32_t bit_len_mask = (((uint32_t)1 << bit_len) - 1); + + // The acc_bits least-significant bits of acc_value represent a bit sequence + // in big-endian order. + size_t acc_bits = 0; + uint32_t acc_value = 0; + + size_t j = 0; + for (size_t i = 0; i < in_len; i++) { + acc_value = (acc_value << 8) | in[i]; + acc_bits += 8; + + // When we have bit_len or more bits in the accumulator, write the next + // output element. + if (acc_bits >= bit_len) { + acc_bits -= bit_len; + for (size_t x = 0; x < byte_pad; x++) { + out[j + x] = 0; + } + for (size_t x = byte_pad; x < out_width; x++) { + out[j + x] = ( + // Big-endian + acc_value >> (acc_bits + (8 * (out_width - x - 1))) + ) & ( + // Apply bit_len_mask across byte boundaries + (bit_len_mask >> (8 * (out_width - x - 1))) & 0xFF + ); + } + j += out_width; + } } - } } void CompressArray(const unsigned char* in, size_t in_len, - unsigned char* out, size_t out_len, - size_t bit_len, size_t byte_pad) + unsigned char* out, size_t out_len, + size_t bit_len, size_t byte_pad) { - size_t in_width = ((bit_len+7)/8 + byte_pad); - uint32_t bit_len_mask = (((uint32_t)1 << bit_len) - 1); - - // The acc_bits least-significant bits of acc_value represent a bit sequence - // in big-endian order. - size_t acc_bits = 0; - uint32_t acc_value = 0; - - size_t j = 0; - for (size_t i = 0; i < out_len; i++) { - // When we have fewer than 8 bits left in the accumulator, read the next - // input element. - if (acc_bits < 8) { - acc_value = acc_value << bit_len; - for (size_t x = byte_pad; x < in_width; x++) { - acc_value = acc_value | ( - ( - // Apply bit_len_mask across byte boundaries - in[j+x] & ((bit_len_mask >> (8*(in_width-x-1))) & 0xFF) - ) << (8*(in_width-x-1)) - ); // Big-endian - } - j += in_width; - acc_bits += bit_len; + size_t in_width = ((bit_len + 7) / 8 + byte_pad); + uint32_t bit_len_mask = (((uint32_t)1 << bit_len) - 1); + + // The acc_bits least-significant bits of acc_value represent a bit sequence + // in big-endian order. + size_t acc_bits = 0; + uint32_t acc_value = 0; + + size_t j = 0; + for (size_t i = 0; i < out_len; i++) { + // When we have fewer than 8 bits left in the accumulator, read the next + // input element. + if (acc_bits < 8) { + acc_value = acc_value << bit_len; + for (size_t x = byte_pad; x < in_width; x++) { + acc_value = acc_value | ( + ( + // Apply bit_len_mask across byte boundaries + in[j + x] & ((bit_len_mask >> (8 * (in_width - x - 1))) & 0xFF) + ) << (8 * (in_width - x - 1)) + ); // Big-endian + } + j += in_width; + acc_bits += bit_len; + } + + acc_bits -= 8; + out[i] = (acc_value >> acc_bits) & 0xFF; } - - acc_bits -= 8; - out[i] = (acc_value >> acc_bits) & 0xFF; - } } static inline void sort_pair(uint32_t *a, uint32_t len) { - uint32_t *b = a + len; - uint32_t tmp, need_sorting = 0; - for (uint32_t i = 0; i < len; i++) { - if (need_sorting || a[i] > b[i]) { - need_sorting = 1; - tmp = a[i]; - a[i] = b[i]; - b[i] = tmp; + uint32_t *b = a + len; + uint32_t tmp, need_sorting = 0; + for (uint32_t i = 0; i < len; i++) { + if (need_sorting || a[i] > b[i]) { + need_sorting = 1; + tmp = a[i]; + a[i] = b[i]; + b[i] = tmp; + } else if (a[i] < b[i]) + break; } - else if (a[i] < b[i]) - break; - } } @@ -234,49 +233,49 @@ bool submit_tested_work(struct thr_info *, struct work *); uint32_t equihash_verify_sol(struct work *work, sols_t *sols, int sol_i) { - uint32_t thr_id = work->thr->id; - uint32_t *inputs = sols->values[sol_i]; - uint32_t seen_len = (1 << (PREFIX + 1)) / 8; - uint8_t seen[seen_len]; - uint32_t i; - uint8_t tmp; - // look for duplicate inputs - memset(seen, 0, seen_len); - for (i = 0; i < (1 << PARAM_K); i++) { - - if (inputs[i] / 8 >= seen_len) { - sols->valid[sol_i] = 0; - return 0; + uint32_t thr_id = work->thr->id; + uint32_t *inputs = sols->values[sol_i]; + uint32_t seen_len = (1 << (PREFIX + 1)) / 8; + uint8_t seen[(1 << (PREFIX + 1)) / 8]; + uint32_t i; + uint8_t tmp; + // look for duplicate inputs + memset(seen, 0, seen_len); + for (i = 0; i < (1 << PARAM_K); i++) { + + if (inputs[i] / 8 >= seen_len) { + sols->valid[sol_i] = 0; + return 0; + } + tmp = seen[inputs[i] / 8]; + seen[inputs[i] / 8] |= 1 << (inputs[i] & 7); + if (tmp == seen[inputs[i] / 8]) { + // at least one input value is a duplicate + sols->valid[sol_i] = 0; + return 0; + } } - tmp = seen[inputs[i] / 8]; - seen[inputs[i] / 8] |= 1 << (inputs[i] & 7); - if (tmp == seen[inputs[i] / 8]) { - // at least one input value is a duplicate - sols->valid[sol_i] = 0; - return 0; + // the valid flag is already set by the GPU, but set it again because + // I plan to change the GPU code to not set it + sols->valid[sol_i] = 1; + // sort the pairs in place + for (uint32_t level = 0; level < PARAM_K; level++) { + for (i = 0; i < (1 << PARAM_K); i += (2 << level)) { + sort_pair(&inputs[i], 1 << level); + } } - } - // the valid flag is already set by the GPU, but set it again because - // I plan to change the GPU code to not set it - sols->valid[sol_i] = 1; - // sort the pairs in place - for (uint32_t level = 0; level < PARAM_K; level++) { - for (i = 0; i < (1 << PARAM_K); i += (2 << level)) { - sort_pair(&inputs[i], 1 << level); + + for (i = 0; i < (1 << PARAM_K); i++) + inputs[i] = htobe32(inputs[i]); + + CompressArray((unsigned char*)inputs, 512 * 4, work->equihash_data + 143, 1344, 21, 1); + + gen_hash(work->equihash_data, 1344 + 143, work->hash); + + if (*(uint64_t*)(work->hash + 24) < *(uint64_t*)(work->target + 24)) { + submit_tested_work(work->thr, work); } - } - - for (i = 0; i < (1 << PARAM_K); i++) - inputs[i] = htobe32(inputs[i]); - - CompressArray((unsigned char*) inputs, 512*4, work->equihash_data + 143, 1344, 21, 1); - - gen_hash(work->equihash_data, 1344 + 143, work->hash); - - if (*(uint64_t*) (work->hash + 24) < *(uint64_t*) (work->target + 24)) { - submit_tested_work(work->thr, work); - } - return 1; + return 1; } void equihash_regenhash(struct work *work) diff --git a/algorithm/ethash.c b/algorithm/ethash.c index b9606d54..c4eb97a1 100644 --- a/algorithm/ethash.c +++ b/algorithm/ethash.c @@ -29,7 +29,7 @@ uint32_t EthCalcEpochNumber(uint8_t *SeedHash) uint8_t TestSeedHash[32] = { 0 }; for(int Epoch = 0; Epoch < 2048; ++Epoch) { - SHA3_256(TestSeedHash, TestSeedHash, 32); + SHA3_256((struct ethash_h256 *)TestSeedHash, TestSeedHash, 32); if(!memcmp(TestSeedHash, SeedHash, 32)) return(Epoch + 1); } @@ -61,6 +61,10 @@ Node CalcDAGItem(const Node *CacheInputNodes, uint32_t NodeCount, uint32_t NodeI return DAGNode; } +#ifdef _MSC_VER +#define restrict +#endif + // OutHash & MixHash MUST have 32 bytes allocated (at least) void LightEthash(uint8_t *restrict OutHash, uint8_t *restrict MixHash, const uint8_t *restrict HeaderPoWHash, const Node *Cache, const uint64_t EpochNumber, const uint64_t Nonce) { @@ -116,7 +120,7 @@ void ethash_regenhash(struct work *work) work->Nonce += *((uint32_t *)(work->data + 32)); applog(LOG_DEBUG, "Regenhash: First qword of input: 0x%016llX.", work->Nonce); cg_rlock(&work->pool->data_lock); - LightEthash(work->hash, work->mixhash, work->data, work->pool->eth_cache.dag_cache, work->eth_epoch, work->Nonce); + LightEthash(work->hash, work->mixhash, work->data, (Node *)work->pool->eth_cache.dag_cache, work->eth_epoch, work->Nonce); cg_runlock(&work->pool->data_lock); char *DbgHash = bin2hex(work->hash, 32); diff --git a/algorithm/yescryptcommon.c b/algorithm/yescryptcommon.c index cf7067d0..841a7838 100644 --- a/algorithm/yescryptcommon.c +++ b/algorithm/yescryptcommon.c @@ -320,6 +320,10 @@ yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p, buf, sizeof(buf)); } +#ifdef _MSC_VER +#define __thread __declspec(thread) +#endif + static int yescrypt_bsty(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p, diff --git a/driver-opencl.c b/driver-opencl.c index 0250833d..c60a7c47 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1646,6 +1646,16 @@ static void opencl_thread_shutdown(struct thr_info *thr) clReleaseMemObject(clState->buffer3); if (clState->padbuffer8) clReleaseMemObject(clState->padbuffer8); + for (i = 0; i < 9; i++) + if (clState->index_buf[i]) + clReleaseMemObject(clState->index_buf[i]); + for (i = 0; i < 4; i++) + if (clState->BranchBuffer[i]) + clReleaseMemObject(clState->BranchBuffer[i]); + if (clState->Scratchpads) + clReleaseMemObject(clState->Scratchpads); + if (clState->States) + clReleaseMemObject(clState->States); clReleaseKernel(clState->kernel); for (i = 0; i < clState->n_extra_kernels; i++) clReleaseKernel(clState->extra_kernels[i]); diff --git a/kernel/equihash-param.h b/kernel/equihash-param.h index ace80692..8969a366 100644 --- a/kernel/equihash-param.h +++ b/kernel/equihash-param.h @@ -1,113 +1,404 @@ -#ifndef __OPENCL_VERSION__ -#define uint uint32_t -#define uchar uint8_t -#endif - - -#define PARAM_N 200 -#define PARAM_K 9 -#define PREFIX (PARAM_N / (PARAM_K + 1)) -#define NR_INPUTS (1 << PREFIX) -// Approximate log base 2 of number of elements in hash tables -#define APX_NR_ELMS_LOG (PREFIX + 1) -// Number of rows and slots is affected by this. 20 offers the best performance -// but occasionally misses ~1% of solutions. -#define NR_ROWS_LOG 18 - -// Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md -#define OPTIM_SIMPLIFY_ROUND 1 - -// Number of collision items to track, per thread -#define THREADS_PER_ROW 8 -#define LDS_COLL_SIZE (NR_SLOTS * 8 * (64 / THREADS_PER_ROW)) - -// Ratio of time of sleeping before rechecking if task is done (0-1) -#define SLEEP_RECHECK_RATIO 0.60 -// Ratio of time to busy wait for the solution (0-1) -// The higher value the higher CPU usage with Nvidia -#define SLEEP_SKIP_RATIO 0.005 - -// Make hash tables OVERHEAD times larger than necessary to store the average -// number of elements per row. The ideal value is as small as possible to -// reduce memory usage, but not too small or else elements are dropped from the -// hash tables. -// -// The actual number of elements per row is closer to the theoretical average -// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be -// smaller. -// -// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease -// performance as they cause VRAM channel conflicts. -#if NR_ROWS_LOG == 16 -// #error "NR_ROWS_LOG = 16 is currently broken - do not use" -#define OVERHEAD 2 -#elif NR_ROWS_LOG == 18 -#define OVERHEAD 3 -#elif NR_ROWS_LOG == 19 -#define OVERHEAD 5 -#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND -#define OVERHEAD 6 -#elif NR_ROWS_LOG == 20 -#define OVERHEAD 9 -#endif - -#define NR_ROWS (1 << NR_ROWS_LOG) -#define NR_SLOTS (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD)) -// Length of 1 element (slot) in byte -#define SLOT_LEN 32 -// Total size of hash table -#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN) -// Length of Zcash block header, nonce (part of header) -#define ZCASH_BLOCK_HEADER_LEN 140 -// Offset of nTime in header -#define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32) -// Length of nonce -#define ZCASH_NONCE_LEN 32 -// Length of encoded representation of solution size -#define ZCASH_SOLSIZE_LEN 3 -// Solution size (1344 = 0x540) represented as a compact integer, in hex -#define ZCASH_SOLSIZE_HEX "fd4005" -// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes) -#define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8) -// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization -#define N_ZERO_BYTES 12 -// Number of bytes Zcash needs out of Blake -#define ZCASH_HASH_LEN 50 -// Number of wavefronts per SIMD for the Blake kernel. -// Blake is ALU-bound (beside the atomic counter being incremented) so we need -// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer -// instructions. 10 is the max supported by the hw. -#define BLAKE_WPS 10 -// Maximum number of solutions reported by kernel to host -#define MAX_SOLS 10 -// Length of SHA256 target -#define SHA256_TARGET_LEN (256 / 8) - -#if (NR_SLOTS < 16) -#define BITS_PER_ROW 4 -#define ROWS_PER_UINT 8 -#define ROW_MASK 0x0F -#else -#define BITS_PER_ROW 8 -#define ROWS_PER_UINT 4 -#define ROW_MASK 0xFF -#endif - -// Optional features -#undef ENABLE_DEBUG - -/* -** Return the offset of Xi in bytes from the beginning of the slot. -*/ -#define xi_offset_for_round(round) (8 + ((round) / 2) * 4) - -// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values -#define SOL_SIZE ((1 << PARAM_K) * 4) -typedef struct sols_s -{ - uint nr; - uint likely_invalids; - uchar valid[MAX_SOLS]; - uint values[MAX_SOLS][(1 << PARAM_K)]; -} sols_t; - +// Gateless Gate, a Zcash miner +// Copyright 2016 zawawa @ bitcointalk.org +// +// The initial version of this software was based on: +// SILENTARMY v5 +// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil +// +// This program is free software : you can redistribute it and / or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program.If not, see . + + + +#ifndef __OPENCL_VERSION__ +#define uint uint32_t +#define uchar uint8_t +#endif +#ifdef cl_amd_fp64 +#define AMD +#endif +#if (defined(__Tahiti__) || defined(__Pitcairn__) || defined(__Capeverde__) || defined(__Oland__)) && !defined(AMD_LEGACY) +#define AMD_LEGACY +#endif +#ifdef cl_nv_pragma_unroll +#define NVIDIA +#endif +//#define ENABLE_DEBUG + + + +// +// Parameters for Hash Tables +// + +// There are PARAM_K - 1 hash tables, and each hash table has NR_ROWS rows. +// Each row contains NR_SLOTS slots. + +#define NR_ROWS_LOG 12 /* 12, 13, 14, 15, or 16. */ +#define NR_SLOTS 684 + +#define LDS_COLL_SIZE (NR_SLOTS * 67 / 100) + +#define LOCAL_WORK_SIZE WORKSIZE +#define LOCAL_WORK_SIZE_SOLS WORKSIZE +#define LOCAL_WORK_SIZE_ROUND0 WORKSIZE +#define LOCAL_WORK_SIZE_POTENTIAL_SOLS WORKSIZE + +#define ROUND0_INPUTS_PER_WORK_ITEM 1 + +#if defined(AMD) +#define THREADS_PER_WRITE(round) (((round) <= 5) ? 2 : 1) +#else +#define THREADS_PER_WRITE(round) 1 +#endif + +#if defined(AMD) && !defined(AMD_LEGACY) +#define OPTIM_24BYTE_WRITES +#endif +#define OPTIM_16BYTE_WRITES +#if !defined(AMD_LEGACY) +#define OPTIM_8BYTE_WRITES +#endif + +//#define OPTIM_FAST_INTEGER_DIVISION +//#define OPTIM_COMPACT_ROW_COUNTERS + +#define ADJUSTED_LDS_ARRAY_SIZE(n) (n) + + + +#define PARAM_N 200 +#define PARAM_K 9 +#define PREFIX (PARAM_N / (PARAM_K + 1)) +#define NR_INPUTS (1 << PREFIX) +#define NR_ROWS (1 << NR_ROWS_LOG) +// Length of 1 element (slot) in byte +#define SLOT_LEN 32 +#define ADJUSTED_SLOT_LEN(round) (((round) <= 5) ? SLOT_LEN : SLOT_LEN - 16) +// Total size of hash table +#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN) +// Length of Zcash block header, nonce (part of header) +#define ZCASH_BLOCK_HEADER_LEN 140 +// Offset of nTime in header +#define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32) +// Length of nonce +#define ZCASH_NONCE_LEN 32 +// Length of encoded representation of solution size +#define ZCASH_SOLSIZE_LEN 3 +// Solution size (1344 = 0x540) represented as a compact integer, in hex +#define ZCASH_SOLSIZE_HEX "fd4005" +// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes) +#define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8) +// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization +#define N_ZERO_BYTES 12 +// Number of bytes Zcash needs out of Blake +#define ZCASH_HASH_LEN 50 +// Number of wavefronts per SIMD for the Blake kernel. +// Blake is ALU-bound (beside the atomic counter being incremented) so we need +// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer +// instructions. 10 is the max supported by the hw. +#define BLAKE_WPS 10 +// Maximum number of solutions reported by kernel to host +#define MAX_SOLS 11 +#define MAX_POTENTIAL_SOLS 4096 +// Length of SHA256 target +#define SHA256_TARGET_LEN (256 / 8) + +#ifdef OPTIM_COMPACT_ROW_COUNTERS +#define BITS_PER_ROW ((NR_SLOTS < 3) ? 2 : \ + (NR_SLOTS < 7) ? 3 : \ + (NR_SLOTS < 15) ? 4 : \ + (NR_SLOTS < 31) ? 5 : \ + (NR_SLOTS < 63) ? 6 : \ + (NR_SLOTS < 255) ? 8 : \ + (NR_SLOTS < 1023) ? 10 : \ + 16) +#else +#define BITS_PER_ROW ((NR_SLOTS < 3) ? 2 : \ + (NR_SLOTS < 15) ? 4 : \ + (NR_SLOTS < 255) ? 8 : \ + 16) +#endif +#define ROWS_PER_UINT (32 / BITS_PER_ROW) +#define ROW_MASK ((1 << BITS_PER_ROW) - 1) + + +#define RC_SIZE ((NR_ROWS * 4 + ROWS_PER_UINT - 1) / ROWS_PER_UINT) + + + +// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values +#define SOL_SIZE ((1 << PARAM_K) * 4) +typedef struct sols_s +{ + uint nr; + uint likely_invalids; + uchar valid[MAX_SOLS]; + uint values[MAX_SOLS][(1 << PARAM_K)]; +} sols_t; + +typedef struct potential_sols_s +{ + uint nr; + uint values[MAX_POTENTIAL_SOLS][2]; +} potential_sols_t; + +#if NR_ROWS_LOG <= 12 && NR_SLOTS <= (1 << 10) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 20) | ((slot1 & 0x3ff) << 10) | (slot0 & 0x3ff)) +#define DECODE_ROW(REF) (REF >> 20) +#define DECODE_SLOT1(REF) ((REF >> 10) & 0x3ff) +#define DECODE_SLOT0(REF) (REF & 0x3ff) + +#elif NR_ROWS_LOG <= 14 && NR_SLOTS <= (1 << 9) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 18) | ((slot1 & 0x1ff) << 9) | (slot0 & 0x1ff)) +#define DECODE_ROW(REF) (REF >> 18) +#define DECODE_SLOT1(REF) ((REF >> 9) & 0x1ff) +#define DECODE_SLOT0(REF) (REF & 0x1ff) + +#elif NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff)) +#define DECODE_ROW(REF) (REF >> 16) +#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff) +#define DECODE_SLOT0(REF) (REF & 0xff) + +#elif NR_ROWS_LOG <= 18 && NR_SLOTS <= (1 << 7) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f)) +#define DECODE_ROW(REF) (REF >> 14) +#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f) +#define DECODE_SLOT0(REF) (REF & 0x7f) + +#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */ +#define DECODE_ROW(REF) (REF >> 13) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6) + +#define ENCODE_INPUTS(row, slot0, slot1) \ + ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) +#define DECODE_ROW(REF) (REF >> 12) +#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) +#define DECODE_SLOT0(REF) (REF & 0x3f) + +#else +#error "unsupported NR_ROWS_LOG" +#endif + +#define NEXT_PRIME_NO(n) \ + (((n) <= 2) ? 2 : \ + ((n) <= 3) ? 3 : \ + ((n) <= 5) ? 5 : \ + ((n) <= 7) ? 7 : \ + ((n) <= 11) ? 11 : \ + ((n) <= 13) ? 13 : \ + ((n) <= 17) ? 17 : \ + ((n) <= 19) ? 19 : \ + ((n) <= 23) ? 23 : \ + ((n) <= 29) ? 29 : \ + ((n) <= 31) ? 31 : \ + ((n) <= 37) ? 37 : \ + ((n) <= 41) ? 41 : \ + ((n) <= 43) ? 43 : \ + ((n) <= 47) ? 47 : \ + ((n) <= 53) ? 53 : \ + ((n) <= 59) ? 59 : \ + ((n) <= 61) ? 61 : \ + ((n) <= 67) ? 67 : \ + ((n) <= 71) ? 71 : \ + ((n) <= 73) ? 73 : \ + ((n) <= 79) ? 79 : \ + ((n) <= 83) ? 83 : \ + ((n) <= 89) ? 89 : \ + ((n) <= 97) ? 97 : \ + ((n) <= 101) ? 101 : \ + ((n) <= 103) ? 103 : \ + ((n) <= 107) ? 107 : \ + ((n) <= 109) ? 109 : \ + ((n) <= 113) ? 113 : \ + ((n) <= 127) ? 127 : \ + ((n) <= 131) ? 131 : \ + ((n) <= 137) ? 137 : \ + ((n) <= 139) ? 139 : \ + ((n) <= 149) ? 149 : \ + ((n) <= 151) ? 151 : \ + ((n) <= 157) ? 157 : \ + ((n) <= 163) ? 163 : \ + ((n) <= 167) ? 167 : \ + ((n) <= 173) ? 173 : \ + ((n) <= 179) ? 179 : \ + ((n) <= 181) ? 181 : \ + ((n) <= 191) ? 191 : \ + ((n) <= 193) ? 193 : \ + ((n) <= 197) ? 197 : \ + ((n) <= 199) ? 199 : \ + ((n) <= 211) ? 211 : \ + ((n) <= 223) ? 223 : \ + ((n) <= 227) ? 227 : \ + ((n) <= 229) ? 229 : \ + ((n) <= 233) ? 233 : \ + ((n) <= 239) ? 239 : \ + ((n) <= 241) ? 241 : \ + ((n) <= 251) ? 251 : \ + ((n) <= 257) ? 257 : \ + ((n) <= 263) ? 263 : \ + ((n) <= 269) ? 269 : \ + ((n) <= 271) ? 271 : \ + ((n) <= 277) ? 277 : \ + ((n) <= 281) ? 281 : \ + ((n) <= 283) ? 283 : \ + ((n) <= 293) ? 293 : \ + ((n) <= 307) ? 307 : \ + ((n) <= 311) ? 311 : \ + ((n) <= 313) ? 313 : \ + ((n) <= 317) ? 317 : \ + ((n) <= 331) ? 331 : \ + ((n) <= 337) ? 337 : \ + ((n) <= 347) ? 347 : \ + ((n) <= 349) ? 349 : \ + ((n) <= 353) ? 353 : \ + ((n) <= 359) ? 359 : \ + ((n) <= 367) ? 367 : \ + ((n) <= 373) ? 373 : \ + ((n) <= 379) ? 379 : \ + ((n) <= 383) ? 383 : \ + ((n) <= 389) ? 389 : \ + ((n) <= 397) ? 397 : \ + ((n) <= 401) ? 401 : \ + ((n) <= 409) ? 409 : \ + ((n) <= 419) ? 419 : \ + ((n) <= 421) ? 421 : \ + ((n) <= 431) ? 431 : \ + ((n) <= 433) ? 433 : \ + ((n) <= 439) ? 439 : \ + ((n) <= 443) ? 443 : \ + ((n) <= 449) ? 449 : \ + ((n) <= 457) ? 457 : \ + ((n) <= 461) ? 461 : \ + ((n) <= 463) ? 463 : \ + ((n) <= 467) ? 467 : \ + ((n) <= 479) ? 479 : \ + ((n) <= 487) ? 487 : \ + ((n) <= 491) ? 491 : \ + ((n) <= 499) ? 499 : \ + ((n) <= 503) ? 503 : \ + ((n) <= 509) ? 509 : \ + ((n) <= 521) ? 521 : \ + ((n) <= 523) ? 523 : \ + ((n) <= 541) ? 541 : \ + ((n) <= 547) ? 547 : \ + ((n) <= 557) ? 557 : \ + ((n) <= 563) ? 563 : \ + ((n) <= 569) ? 569 : \ + ((n) <= 571) ? 571 : \ + ((n) <= 577) ? 577 : \ + ((n) <= 587) ? 587 : \ + ((n) <= 593) ? 593 : \ + ((n) <= 599) ? 599 : \ + ((n) <= 601) ? 601 : \ + ((n) <= 607) ? 607 : \ + ((n) <= 613) ? 613 : \ + ((n) <= 617) ? 617 : \ + ((n) <= 619) ? 619 : \ + ((n) <= 631) ? 631 : \ + ((n) <= 641) ? 641 : \ + ((n) <= 643) ? 643 : \ + ((n) <= 647) ? 647 : \ + ((n) <= 653) ? 653 : \ + ((n) <= 659) ? 659 : \ + ((n) <= 661) ? 661 : \ + ((n) <= 673) ? 673 : \ + ((n) <= 677) ? 677 : \ + ((n) <= 683) ? 683 : \ + ((n) <= 691) ? 691 : \ + ((n) <= 701) ? 701 : \ + ((n) <= 709) ? 709 : \ + ((n) <= 719) ? 719 : \ + ((n) <= 727) ? 727 : \ + ((n) <= 733) ? 733 : \ + ((n) <= 739) ? 739 : \ + ((n) <= 743) ? 743 : \ + ((n) <= 751) ? 751 : \ + ((n) <= 757) ? 757 : \ + ((n) <= 761) ? 761 : \ + ((n) <= 769) ? 769 : \ + ((n) <= 773) ? 773 : \ + ((n) <= 787) ? 787 : \ + ((n) <= 797) ? 797 : \ + ((n) <= 809) ? 809 : \ + ((n) <= 811) ? 811 : \ + ((n) <= 821) ? 821 : \ + ((n) <= 823) ? 823 : \ + ((n) <= 827) ? 827 : \ + ((n) <= 829) ? 829 : \ + ((n) <= 839) ? 839 : \ + ((n) <= 853) ? 853 : \ + ((n) <= 857) ? 857 : \ + ((n) <= 859) ? 859 : \ + ((n) <= 863) ? 863 : \ + ((n) <= 877) ? 877 : \ + ((n) <= 881) ? 881 : \ + ((n) <= 883) ? 883 : \ + ((n) <= 887) ? 887 : \ + ((n) <= 907) ? 907 : \ + ((n) <= 911) ? 911 : \ + ((n) <= 919) ? 919 : \ + ((n) <= 929) ? 929 : \ + ((n) <= 937) ? 937 : \ + ((n) <= 941) ? 941 : \ + ((n) <= 947) ? 947 : \ + ((n) <= 953) ? 953 : \ + ((n) <= 967) ? 967 : \ + ((n) <= 971) ? 971 : \ + ((n) <= 977) ? 977 : \ + ((n) <= 983) ? 983 : \ + ((n) <= 991) ? 991 : \ + ((n) <= 997) ? 997 : \ + ((n) <= 1009) ? 1009 : \ + (n)) + +#define NEXT_POWER_OF_TWO(n) \ + (((n) <= 2) ? 2 : \ + ((n) <= 4) ? 4 : \ + ((n) <= 8) ? 8 : \ + ((n) <= 16) ? 16 : \ + ((n) <= 32) ? 32 : \ + ((n) <= 64) ? 64 : \ + ((n) <= 128) ? 128 : \ + ((n) <= 256) ? 256 : \ + ((n) <= 512) ? 512 : \ + ((n) <= 1024) ? 1024 : \ + ((n) <= 2048) ? 2048 : \ + ((n) <= 4096) ? 4096 : \ + ((n) <= 8192) ? 8192 : \ + ((n) <= 16384) ? 16384 : \ + ((n) <= 32768) ? 32768 : \ + (n)) + +#if NR_SLOTS < 255 +#define SLOT_INDEX_TYPE uchar +#elif NR_SLOTS < 65535 +#define SLOT_INDEX_TYPE ushort +#else +#error "Unsupported NR_SLOTS" +#endif diff --git a/kernel/equihash.cl b/kernel/equihash.cl index 460d20a5..73982c1a 100644 --- a/kernel/equihash.cl +++ b/kernel/equihash.cl @@ -1,833 +1,894 @@ -#include "equihash-param.h" - -#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable - -/* -** Assuming NR_ROWS_LOG == 16, the hash table slots have this layout (length in -** bytes in parens): -** -** round 0, table 0: cnt(4) i(4) pad(0) Xi(23.0) pad(1) -** round 1, table 1: cnt(4) i(4) pad(0.5) Xi(20.5) pad(3) -** round 2, table 0: cnt(4) i(4) i(4) pad(0) Xi(18.0) pad(2) -** round 3, table 1: cnt(4) i(4) i(4) pad(0.5) Xi(15.5) pad(4) -** round 4, table 0: cnt(4) i(4) i(4) i(4) pad(0) Xi(13.0) pad(3) -** round 5, table 1: cnt(4) i(4) i(4) i(4) pad(0.5) Xi(10.5) pad(5) -** round 6, table 0: cnt(4) i(4) i(4) i(4) i(4) pad(0) Xi( 8.0) pad(4) -** round 7, table 1: cnt(4) i(4) i(4) i(4) i(4) pad(0.5) Xi( 5.5) pad(6) -** round 8, table 0: cnt(4) i(4) i(4) i(4) i(4) i(4) pad(0) Xi( 3.0) pad(5) -** -** If the first byte of Xi is 0xAB then: -** - on even rounds, 'A' is part of the colliding PREFIX, 'B' is part of Xi -** - on odd rounds, 'A' and 'B' are both part of the colliding PREFIX, but -** 'A' is considered redundant padding as it was used to compute the row # -** -** - cnt is an atomic counter keeping track of the number of used slots. -** it is used in the first slot only; subsequent slots replace it with -** 4 padding bytes -** - i encodes either the 21-bit input value (round 0) or a reference to two -** inputs from the previous round -** -** Formula for Xi length and pad length above: -** > for i in range(9): -** > xi=(200-20*i-NR_ROWS_LOG)/8.; ci=8+4*((i)/2); print xi,32-ci-xi -** -** Note that the fractional .5-byte/4-bit padding following Xi for odd rounds -** is the 4 most significant bits of the last byte of Xi. -*/ - -__constant ulong blake_iv[] = -{ - 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, - 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, - 0x510e527fade682d1, 0x9b05688c2b3e6c1f, - 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, -}; - -/* -** Reset counters in hash table. -*/ -__kernel -void kernel_init_ht(__global char *ht, __global uint *rowCounters) -{ - rowCounters[get_global_id(0)] = 0; -} - -/* -** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they -** represent (hex notation, group of 5 hex digits are a group of PREFIX bits): -** aa aa ab bb bb cc cc cd dd... [round 0] -** -------------------- -** ...ab bb bb cc cc cd dd... [odd round] -** -------------- -** ...cc cc cd dd... [next even round] -** ----- -** Bytes underlined are going to be stored in the slot. Preceding bytes -** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are -** used to compute the row number. -** -** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter) -** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble) -** TODO: update lines below with padding nibbles -** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter) -** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter) -** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter) -** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter) -** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter) -** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter) -** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter) -** -** Return 0 if successfully stored, or 1 if the row overflowed. -*/ -uint ht_store(uint round, __global char *ht, uint i, - ulong xi0, ulong xi1, ulong xi2, ulong xi3, __global uint *rowCounters) -{ - uint row; - __global char *p; - uint cnt; -#if NR_ROWS_LOG == 16 - if (!(round % 2)) - row = (xi0 & 0xffff); - else - // if we have in hex: "ab cd ef..." (little endian xi0) then this - // formula computes the row as 0xdebc. it skips the 'a' nibble as it - // is part of the PREFIX. The Xi will be stored starting with "ef..."; - // 'e' will be considered padding and 'f' is part of the current PREFIX - row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); -#elif NR_ROWS_LOG == 18 - if (!(round % 2)) - row = (xi0 & 0xffff) | ((xi0 & 0xc00000) >> 6); - else - row = ((xi0 & 0xc0000) >> 2) | - ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); -#elif NR_ROWS_LOG == 19 - if (!(round % 2)) - row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5); - else - row = ((xi0 & 0xe0000) >> 1) | - ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); -#elif NR_ROWS_LOG == 20 - if (!(round % 2)) - row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4); - else - row = ((xi0 & 0xf0000) >> 0) | - ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | - ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12); -#else -#error "unsupported NR_ROWS_LOG" -#endif - xi0 = (xi0 >> 16) | (xi1 << (64 - 16)); - xi1 = (xi1 >> 16) | (xi2 << (64 - 16)); - xi2 = (xi2 >> 16) | (xi3 << (64 - 16)); - p = ht + row * NR_SLOTS * SLOT_LEN; - uint rowIdx = row/ROWS_PER_UINT; - uint rowOffset = BITS_PER_ROW*(row%ROWS_PER_UINT); - uint xcnt = atomic_add(rowCounters + rowIdx, 1 << rowOffset); - xcnt = (xcnt >> rowOffset) & ROW_MASK; - cnt = xcnt; - if (cnt >= NR_SLOTS) - { - // avoid overflows - atomic_sub(rowCounters + rowIdx, 1 << rowOffset); - return 1; - } - p += cnt * SLOT_LEN + xi_offset_for_round(round); - // store "i" (always 4 bytes before Xi) - *(__global uint *)(p - 4) = i; - if (round == 0 || round == 1) - { - // store 24 bytes - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - *(__global ulong *)(p + 16) = xi2; - } - else if (round == 2) - { - // store 20 bytes - *(__global uint *)(p + 0) = xi0; - *(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32); - *(__global ulong *)(p + 12) = (xi1 >> 32) | (xi2 << 32); - } - else if (round == 3) - { - // store 16 bytes - *(__global uint *)(p + 0) = xi0; - *(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32); - *(__global uint *)(p + 12) = (xi1 >> 32); - } - else if (round == 4) - { - // store 16 bytes - *(__global ulong *)(p + 0) = xi0; - *(__global ulong *)(p + 8) = xi1; - } - else if (round == 5) - { - // store 12 bytes - *(__global ulong *)(p + 0) = xi0; - *(__global uint *)(p + 8) = xi1; - } - else if (round == 6 || round == 7) - { - // store 8 bytes - *(__global uint *)(p + 0) = xi0; - *(__global uint *)(p + 4) = (xi0 >> 32); - } - else if (round == 8) - { - // store 4 bytes - *(__global uint *)(p + 0) = xi0; - } - return 0; -} - -#define mix(va, vb, vc, vd, x, y) \ - va = (va + vb + x); \ -vd = rotate((vd ^ va), (ulong)64 - 32); \ -vc = (vc + vd); \ -vb = rotate((vb ^ vc), (ulong)64 - 24); \ -va = (va + vb + y); \ -vd = rotate((vd ^ va), (ulong)64 - 16); \ -vc = (vc + vd); \ -vb = rotate((vb ^ vc), (ulong)64 - 63); - -/* -** Execute round 0 (blake). -** -** Note: making the work group size less than or equal to the wavefront size -** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local -** Memory (LDS) Optimization 2-10" in: -** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/ -*/ -__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -void kernel_round0(__global ulong *blake_state, __global char *ht, - __global uint *rowCounters, __global uint *debug) -{ - uint tid = get_global_id(0); - ulong v[16]; - uint inputs_per_thread = NR_INPUTS / get_global_size(0); - uint input = tid * inputs_per_thread; - uint input_end = (tid + 1) * inputs_per_thread; - uint dropped = 0; - while (input < input_end) - { - // shift "i" to occupy the high 32 bits of the second ulong word in the - // message block - ulong word1 = (ulong)input << 32; - // init vector v - v[0] = blake_state[0]; - v[1] = blake_state[1]; - v[2] = blake_state[2]; - v[3] = blake_state[3]; - v[4] = blake_state[4]; - v[5] = blake_state[5]; - v[6] = blake_state[6]; - v[7] = blake_state[7]; - v[8] = blake_iv[0]; - v[9] = blake_iv[1]; - v[10] = blake_iv[2]; - v[11] = blake_iv[3]; - v[12] = blake_iv[4]; - v[13] = blake_iv[5]; - v[14] = blake_iv[6]; - v[15] = blake_iv[7]; - // mix in length of data - v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */; - // last block - v[14] ^= (ulong)-1; - - // round 1 - mix(v[0], v[4], v[8], v[12], 0, word1); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 2 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], word1, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 3 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, word1); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 4 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, word1); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 5 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, word1); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 6 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], word1, 0); - // round 7 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], word1, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 8 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, word1); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 9 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], word1, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 10 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], word1, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 11 - mix(v[0], v[4], v[8], v[12], 0, word1); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], 0, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - // round 12 - mix(v[0], v[4], v[8], v[12], 0, 0); - mix(v[1], v[5], v[9], v[13], 0, 0); - mix(v[2], v[6], v[10], v[14], 0, 0); - mix(v[3], v[7], v[11], v[15], 0, 0); - mix(v[0], v[5], v[10], v[15], word1, 0); - mix(v[1], v[6], v[11], v[12], 0, 0); - mix(v[2], v[7], v[8], v[13], 0, 0); - mix(v[3], v[4], v[9], v[14], 0, 0); - - // compress v into the blake state; this produces the 50-byte hash - // (two Xi values) - ulong h[7]; - h[0] = blake_state[0] ^ v[0] ^ v[8]; - h[1] = blake_state[1] ^ v[1] ^ v[9]; - h[2] = blake_state[2] ^ v[2] ^ v[10]; - h[3] = blake_state[3] ^ v[3] ^ v[11]; - h[4] = blake_state[4] ^ v[4] ^ v[12]; - h[5] = blake_state[5] ^ v[5] ^ v[13]; - h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff; - - // store the two Xi values in the hash table -#if ZCASH_HASH_LEN == 50 - dropped += ht_store(0, ht, input * 2, - h[0], - h[1], - h[2], - h[3], rowCounters); - dropped += ht_store(0, ht, input * 2 + 1, - (h[3] >> 8) | (h[4] << (64 - 8)), - (h[4] >> 8) | (h[5] << (64 - 8)), - (h[5] >> 8) | (h[6] << (64 - 8)), - (h[6] >> 8), rowCounters); -#else -#error "unsupported ZCASH_HASH_LEN" -#endif - - input++; - } -#ifdef ENABLE_DEBUG - debug[tid * 2] = 0; - debug[tid * 2 + 1] = dropped; -#endif -} - -#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff)) -#define DECODE_ROW(REF) (REF >> 16) -#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff) -#define DECODE_SLOT0(REF) (REF & 0xff) - -#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f)) -#define DECODE_ROW(REF) (REF >> 14) -#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f) -#define DECODE_SLOT0(REF) (REF & 0x7f) - -#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */ -#define DECODE_ROW(REF) (REF >> 13) -#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) -#define DECODE_SLOT0(REF) (REF & 0x3f) - -#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6) - -#define ENCODE_INPUTS(row, slot0, slot1) \ - ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) -#define DECODE_ROW(REF) (REF >> 12) -#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f) -#define DECODE_SLOT0(REF) (REF & 0x3f) - -#else -#error "unsupported NR_ROWS_LOG" -#endif - -/* -** Access a half-aligned long, that is a long aligned on a 4-byte boundary. -*/ -ulong half_aligned_long(__global ulong *p, uint offset) -{ - return - (((ulong)*(__global uint *)((__global char *)p + offset + 0)) << 0) | - (((ulong)*(__global uint *)((__global char *)p + offset + 4)) << 32); -} - -/* -** Access a well-aligned int. -*/ -uint well_aligned_int(__global ulong *_p, uint offset) -{ - __global char *p = (__global char *)_p; - return *(__global uint *)(p + offset); -} - -/* -** XOR a pair of Xi values computed at "round - 1" and store the result in the -** hash table being built for "round". Note that when building the table for -** even rounds we need to skip 1 padding byte present in the "round - 1" table -** (the "0xAB" byte mentioned in the description at the top of this file.) But -** also note we can't load data directly past this byte because this would -** cause an unaligned memory access which is undefined per the OpenCL spec. -** -** Return 0 if successfully stored, or 1 if the row overflowed. -*/ -uint xor_and_store(uint round, __global char *ht_dst, uint row, - uint slot_a, uint slot_b, __global ulong *a, __global ulong *b, - __global uint *rowCounters) -{ - ulong xi0, xi1, xi2; -#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 - // Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not - // storing the byte containing bits from the previous PREFIX block for - if (round == 1 || round == 2) - { - // xor 24 bytes - xi0 = *(a++) ^ *(b++); - xi1 = *(a++) ^ *(b++); - xi2 = *a ^ *b; - if (round == 2) - { - // skip padding byte - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8) | (xi2 << (64 - 8)); - xi2 = (xi2 >> 8); - } - } - else if (round == 3) - { - // xor 20 bytes - xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); - xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8); - xi2 = well_aligned_int(a, 16) ^ well_aligned_int(b, 16); - } - else if (round == 4 || round == 5) - { - // xor 16 bytes - xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); - xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8); - xi2 = 0; - if (round == 4) - { - // skip padding byte - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8); - } - } - else if (round == 6) - { - // xor 12 bytes - xi0 = *a++ ^ *b++; - xi1 = *(__global uint *)a ^ *(__global uint *)b; - xi2 = 0; - if (round == 6) - { - // skip padding byte - xi0 = (xi0 >> 8) | (xi1 << (64 - 8)); - xi1 = (xi1 >> 8); - } - } - else if (round == 7 || round == 8) - { - // xor 8 bytes - xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0); - xi1 = 0; - xi2 = 0; - if (round == 8) - { - // skip padding byte - xi0 = (xi0 >> 8); - } - } - // invalid solutions (which start happenning in round 5) have duplicate - // inputs and xor to zero, so discard them - if (!xi0 && !xi1) - return 0; -#else -#error "unsupported NR_ROWS_LOG" -#endif - return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b), - xi0, xi1, xi2, 0, rowCounters); -} - -/* -** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi, -** store them in ht_dst. -*/ -void equihash_round(uint round, - __global char *ht_src, - __global char *ht_dst, - __global uint *debug, - __local uchar *first_words_data, - __local uint *collisionsData, - __local uint *collisionsNum, - __global uint *rowCountersSrc, - __global uint *rowCountersDst, - uint threadsPerRow) -{ - uint globalTid = get_global_id(0) / threadsPerRow; - uint localTid = get_local_id(0) / threadsPerRow; - uint localGroupId = get_local_id(0) % threadsPerRow; - __local uchar *first_words = &first_words_data[NR_SLOTS*localTid]; - - __global char *p; - uint cnt; - uchar mask; - uint i, j; - // NR_SLOTS is already oversized (by a factor of OVERHEAD), but we want to - // make it even larger - uint n; - uint dropped_coll = 0; - uint dropped_stor = 0; - __global ulong *a, *b; - uint xi_offset; - // read first words of Xi from the previous (round - 1) hash table - xi_offset = xi_offset_for_round(round - 1); - // the mask is also computed to read data from the previous round -#if NR_ROWS_LOG <= 16 - mask = ((!(round % 2)) ? 0x0f : 0xf0); -#elif NR_ROWS_LOG == 18 - mask = ((!(round % 2)) ? 0x03 : 0x30); -#elif NR_ROWS_LOG == 19 - mask = ((!(round % 2)) ? 0x01 : 0x10); -#elif NR_ROWS_LOG == 20 - mask = 0; /* we can vastly simplify the code below */ -#else -#error "unsupported NR_ROWS_LOG" -#endif - - for (uint chunk = 0; chunk < threadsPerRow; chunk++) { - uint tid = globalTid + NR_ROWS/threadsPerRow*chunk; - uint gid = tid & ~(get_local_size(0) / threadsPerRow - 1); -// for (uint tid = get_global_id(0)/threadsPerRow; tid < NR_ROWS; tid += get_global_size(0)/threadsPerRow) { - - uint rowIdx = tid/ROWS_PER_UINT; - uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT); - cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK; - cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round - - *collisionsNum = 0; - p = (ht_src + tid * NR_SLOTS * SLOT_LEN); - p += xi_offset; - p += SLOT_LEN*localGroupId; - for (i = localGroupId; i < cnt; i += threadsPerRow, p += SLOT_LEN*threadsPerRow) - first_words[i] = (*(__global uchar *)p) & mask; - barrier(CLK_LOCAL_MEM_FENCE); - - if (cnt == 0) - // no elements in row, no collisions - goto part2; - // find collisions - for (i = 0; i < cnt-1; i++) - { - uchar data_i = first_words[i]; - uint collision = (localTid << 24) | (i << 12) | (i + 1 + localGroupId); - for (j = i + 1 + localGroupId; j < cnt; j += threadsPerRow) - { - if (data_i == first_words[j]) - { - uint index = atomic_inc(collisionsNum); - if (index >= LDS_COLL_SIZE) { - atomic_dec(collisionsNum); - goto part2; - } - collisionsData[index] = collision; - } - collision += threadsPerRow; - } - } - -part2: - barrier(CLK_LOCAL_MEM_FENCE); - uint totalCollisions = *collisionsNum; - for (uint index = get_local_id(0); index < totalCollisions; index += get_local_size(0)) - { - uint collision = collisionsData[index]; - uint collisionThreadId = gid + (collision >> 24); - uint i = (collision >> 12) & 0xFFF; - uint j = collision & 0xFFF; - __global uchar *ptr = ht_src + collisionThreadId * NR_SLOTS * SLOT_LEN + - xi_offset; - a = (__global ulong *)(ptr + i * SLOT_LEN); - b = (__global ulong *)(ptr + j * SLOT_LEN); - dropped_stor += xor_and_store(round, ht_dst, collisionThreadId, i, j, - a, b, rowCountersDst); - } - } - -#ifdef ENABLE_DEBUG - debug[tid * 2] = dropped_coll; - debug[tid * 2 + 1] = dropped_stor; -#endif -} - -/* -** This defines kernel_round1, kernel_round2, ..., kernel_round7. -*/ -#define KERNEL_ROUND(N) \ -__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) \ -void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \ - __global uint *rowCountersSrc, __global uint *rowCountersDst, \ - __global uint *debug) \ -{ \ - __local uchar first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; \ - __local uint collisionsData[LDS_COLL_SIZE]; \ - __local uint collisionsNum; \ - equihash_round(N, ht_src, ht_dst, debug, first_words_data, collisionsData, \ - &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); \ -} -KERNEL_ROUND(1) -KERNEL_ROUND(2) -KERNEL_ROUND(3) -KERNEL_ROUND(4) -KERNEL_ROUND(5) -KERNEL_ROUND(6) -KERNEL_ROUND(7) - -// kernel_round8 takes an extra argument, "sols" -__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -void kernel_round8(__global char *ht_src, __global char *ht_dst, - __global uint *rowCountersSrc, __global uint *rowCountersDst, - __global uint *debug, __global sols_t *sols) -{ - uint tid = get_global_id(0); - __local uchar first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; - __local uint collisionsData[LDS_COLL_SIZE]; - __local uint collisionsNum; - equihash_round(8, ht_src, ht_dst, debug, first_words_data, collisionsData, - &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); - if (!tid) - sols->nr = sols->likely_invalids = 0; -} - -uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot) -{ - return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN + - slot * SLOT_LEN + xi_offset - 4); -} - -/* -** Expand references to inputs. Return 1 if so far the solution appears valid, -** or 0 otherwise (an invalid solution would be a solution with duplicate -** inputs, which can be detected at the last step: round == 0). -*/ -uint expand_refs(uint *ins, uint nr_inputs, __global char **htabs, - uint round) -{ - __global char *ht = htabs[round % 2]; - uint i = nr_inputs - 1; - uint j = nr_inputs * 2 - 1; - uint xi_offset = xi_offset_for_round(round); - int dup_to_watch = -1; - do - { - ins[j] = expand_ref(ht, xi_offset, - DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i])); - ins[j - 1] = expand_ref(ht, xi_offset, - DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i])); - if (!round) - { - if (dup_to_watch == -1) - dup_to_watch = ins[j]; - else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch) - return 0; - } - if (!i) - break ; - i--; - j -= 2; - } - while (1); - return 1; -} - -/* -** Verify if a potential solution is in fact valid. -*/ -void potential_sol(__global char **htabs, __global sols_t *sols, - uint ref0, uint ref1) -{ - uint nr_values; - uint values_tmp[(1 << PARAM_K)]; - uint sol_i; - uint i; - nr_values = 0; - values_tmp[nr_values++] = ref0; - values_tmp[nr_values++] = ref1; - uint round = PARAM_K - 1; - do - { - round--; - if (!expand_refs(values_tmp, nr_values, htabs, round)) - return ; - nr_values *= 2; - } - while (round > 0); - // solution appears valid, copy it to sols - sol_i = atomic_inc(&sols->nr); - if (sol_i >= MAX_SOLS) - return ; - for (i = 0; i < (1 << PARAM_K); i++) - sols->values[sol_i][i] = values_tmp[i]; - sols->valid[sol_i] = 1; -} - -/* -** Scan the hash tables to find Equihash solutions. -*/ -__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols, - __global uint *rowCountersSrc, __global uint *rowCountersDst) -{ - __local uint counters[WORKSIZE/THREADS_PER_ROW]; - __local uint refs[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; - __local uint data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; - __local uint collisionsNum; - __local ulong collisions[WORKSIZE*4]; - - uint globalTid = get_global_id(0) / THREADS_PER_ROW; - uint localTid = get_local_id(0) / THREADS_PER_ROW; - uint localGroupId = get_local_id(0) % THREADS_PER_ROW; - __local uint *refsPtr = &refs[NR_SLOTS*localTid]; - __local uint *dataPtr = &data[NR_SLOTS*localTid]; - - __global char *htabs[2] = { ht0, ht1 }; - __global char *hcounters[2] = { rowCountersSrc, rowCountersDst }; - uint ht_i = (PARAM_K - 1) % 2; // table filled at last round - uint cnt; - uint xi_offset = xi_offset_for_round(PARAM_K - 1); - uint i, j; - __global char *p; - uint ref_i, ref_j; - // it's ok for the collisions array to be so small, as if it fills up - // the potential solutions are likely invalid (many duplicate inputs) -// ulong collisions; -#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20 - // in the final hash table, we are looking for a match on both the bits - // part of the previous PREFIX colliding bits, and the last PREFIX bits. - uint mask = 0xffffff; -#else -#error "unsupported NR_ROWS_LOG" -#endif - - collisionsNum = 0; - - for (uint chunk = 0; chunk < THREADS_PER_ROW; chunk++) { - uint tid = globalTid + NR_ROWS/THREADS_PER_ROW*chunk; - p = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN; - uint rowIdx = tid/ROWS_PER_UINT; - uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT); - cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK; - cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round - p += xi_offset; - p += SLOT_LEN*localGroupId; - - for (i = get_local_id(0); i < WORKSIZE/THREADS_PER_ROW; i += get_local_size(0)) - counters[i] = 0; - for (i = localGroupId; i < cnt; i += THREADS_PER_ROW, p += SLOT_LEN*THREADS_PER_ROW) { - refsPtr[i] = *(__global uint *)(p - 4); - dataPtr[i] = (*(__global uint *)p) & mask; - } - barrier(CLK_LOCAL_MEM_FENCE); - - for (i = 0; i < cnt; i++) - { - uint a_data = dataPtr[i]; - ref_i = refsPtr[i]; - for (j = i + 1 + localGroupId; j < cnt; j += THREADS_PER_ROW) - { - if (a_data == dataPtr[j]) - { - if (atomic_inc(&counters[localTid]) == 0) - collisions[atomic_inc(&collisionsNum)] = ((ulong)ref_i << 32) | refsPtr[j]; - goto part2; - } - } - } - -part2: - continue; - } - - barrier(CLK_LOCAL_MEM_FENCE); - uint totalCollisions = collisionsNum; - if (get_local_id(0) < totalCollisions) { - ulong coll = collisions[get_local_id(0)]; - potential_sol(htabs, sols, coll >> 32, coll & 0xffffffff); - } -} +// Gateless Gate, a Zcash miner +// Copyright 2016 zawawa @ bitcointalk.org +// +// The initial version of this software was based on: +// SILENTARMY v5 +// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil, eXtremal +// +// This program is free software : you can redistribute it and / or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +#include "equihash-param.h" + +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#ifdef AMD +#pragma OPENCL EXTENSION cl_amd_vec3 : enable +#endif + + + +///////////////// +// HASH TABLES // +///////////////// + +/* +** With the new hash tables, each slot has this layout (length in bytes in parens): +** +** round 0, table 0: i(4) pad(0) Xi(24) pad(4) +** round 1, table 1: i(4) pad(3) Xi(20) pad(5) +** round 2, table 2: i(4) pad(0) Xi(19) pad(9) +** round 3, table 3: i(4) pad(3) Xi(15) pad(10) +** round 4, table 4: i(4) pad(0) Xi(14) pad(14) +** round 5, table 5: i(4) pad(3) Xi(10) pad(15) +** round 6, table 6: i(4) pad(0) Xi( 9) pad(19) +** round 7, table 7: i(4) pad(3) Xi( 5) pad(20) +** round 8, table 8: i(4) pad(0) Xi( 4) pad(24) +*/ + +typedef union { + struct { + uint xi[7]; + uint padding; + } slot; + uint8 ui8; + uint4 ui4[2]; + uint2 ui2[4]; + uint ui[8]; +#ifdef AMD + ulong3 ul3; + uint3 ui3[2]; +#endif +} slot_t; + +typedef __global slot_t *global_pointer_to_slot_t; + +#define UINTS_IN_XI(round) (((round) == 0) ? 6 : \ + ((round) == 1) ? 6 : \ + ((round) == 2) ? 5 : \ + ((round) == 3) ? 5 : \ + ((round) == 4) ? 4 : \ + ((round) == 5) ? 4 : \ + ((round) == 6) ? 3 : \ + ((round) == 7) ? 2 : \ + 1) + + + +/* +** OBSOLETE +** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they +** represent (hex notation, group of 5 hex digits are a group of PREFIX bits): +** aa aa ab bb bb cc cc cd dd... [round 0] +** -------------------- +** ...ab bb bb cc cc cd dd... [odd round] +** -------------- +** ...cc cc cd dd... [next even round] +** ----- +** Bytes underlined are going to be stored in the slot. Preceding bytes +** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are +** used to compute the row number. +** +** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter) +** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble) +** TODO: update lines below with padding nibbles +** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter) +** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter) +** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter) +** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter) +** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter) +** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter) +** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter) +** +** Return 0 if successfully stored, or 1 if the row overflowed. +*/ + +__global char *get_slot_ptr(__global char *ht, uint round, uint row, uint slot) +{ + return ht + (row * NR_SLOTS + slot) * ADJUSTED_SLOT_LEN(round); +} + +__global uint *get_xi_ptr(__global char *ht, uint round, uint row, uint slot) +{ + return (__global uint *)get_slot_ptr(ht, round, row, slot); +} + +__global uint *get_ref_ptr(__global char *ht, uint round, uint row, uint slot) +{ + return get_xi_ptr(ht, round, row, slot) + UINTS_IN_XI(round); +} + +void get_row_counters_index(uint *rowIdx, uint *rowOffset, uint row) +{ + if (ROWS_PER_UINT == 3) { + uint r = (0x55555555 * row + (row >> 1) - (row >> 3)) >> 30; + *rowIdx = (row - r) * 0xAAAAAAAB; + *rowOffset = BITS_PER_ROW * r; + } else if (ROWS_PER_UINT == 6) { + uint r = (0x55555555 * row + (row >> 1) - (row >> 3)) >> 29; + *rowIdx = (row - r) * 0xAAAAAAAB * 2; + *rowOffset = BITS_PER_ROW * r; + } else { + *rowIdx = row / ROWS_PER_UINT; + *rowOffset = BITS_PER_ROW * (row % ROWS_PER_UINT); + } +} + +uint get_row(uint round, uint xi0) +{ + uint row = 0; + + if (NR_ROWS_LOG == 12) { + if (!(round % 2)) + row = (xi0 & 0xfff); + else + row = ((xi0 & 0x0f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24); + } else if (NR_ROWS_LOG == 13) { + if (!(round % 2)) + row = (xi0 & 0x1fff); + else + row = ((xi0 & 0x1f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24); + } else if (NR_ROWS_LOG == 14) { + if (!(round % 2)) + row = (xi0 & 0x3fff); + else + row = ((xi0 & 0x3f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24); + } else if (NR_ROWS_LOG == 15) { + if (!(round % 2)) + row = (xi0 & 0x7fff); + else + row = ((xi0 & 0x7f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24); + } else if (NR_ROWS_LOG == 16) { + if (!(round % 2)) + row = (xi0 & 0xffff); + else + row = ((xi0 & 0xff0f00) >> 8) | ((xi0 & 0xf0000000) >> 24); + } + + return row; +} + +uint get_nr_slots(__global uint *row_counters, uint row_index) +{ + uint rowIdx, rowOffset, nr_slots; + get_row_counters_index(&rowIdx, &rowOffset, row_index); + nr_slots = (row_counters[rowIdx] >> rowOffset) & ROW_MASK; + nr_slots = min(nr_slots, (uint)NR_SLOTS); // handle possible overflow in last round + return nr_slots; +} + +uint inc_row_counter(__global uint *rowCounters, uint row) +{ + uint rowIdx, rowOffset; + get_row_counters_index(&rowIdx, &rowOffset, row); + uint nr_slots = atomic_add(rowCounters + rowIdx, 1U << rowOffset); + nr_slots = (nr_slots >> rowOffset) & ROW_MASK; + if (nr_slots >= NR_SLOTS) { + // avoid overflows + atomic_sub(rowCounters + rowIdx, 1 << rowOffset); + } + return nr_slots; +} + + + +/* +** Reset counters in a hash table. +*/ + +__kernel +void kernel_init_ht(__global char *ht, __global uint *rowCounters, __global sols_t *sols, __global potential_sols_t *potential_sols) +{ + if (!get_global_id(0)) + sols->nr = sols->likely_invalids = potential_sols->nr = 0; + if (get_global_id(0) < RC_SIZE / 4) + rowCounters[get_global_id(0)] = 0; +} + + + +///////////// +// ROUND 0 // +///////////// + +__constant ulong blake_iv[] = +{ + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, + 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, + 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, +}; + +#define mix(va, vb, vc, vd, x, y) \ + va = (va + vb + (x)); \ +vd = rotate((vd ^ va), (ulong)64 - 32); \ +vc = (vc + vd); \ +vb = rotate((vb ^ vc), (ulong)64 - 24); \ +va = (va + vb + (y)); \ +vd = rotate((vd ^ va), (ulong)64 - 16); \ +vc = (vc + vd); \ +vb = rotate((vb ^ vc), (ulong)64 - 63); + +/* +** Execute round 0 (blake). +** +** Note: making the work group size less than or equal to the wavefront size +** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local +** Memory (LDS) Optimization 2-10" in: +** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/ +*/ + +#if ZCASH_HASH_LEN != 50 +#error "unsupported ZCASH_HASH_LEN" +#endif + +__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE_ROUND0, 1, 1))) +void kernel_round0(__constant ulong *blake_state, __global char *ht, + __global uint *rowCounters, __global uint *debug) +{ + uint tid = get_global_id(0); +#if defined(AMD) && !defined(AMD_LEGACY) + volatile ulong v[16]; + uint xi0, xi1, xi2, xi3, xi4, xi5, xi6; + slot_t slot; +#else + ulong v[16]; + uint xi0, xi1, xi2, xi3, xi4, xi5, xi6; + slot_t slot; +#endif + ulong h[7]; + uint inputs_per_thread = (NR_INPUTS + get_global_size(0) - 1) / get_global_size(0); + uint dropped = 0; + + for (uint chunk = 0; chunk < inputs_per_thread; ++chunk) { + uint input = tid + get_global_size(0) * chunk; + + if (input < NR_INPUTS) { + // shift "i" to occupy the high 32 bits of the second ulong word in the + // message block + ulong word1 = (ulong)input << 32; + // init vector v + v[0] = blake_state[0]; + v[1] = blake_state[1]; + v[2] = blake_state[2]; + v[3] = blake_state[3]; + v[4] = blake_state[4]; + v[5] = blake_state[5]; + v[6] = blake_state[6]; + v[7] = blake_state[7]; + v[8] = blake_iv[0]; + v[9] = blake_iv[1]; + v[10] = blake_iv[2]; + v[11] = blake_iv[3]; + v[12] = blake_iv[4]; + v[13] = blake_iv[5]; + v[14] = blake_iv[6]; + v[15] = blake_iv[7]; + // mix in length of data + v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */; + // last block + v[14] ^= (ulong)-1; + +#if defined(AMD) && !defined(AMD_LEGACY) +#pragma unroll 1 + for (uint blake_round = 1; blake_round <= 9; ++blake_round) { +#else +#pragma unroll 9 + for (uint blake_round = 1; blake_round <= 9; ++blake_round) { +#endif + mix(v[0], v[4], v[8], v[12], 0, (blake_round == 1) ? word1 : 0); + mix(v[1], v[5], v[9], v[13], (blake_round == 7) ? word1 : 0, (blake_round == 4) ? word1 : 0); + mix(v[2], v[6], v[10], v[14], 0, (blake_round == 8) ? word1 : 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], (blake_round == 2) ? word1 : 0, (blake_round == 5) ? word1 : 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], (blake_round == 9) ? word1 : 0, (blake_round == 3) ? word1 : 0); + mix(v[3], v[4], v[9], v[14], (blake_round == 6) ? word1 : 0, 0); + } + // round 10 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], word1, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 11 + mix(v[0], v[4], v[8], v[12], 0, word1); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], 0, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + // round 12 + mix(v[0], v[4], v[8], v[12], 0, 0); + mix(v[1], v[5], v[9], v[13], 0, 0); + mix(v[2], v[6], v[10], v[14], 0, 0); + mix(v[3], v[7], v[11], v[15], 0, 0); + mix(v[0], v[5], v[10], v[15], word1, 0); + mix(v[1], v[6], v[11], v[12], 0, 0); + mix(v[2], v[7], v[8], v[13], 0, 0); + mix(v[3], v[4], v[9], v[14], 0, 0); + + // compress v into the blake state; this produces the 50-byte hash + // (two Xi values) + h[0] = blake_state[0] ^ v[0] ^ v[8]; + h[1] = blake_state[1] ^ v[1] ^ v[9]; + h[2] = blake_state[2] ^ v[2] ^ v[10]; + h[3] = blake_state[3] ^ v[3] ^ v[11]; + h[4] = blake_state[4] ^ v[4] ^ v[12]; + h[5] = blake_state[5] ^ v[5] ^ v[13]; + h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff; + } + + if (input < NR_INPUTS) { + // store the two Xi values in the hash table +#pragma unroll 1 + for (uint index = 0; index < 2; ++index) { + if (!index) { + xi0 = h[0] & 0xffffffff; xi1 = h[0] >> 32; + xi2 = h[1] & 0xffffffff; xi3 = h[1] >> 32; + xi4 = h[2] & 0xffffffff; xi5 = h[2] >> 32; + xi6 = h[3] & 0xffffffff; + } else { + xi0 = ((h[3] >> 8) | (h[4] << (64 - 8))) & 0xffffffff; xi1 = ((h[3] >> 8) | (h[4] << (64 - 8))) >> 32; + xi2 = ((h[4] >> 8) | (h[5] << (64 - 8))) & 0xffffffff; xi3 = ((h[4] >> 8) | (h[5] << (64 - 8))) >> 32; + xi4 = ((h[5] >> 8) | (h[6] << (64 - 8))) & 0xffffffff; xi5 = ((h[5] >> 8) | (h[6] << (64 - 8))) >> 32; + xi6 = (h[6] >> 8) & 0xffffffff; + } + + uint row = get_row(0, xi0); + uint nr_slots = inc_row_counter(rowCounters, row); + if (nr_slots >= NR_SLOTS) { + ++dropped; + } else { + slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8)); + slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8)); + slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8)); + slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8)); + slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8)); + slot.slot.xi[5] = ((xi6 << 24) | (xi5 >> 8)); + slot.slot.xi[UINTS_IN_XI(0)] = input * 2 + index; + __global char *p = get_slot_ptr(ht, 0, row, nr_slots); + *(__global uint8 *)p = slot.ui8; + } + } + } + } + +#ifdef ENABLE_DEBUG + debug[tid * 2] = 0; + debug[tid * 2 + 1] = dropped; +#endif +} + +/* +** XOR a pair of Xi values computed at "round - 1" and store the result in the +** hash table being built for "round". Note that when building the table for +** even rounds we need to skip 1 padding byte present in the "round - 1" table +** (the "0xAB" byte mentioned in the description at the top of this file.) But +** also note we can't load data directly past this byte because this would +** cause an unaligned memory access which is undefined per the OpenCL spec. +** +** Return 0 if successfully stored, or 1 if the row overflowed. +*/ + +uint xor_and_store(uint round, __global char *ht_src, __global char *ht_dst, uint row, + uint slot_a, uint slot_b, __local uint *ai, __local uint *bi, + __global uint *rowCounters) { + uint ret = 0; + uint xi0, xi1, xi2, xi3, xi4, xi5; + +#if NR_ROWS_LOG < 8 && NR_ROWS_LOG > 20 +#error "unsupported NR_ROWS_LOG" +#endif + + slot_t slot; + __global slot_t *p = 0; + + if (slot_a < NR_SLOTS && slot_b < NR_SLOTS) { + xi0 = *ai; + xi1 = *(ai += NR_SLOTS); + if (round <= 7) xi2 = *(ai += NR_SLOTS); + if (round <= 6) xi3 = *(ai += NR_SLOTS); + if (round <= 4) xi4 = *(ai += NR_SLOTS); + if (round <= 2) xi5 = *(ai += NR_SLOTS); + + xi0 ^= *bi; + xi1 ^= *(bi += NR_SLOTS); + if (round <= 7) xi2 ^= *(bi += NR_SLOTS); + if (round <= 6) xi3 ^= *(bi += NR_SLOTS); + if (round <= 4) xi4 ^= *(bi += NR_SLOTS); + if (round <= 2) xi5 ^= *(bi += NR_SLOTS); + + if (!(round & 0x1)) { + // skip padding bytes + xi0 = (xi0 >> 24) | (xi1 << (32 - 24)); + + slot.slot.xi[0] = xi1; + slot.slot.xi[1] = xi2; + slot.slot.xi[2] = xi3; + slot.slot.xi[3] = xi4; + slot.slot.xi[4] = xi5; + } else { + slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8)); + if (round <= 7) slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8)); + if (round <= 6) slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8)); + if (round <= 5) slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8)); + if (round <= 3) slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8)); + if (round <= 1) slot.slot.xi[5] = ((xi5 >> 8)); + } + slot.slot.xi[UINTS_IN_XI(round)] = ENCODE_INPUTS(row, slot_a, slot_b); + + // invalid solutions (which start happenning in round 5) have duplicate + // inputs and xor to zero, so discard them + if (xi0 || xi1) { + uint new_row = get_row(round, xi0); + uint new_slot_index = inc_row_counter(rowCounters, new_row); + if (new_slot_index >= NR_SLOTS) { + ret = 1; + } else { + p = (__global slot_t *)get_slot_ptr(ht_dst, round, new_row, new_slot_index); + } + } + } + + if (p) { +#ifdef OPTIM_8BYTE_WRITES + if (round >= 8) + *(__global uint2 *)p = slot.ui2[0]; + else +#endif +#ifdef OPTIM_12BYTE_WRITES + if (round >= 7) + *(__global uint3 *)p = slot.ui3[0]; + else +#endif +#ifdef OPTIM_16BYTE_WRITES + if (round >= 6) + *(__global uint4 *)p = slot.ui4[0]; + else +#endif +#ifdef OPTIM_24BYTE_WRITES + if (round >= 2) + *(__global ulong3 *)p = slot.ul3; + else +#endif + *(__global uint8 *)p = slot.ui8; + } + return ret; +} + +uint parallel_xor_and_store(uint round, __global char *ht_src, __global char *ht_dst, uint row, + uint slot_a, uint slot_b, __local uint *ai, __local uint *bi, + __global uint *rowCounters, + __local SLOT_INDEX_TYPE *new_slot_indexes) { + uint ret = 0; + uint xi0, xi1, xi2, xi3, xi4, xi5; + uint write_index = get_local_id(0) / THREADS_PER_WRITE(round); + uint write_thread_index = get_local_id(0) % THREADS_PER_WRITE(round); + //uint write_index = get_local_id(0) % (get_local_size(0) / THREADS_PER_WRITE(round)); + //uint write_thread_index = get_local_id(0) / (get_local_size(0) / THREADS_PER_WRITE(round)); + +#if NR_ROWS_LOG < 8 && NR_ROWS_LOG > 20 +#error "unsupported NR_ROWS_LOG" +#endif + + slot_t slot; + uint new_slot_index; + uint new_row; + + if (!write_thread_index) + new_slot_indexes[write_index] = NR_SLOTS; + barrier(CLK_LOCAL_MEM_FENCE); + + if (slot_a < NR_SLOTS && slot_b < NR_SLOTS) { + xi0 = *ai; + xi1 = *(ai += NR_SLOTS); + if (round <= 7) xi2 = *(ai += NR_SLOTS); + if (round <= 6) xi3 = *(ai += NR_SLOTS); + if (round <= 4) xi4 = *(ai += NR_SLOTS); + if (round <= 2) xi5 = *(ai += NR_SLOTS); + + xi0 ^= *bi; + xi1 ^= *(bi += NR_SLOTS); + if (round <= 7) xi2 ^= *(bi += NR_SLOTS); + if (round <= 6) xi3 ^= *(bi += NR_SLOTS); + if (round <= 4) xi4 ^= *(bi += NR_SLOTS); + if (round <= 2) xi5 ^= *(bi += NR_SLOTS); + + if (!(round & 0x1)) { + // skip padding bytes + xi0 = (xi0 >> 24) | (xi1 << (32 - 24)); + + slot.slot.xi[0] = xi1; + slot.slot.xi[1] = xi2; + slot.slot.xi[2] = xi3; + slot.slot.xi[3] = xi4; + slot.slot.xi[4] = xi5; + } else { + slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8)); + if (round <= 7) slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8)); + if (round <= 6) slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8)); + if (round <= 5) slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8)); + if (round <= 3) slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8)); + if (round <= 1) slot.slot.xi[5] = ((xi5 >> 8)); + } + slot.slot.xi[UINTS_IN_XI(round)] = ENCODE_INPUTS(row, slot_a, slot_b); + new_row = get_row(round, xi0); + + // invalid solutions (which start happenning in round 5) have duplicate + // inputs and xor to zero, so discard them + if ((xi0 || xi1) && !write_thread_index) { + new_slot_indexes[write_index] = inc_row_counter(rowCounters, new_row); +#ifdef ENABLE_DEBUG + if (new_slot_index >= NR_SLOTS) + ret = 1; +#endif + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + if (new_slot_indexes[write_index] < NR_SLOTS) { + __global slot_t *p = (__global slot_t *)get_slot_ptr(ht_dst, round, new_row, new_slot_indexes[write_index]); + *(((__global uint4 *)p) + write_thread_index) = slot.ui4[write_thread_index]; + } + //barrier(CLK_LOCAL_MEM_FENCE); + return ret; +} + +/* +** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi, +** store them in ht_dst. Each work group processes only one row at a time. +*/ + +void equihash_round(uint round, + __global char *ht_src, + __global char *ht_dst, + __global uint *debug, + __local uint *slot_cache, + __local SLOT_INDEX_TYPE *collision_array_a, + __local SLOT_INDEX_TYPE *collision_array_b, + __local uint *nr_collisions, + __global uint *rowCountersSrc, + __global uint *rowCountersDst, + __local uint *bin_first_slots, + __local SLOT_INDEX_TYPE *bin_next_slots, + __local SLOT_INDEX_TYPE *new_slot_indexes) +{ + uint i, j; +#ifdef ENABLE_DEBUG + uint dropped_coll = 0; + uint dropped_stor = 0; +#endif + + // the mask is also computed to read data from the previous round +#define BIN_MASK(round) ((((round) + 1) % 2) ? 0xf000 : 0xf0000) +#define BIN_MASK_OFFSET(round) ((((round) + 1) % 2) ? 3 * 4 : 4 * 4) + +#define BIN_MASK2(round) ((NR_ROWS_LOG == 12) ? ((((round) + 1) % 2) ? 0x00f0 : 0xf000) : \ + (NR_ROWS_LOG == 13) ? ((((round) + 1) % 2) ? 0x00e0 : 0xe000) : \ + (NR_ROWS_LOG == 14) ? ((((round) + 1) % 2) ? 0x00c0 : 0xc000) : \ + (NR_ROWS_LOG == 15) ? ((((round) + 1) % 2) ? 0x0080 : 0x8000) : \ + 0) +#define BIN_MASK2_OFFSET(round) ((NR_ROWS_LOG == 12) ? ((((round) + 1) % 2) ? 0 : 8) : \ + (NR_ROWS_LOG == 13) ? ((((round) + 1) % 2) ? 1 : 9) : \ + (NR_ROWS_LOG == 14) ? ((((round) + 1) % 2) ? 2 : 10) : \ + (NR_ROWS_LOG == 15) ? ((((round) + 1) % 2) ? 3 : 11) : \ + 0) + +#define NR_BINS_LOG (20 - NR_ROWS_LOG) +#define NR_BINS (1 << NR_BINS_LOG) + + + + uint nr_slots = 0; + uint assigned_row_index = get_group_id(0); +if (assigned_row_index >= NR_ROWS) + return; + + for (i = get_local_id(0); i < NR_BINS; i += get_local_size(0)) + bin_first_slots[i] = NR_SLOTS; + for (i = get_local_id(0); i < NR_SLOTS; i += get_local_size(0)) + bin_next_slots[i] = NR_SLOTS; + if (get_local_id(0) == 0) + *nr_collisions = nr_slots = get_nr_slots(rowCountersSrc, assigned_row_index); + barrier(CLK_LOCAL_MEM_FENCE); + if (get_local_id(0)) + nr_slots = *nr_collisions; + + barrier(CLK_LOCAL_MEM_FENCE); + + for (uint phase = 0; phase < 1; ++phase) { + + // Perform a radix sort as slots get loaded into LDS. + // Make sure all the work items in the work group enter the loop. + for (i = get_local_id(0); i < nr_slots; i += get_local_size(0)) { + uint slot_index = i; + uint slot_cache_index = i; +#ifdef NVIDIA + uint2 slot_data0, slot_data1, slot_data2; + if (UINTS_IN_XI(round - 1) >= 1) slot_data0 = *((__global uint2 *)get_slot_ptr(ht_src, round - 1, assigned_row_index, slot_cache_index) + 0); + if (UINTS_IN_XI(round - 1) >= 3) slot_data1 = *((__global uint2 *)get_slot_ptr(ht_src, round - 1, assigned_row_index, slot_cache_index) + 1); + if (UINTS_IN_XI(round - 1) >= 5) slot_data2 = *((__global uint2 *)get_slot_ptr(ht_src, round - 1, assigned_row_index, slot_cache_index) + 2); + + if (UINTS_IN_XI(round - 1) >= 1) slot_cache[0 * NR_SLOTS + slot_cache_index] = slot_data0.s0; + if (UINTS_IN_XI(round - 1) >= 2) slot_cache[1 * NR_SLOTS + slot_cache_index] = slot_data0.s1; + if (UINTS_IN_XI(round - 1) >= 3) slot_cache[2 * NR_SLOTS + slot_cache_index] = slot_data1.s0; + if (UINTS_IN_XI(round - 1) >= 4) slot_cache[3 * NR_SLOTS + slot_cache_index] = slot_data1.s1; + if (UINTS_IN_XI(round - 1) >= 5) slot_cache[4 * NR_SLOTS + slot_cache_index] = slot_data2.s0; + if (UINTS_IN_XI(round - 1) >= 6) slot_cache[5 * NR_SLOTS + slot_cache_index] = slot_data2.s1; + uint xi0 = slot_data0.s0; +#else + for (j = 0; j < UINTS_IN_XI(round - 1); ++j) + slot_cache[j * NR_SLOTS + slot_cache_index] = *((__global uint *)get_xi_ptr(ht_src, round - 1, assigned_row_index, slot_index) + j); + uint xi0 = slot_cache[0 * NR_SLOTS + slot_cache_index]; +#endif + uint bin_to_use = + ((xi0 & BIN_MASK(round - 1)) >> BIN_MASK_OFFSET(round - 1)) + | ((xi0 & BIN_MASK2(round - 1)) >> BIN_MASK2_OFFSET(round - 1)); + bin_next_slots[i] = atomic_xchg(&bin_first_slots[bin_to_use], i); + } + + if (!get_local_id(0)) + *nr_collisions = 0; + uint max_slot_a_index = NR_SLOTS + (get_local_size(0) - NR_SLOTS % get_local_size(0)) - 1; + barrier(CLK_LOCAL_MEM_FENCE); + for (uint slot_a_index = get_local_id(0); slot_a_index <= max_slot_a_index; slot_a_index += get_local_size(0)) { + uint slot_b_index = (slot_a_index < NR_SLOTS) ? bin_next_slots[slot_a_index] : NR_SLOTS; + while (slot_b_index < NR_SLOTS) { + uint coll_index = atomic_inc(nr_collisions); + if (coll_index < LDS_COLL_SIZE) { + collision_array_a[coll_index] = slot_a_index; + collision_array_b[coll_index] = slot_b_index; + } else { + atomic_dec(nr_collisions); +#ifdef ENABLE_DEBUG + ++dropped_coll; +#endif + } + slot_b_index = bin_next_slots[slot_b_index]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + uint nr_collisions_copy = *nr_collisions; + //barrier(CLK_LOCAL_MEM_FENCE); + while (nr_collisions_copy > 0) { + uint collision, slot_index_a = NR_SLOTS, slot_index_b = NR_SLOTS; + __local uint *slot_cache_a, *slot_cache_b; + uint write_index = get_local_id(0) / THREADS_PER_WRITE(round); + if (write_index < nr_collisions_copy) { + slot_index_a = collision_array_a[nr_collisions_copy - 1 - write_index]; + slot_index_b = collision_array_b[nr_collisions_copy - 1 - write_index]; + slot_cache_a = (__local uint *)&slot_cache[slot_index_a]; + slot_cache_b = (__local uint *)&slot_cache[slot_index_b]; + } + //barrier(CLK_LOCAL_MEM_FENCE); + if (THREADS_PER_WRITE(round) > 1) { +#ifdef ENABLE_DEBUG + //dropped_stor += +#endif + parallel_xor_and_store(round, ht_src, ht_dst, assigned_row_index, slot_index_a, slot_index_b, slot_cache_a, slot_cache_b, rowCountersDst, new_slot_indexes); + } else { +#ifdef ENABLE_DEBUG + dropped_stor += +#endif + xor_and_store(round, ht_src, ht_dst, assigned_row_index, slot_index_a, slot_index_b, slot_cache_a, slot_cache_b, rowCountersDst); + } + + if (!get_local_id(0)) + *nr_collisions -= min(*nr_collisions, (uint)get_local_size(0) / THREADS_PER_WRITE(round)); + + barrier(CLK_LOCAL_MEM_FENCE); + nr_collisions_copy = *nr_collisions; + barrier(CLK_LOCAL_MEM_FENCE); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + } + + + +#ifdef ENABLE_DEBUG + debug[get_global_id(0) * 2] = dropped_coll; + debug[get_global_id(0) * 2 + 1] = dropped_stor; +#endif +} + +/* +** This defines kernel_round1, kernel_round2, ..., kernel_round8. +*/ + +#define KERNEL_ROUND(kernel_name, N) \ +__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE, 1, 1))) \ +void kernel_name(__global char *ht_src, __global char *ht_dst, \ + __global uint *rowCountersSrc, __global uint *rowCountersDst, \ + __global uint *debug) \ +{ \ + __local uint slot_cache[ADJUSTED_LDS_ARRAY_SIZE(UINTS_IN_XI(N - 1) * NR_SLOTS)]; \ + __local SLOT_INDEX_TYPE collision_array_a[ADJUSTED_LDS_ARRAY_SIZE(LDS_COLL_SIZE)]; \ + __local SLOT_INDEX_TYPE collision_array_b[ADJUSTED_LDS_ARRAY_SIZE(LDS_COLL_SIZE)]; \ + __local uint nr_collisions; \ + __local uint bin_first_slots[ADJUSTED_LDS_ARRAY_SIZE(NR_BINS)]; \ + __local SLOT_INDEX_TYPE bin_next_slots[ADJUSTED_LDS_ARRAY_SIZE(NR_SLOTS)]; \ + __local SLOT_INDEX_TYPE new_slot_indexes[ADJUSTED_LDS_ARRAY_SIZE((THREADS_PER_WRITE(N) > 1) ? LOCAL_WORK_SIZE / THREADS_PER_WRITE(N) : 0)]; \ + equihash_round((N), ht_src, ht_dst, debug, slot_cache, collision_array_a, collision_array_b, \ + &nr_collisions, rowCountersSrc, rowCountersDst, bin_first_slots, bin_next_slots, new_slot_indexes); \ +} + +KERNEL_ROUND(kernel_round1, 1) +KERNEL_ROUND(kernel_round2, 2) +KERNEL_ROUND(kernel_round3, 3) +KERNEL_ROUND(kernel_round4, 4) +KERNEL_ROUND(kernel_round5, 5) +KERNEL_ROUND(kernel_round6, 6) +KERNEL_ROUND(kernel_round7, 7) +KERNEL_ROUND(kernel_round8, 8) + + + +void mark_potential_sol(__global potential_sols_t *potential_sols, uint ref0, uint ref1) +{ + uint sol_i = atomic_inc(&potential_sols->nr); + if (sol_i >= MAX_POTENTIAL_SOLS) + return; + potential_sols->values[sol_i][0] = ref0; + potential_sols->values[sol_i][1] = ref1; +} + +/* +** Scan the hash tables to find Equihash solutions. +*/ + +__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE_POTENTIAL_SOLS, 1, 1))) +void kernel_potential_sols( + __global char *ht_src, + __global potential_sols_t *potential_sols, + __global uint *rowCountersSrc) +{ + __local uint refs[ADJUSTED_LDS_ARRAY_SIZE(NR_SLOTS)]; + __local uint data[ADJUSTED_LDS_ARRAY_SIZE(NR_SLOTS)]; + + uint nr_slots; + uint i, j; + __global char *p; + uint ref_i, ref_j; + __local uint bin_first_slots[ADJUSTED_LDS_ARRAY_SIZE(NR_BINS)]; + __local SLOT_INDEX_TYPE bin_next_slots[ADJUSTED_LDS_ARRAY_SIZE(NR_SLOTS)]; + + if (!get_global_id(0)) + potential_sols->nr = 0; + barrier(CLK_GLOBAL_MEM_FENCE); + + uint assigned_row_index = (get_global_id(0) / get_local_size(0)); + if (assigned_row_index >= NR_ROWS) + return; + + __local uint nr_slots_shared; + for (i = get_local_id(0); i < NR_BINS; i += get_local_size(0)) + bin_first_slots[i] = NR_SLOTS; + for (i = get_local_id(0); i < NR_SLOTS; i += get_local_size(0)) + bin_next_slots[i] = NR_SLOTS; + if (get_local_id(0) == 0) + nr_slots_shared = nr_slots = get_nr_slots(rowCountersSrc, assigned_row_index); + barrier(CLK_LOCAL_MEM_FENCE); + if (get_local_id(0)) + nr_slots = nr_slots_shared; + + barrier(CLK_LOCAL_MEM_FENCE); + + // in the final hash table, we are looking for a match on both the bits + // part of the previous PREFIX colliding bits, and the last PREFIX bits. + for (i = get_local_id(0); i < nr_slots; i += get_local_size(0)) { + ulong slot_first_8bytes = *(__global ulong *) get_slot_ptr(ht_src, PARAM_K - 1, assigned_row_index, i); + uint ref_i = refs[i] = slot_first_8bytes >> 32; + uint xi_first_4bytes = data[i] = slot_first_8bytes & 0xffffffff; + uint bin_to_use = + ((xi_first_4bytes & BIN_MASK(PARAM_K - 1)) >> BIN_MASK_OFFSET(PARAM_K - 1)) + | ((xi_first_4bytes & BIN_MASK2(PARAM_K - 1)) >> BIN_MASK2_OFFSET(PARAM_K - 1)); + bin_next_slots[i] = atomic_xchg(&bin_first_slots[bin_to_use], i); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for (i = get_local_id(0); i < nr_slots; i += get_local_size(0)) { + uint data_i = data[i]; + j = bin_next_slots[i]; + while (j < NR_SLOTS) { + if (data_i == data[j]) { + mark_potential_sol(potential_sols, refs[i], refs[j]); + return; + } + j = bin_next_slots[j]; + } + } +} + + + +__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE_SOLS, 1, 1))) +void kernel_sols(__global char *ht0, + __global char *ht1, + __global sols_t *sols, + __global uint *rowCountersSrc, + __global uint *rowCountersDst, + __global char *ht2, + __global char *ht3, + __global char *ht4, + __global char *ht5, + __global char *ht6, + __global char *ht7, + __global char *ht8, + __global potential_sols_t *potential_sols) +{ + __local uint inputs_a[ADJUSTED_LDS_ARRAY_SIZE(1 << PARAM_K)], inputs_b[ADJUSTED_LDS_ARRAY_SIZE(1 << (PARAM_K - 1))]; + __global char *htabs[] = { ht0, ht1, ht2, ht3, ht4, ht5, ht6, ht7, ht8 }; + + if ((get_global_id(0) / get_local_size(0)) < potential_sols->nr && (get_global_id(0) / get_local_size(0)) < MAX_POTENTIAL_SOLS) { + __local uint dup_counter; + if (get_local_id(0) == 0) { + dup_counter = 0; + inputs_a[0] = potential_sols->values[(get_global_id(0) / get_local_size(0))][0]; + inputs_a[1] = potential_sols->values[(get_global_id(0) / get_local_size(0))][1]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int round = 7; round >= 0; --round) { + if (round % 2) { + for (uint i = get_local_id(0); i < (1 << (8 - round)); i += get_local_size(0)) { + inputs_b[i * 2 + 1] = *get_ref_ptr(htabs[round], round, DECODE_ROW(inputs_a[i]), DECODE_SLOT1(inputs_a[i])); + inputs_b[i * 2] = *get_ref_ptr(htabs[round], round, DECODE_ROW(inputs_a[i]), DECODE_SLOT0(inputs_a[i])); + } + } else { + for (uint i = get_local_id(0); i < (1 << (8 - round)); i += get_local_size(0)) { + inputs_a[i * 2 + 1] = *get_ref_ptr(htabs[round], round, DECODE_ROW(inputs_b[i]), DECODE_SLOT1(inputs_b[i])); + inputs_a[i * 2] = *get_ref_ptr(htabs[round], round, DECODE_ROW(inputs_b[i]), DECODE_SLOT0(inputs_b[i])); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + //barrier(CLK_LOCAL_MEM_FENCE); + + int dup_to_watch = inputs_a[256 * 2 - 1]; + for (uint j = 3 + get_local_id(0); j < 256 * 2 - 2; j += get_local_size(0)) + if (inputs_a[j] == dup_to_watch) + atomic_inc(&dup_counter); + barrier(CLK_LOCAL_MEM_FENCE); + + // solution appears valid, copy it to sols + __local uint sol_i; + if (get_local_id(0) == 0 && !dup_counter) + sol_i = atomic_inc(&sols->nr); + barrier(CLK_LOCAL_MEM_FENCE); + if (sol_i < MAX_SOLS && !dup_counter) { + for (uint i = get_local_id(0); i < (1 << PARAM_K); i += get_local_size(0)) + sols->values[sol_i][i] = inputs_a[i]; + if (get_local_id(0) == 0) + sols->valid[sol_i] = 1; + } + barrier(CLK_LOCAL_MEM_FENCE); + } +} diff --git a/ocl.c b/ocl.c index 0cd615fe..620a9bc2 100644 --- a/ocl.c +++ b/ocl.c @@ -798,8 +798,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg char *kernel_names[] = {"kernel_init_ht", "kernel_round0", "kernel_round1", "kernel_round2", "kernel_round3", "kernel_round4", "kernel_round5", - "kernel_round6", "kernel_round7", "kernel_round8"}; - clState->n_extra_kernels = 1 + 9; + "kernel_round6", "kernel_round7", "kernel_round8", + "kernel_potential_sols"}; + clState->n_extra_kernels = 1 + 9 + 1; clState->extra_kernels = (cl_kernel *)malloc(sizeof(cl_kernel) * clState->n_extra_kernels); for (int i = 0; i < clState->n_extra_kernels; i++) { clState->extra_kernels[i] = clCreateKernel(clState->program, kernel_names[i], &status); @@ -810,19 +811,22 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg } char buffer[32]; - clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, HT_SIZE, NULL, &status); + clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, sizeof(potential_sols_t), NULL, &status); snprintf(buffer, sizeof(buffer), "CLbuffer0"); if (status != CL_SUCCESS) goto out; - clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, HT_SIZE, NULL, &status); - snprintf(buffer, sizeof(buffer), "buffer1"); - if (status != CL_SUCCESS) - goto out; - clState->buffer2 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, NR_ROWS, NULL, &status); + clState->buffer1 = NULL; + for (int i = 0; i < 9; i++) { + snprintf(buffer, sizeof(buffer), "index_buf[%d]", i); + clState->index_buf[i] = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, HT_SIZE, NULL, &status); + if (status != CL_SUCCESS) + goto out; + } + clState->buffer2 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, RC_SIZE, NULL, &status); snprintf(buffer, sizeof(buffer), "buffer2"); if (status != CL_SUCCESS) goto out; - clState->buffer3 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, NR_ROWS, NULL, &status); + clState->buffer3 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, RC_SIZE, NULL, &status); snprintf(buffer, sizeof(buffer), "buffer3"); if (status != CL_SUCCESS) goto out; @@ -839,13 +843,39 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg if (status != CL_SUCCESS) goto out; + cl_mem rowCounters[] = {clState->buffer2, clState->buffer3}; + for (int round = 0; round < PARAM_K; round++) { + unsigned int num = 0; + cl_kernel *kernel = &clState->extra_kernels[1 + round]; + if (!round) { + CL_SET_ARG(clState->MidstateBuf); + CL_SET_ARG(clState->index_buf[round]); + CL_SET_ARG(rowCounters[round % 2]); + } + else { + CL_SET_ARG(clState->index_buf[round - 1]); + CL_SET_ARG(clState->index_buf[round]); + CL_SET_ARG(rowCounters[(round - 1) % 2]); + CL_SET_ARG(rowCounters[round % 2]); + } + CL_SET_ARG(clState->padbuffer8); + } unsigned int num = 0; - cl_kernel *kernel = &clState->kernel; + cl_kernel *kernel = &clState->extra_kernels[1 + 9]; + CL_SET_ARG(clState->index_buf[8]); CL_SET_ARG(clState->CLbuffer0); - CL_SET_ARG(clState->buffer1); + CL_SET_ARG(rowCounters[0]); + + num = 0; + kernel = &clState->kernel; + CL_SET_ARG(clState->index_buf[0]); + CL_SET_ARG(clState->index_buf[1]); CL_SET_ARG(clState->outputBuffer); - CL_SET_ARG(clState->buffer2); - CL_SET_ARG(clState->buffer3); + CL_SET_ARG(rowCounters[0]); + CL_SET_ARG(rowCounters[1]); + for (int i = 2; i < 9; i++) + CL_SET_ARG(clState->index_buf[i]); + CL_SET_ARG(clState->CLbuffer0); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Setting Kernel arguments for ALGO_EQUIHASH failed. (clSetKernelArg)", status);