diff --git a/algorithm.c b/algorithm.c
index 2f543556..ad56d833 100644
--- a/algorithm.c
+++ b/algorithm.c
@@ -1147,6 +1147,7 @@ static cl_int queue_cryptonight_kernel(_clState *clState, dev_blk_ctx *blk, __ma
}
+#define WORKSIZE clState->wsize
static cl_int queue_equihash_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
@@ -1160,43 +1161,43 @@ static cl_int queue_equihash_kernel(_clState *clState, dev_blk_ctx *blk, __maybe
uint32_t dbg[2] = {0};
status |= clEnqueueWriteBuffer(clState->commandQueue, clState->padbuffer8, CL_TRUE, 0, sizeof(dbg), &dbg, 0, NULL, NULL);
- cl_mem buf_ht[2] = {clState->CLbuffer0, clState->buffer1};
cl_mem rowCounters[2] = {clState->buffer2, clState->buffer3};
for (int round = 0; round < PARAM_K; round++) {
- size_t global_ws = NR_ROWS / ROWS_PER_UINT;
+ size_t global_ws = RC_SIZE;
size_t local_ws = 256;
unsigned int num = 0;
cl_kernel *kernel = &clState->extra_kernels[0];
// Now on every round!!!!
- CL_SET_ARG(buf_ht[round % 2]);
+ CL_SET_ARG(clState->index_buf[round]);
CL_SET_ARG(rowCounters[round % 2]);
+ CL_SET_ARG(clState->outputBuffer);
+ CL_SET_ARG(clState->CLbuffer0);
status |= clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL);
- num = 0;
kernel = &clState->extra_kernels[1 + round];
if (!round) {
- CL_SET_ARG(clState->MidstateBuf);
- CL_SET_ARG(buf_ht[round % 2]);
- CL_SET_ARG(rowCounters[round % 2]);
- work_items = threads;
+ worksize = LOCAL_WORK_SIZE_ROUND0;
+ work_items = NR_INPUTS / ROUND0_INPUTS_PER_WORK_ITEM;
}
else {
- CL_SET_ARG(buf_ht[(round - 1) % 2]);
- CL_SET_ARG(buf_ht[round % 2]);
- CL_SET_ARG(rowCounters[(round - 1) % 2]);
- CL_SET_ARG(rowCounters[round % 2]);
- work_items = NR_ROWS;
+ worksize = LOCAL_WORK_SIZE;
+ work_items = NR_ROWS * worksize;
}
- CL_SET_ARG(clState->padbuffer8);
- if (round == PARAM_K - 1)
- CL_SET_ARG(clState->outputBuffer);
- status |= clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL, &work_items, &worksize, 0, NULL, NULL);
+ status |= clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[1 + round], 1, NULL, &work_items, &worksize, 0, NULL, NULL);
}
- work_items = NR_ROWS;
+
+ worksize = LOCAL_WORK_SIZE_POTENTIAL_SOLS;
+ work_items = NR_ROWS * worksize;
+ status |= clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[1 + 9], 1, NULL, &work_items, &worksize, 0, NULL, NULL);
+
+ worksize = LOCAL_WORK_SIZE_SOLS;
+ work_items = MAX_POTENTIAL_SOLS * worksize;
status |= clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 1, NULL, &work_items, &worksize, 0, NULL, NULL);
return status;
}
+#undef WORKSIZE
+
static algorithm_settings_t algos[] = {
// kernels starting from this will have difficulty calculated by using litecoin algorithm
diff --git a/algorithm/equihash.c b/algorithm/equihash.c
index 1f6a6b4b..35a32efd 100644
--- a/algorithm/equihash.c
+++ b/algorithm/equihash.c
@@ -34,7 +34,7 @@ static const uint8_t blake2b_sigma[12][16] = {
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
-
+
static const uint64_t blake2b_IV[8] = {
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
@@ -43,9 +43,9 @@ static const uint64_t blake2b_IV[8] = {
};
static const uint64_t blake2b_h[8] = {
- 0x6a09e667f2bdc93aULL, 0xbb67ae8584caa73bULL,
+ 0x6a09e667f2bdc93aULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
- 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+ 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
0x48ec89c38820de31ULL, 0x5be0cd10137e21b1ULL
};
@@ -75,8 +75,8 @@ static const uint64_t blake2b_h[8] = {
G(r,5,v[ 1],v[ 6],v[11],v[12]); \
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
-
-
+
+
#define G_fast(r,i,a,b,c,d) \
a = a + b + (blake2b_sigma[r][2*i] == 1 ? m1 : 0); \
d = rotr64(d ^ a, 32); \
@@ -99,134 +99,133 @@ static const uint64_t blake2b_h[8] = {
void equihash_calc_mid_hash(uint64_t mid_hash[8], uint8_t* header) {
- uint64_t v[16], *m = (uint64_t*) header;
- for (int i = 0; i < 8; i++) {
- v[i] = blake2b_h[i];
- v[i+8] = blake2b_IV[i];
- }
- v[12] ^= 128;
- for (int r = 0; r < 12; r++) {
- ROUND(r)
- }
- for (int i = 0; i < 8; i++)
- mid_hash[i] = blake2b_h[i] ^ v[i] ^ v[i+8];
+ uint64_t v[16], *m = (uint64_t*)header;
+ for (int i = 0; i < 8; i++) {
+ v[i] = blake2b_h[i];
+ v[i + 8] = blake2b_IV[i];
+ }
+ v[12] ^= 128;
+ for (int r = 0; r < 12; r++) {
+ ROUND(r)
+ }
+ for (int i = 0; i < 8; i++)
+ mid_hash[i] = blake2b_h[i] ^ v[i] ^ v[i + 8];
}
void blake2b_hash(uint8_t *hash, uint64_t mid_hash[8], uint32_t bday) {
- uint64_t v[16], tmp[8];
- uint64_t m1 = (uint64_t) bday << 32;
- for (int i = 0; i < 8; i++) {
- v[i] = mid_hash[i];
- v[i+8] = blake2b_IV[i];
- }
- v[12] ^= 140 + sizeof(bday);
- v[14] ^= (int64_t) -1;
- for (int r = 0; r < 12; r++) {
- ROUND_fast(r)
- }
- for (int i = 0; i < 8; i++)
- tmp[i] = mid_hash[i] ^ v[i] ^ v[i+8];
- memcpy(hash, tmp, 50);
+ uint64_t v[16], tmp[8];
+ uint64_t m1 = (uint64_t)bday << 32;
+ for (int i = 0; i < 8; i++) {
+ v[i] = mid_hash[i];
+ v[i + 8] = blake2b_IV[i];
+ }
+ v[12] ^= 140 + sizeof(bday);
+ v[14] ^= (int64_t)-1;
+ for (int r = 0; r < 12; r++) {
+ ROUND_fast(r)
+ }
+ for (int i = 0; i < 8; i++)
+ tmp[i] = mid_hash[i] ^ v[i] ^ v[i + 8];
+ memcpy(hash, tmp, 50);
}
void equihash_calc_hash(uint8_t hash[25], uint64_t mid_hash[8], uint32_t bday) {
- uint8_t tmp[50];
- blake2b_hash(tmp, mid_hash, bday/2);
- memcpy(hash, tmp + (bday & 1 ? 25 : 0), 25);
+ uint8_t tmp[50];
+ blake2b_hash(tmp, mid_hash, bday / 2);
+ memcpy(hash, tmp + (bday & 1 ? 25 : 0), 25);
}
// These two copied from the ref impl, for now.
void ExpandArray(const unsigned char* in, size_t in_len,
- unsigned char* out, size_t out_len,
- size_t bit_len)
+ unsigned char* out, size_t out_len,
+ size_t bit_len)
{
- size_t byte_pad = 0;
- size_t out_width = ((bit_len+7)/8 + byte_pad);
- uint32_t bit_len_mask = (((uint32_t)1 << bit_len) - 1);
-
- // The acc_bits least-significant bits of acc_value represent a bit sequence
- // in big-endian order.
- size_t acc_bits = 0;
- uint32_t acc_value = 0;
-
- size_t j = 0;
- for (size_t i = 0; i < in_len; i++) {
- acc_value = (acc_value << 8) | in[i];
- acc_bits += 8;
-
- // When we have bit_len or more bits in the accumulator, write the next
- // output element.
- if (acc_bits >= bit_len) {
- acc_bits -= bit_len;
- for (size_t x = 0; x < byte_pad; x++) {
- out[j+x] = 0;
- }
- for (size_t x = byte_pad; x < out_width; x++) {
- out[j+x] = (
- // Big-endian
- acc_value >> (acc_bits+(8*(out_width-x-1)))
- ) & (
- // Apply bit_len_mask across byte boundaries
- (bit_len_mask >> (8*(out_width-x-1))) & 0xFF
- );
- }
- j += out_width;
+ size_t byte_pad = 0;
+ size_t out_width = ((bit_len + 7) / 8 + byte_pad);
+ uint32_t bit_len_mask = (((uint32_t)1 << bit_len) - 1);
+
+ // The acc_bits least-significant bits of acc_value represent a bit sequence
+ // in big-endian order.
+ size_t acc_bits = 0;
+ uint32_t acc_value = 0;
+
+ size_t j = 0;
+ for (size_t i = 0; i < in_len; i++) {
+ acc_value = (acc_value << 8) | in[i];
+ acc_bits += 8;
+
+ // When we have bit_len or more bits in the accumulator, write the next
+ // output element.
+ if (acc_bits >= bit_len) {
+ acc_bits -= bit_len;
+ for (size_t x = 0; x < byte_pad; x++) {
+ out[j + x] = 0;
+ }
+ for (size_t x = byte_pad; x < out_width; x++) {
+ out[j + x] = (
+ // Big-endian
+ acc_value >> (acc_bits + (8 * (out_width - x - 1)))
+ ) & (
+ // Apply bit_len_mask across byte boundaries
+ (bit_len_mask >> (8 * (out_width - x - 1))) & 0xFF
+ );
+ }
+ j += out_width;
+ }
}
- }
}
void CompressArray(const unsigned char* in, size_t in_len,
- unsigned char* out, size_t out_len,
- size_t bit_len, size_t byte_pad)
+ unsigned char* out, size_t out_len,
+ size_t bit_len, size_t byte_pad)
{
- size_t in_width = ((bit_len+7)/8 + byte_pad);
- uint32_t bit_len_mask = (((uint32_t)1 << bit_len) - 1);
-
- // The acc_bits least-significant bits of acc_value represent a bit sequence
- // in big-endian order.
- size_t acc_bits = 0;
- uint32_t acc_value = 0;
-
- size_t j = 0;
- for (size_t i = 0; i < out_len; i++) {
- // When we have fewer than 8 bits left in the accumulator, read the next
- // input element.
- if (acc_bits < 8) {
- acc_value = acc_value << bit_len;
- for (size_t x = byte_pad; x < in_width; x++) {
- acc_value = acc_value | (
- (
- // Apply bit_len_mask across byte boundaries
- in[j+x] & ((bit_len_mask >> (8*(in_width-x-1))) & 0xFF)
- ) << (8*(in_width-x-1))
- ); // Big-endian
- }
- j += in_width;
- acc_bits += bit_len;
+ size_t in_width = ((bit_len + 7) / 8 + byte_pad);
+ uint32_t bit_len_mask = (((uint32_t)1 << bit_len) - 1);
+
+ // The acc_bits least-significant bits of acc_value represent a bit sequence
+ // in big-endian order.
+ size_t acc_bits = 0;
+ uint32_t acc_value = 0;
+
+ size_t j = 0;
+ for (size_t i = 0; i < out_len; i++) {
+ // When we have fewer than 8 bits left in the accumulator, read the next
+ // input element.
+ if (acc_bits < 8) {
+ acc_value = acc_value << bit_len;
+ for (size_t x = byte_pad; x < in_width; x++) {
+ acc_value = acc_value | (
+ (
+ // Apply bit_len_mask across byte boundaries
+ in[j + x] & ((bit_len_mask >> (8 * (in_width - x - 1))) & 0xFF)
+ ) << (8 * (in_width - x - 1))
+ ); // Big-endian
+ }
+ j += in_width;
+ acc_bits += bit_len;
+ }
+
+ acc_bits -= 8;
+ out[i] = (acc_value >> acc_bits) & 0xFF;
}
-
- acc_bits -= 8;
- out[i] = (acc_value >> acc_bits) & 0xFF;
- }
}
static inline void sort_pair(uint32_t *a, uint32_t len)
{
- uint32_t *b = a + len;
- uint32_t tmp, need_sorting = 0;
- for (uint32_t i = 0; i < len; i++) {
- if (need_sorting || a[i] > b[i]) {
- need_sorting = 1;
- tmp = a[i];
- a[i] = b[i];
- b[i] = tmp;
+ uint32_t *b = a + len;
+ uint32_t tmp, need_sorting = 0;
+ for (uint32_t i = 0; i < len; i++) {
+ if (need_sorting || a[i] > b[i]) {
+ need_sorting = 1;
+ tmp = a[i];
+ a[i] = b[i];
+ b[i] = tmp;
+ } else if (a[i] < b[i])
+ break;
}
- else if (a[i] < b[i])
- break;
- }
}
@@ -234,49 +233,49 @@ bool submit_tested_work(struct thr_info *, struct work *);
uint32_t equihash_verify_sol(struct work *work, sols_t *sols, int sol_i)
{
- uint32_t thr_id = work->thr->id;
- uint32_t *inputs = sols->values[sol_i];
- uint32_t seen_len = (1 << (PREFIX + 1)) / 8;
- uint8_t seen[seen_len];
- uint32_t i;
- uint8_t tmp;
- // look for duplicate inputs
- memset(seen, 0, seen_len);
- for (i = 0; i < (1 << PARAM_K); i++) {
-
- if (inputs[i] / 8 >= seen_len) {
- sols->valid[sol_i] = 0;
- return 0;
+ uint32_t thr_id = work->thr->id;
+ uint32_t *inputs = sols->values[sol_i];
+ uint32_t seen_len = (1 << (PREFIX + 1)) / 8;
+ uint8_t seen[(1 << (PREFIX + 1)) / 8];
+ uint32_t i;
+ uint8_t tmp;
+ // look for duplicate inputs
+ memset(seen, 0, seen_len);
+ for (i = 0; i < (1 << PARAM_K); i++) {
+
+ if (inputs[i] / 8 >= seen_len) {
+ sols->valid[sol_i] = 0;
+ return 0;
+ }
+ tmp = seen[inputs[i] / 8];
+ seen[inputs[i] / 8] |= 1 << (inputs[i] & 7);
+ if (tmp == seen[inputs[i] / 8]) {
+ // at least one input value is a duplicate
+ sols->valid[sol_i] = 0;
+ return 0;
+ }
}
- tmp = seen[inputs[i] / 8];
- seen[inputs[i] / 8] |= 1 << (inputs[i] & 7);
- if (tmp == seen[inputs[i] / 8]) {
- // at least one input value is a duplicate
- sols->valid[sol_i] = 0;
- return 0;
+ // the valid flag is already set by the GPU, but set it again because
+ // I plan to change the GPU code to not set it
+ sols->valid[sol_i] = 1;
+ // sort the pairs in place
+ for (uint32_t level = 0; level < PARAM_K; level++) {
+ for (i = 0; i < (1 << PARAM_K); i += (2 << level)) {
+ sort_pair(&inputs[i], 1 << level);
+ }
}
- }
- // the valid flag is already set by the GPU, but set it again because
- // I plan to change the GPU code to not set it
- sols->valid[sol_i] = 1;
- // sort the pairs in place
- for (uint32_t level = 0; level < PARAM_K; level++) {
- for (i = 0; i < (1 << PARAM_K); i += (2 << level)) {
- sort_pair(&inputs[i], 1 << level);
+
+ for (i = 0; i < (1 << PARAM_K); i++)
+ inputs[i] = htobe32(inputs[i]);
+
+ CompressArray((unsigned char*)inputs, 512 * 4, work->equihash_data + 143, 1344, 21, 1);
+
+ gen_hash(work->equihash_data, 1344 + 143, work->hash);
+
+ if (*(uint64_t*)(work->hash + 24) < *(uint64_t*)(work->target + 24)) {
+ submit_tested_work(work->thr, work);
}
- }
-
- for (i = 0; i < (1 << PARAM_K); i++)
- inputs[i] = htobe32(inputs[i]);
-
- CompressArray((unsigned char*) inputs, 512*4, work->equihash_data + 143, 1344, 21, 1);
-
- gen_hash(work->equihash_data, 1344 + 143, work->hash);
-
- if (*(uint64_t*) (work->hash + 24) < *(uint64_t*) (work->target + 24)) {
- submit_tested_work(work->thr, work);
- }
- return 1;
+ return 1;
}
void equihash_regenhash(struct work *work)
diff --git a/algorithm/ethash.c b/algorithm/ethash.c
index b9606d54..c4eb97a1 100644
--- a/algorithm/ethash.c
+++ b/algorithm/ethash.c
@@ -29,7 +29,7 @@ uint32_t EthCalcEpochNumber(uint8_t *SeedHash)
uint8_t TestSeedHash[32] = { 0 };
for(int Epoch = 0; Epoch < 2048; ++Epoch) {
- SHA3_256(TestSeedHash, TestSeedHash, 32);
+ SHA3_256((struct ethash_h256 *)TestSeedHash, TestSeedHash, 32);
if(!memcmp(TestSeedHash, SeedHash, 32)) return(Epoch + 1);
}
@@ -61,6 +61,10 @@ Node CalcDAGItem(const Node *CacheInputNodes, uint32_t NodeCount, uint32_t NodeI
return DAGNode;
}
+#ifdef _MSC_VER
+#define restrict
+#endif
+
// OutHash & MixHash MUST have 32 bytes allocated (at least)
void LightEthash(uint8_t *restrict OutHash, uint8_t *restrict MixHash, const uint8_t *restrict HeaderPoWHash, const Node *Cache, const uint64_t EpochNumber, const uint64_t Nonce)
{
@@ -116,7 +120,7 @@ void ethash_regenhash(struct work *work)
work->Nonce += *((uint32_t *)(work->data + 32));
applog(LOG_DEBUG, "Regenhash: First qword of input: 0x%016llX.", work->Nonce);
cg_rlock(&work->pool->data_lock);
- LightEthash(work->hash, work->mixhash, work->data, work->pool->eth_cache.dag_cache, work->eth_epoch, work->Nonce);
+ LightEthash(work->hash, work->mixhash, work->data, (Node *)work->pool->eth_cache.dag_cache, work->eth_epoch, work->Nonce);
cg_runlock(&work->pool->data_lock);
char *DbgHash = bin2hex(work->hash, 32);
diff --git a/algorithm/yescryptcommon.c b/algorithm/yescryptcommon.c
index cf7067d0..841a7838 100644
--- a/algorithm/yescryptcommon.c
+++ b/algorithm/yescryptcommon.c
@@ -320,6 +320,10 @@ yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p,
buf, sizeof(buf));
}
+#ifdef _MSC_VER
+#define __thread __declspec(thread)
+#endif
+
static int
yescrypt_bsty(const uint8_t * passwd, size_t passwdlen,
const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p,
diff --git a/driver-opencl.c b/driver-opencl.c
index 0250833d..c60a7c47 100644
--- a/driver-opencl.c
+++ b/driver-opencl.c
@@ -1646,6 +1646,16 @@ static void opencl_thread_shutdown(struct thr_info *thr)
clReleaseMemObject(clState->buffer3);
if (clState->padbuffer8)
clReleaseMemObject(clState->padbuffer8);
+ for (i = 0; i < 9; i++)
+ if (clState->index_buf[i])
+ clReleaseMemObject(clState->index_buf[i]);
+ for (i = 0; i < 4; i++)
+ if (clState->BranchBuffer[i])
+ clReleaseMemObject(clState->BranchBuffer[i]);
+ if (clState->Scratchpads)
+ clReleaseMemObject(clState->Scratchpads);
+ if (clState->States)
+ clReleaseMemObject(clState->States);
clReleaseKernel(clState->kernel);
for (i = 0; i < clState->n_extra_kernels; i++)
clReleaseKernel(clState->extra_kernels[i]);
diff --git a/kernel/equihash-param.h b/kernel/equihash-param.h
index ace80692..8969a366 100644
--- a/kernel/equihash-param.h
+++ b/kernel/equihash-param.h
@@ -1,113 +1,404 @@
-#ifndef __OPENCL_VERSION__
-#define uint uint32_t
-#define uchar uint8_t
-#endif
-
-
-#define PARAM_N 200
-#define PARAM_K 9
-#define PREFIX (PARAM_N / (PARAM_K + 1))
-#define NR_INPUTS (1 << PREFIX)
-// Approximate log base 2 of number of elements in hash tables
-#define APX_NR_ELMS_LOG (PREFIX + 1)
-// Number of rows and slots is affected by this. 20 offers the best performance
-// but occasionally misses ~1% of solutions.
-#define NR_ROWS_LOG 18
-
-// Setting this to 1 might make SILENTARMY faster, see TROUBLESHOOTING.md
-#define OPTIM_SIMPLIFY_ROUND 1
-
-// Number of collision items to track, per thread
-#define THREADS_PER_ROW 8
-#define LDS_COLL_SIZE (NR_SLOTS * 8 * (64 / THREADS_PER_ROW))
-
-// Ratio of time of sleeping before rechecking if task is done (0-1)
-#define SLEEP_RECHECK_RATIO 0.60
-// Ratio of time to busy wait for the solution (0-1)
-// The higher value the higher CPU usage with Nvidia
-#define SLEEP_SKIP_RATIO 0.005
-
-// Make hash tables OVERHEAD times larger than necessary to store the average
-// number of elements per row. The ideal value is as small as possible to
-// reduce memory usage, but not too small or else elements are dropped from the
-// hash tables.
-//
-// The actual number of elements per row is closer to the theoretical average
-// (less variance) when NR_ROWS_LOG is small. So accordingly OVERHEAD can be
-// smaller.
-//
-// Even (as opposed to odd) values of OVERHEAD sometimes significantly decrease
-// performance as they cause VRAM channel conflicts.
-#if NR_ROWS_LOG == 16
-// #error "NR_ROWS_LOG = 16 is currently broken - do not use"
-#define OVERHEAD 2
-#elif NR_ROWS_LOG == 18
-#define OVERHEAD 3
-#elif NR_ROWS_LOG == 19
-#define OVERHEAD 5
-#elif NR_ROWS_LOG == 20 && OPTIM_SIMPLIFY_ROUND
-#define OVERHEAD 6
-#elif NR_ROWS_LOG == 20
-#define OVERHEAD 9
-#endif
-
-#define NR_ROWS (1 << NR_ROWS_LOG)
-#define NR_SLOTS (((1 << (APX_NR_ELMS_LOG - NR_ROWS_LOG)) * OVERHEAD))
-// Length of 1 element (slot) in byte
-#define SLOT_LEN 32
-// Total size of hash table
-#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN)
-// Length of Zcash block header, nonce (part of header)
-#define ZCASH_BLOCK_HEADER_LEN 140
-// Offset of nTime in header
-#define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32)
-// Length of nonce
-#define ZCASH_NONCE_LEN 32
-// Length of encoded representation of solution size
-#define ZCASH_SOLSIZE_LEN 3
-// Solution size (1344 = 0x540) represented as a compact integer, in hex
-#define ZCASH_SOLSIZE_HEX "fd4005"
-// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes)
-#define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8)
-// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization
-#define N_ZERO_BYTES 12
-// Number of bytes Zcash needs out of Blake
-#define ZCASH_HASH_LEN 50
-// Number of wavefronts per SIMD for the Blake kernel.
-// Blake is ALU-bound (beside the atomic counter being incremented) so we need
-// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
-// instructions. 10 is the max supported by the hw.
-#define BLAKE_WPS 10
-// Maximum number of solutions reported by kernel to host
-#define MAX_SOLS 10
-// Length of SHA256 target
-#define SHA256_TARGET_LEN (256 / 8)
-
-#if (NR_SLOTS < 16)
-#define BITS_PER_ROW 4
-#define ROWS_PER_UINT 8
-#define ROW_MASK 0x0F
-#else
-#define BITS_PER_ROW 8
-#define ROWS_PER_UINT 4
-#define ROW_MASK 0xFF
-#endif
-
-// Optional features
-#undef ENABLE_DEBUG
-
-/*
-** Return the offset of Xi in bytes from the beginning of the slot.
-*/
-#define xi_offset_for_round(round) (8 + ((round) / 2) * 4)
-
-// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
-#define SOL_SIZE ((1 << PARAM_K) * 4)
-typedef struct sols_s
-{
- uint nr;
- uint likely_invalids;
- uchar valid[MAX_SOLS];
- uint values[MAX_SOLS][(1 << PARAM_K)];
-} sols_t;
-
+// Gateless Gate, a Zcash miner
+// Copyright 2016 zawawa @ bitcointalk.org
+//
+// The initial version of this software was based on:
+// SILENTARMY v5
+// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil
+//
+// This program is free software : you can redistribute it and / or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.If not, see .
+
+
+
+#ifndef __OPENCL_VERSION__
+#define uint uint32_t
+#define uchar uint8_t
+#endif
+#ifdef cl_amd_fp64
+#define AMD
+#endif
+#if (defined(__Tahiti__) || defined(__Pitcairn__) || defined(__Capeverde__) || defined(__Oland__)) && !defined(AMD_LEGACY)
+#define AMD_LEGACY
+#endif
+#ifdef cl_nv_pragma_unroll
+#define NVIDIA
+#endif
+//#define ENABLE_DEBUG
+
+
+
+//
+// Parameters for Hash Tables
+//
+
+// There are PARAM_K - 1 hash tables, and each hash table has NR_ROWS rows.
+// Each row contains NR_SLOTS slots.
+
+#define NR_ROWS_LOG 12 /* 12, 13, 14, 15, or 16. */
+#define NR_SLOTS 684
+
+#define LDS_COLL_SIZE (NR_SLOTS * 67 / 100)
+
+#define LOCAL_WORK_SIZE WORKSIZE
+#define LOCAL_WORK_SIZE_SOLS WORKSIZE
+#define LOCAL_WORK_SIZE_ROUND0 WORKSIZE
+#define LOCAL_WORK_SIZE_POTENTIAL_SOLS WORKSIZE
+
+#define ROUND0_INPUTS_PER_WORK_ITEM 1
+
+#if defined(AMD)
+#define THREADS_PER_WRITE(round) (((round) <= 5) ? 2 : 1)
+#else
+#define THREADS_PER_WRITE(round) 1
+#endif
+
+#if defined(AMD) && !defined(AMD_LEGACY)
+#define OPTIM_24BYTE_WRITES
+#endif
+#define OPTIM_16BYTE_WRITES
+#if !defined(AMD_LEGACY)
+#define OPTIM_8BYTE_WRITES
+#endif
+
+//#define OPTIM_FAST_INTEGER_DIVISION
+//#define OPTIM_COMPACT_ROW_COUNTERS
+
+#define ADJUSTED_LDS_ARRAY_SIZE(n) (n)
+
+
+
+#define PARAM_N 200
+#define PARAM_K 9
+#define PREFIX (PARAM_N / (PARAM_K + 1))
+#define NR_INPUTS (1 << PREFIX)
+#define NR_ROWS (1 << NR_ROWS_LOG)
+// Length of 1 element (slot) in byte
+#define SLOT_LEN 32
+#define ADJUSTED_SLOT_LEN(round) (((round) <= 5) ? SLOT_LEN : SLOT_LEN - 16)
+// Total size of hash table
+#define HT_SIZE (NR_ROWS * NR_SLOTS * SLOT_LEN)
+// Length of Zcash block header, nonce (part of header)
+#define ZCASH_BLOCK_HEADER_LEN 140
+// Offset of nTime in header
+#define ZCASH_BLOCK_OFFSET_NTIME (4 + 3 * 32)
+// Length of nonce
+#define ZCASH_NONCE_LEN 32
+// Length of encoded representation of solution size
+#define ZCASH_SOLSIZE_LEN 3
+// Solution size (1344 = 0x540) represented as a compact integer, in hex
+#define ZCASH_SOLSIZE_HEX "fd4005"
+// Length of encoded solution (512 * 21 bits / 8 = 1344 bytes)
+#define ZCASH_SOL_LEN ((1 << PARAM_K) * (PREFIX + 1) / 8)
+// Last N_ZERO_BYTES of nonce must be zero due to my BLAKE2B optimization
+#define N_ZERO_BYTES 12
+// Number of bytes Zcash needs out of Blake
+#define ZCASH_HASH_LEN 50
+// Number of wavefronts per SIMD for the Blake kernel.
+// Blake is ALU-bound (beside the atomic counter being incremented) so we need
+// at least 2 wavefronts per SIMD to hide the 2-clock latency of integer
+// instructions. 10 is the max supported by the hw.
+#define BLAKE_WPS 10
+// Maximum number of solutions reported by kernel to host
+#define MAX_SOLS 11
+#define MAX_POTENTIAL_SOLS 4096
+// Length of SHA256 target
+#define SHA256_TARGET_LEN (256 / 8)
+
+#ifdef OPTIM_COMPACT_ROW_COUNTERS
+#define BITS_PER_ROW ((NR_SLOTS < 3) ? 2 : \
+ (NR_SLOTS < 7) ? 3 : \
+ (NR_SLOTS < 15) ? 4 : \
+ (NR_SLOTS < 31) ? 5 : \
+ (NR_SLOTS < 63) ? 6 : \
+ (NR_SLOTS < 255) ? 8 : \
+ (NR_SLOTS < 1023) ? 10 : \
+ 16)
+#else
+#define BITS_PER_ROW ((NR_SLOTS < 3) ? 2 : \
+ (NR_SLOTS < 15) ? 4 : \
+ (NR_SLOTS < 255) ? 8 : \
+ 16)
+#endif
+#define ROWS_PER_UINT (32 / BITS_PER_ROW)
+#define ROW_MASK ((1 << BITS_PER_ROW) - 1)
+
+
+#define RC_SIZE ((NR_ROWS * 4 + ROWS_PER_UINT - 1) / ROWS_PER_UINT)
+
+
+
+// An (uncompressed) solution stores (1 << PARAM_K) 32-bit values
+#define SOL_SIZE ((1 << PARAM_K) * 4)
+typedef struct sols_s
+{
+ uint nr;
+ uint likely_invalids;
+ uchar valid[MAX_SOLS];
+ uint values[MAX_SOLS][(1 << PARAM_K)];
+} sols_t;
+
+typedef struct potential_sols_s
+{
+ uint nr;
+ uint values[MAX_POTENTIAL_SOLS][2];
+} potential_sols_t;
+
+#if NR_ROWS_LOG <= 12 && NR_SLOTS <= (1 << 10)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+ ((row << 20) | ((slot1 & 0x3ff) << 10) | (slot0 & 0x3ff))
+#define DECODE_ROW(REF) (REF >> 20)
+#define DECODE_SLOT1(REF) ((REF >> 10) & 0x3ff)
+#define DECODE_SLOT0(REF) (REF & 0x3ff)
+
+#elif NR_ROWS_LOG <= 14 && NR_SLOTS <= (1 << 9)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+ ((row << 18) | ((slot1 & 0x1ff) << 9) | (slot0 & 0x1ff))
+#define DECODE_ROW(REF) (REF >> 18)
+#define DECODE_SLOT1(REF) ((REF >> 9) & 0x1ff)
+#define DECODE_SLOT0(REF) (REF & 0x1ff)
+
+#elif NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+ ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
+#define DECODE_ROW(REF) (REF >> 16)
+#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
+#define DECODE_SLOT0(REF) (REF & 0xff)
+
+#elif NR_ROWS_LOG <= 18 && NR_SLOTS <= (1 << 7)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+ ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
+#define DECODE_ROW(REF) (REF >> 14)
+#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
+#define DECODE_SLOT0(REF) (REF & 0x7f)
+
+#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+ ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
+#define DECODE_ROW(REF) (REF >> 13)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
+
+#define ENCODE_INPUTS(row, slot0, slot1) \
+ ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
+#define DECODE_ROW(REF) (REF >> 12)
+#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
+#define DECODE_SLOT0(REF) (REF & 0x3f)
+
+#else
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+#define NEXT_PRIME_NO(n) \
+ (((n) <= 2) ? 2 : \
+ ((n) <= 3) ? 3 : \
+ ((n) <= 5) ? 5 : \
+ ((n) <= 7) ? 7 : \
+ ((n) <= 11) ? 11 : \
+ ((n) <= 13) ? 13 : \
+ ((n) <= 17) ? 17 : \
+ ((n) <= 19) ? 19 : \
+ ((n) <= 23) ? 23 : \
+ ((n) <= 29) ? 29 : \
+ ((n) <= 31) ? 31 : \
+ ((n) <= 37) ? 37 : \
+ ((n) <= 41) ? 41 : \
+ ((n) <= 43) ? 43 : \
+ ((n) <= 47) ? 47 : \
+ ((n) <= 53) ? 53 : \
+ ((n) <= 59) ? 59 : \
+ ((n) <= 61) ? 61 : \
+ ((n) <= 67) ? 67 : \
+ ((n) <= 71) ? 71 : \
+ ((n) <= 73) ? 73 : \
+ ((n) <= 79) ? 79 : \
+ ((n) <= 83) ? 83 : \
+ ((n) <= 89) ? 89 : \
+ ((n) <= 97) ? 97 : \
+ ((n) <= 101) ? 101 : \
+ ((n) <= 103) ? 103 : \
+ ((n) <= 107) ? 107 : \
+ ((n) <= 109) ? 109 : \
+ ((n) <= 113) ? 113 : \
+ ((n) <= 127) ? 127 : \
+ ((n) <= 131) ? 131 : \
+ ((n) <= 137) ? 137 : \
+ ((n) <= 139) ? 139 : \
+ ((n) <= 149) ? 149 : \
+ ((n) <= 151) ? 151 : \
+ ((n) <= 157) ? 157 : \
+ ((n) <= 163) ? 163 : \
+ ((n) <= 167) ? 167 : \
+ ((n) <= 173) ? 173 : \
+ ((n) <= 179) ? 179 : \
+ ((n) <= 181) ? 181 : \
+ ((n) <= 191) ? 191 : \
+ ((n) <= 193) ? 193 : \
+ ((n) <= 197) ? 197 : \
+ ((n) <= 199) ? 199 : \
+ ((n) <= 211) ? 211 : \
+ ((n) <= 223) ? 223 : \
+ ((n) <= 227) ? 227 : \
+ ((n) <= 229) ? 229 : \
+ ((n) <= 233) ? 233 : \
+ ((n) <= 239) ? 239 : \
+ ((n) <= 241) ? 241 : \
+ ((n) <= 251) ? 251 : \
+ ((n) <= 257) ? 257 : \
+ ((n) <= 263) ? 263 : \
+ ((n) <= 269) ? 269 : \
+ ((n) <= 271) ? 271 : \
+ ((n) <= 277) ? 277 : \
+ ((n) <= 281) ? 281 : \
+ ((n) <= 283) ? 283 : \
+ ((n) <= 293) ? 293 : \
+ ((n) <= 307) ? 307 : \
+ ((n) <= 311) ? 311 : \
+ ((n) <= 313) ? 313 : \
+ ((n) <= 317) ? 317 : \
+ ((n) <= 331) ? 331 : \
+ ((n) <= 337) ? 337 : \
+ ((n) <= 347) ? 347 : \
+ ((n) <= 349) ? 349 : \
+ ((n) <= 353) ? 353 : \
+ ((n) <= 359) ? 359 : \
+ ((n) <= 367) ? 367 : \
+ ((n) <= 373) ? 373 : \
+ ((n) <= 379) ? 379 : \
+ ((n) <= 383) ? 383 : \
+ ((n) <= 389) ? 389 : \
+ ((n) <= 397) ? 397 : \
+ ((n) <= 401) ? 401 : \
+ ((n) <= 409) ? 409 : \
+ ((n) <= 419) ? 419 : \
+ ((n) <= 421) ? 421 : \
+ ((n) <= 431) ? 431 : \
+ ((n) <= 433) ? 433 : \
+ ((n) <= 439) ? 439 : \
+ ((n) <= 443) ? 443 : \
+ ((n) <= 449) ? 449 : \
+ ((n) <= 457) ? 457 : \
+ ((n) <= 461) ? 461 : \
+ ((n) <= 463) ? 463 : \
+ ((n) <= 467) ? 467 : \
+ ((n) <= 479) ? 479 : \
+ ((n) <= 487) ? 487 : \
+ ((n) <= 491) ? 491 : \
+ ((n) <= 499) ? 499 : \
+ ((n) <= 503) ? 503 : \
+ ((n) <= 509) ? 509 : \
+ ((n) <= 521) ? 521 : \
+ ((n) <= 523) ? 523 : \
+ ((n) <= 541) ? 541 : \
+ ((n) <= 547) ? 547 : \
+ ((n) <= 557) ? 557 : \
+ ((n) <= 563) ? 563 : \
+ ((n) <= 569) ? 569 : \
+ ((n) <= 571) ? 571 : \
+ ((n) <= 577) ? 577 : \
+ ((n) <= 587) ? 587 : \
+ ((n) <= 593) ? 593 : \
+ ((n) <= 599) ? 599 : \
+ ((n) <= 601) ? 601 : \
+ ((n) <= 607) ? 607 : \
+ ((n) <= 613) ? 613 : \
+ ((n) <= 617) ? 617 : \
+ ((n) <= 619) ? 619 : \
+ ((n) <= 631) ? 631 : \
+ ((n) <= 641) ? 641 : \
+ ((n) <= 643) ? 643 : \
+ ((n) <= 647) ? 647 : \
+ ((n) <= 653) ? 653 : \
+ ((n) <= 659) ? 659 : \
+ ((n) <= 661) ? 661 : \
+ ((n) <= 673) ? 673 : \
+ ((n) <= 677) ? 677 : \
+ ((n) <= 683) ? 683 : \
+ ((n) <= 691) ? 691 : \
+ ((n) <= 701) ? 701 : \
+ ((n) <= 709) ? 709 : \
+ ((n) <= 719) ? 719 : \
+ ((n) <= 727) ? 727 : \
+ ((n) <= 733) ? 733 : \
+ ((n) <= 739) ? 739 : \
+ ((n) <= 743) ? 743 : \
+ ((n) <= 751) ? 751 : \
+ ((n) <= 757) ? 757 : \
+ ((n) <= 761) ? 761 : \
+ ((n) <= 769) ? 769 : \
+ ((n) <= 773) ? 773 : \
+ ((n) <= 787) ? 787 : \
+ ((n) <= 797) ? 797 : \
+ ((n) <= 809) ? 809 : \
+ ((n) <= 811) ? 811 : \
+ ((n) <= 821) ? 821 : \
+ ((n) <= 823) ? 823 : \
+ ((n) <= 827) ? 827 : \
+ ((n) <= 829) ? 829 : \
+ ((n) <= 839) ? 839 : \
+ ((n) <= 853) ? 853 : \
+ ((n) <= 857) ? 857 : \
+ ((n) <= 859) ? 859 : \
+ ((n) <= 863) ? 863 : \
+ ((n) <= 877) ? 877 : \
+ ((n) <= 881) ? 881 : \
+ ((n) <= 883) ? 883 : \
+ ((n) <= 887) ? 887 : \
+ ((n) <= 907) ? 907 : \
+ ((n) <= 911) ? 911 : \
+ ((n) <= 919) ? 919 : \
+ ((n) <= 929) ? 929 : \
+ ((n) <= 937) ? 937 : \
+ ((n) <= 941) ? 941 : \
+ ((n) <= 947) ? 947 : \
+ ((n) <= 953) ? 953 : \
+ ((n) <= 967) ? 967 : \
+ ((n) <= 971) ? 971 : \
+ ((n) <= 977) ? 977 : \
+ ((n) <= 983) ? 983 : \
+ ((n) <= 991) ? 991 : \
+ ((n) <= 997) ? 997 : \
+ ((n) <= 1009) ? 1009 : \
+ (n))
+
+#define NEXT_POWER_OF_TWO(n) \
+ (((n) <= 2) ? 2 : \
+ ((n) <= 4) ? 4 : \
+ ((n) <= 8) ? 8 : \
+ ((n) <= 16) ? 16 : \
+ ((n) <= 32) ? 32 : \
+ ((n) <= 64) ? 64 : \
+ ((n) <= 128) ? 128 : \
+ ((n) <= 256) ? 256 : \
+ ((n) <= 512) ? 512 : \
+ ((n) <= 1024) ? 1024 : \
+ ((n) <= 2048) ? 2048 : \
+ ((n) <= 4096) ? 4096 : \
+ ((n) <= 8192) ? 8192 : \
+ ((n) <= 16384) ? 16384 : \
+ ((n) <= 32768) ? 32768 : \
+ (n))
+
+#if NR_SLOTS < 255
+#define SLOT_INDEX_TYPE uchar
+#elif NR_SLOTS < 65535
+#define SLOT_INDEX_TYPE ushort
+#else
+#error "Unsupported NR_SLOTS"
+#endif
diff --git a/kernel/equihash.cl b/kernel/equihash.cl
index 460d20a5..73982c1a 100644
--- a/kernel/equihash.cl
+++ b/kernel/equihash.cl
@@ -1,833 +1,894 @@
-#include "equihash-param.h"
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-
-/*
-** Assuming NR_ROWS_LOG == 16, the hash table slots have this layout (length in
-** bytes in parens):
-**
-** round 0, table 0: cnt(4) i(4) pad(0) Xi(23.0) pad(1)
-** round 1, table 1: cnt(4) i(4) pad(0.5) Xi(20.5) pad(3)
-** round 2, table 0: cnt(4) i(4) i(4) pad(0) Xi(18.0) pad(2)
-** round 3, table 1: cnt(4) i(4) i(4) pad(0.5) Xi(15.5) pad(4)
-** round 4, table 0: cnt(4) i(4) i(4) i(4) pad(0) Xi(13.0) pad(3)
-** round 5, table 1: cnt(4) i(4) i(4) i(4) pad(0.5) Xi(10.5) pad(5)
-** round 6, table 0: cnt(4) i(4) i(4) i(4) i(4) pad(0) Xi( 8.0) pad(4)
-** round 7, table 1: cnt(4) i(4) i(4) i(4) i(4) pad(0.5) Xi( 5.5) pad(6)
-** round 8, table 0: cnt(4) i(4) i(4) i(4) i(4) i(4) pad(0) Xi( 3.0) pad(5)
-**
-** If the first byte of Xi is 0xAB then:
-** - on even rounds, 'A' is part of the colliding PREFIX, 'B' is part of Xi
-** - on odd rounds, 'A' and 'B' are both part of the colliding PREFIX, but
-** 'A' is considered redundant padding as it was used to compute the row #
-**
-** - cnt is an atomic counter keeping track of the number of used slots.
-** it is used in the first slot only; subsequent slots replace it with
-** 4 padding bytes
-** - i encodes either the 21-bit input value (round 0) or a reference to two
-** inputs from the previous round
-**
-** Formula for Xi length and pad length above:
-** > for i in range(9):
-** > xi=(200-20*i-NR_ROWS_LOG)/8.; ci=8+4*((i)/2); print xi,32-ci-xi
-**
-** Note that the fractional .5-byte/4-bit padding following Xi for odd rounds
-** is the 4 most significant bits of the last byte of Xi.
-*/
-
-__constant ulong blake_iv[] =
-{
- 0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
- 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
- 0x510e527fade682d1, 0x9b05688c2b3e6c1f,
- 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
-};
-
-/*
-** Reset counters in hash table.
-*/
-__kernel
-void kernel_init_ht(__global char *ht, __global uint *rowCounters)
-{
- rowCounters[get_global_id(0)] = 0;
-}
-
-/*
-** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they
-** represent (hex notation, group of 5 hex digits are a group of PREFIX bits):
-** aa aa ab bb bb cc cc cd dd... [round 0]
-** --------------------
-** ...ab bb bb cc cc cd dd... [odd round]
-** --------------
-** ...cc cc cd dd... [next even round]
-** -----
-** Bytes underlined are going to be stored in the slot. Preceding bytes
-** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are
-** used to compute the row number.
-**
-** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter)
-** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble)
-** TODO: update lines below with padding nibbles
-** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter)
-** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter)
-** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter)
-** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter)
-** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter)
-** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter)
-** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter)
-**
-** Return 0 if successfully stored, or 1 if the row overflowed.
-*/
-uint ht_store(uint round, __global char *ht, uint i,
- ulong xi0, ulong xi1, ulong xi2, ulong xi3, __global uint *rowCounters)
-{
- uint row;
- __global char *p;
- uint cnt;
-#if NR_ROWS_LOG == 16
- if (!(round % 2))
- row = (xi0 & 0xffff);
- else
- // if we have in hex: "ab cd ef..." (little endian xi0) then this
- // formula computes the row as 0xdebc. it skips the 'a' nibble as it
- // is part of the PREFIX. The Xi will be stored starting with "ef...";
- // 'e' will be considered padding and 'f' is part of the current PREFIX
- row = ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
- ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#elif NR_ROWS_LOG == 18
- if (!(round % 2))
- row = (xi0 & 0xffff) | ((xi0 & 0xc00000) >> 6);
- else
- row = ((xi0 & 0xc0000) >> 2) |
- ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
- ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#elif NR_ROWS_LOG == 19
- if (!(round % 2))
- row = (xi0 & 0xffff) | ((xi0 & 0xe00000) >> 5);
- else
- row = ((xi0 & 0xe0000) >> 1) |
- ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
- ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#elif NR_ROWS_LOG == 20
- if (!(round % 2))
- row = (xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4);
- else
- row = ((xi0 & 0xf0000) >> 0) |
- ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) |
- ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12);
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
- xi0 = (xi0 >> 16) | (xi1 << (64 - 16));
- xi1 = (xi1 >> 16) | (xi2 << (64 - 16));
- xi2 = (xi2 >> 16) | (xi3 << (64 - 16));
- p = ht + row * NR_SLOTS * SLOT_LEN;
- uint rowIdx = row/ROWS_PER_UINT;
- uint rowOffset = BITS_PER_ROW*(row%ROWS_PER_UINT);
- uint xcnt = atomic_add(rowCounters + rowIdx, 1 << rowOffset);
- xcnt = (xcnt >> rowOffset) & ROW_MASK;
- cnt = xcnt;
- if (cnt >= NR_SLOTS)
- {
- // avoid overflows
- atomic_sub(rowCounters + rowIdx, 1 << rowOffset);
- return 1;
- }
- p += cnt * SLOT_LEN + xi_offset_for_round(round);
- // store "i" (always 4 bytes before Xi)
- *(__global uint *)(p - 4) = i;
- if (round == 0 || round == 1)
- {
- // store 24 bytes
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
- *(__global ulong *)(p + 16) = xi2;
- }
- else if (round == 2)
- {
- // store 20 bytes
- *(__global uint *)(p + 0) = xi0;
- *(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32);
- *(__global ulong *)(p + 12) = (xi1 >> 32) | (xi2 << 32);
- }
- else if (round == 3)
- {
- // store 16 bytes
- *(__global uint *)(p + 0) = xi0;
- *(__global ulong *)(p + 4) = (xi0 >> 32) | (xi1 << 32);
- *(__global uint *)(p + 12) = (xi1 >> 32);
- }
- else if (round == 4)
- {
- // store 16 bytes
- *(__global ulong *)(p + 0) = xi0;
- *(__global ulong *)(p + 8) = xi1;
- }
- else if (round == 5)
- {
- // store 12 bytes
- *(__global ulong *)(p + 0) = xi0;
- *(__global uint *)(p + 8) = xi1;
- }
- else if (round == 6 || round == 7)
- {
- // store 8 bytes
- *(__global uint *)(p + 0) = xi0;
- *(__global uint *)(p + 4) = (xi0 >> 32);
- }
- else if (round == 8)
- {
- // store 4 bytes
- *(__global uint *)(p + 0) = xi0;
- }
- return 0;
-}
-
-#define mix(va, vb, vc, vd, x, y) \
- va = (va + vb + x); \
-vd = rotate((vd ^ va), (ulong)64 - 32); \
-vc = (vc + vd); \
-vb = rotate((vb ^ vc), (ulong)64 - 24); \
-va = (va + vb + y); \
-vd = rotate((vd ^ va), (ulong)64 - 16); \
-vc = (vc + vd); \
-vb = rotate((vb ^ vc), (ulong)64 - 63);
-
-/*
-** Execute round 0 (blake).
-**
-** Note: making the work group size less than or equal to the wavefront size
-** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local
-** Memory (LDS) Optimization 2-10" in:
-** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/
-*/
-__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-void kernel_round0(__global ulong *blake_state, __global char *ht,
- __global uint *rowCounters, __global uint *debug)
-{
- uint tid = get_global_id(0);
- ulong v[16];
- uint inputs_per_thread = NR_INPUTS / get_global_size(0);
- uint input = tid * inputs_per_thread;
- uint input_end = (tid + 1) * inputs_per_thread;
- uint dropped = 0;
- while (input < input_end)
- {
- // shift "i" to occupy the high 32 bits of the second ulong word in the
- // message block
- ulong word1 = (ulong)input << 32;
- // init vector v
- v[0] = blake_state[0];
- v[1] = blake_state[1];
- v[2] = blake_state[2];
- v[3] = blake_state[3];
- v[4] = blake_state[4];
- v[5] = blake_state[5];
- v[6] = blake_state[6];
- v[7] = blake_state[7];
- v[8] = blake_iv[0];
- v[9] = blake_iv[1];
- v[10] = blake_iv[2];
- v[11] = blake_iv[3];
- v[12] = blake_iv[4];
- v[13] = blake_iv[5];
- v[14] = blake_iv[6];
- v[15] = blake_iv[7];
- // mix in length of data
- v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */;
- // last block
- v[14] ^= (ulong)-1;
-
- // round 1
- mix(v[0], v[4], v[8], v[12], 0, word1);
- mix(v[1], v[5], v[9], v[13], 0, 0);
- mix(v[2], v[6], v[10], v[14], 0, 0);
- mix(v[3], v[7], v[11], v[15], 0, 0);
- mix(v[0], v[5], v[10], v[15], 0, 0);
- mix(v[1], v[6], v[11], v[12], 0, 0);
- mix(v[2], v[7], v[8], v[13], 0, 0);
- mix(v[3], v[4], v[9], v[14], 0, 0);
- // round 2
- mix(v[0], v[4], v[8], v[12], 0, 0);
- mix(v[1], v[5], v[9], v[13], 0, 0);
- mix(v[2], v[6], v[10], v[14], 0, 0);
- mix(v[3], v[7], v[11], v[15], 0, 0);
- mix(v[0], v[5], v[10], v[15], word1, 0);
- mix(v[1], v[6], v[11], v[12], 0, 0);
- mix(v[2], v[7], v[8], v[13], 0, 0);
- mix(v[3], v[4], v[9], v[14], 0, 0);
- // round 3
- mix(v[0], v[4], v[8], v[12], 0, 0);
- mix(v[1], v[5], v[9], v[13], 0, 0);
- mix(v[2], v[6], v[10], v[14], 0, 0);
- mix(v[3], v[7], v[11], v[15], 0, 0);
- mix(v[0], v[5], v[10], v[15], 0, 0);
- mix(v[1], v[6], v[11], v[12], 0, 0);
- mix(v[2], v[7], v[8], v[13], 0, word1);
- mix(v[3], v[4], v[9], v[14], 0, 0);
- // round 4
- mix(v[0], v[4], v[8], v[12], 0, 0);
- mix(v[1], v[5], v[9], v[13], 0, word1);
- mix(v[2], v[6], v[10], v[14], 0, 0);
- mix(v[3], v[7], v[11], v[15], 0, 0);
- mix(v[0], v[5], v[10], v[15], 0, 0);
- mix(v[1], v[6], v[11], v[12], 0, 0);
- mix(v[2], v[7], v[8], v[13], 0, 0);
- mix(v[3], v[4], v[9], v[14], 0, 0);
- // round 5
- mix(v[0], v[4], v[8], v[12], 0, 0);
- mix(v[1], v[5], v[9], v[13], 0, 0);
- mix(v[2], v[6], v[10], v[14], 0, 0);
- mix(v[3], v[7], v[11], v[15], 0, 0);
- mix(v[0], v[5], v[10], v[15], 0, word1);
- mix(v[1], v[6], v[11], v[12], 0, 0);
- mix(v[2], v[7], v[8], v[13], 0, 0);
- mix(v[3], v[4], v[9], v[14], 0, 0);
- // round 6
- mix(v[0], v[4], v[8], v[12], 0, 0);
- mix(v[1], v[5], v[9], v[13], 0, 0);
- mix(v[2], v[6], v[10], v[14], 0, 0);
- mix(v[3], v[7], v[11], v[15], 0, 0);
- mix(v[0], v[5], v[10], v[15], 0, 0);
- mix(v[1], v[6], v[11], v[12], 0, 0);
- mix(v[2], v[7], v[8], v[13], 0, 0);
- mix(v[3], v[4], v[9], v[14], word1, 0);
- // round 7
- mix(v[0], v[4], v[8], v[12], 0, 0);
- mix(v[1], v[5], v[9], v[13], word1, 0);
- mix(v[2], v[6], v[10], v[14], 0, 0);
- mix(v[3], v[7], v[11], v[15], 0, 0);
- mix(v[0], v[5], v[10], v[15], 0, 0);
- mix(v[1], v[6], v[11], v[12], 0, 0);
- mix(v[2], v[7], v[8], v[13], 0, 0);
- mix(v[3], v[4], v[9], v[14], 0, 0);
- // round 8
- mix(v[0], v[4], v[8], v[12], 0, 0);
- mix(v[1], v[5], v[9], v[13], 0, 0);
- mix(v[2], v[6], v[10], v[14], 0, word1);
- mix(v[3], v[7], v[11], v[15], 0, 0);
- mix(v[0], v[5], v[10], v[15], 0, 0);
- mix(v[1], v[6], v[11], v[12], 0, 0);
- mix(v[2], v[7], v[8], v[13], 0, 0);
- mix(v[3], v[4], v[9], v[14], 0, 0);
- // round 9
- mix(v[0], v[4], v[8], v[12], 0, 0);
- mix(v[1], v[5], v[9], v[13], 0, 0);
- mix(v[2], v[6], v[10], v[14], 0, 0);
- mix(v[3], v[7], v[11], v[15], 0, 0);
- mix(v[0], v[5], v[10], v[15], 0, 0);
- mix(v[1], v[6], v[11], v[12], 0, 0);
- mix(v[2], v[7], v[8], v[13], word1, 0);
- mix(v[3], v[4], v[9], v[14], 0, 0);
- // round 10
- mix(v[0], v[4], v[8], v[12], 0, 0);
- mix(v[1], v[5], v[9], v[13], 0, 0);
- mix(v[2], v[6], v[10], v[14], 0, 0);
- mix(v[3], v[7], v[11], v[15], word1, 0);
- mix(v[0], v[5], v[10], v[15], 0, 0);
- mix(v[1], v[6], v[11], v[12], 0, 0);
- mix(v[2], v[7], v[8], v[13], 0, 0);
- mix(v[3], v[4], v[9], v[14], 0, 0);
- // round 11
- mix(v[0], v[4], v[8], v[12], 0, word1);
- mix(v[1], v[5], v[9], v[13], 0, 0);
- mix(v[2], v[6], v[10], v[14], 0, 0);
- mix(v[3], v[7], v[11], v[15], 0, 0);
- mix(v[0], v[5], v[10], v[15], 0, 0);
- mix(v[1], v[6], v[11], v[12], 0, 0);
- mix(v[2], v[7], v[8], v[13], 0, 0);
- mix(v[3], v[4], v[9], v[14], 0, 0);
- // round 12
- mix(v[0], v[4], v[8], v[12], 0, 0);
- mix(v[1], v[5], v[9], v[13], 0, 0);
- mix(v[2], v[6], v[10], v[14], 0, 0);
- mix(v[3], v[7], v[11], v[15], 0, 0);
- mix(v[0], v[5], v[10], v[15], word1, 0);
- mix(v[1], v[6], v[11], v[12], 0, 0);
- mix(v[2], v[7], v[8], v[13], 0, 0);
- mix(v[3], v[4], v[9], v[14], 0, 0);
-
- // compress v into the blake state; this produces the 50-byte hash
- // (two Xi values)
- ulong h[7];
- h[0] = blake_state[0] ^ v[0] ^ v[8];
- h[1] = blake_state[1] ^ v[1] ^ v[9];
- h[2] = blake_state[2] ^ v[2] ^ v[10];
- h[3] = blake_state[3] ^ v[3] ^ v[11];
- h[4] = blake_state[4] ^ v[4] ^ v[12];
- h[5] = blake_state[5] ^ v[5] ^ v[13];
- h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
-
- // store the two Xi values in the hash table
-#if ZCASH_HASH_LEN == 50
- dropped += ht_store(0, ht, input * 2,
- h[0],
- h[1],
- h[2],
- h[3], rowCounters);
- dropped += ht_store(0, ht, input * 2 + 1,
- (h[3] >> 8) | (h[4] << (64 - 8)),
- (h[4] >> 8) | (h[5] << (64 - 8)),
- (h[5] >> 8) | (h[6] << (64 - 8)),
- (h[6] >> 8), rowCounters);
-#else
-#error "unsupported ZCASH_HASH_LEN"
-#endif
-
- input++;
- }
-#ifdef ENABLE_DEBUG
- debug[tid * 2] = 0;
- debug[tid * 2 + 1] = dropped;
-#endif
-}
-
-#if NR_ROWS_LOG <= 16 && NR_SLOTS <= (1 << 8)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
- ((row << 16) | ((slot1 & 0xff) << 8) | (slot0 & 0xff))
-#define DECODE_ROW(REF) (REF >> 16)
-#define DECODE_SLOT1(REF) ((REF >> 8) & 0xff)
-#define DECODE_SLOT0(REF) (REF & 0xff)
-
-#elif NR_ROWS_LOG == 18 && NR_SLOTS <= (1 << 7)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
- ((row << 14) | ((slot1 & 0x7f) << 7) | (slot0 & 0x7f))
-#define DECODE_ROW(REF) (REF >> 14)
-#define DECODE_SLOT1(REF) ((REF >> 7) & 0x7f)
-#define DECODE_SLOT0(REF) (REF & 0x7f)
-
-#elif NR_ROWS_LOG == 19 && NR_SLOTS <= (1 << 6)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
- ((row << 13) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f)) /* 1 spare bit */
-#define DECODE_ROW(REF) (REF >> 13)
-#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
-#define DECODE_SLOT0(REF) (REF & 0x3f)
-
-#elif NR_ROWS_LOG == 20 && NR_SLOTS <= (1 << 6)
-
-#define ENCODE_INPUTS(row, slot0, slot1) \
- ((row << 12) | ((slot1 & 0x3f) << 6) | (slot0 & 0x3f))
-#define DECODE_ROW(REF) (REF >> 12)
-#define DECODE_SLOT1(REF) ((REF >> 6) & 0x3f)
-#define DECODE_SLOT0(REF) (REF & 0x3f)
-
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-
-/*
-** Access a half-aligned long, that is a long aligned on a 4-byte boundary.
-*/
-ulong half_aligned_long(__global ulong *p, uint offset)
-{
- return
- (((ulong)*(__global uint *)((__global char *)p + offset + 0)) << 0) |
- (((ulong)*(__global uint *)((__global char *)p + offset + 4)) << 32);
-}
-
-/*
-** Access a well-aligned int.
-*/
-uint well_aligned_int(__global ulong *_p, uint offset)
-{
- __global char *p = (__global char *)_p;
- return *(__global uint *)(p + offset);
-}
-
-/*
-** XOR a pair of Xi values computed at "round - 1" and store the result in the
-** hash table being built for "round". Note that when building the table for
-** even rounds we need to skip 1 padding byte present in the "round - 1" table
-** (the "0xAB" byte mentioned in the description at the top of this file.) But
-** also note we can't load data directly past this byte because this would
-** cause an unaligned memory access which is undefined per the OpenCL spec.
-**
-** Return 0 if successfully stored, or 1 if the row overflowed.
-*/
-uint xor_and_store(uint round, __global char *ht_dst, uint row,
- uint slot_a, uint slot_b, __global ulong *a, __global ulong *b,
- __global uint *rowCounters)
-{
- ulong xi0, xi1, xi2;
-#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
- // Note: for NR_ROWS_LOG == 20, for odd rounds, we could optimize by not
- // storing the byte containing bits from the previous PREFIX block for
- if (round == 1 || round == 2)
- {
- // xor 24 bytes
- xi0 = *(a++) ^ *(b++);
- xi1 = *(a++) ^ *(b++);
- xi2 = *a ^ *b;
- if (round == 2)
- {
- // skip padding byte
- xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
- xi1 = (xi1 >> 8) | (xi2 << (64 - 8));
- xi2 = (xi2 >> 8);
- }
- }
- else if (round == 3)
- {
- // xor 20 bytes
- xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
- xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8);
- xi2 = well_aligned_int(a, 16) ^ well_aligned_int(b, 16);
- }
- else if (round == 4 || round == 5)
- {
- // xor 16 bytes
- xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
- xi1 = half_aligned_long(a, 8) ^ half_aligned_long(b, 8);
- xi2 = 0;
- if (round == 4)
- {
- // skip padding byte
- xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
- xi1 = (xi1 >> 8);
- }
- }
- else if (round == 6)
- {
- // xor 12 bytes
- xi0 = *a++ ^ *b++;
- xi1 = *(__global uint *)a ^ *(__global uint *)b;
- xi2 = 0;
- if (round == 6)
- {
- // skip padding byte
- xi0 = (xi0 >> 8) | (xi1 << (64 - 8));
- xi1 = (xi1 >> 8);
- }
- }
- else if (round == 7 || round == 8)
- {
- // xor 8 bytes
- xi0 = half_aligned_long(a, 0) ^ half_aligned_long(b, 0);
- xi1 = 0;
- xi2 = 0;
- if (round == 8)
- {
- // skip padding byte
- xi0 = (xi0 >> 8);
- }
- }
- // invalid solutions (which start happenning in round 5) have duplicate
- // inputs and xor to zero, so discard them
- if (!xi0 && !xi1)
- return 0;
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
- return ht_store(round, ht_dst, ENCODE_INPUTS(row, slot_a, slot_b),
- xi0, xi1, xi2, 0, rowCounters);
-}
-
-/*
-** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi,
-** store them in ht_dst.
-*/
-void equihash_round(uint round,
- __global char *ht_src,
- __global char *ht_dst,
- __global uint *debug,
- __local uchar *first_words_data,
- __local uint *collisionsData,
- __local uint *collisionsNum,
- __global uint *rowCountersSrc,
- __global uint *rowCountersDst,
- uint threadsPerRow)
-{
- uint globalTid = get_global_id(0) / threadsPerRow;
- uint localTid = get_local_id(0) / threadsPerRow;
- uint localGroupId = get_local_id(0) % threadsPerRow;
- __local uchar *first_words = &first_words_data[NR_SLOTS*localTid];
-
- __global char *p;
- uint cnt;
- uchar mask;
- uint i, j;
- // NR_SLOTS is already oversized (by a factor of OVERHEAD), but we want to
- // make it even larger
- uint n;
- uint dropped_coll = 0;
- uint dropped_stor = 0;
- __global ulong *a, *b;
- uint xi_offset;
- // read first words of Xi from the previous (round - 1) hash table
- xi_offset = xi_offset_for_round(round - 1);
- // the mask is also computed to read data from the previous round
-#if NR_ROWS_LOG <= 16
- mask = ((!(round % 2)) ? 0x0f : 0xf0);
-#elif NR_ROWS_LOG == 18
- mask = ((!(round % 2)) ? 0x03 : 0x30);
-#elif NR_ROWS_LOG == 19
- mask = ((!(round % 2)) ? 0x01 : 0x10);
-#elif NR_ROWS_LOG == 20
- mask = 0; /* we can vastly simplify the code below */
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-
- for (uint chunk = 0; chunk < threadsPerRow; chunk++) {
- uint tid = globalTid + NR_ROWS/threadsPerRow*chunk;
- uint gid = tid & ~(get_local_size(0) / threadsPerRow - 1);
-// for (uint tid = get_global_id(0)/threadsPerRow; tid < NR_ROWS; tid += get_global_size(0)/threadsPerRow) {
-
- uint rowIdx = tid/ROWS_PER_UINT;
- uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT);
- cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
- cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in prev. round
-
- *collisionsNum = 0;
- p = (ht_src + tid * NR_SLOTS * SLOT_LEN);
- p += xi_offset;
- p += SLOT_LEN*localGroupId;
- for (i = localGroupId; i < cnt; i += threadsPerRow, p += SLOT_LEN*threadsPerRow)
- first_words[i] = (*(__global uchar *)p) & mask;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (cnt == 0)
- // no elements in row, no collisions
- goto part2;
- // find collisions
- for (i = 0; i < cnt-1; i++)
- {
- uchar data_i = first_words[i];
- uint collision = (localTid << 24) | (i << 12) | (i + 1 + localGroupId);
- for (j = i + 1 + localGroupId; j < cnt; j += threadsPerRow)
- {
- if (data_i == first_words[j])
- {
- uint index = atomic_inc(collisionsNum);
- if (index >= LDS_COLL_SIZE) {
- atomic_dec(collisionsNum);
- goto part2;
- }
- collisionsData[index] = collision;
- }
- collision += threadsPerRow;
- }
- }
-
-part2:
- barrier(CLK_LOCAL_MEM_FENCE);
- uint totalCollisions = *collisionsNum;
- for (uint index = get_local_id(0); index < totalCollisions; index += get_local_size(0))
- {
- uint collision = collisionsData[index];
- uint collisionThreadId = gid + (collision >> 24);
- uint i = (collision >> 12) & 0xFFF;
- uint j = collision & 0xFFF;
- __global uchar *ptr = ht_src + collisionThreadId * NR_SLOTS * SLOT_LEN +
- xi_offset;
- a = (__global ulong *)(ptr + i * SLOT_LEN);
- b = (__global ulong *)(ptr + j * SLOT_LEN);
- dropped_stor += xor_and_store(round, ht_dst, collisionThreadId, i, j,
- a, b, rowCountersDst);
- }
- }
-
-#ifdef ENABLE_DEBUG
- debug[tid * 2] = dropped_coll;
- debug[tid * 2 + 1] = dropped_stor;
-#endif
-}
-
-/*
-** This defines kernel_round1, kernel_round2, ..., kernel_round7.
-*/
-#define KERNEL_ROUND(N) \
-__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) \
-void kernel_round ## N(__global char *ht_src, __global char *ht_dst, \
- __global uint *rowCountersSrc, __global uint *rowCountersDst, \
- __global uint *debug) \
-{ \
- __local uchar first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)]; \
- __local uint collisionsData[LDS_COLL_SIZE]; \
- __local uint collisionsNum; \
- equihash_round(N, ht_src, ht_dst, debug, first_words_data, collisionsData, \
- &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW); \
-}
-KERNEL_ROUND(1)
-KERNEL_ROUND(2)
-KERNEL_ROUND(3)
-KERNEL_ROUND(4)
-KERNEL_ROUND(5)
-KERNEL_ROUND(6)
-KERNEL_ROUND(7)
-
-// kernel_round8 takes an extra argument, "sols"
-__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-void kernel_round8(__global char *ht_src, __global char *ht_dst,
- __global uint *rowCountersSrc, __global uint *rowCountersDst,
- __global uint *debug, __global sols_t *sols)
-{
- uint tid = get_global_id(0);
- __local uchar first_words_data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)];
- __local uint collisionsData[LDS_COLL_SIZE];
- __local uint collisionsNum;
- equihash_round(8, ht_src, ht_dst, debug, first_words_data, collisionsData,
- &collisionsNum, rowCountersSrc, rowCountersDst, THREADS_PER_ROW);
- if (!tid)
- sols->nr = sols->likely_invalids = 0;
-}
-
-uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
-{
- return *(__global uint *)(ht + row * NR_SLOTS * SLOT_LEN +
- slot * SLOT_LEN + xi_offset - 4);
-}
-
-/*
-** Expand references to inputs. Return 1 if so far the solution appears valid,
-** or 0 otherwise (an invalid solution would be a solution with duplicate
-** inputs, which can be detected at the last step: round == 0).
-*/
-uint expand_refs(uint *ins, uint nr_inputs, __global char **htabs,
- uint round)
-{
- __global char *ht = htabs[round % 2];
- uint i = nr_inputs - 1;
- uint j = nr_inputs * 2 - 1;
- uint xi_offset = xi_offset_for_round(round);
- int dup_to_watch = -1;
- do
- {
- ins[j] = expand_ref(ht, xi_offset,
- DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i]));
- ins[j - 1] = expand_ref(ht, xi_offset,
- DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i]));
- if (!round)
- {
- if (dup_to_watch == -1)
- dup_to_watch = ins[j];
- else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch)
- return 0;
- }
- if (!i)
- break ;
- i--;
- j -= 2;
- }
- while (1);
- return 1;
-}
-
-/*
-** Verify if a potential solution is in fact valid.
-*/
-void potential_sol(__global char **htabs, __global sols_t *sols,
- uint ref0, uint ref1)
-{
- uint nr_values;
- uint values_tmp[(1 << PARAM_K)];
- uint sol_i;
- uint i;
- nr_values = 0;
- values_tmp[nr_values++] = ref0;
- values_tmp[nr_values++] = ref1;
- uint round = PARAM_K - 1;
- do
- {
- round--;
- if (!expand_refs(values_tmp, nr_values, htabs, round))
- return ;
- nr_values *= 2;
- }
- while (round > 0);
- // solution appears valid, copy it to sols
- sol_i = atomic_inc(&sols->nr);
- if (sol_i >= MAX_SOLS)
- return ;
- for (i = 0; i < (1 << PARAM_K); i++)
- sols->values[sol_i][i] = values_tmp[i];
- sols->valid[sol_i] = 1;
-}
-
-/*
-** Scan the hash tables to find Equihash solutions.
-*/
-__kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-void kernel_sols(__global char *ht0, __global char *ht1, __global sols_t *sols,
- __global uint *rowCountersSrc, __global uint *rowCountersDst)
-{
- __local uint counters[WORKSIZE/THREADS_PER_ROW];
- __local uint refs[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)];
- __local uint data[NR_SLOTS*(WORKSIZE/THREADS_PER_ROW)];
- __local uint collisionsNum;
- __local ulong collisions[WORKSIZE*4];
-
- uint globalTid = get_global_id(0) / THREADS_PER_ROW;
- uint localTid = get_local_id(0) / THREADS_PER_ROW;
- uint localGroupId = get_local_id(0) % THREADS_PER_ROW;
- __local uint *refsPtr = &refs[NR_SLOTS*localTid];
- __local uint *dataPtr = &data[NR_SLOTS*localTid];
-
- __global char *htabs[2] = { ht0, ht1 };
- __global char *hcounters[2] = { rowCountersSrc, rowCountersDst };
- uint ht_i = (PARAM_K - 1) % 2; // table filled at last round
- uint cnt;
- uint xi_offset = xi_offset_for_round(PARAM_K - 1);
- uint i, j;
- __global char *p;
- uint ref_i, ref_j;
- // it's ok for the collisions array to be so small, as if it fills up
- // the potential solutions are likely invalid (many duplicate inputs)
-// ulong collisions;
-#if NR_ROWS_LOG >= 16 && NR_ROWS_LOG <= 20
- // in the final hash table, we are looking for a match on both the bits
- // part of the previous PREFIX colliding bits, and the last PREFIX bits.
- uint mask = 0xffffff;
-#else
-#error "unsupported NR_ROWS_LOG"
-#endif
-
- collisionsNum = 0;
-
- for (uint chunk = 0; chunk < THREADS_PER_ROW; chunk++) {
- uint tid = globalTid + NR_ROWS/THREADS_PER_ROW*chunk;
- p = htabs[ht_i] + tid * NR_SLOTS * SLOT_LEN;
- uint rowIdx = tid/ROWS_PER_UINT;
- uint rowOffset = BITS_PER_ROW*(tid%ROWS_PER_UINT);
- cnt = (rowCountersSrc[rowIdx] >> rowOffset) & ROW_MASK;
- cnt = min(cnt, (uint)NR_SLOTS); // handle possible overflow in last round
- p += xi_offset;
- p += SLOT_LEN*localGroupId;
-
- for (i = get_local_id(0); i < WORKSIZE/THREADS_PER_ROW; i += get_local_size(0))
- counters[i] = 0;
- for (i = localGroupId; i < cnt; i += THREADS_PER_ROW, p += SLOT_LEN*THREADS_PER_ROW) {
- refsPtr[i] = *(__global uint *)(p - 4);
- dataPtr[i] = (*(__global uint *)p) & mask;
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- for (i = 0; i < cnt; i++)
- {
- uint a_data = dataPtr[i];
- ref_i = refsPtr[i];
- for (j = i + 1 + localGroupId; j < cnt; j += THREADS_PER_ROW)
- {
- if (a_data == dataPtr[j])
- {
- if (atomic_inc(&counters[localTid]) == 0)
- collisions[atomic_inc(&collisionsNum)] = ((ulong)ref_i << 32) | refsPtr[j];
- goto part2;
- }
- }
- }
-
-part2:
- continue;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- uint totalCollisions = collisionsNum;
- if (get_local_id(0) < totalCollisions) {
- ulong coll = collisions[get_local_id(0)];
- potential_sol(htabs, sols, coll >> 32, coll & 0xffffffff);
- }
-}
+// Gateless Gate, a Zcash miner
+// Copyright 2016 zawawa @ bitcointalk.org
+//
+// The initial version of this software was based on:
+// SILENTARMY v5
+// The MIT License (MIT) Copyright (c) 2016 Marc Bevand, Genoil, eXtremal
+//
+// This program is free software : you can redistribute it and / or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see .
+
+#include "equihash-param.h"
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#ifdef AMD
+#pragma OPENCL EXTENSION cl_amd_vec3 : enable
+#endif
+
+
+
+/////////////////
+// HASH TABLES //
+/////////////////
+
+/*
+** With the new hash tables, each slot has this layout (length in bytes in parens):
+**
+** round 0, table 0: i(4) pad(0) Xi(24) pad(4)
+** round 1, table 1: i(4) pad(3) Xi(20) pad(5)
+** round 2, table 2: i(4) pad(0) Xi(19) pad(9)
+** round 3, table 3: i(4) pad(3) Xi(15) pad(10)
+** round 4, table 4: i(4) pad(0) Xi(14) pad(14)
+** round 5, table 5: i(4) pad(3) Xi(10) pad(15)
+** round 6, table 6: i(4) pad(0) Xi( 9) pad(19)
+** round 7, table 7: i(4) pad(3) Xi( 5) pad(20)
+** round 8, table 8: i(4) pad(0) Xi( 4) pad(24)
+*/
+
+typedef union {
+ struct {
+ uint xi[7];
+ uint padding;
+ } slot;
+ uint8 ui8;
+ uint4 ui4[2];
+ uint2 ui2[4];
+ uint ui[8];
+#ifdef AMD
+ ulong3 ul3;
+ uint3 ui3[2];
+#endif
+} slot_t;
+
+typedef __global slot_t *global_pointer_to_slot_t;
+
+#define UINTS_IN_XI(round) (((round) == 0) ? 6 : \
+ ((round) == 1) ? 6 : \
+ ((round) == 2) ? 5 : \
+ ((round) == 3) ? 5 : \
+ ((round) == 4) ? 4 : \
+ ((round) == 5) ? 4 : \
+ ((round) == 6) ? 3 : \
+ ((round) == 7) ? 2 : \
+ 1)
+
+
+
+/*
+** OBSOLETE
+** If xi0,xi1,xi2,xi3 are stored consecutively in little endian then they
+** represent (hex notation, group of 5 hex digits are a group of PREFIX bits):
+** aa aa ab bb bb cc cc cd dd... [round 0]
+** --------------------
+** ...ab bb bb cc cc cd dd... [odd round]
+** --------------
+** ...cc cc cd dd... [next even round]
+** -----
+** Bytes underlined are going to be stored in the slot. Preceding bytes
+** (and possibly part of the underlined bytes, depending on NR_ROWS_LOG) are
+** used to compute the row number.
+**
+** Round 0: xi0,xi1,xi2,xi3 is a 25-byte Xi (xi3: only the low byte matter)
+** Round 1: xi0,xi1,xi2 is a 23-byte Xi (incl. the colliding PREFIX nibble)
+** TODO: update lines below with padding nibbles
+** Round 2: xi0,xi1,xi2 is a 20-byte Xi (xi2: only the low 4 bytes matter)
+** Round 3: xi0,xi1,xi2 is a 17.5-byte Xi (xi2: only the low 1.5 bytes matter)
+** Round 4: xi0,xi1 is a 15-byte Xi (xi1: only the low 7 bytes matter)
+** Round 5: xi0,xi1 is a 12.5-byte Xi (xi1: only the low 4.5 bytes matter)
+** Round 6: xi0,xi1 is a 10-byte Xi (xi1: only the low 2 bytes matter)
+** Round 7: xi0 is a 7.5-byte Xi (xi0: only the low 7.5 bytes matter)
+** Round 8: xi0 is a 5-byte Xi (xi0: only the low 5 bytes matter)
+**
+** Return 0 if successfully stored, or 1 if the row overflowed.
+*/
+
+__global char *get_slot_ptr(__global char *ht, uint round, uint row, uint slot)
+{
+ return ht + (row * NR_SLOTS + slot) * ADJUSTED_SLOT_LEN(round);
+}
+
+__global uint *get_xi_ptr(__global char *ht, uint round, uint row, uint slot)
+{
+ return (__global uint *)get_slot_ptr(ht, round, row, slot);
+}
+
+__global uint *get_ref_ptr(__global char *ht, uint round, uint row, uint slot)
+{
+ return get_xi_ptr(ht, round, row, slot) + UINTS_IN_XI(round);
+}
+
+void get_row_counters_index(uint *rowIdx, uint *rowOffset, uint row)
+{
+ if (ROWS_PER_UINT == 3) {
+ uint r = (0x55555555 * row + (row >> 1) - (row >> 3)) >> 30;
+ *rowIdx = (row - r) * 0xAAAAAAAB;
+ *rowOffset = BITS_PER_ROW * r;
+ } else if (ROWS_PER_UINT == 6) {
+ uint r = (0x55555555 * row + (row >> 1) - (row >> 3)) >> 29;
+ *rowIdx = (row - r) * 0xAAAAAAAB * 2;
+ *rowOffset = BITS_PER_ROW * r;
+ } else {
+ *rowIdx = row / ROWS_PER_UINT;
+ *rowOffset = BITS_PER_ROW * (row % ROWS_PER_UINT);
+ }
+}
+
+uint get_row(uint round, uint xi0)
+{
+ uint row = 0;
+
+ if (NR_ROWS_LOG == 12) {
+ if (!(round % 2))
+ row = (xi0 & 0xfff);
+ else
+ row = ((xi0 & 0x0f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+ } else if (NR_ROWS_LOG == 13) {
+ if (!(round % 2))
+ row = (xi0 & 0x1fff);
+ else
+ row = ((xi0 & 0x1f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+ } else if (NR_ROWS_LOG == 14) {
+ if (!(round % 2))
+ row = (xi0 & 0x3fff);
+ else
+ row = ((xi0 & 0x3f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+ } else if (NR_ROWS_LOG == 15) {
+ if (!(round % 2))
+ row = (xi0 & 0x7fff);
+ else
+ row = ((xi0 & 0x7f0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+ } else if (NR_ROWS_LOG == 16) {
+ if (!(round % 2))
+ row = (xi0 & 0xffff);
+ else
+ row = ((xi0 & 0xff0f00) >> 8) | ((xi0 & 0xf0000000) >> 24);
+ }
+
+ return row;
+}
+
+uint get_nr_slots(__global uint *row_counters, uint row_index)
+{
+ uint rowIdx, rowOffset, nr_slots;
+ get_row_counters_index(&rowIdx, &rowOffset, row_index);
+ nr_slots = (row_counters[rowIdx] >> rowOffset) & ROW_MASK;
+ nr_slots = min(nr_slots, (uint)NR_SLOTS); // handle possible overflow in last round
+ return nr_slots;
+}
+
+uint inc_row_counter(__global uint *rowCounters, uint row)
+{
+ uint rowIdx, rowOffset;
+ get_row_counters_index(&rowIdx, &rowOffset, row);
+ uint nr_slots = atomic_add(rowCounters + rowIdx, 1U << rowOffset);
+ nr_slots = (nr_slots >> rowOffset) & ROW_MASK;
+ if (nr_slots >= NR_SLOTS) {
+ // avoid overflows
+ atomic_sub(rowCounters + rowIdx, 1 << rowOffset);
+ }
+ return nr_slots;
+}
+
+
+
+/*
+** Reset counters in a hash table.
+*/
+
+__kernel
+void kernel_init_ht(__global char *ht, __global uint *rowCounters, __global sols_t *sols, __global potential_sols_t *potential_sols)
+{
+ if (!get_global_id(0))
+ sols->nr = sols->likely_invalids = potential_sols->nr = 0;
+ if (get_global_id(0) < RC_SIZE / 4)
+ rowCounters[get_global_id(0)] = 0;
+}
+
+
+
+/////////////
+// ROUND 0 //
+/////////////
+
+__constant ulong blake_iv[] =
+{
+ 0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+ 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+ 0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+ 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+};
+
+#define mix(va, vb, vc, vd, x, y) \
+ va = (va + vb + (x)); \
+vd = rotate((vd ^ va), (ulong)64 - 32); \
+vc = (vc + vd); \
+vb = rotate((vb ^ vc), (ulong)64 - 24); \
+va = (va + vb + (y)); \
+vd = rotate((vd ^ va), (ulong)64 - 16); \
+vc = (vc + vd); \
+vb = rotate((vb ^ vc), (ulong)64 - 63);
+
+/*
+** Execute round 0 (blake).
+**
+** Note: making the work group size less than or equal to the wavefront size
+** allows the OpenCL compiler to remove the barrier() calls, see "2.2 Local
+** Memory (LDS) Optimization 2-10" in:
+** http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/opencl-optimization-guide/
+*/
+
+#if ZCASH_HASH_LEN != 50
+#error "unsupported ZCASH_HASH_LEN"
+#endif
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE_ROUND0, 1, 1)))
+void kernel_round0(__constant ulong *blake_state, __global char *ht,
+ __global uint *rowCounters, __global uint *debug)
+{
+ uint tid = get_global_id(0);
+#if defined(AMD) && !defined(AMD_LEGACY)
+ volatile ulong v[16];
+ uint xi0, xi1, xi2, xi3, xi4, xi5, xi6;
+ slot_t slot;
+#else
+ ulong v[16];
+ uint xi0, xi1, xi2, xi3, xi4, xi5, xi6;
+ slot_t slot;
+#endif
+ ulong h[7];
+ uint inputs_per_thread = (NR_INPUTS + get_global_size(0) - 1) / get_global_size(0);
+ uint dropped = 0;
+
+ for (uint chunk = 0; chunk < inputs_per_thread; ++chunk) {
+ uint input = tid + get_global_size(0) * chunk;
+
+ if (input < NR_INPUTS) {
+ // shift "i" to occupy the high 32 bits of the second ulong word in the
+ // message block
+ ulong word1 = (ulong)input << 32;
+ // init vector v
+ v[0] = blake_state[0];
+ v[1] = blake_state[1];
+ v[2] = blake_state[2];
+ v[3] = blake_state[3];
+ v[4] = blake_state[4];
+ v[5] = blake_state[5];
+ v[6] = blake_state[6];
+ v[7] = blake_state[7];
+ v[8] = blake_iv[0];
+ v[9] = blake_iv[1];
+ v[10] = blake_iv[2];
+ v[11] = blake_iv[3];
+ v[12] = blake_iv[4];
+ v[13] = blake_iv[5];
+ v[14] = blake_iv[6];
+ v[15] = blake_iv[7];
+ // mix in length of data
+ v[12] ^= ZCASH_BLOCK_HEADER_LEN + 4 /* length of "i" */;
+ // last block
+ v[14] ^= (ulong)-1;
+
+#if defined(AMD) && !defined(AMD_LEGACY)
+#pragma unroll 1
+ for (uint blake_round = 1; blake_round <= 9; ++blake_round) {
+#else
+#pragma unroll 9
+ for (uint blake_round = 1; blake_round <= 9; ++blake_round) {
+#endif
+ mix(v[0], v[4], v[8], v[12], 0, (blake_round == 1) ? word1 : 0);
+ mix(v[1], v[5], v[9], v[13], (blake_round == 7) ? word1 : 0, (blake_round == 4) ? word1 : 0);
+ mix(v[2], v[6], v[10], v[14], 0, (blake_round == 8) ? word1 : 0);
+ mix(v[3], v[7], v[11], v[15], 0, 0);
+ mix(v[0], v[5], v[10], v[15], (blake_round == 2) ? word1 : 0, (blake_round == 5) ? word1 : 0);
+ mix(v[1], v[6], v[11], v[12], 0, 0);
+ mix(v[2], v[7], v[8], v[13], (blake_round == 9) ? word1 : 0, (blake_round == 3) ? word1 : 0);
+ mix(v[3], v[4], v[9], v[14], (blake_round == 6) ? word1 : 0, 0);
+ }
+ // round 10
+ mix(v[0], v[4], v[8], v[12], 0, 0);
+ mix(v[1], v[5], v[9], v[13], 0, 0);
+ mix(v[2], v[6], v[10], v[14], 0, 0);
+ mix(v[3], v[7], v[11], v[15], word1, 0);
+ mix(v[0], v[5], v[10], v[15], 0, 0);
+ mix(v[1], v[6], v[11], v[12], 0, 0);
+ mix(v[2], v[7], v[8], v[13], 0, 0);
+ mix(v[3], v[4], v[9], v[14], 0, 0);
+ // round 11
+ mix(v[0], v[4], v[8], v[12], 0, word1);
+ mix(v[1], v[5], v[9], v[13], 0, 0);
+ mix(v[2], v[6], v[10], v[14], 0, 0);
+ mix(v[3], v[7], v[11], v[15], 0, 0);
+ mix(v[0], v[5], v[10], v[15], 0, 0);
+ mix(v[1], v[6], v[11], v[12], 0, 0);
+ mix(v[2], v[7], v[8], v[13], 0, 0);
+ mix(v[3], v[4], v[9], v[14], 0, 0);
+ // round 12
+ mix(v[0], v[4], v[8], v[12], 0, 0);
+ mix(v[1], v[5], v[9], v[13], 0, 0);
+ mix(v[2], v[6], v[10], v[14], 0, 0);
+ mix(v[3], v[7], v[11], v[15], 0, 0);
+ mix(v[0], v[5], v[10], v[15], word1, 0);
+ mix(v[1], v[6], v[11], v[12], 0, 0);
+ mix(v[2], v[7], v[8], v[13], 0, 0);
+ mix(v[3], v[4], v[9], v[14], 0, 0);
+
+ // compress v into the blake state; this produces the 50-byte hash
+ // (two Xi values)
+ h[0] = blake_state[0] ^ v[0] ^ v[8];
+ h[1] = blake_state[1] ^ v[1] ^ v[9];
+ h[2] = blake_state[2] ^ v[2] ^ v[10];
+ h[3] = blake_state[3] ^ v[3] ^ v[11];
+ h[4] = blake_state[4] ^ v[4] ^ v[12];
+ h[5] = blake_state[5] ^ v[5] ^ v[13];
+ h[6] = (blake_state[6] ^ v[6] ^ v[14]) & 0xffff;
+ }
+
+ if (input < NR_INPUTS) {
+ // store the two Xi values in the hash table
+#pragma unroll 1
+ for (uint index = 0; index < 2; ++index) {
+ if (!index) {
+ xi0 = h[0] & 0xffffffff; xi1 = h[0] >> 32;
+ xi2 = h[1] & 0xffffffff; xi3 = h[1] >> 32;
+ xi4 = h[2] & 0xffffffff; xi5 = h[2] >> 32;
+ xi6 = h[3] & 0xffffffff;
+ } else {
+ xi0 = ((h[3] >> 8) | (h[4] << (64 - 8))) & 0xffffffff; xi1 = ((h[3] >> 8) | (h[4] << (64 - 8))) >> 32;
+ xi2 = ((h[4] >> 8) | (h[5] << (64 - 8))) & 0xffffffff; xi3 = ((h[4] >> 8) | (h[5] << (64 - 8))) >> 32;
+ xi4 = ((h[5] >> 8) | (h[6] << (64 - 8))) & 0xffffffff; xi5 = ((h[5] >> 8) | (h[6] << (64 - 8))) >> 32;
+ xi6 = (h[6] >> 8) & 0xffffffff;
+ }
+
+ uint row = get_row(0, xi0);
+ uint nr_slots = inc_row_counter(rowCounters, row);
+ if (nr_slots >= NR_SLOTS) {
+ ++dropped;
+ } else {
+ slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8));
+ slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8));
+ slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8));
+ slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8));
+ slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8));
+ slot.slot.xi[5] = ((xi6 << 24) | (xi5 >> 8));
+ slot.slot.xi[UINTS_IN_XI(0)] = input * 2 + index;
+ __global char *p = get_slot_ptr(ht, 0, row, nr_slots);
+ *(__global uint8 *)p = slot.ui8;
+ }
+ }
+ }
+ }
+
+#ifdef ENABLE_DEBUG
+ debug[tid * 2] = 0;
+ debug[tid * 2 + 1] = dropped;
+#endif
+}
+
+/*
+** XOR a pair of Xi values computed at "round - 1" and store the result in the
+** hash table being built for "round". Note that when building the table for
+** even rounds we need to skip 1 padding byte present in the "round - 1" table
+** (the "0xAB" byte mentioned in the description at the top of this file.) But
+** also note we can't load data directly past this byte because this would
+** cause an unaligned memory access which is undefined per the OpenCL spec.
+**
+** Return 0 if successfully stored, or 1 if the row overflowed.
+*/
+
+uint xor_and_store(uint round, __global char *ht_src, __global char *ht_dst, uint row,
+ uint slot_a, uint slot_b, __local uint *ai, __local uint *bi,
+ __global uint *rowCounters) {
+ uint ret = 0;
+ uint xi0, xi1, xi2, xi3, xi4, xi5;
+
+#if NR_ROWS_LOG < 8 && NR_ROWS_LOG > 20
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+ slot_t slot;
+ __global slot_t *p = 0;
+
+ if (slot_a < NR_SLOTS && slot_b < NR_SLOTS) {
+ xi0 = *ai;
+ xi1 = *(ai += NR_SLOTS);
+ if (round <= 7) xi2 = *(ai += NR_SLOTS);
+ if (round <= 6) xi3 = *(ai += NR_SLOTS);
+ if (round <= 4) xi4 = *(ai += NR_SLOTS);
+ if (round <= 2) xi5 = *(ai += NR_SLOTS);
+
+ xi0 ^= *bi;
+ xi1 ^= *(bi += NR_SLOTS);
+ if (round <= 7) xi2 ^= *(bi += NR_SLOTS);
+ if (round <= 6) xi3 ^= *(bi += NR_SLOTS);
+ if (round <= 4) xi4 ^= *(bi += NR_SLOTS);
+ if (round <= 2) xi5 ^= *(bi += NR_SLOTS);
+
+ if (!(round & 0x1)) {
+ // skip padding bytes
+ xi0 = (xi0 >> 24) | (xi1 << (32 - 24));
+
+ slot.slot.xi[0] = xi1;
+ slot.slot.xi[1] = xi2;
+ slot.slot.xi[2] = xi3;
+ slot.slot.xi[3] = xi4;
+ slot.slot.xi[4] = xi5;
+ } else {
+ slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8));
+ if (round <= 7) slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8));
+ if (round <= 6) slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8));
+ if (round <= 5) slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8));
+ if (round <= 3) slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8));
+ if (round <= 1) slot.slot.xi[5] = ((xi5 >> 8));
+ }
+ slot.slot.xi[UINTS_IN_XI(round)] = ENCODE_INPUTS(row, slot_a, slot_b);
+
+ // invalid solutions (which start happenning in round 5) have duplicate
+ // inputs and xor to zero, so discard them
+ if (xi0 || xi1) {
+ uint new_row = get_row(round, xi0);
+ uint new_slot_index = inc_row_counter(rowCounters, new_row);
+ if (new_slot_index >= NR_SLOTS) {
+ ret = 1;
+ } else {
+ p = (__global slot_t *)get_slot_ptr(ht_dst, round, new_row, new_slot_index);
+ }
+ }
+ }
+
+ if (p) {
+#ifdef OPTIM_8BYTE_WRITES
+ if (round >= 8)
+ *(__global uint2 *)p = slot.ui2[0];
+ else
+#endif
+#ifdef OPTIM_12BYTE_WRITES
+ if (round >= 7)
+ *(__global uint3 *)p = slot.ui3[0];
+ else
+#endif
+#ifdef OPTIM_16BYTE_WRITES
+ if (round >= 6)
+ *(__global uint4 *)p = slot.ui4[0];
+ else
+#endif
+#ifdef OPTIM_24BYTE_WRITES
+ if (round >= 2)
+ *(__global ulong3 *)p = slot.ul3;
+ else
+#endif
+ *(__global uint8 *)p = slot.ui8;
+ }
+ return ret;
+}
+
+uint parallel_xor_and_store(uint round, __global char *ht_src, __global char *ht_dst, uint row,
+ uint slot_a, uint slot_b, __local uint *ai, __local uint *bi,
+ __global uint *rowCounters,
+ __local SLOT_INDEX_TYPE *new_slot_indexes) {
+ uint ret = 0;
+ uint xi0, xi1, xi2, xi3, xi4, xi5;
+ uint write_index = get_local_id(0) / THREADS_PER_WRITE(round);
+ uint write_thread_index = get_local_id(0) % THREADS_PER_WRITE(round);
+ //uint write_index = get_local_id(0) % (get_local_size(0) / THREADS_PER_WRITE(round));
+ //uint write_thread_index = get_local_id(0) / (get_local_size(0) / THREADS_PER_WRITE(round));
+
+#if NR_ROWS_LOG < 8 && NR_ROWS_LOG > 20
+#error "unsupported NR_ROWS_LOG"
+#endif
+
+ slot_t slot;
+ uint new_slot_index;
+ uint new_row;
+
+ if (!write_thread_index)
+ new_slot_indexes[write_index] = NR_SLOTS;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (slot_a < NR_SLOTS && slot_b < NR_SLOTS) {
+ xi0 = *ai;
+ xi1 = *(ai += NR_SLOTS);
+ if (round <= 7) xi2 = *(ai += NR_SLOTS);
+ if (round <= 6) xi3 = *(ai += NR_SLOTS);
+ if (round <= 4) xi4 = *(ai += NR_SLOTS);
+ if (round <= 2) xi5 = *(ai += NR_SLOTS);
+
+ xi0 ^= *bi;
+ xi1 ^= *(bi += NR_SLOTS);
+ if (round <= 7) xi2 ^= *(bi += NR_SLOTS);
+ if (round <= 6) xi3 ^= *(bi += NR_SLOTS);
+ if (round <= 4) xi4 ^= *(bi += NR_SLOTS);
+ if (round <= 2) xi5 ^= *(bi += NR_SLOTS);
+
+ if (!(round & 0x1)) {
+ // skip padding bytes
+ xi0 = (xi0 >> 24) | (xi1 << (32 - 24));
+
+ slot.slot.xi[0] = xi1;
+ slot.slot.xi[1] = xi2;
+ slot.slot.xi[2] = xi3;
+ slot.slot.xi[3] = xi4;
+ slot.slot.xi[4] = xi5;
+ } else {
+ slot.slot.xi[0] = ((xi1 << 24) | (xi0 >> 8));
+ if (round <= 7) slot.slot.xi[1] = ((xi2 << 24) | (xi1 >> 8));
+ if (round <= 6) slot.slot.xi[2] = ((xi3 << 24) | (xi2 >> 8));
+ if (round <= 5) slot.slot.xi[3] = ((xi4 << 24) | (xi3 >> 8));
+ if (round <= 3) slot.slot.xi[4] = ((xi5 << 24) | (xi4 >> 8));
+ if (round <= 1) slot.slot.xi[5] = ((xi5 >> 8));
+ }
+ slot.slot.xi[UINTS_IN_XI(round)] = ENCODE_INPUTS(row, slot_a, slot_b);
+ new_row = get_row(round, xi0);
+
+ // invalid solutions (which start happenning in round 5) have duplicate
+ // inputs and xor to zero, so discard them
+ if ((xi0 || xi1) && !write_thread_index) {
+ new_slot_indexes[write_index] = inc_row_counter(rowCounters, new_row);
+#ifdef ENABLE_DEBUG
+ if (new_slot_index >= NR_SLOTS)
+ ret = 1;
+#endif
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (new_slot_indexes[write_index] < NR_SLOTS) {
+ __global slot_t *p = (__global slot_t *)get_slot_ptr(ht_dst, round, new_row, new_slot_indexes[write_index]);
+ *(((__global uint4 *)p) + write_thread_index) = slot.ui4[write_thread_index];
+ }
+ //barrier(CLK_LOCAL_MEM_FENCE);
+ return ret;
+}
+
+/*
+** Execute one Equihash round. Read from ht_src, XOR colliding pairs of Xi,
+** store them in ht_dst. Each work group processes only one row at a time.
+*/
+
+void equihash_round(uint round,
+ __global char *ht_src,
+ __global char *ht_dst,
+ __global uint *debug,
+ __local uint *slot_cache,
+ __local SLOT_INDEX_TYPE *collision_array_a,
+ __local SLOT_INDEX_TYPE *collision_array_b,
+ __local uint *nr_collisions,
+ __global uint *rowCountersSrc,
+ __global uint *rowCountersDst,
+ __local uint *bin_first_slots,
+ __local SLOT_INDEX_TYPE *bin_next_slots,
+ __local SLOT_INDEX_TYPE *new_slot_indexes)
+{
+ uint i, j;
+#ifdef ENABLE_DEBUG
+ uint dropped_coll = 0;
+ uint dropped_stor = 0;
+#endif
+
+ // the mask is also computed to read data from the previous round
+#define BIN_MASK(round) ((((round) + 1) % 2) ? 0xf000 : 0xf0000)
+#define BIN_MASK_OFFSET(round) ((((round) + 1) % 2) ? 3 * 4 : 4 * 4)
+
+#define BIN_MASK2(round) ((NR_ROWS_LOG == 12) ? ((((round) + 1) % 2) ? 0x00f0 : 0xf000) : \
+ (NR_ROWS_LOG == 13) ? ((((round) + 1) % 2) ? 0x00e0 : 0xe000) : \
+ (NR_ROWS_LOG == 14) ? ((((round) + 1) % 2) ? 0x00c0 : 0xc000) : \
+ (NR_ROWS_LOG == 15) ? ((((round) + 1) % 2) ? 0x0080 : 0x8000) : \
+ 0)
+#define BIN_MASK2_OFFSET(round) ((NR_ROWS_LOG == 12) ? ((((round) + 1) % 2) ? 0 : 8) : \
+ (NR_ROWS_LOG == 13) ? ((((round) + 1) % 2) ? 1 : 9) : \
+ (NR_ROWS_LOG == 14) ? ((((round) + 1) % 2) ? 2 : 10) : \
+ (NR_ROWS_LOG == 15) ? ((((round) + 1) % 2) ? 3 : 11) : \
+ 0)
+
+#define NR_BINS_LOG (20 - NR_ROWS_LOG)
+#define NR_BINS (1 << NR_BINS_LOG)
+
+
+
+ uint nr_slots = 0;
+ uint assigned_row_index = get_group_id(0);
+if (assigned_row_index >= NR_ROWS)
+ return;
+
+ for (i = get_local_id(0); i < NR_BINS; i += get_local_size(0))
+ bin_first_slots[i] = NR_SLOTS;
+ for (i = get_local_id(0); i < NR_SLOTS; i += get_local_size(0))
+ bin_next_slots[i] = NR_SLOTS;
+ if (get_local_id(0) == 0)
+ *nr_collisions = nr_slots = get_nr_slots(rowCountersSrc, assigned_row_index);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (get_local_id(0))
+ nr_slots = *nr_collisions;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (uint phase = 0; phase < 1; ++phase) {
+
+ // Perform a radix sort as slots get loaded into LDS.
+ // Make sure all the work items in the work group enter the loop.
+ for (i = get_local_id(0); i < nr_slots; i += get_local_size(0)) {
+ uint slot_index = i;
+ uint slot_cache_index = i;
+#ifdef NVIDIA
+ uint2 slot_data0, slot_data1, slot_data2;
+ if (UINTS_IN_XI(round - 1) >= 1) slot_data0 = *((__global uint2 *)get_slot_ptr(ht_src, round - 1, assigned_row_index, slot_cache_index) + 0);
+ if (UINTS_IN_XI(round - 1) >= 3) slot_data1 = *((__global uint2 *)get_slot_ptr(ht_src, round - 1, assigned_row_index, slot_cache_index) + 1);
+ if (UINTS_IN_XI(round - 1) >= 5) slot_data2 = *((__global uint2 *)get_slot_ptr(ht_src, round - 1, assigned_row_index, slot_cache_index) + 2);
+
+ if (UINTS_IN_XI(round - 1) >= 1) slot_cache[0 * NR_SLOTS + slot_cache_index] = slot_data0.s0;
+ if (UINTS_IN_XI(round - 1) >= 2) slot_cache[1 * NR_SLOTS + slot_cache_index] = slot_data0.s1;
+ if (UINTS_IN_XI(round - 1) >= 3) slot_cache[2 * NR_SLOTS + slot_cache_index] = slot_data1.s0;
+ if (UINTS_IN_XI(round - 1) >= 4) slot_cache[3 * NR_SLOTS + slot_cache_index] = slot_data1.s1;
+ if (UINTS_IN_XI(round - 1) >= 5) slot_cache[4 * NR_SLOTS + slot_cache_index] = slot_data2.s0;
+ if (UINTS_IN_XI(round - 1) >= 6) slot_cache[5 * NR_SLOTS + slot_cache_index] = slot_data2.s1;
+ uint xi0 = slot_data0.s0;
+#else
+ for (j = 0; j < UINTS_IN_XI(round - 1); ++j)
+ slot_cache[j * NR_SLOTS + slot_cache_index] = *((__global uint *)get_xi_ptr(ht_src, round - 1, assigned_row_index, slot_index) + j);
+ uint xi0 = slot_cache[0 * NR_SLOTS + slot_cache_index];
+#endif
+ uint bin_to_use =
+ ((xi0 & BIN_MASK(round - 1)) >> BIN_MASK_OFFSET(round - 1))
+ | ((xi0 & BIN_MASK2(round - 1)) >> BIN_MASK2_OFFSET(round - 1));
+ bin_next_slots[i] = atomic_xchg(&bin_first_slots[bin_to_use], i);
+ }
+
+ if (!get_local_id(0))
+ *nr_collisions = 0;
+ uint max_slot_a_index = NR_SLOTS + (get_local_size(0) - NR_SLOTS % get_local_size(0)) - 1;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ for (uint slot_a_index = get_local_id(0); slot_a_index <= max_slot_a_index; slot_a_index += get_local_size(0)) {
+ uint slot_b_index = (slot_a_index < NR_SLOTS) ? bin_next_slots[slot_a_index] : NR_SLOTS;
+ while (slot_b_index < NR_SLOTS) {
+ uint coll_index = atomic_inc(nr_collisions);
+ if (coll_index < LDS_COLL_SIZE) {
+ collision_array_a[coll_index] = slot_a_index;
+ collision_array_b[coll_index] = slot_b_index;
+ } else {
+ atomic_dec(nr_collisions);
+#ifdef ENABLE_DEBUG
+ ++dropped_coll;
+#endif
+ }
+ slot_b_index = bin_next_slots[slot_b_index];
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ uint nr_collisions_copy = *nr_collisions;
+ //barrier(CLK_LOCAL_MEM_FENCE);
+ while (nr_collisions_copy > 0) {
+ uint collision, slot_index_a = NR_SLOTS, slot_index_b = NR_SLOTS;
+ __local uint *slot_cache_a, *slot_cache_b;
+ uint write_index = get_local_id(0) / THREADS_PER_WRITE(round);
+ if (write_index < nr_collisions_copy) {
+ slot_index_a = collision_array_a[nr_collisions_copy - 1 - write_index];
+ slot_index_b = collision_array_b[nr_collisions_copy - 1 - write_index];
+ slot_cache_a = (__local uint *)&slot_cache[slot_index_a];
+ slot_cache_b = (__local uint *)&slot_cache[slot_index_b];
+ }
+ //barrier(CLK_LOCAL_MEM_FENCE);
+ if (THREADS_PER_WRITE(round) > 1) {
+#ifdef ENABLE_DEBUG
+ //dropped_stor +=
+#endif
+ parallel_xor_and_store(round, ht_src, ht_dst, assigned_row_index, slot_index_a, slot_index_b, slot_cache_a, slot_cache_b, rowCountersDst, new_slot_indexes);
+ } else {
+#ifdef ENABLE_DEBUG
+ dropped_stor +=
+#endif
+ xor_and_store(round, ht_src, ht_dst, assigned_row_index, slot_index_a, slot_index_b, slot_cache_a, slot_cache_b, rowCountersDst);
+ }
+
+ if (!get_local_id(0))
+ *nr_collisions -= min(*nr_collisions, (uint)get_local_size(0) / THREADS_PER_WRITE(round));
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ nr_collisions_copy = *nr_collisions;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ }
+
+
+
+#ifdef ENABLE_DEBUG
+ debug[get_global_id(0) * 2] = dropped_coll;
+ debug[get_global_id(0) * 2 + 1] = dropped_stor;
+#endif
+}
+
+/*
+** This defines kernel_round1, kernel_round2, ..., kernel_round8.
+*/
+
+#define KERNEL_ROUND(kernel_name, N) \
+__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE, 1, 1))) \
+void kernel_name(__global char *ht_src, __global char *ht_dst, \
+ __global uint *rowCountersSrc, __global uint *rowCountersDst, \
+ __global uint *debug) \
+{ \
+ __local uint slot_cache[ADJUSTED_LDS_ARRAY_SIZE(UINTS_IN_XI(N - 1) * NR_SLOTS)]; \
+ __local SLOT_INDEX_TYPE collision_array_a[ADJUSTED_LDS_ARRAY_SIZE(LDS_COLL_SIZE)]; \
+ __local SLOT_INDEX_TYPE collision_array_b[ADJUSTED_LDS_ARRAY_SIZE(LDS_COLL_SIZE)]; \
+ __local uint nr_collisions; \
+ __local uint bin_first_slots[ADJUSTED_LDS_ARRAY_SIZE(NR_BINS)]; \
+ __local SLOT_INDEX_TYPE bin_next_slots[ADJUSTED_LDS_ARRAY_SIZE(NR_SLOTS)]; \
+ __local SLOT_INDEX_TYPE new_slot_indexes[ADJUSTED_LDS_ARRAY_SIZE((THREADS_PER_WRITE(N) > 1) ? LOCAL_WORK_SIZE / THREADS_PER_WRITE(N) : 0)]; \
+ equihash_round((N), ht_src, ht_dst, debug, slot_cache, collision_array_a, collision_array_b, \
+ &nr_collisions, rowCountersSrc, rowCountersDst, bin_first_slots, bin_next_slots, new_slot_indexes); \
+}
+
+KERNEL_ROUND(kernel_round1, 1)
+KERNEL_ROUND(kernel_round2, 2)
+KERNEL_ROUND(kernel_round3, 3)
+KERNEL_ROUND(kernel_round4, 4)
+KERNEL_ROUND(kernel_round5, 5)
+KERNEL_ROUND(kernel_round6, 6)
+KERNEL_ROUND(kernel_round7, 7)
+KERNEL_ROUND(kernel_round8, 8)
+
+
+
+void mark_potential_sol(__global potential_sols_t *potential_sols, uint ref0, uint ref1)
+{
+ uint sol_i = atomic_inc(&potential_sols->nr);
+ if (sol_i >= MAX_POTENTIAL_SOLS)
+ return;
+ potential_sols->values[sol_i][0] = ref0;
+ potential_sols->values[sol_i][1] = ref1;
+}
+
+/*
+** Scan the hash tables to find Equihash solutions.
+*/
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE_POTENTIAL_SOLS, 1, 1)))
+void kernel_potential_sols(
+ __global char *ht_src,
+ __global potential_sols_t *potential_sols,
+ __global uint *rowCountersSrc)
+{
+ __local uint refs[ADJUSTED_LDS_ARRAY_SIZE(NR_SLOTS)];
+ __local uint data[ADJUSTED_LDS_ARRAY_SIZE(NR_SLOTS)];
+
+ uint nr_slots;
+ uint i, j;
+ __global char *p;
+ uint ref_i, ref_j;
+ __local uint bin_first_slots[ADJUSTED_LDS_ARRAY_SIZE(NR_BINS)];
+ __local SLOT_INDEX_TYPE bin_next_slots[ADJUSTED_LDS_ARRAY_SIZE(NR_SLOTS)];
+
+ if (!get_global_id(0))
+ potential_sols->nr = 0;
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+ uint assigned_row_index = (get_global_id(0) / get_local_size(0));
+ if (assigned_row_index >= NR_ROWS)
+ return;
+
+ __local uint nr_slots_shared;
+ for (i = get_local_id(0); i < NR_BINS; i += get_local_size(0))
+ bin_first_slots[i] = NR_SLOTS;
+ for (i = get_local_id(0); i < NR_SLOTS; i += get_local_size(0))
+ bin_next_slots[i] = NR_SLOTS;
+ if (get_local_id(0) == 0)
+ nr_slots_shared = nr_slots = get_nr_slots(rowCountersSrc, assigned_row_index);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (get_local_id(0))
+ nr_slots = nr_slots_shared;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // in the final hash table, we are looking for a match on both the bits
+ // part of the previous PREFIX colliding bits, and the last PREFIX bits.
+ for (i = get_local_id(0); i < nr_slots; i += get_local_size(0)) {
+ ulong slot_first_8bytes = *(__global ulong *) get_slot_ptr(ht_src, PARAM_K - 1, assigned_row_index, i);
+ uint ref_i = refs[i] = slot_first_8bytes >> 32;
+ uint xi_first_4bytes = data[i] = slot_first_8bytes & 0xffffffff;
+ uint bin_to_use =
+ ((xi_first_4bytes & BIN_MASK(PARAM_K - 1)) >> BIN_MASK_OFFSET(PARAM_K - 1))
+ | ((xi_first_4bytes & BIN_MASK2(PARAM_K - 1)) >> BIN_MASK2_OFFSET(PARAM_K - 1));
+ bin_next_slots[i] = atomic_xchg(&bin_first_slots[bin_to_use], i);
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (i = get_local_id(0); i < nr_slots; i += get_local_size(0)) {
+ uint data_i = data[i];
+ j = bin_next_slots[i];
+ while (j < NR_SLOTS) {
+ if (data_i == data[j]) {
+ mark_potential_sol(potential_sols, refs[i], refs[j]);
+ return;
+ }
+ j = bin_next_slots[j];
+ }
+ }
+}
+
+
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_WORK_SIZE_SOLS, 1, 1)))
+void kernel_sols(__global char *ht0,
+ __global char *ht1,
+ __global sols_t *sols,
+ __global uint *rowCountersSrc,
+ __global uint *rowCountersDst,
+ __global char *ht2,
+ __global char *ht3,
+ __global char *ht4,
+ __global char *ht5,
+ __global char *ht6,
+ __global char *ht7,
+ __global char *ht8,
+ __global potential_sols_t *potential_sols)
+{
+ __local uint inputs_a[ADJUSTED_LDS_ARRAY_SIZE(1 << PARAM_K)], inputs_b[ADJUSTED_LDS_ARRAY_SIZE(1 << (PARAM_K - 1))];
+ __global char *htabs[] = { ht0, ht1, ht2, ht3, ht4, ht5, ht6, ht7, ht8 };
+
+ if ((get_global_id(0) / get_local_size(0)) < potential_sols->nr && (get_global_id(0) / get_local_size(0)) < MAX_POTENTIAL_SOLS) {
+ __local uint dup_counter;
+ if (get_local_id(0) == 0) {
+ dup_counter = 0;
+ inputs_a[0] = potential_sols->values[(get_global_id(0) / get_local_size(0))][0];
+ inputs_a[1] = potential_sols->values[(get_global_id(0) / get_local_size(0))][1];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (int round = 7; round >= 0; --round) {
+ if (round % 2) {
+ for (uint i = get_local_id(0); i < (1 << (8 - round)); i += get_local_size(0)) {
+ inputs_b[i * 2 + 1] = *get_ref_ptr(htabs[round], round, DECODE_ROW(inputs_a[i]), DECODE_SLOT1(inputs_a[i]));
+ inputs_b[i * 2] = *get_ref_ptr(htabs[round], round, DECODE_ROW(inputs_a[i]), DECODE_SLOT0(inputs_a[i]));
+ }
+ } else {
+ for (uint i = get_local_id(0); i < (1 << (8 - round)); i += get_local_size(0)) {
+ inputs_a[i * 2 + 1] = *get_ref_ptr(htabs[round], round, DECODE_ROW(inputs_b[i]), DECODE_SLOT1(inputs_b[i]));
+ inputs_a[i * 2] = *get_ref_ptr(htabs[round], round, DECODE_ROW(inputs_b[i]), DECODE_SLOT0(inputs_b[i]));
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ //barrier(CLK_LOCAL_MEM_FENCE);
+
+ int dup_to_watch = inputs_a[256 * 2 - 1];
+ for (uint j = 3 + get_local_id(0); j < 256 * 2 - 2; j += get_local_size(0))
+ if (inputs_a[j] == dup_to_watch)
+ atomic_inc(&dup_counter);
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // solution appears valid, copy it to sols
+ __local uint sol_i;
+ if (get_local_id(0) == 0 && !dup_counter)
+ sol_i = atomic_inc(&sols->nr);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (sol_i < MAX_SOLS && !dup_counter) {
+ for (uint i = get_local_id(0); i < (1 << PARAM_K); i += get_local_size(0))
+ sols->values[sol_i][i] = inputs_a[i];
+ if (get_local_id(0) == 0)
+ sols->valid[sol_i] = 1;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+}
diff --git a/ocl.c b/ocl.c
index 0cd615fe..620a9bc2 100644
--- a/ocl.c
+++ b/ocl.c
@@ -798,8 +798,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
char *kernel_names[] = {"kernel_init_ht",
"kernel_round0", "kernel_round1", "kernel_round2",
"kernel_round3", "kernel_round4", "kernel_round5",
- "kernel_round6", "kernel_round7", "kernel_round8"};
- clState->n_extra_kernels = 1 + 9;
+ "kernel_round6", "kernel_round7", "kernel_round8",
+ "kernel_potential_sols"};
+ clState->n_extra_kernels = 1 + 9 + 1;
clState->extra_kernels = (cl_kernel *)malloc(sizeof(cl_kernel) * clState->n_extra_kernels);
for (int i = 0; i < clState->n_extra_kernels; i++) {
clState->extra_kernels[i] = clCreateKernel(clState->program, kernel_names[i], &status);
@@ -810,19 +811,22 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
}
char buffer[32];
- clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, HT_SIZE, NULL, &status);
+ clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, sizeof(potential_sols_t), NULL, &status);
snprintf(buffer, sizeof(buffer), "CLbuffer0");
if (status != CL_SUCCESS)
goto out;
- clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, HT_SIZE, NULL, &status);
- snprintf(buffer, sizeof(buffer), "buffer1");
- if (status != CL_SUCCESS)
- goto out;
- clState->buffer2 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, NR_ROWS, NULL, &status);
+ clState->buffer1 = NULL;
+ for (int i = 0; i < 9; i++) {
+ snprintf(buffer, sizeof(buffer), "index_buf[%d]", i);
+ clState->index_buf[i] = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, HT_SIZE, NULL, &status);
+ if (status != CL_SUCCESS)
+ goto out;
+ }
+ clState->buffer2 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, RC_SIZE, NULL, &status);
snprintf(buffer, sizeof(buffer), "buffer2");
if (status != CL_SUCCESS)
goto out;
- clState->buffer3 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, NR_ROWS, NULL, &status);
+ clState->buffer3 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, RC_SIZE, NULL, &status);
snprintf(buffer, sizeof(buffer), "buffer3");
if (status != CL_SUCCESS)
goto out;
@@ -839,13 +843,39 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
if (status != CL_SUCCESS)
goto out;
+ cl_mem rowCounters[] = {clState->buffer2, clState->buffer3};
+ for (int round = 0; round < PARAM_K; round++) {
+ unsigned int num = 0;
+ cl_kernel *kernel = &clState->extra_kernels[1 + round];
+ if (!round) {
+ CL_SET_ARG(clState->MidstateBuf);
+ CL_SET_ARG(clState->index_buf[round]);
+ CL_SET_ARG(rowCounters[round % 2]);
+ }
+ else {
+ CL_SET_ARG(clState->index_buf[round - 1]);
+ CL_SET_ARG(clState->index_buf[round]);
+ CL_SET_ARG(rowCounters[(round - 1) % 2]);
+ CL_SET_ARG(rowCounters[round % 2]);
+ }
+ CL_SET_ARG(clState->padbuffer8);
+ }
unsigned int num = 0;
- cl_kernel *kernel = &clState->kernel;
+ cl_kernel *kernel = &clState->extra_kernels[1 + 9];
+ CL_SET_ARG(clState->index_buf[8]);
CL_SET_ARG(clState->CLbuffer0);
- CL_SET_ARG(clState->buffer1);
+ CL_SET_ARG(rowCounters[0]);
+
+ num = 0;
+ kernel = &clState->kernel;
+ CL_SET_ARG(clState->index_buf[0]);
+ CL_SET_ARG(clState->index_buf[1]);
CL_SET_ARG(clState->outputBuffer);
- CL_SET_ARG(clState->buffer2);
- CL_SET_ARG(clState->buffer3);
+ CL_SET_ARG(rowCounters[0]);
+ CL_SET_ARG(rowCounters[1]);
+ for (int i = 2; i < 9; i++)
+ CL_SET_ARG(clState->index_buf[i]);
+ CL_SET_ARG(clState->CLbuffer0);
if (status != CL_SUCCESS) {
applog(LOG_ERR, "Error %d: Setting Kernel arguments for ALGO_EQUIHASH failed. (clSetKernelArg)", status);