diff --git a/algorithm.c b/algorithm.c index 2a195a65..96c9679e 100644 --- a/algorithm.c +++ b/algorithm.c @@ -1124,6 +1124,8 @@ static cl_int queue_cryptonight_kernel(_clState *clState, dev_blk_ctx *blk, __ma kernel = clState->extra_kernels; CL_SET_ARG(clState->Scratchpads); CL_SET_ARG(clState->States); + if (variant > 0) + CL_SET_ARG(*(cl_uint*)(clState->cldata + 35)); num = 0; CL_NEXTKERNEL_SET_ARG(clState->Scratchpads); diff --git a/algorithm/cryptonight.c b/algorithm/cryptonight.c index 00c188d8..b275b3d4 100644 --- a/algorithm/cryptonight.c +++ b/algorithm/cryptonight.c @@ -14,26 +14,23 @@ #include "algorithm/cn-aes-tbls.h" #define VARIANT1_1(p) \ - do if (Variant > 0) \ - { \ - const uint8_t tmp = ((const uint8_t*)(p))[11]; \ + do if (Variant > 0) { \ + const uint32_t tmp = (p); \ static const uint32_t table = 0x75310; \ - const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \ - ((uint8_t*)(p))[11] = tmp ^ ((table >> index) & 0x30); \ + const uint8_t index = ((tmp >> 26) & 12) | ((tmp >> 23) & 2); \ + (p) ^= ((table >> index) & 0x30) << 24; \ } while(0) #define VARIANT1_2(p) \ - do \ - { \ - ((uint32_t*)(p))[2] ^= nonce; \ + do if (Variant > 0) { \ + (p) ^= tweak1_2; \ } while(0) #define VARIANT1_INIT() \ - if (Variant > 0 && Length < 43) \ - { \ + if (Variant > 0 && Length < 43) { \ quit(1, "Cryptonight variants need at least 43 bytes of data"); \ } \ - const uint32_t nonce = Variant > 0 ? *(uint32_t*)(Input + 39) : 0 + const uint64_t tweak1_2 = Variant > 0 ? *(uint64_t*)(Input + 35) ^ CNCtx.State[24] : 0 static const uint64_t keccakf_rndc[24] = @@ -213,9 +210,9 @@ void cryptonight(uint8_t *Output, uint8_t *Input, uint32_t Length, int Variant) uint64_t text[16], a[2], b[2]; uint32_t ExpandedKey1[64], ExpandedKey2[64]; - VARIANT1_INIT(); - CNKeccak(CNCtx.State, Input, Length); + + VARIANT1_INIT(); for(int i = 0; i < 4; ++i) ((uint64_t *)ExpandedKey1)[i] = CNCtx.State[i]; for(int i = 0; i < 4; ++i) ((uint64_t *)ExpandedKey2)[i] = CNCtx.State[i + 4]; @@ -250,8 +247,8 @@ void cryptonight(uint8_t *Output, uint8_t *Input, uint32_t Length, int Variant) b[0] ^= c[0]; b[1] ^= c[1]; + VARIANT1_1(b[1]); memcpy(CNCtx.Scratchpad + ((a[0] & 0x1FFFF0) >> 3), b, 16); - VARIANT1_1(CNCtx.Scratchpad + ((a[0] & 0x1FFFF0) >> 3)); memcpy(b, CNCtx.Scratchpad + ((c[0] & 0x1FFFF0) >> 3), 16); @@ -260,8 +257,9 @@ void cryptonight(uint8_t *Output, uint8_t *Input, uint32_t Length, int Variant) a[1] += mul128(c[0], b[0], &hi); a[0] += hi; + VARIANT1_2(a[1]); memcpy(CNCtx.Scratchpad + ((c[0] & 0x1FFFF0) >> 3), a, 16); - VARIANT1_2(CNCtx.Scratchpad + ((c[0] & 0x1FFFF0) >> 3)); + VARIANT1_2(a[1]); a[0] ^= b[0]; a[1] ^= b[1]; diff --git a/algorithm/cryptonight.h b/algorithm/cryptonight.h index 239ec6e8..65e84263 100644 --- a/algorithm/cryptonight.h +++ b/algorithm/cryptonight.h @@ -7,7 +7,7 @@ typedef struct _CryptonightCtx uint64_t Scratchpad[1 << 18]; } CryptonightCtx; -inline int monero_variant(struct work *work) { +static inline int monero_variant(struct work *work) { return (work->is_monero && work->data[0] >= 7) ? work->data[0] - 6 : 0; } diff --git a/kernel/cryptonight.cl b/kernel/cryptonight.cl index 8c41ed2f..6b48a7f9 100644 --- a/kernel/cryptonight.cl +++ b/kernel/cryptonight.cl @@ -7,10 +7,12 @@ #include "blake256.cl" #include "groestl256.cl" +#define VARIANT0_PARAMS #define VARIANT0_1(p) #define VARIANT0_2(p) #define VARIANT0_INIT() +#define VARIANT1_PARAMS , const uint tweak1 #define VARIANT1_1(p) \ do \ { \ @@ -20,13 +22,13 @@ } while(0) #define VARIANT1_2(p) \ - do \ - { \ - (p).s2 ^= get_global_id(0); \ - } while(0) - -#define VARIANT1_INIT() + (p) ^= tweak1_2 +#define VARIANT1_INIT() \ + uint2 tweak1_2; \ + tweak1_2.s0 = tweak1; \ + tweak1_2.s1 = get_global_id(0); \ + tweak1_2 ^= as_uint2(states[24]); static const __constant ulong keccakf_rndc[24] = { @@ -263,7 +265,7 @@ __kernel void search(__global ulong *input, uint InputLen, __global uint4 *Scrat keccakf1600_2(State); mem_fence(CLK_GLOBAL_MEM_FENCE); - + #pragma unroll for(int i = 0; i < 25; ++i) states[i] = State[i]; @@ -289,16 +291,16 @@ __kernel void search(__global ulong *input, uint InputLen, __global uint4 *Scrat #define SEARCH1(VAR) \ __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) \ -__kernel void search1_var##VAR(__global uint4 *Scratchpad, __global ulong *states) \ +__kernel void search1_var##VAR(__global uint4 *Scratchpad, __global ulong *states VARIANT##VAR##_PARAMS) \ { \ uint4 a, b; \ __local uint AES0[256], AES1[256], AES2[256], AES3[256]; \ \ - VARIANT##VAR##_INIT(); \ - \ Scratchpad += ((get_global_id(0) - get_global_offset(0))); \ states += (25 * (get_global_id(0) - get_global_offset(0))); \ \ + VARIANT##VAR##_INIT(); \ + \ for(int i = get_local_id(0); i < 256; i += WORKSIZE) \ { \ const uint tmp = AES0_C[i]; \ @@ -325,8 +327,10 @@ __kernel void search1_var##VAR(__global uint4 *Scratchpad, __global ulong *state c = Scratchpad[IDX((as_ulong(a.s01) & 0x1FFFF0) >> 4)]; \ c = AES_Round(AES0, AES1, AES2, AES3, c, a); \ \ - Scratchpad[IDX((as_ulong(a.s01) & 0x1FFFF0) >> 4)] = b_x ^ c; \ - VARIANT##VAR##_1(Scratchpad[IDX((as_ulong(a.s01) & 0x1FFFF0) >> 4)]); \ + b_x ^= c; \ + VARIANT##VAR##_1(b_x); \ + \ + Scratchpad[IDX((as_ulong(a.s01) & 0x1FFFF0) >> 4)] = b_x; \ \ uint4 tmp; \ tmp = Scratchpad[IDX((as_ulong(c.s01) & 0x1FFFF0) >> 4)]; \ @@ -334,8 +338,9 @@ __kernel void search1_var##VAR(__global uint4 *Scratchpad, __global ulong *state a.s23 = as_uint2(as_ulong(a.s23) + as_ulong(c.s01) * as_ulong(tmp.s01)); \ a.s01 = as_uint2(as_ulong(a.s01) + mul_hi(as_ulong(c.s01), as_ulong(tmp.s01))); \ \ + VARIANT##VAR##_2(a.s23); \ Scratchpad[IDX((as_ulong(c.s01) & 0x1FFFF0) >> 4)] = a; \ - VARIANT##VAR##_2(Scratchpad[IDX((as_ulong(c.s01) & 0x1FFFF0) >> 4)]); \ + VARIANT##VAR##_2(a.s23); \ \ a ^= tmp; \ \