Skip to content

Commit

Permalink
Added control over splitting up the kernel launches via command line …
Browse files Browse the repository at this point in the history
…parameters --bfactor and --bsleep. Also included an optimization courtesy of Wolf0 (https://bitcointalk.org/index.php?topic=671784.msg7641906#msg7641906)
  • Loading branch information
tsiv committed Jul 5, 2014
1 parent 44bc11e commit 433f537
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 31 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,18 @@ ccminer-cryptonight
A modification of Christian Buchner's & Christian H.'s
ccminer project by tsiv for Cryptonight mining.

July 5th 2014
-------------

Massive improvement to interactivity on Windows, should also further help with TDR issues.
Introducing the --bfactor and --bsleep command line parameters allows for control over
execution of the biggest resource hog of the algorithm. Use bfactor to determine how
many parts the kernel is split into and bsleep to insert a short delay between the kernel
launches. The defaults are no splitting / no sleep for Linux and split into 64 (bfactor 6)
parts / sleep 100 microseconds between launches for Windows. These defaults seem to work
wonders on my 750 Ti on Windows 7, once again you may want to tweak according to your
environment.

June 30th 2014
--------------

Expand Down
9 changes: 9 additions & 0 deletions README.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ most of their command line interface and options.
value between devices, you can just enter a single
value and it will be used for all devices.
(default: 8x40)
--bfactor=X Enables running the Cryptonight kernel in smaller pieces.\n\
The kernel will be run in 2^X parts according to bfactor,\n\
with a small pause between parts, specified by --bsleep.\n\
This is a per-device setting like the launch config.\n\
(default: 0 (no splitting) on Linux, 6 (64 parts) on Windows)\n\
--bsleep=X Insert a delay of X microseconds between kernel launches.\n\
Use in combination with --bfactor to mitigate the lag\n\
when running on your primary GPU.\n\
This is a per-device setting like the launch config.\n\
-f, --diff Divide difficulty by this factor (std is 1)
-o, --url=URL URL of mining server (default: " DEF_RPC_URL ")
-O, --userpass=U:P username:password pair for mining server
Expand Down
58 changes: 58 additions & 0 deletions cpu-miner.c
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,16 @@ uint16_t opt_vote = 9999;
static int num_processors;
int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
char *device_name[8]; // CB
int device_bfactor[8];
int device_bsleep[8];
int device_config[8][2];
#ifdef WIN32
static int default_bfactor = 6;
static int default_bsleep = 100;
#else
static int default_bfactor = 0;
static int default_bsleep = 0;
#endif
static char *rpc_url;
static char *rpc_userpass;
static char *rpc_user, *rpc_pass;
Expand Down Expand Up @@ -239,6 +248,15 @@ Options:\n\
the remaining devices. If you don't need to vary the\n\
value between devices, you can just enter a single value\n\
and it will be used for all devices. (default: 8x40)\n\
--bfactor=X Enables running the Cryptonight kernel in smaller pieces.\n\
The kernel will be run in 2^X parts according to bfactor,\n\
with a small pause between parts, specified by --bsleep.\n\
This is a per-device setting like the launch config.\n\
(default: 0 (no splitting) on Linux, 6 (64 parts) on Windows)\n\
--bsleep=X Insert a delay of X microseconds between kernel launches.\n\
Use in combination with --bfactor to mitigate the lag\n\
when running on your primary GPU.\n\
This is a per-device setting like the launch config.\n\
-m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\
-o, --url=URL URL of mining server\n\
-O, --userpass=U:P username:password pair for mining server\n\
Expand Down Expand Up @@ -314,6 +332,8 @@ static struct option const options[] = {
{ "devices", 1, NULL, 'd' },
{ "diff", 1, NULL, 'f' },
{ "launch", 1, NULL, 'l' },
{ "bfactor", 1, NULL, 1008 },
{ "bsleep", 1, NULL, 1009 },
{ 0, 0, 0, 0 }
};

Expand Down Expand Up @@ -1732,6 +1752,42 @@ static void parse_arg (int key, char *arg)
}
}
break;
case 1008:
{
p = strtok(arg, ",");
if( p == NULL ) show_usage_and_exit(1);
int last;
i = 0;
while( p != NULL && i < 8 ) {
device_bfactor[i++] = last = atoi(p);
if( last < 0 || last > 10 ) {
applog(LOG_ERR, "Valid range for --bfactor is 0-10");
exit(1);
}
p = strtok(NULL, ",");
}
while (i < 8) {
device_bfactor[i++] = last;
}
}
break;
case 1009:
p = strtok(arg, ",");
if( p == NULL ) show_usage_and_exit(1);
int last;
i = 0;
while( p != NULL && i < 8 ) {
device_bsleep[i++] = last = atoi(p);
if( last < 0 || last > 1000000 ) {
applog(LOG_ERR, "Valid range for --bsleep is 0-1000000");
exit(1);
}
p = strtok(NULL, ",");
}
while (i < 8) {
device_bsleep[i++] = last;
}
break;

case 'V':
show_version_and_exit();
Expand Down Expand Up @@ -1862,6 +1918,8 @@ int main(int argc, char *argv[])
for(i = 0; i < 8; i++) {
device_config[i][0] = opt_cn_blocks;
device_config[i][1] = opt_cn_threads;
device_bfactor[i] = default_bfactor;
device_bsleep[i] = default_bsleep;
}

/* parse command line */
Expand Down
1 change: 1 addition & 0 deletions cryptonight.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ struct cryptonight_gpu_ctx {
uint32_t b[4];
uint32_t key1[40];
uint32_t key2[40];
uint32_t text[32];
};

void hash_permutation(union hash_state *state);
Expand Down
68 changes: 37 additions & 31 deletions cryptonight/cuda_cryptonight_core.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <sys/time.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "cryptonight.h"
Expand All @@ -9,31 +10,15 @@
#include <unistd.h>
#endif

#include "cuda_cryptonight_aes.cu"
extern int device_bfactor[8];
extern int device_bsleep[8];

#define hi_dword(x) (x >> 32)
#define lo_dword(x) (x & 0xFFFFFFFF)
#include "cuda_cryptonight_aes.cu"

__device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
{
uint64_t a = hi_dword(multiplier);
uint64_t b = lo_dword(multiplier);
uint64_t c = hi_dword(multiplicand);
uint64_t d = lo_dword(multiplicand);

uint64_t ac = a * c;
uint64_t ad = a * d;
uint64_t bc = b * c;
uint64_t bd = b * d;

uint64_t adbc = ad + bc;
uint64_t adbc_carry = adbc < ad ? 1 : 0;

uint64_t product_lo = bd + (adbc << 32);
uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
*product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;

return product_lo;
*product_hi = __umul64hi(multiplier, multiplicand);
return(multiplier * multiplicand);
}

__global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx)
Expand All @@ -50,6 +35,7 @@ __global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state,
if (thread < threads)
{
int i, j;
int start = 0, end = MEMORY;
uint8_t *long_state = &d_long_state[MEMORY * thread];
uint32_t *ls32;
struct cryptonight_gpu_ctx *ctx = &d_ctx[thread];
Expand All @@ -61,7 +47,7 @@ __global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state,
for( i = 0; i < 4; i++ )
text[i] = state[i];

for (i = 0; i < MEMORY; i += INIT_SIZE_BYTE) {
for (i = start; i < end; i += INIT_SIZE_BYTE) {

ls32 = (uint32_t *)&long_state[i];

Expand All @@ -73,7 +59,7 @@ __global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state,
}
}

__global__ void cryptonight_core_gpu_phase2(int threads, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx)
__global__ void cryptonight_core_gpu_phase2(int threads, int partcount, int partidx, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx)
{
__shared__ uint32_t sharedMemory[1024];

Expand All @@ -86,23 +72,37 @@ __global__ void cryptonight_core_gpu_phase2(int threads, uint8_t *d_long_state,
if (thread < threads)
{
int i, j;
int start = 0, end = ITER / 4;
uint8_t *long_state = &d_long_state[MEMORY * thread];
struct cryptonight_gpu_ctx *ctx = &d_ctx[thread];
uint32_t a[4], b[4], c[4];

MEMCPY8(a, ctx->a, 2);
MEMCPY8(b, ctx->b, 2);

for (i = 0; i < ITER / 4; ++i) {
if( partcount > 1 ) {

int batchsize = (ITER / 4) / partcount;
start = partidx * batchsize;
end = start + batchsize;
}

for (i = start; i < end; ++i) {

j = E2I(a) * AES_BLOCK_SIZE;
j = ((uint32_t *)a)[0] & 0x1FFFF0;
cn_aes_single_round(sharedMemory, &long_state[j], c, a);
XOR_BLOCKS_DST(c, b, &long_state[j]);
MUL_SUM_XOR_DST(c, a, &long_state[E2I(c) * AES_BLOCK_SIZE]);
j = E2I(a) * AES_BLOCK_SIZE;
MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]);
j = ((uint32_t *)a)[0] & 0x1FFFF0;
cn_aes_single_round(sharedMemory, &long_state[j], b, a);
XOR_BLOCKS_DST(b, c, &long_state[j]);
MUL_SUM_XOR_DST(b, a, &long_state[E2I(b) * AES_BLOCK_SIZE]);
MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0x1FFFF0]);
}

if( partcount > 1 ) {

MEMCPY8(ctx->a, a, 2);
MEMCPY8(ctx->b, b, 2);
}
}
}
Expand All @@ -121,6 +121,7 @@ __global__ void cryptonight_core_gpu_phase3(int threads, uint8_t *d_long_state,
if (thread < threads)
{
int i, j;
int start = 0, end = MEMORY;
uint8_t *long_state = &d_long_state[MEMORY * thread];
uint32_t *ls32;
struct cryptonight_gpu_ctx *ctx = &d_ctx[thread];
Expand All @@ -132,7 +133,7 @@ __global__ void cryptonight_core_gpu_phase3(int threads, uint8_t *d_long_state,
for( i = 0; i < 4; i++ )
text[i] = state[i];

for (i = 0; i < MEMORY; i += INIT_SIZE_BYTE) {
for (i = start; i < end; i += INIT_SIZE_BYTE) {

ls32 = (uint32_t *)&long_state[i];

Expand All @@ -159,12 +160,17 @@ __host__ void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uin
dim3 block8(threads << 3);

size_t shared_size = 1024;
int i, partcount = 1 << device_bfactor[thr_id];

cryptonight_core_gpu_phase1<<<grid, block8, shared_size>>>(blocks*threads, d_long_state, d_ctx);
cudaDeviceSynchronize();
if( partcount > 1 ) usleep(device_bsleep[thr_id]);

cryptonight_core_gpu_phase2<<<grid, block, shared_size>>>(blocks*threads, d_long_state, d_ctx);
cudaDeviceSynchronize();
for( i = 0; i < partcount; i++ ) {
cryptonight_core_gpu_phase2<<<grid, block, shared_size>>>(blocks*threads, partcount, i, d_long_state, d_ctx);
cudaDeviceSynchronize();
if( partcount > 1 ) usleep(device_bsleep[thr_id]);
}

cryptonight_core_gpu_phase3<<<grid, block8, shared_size>>>(blocks*threads, d_long_state, d_ctx);
cudaDeviceSynchronize();
Expand Down

0 comments on commit 433f537

Please sign in to comment.