Skip to content

Commit

Permalink
Fixed high CPU usage due to busy waiting the kernel to finish, added …
Browse files Browse the repository at this point in the history
…checks to initial memory allocations instead of blindly launching the kernel, launch configs can now be specified per device, updated the help text on launch config
  • Loading branch information
tsiv committed Jun 25, 2014
1 parent 1ae75d1 commit dd83f59
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 35 deletions.
98 changes: 80 additions & 18 deletions cpu-miner.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ int cuda_finddevice(char *name);
#endif

extern void cryptonight_hash(void* output, const void* input, size_t len);
void parse_device_config( char *config, int *blocks, int *threads );

#ifdef __linux /* Linux specific policy and affinity management */
#include <sched.h>
Expand Down Expand Up @@ -178,6 +179,7 @@ uint16_t opt_vote = 9999;
static int num_processors;
int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
char *device_name[8]; // CB
int device_config[8][2];
static char *rpc_url;
static char *rpc_userpass;
static char *rpc_user, *rpc_pass;
Expand Down Expand Up @@ -241,8 +243,15 @@ Options:\n\
string names of your cards like gtx780ti or gt640#2\n\
(matching 2nd gt640 in the PC)\n\
-f, --diff Divide difficulty by this factor (std is 1) \n\
-k, --launch=CONFIG launch config for the Cryptonight kernel\n\
THREADSxBLOCKS (default: 8x40)\n\
-l, --launch=CONFIG launch config for the Cryptonight kernel.\n\
a comma separated list of values in form of\n\
AxB where A is the number of threads to run in\n\
each thread block and B is the number of thread\n\
blocks to launch. If less values than devices in use\n\
are provided, the last value will be used for\n\
the remaining devices. If you don't need to vary the\n\
value between devices, you can just enter a single value\n\
and it will be used for all devices. (default: 8x40)\n\
-v, --vote=VOTE block reward vote (for HeavyCoin)\n\
-m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\
-o, --url=URL URL of mining server\n\
Expand Down Expand Up @@ -1129,6 +1138,12 @@ static void *miner_thread(void *userdata)
affine_to_cpu(thr_id, thr_id % num_processors);
}

if( opt_algo == ALGO_CRYPTONIGHT ) {

applog(LOG_INFO, "GPU #%d: %s, using %d blocks of %d threads",
device_map[thr_id], device_name[thr_id], device_config[thr_id][0], device_config[thr_id][1]);
}

uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + (jsonrpc_2 ? 39 : 76));

while (1) {
Expand Down Expand Up @@ -1573,6 +1588,41 @@ static void show_usage_and_exit(int status)
exit(status);
}

void parse_device_config( char *config, int *blocks, int *threads )
{
char *p;
int tmp_blocks, tmp_threads;

if( config == NULL ) goto usedefault;


p = strtok(config, "x");
if(!p)
goto usedefault;

tmp_threads = atoi(p);
if( tmp_threads < 4 || tmp_threads > 1024 )
goto usedefault;

p = strtok(NULL, "x");
if(!p)
goto usedefault;

tmp_blocks = atoi(p);
if( tmp_blocks < 1 )
goto usedefault;

*blocks = tmp_blocks;
*threads = tmp_threads;
return;

usedefault:
*blocks = opt_cn_blocks;
*threads = opt_cn_threads;
return;

}

static void parse_arg (int key, char *arg)
{
char *p;
Expand Down Expand Up @@ -1783,19 +1833,27 @@ static void parse_arg (int key, char *arg)
opt_difficulty = d;
break;
case 'l': /* cryptonight launch config */
p = strtok(arg, "x");
if (!p) show_usage_and_exit(1);
opt_cn_threads = atoi(p);
if( opt_cn_threads < 4 || opt_cn_threads > 1024 ) {
applog(LOG_ERR, "Invalid value for threads per block, must be between 4 and 1024");
show_usage_and_exit(1);
}
p = strtok(NULL, "x");
if (!p) show_usage_and_exit(1);
opt_cn_blocks = atoi(p);
if( opt_cn_blocks < 1 ) {
applog(LOG_ERR, "Invalid value for thread blocks, needs to be at least 1");
show_usage_and_exit(1);
{
char *tmp_config[8];
int tmp_blocks = opt_cn_blocks, tmp_threads = opt_cn_threads;
for( i = 0; i < 8; i++ ) tmp_config[i] = NULL;
p = strtok(arg, ",");
if( p == NULL ) show_usage_and_exit(1);
i = 0;
while( p != NULL && i < 8 ) {
tmp_config[i++] = strdup(p);
p = strtok(NULL, ",");
}
while (i < 8) {
tmp_config[i] = strdup(tmp_config[i-1]);
i++;
}

for( i = 0; i < 8; i++ ) {
parse_device_config(tmp_config[i], &tmp_blocks, &tmp_threads);
device_config[i][0] = tmp_blocks;
device_config[i][1] = tmp_threads;
}
}
break;

Expand Down Expand Up @@ -1921,15 +1979,19 @@ int main(int argc, char *argv[])
pthread_mutex_init(&applog_lock, NULL);
num_processors = cuda_num_devices();

/* parse command line */
for(i = 0; i < 8; i++) {
device_config[i][0] = opt_cn_blocks;
device_config[i][1] = opt_cn_threads;
}

/* parse command line */
parse_cmdline(argc, argv);

cuda_devicenames();

if(opt_algo == ALGO_CRYPTONIGHT) {
jsonrpc_2 = true;
applog(LOG_INFO, "Using JSON-RPC 2.0");
applog(LOG_INFO, "Using %d blocks of %d threads for cryptonight kernel", opt_cn_blocks, opt_cn_threads);
}

if (!opt_benchmark && !rpc_url) {
Expand Down Expand Up @@ -1988,7 +2050,7 @@ int main(int argc, char *argv[])
openlog("cpuminer", LOG_PID, LOG_USER);
#endif

work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart));
work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart));
if (!work_restart)
return 1;

Expand Down
26 changes: 19 additions & 7 deletions cryptonight/cryptonight.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#include <unistd.h>
#include <stdio.h>
#include <stdint.h>
#include "cuda.h"
#include "cuda_runtime.h"

extern "C"
{
Expand All @@ -9,17 +12,16 @@ extern "C"
}

extern int device_map[8];
extern int device_config[8][2];

static uint8_t *d_long_state[8];
static union cn_gpu_hash_state *d_hash_state[8];

extern bool opt_benchmark;
extern int opt_cn_threads;
extern int opt_cn_blocks;

extern void cryptonight_cpu_init(int thr_id, int threads);
extern void cryptonight_cpu_setInput(int thr_id, void *data, void *pTargetIn);
extern void cryptonight_cpu_hash(int thr_id, int threads, uint32_t startNonce, uint32_t *nonce, uint8_t *d_long_state, union cn_gpu_hash_state *d_hash_state);
extern void cryptonight_cpu_hash(int thr_id, int blocks, int threads, uint32_t startNonce, uint32_t *nonce, uint8_t *d_long_state, union cn_gpu_hash_state *d_hash_state);

extern "C" void cryptonight_hash(void* output, const void* input, size_t len);

Expand All @@ -30,20 +32,30 @@ extern "C" int scanhash_cryptonight(int thr_id, uint32_t *pdata,
uint32_t *nonceptr = (uint32_t*)(((char*)pdata) + 39);
const uint32_t first_nonce = *nonceptr;
uint32_t nonce = *nonceptr;
int cn_blocks = device_config[thr_id][0], cn_threads = device_config[thr_id][1];

if (opt_benchmark) {
((uint32_t*)ptarget)[7] = 0x0000ff;
pdata[17] = 0;
}
const uint32_t Htarg = ptarget[7];
const int throughput = opt_cn_threads * opt_cn_blocks;
const int throughput = cn_threads * cn_blocks;
const size_t alloc = MEMORY * throughput;

static bool init[8] = { false, false, false, false, false, false, false, false };
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
cudaMalloc(&d_long_state[thr_id], MEMORY * throughput);
cudaMalloc(&d_hash_state[thr_id], sizeof(union cn_gpu_hash_state) * throughput);
cudaDeviceReset();
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
if( cudaMalloc(&d_long_state[thr_id], alloc) != cudaSuccess ) {
applog(LOG_ERR, "GPU #%d: FATAL: failed to allocate device memory for the long state variable", thr_id);
exit(1);
}
if( cudaMalloc(&d_hash_state[thr_id], sizeof(union cn_gpu_hash_state) * throughput) != cudaSuccess ) {
applog(LOG_ERR, "GPU #%d: FATAL: failed to allocate device memory for the hash state variable", thr_id);
exit(1);
}
cryptonight_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
Expand All @@ -53,7 +65,7 @@ extern "C" int scanhash_cryptonight(int thr_id, uint32_t *pdata,
do {
uint32_t foundNonce = 0xFFFFFFFF;

cryptonight_cpu_hash(thr_id, throughput, nonce, &foundNonce, d_long_state[thr_id], d_hash_state[thr_id]);
cryptonight_cpu_hash(thr_id, cn_blocks, cn_threads, nonce, &foundNonce, d_long_state[thr_id], d_hash_state[thr_id]);

if (foundNonce < 0xffffffff)
{
Expand Down
21 changes: 12 additions & 9 deletions cryptonight/cuda_cryptonight.cu
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "cryptonight.h"

extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#ifndef _WIN32
#include <unistd.h>
#endif

typedef unsigned char BitSequence;
typedef unsigned long long DataLength;

extern int opt_cn_threads;
extern int opt_cn_blocks;

__constant__ uint32_t pTarget[8];
__constant__ uint32_t d_input[19];
extern uint32_t *d_resultNonce[8];
Expand Down Expand Up @@ -250,16 +251,18 @@ __host__ void cryptonight_cpu_init(int thr_id, int threads)
cudaMemcpyToSymbol( d_E8_rc, h_E8_rc, sizeof(h_E8_rc), 0, cudaMemcpyHostToDevice);
}

__host__ void cryptonight_cpu_hash(int thr_id, int threads, uint32_t startNonce, uint32_t *nonce, uint8_t *d_long_state, union cn_gpu_hash_state *d_hash_state)
__host__ void cryptonight_cpu_hash(int thr_id, int blocks, int threads, uint32_t startNonce, uint32_t *nonce, uint8_t *d_long_state, union cn_gpu_hash_state *d_hash_state)
{
dim3 grid(opt_cn_blocks);
dim3 block(opt_cn_threads);
dim3 grid(blocks);
dim3 block(threads);

size_t shared_size = 1024;

cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
cryptonight_gpu_hash<<<grid, block, shared_size>>>(threads, startNonce, d_resultNonce[thr_id], d_long_state, d_hash_state);
MyStreamSynchronize(NULL, 0, thr_id);
cryptonight_gpu_hash<<<grid, block, shared_size>>>(blocks*threads, startNonce, d_resultNonce[thr_id], d_long_state, d_hash_state);

cudaDeviceSynchronize();

cudaMemcpy(nonce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
}

4 changes: 3 additions & 1 deletion cryptonight/cuda_cryptonight_aes.cu
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,10 @@ static void cn_aes_cpu_init()

__device__ __forceinline__ void cn_aes_gpu_init(uint32_t *sharedMemory)
{
int i, x;
int x;

// TODO: actually spread the shared memory loading between more or less all threads
// instead of just using the first four to do it...
if(threadIdx.x < 4) {

for( x = 0; x < 256; x++ ) {
Expand Down

0 comments on commit dd83f59

Please sign in to comment.