diff --git a/cpu-miner.c b/cpu-miner.c index 439d542..f8b9344 100755 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -63,6 +63,7 @@ int cuda_finddevice(char *name); #endif extern void cryptonight_hash(void* output, const void* input, size_t len); +void parse_device_config( char *config, int *blocks, int *threads ); #ifdef __linux /* Linux specific policy and affinity management */ #include @@ -178,6 +179,7 @@ uint16_t opt_vote = 9999; static int num_processors; int device_map[8] = {0,1,2,3,4,5,6,7}; // CB char *device_name[8]; // CB +int device_config[8][2]; static char *rpc_url; static char *rpc_userpass; static char *rpc_user, *rpc_pass; @@ -241,8 +243,15 @@ Options:\n\ string names of your cards like gtx780ti or gt640#2\n\ (matching 2nd gt640 in the PC)\n\ -f, --diff Divide difficulty by this factor (std is 1) \n\ - -k, --launch=CONFIG launch config for the Cryptonight kernel\n\ - THREADSxBLOCKS (default: 8x40)\n\ + -l, --launch=CONFIG launch config for the Cryptonight kernel.\n\ + a comma separated list of values in form of\n\ + AxB where A is the number of threads to run in\n\ + each thread block and B is the number of thread\n\ + blocks to launch. If less values than devices in use\n\ + are provided, the last value will be used for\n\ + the remaining devices. If you don't need to vary the\n\ + value between devices, you can just enter a single value\n\ + and it will be used for all devices. (default: 8x40)\n\ -v, --vote=VOTE block reward vote (for HeavyCoin)\n\ -m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\ -o, --url=URL URL of mining server\n\ @@ -1129,6 +1138,12 @@ static void *miner_thread(void *userdata) affine_to_cpu(thr_id, thr_id % num_processors); } + if( opt_algo == ALGO_CRYPTONIGHT ) { + + applog(LOG_INFO, "GPU #%d: %s, using %d blocks of %d threads", + device_map[thr_id], device_name[thr_id], device_config[thr_id][0], device_config[thr_id][1]); + } + uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + (jsonrpc_2 ? 39 : 76)); while (1) { @@ -1573,6 +1588,41 @@ static void show_usage_and_exit(int status) exit(status); } +void parse_device_config( char *config, int *blocks, int *threads ) +{ + char *p; + int tmp_blocks, tmp_threads; + + if( config == NULL ) goto usedefault; + + + p = strtok(config, "x"); + if(!p) + goto usedefault; + + tmp_threads = atoi(p); + if( tmp_threads < 4 || tmp_threads > 1024 ) + goto usedefault; + + p = strtok(NULL, "x"); + if(!p) + goto usedefault; + + tmp_blocks = atoi(p); + if( tmp_blocks < 1 ) + goto usedefault; + + *blocks = tmp_blocks; + *threads = tmp_threads; + return; + +usedefault: + *blocks = opt_cn_blocks; + *threads = opt_cn_threads; + return; + +} + static void parse_arg (int key, char *arg) { char *p; @@ -1783,19 +1833,27 @@ static void parse_arg (int key, char *arg) opt_difficulty = d; break; case 'l': /* cryptonight launch config */ - p = strtok(arg, "x"); - if (!p) show_usage_and_exit(1); - opt_cn_threads = atoi(p); - if( opt_cn_threads < 4 || opt_cn_threads > 1024 ) { - applog(LOG_ERR, "Invalid value for threads per block, must be between 4 and 1024"); - show_usage_and_exit(1); - } - p = strtok(NULL, "x"); - if (!p) show_usage_and_exit(1); - opt_cn_blocks = atoi(p); - if( opt_cn_blocks < 1 ) { - applog(LOG_ERR, "Invalid value for thread blocks, needs to be at least 1"); - show_usage_and_exit(1); + { + char *tmp_config[8]; + int tmp_blocks = opt_cn_blocks, tmp_threads = opt_cn_threads; + for( i = 0; i < 8; i++ ) tmp_config[i] = NULL; + p = strtok(arg, ","); + if( p == NULL ) show_usage_and_exit(1); + i = 0; + while( p != NULL && i < 8 ) { + tmp_config[i++] = strdup(p); + p = strtok(NULL, ","); + } + while (i < 8) { + tmp_config[i] = strdup(tmp_config[i-1]); + i++; + } + + for( i = 0; i < 8; i++ ) { + parse_device_config(tmp_config[i], &tmp_blocks, &tmp_threads); + device_config[i][0] = tmp_blocks; + device_config[i][1] = tmp_threads; + } } break; @@ -1921,7 +1979,12 @@ int main(int argc, char *argv[]) pthread_mutex_init(&applog_lock, NULL); num_processors = cuda_num_devices(); - /* parse command line */ + for(i = 0; i < 8; i++) { + device_config[i][0] = opt_cn_blocks; + device_config[i][1] = opt_cn_threads; + } + + /* parse command line */ parse_cmdline(argc, argv); cuda_devicenames(); @@ -1929,7 +1992,6 @@ int main(int argc, char *argv[]) if(opt_algo == ALGO_CRYPTONIGHT) { jsonrpc_2 = true; applog(LOG_INFO, "Using JSON-RPC 2.0"); - applog(LOG_INFO, "Using %d blocks of %d threads for cryptonight kernel", opt_cn_blocks, opt_cn_threads); } if (!opt_benchmark && !rpc_url) { @@ -1988,7 +2050,7 @@ int main(int argc, char *argv[]) openlog("cpuminer", LOG_PID, LOG_USER); #endif - work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart)); + work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart)); if (!work_restart) return 1; diff --git a/cryptonight/cryptonight.cu b/cryptonight/cryptonight.cu index d72da40..462aee4 100755 --- a/cryptonight/cryptonight.cu +++ b/cryptonight/cryptonight.cu @@ -1,5 +1,8 @@ #include #include +#include +#include "cuda.h" +#include "cuda_runtime.h" extern "C" { @@ -9,17 +12,16 @@ extern "C" } extern int device_map[8]; +extern int device_config[8][2]; static uint8_t *d_long_state[8]; static union cn_gpu_hash_state *d_hash_state[8]; extern bool opt_benchmark; -extern int opt_cn_threads; -extern int opt_cn_blocks; extern void cryptonight_cpu_init(int thr_id, int threads); extern void cryptonight_cpu_setInput(int thr_id, void *data, void *pTargetIn); -extern void cryptonight_cpu_hash(int thr_id, int threads, uint32_t startNonce, uint32_t *nonce, uint8_t *d_long_state, union cn_gpu_hash_state *d_hash_state); +extern void cryptonight_cpu_hash(int thr_id, int blocks, int threads, uint32_t startNonce, uint32_t *nonce, uint8_t *d_long_state, union cn_gpu_hash_state *d_hash_state); extern "C" void cryptonight_hash(void* output, const void* input, size_t len); @@ -30,20 +32,30 @@ extern "C" int scanhash_cryptonight(int thr_id, uint32_t *pdata, uint32_t *nonceptr = (uint32_t*)(((char*)pdata) + 39); const uint32_t first_nonce = *nonceptr; uint32_t nonce = *nonceptr; + int cn_blocks = device_config[thr_id][0], cn_threads = device_config[thr_id][1]; if (opt_benchmark) { ((uint32_t*)ptarget)[7] = 0x0000ff; pdata[17] = 0; } const uint32_t Htarg = ptarget[7]; - const int throughput = opt_cn_threads * opt_cn_blocks; + const int throughput = cn_threads * cn_blocks; + const size_t alloc = MEMORY * throughput; static bool init[8] = { false, false, false, false, false, false, false, false }; if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); - cudaMalloc(&d_long_state[thr_id], MEMORY * throughput); - cudaMalloc(&d_hash_state[thr_id], sizeof(union cn_gpu_hash_state) * throughput); + cudaDeviceReset(); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + if( cudaMalloc(&d_long_state[thr_id], alloc) != cudaSuccess ) { + applog(LOG_ERR, "GPU #%d: FATAL: failed to allocate device memory for the long state variable", thr_id); + exit(1); + } + if( cudaMalloc(&d_hash_state[thr_id], sizeof(union cn_gpu_hash_state) * throughput) != cudaSuccess ) { + applog(LOG_ERR, "GPU #%d: FATAL: failed to allocate device memory for the hash state variable", thr_id); + exit(1); + } cryptonight_cpu_init(thr_id, throughput); init[thr_id] = true; } @@ -53,7 +65,7 @@ extern "C" int scanhash_cryptonight(int thr_id, uint32_t *pdata, do { uint32_t foundNonce = 0xFFFFFFFF; - cryptonight_cpu_hash(thr_id, throughput, nonce, &foundNonce, d_long_state[thr_id], d_hash_state[thr_id]); + cryptonight_cpu_hash(thr_id, cn_blocks, cn_threads, nonce, &foundNonce, d_long_state[thr_id], d_hash_state[thr_id]); if (foundNonce < 0xffffffff) { diff --git a/cryptonight/cuda_cryptonight.cu b/cryptonight/cuda_cryptonight.cu index 0c8dffc..82f4aa7 100755 --- a/cryptonight/cuda_cryptonight.cu +++ b/cryptonight/cuda_cryptonight.cu @@ -1,16 +1,17 @@ #include #include #include +#include "cuda.h" +#include "cuda_runtime.h" #include "cryptonight.h" -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); +#ifndef _WIN32 +#include +#endif typedef unsigned char BitSequence; typedef unsigned long long DataLength; -extern int opt_cn_threads; -extern int opt_cn_blocks; - __constant__ uint32_t pTarget[8]; __constant__ uint32_t d_input[19]; extern uint32_t *d_resultNonce[8]; @@ -250,16 +251,18 @@ __host__ void cryptonight_cpu_init(int thr_id, int threads) cudaMemcpyToSymbol( d_E8_rc, h_E8_rc, sizeof(h_E8_rc), 0, cudaMemcpyHostToDevice); } -__host__ void cryptonight_cpu_hash(int thr_id, int threads, uint32_t startNonce, uint32_t *nonce, uint8_t *d_long_state, union cn_gpu_hash_state *d_hash_state) +__host__ void cryptonight_cpu_hash(int thr_id, int blocks, int threads, uint32_t startNonce, uint32_t *nonce, uint8_t *d_long_state, union cn_gpu_hash_state *d_hash_state) { - dim3 grid(opt_cn_blocks); - dim3 block(opt_cn_threads); + dim3 grid(blocks); + dim3 block(threads); size_t shared_size = 1024; cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); - cryptonight_gpu_hash<<>>(threads, startNonce, d_resultNonce[thr_id], d_long_state, d_hash_state); - MyStreamSynchronize(NULL, 0, thr_id); + cryptonight_gpu_hash<<>>(blocks*threads, startNonce, d_resultNonce[thr_id], d_long_state, d_hash_state); + + cudaDeviceSynchronize(); + cudaMemcpy(nonce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); } diff --git a/cryptonight/cuda_cryptonight_aes.cu b/cryptonight/cuda_cryptonight_aes.cu index 39b93df..6a3d5cb 100755 --- a/cryptonight/cuda_cryptonight_aes.cu +++ b/cryptonight/cuda_cryptonight_aes.cu @@ -164,8 +164,10 @@ static void cn_aes_cpu_init() __device__ __forceinline__ void cn_aes_gpu_init(uint32_t *sharedMemory) { - int i, x; + int x; + // TODO: actually spread the shared memory loading between more or less all threads + // instead of just using the first four to do it... if(threadIdx.x < 4) { for( x = 0; x < 256; x++ ) {