diff --git a/cpu-miner.c b/cpu-miner.c
index 439d542..f8b9344 100755
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -63,6 +63,7 @@ int cuda_finddevice(char *name);
 #endif
 
 extern void cryptonight_hash(void* output, const void* input, size_t len);
+void parse_device_config( char *config, int *blocks, int *threads );
 
 #ifdef __linux /* Linux specific policy and affinity management */
 #include <sched.h>
@@ -178,6 +179,7 @@ uint16_t opt_vote = 9999;
 static int num_processors;
 int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
 char *device_name[8]; // CB
+int device_config[8][2];
 static char *rpc_url;
 static char *rpc_userpass;
 static char *rpc_user, *rpc_pass;
@@ -241,8 +243,15 @@ Options:\n\
                         string names of your cards like gtx780ti or gt640#2\n\
                         (matching 2nd gt640 in the PC)\n\
   -f, --diff            Divide difficulty by this factor (std is 1) \n\
-  -k, --launch=CONFIG   launch config for the Cryptonight kernel\n\
-                          THREADSxBLOCKS (default: 8x40)\n\
+  -l, --launch=CONFIG   launch config for the Cryptonight kernel.\n\
+                        a comma separated list of values in form of\n\
+                        AxB where A is the number of threads to run in\n\
+                        each thread block and B is the number of thread\n\
+                        blocks to launch. If less values than devices in use\n\
+                        are provided, the last value will be used for\n\
+                        the remaining devices. If you don't need to vary the\n\
+                        value between devices, you can just enter a single value\n\
+                        and it will be used for all devices. (default: 8x40)\n\
   -v, --vote=VOTE       block reward vote (for HeavyCoin)\n\
   -m, --trust-pool      trust the max block reward vote (maxvote) sent by the pool\n\
   -o, --url=URL         URL of mining server\n\
@@ -1129,6 +1138,12 @@ static void *miner_thread(void *userdata)
 		affine_to_cpu(thr_id, thr_id % num_processors);
 	}
 
+    if( opt_algo == ALGO_CRYPTONIGHT ) {
+
+        applog(LOG_INFO, "GPU #%d: %s, using %d blocks of %d threads",
+            device_map[thr_id], device_name[thr_id], device_config[thr_id][0], device_config[thr_id][1]);
+    }
+
     uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + (jsonrpc_2 ? 39 : 76));
 
 	while (1) {
@@ -1573,6 +1588,41 @@ static void show_usage_and_exit(int status)
 	exit(status);
 }
 
+void parse_device_config( char *config, int *blocks, int *threads )
+{
+    char *p;
+    int tmp_blocks, tmp_threads;
+    
+    if( config == NULL ) goto usedefault;
+
+
+    p = strtok(config, "x");
+    if(!p)
+        goto usedefault;
+    
+    tmp_threads = atoi(p);
+    if( tmp_threads < 4 || tmp_threads > 1024 )
+        goto usedefault;
+    
+    p = strtok(NULL, "x");
+    if(!p)
+        goto usedefault;
+
+    tmp_blocks = atoi(p);
+    if( tmp_blocks < 1 )
+        goto usedefault;
+
+    *blocks = tmp_blocks;
+    *threads = tmp_threads;
+    return;
+
+usedefault:
+    *blocks = opt_cn_blocks;
+    *threads = opt_cn_threads;
+    return;
+
+}
+
 static void parse_arg (int key, char *arg)
 {
 	char *p;
@@ -1783,19 +1833,27 @@ static void parse_arg (int key, char *arg)
 		opt_difficulty = d;
 		break;
 	case 'l':			/* cryptonight launch config */
-        p = strtok(arg, "x");
-		if (!p) show_usage_and_exit(1);
-        opt_cn_threads = atoi(p);
-        if( opt_cn_threads < 4 || opt_cn_threads > 1024 ) {
-            applog(LOG_ERR, "Invalid value for threads per block, must be between 4 and 1024");
-            show_usage_and_exit(1);
-        }
-        p = strtok(NULL, "x");
-		if (!p) show_usage_and_exit(1);
-        opt_cn_blocks = atoi(p);
-        if( opt_cn_blocks < 1 ) {
-            applog(LOG_ERR, "Invalid value for thread blocks, needs to be at least 1");
-            show_usage_and_exit(1);
+        {
+            char *tmp_config[8];
+            int tmp_blocks = opt_cn_blocks, tmp_threads = opt_cn_threads;
+            for( i = 0; i < 8; i++ ) tmp_config[i] = NULL;
+            p = strtok(arg, ",");
+            if( p == NULL ) show_usage_and_exit(1);
+            i = 0;
+            while( p != NULL && i < 8 ) {
+                tmp_config[i++] = strdup(p);
+                p = strtok(NULL, ",");
+            }
+            while (i < 8) {
+                tmp_config[i] = strdup(tmp_config[i-1]);
+                i++;
+            }
+
+            for( i = 0; i < 8; i++ ) {
+                parse_device_config(tmp_config[i], &tmp_blocks, &tmp_threads);
+                device_config[i][0] = tmp_blocks;
+                device_config[i][1] = tmp_threads;
+            }
         }
         break;
 
@@ -1921,7 +1979,12 @@ int main(int argc, char *argv[])
 	pthread_mutex_init(&applog_lock, NULL);
 	num_processors = cuda_num_devices();
 
-	/* parse command line */
+	for(i = 0; i < 8; i++) {
+        device_config[i][0] = opt_cn_blocks;
+        device_config[i][1] = opt_cn_threads;
+    }
+
+    /* parse command line */
 	parse_cmdline(argc, argv);
 
 	cuda_devicenames();
@@ -1929,7 +1992,6 @@ int main(int argc, char *argv[])
     if(opt_algo == ALGO_CRYPTONIGHT) {
         jsonrpc_2 = true;
         applog(LOG_INFO, "Using JSON-RPC 2.0");
-        applog(LOG_INFO, "Using %d blocks of %d threads for cryptonight kernel", opt_cn_blocks, opt_cn_threads);
     }
 
 	if (!opt_benchmark && !rpc_url) {
@@ -1988,7 +2050,7 @@ int main(int argc, char *argv[])
 		openlog("cpuminer", LOG_PID, LOG_USER);
 #endif
 
-	work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart));
+    work_restart = (struct work_restart *)calloc(opt_n_threads, sizeof(*work_restart));
 	if (!work_restart)
 		return 1;
 
diff --git a/cryptonight/cryptonight.cu b/cryptonight/cryptonight.cu
index d72da40..462aee4 100755
--- a/cryptonight/cryptonight.cu
+++ b/cryptonight/cryptonight.cu
@@ -1,5 +1,8 @@
 #include <unistd.h>
 #include <stdio.h>
+#include <stdint.h>
+#include "cuda.h"
+#include "cuda_runtime.h"
 
 extern "C"
 {
@@ -9,17 +12,16 @@ extern "C"
 }
 
 extern int device_map[8];
+extern int device_config[8][2];
 
 static uint8_t *d_long_state[8];
 static union cn_gpu_hash_state *d_hash_state[8];
 
 extern bool opt_benchmark;
-extern int opt_cn_threads;
-extern int opt_cn_blocks;
 
 extern void cryptonight_cpu_init(int thr_id, int threads);
 extern void cryptonight_cpu_setInput(int thr_id, void *data, void *pTargetIn);
-extern void cryptonight_cpu_hash(int thr_id, int threads, uint32_t startNonce, uint32_t *nonce, uint8_t *d_long_state, union cn_gpu_hash_state *d_hash_state);
+extern void cryptonight_cpu_hash(int thr_id, int blocks, int threads, uint32_t startNonce, uint32_t *nonce, uint8_t *d_long_state, union cn_gpu_hash_state *d_hash_state);
 
 extern "C" void cryptonight_hash(void* output, const void* input, size_t len);
 
@@ -30,20 +32,30 @@ extern "C" int scanhash_cryptonight(int thr_id, uint32_t *pdata,
     uint32_t *nonceptr = (uint32_t*)(((char*)pdata) + 39);
     const uint32_t first_nonce = *nonceptr;
     uint32_t nonce = *nonceptr;
+    int cn_blocks = device_config[thr_id][0], cn_threads = device_config[thr_id][1];
 
 	if (opt_benchmark) {
 		((uint32_t*)ptarget)[7] = 0x0000ff;
         pdata[17] = 0;
     }
 	const uint32_t Htarg = ptarget[7];
-	const int throughput = opt_cn_threads * opt_cn_blocks;
+	const int throughput = cn_threads * cn_blocks;
+    const size_t alloc = MEMORY * throughput;
 
     static bool init[8] = { false, false, false, false, false, false, false, false };
 	if (!init[thr_id])
 	{
         cudaSetDevice(device_map[thr_id]);
-		cudaMalloc(&d_long_state[thr_id], MEMORY * throughput);
-		cudaMalloc(&d_hash_state[thr_id], sizeof(union cn_gpu_hash_state) * throughput);
+        cudaDeviceReset();
+        cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		if( cudaMalloc(&d_long_state[thr_id], alloc) != cudaSuccess ) {
+            applog(LOG_ERR, "GPU #%d: FATAL: failed to allocate device memory for the long state variable", thr_id);
+            exit(1);
+        }
+		if( cudaMalloc(&d_hash_state[thr_id], sizeof(union cn_gpu_hash_state) * throughput) != cudaSuccess ) {
+            applog(LOG_ERR, "GPU #%d: FATAL: failed to allocate device memory for the hash state variable", thr_id);
+            exit(1);
+        }
 		cryptonight_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
@@ -53,7 +65,7 @@ extern "C" int scanhash_cryptonight(int thr_id, uint32_t *pdata,
 	do {
         uint32_t foundNonce = 0xFFFFFFFF;
 
-        cryptonight_cpu_hash(thr_id, throughput, nonce, &foundNonce, d_long_state[thr_id], d_hash_state[thr_id]);
+        cryptonight_cpu_hash(thr_id, cn_blocks, cn_threads, nonce, &foundNonce, d_long_state[thr_id], d_hash_state[thr_id]);
 
         if (foundNonce < 0xffffffff)
 		{
diff --git a/cryptonight/cuda_cryptonight.cu b/cryptonight/cuda_cryptonight.cu
index 0c8dffc..82f4aa7 100755
--- a/cryptonight/cuda_cryptonight.cu
+++ b/cryptonight/cuda_cryptonight.cu
@@ -1,16 +1,17 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
+#include "cuda.h"
+#include "cuda_runtime.h"
 #include "cryptonight.h"
 
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+#ifndef _WIN32
+#include <unistd.h>
+#endif
 
 typedef unsigned char BitSequence;
 typedef unsigned long long DataLength;
 
-extern int opt_cn_threads;
-extern int opt_cn_blocks;
-
 __constant__ uint32_t pTarget[8];
 __constant__ uint32_t d_input[19];
 extern uint32_t *d_resultNonce[8];
@@ -250,16 +251,18 @@ __host__ void cryptonight_cpu_init(int thr_id, int threads)
     cudaMemcpyToSymbol( d_E8_rc, h_E8_rc, sizeof(h_E8_rc), 0, cudaMemcpyHostToDevice);
 }
 
-__host__ void cryptonight_cpu_hash(int thr_id, int threads, uint32_t startNonce, uint32_t *nonce, uint8_t *d_long_state, union cn_gpu_hash_state *d_hash_state)
+__host__ void cryptonight_cpu_hash(int thr_id, int blocks, int threads, uint32_t startNonce, uint32_t *nonce, uint8_t *d_long_state, union cn_gpu_hash_state *d_hash_state)
 {
-    dim3 grid(opt_cn_blocks);
-    dim3 block(opt_cn_threads);
+    dim3 grid(blocks);
+    dim3 block(threads);
 
     size_t shared_size = 1024;
 
     cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-    cryptonight_gpu_hash<<<grid, block, shared_size>>>(threads, startNonce, d_resultNonce[thr_id], d_long_state, d_hash_state);
-    MyStreamSynchronize(NULL, 0, thr_id);
+    cryptonight_gpu_hash<<<grid, block, shared_size>>>(blocks*threads, startNonce, d_resultNonce[thr_id], d_long_state, d_hash_state);
+
+    cudaDeviceSynchronize();
+
     cudaMemcpy(nonce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 }
 
diff --git a/cryptonight/cuda_cryptonight_aes.cu b/cryptonight/cuda_cryptonight_aes.cu
index 39b93df..6a3d5cb 100755
--- a/cryptonight/cuda_cryptonight_aes.cu
+++ b/cryptonight/cuda_cryptonight_aes.cu
@@ -164,8 +164,10 @@ static void cn_aes_cpu_init()
 
 __device__ __forceinline__ void cn_aes_gpu_init(uint32_t *sharedMemory)
 {
-	int i, x;
+	int x;
 
+    // TODO: actually spread the shared memory loading between more or less all threads
+    // instead of just using the first four to do it...
     if(threadIdx.x < 4) {
 		
         for( x = 0; x < 256; x++ ) {