Skip to content
Slava Krutelyov edited this page May 23, 2018 · 18 revisions

Welcome to the mkfit-hackathon wiki!

To disable optimization in nvcc: -O0 -Xcicc -O0 -Xptxas -O0

nvprof commands:

nvprof   ./multorture
nvprof  --print-gpu-trace ./multorture
time nvprof --metrics flop_count_sp,flop_sp_efficiency,shared_store_transactions,shared_load_transactions,local_load_transactions,local_store_transactions,gld_transactions,gst_transactions,gld_throughput,gst_throughput,l2_read_transactions,l2_write_transactions,l2_utilization,l1_cache_global_hit_rate,l1_shared_utilization,l2_l1_read_hit_rate ./multorture
nvprof --events warps_launched,local_load --metrics ipc,ipc_instance,inst_executed,inst_fp_32,inst_integer,inst_per_warp ./multorture

Sample code

__global__ void raw_reg_c_mult_loop_kn(const float* const a, const float* const b, 
    float* c, const int N)
{

  int nN = 1000;
  for (int oLoop = 0; oLoop< nN; ++oLoop){
    for (int n = threadIdx.x + blockIdx.x * blockDim.x;
         n < N;
         n += blockDim.x * gridDim.x) {
      
      float a_ar[36];
      float b_ar[36];
      for (int i = 0; i < 36; ++i){
        const int idx = n + N*i;
        a_ar[i] = a[idx];
        b_ar[i] = b[idx];
      }
      for (int i = 0; i < 6; ++i) {
        for (int j = 0; j < 6; ++j) {
          float c_tmp = 0.f;
          for (int k = 0; k < 6; ++k) {
            c_tmp += a_ar[i + 6*k] * b_ar[k + 6*j];
          }
          c[n + N*(i + 6*j)] = c_tmp;
        }
      }
    }
  }//oLoop< nN; ++oLoop){
}

Some extracted numbers with numerology

N 7168 (64)
.. 1032192 bytes in 1 matrix
1.8142ms         1  1.8142ms  1.8142ms  1.8142ms  raw_reg_c_mult_loop_kn(float const *, float const *, float*, int)
428.68ms  1.8094ms            (112 1 1)        (64 1 1)        96        0B        0B         -           -           -           -  Tesla P100-SXM2         1         7  raw_reg_c_mult_loop_kn(float const *, float const *, float*, int) [208]
          1                             flop_count_sp   Floating Point Operations(Single Precision)  3,096,576,000  3096576000  3096576000
.. 1.7 TFLOPS; vs 9.5 specs (same percentage -> OK)
          1                        flop_sp_efficiency                  FLOP Efficiency(Peak Single)      17.72%      17.72%      17.72%
          1                 shared_store_transactions                     Shared Store Transactions           0           0           0
          1                  shared_load_transactions                      Shared Load Transactions           0           0           0
          1                   local_load_transactions                       Local Load Transactions           0           0           0
          1                  local_store_transactions                      Local Store Transactions           0           0           0
          1                          gld_transactions                      Global Load Transactions   129024000   129024000   129024000
.. 129024000 = 1000* 7168* 18 -> 18 loads per 72 floats => 16 usable bytes per load
          1                          gst_transactions                     Global Store Transactions    32256000    32256000    32256000
.. 32256000 = 1000* 7168* 4.5 -> 4.5 stores per 36 floats => 32 usable bytes per store 
          1                            gld_throughput                        Global Load Throughput  2307.3GB/s  2307.3GB/s  2307.3GB/s
.. vs 720.9 GB/s HBM bandwidth ==> looks like caches are actually in use
.. 4.19 GB per call; 32 bytes per load transaction
          1                            gst_throughput                       Global Store Throughput  576.82GB/s  576.82GB/s  576.82GB/s
.. 1.05 GB per call; 32 bytes per store transaction
          1                      l2_read_transactions                          L2 Read Transactions    63539508    63539508    63539508
.. l2_read/gld = 0.49
          1                     l2_write_transactions                         L2 Write Transactions    32256013    32256013    32256013
          1                            l2_utilization                          L2 Cache Utilization    High (7)    High (7)    High (7)
          1                                       ipc                              Executed IPC    1.348572    1.348572    1.348572
          1                             inst_executed                     Instructions Executed   174,497,792   174497792   174497792
          1                                inst_fp_32                   FP Instructions(Single)  1,548,288,000  1548288000  1548288000
          1                              inst_integer                      Integer Instructions  1,928,213,504  1928213504  1928213504
          1                             inst_per_warp                     Instructions per warp  7.7901e+05  7.7901e+05  7.7901e+05
          1                            warps_launched         224         224         224         224
Clone this wiki locally