openucx · ivankochin · Nov 20, 2024 · rakhmets · Nov 21, 2024 · rakhmets
diff --git a/config/ucx.conf b/config/ucx.conf
@@ -3,10 +3,9 @@ CPU model=Grace
 UCX_REG_NONBLOCK_MEM_TYPES=host,cuda-managed
 UCX_IB_ODP_MEM_TYPES=host,cuda-managed
 UCX_IB_MLX5_DEVX_OBJECTS=
-UCX_GDR_COPY_BW=0MBs,get_dedicated:30GBs,put_dedicated:30GBs
-# Real latency is around 30ns, rest is gdrcopy rcache overhead
-# TODO: Add gdrcopy rcache overhead as separate performance graph node
-UCX_GDR_COPY_LAT=200ns
+UCX_GDR_COPY_BW=0MBs,get_dedicated:30GBs,put_dedicated:30GBsn
-UCX_GDR_COPY_BW=0MBs,get_dedicated:30GBs,put_dedicated:30GBsn
+UCX_GDR_COPY_BW=0MBs,get_dedicated:30GBs,put_dedicated:30GBs
-UCX_GDR_COPY_BW=0MBs,get_dedicated:30GBs,put_dedicated:30GBsn
+UCX_GDR_COPY_BW=0MBs,get_dedicated:30GBs,put_dedicated:30GBs
+UCX_GDR_COPY_LAT=30ns
+UCX_GDR_COPY_RCACHE_OVERHEAD=170ns
 UCX_DISTANCE_BW=auto,sys:16500MBs
 UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda
 
@@ -27,10 +26,6 @@ UCX_DISTANCE_BW=auto,sys:5100MBs
 [AMD Milan]
 CPU model=Milan
 UCX_DISTANCE_BW=auto,sys:5100MBs
-# Real latencies are around 1.4 and 0.4, rest is gdrcopy rcache overhead
-# TODO: Add gdrcopy rcache overhead as separate performance graph node
-# TODO: Add rcache overhead not only for Milan and GH systems
-UCX_GDR_COPY_LAT=get:1.65e-6,put:0.65e-6
 
 [AMD Genoa]
 CPU model=Genoa

diff --git a/src/uct/cuda/gdr_copy/gdr_copy_iface.c b/src/uct/cuda/gdr_copy/gdr_copy_iface.c
@@ -46,6 +46,10 @@ static ucs_config_field_t uct_gdr_copy_iface_config_table[] = {
          ucs_offsetof(uct_gdr_copy_iface_config_t, put_latency)},
         {NULL})},
 
+    {"RCACHE_OVERHEAD", "250ns",
+     "gdr_copy regions rcache lookup estimated overhead",
+     ucs_offsetof(uct_gdr_copy_iface_config_t, rcache_overhead), UCS_CONFIG_TYPE_TIME},
+
     {NULL}
 };
 
@@ -137,7 +141,7 @@ uct_gdr_copy_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr)
                                               iface->config.put_bw.dedicated);
     iface_attr->bandwidth.shared    = ucs_max(iface->config.get_bw.shared,
                                               iface->config.put_bw.shared);
-    iface_attr->overhead            = UCT_GDR_COPY_IFACE_OVERHEAD;
+    iface_attr->overhead            = 0;
     iface_attr->priority            = 0;
 
     return UCS_OK;
@@ -158,7 +162,8 @@ uct_gdr_copy_estimate_perf(uct_iface_h tl_iface, uct_perf_attr_t *perf_attr)
     }
 
     if (perf_attr->field_mask & UCT_PERF_ATTR_FIELD_SEND_PRE_OVERHEAD) {
-        perf_attr->send_pre_overhead = UCT_GDR_COPY_IFACE_OVERHEAD;
+        perf_attr->send_pre_overhead = UCT_GDR_COPY_IFACE_OVERHEAD +
+                                       iface->config.rcache_ovh;
     }
 
     if (perf_attr->field_mask & UCT_PERF_ATTR_FIELD_SEND_POST_OVERHEAD) {
@@ -240,6 +245,7 @@ static UCS_CLASS_INIT_FUNC(uct_gdr_copy_iface_t, uct_md_h md, uct_worker_h worke
     self->config.put_bw      = gdr_config->put_bw;
     self->config.get_latency = ucs_linear_func_make(gdr_config->get_latency, 0);
     self->config.put_latency = ucs_linear_func_make(gdr_config->put_latency, 0);
+    self->config.rcache_ovh  = gdr_config->rcache_overhead;
 
     return UCS_OK;
 }

diff --git a/src/uct/cuda/gdr_copy/gdr_copy_iface.h b/src/uct/cuda/gdr_copy/gdr_copy_iface.h
@@ -20,6 +20,7 @@ typedef struct uct_gdr_copy_iface {
         uct_ppn_bandwidth_t   put_bw;
         ucs_linear_func_t     get_latency;
         ucs_linear_func_t     put_latency;
+        double                rcache_ovh;
     } config;
 } uct_gdr_copy_iface_t;
 
@@ -30,6 +31,7 @@ typedef struct uct_gdr_copy_iface_config {
     uct_ppn_bandwidth_t put_bw;
     double              get_latency;
     double              put_latency;
+    double              rcache_overhead;
 } uct_gdr_copy_iface_config_t;
 
 #endif