From bb53af80dcc505fd630c9e8bf397febb6e98e30e Mon Sep 17 00:00:00 2001 From: Ivan Kochin Date: Wed, 20 Nov 2024 13:57:59 +0000 Subject: [PATCH] UCT/GDRCOPY: rcache lookup config variable --- config/ucx.conf | 11 +++-------- src/uct/cuda/gdr_copy/gdr_copy_iface.c | 10 ++++++++-- src/uct/cuda/gdr_copy/gdr_copy_iface.h | 2 ++ 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/config/ucx.conf b/config/ucx.conf index 58a91affa99..b3080a21327 100644 --- a/config/ucx.conf +++ b/config/ucx.conf @@ -3,10 +3,9 @@ CPU model=Grace UCX_REG_NONBLOCK_MEM_TYPES=host,cuda-managed UCX_IB_ODP_MEM_TYPES=host,cuda-managed UCX_IB_MLX5_DEVX_OBJECTS= -UCX_GDR_COPY_BW=0MBs,get_dedicated:30GBs,put_dedicated:30GBs -# Real latency is around 30ns, rest is gdrcopy rcache overhead -# TODO: Add gdrcopy rcache overhead as separate performance graph node -UCX_GDR_COPY_LAT=200ns +UCX_GDR_COPY_BW=0MBs,get_dedicated:30GBs,put_dedicated:30GBsn +UCX_GDR_COPY_LAT=30ns +UCX_GDR_COPY_RCACHE_OVERHEAD=170ns UCX_DISTANCE_BW=auto,sys:16500MBs UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda @@ -27,10 +26,6 @@ UCX_DISTANCE_BW=auto,sys:5100MBs [AMD Milan] CPU model=Milan UCX_DISTANCE_BW=auto,sys:5100MBs -# Real latencies are around 1.4 and 0.4, rest is gdrcopy rcache overhead -# TODO: Add gdrcopy rcache overhead as separate performance graph node -# TODO: Add rcache overhead not only for Milan and GH systems -UCX_GDR_COPY_LAT=get:1.65e-6,put:0.65e-6 [AMD Genoa] CPU model=Genoa diff --git a/src/uct/cuda/gdr_copy/gdr_copy_iface.c b/src/uct/cuda/gdr_copy/gdr_copy_iface.c index b6634080a5b..69443eab25e 100644 --- a/src/uct/cuda/gdr_copy/gdr_copy_iface.c +++ b/src/uct/cuda/gdr_copy/gdr_copy_iface.c @@ -46,6 +46,10 @@ static ucs_config_field_t uct_gdr_copy_iface_config_table[] = { ucs_offsetof(uct_gdr_copy_iface_config_t, put_latency)}, {NULL})}, + {"RCACHE_OVERHEAD", "250ns", + "gdr_copy regions rcache lookup estimated overhead", + ucs_offsetof(uct_gdr_copy_iface_config_t, rcache_overhead), UCS_CONFIG_TYPE_TIME}, + {NULL} }; @@ -137,7 +141,7 @@ uct_gdr_copy_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) iface->config.put_bw.dedicated); iface_attr->bandwidth.shared = ucs_max(iface->config.get_bw.shared, iface->config.put_bw.shared); - iface_attr->overhead = UCT_GDR_COPY_IFACE_OVERHEAD; + iface_attr->overhead = 0; iface_attr->priority = 0; return UCS_OK; @@ -158,7 +162,8 @@ uct_gdr_copy_estimate_perf(uct_iface_h tl_iface, uct_perf_attr_t *perf_attr) } if (perf_attr->field_mask & UCT_PERF_ATTR_FIELD_SEND_PRE_OVERHEAD) { - perf_attr->send_pre_overhead = UCT_GDR_COPY_IFACE_OVERHEAD; + perf_attr->send_pre_overhead = UCT_GDR_COPY_IFACE_OVERHEAD + + iface->config.rcache_ovh; } if (perf_attr->field_mask & UCT_PERF_ATTR_FIELD_SEND_POST_OVERHEAD) { @@ -240,6 +245,7 @@ static UCS_CLASS_INIT_FUNC(uct_gdr_copy_iface_t, uct_md_h md, uct_worker_h worke self->config.put_bw = gdr_config->put_bw; self->config.get_latency = ucs_linear_func_make(gdr_config->get_latency, 0); self->config.put_latency = ucs_linear_func_make(gdr_config->put_latency, 0); + self->config.rcache_ovh = gdr_config->rcache_overhead; return UCS_OK; } diff --git a/src/uct/cuda/gdr_copy/gdr_copy_iface.h b/src/uct/cuda/gdr_copy/gdr_copy_iface.h index fb97302e28b..60623f93286 100644 --- a/src/uct/cuda/gdr_copy/gdr_copy_iface.h +++ b/src/uct/cuda/gdr_copy/gdr_copy_iface.h @@ -20,6 +20,7 @@ typedef struct uct_gdr_copy_iface { uct_ppn_bandwidth_t put_bw; ucs_linear_func_t get_latency; ucs_linear_func_t put_latency; + double rcache_ovh; } config; } uct_gdr_copy_iface_t; @@ -30,6 +31,7 @@ typedef struct uct_gdr_copy_iface_config { uct_ppn_bandwidth_t put_bw; double get_latency; double put_latency; + double rcache_overhead; } uct_gdr_copy_iface_config_t; #endif