flame · jeffhammond · May 3, 2022 · May 3, 2022 · May 3, 2022 · May 3, 2022
diff --git a/frame/include/blis.h b/frame/include/blis.h
@@ -84,6 +84,7 @@ extern "C" {
 
 #include "bli_thread.h"
 #include "bli_pthread.h"
+#include "bli_affinity.h"
 
 
 // -- Constant definitions --

diff --git a/frame/thread/bli_affinity.c b/frame/thread/bli_affinity.c
@@ -0,0 +1,106 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022 NVIDIA
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// this macro has to come before any other headers.
+// i hate this but cannot figure out any other way to solve it.
+#define _GNU_SOURCE
+
+#include "bli_affinity.h"
+
+// we need a way to detect oversubscription of the kind where
+// hierarchical parallelism is used and the affinity mask within
+// which BLIS runs does not have enough hardware threads to support
+// the requested software threads.
+//
+// this is motivated by, or related to:
+//    https://github.com/flame/blis/issues/588
+//    https://github.com/flame/blis/pull/607
+//    https://github.com/flame/blis/issues/604
+//    https://github.com/flame/blis/issues/603 
+
+#ifndef BLIS_OS_LINUX
+
+// define the symbol for platforms like Windows and MacOS that do not support the Linux affinity API
+
+dim_t bli_affinity_get_hw_size(bli_affinity_scope_t scope)
+{
+    // this is the largest possible value returned by this function
+    // and it means that the affinity mask does not constrain the current scope.
+    return (dim_t)1024;
+}
+
+#else // BLIS_OS_LINUX
+
+#include <sched.h>
+#include <unistd.h>
+
+// scope is either the calling process or the calling thread:
+//  0 = calling process
+//  1 = calling thread
+
+dim_t bli_affinity_get_hw_size(bli_affinity_scope_t scope)
+{
+    int rc;
+    int active_cpus;
+    pid_t pid;
+    cpu_set_t mask;
+
+    if (scope == process) {
+        pid = getpid();
+    } else {
+        // this means the current thread
+        pid = 0;
+    }
+
+    CPU_ZERO(&mask);
+
+    // if the CPU mask is larger than 1024 bits, this needs to change.
+    // see https://man7.org/linux/man-pages/man2/sched_getaffinity.2.html for details.
+    rc = sched_getaffinity(pid, sizeof(cpu_set_t), &mask);
+    if (rc) {
+        bli_print_msg( "sched_getaffinity failed",
+                       __FILE__, __LINE__ );
+        bli_abort();
+    }
+
+    active_cpus = 0;
+    for (int i=0; i<sizeof(cpu_set_t); i++) {
+        const int on = CPU_ISSET(i, &mask);
+        if (on) active_cpus++;
+    }
+
+    return active_cpus;
+}
+
+#endif // BLIS_OS_LINUX
diff --git a/frame/thread/bli_affinity.h b/frame/thread/bli_affinity.h
@@ -0,0 +1,44 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022 NVIDIA
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_AFFINITY_H
+#define BLIS_AFFINITY_H
+
+#include "blis.h"
+
+typedef enum { process = 0, thread = 1 } bli_affinity_scope_t;
+
+dim_t bli_affinity_get_hw_size(bli_affinity_scope_t scope);
+
+#endif // BLIS_AFFINITY_H
diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c
@@ -6,6 +6,7 @@
 
    Copyright (C) 2014, The University of Texas at Austin
    Copyright (C) 2018, Advanced Micro Devices, Inc.
+   Copyright (C) 2022, NVIDIA
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -199,6 +200,12 @@ void bli_l3_thread_decorator_thread_check
      )
 {
 	dim_t n_threads_real = omp_get_num_threads();
+        dim_t n_threads_hwmask;
+        if ( omp_in_parallel() ) {
+            n_threads_hwmask = bli_affinity_get_hw_size(thread);
+        } else {
+            n_threads_hwmask = bli_affinity_get_hw_size(process);
+        }
 
 	// Check if the number of OpenMP threads created within this parallel
 	// region is different from the number of threads that were requested
@@ -241,6 +248,27 @@ void bli_l3_thread_decorator_thread_check
 
 		// Synchronize all threads and continue.
 		_Pragma( "omp barrier" )
+
+                return;
+	}
+
+	// Check if the number of OpenMP threads created within this parallel
+	// region is different from the number of threads that are available
+	// to BLIS in the calling context.
+	if ( n_threads_hwmask < n_threads || n_threads_hwmask < n_threads_real)
+	{
+                bli_print_msg( "The affinity mask on this process does not have "
+                               "enough HW threads for your requested SW threads.",
+                               __FILE__, __LINE__ );
+                bli_abort();
+
+		bli_thrcomm_init( n_threads_hwmask, gl_comm );
+		bli_rntm_set_num_threads_only( n_threads_hwmask, rntm );
+#warning HELP ME HERE
+		bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
+
+		// Synchronize all threads and continue.
+		_Pragma( "omp barrier" )
 	}
 }
 

diff --git a/test/other/test_affinity.c b/test/other/test_affinity.c
@@ -0,0 +1,112 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022 NVIDIA
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+// this macro has to come before any other headers.
+// i hate this but cannot figure out any other way to solve it.
+#define _GNU_SOURCE
+
+#include <sched.h>
+#include <unistd.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cblas.h>
+
+int main(void)
+{
+    int m=10, n=10, k=10;
+    double A[100], B[100], C[100];
+
+    for (int i=0; i<100; i++) {
+        A[i] = B[i] = C[i] = 1.0;
+    }
+
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                10, 10, 10, 1.0, A, 10, B, 10, 1.0, C, 10); 
+
+    {
+        int rc;
+        pid_t pid = getpid();
+        cpu_set_t old_mask, new_mask;
+        int active_cpus;
+
+        CPU_ZERO(&old_mask);
+
+        rc = sched_getaffinity(pid, sizeof(cpu_set_t), &old_mask);
+        if (rc) {
+            printf("sched_getaffinity returned %d\n", rc);
+            abort();
+        }
+
+        active_cpus = 0;
+        for (int i=0; i<sizeof(cpu_set_t); i++) {
+            const int on = CPU_ISSET(i, &old_mask);
+            if (on) active_cpus++;
+        }
+        printf("active CPUs before = %d\n", active_cpus);
+
+        CPU_ZERO(&new_mask);
+
+        for (int i=0, j=0; i<sizeof(cpu_set_t); i++) {
+            const int on = CPU_ISSET(i, &old_mask);
+            if (on) {
+                if (j < active_cpus / 2) {
+                    CPU_SET(i, &new_mask);
+                    j++;
+                }
+            }
+        }
+
+        active_cpus = 0;
+        for (int i=0; i<sizeof(cpu_set_t); i++) {
+            const int on = CPU_ISSET(i, &new_mask);
+            if (on) active_cpus++;
+        }
+        printf("active CPUs after  = %d\n", active_cpus);
+
+        rc = sched_setaffinity(pid, sizeof(cpu_set_t), &new_mask);
+        if (rc) {
+            printf("sched_getaffinity returned %d\n", rc);
+            abort();
+        }
+
+        cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                    10, 10, 10, 1.0, A, 10, B, 10, 1.0, C, 10); 
+
+        printf("AFTER\n");
+
+    }
+    return 0;
+}