1 /* 2 * kmp_affinity.h -- header for affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef KMP_AFFINITY_H 14 #define KMP_AFFINITY_H 15 16 #include "kmp.h" 17 #include "kmp_os.h" 18 #include <limits> 19 20 #if KMP_AFFINITY_SUPPORTED 21 #if KMP_USE_HWLOC 22 class KMPHwlocAffinity : public KMPAffinity { 23 public: 24 class Mask : public KMPAffinity::Mask { 25 hwloc_cpuset_t mask; 26 27 public: Mask()28 Mask() { 29 mask = hwloc_bitmap_alloc(); 30 this->zero(); 31 } ~Mask()32 ~Mask() { hwloc_bitmap_free(mask); } set(int i)33 void set(int i) override { hwloc_bitmap_set(mask, i); } is_set(int i)34 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); } clear(int i)35 void clear(int i) override { hwloc_bitmap_clr(mask, i); } zero()36 void zero() override { hwloc_bitmap_zero(mask); } empty()37 bool empty() const override { return hwloc_bitmap_iszero(mask); } copy(const KMPAffinity::Mask * src)38 void copy(const KMPAffinity::Mask *src) override { 39 const Mask *convert = static_cast<const Mask *>(src); 40 hwloc_bitmap_copy(mask, convert->mask); 41 } bitwise_and(const KMPAffinity::Mask * rhs)42 void bitwise_and(const KMPAffinity::Mask *rhs) override { 43 const Mask *convert = static_cast<const Mask *>(rhs); 44 hwloc_bitmap_and(mask, mask, convert->mask); 45 } bitwise_or(const KMPAffinity::Mask * rhs)46 void bitwise_or(const KMPAffinity::Mask *rhs) override { 47 const Mask *convert = static_cast<const Mask *>(rhs); 48 hwloc_bitmap_or(mask, mask, convert->mask); 49 } bitwise_not()50 void bitwise_not() override { hwloc_bitmap_not(mask, mask); } is_equal(const KMPAffinity::Mask * rhs)51 bool is_equal(const KMPAffinity::Mask *rhs) const override { 52 const Mask *convert = static_cast<const Mask *>(rhs); 53 return hwloc_bitmap_isequal(mask, convert->mask); 54 } begin()55 int begin() const override { return hwloc_bitmap_first(mask); } end()56 int end() const override { return -1; } next(int previous)57 int next(int previous) const override { 58 return hwloc_bitmap_next(mask, previous); 59 } get_system_affinity(bool abort_on_error)60 int get_system_affinity(bool abort_on_error) override { 61 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 62 "Illegal get affinity operation when not capable"); 63 long retval = 64 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 65 if (retval >= 0) { 66 return 0; 67 } 68 int error = errno; 69 if (abort_on_error) { 70 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"), 71 KMP_ERR(error), __kmp_msg_null); 72 } 73 return error; 74 } set_system_affinity(bool abort_on_error)75 int set_system_affinity(bool abort_on_error) const override { 76 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 77 "Illegal set affinity operation when not capable"); 78 long retval = 79 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 80 if (retval >= 0) { 81 return 0; 82 } 83 int error = errno; 84 if (abort_on_error) { 85 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"), 86 KMP_ERR(error), __kmp_msg_null); 87 } 88 return error; 89 } 90 #if KMP_OS_WINDOWS set_process_affinity(bool abort_on_error)91 int set_process_affinity(bool abort_on_error) const override { 92 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 93 "Illegal set process affinity operation when not capable"); 94 int error = 0; 95 const hwloc_topology_support *support = 96 hwloc_topology_get_support(__kmp_hwloc_topology); 97 if (support->cpubind->set_proc_cpubind) { 98 int retval; 99 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask, 100 HWLOC_CPUBIND_PROCESS); 101 if (retval >= 0) 102 return 0; 103 error = errno; 104 if (abort_on_error) 105 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"), 106 KMP_ERR(error), __kmp_msg_null); 107 } 108 return error; 109 } 110 #endif get_proc_group()111 int get_proc_group() const override { 112 int group = -1; 113 #if KMP_OS_WINDOWS 114 if (__kmp_num_proc_groups == 1) { 115 return 1; 116 } 117 for (int i = 0; i < __kmp_num_proc_groups; i++) { 118 // On windows, the long type is always 32 bits 119 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2); 120 unsigned long second_32_bits = 121 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1); 122 if (first_32_bits == 0 && second_32_bits == 0) { 123 continue; 124 } 125 if (group >= 0) { 126 return -1; 127 } 128 group = i; 129 } 130 #endif /* KMP_OS_WINDOWS */ 131 return group; 132 } 133 }; determine_capable(const char * var)134 void determine_capable(const char *var) override { 135 const hwloc_topology_support *topology_support; 136 if (__kmp_hwloc_topology == NULL) { 137 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) { 138 __kmp_hwloc_error = TRUE; 139 if (__kmp_affinity.flags.verbose) { 140 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()"); 141 } 142 } 143 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) { 144 __kmp_hwloc_error = TRUE; 145 if (__kmp_affinity.flags.verbose) { 146 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()"); 147 } 148 } 149 } 150 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology); 151 // Is the system capable of setting/getting this thread's affinity? 152 // Also, is topology discovery possible? (pu indicates ability to discover 153 // processing units). And finally, were there no errors when calling any 154 // hwloc_* API functions? 155 if (topology_support && topology_support->cpubind->set_thisthread_cpubind && 156 topology_support->cpubind->get_thisthread_cpubind && 157 topology_support->discovery->pu && !__kmp_hwloc_error) { 158 // enables affinity according to KMP_AFFINITY_CAPABLE() macro 159 KMP_AFFINITY_ENABLE(TRUE); 160 } else { 161 // indicate that hwloc didn't work and disable affinity 162 __kmp_hwloc_error = TRUE; 163 KMP_AFFINITY_DISABLE(); 164 } 165 } bind_thread(int which)166 void bind_thread(int which) override { 167 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 168 "Illegal set affinity operation when not capable"); 169 KMPAffinity::Mask *mask; 170 KMP_CPU_ALLOC_ON_STACK(mask); 171 KMP_CPU_ZERO(mask); 172 KMP_CPU_SET(which, mask); 173 __kmp_set_system_affinity(mask, TRUE); 174 KMP_CPU_FREE_FROM_STACK(mask); 175 } allocate_mask()176 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } deallocate_mask(KMPAffinity::Mask * m)177 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } allocate_mask_array(int num)178 KMPAffinity::Mask *allocate_mask_array(int num) override { 179 return new Mask[num]; 180 } deallocate_mask_array(KMPAffinity::Mask * array)181 void deallocate_mask_array(KMPAffinity::Mask *array) override { 182 Mask *hwloc_array = static_cast<Mask *>(array); 183 delete[] hwloc_array; 184 } index_mask_array(KMPAffinity::Mask * array,int index)185 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 186 int index) override { 187 Mask *hwloc_array = static_cast<Mask *>(array); 188 return &(hwloc_array[index]); 189 } get_api_type()190 api_type get_api_type() const override { return HWLOC; } 191 }; 192 #endif /* KMP_USE_HWLOC */ 193 194 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \ 195 KMP_OS_AIX 196 #if KMP_OS_LINUX 197 /* On some of the older OS's that we build on, these constants aren't present 198 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on 199 all systems of the same arch where they are defined, and they cannot change. 200 stone forever. */ 201 #include <sys/syscall.h> 202 #if KMP_ARCH_X86 || KMP_ARCH_ARM 203 #ifndef __NR_sched_setaffinity 204 #define __NR_sched_setaffinity 241 205 #elif __NR_sched_setaffinity != 241 206 #error Wrong code for setaffinity system call. 207 #endif /* __NR_sched_setaffinity */ 208 #ifndef __NR_sched_getaffinity 209 #define __NR_sched_getaffinity 242 210 #elif __NR_sched_getaffinity != 242 211 #error Wrong code for getaffinity system call. 212 #endif /* __NR_sched_getaffinity */ 213 #elif KMP_ARCH_AARCH64 214 #ifndef __NR_sched_setaffinity 215 #define __NR_sched_setaffinity 122 216 #elif __NR_sched_setaffinity != 122 217 #error Wrong code for setaffinity system call. 218 #endif /* __NR_sched_setaffinity */ 219 #ifndef __NR_sched_getaffinity 220 #define __NR_sched_getaffinity 123 221 #elif __NR_sched_getaffinity != 123 222 #error Wrong code for getaffinity system call. 223 #endif /* __NR_sched_getaffinity */ 224 #elif KMP_ARCH_X86_64 225 #ifndef __NR_sched_setaffinity 226 #define __NR_sched_setaffinity 203 227 #elif __NR_sched_setaffinity != 203 228 #error Wrong code for setaffinity system call. 229 #endif /* __NR_sched_setaffinity */ 230 #ifndef __NR_sched_getaffinity 231 #define __NR_sched_getaffinity 204 232 #elif __NR_sched_getaffinity != 204 233 #error Wrong code for getaffinity system call. 234 #endif /* __NR_sched_getaffinity */ 235 #elif KMP_ARCH_PPC64 236 #ifndef __NR_sched_setaffinity 237 #define __NR_sched_setaffinity 222 238 #elif __NR_sched_setaffinity != 222 239 #error Wrong code for setaffinity system call. 240 #endif /* __NR_sched_setaffinity */ 241 #ifndef __NR_sched_getaffinity 242 #define __NR_sched_getaffinity 223 243 #elif __NR_sched_getaffinity != 223 244 #error Wrong code for getaffinity system call. 245 #endif /* __NR_sched_getaffinity */ 246 #elif KMP_ARCH_MIPS 247 #ifndef __NR_sched_setaffinity 248 #define __NR_sched_setaffinity 4239 249 #elif __NR_sched_setaffinity != 4239 250 #error Wrong code for setaffinity system call. 251 #endif /* __NR_sched_setaffinity */ 252 #ifndef __NR_sched_getaffinity 253 #define __NR_sched_getaffinity 4240 254 #elif __NR_sched_getaffinity != 4240 255 #error Wrong code for getaffinity system call. 256 #endif /* __NR_sched_getaffinity */ 257 #elif KMP_ARCH_MIPS64 258 #ifndef __NR_sched_setaffinity 259 #define __NR_sched_setaffinity 5195 260 #elif __NR_sched_setaffinity != 5195 261 #error Wrong code for setaffinity system call. 262 #endif /* __NR_sched_setaffinity */ 263 #ifndef __NR_sched_getaffinity 264 #define __NR_sched_getaffinity 5196 265 #elif __NR_sched_getaffinity != 5196 266 #error Wrong code for getaffinity system call. 267 #endif /* __NR_sched_getaffinity */ 268 #elif KMP_ARCH_LOONGARCH64 269 #ifndef __NR_sched_setaffinity 270 #define __NR_sched_setaffinity 122 271 #elif __NR_sched_setaffinity != 122 272 #error Wrong code for setaffinity system call. 273 #endif /* __NR_sched_setaffinity */ 274 #ifndef __NR_sched_getaffinity 275 #define __NR_sched_getaffinity 123 276 #elif __NR_sched_getaffinity != 123 277 #error Wrong code for getaffinity system call. 278 #endif /* __NR_sched_getaffinity */ 279 #elif KMP_ARCH_RISCV64 280 #ifndef __NR_sched_setaffinity 281 #define __NR_sched_setaffinity 122 282 #elif __NR_sched_setaffinity != 122 283 #error Wrong code for setaffinity system call. 284 #endif /* __NR_sched_setaffinity */ 285 #ifndef __NR_sched_getaffinity 286 #define __NR_sched_getaffinity 123 287 #elif __NR_sched_getaffinity != 123 288 #error Wrong code for getaffinity system call. 289 #endif /* __NR_sched_getaffinity */ 290 #elif KMP_ARCH_VE 291 #ifndef __NR_sched_setaffinity 292 #define __NR_sched_setaffinity 203 293 #elif __NR_sched_setaffinity != 203 294 #error Wrong code for setaffinity system call. 295 #endif /* __NR_sched_setaffinity */ 296 #ifndef __NR_sched_getaffinity 297 #define __NR_sched_getaffinity 204 298 #elif __NR_sched_getaffinity != 204 299 #error Wrong code for getaffinity system call. 300 #endif /* __NR_sched_getaffinity */ 301 #elif KMP_ARCH_S390X 302 #ifndef __NR_sched_setaffinity 303 #define __NR_sched_setaffinity 239 304 #elif __NR_sched_setaffinity != 239 305 #error Wrong code for setaffinity system call. 306 #endif /* __NR_sched_setaffinity */ 307 #ifndef __NR_sched_getaffinity 308 #define __NR_sched_getaffinity 240 309 #elif __NR_sched_getaffinity != 240 310 #error Wrong code for getaffinity system call. 311 #endif /* __NR_sched_getaffinity */ 312 #else 313 #error Unknown or unsupported architecture 314 #endif /* KMP_ARCH_* */ 315 #elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY 316 #include <pthread.h> 317 #include <pthread_np.h> 318 #elif KMP_OS_NETBSD 319 #include <pthread.h> 320 #include <sched.h> 321 #elif KMP_OS_AIX 322 #include <sys/dr.h> 323 #include <sys/rset.h> 324 #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX. 325 #define GET_NUMBER_SMT_SETS 0x0004 326 extern "C" int syssmt(int flags, int, int, int *); 327 #endif 328 class KMPNativeAffinity : public KMPAffinity { 329 class Mask : public KMPAffinity::Mask { 330 typedef unsigned long mask_t; 331 typedef decltype(__kmp_affin_mask_size) mask_size_type; 332 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 333 static const mask_t ONE = 1; get_num_mask_types()334 mask_size_type get_num_mask_types() const { 335 return __kmp_affin_mask_size / sizeof(mask_t); 336 } 337 338 public: 339 mask_t *mask; Mask()340 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); } ~Mask()341 ~Mask() { 342 if (mask) 343 __kmp_free(mask); 344 } set(int i)345 void set(int i) override { 346 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T)); 347 } is_set(int i)348 bool is_set(int i) const override { 349 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T))); 350 } clear(int i)351 void clear(int i) override { 352 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T)); 353 } zero()354 void zero() override { 355 mask_size_type e = get_num_mask_types(); 356 for (mask_size_type i = 0; i < e; ++i) 357 mask[i] = (mask_t)0; 358 } empty()359 bool empty() const override { 360 mask_size_type e = get_num_mask_types(); 361 for (mask_size_type i = 0; i < e; ++i) 362 if (mask[i] != (mask_t)0) 363 return false; 364 return true; 365 } copy(const KMPAffinity::Mask * src)366 void copy(const KMPAffinity::Mask *src) override { 367 const Mask *convert = static_cast<const Mask *>(src); 368 mask_size_type e = get_num_mask_types(); 369 for (mask_size_type i = 0; i < e; ++i) 370 mask[i] = convert->mask[i]; 371 } bitwise_and(const KMPAffinity::Mask * rhs)372 void bitwise_and(const KMPAffinity::Mask *rhs) override { 373 const Mask *convert = static_cast<const Mask *>(rhs); 374 mask_size_type e = get_num_mask_types(); 375 for (mask_size_type i = 0; i < e; ++i) 376 mask[i] &= convert->mask[i]; 377 } bitwise_or(const KMPAffinity::Mask * rhs)378 void bitwise_or(const KMPAffinity::Mask *rhs) override { 379 const Mask *convert = static_cast<const Mask *>(rhs); 380 mask_size_type e = get_num_mask_types(); 381 for (mask_size_type i = 0; i < e; ++i) 382 mask[i] |= convert->mask[i]; 383 } bitwise_not()384 void bitwise_not() override { 385 mask_size_type e = get_num_mask_types(); 386 for (mask_size_type i = 0; i < e; ++i) 387 mask[i] = ~(mask[i]); 388 } is_equal(const KMPAffinity::Mask * rhs)389 bool is_equal(const KMPAffinity::Mask *rhs) const override { 390 const Mask *convert = static_cast<const Mask *>(rhs); 391 mask_size_type e = get_num_mask_types(); 392 for (mask_size_type i = 0; i < e; ++i) 393 if (mask[i] != convert->mask[i]) 394 return false; 395 return true; 396 } begin()397 int begin() const override { 398 int retval = 0; 399 while (retval < end() && !is_set(retval)) 400 ++retval; 401 return retval; 402 } end()403 int end() const override { 404 int e; 405 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e); 406 return e; 407 } next(int previous)408 int next(int previous) const override { 409 int retval = previous + 1; 410 while (retval < end() && !is_set(retval)) 411 ++retval; 412 return retval; 413 } 414 #if KMP_OS_AIX 415 // On AIX, we don't have a way to get CPU(s) a thread is bound to. 416 // This routine is only used to get the full mask. get_system_affinity(bool abort_on_error)417 int get_system_affinity(bool abort_on_error) override { 418 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 419 "Illegal get affinity operation when not capable"); 420 421 (void)abort_on_error; 422 423 // Set the mask with all CPUs that are available. 424 for (int i = 0; i < __kmp_xproc; ++i) 425 KMP_CPU_SET(i, this); 426 return 0; 427 } set_system_affinity(bool abort_on_error)428 int set_system_affinity(bool abort_on_error) const override { 429 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 430 431 "Illegal set affinity operation when not capable"); 432 433 int location; 434 int gtid = __kmp_entry_gtid(); 435 int tid = thread_self(); 436 437 // Unbind the thread if it was bound to any processors before so that 438 // we can bind the thread to CPUs specified by the mask not others. 439 int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY); 440 441 // On AIX, we can only bind to one instead of a set of CPUs with the 442 // bindprocessor() system call. 443 KMP_CPU_SET_ITERATE(location, this) { 444 if (KMP_CPU_ISSET(location, this)) { 445 retval = bindprocessor(BINDTHREAD, tid, location); 446 if (retval == -1 && errno == 1) { 447 rsid_t rsid; 448 rsethandle_t rsh; 449 // Put something in rsh to prevent compiler warning 450 // about uninitalized use 451 rsh = rs_alloc(RS_EMPTY); 452 rsid.at_pid = getpid(); 453 if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) { 454 retval = ra_detachrset(R_PROCESS, rsid, 0); 455 retval = bindprocessor(BINDTHREAD, tid, location); 456 } 457 } 458 if (retval == 0) { 459 KA_TRACE(10, ("__kmp_set_system_affinity: Done binding " 460 "T#%d to cpu=%d.\n", 461 gtid, location)); 462 continue; 463 } 464 int error = errno; 465 if (abort_on_error) { 466 __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"), 467 KMP_ERR(error), __kmp_msg_null); 468 KA_TRACE(10, ("__kmp_set_system_affinity: Error binding " 469 "T#%d to cpu=%d, errno=%d.\n", 470 gtid, location, error)); 471 return error; 472 } 473 } 474 } 475 return 0; 476 } 477 #else // !KMP_OS_AIX get_system_affinity(bool abort_on_error)478 int get_system_affinity(bool abort_on_error) override { 479 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 480 "Illegal get affinity operation when not capable"); 481 #if KMP_OS_LINUX 482 long retval = 483 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask); 484 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY 485 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, 486 reinterpret_cast<cpuset_t *>(mask)); 487 int retval = (r == 0 ? 0 : -1); 488 #endif 489 if (retval >= 0) { 490 return 0; 491 } 492 int error = errno; 493 if (abort_on_error) { 494 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"), 495 KMP_ERR(error), __kmp_msg_null); 496 } 497 return error; 498 } set_system_affinity(bool abort_on_error)499 int set_system_affinity(bool abort_on_error) const override { 500 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 501 "Illegal set affinity operation when not capable"); 502 #if KMP_OS_LINUX 503 long retval = 504 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask); 505 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY 506 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, 507 reinterpret_cast<cpuset_t *>(mask)); 508 int retval = (r == 0 ? 0 : -1); 509 #endif 510 if (retval >= 0) { 511 return 0; 512 } 513 int error = errno; 514 if (abort_on_error) { 515 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"), 516 KMP_ERR(error), __kmp_msg_null); 517 } 518 return error; 519 } 520 #endif // KMP_OS_AIX 521 }; determine_capable(const char * env_var)522 void determine_capable(const char *env_var) override { 523 __kmp_affinity_determine_capable(env_var); 524 } bind_thread(int which)525 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } allocate_mask()526 KMPAffinity::Mask *allocate_mask() override { 527 KMPNativeAffinity::Mask *retval = new Mask(); 528 return retval; 529 } deallocate_mask(KMPAffinity::Mask * m)530 void deallocate_mask(KMPAffinity::Mask *m) override { 531 KMPNativeAffinity::Mask *native_mask = 532 static_cast<KMPNativeAffinity::Mask *>(m); 533 delete native_mask; 534 } allocate_mask_array(int num)535 KMPAffinity::Mask *allocate_mask_array(int num) override { 536 return new Mask[num]; 537 } deallocate_mask_array(KMPAffinity::Mask * array)538 void deallocate_mask_array(KMPAffinity::Mask *array) override { 539 Mask *linux_array = static_cast<Mask *>(array); 540 delete[] linux_array; 541 } index_mask_array(KMPAffinity::Mask * array,int index)542 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 543 int index) override { 544 Mask *linux_array = static_cast<Mask *>(array); 545 return &(linux_array[index]); 546 } get_api_type()547 api_type get_api_type() const override { return NATIVE_OS; } 548 }; 549 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \ 550 || KMP_OS_AIX */ 551 552 #if KMP_OS_WINDOWS 553 class KMPNativeAffinity : public KMPAffinity { 554 class Mask : public KMPAffinity::Mask { 555 typedef ULONG_PTR mask_t; 556 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 557 mask_t *mask; 558 559 public: Mask()560 Mask() { 561 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups); 562 } ~Mask()563 ~Mask() { 564 if (mask) 565 __kmp_free(mask); 566 } set(int i)567 void set(int i) override { 568 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T)); 569 } is_set(int i)570 bool is_set(int i) const override { 571 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T))); 572 } clear(int i)573 void clear(int i) override { 574 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T)); 575 } zero()576 void zero() override { 577 for (int i = 0; i < __kmp_num_proc_groups; ++i) 578 mask[i] = 0; 579 } empty()580 bool empty() const override { 581 for (size_t i = 0; i < __kmp_num_proc_groups; ++i) 582 if (mask[i]) 583 return false; 584 return true; 585 } copy(const KMPAffinity::Mask * src)586 void copy(const KMPAffinity::Mask *src) override { 587 const Mask *convert = static_cast<const Mask *>(src); 588 for (int i = 0; i < __kmp_num_proc_groups; ++i) 589 mask[i] = convert->mask[i]; 590 } bitwise_and(const KMPAffinity::Mask * rhs)591 void bitwise_and(const KMPAffinity::Mask *rhs) override { 592 const Mask *convert = static_cast<const Mask *>(rhs); 593 for (int i = 0; i < __kmp_num_proc_groups; ++i) 594 mask[i] &= convert->mask[i]; 595 } bitwise_or(const KMPAffinity::Mask * rhs)596 void bitwise_or(const KMPAffinity::Mask *rhs) override { 597 const Mask *convert = static_cast<const Mask *>(rhs); 598 for (int i = 0; i < __kmp_num_proc_groups; ++i) 599 mask[i] |= convert->mask[i]; 600 } bitwise_not()601 void bitwise_not() override { 602 for (int i = 0; i < __kmp_num_proc_groups; ++i) 603 mask[i] = ~(mask[i]); 604 } is_equal(const KMPAffinity::Mask * rhs)605 bool is_equal(const KMPAffinity::Mask *rhs) const override { 606 const Mask *convert = static_cast<const Mask *>(rhs); 607 for (size_t i = 0; i < __kmp_num_proc_groups; ++i) 608 if (mask[i] != convert->mask[i]) 609 return false; 610 return true; 611 } begin()612 int begin() const override { 613 int retval = 0; 614 while (retval < end() && !is_set(retval)) 615 ++retval; 616 return retval; 617 } end()618 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; } next(int previous)619 int next(int previous) const override { 620 int retval = previous + 1; 621 while (retval < end() && !is_set(retval)) 622 ++retval; 623 return retval; 624 } set_process_affinity(bool abort_on_error)625 int set_process_affinity(bool abort_on_error) const override { 626 if (__kmp_num_proc_groups <= 1) { 627 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) { 628 DWORD error = GetLastError(); 629 if (abort_on_error) { 630 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 631 __kmp_msg_null); 632 } 633 return error; 634 } 635 } 636 return 0; 637 } set_system_affinity(bool abort_on_error)638 int set_system_affinity(bool abort_on_error) const override { 639 if (__kmp_num_proc_groups > 1) { 640 // Check for a valid mask. 641 GROUP_AFFINITY ga; 642 int group = get_proc_group(); 643 if (group < 0) { 644 if (abort_on_error) { 645 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 646 } 647 return -1; 648 } 649 // Transform the bit vector into a GROUP_AFFINITY struct 650 // and make the system call to set affinity. 651 ga.Group = group; 652 ga.Mask = mask[group]; 653 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0; 654 655 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL); 656 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) { 657 DWORD error = GetLastError(); 658 if (abort_on_error) { 659 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 660 __kmp_msg_null); 661 } 662 return error; 663 } 664 } else { 665 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) { 666 DWORD error = GetLastError(); 667 if (abort_on_error) { 668 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 669 __kmp_msg_null); 670 } 671 return error; 672 } 673 } 674 return 0; 675 } get_system_affinity(bool abort_on_error)676 int get_system_affinity(bool abort_on_error) override { 677 if (__kmp_num_proc_groups > 1) { 678 this->zero(); 679 GROUP_AFFINITY ga; 680 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL); 681 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) { 682 DWORD error = GetLastError(); 683 if (abort_on_error) { 684 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"), 685 KMP_ERR(error), __kmp_msg_null); 686 } 687 return error; 688 } 689 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) || 690 (ga.Mask == 0)) { 691 return -1; 692 } 693 mask[ga.Group] = ga.Mask; 694 } else { 695 mask_t newMask, sysMask, retval; 696 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) { 697 DWORD error = GetLastError(); 698 if (abort_on_error) { 699 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"), 700 KMP_ERR(error), __kmp_msg_null); 701 } 702 return error; 703 } 704 retval = SetThreadAffinityMask(GetCurrentThread(), newMask); 705 if (!retval) { 706 DWORD error = GetLastError(); 707 if (abort_on_error) { 708 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 709 KMP_ERR(error), __kmp_msg_null); 710 } 711 return error; 712 } 713 newMask = SetThreadAffinityMask(GetCurrentThread(), retval); 714 if (!newMask) { 715 DWORD error = GetLastError(); 716 if (abort_on_error) { 717 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 718 KMP_ERR(error), __kmp_msg_null); 719 } 720 } 721 *mask = retval; 722 } 723 return 0; 724 } get_proc_group()725 int get_proc_group() const override { 726 int group = -1; 727 if (__kmp_num_proc_groups == 1) { 728 return 1; 729 } 730 for (int i = 0; i < __kmp_num_proc_groups; i++) { 731 if (mask[i] == 0) 732 continue; 733 if (group >= 0) 734 return -1; 735 group = i; 736 } 737 return group; 738 } 739 }; determine_capable(const char * env_var)740 void determine_capable(const char *env_var) override { 741 __kmp_affinity_determine_capable(env_var); 742 } bind_thread(int which)743 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } allocate_mask()744 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } deallocate_mask(KMPAffinity::Mask * m)745 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } allocate_mask_array(int num)746 KMPAffinity::Mask *allocate_mask_array(int num) override { 747 return new Mask[num]; 748 } deallocate_mask_array(KMPAffinity::Mask * array)749 void deallocate_mask_array(KMPAffinity::Mask *array) override { 750 Mask *windows_array = static_cast<Mask *>(array); 751 delete[] windows_array; 752 } index_mask_array(KMPAffinity::Mask * array,int index)753 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 754 int index) override { 755 Mask *windows_array = static_cast<Mask *>(array); 756 return &(windows_array[index]); 757 } get_api_type()758 api_type get_api_type() const override { return NATIVE_OS; } 759 }; 760 #endif /* KMP_OS_WINDOWS */ 761 #endif /* KMP_AFFINITY_SUPPORTED */ 762 763 // Describe an attribute for a level in the machine topology 764 struct kmp_hw_attr_t { 765 int core_type : 8; 766 int core_eff : 8; 767 unsigned valid : 1; 768 unsigned reserved : 15; 769 770 static const int UNKNOWN_CORE_EFF = -1; 771 kmp_hw_attr_tkmp_hw_attr_t772 kmp_hw_attr_t() 773 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF), 774 valid(0), reserved(0) {} set_core_typekmp_hw_attr_t775 void set_core_type(kmp_hw_core_type_t type) { 776 valid = 1; 777 core_type = type; 778 } set_core_effkmp_hw_attr_t779 void set_core_eff(int eff) { 780 valid = 1; 781 core_eff = eff; 782 } get_core_typekmp_hw_attr_t783 kmp_hw_core_type_t get_core_type() const { 784 return (kmp_hw_core_type_t)core_type; 785 } get_core_effkmp_hw_attr_t786 int get_core_eff() const { return core_eff; } is_core_type_validkmp_hw_attr_t787 bool is_core_type_valid() const { 788 return core_type != KMP_HW_CORE_TYPE_UNKNOWN; 789 } is_core_eff_validkmp_hw_attr_t790 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; } 791 operator bool() const { return valid; } clearkmp_hw_attr_t792 void clear() { 793 core_type = KMP_HW_CORE_TYPE_UNKNOWN; 794 core_eff = UNKNOWN_CORE_EFF; 795 valid = 0; 796 } containskmp_hw_attr_t797 bool contains(const kmp_hw_attr_t &other) const { 798 if (!valid && !other.valid) 799 return true; 800 if (valid && other.valid) { 801 if (other.is_core_type_valid()) { 802 if (!is_core_type_valid() || (get_core_type() != other.get_core_type())) 803 return false; 804 } 805 if (other.is_core_eff_valid()) { 806 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff())) 807 return false; 808 } 809 return true; 810 } 811 return false; 812 } 813 #if KMP_AFFINITY_SUPPORTED containskmp_hw_attr_t814 bool contains(const kmp_affinity_attrs_t &attr) const { 815 if (!valid && !attr.valid) 816 return true; 817 if (valid && attr.valid) { 818 if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN) 819 return (is_core_type_valid() && 820 (get_core_type() == (kmp_hw_core_type_t)attr.core_type)); 821 if (attr.core_eff != UNKNOWN_CORE_EFF) 822 return (is_core_eff_valid() && (get_core_eff() == attr.core_eff)); 823 return true; 824 } 825 return false; 826 } 827 #endif // KMP_AFFINITY_SUPPORTED 828 bool operator==(const kmp_hw_attr_t &rhs) const { 829 return (rhs.valid == valid && rhs.core_eff == core_eff && 830 rhs.core_type == core_type); 831 } 832 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); } 833 }; 834 835 #if KMP_AFFINITY_SUPPORTED 836 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t)); 837 #endif 838 839 class kmp_hw_thread_t { 840 public: 841 static const int UNKNOWN_ID = -1; 842 static const int MULTIPLE_ID = -2; 843 static int compare_ids(const void *a, const void *b); 844 static int compare_compact(const void *a, const void *b); 845 int ids[KMP_HW_LAST]; 846 int sub_ids[KMP_HW_LAST]; 847 bool leader; 848 int os_id; 849 kmp_hw_attr_t attrs; 850 851 void print() const; clear()852 void clear() { 853 for (int i = 0; i < (int)KMP_HW_LAST; ++i) 854 ids[i] = UNKNOWN_ID; 855 leader = false; 856 attrs.clear(); 857 } 858 }; 859 860 class kmp_topology_t { 861 862 struct flags_t { 863 int uniform : 1; 864 int reserved : 31; 865 }; 866 867 int depth; 868 869 // The following arrays are all 'depth' long and have been 870 // allocated to hold up to KMP_HW_LAST number of objects if 871 // needed so layers can be added without reallocation of any array 872 873 // Orderd array of the types in the topology 874 kmp_hw_t *types; 875 876 // Keep quick topology ratios, for non-uniform topologies, 877 // this ratio holds the max number of itemAs per itemB 878 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ] 879 int *ratio; 880 881 // Storage containing the absolute number of each topology layer 882 int *count; 883 884 // The number of core efficiencies. This is only useful for hybrid 885 // topologies. Core efficiencies will range from 0 to num efficiencies - 1 886 int num_core_efficiencies; 887 int num_core_types; 888 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES]; 889 890 // The hardware threads array 891 // hw_threads is num_hw_threads long 892 // Each hw_thread's ids and sub_ids are depth deep 893 int num_hw_threads; 894 kmp_hw_thread_t *hw_threads; 895 896 // Equivalence hash where the key is the hardware topology item 897 // and the value is the equivalent hardware topology type in the 898 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no 899 // known equivalence for the topology type 900 kmp_hw_t equivalent[KMP_HW_LAST]; 901 902 // Flags describing the topology 903 flags_t flags; 904 905 // Compact value used during sort_compact() 906 int compact; 907 908 // Insert a new topology layer after allocation 909 void _insert_layer(kmp_hw_t type, const int *ids); 910 911 #if KMP_GROUP_AFFINITY 912 // Insert topology information about Windows Processor groups 913 void _insert_windows_proc_groups(); 914 #endif 915 916 // Count each item & get the num x's per y 917 // e.g., get the number of cores and the number of threads per core 918 // for each (x, y) in (KMP_HW_* , KMP_HW_*) 919 void _gather_enumeration_information(); 920 921 // Remove layers that don't add information to the topology. 922 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 923 void _remove_radix1_layers(); 924 925 // Find out if the topology is uniform 926 void _discover_uniformity(); 927 928 // Set all the sub_ids for each hardware thread 929 void _set_sub_ids(); 930 931 // Set global affinity variables describing the number of threads per 932 // core, the number of packages, the number of cores per package, and 933 // the number of cores. 934 void _set_globals(); 935 936 // Set the last level cache equivalent type 937 void _set_last_level_cache(); 938 939 // Return the number of cores with a particular attribute, 'attr'. 940 // If 'find_all' is true, then find all cores on the machine, otherwise find 941 // all cores per the layer 'above' 942 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above, 943 bool find_all = false) const; 944 945 public: 946 // Force use of allocate()/deallocate() 947 kmp_topology_t() = delete; 948 kmp_topology_t(const kmp_topology_t &t) = delete; 949 kmp_topology_t(kmp_topology_t &&t) = delete; 950 kmp_topology_t &operator=(const kmp_topology_t &t) = delete; 951 kmp_topology_t &operator=(kmp_topology_t &&t) = delete; 952 953 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types); 954 static void deallocate(kmp_topology_t *); 955 956 // Functions used in create_map() routines at(int index)957 kmp_hw_thread_t &at(int index) { 958 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 959 return hw_threads[index]; 960 } at(int index)961 const kmp_hw_thread_t &at(int index) const { 962 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 963 return hw_threads[index]; 964 } get_num_hw_threads()965 int get_num_hw_threads() const { return num_hw_threads; } sort_ids()966 void sort_ids() { 967 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 968 kmp_hw_thread_t::compare_ids); 969 } 970 // Check if the hardware ids are unique, if they are 971 // return true, otherwise return false 972 bool check_ids() const; 973 974 // Function to call after the create_map() routine 975 void canonicalize(); 976 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores); 977 978 // Functions used after canonicalize() called 979 980 #if KMP_AFFINITY_SUPPORTED 981 // Set the granularity for affinity settings 982 void set_granularity(kmp_affinity_t &stgs) const; 983 bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const; 984 bool restrict_to_mask(const kmp_affin_mask_t *mask); 985 bool filter_hw_subset(); 986 #endif is_uniform()987 bool is_uniform() const { return flags.uniform; } 988 // Tell whether a type is a valid type in the topology 989 // returns KMP_HW_UNKNOWN when there is no equivalent type get_equivalent_type(kmp_hw_t type)990 kmp_hw_t get_equivalent_type(kmp_hw_t type) const { 991 if (type == KMP_HW_UNKNOWN) 992 return KMP_HW_UNKNOWN; 993 return equivalent[type]; 994 } 995 // Set type1 = type2 set_equivalent_type(kmp_hw_t type1,kmp_hw_t type2)996 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) { 997 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1); 998 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2); 999 kmp_hw_t real_type2 = equivalent[type2]; 1000 if (real_type2 == KMP_HW_UNKNOWN) 1001 real_type2 = type2; 1002 equivalent[type1] = real_type2; 1003 // This loop is required since any of the types may have been set to 1004 // be equivalent to type1. They all must be checked and reset to type2. 1005 KMP_FOREACH_HW_TYPE(type) { 1006 if (equivalent[type] == type1) { 1007 equivalent[type] = real_type2; 1008 } 1009 } 1010 } 1011 // Calculate number of types corresponding to level1 1012 // per types corresponding to level2 (e.g., number of threads per core) calculate_ratio(int level1,int level2)1013 int calculate_ratio(int level1, int level2) const { 1014 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth); 1015 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth); 1016 int r = 1; 1017 for (int level = level1; level > level2; --level) 1018 r *= ratio[level]; 1019 return r; 1020 } get_ratio(int level)1021 int get_ratio(int level) const { 1022 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 1023 return ratio[level]; 1024 } get_depth()1025 int get_depth() const { return depth; }; get_type(int level)1026 kmp_hw_t get_type(int level) const { 1027 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 1028 return types[level]; 1029 } get_level(kmp_hw_t type)1030 int get_level(kmp_hw_t type) const { 1031 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type); 1032 int eq_type = equivalent[type]; 1033 if (eq_type == KMP_HW_UNKNOWN) 1034 return -1; 1035 for (int i = 0; i < depth; ++i) 1036 if (types[i] == eq_type) 1037 return i; 1038 return -1; 1039 } get_count(int level)1040 int get_count(int level) const { 1041 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 1042 return count[level]; 1043 } 1044 // Return the total number of cores with attribute 'attr' get_ncores_with_attr(const kmp_hw_attr_t & attr)1045 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const { 1046 return _get_ncores_with_attr(attr, -1, true); 1047 } 1048 // Return the number of cores with attribute 1049 // 'attr' per topology level 'above' get_ncores_with_attr_per(const kmp_hw_attr_t & attr,int above)1050 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const { 1051 return _get_ncores_with_attr(attr, above, false); 1052 } 1053 1054 #if KMP_AFFINITY_SUPPORTED 1055 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b); sort_compact(kmp_affinity_t & affinity)1056 void sort_compact(kmp_affinity_t &affinity) { 1057 compact = affinity.compact; 1058 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 1059 kmp_hw_thread_t::compare_compact); 1060 } 1061 #endif 1062 void print(const char *env_var = "KMP_AFFINITY") const; 1063 void dump() const; 1064 }; 1065 extern kmp_topology_t *__kmp_topology; 1066 1067 class kmp_hw_subset_t { 1068 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS; 1069 1070 public: 1071 // Describe a machine topology item in KMP_HW_SUBSET 1072 struct item_t { 1073 kmp_hw_t type; 1074 int num_attrs; 1075 int num[MAX_ATTRS]; 1076 int offset[MAX_ATTRS]; 1077 kmp_hw_attr_t attr[MAX_ATTRS]; 1078 }; 1079 // Put parenthesis around max to avoid accidental use of Windows max macro. 1080 const static int USE_ALL = (std::numeric_limits<int>::max)(); 1081 1082 private: 1083 int depth; 1084 int capacity; 1085 item_t *items; 1086 kmp_uint64 set; 1087 bool absolute; 1088 // The set must be able to handle up to KMP_HW_LAST number of layers 1089 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST); 1090 // Sorting the KMP_HW_SUBSET items to follow topology order 1091 // All unknown topology types will be at the beginning of the subset hw_subset_compare(const void * i1,const void * i2)1092 static int hw_subset_compare(const void *i1, const void *i2) { 1093 kmp_hw_t type1 = ((const item_t *)i1)->type; 1094 kmp_hw_t type2 = ((const item_t *)i2)->type; 1095 int level1 = __kmp_topology->get_level(type1); 1096 int level2 = __kmp_topology->get_level(type2); 1097 return level1 - level2; 1098 } 1099 1100 public: 1101 // Force use of allocate()/deallocate() 1102 kmp_hw_subset_t() = delete; 1103 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete; 1104 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete; 1105 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete; 1106 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete; 1107 allocate()1108 static kmp_hw_subset_t *allocate() { 1109 int initial_capacity = 5; 1110 kmp_hw_subset_t *retval = 1111 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t)); 1112 retval->depth = 0; 1113 retval->capacity = initial_capacity; 1114 retval->set = 0ull; 1115 retval->absolute = false; 1116 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity); 1117 return retval; 1118 } deallocate(kmp_hw_subset_t * subset)1119 static void deallocate(kmp_hw_subset_t *subset) { 1120 __kmp_free(subset->items); 1121 __kmp_free(subset); 1122 } set_absolute()1123 void set_absolute() { absolute = true; } is_absolute()1124 bool is_absolute() const { return absolute; } push_back(int num,kmp_hw_t type,int offset,kmp_hw_attr_t attr)1125 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) { 1126 for (int i = 0; i < depth; ++i) { 1127 // Found an existing item for this layer type 1128 // Add the num, offset, and attr to this item 1129 if (items[i].type == type) { 1130 int idx = items[i].num_attrs++; 1131 if ((size_t)idx >= MAX_ATTRS) 1132 return; 1133 items[i].num[idx] = num; 1134 items[i].offset[idx] = offset; 1135 items[i].attr[idx] = attr; 1136 return; 1137 } 1138 } 1139 if (depth == capacity - 1) { 1140 capacity *= 2; 1141 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity); 1142 for (int i = 0; i < depth; ++i) 1143 new_items[i] = items[i]; 1144 __kmp_free(items); 1145 items = new_items; 1146 } 1147 items[depth].num_attrs = 1; 1148 items[depth].type = type; 1149 items[depth].num[0] = num; 1150 items[depth].offset[0] = offset; 1151 items[depth].attr[0] = attr; 1152 depth++; 1153 set |= (1ull << type); 1154 } get_depth()1155 int get_depth() const { return depth; } at(int index)1156 const item_t &at(int index) const { 1157 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1158 return items[index]; 1159 } at(int index)1160 item_t &at(int index) { 1161 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1162 return items[index]; 1163 } remove(int index)1164 void remove(int index) { 1165 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1166 set &= ~(1ull << items[index].type); 1167 for (int j = index + 1; j < depth; ++j) { 1168 items[j - 1] = items[j]; 1169 } 1170 depth--; 1171 } sort()1172 void sort() { 1173 KMP_DEBUG_ASSERT(__kmp_topology); 1174 qsort(items, depth, sizeof(item_t), hw_subset_compare); 1175 } specified(kmp_hw_t type)1176 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); } 1177 1178 // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset. 1179 // This means putting each of {sockets, cores, threads} in the topology if 1180 // they are not specified: 1181 // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc. 1182 // e.g., 3module => *s,3module,*c,*t 1183 // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET 1184 // are expecting the traditional sockets/cores/threads topology. For newer 1185 // hardware, there can be intervening layers like dies/tiles/modules 1186 // (usually corresponding to a cache level). So when a user asks for 1187 // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user 1188 // should get 12 hardware threads across 6 cores and effectively ignore the 1189 // module layer. canonicalize(const kmp_topology_t * top)1190 void canonicalize(const kmp_topology_t *top) { 1191 // Layers to target for KMP_HW_SUBSET canonicalization 1192 kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD}; 1193 1194 // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS 1195 if (is_absolute()) 1196 return; 1197 1198 // Do not target-layer-canonicalize KMP_HW_SUBSETS when the 1199 // topology doesn't have these layers 1200 for (kmp_hw_t type : targeted) 1201 if (top->get_level(type) == KMP_HW_UNKNOWN) 1202 return; 1203 1204 // Put targeted layers in topology if they do not exist 1205 for (kmp_hw_t type : targeted) { 1206 bool found = false; 1207 for (int i = 0; i < get_depth(); ++i) { 1208 if (top->get_equivalent_type(items[i].type) == type) { 1209 found = true; 1210 break; 1211 } 1212 } 1213 if (!found) { 1214 push_back(USE_ALL, type, 0, kmp_hw_attr_t{}); 1215 } 1216 } 1217 sort(); 1218 // Set as an absolute topology that only targets the targeted layers 1219 set_absolute(); 1220 } dump()1221 void dump() const { 1222 printf("**********************\n"); 1223 printf("*** kmp_hw_subset: ***\n"); 1224 printf("* depth: %d\n", depth); 1225 printf("* items:\n"); 1226 for (int i = 0; i < depth; ++i) { 1227 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type)); 1228 for (int j = 0; j < items[i].num_attrs; ++j) { 1229 printf(" num: %d, offset: %d, attr: ", items[i].num[j], 1230 items[i].offset[j]); 1231 if (!items[i].attr[j]) { 1232 printf(" (none)\n"); 1233 } else { 1234 printf( 1235 " core_type = %s, core_eff = %d\n", 1236 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()), 1237 items[i].attr[j].get_core_eff()); 1238 } 1239 } 1240 } 1241 printf("* set: 0x%llx\n", set); 1242 printf("* absolute: %d\n", absolute); 1243 printf("**********************\n"); 1244 } 1245 }; 1246 extern kmp_hw_subset_t *__kmp_hw_subset; 1247 1248 /* A structure for holding machine-specific hierarchy info to be computed once 1249 at init. This structure represents a mapping of threads to the actual machine 1250 hierarchy, or to our best guess at what the hierarchy might be, for the 1251 purpose of performing an efficient barrier. In the worst case, when there is 1252 no machine hierarchy information, it produces a tree suitable for a barrier, 1253 similar to the tree used in the hyper barrier. */ 1254 class hierarchy_info { 1255 public: 1256 /* Good default values for number of leaves and branching factor, given no 1257 affinity information. Behaves a bit like hyper barrier. */ 1258 static const kmp_uint32 maxLeaves = 4; 1259 static const kmp_uint32 minBranch = 4; 1260 /** Number of levels in the hierarchy. Typical levels are threads/core, 1261 cores/package or socket, packages/node, nodes/machine, etc. We don't want 1262 to get specific with nomenclature. When the machine is oversubscribed we 1263 add levels to duplicate the hierarchy, doubling the thread capacity of the 1264 hierarchy each time we add a level. */ 1265 kmp_uint32 maxLevels; 1266 1267 /** This is specifically the depth of the machine configuration hierarchy, in 1268 terms of the number of levels along the longest path from root to any 1269 leaf. It corresponds to the number of entries in numPerLevel if we exclude 1270 all but one trailing 1. */ 1271 kmp_uint32 depth; 1272 kmp_uint32 base_num_threads; 1273 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 }; 1274 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized, 1275 // 2=initialization in progress 1276 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing 1277 1278 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children 1279 the parent of a node at level i has. For example, if we have a machine 1280 with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel = 1281 {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 1282 kmp_uint32 *numPerLevel; 1283 kmp_uint32 *skipPerLevel; 1284 deriveLevels()1285 void deriveLevels() { 1286 int hier_depth = __kmp_topology->get_depth(); 1287 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) { 1288 numPerLevel[level] = __kmp_topology->get_ratio(i); 1289 } 1290 } 1291 hierarchy_info()1292 hierarchy_info() 1293 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {} 1294 fini()1295 void fini() { 1296 if (!uninitialized && numPerLevel) { 1297 __kmp_free(numPerLevel); 1298 numPerLevel = NULL; 1299 uninitialized = not_initialized; 1300 } 1301 } 1302 init(int num_addrs)1303 void init(int num_addrs) { 1304 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8( 1305 &uninitialized, not_initialized, initializing); 1306 if (bool_result == 0) { // Wait for initialization 1307 while (TCR_1(uninitialized) != initialized) 1308 KMP_CPU_PAUSE(); 1309 return; 1310 } 1311 KMP_DEBUG_ASSERT(bool_result == 1); 1312 1313 /* Added explicit initialization of the data fields here to prevent usage of 1314 dirty value observed when static library is re-initialized multiple times 1315 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses 1316 OpenMP). */ 1317 depth = 1; 1318 resizing = 0; 1319 maxLevels = 7; 1320 numPerLevel = 1321 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1322 skipPerLevel = &(numPerLevel[maxLevels]); 1323 for (kmp_uint32 i = 0; i < maxLevels; 1324 ++i) { // init numPerLevel[*] to 1 item per level 1325 numPerLevel[i] = 1; 1326 skipPerLevel[i] = 1; 1327 } 1328 1329 // Sort table by physical ID 1330 if (__kmp_topology && __kmp_topology->get_depth() > 0) { 1331 deriveLevels(); 1332 } else { 1333 numPerLevel[0] = maxLeaves; 1334 numPerLevel[1] = num_addrs / maxLeaves; 1335 if (num_addrs % maxLeaves) 1336 numPerLevel[1]++; 1337 } 1338 1339 base_num_threads = num_addrs; 1340 for (int i = maxLevels - 1; i >= 0; 1341 --i) // count non-empty levels to get depth 1342 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 1343 depth++; 1344 1345 kmp_uint32 branch = minBranch; 1346 if (numPerLevel[0] == 1) 1347 branch = num_addrs / maxLeaves; 1348 if (branch < minBranch) 1349 branch = minBranch; 1350 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width 1351 while (numPerLevel[d] > branch || 1352 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0! 1353 if (numPerLevel[d] & 1) 1354 numPerLevel[d]++; 1355 numPerLevel[d] = numPerLevel[d] >> 1; 1356 if (numPerLevel[d + 1] == 1) 1357 depth++; 1358 numPerLevel[d + 1] = numPerLevel[d + 1] << 1; 1359 } 1360 if (numPerLevel[0] == 1) { 1361 branch = branch >> 1; 1362 if (branch < 4) 1363 branch = minBranch; 1364 } 1365 } 1366 1367 for (kmp_uint32 i = 1; i < depth; ++i) 1368 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1]; 1369 // Fill in hierarchy in the case of oversubscription 1370 for (kmp_uint32 i = depth; i < maxLevels; ++i) 1371 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1372 1373 uninitialized = initialized; // One writer 1374 } 1375 1376 // Resize the hierarchy if nproc changes to something larger than before resize(kmp_uint32 nproc)1377 void resize(kmp_uint32 nproc) { 1378 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1379 while (bool_result == 0) { // someone else is trying to resize 1380 KMP_CPU_PAUSE(); 1381 if (nproc <= base_num_threads) // happy with other thread's resize 1382 return; 1383 else // try to resize 1384 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1385 } 1386 KMP_DEBUG_ASSERT(bool_result != 0); 1387 if (nproc <= base_num_threads) 1388 return; // happy with other thread's resize 1389 1390 // Calculate new maxLevels 1391 kmp_uint32 old_sz = skipPerLevel[depth - 1]; 1392 kmp_uint32 incs = 0, old_maxLevels = maxLevels; 1393 // First see if old maxLevels is enough to contain new size 1394 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) { 1395 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1396 numPerLevel[i - 1] *= 2; 1397 old_sz *= 2; 1398 depth++; 1399 } 1400 if (nproc > old_sz) { // Not enough space, need to expand hierarchy 1401 while (nproc > old_sz) { 1402 old_sz *= 2; 1403 incs++; 1404 depth++; 1405 } 1406 maxLevels += incs; 1407 1408 // Resize arrays 1409 kmp_uint32 *old_numPerLevel = numPerLevel; 1410 kmp_uint32 *old_skipPerLevel = skipPerLevel; 1411 numPerLevel = skipPerLevel = NULL; 1412 numPerLevel = 1413 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1414 skipPerLevel = &(numPerLevel[maxLevels]); 1415 1416 // Copy old elements from old arrays 1417 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) { 1418 // init numPerLevel[*] to 1 item per level 1419 numPerLevel[i] = old_numPerLevel[i]; 1420 skipPerLevel[i] = old_skipPerLevel[i]; 1421 } 1422 1423 // Init new elements in arrays to 1 1424 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) { 1425 // init numPerLevel[*] to 1 item per level 1426 numPerLevel[i] = 1; 1427 skipPerLevel[i] = 1; 1428 } 1429 1430 // Free old arrays 1431 __kmp_free(old_numPerLevel); 1432 } 1433 1434 // Fill in oversubscription levels of hierarchy 1435 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) 1436 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1437 1438 base_num_threads = nproc; 1439 resizing = 0; // One writer 1440 } 1441 }; 1442 #endif // KMP_AFFINITY_H 1443