1 /* 2 * kmp_affinity.h -- header for affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef KMP_AFFINITY_H 14 #define KMP_AFFINITY_H 15 16 #include "kmp.h" 17 #include "kmp_os.h" 18 19 #if KMP_AFFINITY_SUPPORTED 20 #if KMP_USE_HWLOC 21 class KMPHwlocAffinity : public KMPAffinity { 22 public: 23 class Mask : public KMPAffinity::Mask { 24 hwloc_cpuset_t mask; 25 26 public: 27 Mask() { 28 mask = hwloc_bitmap_alloc(); 29 this->zero(); 30 } 31 ~Mask() { hwloc_bitmap_free(mask); } 32 void set(int i) override { hwloc_bitmap_set(mask, i); } 33 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); } 34 void clear(int i) override { hwloc_bitmap_clr(mask, i); } 35 void zero() override { hwloc_bitmap_zero(mask); } 36 void copy(const KMPAffinity::Mask *src) override { 37 const Mask *convert = static_cast<const Mask *>(src); 38 hwloc_bitmap_copy(mask, convert->mask); 39 } 40 void bitwise_and(const KMPAffinity::Mask *rhs) override { 41 const Mask *convert = static_cast<const Mask *>(rhs); 42 hwloc_bitmap_and(mask, mask, convert->mask); 43 } 44 void bitwise_or(const KMPAffinity::Mask *rhs) override { 45 const Mask *convert = static_cast<const Mask *>(rhs); 46 hwloc_bitmap_or(mask, mask, convert->mask); 47 } 48 void bitwise_not() override { hwloc_bitmap_not(mask, mask); } 49 int begin() const override { return hwloc_bitmap_first(mask); } 50 int end() const override { return -1; } 51 int next(int previous) const override { 52 return hwloc_bitmap_next(mask, previous); 53 } 54 int get_system_affinity(bool abort_on_error) override { 55 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 56 "Illegal get affinity operation when not capable"); 57 long retval = 58 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 59 if (retval >= 0) { 60 return 0; 61 } 62 int error = errno; 63 if (abort_on_error) { 64 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null); 65 } 66 return error; 67 } 68 int set_system_affinity(bool abort_on_error) const override { 69 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 70 "Illegal set affinity operation when not capable"); 71 long retval = 72 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 73 if (retval >= 0) { 74 return 0; 75 } 76 int error = errno; 77 if (abort_on_error) { 78 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null); 79 } 80 return error; 81 } 82 #if KMP_OS_WINDOWS 83 int set_process_affinity(bool abort_on_error) const override { 84 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 85 "Illegal set process affinity operation when not capable"); 86 int error = 0; 87 const hwloc_topology_support *support = 88 hwloc_topology_get_support(__kmp_hwloc_topology); 89 if (support->cpubind->set_proc_cpubind) { 90 int retval; 91 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask, 92 HWLOC_CPUBIND_PROCESS); 93 if (retval >= 0) 94 return 0; 95 error = errno; 96 if (abort_on_error) 97 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null); 98 } 99 return error; 100 } 101 #endif 102 int get_proc_group() const override { 103 int group = -1; 104 #if KMP_OS_WINDOWS 105 if (__kmp_num_proc_groups == 1) { 106 return 1; 107 } 108 for (int i = 0; i < __kmp_num_proc_groups; i++) { 109 // On windows, the long type is always 32 bits 110 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2); 111 unsigned long second_32_bits = 112 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1); 113 if (first_32_bits == 0 && second_32_bits == 0) { 114 continue; 115 } 116 if (group >= 0) { 117 return -1; 118 } 119 group = i; 120 } 121 #endif /* KMP_OS_WINDOWS */ 122 return group; 123 } 124 }; 125 void determine_capable(const char *var) override { 126 const hwloc_topology_support *topology_support; 127 if (__kmp_hwloc_topology == NULL) { 128 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) { 129 __kmp_hwloc_error = TRUE; 130 if (__kmp_affinity_verbose) 131 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()"); 132 } 133 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) { 134 __kmp_hwloc_error = TRUE; 135 if (__kmp_affinity_verbose) 136 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()"); 137 } 138 } 139 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology); 140 // Is the system capable of setting/getting this thread's affinity? 141 // Also, is topology discovery possible? (pu indicates ability to discover 142 // processing units). And finally, were there no errors when calling any 143 // hwloc_* API functions? 144 if (topology_support && topology_support->cpubind->set_thisthread_cpubind && 145 topology_support->cpubind->get_thisthread_cpubind && 146 topology_support->discovery->pu && !__kmp_hwloc_error) { 147 // enables affinity according to KMP_AFFINITY_CAPABLE() macro 148 KMP_AFFINITY_ENABLE(TRUE); 149 } else { 150 // indicate that hwloc didn't work and disable affinity 151 __kmp_hwloc_error = TRUE; 152 KMP_AFFINITY_DISABLE(); 153 } 154 } 155 void bind_thread(int which) override { 156 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 157 "Illegal set affinity operation when not capable"); 158 KMPAffinity::Mask *mask; 159 KMP_CPU_ALLOC_ON_STACK(mask); 160 KMP_CPU_ZERO(mask); 161 KMP_CPU_SET(which, mask); 162 __kmp_set_system_affinity(mask, TRUE); 163 KMP_CPU_FREE_FROM_STACK(mask); 164 } 165 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } 166 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } 167 KMPAffinity::Mask *allocate_mask_array(int num) override { 168 return new Mask[num]; 169 } 170 void deallocate_mask_array(KMPAffinity::Mask *array) override { 171 Mask *hwloc_array = static_cast<Mask *>(array); 172 delete[] hwloc_array; 173 } 174 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 175 int index) override { 176 Mask *hwloc_array = static_cast<Mask *>(array); 177 return &(hwloc_array[index]); 178 } 179 api_type get_api_type() const override { return HWLOC; } 180 }; 181 #endif /* KMP_USE_HWLOC */ 182 183 #if KMP_OS_LINUX || KMP_OS_FREEBSD 184 #if KMP_OS_LINUX 185 /* On some of the older OS's that we build on, these constants aren't present 186 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on 187 all systems of the same arch where they are defined, and they cannot change. 188 stone forever. */ 189 #include <sys/syscall.h> 190 #if KMP_ARCH_X86 || KMP_ARCH_ARM 191 #ifndef __NR_sched_setaffinity 192 #define __NR_sched_setaffinity 241 193 #elif __NR_sched_setaffinity != 241 194 #error Wrong code for setaffinity system call. 195 #endif /* __NR_sched_setaffinity */ 196 #ifndef __NR_sched_getaffinity 197 #define __NR_sched_getaffinity 242 198 #elif __NR_sched_getaffinity != 242 199 #error Wrong code for getaffinity system call. 200 #endif /* __NR_sched_getaffinity */ 201 #elif KMP_ARCH_AARCH64 202 #ifndef __NR_sched_setaffinity 203 #define __NR_sched_setaffinity 122 204 #elif __NR_sched_setaffinity != 122 205 #error Wrong code for setaffinity system call. 206 #endif /* __NR_sched_setaffinity */ 207 #ifndef __NR_sched_getaffinity 208 #define __NR_sched_getaffinity 123 209 #elif __NR_sched_getaffinity != 123 210 #error Wrong code for getaffinity system call. 211 #endif /* __NR_sched_getaffinity */ 212 #elif KMP_ARCH_X86_64 213 #ifndef __NR_sched_setaffinity 214 #define __NR_sched_setaffinity 203 215 #elif __NR_sched_setaffinity != 203 216 #error Wrong code for setaffinity system call. 217 #endif /* __NR_sched_setaffinity */ 218 #ifndef __NR_sched_getaffinity 219 #define __NR_sched_getaffinity 204 220 #elif __NR_sched_getaffinity != 204 221 #error Wrong code for getaffinity system call. 222 #endif /* __NR_sched_getaffinity */ 223 #elif KMP_ARCH_PPC64 224 #ifndef __NR_sched_setaffinity 225 #define __NR_sched_setaffinity 222 226 #elif __NR_sched_setaffinity != 222 227 #error Wrong code for setaffinity system call. 228 #endif /* __NR_sched_setaffinity */ 229 #ifndef __NR_sched_getaffinity 230 #define __NR_sched_getaffinity 223 231 #elif __NR_sched_getaffinity != 223 232 #error Wrong code for getaffinity system call. 233 #endif /* __NR_sched_getaffinity */ 234 #elif KMP_ARCH_MIPS 235 #ifndef __NR_sched_setaffinity 236 #define __NR_sched_setaffinity 4239 237 #elif __NR_sched_setaffinity != 4239 238 #error Wrong code for setaffinity system call. 239 #endif /* __NR_sched_setaffinity */ 240 #ifndef __NR_sched_getaffinity 241 #define __NR_sched_getaffinity 4240 242 #elif __NR_sched_getaffinity != 4240 243 #error Wrong code for getaffinity system call. 244 #endif /* __NR_sched_getaffinity */ 245 #elif KMP_ARCH_MIPS64 246 #ifndef __NR_sched_setaffinity 247 #define __NR_sched_setaffinity 5195 248 #elif __NR_sched_setaffinity != 5195 249 #error Wrong code for setaffinity system call. 250 #endif /* __NR_sched_setaffinity */ 251 #ifndef __NR_sched_getaffinity 252 #define __NR_sched_getaffinity 5196 253 #elif __NR_sched_getaffinity != 5196 254 #error Wrong code for getaffinity system call. 255 #endif /* __NR_sched_getaffinity */ 256 #error Unknown or unsupported architecture 257 #endif /* KMP_ARCH_* */ 258 #elif KMP_OS_FREEBSD 259 #include <pthread.h> 260 #include <pthread_np.h> 261 #endif 262 class KMPNativeAffinity : public KMPAffinity { 263 class Mask : public KMPAffinity::Mask { 264 typedef unsigned long mask_t; 265 typedef decltype(__kmp_affin_mask_size) mask_size_type; 266 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 267 static const mask_t ONE = 1; 268 mask_size_type get_num_mask_types() const { 269 return __kmp_affin_mask_size / sizeof(mask_t); 270 } 271 272 public: 273 mask_t *mask; 274 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); } 275 ~Mask() { 276 if (mask) 277 __kmp_free(mask); 278 } 279 void set(int i) override { 280 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T)); 281 } 282 bool is_set(int i) const override { 283 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T))); 284 } 285 void clear(int i) override { 286 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T)); 287 } 288 void zero() override { 289 mask_size_type e = get_num_mask_types(); 290 for (mask_size_type i = 0; i < e; ++i) 291 mask[i] = (mask_t)0; 292 } 293 void copy(const KMPAffinity::Mask *src) override { 294 const Mask *convert = static_cast<const Mask *>(src); 295 mask_size_type e = get_num_mask_types(); 296 for (mask_size_type i = 0; i < e; ++i) 297 mask[i] = convert->mask[i]; 298 } 299 void bitwise_and(const KMPAffinity::Mask *rhs) override { 300 const Mask *convert = static_cast<const Mask *>(rhs); 301 mask_size_type e = get_num_mask_types(); 302 for (mask_size_type i = 0; i < e; ++i) 303 mask[i] &= convert->mask[i]; 304 } 305 void bitwise_or(const KMPAffinity::Mask *rhs) override { 306 const Mask *convert = static_cast<const Mask *>(rhs); 307 mask_size_type e = get_num_mask_types(); 308 for (mask_size_type i = 0; i < e; ++i) 309 mask[i] |= convert->mask[i]; 310 } 311 void bitwise_not() override { 312 mask_size_type e = get_num_mask_types(); 313 for (mask_size_type i = 0; i < e; ++i) 314 mask[i] = ~(mask[i]); 315 } 316 int begin() const override { 317 int retval = 0; 318 while (retval < end() && !is_set(retval)) 319 ++retval; 320 return retval; 321 } 322 int end() const override { 323 int e; 324 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e); 325 return e; 326 } 327 int next(int previous) const override { 328 int retval = previous + 1; 329 while (retval < end() && !is_set(retval)) 330 ++retval; 331 return retval; 332 } 333 int get_system_affinity(bool abort_on_error) override { 334 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 335 "Illegal get affinity operation when not capable"); 336 #if KMP_OS_LINUX 337 long retval = 338 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask); 339 #elif KMP_OS_FREEBSD 340 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, 341 reinterpret_cast<cpuset_t *>(mask)); 342 int retval = (r == 0 ? 0 : -1); 343 #endif 344 if (retval >= 0) { 345 return 0; 346 } 347 int error = errno; 348 if (abort_on_error) { 349 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null); 350 } 351 return error; 352 } 353 int set_system_affinity(bool abort_on_error) const override { 354 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 355 "Illegal set affinity operation when not capable"); 356 #if KMP_OS_LINUX 357 long retval = 358 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask); 359 #elif KMP_OS_FREEBSD 360 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, 361 reinterpret_cast<cpuset_t *>(mask)); 362 int retval = (r == 0 ? 0 : -1); 363 #endif 364 if (retval >= 0) { 365 return 0; 366 } 367 int error = errno; 368 if (abort_on_error) { 369 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null); 370 } 371 return error; 372 } 373 }; 374 void determine_capable(const char *env_var) override { 375 __kmp_affinity_determine_capable(env_var); 376 } 377 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } 378 KMPAffinity::Mask *allocate_mask() override { 379 KMPNativeAffinity::Mask *retval = new Mask(); 380 return retval; 381 } 382 void deallocate_mask(KMPAffinity::Mask *m) override { 383 KMPNativeAffinity::Mask *native_mask = 384 static_cast<KMPNativeAffinity::Mask *>(m); 385 delete native_mask; 386 } 387 KMPAffinity::Mask *allocate_mask_array(int num) override { 388 return new Mask[num]; 389 } 390 void deallocate_mask_array(KMPAffinity::Mask *array) override { 391 Mask *linux_array = static_cast<Mask *>(array); 392 delete[] linux_array; 393 } 394 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 395 int index) override { 396 Mask *linux_array = static_cast<Mask *>(array); 397 return &(linux_array[index]); 398 } 399 api_type get_api_type() const override { return NATIVE_OS; } 400 }; 401 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */ 402 403 #if KMP_OS_WINDOWS 404 class KMPNativeAffinity : public KMPAffinity { 405 class Mask : public KMPAffinity::Mask { 406 typedef ULONG_PTR mask_t; 407 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 408 mask_t *mask; 409 410 public: 411 Mask() { 412 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups); 413 } 414 ~Mask() { 415 if (mask) 416 __kmp_free(mask); 417 } 418 void set(int i) override { 419 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T)); 420 } 421 bool is_set(int i) const override { 422 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T))); 423 } 424 void clear(int i) override { 425 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T)); 426 } 427 void zero() override { 428 for (int i = 0; i < __kmp_num_proc_groups; ++i) 429 mask[i] = 0; 430 } 431 void copy(const KMPAffinity::Mask *src) override { 432 const Mask *convert = static_cast<const Mask *>(src); 433 for (int i = 0; i < __kmp_num_proc_groups; ++i) 434 mask[i] = convert->mask[i]; 435 } 436 void bitwise_and(const KMPAffinity::Mask *rhs) override { 437 const Mask *convert = static_cast<const Mask *>(rhs); 438 for (int i = 0; i < __kmp_num_proc_groups; ++i) 439 mask[i] &= convert->mask[i]; 440 } 441 void bitwise_or(const KMPAffinity::Mask *rhs) override { 442 const Mask *convert = static_cast<const Mask *>(rhs); 443 for (int i = 0; i < __kmp_num_proc_groups; ++i) 444 mask[i] |= convert->mask[i]; 445 } 446 void bitwise_not() override { 447 for (int i = 0; i < __kmp_num_proc_groups; ++i) 448 mask[i] = ~(mask[i]); 449 } 450 int begin() const override { 451 int retval = 0; 452 while (retval < end() && !is_set(retval)) 453 ++retval; 454 return retval; 455 } 456 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; } 457 int next(int previous) const override { 458 int retval = previous + 1; 459 while (retval < end() && !is_set(retval)) 460 ++retval; 461 return retval; 462 } 463 int set_process_affinity(bool abort_on_error) const override { 464 if (__kmp_num_proc_groups <= 1) { 465 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) { 466 DWORD error = GetLastError(); 467 if (abort_on_error) { 468 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 469 __kmp_msg_null); 470 } 471 return error; 472 } 473 } 474 return 0; 475 } 476 int set_system_affinity(bool abort_on_error) const override { 477 if (__kmp_num_proc_groups > 1) { 478 // Check for a valid mask. 479 GROUP_AFFINITY ga; 480 int group = get_proc_group(); 481 if (group < 0) { 482 if (abort_on_error) { 483 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 484 } 485 return -1; 486 } 487 // Transform the bit vector into a GROUP_AFFINITY struct 488 // and make the system call to set affinity. 489 ga.Group = group; 490 ga.Mask = mask[group]; 491 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0; 492 493 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL); 494 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) { 495 DWORD error = GetLastError(); 496 if (abort_on_error) { 497 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 498 __kmp_msg_null); 499 } 500 return error; 501 } 502 } else { 503 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) { 504 DWORD error = GetLastError(); 505 if (abort_on_error) { 506 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 507 __kmp_msg_null); 508 } 509 return error; 510 } 511 } 512 return 0; 513 } 514 int get_system_affinity(bool abort_on_error) override { 515 if (__kmp_num_proc_groups > 1) { 516 this->zero(); 517 GROUP_AFFINITY ga; 518 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL); 519 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) { 520 DWORD error = GetLastError(); 521 if (abort_on_error) { 522 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"), 523 KMP_ERR(error), __kmp_msg_null); 524 } 525 return error; 526 } 527 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) || 528 (ga.Mask == 0)) { 529 return -1; 530 } 531 mask[ga.Group] = ga.Mask; 532 } else { 533 mask_t newMask, sysMask, retval; 534 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) { 535 DWORD error = GetLastError(); 536 if (abort_on_error) { 537 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"), 538 KMP_ERR(error), __kmp_msg_null); 539 } 540 return error; 541 } 542 retval = SetThreadAffinityMask(GetCurrentThread(), newMask); 543 if (!retval) { 544 DWORD error = GetLastError(); 545 if (abort_on_error) { 546 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 547 KMP_ERR(error), __kmp_msg_null); 548 } 549 return error; 550 } 551 newMask = SetThreadAffinityMask(GetCurrentThread(), retval); 552 if (!newMask) { 553 DWORD error = GetLastError(); 554 if (abort_on_error) { 555 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 556 KMP_ERR(error), __kmp_msg_null); 557 } 558 } 559 *mask = retval; 560 } 561 return 0; 562 } 563 int get_proc_group() const override { 564 int group = -1; 565 if (__kmp_num_proc_groups == 1) { 566 return 1; 567 } 568 for (int i = 0; i < __kmp_num_proc_groups; i++) { 569 if (mask[i] == 0) 570 continue; 571 if (group >= 0) 572 return -1; 573 group = i; 574 } 575 return group; 576 } 577 }; 578 void determine_capable(const char *env_var) override { 579 __kmp_affinity_determine_capable(env_var); 580 } 581 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } 582 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } 583 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } 584 KMPAffinity::Mask *allocate_mask_array(int num) override { 585 return new Mask[num]; 586 } 587 void deallocate_mask_array(KMPAffinity::Mask *array) override { 588 Mask *windows_array = static_cast<Mask *>(array); 589 delete[] windows_array; 590 } 591 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 592 int index) override { 593 Mask *windows_array = static_cast<Mask *>(array); 594 return &(windows_array[index]); 595 } 596 api_type get_api_type() const override { return NATIVE_OS; } 597 }; 598 #endif /* KMP_OS_WINDOWS */ 599 #endif /* KMP_AFFINITY_SUPPORTED */ 600 601 class kmp_hw_thread_t { 602 public: 603 static const int UNKNOWN_ID = -1; 604 static int compare_ids(const void *a, const void *b); 605 static int compare_compact(const void *a, const void *b); 606 int ids[KMP_HW_LAST]; 607 int sub_ids[KMP_HW_LAST]; 608 bool leader; 609 int os_id; 610 void print() const; 611 void clear() { 612 for (int i = 0; i < (int)KMP_HW_LAST; ++i) 613 ids[i] = UNKNOWN_ID; 614 leader = false; 615 } 616 }; 617 618 class kmp_topology_t { 619 620 struct flags_t { 621 int uniform : 1; 622 int reserved : 31; 623 }; 624 625 int depth; 626 627 // The following arrays are all 'depth' long 628 629 // Orderd array of the types in the topology 630 kmp_hw_t *types; 631 632 // Keep quick topology ratios, for non-uniform topologies, 633 // this ratio holds the max number of itemAs per itemB 634 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ] 635 int *ratio; 636 637 // Storage containing the absolute number of each topology layer 638 int *count; 639 640 // The hardware threads array 641 // hw_threads is num_hw_threads long 642 // Each hw_thread's ids and sub_ids are depth deep 643 int num_hw_threads; 644 kmp_hw_thread_t *hw_threads; 645 646 // Equivalence hash where the key is the hardware topology item 647 // and the value is the equivalent hardware topology type in the 648 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no 649 // known equivalence for the topology type 650 kmp_hw_t equivalent[KMP_HW_LAST]; 651 652 // Flags describing the topology 653 flags_t flags; 654 655 // Count each item & get the num x's per y 656 // e.g., get the number of cores and the number of threads per core 657 // for each (x, y) in (KMP_HW_* , KMP_HW_*) 658 void _gather_enumeration_information(); 659 660 // Remove layers that don't add information to the topology. 661 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 662 void _remove_radix1_layers(); 663 664 // Find out if the topology is uniform 665 void _discover_uniformity(); 666 667 // Set all the sub_ids for each hardware thread 668 void _set_sub_ids(); 669 670 // Set global affinity variables describing the number of threads per 671 // core, the number of packages, the number of cores per package, and 672 // the number of cores. 673 void _set_globals(); 674 675 // Set the last level cache equivalent type 676 void _set_last_level_cache(); 677 678 public: 679 // Force use of allocate()/deallocate() 680 kmp_topology_t() = delete; 681 kmp_topology_t(const kmp_topology_t &t) = delete; 682 kmp_topology_t(kmp_topology_t &&t) = delete; 683 kmp_topology_t &operator=(const kmp_topology_t &t) = delete; 684 kmp_topology_t &operator=(kmp_topology_t &&t) = delete; 685 686 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types); 687 static void deallocate(kmp_topology_t *); 688 689 // Functions used in create_map() routines 690 kmp_hw_thread_t &at(int index) { 691 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 692 return hw_threads[index]; 693 } 694 const kmp_hw_thread_t &at(int index) const { 695 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 696 return hw_threads[index]; 697 } 698 int get_num_hw_threads() const { return num_hw_threads; } 699 void sort_ids() { 700 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 701 kmp_hw_thread_t::compare_ids); 702 } 703 // Check if the hardware ids are unique, if they are 704 // return true, otherwise return false 705 bool check_ids() const; 706 707 // Function to call after the create_map() routine 708 void canonicalize(); 709 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores); 710 711 // Functions used after canonicalize() called 712 bool filter_hw_subset(); 713 bool is_close(int hwt1, int hwt2, int level) const; 714 bool is_uniform() const { return flags.uniform; } 715 // Tell whether a type is a valid type in the topology 716 // returns KMP_HW_UNKNOWN when there is no equivalent type 717 kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; } 718 // Set type1 = type2 719 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) { 720 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1); 721 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2); 722 kmp_hw_t real_type2 = equivalent[type2]; 723 if (real_type2 == KMP_HW_UNKNOWN) 724 real_type2 = type2; 725 equivalent[type1] = real_type2; 726 // This loop is required since any of the types may have been set to 727 // be equivalent to type1. They all must be checked and reset to type2. 728 KMP_FOREACH_HW_TYPE(type) { 729 if (equivalent[type] == type1) { 730 equivalent[type] = real_type2; 731 } 732 } 733 } 734 // Calculate number of types corresponding to level1 735 // per types corresponding to level2 (e.g., number of threads per core) 736 int calculate_ratio(int level1, int level2) const { 737 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth); 738 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth); 739 int r = 1; 740 for (int level = level1; level > level2; --level) 741 r *= ratio[level]; 742 return r; 743 } 744 int get_ratio(int level) const { 745 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 746 return ratio[level]; 747 } 748 int get_depth() const { return depth; }; 749 kmp_hw_t get_type(int level) const { 750 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 751 return types[level]; 752 } 753 int get_level(kmp_hw_t type) const { 754 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type); 755 int eq_type = equivalent[type]; 756 if (eq_type == KMP_HW_UNKNOWN) 757 return -1; 758 for (int i = 0; i < depth; ++i) 759 if (types[i] == eq_type) 760 return i; 761 return -1; 762 } 763 int get_count(int level) const { 764 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 765 return count[level]; 766 } 767 #if KMP_AFFINITY_SUPPORTED 768 void sort_compact() { 769 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 770 kmp_hw_thread_t::compare_compact); 771 } 772 #endif 773 void print(const char *env_var = "KMP_AFFINITY") const; 774 void dump() const; 775 }; 776 777 class kmp_hw_subset_t { 778 public: 779 struct item_t { 780 int num; 781 kmp_hw_t type; 782 int offset; 783 }; 784 785 private: 786 int depth; 787 int capacity; 788 item_t *items; 789 kmp_uint64 set; 790 bool absolute; 791 // The set must be able to handle up to KMP_HW_LAST number of layers 792 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST); 793 794 public: 795 // Force use of allocate()/deallocate() 796 kmp_hw_subset_t() = delete; 797 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete; 798 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete; 799 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete; 800 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete; 801 802 static kmp_hw_subset_t *allocate() { 803 int initial_capacity = 5; 804 kmp_hw_subset_t *retval = 805 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t)); 806 retval->depth = 0; 807 retval->capacity = initial_capacity; 808 retval->set = 0ull; 809 retval->absolute = false; 810 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity); 811 return retval; 812 } 813 static void deallocate(kmp_hw_subset_t *subset) { 814 __kmp_free(subset->items); 815 __kmp_free(subset); 816 } 817 void set_absolute() { absolute = true; } 818 bool is_absolute() const { return absolute; } 819 void push_back(int num, kmp_hw_t type, int offset) { 820 if (depth == capacity - 1) { 821 capacity *= 2; 822 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity); 823 for (int i = 0; i < depth; ++i) 824 new_items[i] = items[i]; 825 __kmp_free(items); 826 items = new_items; 827 } 828 items[depth].num = num; 829 items[depth].type = type; 830 items[depth].offset = offset; 831 depth++; 832 set |= (1ull << type); 833 } 834 int get_depth() const { return depth; } 835 const item_t &at(int index) const { 836 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 837 return items[index]; 838 } 839 item_t &at(int index) { 840 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 841 return items[index]; 842 } 843 void remove(int index) { 844 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 845 set &= ~(1ull << items[index].type); 846 for (int j = index + 1; j < depth; ++j) { 847 items[j - 1] = items[j]; 848 } 849 depth--; 850 } 851 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); } 852 void dump() const { 853 printf("**********************\n"); 854 printf("*** kmp_hw_subset: ***\n"); 855 printf("* depth: %d\n", depth); 856 printf("* items:\n"); 857 for (int i = 0; i < depth; ++i) { 858 printf("num: %d, type: %s, offset: %d\n", items[i].num, 859 __kmp_hw_get_keyword(items[i].type), items[i].offset); 860 } 861 printf("* set: 0x%llx\n", set); 862 printf("* absolute: %d\n", absolute); 863 printf("**********************\n"); 864 } 865 }; 866 867 extern kmp_topology_t *__kmp_topology; 868 extern kmp_hw_subset_t *__kmp_hw_subset; 869 870 /* A structure for holding machine-specific hierarchy info to be computed once 871 at init. This structure represents a mapping of threads to the actual machine 872 hierarchy, or to our best guess at what the hierarchy might be, for the 873 purpose of performing an efficient barrier. In the worst case, when there is 874 no machine hierarchy information, it produces a tree suitable for a barrier, 875 similar to the tree used in the hyper barrier. */ 876 class hierarchy_info { 877 public: 878 /* Good default values for number of leaves and branching factor, given no 879 affinity information. Behaves a bit like hyper barrier. */ 880 static const kmp_uint32 maxLeaves = 4; 881 static const kmp_uint32 minBranch = 4; 882 /** Number of levels in the hierarchy. Typical levels are threads/core, 883 cores/package or socket, packages/node, nodes/machine, etc. We don't want 884 to get specific with nomenclature. When the machine is oversubscribed we 885 add levels to duplicate the hierarchy, doubling the thread capacity of the 886 hierarchy each time we add a level. */ 887 kmp_uint32 maxLevels; 888 889 /** This is specifically the depth of the machine configuration hierarchy, in 890 terms of the number of levels along the longest path from root to any 891 leaf. It corresponds to the number of entries in numPerLevel if we exclude 892 all but one trailing 1. */ 893 kmp_uint32 depth; 894 kmp_uint32 base_num_threads; 895 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 }; 896 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized, 897 // 2=initialization in progress 898 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing 899 900 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children 901 the parent of a node at level i has. For example, if we have a machine 902 with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel = 903 {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 904 kmp_uint32 *numPerLevel; 905 kmp_uint32 *skipPerLevel; 906 907 void deriveLevels() { 908 int hier_depth = __kmp_topology->get_depth(); 909 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) { 910 numPerLevel[level] = __kmp_topology->get_ratio(i); 911 } 912 } 913 914 hierarchy_info() 915 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {} 916 917 void fini() { 918 if (!uninitialized && numPerLevel) { 919 __kmp_free(numPerLevel); 920 numPerLevel = NULL; 921 uninitialized = not_initialized; 922 } 923 } 924 925 void init(int num_addrs) { 926 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8( 927 &uninitialized, not_initialized, initializing); 928 if (bool_result == 0) { // Wait for initialization 929 while (TCR_1(uninitialized) != initialized) 930 KMP_CPU_PAUSE(); 931 return; 932 } 933 KMP_DEBUG_ASSERT(bool_result == 1); 934 935 /* Added explicit initialization of the data fields here to prevent usage of 936 dirty value observed when static library is re-initialized multiple times 937 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses 938 OpenMP). */ 939 depth = 1; 940 resizing = 0; 941 maxLevels = 7; 942 numPerLevel = 943 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 944 skipPerLevel = &(numPerLevel[maxLevels]); 945 for (kmp_uint32 i = 0; i < maxLevels; 946 ++i) { // init numPerLevel[*] to 1 item per level 947 numPerLevel[i] = 1; 948 skipPerLevel[i] = 1; 949 } 950 951 // Sort table by physical ID 952 if (__kmp_topology && __kmp_topology->get_depth() > 0) { 953 deriveLevels(); 954 } else { 955 numPerLevel[0] = maxLeaves; 956 numPerLevel[1] = num_addrs / maxLeaves; 957 if (num_addrs % maxLeaves) 958 numPerLevel[1]++; 959 } 960 961 base_num_threads = num_addrs; 962 for (int i = maxLevels - 1; i >= 0; 963 --i) // count non-empty levels to get depth 964 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 965 depth++; 966 967 kmp_uint32 branch = minBranch; 968 if (numPerLevel[0] == 1) 969 branch = num_addrs / maxLeaves; 970 if (branch < minBranch) 971 branch = minBranch; 972 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width 973 while (numPerLevel[d] > branch || 974 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0! 975 if (numPerLevel[d] & 1) 976 numPerLevel[d]++; 977 numPerLevel[d] = numPerLevel[d] >> 1; 978 if (numPerLevel[d + 1] == 1) 979 depth++; 980 numPerLevel[d + 1] = numPerLevel[d + 1] << 1; 981 } 982 if (numPerLevel[0] == 1) { 983 branch = branch >> 1; 984 if (branch < 4) 985 branch = minBranch; 986 } 987 } 988 989 for (kmp_uint32 i = 1; i < depth; ++i) 990 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1]; 991 // Fill in hierarchy in the case of oversubscription 992 for (kmp_uint32 i = depth; i < maxLevels; ++i) 993 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 994 995 uninitialized = initialized; // One writer 996 } 997 998 // Resize the hierarchy if nproc changes to something larger than before 999 void resize(kmp_uint32 nproc) { 1000 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1001 while (bool_result == 0) { // someone else is trying to resize 1002 KMP_CPU_PAUSE(); 1003 if (nproc <= base_num_threads) // happy with other thread's resize 1004 return; 1005 else // try to resize 1006 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1007 } 1008 KMP_DEBUG_ASSERT(bool_result != 0); 1009 if (nproc <= base_num_threads) 1010 return; // happy with other thread's resize 1011 1012 // Calculate new maxLevels 1013 kmp_uint32 old_sz = skipPerLevel[depth - 1]; 1014 kmp_uint32 incs = 0, old_maxLevels = maxLevels; 1015 // First see if old maxLevels is enough to contain new size 1016 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) { 1017 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1018 numPerLevel[i - 1] *= 2; 1019 old_sz *= 2; 1020 depth++; 1021 } 1022 if (nproc > old_sz) { // Not enough space, need to expand hierarchy 1023 while (nproc > old_sz) { 1024 old_sz *= 2; 1025 incs++; 1026 depth++; 1027 } 1028 maxLevels += incs; 1029 1030 // Resize arrays 1031 kmp_uint32 *old_numPerLevel = numPerLevel; 1032 kmp_uint32 *old_skipPerLevel = skipPerLevel; 1033 numPerLevel = skipPerLevel = NULL; 1034 numPerLevel = 1035 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1036 skipPerLevel = &(numPerLevel[maxLevels]); 1037 1038 // Copy old elements from old arrays 1039 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) { 1040 // init numPerLevel[*] to 1 item per level 1041 numPerLevel[i] = old_numPerLevel[i]; 1042 skipPerLevel[i] = old_skipPerLevel[i]; 1043 } 1044 1045 // Init new elements in arrays to 1 1046 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) { 1047 // init numPerLevel[*] to 1 item per level 1048 numPerLevel[i] = 1; 1049 skipPerLevel[i] = 1; 1050 } 1051 1052 // Free old arrays 1053 __kmp_free(old_numPerLevel); 1054 } 1055 1056 // Fill in oversubscription levels of hierarchy 1057 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) 1058 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1059 1060 base_num_threads = nproc; 1061 resizing = 0; // One writer 1062 } 1063 }; 1064 #endif // KMP_AFFINITY_H 1065