1 /* 2 * kmp_affinity.h -- header for affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef KMP_AFFINITY_H 14 #define KMP_AFFINITY_H 15 16 #include "kmp.h" 17 #include "kmp_os.h" 18 #include <limits> 19 20 #if KMP_AFFINITY_SUPPORTED 21 #if KMP_USE_HWLOC 22 class KMPHwlocAffinity : public KMPAffinity { 23 public: 24 class Mask : public KMPAffinity::Mask { 25 hwloc_cpuset_t mask; 26 27 public: 28 Mask() { 29 mask = hwloc_bitmap_alloc(); 30 this->zero(); 31 } 32 ~Mask() { hwloc_bitmap_free(mask); } 33 void set(int i) override { hwloc_bitmap_set(mask, i); } 34 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); } 35 void clear(int i) override { hwloc_bitmap_clr(mask, i); } 36 void zero() override { hwloc_bitmap_zero(mask); } 37 bool empty() const override { return hwloc_bitmap_iszero(mask); } 38 void copy(const KMPAffinity::Mask *src) override { 39 const Mask *convert = static_cast<const Mask *>(src); 40 hwloc_bitmap_copy(mask, convert->mask); 41 } 42 void bitwise_and(const KMPAffinity::Mask *rhs) override { 43 const Mask *convert = static_cast<const Mask *>(rhs); 44 hwloc_bitmap_and(mask, mask, convert->mask); 45 } 46 void bitwise_or(const KMPAffinity::Mask *rhs) override { 47 const Mask *convert = static_cast<const Mask *>(rhs); 48 hwloc_bitmap_or(mask, mask, convert->mask); 49 } 50 void bitwise_not() override { hwloc_bitmap_not(mask, mask); } 51 bool is_equal(const KMPAffinity::Mask *rhs) const override { 52 const Mask *convert = static_cast<const Mask *>(rhs); 53 return hwloc_bitmap_isequal(mask, convert->mask); 54 } 55 int begin() const override { return hwloc_bitmap_first(mask); } 56 int end() const override { return -1; } 57 int next(int previous) const override { 58 return hwloc_bitmap_next(mask, previous); 59 } 60 int get_system_affinity(bool abort_on_error) override { 61 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 62 "Illegal get affinity operation when not capable"); 63 long retval = 64 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 65 if (retval >= 0) { 66 return 0; 67 } 68 int error = errno; 69 if (abort_on_error) { 70 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"), 71 KMP_ERR(error), __kmp_msg_null); 72 } 73 return error; 74 } 75 int set_system_affinity(bool abort_on_error) const override { 76 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 77 "Illegal set affinity operation when not capable"); 78 long retval = 79 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 80 if (retval >= 0) { 81 return 0; 82 } 83 int error = errno; 84 if (abort_on_error) { 85 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"), 86 KMP_ERR(error), __kmp_msg_null); 87 } 88 return error; 89 } 90 #if KMP_OS_WINDOWS 91 int set_process_affinity(bool abort_on_error) const override { 92 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 93 "Illegal set process affinity operation when not capable"); 94 int error = 0; 95 const hwloc_topology_support *support = 96 hwloc_topology_get_support(__kmp_hwloc_topology); 97 if (support->cpubind->set_proc_cpubind) { 98 int retval; 99 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask, 100 HWLOC_CPUBIND_PROCESS); 101 if (retval >= 0) 102 return 0; 103 error = errno; 104 if (abort_on_error) 105 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"), 106 KMP_ERR(error), __kmp_msg_null); 107 } 108 return error; 109 } 110 #endif 111 int get_proc_group() const override { 112 int group = -1; 113 #if KMP_OS_WINDOWS 114 if (__kmp_num_proc_groups == 1) { 115 return 1; 116 } 117 for (int i = 0; i < __kmp_num_proc_groups; i++) { 118 // On windows, the long type is always 32 bits 119 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2); 120 unsigned long second_32_bits = 121 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1); 122 if (first_32_bits == 0 && second_32_bits == 0) { 123 continue; 124 } 125 if (group >= 0) { 126 return -1; 127 } 128 group = i; 129 } 130 #endif /* KMP_OS_WINDOWS */ 131 return group; 132 } 133 }; 134 void determine_capable(const char *var) override { 135 const hwloc_topology_support *topology_support; 136 if (__kmp_hwloc_topology == NULL) { 137 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) { 138 __kmp_hwloc_error = TRUE; 139 if (__kmp_affinity.flags.verbose) { 140 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()"); 141 } 142 } 143 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) { 144 __kmp_hwloc_error = TRUE; 145 if (__kmp_affinity.flags.verbose) { 146 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()"); 147 } 148 } 149 } 150 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology); 151 // Is the system capable of setting/getting this thread's affinity? 152 // Also, is topology discovery possible? (pu indicates ability to discover 153 // processing units). And finally, were there no errors when calling any 154 // hwloc_* API functions? 155 if (topology_support && topology_support->cpubind->set_thisthread_cpubind && 156 topology_support->cpubind->get_thisthread_cpubind && 157 topology_support->discovery->pu && !__kmp_hwloc_error) { 158 // enables affinity according to KMP_AFFINITY_CAPABLE() macro 159 KMP_AFFINITY_ENABLE(TRUE); 160 } else { 161 // indicate that hwloc didn't work and disable affinity 162 __kmp_hwloc_error = TRUE; 163 KMP_AFFINITY_DISABLE(); 164 } 165 } 166 void bind_thread(int which) override { 167 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 168 "Illegal set affinity operation when not capable"); 169 KMPAffinity::Mask *mask; 170 KMP_CPU_ALLOC_ON_STACK(mask); 171 KMP_CPU_ZERO(mask); 172 KMP_CPU_SET(which, mask); 173 __kmp_set_system_affinity(mask, TRUE); 174 KMP_CPU_FREE_FROM_STACK(mask); 175 } 176 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } 177 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } 178 KMPAffinity::Mask *allocate_mask_array(int num) override { 179 return new Mask[num]; 180 } 181 void deallocate_mask_array(KMPAffinity::Mask *array) override { 182 Mask *hwloc_array = static_cast<Mask *>(array); 183 delete[] hwloc_array; 184 } 185 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 186 int index) override { 187 Mask *hwloc_array = static_cast<Mask *>(array); 188 return &(hwloc_array[index]); 189 } 190 api_type get_api_type() const override { return HWLOC; } 191 }; 192 #endif /* KMP_USE_HWLOC */ 193 194 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX 195 #if KMP_OS_LINUX 196 /* On some of the older OS's that we build on, these constants aren't present 197 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on 198 all systems of the same arch where they are defined, and they cannot change. 199 stone forever. */ 200 #include <sys/syscall.h> 201 #if KMP_ARCH_X86 || KMP_ARCH_ARM 202 #ifndef __NR_sched_setaffinity 203 #define __NR_sched_setaffinity 241 204 #elif __NR_sched_setaffinity != 241 205 #error Wrong code for setaffinity system call. 206 #endif /* __NR_sched_setaffinity */ 207 #ifndef __NR_sched_getaffinity 208 #define __NR_sched_getaffinity 242 209 #elif __NR_sched_getaffinity != 242 210 #error Wrong code for getaffinity system call. 211 #endif /* __NR_sched_getaffinity */ 212 #elif KMP_ARCH_AARCH64 213 #ifndef __NR_sched_setaffinity 214 #define __NR_sched_setaffinity 122 215 #elif __NR_sched_setaffinity != 122 216 #error Wrong code for setaffinity system call. 217 #endif /* __NR_sched_setaffinity */ 218 #ifndef __NR_sched_getaffinity 219 #define __NR_sched_getaffinity 123 220 #elif __NR_sched_getaffinity != 123 221 #error Wrong code for getaffinity system call. 222 #endif /* __NR_sched_getaffinity */ 223 #elif KMP_ARCH_X86_64 224 #ifndef __NR_sched_setaffinity 225 #define __NR_sched_setaffinity 203 226 #elif __NR_sched_setaffinity != 203 227 #error Wrong code for setaffinity system call. 228 #endif /* __NR_sched_setaffinity */ 229 #ifndef __NR_sched_getaffinity 230 #define __NR_sched_getaffinity 204 231 #elif __NR_sched_getaffinity != 204 232 #error Wrong code for getaffinity system call. 233 #endif /* __NR_sched_getaffinity */ 234 #elif KMP_ARCH_PPC64 235 #ifndef __NR_sched_setaffinity 236 #define __NR_sched_setaffinity 222 237 #elif __NR_sched_setaffinity != 222 238 #error Wrong code for setaffinity system call. 239 #endif /* __NR_sched_setaffinity */ 240 #ifndef __NR_sched_getaffinity 241 #define __NR_sched_getaffinity 223 242 #elif __NR_sched_getaffinity != 223 243 #error Wrong code for getaffinity system call. 244 #endif /* __NR_sched_getaffinity */ 245 #elif KMP_ARCH_MIPS 246 #ifndef __NR_sched_setaffinity 247 #define __NR_sched_setaffinity 4239 248 #elif __NR_sched_setaffinity != 4239 249 #error Wrong code for setaffinity system call. 250 #endif /* __NR_sched_setaffinity */ 251 #ifndef __NR_sched_getaffinity 252 #define __NR_sched_getaffinity 4240 253 #elif __NR_sched_getaffinity != 4240 254 #error Wrong code for getaffinity system call. 255 #endif /* __NR_sched_getaffinity */ 256 #elif KMP_ARCH_MIPS64 257 #ifndef __NR_sched_setaffinity 258 #define __NR_sched_setaffinity 5195 259 #elif __NR_sched_setaffinity != 5195 260 #error Wrong code for setaffinity system call. 261 #endif /* __NR_sched_setaffinity */ 262 #ifndef __NR_sched_getaffinity 263 #define __NR_sched_getaffinity 5196 264 #elif __NR_sched_getaffinity != 5196 265 #error Wrong code for getaffinity system call. 266 #endif /* __NR_sched_getaffinity */ 267 #elif KMP_ARCH_LOONGARCH64 268 #ifndef __NR_sched_setaffinity 269 #define __NR_sched_setaffinity 122 270 #elif __NR_sched_setaffinity != 122 271 #error Wrong code for setaffinity system call. 272 #endif /* __NR_sched_setaffinity */ 273 #ifndef __NR_sched_getaffinity 274 #define __NR_sched_getaffinity 123 275 #elif __NR_sched_getaffinity != 123 276 #error Wrong code for getaffinity system call. 277 #endif /* __NR_sched_getaffinity */ 278 #elif KMP_ARCH_RISCV64 279 #ifndef __NR_sched_setaffinity 280 #define __NR_sched_setaffinity 122 281 #elif __NR_sched_setaffinity != 122 282 #error Wrong code for setaffinity system call. 283 #endif /* __NR_sched_setaffinity */ 284 #ifndef __NR_sched_getaffinity 285 #define __NR_sched_getaffinity 123 286 #elif __NR_sched_getaffinity != 123 287 #error Wrong code for getaffinity system call. 288 #endif /* __NR_sched_getaffinity */ 289 #elif KMP_ARCH_VE 290 #ifndef __NR_sched_setaffinity 291 #define __NR_sched_setaffinity 203 292 #elif __NR_sched_setaffinity != 203 293 #error Wrong code for setaffinity system call. 294 #endif /* __NR_sched_setaffinity */ 295 #ifndef __NR_sched_getaffinity 296 #define __NR_sched_getaffinity 204 297 #elif __NR_sched_getaffinity != 204 298 #error Wrong code for getaffinity system call. 299 #endif /* __NR_sched_getaffinity */ 300 #elif KMP_ARCH_S390X 301 #ifndef __NR_sched_setaffinity 302 #define __NR_sched_setaffinity 239 303 #elif __NR_sched_setaffinity != 239 304 #error Wrong code for setaffinity system call. 305 #endif /* __NR_sched_setaffinity */ 306 #ifndef __NR_sched_getaffinity 307 #define __NR_sched_getaffinity 240 308 #elif __NR_sched_getaffinity != 240 309 #error Wrong code for getaffinity system call. 310 #endif /* __NR_sched_getaffinity */ 311 #else 312 #error Unknown or unsupported architecture 313 #endif /* KMP_ARCH_* */ 314 #elif KMP_OS_FREEBSD 315 #include <pthread.h> 316 #include <pthread_np.h> 317 #elif KMP_OS_AIX 318 #include <sys/dr.h> 319 #include <sys/rset.h> 320 #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX. 321 #endif 322 class KMPNativeAffinity : public KMPAffinity { 323 class Mask : public KMPAffinity::Mask { 324 typedef unsigned long mask_t; 325 typedef decltype(__kmp_affin_mask_size) mask_size_type; 326 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 327 static const mask_t ONE = 1; 328 mask_size_type get_num_mask_types() const { 329 return __kmp_affin_mask_size / sizeof(mask_t); 330 } 331 332 public: 333 mask_t *mask; 334 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); } 335 ~Mask() { 336 if (mask) 337 __kmp_free(mask); 338 } 339 void set(int i) override { 340 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T)); 341 } 342 bool is_set(int i) const override { 343 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T))); 344 } 345 void clear(int i) override { 346 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T)); 347 } 348 void zero() override { 349 mask_size_type e = get_num_mask_types(); 350 for (mask_size_type i = 0; i < e; ++i) 351 mask[i] = (mask_t)0; 352 } 353 bool empty() const override { 354 mask_size_type e = get_num_mask_types(); 355 for (mask_size_type i = 0; i < e; ++i) 356 if (mask[i] != (mask_t)0) 357 return false; 358 return true; 359 } 360 void copy(const KMPAffinity::Mask *src) override { 361 const Mask *convert = static_cast<const Mask *>(src); 362 mask_size_type e = get_num_mask_types(); 363 for (mask_size_type i = 0; i < e; ++i) 364 mask[i] = convert->mask[i]; 365 } 366 void bitwise_and(const KMPAffinity::Mask *rhs) override { 367 const Mask *convert = static_cast<const Mask *>(rhs); 368 mask_size_type e = get_num_mask_types(); 369 for (mask_size_type i = 0; i < e; ++i) 370 mask[i] &= convert->mask[i]; 371 } 372 void bitwise_or(const KMPAffinity::Mask *rhs) override { 373 const Mask *convert = static_cast<const Mask *>(rhs); 374 mask_size_type e = get_num_mask_types(); 375 for (mask_size_type i = 0; i < e; ++i) 376 mask[i] |= convert->mask[i]; 377 } 378 void bitwise_not() override { 379 mask_size_type e = get_num_mask_types(); 380 for (mask_size_type i = 0; i < e; ++i) 381 mask[i] = ~(mask[i]); 382 } 383 bool is_equal(const KMPAffinity::Mask *rhs) const override { 384 const Mask *convert = static_cast<const Mask *>(rhs); 385 mask_size_type e = get_num_mask_types(); 386 for (mask_size_type i = 0; i < e; ++i) 387 if (mask[i] != convert->mask[i]) 388 return false; 389 return true; 390 } 391 int begin() const override { 392 int retval = 0; 393 while (retval < end() && !is_set(retval)) 394 ++retval; 395 return retval; 396 } 397 int end() const override { 398 int e; 399 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e); 400 return e; 401 } 402 int next(int previous) const override { 403 int retval = previous + 1; 404 while (retval < end() && !is_set(retval)) 405 ++retval; 406 return retval; 407 } 408 #if KMP_OS_AIX 409 // On AIX, we don't have a way to get CPU(s) a thread is bound to. 410 // This routine is only used to get the full mask. 411 int get_system_affinity(bool abort_on_error) override { 412 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 413 "Illegal get affinity operation when not capable"); 414 415 (void)abort_on_error; 416 417 // Set the mask with all CPUs that are available. 418 for (int i = 0; i < __kmp_xproc; ++i) 419 KMP_CPU_SET(i, this); 420 return 0; 421 } 422 int set_system_affinity(bool abort_on_error) const override { 423 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 424 425 "Illegal set affinity operation when not capable"); 426 427 int location; 428 int gtid = __kmp_entry_gtid(); 429 int tid = thread_self(); 430 431 // Unbind the thread if it was bound to any processors before so that 432 // we can bind the thread to CPUs specified by the mask not others. 433 int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY); 434 435 // On AIX, we can only bind to one instead of a set of CPUs with the 436 // bindprocessor() system call. 437 KMP_CPU_SET_ITERATE(location, this) { 438 if (KMP_CPU_ISSET(location, this)) { 439 retval = bindprocessor(BINDTHREAD, tid, location); 440 if (retval == -1 && errno == 1) { 441 rsid_t rsid; 442 rsethandle_t rsh; 443 // Put something in rsh to prevent compiler warning 444 // about uninitalized use 445 rsh = rs_alloc(RS_EMPTY); 446 rsid.at_pid = getpid(); 447 if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) { 448 retval = ra_detachrset(R_PROCESS, rsid, 0); 449 retval = bindprocessor(BINDTHREAD, tid, location); 450 } 451 } 452 if (retval == 0) { 453 KA_TRACE(10, ("__kmp_set_system_affinity: Done binding " 454 "T#%d to cpu=%d.\n", 455 gtid, location)); 456 continue; 457 } 458 int error = errno; 459 if (abort_on_error) { 460 __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"), 461 KMP_ERR(error), __kmp_msg_null); 462 KA_TRACE(10, ("__kmp_set_system_affinity: Error binding " 463 "T#%d to cpu=%d, errno=%d.\n", 464 gtid, location, error)); 465 return error; 466 } 467 } 468 } 469 return 0; 470 } 471 #else // !KMP_OS_AIX 472 int get_system_affinity(bool abort_on_error) override { 473 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 474 "Illegal get affinity operation when not capable"); 475 #if KMP_OS_LINUX 476 long retval = 477 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask); 478 #elif KMP_OS_FREEBSD 479 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, 480 reinterpret_cast<cpuset_t *>(mask)); 481 int retval = (r == 0 ? 0 : -1); 482 #endif 483 if (retval >= 0) { 484 return 0; 485 } 486 int error = errno; 487 if (abort_on_error) { 488 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"), 489 KMP_ERR(error), __kmp_msg_null); 490 } 491 return error; 492 } 493 int set_system_affinity(bool abort_on_error) const override { 494 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 495 "Illegal set affinity operation when not capable"); 496 #if KMP_OS_LINUX 497 long retval = 498 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask); 499 #elif KMP_OS_FREEBSD 500 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, 501 reinterpret_cast<cpuset_t *>(mask)); 502 int retval = (r == 0 ? 0 : -1); 503 #endif 504 if (retval >= 0) { 505 return 0; 506 } 507 int error = errno; 508 if (abort_on_error) { 509 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"), 510 KMP_ERR(error), __kmp_msg_null); 511 } 512 return error; 513 } 514 #endif // KMP_OS_AIX 515 }; 516 void determine_capable(const char *env_var) override { 517 __kmp_affinity_determine_capable(env_var); 518 } 519 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } 520 KMPAffinity::Mask *allocate_mask() override { 521 KMPNativeAffinity::Mask *retval = new Mask(); 522 return retval; 523 } 524 void deallocate_mask(KMPAffinity::Mask *m) override { 525 KMPNativeAffinity::Mask *native_mask = 526 static_cast<KMPNativeAffinity::Mask *>(m); 527 delete native_mask; 528 } 529 KMPAffinity::Mask *allocate_mask_array(int num) override { 530 return new Mask[num]; 531 } 532 void deallocate_mask_array(KMPAffinity::Mask *array) override { 533 Mask *linux_array = static_cast<Mask *>(array); 534 delete[] linux_array; 535 } 536 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 537 int index) override { 538 Mask *linux_array = static_cast<Mask *>(array); 539 return &(linux_array[index]); 540 } 541 api_type get_api_type() const override { return NATIVE_OS; } 542 }; 543 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX */ 544 545 #if KMP_OS_WINDOWS 546 class KMPNativeAffinity : public KMPAffinity { 547 class Mask : public KMPAffinity::Mask { 548 typedef ULONG_PTR mask_t; 549 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 550 mask_t *mask; 551 552 public: 553 Mask() { 554 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups); 555 } 556 ~Mask() { 557 if (mask) 558 __kmp_free(mask); 559 } 560 void set(int i) override { 561 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T)); 562 } 563 bool is_set(int i) const override { 564 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T))); 565 } 566 void clear(int i) override { 567 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T)); 568 } 569 void zero() override { 570 for (int i = 0; i < __kmp_num_proc_groups; ++i) 571 mask[i] = 0; 572 } 573 bool empty() const override { 574 for (size_t i = 0; i < __kmp_num_proc_groups; ++i) 575 if (mask[i]) 576 return false; 577 return true; 578 } 579 void copy(const KMPAffinity::Mask *src) override { 580 const Mask *convert = static_cast<const Mask *>(src); 581 for (int i = 0; i < __kmp_num_proc_groups; ++i) 582 mask[i] = convert->mask[i]; 583 } 584 void bitwise_and(const KMPAffinity::Mask *rhs) override { 585 const Mask *convert = static_cast<const Mask *>(rhs); 586 for (int i = 0; i < __kmp_num_proc_groups; ++i) 587 mask[i] &= convert->mask[i]; 588 } 589 void bitwise_or(const KMPAffinity::Mask *rhs) override { 590 const Mask *convert = static_cast<const Mask *>(rhs); 591 for (int i = 0; i < __kmp_num_proc_groups; ++i) 592 mask[i] |= convert->mask[i]; 593 } 594 void bitwise_not() override { 595 for (int i = 0; i < __kmp_num_proc_groups; ++i) 596 mask[i] = ~(mask[i]); 597 } 598 bool is_equal(const KMPAffinity::Mask *rhs) const override { 599 const Mask *convert = static_cast<const Mask *>(rhs); 600 for (size_t i = 0; i < __kmp_num_proc_groups; ++i) 601 if (mask[i] != convert->mask[i]) 602 return false; 603 return true; 604 } 605 int begin() const override { 606 int retval = 0; 607 while (retval < end() && !is_set(retval)) 608 ++retval; 609 return retval; 610 } 611 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; } 612 int next(int previous) const override { 613 int retval = previous + 1; 614 while (retval < end() && !is_set(retval)) 615 ++retval; 616 return retval; 617 } 618 int set_process_affinity(bool abort_on_error) const override { 619 if (__kmp_num_proc_groups <= 1) { 620 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) { 621 DWORD error = GetLastError(); 622 if (abort_on_error) { 623 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 624 __kmp_msg_null); 625 } 626 return error; 627 } 628 } 629 return 0; 630 } 631 int set_system_affinity(bool abort_on_error) const override { 632 if (__kmp_num_proc_groups > 1) { 633 // Check for a valid mask. 634 GROUP_AFFINITY ga; 635 int group = get_proc_group(); 636 if (group < 0) { 637 if (abort_on_error) { 638 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 639 } 640 return -1; 641 } 642 // Transform the bit vector into a GROUP_AFFINITY struct 643 // and make the system call to set affinity. 644 ga.Group = group; 645 ga.Mask = mask[group]; 646 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0; 647 648 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL); 649 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) { 650 DWORD error = GetLastError(); 651 if (abort_on_error) { 652 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 653 __kmp_msg_null); 654 } 655 return error; 656 } 657 } else { 658 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) { 659 DWORD error = GetLastError(); 660 if (abort_on_error) { 661 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 662 __kmp_msg_null); 663 } 664 return error; 665 } 666 } 667 return 0; 668 } 669 int get_system_affinity(bool abort_on_error) override { 670 if (__kmp_num_proc_groups > 1) { 671 this->zero(); 672 GROUP_AFFINITY ga; 673 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL); 674 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) { 675 DWORD error = GetLastError(); 676 if (abort_on_error) { 677 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"), 678 KMP_ERR(error), __kmp_msg_null); 679 } 680 return error; 681 } 682 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) || 683 (ga.Mask == 0)) { 684 return -1; 685 } 686 mask[ga.Group] = ga.Mask; 687 } else { 688 mask_t newMask, sysMask, retval; 689 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) { 690 DWORD error = GetLastError(); 691 if (abort_on_error) { 692 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"), 693 KMP_ERR(error), __kmp_msg_null); 694 } 695 return error; 696 } 697 retval = SetThreadAffinityMask(GetCurrentThread(), newMask); 698 if (!retval) { 699 DWORD error = GetLastError(); 700 if (abort_on_error) { 701 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 702 KMP_ERR(error), __kmp_msg_null); 703 } 704 return error; 705 } 706 newMask = SetThreadAffinityMask(GetCurrentThread(), retval); 707 if (!newMask) { 708 DWORD error = GetLastError(); 709 if (abort_on_error) { 710 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 711 KMP_ERR(error), __kmp_msg_null); 712 } 713 } 714 *mask = retval; 715 } 716 return 0; 717 } 718 int get_proc_group() const override { 719 int group = -1; 720 if (__kmp_num_proc_groups == 1) { 721 return 1; 722 } 723 for (int i = 0; i < __kmp_num_proc_groups; i++) { 724 if (mask[i] == 0) 725 continue; 726 if (group >= 0) 727 return -1; 728 group = i; 729 } 730 return group; 731 } 732 }; 733 void determine_capable(const char *env_var) override { 734 __kmp_affinity_determine_capable(env_var); 735 } 736 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } 737 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } 738 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } 739 KMPAffinity::Mask *allocate_mask_array(int num) override { 740 return new Mask[num]; 741 } 742 void deallocate_mask_array(KMPAffinity::Mask *array) override { 743 Mask *windows_array = static_cast<Mask *>(array); 744 delete[] windows_array; 745 } 746 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 747 int index) override { 748 Mask *windows_array = static_cast<Mask *>(array); 749 return &(windows_array[index]); 750 } 751 api_type get_api_type() const override { return NATIVE_OS; } 752 }; 753 #endif /* KMP_OS_WINDOWS */ 754 #endif /* KMP_AFFINITY_SUPPORTED */ 755 756 // Describe an attribute for a level in the machine topology 757 struct kmp_hw_attr_t { 758 int core_type : 8; 759 int core_eff : 8; 760 unsigned valid : 1; 761 unsigned reserved : 15; 762 763 static const int UNKNOWN_CORE_EFF = -1; 764 765 kmp_hw_attr_t() 766 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF), 767 valid(0), reserved(0) {} 768 void set_core_type(kmp_hw_core_type_t type) { 769 valid = 1; 770 core_type = type; 771 } 772 void set_core_eff(int eff) { 773 valid = 1; 774 core_eff = eff; 775 } 776 kmp_hw_core_type_t get_core_type() const { 777 return (kmp_hw_core_type_t)core_type; 778 } 779 int get_core_eff() const { return core_eff; } 780 bool is_core_type_valid() const { 781 return core_type != KMP_HW_CORE_TYPE_UNKNOWN; 782 } 783 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; } 784 operator bool() const { return valid; } 785 void clear() { 786 core_type = KMP_HW_CORE_TYPE_UNKNOWN; 787 core_eff = UNKNOWN_CORE_EFF; 788 valid = 0; 789 } 790 bool contains(const kmp_hw_attr_t &other) const { 791 if (!valid && !other.valid) 792 return true; 793 if (valid && other.valid) { 794 if (other.is_core_type_valid()) { 795 if (!is_core_type_valid() || (get_core_type() != other.get_core_type())) 796 return false; 797 } 798 if (other.is_core_eff_valid()) { 799 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff())) 800 return false; 801 } 802 return true; 803 } 804 return false; 805 } 806 #if KMP_AFFINITY_SUPPORTED 807 bool contains(const kmp_affinity_attrs_t &attr) const { 808 if (!valid && !attr.valid) 809 return true; 810 if (valid && attr.valid) { 811 if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN) 812 return (is_core_type_valid() && 813 (get_core_type() == (kmp_hw_core_type_t)attr.core_type)); 814 if (attr.core_eff != UNKNOWN_CORE_EFF) 815 return (is_core_eff_valid() && (get_core_eff() == attr.core_eff)); 816 return true; 817 } 818 return false; 819 } 820 #endif // KMP_AFFINITY_SUPPORTED 821 bool operator==(const kmp_hw_attr_t &rhs) const { 822 return (rhs.valid == valid && rhs.core_eff == core_eff && 823 rhs.core_type == core_type); 824 } 825 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); } 826 }; 827 828 #if KMP_AFFINITY_SUPPORTED 829 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t)); 830 #endif 831 832 class kmp_hw_thread_t { 833 public: 834 static const int UNKNOWN_ID = -1; 835 static const int MULTIPLE_ID = -2; 836 static int compare_ids(const void *a, const void *b); 837 static int compare_compact(const void *a, const void *b); 838 int ids[KMP_HW_LAST]; 839 int sub_ids[KMP_HW_LAST]; 840 bool leader; 841 int os_id; 842 kmp_hw_attr_t attrs; 843 844 void print() const; 845 void clear() { 846 for (int i = 0; i < (int)KMP_HW_LAST; ++i) 847 ids[i] = UNKNOWN_ID; 848 leader = false; 849 attrs.clear(); 850 } 851 }; 852 853 class kmp_topology_t { 854 855 struct flags_t { 856 int uniform : 1; 857 int reserved : 31; 858 }; 859 860 int depth; 861 862 // The following arrays are all 'depth' long and have been 863 // allocated to hold up to KMP_HW_LAST number of objects if 864 // needed so layers can be added without reallocation of any array 865 866 // Orderd array of the types in the topology 867 kmp_hw_t *types; 868 869 // Keep quick topology ratios, for non-uniform topologies, 870 // this ratio holds the max number of itemAs per itemB 871 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ] 872 int *ratio; 873 874 // Storage containing the absolute number of each topology layer 875 int *count; 876 877 // The number of core efficiencies. This is only useful for hybrid 878 // topologies. Core efficiencies will range from 0 to num efficiencies - 1 879 int num_core_efficiencies; 880 int num_core_types; 881 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES]; 882 883 // The hardware threads array 884 // hw_threads is num_hw_threads long 885 // Each hw_thread's ids and sub_ids are depth deep 886 int num_hw_threads; 887 kmp_hw_thread_t *hw_threads; 888 889 // Equivalence hash where the key is the hardware topology item 890 // and the value is the equivalent hardware topology type in the 891 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no 892 // known equivalence for the topology type 893 kmp_hw_t equivalent[KMP_HW_LAST]; 894 895 // Flags describing the topology 896 flags_t flags; 897 898 // Compact value used during sort_compact() 899 int compact; 900 901 // Insert a new topology layer after allocation 902 void _insert_layer(kmp_hw_t type, const int *ids); 903 904 #if KMP_GROUP_AFFINITY 905 // Insert topology information about Windows Processor groups 906 void _insert_windows_proc_groups(); 907 #endif 908 909 // Count each item & get the num x's per y 910 // e.g., get the number of cores and the number of threads per core 911 // for each (x, y) in (KMP_HW_* , KMP_HW_*) 912 void _gather_enumeration_information(); 913 914 // Remove layers that don't add information to the topology. 915 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 916 void _remove_radix1_layers(); 917 918 // Find out if the topology is uniform 919 void _discover_uniformity(); 920 921 // Set all the sub_ids for each hardware thread 922 void _set_sub_ids(); 923 924 // Set global affinity variables describing the number of threads per 925 // core, the number of packages, the number of cores per package, and 926 // the number of cores. 927 void _set_globals(); 928 929 // Set the last level cache equivalent type 930 void _set_last_level_cache(); 931 932 // Return the number of cores with a particular attribute, 'attr'. 933 // If 'find_all' is true, then find all cores on the machine, otherwise find 934 // all cores per the layer 'above' 935 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above, 936 bool find_all = false) const; 937 938 public: 939 // Force use of allocate()/deallocate() 940 kmp_topology_t() = delete; 941 kmp_topology_t(const kmp_topology_t &t) = delete; 942 kmp_topology_t(kmp_topology_t &&t) = delete; 943 kmp_topology_t &operator=(const kmp_topology_t &t) = delete; 944 kmp_topology_t &operator=(kmp_topology_t &&t) = delete; 945 946 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types); 947 static void deallocate(kmp_topology_t *); 948 949 // Functions used in create_map() routines 950 kmp_hw_thread_t &at(int index) { 951 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 952 return hw_threads[index]; 953 } 954 const kmp_hw_thread_t &at(int index) const { 955 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 956 return hw_threads[index]; 957 } 958 int get_num_hw_threads() const { return num_hw_threads; } 959 void sort_ids() { 960 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 961 kmp_hw_thread_t::compare_ids); 962 } 963 // Check if the hardware ids are unique, if they are 964 // return true, otherwise return false 965 bool check_ids() const; 966 967 // Function to call after the create_map() routine 968 void canonicalize(); 969 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores); 970 971 // Functions used after canonicalize() called 972 973 #if KMP_AFFINITY_SUPPORTED 974 // Set the granularity for affinity settings 975 void set_granularity(kmp_affinity_t &stgs) const; 976 bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const; 977 bool restrict_to_mask(const kmp_affin_mask_t *mask); 978 bool filter_hw_subset(); 979 #endif 980 bool is_uniform() const { return flags.uniform; } 981 // Tell whether a type is a valid type in the topology 982 // returns KMP_HW_UNKNOWN when there is no equivalent type 983 kmp_hw_t get_equivalent_type(kmp_hw_t type) const { 984 if (type == KMP_HW_UNKNOWN) 985 return KMP_HW_UNKNOWN; 986 return equivalent[type]; 987 } 988 // Set type1 = type2 989 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) { 990 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1); 991 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2); 992 kmp_hw_t real_type2 = equivalent[type2]; 993 if (real_type2 == KMP_HW_UNKNOWN) 994 real_type2 = type2; 995 equivalent[type1] = real_type2; 996 // This loop is required since any of the types may have been set to 997 // be equivalent to type1. They all must be checked and reset to type2. 998 KMP_FOREACH_HW_TYPE(type) { 999 if (equivalent[type] == type1) { 1000 equivalent[type] = real_type2; 1001 } 1002 } 1003 } 1004 // Calculate number of types corresponding to level1 1005 // per types corresponding to level2 (e.g., number of threads per core) 1006 int calculate_ratio(int level1, int level2) const { 1007 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth); 1008 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth); 1009 int r = 1; 1010 for (int level = level1; level > level2; --level) 1011 r *= ratio[level]; 1012 return r; 1013 } 1014 int get_ratio(int level) const { 1015 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 1016 return ratio[level]; 1017 } 1018 int get_depth() const { return depth; }; 1019 kmp_hw_t get_type(int level) const { 1020 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 1021 return types[level]; 1022 } 1023 int get_level(kmp_hw_t type) const { 1024 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type); 1025 int eq_type = equivalent[type]; 1026 if (eq_type == KMP_HW_UNKNOWN) 1027 return -1; 1028 for (int i = 0; i < depth; ++i) 1029 if (types[i] == eq_type) 1030 return i; 1031 return -1; 1032 } 1033 int get_count(int level) const { 1034 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 1035 return count[level]; 1036 } 1037 // Return the total number of cores with attribute 'attr' 1038 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const { 1039 return _get_ncores_with_attr(attr, -1, true); 1040 } 1041 // Return the number of cores with attribute 1042 // 'attr' per topology level 'above' 1043 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const { 1044 return _get_ncores_with_attr(attr, above, false); 1045 } 1046 1047 #if KMP_AFFINITY_SUPPORTED 1048 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b); 1049 void sort_compact(kmp_affinity_t &affinity) { 1050 compact = affinity.compact; 1051 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 1052 kmp_hw_thread_t::compare_compact); 1053 } 1054 #endif 1055 void print(const char *env_var = "KMP_AFFINITY") const; 1056 void dump() const; 1057 }; 1058 extern kmp_topology_t *__kmp_topology; 1059 1060 class kmp_hw_subset_t { 1061 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS; 1062 1063 public: 1064 // Describe a machine topology item in KMP_HW_SUBSET 1065 struct item_t { 1066 kmp_hw_t type; 1067 int num_attrs; 1068 int num[MAX_ATTRS]; 1069 int offset[MAX_ATTRS]; 1070 kmp_hw_attr_t attr[MAX_ATTRS]; 1071 }; 1072 // Put parenthesis around max to avoid accidental use of Windows max macro. 1073 const static int USE_ALL = (std::numeric_limits<int>::max)(); 1074 1075 private: 1076 int depth; 1077 int capacity; 1078 item_t *items; 1079 kmp_uint64 set; 1080 bool absolute; 1081 // The set must be able to handle up to KMP_HW_LAST number of layers 1082 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST); 1083 // Sorting the KMP_HW_SUBSET items to follow topology order 1084 // All unknown topology types will be at the beginning of the subset 1085 static int hw_subset_compare(const void *i1, const void *i2) { 1086 kmp_hw_t type1 = ((const item_t *)i1)->type; 1087 kmp_hw_t type2 = ((const item_t *)i2)->type; 1088 int level1 = __kmp_topology->get_level(type1); 1089 int level2 = __kmp_topology->get_level(type2); 1090 return level1 - level2; 1091 } 1092 1093 public: 1094 // Force use of allocate()/deallocate() 1095 kmp_hw_subset_t() = delete; 1096 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete; 1097 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete; 1098 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete; 1099 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete; 1100 1101 static kmp_hw_subset_t *allocate() { 1102 int initial_capacity = 5; 1103 kmp_hw_subset_t *retval = 1104 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t)); 1105 retval->depth = 0; 1106 retval->capacity = initial_capacity; 1107 retval->set = 0ull; 1108 retval->absolute = false; 1109 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity); 1110 return retval; 1111 } 1112 static void deallocate(kmp_hw_subset_t *subset) { 1113 __kmp_free(subset->items); 1114 __kmp_free(subset); 1115 } 1116 void set_absolute() { absolute = true; } 1117 bool is_absolute() const { return absolute; } 1118 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) { 1119 for (int i = 0; i < depth; ++i) { 1120 // Found an existing item for this layer type 1121 // Add the num, offset, and attr to this item 1122 if (items[i].type == type) { 1123 int idx = items[i].num_attrs++; 1124 if ((size_t)idx >= MAX_ATTRS) 1125 return; 1126 items[i].num[idx] = num; 1127 items[i].offset[idx] = offset; 1128 items[i].attr[idx] = attr; 1129 return; 1130 } 1131 } 1132 if (depth == capacity - 1) { 1133 capacity *= 2; 1134 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity); 1135 for (int i = 0; i < depth; ++i) 1136 new_items[i] = items[i]; 1137 __kmp_free(items); 1138 items = new_items; 1139 } 1140 items[depth].num_attrs = 1; 1141 items[depth].type = type; 1142 items[depth].num[0] = num; 1143 items[depth].offset[0] = offset; 1144 items[depth].attr[0] = attr; 1145 depth++; 1146 set |= (1ull << type); 1147 } 1148 int get_depth() const { return depth; } 1149 const item_t &at(int index) const { 1150 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1151 return items[index]; 1152 } 1153 item_t &at(int index) { 1154 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1155 return items[index]; 1156 } 1157 void remove(int index) { 1158 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1159 set &= ~(1ull << items[index].type); 1160 for (int j = index + 1; j < depth; ++j) { 1161 items[j - 1] = items[j]; 1162 } 1163 depth--; 1164 } 1165 void sort() { 1166 KMP_DEBUG_ASSERT(__kmp_topology); 1167 qsort(items, depth, sizeof(item_t), hw_subset_compare); 1168 } 1169 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); } 1170 void dump() const { 1171 printf("**********************\n"); 1172 printf("*** kmp_hw_subset: ***\n"); 1173 printf("* depth: %d\n", depth); 1174 printf("* items:\n"); 1175 for (int i = 0; i < depth; ++i) { 1176 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type)); 1177 for (int j = 0; j < items[i].num_attrs; ++j) { 1178 printf(" num: %d, offset: %d, attr: ", items[i].num[j], 1179 items[i].offset[j]); 1180 if (!items[i].attr[j]) { 1181 printf(" (none)\n"); 1182 } else { 1183 printf( 1184 " core_type = %s, core_eff = %d\n", 1185 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()), 1186 items[i].attr[j].get_core_eff()); 1187 } 1188 } 1189 } 1190 printf("* set: 0x%llx\n", set); 1191 printf("* absolute: %d\n", absolute); 1192 printf("**********************\n"); 1193 } 1194 }; 1195 extern kmp_hw_subset_t *__kmp_hw_subset; 1196 1197 /* A structure for holding machine-specific hierarchy info to be computed once 1198 at init. This structure represents a mapping of threads to the actual machine 1199 hierarchy, or to our best guess at what the hierarchy might be, for the 1200 purpose of performing an efficient barrier. In the worst case, when there is 1201 no machine hierarchy information, it produces a tree suitable for a barrier, 1202 similar to the tree used in the hyper barrier. */ 1203 class hierarchy_info { 1204 public: 1205 /* Good default values for number of leaves and branching factor, given no 1206 affinity information. Behaves a bit like hyper barrier. */ 1207 static const kmp_uint32 maxLeaves = 4; 1208 static const kmp_uint32 minBranch = 4; 1209 /** Number of levels in the hierarchy. Typical levels are threads/core, 1210 cores/package or socket, packages/node, nodes/machine, etc. We don't want 1211 to get specific with nomenclature. When the machine is oversubscribed we 1212 add levels to duplicate the hierarchy, doubling the thread capacity of the 1213 hierarchy each time we add a level. */ 1214 kmp_uint32 maxLevels; 1215 1216 /** This is specifically the depth of the machine configuration hierarchy, in 1217 terms of the number of levels along the longest path from root to any 1218 leaf. It corresponds to the number of entries in numPerLevel if we exclude 1219 all but one trailing 1. */ 1220 kmp_uint32 depth; 1221 kmp_uint32 base_num_threads; 1222 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 }; 1223 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized, 1224 // 2=initialization in progress 1225 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing 1226 1227 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children 1228 the parent of a node at level i has. For example, if we have a machine 1229 with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel = 1230 {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 1231 kmp_uint32 *numPerLevel; 1232 kmp_uint32 *skipPerLevel; 1233 1234 void deriveLevels() { 1235 int hier_depth = __kmp_topology->get_depth(); 1236 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) { 1237 numPerLevel[level] = __kmp_topology->get_ratio(i); 1238 } 1239 } 1240 1241 hierarchy_info() 1242 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {} 1243 1244 void fini() { 1245 if (!uninitialized && numPerLevel) { 1246 __kmp_free(numPerLevel); 1247 numPerLevel = NULL; 1248 uninitialized = not_initialized; 1249 } 1250 } 1251 1252 void init(int num_addrs) { 1253 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8( 1254 &uninitialized, not_initialized, initializing); 1255 if (bool_result == 0) { // Wait for initialization 1256 while (TCR_1(uninitialized) != initialized) 1257 KMP_CPU_PAUSE(); 1258 return; 1259 } 1260 KMP_DEBUG_ASSERT(bool_result == 1); 1261 1262 /* Added explicit initialization of the data fields here to prevent usage of 1263 dirty value observed when static library is re-initialized multiple times 1264 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses 1265 OpenMP). */ 1266 depth = 1; 1267 resizing = 0; 1268 maxLevels = 7; 1269 numPerLevel = 1270 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1271 skipPerLevel = &(numPerLevel[maxLevels]); 1272 for (kmp_uint32 i = 0; i < maxLevels; 1273 ++i) { // init numPerLevel[*] to 1 item per level 1274 numPerLevel[i] = 1; 1275 skipPerLevel[i] = 1; 1276 } 1277 1278 // Sort table by physical ID 1279 if (__kmp_topology && __kmp_topology->get_depth() > 0) { 1280 deriveLevels(); 1281 } else { 1282 numPerLevel[0] = maxLeaves; 1283 numPerLevel[1] = num_addrs / maxLeaves; 1284 if (num_addrs % maxLeaves) 1285 numPerLevel[1]++; 1286 } 1287 1288 base_num_threads = num_addrs; 1289 for (int i = maxLevels - 1; i >= 0; 1290 --i) // count non-empty levels to get depth 1291 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 1292 depth++; 1293 1294 kmp_uint32 branch = minBranch; 1295 if (numPerLevel[0] == 1) 1296 branch = num_addrs / maxLeaves; 1297 if (branch < minBranch) 1298 branch = minBranch; 1299 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width 1300 while (numPerLevel[d] > branch || 1301 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0! 1302 if (numPerLevel[d] & 1) 1303 numPerLevel[d]++; 1304 numPerLevel[d] = numPerLevel[d] >> 1; 1305 if (numPerLevel[d + 1] == 1) 1306 depth++; 1307 numPerLevel[d + 1] = numPerLevel[d + 1] << 1; 1308 } 1309 if (numPerLevel[0] == 1) { 1310 branch = branch >> 1; 1311 if (branch < 4) 1312 branch = minBranch; 1313 } 1314 } 1315 1316 for (kmp_uint32 i = 1; i < depth; ++i) 1317 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1]; 1318 // Fill in hierarchy in the case of oversubscription 1319 for (kmp_uint32 i = depth; i < maxLevels; ++i) 1320 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1321 1322 uninitialized = initialized; // One writer 1323 } 1324 1325 // Resize the hierarchy if nproc changes to something larger than before 1326 void resize(kmp_uint32 nproc) { 1327 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1328 while (bool_result == 0) { // someone else is trying to resize 1329 KMP_CPU_PAUSE(); 1330 if (nproc <= base_num_threads) // happy with other thread's resize 1331 return; 1332 else // try to resize 1333 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1334 } 1335 KMP_DEBUG_ASSERT(bool_result != 0); 1336 if (nproc <= base_num_threads) 1337 return; // happy with other thread's resize 1338 1339 // Calculate new maxLevels 1340 kmp_uint32 old_sz = skipPerLevel[depth - 1]; 1341 kmp_uint32 incs = 0, old_maxLevels = maxLevels; 1342 // First see if old maxLevels is enough to contain new size 1343 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) { 1344 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1345 numPerLevel[i - 1] *= 2; 1346 old_sz *= 2; 1347 depth++; 1348 } 1349 if (nproc > old_sz) { // Not enough space, need to expand hierarchy 1350 while (nproc > old_sz) { 1351 old_sz *= 2; 1352 incs++; 1353 depth++; 1354 } 1355 maxLevels += incs; 1356 1357 // Resize arrays 1358 kmp_uint32 *old_numPerLevel = numPerLevel; 1359 kmp_uint32 *old_skipPerLevel = skipPerLevel; 1360 numPerLevel = skipPerLevel = NULL; 1361 numPerLevel = 1362 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1363 skipPerLevel = &(numPerLevel[maxLevels]); 1364 1365 // Copy old elements from old arrays 1366 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) { 1367 // init numPerLevel[*] to 1 item per level 1368 numPerLevel[i] = old_numPerLevel[i]; 1369 skipPerLevel[i] = old_skipPerLevel[i]; 1370 } 1371 1372 // Init new elements in arrays to 1 1373 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) { 1374 // init numPerLevel[*] to 1 item per level 1375 numPerLevel[i] = 1; 1376 skipPerLevel[i] = 1; 1377 } 1378 1379 // Free old arrays 1380 __kmp_free(old_numPerLevel); 1381 } 1382 1383 // Fill in oversubscription levels of hierarchy 1384 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) 1385 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1386 1387 base_num_threads = nproc; 1388 resizing = 0; // One writer 1389 } 1390 }; 1391 #endif // KMP_AFFINITY_H 1392