1 /* 2 * kmp_affinity.h -- header for affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef KMP_AFFINITY_H 14 #define KMP_AFFINITY_H 15 16 #include "kmp.h" 17 #include "kmp_os.h" 18 #include <limits> 19 20 #if KMP_AFFINITY_SUPPORTED 21 #if KMP_HWLOC_ENABLED 22 class KMPHwlocAffinity : public KMPAffinity { 23 public: 24 class Mask : public KMPAffinity::Mask { 25 hwloc_cpuset_t mask; 26 27 public: Mask()28 Mask() { 29 mask = hwloc_bitmap_alloc(); 30 this->zero(); 31 } 32 Mask(const Mask &other) = delete; 33 Mask &operator=(const Mask &other) = delete; ~Mask()34 ~Mask() { hwloc_bitmap_free(mask); } set(int i)35 void set(int i) override { hwloc_bitmap_set(mask, i); } is_set(int i)36 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); } clear(int i)37 void clear(int i) override { hwloc_bitmap_clr(mask, i); } zero()38 void zero() override { hwloc_bitmap_zero(mask); } empty()39 bool empty() const override { return hwloc_bitmap_iszero(mask); } copy(const KMPAffinity::Mask * src)40 void copy(const KMPAffinity::Mask *src) override { 41 const Mask *convert = static_cast<const Mask *>(src); 42 hwloc_bitmap_copy(mask, convert->mask); 43 } bitwise_and(const KMPAffinity::Mask * rhs)44 void bitwise_and(const KMPAffinity::Mask *rhs) override { 45 const Mask *convert = static_cast<const Mask *>(rhs); 46 hwloc_bitmap_and(mask, mask, convert->mask); 47 } bitwise_or(const KMPAffinity::Mask * rhs)48 void bitwise_or(const KMPAffinity::Mask *rhs) override { 49 const Mask *convert = static_cast<const Mask *>(rhs); 50 hwloc_bitmap_or(mask, mask, convert->mask); 51 } bitwise_not()52 void bitwise_not() override { hwloc_bitmap_not(mask, mask); } is_equal(const KMPAffinity::Mask * rhs)53 bool is_equal(const KMPAffinity::Mask *rhs) const override { 54 const Mask *convert = static_cast<const Mask *>(rhs); 55 return hwloc_bitmap_isequal(mask, convert->mask); 56 } begin()57 int begin() const override { return hwloc_bitmap_first(mask); } end()58 int end() const override { return -1; } next(int previous)59 int next(int previous) const override { 60 return hwloc_bitmap_next(mask, previous); 61 } get_system_affinity(bool abort_on_error)62 int get_system_affinity(bool abort_on_error) override { 63 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 64 "Illegal get affinity operation when not capable"); 65 long retval = 66 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 67 if (retval >= 0) { 68 return 0; 69 } 70 int error = errno; 71 if (abort_on_error) { 72 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"), 73 KMP_ERR(error), __kmp_msg_null); 74 } 75 return error; 76 } set_system_affinity(bool abort_on_error)77 int set_system_affinity(bool abort_on_error) const override { 78 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 79 "Illegal set affinity operation when not capable"); 80 long retval = 81 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 82 if (retval >= 0) { 83 return 0; 84 } 85 int error = errno; 86 if (abort_on_error) { 87 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"), 88 KMP_ERR(error), __kmp_msg_null); 89 } 90 return error; 91 } 92 #if KMP_OS_WINDOWS set_process_affinity(bool abort_on_error)93 int set_process_affinity(bool abort_on_error) const override { 94 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 95 "Illegal set process affinity operation when not capable"); 96 int error = 0; 97 const hwloc_topology_support *support = 98 hwloc_topology_get_support(__kmp_hwloc_topology); 99 if (support->cpubind->set_proc_cpubind) { 100 int retval; 101 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask, 102 HWLOC_CPUBIND_PROCESS); 103 if (retval >= 0) 104 return 0; 105 error = errno; 106 if (abort_on_error) 107 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"), 108 KMP_ERR(error), __kmp_msg_null); 109 } 110 return error; 111 } 112 #endif // KMP_OS_WINDOWS get_proc_group()113 int get_proc_group() const override { 114 int group = -1; 115 #if KMP_OS_WINDOWS 116 if (__kmp_num_proc_groups == 1) { 117 return 1; 118 } 119 for (int i = 0; i < __kmp_num_proc_groups; i++) { 120 // On windows, the long type is always 32 bits 121 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2); 122 unsigned long second_32_bits = 123 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1); 124 if (first_32_bits == 0 && second_32_bits == 0) { 125 continue; 126 } 127 if (group >= 0) { 128 return -1; 129 } 130 group = i; 131 } 132 #endif /* KMP_OS_WINDOWS */ 133 return group; 134 } 135 }; determine_capable(const char * var)136 void determine_capable(const char *var) override { 137 const hwloc_topology_support *topology_support; 138 if (__kmp_hwloc_topology == NULL) { 139 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) { 140 __kmp_hwloc_error = TRUE; 141 if (__kmp_affinity.flags.verbose) { 142 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()"); 143 } 144 } 145 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) { 146 __kmp_hwloc_error = TRUE; 147 if (__kmp_affinity.flags.verbose) { 148 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()"); 149 } 150 } 151 } 152 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology); 153 // Is the system capable of setting/getting this thread's affinity? 154 // Also, is topology discovery possible? (pu indicates ability to discover 155 // processing units). And finally, were there no errors when calling any 156 // hwloc_* API functions? 157 if (topology_support && topology_support->cpubind->set_thisthread_cpubind && 158 topology_support->cpubind->get_thisthread_cpubind && 159 topology_support->discovery->pu && !__kmp_hwloc_error) { 160 // enables affinity according to KMP_AFFINITY_CAPABLE() macro 161 KMP_AFFINITY_ENABLE(TRUE); 162 } else { 163 // indicate that hwloc didn't work and disable affinity 164 __kmp_hwloc_error = TRUE; 165 KMP_AFFINITY_DISABLE(); 166 } 167 } bind_thread(int which)168 void bind_thread(int which) override { 169 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 170 "Illegal set affinity operation when not capable"); 171 KMPAffinity::Mask *mask; 172 KMP_CPU_ALLOC_ON_STACK(mask); 173 KMP_CPU_ZERO(mask); 174 KMP_CPU_SET(which, mask); 175 __kmp_set_system_affinity(mask, TRUE); 176 KMP_CPU_FREE_FROM_STACK(mask); 177 } allocate_mask()178 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } deallocate_mask(KMPAffinity::Mask * m)179 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } allocate_mask_array(int num)180 KMPAffinity::Mask *allocate_mask_array(int num) override { 181 return new Mask[num]; 182 } deallocate_mask_array(KMPAffinity::Mask * array)183 void deallocate_mask_array(KMPAffinity::Mask *array) override { 184 Mask *hwloc_array = static_cast<Mask *>(array); 185 delete[] hwloc_array; 186 } index_mask_array(KMPAffinity::Mask * array,int index)187 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 188 int index) override { 189 Mask *hwloc_array = static_cast<Mask *>(array); 190 return &(hwloc_array[index]); 191 } get_api_type()192 api_type get_api_type() const override { return HWLOC; } 193 }; 194 #endif /* KMP_HWLOC_ENABLED */ 195 196 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \ 197 KMP_OS_AIX 198 #if KMP_OS_LINUX 199 /* On some of the older OS's that we build on, these constants aren't present 200 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on 201 all systems of the same arch where they are defined, and they cannot change. 202 stone forever. */ 203 #include <sys/syscall.h> 204 #if KMP_ARCH_X86 || KMP_ARCH_ARM 205 #ifndef __NR_sched_setaffinity 206 #define __NR_sched_setaffinity 241 207 #elif __NR_sched_setaffinity != 241 208 #error Wrong code for setaffinity system call. 209 #endif /* __NR_sched_setaffinity */ 210 #ifndef __NR_sched_getaffinity 211 #define __NR_sched_getaffinity 242 212 #elif __NR_sched_getaffinity != 242 213 #error Wrong code for getaffinity system call. 214 #endif /* __NR_sched_getaffinity */ 215 #elif KMP_ARCH_AARCH64 216 #ifndef __NR_sched_setaffinity 217 #define __NR_sched_setaffinity 122 218 #elif __NR_sched_setaffinity != 122 219 #error Wrong code for setaffinity system call. 220 #endif /* __NR_sched_setaffinity */ 221 #ifndef __NR_sched_getaffinity 222 #define __NR_sched_getaffinity 123 223 #elif __NR_sched_getaffinity != 123 224 #error Wrong code for getaffinity system call. 225 #endif /* __NR_sched_getaffinity */ 226 #elif KMP_ARCH_X86_64 227 #ifndef __NR_sched_setaffinity 228 #define __NR_sched_setaffinity 203 229 #elif __NR_sched_setaffinity != 203 230 #error Wrong code for setaffinity system call. 231 #endif /* __NR_sched_setaffinity */ 232 #ifndef __NR_sched_getaffinity 233 #define __NR_sched_getaffinity 204 234 #elif __NR_sched_getaffinity != 204 235 #error Wrong code for getaffinity system call. 236 #endif /* __NR_sched_getaffinity */ 237 #elif KMP_ARCH_PPC64 238 #ifndef __NR_sched_setaffinity 239 #define __NR_sched_setaffinity 222 240 #elif __NR_sched_setaffinity != 222 241 #error Wrong code for setaffinity system call. 242 #endif /* __NR_sched_setaffinity */ 243 #ifndef __NR_sched_getaffinity 244 #define __NR_sched_getaffinity 223 245 #elif __NR_sched_getaffinity != 223 246 #error Wrong code for getaffinity system call. 247 #endif /* __NR_sched_getaffinity */ 248 #elif KMP_ARCH_MIPS 249 #ifndef __NR_sched_setaffinity 250 #define __NR_sched_setaffinity 4239 251 #elif __NR_sched_setaffinity != 4239 252 #error Wrong code for setaffinity system call. 253 #endif /* __NR_sched_setaffinity */ 254 #ifndef __NR_sched_getaffinity 255 #define __NR_sched_getaffinity 4240 256 #elif __NR_sched_getaffinity != 4240 257 #error Wrong code for getaffinity system call. 258 #endif /* __NR_sched_getaffinity */ 259 #elif KMP_ARCH_MIPS64 260 #ifndef __NR_sched_setaffinity 261 #define __NR_sched_setaffinity 5195 262 #elif __NR_sched_setaffinity != 5195 263 #error Wrong code for setaffinity system call. 264 #endif /* __NR_sched_setaffinity */ 265 #ifndef __NR_sched_getaffinity 266 #define __NR_sched_getaffinity 5196 267 #elif __NR_sched_getaffinity != 5196 268 #error Wrong code for getaffinity system call. 269 #endif /* __NR_sched_getaffinity */ 270 #elif KMP_ARCH_LOONGARCH64 271 #ifndef __NR_sched_setaffinity 272 #define __NR_sched_setaffinity 122 273 #elif __NR_sched_setaffinity != 122 274 #error Wrong code for setaffinity system call. 275 #endif /* __NR_sched_setaffinity */ 276 #ifndef __NR_sched_getaffinity 277 #define __NR_sched_getaffinity 123 278 #elif __NR_sched_getaffinity != 123 279 #error Wrong code for getaffinity system call. 280 #endif /* __NR_sched_getaffinity */ 281 #elif KMP_ARCH_RISCV64 282 #ifndef __NR_sched_setaffinity 283 #define __NR_sched_setaffinity 122 284 #elif __NR_sched_setaffinity != 122 285 #error Wrong code for setaffinity system call. 286 #endif /* __NR_sched_setaffinity */ 287 #ifndef __NR_sched_getaffinity 288 #define __NR_sched_getaffinity 123 289 #elif __NR_sched_getaffinity != 123 290 #error Wrong code for getaffinity system call. 291 #endif /* __NR_sched_getaffinity */ 292 #elif KMP_ARCH_VE 293 #ifndef __NR_sched_setaffinity 294 #define __NR_sched_setaffinity 203 295 #elif __NR_sched_setaffinity != 203 296 #error Wrong code for setaffinity system call. 297 #endif /* __NR_sched_setaffinity */ 298 #ifndef __NR_sched_getaffinity 299 #define __NR_sched_getaffinity 204 300 #elif __NR_sched_getaffinity != 204 301 #error Wrong code for getaffinity system call. 302 #endif /* __NR_sched_getaffinity */ 303 #elif KMP_ARCH_S390X 304 #ifndef __NR_sched_setaffinity 305 #define __NR_sched_setaffinity 239 306 #elif __NR_sched_setaffinity != 239 307 #error Wrong code for setaffinity system call. 308 #endif /* __NR_sched_setaffinity */ 309 #ifndef __NR_sched_getaffinity 310 #define __NR_sched_getaffinity 240 311 #elif __NR_sched_getaffinity != 240 312 #error Wrong code for getaffinity system call. 313 #endif /* __NR_sched_getaffinity */ 314 #elif KMP_ARCH_SPARC 315 #ifndef __NR_sched_setaffinity 316 #define __NR_sched_setaffinity 261 317 #elif __NR_sched_setaffinity != 261 318 #error Wrong code for setaffinity system call. 319 #endif /* __NR_sched_setaffinity */ 320 #ifndef __NR_sched_getaffinity 321 #define __NR_sched_getaffinity 260 322 #elif __NR_sched_getaffinity != 260 323 #error Wrong code for getaffinity system call. 324 #endif /* __NR_sched_getaffinity */ 325 #else 326 #error Unknown or unsupported architecture 327 #endif /* KMP_ARCH_* */ 328 #elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY 329 #include <pthread.h> 330 #include <pthread_np.h> 331 #elif KMP_OS_NETBSD 332 #include <pthread.h> 333 #include <sched.h> 334 #elif KMP_OS_AIX 335 #include <sys/dr.h> 336 #include <sys/rset.h> 337 #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX. 338 #define GET_NUMBER_SMT_SETS 0x0004 339 extern "C" int syssmt(int flags, int, int, int *); 340 #endif 341 class KMPNativeAffinity : public KMPAffinity { 342 class Mask : public KMPAffinity::Mask { 343 typedef unsigned long mask_t; 344 typedef decltype(__kmp_affin_mask_size) mask_size_type; 345 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 346 static const mask_t ONE = 1; get_num_mask_types()347 mask_size_type get_num_mask_types() const { 348 return __kmp_affin_mask_size / sizeof(mask_t); 349 } 350 351 public: 352 mask_t *mask; Mask()353 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); } ~Mask()354 ~Mask() { 355 if (mask) 356 __kmp_free(mask); 357 } set(int i)358 void set(int i) override { 359 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T)); 360 } is_set(int i)361 bool is_set(int i) const override { 362 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T))); 363 } clear(int i)364 void clear(int i) override { 365 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T)); 366 } zero()367 void zero() override { 368 mask_size_type e = get_num_mask_types(); 369 for (mask_size_type i = 0; i < e; ++i) 370 mask[i] = (mask_t)0; 371 } empty()372 bool empty() const override { 373 mask_size_type e = get_num_mask_types(); 374 for (mask_size_type i = 0; i < e; ++i) 375 if (mask[i] != (mask_t)0) 376 return false; 377 return true; 378 } copy(const KMPAffinity::Mask * src)379 void copy(const KMPAffinity::Mask *src) override { 380 const Mask *convert = static_cast<const Mask *>(src); 381 mask_size_type e = get_num_mask_types(); 382 for (mask_size_type i = 0; i < e; ++i) 383 mask[i] = convert->mask[i]; 384 } bitwise_and(const KMPAffinity::Mask * rhs)385 void bitwise_and(const KMPAffinity::Mask *rhs) override { 386 const Mask *convert = static_cast<const Mask *>(rhs); 387 mask_size_type e = get_num_mask_types(); 388 for (mask_size_type i = 0; i < e; ++i) 389 mask[i] &= convert->mask[i]; 390 } bitwise_or(const KMPAffinity::Mask * rhs)391 void bitwise_or(const KMPAffinity::Mask *rhs) override { 392 const Mask *convert = static_cast<const Mask *>(rhs); 393 mask_size_type e = get_num_mask_types(); 394 for (mask_size_type i = 0; i < e; ++i) 395 mask[i] |= convert->mask[i]; 396 } bitwise_not()397 void bitwise_not() override { 398 mask_size_type e = get_num_mask_types(); 399 for (mask_size_type i = 0; i < e; ++i) 400 mask[i] = ~(mask[i]); 401 } is_equal(const KMPAffinity::Mask * rhs)402 bool is_equal(const KMPAffinity::Mask *rhs) const override { 403 const Mask *convert = static_cast<const Mask *>(rhs); 404 mask_size_type e = get_num_mask_types(); 405 for (mask_size_type i = 0; i < e; ++i) 406 if (mask[i] != convert->mask[i]) 407 return false; 408 return true; 409 } begin()410 int begin() const override { 411 int retval = 0; 412 while (retval < end() && !is_set(retval)) 413 ++retval; 414 return retval; 415 } end()416 int end() const override { 417 int e; 418 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e); 419 return e; 420 } next(int previous)421 int next(int previous) const override { 422 int retval = previous + 1; 423 while (retval < end() && !is_set(retval)) 424 ++retval; 425 return retval; 426 } 427 #if KMP_OS_AIX 428 // On AIX, we don't have a way to get CPU(s) a thread is bound to. 429 // This routine is only used to get the full mask. get_system_affinity(bool abort_on_error)430 int get_system_affinity(bool abort_on_error) override { 431 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 432 "Illegal get affinity operation when not capable"); 433 434 (void)abort_on_error; 435 436 // Set the mask with all CPUs that are available. 437 for (int i = 0; i < __kmp_xproc; ++i) 438 KMP_CPU_SET(i, this); 439 return 0; 440 } set_system_affinity(bool abort_on_error)441 int set_system_affinity(bool abort_on_error) const override { 442 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 443 444 "Illegal set affinity operation when not capable"); 445 446 int location; 447 int gtid = __kmp_entry_gtid(); 448 int tid = thread_self(); 449 450 // Unbind the thread if it was bound to any processors before so that 451 // we can bind the thread to CPUs specified by the mask not others. 452 int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY); 453 454 // On AIX, we can only bind to one instead of a set of CPUs with the 455 // bindprocessor() system call. 456 KMP_CPU_SET_ITERATE(location, this) { 457 if (KMP_CPU_ISSET(location, this)) { 458 retval = bindprocessor(BINDTHREAD, tid, location); 459 if (retval == -1 && errno == 1) { 460 rsid_t rsid; 461 rsethandle_t rsh; 462 // Put something in rsh to prevent compiler warning 463 // about uninitalized use 464 rsh = rs_alloc(RS_EMPTY); 465 rsid.at_pid = getpid(); 466 if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) { 467 retval = ra_detachrset(R_PROCESS, rsid, 0); 468 retval = bindprocessor(BINDTHREAD, tid, location); 469 } 470 } 471 if (retval == 0) { 472 KA_TRACE(10, ("__kmp_set_system_affinity: Done binding " 473 "T#%d to cpu=%d.\n", 474 gtid, location)); 475 continue; 476 } 477 int error = errno; 478 if (abort_on_error) { 479 __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"), 480 KMP_ERR(error), __kmp_msg_null); 481 KA_TRACE(10, ("__kmp_set_system_affinity: Error binding " 482 "T#%d to cpu=%d, errno=%d.\n", 483 gtid, location, error)); 484 return error; 485 } 486 } 487 } 488 return 0; 489 } 490 #else // !KMP_OS_AIX get_system_affinity(bool abort_on_error)491 int get_system_affinity(bool abort_on_error) override { 492 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 493 "Illegal get affinity operation when not capable"); 494 #if KMP_OS_LINUX 495 long retval = 496 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask); 497 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY 498 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, 499 reinterpret_cast<cpuset_t *>(mask)); 500 int retval = (r == 0 ? 0 : -1); 501 #endif 502 if (retval >= 0) { 503 return 0; 504 } 505 int error = errno; 506 if (abort_on_error) { 507 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"), 508 KMP_ERR(error), __kmp_msg_null); 509 } 510 return error; 511 } set_system_affinity(bool abort_on_error)512 int set_system_affinity(bool abort_on_error) const override { 513 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 514 "Illegal set affinity operation when not capable"); 515 #if KMP_OS_LINUX 516 long retval = 517 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask); 518 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY 519 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, 520 reinterpret_cast<cpuset_t *>(mask)); 521 int retval = (r == 0 ? 0 : -1); 522 #endif 523 if (retval >= 0) { 524 return 0; 525 } 526 int error = errno; 527 if (abort_on_error) { 528 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"), 529 KMP_ERR(error), __kmp_msg_null); 530 } 531 return error; 532 } 533 #endif // KMP_OS_AIX 534 }; determine_capable(const char * env_var)535 void determine_capable(const char *env_var) override { 536 __kmp_affinity_determine_capable(env_var); 537 } bind_thread(int which)538 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } allocate_mask()539 KMPAffinity::Mask *allocate_mask() override { 540 KMPNativeAffinity::Mask *retval = new Mask(); 541 return retval; 542 } deallocate_mask(KMPAffinity::Mask * m)543 void deallocate_mask(KMPAffinity::Mask *m) override { 544 KMPNativeAffinity::Mask *native_mask = 545 static_cast<KMPNativeAffinity::Mask *>(m); 546 delete native_mask; 547 } allocate_mask_array(int num)548 KMPAffinity::Mask *allocate_mask_array(int num) override { 549 return new Mask[num]; 550 } deallocate_mask_array(KMPAffinity::Mask * array)551 void deallocate_mask_array(KMPAffinity::Mask *array) override { 552 Mask *linux_array = static_cast<Mask *>(array); 553 delete[] linux_array; 554 } index_mask_array(KMPAffinity::Mask * array,int index)555 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 556 int index) override { 557 Mask *linux_array = static_cast<Mask *>(array); 558 return &(linux_array[index]); 559 } get_api_type()560 api_type get_api_type() const override { return NATIVE_OS; } 561 }; 562 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \ 563 || KMP_OS_AIX */ 564 565 #if KMP_OS_WINDOWS 566 class KMPNativeAffinity : public KMPAffinity { 567 class Mask : public KMPAffinity::Mask { 568 typedef ULONG_PTR mask_t; 569 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 570 mask_t *mask; 571 572 public: Mask()573 Mask() { 574 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups); 575 } ~Mask()576 ~Mask() { 577 if (mask) 578 __kmp_free(mask); 579 } set(int i)580 void set(int i) override { 581 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T)); 582 } is_set(int i)583 bool is_set(int i) const override { 584 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T))); 585 } clear(int i)586 void clear(int i) override { 587 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T)); 588 } zero()589 void zero() override { 590 for (int i = 0; i < __kmp_num_proc_groups; ++i) 591 mask[i] = 0; 592 } empty()593 bool empty() const override { 594 for (size_t i = 0; i < __kmp_num_proc_groups; ++i) 595 if (mask[i]) 596 return false; 597 return true; 598 } copy(const KMPAffinity::Mask * src)599 void copy(const KMPAffinity::Mask *src) override { 600 const Mask *convert = static_cast<const Mask *>(src); 601 for (int i = 0; i < __kmp_num_proc_groups; ++i) 602 mask[i] = convert->mask[i]; 603 } bitwise_and(const KMPAffinity::Mask * rhs)604 void bitwise_and(const KMPAffinity::Mask *rhs) override { 605 const Mask *convert = static_cast<const Mask *>(rhs); 606 for (int i = 0; i < __kmp_num_proc_groups; ++i) 607 mask[i] &= convert->mask[i]; 608 } bitwise_or(const KMPAffinity::Mask * rhs)609 void bitwise_or(const KMPAffinity::Mask *rhs) override { 610 const Mask *convert = static_cast<const Mask *>(rhs); 611 for (int i = 0; i < __kmp_num_proc_groups; ++i) 612 mask[i] |= convert->mask[i]; 613 } bitwise_not()614 void bitwise_not() override { 615 for (int i = 0; i < __kmp_num_proc_groups; ++i) 616 mask[i] = ~(mask[i]); 617 } is_equal(const KMPAffinity::Mask * rhs)618 bool is_equal(const KMPAffinity::Mask *rhs) const override { 619 const Mask *convert = static_cast<const Mask *>(rhs); 620 for (size_t i = 0; i < __kmp_num_proc_groups; ++i) 621 if (mask[i] != convert->mask[i]) 622 return false; 623 return true; 624 } begin()625 int begin() const override { 626 int retval = 0; 627 while (retval < end() && !is_set(retval)) 628 ++retval; 629 return retval; 630 } end()631 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; } next(int previous)632 int next(int previous) const override { 633 int retval = previous + 1; 634 while (retval < end() && !is_set(retval)) 635 ++retval; 636 return retval; 637 } set_process_affinity(bool abort_on_error)638 int set_process_affinity(bool abort_on_error) const override { 639 if (__kmp_num_proc_groups <= 1) { 640 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) { 641 DWORD error = GetLastError(); 642 if (abort_on_error) { 643 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 644 __kmp_msg_null); 645 } 646 return error; 647 } 648 } 649 return 0; 650 } set_system_affinity(bool abort_on_error)651 int set_system_affinity(bool abort_on_error) const override { 652 if (__kmp_num_proc_groups > 1) { 653 // Check for a valid mask. 654 GROUP_AFFINITY ga; 655 int group = get_proc_group(); 656 if (group < 0) { 657 if (abort_on_error) { 658 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 659 } 660 return -1; 661 } 662 // Transform the bit vector into a GROUP_AFFINITY struct 663 // and make the system call to set affinity. 664 ga.Group = group; 665 ga.Mask = mask[group]; 666 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0; 667 668 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL); 669 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) { 670 DWORD error = GetLastError(); 671 if (abort_on_error) { 672 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 673 __kmp_msg_null); 674 } 675 return error; 676 } 677 } else { 678 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) { 679 DWORD error = GetLastError(); 680 if (abort_on_error) { 681 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 682 __kmp_msg_null); 683 } 684 return error; 685 } 686 } 687 return 0; 688 } get_system_affinity(bool abort_on_error)689 int get_system_affinity(bool abort_on_error) override { 690 if (__kmp_num_proc_groups > 1) { 691 this->zero(); 692 GROUP_AFFINITY ga; 693 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL); 694 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) { 695 DWORD error = GetLastError(); 696 if (abort_on_error) { 697 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"), 698 KMP_ERR(error), __kmp_msg_null); 699 } 700 return error; 701 } 702 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) || 703 (ga.Mask == 0)) { 704 return -1; 705 } 706 mask[ga.Group] = ga.Mask; 707 } else { 708 mask_t newMask, sysMask, retval; 709 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) { 710 DWORD error = GetLastError(); 711 if (abort_on_error) { 712 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"), 713 KMP_ERR(error), __kmp_msg_null); 714 } 715 return error; 716 } 717 retval = SetThreadAffinityMask(GetCurrentThread(), newMask); 718 if (!retval) { 719 DWORD error = GetLastError(); 720 if (abort_on_error) { 721 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 722 KMP_ERR(error), __kmp_msg_null); 723 } 724 return error; 725 } 726 newMask = SetThreadAffinityMask(GetCurrentThread(), retval); 727 if (!newMask) { 728 DWORD error = GetLastError(); 729 if (abort_on_error) { 730 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 731 KMP_ERR(error), __kmp_msg_null); 732 } 733 } 734 *mask = retval; 735 } 736 return 0; 737 } get_proc_group()738 int get_proc_group() const override { 739 int group = -1; 740 if (__kmp_num_proc_groups == 1) { 741 return 1; 742 } 743 for (int i = 0; i < __kmp_num_proc_groups; i++) { 744 if (mask[i] == 0) 745 continue; 746 if (group >= 0) 747 return -1; 748 group = i; 749 } 750 return group; 751 } 752 }; determine_capable(const char * env_var)753 void determine_capable(const char *env_var) override { 754 __kmp_affinity_determine_capable(env_var); 755 } bind_thread(int which)756 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } allocate_mask()757 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } deallocate_mask(KMPAffinity::Mask * m)758 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } allocate_mask_array(int num)759 KMPAffinity::Mask *allocate_mask_array(int num) override { 760 return new Mask[num]; 761 } deallocate_mask_array(KMPAffinity::Mask * array)762 void deallocate_mask_array(KMPAffinity::Mask *array) override { 763 Mask *windows_array = static_cast<Mask *>(array); 764 delete[] windows_array; 765 } index_mask_array(KMPAffinity::Mask * array,int index)766 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 767 int index) override { 768 Mask *windows_array = static_cast<Mask *>(array); 769 return &(windows_array[index]); 770 } get_api_type()771 api_type get_api_type() const override { return NATIVE_OS; } 772 }; 773 #endif /* KMP_OS_WINDOWS */ 774 #endif /* KMP_AFFINITY_SUPPORTED */ 775 776 // Describe an attribute for a level in the machine topology 777 struct kmp_hw_attr_t { 778 int core_type : 8; 779 int core_eff : 8; 780 unsigned valid : 1; 781 unsigned reserved : 15; 782 783 static const int UNKNOWN_CORE_EFF = -1; 784 kmp_hw_attr_tkmp_hw_attr_t785 kmp_hw_attr_t() 786 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF), 787 valid(0), reserved(0) {} set_core_typekmp_hw_attr_t788 void set_core_type(kmp_hw_core_type_t type) { 789 valid = 1; 790 core_type = type; 791 } set_core_effkmp_hw_attr_t792 void set_core_eff(int eff) { 793 valid = 1; 794 core_eff = eff; 795 } get_core_typekmp_hw_attr_t796 kmp_hw_core_type_t get_core_type() const { 797 return (kmp_hw_core_type_t)core_type; 798 } get_core_effkmp_hw_attr_t799 int get_core_eff() const { return core_eff; } is_core_type_validkmp_hw_attr_t800 bool is_core_type_valid() const { 801 return core_type != KMP_HW_CORE_TYPE_UNKNOWN; 802 } is_core_eff_validkmp_hw_attr_t803 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; } 804 operator bool() const { return valid; } clearkmp_hw_attr_t805 void clear() { 806 core_type = KMP_HW_CORE_TYPE_UNKNOWN; 807 core_eff = UNKNOWN_CORE_EFF; 808 valid = 0; 809 } containskmp_hw_attr_t810 bool contains(const kmp_hw_attr_t &other) const { 811 if (!valid && !other.valid) 812 return true; 813 if (valid && other.valid) { 814 if (other.is_core_type_valid()) { 815 if (!is_core_type_valid() || (get_core_type() != other.get_core_type())) 816 return false; 817 } 818 if (other.is_core_eff_valid()) { 819 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff())) 820 return false; 821 } 822 return true; 823 } 824 return false; 825 } 826 #if KMP_AFFINITY_SUPPORTED containskmp_hw_attr_t827 bool contains(const kmp_affinity_attrs_t &attr) const { 828 if (!valid && !attr.valid) 829 return true; 830 if (valid && attr.valid) { 831 if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN) 832 return (is_core_type_valid() && 833 (get_core_type() == (kmp_hw_core_type_t)attr.core_type)); 834 if (attr.core_eff != UNKNOWN_CORE_EFF) 835 return (is_core_eff_valid() && (get_core_eff() == attr.core_eff)); 836 return true; 837 } 838 return false; 839 } 840 #endif // KMP_AFFINITY_SUPPORTED 841 bool operator==(const kmp_hw_attr_t &rhs) const { 842 return (rhs.valid == valid && rhs.core_eff == core_eff && 843 rhs.core_type == core_type); 844 } 845 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); } 846 }; 847 848 #if KMP_AFFINITY_SUPPORTED 849 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t)); 850 #endif 851 852 class kmp_hw_thread_t { 853 public: 854 static const int UNKNOWN_ID = -1; 855 static const int MULTIPLE_ID = -2; 856 static int compare_ids(const void *a, const void *b); 857 static int compare_compact(const void *a, const void *b); 858 int ids[KMP_HW_LAST]; 859 int sub_ids[KMP_HW_LAST]; 860 bool leader; 861 int os_id; 862 int original_idx; 863 kmp_hw_attr_t attrs; 864 865 void print() const; clear()866 void clear() { 867 for (int i = 0; i < (int)KMP_HW_LAST; ++i) 868 ids[i] = UNKNOWN_ID; 869 leader = false; 870 attrs.clear(); 871 } 872 }; 873 874 class kmp_topology_t { 875 876 struct flags_t { 877 int uniform : 1; 878 int reserved : 31; 879 }; 880 881 int depth; 882 883 // The following arrays are all 'depth' long and have been 884 // allocated to hold up to KMP_HW_LAST number of objects if 885 // needed so layers can be added without reallocation of any array 886 887 // Orderd array of the types in the topology 888 kmp_hw_t *types; 889 890 // Keep quick topology ratios, for non-uniform topologies, 891 // this ratio holds the max number of itemAs per itemB 892 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ] 893 int *ratio; 894 895 // Storage containing the absolute number of each topology layer 896 int *count; 897 898 // The number of core efficiencies. This is only useful for hybrid 899 // topologies. Core efficiencies will range from 0 to num efficiencies - 1 900 int num_core_efficiencies; 901 int num_core_types; 902 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES]; 903 904 // The hardware threads array 905 // hw_threads is num_hw_threads long 906 // Each hw_thread's ids and sub_ids are depth deep 907 int num_hw_threads; 908 kmp_hw_thread_t *hw_threads; 909 910 // Equivalence hash where the key is the hardware topology item 911 // and the value is the equivalent hardware topology type in the 912 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no 913 // known equivalence for the topology type 914 kmp_hw_t equivalent[KMP_HW_LAST]; 915 916 // Flags describing the topology 917 flags_t flags; 918 919 // Compact value used during sort_compact() 920 int compact; 921 922 #if KMP_GROUP_AFFINITY 923 // Insert topology information about Windows Processor groups 924 void _insert_windows_proc_groups(); 925 #endif 926 927 // Count each item & get the num x's per y 928 // e.g., get the number of cores and the number of threads per core 929 // for each (x, y) in (KMP_HW_* , KMP_HW_*) 930 void _gather_enumeration_information(); 931 932 // Remove layers that don't add information to the topology. 933 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 934 void _remove_radix1_layers(); 935 936 // Find out if the topology is uniform 937 void _discover_uniformity(); 938 939 // Set all the sub_ids for each hardware thread 940 void _set_sub_ids(); 941 942 // Set global affinity variables describing the number of threads per 943 // core, the number of packages, the number of cores per package, and 944 // the number of cores. 945 void _set_globals(); 946 947 // Set the last level cache equivalent type 948 void _set_last_level_cache(); 949 950 // Return the number of cores with a particular attribute, 'attr'. 951 // If 'find_all' is true, then find all cores on the machine, otherwise find 952 // all cores per the layer 'above' 953 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above, 954 bool find_all = false) const; 955 956 public: 957 // Force use of allocate()/deallocate() 958 kmp_topology_t() = delete; 959 kmp_topology_t(const kmp_topology_t &t) = delete; 960 kmp_topology_t(kmp_topology_t &&t) = delete; 961 kmp_topology_t &operator=(const kmp_topology_t &t) = delete; 962 kmp_topology_t &operator=(kmp_topology_t &&t) = delete; 963 964 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types); 965 static void deallocate(kmp_topology_t *); 966 967 // Functions used in create_map() routines at(int index)968 kmp_hw_thread_t &at(int index) { 969 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 970 return hw_threads[index]; 971 } at(int index)972 const kmp_hw_thread_t &at(int index) const { 973 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 974 return hw_threads[index]; 975 } get_num_hw_threads()976 int get_num_hw_threads() const { return num_hw_threads; } sort_ids()977 void sort_ids() { 978 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 979 kmp_hw_thread_t::compare_ids); 980 } 981 982 // Insert a new topology layer after allocation 983 void insert_layer(kmp_hw_t type, const int *ids); 984 985 // Check if the hardware ids are unique, if they are 986 // return true, otherwise return false 987 bool check_ids() const; 988 989 // Function to call after the create_map() routine 990 void canonicalize(); 991 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores); 992 993 // Functions used after canonicalize() called 994 995 #if KMP_AFFINITY_SUPPORTED 996 // Set the granularity for affinity settings 997 void set_granularity(kmp_affinity_t &stgs) const; 998 bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const; 999 bool restrict_to_mask(const kmp_affin_mask_t *mask); 1000 bool filter_hw_subset(); 1001 #endif is_uniform()1002 bool is_uniform() const { return flags.uniform; } 1003 // Tell whether a type is a valid type in the topology 1004 // returns KMP_HW_UNKNOWN when there is no equivalent type get_equivalent_type(kmp_hw_t type)1005 kmp_hw_t get_equivalent_type(kmp_hw_t type) const { 1006 if (type == KMP_HW_UNKNOWN) 1007 return KMP_HW_UNKNOWN; 1008 return equivalent[type]; 1009 } 1010 // Set type1 = type2 set_equivalent_type(kmp_hw_t type1,kmp_hw_t type2)1011 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) { 1012 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1); 1013 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2); 1014 kmp_hw_t real_type2 = equivalent[type2]; 1015 if (real_type2 == KMP_HW_UNKNOWN) 1016 real_type2 = type2; 1017 equivalent[type1] = real_type2; 1018 // This loop is required since any of the types may have been set to 1019 // be equivalent to type1. They all must be checked and reset to type2. 1020 KMP_FOREACH_HW_TYPE(type) { 1021 if (equivalent[type] == type1) { 1022 equivalent[type] = real_type2; 1023 } 1024 } 1025 } 1026 // Calculate number of types corresponding to level1 1027 // per types corresponding to level2 (e.g., number of threads per core) calculate_ratio(int level1,int level2)1028 int calculate_ratio(int level1, int level2) const { 1029 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth); 1030 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth); 1031 int r = 1; 1032 for (int level = level1; level > level2; --level) 1033 r *= ratio[level]; 1034 return r; 1035 } get_ratio(int level)1036 int get_ratio(int level) const { 1037 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 1038 return ratio[level]; 1039 } get_depth()1040 int get_depth() const { return depth; }; get_type(int level)1041 kmp_hw_t get_type(int level) const { 1042 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 1043 return types[level]; 1044 } get_level(kmp_hw_t type)1045 int get_level(kmp_hw_t type) const { 1046 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type); 1047 int eq_type = equivalent[type]; 1048 if (eq_type == KMP_HW_UNKNOWN) 1049 return -1; 1050 for (int i = 0; i < depth; ++i) 1051 if (types[i] == eq_type) 1052 return i; 1053 return -1; 1054 } get_count(int level)1055 int get_count(int level) const { 1056 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 1057 return count[level]; 1058 } 1059 // Return the total number of cores with attribute 'attr' get_ncores_with_attr(const kmp_hw_attr_t & attr)1060 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const { 1061 return _get_ncores_with_attr(attr, -1, true); 1062 } 1063 // Return the number of cores with attribute 1064 // 'attr' per topology level 'above' get_ncores_with_attr_per(const kmp_hw_attr_t & attr,int above)1065 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const { 1066 return _get_ncores_with_attr(attr, above, false); 1067 } 1068 1069 #if KMP_AFFINITY_SUPPORTED 1070 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b); sort_compact(kmp_affinity_t & affinity)1071 void sort_compact(kmp_affinity_t &affinity) { 1072 compact = affinity.compact; 1073 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 1074 kmp_hw_thread_t::compare_compact); 1075 } 1076 #endif 1077 void print(const char *env_var = "KMP_AFFINITY") const; 1078 void dump() const; 1079 }; 1080 extern kmp_topology_t *__kmp_topology; 1081 1082 class kmp_hw_subset_t { 1083 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS; 1084 1085 public: 1086 // Describe a machine topology item in KMP_HW_SUBSET 1087 struct item_t { 1088 kmp_hw_t type; 1089 int num_attrs; 1090 int num[MAX_ATTRS]; 1091 int offset[MAX_ATTRS]; 1092 kmp_hw_attr_t attr[MAX_ATTRS]; 1093 }; 1094 // Put parenthesis around max to avoid accidental use of Windows max macro. 1095 const static int USE_ALL = (std::numeric_limits<int>::max)(); 1096 1097 private: 1098 int depth; 1099 int capacity; 1100 item_t *items; 1101 kmp_uint64 set; 1102 bool absolute; 1103 // The set must be able to handle up to KMP_HW_LAST number of layers 1104 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST); 1105 // Sorting the KMP_HW_SUBSET items to follow topology order 1106 // All unknown topology types will be at the beginning of the subset hw_subset_compare(const void * i1,const void * i2)1107 static int hw_subset_compare(const void *i1, const void *i2) { 1108 kmp_hw_t type1 = ((const item_t *)i1)->type; 1109 kmp_hw_t type2 = ((const item_t *)i2)->type; 1110 int level1 = __kmp_topology->get_level(type1); 1111 int level2 = __kmp_topology->get_level(type2); 1112 return level1 - level2; 1113 } 1114 1115 public: 1116 // Force use of allocate()/deallocate() 1117 kmp_hw_subset_t() = delete; 1118 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete; 1119 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete; 1120 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete; 1121 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete; 1122 allocate()1123 static kmp_hw_subset_t *allocate() { 1124 int initial_capacity = 5; 1125 kmp_hw_subset_t *retval = 1126 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t)); 1127 retval->depth = 0; 1128 retval->capacity = initial_capacity; 1129 retval->set = 0ull; 1130 retval->absolute = false; 1131 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity); 1132 return retval; 1133 } deallocate(kmp_hw_subset_t * subset)1134 static void deallocate(kmp_hw_subset_t *subset) { 1135 __kmp_free(subset->items); 1136 __kmp_free(subset); 1137 } set_absolute()1138 void set_absolute() { absolute = true; } is_absolute()1139 bool is_absolute() const { return absolute; } push_back(int num,kmp_hw_t type,int offset,kmp_hw_attr_t attr)1140 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) { 1141 for (int i = 0; i < depth; ++i) { 1142 // Found an existing item for this layer type 1143 // Add the num, offset, and attr to this item 1144 if (items[i].type == type) { 1145 int idx = items[i].num_attrs++; 1146 if ((size_t)idx >= MAX_ATTRS) 1147 return; 1148 items[i].num[idx] = num; 1149 items[i].offset[idx] = offset; 1150 items[i].attr[idx] = attr; 1151 return; 1152 } 1153 } 1154 if (depth == capacity - 1) { 1155 capacity *= 2; 1156 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity); 1157 for (int i = 0; i < depth; ++i) 1158 new_items[i] = items[i]; 1159 __kmp_free(items); 1160 items = new_items; 1161 } 1162 items[depth].num_attrs = 1; 1163 items[depth].type = type; 1164 items[depth].num[0] = num; 1165 items[depth].offset[0] = offset; 1166 items[depth].attr[0] = attr; 1167 depth++; 1168 set |= (1ull << type); 1169 } get_depth()1170 int get_depth() const { return depth; } at(int index)1171 const item_t &at(int index) const { 1172 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1173 return items[index]; 1174 } at(int index)1175 item_t &at(int index) { 1176 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1177 return items[index]; 1178 } remove(int index)1179 void remove(int index) { 1180 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1181 set &= ~(1ull << items[index].type); 1182 for (int j = index + 1; j < depth; ++j) { 1183 items[j - 1] = items[j]; 1184 } 1185 depth--; 1186 } sort()1187 void sort() { 1188 KMP_DEBUG_ASSERT(__kmp_topology); 1189 qsort(items, depth, sizeof(item_t), hw_subset_compare); 1190 } specified(kmp_hw_t type)1191 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); } 1192 1193 // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset. 1194 // This means putting each of {sockets, cores, threads} in the topology if 1195 // they are not specified: 1196 // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc. 1197 // e.g., 3module => *s,3module,*c,*t 1198 // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET 1199 // are expecting the traditional sockets/cores/threads topology. For newer 1200 // hardware, there can be intervening layers like dies/tiles/modules 1201 // (usually corresponding to a cache level). So when a user asks for 1202 // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user 1203 // should get 12 hardware threads across 6 cores and effectively ignore the 1204 // module layer. canonicalize(const kmp_topology_t * top)1205 void canonicalize(const kmp_topology_t *top) { 1206 // Layers to target for KMP_HW_SUBSET canonicalization 1207 kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD}; 1208 1209 // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS 1210 if (is_absolute()) 1211 return; 1212 1213 // Do not target-layer-canonicalize KMP_HW_SUBSETS when the 1214 // topology doesn't have these layers 1215 for (kmp_hw_t type : targeted) 1216 if (top->get_level(type) == KMP_HW_UNKNOWN) 1217 return; 1218 1219 // Put targeted layers in topology if they do not exist 1220 for (kmp_hw_t type : targeted) { 1221 bool found = false; 1222 for (int i = 0; i < get_depth(); ++i) { 1223 if (top->get_equivalent_type(items[i].type) == type) { 1224 found = true; 1225 break; 1226 } 1227 } 1228 if (!found) { 1229 push_back(USE_ALL, type, 0, kmp_hw_attr_t{}); 1230 } 1231 } 1232 sort(); 1233 // Set as an absolute topology that only targets the targeted layers 1234 set_absolute(); 1235 } dump()1236 void dump() const { 1237 printf("**********************\n"); 1238 printf("*** kmp_hw_subset: ***\n"); 1239 printf("* depth: %d\n", depth); 1240 printf("* items:\n"); 1241 for (int i = 0; i < depth; ++i) { 1242 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type)); 1243 for (int j = 0; j < items[i].num_attrs; ++j) { 1244 printf(" num: %d, offset: %d, attr: ", items[i].num[j], 1245 items[i].offset[j]); 1246 if (!items[i].attr[j]) { 1247 printf(" (none)\n"); 1248 } else { 1249 printf( 1250 " core_type = %s, core_eff = %d\n", 1251 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()), 1252 items[i].attr[j].get_core_eff()); 1253 } 1254 } 1255 } 1256 printf("* set: 0x%llx\n", set); 1257 printf("* absolute: %d\n", absolute); 1258 printf("**********************\n"); 1259 } 1260 }; 1261 extern kmp_hw_subset_t *__kmp_hw_subset; 1262 1263 /* A structure for holding machine-specific hierarchy info to be computed once 1264 at init. This structure represents a mapping of threads to the actual machine 1265 hierarchy, or to our best guess at what the hierarchy might be, for the 1266 purpose of performing an efficient barrier. In the worst case, when there is 1267 no machine hierarchy information, it produces a tree suitable for a barrier, 1268 similar to the tree used in the hyper barrier. */ 1269 class hierarchy_info { 1270 public: 1271 /* Good default values for number of leaves and branching factor, given no 1272 affinity information. Behaves a bit like hyper barrier. */ 1273 static const kmp_uint32 maxLeaves = 4; 1274 static const kmp_uint32 minBranch = 4; 1275 /** Number of levels in the hierarchy. Typical levels are threads/core, 1276 cores/package or socket, packages/node, nodes/machine, etc. We don't want 1277 to get specific with nomenclature. When the machine is oversubscribed we 1278 add levels to duplicate the hierarchy, doubling the thread capacity of the 1279 hierarchy each time we add a level. */ 1280 kmp_uint32 maxLevels; 1281 1282 /** This is specifically the depth of the machine configuration hierarchy, in 1283 terms of the number of levels along the longest path from root to any 1284 leaf. It corresponds to the number of entries in numPerLevel if we exclude 1285 all but one trailing 1. */ 1286 kmp_uint32 depth; 1287 kmp_uint32 base_num_threads = 0; 1288 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 }; 1289 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized, 1290 // 2=initialization in progress 1291 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing 1292 1293 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children 1294 the parent of a node at level i has. For example, if we have a machine 1295 with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel = 1296 {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 1297 kmp_uint32 *numPerLevel = nullptr; 1298 kmp_uint32 *skipPerLevel = nullptr; 1299 deriveLevels()1300 void deriveLevels() { 1301 int hier_depth = __kmp_topology->get_depth(); 1302 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) { 1303 numPerLevel[level] = __kmp_topology->get_ratio(i); 1304 } 1305 } 1306 hierarchy_info()1307 hierarchy_info() 1308 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {} 1309 fini()1310 void fini() { 1311 if (!uninitialized && numPerLevel) { 1312 __kmp_free(numPerLevel); 1313 numPerLevel = NULL; 1314 uninitialized = not_initialized; 1315 } 1316 } 1317 init(int num_addrs)1318 void init(int num_addrs) { 1319 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8( 1320 &uninitialized, not_initialized, initializing); 1321 if (bool_result == 0) { // Wait for initialization 1322 while (TCR_1(uninitialized) != initialized) 1323 KMP_CPU_PAUSE(); 1324 return; 1325 } 1326 KMP_DEBUG_ASSERT(bool_result == 1); 1327 1328 /* Added explicit initialization of the data fields here to prevent usage of 1329 dirty value observed when static library is re-initialized multiple times 1330 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses 1331 OpenMP). */ 1332 depth = 1; 1333 resizing = 0; 1334 maxLevels = 7; 1335 numPerLevel = 1336 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1337 skipPerLevel = &(numPerLevel[maxLevels]); 1338 for (kmp_uint32 i = 0; i < maxLevels; 1339 ++i) { // init numPerLevel[*] to 1 item per level 1340 numPerLevel[i] = 1; 1341 skipPerLevel[i] = 1; 1342 } 1343 1344 // Sort table by physical ID 1345 if (__kmp_topology && __kmp_topology->get_depth() > 0) { 1346 deriveLevels(); 1347 } else { 1348 numPerLevel[0] = maxLeaves; 1349 numPerLevel[1] = num_addrs / maxLeaves; 1350 if (num_addrs % maxLeaves) 1351 numPerLevel[1]++; 1352 } 1353 1354 base_num_threads = num_addrs; 1355 for (int i = maxLevels - 1; i >= 0; 1356 --i) // count non-empty levels to get depth 1357 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 1358 depth++; 1359 1360 kmp_uint32 branch = minBranch; 1361 if (numPerLevel[0] == 1) 1362 branch = num_addrs / maxLeaves; 1363 if (branch < minBranch) 1364 branch = minBranch; 1365 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width 1366 while (numPerLevel[d] > branch || 1367 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0! 1368 if (numPerLevel[d] & 1) 1369 numPerLevel[d]++; 1370 numPerLevel[d] = numPerLevel[d] >> 1; 1371 if (numPerLevel[d + 1] == 1) 1372 depth++; 1373 numPerLevel[d + 1] = numPerLevel[d + 1] << 1; 1374 } 1375 if (numPerLevel[0] == 1) { 1376 branch = branch >> 1; 1377 if (branch < 4) 1378 branch = minBranch; 1379 } 1380 } 1381 1382 for (kmp_uint32 i = 1; i < depth; ++i) 1383 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1]; 1384 // Fill in hierarchy in the case of oversubscription 1385 for (kmp_uint32 i = depth; i < maxLevels; ++i) 1386 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1387 1388 uninitialized = initialized; // One writer 1389 } 1390 1391 // Resize the hierarchy if nproc changes to something larger than before resize(kmp_uint32 nproc)1392 void resize(kmp_uint32 nproc) { 1393 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1394 while (bool_result == 0) { // someone else is trying to resize 1395 KMP_CPU_PAUSE(); 1396 if (nproc <= base_num_threads) // happy with other thread's resize 1397 return; 1398 else // try to resize 1399 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1400 } 1401 KMP_DEBUG_ASSERT(bool_result != 0); 1402 if (nproc <= base_num_threads) 1403 return; // happy with other thread's resize 1404 1405 // Calculate new maxLevels 1406 kmp_uint32 old_sz = skipPerLevel[depth - 1]; 1407 kmp_uint32 incs = 0, old_maxLevels = maxLevels; 1408 // First see if old maxLevels is enough to contain new size 1409 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) { 1410 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1411 numPerLevel[i - 1] *= 2; 1412 old_sz *= 2; 1413 depth++; 1414 } 1415 if (nproc > old_sz) { // Not enough space, need to expand hierarchy 1416 while (nproc > old_sz) { 1417 old_sz *= 2; 1418 incs++; 1419 depth++; 1420 } 1421 maxLevels += incs; 1422 1423 // Resize arrays 1424 kmp_uint32 *old_numPerLevel = numPerLevel; 1425 kmp_uint32 *old_skipPerLevel = skipPerLevel; 1426 numPerLevel = skipPerLevel = NULL; 1427 numPerLevel = 1428 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1429 skipPerLevel = &(numPerLevel[maxLevels]); 1430 1431 // Copy old elements from old arrays 1432 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) { 1433 // init numPerLevel[*] to 1 item per level 1434 numPerLevel[i] = old_numPerLevel[i]; 1435 skipPerLevel[i] = old_skipPerLevel[i]; 1436 } 1437 1438 // Init new elements in arrays to 1 1439 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) { 1440 // init numPerLevel[*] to 1 item per level 1441 numPerLevel[i] = 1; 1442 skipPerLevel[i] = 1; 1443 } 1444 1445 // Free old arrays 1446 __kmp_free(old_numPerLevel); 1447 } 1448 1449 // Fill in oversubscription levels of hierarchy 1450 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) 1451 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1452 1453 base_num_threads = nproc; 1454 resizing = 0; // One writer 1455 } 1456 }; 1457 #endif // KMP_AFFINITY_H 1458