1 /* 2 * kmp_affinity.h -- header for affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef KMP_AFFINITY_H 14 #define KMP_AFFINITY_H 15 16 #include "kmp.h" 17 #include "kmp_os.h" 18 #include <limits> 19 20 #if KMP_AFFINITY_SUPPORTED 21 #if KMP_USE_HWLOC 22 class KMPHwlocAffinity : public KMPAffinity { 23 public: 24 class Mask : public KMPAffinity::Mask { 25 hwloc_cpuset_t mask; 26 27 public: 28 Mask() { 29 mask = hwloc_bitmap_alloc(); 30 this->zero(); 31 } 32 ~Mask() { hwloc_bitmap_free(mask); } 33 void set(int i) override { hwloc_bitmap_set(mask, i); } 34 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); } 35 void clear(int i) override { hwloc_bitmap_clr(mask, i); } 36 void zero() override { hwloc_bitmap_zero(mask); } 37 void copy(const KMPAffinity::Mask *src) override { 38 const Mask *convert = static_cast<const Mask *>(src); 39 hwloc_bitmap_copy(mask, convert->mask); 40 } 41 void bitwise_and(const KMPAffinity::Mask *rhs) override { 42 const Mask *convert = static_cast<const Mask *>(rhs); 43 hwloc_bitmap_and(mask, mask, convert->mask); 44 } 45 void bitwise_or(const KMPAffinity::Mask *rhs) override { 46 const Mask *convert = static_cast<const Mask *>(rhs); 47 hwloc_bitmap_or(mask, mask, convert->mask); 48 } 49 void bitwise_not() override { hwloc_bitmap_not(mask, mask); } 50 int begin() const override { return hwloc_bitmap_first(mask); } 51 int end() const override { return -1; } 52 int next(int previous) const override { 53 return hwloc_bitmap_next(mask, previous); 54 } 55 int get_system_affinity(bool abort_on_error) override { 56 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 57 "Illegal get affinity operation when not capable"); 58 long retval = 59 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 60 if (retval >= 0) { 61 return 0; 62 } 63 int error = errno; 64 if (abort_on_error) { 65 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"), 66 KMP_ERR(error), __kmp_msg_null); 67 } 68 return error; 69 } 70 int set_system_affinity(bool abort_on_error) const override { 71 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 72 "Illegal set affinity operation when not capable"); 73 long retval = 74 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 75 if (retval >= 0) { 76 return 0; 77 } 78 int error = errno; 79 if (abort_on_error) { 80 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"), 81 KMP_ERR(error), __kmp_msg_null); 82 } 83 return error; 84 } 85 #if KMP_OS_WINDOWS 86 int set_process_affinity(bool abort_on_error) const override { 87 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 88 "Illegal set process affinity operation when not capable"); 89 int error = 0; 90 const hwloc_topology_support *support = 91 hwloc_topology_get_support(__kmp_hwloc_topology); 92 if (support->cpubind->set_proc_cpubind) { 93 int retval; 94 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask, 95 HWLOC_CPUBIND_PROCESS); 96 if (retval >= 0) 97 return 0; 98 error = errno; 99 if (abort_on_error) 100 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"), 101 KMP_ERR(error), __kmp_msg_null); 102 } 103 return error; 104 } 105 #endif 106 int get_proc_group() const override { 107 int group = -1; 108 #if KMP_OS_WINDOWS 109 if (__kmp_num_proc_groups == 1) { 110 return 1; 111 } 112 for (int i = 0; i < __kmp_num_proc_groups; i++) { 113 // On windows, the long type is always 32 bits 114 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2); 115 unsigned long second_32_bits = 116 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1); 117 if (first_32_bits == 0 && second_32_bits == 0) { 118 continue; 119 } 120 if (group >= 0) { 121 return -1; 122 } 123 group = i; 124 } 125 #endif /* KMP_OS_WINDOWS */ 126 return group; 127 } 128 }; 129 void determine_capable(const char *var) override { 130 const hwloc_topology_support *topology_support; 131 if (__kmp_hwloc_topology == NULL) { 132 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) { 133 __kmp_hwloc_error = TRUE; 134 if (__kmp_affinity.flags.verbose) { 135 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()"); 136 } 137 } 138 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) { 139 __kmp_hwloc_error = TRUE; 140 if (__kmp_affinity.flags.verbose) { 141 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()"); 142 } 143 } 144 } 145 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology); 146 // Is the system capable of setting/getting this thread's affinity? 147 // Also, is topology discovery possible? (pu indicates ability to discover 148 // processing units). And finally, were there no errors when calling any 149 // hwloc_* API functions? 150 if (topology_support && topology_support->cpubind->set_thisthread_cpubind && 151 topology_support->cpubind->get_thisthread_cpubind && 152 topology_support->discovery->pu && !__kmp_hwloc_error) { 153 // enables affinity according to KMP_AFFINITY_CAPABLE() macro 154 KMP_AFFINITY_ENABLE(TRUE); 155 } else { 156 // indicate that hwloc didn't work and disable affinity 157 __kmp_hwloc_error = TRUE; 158 KMP_AFFINITY_DISABLE(); 159 } 160 } 161 void bind_thread(int which) override { 162 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 163 "Illegal set affinity operation when not capable"); 164 KMPAffinity::Mask *mask; 165 KMP_CPU_ALLOC_ON_STACK(mask); 166 KMP_CPU_ZERO(mask); 167 KMP_CPU_SET(which, mask); 168 __kmp_set_system_affinity(mask, TRUE); 169 KMP_CPU_FREE_FROM_STACK(mask); 170 } 171 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } 172 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } 173 KMPAffinity::Mask *allocate_mask_array(int num) override { 174 return new Mask[num]; 175 } 176 void deallocate_mask_array(KMPAffinity::Mask *array) override { 177 Mask *hwloc_array = static_cast<Mask *>(array); 178 delete[] hwloc_array; 179 } 180 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 181 int index) override { 182 Mask *hwloc_array = static_cast<Mask *>(array); 183 return &(hwloc_array[index]); 184 } 185 api_type get_api_type() const override { return HWLOC; } 186 }; 187 #endif /* KMP_USE_HWLOC */ 188 189 #if KMP_OS_LINUX || KMP_OS_FREEBSD 190 #if KMP_OS_LINUX 191 /* On some of the older OS's that we build on, these constants aren't present 192 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on 193 all systems of the same arch where they are defined, and they cannot change. 194 stone forever. */ 195 #include <sys/syscall.h> 196 #if KMP_ARCH_X86 || KMP_ARCH_ARM 197 #ifndef __NR_sched_setaffinity 198 #define __NR_sched_setaffinity 241 199 #elif __NR_sched_setaffinity != 241 200 #error Wrong code for setaffinity system call. 201 #endif /* __NR_sched_setaffinity */ 202 #ifndef __NR_sched_getaffinity 203 #define __NR_sched_getaffinity 242 204 #elif __NR_sched_getaffinity != 242 205 #error Wrong code for getaffinity system call. 206 #endif /* __NR_sched_getaffinity */ 207 #elif KMP_ARCH_AARCH64 208 #ifndef __NR_sched_setaffinity 209 #define __NR_sched_setaffinity 122 210 #elif __NR_sched_setaffinity != 122 211 #error Wrong code for setaffinity system call. 212 #endif /* __NR_sched_setaffinity */ 213 #ifndef __NR_sched_getaffinity 214 #define __NR_sched_getaffinity 123 215 #elif __NR_sched_getaffinity != 123 216 #error Wrong code for getaffinity system call. 217 #endif /* __NR_sched_getaffinity */ 218 #elif KMP_ARCH_X86_64 219 #ifndef __NR_sched_setaffinity 220 #define __NR_sched_setaffinity 203 221 #elif __NR_sched_setaffinity != 203 222 #error Wrong code for setaffinity system call. 223 #endif /* __NR_sched_setaffinity */ 224 #ifndef __NR_sched_getaffinity 225 #define __NR_sched_getaffinity 204 226 #elif __NR_sched_getaffinity != 204 227 #error Wrong code for getaffinity system call. 228 #endif /* __NR_sched_getaffinity */ 229 #elif KMP_ARCH_PPC64 230 #ifndef __NR_sched_setaffinity 231 #define __NR_sched_setaffinity 222 232 #elif __NR_sched_setaffinity != 222 233 #error Wrong code for setaffinity system call. 234 #endif /* __NR_sched_setaffinity */ 235 #ifndef __NR_sched_getaffinity 236 #define __NR_sched_getaffinity 223 237 #elif __NR_sched_getaffinity != 223 238 #error Wrong code for getaffinity system call. 239 #endif /* __NR_sched_getaffinity */ 240 #elif KMP_ARCH_MIPS 241 #ifndef __NR_sched_setaffinity 242 #define __NR_sched_setaffinity 4239 243 #elif __NR_sched_setaffinity != 4239 244 #error Wrong code for setaffinity system call. 245 #endif /* __NR_sched_setaffinity */ 246 #ifndef __NR_sched_getaffinity 247 #define __NR_sched_getaffinity 4240 248 #elif __NR_sched_getaffinity != 4240 249 #error Wrong code for getaffinity system call. 250 #endif /* __NR_sched_getaffinity */ 251 #elif KMP_ARCH_MIPS64 252 #ifndef __NR_sched_setaffinity 253 #define __NR_sched_setaffinity 5195 254 #elif __NR_sched_setaffinity != 5195 255 #error Wrong code for setaffinity system call. 256 #endif /* __NR_sched_setaffinity */ 257 #ifndef __NR_sched_getaffinity 258 #define __NR_sched_getaffinity 5196 259 #elif __NR_sched_getaffinity != 5196 260 #error Wrong code for getaffinity system call. 261 #endif /* __NR_sched_getaffinity */ 262 #elif KMP_ARCH_LOONGARCH64 263 #ifndef __NR_sched_setaffinity 264 #define __NR_sched_setaffinity 122 265 #elif __NR_sched_setaffinity != 122 266 #error Wrong code for setaffinity system call. 267 #endif /* __NR_sched_setaffinity */ 268 #ifndef __NR_sched_getaffinity 269 #define __NR_sched_getaffinity 123 270 #elif __NR_sched_getaffinity != 123 271 #error Wrong code for getaffinity system call. 272 #endif /* __NR_sched_getaffinity */ 273 #elif KMP_ARCH_RISCV64 274 #ifndef __NR_sched_setaffinity 275 #define __NR_sched_setaffinity 122 276 #elif __NR_sched_setaffinity != 122 277 #error Wrong code for setaffinity system call. 278 #endif /* __NR_sched_setaffinity */ 279 #ifndef __NR_sched_getaffinity 280 #define __NR_sched_getaffinity 123 281 #elif __NR_sched_getaffinity != 123 282 #error Wrong code for getaffinity system call. 283 #endif /* __NR_sched_getaffinity */ 284 #else 285 #error Unknown or unsupported architecture 286 #endif /* KMP_ARCH_* */ 287 #elif KMP_OS_FREEBSD 288 #include <pthread.h> 289 #include <pthread_np.h> 290 #endif 291 class KMPNativeAffinity : public KMPAffinity { 292 class Mask : public KMPAffinity::Mask { 293 typedef unsigned long mask_t; 294 typedef decltype(__kmp_affin_mask_size) mask_size_type; 295 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 296 static const mask_t ONE = 1; 297 mask_size_type get_num_mask_types() const { 298 return __kmp_affin_mask_size / sizeof(mask_t); 299 } 300 301 public: 302 mask_t *mask; 303 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); } 304 ~Mask() { 305 if (mask) 306 __kmp_free(mask); 307 } 308 void set(int i) override { 309 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T)); 310 } 311 bool is_set(int i) const override { 312 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T))); 313 } 314 void clear(int i) override { 315 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T)); 316 } 317 void zero() override { 318 mask_size_type e = get_num_mask_types(); 319 for (mask_size_type i = 0; i < e; ++i) 320 mask[i] = (mask_t)0; 321 } 322 void copy(const KMPAffinity::Mask *src) override { 323 const Mask *convert = static_cast<const Mask *>(src); 324 mask_size_type e = get_num_mask_types(); 325 for (mask_size_type i = 0; i < e; ++i) 326 mask[i] = convert->mask[i]; 327 } 328 void bitwise_and(const KMPAffinity::Mask *rhs) override { 329 const Mask *convert = static_cast<const Mask *>(rhs); 330 mask_size_type e = get_num_mask_types(); 331 for (mask_size_type i = 0; i < e; ++i) 332 mask[i] &= convert->mask[i]; 333 } 334 void bitwise_or(const KMPAffinity::Mask *rhs) override { 335 const Mask *convert = static_cast<const Mask *>(rhs); 336 mask_size_type e = get_num_mask_types(); 337 for (mask_size_type i = 0; i < e; ++i) 338 mask[i] |= convert->mask[i]; 339 } 340 void bitwise_not() override { 341 mask_size_type e = get_num_mask_types(); 342 for (mask_size_type i = 0; i < e; ++i) 343 mask[i] = ~(mask[i]); 344 } 345 int begin() const override { 346 int retval = 0; 347 while (retval < end() && !is_set(retval)) 348 ++retval; 349 return retval; 350 } 351 int end() const override { 352 int e; 353 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e); 354 return e; 355 } 356 int next(int previous) const override { 357 int retval = previous + 1; 358 while (retval < end() && !is_set(retval)) 359 ++retval; 360 return retval; 361 } 362 int get_system_affinity(bool abort_on_error) override { 363 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 364 "Illegal get affinity operation when not capable"); 365 #if KMP_OS_LINUX 366 long retval = 367 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask); 368 #elif KMP_OS_FREEBSD 369 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, 370 reinterpret_cast<cpuset_t *>(mask)); 371 int retval = (r == 0 ? 0 : -1); 372 #endif 373 if (retval >= 0) { 374 return 0; 375 } 376 int error = errno; 377 if (abort_on_error) { 378 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"), 379 KMP_ERR(error), __kmp_msg_null); 380 } 381 return error; 382 } 383 int set_system_affinity(bool abort_on_error) const override { 384 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 385 "Illegal set affinity operation when not capable"); 386 #if KMP_OS_LINUX 387 long retval = 388 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask); 389 #elif KMP_OS_FREEBSD 390 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, 391 reinterpret_cast<cpuset_t *>(mask)); 392 int retval = (r == 0 ? 0 : -1); 393 #endif 394 if (retval >= 0) { 395 return 0; 396 } 397 int error = errno; 398 if (abort_on_error) { 399 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"), 400 KMP_ERR(error), __kmp_msg_null); 401 } 402 return error; 403 } 404 }; 405 void determine_capable(const char *env_var) override { 406 __kmp_affinity_determine_capable(env_var); 407 } 408 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } 409 KMPAffinity::Mask *allocate_mask() override { 410 KMPNativeAffinity::Mask *retval = new Mask(); 411 return retval; 412 } 413 void deallocate_mask(KMPAffinity::Mask *m) override { 414 KMPNativeAffinity::Mask *native_mask = 415 static_cast<KMPNativeAffinity::Mask *>(m); 416 delete native_mask; 417 } 418 KMPAffinity::Mask *allocate_mask_array(int num) override { 419 return new Mask[num]; 420 } 421 void deallocate_mask_array(KMPAffinity::Mask *array) override { 422 Mask *linux_array = static_cast<Mask *>(array); 423 delete[] linux_array; 424 } 425 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 426 int index) override { 427 Mask *linux_array = static_cast<Mask *>(array); 428 return &(linux_array[index]); 429 } 430 api_type get_api_type() const override { return NATIVE_OS; } 431 }; 432 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */ 433 434 #if KMP_OS_WINDOWS 435 class KMPNativeAffinity : public KMPAffinity { 436 class Mask : public KMPAffinity::Mask { 437 typedef ULONG_PTR mask_t; 438 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 439 mask_t *mask; 440 441 public: 442 Mask() { 443 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups); 444 } 445 ~Mask() { 446 if (mask) 447 __kmp_free(mask); 448 } 449 void set(int i) override { 450 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T)); 451 } 452 bool is_set(int i) const override { 453 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T))); 454 } 455 void clear(int i) override { 456 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T)); 457 } 458 void zero() override { 459 for (int i = 0; i < __kmp_num_proc_groups; ++i) 460 mask[i] = 0; 461 } 462 void copy(const KMPAffinity::Mask *src) override { 463 const Mask *convert = static_cast<const Mask *>(src); 464 for (int i = 0; i < __kmp_num_proc_groups; ++i) 465 mask[i] = convert->mask[i]; 466 } 467 void bitwise_and(const KMPAffinity::Mask *rhs) override { 468 const Mask *convert = static_cast<const Mask *>(rhs); 469 for (int i = 0; i < __kmp_num_proc_groups; ++i) 470 mask[i] &= convert->mask[i]; 471 } 472 void bitwise_or(const KMPAffinity::Mask *rhs) override { 473 const Mask *convert = static_cast<const Mask *>(rhs); 474 for (int i = 0; i < __kmp_num_proc_groups; ++i) 475 mask[i] |= convert->mask[i]; 476 } 477 void bitwise_not() override { 478 for (int i = 0; i < __kmp_num_proc_groups; ++i) 479 mask[i] = ~(mask[i]); 480 } 481 int begin() const override { 482 int retval = 0; 483 while (retval < end() && !is_set(retval)) 484 ++retval; 485 return retval; 486 } 487 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; } 488 int next(int previous) const override { 489 int retval = previous + 1; 490 while (retval < end() && !is_set(retval)) 491 ++retval; 492 return retval; 493 } 494 int set_process_affinity(bool abort_on_error) const override { 495 if (__kmp_num_proc_groups <= 1) { 496 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) { 497 DWORD error = GetLastError(); 498 if (abort_on_error) { 499 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 500 __kmp_msg_null); 501 } 502 return error; 503 } 504 } 505 return 0; 506 } 507 int set_system_affinity(bool abort_on_error) const override { 508 if (__kmp_num_proc_groups > 1) { 509 // Check for a valid mask. 510 GROUP_AFFINITY ga; 511 int group = get_proc_group(); 512 if (group < 0) { 513 if (abort_on_error) { 514 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 515 } 516 return -1; 517 } 518 // Transform the bit vector into a GROUP_AFFINITY struct 519 // and make the system call to set affinity. 520 ga.Group = group; 521 ga.Mask = mask[group]; 522 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0; 523 524 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL); 525 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) { 526 DWORD error = GetLastError(); 527 if (abort_on_error) { 528 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 529 __kmp_msg_null); 530 } 531 return error; 532 } 533 } else { 534 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) { 535 DWORD error = GetLastError(); 536 if (abort_on_error) { 537 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 538 __kmp_msg_null); 539 } 540 return error; 541 } 542 } 543 return 0; 544 } 545 int get_system_affinity(bool abort_on_error) override { 546 if (__kmp_num_proc_groups > 1) { 547 this->zero(); 548 GROUP_AFFINITY ga; 549 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL); 550 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) { 551 DWORD error = GetLastError(); 552 if (abort_on_error) { 553 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"), 554 KMP_ERR(error), __kmp_msg_null); 555 } 556 return error; 557 } 558 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) || 559 (ga.Mask == 0)) { 560 return -1; 561 } 562 mask[ga.Group] = ga.Mask; 563 } else { 564 mask_t newMask, sysMask, retval; 565 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) { 566 DWORD error = GetLastError(); 567 if (abort_on_error) { 568 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"), 569 KMP_ERR(error), __kmp_msg_null); 570 } 571 return error; 572 } 573 retval = SetThreadAffinityMask(GetCurrentThread(), newMask); 574 if (!retval) { 575 DWORD error = GetLastError(); 576 if (abort_on_error) { 577 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 578 KMP_ERR(error), __kmp_msg_null); 579 } 580 return error; 581 } 582 newMask = SetThreadAffinityMask(GetCurrentThread(), retval); 583 if (!newMask) { 584 DWORD error = GetLastError(); 585 if (abort_on_error) { 586 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 587 KMP_ERR(error), __kmp_msg_null); 588 } 589 } 590 *mask = retval; 591 } 592 return 0; 593 } 594 int get_proc_group() const override { 595 int group = -1; 596 if (__kmp_num_proc_groups == 1) { 597 return 1; 598 } 599 for (int i = 0; i < __kmp_num_proc_groups; i++) { 600 if (mask[i] == 0) 601 continue; 602 if (group >= 0) 603 return -1; 604 group = i; 605 } 606 return group; 607 } 608 }; 609 void determine_capable(const char *env_var) override { 610 __kmp_affinity_determine_capable(env_var); 611 } 612 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } 613 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } 614 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } 615 KMPAffinity::Mask *allocate_mask_array(int num) override { 616 return new Mask[num]; 617 } 618 void deallocate_mask_array(KMPAffinity::Mask *array) override { 619 Mask *windows_array = static_cast<Mask *>(array); 620 delete[] windows_array; 621 } 622 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 623 int index) override { 624 Mask *windows_array = static_cast<Mask *>(array); 625 return &(windows_array[index]); 626 } 627 api_type get_api_type() const override { return NATIVE_OS; } 628 }; 629 #endif /* KMP_OS_WINDOWS */ 630 #endif /* KMP_AFFINITY_SUPPORTED */ 631 632 // Describe an attribute for a level in the machine topology 633 struct kmp_hw_attr_t { 634 int core_type : 8; 635 int core_eff : 8; 636 unsigned valid : 1; 637 unsigned reserved : 15; 638 639 static const int UNKNOWN_CORE_EFF = -1; 640 641 kmp_hw_attr_t() 642 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF), 643 valid(0), reserved(0) {} 644 void set_core_type(kmp_hw_core_type_t type) { 645 valid = 1; 646 core_type = type; 647 } 648 void set_core_eff(int eff) { 649 valid = 1; 650 core_eff = eff; 651 } 652 kmp_hw_core_type_t get_core_type() const { 653 return (kmp_hw_core_type_t)core_type; 654 } 655 int get_core_eff() const { return core_eff; } 656 bool is_core_type_valid() const { 657 return core_type != KMP_HW_CORE_TYPE_UNKNOWN; 658 } 659 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; } 660 operator bool() const { return valid; } 661 void clear() { 662 core_type = KMP_HW_CORE_TYPE_UNKNOWN; 663 core_eff = UNKNOWN_CORE_EFF; 664 valid = 0; 665 } 666 bool contains(const kmp_hw_attr_t &other) const { 667 if (!valid && !other.valid) 668 return true; 669 if (valid && other.valid) { 670 if (other.is_core_type_valid()) { 671 if (!is_core_type_valid() || (get_core_type() != other.get_core_type())) 672 return false; 673 } 674 if (other.is_core_eff_valid()) { 675 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff())) 676 return false; 677 } 678 return true; 679 } 680 return false; 681 } 682 bool operator==(const kmp_hw_attr_t &rhs) const { 683 return (rhs.valid == valid && rhs.core_eff == core_eff && 684 rhs.core_type == core_type); 685 } 686 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); } 687 }; 688 689 #if KMP_AFFINITY_SUPPORTED 690 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t)); 691 #endif 692 693 class kmp_hw_thread_t { 694 public: 695 static const int UNKNOWN_ID = -1; 696 static const int MULTIPLE_ID = -2; 697 static int compare_ids(const void *a, const void *b); 698 static int compare_compact(const void *a, const void *b); 699 int ids[KMP_HW_LAST]; 700 int sub_ids[KMP_HW_LAST]; 701 bool leader; 702 int os_id; 703 kmp_hw_attr_t attrs; 704 705 void print() const; 706 void clear() { 707 for (int i = 0; i < (int)KMP_HW_LAST; ++i) 708 ids[i] = UNKNOWN_ID; 709 leader = false; 710 attrs.clear(); 711 } 712 }; 713 714 class kmp_topology_t { 715 716 struct flags_t { 717 int uniform : 1; 718 int reserved : 31; 719 }; 720 721 int depth; 722 723 // The following arrays are all 'depth' long and have been 724 // allocated to hold up to KMP_HW_LAST number of objects if 725 // needed so layers can be added without reallocation of any array 726 727 // Orderd array of the types in the topology 728 kmp_hw_t *types; 729 730 // Keep quick topology ratios, for non-uniform topologies, 731 // this ratio holds the max number of itemAs per itemB 732 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ] 733 int *ratio; 734 735 // Storage containing the absolute number of each topology layer 736 int *count; 737 738 // The number of core efficiencies. This is only useful for hybrid 739 // topologies. Core efficiencies will range from 0 to num efficiencies - 1 740 int num_core_efficiencies; 741 int num_core_types; 742 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES]; 743 744 // The hardware threads array 745 // hw_threads is num_hw_threads long 746 // Each hw_thread's ids and sub_ids are depth deep 747 int num_hw_threads; 748 kmp_hw_thread_t *hw_threads; 749 750 // Equivalence hash where the key is the hardware topology item 751 // and the value is the equivalent hardware topology type in the 752 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no 753 // known equivalence for the topology type 754 kmp_hw_t equivalent[KMP_HW_LAST]; 755 756 // Flags describing the topology 757 flags_t flags; 758 759 // Compact value used during sort_compact() 760 int compact; 761 762 // Insert a new topology layer after allocation 763 void _insert_layer(kmp_hw_t type, const int *ids); 764 765 #if KMP_GROUP_AFFINITY 766 // Insert topology information about Windows Processor groups 767 void _insert_windows_proc_groups(); 768 #endif 769 770 // Count each item & get the num x's per y 771 // e.g., get the number of cores and the number of threads per core 772 // for each (x, y) in (KMP_HW_* , KMP_HW_*) 773 void _gather_enumeration_information(); 774 775 // Remove layers that don't add information to the topology. 776 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 777 void _remove_radix1_layers(); 778 779 // Find out if the topology is uniform 780 void _discover_uniformity(); 781 782 // Set all the sub_ids for each hardware thread 783 void _set_sub_ids(); 784 785 // Set global affinity variables describing the number of threads per 786 // core, the number of packages, the number of cores per package, and 787 // the number of cores. 788 void _set_globals(); 789 790 // Set the last level cache equivalent type 791 void _set_last_level_cache(); 792 793 // Return the number of cores with a particular attribute, 'attr'. 794 // If 'find_all' is true, then find all cores on the machine, otherwise find 795 // all cores per the layer 'above' 796 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above, 797 bool find_all = false) const; 798 799 public: 800 // Force use of allocate()/deallocate() 801 kmp_topology_t() = delete; 802 kmp_topology_t(const kmp_topology_t &t) = delete; 803 kmp_topology_t(kmp_topology_t &&t) = delete; 804 kmp_topology_t &operator=(const kmp_topology_t &t) = delete; 805 kmp_topology_t &operator=(kmp_topology_t &&t) = delete; 806 807 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types); 808 static void deallocate(kmp_topology_t *); 809 810 // Functions used in create_map() routines 811 kmp_hw_thread_t &at(int index) { 812 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 813 return hw_threads[index]; 814 } 815 const kmp_hw_thread_t &at(int index) const { 816 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 817 return hw_threads[index]; 818 } 819 int get_num_hw_threads() const { return num_hw_threads; } 820 void sort_ids() { 821 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 822 kmp_hw_thread_t::compare_ids); 823 } 824 // Check if the hardware ids are unique, if they are 825 // return true, otherwise return false 826 bool check_ids() const; 827 828 // Function to call after the create_map() routine 829 void canonicalize(); 830 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores); 831 832 // Functions used after canonicalize() called 833 834 #if KMP_AFFINITY_SUPPORTED 835 // Set the granularity for affinity settings 836 void set_granularity(kmp_affinity_t &stgs) const; 837 #endif 838 bool filter_hw_subset(); 839 bool is_close(int hwt1, int hwt2, int level) const; 840 bool is_uniform() const { return flags.uniform; } 841 // Tell whether a type is a valid type in the topology 842 // returns KMP_HW_UNKNOWN when there is no equivalent type 843 kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; } 844 // Set type1 = type2 845 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) { 846 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1); 847 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2); 848 kmp_hw_t real_type2 = equivalent[type2]; 849 if (real_type2 == KMP_HW_UNKNOWN) 850 real_type2 = type2; 851 equivalent[type1] = real_type2; 852 // This loop is required since any of the types may have been set to 853 // be equivalent to type1. They all must be checked and reset to type2. 854 KMP_FOREACH_HW_TYPE(type) { 855 if (equivalent[type] == type1) { 856 equivalent[type] = real_type2; 857 } 858 } 859 } 860 // Calculate number of types corresponding to level1 861 // per types corresponding to level2 (e.g., number of threads per core) 862 int calculate_ratio(int level1, int level2) const { 863 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth); 864 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth); 865 int r = 1; 866 for (int level = level1; level > level2; --level) 867 r *= ratio[level]; 868 return r; 869 } 870 int get_ratio(int level) const { 871 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 872 return ratio[level]; 873 } 874 int get_depth() const { return depth; }; 875 kmp_hw_t get_type(int level) const { 876 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 877 return types[level]; 878 } 879 int get_level(kmp_hw_t type) const { 880 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type); 881 int eq_type = equivalent[type]; 882 if (eq_type == KMP_HW_UNKNOWN) 883 return -1; 884 for (int i = 0; i < depth; ++i) 885 if (types[i] == eq_type) 886 return i; 887 return -1; 888 } 889 int get_count(int level) const { 890 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 891 return count[level]; 892 } 893 // Return the total number of cores with attribute 'attr' 894 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const { 895 return _get_ncores_with_attr(attr, -1, true); 896 } 897 // Return the number of cores with attribute 898 // 'attr' per topology level 'above' 899 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const { 900 return _get_ncores_with_attr(attr, above, false); 901 } 902 903 #if KMP_AFFINITY_SUPPORTED 904 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b); 905 void sort_compact(kmp_affinity_t &affinity) { 906 compact = affinity.compact; 907 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 908 kmp_hw_thread_t::compare_compact); 909 } 910 #endif 911 void print(const char *env_var = "KMP_AFFINITY") const; 912 void dump() const; 913 }; 914 extern kmp_topology_t *__kmp_topology; 915 916 class kmp_hw_subset_t { 917 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS; 918 919 public: 920 // Describe a machine topology item in KMP_HW_SUBSET 921 struct item_t { 922 kmp_hw_t type; 923 int num_attrs; 924 int num[MAX_ATTRS]; 925 int offset[MAX_ATTRS]; 926 kmp_hw_attr_t attr[MAX_ATTRS]; 927 }; 928 // Put parenthesis around max to avoid accidental use of Windows max macro. 929 const static int USE_ALL = (std::numeric_limits<int>::max)(); 930 931 private: 932 int depth; 933 int capacity; 934 item_t *items; 935 kmp_uint64 set; 936 bool absolute; 937 // The set must be able to handle up to KMP_HW_LAST number of layers 938 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST); 939 // Sorting the KMP_HW_SUBSET items to follow topology order 940 // All unknown topology types will be at the beginning of the subset 941 static int hw_subset_compare(const void *i1, const void *i2) { 942 kmp_hw_t type1 = ((const item_t *)i1)->type; 943 kmp_hw_t type2 = ((const item_t *)i2)->type; 944 int level1 = __kmp_topology->get_level(type1); 945 int level2 = __kmp_topology->get_level(type2); 946 return level1 - level2; 947 } 948 949 public: 950 // Force use of allocate()/deallocate() 951 kmp_hw_subset_t() = delete; 952 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete; 953 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete; 954 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete; 955 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete; 956 957 static kmp_hw_subset_t *allocate() { 958 int initial_capacity = 5; 959 kmp_hw_subset_t *retval = 960 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t)); 961 retval->depth = 0; 962 retval->capacity = initial_capacity; 963 retval->set = 0ull; 964 retval->absolute = false; 965 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity); 966 return retval; 967 } 968 static void deallocate(kmp_hw_subset_t *subset) { 969 __kmp_free(subset->items); 970 __kmp_free(subset); 971 } 972 void set_absolute() { absolute = true; } 973 bool is_absolute() const { return absolute; } 974 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) { 975 for (int i = 0; i < depth; ++i) { 976 // Found an existing item for this layer type 977 // Add the num, offset, and attr to this item 978 if (items[i].type == type) { 979 int idx = items[i].num_attrs++; 980 if ((size_t)idx >= MAX_ATTRS) 981 return; 982 items[i].num[idx] = num; 983 items[i].offset[idx] = offset; 984 items[i].attr[idx] = attr; 985 return; 986 } 987 } 988 if (depth == capacity - 1) { 989 capacity *= 2; 990 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity); 991 for (int i = 0; i < depth; ++i) 992 new_items[i] = items[i]; 993 __kmp_free(items); 994 items = new_items; 995 } 996 items[depth].num_attrs = 1; 997 items[depth].type = type; 998 items[depth].num[0] = num; 999 items[depth].offset[0] = offset; 1000 items[depth].attr[0] = attr; 1001 depth++; 1002 set |= (1ull << type); 1003 } 1004 int get_depth() const { return depth; } 1005 const item_t &at(int index) const { 1006 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1007 return items[index]; 1008 } 1009 item_t &at(int index) { 1010 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1011 return items[index]; 1012 } 1013 void remove(int index) { 1014 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1015 set &= ~(1ull << items[index].type); 1016 for (int j = index + 1; j < depth; ++j) { 1017 items[j - 1] = items[j]; 1018 } 1019 depth--; 1020 } 1021 void sort() { 1022 KMP_DEBUG_ASSERT(__kmp_topology); 1023 qsort(items, depth, sizeof(item_t), hw_subset_compare); 1024 } 1025 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); } 1026 void dump() const { 1027 printf("**********************\n"); 1028 printf("*** kmp_hw_subset: ***\n"); 1029 printf("* depth: %d\n", depth); 1030 printf("* items:\n"); 1031 for (int i = 0; i < depth; ++i) { 1032 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type)); 1033 for (int j = 0; j < items[i].num_attrs; ++j) { 1034 printf(" num: %d, offset: %d, attr: ", items[i].num[j], 1035 items[i].offset[j]); 1036 if (!items[i].attr[j]) { 1037 printf(" (none)\n"); 1038 } else { 1039 printf( 1040 " core_type = %s, core_eff = %d\n", 1041 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()), 1042 items[i].attr[j].get_core_eff()); 1043 } 1044 } 1045 } 1046 printf("* set: 0x%llx\n", set); 1047 printf("* absolute: %d\n", absolute); 1048 printf("**********************\n"); 1049 } 1050 }; 1051 extern kmp_hw_subset_t *__kmp_hw_subset; 1052 1053 /* A structure for holding machine-specific hierarchy info to be computed once 1054 at init. This structure represents a mapping of threads to the actual machine 1055 hierarchy, or to our best guess at what the hierarchy might be, for the 1056 purpose of performing an efficient barrier. In the worst case, when there is 1057 no machine hierarchy information, it produces a tree suitable for a barrier, 1058 similar to the tree used in the hyper barrier. */ 1059 class hierarchy_info { 1060 public: 1061 /* Good default values for number of leaves and branching factor, given no 1062 affinity information. Behaves a bit like hyper barrier. */ 1063 static const kmp_uint32 maxLeaves = 4; 1064 static const kmp_uint32 minBranch = 4; 1065 /** Number of levels in the hierarchy. Typical levels are threads/core, 1066 cores/package or socket, packages/node, nodes/machine, etc. We don't want 1067 to get specific with nomenclature. When the machine is oversubscribed we 1068 add levels to duplicate the hierarchy, doubling the thread capacity of the 1069 hierarchy each time we add a level. */ 1070 kmp_uint32 maxLevels; 1071 1072 /** This is specifically the depth of the machine configuration hierarchy, in 1073 terms of the number of levels along the longest path from root to any 1074 leaf. It corresponds to the number of entries in numPerLevel if we exclude 1075 all but one trailing 1. */ 1076 kmp_uint32 depth; 1077 kmp_uint32 base_num_threads; 1078 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 }; 1079 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized, 1080 // 2=initialization in progress 1081 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing 1082 1083 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children 1084 the parent of a node at level i has. For example, if we have a machine 1085 with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel = 1086 {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 1087 kmp_uint32 *numPerLevel; 1088 kmp_uint32 *skipPerLevel; 1089 1090 void deriveLevels() { 1091 int hier_depth = __kmp_topology->get_depth(); 1092 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) { 1093 numPerLevel[level] = __kmp_topology->get_ratio(i); 1094 } 1095 } 1096 1097 hierarchy_info() 1098 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {} 1099 1100 void fini() { 1101 if (!uninitialized && numPerLevel) { 1102 __kmp_free(numPerLevel); 1103 numPerLevel = NULL; 1104 uninitialized = not_initialized; 1105 } 1106 } 1107 1108 void init(int num_addrs) { 1109 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8( 1110 &uninitialized, not_initialized, initializing); 1111 if (bool_result == 0) { // Wait for initialization 1112 while (TCR_1(uninitialized) != initialized) 1113 KMP_CPU_PAUSE(); 1114 return; 1115 } 1116 KMP_DEBUG_ASSERT(bool_result == 1); 1117 1118 /* Added explicit initialization of the data fields here to prevent usage of 1119 dirty value observed when static library is re-initialized multiple times 1120 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses 1121 OpenMP). */ 1122 depth = 1; 1123 resizing = 0; 1124 maxLevels = 7; 1125 numPerLevel = 1126 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1127 skipPerLevel = &(numPerLevel[maxLevels]); 1128 for (kmp_uint32 i = 0; i < maxLevels; 1129 ++i) { // init numPerLevel[*] to 1 item per level 1130 numPerLevel[i] = 1; 1131 skipPerLevel[i] = 1; 1132 } 1133 1134 // Sort table by physical ID 1135 if (__kmp_topology && __kmp_topology->get_depth() > 0) { 1136 deriveLevels(); 1137 } else { 1138 numPerLevel[0] = maxLeaves; 1139 numPerLevel[1] = num_addrs / maxLeaves; 1140 if (num_addrs % maxLeaves) 1141 numPerLevel[1]++; 1142 } 1143 1144 base_num_threads = num_addrs; 1145 for (int i = maxLevels - 1; i >= 0; 1146 --i) // count non-empty levels to get depth 1147 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 1148 depth++; 1149 1150 kmp_uint32 branch = minBranch; 1151 if (numPerLevel[0] == 1) 1152 branch = num_addrs / maxLeaves; 1153 if (branch < minBranch) 1154 branch = minBranch; 1155 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width 1156 while (numPerLevel[d] > branch || 1157 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0! 1158 if (numPerLevel[d] & 1) 1159 numPerLevel[d]++; 1160 numPerLevel[d] = numPerLevel[d] >> 1; 1161 if (numPerLevel[d + 1] == 1) 1162 depth++; 1163 numPerLevel[d + 1] = numPerLevel[d + 1] << 1; 1164 } 1165 if (numPerLevel[0] == 1) { 1166 branch = branch >> 1; 1167 if (branch < 4) 1168 branch = minBranch; 1169 } 1170 } 1171 1172 for (kmp_uint32 i = 1; i < depth; ++i) 1173 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1]; 1174 // Fill in hierarchy in the case of oversubscription 1175 for (kmp_uint32 i = depth; i < maxLevels; ++i) 1176 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1177 1178 uninitialized = initialized; // One writer 1179 } 1180 1181 // Resize the hierarchy if nproc changes to something larger than before 1182 void resize(kmp_uint32 nproc) { 1183 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1184 while (bool_result == 0) { // someone else is trying to resize 1185 KMP_CPU_PAUSE(); 1186 if (nproc <= base_num_threads) // happy with other thread's resize 1187 return; 1188 else // try to resize 1189 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1190 } 1191 KMP_DEBUG_ASSERT(bool_result != 0); 1192 if (nproc <= base_num_threads) 1193 return; // happy with other thread's resize 1194 1195 // Calculate new maxLevels 1196 kmp_uint32 old_sz = skipPerLevel[depth - 1]; 1197 kmp_uint32 incs = 0, old_maxLevels = maxLevels; 1198 // First see if old maxLevels is enough to contain new size 1199 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) { 1200 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1201 numPerLevel[i - 1] *= 2; 1202 old_sz *= 2; 1203 depth++; 1204 } 1205 if (nproc > old_sz) { // Not enough space, need to expand hierarchy 1206 while (nproc > old_sz) { 1207 old_sz *= 2; 1208 incs++; 1209 depth++; 1210 } 1211 maxLevels += incs; 1212 1213 // Resize arrays 1214 kmp_uint32 *old_numPerLevel = numPerLevel; 1215 kmp_uint32 *old_skipPerLevel = skipPerLevel; 1216 numPerLevel = skipPerLevel = NULL; 1217 numPerLevel = 1218 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1219 skipPerLevel = &(numPerLevel[maxLevels]); 1220 1221 // Copy old elements from old arrays 1222 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) { 1223 // init numPerLevel[*] to 1 item per level 1224 numPerLevel[i] = old_numPerLevel[i]; 1225 skipPerLevel[i] = old_skipPerLevel[i]; 1226 } 1227 1228 // Init new elements in arrays to 1 1229 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) { 1230 // init numPerLevel[*] to 1 item per level 1231 numPerLevel[i] = 1; 1232 skipPerLevel[i] = 1; 1233 } 1234 1235 // Free old arrays 1236 __kmp_free(old_numPerLevel); 1237 } 1238 1239 // Fill in oversubscription levels of hierarchy 1240 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) 1241 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1242 1243 base_num_threads = nproc; 1244 resizing = 0; // One writer 1245 } 1246 }; 1247 #endif // KMP_AFFINITY_H 1248