1 /* 2 * kmp_affinity.h -- header for affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef KMP_AFFINITY_H 14 #define KMP_AFFINITY_H 15 16 #include "kmp.h" 17 #include "kmp_os.h" 18 #include <limits> 19 20 #if KMP_AFFINITY_SUPPORTED 21 #if KMP_USE_HWLOC 22 class KMPHwlocAffinity : public KMPAffinity { 23 public: 24 class Mask : public KMPAffinity::Mask { 25 hwloc_cpuset_t mask; 26 27 public: 28 Mask() { 29 mask = hwloc_bitmap_alloc(); 30 this->zero(); 31 } 32 ~Mask() { hwloc_bitmap_free(mask); } 33 void set(int i) override { hwloc_bitmap_set(mask, i); } 34 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); } 35 void clear(int i) override { hwloc_bitmap_clr(mask, i); } 36 void zero() override { hwloc_bitmap_zero(mask); } 37 void copy(const KMPAffinity::Mask *src) override { 38 const Mask *convert = static_cast<const Mask *>(src); 39 hwloc_bitmap_copy(mask, convert->mask); 40 } 41 void bitwise_and(const KMPAffinity::Mask *rhs) override { 42 const Mask *convert = static_cast<const Mask *>(rhs); 43 hwloc_bitmap_and(mask, mask, convert->mask); 44 } 45 void bitwise_or(const KMPAffinity::Mask *rhs) override { 46 const Mask *convert = static_cast<const Mask *>(rhs); 47 hwloc_bitmap_or(mask, mask, convert->mask); 48 } 49 void bitwise_not() override { hwloc_bitmap_not(mask, mask); } 50 int begin() const override { return hwloc_bitmap_first(mask); } 51 int end() const override { return -1; } 52 int next(int previous) const override { 53 return hwloc_bitmap_next(mask, previous); 54 } 55 int get_system_affinity(bool abort_on_error) override { 56 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 57 "Illegal get affinity operation when not capable"); 58 long retval = 59 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 60 if (retval >= 0) { 61 return 0; 62 } 63 int error = errno; 64 if (abort_on_error) { 65 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null); 66 } 67 return error; 68 } 69 int set_system_affinity(bool abort_on_error) const override { 70 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 71 "Illegal set affinity operation when not capable"); 72 long retval = 73 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 74 if (retval >= 0) { 75 return 0; 76 } 77 int error = errno; 78 if (abort_on_error) { 79 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null); 80 } 81 return error; 82 } 83 #if KMP_OS_WINDOWS 84 int set_process_affinity(bool abort_on_error) const override { 85 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 86 "Illegal set process affinity operation when not capable"); 87 int error = 0; 88 const hwloc_topology_support *support = 89 hwloc_topology_get_support(__kmp_hwloc_topology); 90 if (support->cpubind->set_proc_cpubind) { 91 int retval; 92 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask, 93 HWLOC_CPUBIND_PROCESS); 94 if (retval >= 0) 95 return 0; 96 error = errno; 97 if (abort_on_error) 98 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null); 99 } 100 return error; 101 } 102 #endif 103 int get_proc_group() const override { 104 int group = -1; 105 #if KMP_OS_WINDOWS 106 if (__kmp_num_proc_groups == 1) { 107 return 1; 108 } 109 for (int i = 0; i < __kmp_num_proc_groups; i++) { 110 // On windows, the long type is always 32 bits 111 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2); 112 unsigned long second_32_bits = 113 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1); 114 if (first_32_bits == 0 && second_32_bits == 0) { 115 continue; 116 } 117 if (group >= 0) { 118 return -1; 119 } 120 group = i; 121 } 122 #endif /* KMP_OS_WINDOWS */ 123 return group; 124 } 125 }; 126 void determine_capable(const char *var) override { 127 const hwloc_topology_support *topology_support; 128 if (__kmp_hwloc_topology == NULL) { 129 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) { 130 __kmp_hwloc_error = TRUE; 131 if (__kmp_affinity.flags.verbose) { 132 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()"); 133 } 134 } 135 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) { 136 __kmp_hwloc_error = TRUE; 137 if (__kmp_affinity.flags.verbose) { 138 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()"); 139 } 140 } 141 } 142 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology); 143 // Is the system capable of setting/getting this thread's affinity? 144 // Also, is topology discovery possible? (pu indicates ability to discover 145 // processing units). And finally, were there no errors when calling any 146 // hwloc_* API functions? 147 if (topology_support && topology_support->cpubind->set_thisthread_cpubind && 148 topology_support->cpubind->get_thisthread_cpubind && 149 topology_support->discovery->pu && !__kmp_hwloc_error) { 150 // enables affinity according to KMP_AFFINITY_CAPABLE() macro 151 KMP_AFFINITY_ENABLE(TRUE); 152 } else { 153 // indicate that hwloc didn't work and disable affinity 154 __kmp_hwloc_error = TRUE; 155 KMP_AFFINITY_DISABLE(); 156 } 157 } 158 void bind_thread(int which) override { 159 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 160 "Illegal set affinity operation when not capable"); 161 KMPAffinity::Mask *mask; 162 KMP_CPU_ALLOC_ON_STACK(mask); 163 KMP_CPU_ZERO(mask); 164 KMP_CPU_SET(which, mask); 165 __kmp_set_system_affinity(mask, TRUE); 166 KMP_CPU_FREE_FROM_STACK(mask); 167 } 168 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } 169 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } 170 KMPAffinity::Mask *allocate_mask_array(int num) override { 171 return new Mask[num]; 172 } 173 void deallocate_mask_array(KMPAffinity::Mask *array) override { 174 Mask *hwloc_array = static_cast<Mask *>(array); 175 delete[] hwloc_array; 176 } 177 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 178 int index) override { 179 Mask *hwloc_array = static_cast<Mask *>(array); 180 return &(hwloc_array[index]); 181 } 182 api_type get_api_type() const override { return HWLOC; } 183 }; 184 #endif /* KMP_USE_HWLOC */ 185 186 #if KMP_OS_LINUX || KMP_OS_FREEBSD 187 #if KMP_OS_LINUX 188 /* On some of the older OS's that we build on, these constants aren't present 189 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on 190 all systems of the same arch where they are defined, and they cannot change. 191 stone forever. */ 192 #include <sys/syscall.h> 193 #if KMP_ARCH_X86 || KMP_ARCH_ARM 194 #ifndef __NR_sched_setaffinity 195 #define __NR_sched_setaffinity 241 196 #elif __NR_sched_setaffinity != 241 197 #error Wrong code for setaffinity system call. 198 #endif /* __NR_sched_setaffinity */ 199 #ifndef __NR_sched_getaffinity 200 #define __NR_sched_getaffinity 242 201 #elif __NR_sched_getaffinity != 242 202 #error Wrong code for getaffinity system call. 203 #endif /* __NR_sched_getaffinity */ 204 #elif KMP_ARCH_AARCH64 205 #ifndef __NR_sched_setaffinity 206 #define __NR_sched_setaffinity 122 207 #elif __NR_sched_setaffinity != 122 208 #error Wrong code for setaffinity system call. 209 #endif /* __NR_sched_setaffinity */ 210 #ifndef __NR_sched_getaffinity 211 #define __NR_sched_getaffinity 123 212 #elif __NR_sched_getaffinity != 123 213 #error Wrong code for getaffinity system call. 214 #endif /* __NR_sched_getaffinity */ 215 #elif KMP_ARCH_X86_64 216 #ifndef __NR_sched_setaffinity 217 #define __NR_sched_setaffinity 203 218 #elif __NR_sched_setaffinity != 203 219 #error Wrong code for setaffinity system call. 220 #endif /* __NR_sched_setaffinity */ 221 #ifndef __NR_sched_getaffinity 222 #define __NR_sched_getaffinity 204 223 #elif __NR_sched_getaffinity != 204 224 #error Wrong code for getaffinity system call. 225 #endif /* __NR_sched_getaffinity */ 226 #elif KMP_ARCH_PPC64 227 #ifndef __NR_sched_setaffinity 228 #define __NR_sched_setaffinity 222 229 #elif __NR_sched_setaffinity != 222 230 #error Wrong code for setaffinity system call. 231 #endif /* __NR_sched_setaffinity */ 232 #ifndef __NR_sched_getaffinity 233 #define __NR_sched_getaffinity 223 234 #elif __NR_sched_getaffinity != 223 235 #error Wrong code for getaffinity system call. 236 #endif /* __NR_sched_getaffinity */ 237 #elif KMP_ARCH_MIPS 238 #ifndef __NR_sched_setaffinity 239 #define __NR_sched_setaffinity 4239 240 #elif __NR_sched_setaffinity != 4239 241 #error Wrong code for setaffinity system call. 242 #endif /* __NR_sched_setaffinity */ 243 #ifndef __NR_sched_getaffinity 244 #define __NR_sched_getaffinity 4240 245 #elif __NR_sched_getaffinity != 4240 246 #error Wrong code for getaffinity system call. 247 #endif /* __NR_sched_getaffinity */ 248 #elif KMP_ARCH_MIPS64 249 #ifndef __NR_sched_setaffinity 250 #define __NR_sched_setaffinity 5195 251 #elif __NR_sched_setaffinity != 5195 252 #error Wrong code for setaffinity system call. 253 #endif /* __NR_sched_setaffinity */ 254 #ifndef __NR_sched_getaffinity 255 #define __NR_sched_getaffinity 5196 256 #elif __NR_sched_getaffinity != 5196 257 #error Wrong code for getaffinity system call. 258 #endif /* __NR_sched_getaffinity */ 259 #elif KMP_ARCH_LOONGARCH64 260 #ifndef __NR_sched_setaffinity 261 #define __NR_sched_setaffinity 122 262 #elif __NR_sched_setaffinity != 122 263 #error Wrong code for setaffinity system call. 264 #endif /* __NR_sched_setaffinity */ 265 #ifndef __NR_sched_getaffinity 266 #define __NR_sched_getaffinity 123 267 #elif __NR_sched_getaffinity != 123 268 #error Wrong code for getaffinity system call. 269 #endif /* __NR_sched_getaffinity */ 270 #elif KMP_ARCH_RISCV64 271 #ifndef __NR_sched_setaffinity 272 #define __NR_sched_setaffinity 122 273 #elif __NR_sched_setaffinity != 122 274 #error Wrong code for setaffinity system call. 275 #endif /* __NR_sched_setaffinity */ 276 #ifndef __NR_sched_getaffinity 277 #define __NR_sched_getaffinity 123 278 #elif __NR_sched_getaffinity != 123 279 #error Wrong code for getaffinity system call. 280 #endif /* __NR_sched_getaffinity */ 281 #else 282 #error Unknown or unsupported architecture 283 #endif /* KMP_ARCH_* */ 284 #elif KMP_OS_FREEBSD 285 #include <pthread.h> 286 #include <pthread_np.h> 287 #endif 288 class KMPNativeAffinity : public KMPAffinity { 289 class Mask : public KMPAffinity::Mask { 290 typedef unsigned long mask_t; 291 typedef decltype(__kmp_affin_mask_size) mask_size_type; 292 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 293 static const mask_t ONE = 1; 294 mask_size_type get_num_mask_types() const { 295 return __kmp_affin_mask_size / sizeof(mask_t); 296 } 297 298 public: 299 mask_t *mask; 300 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); } 301 ~Mask() { 302 if (mask) 303 __kmp_free(mask); 304 } 305 void set(int i) override { 306 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T)); 307 } 308 bool is_set(int i) const override { 309 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T))); 310 } 311 void clear(int i) override { 312 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T)); 313 } 314 void zero() override { 315 mask_size_type e = get_num_mask_types(); 316 for (mask_size_type i = 0; i < e; ++i) 317 mask[i] = (mask_t)0; 318 } 319 void copy(const KMPAffinity::Mask *src) override { 320 const Mask *convert = static_cast<const Mask *>(src); 321 mask_size_type e = get_num_mask_types(); 322 for (mask_size_type i = 0; i < e; ++i) 323 mask[i] = convert->mask[i]; 324 } 325 void bitwise_and(const KMPAffinity::Mask *rhs) override { 326 const Mask *convert = static_cast<const Mask *>(rhs); 327 mask_size_type e = get_num_mask_types(); 328 for (mask_size_type i = 0; i < e; ++i) 329 mask[i] &= convert->mask[i]; 330 } 331 void bitwise_or(const KMPAffinity::Mask *rhs) override { 332 const Mask *convert = static_cast<const Mask *>(rhs); 333 mask_size_type e = get_num_mask_types(); 334 for (mask_size_type i = 0; i < e; ++i) 335 mask[i] |= convert->mask[i]; 336 } 337 void bitwise_not() override { 338 mask_size_type e = get_num_mask_types(); 339 for (mask_size_type i = 0; i < e; ++i) 340 mask[i] = ~(mask[i]); 341 } 342 int begin() const override { 343 int retval = 0; 344 while (retval < end() && !is_set(retval)) 345 ++retval; 346 return retval; 347 } 348 int end() const override { 349 int e; 350 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e); 351 return e; 352 } 353 int next(int previous) const override { 354 int retval = previous + 1; 355 while (retval < end() && !is_set(retval)) 356 ++retval; 357 return retval; 358 } 359 int get_system_affinity(bool abort_on_error) override { 360 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 361 "Illegal get affinity operation when not capable"); 362 #if KMP_OS_LINUX 363 long retval = 364 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask); 365 #elif KMP_OS_FREEBSD 366 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, 367 reinterpret_cast<cpuset_t *>(mask)); 368 int retval = (r == 0 ? 0 : -1); 369 #endif 370 if (retval >= 0) { 371 return 0; 372 } 373 int error = errno; 374 if (abort_on_error) { 375 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null); 376 } 377 return error; 378 } 379 int set_system_affinity(bool abort_on_error) const override { 380 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 381 "Illegal set affinity operation when not capable"); 382 #if KMP_OS_LINUX 383 long retval = 384 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask); 385 #elif KMP_OS_FREEBSD 386 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, 387 reinterpret_cast<cpuset_t *>(mask)); 388 int retval = (r == 0 ? 0 : -1); 389 #endif 390 if (retval >= 0) { 391 return 0; 392 } 393 int error = errno; 394 if (abort_on_error) { 395 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null); 396 } 397 return error; 398 } 399 }; 400 void determine_capable(const char *env_var) override { 401 __kmp_affinity_determine_capable(env_var); 402 } 403 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } 404 KMPAffinity::Mask *allocate_mask() override { 405 KMPNativeAffinity::Mask *retval = new Mask(); 406 return retval; 407 } 408 void deallocate_mask(KMPAffinity::Mask *m) override { 409 KMPNativeAffinity::Mask *native_mask = 410 static_cast<KMPNativeAffinity::Mask *>(m); 411 delete native_mask; 412 } 413 KMPAffinity::Mask *allocate_mask_array(int num) override { 414 return new Mask[num]; 415 } 416 void deallocate_mask_array(KMPAffinity::Mask *array) override { 417 Mask *linux_array = static_cast<Mask *>(array); 418 delete[] linux_array; 419 } 420 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 421 int index) override { 422 Mask *linux_array = static_cast<Mask *>(array); 423 return &(linux_array[index]); 424 } 425 api_type get_api_type() const override { return NATIVE_OS; } 426 }; 427 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */ 428 429 #if KMP_OS_WINDOWS 430 class KMPNativeAffinity : public KMPAffinity { 431 class Mask : public KMPAffinity::Mask { 432 typedef ULONG_PTR mask_t; 433 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 434 mask_t *mask; 435 436 public: 437 Mask() { 438 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups); 439 } 440 ~Mask() { 441 if (mask) 442 __kmp_free(mask); 443 } 444 void set(int i) override { 445 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T)); 446 } 447 bool is_set(int i) const override { 448 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T))); 449 } 450 void clear(int i) override { 451 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T)); 452 } 453 void zero() override { 454 for (int i = 0; i < __kmp_num_proc_groups; ++i) 455 mask[i] = 0; 456 } 457 void copy(const KMPAffinity::Mask *src) override { 458 const Mask *convert = static_cast<const Mask *>(src); 459 for (int i = 0; i < __kmp_num_proc_groups; ++i) 460 mask[i] = convert->mask[i]; 461 } 462 void bitwise_and(const KMPAffinity::Mask *rhs) override { 463 const Mask *convert = static_cast<const Mask *>(rhs); 464 for (int i = 0; i < __kmp_num_proc_groups; ++i) 465 mask[i] &= convert->mask[i]; 466 } 467 void bitwise_or(const KMPAffinity::Mask *rhs) override { 468 const Mask *convert = static_cast<const Mask *>(rhs); 469 for (int i = 0; i < __kmp_num_proc_groups; ++i) 470 mask[i] |= convert->mask[i]; 471 } 472 void bitwise_not() override { 473 for (int i = 0; i < __kmp_num_proc_groups; ++i) 474 mask[i] = ~(mask[i]); 475 } 476 int begin() const override { 477 int retval = 0; 478 while (retval < end() && !is_set(retval)) 479 ++retval; 480 return retval; 481 } 482 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; } 483 int next(int previous) const override { 484 int retval = previous + 1; 485 while (retval < end() && !is_set(retval)) 486 ++retval; 487 return retval; 488 } 489 int set_process_affinity(bool abort_on_error) const override { 490 if (__kmp_num_proc_groups <= 1) { 491 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) { 492 DWORD error = GetLastError(); 493 if (abort_on_error) { 494 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 495 __kmp_msg_null); 496 } 497 return error; 498 } 499 } 500 return 0; 501 } 502 int set_system_affinity(bool abort_on_error) const override { 503 if (__kmp_num_proc_groups > 1) { 504 // Check for a valid mask. 505 GROUP_AFFINITY ga; 506 int group = get_proc_group(); 507 if (group < 0) { 508 if (abort_on_error) { 509 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 510 } 511 return -1; 512 } 513 // Transform the bit vector into a GROUP_AFFINITY struct 514 // and make the system call to set affinity. 515 ga.Group = group; 516 ga.Mask = mask[group]; 517 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0; 518 519 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL); 520 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) { 521 DWORD error = GetLastError(); 522 if (abort_on_error) { 523 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 524 __kmp_msg_null); 525 } 526 return error; 527 } 528 } else { 529 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) { 530 DWORD error = GetLastError(); 531 if (abort_on_error) { 532 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 533 __kmp_msg_null); 534 } 535 return error; 536 } 537 } 538 return 0; 539 } 540 int get_system_affinity(bool abort_on_error) override { 541 if (__kmp_num_proc_groups > 1) { 542 this->zero(); 543 GROUP_AFFINITY ga; 544 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL); 545 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) { 546 DWORD error = GetLastError(); 547 if (abort_on_error) { 548 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"), 549 KMP_ERR(error), __kmp_msg_null); 550 } 551 return error; 552 } 553 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) || 554 (ga.Mask == 0)) { 555 return -1; 556 } 557 mask[ga.Group] = ga.Mask; 558 } else { 559 mask_t newMask, sysMask, retval; 560 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) { 561 DWORD error = GetLastError(); 562 if (abort_on_error) { 563 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"), 564 KMP_ERR(error), __kmp_msg_null); 565 } 566 return error; 567 } 568 retval = SetThreadAffinityMask(GetCurrentThread(), newMask); 569 if (!retval) { 570 DWORD error = GetLastError(); 571 if (abort_on_error) { 572 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 573 KMP_ERR(error), __kmp_msg_null); 574 } 575 return error; 576 } 577 newMask = SetThreadAffinityMask(GetCurrentThread(), retval); 578 if (!newMask) { 579 DWORD error = GetLastError(); 580 if (abort_on_error) { 581 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 582 KMP_ERR(error), __kmp_msg_null); 583 } 584 } 585 *mask = retval; 586 } 587 return 0; 588 } 589 int get_proc_group() const override { 590 int group = -1; 591 if (__kmp_num_proc_groups == 1) { 592 return 1; 593 } 594 for (int i = 0; i < __kmp_num_proc_groups; i++) { 595 if (mask[i] == 0) 596 continue; 597 if (group >= 0) 598 return -1; 599 group = i; 600 } 601 return group; 602 } 603 }; 604 void determine_capable(const char *env_var) override { 605 __kmp_affinity_determine_capable(env_var); 606 } 607 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } 608 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } 609 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } 610 KMPAffinity::Mask *allocate_mask_array(int num) override { 611 return new Mask[num]; 612 } 613 void deallocate_mask_array(KMPAffinity::Mask *array) override { 614 Mask *windows_array = static_cast<Mask *>(array); 615 delete[] windows_array; 616 } 617 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 618 int index) override { 619 Mask *windows_array = static_cast<Mask *>(array); 620 return &(windows_array[index]); 621 } 622 api_type get_api_type() const override { return NATIVE_OS; } 623 }; 624 #endif /* KMP_OS_WINDOWS */ 625 #endif /* KMP_AFFINITY_SUPPORTED */ 626 627 // Describe an attribute for a level in the machine topology 628 struct kmp_hw_attr_t { 629 int core_type : 8; 630 int core_eff : 8; 631 unsigned valid : 1; 632 unsigned reserved : 15; 633 634 static const int UNKNOWN_CORE_EFF = -1; 635 636 kmp_hw_attr_t() 637 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF), 638 valid(0), reserved(0) {} 639 void set_core_type(kmp_hw_core_type_t type) { 640 valid = 1; 641 core_type = type; 642 } 643 void set_core_eff(int eff) { 644 valid = 1; 645 core_eff = eff; 646 } 647 kmp_hw_core_type_t get_core_type() const { 648 return (kmp_hw_core_type_t)core_type; 649 } 650 int get_core_eff() const { return core_eff; } 651 bool is_core_type_valid() const { 652 return core_type != KMP_HW_CORE_TYPE_UNKNOWN; 653 } 654 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; } 655 operator bool() const { return valid; } 656 void clear() { 657 core_type = KMP_HW_CORE_TYPE_UNKNOWN; 658 core_eff = UNKNOWN_CORE_EFF; 659 valid = 0; 660 } 661 bool contains(const kmp_hw_attr_t &other) const { 662 if (!valid && !other.valid) 663 return true; 664 if (valid && other.valid) { 665 if (other.is_core_type_valid()) { 666 if (!is_core_type_valid() || (get_core_type() != other.get_core_type())) 667 return false; 668 } 669 if (other.is_core_eff_valid()) { 670 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff())) 671 return false; 672 } 673 return true; 674 } 675 return false; 676 } 677 bool operator==(const kmp_hw_attr_t &rhs) const { 678 return (rhs.valid == valid && rhs.core_eff == core_eff && 679 rhs.core_type == core_type); 680 } 681 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); } 682 }; 683 684 #if KMP_AFFINITY_SUPPORTED 685 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t)); 686 #endif 687 688 class kmp_hw_thread_t { 689 public: 690 static const int UNKNOWN_ID = -1; 691 static const int MULTIPLE_ID = -2; 692 static int compare_ids(const void *a, const void *b); 693 static int compare_compact(const void *a, const void *b); 694 int ids[KMP_HW_LAST]; 695 int sub_ids[KMP_HW_LAST]; 696 bool leader; 697 int os_id; 698 kmp_hw_attr_t attrs; 699 700 void print() const; 701 void clear() { 702 for (int i = 0; i < (int)KMP_HW_LAST; ++i) 703 ids[i] = UNKNOWN_ID; 704 leader = false; 705 attrs.clear(); 706 } 707 }; 708 709 class kmp_topology_t { 710 711 struct flags_t { 712 int uniform : 1; 713 int reserved : 31; 714 }; 715 716 int depth; 717 718 // The following arrays are all 'depth' long and have been 719 // allocated to hold up to KMP_HW_LAST number of objects if 720 // needed so layers can be added without reallocation of any array 721 722 // Orderd array of the types in the topology 723 kmp_hw_t *types; 724 725 // Keep quick topology ratios, for non-uniform topologies, 726 // this ratio holds the max number of itemAs per itemB 727 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ] 728 int *ratio; 729 730 // Storage containing the absolute number of each topology layer 731 int *count; 732 733 // The number of core efficiencies. This is only useful for hybrid 734 // topologies. Core efficiencies will range from 0 to num efficiencies - 1 735 int num_core_efficiencies; 736 int num_core_types; 737 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES]; 738 739 // The hardware threads array 740 // hw_threads is num_hw_threads long 741 // Each hw_thread's ids and sub_ids are depth deep 742 int num_hw_threads; 743 kmp_hw_thread_t *hw_threads; 744 745 // Equivalence hash where the key is the hardware topology item 746 // and the value is the equivalent hardware topology type in the 747 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no 748 // known equivalence for the topology type 749 kmp_hw_t equivalent[KMP_HW_LAST]; 750 751 // Flags describing the topology 752 flags_t flags; 753 754 // Compact value used during sort_compact() 755 int compact; 756 757 // Insert a new topology layer after allocation 758 void _insert_layer(kmp_hw_t type, const int *ids); 759 760 #if KMP_GROUP_AFFINITY 761 // Insert topology information about Windows Processor groups 762 void _insert_windows_proc_groups(); 763 #endif 764 765 // Count each item & get the num x's per y 766 // e.g., get the number of cores and the number of threads per core 767 // for each (x, y) in (KMP_HW_* , KMP_HW_*) 768 void _gather_enumeration_information(); 769 770 // Remove layers that don't add information to the topology. 771 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 772 void _remove_radix1_layers(); 773 774 // Find out if the topology is uniform 775 void _discover_uniformity(); 776 777 // Set all the sub_ids for each hardware thread 778 void _set_sub_ids(); 779 780 // Set global affinity variables describing the number of threads per 781 // core, the number of packages, the number of cores per package, and 782 // the number of cores. 783 void _set_globals(); 784 785 // Set the last level cache equivalent type 786 void _set_last_level_cache(); 787 788 // Return the number of cores with a particular attribute, 'attr'. 789 // If 'find_all' is true, then find all cores on the machine, otherwise find 790 // all cores per the layer 'above' 791 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above, 792 bool find_all = false) const; 793 794 public: 795 // Force use of allocate()/deallocate() 796 kmp_topology_t() = delete; 797 kmp_topology_t(const kmp_topology_t &t) = delete; 798 kmp_topology_t(kmp_topology_t &&t) = delete; 799 kmp_topology_t &operator=(const kmp_topology_t &t) = delete; 800 kmp_topology_t &operator=(kmp_topology_t &&t) = delete; 801 802 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types); 803 static void deallocate(kmp_topology_t *); 804 805 // Functions used in create_map() routines 806 kmp_hw_thread_t &at(int index) { 807 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 808 return hw_threads[index]; 809 } 810 const kmp_hw_thread_t &at(int index) const { 811 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 812 return hw_threads[index]; 813 } 814 int get_num_hw_threads() const { return num_hw_threads; } 815 void sort_ids() { 816 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 817 kmp_hw_thread_t::compare_ids); 818 } 819 // Check if the hardware ids are unique, if they are 820 // return true, otherwise return false 821 bool check_ids() const; 822 823 // Function to call after the create_map() routine 824 void canonicalize(); 825 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores); 826 827 // Functions used after canonicalize() called 828 829 #if KMP_AFFINITY_SUPPORTED 830 // Set the granularity for affinity settings 831 void set_granularity(kmp_affinity_t &stgs) const; 832 #endif 833 bool filter_hw_subset(); 834 bool is_close(int hwt1, int hwt2, int level) const; 835 bool is_uniform() const { return flags.uniform; } 836 // Tell whether a type is a valid type in the topology 837 // returns KMP_HW_UNKNOWN when there is no equivalent type 838 kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; } 839 // Set type1 = type2 840 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) { 841 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1); 842 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2); 843 kmp_hw_t real_type2 = equivalent[type2]; 844 if (real_type2 == KMP_HW_UNKNOWN) 845 real_type2 = type2; 846 equivalent[type1] = real_type2; 847 // This loop is required since any of the types may have been set to 848 // be equivalent to type1. They all must be checked and reset to type2. 849 KMP_FOREACH_HW_TYPE(type) { 850 if (equivalent[type] == type1) { 851 equivalent[type] = real_type2; 852 } 853 } 854 } 855 // Calculate number of types corresponding to level1 856 // per types corresponding to level2 (e.g., number of threads per core) 857 int calculate_ratio(int level1, int level2) const { 858 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth); 859 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth); 860 int r = 1; 861 for (int level = level1; level > level2; --level) 862 r *= ratio[level]; 863 return r; 864 } 865 int get_ratio(int level) const { 866 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 867 return ratio[level]; 868 } 869 int get_depth() const { return depth; }; 870 kmp_hw_t get_type(int level) const { 871 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 872 return types[level]; 873 } 874 int get_level(kmp_hw_t type) const { 875 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type); 876 int eq_type = equivalent[type]; 877 if (eq_type == KMP_HW_UNKNOWN) 878 return -1; 879 for (int i = 0; i < depth; ++i) 880 if (types[i] == eq_type) 881 return i; 882 return -1; 883 } 884 int get_count(int level) const { 885 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 886 return count[level]; 887 } 888 // Return the total number of cores with attribute 'attr' 889 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const { 890 return _get_ncores_with_attr(attr, -1, true); 891 } 892 // Return the number of cores with attribute 893 // 'attr' per topology level 'above' 894 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const { 895 return _get_ncores_with_attr(attr, above, false); 896 } 897 898 #if KMP_AFFINITY_SUPPORTED 899 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b); 900 void sort_compact(kmp_affinity_t &affinity) { 901 compact = affinity.compact; 902 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 903 kmp_hw_thread_t::compare_compact); 904 } 905 #endif 906 void print(const char *env_var = "KMP_AFFINITY") const; 907 void dump() const; 908 }; 909 extern kmp_topology_t *__kmp_topology; 910 911 class kmp_hw_subset_t { 912 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS; 913 914 public: 915 // Describe a machine topology item in KMP_HW_SUBSET 916 struct item_t { 917 kmp_hw_t type; 918 int num_attrs; 919 int num[MAX_ATTRS]; 920 int offset[MAX_ATTRS]; 921 kmp_hw_attr_t attr[MAX_ATTRS]; 922 }; 923 // Put parenthesis around max to avoid accidental use of Windows max macro. 924 const static int USE_ALL = (std::numeric_limits<int>::max)(); 925 926 private: 927 int depth; 928 int capacity; 929 item_t *items; 930 kmp_uint64 set; 931 bool absolute; 932 // The set must be able to handle up to KMP_HW_LAST number of layers 933 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST); 934 // Sorting the KMP_HW_SUBSET items to follow topology order 935 // All unknown topology types will be at the beginning of the subset 936 static int hw_subset_compare(const void *i1, const void *i2) { 937 kmp_hw_t type1 = ((const item_t *)i1)->type; 938 kmp_hw_t type2 = ((const item_t *)i2)->type; 939 int level1 = __kmp_topology->get_level(type1); 940 int level2 = __kmp_topology->get_level(type2); 941 return level1 - level2; 942 } 943 944 public: 945 // Force use of allocate()/deallocate() 946 kmp_hw_subset_t() = delete; 947 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete; 948 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete; 949 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete; 950 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete; 951 952 static kmp_hw_subset_t *allocate() { 953 int initial_capacity = 5; 954 kmp_hw_subset_t *retval = 955 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t)); 956 retval->depth = 0; 957 retval->capacity = initial_capacity; 958 retval->set = 0ull; 959 retval->absolute = false; 960 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity); 961 return retval; 962 } 963 static void deallocate(kmp_hw_subset_t *subset) { 964 __kmp_free(subset->items); 965 __kmp_free(subset); 966 } 967 void set_absolute() { absolute = true; } 968 bool is_absolute() const { return absolute; } 969 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) { 970 for (int i = 0; i < depth; ++i) { 971 // Found an existing item for this layer type 972 // Add the num, offset, and attr to this item 973 if (items[i].type == type) { 974 int idx = items[i].num_attrs++; 975 if ((size_t)idx >= MAX_ATTRS) 976 return; 977 items[i].num[idx] = num; 978 items[i].offset[idx] = offset; 979 items[i].attr[idx] = attr; 980 return; 981 } 982 } 983 if (depth == capacity - 1) { 984 capacity *= 2; 985 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity); 986 for (int i = 0; i < depth; ++i) 987 new_items[i] = items[i]; 988 __kmp_free(items); 989 items = new_items; 990 } 991 items[depth].num_attrs = 1; 992 items[depth].type = type; 993 items[depth].num[0] = num; 994 items[depth].offset[0] = offset; 995 items[depth].attr[0] = attr; 996 depth++; 997 set |= (1ull << type); 998 } 999 int get_depth() const { return depth; } 1000 const item_t &at(int index) const { 1001 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1002 return items[index]; 1003 } 1004 item_t &at(int index) { 1005 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1006 return items[index]; 1007 } 1008 void remove(int index) { 1009 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1010 set &= ~(1ull << items[index].type); 1011 for (int j = index + 1; j < depth; ++j) { 1012 items[j - 1] = items[j]; 1013 } 1014 depth--; 1015 } 1016 void sort() { 1017 KMP_DEBUG_ASSERT(__kmp_topology); 1018 qsort(items, depth, sizeof(item_t), hw_subset_compare); 1019 } 1020 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); } 1021 void dump() const { 1022 printf("**********************\n"); 1023 printf("*** kmp_hw_subset: ***\n"); 1024 printf("* depth: %d\n", depth); 1025 printf("* items:\n"); 1026 for (int i = 0; i < depth; ++i) { 1027 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type)); 1028 for (int j = 0; j < items[i].num_attrs; ++j) { 1029 printf(" num: %d, offset: %d, attr: ", items[i].num[j], 1030 items[i].offset[j]); 1031 if (!items[i].attr[j]) { 1032 printf(" (none)\n"); 1033 } else { 1034 printf( 1035 " core_type = %s, core_eff = %d\n", 1036 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()), 1037 items[i].attr[j].get_core_eff()); 1038 } 1039 } 1040 } 1041 printf("* set: 0x%llx\n", set); 1042 printf("* absolute: %d\n", absolute); 1043 printf("**********************\n"); 1044 } 1045 }; 1046 extern kmp_hw_subset_t *__kmp_hw_subset; 1047 1048 /* A structure for holding machine-specific hierarchy info to be computed once 1049 at init. This structure represents a mapping of threads to the actual machine 1050 hierarchy, or to our best guess at what the hierarchy might be, for the 1051 purpose of performing an efficient barrier. In the worst case, when there is 1052 no machine hierarchy information, it produces a tree suitable for a barrier, 1053 similar to the tree used in the hyper barrier. */ 1054 class hierarchy_info { 1055 public: 1056 /* Good default values for number of leaves and branching factor, given no 1057 affinity information. Behaves a bit like hyper barrier. */ 1058 static const kmp_uint32 maxLeaves = 4; 1059 static const kmp_uint32 minBranch = 4; 1060 /** Number of levels in the hierarchy. Typical levels are threads/core, 1061 cores/package or socket, packages/node, nodes/machine, etc. We don't want 1062 to get specific with nomenclature. When the machine is oversubscribed we 1063 add levels to duplicate the hierarchy, doubling the thread capacity of the 1064 hierarchy each time we add a level. */ 1065 kmp_uint32 maxLevels; 1066 1067 /** This is specifically the depth of the machine configuration hierarchy, in 1068 terms of the number of levels along the longest path from root to any 1069 leaf. It corresponds to the number of entries in numPerLevel if we exclude 1070 all but one trailing 1. */ 1071 kmp_uint32 depth; 1072 kmp_uint32 base_num_threads; 1073 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 }; 1074 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized, 1075 // 2=initialization in progress 1076 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing 1077 1078 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children 1079 the parent of a node at level i has. For example, if we have a machine 1080 with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel = 1081 {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 1082 kmp_uint32 *numPerLevel; 1083 kmp_uint32 *skipPerLevel; 1084 1085 void deriveLevels() { 1086 int hier_depth = __kmp_topology->get_depth(); 1087 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) { 1088 numPerLevel[level] = __kmp_topology->get_ratio(i); 1089 } 1090 } 1091 1092 hierarchy_info() 1093 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {} 1094 1095 void fini() { 1096 if (!uninitialized && numPerLevel) { 1097 __kmp_free(numPerLevel); 1098 numPerLevel = NULL; 1099 uninitialized = not_initialized; 1100 } 1101 } 1102 1103 void init(int num_addrs) { 1104 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8( 1105 &uninitialized, not_initialized, initializing); 1106 if (bool_result == 0) { // Wait for initialization 1107 while (TCR_1(uninitialized) != initialized) 1108 KMP_CPU_PAUSE(); 1109 return; 1110 } 1111 KMP_DEBUG_ASSERT(bool_result == 1); 1112 1113 /* Added explicit initialization of the data fields here to prevent usage of 1114 dirty value observed when static library is re-initialized multiple times 1115 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses 1116 OpenMP). */ 1117 depth = 1; 1118 resizing = 0; 1119 maxLevels = 7; 1120 numPerLevel = 1121 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1122 skipPerLevel = &(numPerLevel[maxLevels]); 1123 for (kmp_uint32 i = 0; i < maxLevels; 1124 ++i) { // init numPerLevel[*] to 1 item per level 1125 numPerLevel[i] = 1; 1126 skipPerLevel[i] = 1; 1127 } 1128 1129 // Sort table by physical ID 1130 if (__kmp_topology && __kmp_topology->get_depth() > 0) { 1131 deriveLevels(); 1132 } else { 1133 numPerLevel[0] = maxLeaves; 1134 numPerLevel[1] = num_addrs / maxLeaves; 1135 if (num_addrs % maxLeaves) 1136 numPerLevel[1]++; 1137 } 1138 1139 base_num_threads = num_addrs; 1140 for (int i = maxLevels - 1; i >= 0; 1141 --i) // count non-empty levels to get depth 1142 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 1143 depth++; 1144 1145 kmp_uint32 branch = minBranch; 1146 if (numPerLevel[0] == 1) 1147 branch = num_addrs / maxLeaves; 1148 if (branch < minBranch) 1149 branch = minBranch; 1150 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width 1151 while (numPerLevel[d] > branch || 1152 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0! 1153 if (numPerLevel[d] & 1) 1154 numPerLevel[d]++; 1155 numPerLevel[d] = numPerLevel[d] >> 1; 1156 if (numPerLevel[d + 1] == 1) 1157 depth++; 1158 numPerLevel[d + 1] = numPerLevel[d + 1] << 1; 1159 } 1160 if (numPerLevel[0] == 1) { 1161 branch = branch >> 1; 1162 if (branch < 4) 1163 branch = minBranch; 1164 } 1165 } 1166 1167 for (kmp_uint32 i = 1; i < depth; ++i) 1168 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1]; 1169 // Fill in hierarchy in the case of oversubscription 1170 for (kmp_uint32 i = depth; i < maxLevels; ++i) 1171 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1172 1173 uninitialized = initialized; // One writer 1174 } 1175 1176 // Resize the hierarchy if nproc changes to something larger than before 1177 void resize(kmp_uint32 nproc) { 1178 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1179 while (bool_result == 0) { // someone else is trying to resize 1180 KMP_CPU_PAUSE(); 1181 if (nproc <= base_num_threads) // happy with other thread's resize 1182 return; 1183 else // try to resize 1184 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1185 } 1186 KMP_DEBUG_ASSERT(bool_result != 0); 1187 if (nproc <= base_num_threads) 1188 return; // happy with other thread's resize 1189 1190 // Calculate new maxLevels 1191 kmp_uint32 old_sz = skipPerLevel[depth - 1]; 1192 kmp_uint32 incs = 0, old_maxLevels = maxLevels; 1193 // First see if old maxLevels is enough to contain new size 1194 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) { 1195 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1196 numPerLevel[i - 1] *= 2; 1197 old_sz *= 2; 1198 depth++; 1199 } 1200 if (nproc > old_sz) { // Not enough space, need to expand hierarchy 1201 while (nproc > old_sz) { 1202 old_sz *= 2; 1203 incs++; 1204 depth++; 1205 } 1206 maxLevels += incs; 1207 1208 // Resize arrays 1209 kmp_uint32 *old_numPerLevel = numPerLevel; 1210 kmp_uint32 *old_skipPerLevel = skipPerLevel; 1211 numPerLevel = skipPerLevel = NULL; 1212 numPerLevel = 1213 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1214 skipPerLevel = &(numPerLevel[maxLevels]); 1215 1216 // Copy old elements from old arrays 1217 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) { 1218 // init numPerLevel[*] to 1 item per level 1219 numPerLevel[i] = old_numPerLevel[i]; 1220 skipPerLevel[i] = old_skipPerLevel[i]; 1221 } 1222 1223 // Init new elements in arrays to 1 1224 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) { 1225 // init numPerLevel[*] to 1 item per level 1226 numPerLevel[i] = 1; 1227 skipPerLevel[i] = 1; 1228 } 1229 1230 // Free old arrays 1231 __kmp_free(old_numPerLevel); 1232 } 1233 1234 // Fill in oversubscription levels of hierarchy 1235 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) 1236 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1237 1238 base_num_threads = nproc; 1239 resizing = 0; // One writer 1240 } 1241 }; 1242 #endif // KMP_AFFINITY_H 1243