xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_affinity.h (revision 8311bc5f17dec348749f763b82dfe2737bc53cd7)
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 #include <limits>
19 
20 #if KMP_AFFINITY_SUPPORTED
21 #if KMP_USE_HWLOC
22 class KMPHwlocAffinity : public KMPAffinity {
23 public:
24   class Mask : public KMPAffinity::Mask {
25     hwloc_cpuset_t mask;
26 
27   public:
28     Mask() {
29       mask = hwloc_bitmap_alloc();
30       this->zero();
31     }
32     ~Mask() { hwloc_bitmap_free(mask); }
33     void set(int i) override { hwloc_bitmap_set(mask, i); }
34     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36     void zero() override { hwloc_bitmap_zero(mask); }
37     void copy(const KMPAffinity::Mask *src) override {
38       const Mask *convert = static_cast<const Mask *>(src);
39       hwloc_bitmap_copy(mask, convert->mask);
40     }
41     void bitwise_and(const KMPAffinity::Mask *rhs) override {
42       const Mask *convert = static_cast<const Mask *>(rhs);
43       hwloc_bitmap_and(mask, mask, convert->mask);
44     }
45     void bitwise_or(const KMPAffinity::Mask *rhs) override {
46       const Mask *convert = static_cast<const Mask *>(rhs);
47       hwloc_bitmap_or(mask, mask, convert->mask);
48     }
49     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
50     int begin() const override { return hwloc_bitmap_first(mask); }
51     int end() const override { return -1; }
52     int next(int previous) const override {
53       return hwloc_bitmap_next(mask, previous);
54     }
55     int get_system_affinity(bool abort_on_error) override {
56       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
57                   "Illegal get affinity operation when not capable");
58       long retval =
59           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
60       if (retval >= 0) {
61         return 0;
62       }
63       int error = errno;
64       if (abort_on_error) {
65         __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
66                     KMP_ERR(error), __kmp_msg_null);
67       }
68       return error;
69     }
70     int set_system_affinity(bool abort_on_error) const override {
71       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
72                   "Illegal set affinity operation when not capable");
73       long retval =
74           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
75       if (retval >= 0) {
76         return 0;
77       }
78       int error = errno;
79       if (abort_on_error) {
80         __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
81                     KMP_ERR(error), __kmp_msg_null);
82       }
83       return error;
84     }
85 #if KMP_OS_WINDOWS
86     int set_process_affinity(bool abort_on_error) const override {
87       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
88                   "Illegal set process affinity operation when not capable");
89       int error = 0;
90       const hwloc_topology_support *support =
91           hwloc_topology_get_support(__kmp_hwloc_topology);
92       if (support->cpubind->set_proc_cpubind) {
93         int retval;
94         retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
95                                    HWLOC_CPUBIND_PROCESS);
96         if (retval >= 0)
97           return 0;
98         error = errno;
99         if (abort_on_error)
100           __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
101                       KMP_ERR(error), __kmp_msg_null);
102       }
103       return error;
104     }
105 #endif
106     int get_proc_group() const override {
107       int group = -1;
108 #if KMP_OS_WINDOWS
109       if (__kmp_num_proc_groups == 1) {
110         return 1;
111       }
112       for (int i = 0; i < __kmp_num_proc_groups; i++) {
113         // On windows, the long type is always 32 bits
114         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
115         unsigned long second_32_bits =
116             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
117         if (first_32_bits == 0 && second_32_bits == 0) {
118           continue;
119         }
120         if (group >= 0) {
121           return -1;
122         }
123         group = i;
124       }
125 #endif /* KMP_OS_WINDOWS */
126       return group;
127     }
128   };
129   void determine_capable(const char *var) override {
130     const hwloc_topology_support *topology_support;
131     if (__kmp_hwloc_topology == NULL) {
132       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
133         __kmp_hwloc_error = TRUE;
134         if (__kmp_affinity.flags.verbose) {
135           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
136         }
137       }
138       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
139         __kmp_hwloc_error = TRUE;
140         if (__kmp_affinity.flags.verbose) {
141           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
142         }
143       }
144     }
145     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
146     // Is the system capable of setting/getting this thread's affinity?
147     // Also, is topology discovery possible? (pu indicates ability to discover
148     // processing units). And finally, were there no errors when calling any
149     // hwloc_* API functions?
150     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
151         topology_support->cpubind->get_thisthread_cpubind &&
152         topology_support->discovery->pu && !__kmp_hwloc_error) {
153       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
154       KMP_AFFINITY_ENABLE(TRUE);
155     } else {
156       // indicate that hwloc didn't work and disable affinity
157       __kmp_hwloc_error = TRUE;
158       KMP_AFFINITY_DISABLE();
159     }
160   }
161   void bind_thread(int which) override {
162     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
163                 "Illegal set affinity operation when not capable");
164     KMPAffinity::Mask *mask;
165     KMP_CPU_ALLOC_ON_STACK(mask);
166     KMP_CPU_ZERO(mask);
167     KMP_CPU_SET(which, mask);
168     __kmp_set_system_affinity(mask, TRUE);
169     KMP_CPU_FREE_FROM_STACK(mask);
170   }
171   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
172   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
173   KMPAffinity::Mask *allocate_mask_array(int num) override {
174     return new Mask[num];
175   }
176   void deallocate_mask_array(KMPAffinity::Mask *array) override {
177     Mask *hwloc_array = static_cast<Mask *>(array);
178     delete[] hwloc_array;
179   }
180   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
181                                       int index) override {
182     Mask *hwloc_array = static_cast<Mask *>(array);
183     return &(hwloc_array[index]);
184   }
185   api_type get_api_type() const override { return HWLOC; }
186 };
187 #endif /* KMP_USE_HWLOC */
188 
189 #if KMP_OS_LINUX || KMP_OS_FREEBSD
190 #if KMP_OS_LINUX
191 /* On some of the older OS's that we build on, these constants aren't present
192    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
193    all systems of the same arch where they are defined, and they cannot change.
194    stone forever. */
195 #include <sys/syscall.h>
196 #if KMP_ARCH_X86 || KMP_ARCH_ARM
197 #ifndef __NR_sched_setaffinity
198 #define __NR_sched_setaffinity 241
199 #elif __NR_sched_setaffinity != 241
200 #error Wrong code for setaffinity system call.
201 #endif /* __NR_sched_setaffinity */
202 #ifndef __NR_sched_getaffinity
203 #define __NR_sched_getaffinity 242
204 #elif __NR_sched_getaffinity != 242
205 #error Wrong code for getaffinity system call.
206 #endif /* __NR_sched_getaffinity */
207 #elif KMP_ARCH_AARCH64
208 #ifndef __NR_sched_setaffinity
209 #define __NR_sched_setaffinity 122
210 #elif __NR_sched_setaffinity != 122
211 #error Wrong code for setaffinity system call.
212 #endif /* __NR_sched_setaffinity */
213 #ifndef __NR_sched_getaffinity
214 #define __NR_sched_getaffinity 123
215 #elif __NR_sched_getaffinity != 123
216 #error Wrong code for getaffinity system call.
217 #endif /* __NR_sched_getaffinity */
218 #elif KMP_ARCH_X86_64
219 #ifndef __NR_sched_setaffinity
220 #define __NR_sched_setaffinity 203
221 #elif __NR_sched_setaffinity != 203
222 #error Wrong code for setaffinity system call.
223 #endif /* __NR_sched_setaffinity */
224 #ifndef __NR_sched_getaffinity
225 #define __NR_sched_getaffinity 204
226 #elif __NR_sched_getaffinity != 204
227 #error Wrong code for getaffinity system call.
228 #endif /* __NR_sched_getaffinity */
229 #elif KMP_ARCH_PPC64
230 #ifndef __NR_sched_setaffinity
231 #define __NR_sched_setaffinity 222
232 #elif __NR_sched_setaffinity != 222
233 #error Wrong code for setaffinity system call.
234 #endif /* __NR_sched_setaffinity */
235 #ifndef __NR_sched_getaffinity
236 #define __NR_sched_getaffinity 223
237 #elif __NR_sched_getaffinity != 223
238 #error Wrong code for getaffinity system call.
239 #endif /* __NR_sched_getaffinity */
240 #elif KMP_ARCH_MIPS
241 #ifndef __NR_sched_setaffinity
242 #define __NR_sched_setaffinity 4239
243 #elif __NR_sched_setaffinity != 4239
244 #error Wrong code for setaffinity system call.
245 #endif /* __NR_sched_setaffinity */
246 #ifndef __NR_sched_getaffinity
247 #define __NR_sched_getaffinity 4240
248 #elif __NR_sched_getaffinity != 4240
249 #error Wrong code for getaffinity system call.
250 #endif /* __NR_sched_getaffinity */
251 #elif KMP_ARCH_MIPS64
252 #ifndef __NR_sched_setaffinity
253 #define __NR_sched_setaffinity 5195
254 #elif __NR_sched_setaffinity != 5195
255 #error Wrong code for setaffinity system call.
256 #endif /* __NR_sched_setaffinity */
257 #ifndef __NR_sched_getaffinity
258 #define __NR_sched_getaffinity 5196
259 #elif __NR_sched_getaffinity != 5196
260 #error Wrong code for getaffinity system call.
261 #endif /* __NR_sched_getaffinity */
262 #elif KMP_ARCH_LOONGARCH64
263 #ifndef __NR_sched_setaffinity
264 #define __NR_sched_setaffinity 122
265 #elif __NR_sched_setaffinity != 122
266 #error Wrong code for setaffinity system call.
267 #endif /* __NR_sched_setaffinity */
268 #ifndef __NR_sched_getaffinity
269 #define __NR_sched_getaffinity 123
270 #elif __NR_sched_getaffinity != 123
271 #error Wrong code for getaffinity system call.
272 #endif /* __NR_sched_getaffinity */
273 #elif KMP_ARCH_RISCV64
274 #ifndef __NR_sched_setaffinity
275 #define __NR_sched_setaffinity 122
276 #elif __NR_sched_setaffinity != 122
277 #error Wrong code for setaffinity system call.
278 #endif /* __NR_sched_setaffinity */
279 #ifndef __NR_sched_getaffinity
280 #define __NR_sched_getaffinity 123
281 #elif __NR_sched_getaffinity != 123
282 #error Wrong code for getaffinity system call.
283 #endif /* __NR_sched_getaffinity */
284 #else
285 #error Unknown or unsupported architecture
286 #endif /* KMP_ARCH_* */
287 #elif KMP_OS_FREEBSD
288 #include <pthread.h>
289 #include <pthread_np.h>
290 #endif
291 class KMPNativeAffinity : public KMPAffinity {
292   class Mask : public KMPAffinity::Mask {
293     typedef unsigned long mask_t;
294     typedef decltype(__kmp_affin_mask_size) mask_size_type;
295     static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
296     static const mask_t ONE = 1;
297     mask_size_type get_num_mask_types() const {
298       return __kmp_affin_mask_size / sizeof(mask_t);
299     }
300 
301   public:
302     mask_t *mask;
303     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
304     ~Mask() {
305       if (mask)
306         __kmp_free(mask);
307     }
308     void set(int i) override {
309       mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
310     }
311     bool is_set(int i) const override {
312       return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
313     }
314     void clear(int i) override {
315       mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
316     }
317     void zero() override {
318       mask_size_type e = get_num_mask_types();
319       for (mask_size_type i = 0; i < e; ++i)
320         mask[i] = (mask_t)0;
321     }
322     void copy(const KMPAffinity::Mask *src) override {
323       const Mask *convert = static_cast<const Mask *>(src);
324       mask_size_type e = get_num_mask_types();
325       for (mask_size_type i = 0; i < e; ++i)
326         mask[i] = convert->mask[i];
327     }
328     void bitwise_and(const KMPAffinity::Mask *rhs) override {
329       const Mask *convert = static_cast<const Mask *>(rhs);
330       mask_size_type e = get_num_mask_types();
331       for (mask_size_type i = 0; i < e; ++i)
332         mask[i] &= convert->mask[i];
333     }
334     void bitwise_or(const KMPAffinity::Mask *rhs) override {
335       const Mask *convert = static_cast<const Mask *>(rhs);
336       mask_size_type e = get_num_mask_types();
337       for (mask_size_type i = 0; i < e; ++i)
338         mask[i] |= convert->mask[i];
339     }
340     void bitwise_not() override {
341       mask_size_type e = get_num_mask_types();
342       for (mask_size_type i = 0; i < e; ++i)
343         mask[i] = ~(mask[i]);
344     }
345     int begin() const override {
346       int retval = 0;
347       while (retval < end() && !is_set(retval))
348         ++retval;
349       return retval;
350     }
351     int end() const override {
352       int e;
353       __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
354       return e;
355     }
356     int next(int previous) const override {
357       int retval = previous + 1;
358       while (retval < end() && !is_set(retval))
359         ++retval;
360       return retval;
361     }
362     int get_system_affinity(bool abort_on_error) override {
363       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
364                   "Illegal get affinity operation when not capable");
365 #if KMP_OS_LINUX
366       long retval =
367           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
368 #elif KMP_OS_FREEBSD
369       int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
370                                      reinterpret_cast<cpuset_t *>(mask));
371       int retval = (r == 0 ? 0 : -1);
372 #endif
373       if (retval >= 0) {
374         return 0;
375       }
376       int error = errno;
377       if (abort_on_error) {
378         __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
379                     KMP_ERR(error), __kmp_msg_null);
380       }
381       return error;
382     }
383     int set_system_affinity(bool abort_on_error) const override {
384       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
385                   "Illegal set affinity operation when not capable");
386 #if KMP_OS_LINUX
387       long retval =
388           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
389 #elif KMP_OS_FREEBSD
390       int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
391                                      reinterpret_cast<cpuset_t *>(mask));
392       int retval = (r == 0 ? 0 : -1);
393 #endif
394       if (retval >= 0) {
395         return 0;
396       }
397       int error = errno;
398       if (abort_on_error) {
399         __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
400                     KMP_ERR(error), __kmp_msg_null);
401       }
402       return error;
403     }
404   };
405   void determine_capable(const char *env_var) override {
406     __kmp_affinity_determine_capable(env_var);
407   }
408   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
409   KMPAffinity::Mask *allocate_mask() override {
410     KMPNativeAffinity::Mask *retval = new Mask();
411     return retval;
412   }
413   void deallocate_mask(KMPAffinity::Mask *m) override {
414     KMPNativeAffinity::Mask *native_mask =
415         static_cast<KMPNativeAffinity::Mask *>(m);
416     delete native_mask;
417   }
418   KMPAffinity::Mask *allocate_mask_array(int num) override {
419     return new Mask[num];
420   }
421   void deallocate_mask_array(KMPAffinity::Mask *array) override {
422     Mask *linux_array = static_cast<Mask *>(array);
423     delete[] linux_array;
424   }
425   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
426                                       int index) override {
427     Mask *linux_array = static_cast<Mask *>(array);
428     return &(linux_array[index]);
429   }
430   api_type get_api_type() const override { return NATIVE_OS; }
431 };
432 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
433 
434 #if KMP_OS_WINDOWS
435 class KMPNativeAffinity : public KMPAffinity {
436   class Mask : public KMPAffinity::Mask {
437     typedef ULONG_PTR mask_t;
438     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
439     mask_t *mask;
440 
441   public:
442     Mask() {
443       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
444     }
445     ~Mask() {
446       if (mask)
447         __kmp_free(mask);
448     }
449     void set(int i) override {
450       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
451     }
452     bool is_set(int i) const override {
453       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
454     }
455     void clear(int i) override {
456       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
457     }
458     void zero() override {
459       for (int i = 0; i < __kmp_num_proc_groups; ++i)
460         mask[i] = 0;
461     }
462     void copy(const KMPAffinity::Mask *src) override {
463       const Mask *convert = static_cast<const Mask *>(src);
464       for (int i = 0; i < __kmp_num_proc_groups; ++i)
465         mask[i] = convert->mask[i];
466     }
467     void bitwise_and(const KMPAffinity::Mask *rhs) override {
468       const Mask *convert = static_cast<const Mask *>(rhs);
469       for (int i = 0; i < __kmp_num_proc_groups; ++i)
470         mask[i] &= convert->mask[i];
471     }
472     void bitwise_or(const KMPAffinity::Mask *rhs) override {
473       const Mask *convert = static_cast<const Mask *>(rhs);
474       for (int i = 0; i < __kmp_num_proc_groups; ++i)
475         mask[i] |= convert->mask[i];
476     }
477     void bitwise_not() override {
478       for (int i = 0; i < __kmp_num_proc_groups; ++i)
479         mask[i] = ~(mask[i]);
480     }
481     int begin() const override {
482       int retval = 0;
483       while (retval < end() && !is_set(retval))
484         ++retval;
485       return retval;
486     }
487     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
488     int next(int previous) const override {
489       int retval = previous + 1;
490       while (retval < end() && !is_set(retval))
491         ++retval;
492       return retval;
493     }
494     int set_process_affinity(bool abort_on_error) const override {
495       if (__kmp_num_proc_groups <= 1) {
496         if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
497           DWORD error = GetLastError();
498           if (abort_on_error) {
499             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
500                         __kmp_msg_null);
501           }
502           return error;
503         }
504       }
505       return 0;
506     }
507     int set_system_affinity(bool abort_on_error) const override {
508       if (__kmp_num_proc_groups > 1) {
509         // Check for a valid mask.
510         GROUP_AFFINITY ga;
511         int group = get_proc_group();
512         if (group < 0) {
513           if (abort_on_error) {
514             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
515           }
516           return -1;
517         }
518         // Transform the bit vector into a GROUP_AFFINITY struct
519         // and make the system call to set affinity.
520         ga.Group = group;
521         ga.Mask = mask[group];
522         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
523 
524         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
525         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
526           DWORD error = GetLastError();
527           if (abort_on_error) {
528             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
529                         __kmp_msg_null);
530           }
531           return error;
532         }
533       } else {
534         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
535           DWORD error = GetLastError();
536           if (abort_on_error) {
537             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
538                         __kmp_msg_null);
539           }
540           return error;
541         }
542       }
543       return 0;
544     }
545     int get_system_affinity(bool abort_on_error) override {
546       if (__kmp_num_proc_groups > 1) {
547         this->zero();
548         GROUP_AFFINITY ga;
549         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
550         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
551           DWORD error = GetLastError();
552           if (abort_on_error) {
553             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
554                         KMP_ERR(error), __kmp_msg_null);
555           }
556           return error;
557         }
558         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
559             (ga.Mask == 0)) {
560           return -1;
561         }
562         mask[ga.Group] = ga.Mask;
563       } else {
564         mask_t newMask, sysMask, retval;
565         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
566           DWORD error = GetLastError();
567           if (abort_on_error) {
568             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
569                         KMP_ERR(error), __kmp_msg_null);
570           }
571           return error;
572         }
573         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
574         if (!retval) {
575           DWORD error = GetLastError();
576           if (abort_on_error) {
577             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
578                         KMP_ERR(error), __kmp_msg_null);
579           }
580           return error;
581         }
582         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
583         if (!newMask) {
584           DWORD error = GetLastError();
585           if (abort_on_error) {
586             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
587                         KMP_ERR(error), __kmp_msg_null);
588           }
589         }
590         *mask = retval;
591       }
592       return 0;
593     }
594     int get_proc_group() const override {
595       int group = -1;
596       if (__kmp_num_proc_groups == 1) {
597         return 1;
598       }
599       for (int i = 0; i < __kmp_num_proc_groups; i++) {
600         if (mask[i] == 0)
601           continue;
602         if (group >= 0)
603           return -1;
604         group = i;
605       }
606       return group;
607     }
608   };
609   void determine_capable(const char *env_var) override {
610     __kmp_affinity_determine_capable(env_var);
611   }
612   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
613   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
614   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
615   KMPAffinity::Mask *allocate_mask_array(int num) override {
616     return new Mask[num];
617   }
618   void deallocate_mask_array(KMPAffinity::Mask *array) override {
619     Mask *windows_array = static_cast<Mask *>(array);
620     delete[] windows_array;
621   }
622   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
623                                       int index) override {
624     Mask *windows_array = static_cast<Mask *>(array);
625     return &(windows_array[index]);
626   }
627   api_type get_api_type() const override { return NATIVE_OS; }
628 };
629 #endif /* KMP_OS_WINDOWS */
630 #endif /* KMP_AFFINITY_SUPPORTED */
631 
632 // Describe an attribute for a level in the machine topology
633 struct kmp_hw_attr_t {
634   int core_type : 8;
635   int core_eff : 8;
636   unsigned valid : 1;
637   unsigned reserved : 15;
638 
639   static const int UNKNOWN_CORE_EFF = -1;
640 
641   kmp_hw_attr_t()
642       : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
643         valid(0), reserved(0) {}
644   void set_core_type(kmp_hw_core_type_t type) {
645     valid = 1;
646     core_type = type;
647   }
648   void set_core_eff(int eff) {
649     valid = 1;
650     core_eff = eff;
651   }
652   kmp_hw_core_type_t get_core_type() const {
653     return (kmp_hw_core_type_t)core_type;
654   }
655   int get_core_eff() const { return core_eff; }
656   bool is_core_type_valid() const {
657     return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
658   }
659   bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
660   operator bool() const { return valid; }
661   void clear() {
662     core_type = KMP_HW_CORE_TYPE_UNKNOWN;
663     core_eff = UNKNOWN_CORE_EFF;
664     valid = 0;
665   }
666   bool contains(const kmp_hw_attr_t &other) const {
667     if (!valid && !other.valid)
668       return true;
669     if (valid && other.valid) {
670       if (other.is_core_type_valid()) {
671         if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
672           return false;
673       }
674       if (other.is_core_eff_valid()) {
675         if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
676           return false;
677       }
678       return true;
679     }
680     return false;
681   }
682   bool operator==(const kmp_hw_attr_t &rhs) const {
683     return (rhs.valid == valid && rhs.core_eff == core_eff &&
684             rhs.core_type == core_type);
685   }
686   bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
687 };
688 
689 #if KMP_AFFINITY_SUPPORTED
690 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
691 #endif
692 
693 class kmp_hw_thread_t {
694 public:
695   static const int UNKNOWN_ID = -1;
696   static const int MULTIPLE_ID = -2;
697   static int compare_ids(const void *a, const void *b);
698   static int compare_compact(const void *a, const void *b);
699   int ids[KMP_HW_LAST];
700   int sub_ids[KMP_HW_LAST];
701   bool leader;
702   int os_id;
703   kmp_hw_attr_t attrs;
704 
705   void print() const;
706   void clear() {
707     for (int i = 0; i < (int)KMP_HW_LAST; ++i)
708       ids[i] = UNKNOWN_ID;
709     leader = false;
710     attrs.clear();
711   }
712 };
713 
714 class kmp_topology_t {
715 
716   struct flags_t {
717     int uniform : 1;
718     int reserved : 31;
719   };
720 
721   int depth;
722 
723   // The following arrays are all 'depth' long and have been
724   // allocated to hold up to KMP_HW_LAST number of objects if
725   // needed so layers can be added without reallocation of any array
726 
727   // Orderd array of the types in the topology
728   kmp_hw_t *types;
729 
730   // Keep quick topology ratios, for non-uniform topologies,
731   // this ratio holds the max number of itemAs per itemB
732   // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
733   int *ratio;
734 
735   // Storage containing the absolute number of each topology layer
736   int *count;
737 
738   // The number of core efficiencies. This is only useful for hybrid
739   // topologies. Core efficiencies will range from 0 to num efficiencies - 1
740   int num_core_efficiencies;
741   int num_core_types;
742   kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
743 
744   // The hardware threads array
745   // hw_threads is num_hw_threads long
746   // Each hw_thread's ids and sub_ids are depth deep
747   int num_hw_threads;
748   kmp_hw_thread_t *hw_threads;
749 
750   // Equivalence hash where the key is the hardware topology item
751   // and the value is the equivalent hardware topology type in the
752   // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
753   // known equivalence for the topology type
754   kmp_hw_t equivalent[KMP_HW_LAST];
755 
756   // Flags describing the topology
757   flags_t flags;
758 
759   // Compact value used during sort_compact()
760   int compact;
761 
762   // Insert a new topology layer after allocation
763   void _insert_layer(kmp_hw_t type, const int *ids);
764 
765 #if KMP_GROUP_AFFINITY
766   // Insert topology information about Windows Processor groups
767   void _insert_windows_proc_groups();
768 #endif
769 
770   // Count each item & get the num x's per y
771   // e.g., get the number of cores and the number of threads per core
772   // for each (x, y) in (KMP_HW_* , KMP_HW_*)
773   void _gather_enumeration_information();
774 
775   // Remove layers that don't add information to the topology.
776   // This is done by having the layer take on the id = UNKNOWN_ID (-1)
777   void _remove_radix1_layers();
778 
779   // Find out if the topology is uniform
780   void _discover_uniformity();
781 
782   // Set all the sub_ids for each hardware thread
783   void _set_sub_ids();
784 
785   // Set global affinity variables describing the number of threads per
786   // core, the number of packages, the number of cores per package, and
787   // the number of cores.
788   void _set_globals();
789 
790   // Set the last level cache equivalent type
791   void _set_last_level_cache();
792 
793   // Return the number of cores with a particular attribute, 'attr'.
794   // If 'find_all' is true, then find all cores on the machine, otherwise find
795   // all cores per the layer 'above'
796   int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
797                             bool find_all = false) const;
798 
799 public:
800   // Force use of allocate()/deallocate()
801   kmp_topology_t() = delete;
802   kmp_topology_t(const kmp_topology_t &t) = delete;
803   kmp_topology_t(kmp_topology_t &&t) = delete;
804   kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
805   kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
806 
807   static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
808   static void deallocate(kmp_topology_t *);
809 
810   // Functions used in create_map() routines
811   kmp_hw_thread_t &at(int index) {
812     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
813     return hw_threads[index];
814   }
815   const kmp_hw_thread_t &at(int index) const {
816     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
817     return hw_threads[index];
818   }
819   int get_num_hw_threads() const { return num_hw_threads; }
820   void sort_ids() {
821     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
822           kmp_hw_thread_t::compare_ids);
823   }
824   // Check if the hardware ids are unique, if they are
825   // return true, otherwise return false
826   bool check_ids() const;
827 
828   // Function to call after the create_map() routine
829   void canonicalize();
830   void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
831 
832 // Functions used after canonicalize() called
833 
834 #if KMP_AFFINITY_SUPPORTED
835   // Set the granularity for affinity settings
836   void set_granularity(kmp_affinity_t &stgs) const;
837 #endif
838   bool filter_hw_subset();
839   bool is_close(int hwt1, int hwt2, int level) const;
840   bool is_uniform() const { return flags.uniform; }
841   // Tell whether a type is a valid type in the topology
842   // returns KMP_HW_UNKNOWN when there is no equivalent type
843   kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
844   // Set type1 = type2
845   void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
846     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
847     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
848     kmp_hw_t real_type2 = equivalent[type2];
849     if (real_type2 == KMP_HW_UNKNOWN)
850       real_type2 = type2;
851     equivalent[type1] = real_type2;
852     // This loop is required since any of the types may have been set to
853     // be equivalent to type1.  They all must be checked and reset to type2.
854     KMP_FOREACH_HW_TYPE(type) {
855       if (equivalent[type] == type1) {
856         equivalent[type] = real_type2;
857       }
858     }
859   }
860   // Calculate number of types corresponding to level1
861   // per types corresponding to level2 (e.g., number of threads per core)
862   int calculate_ratio(int level1, int level2) const {
863     KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
864     KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
865     int r = 1;
866     for (int level = level1; level > level2; --level)
867       r *= ratio[level];
868     return r;
869   }
870   int get_ratio(int level) const {
871     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
872     return ratio[level];
873   }
874   int get_depth() const { return depth; };
875   kmp_hw_t get_type(int level) const {
876     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
877     return types[level];
878   }
879   int get_level(kmp_hw_t type) const {
880     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
881     int eq_type = equivalent[type];
882     if (eq_type == KMP_HW_UNKNOWN)
883       return -1;
884     for (int i = 0; i < depth; ++i)
885       if (types[i] == eq_type)
886         return i;
887     return -1;
888   }
889   int get_count(int level) const {
890     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
891     return count[level];
892   }
893   // Return the total number of cores with attribute 'attr'
894   int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
895     return _get_ncores_with_attr(attr, -1, true);
896   }
897   // Return the number of cores with attribute
898   // 'attr' per topology level 'above'
899   int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
900     return _get_ncores_with_attr(attr, above, false);
901   }
902 
903 #if KMP_AFFINITY_SUPPORTED
904   friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
905   void sort_compact(kmp_affinity_t &affinity) {
906     compact = affinity.compact;
907     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
908           kmp_hw_thread_t::compare_compact);
909   }
910 #endif
911   void print(const char *env_var = "KMP_AFFINITY") const;
912   void dump() const;
913 };
914 extern kmp_topology_t *__kmp_topology;
915 
916 class kmp_hw_subset_t {
917   const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
918 
919 public:
920   // Describe a machine topology item in KMP_HW_SUBSET
921   struct item_t {
922     kmp_hw_t type;
923     int num_attrs;
924     int num[MAX_ATTRS];
925     int offset[MAX_ATTRS];
926     kmp_hw_attr_t attr[MAX_ATTRS];
927   };
928   // Put parenthesis around max to avoid accidental use of Windows max macro.
929   const static int USE_ALL = (std::numeric_limits<int>::max)();
930 
931 private:
932   int depth;
933   int capacity;
934   item_t *items;
935   kmp_uint64 set;
936   bool absolute;
937   // The set must be able to handle up to KMP_HW_LAST number of layers
938   KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
939   // Sorting the KMP_HW_SUBSET items to follow topology order
940   // All unknown topology types will be at the beginning of the subset
941   static int hw_subset_compare(const void *i1, const void *i2) {
942     kmp_hw_t type1 = ((const item_t *)i1)->type;
943     kmp_hw_t type2 = ((const item_t *)i2)->type;
944     int level1 = __kmp_topology->get_level(type1);
945     int level2 = __kmp_topology->get_level(type2);
946     return level1 - level2;
947   }
948 
949 public:
950   // Force use of allocate()/deallocate()
951   kmp_hw_subset_t() = delete;
952   kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
953   kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
954   kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
955   kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
956 
957   static kmp_hw_subset_t *allocate() {
958     int initial_capacity = 5;
959     kmp_hw_subset_t *retval =
960         (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
961     retval->depth = 0;
962     retval->capacity = initial_capacity;
963     retval->set = 0ull;
964     retval->absolute = false;
965     retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
966     return retval;
967   }
968   static void deallocate(kmp_hw_subset_t *subset) {
969     __kmp_free(subset->items);
970     __kmp_free(subset);
971   }
972   void set_absolute() { absolute = true; }
973   bool is_absolute() const { return absolute; }
974   void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
975     for (int i = 0; i < depth; ++i) {
976       // Found an existing item for this layer type
977       // Add the num, offset, and attr to this item
978       if (items[i].type == type) {
979         int idx = items[i].num_attrs++;
980         if ((size_t)idx >= MAX_ATTRS)
981           return;
982         items[i].num[idx] = num;
983         items[i].offset[idx] = offset;
984         items[i].attr[idx] = attr;
985         return;
986       }
987     }
988     if (depth == capacity - 1) {
989       capacity *= 2;
990       item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
991       for (int i = 0; i < depth; ++i)
992         new_items[i] = items[i];
993       __kmp_free(items);
994       items = new_items;
995     }
996     items[depth].num_attrs = 1;
997     items[depth].type = type;
998     items[depth].num[0] = num;
999     items[depth].offset[0] = offset;
1000     items[depth].attr[0] = attr;
1001     depth++;
1002     set |= (1ull << type);
1003   }
1004   int get_depth() const { return depth; }
1005   const item_t &at(int index) const {
1006     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1007     return items[index];
1008   }
1009   item_t &at(int index) {
1010     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1011     return items[index];
1012   }
1013   void remove(int index) {
1014     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1015     set &= ~(1ull << items[index].type);
1016     for (int j = index + 1; j < depth; ++j) {
1017       items[j - 1] = items[j];
1018     }
1019     depth--;
1020   }
1021   void sort() {
1022     KMP_DEBUG_ASSERT(__kmp_topology);
1023     qsort(items, depth, sizeof(item_t), hw_subset_compare);
1024   }
1025   bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1026   void dump() const {
1027     printf("**********************\n");
1028     printf("*** kmp_hw_subset: ***\n");
1029     printf("* depth: %d\n", depth);
1030     printf("* items:\n");
1031     for (int i = 0; i < depth; ++i) {
1032       printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1033       for (int j = 0; j < items[i].num_attrs; ++j) {
1034         printf("  num: %d, offset: %d, attr: ", items[i].num[j],
1035                items[i].offset[j]);
1036         if (!items[i].attr[j]) {
1037           printf(" (none)\n");
1038         } else {
1039           printf(
1040               " core_type = %s, core_eff = %d\n",
1041               __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1042               items[i].attr[j].get_core_eff());
1043         }
1044       }
1045     }
1046     printf("* set: 0x%llx\n", set);
1047     printf("* absolute: %d\n", absolute);
1048     printf("**********************\n");
1049   }
1050 };
1051 extern kmp_hw_subset_t *__kmp_hw_subset;
1052 
1053 /* A structure for holding machine-specific hierarchy info to be computed once
1054    at init. This structure represents a mapping of threads to the actual machine
1055    hierarchy, or to our best guess at what the hierarchy might be, for the
1056    purpose of performing an efficient barrier. In the worst case, when there is
1057    no machine hierarchy information, it produces a tree suitable for a barrier,
1058    similar to the tree used in the hyper barrier. */
1059 class hierarchy_info {
1060 public:
1061   /* Good default values for number of leaves and branching factor, given no
1062      affinity information. Behaves a bit like hyper barrier. */
1063   static const kmp_uint32 maxLeaves = 4;
1064   static const kmp_uint32 minBranch = 4;
1065   /** Number of levels in the hierarchy. Typical levels are threads/core,
1066       cores/package or socket, packages/node, nodes/machine, etc. We don't want
1067       to get specific with nomenclature. When the machine is oversubscribed we
1068       add levels to duplicate the hierarchy, doubling the thread capacity of the
1069       hierarchy each time we add a level. */
1070   kmp_uint32 maxLevels;
1071 
1072   /** This is specifically the depth of the machine configuration hierarchy, in
1073       terms of the number of levels along the longest path from root to any
1074       leaf. It corresponds to the number of entries in numPerLevel if we exclude
1075       all but one trailing 1. */
1076   kmp_uint32 depth;
1077   kmp_uint32 base_num_threads;
1078   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1079   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1080   // 2=initialization in progress
1081   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1082 
1083   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1084       the parent of a node at level i has. For example, if we have a machine
1085       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1086       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
1087   kmp_uint32 *numPerLevel;
1088   kmp_uint32 *skipPerLevel;
1089 
1090   void deriveLevels() {
1091     int hier_depth = __kmp_topology->get_depth();
1092     for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1093       numPerLevel[level] = __kmp_topology->get_ratio(i);
1094     }
1095   }
1096 
1097   hierarchy_info()
1098       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1099 
1100   void fini() {
1101     if (!uninitialized && numPerLevel) {
1102       __kmp_free(numPerLevel);
1103       numPerLevel = NULL;
1104       uninitialized = not_initialized;
1105     }
1106   }
1107 
1108   void init(int num_addrs) {
1109     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1110         &uninitialized, not_initialized, initializing);
1111     if (bool_result == 0) { // Wait for initialization
1112       while (TCR_1(uninitialized) != initialized)
1113         KMP_CPU_PAUSE();
1114       return;
1115     }
1116     KMP_DEBUG_ASSERT(bool_result == 1);
1117 
1118     /* Added explicit initialization of the data fields here to prevent usage of
1119        dirty value observed when static library is re-initialized multiple times
1120        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1121        OpenMP). */
1122     depth = 1;
1123     resizing = 0;
1124     maxLevels = 7;
1125     numPerLevel =
1126         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1127     skipPerLevel = &(numPerLevel[maxLevels]);
1128     for (kmp_uint32 i = 0; i < maxLevels;
1129          ++i) { // init numPerLevel[*] to 1 item per level
1130       numPerLevel[i] = 1;
1131       skipPerLevel[i] = 1;
1132     }
1133 
1134     // Sort table by physical ID
1135     if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1136       deriveLevels();
1137     } else {
1138       numPerLevel[0] = maxLeaves;
1139       numPerLevel[1] = num_addrs / maxLeaves;
1140       if (num_addrs % maxLeaves)
1141         numPerLevel[1]++;
1142     }
1143 
1144     base_num_threads = num_addrs;
1145     for (int i = maxLevels - 1; i >= 0;
1146          --i) // count non-empty levels to get depth
1147       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1148         depth++;
1149 
1150     kmp_uint32 branch = minBranch;
1151     if (numPerLevel[0] == 1)
1152       branch = num_addrs / maxLeaves;
1153     if (branch < minBranch)
1154       branch = minBranch;
1155     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1156       while (numPerLevel[d] > branch ||
1157              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1158         if (numPerLevel[d] & 1)
1159           numPerLevel[d]++;
1160         numPerLevel[d] = numPerLevel[d] >> 1;
1161         if (numPerLevel[d + 1] == 1)
1162           depth++;
1163         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1164       }
1165       if (numPerLevel[0] == 1) {
1166         branch = branch >> 1;
1167         if (branch < 4)
1168           branch = minBranch;
1169       }
1170     }
1171 
1172     for (kmp_uint32 i = 1; i < depth; ++i)
1173       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1174     // Fill in hierarchy in the case of oversubscription
1175     for (kmp_uint32 i = depth; i < maxLevels; ++i)
1176       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1177 
1178     uninitialized = initialized; // One writer
1179   }
1180 
1181   // Resize the hierarchy if nproc changes to something larger than before
1182   void resize(kmp_uint32 nproc) {
1183     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1184     while (bool_result == 0) { // someone else is trying to resize
1185       KMP_CPU_PAUSE();
1186       if (nproc <= base_num_threads) // happy with other thread's resize
1187         return;
1188       else // try to resize
1189         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1190     }
1191     KMP_DEBUG_ASSERT(bool_result != 0);
1192     if (nproc <= base_num_threads)
1193       return; // happy with other thread's resize
1194 
1195     // Calculate new maxLevels
1196     kmp_uint32 old_sz = skipPerLevel[depth - 1];
1197     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1198     // First see if old maxLevels is enough to contain new size
1199     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1200       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1201       numPerLevel[i - 1] *= 2;
1202       old_sz *= 2;
1203       depth++;
1204     }
1205     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1206       while (nproc > old_sz) {
1207         old_sz *= 2;
1208         incs++;
1209         depth++;
1210       }
1211       maxLevels += incs;
1212 
1213       // Resize arrays
1214       kmp_uint32 *old_numPerLevel = numPerLevel;
1215       kmp_uint32 *old_skipPerLevel = skipPerLevel;
1216       numPerLevel = skipPerLevel = NULL;
1217       numPerLevel =
1218           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1219       skipPerLevel = &(numPerLevel[maxLevels]);
1220 
1221       // Copy old elements from old arrays
1222       for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1223         // init numPerLevel[*] to 1 item per level
1224         numPerLevel[i] = old_numPerLevel[i];
1225         skipPerLevel[i] = old_skipPerLevel[i];
1226       }
1227 
1228       // Init new elements in arrays to 1
1229       for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1230         // init numPerLevel[*] to 1 item per level
1231         numPerLevel[i] = 1;
1232         skipPerLevel[i] = 1;
1233       }
1234 
1235       // Free old arrays
1236       __kmp_free(old_numPerLevel);
1237     }
1238 
1239     // Fill in oversubscription levels of hierarchy
1240     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1241       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1242 
1243     base_num_threads = nproc;
1244     resizing = 0; // One writer
1245   }
1246 };
1247 #endif // KMP_AFFINITY_H
1248