xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_affinity.h (revision 28f6c2f292806bf31230a959bc4b19d7081669a7)
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 #include <limits>
19 
20 #if KMP_AFFINITY_SUPPORTED
21 #if KMP_USE_HWLOC
22 class KMPHwlocAffinity : public KMPAffinity {
23 public:
24   class Mask : public KMPAffinity::Mask {
25     hwloc_cpuset_t mask;
26 
27   public:
28     Mask() {
29       mask = hwloc_bitmap_alloc();
30       this->zero();
31     }
32     ~Mask() { hwloc_bitmap_free(mask); }
33     void set(int i) override { hwloc_bitmap_set(mask, i); }
34     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36     void zero() override { hwloc_bitmap_zero(mask); }
37     void copy(const KMPAffinity::Mask *src) override {
38       const Mask *convert = static_cast<const Mask *>(src);
39       hwloc_bitmap_copy(mask, convert->mask);
40     }
41     void bitwise_and(const KMPAffinity::Mask *rhs) override {
42       const Mask *convert = static_cast<const Mask *>(rhs);
43       hwloc_bitmap_and(mask, mask, convert->mask);
44     }
45     void bitwise_or(const KMPAffinity::Mask *rhs) override {
46       const Mask *convert = static_cast<const Mask *>(rhs);
47       hwloc_bitmap_or(mask, mask, convert->mask);
48     }
49     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
50     int begin() const override { return hwloc_bitmap_first(mask); }
51     int end() const override { return -1; }
52     int next(int previous) const override {
53       return hwloc_bitmap_next(mask, previous);
54     }
55     int get_system_affinity(bool abort_on_error) override {
56       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
57                   "Illegal get affinity operation when not capable");
58       long retval =
59           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
60       if (retval >= 0) {
61         return 0;
62       }
63       int error = errno;
64       if (abort_on_error) {
65         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
66       }
67       return error;
68     }
69     int set_system_affinity(bool abort_on_error) const override {
70       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
71                   "Illegal set affinity operation when not capable");
72       long retval =
73           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
74       if (retval >= 0) {
75         return 0;
76       }
77       int error = errno;
78       if (abort_on_error) {
79         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
80       }
81       return error;
82     }
83 #if KMP_OS_WINDOWS
84     int set_process_affinity(bool abort_on_error) const override {
85       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
86                   "Illegal set process affinity operation when not capable");
87       int error = 0;
88       const hwloc_topology_support *support =
89           hwloc_topology_get_support(__kmp_hwloc_topology);
90       if (support->cpubind->set_proc_cpubind) {
91         int retval;
92         retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
93                                    HWLOC_CPUBIND_PROCESS);
94         if (retval >= 0)
95           return 0;
96         error = errno;
97         if (abort_on_error)
98           __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
99       }
100       return error;
101     }
102 #endif
103     int get_proc_group() const override {
104       int group = -1;
105 #if KMP_OS_WINDOWS
106       if (__kmp_num_proc_groups == 1) {
107         return 1;
108       }
109       for (int i = 0; i < __kmp_num_proc_groups; i++) {
110         // On windows, the long type is always 32 bits
111         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
112         unsigned long second_32_bits =
113             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
114         if (first_32_bits == 0 && second_32_bits == 0) {
115           continue;
116         }
117         if (group >= 0) {
118           return -1;
119         }
120         group = i;
121       }
122 #endif /* KMP_OS_WINDOWS */
123       return group;
124     }
125   };
126   void determine_capable(const char *var) override {
127     const hwloc_topology_support *topology_support;
128     if (__kmp_hwloc_topology == NULL) {
129       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
130         __kmp_hwloc_error = TRUE;
131         if (__kmp_affinity.flags.verbose) {
132           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
133         }
134       }
135       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
136         __kmp_hwloc_error = TRUE;
137         if (__kmp_affinity.flags.verbose) {
138           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
139         }
140       }
141     }
142     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
143     // Is the system capable of setting/getting this thread's affinity?
144     // Also, is topology discovery possible? (pu indicates ability to discover
145     // processing units). And finally, were there no errors when calling any
146     // hwloc_* API functions?
147     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
148         topology_support->cpubind->get_thisthread_cpubind &&
149         topology_support->discovery->pu && !__kmp_hwloc_error) {
150       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
151       KMP_AFFINITY_ENABLE(TRUE);
152     } else {
153       // indicate that hwloc didn't work and disable affinity
154       __kmp_hwloc_error = TRUE;
155       KMP_AFFINITY_DISABLE();
156     }
157   }
158   void bind_thread(int which) override {
159     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
160                 "Illegal set affinity operation when not capable");
161     KMPAffinity::Mask *mask;
162     KMP_CPU_ALLOC_ON_STACK(mask);
163     KMP_CPU_ZERO(mask);
164     KMP_CPU_SET(which, mask);
165     __kmp_set_system_affinity(mask, TRUE);
166     KMP_CPU_FREE_FROM_STACK(mask);
167   }
168   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
169   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
170   KMPAffinity::Mask *allocate_mask_array(int num) override {
171     return new Mask[num];
172   }
173   void deallocate_mask_array(KMPAffinity::Mask *array) override {
174     Mask *hwloc_array = static_cast<Mask *>(array);
175     delete[] hwloc_array;
176   }
177   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
178                                       int index) override {
179     Mask *hwloc_array = static_cast<Mask *>(array);
180     return &(hwloc_array[index]);
181   }
182   api_type get_api_type() const override { return HWLOC; }
183 };
184 #endif /* KMP_USE_HWLOC */
185 
186 #if KMP_OS_LINUX || KMP_OS_FREEBSD
187 #if KMP_OS_LINUX
188 /* On some of the older OS's that we build on, these constants aren't present
189    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
190    all systems of the same arch where they are defined, and they cannot change.
191    stone forever. */
192 #include <sys/syscall.h>
193 #if KMP_ARCH_X86 || KMP_ARCH_ARM
194 #ifndef __NR_sched_setaffinity
195 #define __NR_sched_setaffinity 241
196 #elif __NR_sched_setaffinity != 241
197 #error Wrong code for setaffinity system call.
198 #endif /* __NR_sched_setaffinity */
199 #ifndef __NR_sched_getaffinity
200 #define __NR_sched_getaffinity 242
201 #elif __NR_sched_getaffinity != 242
202 #error Wrong code for getaffinity system call.
203 #endif /* __NR_sched_getaffinity */
204 #elif KMP_ARCH_AARCH64
205 #ifndef __NR_sched_setaffinity
206 #define __NR_sched_setaffinity 122
207 #elif __NR_sched_setaffinity != 122
208 #error Wrong code for setaffinity system call.
209 #endif /* __NR_sched_setaffinity */
210 #ifndef __NR_sched_getaffinity
211 #define __NR_sched_getaffinity 123
212 #elif __NR_sched_getaffinity != 123
213 #error Wrong code for getaffinity system call.
214 #endif /* __NR_sched_getaffinity */
215 #elif KMP_ARCH_X86_64
216 #ifndef __NR_sched_setaffinity
217 #define __NR_sched_setaffinity 203
218 #elif __NR_sched_setaffinity != 203
219 #error Wrong code for setaffinity system call.
220 #endif /* __NR_sched_setaffinity */
221 #ifndef __NR_sched_getaffinity
222 #define __NR_sched_getaffinity 204
223 #elif __NR_sched_getaffinity != 204
224 #error Wrong code for getaffinity system call.
225 #endif /* __NR_sched_getaffinity */
226 #elif KMP_ARCH_PPC64
227 #ifndef __NR_sched_setaffinity
228 #define __NR_sched_setaffinity 222
229 #elif __NR_sched_setaffinity != 222
230 #error Wrong code for setaffinity system call.
231 #endif /* __NR_sched_setaffinity */
232 #ifndef __NR_sched_getaffinity
233 #define __NR_sched_getaffinity 223
234 #elif __NR_sched_getaffinity != 223
235 #error Wrong code for getaffinity system call.
236 #endif /* __NR_sched_getaffinity */
237 #elif KMP_ARCH_MIPS
238 #ifndef __NR_sched_setaffinity
239 #define __NR_sched_setaffinity 4239
240 #elif __NR_sched_setaffinity != 4239
241 #error Wrong code for setaffinity system call.
242 #endif /* __NR_sched_setaffinity */
243 #ifndef __NR_sched_getaffinity
244 #define __NR_sched_getaffinity 4240
245 #elif __NR_sched_getaffinity != 4240
246 #error Wrong code for getaffinity system call.
247 #endif /* __NR_sched_getaffinity */
248 #elif KMP_ARCH_MIPS64
249 #ifndef __NR_sched_setaffinity
250 #define __NR_sched_setaffinity 5195
251 #elif __NR_sched_setaffinity != 5195
252 #error Wrong code for setaffinity system call.
253 #endif /* __NR_sched_setaffinity */
254 #ifndef __NR_sched_getaffinity
255 #define __NR_sched_getaffinity 5196
256 #elif __NR_sched_getaffinity != 5196
257 #error Wrong code for getaffinity system call.
258 #endif /* __NR_sched_getaffinity */
259 #elif KMP_ARCH_LOONGARCH64
260 #ifndef __NR_sched_setaffinity
261 #define __NR_sched_setaffinity 122
262 #elif __NR_sched_setaffinity != 122
263 #error Wrong code for setaffinity system call.
264 #endif /* __NR_sched_setaffinity */
265 #ifndef __NR_sched_getaffinity
266 #define __NR_sched_getaffinity 123
267 #elif __NR_sched_getaffinity != 123
268 #error Wrong code for getaffinity system call.
269 #endif /* __NR_sched_getaffinity */
270 #elif KMP_ARCH_RISCV64
271 #ifndef __NR_sched_setaffinity
272 #define __NR_sched_setaffinity 122
273 #elif __NR_sched_setaffinity != 122
274 #error Wrong code for setaffinity system call.
275 #endif /* __NR_sched_setaffinity */
276 #ifndef __NR_sched_getaffinity
277 #define __NR_sched_getaffinity 123
278 #elif __NR_sched_getaffinity != 123
279 #error Wrong code for getaffinity system call.
280 #endif /* __NR_sched_getaffinity */
281 #else
282 #error Unknown or unsupported architecture
283 #endif /* KMP_ARCH_* */
284 #elif KMP_OS_FREEBSD
285 #include <pthread.h>
286 #include <pthread_np.h>
287 #endif
288 class KMPNativeAffinity : public KMPAffinity {
289   class Mask : public KMPAffinity::Mask {
290     typedef unsigned long mask_t;
291     typedef decltype(__kmp_affin_mask_size) mask_size_type;
292     static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
293     static const mask_t ONE = 1;
294     mask_size_type get_num_mask_types() const {
295       return __kmp_affin_mask_size / sizeof(mask_t);
296     }
297 
298   public:
299     mask_t *mask;
300     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
301     ~Mask() {
302       if (mask)
303         __kmp_free(mask);
304     }
305     void set(int i) override {
306       mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
307     }
308     bool is_set(int i) const override {
309       return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
310     }
311     void clear(int i) override {
312       mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
313     }
314     void zero() override {
315       mask_size_type e = get_num_mask_types();
316       for (mask_size_type i = 0; i < e; ++i)
317         mask[i] = (mask_t)0;
318     }
319     void copy(const KMPAffinity::Mask *src) override {
320       const Mask *convert = static_cast<const Mask *>(src);
321       mask_size_type e = get_num_mask_types();
322       for (mask_size_type i = 0; i < e; ++i)
323         mask[i] = convert->mask[i];
324     }
325     void bitwise_and(const KMPAffinity::Mask *rhs) override {
326       const Mask *convert = static_cast<const Mask *>(rhs);
327       mask_size_type e = get_num_mask_types();
328       for (mask_size_type i = 0; i < e; ++i)
329         mask[i] &= convert->mask[i];
330     }
331     void bitwise_or(const KMPAffinity::Mask *rhs) override {
332       const Mask *convert = static_cast<const Mask *>(rhs);
333       mask_size_type e = get_num_mask_types();
334       for (mask_size_type i = 0; i < e; ++i)
335         mask[i] |= convert->mask[i];
336     }
337     void bitwise_not() override {
338       mask_size_type e = get_num_mask_types();
339       for (mask_size_type i = 0; i < e; ++i)
340         mask[i] = ~(mask[i]);
341     }
342     int begin() const override {
343       int retval = 0;
344       while (retval < end() && !is_set(retval))
345         ++retval;
346       return retval;
347     }
348     int end() const override {
349       int e;
350       __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
351       return e;
352     }
353     int next(int previous) const override {
354       int retval = previous + 1;
355       while (retval < end() && !is_set(retval))
356         ++retval;
357       return retval;
358     }
359     int get_system_affinity(bool abort_on_error) override {
360       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
361                   "Illegal get affinity operation when not capable");
362 #if KMP_OS_LINUX
363       long retval =
364           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
365 #elif KMP_OS_FREEBSD
366       int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
367                                      reinterpret_cast<cpuset_t *>(mask));
368       int retval = (r == 0 ? 0 : -1);
369 #endif
370       if (retval >= 0) {
371         return 0;
372       }
373       int error = errno;
374       if (abort_on_error) {
375         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
376       }
377       return error;
378     }
379     int set_system_affinity(bool abort_on_error) const override {
380       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
381                   "Illegal set affinity operation when not capable");
382 #if KMP_OS_LINUX
383       long retval =
384           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
385 #elif KMP_OS_FREEBSD
386       int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
387                                      reinterpret_cast<cpuset_t *>(mask));
388       int retval = (r == 0 ? 0 : -1);
389 #endif
390       if (retval >= 0) {
391         return 0;
392       }
393       int error = errno;
394       if (abort_on_error) {
395         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
396       }
397       return error;
398     }
399   };
400   void determine_capable(const char *env_var) override {
401     __kmp_affinity_determine_capable(env_var);
402   }
403   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
404   KMPAffinity::Mask *allocate_mask() override {
405     KMPNativeAffinity::Mask *retval = new Mask();
406     return retval;
407   }
408   void deallocate_mask(KMPAffinity::Mask *m) override {
409     KMPNativeAffinity::Mask *native_mask =
410         static_cast<KMPNativeAffinity::Mask *>(m);
411     delete native_mask;
412   }
413   KMPAffinity::Mask *allocate_mask_array(int num) override {
414     return new Mask[num];
415   }
416   void deallocate_mask_array(KMPAffinity::Mask *array) override {
417     Mask *linux_array = static_cast<Mask *>(array);
418     delete[] linux_array;
419   }
420   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
421                                       int index) override {
422     Mask *linux_array = static_cast<Mask *>(array);
423     return &(linux_array[index]);
424   }
425   api_type get_api_type() const override { return NATIVE_OS; }
426 };
427 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
428 
429 #if KMP_OS_WINDOWS
430 class KMPNativeAffinity : public KMPAffinity {
431   class Mask : public KMPAffinity::Mask {
432     typedef ULONG_PTR mask_t;
433     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
434     mask_t *mask;
435 
436   public:
437     Mask() {
438       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
439     }
440     ~Mask() {
441       if (mask)
442         __kmp_free(mask);
443     }
444     void set(int i) override {
445       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
446     }
447     bool is_set(int i) const override {
448       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
449     }
450     void clear(int i) override {
451       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
452     }
453     void zero() override {
454       for (int i = 0; i < __kmp_num_proc_groups; ++i)
455         mask[i] = 0;
456     }
457     void copy(const KMPAffinity::Mask *src) override {
458       const Mask *convert = static_cast<const Mask *>(src);
459       for (int i = 0; i < __kmp_num_proc_groups; ++i)
460         mask[i] = convert->mask[i];
461     }
462     void bitwise_and(const KMPAffinity::Mask *rhs) override {
463       const Mask *convert = static_cast<const Mask *>(rhs);
464       for (int i = 0; i < __kmp_num_proc_groups; ++i)
465         mask[i] &= convert->mask[i];
466     }
467     void bitwise_or(const KMPAffinity::Mask *rhs) override {
468       const Mask *convert = static_cast<const Mask *>(rhs);
469       for (int i = 0; i < __kmp_num_proc_groups; ++i)
470         mask[i] |= convert->mask[i];
471     }
472     void bitwise_not() override {
473       for (int i = 0; i < __kmp_num_proc_groups; ++i)
474         mask[i] = ~(mask[i]);
475     }
476     int begin() const override {
477       int retval = 0;
478       while (retval < end() && !is_set(retval))
479         ++retval;
480       return retval;
481     }
482     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
483     int next(int previous) const override {
484       int retval = previous + 1;
485       while (retval < end() && !is_set(retval))
486         ++retval;
487       return retval;
488     }
489     int set_process_affinity(bool abort_on_error) const override {
490       if (__kmp_num_proc_groups <= 1) {
491         if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
492           DWORD error = GetLastError();
493           if (abort_on_error) {
494             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
495                         __kmp_msg_null);
496           }
497           return error;
498         }
499       }
500       return 0;
501     }
502     int set_system_affinity(bool abort_on_error) const override {
503       if (__kmp_num_proc_groups > 1) {
504         // Check for a valid mask.
505         GROUP_AFFINITY ga;
506         int group = get_proc_group();
507         if (group < 0) {
508           if (abort_on_error) {
509             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
510           }
511           return -1;
512         }
513         // Transform the bit vector into a GROUP_AFFINITY struct
514         // and make the system call to set affinity.
515         ga.Group = group;
516         ga.Mask = mask[group];
517         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
518 
519         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
520         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
521           DWORD error = GetLastError();
522           if (abort_on_error) {
523             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
524                         __kmp_msg_null);
525           }
526           return error;
527         }
528       } else {
529         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
530           DWORD error = GetLastError();
531           if (abort_on_error) {
532             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
533                         __kmp_msg_null);
534           }
535           return error;
536         }
537       }
538       return 0;
539     }
540     int get_system_affinity(bool abort_on_error) override {
541       if (__kmp_num_proc_groups > 1) {
542         this->zero();
543         GROUP_AFFINITY ga;
544         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
545         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
546           DWORD error = GetLastError();
547           if (abort_on_error) {
548             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
549                         KMP_ERR(error), __kmp_msg_null);
550           }
551           return error;
552         }
553         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
554             (ga.Mask == 0)) {
555           return -1;
556         }
557         mask[ga.Group] = ga.Mask;
558       } else {
559         mask_t newMask, sysMask, retval;
560         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
561           DWORD error = GetLastError();
562           if (abort_on_error) {
563             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
564                         KMP_ERR(error), __kmp_msg_null);
565           }
566           return error;
567         }
568         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
569         if (!retval) {
570           DWORD error = GetLastError();
571           if (abort_on_error) {
572             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
573                         KMP_ERR(error), __kmp_msg_null);
574           }
575           return error;
576         }
577         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
578         if (!newMask) {
579           DWORD error = GetLastError();
580           if (abort_on_error) {
581             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
582                         KMP_ERR(error), __kmp_msg_null);
583           }
584         }
585         *mask = retval;
586       }
587       return 0;
588     }
589     int get_proc_group() const override {
590       int group = -1;
591       if (__kmp_num_proc_groups == 1) {
592         return 1;
593       }
594       for (int i = 0; i < __kmp_num_proc_groups; i++) {
595         if (mask[i] == 0)
596           continue;
597         if (group >= 0)
598           return -1;
599         group = i;
600       }
601       return group;
602     }
603   };
604   void determine_capable(const char *env_var) override {
605     __kmp_affinity_determine_capable(env_var);
606   }
607   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
608   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
609   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
610   KMPAffinity::Mask *allocate_mask_array(int num) override {
611     return new Mask[num];
612   }
613   void deallocate_mask_array(KMPAffinity::Mask *array) override {
614     Mask *windows_array = static_cast<Mask *>(array);
615     delete[] windows_array;
616   }
617   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
618                                       int index) override {
619     Mask *windows_array = static_cast<Mask *>(array);
620     return &(windows_array[index]);
621   }
622   api_type get_api_type() const override { return NATIVE_OS; }
623 };
624 #endif /* KMP_OS_WINDOWS */
625 #endif /* KMP_AFFINITY_SUPPORTED */
626 
627 // Describe an attribute for a level in the machine topology
628 struct kmp_hw_attr_t {
629   int core_type : 8;
630   int core_eff : 8;
631   unsigned valid : 1;
632   unsigned reserved : 15;
633 
634   static const int UNKNOWN_CORE_EFF = -1;
635 
636   kmp_hw_attr_t()
637       : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
638         valid(0), reserved(0) {}
639   void set_core_type(kmp_hw_core_type_t type) {
640     valid = 1;
641     core_type = type;
642   }
643   void set_core_eff(int eff) {
644     valid = 1;
645     core_eff = eff;
646   }
647   kmp_hw_core_type_t get_core_type() const {
648     return (kmp_hw_core_type_t)core_type;
649   }
650   int get_core_eff() const { return core_eff; }
651   bool is_core_type_valid() const {
652     return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
653   }
654   bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
655   operator bool() const { return valid; }
656   void clear() {
657     core_type = KMP_HW_CORE_TYPE_UNKNOWN;
658     core_eff = UNKNOWN_CORE_EFF;
659     valid = 0;
660   }
661   bool contains(const kmp_hw_attr_t &other) const {
662     if (!valid && !other.valid)
663       return true;
664     if (valid && other.valid) {
665       if (other.is_core_type_valid()) {
666         if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
667           return false;
668       }
669       if (other.is_core_eff_valid()) {
670         if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
671           return false;
672       }
673       return true;
674     }
675     return false;
676   }
677   bool operator==(const kmp_hw_attr_t &rhs) const {
678     return (rhs.valid == valid && rhs.core_eff == core_eff &&
679             rhs.core_type == core_type);
680   }
681   bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
682 };
683 
684 #if KMP_AFFINITY_SUPPORTED
685 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
686 #endif
687 
688 class kmp_hw_thread_t {
689 public:
690   static const int UNKNOWN_ID = -1;
691   static const int MULTIPLE_ID = -2;
692   static int compare_ids(const void *a, const void *b);
693   static int compare_compact(const void *a, const void *b);
694   int ids[KMP_HW_LAST];
695   int sub_ids[KMP_HW_LAST];
696   bool leader;
697   int os_id;
698   kmp_hw_attr_t attrs;
699 
700   void print() const;
701   void clear() {
702     for (int i = 0; i < (int)KMP_HW_LAST; ++i)
703       ids[i] = UNKNOWN_ID;
704     leader = false;
705     attrs.clear();
706   }
707 };
708 
709 class kmp_topology_t {
710 
711   struct flags_t {
712     int uniform : 1;
713     int reserved : 31;
714   };
715 
716   int depth;
717 
718   // The following arrays are all 'depth' long and have been
719   // allocated to hold up to KMP_HW_LAST number of objects if
720   // needed so layers can be added without reallocation of any array
721 
722   // Orderd array of the types in the topology
723   kmp_hw_t *types;
724 
725   // Keep quick topology ratios, for non-uniform topologies,
726   // this ratio holds the max number of itemAs per itemB
727   // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
728   int *ratio;
729 
730   // Storage containing the absolute number of each topology layer
731   int *count;
732 
733   // The number of core efficiencies. This is only useful for hybrid
734   // topologies. Core efficiencies will range from 0 to num efficiencies - 1
735   int num_core_efficiencies;
736   int num_core_types;
737   kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
738 
739   // The hardware threads array
740   // hw_threads is num_hw_threads long
741   // Each hw_thread's ids and sub_ids are depth deep
742   int num_hw_threads;
743   kmp_hw_thread_t *hw_threads;
744 
745   // Equivalence hash where the key is the hardware topology item
746   // and the value is the equivalent hardware topology type in the
747   // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
748   // known equivalence for the topology type
749   kmp_hw_t equivalent[KMP_HW_LAST];
750 
751   // Flags describing the topology
752   flags_t flags;
753 
754   // Compact value used during sort_compact()
755   int compact;
756 
757   // Insert a new topology layer after allocation
758   void _insert_layer(kmp_hw_t type, const int *ids);
759 
760 #if KMP_GROUP_AFFINITY
761   // Insert topology information about Windows Processor groups
762   void _insert_windows_proc_groups();
763 #endif
764 
765   // Count each item & get the num x's per y
766   // e.g., get the number of cores and the number of threads per core
767   // for each (x, y) in (KMP_HW_* , KMP_HW_*)
768   void _gather_enumeration_information();
769 
770   // Remove layers that don't add information to the topology.
771   // This is done by having the layer take on the id = UNKNOWN_ID (-1)
772   void _remove_radix1_layers();
773 
774   // Find out if the topology is uniform
775   void _discover_uniformity();
776 
777   // Set all the sub_ids for each hardware thread
778   void _set_sub_ids();
779 
780   // Set global affinity variables describing the number of threads per
781   // core, the number of packages, the number of cores per package, and
782   // the number of cores.
783   void _set_globals();
784 
785   // Set the last level cache equivalent type
786   void _set_last_level_cache();
787 
788   // Return the number of cores with a particular attribute, 'attr'.
789   // If 'find_all' is true, then find all cores on the machine, otherwise find
790   // all cores per the layer 'above'
791   int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
792                             bool find_all = false) const;
793 
794 public:
795   // Force use of allocate()/deallocate()
796   kmp_topology_t() = delete;
797   kmp_topology_t(const kmp_topology_t &t) = delete;
798   kmp_topology_t(kmp_topology_t &&t) = delete;
799   kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
800   kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
801 
802   static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
803   static void deallocate(kmp_topology_t *);
804 
805   // Functions used in create_map() routines
806   kmp_hw_thread_t &at(int index) {
807     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
808     return hw_threads[index];
809   }
810   const kmp_hw_thread_t &at(int index) const {
811     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
812     return hw_threads[index];
813   }
814   int get_num_hw_threads() const { return num_hw_threads; }
815   void sort_ids() {
816     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
817           kmp_hw_thread_t::compare_ids);
818   }
819   // Check if the hardware ids are unique, if they are
820   // return true, otherwise return false
821   bool check_ids() const;
822 
823   // Function to call after the create_map() routine
824   void canonicalize();
825   void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
826 
827 // Functions used after canonicalize() called
828 
829 #if KMP_AFFINITY_SUPPORTED
830   // Set the granularity for affinity settings
831   void set_granularity(kmp_affinity_t &stgs) const;
832 #endif
833   bool filter_hw_subset();
834   bool is_close(int hwt1, int hwt2, int level) const;
835   bool is_uniform() const { return flags.uniform; }
836   // Tell whether a type is a valid type in the topology
837   // returns KMP_HW_UNKNOWN when there is no equivalent type
838   kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
839   // Set type1 = type2
840   void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
841     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
842     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
843     kmp_hw_t real_type2 = equivalent[type2];
844     if (real_type2 == KMP_HW_UNKNOWN)
845       real_type2 = type2;
846     equivalent[type1] = real_type2;
847     // This loop is required since any of the types may have been set to
848     // be equivalent to type1.  They all must be checked and reset to type2.
849     KMP_FOREACH_HW_TYPE(type) {
850       if (equivalent[type] == type1) {
851         equivalent[type] = real_type2;
852       }
853     }
854   }
855   // Calculate number of types corresponding to level1
856   // per types corresponding to level2 (e.g., number of threads per core)
857   int calculate_ratio(int level1, int level2) const {
858     KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
859     KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
860     int r = 1;
861     for (int level = level1; level > level2; --level)
862       r *= ratio[level];
863     return r;
864   }
865   int get_ratio(int level) const {
866     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
867     return ratio[level];
868   }
869   int get_depth() const { return depth; };
870   kmp_hw_t get_type(int level) const {
871     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
872     return types[level];
873   }
874   int get_level(kmp_hw_t type) const {
875     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
876     int eq_type = equivalent[type];
877     if (eq_type == KMP_HW_UNKNOWN)
878       return -1;
879     for (int i = 0; i < depth; ++i)
880       if (types[i] == eq_type)
881         return i;
882     return -1;
883   }
884   int get_count(int level) const {
885     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
886     return count[level];
887   }
888   // Return the total number of cores with attribute 'attr'
889   int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
890     return _get_ncores_with_attr(attr, -1, true);
891   }
892   // Return the number of cores with attribute
893   // 'attr' per topology level 'above'
894   int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
895     return _get_ncores_with_attr(attr, above, false);
896   }
897 
898 #if KMP_AFFINITY_SUPPORTED
899   friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
900   void sort_compact(kmp_affinity_t &affinity) {
901     compact = affinity.compact;
902     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
903           kmp_hw_thread_t::compare_compact);
904   }
905 #endif
906   void print(const char *env_var = "KMP_AFFINITY") const;
907   void dump() const;
908 };
909 extern kmp_topology_t *__kmp_topology;
910 
911 class kmp_hw_subset_t {
912   const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
913 
914 public:
915   // Describe a machine topology item in KMP_HW_SUBSET
916   struct item_t {
917     kmp_hw_t type;
918     int num_attrs;
919     int num[MAX_ATTRS];
920     int offset[MAX_ATTRS];
921     kmp_hw_attr_t attr[MAX_ATTRS];
922   };
923   // Put parenthesis around max to avoid accidental use of Windows max macro.
924   const static int USE_ALL = (std::numeric_limits<int>::max)();
925 
926 private:
927   int depth;
928   int capacity;
929   item_t *items;
930   kmp_uint64 set;
931   bool absolute;
932   // The set must be able to handle up to KMP_HW_LAST number of layers
933   KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
934   // Sorting the KMP_HW_SUBSET items to follow topology order
935   // All unknown topology types will be at the beginning of the subset
936   static int hw_subset_compare(const void *i1, const void *i2) {
937     kmp_hw_t type1 = ((const item_t *)i1)->type;
938     kmp_hw_t type2 = ((const item_t *)i2)->type;
939     int level1 = __kmp_topology->get_level(type1);
940     int level2 = __kmp_topology->get_level(type2);
941     return level1 - level2;
942   }
943 
944 public:
945   // Force use of allocate()/deallocate()
946   kmp_hw_subset_t() = delete;
947   kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
948   kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
949   kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
950   kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
951 
952   static kmp_hw_subset_t *allocate() {
953     int initial_capacity = 5;
954     kmp_hw_subset_t *retval =
955         (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
956     retval->depth = 0;
957     retval->capacity = initial_capacity;
958     retval->set = 0ull;
959     retval->absolute = false;
960     retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
961     return retval;
962   }
963   static void deallocate(kmp_hw_subset_t *subset) {
964     __kmp_free(subset->items);
965     __kmp_free(subset);
966   }
967   void set_absolute() { absolute = true; }
968   bool is_absolute() const { return absolute; }
969   void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
970     for (int i = 0; i < depth; ++i) {
971       // Found an existing item for this layer type
972       // Add the num, offset, and attr to this item
973       if (items[i].type == type) {
974         int idx = items[i].num_attrs++;
975         if ((size_t)idx >= MAX_ATTRS)
976           return;
977         items[i].num[idx] = num;
978         items[i].offset[idx] = offset;
979         items[i].attr[idx] = attr;
980         return;
981       }
982     }
983     if (depth == capacity - 1) {
984       capacity *= 2;
985       item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
986       for (int i = 0; i < depth; ++i)
987         new_items[i] = items[i];
988       __kmp_free(items);
989       items = new_items;
990     }
991     items[depth].num_attrs = 1;
992     items[depth].type = type;
993     items[depth].num[0] = num;
994     items[depth].offset[0] = offset;
995     items[depth].attr[0] = attr;
996     depth++;
997     set |= (1ull << type);
998   }
999   int get_depth() const { return depth; }
1000   const item_t &at(int index) const {
1001     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1002     return items[index];
1003   }
1004   item_t &at(int index) {
1005     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1006     return items[index];
1007   }
1008   void remove(int index) {
1009     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1010     set &= ~(1ull << items[index].type);
1011     for (int j = index + 1; j < depth; ++j) {
1012       items[j - 1] = items[j];
1013     }
1014     depth--;
1015   }
1016   void sort() {
1017     KMP_DEBUG_ASSERT(__kmp_topology);
1018     qsort(items, depth, sizeof(item_t), hw_subset_compare);
1019   }
1020   bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1021   void dump() const {
1022     printf("**********************\n");
1023     printf("*** kmp_hw_subset: ***\n");
1024     printf("* depth: %d\n", depth);
1025     printf("* items:\n");
1026     for (int i = 0; i < depth; ++i) {
1027       printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1028       for (int j = 0; j < items[i].num_attrs; ++j) {
1029         printf("  num: %d, offset: %d, attr: ", items[i].num[j],
1030                items[i].offset[j]);
1031         if (!items[i].attr[j]) {
1032           printf(" (none)\n");
1033         } else {
1034           printf(
1035               " core_type = %s, core_eff = %d\n",
1036               __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1037               items[i].attr[j].get_core_eff());
1038         }
1039       }
1040     }
1041     printf("* set: 0x%llx\n", set);
1042     printf("* absolute: %d\n", absolute);
1043     printf("**********************\n");
1044   }
1045 };
1046 extern kmp_hw_subset_t *__kmp_hw_subset;
1047 
1048 /* A structure for holding machine-specific hierarchy info to be computed once
1049    at init. This structure represents a mapping of threads to the actual machine
1050    hierarchy, or to our best guess at what the hierarchy might be, for the
1051    purpose of performing an efficient barrier. In the worst case, when there is
1052    no machine hierarchy information, it produces a tree suitable for a barrier,
1053    similar to the tree used in the hyper barrier. */
1054 class hierarchy_info {
1055 public:
1056   /* Good default values for number of leaves and branching factor, given no
1057      affinity information. Behaves a bit like hyper barrier. */
1058   static const kmp_uint32 maxLeaves = 4;
1059   static const kmp_uint32 minBranch = 4;
1060   /** Number of levels in the hierarchy. Typical levels are threads/core,
1061       cores/package or socket, packages/node, nodes/machine, etc. We don't want
1062       to get specific with nomenclature. When the machine is oversubscribed we
1063       add levels to duplicate the hierarchy, doubling the thread capacity of the
1064       hierarchy each time we add a level. */
1065   kmp_uint32 maxLevels;
1066 
1067   /** This is specifically the depth of the machine configuration hierarchy, in
1068       terms of the number of levels along the longest path from root to any
1069       leaf. It corresponds to the number of entries in numPerLevel if we exclude
1070       all but one trailing 1. */
1071   kmp_uint32 depth;
1072   kmp_uint32 base_num_threads;
1073   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1074   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1075   // 2=initialization in progress
1076   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1077 
1078   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1079       the parent of a node at level i has. For example, if we have a machine
1080       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1081       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
1082   kmp_uint32 *numPerLevel;
1083   kmp_uint32 *skipPerLevel;
1084 
1085   void deriveLevels() {
1086     int hier_depth = __kmp_topology->get_depth();
1087     for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1088       numPerLevel[level] = __kmp_topology->get_ratio(i);
1089     }
1090   }
1091 
1092   hierarchy_info()
1093       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1094 
1095   void fini() {
1096     if (!uninitialized && numPerLevel) {
1097       __kmp_free(numPerLevel);
1098       numPerLevel = NULL;
1099       uninitialized = not_initialized;
1100     }
1101   }
1102 
1103   void init(int num_addrs) {
1104     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1105         &uninitialized, not_initialized, initializing);
1106     if (bool_result == 0) { // Wait for initialization
1107       while (TCR_1(uninitialized) != initialized)
1108         KMP_CPU_PAUSE();
1109       return;
1110     }
1111     KMP_DEBUG_ASSERT(bool_result == 1);
1112 
1113     /* Added explicit initialization of the data fields here to prevent usage of
1114        dirty value observed when static library is re-initialized multiple times
1115        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1116        OpenMP). */
1117     depth = 1;
1118     resizing = 0;
1119     maxLevels = 7;
1120     numPerLevel =
1121         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1122     skipPerLevel = &(numPerLevel[maxLevels]);
1123     for (kmp_uint32 i = 0; i < maxLevels;
1124          ++i) { // init numPerLevel[*] to 1 item per level
1125       numPerLevel[i] = 1;
1126       skipPerLevel[i] = 1;
1127     }
1128 
1129     // Sort table by physical ID
1130     if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1131       deriveLevels();
1132     } else {
1133       numPerLevel[0] = maxLeaves;
1134       numPerLevel[1] = num_addrs / maxLeaves;
1135       if (num_addrs % maxLeaves)
1136         numPerLevel[1]++;
1137     }
1138 
1139     base_num_threads = num_addrs;
1140     for (int i = maxLevels - 1; i >= 0;
1141          --i) // count non-empty levels to get depth
1142       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1143         depth++;
1144 
1145     kmp_uint32 branch = minBranch;
1146     if (numPerLevel[0] == 1)
1147       branch = num_addrs / maxLeaves;
1148     if (branch < minBranch)
1149       branch = minBranch;
1150     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1151       while (numPerLevel[d] > branch ||
1152              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1153         if (numPerLevel[d] & 1)
1154           numPerLevel[d]++;
1155         numPerLevel[d] = numPerLevel[d] >> 1;
1156         if (numPerLevel[d + 1] == 1)
1157           depth++;
1158         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1159       }
1160       if (numPerLevel[0] == 1) {
1161         branch = branch >> 1;
1162         if (branch < 4)
1163           branch = minBranch;
1164       }
1165     }
1166 
1167     for (kmp_uint32 i = 1; i < depth; ++i)
1168       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1169     // Fill in hierarchy in the case of oversubscription
1170     for (kmp_uint32 i = depth; i < maxLevels; ++i)
1171       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1172 
1173     uninitialized = initialized; // One writer
1174   }
1175 
1176   // Resize the hierarchy if nproc changes to something larger than before
1177   void resize(kmp_uint32 nproc) {
1178     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1179     while (bool_result == 0) { // someone else is trying to resize
1180       KMP_CPU_PAUSE();
1181       if (nproc <= base_num_threads) // happy with other thread's resize
1182         return;
1183       else // try to resize
1184         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1185     }
1186     KMP_DEBUG_ASSERT(bool_result != 0);
1187     if (nproc <= base_num_threads)
1188       return; // happy with other thread's resize
1189 
1190     // Calculate new maxLevels
1191     kmp_uint32 old_sz = skipPerLevel[depth - 1];
1192     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1193     // First see if old maxLevels is enough to contain new size
1194     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1195       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1196       numPerLevel[i - 1] *= 2;
1197       old_sz *= 2;
1198       depth++;
1199     }
1200     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1201       while (nproc > old_sz) {
1202         old_sz *= 2;
1203         incs++;
1204         depth++;
1205       }
1206       maxLevels += incs;
1207 
1208       // Resize arrays
1209       kmp_uint32 *old_numPerLevel = numPerLevel;
1210       kmp_uint32 *old_skipPerLevel = skipPerLevel;
1211       numPerLevel = skipPerLevel = NULL;
1212       numPerLevel =
1213           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1214       skipPerLevel = &(numPerLevel[maxLevels]);
1215 
1216       // Copy old elements from old arrays
1217       for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1218         // init numPerLevel[*] to 1 item per level
1219         numPerLevel[i] = old_numPerLevel[i];
1220         skipPerLevel[i] = old_skipPerLevel[i];
1221       }
1222 
1223       // Init new elements in arrays to 1
1224       for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1225         // init numPerLevel[*] to 1 item per level
1226         numPerLevel[i] = 1;
1227         skipPerLevel[i] = 1;
1228       }
1229 
1230       // Free old arrays
1231       __kmp_free(old_numPerLevel);
1232     }
1233 
1234     // Fill in oversubscription levels of hierarchy
1235     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1236       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1237 
1238     base_num_threads = nproc;
1239     resizing = 0; // One writer
1240   }
1241 };
1242 #endif // KMP_AFFINITY_H
1243