xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_affinity.h (revision b4af4f93c682e445bf159f0d1ec90b636296c946)
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 
19 #if KMP_AFFINITY_SUPPORTED
20 #if KMP_USE_HWLOC
21 class KMPHwlocAffinity : public KMPAffinity {
22 public:
23   class Mask : public KMPAffinity::Mask {
24     hwloc_cpuset_t mask;
25 
26   public:
27     Mask() {
28       mask = hwloc_bitmap_alloc();
29       this->zero();
30     }
31     ~Mask() { hwloc_bitmap_free(mask); }
32     void set(int i) override { hwloc_bitmap_set(mask, i); }
33     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
34     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
35     void zero() override { hwloc_bitmap_zero(mask); }
36     void copy(const KMPAffinity::Mask *src) override {
37       const Mask *convert = static_cast<const Mask *>(src);
38       hwloc_bitmap_copy(mask, convert->mask);
39     }
40     void bitwise_and(const KMPAffinity::Mask *rhs) override {
41       const Mask *convert = static_cast<const Mask *>(rhs);
42       hwloc_bitmap_and(mask, mask, convert->mask);
43     }
44     void bitwise_or(const KMPAffinity::Mask *rhs) override {
45       const Mask *convert = static_cast<const Mask *>(rhs);
46       hwloc_bitmap_or(mask, mask, convert->mask);
47     }
48     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
49     int begin() const override { return hwloc_bitmap_first(mask); }
50     int end() const override { return -1; }
51     int next(int previous) const override {
52       return hwloc_bitmap_next(mask, previous);
53     }
54     int get_system_affinity(bool abort_on_error) override {
55       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
56                   "Illegal get affinity operation when not capable");
57       int retval =
58           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
59       if (retval >= 0) {
60         return 0;
61       }
62       int error = errno;
63       if (abort_on_error) {
64         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
65       }
66       return error;
67     }
68     int set_system_affinity(bool abort_on_error) const override {
69       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
70                   "Illegal get affinity operation when not capable");
71       int retval =
72           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
73       if (retval >= 0) {
74         return 0;
75       }
76       int error = errno;
77       if (abort_on_error) {
78         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
79       }
80       return error;
81     }
82     int get_proc_group() const override {
83       int group = -1;
84 #if KMP_OS_WINDOWS
85       if (__kmp_num_proc_groups == 1) {
86         return 1;
87       }
88       for (int i = 0; i < __kmp_num_proc_groups; i++) {
89         // On windows, the long type is always 32 bits
90         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
91         unsigned long second_32_bits =
92             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
93         if (first_32_bits == 0 && second_32_bits == 0) {
94           continue;
95         }
96         if (group >= 0) {
97           return -1;
98         }
99         group = i;
100       }
101 #endif /* KMP_OS_WINDOWS */
102       return group;
103     }
104   };
105   void determine_capable(const char *var) override {
106     const hwloc_topology_support *topology_support;
107     if (__kmp_hwloc_topology == NULL) {
108       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
109         __kmp_hwloc_error = TRUE;
110         if (__kmp_affinity_verbose)
111           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
112       }
113       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
114         __kmp_hwloc_error = TRUE;
115         if (__kmp_affinity_verbose)
116           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
117       }
118     }
119     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
120     // Is the system capable of setting/getting this thread's affinity?
121     // Also, is topology discovery possible? (pu indicates ability to discover
122     // processing units). And finally, were there no errors when calling any
123     // hwloc_* API functions?
124     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
125         topology_support->cpubind->get_thisthread_cpubind &&
126         topology_support->discovery->pu && !__kmp_hwloc_error) {
127       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
128       KMP_AFFINITY_ENABLE(TRUE);
129     } else {
130       // indicate that hwloc didn't work and disable affinity
131       __kmp_hwloc_error = TRUE;
132       KMP_AFFINITY_DISABLE();
133     }
134   }
135   void bind_thread(int which) override {
136     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
137                 "Illegal set affinity operation when not capable");
138     KMPAffinity::Mask *mask;
139     KMP_CPU_ALLOC_ON_STACK(mask);
140     KMP_CPU_ZERO(mask);
141     KMP_CPU_SET(which, mask);
142     __kmp_set_system_affinity(mask, TRUE);
143     KMP_CPU_FREE_FROM_STACK(mask);
144   }
145   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
146   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
147   KMPAffinity::Mask *allocate_mask_array(int num) override {
148     return new Mask[num];
149   }
150   void deallocate_mask_array(KMPAffinity::Mask *array) override {
151     Mask *hwloc_array = static_cast<Mask *>(array);
152     delete[] hwloc_array;
153   }
154   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
155                                       int index) override {
156     Mask *hwloc_array = static_cast<Mask *>(array);
157     return &(hwloc_array[index]);
158   }
159   api_type get_api_type() const override { return HWLOC; }
160 };
161 #endif /* KMP_USE_HWLOC */
162 
163 #if KMP_OS_LINUX || KMP_OS_FREEBSD
164 #if KMP_OS_LINUX
165 /* On some of the older OS's that we build on, these constants aren't present
166    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
167    all systems of the same arch where they are defined, and they cannot change.
168    stone forever. */
169 #include <sys/syscall.h>
170 #if KMP_ARCH_X86 || KMP_ARCH_ARM
171 #ifndef __NR_sched_setaffinity
172 #define __NR_sched_setaffinity 241
173 #elif __NR_sched_setaffinity != 241
174 #error Wrong code for setaffinity system call.
175 #endif /* __NR_sched_setaffinity */
176 #ifndef __NR_sched_getaffinity
177 #define __NR_sched_getaffinity 242
178 #elif __NR_sched_getaffinity != 242
179 #error Wrong code for getaffinity system call.
180 #endif /* __NR_sched_getaffinity */
181 #elif KMP_ARCH_AARCH64
182 #ifndef __NR_sched_setaffinity
183 #define __NR_sched_setaffinity 122
184 #elif __NR_sched_setaffinity != 122
185 #error Wrong code for setaffinity system call.
186 #endif /* __NR_sched_setaffinity */
187 #ifndef __NR_sched_getaffinity
188 #define __NR_sched_getaffinity 123
189 #elif __NR_sched_getaffinity != 123
190 #error Wrong code for getaffinity system call.
191 #endif /* __NR_sched_getaffinity */
192 #elif KMP_ARCH_X86_64
193 #ifndef __NR_sched_setaffinity
194 #define __NR_sched_setaffinity 203
195 #elif __NR_sched_setaffinity != 203
196 #error Wrong code for setaffinity system call.
197 #endif /* __NR_sched_setaffinity */
198 #ifndef __NR_sched_getaffinity
199 #define __NR_sched_getaffinity 204
200 #elif __NR_sched_getaffinity != 204
201 #error Wrong code for getaffinity system call.
202 #endif /* __NR_sched_getaffinity */
203 #elif KMP_ARCH_PPC64
204 #ifndef __NR_sched_setaffinity
205 #define __NR_sched_setaffinity 222
206 #elif __NR_sched_setaffinity != 222
207 #error Wrong code for setaffinity system call.
208 #endif /* __NR_sched_setaffinity */
209 #ifndef __NR_sched_getaffinity
210 #define __NR_sched_getaffinity 223
211 #elif __NR_sched_getaffinity != 223
212 #error Wrong code for getaffinity system call.
213 #endif /* __NR_sched_getaffinity */
214 #elif KMP_ARCH_MIPS
215 #ifndef __NR_sched_setaffinity
216 #define __NR_sched_setaffinity 4239
217 #elif __NR_sched_setaffinity != 4239
218 #error Wrong code for setaffinity system call.
219 #endif /* __NR_sched_setaffinity */
220 #ifndef __NR_sched_getaffinity
221 #define __NR_sched_getaffinity 4240
222 #elif __NR_sched_getaffinity != 4240
223 #error Wrong code for getaffinity system call.
224 #endif /* __NR_sched_getaffinity */
225 #elif KMP_ARCH_MIPS64
226 #ifndef __NR_sched_setaffinity
227 #define __NR_sched_setaffinity 5195
228 #elif __NR_sched_setaffinity != 5195
229 #error Wrong code for setaffinity system call.
230 #endif /* __NR_sched_setaffinity */
231 #ifndef __NR_sched_getaffinity
232 #define __NR_sched_getaffinity 5196
233 #elif __NR_sched_getaffinity != 5196
234 #error Wrong code for getaffinity system call.
235 #endif /* __NR_sched_getaffinity */
236 #error Unknown or unsupported architecture
237 #endif /* KMP_ARCH_* */
238 #elif KMP_OS_FREEBSD
239 #include <pthread.h>
240 #include <pthread_np.h>
241 #endif
242 class KMPNativeAffinity : public KMPAffinity {
243   class Mask : public KMPAffinity::Mask {
244     typedef unsigned char mask_t;
245     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
246 
247   public:
248     mask_t *mask;
249     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
250     ~Mask() {
251       if (mask)
252         __kmp_free(mask);
253     }
254     void set(int i) override {
255       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
256     }
257     bool is_set(int i) const override {
258       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
259     }
260     void clear(int i) override {
261       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
262     }
263     void zero() override {
264       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
265         mask[i] = 0;
266     }
267     void copy(const KMPAffinity::Mask *src) override {
268       const Mask *convert = static_cast<const Mask *>(src);
269       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
270         mask[i] = convert->mask[i];
271     }
272     void bitwise_and(const KMPAffinity::Mask *rhs) override {
273       const Mask *convert = static_cast<const Mask *>(rhs);
274       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
275         mask[i] &= convert->mask[i];
276     }
277     void bitwise_or(const KMPAffinity::Mask *rhs) override {
278       const Mask *convert = static_cast<const Mask *>(rhs);
279       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
280         mask[i] |= convert->mask[i];
281     }
282     void bitwise_not() override {
283       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
284         mask[i] = ~(mask[i]);
285     }
286     int begin() const override {
287       int retval = 0;
288       while (retval < end() && !is_set(retval))
289         ++retval;
290       return retval;
291     }
292     int end() const override { return __kmp_affin_mask_size * BITS_PER_MASK_T; }
293     int next(int previous) const override {
294       int retval = previous + 1;
295       while (retval < end() && !is_set(retval))
296         ++retval;
297       return retval;
298     }
299     int get_system_affinity(bool abort_on_error) override {
300       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
301                   "Illegal get affinity operation when not capable");
302 #if KMP_OS_LINUX
303       int retval =
304           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
305 #elif KMP_OS_FREEBSD
306       int retval =
307           pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
308 #endif
309       if (retval >= 0) {
310         return 0;
311       }
312       int error = errno;
313       if (abort_on_error) {
314         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
315       }
316       return error;
317     }
318     int set_system_affinity(bool abort_on_error) const override {
319       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
320                   "Illegal get affinity operation when not capable");
321 #if KMP_OS_LINUX
322       int retval =
323           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
324 #elif KMP_OS_FREEBSD
325       int retval =
326           pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
327 #endif
328       if (retval >= 0) {
329         return 0;
330       }
331       int error = errno;
332       if (abort_on_error) {
333         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
334       }
335       return error;
336     }
337   };
338   void determine_capable(const char *env_var) override {
339     __kmp_affinity_determine_capable(env_var);
340   }
341   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
342   KMPAffinity::Mask *allocate_mask() override {
343     KMPNativeAffinity::Mask *retval = new Mask();
344     return retval;
345   }
346   void deallocate_mask(KMPAffinity::Mask *m) override {
347     KMPNativeAffinity::Mask *native_mask =
348         static_cast<KMPNativeAffinity::Mask *>(m);
349     delete native_mask;
350   }
351   KMPAffinity::Mask *allocate_mask_array(int num) override {
352     return new Mask[num];
353   }
354   void deallocate_mask_array(KMPAffinity::Mask *array) override {
355     Mask *linux_array = static_cast<Mask *>(array);
356     delete[] linux_array;
357   }
358   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
359                                       int index) override {
360     Mask *linux_array = static_cast<Mask *>(array);
361     return &(linux_array[index]);
362   }
363   api_type get_api_type() const override { return NATIVE_OS; }
364 };
365 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
366 
367 #if KMP_OS_WINDOWS
368 class KMPNativeAffinity : public KMPAffinity {
369   class Mask : public KMPAffinity::Mask {
370     typedef ULONG_PTR mask_t;
371     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
372     mask_t *mask;
373 
374   public:
375     Mask() {
376       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
377     }
378     ~Mask() {
379       if (mask)
380         __kmp_free(mask);
381     }
382     void set(int i) override {
383       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
384     }
385     bool is_set(int i) const override {
386       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
387     }
388     void clear(int i) override {
389       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
390     }
391     void zero() override {
392       for (int i = 0; i < __kmp_num_proc_groups; ++i)
393         mask[i] = 0;
394     }
395     void copy(const KMPAffinity::Mask *src) override {
396       const Mask *convert = static_cast<const Mask *>(src);
397       for (int i = 0; i < __kmp_num_proc_groups; ++i)
398         mask[i] = convert->mask[i];
399     }
400     void bitwise_and(const KMPAffinity::Mask *rhs) override {
401       const Mask *convert = static_cast<const Mask *>(rhs);
402       for (int i = 0; i < __kmp_num_proc_groups; ++i)
403         mask[i] &= convert->mask[i];
404     }
405     void bitwise_or(const KMPAffinity::Mask *rhs) override {
406       const Mask *convert = static_cast<const Mask *>(rhs);
407       for (int i = 0; i < __kmp_num_proc_groups; ++i)
408         mask[i] |= convert->mask[i];
409     }
410     void bitwise_not() override {
411       for (int i = 0; i < __kmp_num_proc_groups; ++i)
412         mask[i] = ~(mask[i]);
413     }
414     int begin() const override {
415       int retval = 0;
416       while (retval < end() && !is_set(retval))
417         ++retval;
418       return retval;
419     }
420     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
421     int next(int previous) const override {
422       int retval = previous + 1;
423       while (retval < end() && !is_set(retval))
424         ++retval;
425       return retval;
426     }
427     int set_system_affinity(bool abort_on_error) const override {
428       if (__kmp_num_proc_groups > 1) {
429         // Check for a valid mask.
430         GROUP_AFFINITY ga;
431         int group = get_proc_group();
432         if (group < 0) {
433           if (abort_on_error) {
434             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
435           }
436           return -1;
437         }
438         // Transform the bit vector into a GROUP_AFFINITY struct
439         // and make the system call to set affinity.
440         ga.Group = group;
441         ga.Mask = mask[group];
442         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
443 
444         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
445         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
446           DWORD error = GetLastError();
447           if (abort_on_error) {
448             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
449                         __kmp_msg_null);
450           }
451           return error;
452         }
453       } else {
454         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
455           DWORD error = GetLastError();
456           if (abort_on_error) {
457             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
458                         __kmp_msg_null);
459           }
460           return error;
461         }
462       }
463       return 0;
464     }
465     int get_system_affinity(bool abort_on_error) override {
466       if (__kmp_num_proc_groups > 1) {
467         this->zero();
468         GROUP_AFFINITY ga;
469         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
470         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
471           DWORD error = GetLastError();
472           if (abort_on_error) {
473             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
474                         KMP_ERR(error), __kmp_msg_null);
475           }
476           return error;
477         }
478         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
479             (ga.Mask == 0)) {
480           return -1;
481         }
482         mask[ga.Group] = ga.Mask;
483       } else {
484         mask_t newMask, sysMask, retval;
485         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
486           DWORD error = GetLastError();
487           if (abort_on_error) {
488             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
489                         KMP_ERR(error), __kmp_msg_null);
490           }
491           return error;
492         }
493         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
494         if (!retval) {
495           DWORD error = GetLastError();
496           if (abort_on_error) {
497             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
498                         KMP_ERR(error), __kmp_msg_null);
499           }
500           return error;
501         }
502         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
503         if (!newMask) {
504           DWORD error = GetLastError();
505           if (abort_on_error) {
506             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
507                         KMP_ERR(error), __kmp_msg_null);
508           }
509         }
510         *mask = retval;
511       }
512       return 0;
513     }
514     int get_proc_group() const override {
515       int group = -1;
516       if (__kmp_num_proc_groups == 1) {
517         return 1;
518       }
519       for (int i = 0; i < __kmp_num_proc_groups; i++) {
520         if (mask[i] == 0)
521           continue;
522         if (group >= 0)
523           return -1;
524         group = i;
525       }
526       return group;
527     }
528   };
529   void determine_capable(const char *env_var) override {
530     __kmp_affinity_determine_capable(env_var);
531   }
532   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
533   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
534   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
535   KMPAffinity::Mask *allocate_mask_array(int num) override {
536     return new Mask[num];
537   }
538   void deallocate_mask_array(KMPAffinity::Mask *array) override {
539     Mask *windows_array = static_cast<Mask *>(array);
540     delete[] windows_array;
541   }
542   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
543                                       int index) override {
544     Mask *windows_array = static_cast<Mask *>(array);
545     return &(windows_array[index]);
546   }
547   api_type get_api_type() const override { return NATIVE_OS; }
548 };
549 #endif /* KMP_OS_WINDOWS */
550 #endif /* KMP_AFFINITY_SUPPORTED */
551 
552 class Address {
553 public:
554   static const unsigned maxDepth = 32;
555   unsigned labels[maxDepth];
556   unsigned childNums[maxDepth];
557   unsigned depth;
558   unsigned leader;
559   Address(unsigned _depth) : depth(_depth), leader(FALSE) {}
560   Address &operator=(const Address &b) {
561     depth = b.depth;
562     for (unsigned i = 0; i < depth; i++) {
563       labels[i] = b.labels[i];
564       childNums[i] = b.childNums[i];
565     }
566     leader = FALSE;
567     return *this;
568   }
569   bool operator==(const Address &b) const {
570     if (depth != b.depth)
571       return false;
572     for (unsigned i = 0; i < depth; i++)
573       if (labels[i] != b.labels[i])
574         return false;
575     return true;
576   }
577   bool isClose(const Address &b, int level) const {
578     if (depth != b.depth)
579       return false;
580     if ((unsigned)level >= depth)
581       return true;
582     for (unsigned i = 0; i < (depth - level); i++)
583       if (labels[i] != b.labels[i])
584         return false;
585     return true;
586   }
587   bool operator!=(const Address &b) const { return !operator==(b); }
588   void print() const {
589     unsigned i;
590     printf("Depth: %u --- ", depth);
591     for (i = 0; i < depth; i++) {
592       printf("%u ", labels[i]);
593     }
594   }
595 };
596 
597 class AddrUnsPair {
598 public:
599   Address first;
600   unsigned second;
601   AddrUnsPair(Address _first, unsigned _second)
602       : first(_first), second(_second) {}
603   AddrUnsPair &operator=(const AddrUnsPair &b) {
604     first = b.first;
605     second = b.second;
606     return *this;
607   }
608   void print() const {
609     printf("first = ");
610     first.print();
611     printf(" --- second = %u", second);
612   }
613   bool operator==(const AddrUnsPair &b) const {
614     if (first != b.first)
615       return false;
616     if (second != b.second)
617       return false;
618     return true;
619   }
620   bool operator!=(const AddrUnsPair &b) const { return !operator==(b); }
621 };
622 
623 static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) {
624   const Address *aa = &(((const AddrUnsPair *)a)->first);
625   const Address *bb = &(((const AddrUnsPair *)b)->first);
626   unsigned depth = aa->depth;
627   unsigned i;
628   KMP_DEBUG_ASSERT(depth == bb->depth);
629   for (i = 0; i < depth; i++) {
630     if (aa->labels[i] < bb->labels[i])
631       return -1;
632     if (aa->labels[i] > bb->labels[i])
633       return 1;
634   }
635   return 0;
636 }
637 
638 /* A structure for holding machine-specific hierarchy info to be computed once
639    at init. This structure represents a mapping of threads to the actual machine
640    hierarchy, or to our best guess at what the hierarchy might be, for the
641    purpose of performing an efficient barrier. In the worst case, when there is
642    no machine hierarchy information, it produces a tree suitable for a barrier,
643    similar to the tree used in the hyper barrier. */
644 class hierarchy_info {
645 public:
646   /* Good default values for number of leaves and branching factor, given no
647      affinity information. Behaves a bit like hyper barrier. */
648   static const kmp_uint32 maxLeaves = 4;
649   static const kmp_uint32 minBranch = 4;
650   /** Number of levels in the hierarchy. Typical levels are threads/core,
651       cores/package or socket, packages/node, nodes/machine, etc. We don't want
652       to get specific with nomenclature. When the machine is oversubscribed we
653       add levels to duplicate the hierarchy, doubling the thread capacity of the
654       hierarchy each time we add a level. */
655   kmp_uint32 maxLevels;
656 
657   /** This is specifically the depth of the machine configuration hierarchy, in
658       terms of the number of levels along the longest path from root to any
659       leaf. It corresponds to the number of entries in numPerLevel if we exclude
660       all but one trailing 1. */
661   kmp_uint32 depth;
662   kmp_uint32 base_num_threads;
663   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
664   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
665   // 2=initialization in progress
666   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
667 
668   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
669       the parent of a node at level i has. For example, if we have a machine
670       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
671       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
672   kmp_uint32 *numPerLevel;
673   kmp_uint32 *skipPerLevel;
674 
675   void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
676     int hier_depth = adr2os[0].first.depth;
677     int level = 0;
678     for (int i = hier_depth - 1; i >= 0; --i) {
679       int max = -1;
680       for (int j = 0; j < num_addrs; ++j) {
681         int next = adr2os[j].first.childNums[i];
682         if (next > max)
683           max = next;
684       }
685       numPerLevel[level] = max + 1;
686       ++level;
687     }
688   }
689 
690   hierarchy_info()
691       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
692 
693   void fini() {
694     if (!uninitialized && numPerLevel) {
695       __kmp_free(numPerLevel);
696       numPerLevel = NULL;
697       uninitialized = not_initialized;
698     }
699   }
700 
701   void init(AddrUnsPair *adr2os, int num_addrs) {
702     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
703         &uninitialized, not_initialized, initializing);
704     if (bool_result == 0) { // Wait for initialization
705       while (TCR_1(uninitialized) != initialized)
706         KMP_CPU_PAUSE();
707       return;
708     }
709     KMP_DEBUG_ASSERT(bool_result == 1);
710 
711     /* Added explicit initialization of the data fields here to prevent usage of
712        dirty value observed when static library is re-initialized multiple times
713        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
714        OpenMP). */
715     depth = 1;
716     resizing = 0;
717     maxLevels = 7;
718     numPerLevel =
719         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
720     skipPerLevel = &(numPerLevel[maxLevels]);
721     for (kmp_uint32 i = 0; i < maxLevels;
722          ++i) { // init numPerLevel[*] to 1 item per level
723       numPerLevel[i] = 1;
724       skipPerLevel[i] = 1;
725     }
726 
727     // Sort table by physical ID
728     if (adr2os) {
729       qsort(adr2os, num_addrs, sizeof(*adr2os),
730             __kmp_affinity_cmp_Address_labels);
731       deriveLevels(adr2os, num_addrs);
732     } else {
733       numPerLevel[0] = maxLeaves;
734       numPerLevel[1] = num_addrs / maxLeaves;
735       if (num_addrs % maxLeaves)
736         numPerLevel[1]++;
737     }
738 
739     base_num_threads = num_addrs;
740     for (int i = maxLevels - 1; i >= 0;
741          --i) // count non-empty levels to get depth
742       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
743         depth++;
744 
745     kmp_uint32 branch = minBranch;
746     if (numPerLevel[0] == 1)
747       branch = num_addrs / maxLeaves;
748     if (branch < minBranch)
749       branch = minBranch;
750     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
751       while (numPerLevel[d] > branch ||
752              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
753         if (numPerLevel[d] & 1)
754           numPerLevel[d]++;
755         numPerLevel[d] = numPerLevel[d] >> 1;
756         if (numPerLevel[d + 1] == 1)
757           depth++;
758         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
759       }
760       if (numPerLevel[0] == 1) {
761         branch = branch >> 1;
762         if (branch < 4)
763           branch = minBranch;
764       }
765     }
766 
767     for (kmp_uint32 i = 1; i < depth; ++i)
768       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
769     // Fill in hierarchy in the case of oversubscription
770     for (kmp_uint32 i = depth; i < maxLevels; ++i)
771       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
772 
773     uninitialized = initialized; // One writer
774   }
775 
776   // Resize the hierarchy if nproc changes to something larger than before
777   void resize(kmp_uint32 nproc) {
778     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
779     while (bool_result == 0) { // someone else is trying to resize
780       KMP_CPU_PAUSE();
781       if (nproc <= base_num_threads) // happy with other thread's resize
782         return;
783       else // try to resize
784         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
785     }
786     KMP_DEBUG_ASSERT(bool_result != 0);
787     if (nproc <= base_num_threads)
788       return; // happy with other thread's resize
789 
790     // Calculate new maxLevels
791     kmp_uint32 old_sz = skipPerLevel[depth - 1];
792     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
793     // First see if old maxLevels is enough to contain new size
794     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
795       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
796       numPerLevel[i - 1] *= 2;
797       old_sz *= 2;
798       depth++;
799     }
800     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
801       while (nproc > old_sz) {
802         old_sz *= 2;
803         incs++;
804         depth++;
805       }
806       maxLevels += incs;
807 
808       // Resize arrays
809       kmp_uint32 *old_numPerLevel = numPerLevel;
810       kmp_uint32 *old_skipPerLevel = skipPerLevel;
811       numPerLevel = skipPerLevel = NULL;
812       numPerLevel =
813           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
814       skipPerLevel = &(numPerLevel[maxLevels]);
815 
816       // Copy old elements from old arrays
817       for (kmp_uint32 i = 0; i < old_maxLevels;
818            ++i) { // init numPerLevel[*] to 1 item per level
819         numPerLevel[i] = old_numPerLevel[i];
820         skipPerLevel[i] = old_skipPerLevel[i];
821       }
822 
823       // Init new elements in arrays to 1
824       for (kmp_uint32 i = old_maxLevels; i < maxLevels;
825            ++i) { // init numPerLevel[*] to 1 item per level
826         numPerLevel[i] = 1;
827         skipPerLevel[i] = 1;
828       }
829 
830       // Free old arrays
831       __kmp_free(old_numPerLevel);
832     }
833 
834     // Fill in oversubscription levels of hierarchy
835     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
836       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
837 
838     base_num_threads = nproc;
839     resizing = 0; // One writer
840   }
841 };
842 #endif // KMP_AFFINITY_H
843