xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_affinity.h (revision 4b50c451720d8b427757a6da1dd2bb4c52cd9e35)
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 
19 #if KMP_AFFINITY_SUPPORTED
20 #if KMP_USE_HWLOC
21 class KMPHwlocAffinity : public KMPAffinity {
22 public:
23   class Mask : public KMPAffinity::Mask {
24     hwloc_cpuset_t mask;
25 
26   public:
27     Mask() {
28       mask = hwloc_bitmap_alloc();
29       this->zero();
30     }
31     ~Mask() { hwloc_bitmap_free(mask); }
32     void set(int i) override { hwloc_bitmap_set(mask, i); }
33     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
34     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
35     void zero() override { hwloc_bitmap_zero(mask); }
36     void copy(const KMPAffinity::Mask *src) override {
37       const Mask *convert = static_cast<const Mask *>(src);
38       hwloc_bitmap_copy(mask, convert->mask);
39     }
40     void bitwise_and(const KMPAffinity::Mask *rhs) override {
41       const Mask *convert = static_cast<const Mask *>(rhs);
42       hwloc_bitmap_and(mask, mask, convert->mask);
43     }
44     void bitwise_or(const KMPAffinity::Mask *rhs) override {
45       const Mask *convert = static_cast<const Mask *>(rhs);
46       hwloc_bitmap_or(mask, mask, convert->mask);
47     }
48     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
49     int begin() const override { return hwloc_bitmap_first(mask); }
50     int end() const override { return -1; }
51     int next(int previous) const override {
52       return hwloc_bitmap_next(mask, previous);
53     }
54     int get_system_affinity(bool abort_on_error) override {
55       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
56                   "Illegal get affinity operation when not capable");
57       int retval =
58           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
59       if (retval >= 0) {
60         return 0;
61       }
62       int error = errno;
63       if (abort_on_error) {
64         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
65       }
66       return error;
67     }
68     int set_system_affinity(bool abort_on_error) const override {
69       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
70                   "Illegal get affinity operation when not capable");
71       int retval =
72           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
73       if (retval >= 0) {
74         return 0;
75       }
76       int error = errno;
77       if (abort_on_error) {
78         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
79       }
80       return error;
81     }
82     int get_proc_group() const override {
83       int group = -1;
84 #if KMP_OS_WINDOWS
85       if (__kmp_num_proc_groups == 1) {
86         return 1;
87       }
88       for (int i = 0; i < __kmp_num_proc_groups; i++) {
89         // On windows, the long type is always 32 bits
90         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
91         unsigned long second_32_bits =
92             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
93         if (first_32_bits == 0 && second_32_bits == 0) {
94           continue;
95         }
96         if (group >= 0) {
97           return -1;
98         }
99         group = i;
100       }
101 #endif /* KMP_OS_WINDOWS */
102       return group;
103     }
104   };
105   void determine_capable(const char *var) override {
106     const hwloc_topology_support *topology_support;
107     if (__kmp_hwloc_topology == NULL) {
108       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
109         __kmp_hwloc_error = TRUE;
110         if (__kmp_affinity_verbose)
111           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
112       }
113       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
114         __kmp_hwloc_error = TRUE;
115         if (__kmp_affinity_verbose)
116           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
117       }
118     }
119     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
120     // Is the system capable of setting/getting this thread's affinity?
121     // Also, is topology discovery possible? (pu indicates ability to discover
122     // processing units). And finally, were there no errors when calling any
123     // hwloc_* API functions?
124     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
125         topology_support->cpubind->get_thisthread_cpubind &&
126         topology_support->discovery->pu && !__kmp_hwloc_error) {
127       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
128       KMP_AFFINITY_ENABLE(TRUE);
129     } else {
130       // indicate that hwloc didn't work and disable affinity
131       __kmp_hwloc_error = TRUE;
132       KMP_AFFINITY_DISABLE();
133     }
134   }
135   void bind_thread(int which) override {
136     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
137                 "Illegal set affinity operation when not capable");
138     KMPAffinity::Mask *mask;
139     KMP_CPU_ALLOC_ON_STACK(mask);
140     KMP_CPU_ZERO(mask);
141     KMP_CPU_SET(which, mask);
142     __kmp_set_system_affinity(mask, TRUE);
143     KMP_CPU_FREE_FROM_STACK(mask);
144   }
145   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
146   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
147   KMPAffinity::Mask *allocate_mask_array(int num) override {
148     return new Mask[num];
149   }
150   void deallocate_mask_array(KMPAffinity::Mask *array) override {
151     Mask *hwloc_array = static_cast<Mask *>(array);
152     delete[] hwloc_array;
153   }
154   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
155                                       int index) override {
156     Mask *hwloc_array = static_cast<Mask *>(array);
157     return &(hwloc_array[index]);
158   }
159   api_type get_api_type() const override { return HWLOC; }
160 };
161 #endif /* KMP_USE_HWLOC */
162 
163 #if KMP_OS_LINUX
164 /* On some of the older OS's that we build on, these constants aren't present
165    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
166    all systems of the same arch where they are defined, and they cannot change.
167    stone forever. */
168 #include <sys/syscall.h>
169 #if KMP_ARCH_X86 || KMP_ARCH_ARM
170 #ifndef __NR_sched_setaffinity
171 #define __NR_sched_setaffinity 241
172 #elif __NR_sched_setaffinity != 241
173 #error Wrong code for setaffinity system call.
174 #endif /* __NR_sched_setaffinity */
175 #ifndef __NR_sched_getaffinity
176 #define __NR_sched_getaffinity 242
177 #elif __NR_sched_getaffinity != 242
178 #error Wrong code for getaffinity system call.
179 #endif /* __NR_sched_getaffinity */
180 #elif KMP_ARCH_AARCH64
181 #ifndef __NR_sched_setaffinity
182 #define __NR_sched_setaffinity 122
183 #elif __NR_sched_setaffinity != 122
184 #error Wrong code for setaffinity system call.
185 #endif /* __NR_sched_setaffinity */
186 #ifndef __NR_sched_getaffinity
187 #define __NR_sched_getaffinity 123
188 #elif __NR_sched_getaffinity != 123
189 #error Wrong code for getaffinity system call.
190 #endif /* __NR_sched_getaffinity */
191 #elif KMP_ARCH_X86_64
192 #ifndef __NR_sched_setaffinity
193 #define __NR_sched_setaffinity 203
194 #elif __NR_sched_setaffinity != 203
195 #error Wrong code for setaffinity system call.
196 #endif /* __NR_sched_setaffinity */
197 #ifndef __NR_sched_getaffinity
198 #define __NR_sched_getaffinity 204
199 #elif __NR_sched_getaffinity != 204
200 #error Wrong code for getaffinity system call.
201 #endif /* __NR_sched_getaffinity */
202 #elif KMP_ARCH_PPC64
203 #ifndef __NR_sched_setaffinity
204 #define __NR_sched_setaffinity 222
205 #elif __NR_sched_setaffinity != 222
206 #error Wrong code for setaffinity system call.
207 #endif /* __NR_sched_setaffinity */
208 #ifndef __NR_sched_getaffinity
209 #define __NR_sched_getaffinity 223
210 #elif __NR_sched_getaffinity != 223
211 #error Wrong code for getaffinity system call.
212 #endif /* __NR_sched_getaffinity */
213 #elif KMP_ARCH_MIPS
214 #ifndef __NR_sched_setaffinity
215 #define __NR_sched_setaffinity 4239
216 #elif __NR_sched_setaffinity != 4239
217 #error Wrong code for setaffinity system call.
218 #endif /* __NR_sched_setaffinity */
219 #ifndef __NR_sched_getaffinity
220 #define __NR_sched_getaffinity 4240
221 #elif __NR_sched_getaffinity != 4240
222 #error Wrong code for getaffinity system call.
223 #endif /* __NR_sched_getaffinity */
224 #elif KMP_ARCH_MIPS64
225 #ifndef __NR_sched_setaffinity
226 #define __NR_sched_setaffinity 5195
227 #elif __NR_sched_setaffinity != 5195
228 #error Wrong code for setaffinity system call.
229 #endif /* __NR_sched_setaffinity */
230 #ifndef __NR_sched_getaffinity
231 #define __NR_sched_getaffinity 5196
232 #elif __NR_sched_getaffinity != 5196
233 #error Wrong code for getaffinity system call.
234 #endif /* __NR_sched_getaffinity */
235 #error Unknown or unsupported architecture
236 #endif /* KMP_ARCH_* */
237 class KMPNativeAffinity : public KMPAffinity {
238   class Mask : public KMPAffinity::Mask {
239     typedef unsigned char mask_t;
240     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
241 
242   public:
243     mask_t *mask;
244     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
245     ~Mask() {
246       if (mask)
247         __kmp_free(mask);
248     }
249     void set(int i) override {
250       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
251     }
252     bool is_set(int i) const override {
253       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
254     }
255     void clear(int i) override {
256       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
257     }
258     void zero() override {
259       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
260         mask[i] = 0;
261     }
262     void copy(const KMPAffinity::Mask *src) override {
263       const Mask *convert = static_cast<const Mask *>(src);
264       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
265         mask[i] = convert->mask[i];
266     }
267     void bitwise_and(const KMPAffinity::Mask *rhs) override {
268       const Mask *convert = static_cast<const Mask *>(rhs);
269       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
270         mask[i] &= convert->mask[i];
271     }
272     void bitwise_or(const KMPAffinity::Mask *rhs) override {
273       const Mask *convert = static_cast<const Mask *>(rhs);
274       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
275         mask[i] |= convert->mask[i];
276     }
277     void bitwise_not() override {
278       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
279         mask[i] = ~(mask[i]);
280     }
281     int begin() const override {
282       int retval = 0;
283       while (retval < end() && !is_set(retval))
284         ++retval;
285       return retval;
286     }
287     int end() const override { return __kmp_affin_mask_size * BITS_PER_MASK_T; }
288     int next(int previous) const override {
289       int retval = previous + 1;
290       while (retval < end() && !is_set(retval))
291         ++retval;
292       return retval;
293     }
294     int get_system_affinity(bool abort_on_error) override {
295       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
296                   "Illegal get affinity operation when not capable");
297       int retval =
298           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
299       if (retval >= 0) {
300         return 0;
301       }
302       int error = errno;
303       if (abort_on_error) {
304         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
305       }
306       return error;
307     }
308     int set_system_affinity(bool abort_on_error) const override {
309       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
310                   "Illegal get affinity operation when not capable");
311       int retval =
312           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
313       if (retval >= 0) {
314         return 0;
315       }
316       int error = errno;
317       if (abort_on_error) {
318         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
319       }
320       return error;
321     }
322   };
323   void determine_capable(const char *env_var) override {
324     __kmp_affinity_determine_capable(env_var);
325   }
326   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
327   KMPAffinity::Mask *allocate_mask() override {
328     KMPNativeAffinity::Mask *retval = new Mask();
329     return retval;
330   }
331   void deallocate_mask(KMPAffinity::Mask *m) override {
332     KMPNativeAffinity::Mask *native_mask =
333         static_cast<KMPNativeAffinity::Mask *>(m);
334     delete native_mask;
335   }
336   KMPAffinity::Mask *allocate_mask_array(int num) override {
337     return new Mask[num];
338   }
339   void deallocate_mask_array(KMPAffinity::Mask *array) override {
340     Mask *linux_array = static_cast<Mask *>(array);
341     delete[] linux_array;
342   }
343   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
344                                       int index) override {
345     Mask *linux_array = static_cast<Mask *>(array);
346     return &(linux_array[index]);
347   }
348   api_type get_api_type() const override { return NATIVE_OS; }
349 };
350 #endif /* KMP_OS_LINUX */
351 
352 #if KMP_OS_WINDOWS
353 class KMPNativeAffinity : public KMPAffinity {
354   class Mask : public KMPAffinity::Mask {
355     typedef ULONG_PTR mask_t;
356     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
357     mask_t *mask;
358 
359   public:
360     Mask() {
361       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
362     }
363     ~Mask() {
364       if (mask)
365         __kmp_free(mask);
366     }
367     void set(int i) override {
368       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
369     }
370     bool is_set(int i) const override {
371       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
372     }
373     void clear(int i) override {
374       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
375     }
376     void zero() override {
377       for (int i = 0; i < __kmp_num_proc_groups; ++i)
378         mask[i] = 0;
379     }
380     void copy(const KMPAffinity::Mask *src) override {
381       const Mask *convert = static_cast<const Mask *>(src);
382       for (int i = 0; i < __kmp_num_proc_groups; ++i)
383         mask[i] = convert->mask[i];
384     }
385     void bitwise_and(const KMPAffinity::Mask *rhs) override {
386       const Mask *convert = static_cast<const Mask *>(rhs);
387       for (int i = 0; i < __kmp_num_proc_groups; ++i)
388         mask[i] &= convert->mask[i];
389     }
390     void bitwise_or(const KMPAffinity::Mask *rhs) override {
391       const Mask *convert = static_cast<const Mask *>(rhs);
392       for (int i = 0; i < __kmp_num_proc_groups; ++i)
393         mask[i] |= convert->mask[i];
394     }
395     void bitwise_not() override {
396       for (int i = 0; i < __kmp_num_proc_groups; ++i)
397         mask[i] = ~(mask[i]);
398     }
399     int begin() const override {
400       int retval = 0;
401       while (retval < end() && !is_set(retval))
402         ++retval;
403       return retval;
404     }
405     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
406     int next(int previous) const override {
407       int retval = previous + 1;
408       while (retval < end() && !is_set(retval))
409         ++retval;
410       return retval;
411     }
412     int set_system_affinity(bool abort_on_error) const override {
413       if (__kmp_num_proc_groups > 1) {
414         // Check for a valid mask.
415         GROUP_AFFINITY ga;
416         int group = get_proc_group();
417         if (group < 0) {
418           if (abort_on_error) {
419             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
420           }
421           return -1;
422         }
423         // Transform the bit vector into a GROUP_AFFINITY struct
424         // and make the system call to set affinity.
425         ga.Group = group;
426         ga.Mask = mask[group];
427         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
428 
429         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
430         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
431           DWORD error = GetLastError();
432           if (abort_on_error) {
433             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
434                         __kmp_msg_null);
435           }
436           return error;
437         }
438       } else {
439         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
440           DWORD error = GetLastError();
441           if (abort_on_error) {
442             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
443                         __kmp_msg_null);
444           }
445           return error;
446         }
447       }
448       return 0;
449     }
450     int get_system_affinity(bool abort_on_error) override {
451       if (__kmp_num_proc_groups > 1) {
452         this->zero();
453         GROUP_AFFINITY ga;
454         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
455         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
456           DWORD error = GetLastError();
457           if (abort_on_error) {
458             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
459                         KMP_ERR(error), __kmp_msg_null);
460           }
461           return error;
462         }
463         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
464             (ga.Mask == 0)) {
465           return -1;
466         }
467         mask[ga.Group] = ga.Mask;
468       } else {
469         mask_t newMask, sysMask, retval;
470         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
471           DWORD error = GetLastError();
472           if (abort_on_error) {
473             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
474                         KMP_ERR(error), __kmp_msg_null);
475           }
476           return error;
477         }
478         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
479         if (!retval) {
480           DWORD error = GetLastError();
481           if (abort_on_error) {
482             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
483                         KMP_ERR(error), __kmp_msg_null);
484           }
485           return error;
486         }
487         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
488         if (!newMask) {
489           DWORD error = GetLastError();
490           if (abort_on_error) {
491             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
492                         KMP_ERR(error), __kmp_msg_null);
493           }
494         }
495         *mask = retval;
496       }
497       return 0;
498     }
499     int get_proc_group() const override {
500       int group = -1;
501       if (__kmp_num_proc_groups == 1) {
502         return 1;
503       }
504       for (int i = 0; i < __kmp_num_proc_groups; i++) {
505         if (mask[i] == 0)
506           continue;
507         if (group >= 0)
508           return -1;
509         group = i;
510       }
511       return group;
512     }
513   };
514   void determine_capable(const char *env_var) override {
515     __kmp_affinity_determine_capable(env_var);
516   }
517   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
518   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
519   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
520   KMPAffinity::Mask *allocate_mask_array(int num) override {
521     return new Mask[num];
522   }
523   void deallocate_mask_array(KMPAffinity::Mask *array) override {
524     Mask *windows_array = static_cast<Mask *>(array);
525     delete[] windows_array;
526   }
527   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
528                                       int index) override {
529     Mask *windows_array = static_cast<Mask *>(array);
530     return &(windows_array[index]);
531   }
532   api_type get_api_type() const override { return NATIVE_OS; }
533 };
534 #endif /* KMP_OS_WINDOWS */
535 #endif /* KMP_AFFINITY_SUPPORTED */
536 
537 class Address {
538 public:
539   static const unsigned maxDepth = 32;
540   unsigned labels[maxDepth];
541   unsigned childNums[maxDepth];
542   unsigned depth;
543   unsigned leader;
544   Address(unsigned _depth) : depth(_depth), leader(FALSE) {}
545   Address &operator=(const Address &b) {
546     depth = b.depth;
547     for (unsigned i = 0; i < depth; i++) {
548       labels[i] = b.labels[i];
549       childNums[i] = b.childNums[i];
550     }
551     leader = FALSE;
552     return *this;
553   }
554   bool operator==(const Address &b) const {
555     if (depth != b.depth)
556       return false;
557     for (unsigned i = 0; i < depth; i++)
558       if (labels[i] != b.labels[i])
559         return false;
560     return true;
561   }
562   bool isClose(const Address &b, int level) const {
563     if (depth != b.depth)
564       return false;
565     if ((unsigned)level >= depth)
566       return true;
567     for (unsigned i = 0; i < (depth - level); i++)
568       if (labels[i] != b.labels[i])
569         return false;
570     return true;
571   }
572   bool operator!=(const Address &b) const { return !operator==(b); }
573   void print() const {
574     unsigned i;
575     printf("Depth: %u --- ", depth);
576     for (i = 0; i < depth; i++) {
577       printf("%u ", labels[i]);
578     }
579   }
580 };
581 
582 class AddrUnsPair {
583 public:
584   Address first;
585   unsigned second;
586   AddrUnsPair(Address _first, unsigned _second)
587       : first(_first), second(_second) {}
588   AddrUnsPair &operator=(const AddrUnsPair &b) {
589     first = b.first;
590     second = b.second;
591     return *this;
592   }
593   void print() const {
594     printf("first = ");
595     first.print();
596     printf(" --- second = %u", second);
597   }
598   bool operator==(const AddrUnsPair &b) const {
599     if (first != b.first)
600       return false;
601     if (second != b.second)
602       return false;
603     return true;
604   }
605   bool operator!=(const AddrUnsPair &b) const { return !operator==(b); }
606 };
607 
608 static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) {
609   const Address *aa = &(((const AddrUnsPair *)a)->first);
610   const Address *bb = &(((const AddrUnsPair *)b)->first);
611   unsigned depth = aa->depth;
612   unsigned i;
613   KMP_DEBUG_ASSERT(depth == bb->depth);
614   for (i = 0; i < depth; i++) {
615     if (aa->labels[i] < bb->labels[i])
616       return -1;
617     if (aa->labels[i] > bb->labels[i])
618       return 1;
619   }
620   return 0;
621 }
622 
623 /* A structure for holding machine-specific hierarchy info to be computed once
624    at init. This structure represents a mapping of threads to the actual machine
625    hierarchy, or to our best guess at what the hierarchy might be, for the
626    purpose of performing an efficient barrier. In the worst case, when there is
627    no machine hierarchy information, it produces a tree suitable for a barrier,
628    similar to the tree used in the hyper barrier. */
629 class hierarchy_info {
630 public:
631   /* Good default values for number of leaves and branching factor, given no
632      affinity information. Behaves a bit like hyper barrier. */
633   static const kmp_uint32 maxLeaves = 4;
634   static const kmp_uint32 minBranch = 4;
635   /** Number of levels in the hierarchy. Typical levels are threads/core,
636       cores/package or socket, packages/node, nodes/machine, etc. We don't want
637       to get specific with nomenclature. When the machine is oversubscribed we
638       add levels to duplicate the hierarchy, doubling the thread capacity of the
639       hierarchy each time we add a level. */
640   kmp_uint32 maxLevels;
641 
642   /** This is specifically the depth of the machine configuration hierarchy, in
643       terms of the number of levels along the longest path from root to any
644       leaf. It corresponds to the number of entries in numPerLevel if we exclude
645       all but one trailing 1. */
646   kmp_uint32 depth;
647   kmp_uint32 base_num_threads;
648   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
649   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
650   // 2=initialization in progress
651   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
652 
653   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
654       the parent of a node at level i has. For example, if we have a machine
655       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
656       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
657   kmp_uint32 *numPerLevel;
658   kmp_uint32 *skipPerLevel;
659 
660   void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
661     int hier_depth = adr2os[0].first.depth;
662     int level = 0;
663     for (int i = hier_depth - 1; i >= 0; --i) {
664       int max = -1;
665       for (int j = 0; j < num_addrs; ++j) {
666         int next = adr2os[j].first.childNums[i];
667         if (next > max)
668           max = next;
669       }
670       numPerLevel[level] = max + 1;
671       ++level;
672     }
673   }
674 
675   hierarchy_info()
676       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
677 
678   void fini() {
679     if (!uninitialized && numPerLevel) {
680       __kmp_free(numPerLevel);
681       numPerLevel = NULL;
682       uninitialized = not_initialized;
683     }
684   }
685 
686   void init(AddrUnsPair *adr2os, int num_addrs) {
687     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
688         &uninitialized, not_initialized, initializing);
689     if (bool_result == 0) { // Wait for initialization
690       while (TCR_1(uninitialized) != initialized)
691         KMP_CPU_PAUSE();
692       return;
693     }
694     KMP_DEBUG_ASSERT(bool_result == 1);
695 
696     /* Added explicit initialization of the data fields here to prevent usage of
697        dirty value observed when static library is re-initialized multiple times
698        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
699        OpenMP). */
700     depth = 1;
701     resizing = 0;
702     maxLevels = 7;
703     numPerLevel =
704         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
705     skipPerLevel = &(numPerLevel[maxLevels]);
706     for (kmp_uint32 i = 0; i < maxLevels;
707          ++i) { // init numPerLevel[*] to 1 item per level
708       numPerLevel[i] = 1;
709       skipPerLevel[i] = 1;
710     }
711 
712     // Sort table by physical ID
713     if (adr2os) {
714       qsort(adr2os, num_addrs, sizeof(*adr2os),
715             __kmp_affinity_cmp_Address_labels);
716       deriveLevels(adr2os, num_addrs);
717     } else {
718       numPerLevel[0] = maxLeaves;
719       numPerLevel[1] = num_addrs / maxLeaves;
720       if (num_addrs % maxLeaves)
721         numPerLevel[1]++;
722     }
723 
724     base_num_threads = num_addrs;
725     for (int i = maxLevels - 1; i >= 0;
726          --i) // count non-empty levels to get depth
727       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
728         depth++;
729 
730     kmp_uint32 branch = minBranch;
731     if (numPerLevel[0] == 1)
732       branch = num_addrs / maxLeaves;
733     if (branch < minBranch)
734       branch = minBranch;
735     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
736       while (numPerLevel[d] > branch ||
737              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
738         if (numPerLevel[d] & 1)
739           numPerLevel[d]++;
740         numPerLevel[d] = numPerLevel[d] >> 1;
741         if (numPerLevel[d + 1] == 1)
742           depth++;
743         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
744       }
745       if (numPerLevel[0] == 1) {
746         branch = branch >> 1;
747         if (branch < 4)
748           branch = minBranch;
749       }
750     }
751 
752     for (kmp_uint32 i = 1; i < depth; ++i)
753       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
754     // Fill in hierarchy in the case of oversubscription
755     for (kmp_uint32 i = depth; i < maxLevels; ++i)
756       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
757 
758     uninitialized = initialized; // One writer
759   }
760 
761   // Resize the hierarchy if nproc changes to something larger than before
762   void resize(kmp_uint32 nproc) {
763     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
764     while (bool_result == 0) { // someone else is trying to resize
765       KMP_CPU_PAUSE();
766       if (nproc <= base_num_threads) // happy with other thread's resize
767         return;
768       else // try to resize
769         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
770     }
771     KMP_DEBUG_ASSERT(bool_result != 0);
772     if (nproc <= base_num_threads)
773       return; // happy with other thread's resize
774 
775     // Calculate new maxLevels
776     kmp_uint32 old_sz = skipPerLevel[depth - 1];
777     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
778     // First see if old maxLevels is enough to contain new size
779     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
780       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
781       numPerLevel[i - 1] *= 2;
782       old_sz *= 2;
783       depth++;
784     }
785     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
786       while (nproc > old_sz) {
787         old_sz *= 2;
788         incs++;
789         depth++;
790       }
791       maxLevels += incs;
792 
793       // Resize arrays
794       kmp_uint32 *old_numPerLevel = numPerLevel;
795       kmp_uint32 *old_skipPerLevel = skipPerLevel;
796       numPerLevel = skipPerLevel = NULL;
797       numPerLevel =
798           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
799       skipPerLevel = &(numPerLevel[maxLevels]);
800 
801       // Copy old elements from old arrays
802       for (kmp_uint32 i = 0; i < old_maxLevels;
803            ++i) { // init numPerLevel[*] to 1 item per level
804         numPerLevel[i] = old_numPerLevel[i];
805         skipPerLevel[i] = old_skipPerLevel[i];
806       }
807 
808       // Init new elements in arrays to 1
809       for (kmp_uint32 i = old_maxLevels; i < maxLevels;
810            ++i) { // init numPerLevel[*] to 1 item per level
811         numPerLevel[i] = 1;
812         skipPerLevel[i] = 1;
813       }
814 
815       // Free old arrays
816       __kmp_free(old_numPerLevel);
817     }
818 
819     // Fill in oversubscription levels of hierarchy
820     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
821       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
822 
823     base_num_threads = nproc;
824     resizing = 0; // One writer
825   }
826 };
827 #endif // KMP_AFFINITY_H
828