xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_affinity.h (revision 25ecdc7d52770caf1c9b44b5ec11f468f6b636f3)
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 
19 #if KMP_AFFINITY_SUPPORTED
20 #if KMP_USE_HWLOC
21 class KMPHwlocAffinity : public KMPAffinity {
22 public:
23   class Mask : public KMPAffinity::Mask {
24     hwloc_cpuset_t mask;
25 
26   public:
27     Mask() {
28       mask = hwloc_bitmap_alloc();
29       this->zero();
30     }
31     ~Mask() { hwloc_bitmap_free(mask); }
32     void set(int i) override { hwloc_bitmap_set(mask, i); }
33     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
34     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
35     void zero() override { hwloc_bitmap_zero(mask); }
36     void copy(const KMPAffinity::Mask *src) override {
37       const Mask *convert = static_cast<const Mask *>(src);
38       hwloc_bitmap_copy(mask, convert->mask);
39     }
40     void bitwise_and(const KMPAffinity::Mask *rhs) override {
41       const Mask *convert = static_cast<const Mask *>(rhs);
42       hwloc_bitmap_and(mask, mask, convert->mask);
43     }
44     void bitwise_or(const KMPAffinity::Mask *rhs) override {
45       const Mask *convert = static_cast<const Mask *>(rhs);
46       hwloc_bitmap_or(mask, mask, convert->mask);
47     }
48     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
49     int begin() const override { return hwloc_bitmap_first(mask); }
50     int end() const override { return -1; }
51     int next(int previous) const override {
52       return hwloc_bitmap_next(mask, previous);
53     }
54     int get_system_affinity(bool abort_on_error) override {
55       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
56                   "Illegal get affinity operation when not capable");
57       int retval =
58           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
59       if (retval >= 0) {
60         return 0;
61       }
62       int error = errno;
63       if (abort_on_error) {
64         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
65       }
66       return error;
67     }
68     int set_system_affinity(bool abort_on_error) const override {
69       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
70                   "Illegal get affinity operation when not capable");
71       int retval =
72           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
73       if (retval >= 0) {
74         return 0;
75       }
76       int error = errno;
77       if (abort_on_error) {
78         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
79       }
80       return error;
81     }
82     int get_proc_group() const override {
83       int group = -1;
84 #if KMP_OS_WINDOWS
85       if (__kmp_num_proc_groups == 1) {
86         return 1;
87       }
88       for (int i = 0; i < __kmp_num_proc_groups; i++) {
89         // On windows, the long type is always 32 bits
90         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
91         unsigned long second_32_bits =
92             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
93         if (first_32_bits == 0 && second_32_bits == 0) {
94           continue;
95         }
96         if (group >= 0) {
97           return -1;
98         }
99         group = i;
100       }
101 #endif /* KMP_OS_WINDOWS */
102       return group;
103     }
104   };
105   void determine_capable(const char *var) override {
106     const hwloc_topology_support *topology_support;
107     if (__kmp_hwloc_topology == NULL) {
108       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
109         __kmp_hwloc_error = TRUE;
110         if (__kmp_affinity_verbose)
111           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
112       }
113       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
114         __kmp_hwloc_error = TRUE;
115         if (__kmp_affinity_verbose)
116           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
117       }
118     }
119     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
120     // Is the system capable of setting/getting this thread's affinity?
121     // Also, is topology discovery possible? (pu indicates ability to discover
122     // processing units). And finally, were there no errors when calling any
123     // hwloc_* API functions?
124     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
125         topology_support->cpubind->get_thisthread_cpubind &&
126         topology_support->discovery->pu && !__kmp_hwloc_error) {
127       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
128       KMP_AFFINITY_ENABLE(TRUE);
129     } else {
130       // indicate that hwloc didn't work and disable affinity
131       __kmp_hwloc_error = TRUE;
132       KMP_AFFINITY_DISABLE();
133     }
134   }
135   void bind_thread(int which) override {
136     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
137                 "Illegal set affinity operation when not capable");
138     KMPAffinity::Mask *mask;
139     KMP_CPU_ALLOC_ON_STACK(mask);
140     KMP_CPU_ZERO(mask);
141     KMP_CPU_SET(which, mask);
142     __kmp_set_system_affinity(mask, TRUE);
143     KMP_CPU_FREE_FROM_STACK(mask);
144   }
145   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
146   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
147   KMPAffinity::Mask *allocate_mask_array(int num) override {
148     return new Mask[num];
149   }
150   void deallocate_mask_array(KMPAffinity::Mask *array) override {
151     Mask *hwloc_array = static_cast<Mask *>(array);
152     delete[] hwloc_array;
153   }
154   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
155                                       int index) override {
156     Mask *hwloc_array = static_cast<Mask *>(array);
157     return &(hwloc_array[index]);
158   }
159   api_type get_api_type() const override { return HWLOC; }
160 };
161 #endif /* KMP_USE_HWLOC */
162 
163 #if KMP_OS_LINUX || KMP_OS_FREEBSD
164 #if KMP_OS_LINUX
165 /* On some of the older OS's that we build on, these constants aren't present
166    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
167    all systems of the same arch where they are defined, and they cannot change.
168    stone forever. */
169 #include <sys/syscall.h>
170 #if KMP_ARCH_X86 || KMP_ARCH_ARM
171 #ifndef __NR_sched_setaffinity
172 #define __NR_sched_setaffinity 241
173 #elif __NR_sched_setaffinity != 241
174 #error Wrong code for setaffinity system call.
175 #endif /* __NR_sched_setaffinity */
176 #ifndef __NR_sched_getaffinity
177 #define __NR_sched_getaffinity 242
178 #elif __NR_sched_getaffinity != 242
179 #error Wrong code for getaffinity system call.
180 #endif /* __NR_sched_getaffinity */
181 #elif KMP_ARCH_AARCH64
182 #ifndef __NR_sched_setaffinity
183 #define __NR_sched_setaffinity 122
184 #elif __NR_sched_setaffinity != 122
185 #error Wrong code for setaffinity system call.
186 #endif /* __NR_sched_setaffinity */
187 #ifndef __NR_sched_getaffinity
188 #define __NR_sched_getaffinity 123
189 #elif __NR_sched_getaffinity != 123
190 #error Wrong code for getaffinity system call.
191 #endif /* __NR_sched_getaffinity */
192 #elif KMP_ARCH_X86_64
193 #ifndef __NR_sched_setaffinity
194 #define __NR_sched_setaffinity 203
195 #elif __NR_sched_setaffinity != 203
196 #error Wrong code for setaffinity system call.
197 #endif /* __NR_sched_setaffinity */
198 #ifndef __NR_sched_getaffinity
199 #define __NR_sched_getaffinity 204
200 #elif __NR_sched_getaffinity != 204
201 #error Wrong code for getaffinity system call.
202 #endif /* __NR_sched_getaffinity */
203 #elif KMP_ARCH_PPC64
204 #ifndef __NR_sched_setaffinity
205 #define __NR_sched_setaffinity 222
206 #elif __NR_sched_setaffinity != 222
207 #error Wrong code for setaffinity system call.
208 #endif /* __NR_sched_setaffinity */
209 #ifndef __NR_sched_getaffinity
210 #define __NR_sched_getaffinity 223
211 #elif __NR_sched_getaffinity != 223
212 #error Wrong code for getaffinity system call.
213 #endif /* __NR_sched_getaffinity */
214 #elif KMP_ARCH_MIPS
215 #ifndef __NR_sched_setaffinity
216 #define __NR_sched_setaffinity 4239
217 #elif __NR_sched_setaffinity != 4239
218 #error Wrong code for setaffinity system call.
219 #endif /* __NR_sched_setaffinity */
220 #ifndef __NR_sched_getaffinity
221 #define __NR_sched_getaffinity 4240
222 #elif __NR_sched_getaffinity != 4240
223 #error Wrong code for getaffinity system call.
224 #endif /* __NR_sched_getaffinity */
225 #elif KMP_ARCH_MIPS64
226 #ifndef __NR_sched_setaffinity
227 #define __NR_sched_setaffinity 5195
228 #elif __NR_sched_setaffinity != 5195
229 #error Wrong code for setaffinity system call.
230 #endif /* __NR_sched_setaffinity */
231 #ifndef __NR_sched_getaffinity
232 #define __NR_sched_getaffinity 5196
233 #elif __NR_sched_getaffinity != 5196
234 #error Wrong code for getaffinity system call.
235 #endif /* __NR_sched_getaffinity */
236 #error Unknown or unsupported architecture
237 #endif /* KMP_ARCH_* */
238 #elif KMP_OS_FREEBSD
239 #include <pthread.h>
240 #include <pthread_np.h>
241 #endif
242 class KMPNativeAffinity : public KMPAffinity {
243   class Mask : public KMPAffinity::Mask {
244     typedef unsigned char mask_t;
245     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
246 
247   public:
248     mask_t *mask;
249     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
250     ~Mask() {
251       if (mask)
252         __kmp_free(mask);
253     }
254     void set(int i) override {
255       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
256     }
257     bool is_set(int i) const override {
258       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
259     }
260     void clear(int i) override {
261       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
262     }
263     void zero() override {
264       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
265         mask[i] = 0;
266     }
267     void copy(const KMPAffinity::Mask *src) override {
268       const Mask *convert = static_cast<const Mask *>(src);
269       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
270         mask[i] = convert->mask[i];
271     }
272     void bitwise_and(const KMPAffinity::Mask *rhs) override {
273       const Mask *convert = static_cast<const Mask *>(rhs);
274       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
275         mask[i] &= convert->mask[i];
276     }
277     void bitwise_or(const KMPAffinity::Mask *rhs) override {
278       const Mask *convert = static_cast<const Mask *>(rhs);
279       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
280         mask[i] |= convert->mask[i];
281     }
282     void bitwise_not() override {
283       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
284         mask[i] = ~(mask[i]);
285     }
286     int begin() const override {
287       int retval = 0;
288       while (retval < end() && !is_set(retval))
289         ++retval;
290       return retval;
291     }
292     int end() const override { return __kmp_affin_mask_size * BITS_PER_MASK_T; }
293     int next(int previous) const override {
294       int retval = previous + 1;
295       while (retval < end() && !is_set(retval))
296         ++retval;
297       return retval;
298     }
299     int get_system_affinity(bool abort_on_error) override {
300       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
301                   "Illegal get affinity operation when not capable");
302 #if KMP_OS_LINUX
303       int retval =
304           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
305 #elif KMP_OS_FREEBSD
306       int r =
307           pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
308       int retval = (r == 0 ? 0 : -1);
309 #endif
310       if (retval >= 0) {
311         return 0;
312       }
313       int error = errno;
314       if (abort_on_error) {
315         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
316       }
317       return error;
318     }
319     int set_system_affinity(bool abort_on_error) const override {
320       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
321                   "Illegal get affinity operation when not capable");
322 #if KMP_OS_LINUX
323       int retval =
324           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
325 #elif KMP_OS_FREEBSD
326       int r =
327           pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
328       int retval = (r == 0 ? 0 : -1);
329 #endif
330       if (retval >= 0) {
331         return 0;
332       }
333       int error = errno;
334       if (abort_on_error) {
335         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
336       }
337       return error;
338     }
339   };
340   void determine_capable(const char *env_var) override {
341     __kmp_affinity_determine_capable(env_var);
342   }
343   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
344   KMPAffinity::Mask *allocate_mask() override {
345     KMPNativeAffinity::Mask *retval = new Mask();
346     return retval;
347   }
348   void deallocate_mask(KMPAffinity::Mask *m) override {
349     KMPNativeAffinity::Mask *native_mask =
350         static_cast<KMPNativeAffinity::Mask *>(m);
351     delete native_mask;
352   }
353   KMPAffinity::Mask *allocate_mask_array(int num) override {
354     return new Mask[num];
355   }
356   void deallocate_mask_array(KMPAffinity::Mask *array) override {
357     Mask *linux_array = static_cast<Mask *>(array);
358     delete[] linux_array;
359   }
360   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
361                                       int index) override {
362     Mask *linux_array = static_cast<Mask *>(array);
363     return &(linux_array[index]);
364   }
365   api_type get_api_type() const override { return NATIVE_OS; }
366 };
367 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
368 
369 #if KMP_OS_WINDOWS
370 class KMPNativeAffinity : public KMPAffinity {
371   class Mask : public KMPAffinity::Mask {
372     typedef ULONG_PTR mask_t;
373     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
374     mask_t *mask;
375 
376   public:
377     Mask() {
378       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
379     }
380     ~Mask() {
381       if (mask)
382         __kmp_free(mask);
383     }
384     void set(int i) override {
385       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
386     }
387     bool is_set(int i) const override {
388       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
389     }
390     void clear(int i) override {
391       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
392     }
393     void zero() override {
394       for (int i = 0; i < __kmp_num_proc_groups; ++i)
395         mask[i] = 0;
396     }
397     void copy(const KMPAffinity::Mask *src) override {
398       const Mask *convert = static_cast<const Mask *>(src);
399       for (int i = 0; i < __kmp_num_proc_groups; ++i)
400         mask[i] = convert->mask[i];
401     }
402     void bitwise_and(const KMPAffinity::Mask *rhs) override {
403       const Mask *convert = static_cast<const Mask *>(rhs);
404       for (int i = 0; i < __kmp_num_proc_groups; ++i)
405         mask[i] &= convert->mask[i];
406     }
407     void bitwise_or(const KMPAffinity::Mask *rhs) override {
408       const Mask *convert = static_cast<const Mask *>(rhs);
409       for (int i = 0; i < __kmp_num_proc_groups; ++i)
410         mask[i] |= convert->mask[i];
411     }
412     void bitwise_not() override {
413       for (int i = 0; i < __kmp_num_proc_groups; ++i)
414         mask[i] = ~(mask[i]);
415     }
416     int begin() const override {
417       int retval = 0;
418       while (retval < end() && !is_set(retval))
419         ++retval;
420       return retval;
421     }
422     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
423     int next(int previous) const override {
424       int retval = previous + 1;
425       while (retval < end() && !is_set(retval))
426         ++retval;
427       return retval;
428     }
429     int set_system_affinity(bool abort_on_error) const override {
430       if (__kmp_num_proc_groups > 1) {
431         // Check for a valid mask.
432         GROUP_AFFINITY ga;
433         int group = get_proc_group();
434         if (group < 0) {
435           if (abort_on_error) {
436             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
437           }
438           return -1;
439         }
440         // Transform the bit vector into a GROUP_AFFINITY struct
441         // and make the system call to set affinity.
442         ga.Group = group;
443         ga.Mask = mask[group];
444         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
445 
446         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
447         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
448           DWORD error = GetLastError();
449           if (abort_on_error) {
450             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
451                         __kmp_msg_null);
452           }
453           return error;
454         }
455       } else {
456         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
457           DWORD error = GetLastError();
458           if (abort_on_error) {
459             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
460                         __kmp_msg_null);
461           }
462           return error;
463         }
464       }
465       return 0;
466     }
467     int get_system_affinity(bool abort_on_error) override {
468       if (__kmp_num_proc_groups > 1) {
469         this->zero();
470         GROUP_AFFINITY ga;
471         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
472         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
473           DWORD error = GetLastError();
474           if (abort_on_error) {
475             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
476                         KMP_ERR(error), __kmp_msg_null);
477           }
478           return error;
479         }
480         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
481             (ga.Mask == 0)) {
482           return -1;
483         }
484         mask[ga.Group] = ga.Mask;
485       } else {
486         mask_t newMask, sysMask, retval;
487         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
488           DWORD error = GetLastError();
489           if (abort_on_error) {
490             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
491                         KMP_ERR(error), __kmp_msg_null);
492           }
493           return error;
494         }
495         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
496         if (!retval) {
497           DWORD error = GetLastError();
498           if (abort_on_error) {
499             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
500                         KMP_ERR(error), __kmp_msg_null);
501           }
502           return error;
503         }
504         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
505         if (!newMask) {
506           DWORD error = GetLastError();
507           if (abort_on_error) {
508             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
509                         KMP_ERR(error), __kmp_msg_null);
510           }
511         }
512         *mask = retval;
513       }
514       return 0;
515     }
516     int get_proc_group() const override {
517       int group = -1;
518       if (__kmp_num_proc_groups == 1) {
519         return 1;
520       }
521       for (int i = 0; i < __kmp_num_proc_groups; i++) {
522         if (mask[i] == 0)
523           continue;
524         if (group >= 0)
525           return -1;
526         group = i;
527       }
528       return group;
529     }
530   };
531   void determine_capable(const char *env_var) override {
532     __kmp_affinity_determine_capable(env_var);
533   }
534   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
535   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
536   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
537   KMPAffinity::Mask *allocate_mask_array(int num) override {
538     return new Mask[num];
539   }
540   void deallocate_mask_array(KMPAffinity::Mask *array) override {
541     Mask *windows_array = static_cast<Mask *>(array);
542     delete[] windows_array;
543   }
544   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
545                                       int index) override {
546     Mask *windows_array = static_cast<Mask *>(array);
547     return &(windows_array[index]);
548   }
549   api_type get_api_type() const override { return NATIVE_OS; }
550 };
551 #endif /* KMP_OS_WINDOWS */
552 #endif /* KMP_AFFINITY_SUPPORTED */
553 
554 class Address {
555 public:
556   static const unsigned maxDepth = 32;
557   unsigned labels[maxDepth];
558   unsigned childNums[maxDepth];
559   unsigned depth;
560   unsigned leader;
561   Address(unsigned _depth) : depth(_depth), leader(FALSE) {}
562   Address &operator=(const Address &b) {
563     depth = b.depth;
564     for (unsigned i = 0; i < depth; i++) {
565       labels[i] = b.labels[i];
566       childNums[i] = b.childNums[i];
567     }
568     leader = FALSE;
569     return *this;
570   }
571   bool operator==(const Address &b) const {
572     if (depth != b.depth)
573       return false;
574     for (unsigned i = 0; i < depth; i++)
575       if (labels[i] != b.labels[i])
576         return false;
577     return true;
578   }
579   bool isClose(const Address &b, int level) const {
580     if (depth != b.depth)
581       return false;
582     if ((unsigned)level >= depth)
583       return true;
584     for (unsigned i = 0; i < (depth - level); i++)
585       if (labels[i] != b.labels[i])
586         return false;
587     return true;
588   }
589   bool operator!=(const Address &b) const { return !operator==(b); }
590   void print() const {
591     unsigned i;
592     printf("Depth: %u --- ", depth);
593     for (i = 0; i < depth; i++) {
594       printf("%u ", labels[i]);
595     }
596   }
597 };
598 
599 class AddrUnsPair {
600 public:
601   Address first;
602   unsigned second;
603   AddrUnsPair(Address _first, unsigned _second)
604       : first(_first), second(_second) {}
605   AddrUnsPair &operator=(const AddrUnsPair &b) {
606     first = b.first;
607     second = b.second;
608     return *this;
609   }
610   void print() const {
611     printf("first = ");
612     first.print();
613     printf(" --- second = %u", second);
614   }
615   bool operator==(const AddrUnsPair &b) const {
616     if (first != b.first)
617       return false;
618     if (second != b.second)
619       return false;
620     return true;
621   }
622   bool operator!=(const AddrUnsPair &b) const { return !operator==(b); }
623 };
624 
625 static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) {
626   const Address *aa = &(((const AddrUnsPair *)a)->first);
627   const Address *bb = &(((const AddrUnsPair *)b)->first);
628   unsigned depth = aa->depth;
629   unsigned i;
630   KMP_DEBUG_ASSERT(depth == bb->depth);
631   for (i = 0; i < depth; i++) {
632     if (aa->labels[i] < bb->labels[i])
633       return -1;
634     if (aa->labels[i] > bb->labels[i])
635       return 1;
636   }
637   return 0;
638 }
639 
640 /* A structure for holding machine-specific hierarchy info to be computed once
641    at init. This structure represents a mapping of threads to the actual machine
642    hierarchy, or to our best guess at what the hierarchy might be, for the
643    purpose of performing an efficient barrier. In the worst case, when there is
644    no machine hierarchy information, it produces a tree suitable for a barrier,
645    similar to the tree used in the hyper barrier. */
646 class hierarchy_info {
647 public:
648   /* Good default values for number of leaves and branching factor, given no
649      affinity information. Behaves a bit like hyper barrier. */
650   static const kmp_uint32 maxLeaves = 4;
651   static const kmp_uint32 minBranch = 4;
652   /** Number of levels in the hierarchy. Typical levels are threads/core,
653       cores/package or socket, packages/node, nodes/machine, etc. We don't want
654       to get specific with nomenclature. When the machine is oversubscribed we
655       add levels to duplicate the hierarchy, doubling the thread capacity of the
656       hierarchy each time we add a level. */
657   kmp_uint32 maxLevels;
658 
659   /** This is specifically the depth of the machine configuration hierarchy, in
660       terms of the number of levels along the longest path from root to any
661       leaf. It corresponds to the number of entries in numPerLevel if we exclude
662       all but one trailing 1. */
663   kmp_uint32 depth;
664   kmp_uint32 base_num_threads;
665   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
666   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
667   // 2=initialization in progress
668   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
669 
670   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
671       the parent of a node at level i has. For example, if we have a machine
672       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
673       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
674   kmp_uint32 *numPerLevel;
675   kmp_uint32 *skipPerLevel;
676 
677   void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
678     int hier_depth = adr2os[0].first.depth;
679     int level = 0;
680     for (int i = hier_depth - 1; i >= 0; --i) {
681       int max = -1;
682       for (int j = 0; j < num_addrs; ++j) {
683         int next = adr2os[j].first.childNums[i];
684         if (next > max)
685           max = next;
686       }
687       numPerLevel[level] = max + 1;
688       ++level;
689     }
690   }
691 
692   hierarchy_info()
693       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
694 
695   void fini() {
696     if (!uninitialized && numPerLevel) {
697       __kmp_free(numPerLevel);
698       numPerLevel = NULL;
699       uninitialized = not_initialized;
700     }
701   }
702 
703   void init(AddrUnsPair *adr2os, int num_addrs) {
704     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
705         &uninitialized, not_initialized, initializing);
706     if (bool_result == 0) { // Wait for initialization
707       while (TCR_1(uninitialized) != initialized)
708         KMP_CPU_PAUSE();
709       return;
710     }
711     KMP_DEBUG_ASSERT(bool_result == 1);
712 
713     /* Added explicit initialization of the data fields here to prevent usage of
714        dirty value observed when static library is re-initialized multiple times
715        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
716        OpenMP). */
717     depth = 1;
718     resizing = 0;
719     maxLevels = 7;
720     numPerLevel =
721         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
722     skipPerLevel = &(numPerLevel[maxLevels]);
723     for (kmp_uint32 i = 0; i < maxLevels;
724          ++i) { // init numPerLevel[*] to 1 item per level
725       numPerLevel[i] = 1;
726       skipPerLevel[i] = 1;
727     }
728 
729     // Sort table by physical ID
730     if (adr2os) {
731       qsort(adr2os, num_addrs, sizeof(*adr2os),
732             __kmp_affinity_cmp_Address_labels);
733       deriveLevels(adr2os, num_addrs);
734     } else {
735       numPerLevel[0] = maxLeaves;
736       numPerLevel[1] = num_addrs / maxLeaves;
737       if (num_addrs % maxLeaves)
738         numPerLevel[1]++;
739     }
740 
741     base_num_threads = num_addrs;
742     for (int i = maxLevels - 1; i >= 0;
743          --i) // count non-empty levels to get depth
744       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
745         depth++;
746 
747     kmp_uint32 branch = minBranch;
748     if (numPerLevel[0] == 1)
749       branch = num_addrs / maxLeaves;
750     if (branch < minBranch)
751       branch = minBranch;
752     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
753       while (numPerLevel[d] > branch ||
754              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
755         if (numPerLevel[d] & 1)
756           numPerLevel[d]++;
757         numPerLevel[d] = numPerLevel[d] >> 1;
758         if (numPerLevel[d + 1] == 1)
759           depth++;
760         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
761       }
762       if (numPerLevel[0] == 1) {
763         branch = branch >> 1;
764         if (branch < 4)
765           branch = minBranch;
766       }
767     }
768 
769     for (kmp_uint32 i = 1; i < depth; ++i)
770       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
771     // Fill in hierarchy in the case of oversubscription
772     for (kmp_uint32 i = depth; i < maxLevels; ++i)
773       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
774 
775     uninitialized = initialized; // One writer
776   }
777 
778   // Resize the hierarchy if nproc changes to something larger than before
779   void resize(kmp_uint32 nproc) {
780     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
781     while (bool_result == 0) { // someone else is trying to resize
782       KMP_CPU_PAUSE();
783       if (nproc <= base_num_threads) // happy with other thread's resize
784         return;
785       else // try to resize
786         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
787     }
788     KMP_DEBUG_ASSERT(bool_result != 0);
789     if (nproc <= base_num_threads)
790       return; // happy with other thread's resize
791 
792     // Calculate new maxLevels
793     kmp_uint32 old_sz = skipPerLevel[depth - 1];
794     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
795     // First see if old maxLevels is enough to contain new size
796     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
797       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
798       numPerLevel[i - 1] *= 2;
799       old_sz *= 2;
800       depth++;
801     }
802     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
803       while (nproc > old_sz) {
804         old_sz *= 2;
805         incs++;
806         depth++;
807       }
808       maxLevels += incs;
809 
810       // Resize arrays
811       kmp_uint32 *old_numPerLevel = numPerLevel;
812       kmp_uint32 *old_skipPerLevel = skipPerLevel;
813       numPerLevel = skipPerLevel = NULL;
814       numPerLevel =
815           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
816       skipPerLevel = &(numPerLevel[maxLevels]);
817 
818       // Copy old elements from old arrays
819       for (kmp_uint32 i = 0; i < old_maxLevels;
820            ++i) { // init numPerLevel[*] to 1 item per level
821         numPerLevel[i] = old_numPerLevel[i];
822         skipPerLevel[i] = old_skipPerLevel[i];
823       }
824 
825       // Init new elements in arrays to 1
826       for (kmp_uint32 i = old_maxLevels; i < maxLevels;
827            ++i) { // init numPerLevel[*] to 1 item per level
828         numPerLevel[i] = 1;
829         skipPerLevel[i] = 1;
830       }
831 
832       // Free old arrays
833       __kmp_free(old_numPerLevel);
834     }
835 
836     // Fill in oversubscription levels of hierarchy
837     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
838       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
839 
840     base_num_threads = nproc;
841     resizing = 0; // One writer
842   }
843 };
844 #endif // KMP_AFFINITY_H
845