xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_affinity.h (revision 53120fbb68952b7d620c2c0e1cf05c5017fc1b27)
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 #include <limits>
19 
20 #if KMP_AFFINITY_SUPPORTED
21 #if KMP_USE_HWLOC
22 class KMPHwlocAffinity : public KMPAffinity {
23 public:
24   class Mask : public KMPAffinity::Mask {
25     hwloc_cpuset_t mask;
26 
27   public:
28     Mask() {
29       mask = hwloc_bitmap_alloc();
30       this->zero();
31     }
32     ~Mask() { hwloc_bitmap_free(mask); }
33     void set(int i) override { hwloc_bitmap_set(mask, i); }
34     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36     void zero() override { hwloc_bitmap_zero(mask); }
37     bool empty() const override { return hwloc_bitmap_iszero(mask); }
38     void copy(const KMPAffinity::Mask *src) override {
39       const Mask *convert = static_cast<const Mask *>(src);
40       hwloc_bitmap_copy(mask, convert->mask);
41     }
42     void bitwise_and(const KMPAffinity::Mask *rhs) override {
43       const Mask *convert = static_cast<const Mask *>(rhs);
44       hwloc_bitmap_and(mask, mask, convert->mask);
45     }
46     void bitwise_or(const KMPAffinity::Mask *rhs) override {
47       const Mask *convert = static_cast<const Mask *>(rhs);
48       hwloc_bitmap_or(mask, mask, convert->mask);
49     }
50     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51     bool is_equal(const KMPAffinity::Mask *rhs) const override {
52       const Mask *convert = static_cast<const Mask *>(rhs);
53       return hwloc_bitmap_isequal(mask, convert->mask);
54     }
55     int begin() const override { return hwloc_bitmap_first(mask); }
56     int end() const override { return -1; }
57     int next(int previous) const override {
58       return hwloc_bitmap_next(mask, previous);
59     }
60     int get_system_affinity(bool abort_on_error) override {
61       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62                   "Illegal get affinity operation when not capable");
63       long retval =
64           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65       if (retval >= 0) {
66         return 0;
67       }
68       int error = errno;
69       if (abort_on_error) {
70         __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71                     KMP_ERR(error), __kmp_msg_null);
72       }
73       return error;
74     }
75     int set_system_affinity(bool abort_on_error) const override {
76       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77                   "Illegal set affinity operation when not capable");
78       long retval =
79           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80       if (retval >= 0) {
81         return 0;
82       }
83       int error = errno;
84       if (abort_on_error) {
85         __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86                     KMP_ERR(error), __kmp_msg_null);
87       }
88       return error;
89     }
90 #if KMP_OS_WINDOWS
91     int set_process_affinity(bool abort_on_error) const override {
92       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93                   "Illegal set process affinity operation when not capable");
94       int error = 0;
95       const hwloc_topology_support *support =
96           hwloc_topology_get_support(__kmp_hwloc_topology);
97       if (support->cpubind->set_proc_cpubind) {
98         int retval;
99         retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100                                    HWLOC_CPUBIND_PROCESS);
101         if (retval >= 0)
102           return 0;
103         error = errno;
104         if (abort_on_error)
105           __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106                       KMP_ERR(error), __kmp_msg_null);
107       }
108       return error;
109     }
110 #endif
111     int get_proc_group() const override {
112       int group = -1;
113 #if KMP_OS_WINDOWS
114       if (__kmp_num_proc_groups == 1) {
115         return 1;
116       }
117       for (int i = 0; i < __kmp_num_proc_groups; i++) {
118         // On windows, the long type is always 32 bits
119         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120         unsigned long second_32_bits =
121             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122         if (first_32_bits == 0 && second_32_bits == 0) {
123           continue;
124         }
125         if (group >= 0) {
126           return -1;
127         }
128         group = i;
129       }
130 #endif /* KMP_OS_WINDOWS */
131       return group;
132     }
133   };
134   void determine_capable(const char *var) override {
135     const hwloc_topology_support *topology_support;
136     if (__kmp_hwloc_topology == NULL) {
137       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138         __kmp_hwloc_error = TRUE;
139         if (__kmp_affinity.flags.verbose) {
140           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141         }
142       }
143       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144         __kmp_hwloc_error = TRUE;
145         if (__kmp_affinity.flags.verbose) {
146           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147         }
148       }
149     }
150     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151     // Is the system capable of setting/getting this thread's affinity?
152     // Also, is topology discovery possible? (pu indicates ability to discover
153     // processing units). And finally, were there no errors when calling any
154     // hwloc_* API functions?
155     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156         topology_support->cpubind->get_thisthread_cpubind &&
157         topology_support->discovery->pu && !__kmp_hwloc_error) {
158       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
159       KMP_AFFINITY_ENABLE(TRUE);
160     } else {
161       // indicate that hwloc didn't work and disable affinity
162       __kmp_hwloc_error = TRUE;
163       KMP_AFFINITY_DISABLE();
164     }
165   }
166   void bind_thread(int which) override {
167     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168                 "Illegal set affinity operation when not capable");
169     KMPAffinity::Mask *mask;
170     KMP_CPU_ALLOC_ON_STACK(mask);
171     KMP_CPU_ZERO(mask);
172     KMP_CPU_SET(which, mask);
173     __kmp_set_system_affinity(mask, TRUE);
174     KMP_CPU_FREE_FROM_STACK(mask);
175   }
176   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178   KMPAffinity::Mask *allocate_mask_array(int num) override {
179     return new Mask[num];
180   }
181   void deallocate_mask_array(KMPAffinity::Mask *array) override {
182     Mask *hwloc_array = static_cast<Mask *>(array);
183     delete[] hwloc_array;
184   }
185   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186                                       int index) override {
187     Mask *hwloc_array = static_cast<Mask *>(array);
188     return &(hwloc_array[index]);
189   }
190   api_type get_api_type() const override { return HWLOC; }
191 };
192 #endif /* KMP_USE_HWLOC */
193 
194 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX
195 #if KMP_OS_LINUX
196 /* On some of the older OS's that we build on, these constants aren't present
197    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
198    all systems of the same arch where they are defined, and they cannot change.
199    stone forever. */
200 #include <sys/syscall.h>
201 #if KMP_ARCH_X86 || KMP_ARCH_ARM
202 #ifndef __NR_sched_setaffinity
203 #define __NR_sched_setaffinity 241
204 #elif __NR_sched_setaffinity != 241
205 #error Wrong code for setaffinity system call.
206 #endif /* __NR_sched_setaffinity */
207 #ifndef __NR_sched_getaffinity
208 #define __NR_sched_getaffinity 242
209 #elif __NR_sched_getaffinity != 242
210 #error Wrong code for getaffinity system call.
211 #endif /* __NR_sched_getaffinity */
212 #elif KMP_ARCH_AARCH64
213 #ifndef __NR_sched_setaffinity
214 #define __NR_sched_setaffinity 122
215 #elif __NR_sched_setaffinity != 122
216 #error Wrong code for setaffinity system call.
217 #endif /* __NR_sched_setaffinity */
218 #ifndef __NR_sched_getaffinity
219 #define __NR_sched_getaffinity 123
220 #elif __NR_sched_getaffinity != 123
221 #error Wrong code for getaffinity system call.
222 #endif /* __NR_sched_getaffinity */
223 #elif KMP_ARCH_X86_64
224 #ifndef __NR_sched_setaffinity
225 #define __NR_sched_setaffinity 203
226 #elif __NR_sched_setaffinity != 203
227 #error Wrong code for setaffinity system call.
228 #endif /* __NR_sched_setaffinity */
229 #ifndef __NR_sched_getaffinity
230 #define __NR_sched_getaffinity 204
231 #elif __NR_sched_getaffinity != 204
232 #error Wrong code for getaffinity system call.
233 #endif /* __NR_sched_getaffinity */
234 #elif KMP_ARCH_PPC64
235 #ifndef __NR_sched_setaffinity
236 #define __NR_sched_setaffinity 222
237 #elif __NR_sched_setaffinity != 222
238 #error Wrong code for setaffinity system call.
239 #endif /* __NR_sched_setaffinity */
240 #ifndef __NR_sched_getaffinity
241 #define __NR_sched_getaffinity 223
242 #elif __NR_sched_getaffinity != 223
243 #error Wrong code for getaffinity system call.
244 #endif /* __NR_sched_getaffinity */
245 #elif KMP_ARCH_MIPS
246 #ifndef __NR_sched_setaffinity
247 #define __NR_sched_setaffinity 4239
248 #elif __NR_sched_setaffinity != 4239
249 #error Wrong code for setaffinity system call.
250 #endif /* __NR_sched_setaffinity */
251 #ifndef __NR_sched_getaffinity
252 #define __NR_sched_getaffinity 4240
253 #elif __NR_sched_getaffinity != 4240
254 #error Wrong code for getaffinity system call.
255 #endif /* __NR_sched_getaffinity */
256 #elif KMP_ARCH_MIPS64
257 #ifndef __NR_sched_setaffinity
258 #define __NR_sched_setaffinity 5195
259 #elif __NR_sched_setaffinity != 5195
260 #error Wrong code for setaffinity system call.
261 #endif /* __NR_sched_setaffinity */
262 #ifndef __NR_sched_getaffinity
263 #define __NR_sched_getaffinity 5196
264 #elif __NR_sched_getaffinity != 5196
265 #error Wrong code for getaffinity system call.
266 #endif /* __NR_sched_getaffinity */
267 #elif KMP_ARCH_LOONGARCH64
268 #ifndef __NR_sched_setaffinity
269 #define __NR_sched_setaffinity 122
270 #elif __NR_sched_setaffinity != 122
271 #error Wrong code for setaffinity system call.
272 #endif /* __NR_sched_setaffinity */
273 #ifndef __NR_sched_getaffinity
274 #define __NR_sched_getaffinity 123
275 #elif __NR_sched_getaffinity != 123
276 #error Wrong code for getaffinity system call.
277 #endif /* __NR_sched_getaffinity */
278 #elif KMP_ARCH_RISCV64
279 #ifndef __NR_sched_setaffinity
280 #define __NR_sched_setaffinity 122
281 #elif __NR_sched_setaffinity != 122
282 #error Wrong code for setaffinity system call.
283 #endif /* __NR_sched_setaffinity */
284 #ifndef __NR_sched_getaffinity
285 #define __NR_sched_getaffinity 123
286 #elif __NR_sched_getaffinity != 123
287 #error Wrong code for getaffinity system call.
288 #endif /* __NR_sched_getaffinity */
289 #elif KMP_ARCH_VE
290 #ifndef __NR_sched_setaffinity
291 #define __NR_sched_setaffinity 203
292 #elif __NR_sched_setaffinity != 203
293 #error Wrong code for setaffinity system call.
294 #endif /* __NR_sched_setaffinity */
295 #ifndef __NR_sched_getaffinity
296 #define __NR_sched_getaffinity 204
297 #elif __NR_sched_getaffinity != 204
298 #error Wrong code for getaffinity system call.
299 #endif /* __NR_sched_getaffinity */
300 #elif KMP_ARCH_S390X
301 #ifndef __NR_sched_setaffinity
302 #define __NR_sched_setaffinity 239
303 #elif __NR_sched_setaffinity != 239
304 #error Wrong code for setaffinity system call.
305 #endif /* __NR_sched_setaffinity */
306 #ifndef __NR_sched_getaffinity
307 #define __NR_sched_getaffinity 240
308 #elif __NR_sched_getaffinity != 240
309 #error Wrong code for getaffinity system call.
310 #endif /* __NR_sched_getaffinity */
311 #else
312 #error Unknown or unsupported architecture
313 #endif /* KMP_ARCH_* */
314 #elif KMP_OS_FREEBSD
315 #include <pthread.h>
316 #include <pthread_np.h>
317 #elif KMP_OS_AIX
318 #include <sys/dr.h>
319 #include <sys/rset.h>
320 #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
321 #endif
322 class KMPNativeAffinity : public KMPAffinity {
323   class Mask : public KMPAffinity::Mask {
324     typedef unsigned long mask_t;
325     typedef decltype(__kmp_affin_mask_size) mask_size_type;
326     static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
327     static const mask_t ONE = 1;
328     mask_size_type get_num_mask_types() const {
329       return __kmp_affin_mask_size / sizeof(mask_t);
330     }
331 
332   public:
333     mask_t *mask;
334     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
335     ~Mask() {
336       if (mask)
337         __kmp_free(mask);
338     }
339     void set(int i) override {
340       mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
341     }
342     bool is_set(int i) const override {
343       return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
344     }
345     void clear(int i) override {
346       mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
347     }
348     void zero() override {
349       mask_size_type e = get_num_mask_types();
350       for (mask_size_type i = 0; i < e; ++i)
351         mask[i] = (mask_t)0;
352     }
353     bool empty() const override {
354       mask_size_type e = get_num_mask_types();
355       for (mask_size_type i = 0; i < e; ++i)
356         if (mask[i] != (mask_t)0)
357           return false;
358       return true;
359     }
360     void copy(const KMPAffinity::Mask *src) override {
361       const Mask *convert = static_cast<const Mask *>(src);
362       mask_size_type e = get_num_mask_types();
363       for (mask_size_type i = 0; i < e; ++i)
364         mask[i] = convert->mask[i];
365     }
366     void bitwise_and(const KMPAffinity::Mask *rhs) override {
367       const Mask *convert = static_cast<const Mask *>(rhs);
368       mask_size_type e = get_num_mask_types();
369       for (mask_size_type i = 0; i < e; ++i)
370         mask[i] &= convert->mask[i];
371     }
372     void bitwise_or(const KMPAffinity::Mask *rhs) override {
373       const Mask *convert = static_cast<const Mask *>(rhs);
374       mask_size_type e = get_num_mask_types();
375       for (mask_size_type i = 0; i < e; ++i)
376         mask[i] |= convert->mask[i];
377     }
378     void bitwise_not() override {
379       mask_size_type e = get_num_mask_types();
380       for (mask_size_type i = 0; i < e; ++i)
381         mask[i] = ~(mask[i]);
382     }
383     bool is_equal(const KMPAffinity::Mask *rhs) const override {
384       const Mask *convert = static_cast<const Mask *>(rhs);
385       mask_size_type e = get_num_mask_types();
386       for (mask_size_type i = 0; i < e; ++i)
387         if (mask[i] != convert->mask[i])
388           return false;
389       return true;
390     }
391     int begin() const override {
392       int retval = 0;
393       while (retval < end() && !is_set(retval))
394         ++retval;
395       return retval;
396     }
397     int end() const override {
398       int e;
399       __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
400       return e;
401     }
402     int next(int previous) const override {
403       int retval = previous + 1;
404       while (retval < end() && !is_set(retval))
405         ++retval;
406       return retval;
407     }
408 #if KMP_OS_AIX
409     // On AIX, we don't have a way to get CPU(s) a thread is bound to.
410     // This routine is only used to get the full mask.
411     int get_system_affinity(bool abort_on_error) override {
412       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
413                   "Illegal get affinity operation when not capable");
414 
415       (void)abort_on_error;
416 
417       // Set the mask with all CPUs that are available.
418       for (int i = 0; i < __kmp_xproc; ++i)
419         KMP_CPU_SET(i, this);
420       return 0;
421     }
422     int set_system_affinity(bool abort_on_error) const override {
423       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
424 
425                   "Illegal set affinity operation when not capable");
426 
427       int location;
428       int gtid = __kmp_entry_gtid();
429       int tid = thread_self();
430 
431       // Unbind the thread if it was bound to any processors before so that
432       // we can bind the thread to CPUs specified by the mask not others.
433       int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
434 
435       // On AIX, we can only bind to one instead of a set of CPUs with the
436       // bindprocessor() system call.
437       KMP_CPU_SET_ITERATE(location, this) {
438         if (KMP_CPU_ISSET(location, this)) {
439           retval = bindprocessor(BINDTHREAD, tid, location);
440           if (retval == -1 && errno == 1) {
441             rsid_t rsid;
442             rsethandle_t rsh;
443             // Put something in rsh to prevent compiler warning
444             // about uninitalized use
445             rsh = rs_alloc(RS_EMPTY);
446             rsid.at_pid = getpid();
447             if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
448               retval = ra_detachrset(R_PROCESS, rsid, 0);
449               retval = bindprocessor(BINDTHREAD, tid, location);
450             }
451           }
452           if (retval == 0) {
453             KA_TRACE(10, ("__kmp_set_system_affinity:  Done binding "
454                           "T#%d to cpu=%d.\n",
455                           gtid, location));
456             continue;
457           }
458           int error = errno;
459           if (abort_on_error) {
460             __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
461                         KMP_ERR(error), __kmp_msg_null);
462             KA_TRACE(10, ("__kmp_set_system_affinity:  Error binding "
463                           "T#%d to cpu=%d, errno=%d.\n",
464                           gtid, location, error));
465             return error;
466           }
467         }
468       }
469       return 0;
470     }
471 #else // !KMP_OS_AIX
472     int get_system_affinity(bool abort_on_error) override {
473       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
474                   "Illegal get affinity operation when not capable");
475 #if KMP_OS_LINUX
476       long retval =
477           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
478 #elif KMP_OS_FREEBSD
479       int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
480                                      reinterpret_cast<cpuset_t *>(mask));
481       int retval = (r == 0 ? 0 : -1);
482 #endif
483       if (retval >= 0) {
484         return 0;
485       }
486       int error = errno;
487       if (abort_on_error) {
488         __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
489                     KMP_ERR(error), __kmp_msg_null);
490       }
491       return error;
492     }
493     int set_system_affinity(bool abort_on_error) const override {
494       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
495                   "Illegal set affinity operation when not capable");
496 #if KMP_OS_LINUX
497       long retval =
498           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
499 #elif KMP_OS_FREEBSD
500       int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
501                                      reinterpret_cast<cpuset_t *>(mask));
502       int retval = (r == 0 ? 0 : -1);
503 #endif
504       if (retval >= 0) {
505         return 0;
506       }
507       int error = errno;
508       if (abort_on_error) {
509         __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
510                     KMP_ERR(error), __kmp_msg_null);
511       }
512       return error;
513     }
514 #endif // KMP_OS_AIX
515   };
516   void determine_capable(const char *env_var) override {
517     __kmp_affinity_determine_capable(env_var);
518   }
519   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
520   KMPAffinity::Mask *allocate_mask() override {
521     KMPNativeAffinity::Mask *retval = new Mask();
522     return retval;
523   }
524   void deallocate_mask(KMPAffinity::Mask *m) override {
525     KMPNativeAffinity::Mask *native_mask =
526         static_cast<KMPNativeAffinity::Mask *>(m);
527     delete native_mask;
528   }
529   KMPAffinity::Mask *allocate_mask_array(int num) override {
530     return new Mask[num];
531   }
532   void deallocate_mask_array(KMPAffinity::Mask *array) override {
533     Mask *linux_array = static_cast<Mask *>(array);
534     delete[] linux_array;
535   }
536   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
537                                       int index) override {
538     Mask *linux_array = static_cast<Mask *>(array);
539     return &(linux_array[index]);
540   }
541   api_type get_api_type() const override { return NATIVE_OS; }
542 };
543 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX */
544 
545 #if KMP_OS_WINDOWS
546 class KMPNativeAffinity : public KMPAffinity {
547   class Mask : public KMPAffinity::Mask {
548     typedef ULONG_PTR mask_t;
549     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
550     mask_t *mask;
551 
552   public:
553     Mask() {
554       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
555     }
556     ~Mask() {
557       if (mask)
558         __kmp_free(mask);
559     }
560     void set(int i) override {
561       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
562     }
563     bool is_set(int i) const override {
564       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
565     }
566     void clear(int i) override {
567       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
568     }
569     void zero() override {
570       for (int i = 0; i < __kmp_num_proc_groups; ++i)
571         mask[i] = 0;
572     }
573     bool empty() const override {
574       for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
575         if (mask[i])
576           return false;
577       return true;
578     }
579     void copy(const KMPAffinity::Mask *src) override {
580       const Mask *convert = static_cast<const Mask *>(src);
581       for (int i = 0; i < __kmp_num_proc_groups; ++i)
582         mask[i] = convert->mask[i];
583     }
584     void bitwise_and(const KMPAffinity::Mask *rhs) override {
585       const Mask *convert = static_cast<const Mask *>(rhs);
586       for (int i = 0; i < __kmp_num_proc_groups; ++i)
587         mask[i] &= convert->mask[i];
588     }
589     void bitwise_or(const KMPAffinity::Mask *rhs) override {
590       const Mask *convert = static_cast<const Mask *>(rhs);
591       for (int i = 0; i < __kmp_num_proc_groups; ++i)
592         mask[i] |= convert->mask[i];
593     }
594     void bitwise_not() override {
595       for (int i = 0; i < __kmp_num_proc_groups; ++i)
596         mask[i] = ~(mask[i]);
597     }
598     bool is_equal(const KMPAffinity::Mask *rhs) const override {
599       const Mask *convert = static_cast<const Mask *>(rhs);
600       for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
601         if (mask[i] != convert->mask[i])
602           return false;
603       return true;
604     }
605     int begin() const override {
606       int retval = 0;
607       while (retval < end() && !is_set(retval))
608         ++retval;
609       return retval;
610     }
611     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
612     int next(int previous) const override {
613       int retval = previous + 1;
614       while (retval < end() && !is_set(retval))
615         ++retval;
616       return retval;
617     }
618     int set_process_affinity(bool abort_on_error) const override {
619       if (__kmp_num_proc_groups <= 1) {
620         if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
621           DWORD error = GetLastError();
622           if (abort_on_error) {
623             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
624                         __kmp_msg_null);
625           }
626           return error;
627         }
628       }
629       return 0;
630     }
631     int set_system_affinity(bool abort_on_error) const override {
632       if (__kmp_num_proc_groups > 1) {
633         // Check for a valid mask.
634         GROUP_AFFINITY ga;
635         int group = get_proc_group();
636         if (group < 0) {
637           if (abort_on_error) {
638             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
639           }
640           return -1;
641         }
642         // Transform the bit vector into a GROUP_AFFINITY struct
643         // and make the system call to set affinity.
644         ga.Group = group;
645         ga.Mask = mask[group];
646         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
647 
648         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
649         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
650           DWORD error = GetLastError();
651           if (abort_on_error) {
652             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
653                         __kmp_msg_null);
654           }
655           return error;
656         }
657       } else {
658         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
659           DWORD error = GetLastError();
660           if (abort_on_error) {
661             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
662                         __kmp_msg_null);
663           }
664           return error;
665         }
666       }
667       return 0;
668     }
669     int get_system_affinity(bool abort_on_error) override {
670       if (__kmp_num_proc_groups > 1) {
671         this->zero();
672         GROUP_AFFINITY ga;
673         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
674         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
675           DWORD error = GetLastError();
676           if (abort_on_error) {
677             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
678                         KMP_ERR(error), __kmp_msg_null);
679           }
680           return error;
681         }
682         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
683             (ga.Mask == 0)) {
684           return -1;
685         }
686         mask[ga.Group] = ga.Mask;
687       } else {
688         mask_t newMask, sysMask, retval;
689         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
690           DWORD error = GetLastError();
691           if (abort_on_error) {
692             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
693                         KMP_ERR(error), __kmp_msg_null);
694           }
695           return error;
696         }
697         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
698         if (!retval) {
699           DWORD error = GetLastError();
700           if (abort_on_error) {
701             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
702                         KMP_ERR(error), __kmp_msg_null);
703           }
704           return error;
705         }
706         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
707         if (!newMask) {
708           DWORD error = GetLastError();
709           if (abort_on_error) {
710             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
711                         KMP_ERR(error), __kmp_msg_null);
712           }
713         }
714         *mask = retval;
715       }
716       return 0;
717     }
718     int get_proc_group() const override {
719       int group = -1;
720       if (__kmp_num_proc_groups == 1) {
721         return 1;
722       }
723       for (int i = 0; i < __kmp_num_proc_groups; i++) {
724         if (mask[i] == 0)
725           continue;
726         if (group >= 0)
727           return -1;
728         group = i;
729       }
730       return group;
731     }
732   };
733   void determine_capable(const char *env_var) override {
734     __kmp_affinity_determine_capable(env_var);
735   }
736   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
737   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
738   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
739   KMPAffinity::Mask *allocate_mask_array(int num) override {
740     return new Mask[num];
741   }
742   void deallocate_mask_array(KMPAffinity::Mask *array) override {
743     Mask *windows_array = static_cast<Mask *>(array);
744     delete[] windows_array;
745   }
746   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
747                                       int index) override {
748     Mask *windows_array = static_cast<Mask *>(array);
749     return &(windows_array[index]);
750   }
751   api_type get_api_type() const override { return NATIVE_OS; }
752 };
753 #endif /* KMP_OS_WINDOWS */
754 #endif /* KMP_AFFINITY_SUPPORTED */
755 
756 // Describe an attribute for a level in the machine topology
757 struct kmp_hw_attr_t {
758   int core_type : 8;
759   int core_eff : 8;
760   unsigned valid : 1;
761   unsigned reserved : 15;
762 
763   static const int UNKNOWN_CORE_EFF = -1;
764 
765   kmp_hw_attr_t()
766       : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
767         valid(0), reserved(0) {}
768   void set_core_type(kmp_hw_core_type_t type) {
769     valid = 1;
770     core_type = type;
771   }
772   void set_core_eff(int eff) {
773     valid = 1;
774     core_eff = eff;
775   }
776   kmp_hw_core_type_t get_core_type() const {
777     return (kmp_hw_core_type_t)core_type;
778   }
779   int get_core_eff() const { return core_eff; }
780   bool is_core_type_valid() const {
781     return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
782   }
783   bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
784   operator bool() const { return valid; }
785   void clear() {
786     core_type = KMP_HW_CORE_TYPE_UNKNOWN;
787     core_eff = UNKNOWN_CORE_EFF;
788     valid = 0;
789   }
790   bool contains(const kmp_hw_attr_t &other) const {
791     if (!valid && !other.valid)
792       return true;
793     if (valid && other.valid) {
794       if (other.is_core_type_valid()) {
795         if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
796           return false;
797       }
798       if (other.is_core_eff_valid()) {
799         if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
800           return false;
801       }
802       return true;
803     }
804     return false;
805   }
806 #if KMP_AFFINITY_SUPPORTED
807   bool contains(const kmp_affinity_attrs_t &attr) const {
808     if (!valid && !attr.valid)
809       return true;
810     if (valid && attr.valid) {
811       if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
812         return (is_core_type_valid() &&
813                 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
814       if (attr.core_eff != UNKNOWN_CORE_EFF)
815         return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
816       return true;
817     }
818     return false;
819   }
820 #endif // KMP_AFFINITY_SUPPORTED
821   bool operator==(const kmp_hw_attr_t &rhs) const {
822     return (rhs.valid == valid && rhs.core_eff == core_eff &&
823             rhs.core_type == core_type);
824   }
825   bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
826 };
827 
828 #if KMP_AFFINITY_SUPPORTED
829 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
830 #endif
831 
832 class kmp_hw_thread_t {
833 public:
834   static const int UNKNOWN_ID = -1;
835   static const int MULTIPLE_ID = -2;
836   static int compare_ids(const void *a, const void *b);
837   static int compare_compact(const void *a, const void *b);
838   int ids[KMP_HW_LAST];
839   int sub_ids[KMP_HW_LAST];
840   bool leader;
841   int os_id;
842   kmp_hw_attr_t attrs;
843 
844   void print() const;
845   void clear() {
846     for (int i = 0; i < (int)KMP_HW_LAST; ++i)
847       ids[i] = UNKNOWN_ID;
848     leader = false;
849     attrs.clear();
850   }
851 };
852 
853 class kmp_topology_t {
854 
855   struct flags_t {
856     int uniform : 1;
857     int reserved : 31;
858   };
859 
860   int depth;
861 
862   // The following arrays are all 'depth' long and have been
863   // allocated to hold up to KMP_HW_LAST number of objects if
864   // needed so layers can be added without reallocation of any array
865 
866   // Orderd array of the types in the topology
867   kmp_hw_t *types;
868 
869   // Keep quick topology ratios, for non-uniform topologies,
870   // this ratio holds the max number of itemAs per itemB
871   // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
872   int *ratio;
873 
874   // Storage containing the absolute number of each topology layer
875   int *count;
876 
877   // The number of core efficiencies. This is only useful for hybrid
878   // topologies. Core efficiencies will range from 0 to num efficiencies - 1
879   int num_core_efficiencies;
880   int num_core_types;
881   kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
882 
883   // The hardware threads array
884   // hw_threads is num_hw_threads long
885   // Each hw_thread's ids and sub_ids are depth deep
886   int num_hw_threads;
887   kmp_hw_thread_t *hw_threads;
888 
889   // Equivalence hash where the key is the hardware topology item
890   // and the value is the equivalent hardware topology type in the
891   // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
892   // known equivalence for the topology type
893   kmp_hw_t equivalent[KMP_HW_LAST];
894 
895   // Flags describing the topology
896   flags_t flags;
897 
898   // Compact value used during sort_compact()
899   int compact;
900 
901   // Insert a new topology layer after allocation
902   void _insert_layer(kmp_hw_t type, const int *ids);
903 
904 #if KMP_GROUP_AFFINITY
905   // Insert topology information about Windows Processor groups
906   void _insert_windows_proc_groups();
907 #endif
908 
909   // Count each item & get the num x's per y
910   // e.g., get the number of cores and the number of threads per core
911   // for each (x, y) in (KMP_HW_* , KMP_HW_*)
912   void _gather_enumeration_information();
913 
914   // Remove layers that don't add information to the topology.
915   // This is done by having the layer take on the id = UNKNOWN_ID (-1)
916   void _remove_radix1_layers();
917 
918   // Find out if the topology is uniform
919   void _discover_uniformity();
920 
921   // Set all the sub_ids for each hardware thread
922   void _set_sub_ids();
923 
924   // Set global affinity variables describing the number of threads per
925   // core, the number of packages, the number of cores per package, and
926   // the number of cores.
927   void _set_globals();
928 
929   // Set the last level cache equivalent type
930   void _set_last_level_cache();
931 
932   // Return the number of cores with a particular attribute, 'attr'.
933   // If 'find_all' is true, then find all cores on the machine, otherwise find
934   // all cores per the layer 'above'
935   int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
936                             bool find_all = false) const;
937 
938 public:
939   // Force use of allocate()/deallocate()
940   kmp_topology_t() = delete;
941   kmp_topology_t(const kmp_topology_t &t) = delete;
942   kmp_topology_t(kmp_topology_t &&t) = delete;
943   kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
944   kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
945 
946   static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
947   static void deallocate(kmp_topology_t *);
948 
949   // Functions used in create_map() routines
950   kmp_hw_thread_t &at(int index) {
951     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
952     return hw_threads[index];
953   }
954   const kmp_hw_thread_t &at(int index) const {
955     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
956     return hw_threads[index];
957   }
958   int get_num_hw_threads() const { return num_hw_threads; }
959   void sort_ids() {
960     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
961           kmp_hw_thread_t::compare_ids);
962   }
963   // Check if the hardware ids are unique, if they are
964   // return true, otherwise return false
965   bool check_ids() const;
966 
967   // Function to call after the create_map() routine
968   void canonicalize();
969   void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
970 
971 // Functions used after canonicalize() called
972 
973 #if KMP_AFFINITY_SUPPORTED
974   // Set the granularity for affinity settings
975   void set_granularity(kmp_affinity_t &stgs) const;
976   bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
977   bool restrict_to_mask(const kmp_affin_mask_t *mask);
978   bool filter_hw_subset();
979 #endif
980   bool is_uniform() const { return flags.uniform; }
981   // Tell whether a type is a valid type in the topology
982   // returns KMP_HW_UNKNOWN when there is no equivalent type
983   kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
984     if (type == KMP_HW_UNKNOWN)
985       return KMP_HW_UNKNOWN;
986     return equivalent[type];
987   }
988   // Set type1 = type2
989   void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
990     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
991     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
992     kmp_hw_t real_type2 = equivalent[type2];
993     if (real_type2 == KMP_HW_UNKNOWN)
994       real_type2 = type2;
995     equivalent[type1] = real_type2;
996     // This loop is required since any of the types may have been set to
997     // be equivalent to type1.  They all must be checked and reset to type2.
998     KMP_FOREACH_HW_TYPE(type) {
999       if (equivalent[type] == type1) {
1000         equivalent[type] = real_type2;
1001       }
1002     }
1003   }
1004   // Calculate number of types corresponding to level1
1005   // per types corresponding to level2 (e.g., number of threads per core)
1006   int calculate_ratio(int level1, int level2) const {
1007     KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1008     KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1009     int r = 1;
1010     for (int level = level1; level > level2; --level)
1011       r *= ratio[level];
1012     return r;
1013   }
1014   int get_ratio(int level) const {
1015     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1016     return ratio[level];
1017   }
1018   int get_depth() const { return depth; };
1019   kmp_hw_t get_type(int level) const {
1020     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1021     return types[level];
1022   }
1023   int get_level(kmp_hw_t type) const {
1024     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1025     int eq_type = equivalent[type];
1026     if (eq_type == KMP_HW_UNKNOWN)
1027       return -1;
1028     for (int i = 0; i < depth; ++i)
1029       if (types[i] == eq_type)
1030         return i;
1031     return -1;
1032   }
1033   int get_count(int level) const {
1034     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1035     return count[level];
1036   }
1037   // Return the total number of cores with attribute 'attr'
1038   int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1039     return _get_ncores_with_attr(attr, -1, true);
1040   }
1041   // Return the number of cores with attribute
1042   // 'attr' per topology level 'above'
1043   int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1044     return _get_ncores_with_attr(attr, above, false);
1045   }
1046 
1047 #if KMP_AFFINITY_SUPPORTED
1048   friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1049   void sort_compact(kmp_affinity_t &affinity) {
1050     compact = affinity.compact;
1051     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1052           kmp_hw_thread_t::compare_compact);
1053   }
1054 #endif
1055   void print(const char *env_var = "KMP_AFFINITY") const;
1056   void dump() const;
1057 };
1058 extern kmp_topology_t *__kmp_topology;
1059 
1060 class kmp_hw_subset_t {
1061   const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1062 
1063 public:
1064   // Describe a machine topology item in KMP_HW_SUBSET
1065   struct item_t {
1066     kmp_hw_t type;
1067     int num_attrs;
1068     int num[MAX_ATTRS];
1069     int offset[MAX_ATTRS];
1070     kmp_hw_attr_t attr[MAX_ATTRS];
1071   };
1072   // Put parenthesis around max to avoid accidental use of Windows max macro.
1073   const static int USE_ALL = (std::numeric_limits<int>::max)();
1074 
1075 private:
1076   int depth;
1077   int capacity;
1078   item_t *items;
1079   kmp_uint64 set;
1080   bool absolute;
1081   // The set must be able to handle up to KMP_HW_LAST number of layers
1082   KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1083   // Sorting the KMP_HW_SUBSET items to follow topology order
1084   // All unknown topology types will be at the beginning of the subset
1085   static int hw_subset_compare(const void *i1, const void *i2) {
1086     kmp_hw_t type1 = ((const item_t *)i1)->type;
1087     kmp_hw_t type2 = ((const item_t *)i2)->type;
1088     int level1 = __kmp_topology->get_level(type1);
1089     int level2 = __kmp_topology->get_level(type2);
1090     return level1 - level2;
1091   }
1092 
1093 public:
1094   // Force use of allocate()/deallocate()
1095   kmp_hw_subset_t() = delete;
1096   kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1097   kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1098   kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1099   kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1100 
1101   static kmp_hw_subset_t *allocate() {
1102     int initial_capacity = 5;
1103     kmp_hw_subset_t *retval =
1104         (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1105     retval->depth = 0;
1106     retval->capacity = initial_capacity;
1107     retval->set = 0ull;
1108     retval->absolute = false;
1109     retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1110     return retval;
1111   }
1112   static void deallocate(kmp_hw_subset_t *subset) {
1113     __kmp_free(subset->items);
1114     __kmp_free(subset);
1115   }
1116   void set_absolute() { absolute = true; }
1117   bool is_absolute() const { return absolute; }
1118   void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1119     for (int i = 0; i < depth; ++i) {
1120       // Found an existing item for this layer type
1121       // Add the num, offset, and attr to this item
1122       if (items[i].type == type) {
1123         int idx = items[i].num_attrs++;
1124         if ((size_t)idx >= MAX_ATTRS)
1125           return;
1126         items[i].num[idx] = num;
1127         items[i].offset[idx] = offset;
1128         items[i].attr[idx] = attr;
1129         return;
1130       }
1131     }
1132     if (depth == capacity - 1) {
1133       capacity *= 2;
1134       item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1135       for (int i = 0; i < depth; ++i)
1136         new_items[i] = items[i];
1137       __kmp_free(items);
1138       items = new_items;
1139     }
1140     items[depth].num_attrs = 1;
1141     items[depth].type = type;
1142     items[depth].num[0] = num;
1143     items[depth].offset[0] = offset;
1144     items[depth].attr[0] = attr;
1145     depth++;
1146     set |= (1ull << type);
1147   }
1148   int get_depth() const { return depth; }
1149   const item_t &at(int index) const {
1150     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1151     return items[index];
1152   }
1153   item_t &at(int index) {
1154     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1155     return items[index];
1156   }
1157   void remove(int index) {
1158     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1159     set &= ~(1ull << items[index].type);
1160     for (int j = index + 1; j < depth; ++j) {
1161       items[j - 1] = items[j];
1162     }
1163     depth--;
1164   }
1165   void sort() {
1166     KMP_DEBUG_ASSERT(__kmp_topology);
1167     qsort(items, depth, sizeof(item_t), hw_subset_compare);
1168   }
1169   bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1170   void dump() const {
1171     printf("**********************\n");
1172     printf("*** kmp_hw_subset: ***\n");
1173     printf("* depth: %d\n", depth);
1174     printf("* items:\n");
1175     for (int i = 0; i < depth; ++i) {
1176       printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1177       for (int j = 0; j < items[i].num_attrs; ++j) {
1178         printf("  num: %d, offset: %d, attr: ", items[i].num[j],
1179                items[i].offset[j]);
1180         if (!items[i].attr[j]) {
1181           printf(" (none)\n");
1182         } else {
1183           printf(
1184               " core_type = %s, core_eff = %d\n",
1185               __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1186               items[i].attr[j].get_core_eff());
1187         }
1188       }
1189     }
1190     printf("* set: 0x%llx\n", set);
1191     printf("* absolute: %d\n", absolute);
1192     printf("**********************\n");
1193   }
1194 };
1195 extern kmp_hw_subset_t *__kmp_hw_subset;
1196 
1197 /* A structure for holding machine-specific hierarchy info to be computed once
1198    at init. This structure represents a mapping of threads to the actual machine
1199    hierarchy, or to our best guess at what the hierarchy might be, for the
1200    purpose of performing an efficient barrier. In the worst case, when there is
1201    no machine hierarchy information, it produces a tree suitable for a barrier,
1202    similar to the tree used in the hyper barrier. */
1203 class hierarchy_info {
1204 public:
1205   /* Good default values for number of leaves and branching factor, given no
1206      affinity information. Behaves a bit like hyper barrier. */
1207   static const kmp_uint32 maxLeaves = 4;
1208   static const kmp_uint32 minBranch = 4;
1209   /** Number of levels in the hierarchy. Typical levels are threads/core,
1210       cores/package or socket, packages/node, nodes/machine, etc. We don't want
1211       to get specific with nomenclature. When the machine is oversubscribed we
1212       add levels to duplicate the hierarchy, doubling the thread capacity of the
1213       hierarchy each time we add a level. */
1214   kmp_uint32 maxLevels;
1215 
1216   /** This is specifically the depth of the machine configuration hierarchy, in
1217       terms of the number of levels along the longest path from root to any
1218       leaf. It corresponds to the number of entries in numPerLevel if we exclude
1219       all but one trailing 1. */
1220   kmp_uint32 depth;
1221   kmp_uint32 base_num_threads;
1222   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1223   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1224   // 2=initialization in progress
1225   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1226 
1227   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1228       the parent of a node at level i has. For example, if we have a machine
1229       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1230       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
1231   kmp_uint32 *numPerLevel;
1232   kmp_uint32 *skipPerLevel;
1233 
1234   void deriveLevels() {
1235     int hier_depth = __kmp_topology->get_depth();
1236     for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1237       numPerLevel[level] = __kmp_topology->get_ratio(i);
1238     }
1239   }
1240 
1241   hierarchy_info()
1242       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1243 
1244   void fini() {
1245     if (!uninitialized && numPerLevel) {
1246       __kmp_free(numPerLevel);
1247       numPerLevel = NULL;
1248       uninitialized = not_initialized;
1249     }
1250   }
1251 
1252   void init(int num_addrs) {
1253     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1254         &uninitialized, not_initialized, initializing);
1255     if (bool_result == 0) { // Wait for initialization
1256       while (TCR_1(uninitialized) != initialized)
1257         KMP_CPU_PAUSE();
1258       return;
1259     }
1260     KMP_DEBUG_ASSERT(bool_result == 1);
1261 
1262     /* Added explicit initialization of the data fields here to prevent usage of
1263        dirty value observed when static library is re-initialized multiple times
1264        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1265        OpenMP). */
1266     depth = 1;
1267     resizing = 0;
1268     maxLevels = 7;
1269     numPerLevel =
1270         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1271     skipPerLevel = &(numPerLevel[maxLevels]);
1272     for (kmp_uint32 i = 0; i < maxLevels;
1273          ++i) { // init numPerLevel[*] to 1 item per level
1274       numPerLevel[i] = 1;
1275       skipPerLevel[i] = 1;
1276     }
1277 
1278     // Sort table by physical ID
1279     if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1280       deriveLevels();
1281     } else {
1282       numPerLevel[0] = maxLeaves;
1283       numPerLevel[1] = num_addrs / maxLeaves;
1284       if (num_addrs % maxLeaves)
1285         numPerLevel[1]++;
1286     }
1287 
1288     base_num_threads = num_addrs;
1289     for (int i = maxLevels - 1; i >= 0;
1290          --i) // count non-empty levels to get depth
1291       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1292         depth++;
1293 
1294     kmp_uint32 branch = minBranch;
1295     if (numPerLevel[0] == 1)
1296       branch = num_addrs / maxLeaves;
1297     if (branch < minBranch)
1298       branch = minBranch;
1299     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1300       while (numPerLevel[d] > branch ||
1301              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1302         if (numPerLevel[d] & 1)
1303           numPerLevel[d]++;
1304         numPerLevel[d] = numPerLevel[d] >> 1;
1305         if (numPerLevel[d + 1] == 1)
1306           depth++;
1307         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1308       }
1309       if (numPerLevel[0] == 1) {
1310         branch = branch >> 1;
1311         if (branch < 4)
1312           branch = minBranch;
1313       }
1314     }
1315 
1316     for (kmp_uint32 i = 1; i < depth; ++i)
1317       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1318     // Fill in hierarchy in the case of oversubscription
1319     for (kmp_uint32 i = depth; i < maxLevels; ++i)
1320       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1321 
1322     uninitialized = initialized; // One writer
1323   }
1324 
1325   // Resize the hierarchy if nproc changes to something larger than before
1326   void resize(kmp_uint32 nproc) {
1327     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1328     while (bool_result == 0) { // someone else is trying to resize
1329       KMP_CPU_PAUSE();
1330       if (nproc <= base_num_threads) // happy with other thread's resize
1331         return;
1332       else // try to resize
1333         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1334     }
1335     KMP_DEBUG_ASSERT(bool_result != 0);
1336     if (nproc <= base_num_threads)
1337       return; // happy with other thread's resize
1338 
1339     // Calculate new maxLevels
1340     kmp_uint32 old_sz = skipPerLevel[depth - 1];
1341     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1342     // First see if old maxLevels is enough to contain new size
1343     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1344       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1345       numPerLevel[i - 1] *= 2;
1346       old_sz *= 2;
1347       depth++;
1348     }
1349     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1350       while (nproc > old_sz) {
1351         old_sz *= 2;
1352         incs++;
1353         depth++;
1354       }
1355       maxLevels += incs;
1356 
1357       // Resize arrays
1358       kmp_uint32 *old_numPerLevel = numPerLevel;
1359       kmp_uint32 *old_skipPerLevel = skipPerLevel;
1360       numPerLevel = skipPerLevel = NULL;
1361       numPerLevel =
1362           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1363       skipPerLevel = &(numPerLevel[maxLevels]);
1364 
1365       // Copy old elements from old arrays
1366       for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1367         // init numPerLevel[*] to 1 item per level
1368         numPerLevel[i] = old_numPerLevel[i];
1369         skipPerLevel[i] = old_skipPerLevel[i];
1370       }
1371 
1372       // Init new elements in arrays to 1
1373       for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1374         // init numPerLevel[*] to 1 item per level
1375         numPerLevel[i] = 1;
1376         skipPerLevel[i] = 1;
1377       }
1378 
1379       // Free old arrays
1380       __kmp_free(old_numPerLevel);
1381     }
1382 
1383     // Fill in oversubscription levels of hierarchy
1384     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1385       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1386 
1387     base_num_threads = nproc;
1388     resizing = 0; // One writer
1389   }
1390 };
1391 #endif // KMP_AFFINITY_H
1392