xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_affinity.h (revision 35c0a8c449fd2b7f75029ebed5e10852240f0865)
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 #include <limits>
19 
20 #if KMP_AFFINITY_SUPPORTED
21 #if KMP_USE_HWLOC
22 class KMPHwlocAffinity : public KMPAffinity {
23 public:
24   class Mask : public KMPAffinity::Mask {
25     hwloc_cpuset_t mask;
26 
27   public:
28     Mask() {
29       mask = hwloc_bitmap_alloc();
30       this->zero();
31     }
32     ~Mask() { hwloc_bitmap_free(mask); }
33     void set(int i) override { hwloc_bitmap_set(mask, i); }
34     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36     void zero() override { hwloc_bitmap_zero(mask); }
37     bool empty() const override { return hwloc_bitmap_iszero(mask); }
38     void copy(const KMPAffinity::Mask *src) override {
39       const Mask *convert = static_cast<const Mask *>(src);
40       hwloc_bitmap_copy(mask, convert->mask);
41     }
42     void bitwise_and(const KMPAffinity::Mask *rhs) override {
43       const Mask *convert = static_cast<const Mask *>(rhs);
44       hwloc_bitmap_and(mask, mask, convert->mask);
45     }
46     void bitwise_or(const KMPAffinity::Mask *rhs) override {
47       const Mask *convert = static_cast<const Mask *>(rhs);
48       hwloc_bitmap_or(mask, mask, convert->mask);
49     }
50     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51     bool is_equal(const KMPAffinity::Mask *rhs) const override {
52       const Mask *convert = static_cast<const Mask *>(rhs);
53       return hwloc_bitmap_isequal(mask, convert->mask);
54     }
55     int begin() const override { return hwloc_bitmap_first(mask); }
56     int end() const override { return -1; }
57     int next(int previous) const override {
58       return hwloc_bitmap_next(mask, previous);
59     }
60     int get_system_affinity(bool abort_on_error) override {
61       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62                   "Illegal get affinity operation when not capable");
63       long retval =
64           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65       if (retval >= 0) {
66         return 0;
67       }
68       int error = errno;
69       if (abort_on_error) {
70         __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71                     KMP_ERR(error), __kmp_msg_null);
72       }
73       return error;
74     }
75     int set_system_affinity(bool abort_on_error) const override {
76       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77                   "Illegal set affinity operation when not capable");
78       long retval =
79           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80       if (retval >= 0) {
81         return 0;
82       }
83       int error = errno;
84       if (abort_on_error) {
85         __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86                     KMP_ERR(error), __kmp_msg_null);
87       }
88       return error;
89     }
90 #if KMP_OS_WINDOWS
91     int set_process_affinity(bool abort_on_error) const override {
92       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93                   "Illegal set process affinity operation when not capable");
94       int error = 0;
95       const hwloc_topology_support *support =
96           hwloc_topology_get_support(__kmp_hwloc_topology);
97       if (support->cpubind->set_proc_cpubind) {
98         int retval;
99         retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100                                    HWLOC_CPUBIND_PROCESS);
101         if (retval >= 0)
102           return 0;
103         error = errno;
104         if (abort_on_error)
105           __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106                       KMP_ERR(error), __kmp_msg_null);
107       }
108       return error;
109     }
110 #endif
111     int get_proc_group() const override {
112       int group = -1;
113 #if KMP_OS_WINDOWS
114       if (__kmp_num_proc_groups == 1) {
115         return 1;
116       }
117       for (int i = 0; i < __kmp_num_proc_groups; i++) {
118         // On windows, the long type is always 32 bits
119         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120         unsigned long second_32_bits =
121             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122         if (first_32_bits == 0 && second_32_bits == 0) {
123           continue;
124         }
125         if (group >= 0) {
126           return -1;
127         }
128         group = i;
129       }
130 #endif /* KMP_OS_WINDOWS */
131       return group;
132     }
133   };
134   void determine_capable(const char *var) override {
135     const hwloc_topology_support *topology_support;
136     if (__kmp_hwloc_topology == NULL) {
137       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138         __kmp_hwloc_error = TRUE;
139         if (__kmp_affinity.flags.verbose) {
140           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141         }
142       }
143       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144         __kmp_hwloc_error = TRUE;
145         if (__kmp_affinity.flags.verbose) {
146           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147         }
148       }
149     }
150     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151     // Is the system capable of setting/getting this thread's affinity?
152     // Also, is topology discovery possible? (pu indicates ability to discover
153     // processing units). And finally, were there no errors when calling any
154     // hwloc_* API functions?
155     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156         topology_support->cpubind->get_thisthread_cpubind &&
157         topology_support->discovery->pu && !__kmp_hwloc_error) {
158       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
159       KMP_AFFINITY_ENABLE(TRUE);
160     } else {
161       // indicate that hwloc didn't work and disable affinity
162       __kmp_hwloc_error = TRUE;
163       KMP_AFFINITY_DISABLE();
164     }
165   }
166   void bind_thread(int which) override {
167     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168                 "Illegal set affinity operation when not capable");
169     KMPAffinity::Mask *mask;
170     KMP_CPU_ALLOC_ON_STACK(mask);
171     KMP_CPU_ZERO(mask);
172     KMP_CPU_SET(which, mask);
173     __kmp_set_system_affinity(mask, TRUE);
174     KMP_CPU_FREE_FROM_STACK(mask);
175   }
176   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178   KMPAffinity::Mask *allocate_mask_array(int num) override {
179     return new Mask[num];
180   }
181   void deallocate_mask_array(KMPAffinity::Mask *array) override {
182     Mask *hwloc_array = static_cast<Mask *>(array);
183     delete[] hwloc_array;
184   }
185   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186                                       int index) override {
187     Mask *hwloc_array = static_cast<Mask *>(array);
188     return &(hwloc_array[index]);
189   }
190   api_type get_api_type() const override { return HWLOC; }
191 };
192 #endif /* KMP_USE_HWLOC */
193 
194 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||     \
195     KMP_OS_AIX
196 #if KMP_OS_LINUX
197 /* On some of the older OS's that we build on, these constants aren't present
198    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
199    all systems of the same arch where they are defined, and they cannot change.
200    stone forever. */
201 #include <sys/syscall.h>
202 #if KMP_ARCH_X86 || KMP_ARCH_ARM
203 #ifndef __NR_sched_setaffinity
204 #define __NR_sched_setaffinity 241
205 #elif __NR_sched_setaffinity != 241
206 #error Wrong code for setaffinity system call.
207 #endif /* __NR_sched_setaffinity */
208 #ifndef __NR_sched_getaffinity
209 #define __NR_sched_getaffinity 242
210 #elif __NR_sched_getaffinity != 242
211 #error Wrong code for getaffinity system call.
212 #endif /* __NR_sched_getaffinity */
213 #elif KMP_ARCH_AARCH64
214 #ifndef __NR_sched_setaffinity
215 #define __NR_sched_setaffinity 122
216 #elif __NR_sched_setaffinity != 122
217 #error Wrong code for setaffinity system call.
218 #endif /* __NR_sched_setaffinity */
219 #ifndef __NR_sched_getaffinity
220 #define __NR_sched_getaffinity 123
221 #elif __NR_sched_getaffinity != 123
222 #error Wrong code for getaffinity system call.
223 #endif /* __NR_sched_getaffinity */
224 #elif KMP_ARCH_X86_64
225 #ifndef __NR_sched_setaffinity
226 #define __NR_sched_setaffinity 203
227 #elif __NR_sched_setaffinity != 203
228 #error Wrong code for setaffinity system call.
229 #endif /* __NR_sched_setaffinity */
230 #ifndef __NR_sched_getaffinity
231 #define __NR_sched_getaffinity 204
232 #elif __NR_sched_getaffinity != 204
233 #error Wrong code for getaffinity system call.
234 #endif /* __NR_sched_getaffinity */
235 #elif KMP_ARCH_PPC64
236 #ifndef __NR_sched_setaffinity
237 #define __NR_sched_setaffinity 222
238 #elif __NR_sched_setaffinity != 222
239 #error Wrong code for setaffinity system call.
240 #endif /* __NR_sched_setaffinity */
241 #ifndef __NR_sched_getaffinity
242 #define __NR_sched_getaffinity 223
243 #elif __NR_sched_getaffinity != 223
244 #error Wrong code for getaffinity system call.
245 #endif /* __NR_sched_getaffinity */
246 #elif KMP_ARCH_MIPS
247 #ifndef __NR_sched_setaffinity
248 #define __NR_sched_setaffinity 4239
249 #elif __NR_sched_setaffinity != 4239
250 #error Wrong code for setaffinity system call.
251 #endif /* __NR_sched_setaffinity */
252 #ifndef __NR_sched_getaffinity
253 #define __NR_sched_getaffinity 4240
254 #elif __NR_sched_getaffinity != 4240
255 #error Wrong code for getaffinity system call.
256 #endif /* __NR_sched_getaffinity */
257 #elif KMP_ARCH_MIPS64
258 #ifndef __NR_sched_setaffinity
259 #define __NR_sched_setaffinity 5195
260 #elif __NR_sched_setaffinity != 5195
261 #error Wrong code for setaffinity system call.
262 #endif /* __NR_sched_setaffinity */
263 #ifndef __NR_sched_getaffinity
264 #define __NR_sched_getaffinity 5196
265 #elif __NR_sched_getaffinity != 5196
266 #error Wrong code for getaffinity system call.
267 #endif /* __NR_sched_getaffinity */
268 #elif KMP_ARCH_LOONGARCH64
269 #ifndef __NR_sched_setaffinity
270 #define __NR_sched_setaffinity 122
271 #elif __NR_sched_setaffinity != 122
272 #error Wrong code for setaffinity system call.
273 #endif /* __NR_sched_setaffinity */
274 #ifndef __NR_sched_getaffinity
275 #define __NR_sched_getaffinity 123
276 #elif __NR_sched_getaffinity != 123
277 #error Wrong code for getaffinity system call.
278 #endif /* __NR_sched_getaffinity */
279 #elif KMP_ARCH_RISCV64
280 #ifndef __NR_sched_setaffinity
281 #define __NR_sched_setaffinity 122
282 #elif __NR_sched_setaffinity != 122
283 #error Wrong code for setaffinity system call.
284 #endif /* __NR_sched_setaffinity */
285 #ifndef __NR_sched_getaffinity
286 #define __NR_sched_getaffinity 123
287 #elif __NR_sched_getaffinity != 123
288 #error Wrong code for getaffinity system call.
289 #endif /* __NR_sched_getaffinity */
290 #elif KMP_ARCH_VE
291 #ifndef __NR_sched_setaffinity
292 #define __NR_sched_setaffinity 203
293 #elif __NR_sched_setaffinity != 203
294 #error Wrong code for setaffinity system call.
295 #endif /* __NR_sched_setaffinity */
296 #ifndef __NR_sched_getaffinity
297 #define __NR_sched_getaffinity 204
298 #elif __NR_sched_getaffinity != 204
299 #error Wrong code for getaffinity system call.
300 #endif /* __NR_sched_getaffinity */
301 #elif KMP_ARCH_S390X
302 #ifndef __NR_sched_setaffinity
303 #define __NR_sched_setaffinity 239
304 #elif __NR_sched_setaffinity != 239
305 #error Wrong code for setaffinity system call.
306 #endif /* __NR_sched_setaffinity */
307 #ifndef __NR_sched_getaffinity
308 #define __NR_sched_getaffinity 240
309 #elif __NR_sched_getaffinity != 240
310 #error Wrong code for getaffinity system call.
311 #endif /* __NR_sched_getaffinity */
312 #else
313 #error Unknown or unsupported architecture
314 #endif /* KMP_ARCH_* */
315 #elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
316 #include <pthread.h>
317 #include <pthread_np.h>
318 #elif KMP_OS_NETBSD
319 #include <pthread.h>
320 #include <sched.h>
321 #elif KMP_OS_AIX
322 #include <sys/dr.h>
323 #include <sys/rset.h>
324 #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
325 #define GET_NUMBER_SMT_SETS 0x0004
326 extern "C" int syssmt(int flags, int, int, int *);
327 #endif
328 class KMPNativeAffinity : public KMPAffinity {
329   class Mask : public KMPAffinity::Mask {
330     typedef unsigned long mask_t;
331     typedef decltype(__kmp_affin_mask_size) mask_size_type;
332     static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
333     static const mask_t ONE = 1;
334     mask_size_type get_num_mask_types() const {
335       return __kmp_affin_mask_size / sizeof(mask_t);
336     }
337 
338   public:
339     mask_t *mask;
340     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
341     ~Mask() {
342       if (mask)
343         __kmp_free(mask);
344     }
345     void set(int i) override {
346       mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
347     }
348     bool is_set(int i) const override {
349       return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
350     }
351     void clear(int i) override {
352       mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
353     }
354     void zero() override {
355       mask_size_type e = get_num_mask_types();
356       for (mask_size_type i = 0; i < e; ++i)
357         mask[i] = (mask_t)0;
358     }
359     bool empty() const override {
360       mask_size_type e = get_num_mask_types();
361       for (mask_size_type i = 0; i < e; ++i)
362         if (mask[i] != (mask_t)0)
363           return false;
364       return true;
365     }
366     void copy(const KMPAffinity::Mask *src) override {
367       const Mask *convert = static_cast<const Mask *>(src);
368       mask_size_type e = get_num_mask_types();
369       for (mask_size_type i = 0; i < e; ++i)
370         mask[i] = convert->mask[i];
371     }
372     void bitwise_and(const KMPAffinity::Mask *rhs) override {
373       const Mask *convert = static_cast<const Mask *>(rhs);
374       mask_size_type e = get_num_mask_types();
375       for (mask_size_type i = 0; i < e; ++i)
376         mask[i] &= convert->mask[i];
377     }
378     void bitwise_or(const KMPAffinity::Mask *rhs) override {
379       const Mask *convert = static_cast<const Mask *>(rhs);
380       mask_size_type e = get_num_mask_types();
381       for (mask_size_type i = 0; i < e; ++i)
382         mask[i] |= convert->mask[i];
383     }
384     void bitwise_not() override {
385       mask_size_type e = get_num_mask_types();
386       for (mask_size_type i = 0; i < e; ++i)
387         mask[i] = ~(mask[i]);
388     }
389     bool is_equal(const KMPAffinity::Mask *rhs) const override {
390       const Mask *convert = static_cast<const Mask *>(rhs);
391       mask_size_type e = get_num_mask_types();
392       for (mask_size_type i = 0; i < e; ++i)
393         if (mask[i] != convert->mask[i])
394           return false;
395       return true;
396     }
397     int begin() const override {
398       int retval = 0;
399       while (retval < end() && !is_set(retval))
400         ++retval;
401       return retval;
402     }
403     int end() const override {
404       int e;
405       __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
406       return e;
407     }
408     int next(int previous) const override {
409       int retval = previous + 1;
410       while (retval < end() && !is_set(retval))
411         ++retval;
412       return retval;
413     }
414 #if KMP_OS_AIX
415     // On AIX, we don't have a way to get CPU(s) a thread is bound to.
416     // This routine is only used to get the full mask.
417     int get_system_affinity(bool abort_on_error) override {
418       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
419                   "Illegal get affinity operation when not capable");
420 
421       (void)abort_on_error;
422 
423       // Set the mask with all CPUs that are available.
424       for (int i = 0; i < __kmp_xproc; ++i)
425         KMP_CPU_SET(i, this);
426       return 0;
427     }
428     int set_system_affinity(bool abort_on_error) const override {
429       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
430 
431                   "Illegal set affinity operation when not capable");
432 
433       int location;
434       int gtid = __kmp_entry_gtid();
435       int tid = thread_self();
436 
437       // Unbind the thread if it was bound to any processors before so that
438       // we can bind the thread to CPUs specified by the mask not others.
439       int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
440 
441       // On AIX, we can only bind to one instead of a set of CPUs with the
442       // bindprocessor() system call.
443       KMP_CPU_SET_ITERATE(location, this) {
444         if (KMP_CPU_ISSET(location, this)) {
445           retval = bindprocessor(BINDTHREAD, tid, location);
446           if (retval == -1 && errno == 1) {
447             rsid_t rsid;
448             rsethandle_t rsh;
449             // Put something in rsh to prevent compiler warning
450             // about uninitalized use
451             rsh = rs_alloc(RS_EMPTY);
452             rsid.at_pid = getpid();
453             if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
454               retval = ra_detachrset(R_PROCESS, rsid, 0);
455               retval = bindprocessor(BINDTHREAD, tid, location);
456             }
457           }
458           if (retval == 0) {
459             KA_TRACE(10, ("__kmp_set_system_affinity:  Done binding "
460                           "T#%d to cpu=%d.\n",
461                           gtid, location));
462             continue;
463           }
464           int error = errno;
465           if (abort_on_error) {
466             __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
467                         KMP_ERR(error), __kmp_msg_null);
468             KA_TRACE(10, ("__kmp_set_system_affinity:  Error binding "
469                           "T#%d to cpu=%d, errno=%d.\n",
470                           gtid, location, error));
471             return error;
472           }
473         }
474       }
475       return 0;
476     }
477 #else // !KMP_OS_AIX
478     int get_system_affinity(bool abort_on_error) override {
479       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
480                   "Illegal get affinity operation when not capable");
481 #if KMP_OS_LINUX
482       long retval =
483           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
484 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
485       int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
486                                      reinterpret_cast<cpuset_t *>(mask));
487       int retval = (r == 0 ? 0 : -1);
488 #endif
489       if (retval >= 0) {
490         return 0;
491       }
492       int error = errno;
493       if (abort_on_error) {
494         __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
495                     KMP_ERR(error), __kmp_msg_null);
496       }
497       return error;
498     }
499     int set_system_affinity(bool abort_on_error) const override {
500       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
501                   "Illegal set affinity operation when not capable");
502 #if KMP_OS_LINUX
503       long retval =
504           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
505 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
506       int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
507                                      reinterpret_cast<cpuset_t *>(mask));
508       int retval = (r == 0 ? 0 : -1);
509 #endif
510       if (retval >= 0) {
511         return 0;
512       }
513       int error = errno;
514       if (abort_on_error) {
515         __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
516                     KMP_ERR(error), __kmp_msg_null);
517       }
518       return error;
519     }
520 #endif // KMP_OS_AIX
521   };
522   void determine_capable(const char *env_var) override {
523     __kmp_affinity_determine_capable(env_var);
524   }
525   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
526   KMPAffinity::Mask *allocate_mask() override {
527     KMPNativeAffinity::Mask *retval = new Mask();
528     return retval;
529   }
530   void deallocate_mask(KMPAffinity::Mask *m) override {
531     KMPNativeAffinity::Mask *native_mask =
532         static_cast<KMPNativeAffinity::Mask *>(m);
533     delete native_mask;
534   }
535   KMPAffinity::Mask *allocate_mask_array(int num) override {
536     return new Mask[num];
537   }
538   void deallocate_mask_array(KMPAffinity::Mask *array) override {
539     Mask *linux_array = static_cast<Mask *>(array);
540     delete[] linux_array;
541   }
542   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
543                                       int index) override {
544     Mask *linux_array = static_cast<Mask *>(array);
545     return &(linux_array[index]);
546   }
547   api_type get_api_type() const override { return NATIVE_OS; }
548 };
549 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY  \
550           || KMP_OS_AIX */
551 
552 #if KMP_OS_WINDOWS
553 class KMPNativeAffinity : public KMPAffinity {
554   class Mask : public KMPAffinity::Mask {
555     typedef ULONG_PTR mask_t;
556     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
557     mask_t *mask;
558 
559   public:
560     Mask() {
561       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
562     }
563     ~Mask() {
564       if (mask)
565         __kmp_free(mask);
566     }
567     void set(int i) override {
568       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
569     }
570     bool is_set(int i) const override {
571       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
572     }
573     void clear(int i) override {
574       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
575     }
576     void zero() override {
577       for (int i = 0; i < __kmp_num_proc_groups; ++i)
578         mask[i] = 0;
579     }
580     bool empty() const override {
581       for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
582         if (mask[i])
583           return false;
584       return true;
585     }
586     void copy(const KMPAffinity::Mask *src) override {
587       const Mask *convert = static_cast<const Mask *>(src);
588       for (int i = 0; i < __kmp_num_proc_groups; ++i)
589         mask[i] = convert->mask[i];
590     }
591     void bitwise_and(const KMPAffinity::Mask *rhs) override {
592       const Mask *convert = static_cast<const Mask *>(rhs);
593       for (int i = 0; i < __kmp_num_proc_groups; ++i)
594         mask[i] &= convert->mask[i];
595     }
596     void bitwise_or(const KMPAffinity::Mask *rhs) override {
597       const Mask *convert = static_cast<const Mask *>(rhs);
598       for (int i = 0; i < __kmp_num_proc_groups; ++i)
599         mask[i] |= convert->mask[i];
600     }
601     void bitwise_not() override {
602       for (int i = 0; i < __kmp_num_proc_groups; ++i)
603         mask[i] = ~(mask[i]);
604     }
605     bool is_equal(const KMPAffinity::Mask *rhs) const override {
606       const Mask *convert = static_cast<const Mask *>(rhs);
607       for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
608         if (mask[i] != convert->mask[i])
609           return false;
610       return true;
611     }
612     int begin() const override {
613       int retval = 0;
614       while (retval < end() && !is_set(retval))
615         ++retval;
616       return retval;
617     }
618     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
619     int next(int previous) const override {
620       int retval = previous + 1;
621       while (retval < end() && !is_set(retval))
622         ++retval;
623       return retval;
624     }
625     int set_process_affinity(bool abort_on_error) const override {
626       if (__kmp_num_proc_groups <= 1) {
627         if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
628           DWORD error = GetLastError();
629           if (abort_on_error) {
630             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
631                         __kmp_msg_null);
632           }
633           return error;
634         }
635       }
636       return 0;
637     }
638     int set_system_affinity(bool abort_on_error) const override {
639       if (__kmp_num_proc_groups > 1) {
640         // Check for a valid mask.
641         GROUP_AFFINITY ga;
642         int group = get_proc_group();
643         if (group < 0) {
644           if (abort_on_error) {
645             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
646           }
647           return -1;
648         }
649         // Transform the bit vector into a GROUP_AFFINITY struct
650         // and make the system call to set affinity.
651         ga.Group = group;
652         ga.Mask = mask[group];
653         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
654 
655         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
656         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
657           DWORD error = GetLastError();
658           if (abort_on_error) {
659             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
660                         __kmp_msg_null);
661           }
662           return error;
663         }
664       } else {
665         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
666           DWORD error = GetLastError();
667           if (abort_on_error) {
668             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
669                         __kmp_msg_null);
670           }
671           return error;
672         }
673       }
674       return 0;
675     }
676     int get_system_affinity(bool abort_on_error) override {
677       if (__kmp_num_proc_groups > 1) {
678         this->zero();
679         GROUP_AFFINITY ga;
680         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
681         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
682           DWORD error = GetLastError();
683           if (abort_on_error) {
684             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
685                         KMP_ERR(error), __kmp_msg_null);
686           }
687           return error;
688         }
689         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
690             (ga.Mask == 0)) {
691           return -1;
692         }
693         mask[ga.Group] = ga.Mask;
694       } else {
695         mask_t newMask, sysMask, retval;
696         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
697           DWORD error = GetLastError();
698           if (abort_on_error) {
699             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
700                         KMP_ERR(error), __kmp_msg_null);
701           }
702           return error;
703         }
704         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
705         if (!retval) {
706           DWORD error = GetLastError();
707           if (abort_on_error) {
708             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
709                         KMP_ERR(error), __kmp_msg_null);
710           }
711           return error;
712         }
713         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
714         if (!newMask) {
715           DWORD error = GetLastError();
716           if (abort_on_error) {
717             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
718                         KMP_ERR(error), __kmp_msg_null);
719           }
720         }
721         *mask = retval;
722       }
723       return 0;
724     }
725     int get_proc_group() const override {
726       int group = -1;
727       if (__kmp_num_proc_groups == 1) {
728         return 1;
729       }
730       for (int i = 0; i < __kmp_num_proc_groups; i++) {
731         if (mask[i] == 0)
732           continue;
733         if (group >= 0)
734           return -1;
735         group = i;
736       }
737       return group;
738     }
739   };
740   void determine_capable(const char *env_var) override {
741     __kmp_affinity_determine_capable(env_var);
742   }
743   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
744   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
745   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
746   KMPAffinity::Mask *allocate_mask_array(int num) override {
747     return new Mask[num];
748   }
749   void deallocate_mask_array(KMPAffinity::Mask *array) override {
750     Mask *windows_array = static_cast<Mask *>(array);
751     delete[] windows_array;
752   }
753   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
754                                       int index) override {
755     Mask *windows_array = static_cast<Mask *>(array);
756     return &(windows_array[index]);
757   }
758   api_type get_api_type() const override { return NATIVE_OS; }
759 };
760 #endif /* KMP_OS_WINDOWS */
761 #endif /* KMP_AFFINITY_SUPPORTED */
762 
763 // Describe an attribute for a level in the machine topology
764 struct kmp_hw_attr_t {
765   int core_type : 8;
766   int core_eff : 8;
767   unsigned valid : 1;
768   unsigned reserved : 15;
769 
770   static const int UNKNOWN_CORE_EFF = -1;
771 
772   kmp_hw_attr_t()
773       : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
774         valid(0), reserved(0) {}
775   void set_core_type(kmp_hw_core_type_t type) {
776     valid = 1;
777     core_type = type;
778   }
779   void set_core_eff(int eff) {
780     valid = 1;
781     core_eff = eff;
782   }
783   kmp_hw_core_type_t get_core_type() const {
784     return (kmp_hw_core_type_t)core_type;
785   }
786   int get_core_eff() const { return core_eff; }
787   bool is_core_type_valid() const {
788     return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
789   }
790   bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
791   operator bool() const { return valid; }
792   void clear() {
793     core_type = KMP_HW_CORE_TYPE_UNKNOWN;
794     core_eff = UNKNOWN_CORE_EFF;
795     valid = 0;
796   }
797   bool contains(const kmp_hw_attr_t &other) const {
798     if (!valid && !other.valid)
799       return true;
800     if (valid && other.valid) {
801       if (other.is_core_type_valid()) {
802         if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
803           return false;
804       }
805       if (other.is_core_eff_valid()) {
806         if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
807           return false;
808       }
809       return true;
810     }
811     return false;
812   }
813 #if KMP_AFFINITY_SUPPORTED
814   bool contains(const kmp_affinity_attrs_t &attr) const {
815     if (!valid && !attr.valid)
816       return true;
817     if (valid && attr.valid) {
818       if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
819         return (is_core_type_valid() &&
820                 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
821       if (attr.core_eff != UNKNOWN_CORE_EFF)
822         return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
823       return true;
824     }
825     return false;
826   }
827 #endif // KMP_AFFINITY_SUPPORTED
828   bool operator==(const kmp_hw_attr_t &rhs) const {
829     return (rhs.valid == valid && rhs.core_eff == core_eff &&
830             rhs.core_type == core_type);
831   }
832   bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
833 };
834 
835 #if KMP_AFFINITY_SUPPORTED
836 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
837 #endif
838 
839 class kmp_hw_thread_t {
840 public:
841   static const int UNKNOWN_ID = -1;
842   static const int MULTIPLE_ID = -2;
843   static int compare_ids(const void *a, const void *b);
844   static int compare_compact(const void *a, const void *b);
845   int ids[KMP_HW_LAST];
846   int sub_ids[KMP_HW_LAST];
847   bool leader;
848   int os_id;
849   kmp_hw_attr_t attrs;
850 
851   void print() const;
852   void clear() {
853     for (int i = 0; i < (int)KMP_HW_LAST; ++i)
854       ids[i] = UNKNOWN_ID;
855     leader = false;
856     attrs.clear();
857   }
858 };
859 
860 class kmp_topology_t {
861 
862   struct flags_t {
863     int uniform : 1;
864     int reserved : 31;
865   };
866 
867   int depth;
868 
869   // The following arrays are all 'depth' long and have been
870   // allocated to hold up to KMP_HW_LAST number of objects if
871   // needed so layers can be added without reallocation of any array
872 
873   // Orderd array of the types in the topology
874   kmp_hw_t *types;
875 
876   // Keep quick topology ratios, for non-uniform topologies,
877   // this ratio holds the max number of itemAs per itemB
878   // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
879   int *ratio;
880 
881   // Storage containing the absolute number of each topology layer
882   int *count;
883 
884   // The number of core efficiencies. This is only useful for hybrid
885   // topologies. Core efficiencies will range from 0 to num efficiencies - 1
886   int num_core_efficiencies;
887   int num_core_types;
888   kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
889 
890   // The hardware threads array
891   // hw_threads is num_hw_threads long
892   // Each hw_thread's ids and sub_ids are depth deep
893   int num_hw_threads;
894   kmp_hw_thread_t *hw_threads;
895 
896   // Equivalence hash where the key is the hardware topology item
897   // and the value is the equivalent hardware topology type in the
898   // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
899   // known equivalence for the topology type
900   kmp_hw_t equivalent[KMP_HW_LAST];
901 
902   // Flags describing the topology
903   flags_t flags;
904 
905   // Compact value used during sort_compact()
906   int compact;
907 
908   // Insert a new topology layer after allocation
909   void _insert_layer(kmp_hw_t type, const int *ids);
910 
911 #if KMP_GROUP_AFFINITY
912   // Insert topology information about Windows Processor groups
913   void _insert_windows_proc_groups();
914 #endif
915 
916   // Count each item & get the num x's per y
917   // e.g., get the number of cores and the number of threads per core
918   // for each (x, y) in (KMP_HW_* , KMP_HW_*)
919   void _gather_enumeration_information();
920 
921   // Remove layers that don't add information to the topology.
922   // This is done by having the layer take on the id = UNKNOWN_ID (-1)
923   void _remove_radix1_layers();
924 
925   // Find out if the topology is uniform
926   void _discover_uniformity();
927 
928   // Set all the sub_ids for each hardware thread
929   void _set_sub_ids();
930 
931   // Set global affinity variables describing the number of threads per
932   // core, the number of packages, the number of cores per package, and
933   // the number of cores.
934   void _set_globals();
935 
936   // Set the last level cache equivalent type
937   void _set_last_level_cache();
938 
939   // Return the number of cores with a particular attribute, 'attr'.
940   // If 'find_all' is true, then find all cores on the machine, otherwise find
941   // all cores per the layer 'above'
942   int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
943                             bool find_all = false) const;
944 
945 public:
946   // Force use of allocate()/deallocate()
947   kmp_topology_t() = delete;
948   kmp_topology_t(const kmp_topology_t &t) = delete;
949   kmp_topology_t(kmp_topology_t &&t) = delete;
950   kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
951   kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
952 
953   static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
954   static void deallocate(kmp_topology_t *);
955 
956   // Functions used in create_map() routines
957   kmp_hw_thread_t &at(int index) {
958     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
959     return hw_threads[index];
960   }
961   const kmp_hw_thread_t &at(int index) const {
962     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
963     return hw_threads[index];
964   }
965   int get_num_hw_threads() const { return num_hw_threads; }
966   void sort_ids() {
967     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
968           kmp_hw_thread_t::compare_ids);
969   }
970   // Check if the hardware ids are unique, if they are
971   // return true, otherwise return false
972   bool check_ids() const;
973 
974   // Function to call after the create_map() routine
975   void canonicalize();
976   void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
977 
978 // Functions used after canonicalize() called
979 
980 #if KMP_AFFINITY_SUPPORTED
981   // Set the granularity for affinity settings
982   void set_granularity(kmp_affinity_t &stgs) const;
983   bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
984   bool restrict_to_mask(const kmp_affin_mask_t *mask);
985   bool filter_hw_subset();
986 #endif
987   bool is_uniform() const { return flags.uniform; }
988   // Tell whether a type is a valid type in the topology
989   // returns KMP_HW_UNKNOWN when there is no equivalent type
990   kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
991     if (type == KMP_HW_UNKNOWN)
992       return KMP_HW_UNKNOWN;
993     return equivalent[type];
994   }
995   // Set type1 = type2
996   void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
997     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
998     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
999     kmp_hw_t real_type2 = equivalent[type2];
1000     if (real_type2 == KMP_HW_UNKNOWN)
1001       real_type2 = type2;
1002     equivalent[type1] = real_type2;
1003     // This loop is required since any of the types may have been set to
1004     // be equivalent to type1.  They all must be checked and reset to type2.
1005     KMP_FOREACH_HW_TYPE(type) {
1006       if (equivalent[type] == type1) {
1007         equivalent[type] = real_type2;
1008       }
1009     }
1010   }
1011   // Calculate number of types corresponding to level1
1012   // per types corresponding to level2 (e.g., number of threads per core)
1013   int calculate_ratio(int level1, int level2) const {
1014     KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1015     KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1016     int r = 1;
1017     for (int level = level1; level > level2; --level)
1018       r *= ratio[level];
1019     return r;
1020   }
1021   int get_ratio(int level) const {
1022     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1023     return ratio[level];
1024   }
1025   int get_depth() const { return depth; };
1026   kmp_hw_t get_type(int level) const {
1027     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1028     return types[level];
1029   }
1030   int get_level(kmp_hw_t type) const {
1031     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1032     int eq_type = equivalent[type];
1033     if (eq_type == KMP_HW_UNKNOWN)
1034       return -1;
1035     for (int i = 0; i < depth; ++i)
1036       if (types[i] == eq_type)
1037         return i;
1038     return -1;
1039   }
1040   int get_count(int level) const {
1041     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1042     return count[level];
1043   }
1044   // Return the total number of cores with attribute 'attr'
1045   int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1046     return _get_ncores_with_attr(attr, -1, true);
1047   }
1048   // Return the number of cores with attribute
1049   // 'attr' per topology level 'above'
1050   int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1051     return _get_ncores_with_attr(attr, above, false);
1052   }
1053 
1054 #if KMP_AFFINITY_SUPPORTED
1055   friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1056   void sort_compact(kmp_affinity_t &affinity) {
1057     compact = affinity.compact;
1058     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1059           kmp_hw_thread_t::compare_compact);
1060   }
1061 #endif
1062   void print(const char *env_var = "KMP_AFFINITY") const;
1063   void dump() const;
1064 };
1065 extern kmp_topology_t *__kmp_topology;
1066 
1067 class kmp_hw_subset_t {
1068   const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1069 
1070 public:
1071   // Describe a machine topology item in KMP_HW_SUBSET
1072   struct item_t {
1073     kmp_hw_t type;
1074     int num_attrs;
1075     int num[MAX_ATTRS];
1076     int offset[MAX_ATTRS];
1077     kmp_hw_attr_t attr[MAX_ATTRS];
1078   };
1079   // Put parenthesis around max to avoid accidental use of Windows max macro.
1080   const static int USE_ALL = (std::numeric_limits<int>::max)();
1081 
1082 private:
1083   int depth;
1084   int capacity;
1085   item_t *items;
1086   kmp_uint64 set;
1087   bool absolute;
1088   // The set must be able to handle up to KMP_HW_LAST number of layers
1089   KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1090   // Sorting the KMP_HW_SUBSET items to follow topology order
1091   // All unknown topology types will be at the beginning of the subset
1092   static int hw_subset_compare(const void *i1, const void *i2) {
1093     kmp_hw_t type1 = ((const item_t *)i1)->type;
1094     kmp_hw_t type2 = ((const item_t *)i2)->type;
1095     int level1 = __kmp_topology->get_level(type1);
1096     int level2 = __kmp_topology->get_level(type2);
1097     return level1 - level2;
1098   }
1099 
1100 public:
1101   // Force use of allocate()/deallocate()
1102   kmp_hw_subset_t() = delete;
1103   kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1104   kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1105   kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1106   kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1107 
1108   static kmp_hw_subset_t *allocate() {
1109     int initial_capacity = 5;
1110     kmp_hw_subset_t *retval =
1111         (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1112     retval->depth = 0;
1113     retval->capacity = initial_capacity;
1114     retval->set = 0ull;
1115     retval->absolute = false;
1116     retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1117     return retval;
1118   }
1119   static void deallocate(kmp_hw_subset_t *subset) {
1120     __kmp_free(subset->items);
1121     __kmp_free(subset);
1122   }
1123   void set_absolute() { absolute = true; }
1124   bool is_absolute() const { return absolute; }
1125   void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1126     for (int i = 0; i < depth; ++i) {
1127       // Found an existing item for this layer type
1128       // Add the num, offset, and attr to this item
1129       if (items[i].type == type) {
1130         int idx = items[i].num_attrs++;
1131         if ((size_t)idx >= MAX_ATTRS)
1132           return;
1133         items[i].num[idx] = num;
1134         items[i].offset[idx] = offset;
1135         items[i].attr[idx] = attr;
1136         return;
1137       }
1138     }
1139     if (depth == capacity - 1) {
1140       capacity *= 2;
1141       item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1142       for (int i = 0; i < depth; ++i)
1143         new_items[i] = items[i];
1144       __kmp_free(items);
1145       items = new_items;
1146     }
1147     items[depth].num_attrs = 1;
1148     items[depth].type = type;
1149     items[depth].num[0] = num;
1150     items[depth].offset[0] = offset;
1151     items[depth].attr[0] = attr;
1152     depth++;
1153     set |= (1ull << type);
1154   }
1155   int get_depth() const { return depth; }
1156   const item_t &at(int index) const {
1157     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1158     return items[index];
1159   }
1160   item_t &at(int index) {
1161     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1162     return items[index];
1163   }
1164   void remove(int index) {
1165     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1166     set &= ~(1ull << items[index].type);
1167     for (int j = index + 1; j < depth; ++j) {
1168       items[j - 1] = items[j];
1169     }
1170     depth--;
1171   }
1172   void sort() {
1173     KMP_DEBUG_ASSERT(__kmp_topology);
1174     qsort(items, depth, sizeof(item_t), hw_subset_compare);
1175   }
1176   bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1177 
1178   // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1179   // This means putting each of {sockets, cores, threads} in the topology if
1180   // they are not specified:
1181   // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1182   // e.g., 3module => *s,3module,*c,*t
1183   // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1184   // are expecting the traditional sockets/cores/threads topology. For newer
1185   // hardware, there can be intervening layers like dies/tiles/modules
1186   // (usually corresponding to a cache level). So when a user asks for
1187   // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1188   // should get 12 hardware threads across 6 cores and effectively ignore the
1189   // module layer.
1190   void canonicalize(const kmp_topology_t *top) {
1191     // Layers to target for KMP_HW_SUBSET canonicalization
1192     kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1193 
1194     // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1195     if (is_absolute())
1196       return;
1197 
1198     // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1199     // topology doesn't have these layers
1200     for (kmp_hw_t type : targeted)
1201       if (top->get_level(type) == KMP_HW_UNKNOWN)
1202         return;
1203 
1204     // Put targeted layers in topology if they do not exist
1205     for (kmp_hw_t type : targeted) {
1206       bool found = false;
1207       for (int i = 0; i < get_depth(); ++i) {
1208         if (top->get_equivalent_type(items[i].type) == type) {
1209           found = true;
1210           break;
1211         }
1212       }
1213       if (!found) {
1214         push_back(USE_ALL, type, 0, kmp_hw_attr_t{});
1215       }
1216     }
1217     sort();
1218     // Set as an absolute topology that only targets the targeted layers
1219     set_absolute();
1220   }
1221   void dump() const {
1222     printf("**********************\n");
1223     printf("*** kmp_hw_subset: ***\n");
1224     printf("* depth: %d\n", depth);
1225     printf("* items:\n");
1226     for (int i = 0; i < depth; ++i) {
1227       printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1228       for (int j = 0; j < items[i].num_attrs; ++j) {
1229         printf("  num: %d, offset: %d, attr: ", items[i].num[j],
1230                items[i].offset[j]);
1231         if (!items[i].attr[j]) {
1232           printf(" (none)\n");
1233         } else {
1234           printf(
1235               " core_type = %s, core_eff = %d\n",
1236               __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1237               items[i].attr[j].get_core_eff());
1238         }
1239       }
1240     }
1241     printf("* set: 0x%llx\n", set);
1242     printf("* absolute: %d\n", absolute);
1243     printf("**********************\n");
1244   }
1245 };
1246 extern kmp_hw_subset_t *__kmp_hw_subset;
1247 
1248 /* A structure for holding machine-specific hierarchy info to be computed once
1249    at init. This structure represents a mapping of threads to the actual machine
1250    hierarchy, or to our best guess at what the hierarchy might be, for the
1251    purpose of performing an efficient barrier. In the worst case, when there is
1252    no machine hierarchy information, it produces a tree suitable for a barrier,
1253    similar to the tree used in the hyper barrier. */
1254 class hierarchy_info {
1255 public:
1256   /* Good default values for number of leaves and branching factor, given no
1257      affinity information. Behaves a bit like hyper barrier. */
1258   static const kmp_uint32 maxLeaves = 4;
1259   static const kmp_uint32 minBranch = 4;
1260   /** Number of levels in the hierarchy. Typical levels are threads/core,
1261       cores/package or socket, packages/node, nodes/machine, etc. We don't want
1262       to get specific with nomenclature. When the machine is oversubscribed we
1263       add levels to duplicate the hierarchy, doubling the thread capacity of the
1264       hierarchy each time we add a level. */
1265   kmp_uint32 maxLevels;
1266 
1267   /** This is specifically the depth of the machine configuration hierarchy, in
1268       terms of the number of levels along the longest path from root to any
1269       leaf. It corresponds to the number of entries in numPerLevel if we exclude
1270       all but one trailing 1. */
1271   kmp_uint32 depth;
1272   kmp_uint32 base_num_threads;
1273   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1274   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1275   // 2=initialization in progress
1276   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1277 
1278   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1279       the parent of a node at level i has. For example, if we have a machine
1280       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1281       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
1282   kmp_uint32 *numPerLevel;
1283   kmp_uint32 *skipPerLevel;
1284 
1285   void deriveLevels() {
1286     int hier_depth = __kmp_topology->get_depth();
1287     for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1288       numPerLevel[level] = __kmp_topology->get_ratio(i);
1289     }
1290   }
1291 
1292   hierarchy_info()
1293       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1294 
1295   void fini() {
1296     if (!uninitialized && numPerLevel) {
1297       __kmp_free(numPerLevel);
1298       numPerLevel = NULL;
1299       uninitialized = not_initialized;
1300     }
1301   }
1302 
1303   void init(int num_addrs) {
1304     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1305         &uninitialized, not_initialized, initializing);
1306     if (bool_result == 0) { // Wait for initialization
1307       while (TCR_1(uninitialized) != initialized)
1308         KMP_CPU_PAUSE();
1309       return;
1310     }
1311     KMP_DEBUG_ASSERT(bool_result == 1);
1312 
1313     /* Added explicit initialization of the data fields here to prevent usage of
1314        dirty value observed when static library is re-initialized multiple times
1315        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1316        OpenMP). */
1317     depth = 1;
1318     resizing = 0;
1319     maxLevels = 7;
1320     numPerLevel =
1321         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1322     skipPerLevel = &(numPerLevel[maxLevels]);
1323     for (kmp_uint32 i = 0; i < maxLevels;
1324          ++i) { // init numPerLevel[*] to 1 item per level
1325       numPerLevel[i] = 1;
1326       skipPerLevel[i] = 1;
1327     }
1328 
1329     // Sort table by physical ID
1330     if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1331       deriveLevels();
1332     } else {
1333       numPerLevel[0] = maxLeaves;
1334       numPerLevel[1] = num_addrs / maxLeaves;
1335       if (num_addrs % maxLeaves)
1336         numPerLevel[1]++;
1337     }
1338 
1339     base_num_threads = num_addrs;
1340     for (int i = maxLevels - 1; i >= 0;
1341          --i) // count non-empty levels to get depth
1342       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1343         depth++;
1344 
1345     kmp_uint32 branch = minBranch;
1346     if (numPerLevel[0] == 1)
1347       branch = num_addrs / maxLeaves;
1348     if (branch < minBranch)
1349       branch = minBranch;
1350     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1351       while (numPerLevel[d] > branch ||
1352              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1353         if (numPerLevel[d] & 1)
1354           numPerLevel[d]++;
1355         numPerLevel[d] = numPerLevel[d] >> 1;
1356         if (numPerLevel[d + 1] == 1)
1357           depth++;
1358         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1359       }
1360       if (numPerLevel[0] == 1) {
1361         branch = branch >> 1;
1362         if (branch < 4)
1363           branch = minBranch;
1364       }
1365     }
1366 
1367     for (kmp_uint32 i = 1; i < depth; ++i)
1368       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1369     // Fill in hierarchy in the case of oversubscription
1370     for (kmp_uint32 i = depth; i < maxLevels; ++i)
1371       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1372 
1373     uninitialized = initialized; // One writer
1374   }
1375 
1376   // Resize the hierarchy if nproc changes to something larger than before
1377   void resize(kmp_uint32 nproc) {
1378     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1379     while (bool_result == 0) { // someone else is trying to resize
1380       KMP_CPU_PAUSE();
1381       if (nproc <= base_num_threads) // happy with other thread's resize
1382         return;
1383       else // try to resize
1384         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1385     }
1386     KMP_DEBUG_ASSERT(bool_result != 0);
1387     if (nproc <= base_num_threads)
1388       return; // happy with other thread's resize
1389 
1390     // Calculate new maxLevels
1391     kmp_uint32 old_sz = skipPerLevel[depth - 1];
1392     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1393     // First see if old maxLevels is enough to contain new size
1394     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1395       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1396       numPerLevel[i - 1] *= 2;
1397       old_sz *= 2;
1398       depth++;
1399     }
1400     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1401       while (nproc > old_sz) {
1402         old_sz *= 2;
1403         incs++;
1404         depth++;
1405       }
1406       maxLevels += incs;
1407 
1408       // Resize arrays
1409       kmp_uint32 *old_numPerLevel = numPerLevel;
1410       kmp_uint32 *old_skipPerLevel = skipPerLevel;
1411       numPerLevel = skipPerLevel = NULL;
1412       numPerLevel =
1413           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1414       skipPerLevel = &(numPerLevel[maxLevels]);
1415 
1416       // Copy old elements from old arrays
1417       for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1418         // init numPerLevel[*] to 1 item per level
1419         numPerLevel[i] = old_numPerLevel[i];
1420         skipPerLevel[i] = old_skipPerLevel[i];
1421       }
1422 
1423       // Init new elements in arrays to 1
1424       for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1425         // init numPerLevel[*] to 1 item per level
1426         numPerLevel[i] = 1;
1427         skipPerLevel[i] = 1;
1428       }
1429 
1430       // Free old arrays
1431       __kmp_free(old_numPerLevel);
1432     }
1433 
1434     // Fill in oversubscription levels of hierarchy
1435     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1436       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1437 
1438     base_num_threads = nproc;
1439     resizing = 0; // One writer
1440   }
1441 };
1442 #endif // KMP_AFFINITY_H
1443