xref: /freebsd/contrib/llvm-project/openmp/runtime/src/kmp_affinity.h (revision e64bea71c21eb42e97aa615188ba91f6cce0d36d)
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 #include <limits>
19 
20 #if KMP_AFFINITY_SUPPORTED
21 #if KMP_HWLOC_ENABLED
22 class KMPHwlocAffinity : public KMPAffinity {
23 public:
24   class Mask : public KMPAffinity::Mask {
25     hwloc_cpuset_t mask;
26 
27   public:
Mask()28     Mask() {
29       mask = hwloc_bitmap_alloc();
30       this->zero();
31     }
32     Mask(const Mask &other) = delete;
33     Mask &operator=(const Mask &other) = delete;
~Mask()34     ~Mask() { hwloc_bitmap_free(mask); }
set(int i)35     void set(int i) override { hwloc_bitmap_set(mask, i); }
is_set(int i)36     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
clear(int i)37     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
zero()38     void zero() override { hwloc_bitmap_zero(mask); }
empty()39     bool empty() const override { return hwloc_bitmap_iszero(mask); }
copy(const KMPAffinity::Mask * src)40     void copy(const KMPAffinity::Mask *src) override {
41       const Mask *convert = static_cast<const Mask *>(src);
42       hwloc_bitmap_copy(mask, convert->mask);
43     }
bitwise_and(const KMPAffinity::Mask * rhs)44     void bitwise_and(const KMPAffinity::Mask *rhs) override {
45       const Mask *convert = static_cast<const Mask *>(rhs);
46       hwloc_bitmap_and(mask, mask, convert->mask);
47     }
bitwise_or(const KMPAffinity::Mask * rhs)48     void bitwise_or(const KMPAffinity::Mask *rhs) override {
49       const Mask *convert = static_cast<const Mask *>(rhs);
50       hwloc_bitmap_or(mask, mask, convert->mask);
51     }
bitwise_not()52     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
is_equal(const KMPAffinity::Mask * rhs)53     bool is_equal(const KMPAffinity::Mask *rhs) const override {
54       const Mask *convert = static_cast<const Mask *>(rhs);
55       return hwloc_bitmap_isequal(mask, convert->mask);
56     }
begin()57     int begin() const override { return hwloc_bitmap_first(mask); }
end()58     int end() const override { return -1; }
next(int previous)59     int next(int previous) const override {
60       return hwloc_bitmap_next(mask, previous);
61     }
get_system_affinity(bool abort_on_error)62     int get_system_affinity(bool abort_on_error) override {
63       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
64                   "Illegal get affinity operation when not capable");
65       long retval =
66           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
67       if (retval >= 0) {
68         return 0;
69       }
70       int error = errno;
71       if (abort_on_error) {
72         __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
73                     KMP_ERR(error), __kmp_msg_null);
74       }
75       return error;
76     }
set_system_affinity(bool abort_on_error)77     int set_system_affinity(bool abort_on_error) const override {
78       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
79                   "Illegal set affinity operation when not capable");
80       long retval =
81           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
82       if (retval >= 0) {
83         return 0;
84       }
85       int error = errno;
86       if (abort_on_error) {
87         __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
88                     KMP_ERR(error), __kmp_msg_null);
89       }
90       return error;
91     }
92 #if KMP_OS_WINDOWS
set_process_affinity(bool abort_on_error)93     int set_process_affinity(bool abort_on_error) const override {
94       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
95                   "Illegal set process affinity operation when not capable");
96       int error = 0;
97       const hwloc_topology_support *support =
98           hwloc_topology_get_support(__kmp_hwloc_topology);
99       if (support->cpubind->set_proc_cpubind) {
100         int retval;
101         retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
102                                    HWLOC_CPUBIND_PROCESS);
103         if (retval >= 0)
104           return 0;
105         error = errno;
106         if (abort_on_error)
107           __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
108                       KMP_ERR(error), __kmp_msg_null);
109       }
110       return error;
111     }
112 #endif // KMP_OS_WINDOWS
get_proc_group()113     int get_proc_group() const override {
114       int group = -1;
115 #if KMP_OS_WINDOWS
116       if (__kmp_num_proc_groups == 1) {
117         return 1;
118       }
119       for (int i = 0; i < __kmp_num_proc_groups; i++) {
120         // On windows, the long type is always 32 bits
121         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
122         unsigned long second_32_bits =
123             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
124         if (first_32_bits == 0 && second_32_bits == 0) {
125           continue;
126         }
127         if (group >= 0) {
128           return -1;
129         }
130         group = i;
131       }
132 #endif /* KMP_OS_WINDOWS */
133       return group;
134     }
135   };
determine_capable(const char * var)136   void determine_capable(const char *var) override {
137     const hwloc_topology_support *topology_support;
138     if (__kmp_hwloc_topology == NULL) {
139       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
140         __kmp_hwloc_error = TRUE;
141         if (__kmp_affinity.flags.verbose) {
142           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
143         }
144       }
145       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
146         __kmp_hwloc_error = TRUE;
147         if (__kmp_affinity.flags.verbose) {
148           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
149         }
150       }
151     }
152     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
153     // Is the system capable of setting/getting this thread's affinity?
154     // Also, is topology discovery possible? (pu indicates ability to discover
155     // processing units). And finally, were there no errors when calling any
156     // hwloc_* API functions?
157     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
158         topology_support->cpubind->get_thisthread_cpubind &&
159         topology_support->discovery->pu && !__kmp_hwloc_error) {
160       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
161       KMP_AFFINITY_ENABLE(TRUE);
162     } else {
163       // indicate that hwloc didn't work and disable affinity
164       __kmp_hwloc_error = TRUE;
165       KMP_AFFINITY_DISABLE();
166     }
167   }
bind_thread(int which)168   void bind_thread(int which) override {
169     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
170                 "Illegal set affinity operation when not capable");
171     KMPAffinity::Mask *mask;
172     KMP_CPU_ALLOC_ON_STACK(mask);
173     KMP_CPU_ZERO(mask);
174     KMP_CPU_SET(which, mask);
175     __kmp_set_system_affinity(mask, TRUE);
176     KMP_CPU_FREE_FROM_STACK(mask);
177   }
allocate_mask()178   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
deallocate_mask(KMPAffinity::Mask * m)179   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
allocate_mask_array(int num)180   KMPAffinity::Mask *allocate_mask_array(int num) override {
181     return new Mask[num];
182   }
deallocate_mask_array(KMPAffinity::Mask * array)183   void deallocate_mask_array(KMPAffinity::Mask *array) override {
184     Mask *hwloc_array = static_cast<Mask *>(array);
185     delete[] hwloc_array;
186   }
index_mask_array(KMPAffinity::Mask * array,int index)187   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
188                                       int index) override {
189     Mask *hwloc_array = static_cast<Mask *>(array);
190     return &(hwloc_array[index]);
191   }
get_api_type()192   api_type get_api_type() const override { return HWLOC; }
193 };
194 #endif /* KMP_HWLOC_ENABLED */
195 
196 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY ||     \
197     KMP_OS_AIX
198 #if KMP_OS_LINUX
199 /* On some of the older OS's that we build on, these constants aren't present
200    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
201    all systems of the same arch where they are defined, and they cannot change.
202    stone forever. */
203 #include <sys/syscall.h>
204 #if KMP_ARCH_X86 || KMP_ARCH_ARM
205 #ifndef __NR_sched_setaffinity
206 #define __NR_sched_setaffinity 241
207 #elif __NR_sched_setaffinity != 241
208 #error Wrong code for setaffinity system call.
209 #endif /* __NR_sched_setaffinity */
210 #ifndef __NR_sched_getaffinity
211 #define __NR_sched_getaffinity 242
212 #elif __NR_sched_getaffinity != 242
213 #error Wrong code for getaffinity system call.
214 #endif /* __NR_sched_getaffinity */
215 #elif KMP_ARCH_AARCH64
216 #ifndef __NR_sched_setaffinity
217 #define __NR_sched_setaffinity 122
218 #elif __NR_sched_setaffinity != 122
219 #error Wrong code for setaffinity system call.
220 #endif /* __NR_sched_setaffinity */
221 #ifndef __NR_sched_getaffinity
222 #define __NR_sched_getaffinity 123
223 #elif __NR_sched_getaffinity != 123
224 #error Wrong code for getaffinity system call.
225 #endif /* __NR_sched_getaffinity */
226 #elif KMP_ARCH_X86_64
227 #ifndef __NR_sched_setaffinity
228 #define __NR_sched_setaffinity 203
229 #elif __NR_sched_setaffinity != 203
230 #error Wrong code for setaffinity system call.
231 #endif /* __NR_sched_setaffinity */
232 #ifndef __NR_sched_getaffinity
233 #define __NR_sched_getaffinity 204
234 #elif __NR_sched_getaffinity != 204
235 #error Wrong code for getaffinity system call.
236 #endif /* __NR_sched_getaffinity */
237 #elif KMP_ARCH_PPC64
238 #ifndef __NR_sched_setaffinity
239 #define __NR_sched_setaffinity 222
240 #elif __NR_sched_setaffinity != 222
241 #error Wrong code for setaffinity system call.
242 #endif /* __NR_sched_setaffinity */
243 #ifndef __NR_sched_getaffinity
244 #define __NR_sched_getaffinity 223
245 #elif __NR_sched_getaffinity != 223
246 #error Wrong code for getaffinity system call.
247 #endif /* __NR_sched_getaffinity */
248 #elif KMP_ARCH_MIPS
249 #ifndef __NR_sched_setaffinity
250 #define __NR_sched_setaffinity 4239
251 #elif __NR_sched_setaffinity != 4239
252 #error Wrong code for setaffinity system call.
253 #endif /* __NR_sched_setaffinity */
254 #ifndef __NR_sched_getaffinity
255 #define __NR_sched_getaffinity 4240
256 #elif __NR_sched_getaffinity != 4240
257 #error Wrong code for getaffinity system call.
258 #endif /* __NR_sched_getaffinity */
259 #elif KMP_ARCH_MIPS64
260 #ifndef __NR_sched_setaffinity
261 #define __NR_sched_setaffinity 5195
262 #elif __NR_sched_setaffinity != 5195
263 #error Wrong code for setaffinity system call.
264 #endif /* __NR_sched_setaffinity */
265 #ifndef __NR_sched_getaffinity
266 #define __NR_sched_getaffinity 5196
267 #elif __NR_sched_getaffinity != 5196
268 #error Wrong code for getaffinity system call.
269 #endif /* __NR_sched_getaffinity */
270 #elif KMP_ARCH_LOONGARCH64
271 #ifndef __NR_sched_setaffinity
272 #define __NR_sched_setaffinity 122
273 #elif __NR_sched_setaffinity != 122
274 #error Wrong code for setaffinity system call.
275 #endif /* __NR_sched_setaffinity */
276 #ifndef __NR_sched_getaffinity
277 #define __NR_sched_getaffinity 123
278 #elif __NR_sched_getaffinity != 123
279 #error Wrong code for getaffinity system call.
280 #endif /* __NR_sched_getaffinity */
281 #elif KMP_ARCH_RISCV64
282 #ifndef __NR_sched_setaffinity
283 #define __NR_sched_setaffinity 122
284 #elif __NR_sched_setaffinity != 122
285 #error Wrong code for setaffinity system call.
286 #endif /* __NR_sched_setaffinity */
287 #ifndef __NR_sched_getaffinity
288 #define __NR_sched_getaffinity 123
289 #elif __NR_sched_getaffinity != 123
290 #error Wrong code for getaffinity system call.
291 #endif /* __NR_sched_getaffinity */
292 #elif KMP_ARCH_VE
293 #ifndef __NR_sched_setaffinity
294 #define __NR_sched_setaffinity 203
295 #elif __NR_sched_setaffinity != 203
296 #error Wrong code for setaffinity system call.
297 #endif /* __NR_sched_setaffinity */
298 #ifndef __NR_sched_getaffinity
299 #define __NR_sched_getaffinity 204
300 #elif __NR_sched_getaffinity != 204
301 #error Wrong code for getaffinity system call.
302 #endif /* __NR_sched_getaffinity */
303 #elif KMP_ARCH_S390X
304 #ifndef __NR_sched_setaffinity
305 #define __NR_sched_setaffinity 239
306 #elif __NR_sched_setaffinity != 239
307 #error Wrong code for setaffinity system call.
308 #endif /* __NR_sched_setaffinity */
309 #ifndef __NR_sched_getaffinity
310 #define __NR_sched_getaffinity 240
311 #elif __NR_sched_getaffinity != 240
312 #error Wrong code for getaffinity system call.
313 #endif /* __NR_sched_getaffinity */
314 #elif KMP_ARCH_SPARC
315 #ifndef __NR_sched_setaffinity
316 #define __NR_sched_setaffinity 261
317 #elif __NR_sched_setaffinity != 261
318 #error Wrong code for setaffinity system call.
319 #endif /* __NR_sched_setaffinity */
320 #ifndef __NR_sched_getaffinity
321 #define __NR_sched_getaffinity 260
322 #elif __NR_sched_getaffinity != 260
323 #error Wrong code for getaffinity system call.
324 #endif /* __NR_sched_getaffinity */
325 #else
326 #error Unknown or unsupported architecture
327 #endif /* KMP_ARCH_* */
328 #elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
329 #include <pthread.h>
330 #include <pthread_np.h>
331 #elif KMP_OS_NETBSD
332 #include <pthread.h>
333 #include <sched.h>
334 #elif KMP_OS_AIX
335 #include <sys/dr.h>
336 #include <sys/rset.h>
337 #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
338 #define GET_NUMBER_SMT_SETS 0x0004
339 extern "C" int syssmt(int flags, int, int, int *);
340 #endif
341 class KMPNativeAffinity : public KMPAffinity {
342   class Mask : public KMPAffinity::Mask {
343     typedef unsigned long mask_t;
344     typedef decltype(__kmp_affin_mask_size) mask_size_type;
345     static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
346     static const mask_t ONE = 1;
get_num_mask_types()347     mask_size_type get_num_mask_types() const {
348       return __kmp_affin_mask_size / sizeof(mask_t);
349     }
350 
351   public:
352     mask_t *mask;
Mask()353     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
~Mask()354     ~Mask() {
355       if (mask)
356         __kmp_free(mask);
357     }
set(int i)358     void set(int i) override {
359       mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
360     }
is_set(int i)361     bool is_set(int i) const override {
362       return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
363     }
clear(int i)364     void clear(int i) override {
365       mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
366     }
zero()367     void zero() override {
368       mask_size_type e = get_num_mask_types();
369       for (mask_size_type i = 0; i < e; ++i)
370         mask[i] = (mask_t)0;
371     }
empty()372     bool empty() const override {
373       mask_size_type e = get_num_mask_types();
374       for (mask_size_type i = 0; i < e; ++i)
375         if (mask[i] != (mask_t)0)
376           return false;
377       return true;
378     }
copy(const KMPAffinity::Mask * src)379     void copy(const KMPAffinity::Mask *src) override {
380       const Mask *convert = static_cast<const Mask *>(src);
381       mask_size_type e = get_num_mask_types();
382       for (mask_size_type i = 0; i < e; ++i)
383         mask[i] = convert->mask[i];
384     }
bitwise_and(const KMPAffinity::Mask * rhs)385     void bitwise_and(const KMPAffinity::Mask *rhs) override {
386       const Mask *convert = static_cast<const Mask *>(rhs);
387       mask_size_type e = get_num_mask_types();
388       for (mask_size_type i = 0; i < e; ++i)
389         mask[i] &= convert->mask[i];
390     }
bitwise_or(const KMPAffinity::Mask * rhs)391     void bitwise_or(const KMPAffinity::Mask *rhs) override {
392       const Mask *convert = static_cast<const Mask *>(rhs);
393       mask_size_type e = get_num_mask_types();
394       for (mask_size_type i = 0; i < e; ++i)
395         mask[i] |= convert->mask[i];
396     }
bitwise_not()397     void bitwise_not() override {
398       mask_size_type e = get_num_mask_types();
399       for (mask_size_type i = 0; i < e; ++i)
400         mask[i] = ~(mask[i]);
401     }
is_equal(const KMPAffinity::Mask * rhs)402     bool is_equal(const KMPAffinity::Mask *rhs) const override {
403       const Mask *convert = static_cast<const Mask *>(rhs);
404       mask_size_type e = get_num_mask_types();
405       for (mask_size_type i = 0; i < e; ++i)
406         if (mask[i] != convert->mask[i])
407           return false;
408       return true;
409     }
begin()410     int begin() const override {
411       int retval = 0;
412       while (retval < end() && !is_set(retval))
413         ++retval;
414       return retval;
415     }
end()416     int end() const override {
417       int e;
418       __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
419       return e;
420     }
next(int previous)421     int next(int previous) const override {
422       int retval = previous + 1;
423       while (retval < end() && !is_set(retval))
424         ++retval;
425       return retval;
426     }
427 #if KMP_OS_AIX
428     // On AIX, we don't have a way to get CPU(s) a thread is bound to.
429     // This routine is only used to get the full mask.
get_system_affinity(bool abort_on_error)430     int get_system_affinity(bool abort_on_error) override {
431       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
432                   "Illegal get affinity operation when not capable");
433 
434       (void)abort_on_error;
435 
436       // Set the mask with all CPUs that are available.
437       for (int i = 0; i < __kmp_xproc; ++i)
438         KMP_CPU_SET(i, this);
439       return 0;
440     }
set_system_affinity(bool abort_on_error)441     int set_system_affinity(bool abort_on_error) const override {
442       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
443 
444                   "Illegal set affinity operation when not capable");
445 
446       int location;
447       int gtid = __kmp_entry_gtid();
448       int tid = thread_self();
449 
450       // Unbind the thread if it was bound to any processors before so that
451       // we can bind the thread to CPUs specified by the mask not others.
452       int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
453 
454       // On AIX, we can only bind to one instead of a set of CPUs with the
455       // bindprocessor() system call.
456       KMP_CPU_SET_ITERATE(location, this) {
457         if (KMP_CPU_ISSET(location, this)) {
458           retval = bindprocessor(BINDTHREAD, tid, location);
459           if (retval == -1 && errno == 1) {
460             rsid_t rsid;
461             rsethandle_t rsh;
462             // Put something in rsh to prevent compiler warning
463             // about uninitalized use
464             rsh = rs_alloc(RS_EMPTY);
465             rsid.at_pid = getpid();
466             if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
467               retval = ra_detachrset(R_PROCESS, rsid, 0);
468               retval = bindprocessor(BINDTHREAD, tid, location);
469             }
470           }
471           if (retval == 0) {
472             KA_TRACE(10, ("__kmp_set_system_affinity:  Done binding "
473                           "T#%d to cpu=%d.\n",
474                           gtid, location));
475             continue;
476           }
477           int error = errno;
478           if (abort_on_error) {
479             __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
480                         KMP_ERR(error), __kmp_msg_null);
481             KA_TRACE(10, ("__kmp_set_system_affinity:  Error binding "
482                           "T#%d to cpu=%d, errno=%d.\n",
483                           gtid, location, error));
484             return error;
485           }
486         }
487       }
488       return 0;
489     }
490 #else // !KMP_OS_AIX
get_system_affinity(bool abort_on_error)491     int get_system_affinity(bool abort_on_error) override {
492       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
493                   "Illegal get affinity operation when not capable");
494 #if KMP_OS_LINUX
495       long retval =
496           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
497 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
498       int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
499                                      reinterpret_cast<cpuset_t *>(mask));
500       int retval = (r == 0 ? 0 : -1);
501 #endif
502       if (retval >= 0) {
503         return 0;
504       }
505       int error = errno;
506       if (abort_on_error) {
507         __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
508                     KMP_ERR(error), __kmp_msg_null);
509       }
510       return error;
511     }
set_system_affinity(bool abort_on_error)512     int set_system_affinity(bool abort_on_error) const override {
513       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
514                   "Illegal set affinity operation when not capable");
515 #if KMP_OS_LINUX
516       long retval =
517           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
518 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
519       int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
520                                      reinterpret_cast<cpuset_t *>(mask));
521       int retval = (r == 0 ? 0 : -1);
522 #endif
523       if (retval >= 0) {
524         return 0;
525       }
526       int error = errno;
527       if (abort_on_error) {
528         __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
529                     KMP_ERR(error), __kmp_msg_null);
530       }
531       return error;
532     }
533 #endif // KMP_OS_AIX
534   };
determine_capable(const char * env_var)535   void determine_capable(const char *env_var) override {
536     __kmp_affinity_determine_capable(env_var);
537   }
bind_thread(int which)538   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
allocate_mask()539   KMPAffinity::Mask *allocate_mask() override {
540     KMPNativeAffinity::Mask *retval = new Mask();
541     return retval;
542   }
deallocate_mask(KMPAffinity::Mask * m)543   void deallocate_mask(KMPAffinity::Mask *m) override {
544     KMPNativeAffinity::Mask *native_mask =
545         static_cast<KMPNativeAffinity::Mask *>(m);
546     delete native_mask;
547   }
allocate_mask_array(int num)548   KMPAffinity::Mask *allocate_mask_array(int num) override {
549     return new Mask[num];
550   }
deallocate_mask_array(KMPAffinity::Mask * array)551   void deallocate_mask_array(KMPAffinity::Mask *array) override {
552     Mask *linux_array = static_cast<Mask *>(array);
553     delete[] linux_array;
554   }
index_mask_array(KMPAffinity::Mask * array,int index)555   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
556                                       int index) override {
557     Mask *linux_array = static_cast<Mask *>(array);
558     return &(linux_array[index]);
559   }
get_api_type()560   api_type get_api_type() const override { return NATIVE_OS; }
561 };
562 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY  \
563           || KMP_OS_AIX */
564 
565 #if KMP_OS_WINDOWS
566 class KMPNativeAffinity : public KMPAffinity {
567   class Mask : public KMPAffinity::Mask {
568     typedef ULONG_PTR mask_t;
569     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
570     mask_t *mask;
571 
572   public:
Mask()573     Mask() {
574       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
575     }
~Mask()576     ~Mask() {
577       if (mask)
578         __kmp_free(mask);
579     }
set(int i)580     void set(int i) override {
581       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
582     }
is_set(int i)583     bool is_set(int i) const override {
584       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
585     }
clear(int i)586     void clear(int i) override {
587       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
588     }
zero()589     void zero() override {
590       for (int i = 0; i < __kmp_num_proc_groups; ++i)
591         mask[i] = 0;
592     }
empty()593     bool empty() const override {
594       for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
595         if (mask[i])
596           return false;
597       return true;
598     }
copy(const KMPAffinity::Mask * src)599     void copy(const KMPAffinity::Mask *src) override {
600       const Mask *convert = static_cast<const Mask *>(src);
601       for (int i = 0; i < __kmp_num_proc_groups; ++i)
602         mask[i] = convert->mask[i];
603     }
bitwise_and(const KMPAffinity::Mask * rhs)604     void bitwise_and(const KMPAffinity::Mask *rhs) override {
605       const Mask *convert = static_cast<const Mask *>(rhs);
606       for (int i = 0; i < __kmp_num_proc_groups; ++i)
607         mask[i] &= convert->mask[i];
608     }
bitwise_or(const KMPAffinity::Mask * rhs)609     void bitwise_or(const KMPAffinity::Mask *rhs) override {
610       const Mask *convert = static_cast<const Mask *>(rhs);
611       for (int i = 0; i < __kmp_num_proc_groups; ++i)
612         mask[i] |= convert->mask[i];
613     }
bitwise_not()614     void bitwise_not() override {
615       for (int i = 0; i < __kmp_num_proc_groups; ++i)
616         mask[i] = ~(mask[i]);
617     }
is_equal(const KMPAffinity::Mask * rhs)618     bool is_equal(const KMPAffinity::Mask *rhs) const override {
619       const Mask *convert = static_cast<const Mask *>(rhs);
620       for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
621         if (mask[i] != convert->mask[i])
622           return false;
623       return true;
624     }
begin()625     int begin() const override {
626       int retval = 0;
627       while (retval < end() && !is_set(retval))
628         ++retval;
629       return retval;
630     }
end()631     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
next(int previous)632     int next(int previous) const override {
633       int retval = previous + 1;
634       while (retval < end() && !is_set(retval))
635         ++retval;
636       return retval;
637     }
set_process_affinity(bool abort_on_error)638     int set_process_affinity(bool abort_on_error) const override {
639       if (__kmp_num_proc_groups <= 1) {
640         if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
641           DWORD error = GetLastError();
642           if (abort_on_error) {
643             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
644                         __kmp_msg_null);
645           }
646           return error;
647         }
648       }
649       return 0;
650     }
set_system_affinity(bool abort_on_error)651     int set_system_affinity(bool abort_on_error) const override {
652       if (__kmp_num_proc_groups > 1) {
653         // Check for a valid mask.
654         GROUP_AFFINITY ga;
655         int group = get_proc_group();
656         if (group < 0) {
657           if (abort_on_error) {
658             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
659           }
660           return -1;
661         }
662         // Transform the bit vector into a GROUP_AFFINITY struct
663         // and make the system call to set affinity.
664         ga.Group = group;
665         ga.Mask = mask[group];
666         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
667 
668         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
669         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
670           DWORD error = GetLastError();
671           if (abort_on_error) {
672             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
673                         __kmp_msg_null);
674           }
675           return error;
676         }
677       } else {
678         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
679           DWORD error = GetLastError();
680           if (abort_on_error) {
681             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
682                         __kmp_msg_null);
683           }
684           return error;
685         }
686       }
687       return 0;
688     }
get_system_affinity(bool abort_on_error)689     int get_system_affinity(bool abort_on_error) override {
690       if (__kmp_num_proc_groups > 1) {
691         this->zero();
692         GROUP_AFFINITY ga;
693         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
694         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
695           DWORD error = GetLastError();
696           if (abort_on_error) {
697             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
698                         KMP_ERR(error), __kmp_msg_null);
699           }
700           return error;
701         }
702         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
703             (ga.Mask == 0)) {
704           return -1;
705         }
706         mask[ga.Group] = ga.Mask;
707       } else {
708         mask_t newMask, sysMask, retval;
709         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
710           DWORD error = GetLastError();
711           if (abort_on_error) {
712             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
713                         KMP_ERR(error), __kmp_msg_null);
714           }
715           return error;
716         }
717         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
718         if (!retval) {
719           DWORD error = GetLastError();
720           if (abort_on_error) {
721             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
722                         KMP_ERR(error), __kmp_msg_null);
723           }
724           return error;
725         }
726         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
727         if (!newMask) {
728           DWORD error = GetLastError();
729           if (abort_on_error) {
730             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
731                         KMP_ERR(error), __kmp_msg_null);
732           }
733         }
734         *mask = retval;
735       }
736       return 0;
737     }
get_proc_group()738     int get_proc_group() const override {
739       int group = -1;
740       if (__kmp_num_proc_groups == 1) {
741         return 1;
742       }
743       for (int i = 0; i < __kmp_num_proc_groups; i++) {
744         if (mask[i] == 0)
745           continue;
746         if (group >= 0)
747           return -1;
748         group = i;
749       }
750       return group;
751     }
752   };
determine_capable(const char * env_var)753   void determine_capable(const char *env_var) override {
754     __kmp_affinity_determine_capable(env_var);
755   }
bind_thread(int which)756   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
allocate_mask()757   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
deallocate_mask(KMPAffinity::Mask * m)758   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
allocate_mask_array(int num)759   KMPAffinity::Mask *allocate_mask_array(int num) override {
760     return new Mask[num];
761   }
deallocate_mask_array(KMPAffinity::Mask * array)762   void deallocate_mask_array(KMPAffinity::Mask *array) override {
763     Mask *windows_array = static_cast<Mask *>(array);
764     delete[] windows_array;
765   }
index_mask_array(KMPAffinity::Mask * array,int index)766   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
767                                       int index) override {
768     Mask *windows_array = static_cast<Mask *>(array);
769     return &(windows_array[index]);
770   }
get_api_type()771   api_type get_api_type() const override { return NATIVE_OS; }
772 };
773 #endif /* KMP_OS_WINDOWS */
774 #endif /* KMP_AFFINITY_SUPPORTED */
775 
776 // Describe an attribute for a level in the machine topology
777 struct kmp_hw_attr_t {
778   int core_type : 8;
779   int core_eff : 8;
780   unsigned valid : 1;
781   unsigned reserved : 15;
782 
783   static const int UNKNOWN_CORE_EFF = -1;
784 
kmp_hw_attr_tkmp_hw_attr_t785   kmp_hw_attr_t()
786       : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
787         valid(0), reserved(0) {}
set_core_typekmp_hw_attr_t788   void set_core_type(kmp_hw_core_type_t type) {
789     valid = 1;
790     core_type = type;
791   }
set_core_effkmp_hw_attr_t792   void set_core_eff(int eff) {
793     valid = 1;
794     core_eff = eff;
795   }
get_core_typekmp_hw_attr_t796   kmp_hw_core_type_t get_core_type() const {
797     return (kmp_hw_core_type_t)core_type;
798   }
get_core_effkmp_hw_attr_t799   int get_core_eff() const { return core_eff; }
is_core_type_validkmp_hw_attr_t800   bool is_core_type_valid() const {
801     return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
802   }
is_core_eff_validkmp_hw_attr_t803   bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
804   operator bool() const { return valid; }
clearkmp_hw_attr_t805   void clear() {
806     core_type = KMP_HW_CORE_TYPE_UNKNOWN;
807     core_eff = UNKNOWN_CORE_EFF;
808     valid = 0;
809   }
containskmp_hw_attr_t810   bool contains(const kmp_hw_attr_t &other) const {
811     if (!valid && !other.valid)
812       return true;
813     if (valid && other.valid) {
814       if (other.is_core_type_valid()) {
815         if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
816           return false;
817       }
818       if (other.is_core_eff_valid()) {
819         if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
820           return false;
821       }
822       return true;
823     }
824     return false;
825   }
826 #if KMP_AFFINITY_SUPPORTED
containskmp_hw_attr_t827   bool contains(const kmp_affinity_attrs_t &attr) const {
828     if (!valid && !attr.valid)
829       return true;
830     if (valid && attr.valid) {
831       if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
832         return (is_core_type_valid() &&
833                 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
834       if (attr.core_eff != UNKNOWN_CORE_EFF)
835         return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
836       return true;
837     }
838     return false;
839   }
840 #endif // KMP_AFFINITY_SUPPORTED
841   bool operator==(const kmp_hw_attr_t &rhs) const {
842     return (rhs.valid == valid && rhs.core_eff == core_eff &&
843             rhs.core_type == core_type);
844   }
845   bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
846 };
847 
848 #if KMP_AFFINITY_SUPPORTED
849 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
850 #endif
851 
852 class kmp_hw_thread_t {
853 public:
854   static const int UNKNOWN_ID = -1;
855   static const int MULTIPLE_ID = -2;
856   static int compare_ids(const void *a, const void *b);
857   static int compare_compact(const void *a, const void *b);
858   int ids[KMP_HW_LAST];
859   int sub_ids[KMP_HW_LAST];
860   bool leader;
861   int os_id;
862   int original_idx;
863   kmp_hw_attr_t attrs;
864 
865   void print() const;
clear()866   void clear() {
867     for (int i = 0; i < (int)KMP_HW_LAST; ++i)
868       ids[i] = UNKNOWN_ID;
869     leader = false;
870     attrs.clear();
871   }
872 };
873 
874 class kmp_topology_t {
875 
876   struct flags_t {
877     int uniform : 1;
878     int reserved : 31;
879   };
880 
881   int depth;
882 
883   // The following arrays are all 'depth' long and have been
884   // allocated to hold up to KMP_HW_LAST number of objects if
885   // needed so layers can be added without reallocation of any array
886 
887   // Orderd array of the types in the topology
888   kmp_hw_t *types;
889 
890   // Keep quick topology ratios, for non-uniform topologies,
891   // this ratio holds the max number of itemAs per itemB
892   // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
893   int *ratio;
894 
895   // Storage containing the absolute number of each topology layer
896   int *count;
897 
898   // The number of core efficiencies. This is only useful for hybrid
899   // topologies. Core efficiencies will range from 0 to num efficiencies - 1
900   int num_core_efficiencies;
901   int num_core_types;
902   kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
903 
904   // The hardware threads array
905   // hw_threads is num_hw_threads long
906   // Each hw_thread's ids and sub_ids are depth deep
907   int num_hw_threads;
908   kmp_hw_thread_t *hw_threads;
909 
910   // Equivalence hash where the key is the hardware topology item
911   // and the value is the equivalent hardware topology type in the
912   // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
913   // known equivalence for the topology type
914   kmp_hw_t equivalent[KMP_HW_LAST];
915 
916   // Flags describing the topology
917   flags_t flags;
918 
919   // Compact value used during sort_compact()
920   int compact;
921 
922 #if KMP_GROUP_AFFINITY
923   // Insert topology information about Windows Processor groups
924   void _insert_windows_proc_groups();
925 #endif
926 
927   // Count each item & get the num x's per y
928   // e.g., get the number of cores and the number of threads per core
929   // for each (x, y) in (KMP_HW_* , KMP_HW_*)
930   void _gather_enumeration_information();
931 
932   // Remove layers that don't add information to the topology.
933   // This is done by having the layer take on the id = UNKNOWN_ID (-1)
934   void _remove_radix1_layers();
935 
936   // Find out if the topology is uniform
937   void _discover_uniformity();
938 
939   // Set all the sub_ids for each hardware thread
940   void _set_sub_ids();
941 
942   // Set global affinity variables describing the number of threads per
943   // core, the number of packages, the number of cores per package, and
944   // the number of cores.
945   void _set_globals();
946 
947   // Set the last level cache equivalent type
948   void _set_last_level_cache();
949 
950   // Return the number of cores with a particular attribute, 'attr'.
951   // If 'find_all' is true, then find all cores on the machine, otherwise find
952   // all cores per the layer 'above'
953   int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
954                             bool find_all = false) const;
955 
956 public:
957   // Force use of allocate()/deallocate()
958   kmp_topology_t() = delete;
959   kmp_topology_t(const kmp_topology_t &t) = delete;
960   kmp_topology_t(kmp_topology_t &&t) = delete;
961   kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
962   kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
963 
964   static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
965   static void deallocate(kmp_topology_t *);
966 
967   // Functions used in create_map() routines
at(int index)968   kmp_hw_thread_t &at(int index) {
969     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
970     return hw_threads[index];
971   }
at(int index)972   const kmp_hw_thread_t &at(int index) const {
973     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
974     return hw_threads[index];
975   }
get_num_hw_threads()976   int get_num_hw_threads() const { return num_hw_threads; }
sort_ids()977   void sort_ids() {
978     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
979           kmp_hw_thread_t::compare_ids);
980   }
981 
982   // Insert a new topology layer after allocation
983   void insert_layer(kmp_hw_t type, const int *ids);
984 
985   // Check if the hardware ids are unique, if they are
986   // return true, otherwise return false
987   bool check_ids() const;
988 
989   // Function to call after the create_map() routine
990   void canonicalize();
991   void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
992 
993 // Functions used after canonicalize() called
994 
995 #if KMP_AFFINITY_SUPPORTED
996   // Set the granularity for affinity settings
997   void set_granularity(kmp_affinity_t &stgs) const;
998   bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
999   bool restrict_to_mask(const kmp_affin_mask_t *mask);
1000   bool filter_hw_subset();
1001 #endif
is_uniform()1002   bool is_uniform() const { return flags.uniform; }
1003   // Tell whether a type is a valid type in the topology
1004   // returns KMP_HW_UNKNOWN when there is no equivalent type
get_equivalent_type(kmp_hw_t type)1005   kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
1006     if (type == KMP_HW_UNKNOWN)
1007       return KMP_HW_UNKNOWN;
1008     return equivalent[type];
1009   }
1010   // Set type1 = type2
set_equivalent_type(kmp_hw_t type1,kmp_hw_t type2)1011   void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
1012     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
1013     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
1014     kmp_hw_t real_type2 = equivalent[type2];
1015     if (real_type2 == KMP_HW_UNKNOWN)
1016       real_type2 = type2;
1017     equivalent[type1] = real_type2;
1018     // This loop is required since any of the types may have been set to
1019     // be equivalent to type1.  They all must be checked and reset to type2.
1020     KMP_FOREACH_HW_TYPE(type) {
1021       if (equivalent[type] == type1) {
1022         equivalent[type] = real_type2;
1023       }
1024     }
1025   }
1026   // Calculate number of types corresponding to level1
1027   // per types corresponding to level2 (e.g., number of threads per core)
calculate_ratio(int level1,int level2)1028   int calculate_ratio(int level1, int level2) const {
1029     KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1030     KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1031     int r = 1;
1032     for (int level = level1; level > level2; --level)
1033       r *= ratio[level];
1034     return r;
1035   }
get_ratio(int level)1036   int get_ratio(int level) const {
1037     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1038     return ratio[level];
1039   }
get_depth()1040   int get_depth() const { return depth; };
get_type(int level)1041   kmp_hw_t get_type(int level) const {
1042     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1043     return types[level];
1044   }
get_level(kmp_hw_t type)1045   int get_level(kmp_hw_t type) const {
1046     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1047     int eq_type = equivalent[type];
1048     if (eq_type == KMP_HW_UNKNOWN)
1049       return -1;
1050     for (int i = 0; i < depth; ++i)
1051       if (types[i] == eq_type)
1052         return i;
1053     return -1;
1054   }
get_count(int level)1055   int get_count(int level) const {
1056     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1057     return count[level];
1058   }
1059   // Return the total number of cores with attribute 'attr'
get_ncores_with_attr(const kmp_hw_attr_t & attr)1060   int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1061     return _get_ncores_with_attr(attr, -1, true);
1062   }
1063   // Return the number of cores with attribute
1064   // 'attr' per topology level 'above'
get_ncores_with_attr_per(const kmp_hw_attr_t & attr,int above)1065   int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1066     return _get_ncores_with_attr(attr, above, false);
1067   }
1068 
1069 #if KMP_AFFINITY_SUPPORTED
1070   friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
sort_compact(kmp_affinity_t & affinity)1071   void sort_compact(kmp_affinity_t &affinity) {
1072     compact = affinity.compact;
1073     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1074           kmp_hw_thread_t::compare_compact);
1075   }
1076 #endif
1077   void print(const char *env_var = "KMP_AFFINITY") const;
1078   void dump() const;
1079 };
1080 extern kmp_topology_t *__kmp_topology;
1081 
1082 class kmp_hw_subset_t {
1083   const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1084 
1085 public:
1086   // Describe a machine topology item in KMP_HW_SUBSET
1087   struct item_t {
1088     kmp_hw_t type;
1089     int num_attrs;
1090     int num[MAX_ATTRS];
1091     int offset[MAX_ATTRS];
1092     kmp_hw_attr_t attr[MAX_ATTRS];
1093   };
1094   // Put parenthesis around max to avoid accidental use of Windows max macro.
1095   const static int USE_ALL = (std::numeric_limits<int>::max)();
1096 
1097 private:
1098   int depth;
1099   int capacity;
1100   item_t *items;
1101   kmp_uint64 set;
1102   bool absolute;
1103   // The set must be able to handle up to KMP_HW_LAST number of layers
1104   KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1105   // Sorting the KMP_HW_SUBSET items to follow topology order
1106   // All unknown topology types will be at the beginning of the subset
hw_subset_compare(const void * i1,const void * i2)1107   static int hw_subset_compare(const void *i1, const void *i2) {
1108     kmp_hw_t type1 = ((const item_t *)i1)->type;
1109     kmp_hw_t type2 = ((const item_t *)i2)->type;
1110     int level1 = __kmp_topology->get_level(type1);
1111     int level2 = __kmp_topology->get_level(type2);
1112     return level1 - level2;
1113   }
1114 
1115 public:
1116   // Force use of allocate()/deallocate()
1117   kmp_hw_subset_t() = delete;
1118   kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1119   kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1120   kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1121   kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1122 
allocate()1123   static kmp_hw_subset_t *allocate() {
1124     int initial_capacity = 5;
1125     kmp_hw_subset_t *retval =
1126         (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1127     retval->depth = 0;
1128     retval->capacity = initial_capacity;
1129     retval->set = 0ull;
1130     retval->absolute = false;
1131     retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1132     return retval;
1133   }
deallocate(kmp_hw_subset_t * subset)1134   static void deallocate(kmp_hw_subset_t *subset) {
1135     __kmp_free(subset->items);
1136     __kmp_free(subset);
1137   }
set_absolute()1138   void set_absolute() { absolute = true; }
is_absolute()1139   bool is_absolute() const { return absolute; }
push_back(int num,kmp_hw_t type,int offset,kmp_hw_attr_t attr)1140   void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1141     for (int i = 0; i < depth; ++i) {
1142       // Found an existing item for this layer type
1143       // Add the num, offset, and attr to this item
1144       if (items[i].type == type) {
1145         int idx = items[i].num_attrs++;
1146         if ((size_t)idx >= MAX_ATTRS)
1147           return;
1148         items[i].num[idx] = num;
1149         items[i].offset[idx] = offset;
1150         items[i].attr[idx] = attr;
1151         return;
1152       }
1153     }
1154     if (depth == capacity - 1) {
1155       capacity *= 2;
1156       item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1157       for (int i = 0; i < depth; ++i)
1158         new_items[i] = items[i];
1159       __kmp_free(items);
1160       items = new_items;
1161     }
1162     items[depth].num_attrs = 1;
1163     items[depth].type = type;
1164     items[depth].num[0] = num;
1165     items[depth].offset[0] = offset;
1166     items[depth].attr[0] = attr;
1167     depth++;
1168     set |= (1ull << type);
1169   }
get_depth()1170   int get_depth() const { return depth; }
at(int index)1171   const item_t &at(int index) const {
1172     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1173     return items[index];
1174   }
at(int index)1175   item_t &at(int index) {
1176     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1177     return items[index];
1178   }
remove(int index)1179   void remove(int index) {
1180     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1181     set &= ~(1ull << items[index].type);
1182     for (int j = index + 1; j < depth; ++j) {
1183       items[j - 1] = items[j];
1184     }
1185     depth--;
1186   }
sort()1187   void sort() {
1188     KMP_DEBUG_ASSERT(__kmp_topology);
1189     qsort(items, depth, sizeof(item_t), hw_subset_compare);
1190   }
specified(kmp_hw_t type)1191   bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1192 
1193   // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1194   // This means putting each of {sockets, cores, threads} in the topology if
1195   // they are not specified:
1196   // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1197   // e.g., 3module => *s,3module,*c,*t
1198   // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1199   // are expecting the traditional sockets/cores/threads topology. For newer
1200   // hardware, there can be intervening layers like dies/tiles/modules
1201   // (usually corresponding to a cache level). So when a user asks for
1202   // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1203   // should get 12 hardware threads across 6 cores and effectively ignore the
1204   // module layer.
canonicalize(const kmp_topology_t * top)1205   void canonicalize(const kmp_topology_t *top) {
1206     // Layers to target for KMP_HW_SUBSET canonicalization
1207     kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1208 
1209     // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1210     if (is_absolute())
1211       return;
1212 
1213     // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1214     // topology doesn't have these layers
1215     for (kmp_hw_t type : targeted)
1216       if (top->get_level(type) == KMP_HW_UNKNOWN)
1217         return;
1218 
1219     // Put targeted layers in topology if they do not exist
1220     for (kmp_hw_t type : targeted) {
1221       bool found = false;
1222       for (int i = 0; i < get_depth(); ++i) {
1223         if (top->get_equivalent_type(items[i].type) == type) {
1224           found = true;
1225           break;
1226         }
1227       }
1228       if (!found) {
1229         push_back(USE_ALL, type, 0, kmp_hw_attr_t{});
1230       }
1231     }
1232     sort();
1233     // Set as an absolute topology that only targets the targeted layers
1234     set_absolute();
1235   }
dump()1236   void dump() const {
1237     printf("**********************\n");
1238     printf("*** kmp_hw_subset: ***\n");
1239     printf("* depth: %d\n", depth);
1240     printf("* items:\n");
1241     for (int i = 0; i < depth; ++i) {
1242       printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1243       for (int j = 0; j < items[i].num_attrs; ++j) {
1244         printf("  num: %d, offset: %d, attr: ", items[i].num[j],
1245                items[i].offset[j]);
1246         if (!items[i].attr[j]) {
1247           printf(" (none)\n");
1248         } else {
1249           printf(
1250               " core_type = %s, core_eff = %d\n",
1251               __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1252               items[i].attr[j].get_core_eff());
1253         }
1254       }
1255     }
1256     printf("* set: 0x%llx\n", set);
1257     printf("* absolute: %d\n", absolute);
1258     printf("**********************\n");
1259   }
1260 };
1261 extern kmp_hw_subset_t *__kmp_hw_subset;
1262 
1263 /* A structure for holding machine-specific hierarchy info to be computed once
1264    at init. This structure represents a mapping of threads to the actual machine
1265    hierarchy, or to our best guess at what the hierarchy might be, for the
1266    purpose of performing an efficient barrier. In the worst case, when there is
1267    no machine hierarchy information, it produces a tree suitable for a barrier,
1268    similar to the tree used in the hyper barrier. */
1269 class hierarchy_info {
1270 public:
1271   /* Good default values for number of leaves and branching factor, given no
1272      affinity information. Behaves a bit like hyper barrier. */
1273   static const kmp_uint32 maxLeaves = 4;
1274   static const kmp_uint32 minBranch = 4;
1275   /** Number of levels in the hierarchy. Typical levels are threads/core,
1276       cores/package or socket, packages/node, nodes/machine, etc. We don't want
1277       to get specific with nomenclature. When the machine is oversubscribed we
1278       add levels to duplicate the hierarchy, doubling the thread capacity of the
1279       hierarchy each time we add a level. */
1280   kmp_uint32 maxLevels;
1281 
1282   /** This is specifically the depth of the machine configuration hierarchy, in
1283       terms of the number of levels along the longest path from root to any
1284       leaf. It corresponds to the number of entries in numPerLevel if we exclude
1285       all but one trailing 1. */
1286   kmp_uint32 depth;
1287   kmp_uint32 base_num_threads = 0;
1288   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1289   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1290   // 2=initialization in progress
1291   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1292 
1293   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1294       the parent of a node at level i has. For example, if we have a machine
1295       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1296       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
1297   kmp_uint32 *numPerLevel = nullptr;
1298   kmp_uint32 *skipPerLevel = nullptr;
1299 
deriveLevels()1300   void deriveLevels() {
1301     int hier_depth = __kmp_topology->get_depth();
1302     for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1303       numPerLevel[level] = __kmp_topology->get_ratio(i);
1304     }
1305   }
1306 
hierarchy_info()1307   hierarchy_info()
1308       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1309 
fini()1310   void fini() {
1311     if (!uninitialized && numPerLevel) {
1312       __kmp_free(numPerLevel);
1313       numPerLevel = NULL;
1314       uninitialized = not_initialized;
1315     }
1316   }
1317 
init(int num_addrs)1318   void init(int num_addrs) {
1319     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1320         &uninitialized, not_initialized, initializing);
1321     if (bool_result == 0) { // Wait for initialization
1322       while (TCR_1(uninitialized) != initialized)
1323         KMP_CPU_PAUSE();
1324       return;
1325     }
1326     KMP_DEBUG_ASSERT(bool_result == 1);
1327 
1328     /* Added explicit initialization of the data fields here to prevent usage of
1329        dirty value observed when static library is re-initialized multiple times
1330        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1331        OpenMP). */
1332     depth = 1;
1333     resizing = 0;
1334     maxLevels = 7;
1335     numPerLevel =
1336         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1337     skipPerLevel = &(numPerLevel[maxLevels]);
1338     for (kmp_uint32 i = 0; i < maxLevels;
1339          ++i) { // init numPerLevel[*] to 1 item per level
1340       numPerLevel[i] = 1;
1341       skipPerLevel[i] = 1;
1342     }
1343 
1344     // Sort table by physical ID
1345     if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1346       deriveLevels();
1347     } else {
1348       numPerLevel[0] = maxLeaves;
1349       numPerLevel[1] = num_addrs / maxLeaves;
1350       if (num_addrs % maxLeaves)
1351         numPerLevel[1]++;
1352     }
1353 
1354     base_num_threads = num_addrs;
1355     for (int i = maxLevels - 1; i >= 0;
1356          --i) // count non-empty levels to get depth
1357       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1358         depth++;
1359 
1360     kmp_uint32 branch = minBranch;
1361     if (numPerLevel[0] == 1)
1362       branch = num_addrs / maxLeaves;
1363     if (branch < minBranch)
1364       branch = minBranch;
1365     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1366       while (numPerLevel[d] > branch ||
1367              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1368         if (numPerLevel[d] & 1)
1369           numPerLevel[d]++;
1370         numPerLevel[d] = numPerLevel[d] >> 1;
1371         if (numPerLevel[d + 1] == 1)
1372           depth++;
1373         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1374       }
1375       if (numPerLevel[0] == 1) {
1376         branch = branch >> 1;
1377         if (branch < 4)
1378           branch = minBranch;
1379       }
1380     }
1381 
1382     for (kmp_uint32 i = 1; i < depth; ++i)
1383       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1384     // Fill in hierarchy in the case of oversubscription
1385     for (kmp_uint32 i = depth; i < maxLevels; ++i)
1386       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1387 
1388     uninitialized = initialized; // One writer
1389   }
1390 
1391   // Resize the hierarchy if nproc changes to something larger than before
resize(kmp_uint32 nproc)1392   void resize(kmp_uint32 nproc) {
1393     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1394     while (bool_result == 0) { // someone else is trying to resize
1395       KMP_CPU_PAUSE();
1396       if (nproc <= base_num_threads) // happy with other thread's resize
1397         return;
1398       else // try to resize
1399         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1400     }
1401     KMP_DEBUG_ASSERT(bool_result != 0);
1402     if (nproc <= base_num_threads)
1403       return; // happy with other thread's resize
1404 
1405     // Calculate new maxLevels
1406     kmp_uint32 old_sz = skipPerLevel[depth - 1];
1407     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1408     // First see if old maxLevels is enough to contain new size
1409     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1410       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1411       numPerLevel[i - 1] *= 2;
1412       old_sz *= 2;
1413       depth++;
1414     }
1415     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1416       while (nproc > old_sz) {
1417         old_sz *= 2;
1418         incs++;
1419         depth++;
1420       }
1421       maxLevels += incs;
1422 
1423       // Resize arrays
1424       kmp_uint32 *old_numPerLevel = numPerLevel;
1425       kmp_uint32 *old_skipPerLevel = skipPerLevel;
1426       numPerLevel = skipPerLevel = NULL;
1427       numPerLevel =
1428           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1429       skipPerLevel = &(numPerLevel[maxLevels]);
1430 
1431       // Copy old elements from old arrays
1432       for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1433         // init numPerLevel[*] to 1 item per level
1434         numPerLevel[i] = old_numPerLevel[i];
1435         skipPerLevel[i] = old_skipPerLevel[i];
1436       }
1437 
1438       // Init new elements in arrays to 1
1439       for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1440         // init numPerLevel[*] to 1 item per level
1441         numPerLevel[i] = 1;
1442         skipPerLevel[i] = 1;
1443       }
1444 
1445       // Free old arrays
1446       __kmp_free(old_numPerLevel);
1447     }
1448 
1449     // Fill in oversubscription levels of hierarchy
1450     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1451       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1452 
1453     base_num_threads = nproc;
1454     resizing = 0; // One writer
1455   }
1456 };
1457 #endif // KMP_AFFINITY_H
1458